1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2007-2011 Oracle and/or its affiliates.  All rights reserved.
13  * Copyright (c) 2011      Oak Ridge National Labs.  All rights reserved.
14  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
15  *                         All rights reserved.
16  * Copyright (c) 2011-2017 Cisco Systems, Inc.  All rights reserved
17  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
18  * Copyright (c) 2014-2018 Research Organization for Information Science
19  *                         and Technology (RIST).  All rights reserved.
20  * Copyright (c) 2017      Mellanox Technologies Ltd. All rights reserved.
21  * Copyright (c) 2017      IBM Corporation. All rights reserved.
22  * $COPYRIGHT$
23  *
24  * Additional copyrights may follow
25  *
26  * $HEADER$
27  */
28 
29 
30 #include "orte_config.h"
31 #include "orte/constants.h"
32 #include "orte/types.h"
33 
34 #ifdef HAVE_SYS_WAIT_H
35 #include <sys/wait.h>
36 #endif
37 #include <errno.h>
38 #ifdef HAVE_SYS_STAT_H
39 #include <sys/stat.h>
40 #endif  /* HAVE_SYS_STAT_H */
41 #ifdef HAVE_SYS_PARAM_H
42 #include <sys/param.h>
43 #endif
44 #include <time.h>
45 
46 #include <signal.h>
47 
48 #include "opal_stdint.h"
49 #include "opal/util/opal_environ.h"
50 #include "opal/util/argv.h"
51 #include "opal/util/os_dirpath.h"
52 #include "opal/util/os_path.h"
53 #include "opal/util/path.h"
54 #include "opal/util/sys_limits.h"
55 #include "opal/dss/dss.h"
56 #include "opal/mca/hwloc/hwloc-internal.h"
57 #include "opal/mca/shmem/base/base.h"
58 #include "opal/mca/pstat/pstat.h"
59 #include "opal/mca/pmix/pmix.h"
60 
61 #include "orte/mca/errmgr/errmgr.h"
62 #include "orte/mca/rml/rml.h"
63 #include "orte/mca/routed/routed.h"
64 #include "orte/mca/iof/iof.h"
65 #include "orte/mca/iof/base/iof_base_setup.h"
66 #include "orte/mca/ess/base/base.h"
67 #include "orte/mca/grpcomm/base/base.h"
68 #include "orte/mca/plm/base/base.h"
69 #include "orte/mca/regx/regx.h"
70 #include "orte/mca/rml/base/rml_contact.h"
71 #include "orte/mca/rmaps/rmaps_types.h"
72 #include "orte/mca/rmaps/base/base.h"
73 #include "orte/mca/rmaps/base/rmaps_private.h"
74 #include "orte/mca/rtc/rtc.h"
75 #include "orte/mca/schizo/schizo.h"
76 #include "orte/mca/state/state.h"
77 #include "orte/mca/filem/filem.h"
78 #include "orte/mca/dfs/dfs.h"
79 
80 #include "orte/util/context_fns.h"
81 #include "orte/util/name_fns.h"
82 #include "orte/util/session_dir.h"
83 #include "orte/util/proc_info.h"
84 #include "orte/util/show_help.h"
85 #include "orte/util/threads.h"
86 #include "orte/runtime/orte_globals.h"
87 #include "orte/runtime/orte_wait.h"
88 #include "orte/orted/orted.h"
89 #include "orte/orted/pmix/pmix_server.h"
90 
91 #if OPAL_ENABLE_FT_CR == 1
92 #include "orte/mca/snapc/snapc.h"
93 #include "orte/mca/snapc/base/base.h"
94 #include "orte/mca/sstore/sstore.h"
95 #include "orte/mca/sstore/base/base.h"
96 #include "opal/mca/crs/crs.h"
97 #include "opal/mca/crs/base/base.h"
98 #endif
99 
100 #include "orte/mca/odls/base/base.h"
101 #include "orte/mca/odls/base/odls_private.h"
102 
103 /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
104  * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
105 */
orte_odls_base_default_get_add_procs_data(opal_buffer_t * buffer,orte_jobid_t job)106 int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
107                                               orte_jobid_t job)
108 {
109     int rc, v;
110     orte_job_t *jdata=NULL, *jptr;
111     orte_job_map_t *map=NULL;
112     opal_buffer_t *wireup, jobdata;
113     opal_byte_object_t bo, *boptr;
114     int32_t numbytes, numjobs;
115     int8_t flag;
116     void *nptr;
117     uint32_t key;
118     char *nidmap;
119     orte_proc_t *dmn, *proc;
120     opal_value_t *val = NULL, *kv;
121     opal_list_t *modex;
122     int n;
123 
124 
125     /* get the job data pointer */
126     if (NULL == (jdata = orte_get_job_data_object(job))) {
127         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
128         return ORTE_ERR_BAD_PARAM;
129     }
130 
131     /* get a pointer to the job map */
132     map = jdata->map;
133     /* if there is no map, just return */
134     if (NULL == map) {
135         return ORTE_SUCCESS;
136     }
137 
138     /* if we couldn't provide the allocation regex on the orted
139      * cmd line, then we need to provide all the info here */
140     if (!orte_nidmap_communicated) {
141         if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &nidmap))) {
142             ORTE_ERROR_LOG(rc);
143             return rc;
144         }
145         orte_nidmap_communicated = true;
146     } else {
147         nidmap = NULL;
148     }
149     opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING);
150     if (NULL != nidmap) {
151         free(nidmap);
152     }
153 
154     /* if we haven't already done so, provide the info on the
155      * capabilities of each node */
156     if (1 < orte_process_info.num_procs &&
157         (!orte_node_info_communicated ||
158          orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL))) {
159         flag = 1;
160         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
161         if (ORTE_SUCCESS != (rc = orte_regx.encode_nodemap(buffer))) {
162             ORTE_ERROR_LOG(rc);
163             return rc;
164         }
165         /* get wireup info for daemons */
166         if (NULL == (jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
167             ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
168             return ORTE_ERR_BAD_PARAM;
169         }
170         wireup = OBJ_NEW(opal_buffer_t);
171         /* always include data for mpirun as the daemons can't have it yet */
172         val = NULL;
173         if (opal_pmix.legacy_get()) {
174             if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
175                 ORTE_ERROR_LOG(rc);
176                 OBJ_RELEASE(wireup);
177                 return rc;
178             } else {
179                 /* pack the name of the daemon */
180                 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
181                     ORTE_ERROR_LOG(rc);
182                     OBJ_RELEASE(wireup);
183                     return rc;
184                 }
185                 /* pack the URI */
186                if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
187                     ORTE_ERROR_LOG(rc);
188                     OBJ_RELEASE(wireup);
189                     return rc;
190                 }
191                 OBJ_RELEASE(val);
192             }
193         } else {
194             if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
195                 ORTE_ERROR_LOG(rc);
196                 OBJ_RELEASE(wireup);
197                 return rc;
198             } else {
199                 /* the data is returned as a list of key-value pairs in the opal_value_t */
200                 if (OPAL_PTR != val->type) {
201                     ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
202                     OBJ_RELEASE(wireup);
203                     return ORTE_ERR_NOT_FOUND;
204                 }
205                 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
206                     ORTE_ERROR_LOG(rc);
207                     OBJ_RELEASE(wireup);
208                     return rc;
209                 }
210                 modex = (opal_list_t*)val->data.ptr;
211                 numbytes = (int32_t)opal_list_get_size(modex);
212                 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
213                     ORTE_ERROR_LOG(rc);
214                     OBJ_RELEASE(wireup);
215                     return rc;
216                 }
217                 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
218                     if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
219                         ORTE_ERROR_LOG(rc);
220                         OBJ_RELEASE(wireup);
221                         return rc;
222                     }
223                 }
224                 OPAL_LIST_RELEASE(modex);
225                 OBJ_RELEASE(val);
226             }
227         }
228         /* if we didn't rollup the connection info, then we have
229          * to provide a complete map of connection info */
230         if (!orte_static_ports && !orte_fwd_mpirun_port) {
231             for (v=1; v < jptr->procs->size; v++) {
232                 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
233                     continue;
234                 }
235                 val = NULL;
236                 if (opal_pmix.legacy_get()) {
237                     if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
238                         ORTE_ERROR_LOG(rc);
239                         OBJ_RELEASE(buffer);
240                         OBJ_RELEASE(wireup);
241                         return rc;
242                     } else {
243                         /* pack the name of the daemon */
244                         if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
245                             ORTE_ERROR_LOG(rc);
246                             OBJ_RELEASE(buffer);
247                             OBJ_RELEASE(wireup);
248                             return rc;
249                         }
250                         /* pack the URI */
251                        if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
252                             ORTE_ERROR_LOG(rc);
253                             OBJ_RELEASE(buffer);
254                             OBJ_RELEASE(wireup);
255                             return rc;
256                         }
257                         OBJ_RELEASE(val);
258                     }
259                 } else {
260                     if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
261                         ORTE_ERROR_LOG(rc);
262                         OBJ_RELEASE(buffer);
263                         return rc;
264                     } else {
265                         /* the data is returned as a list of key-value pairs in the opal_value_t */
266                         if (OPAL_PTR != val->type) {
267                             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
268                             OBJ_RELEASE(buffer);
269                             return ORTE_ERR_NOT_FOUND;
270                         }
271                         if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
272                             ORTE_ERROR_LOG(rc);
273                             OBJ_RELEASE(buffer);
274                             OBJ_RELEASE(wireup);
275                             return rc;
276                         }
277                         modex = (opal_list_t*)val->data.ptr;
278                         numbytes = (int32_t)opal_list_get_size(modex);
279                         if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
280                             ORTE_ERROR_LOG(rc);
281                             OBJ_RELEASE(buffer);
282                             OBJ_RELEASE(wireup);
283                             return rc;
284                         }
285                         OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
286                             if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
287                                 ORTE_ERROR_LOG(rc);
288                                 OBJ_RELEASE(buffer);
289                                 OBJ_RELEASE(wireup);
290                                 return rc;
291                             }
292                         }
293                         OPAL_LIST_RELEASE(modex);
294                         OBJ_RELEASE(val);
295                     }
296                 }
297             }
298         }
299         /* put it in a byte object for xmission */
300         opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
301         OBJ_RELEASE(wireup);
302         /* pack the byte object - zero-byte objects are fine */
303         bo.size = numbytes;
304         boptr = &bo;
305         if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &boptr, 1, OPAL_BYTE_OBJECT))) {
306             ORTE_ERROR_LOG(rc);
307             return rc;
308         }
309         /* release the data since it has now been copied into our buffer */
310         if (NULL != bo.bytes) {
311             free(bo.bytes);
312         }
313 
314         /* we need to ensure that any new daemons get a complete
315          * copy of all active jobs so the grpcomm collectives can
316          * properly work should a proc from one of the other jobs
317          * interact with this one */
318         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
319             flag = 1;
320             opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
321             OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
322             numjobs = 0;
323             rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
324             while (OPAL_SUCCESS == rc) {
325                 /* skip the one we are launching now */
326                 if (NULL != jptr && jptr != jdata &&
327                     ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
328                     /* pack the job struct */
329                     if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
330                         ORTE_ERROR_LOG(rc);
331                         OBJ_DESTRUCT(&jobdata);
332                         return rc;
333                     }
334                     /* pack the location of each proc */
335                     for (n=0; n < jptr->procs->size; n++) {
336                         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
337                             continue;
338                         }
339                         if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &proc->parent, 1, ORTE_VPID))) {
340                             ORTE_ERROR_LOG(rc);
341                             OBJ_DESTRUCT(&jobdata);
342                             return rc;
343                         }
344                     }
345                     ++numjobs;
346                 }
347                 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
348             }
349             /* pack the number of jobs */
350             if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &numjobs, 1, OPAL_INT32))) {
351                 ORTE_ERROR_LOG(rc);
352                 OBJ_DESTRUCT(&jobdata);
353                 return rc;
354             }
355             if (0 < numjobs) {
356                 /* pack the jobdata buffer */
357                 wireup = &jobdata;
358                 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) {
359                     ORTE_ERROR_LOG(rc);
360                     OBJ_DESTRUCT(&jobdata);
361                     return rc;
362                 }
363                 OBJ_DESTRUCT(&jobdata);
364             }
365         } else {
366             flag = 0;
367             opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
368         }
369         orte_node_info_communicated = true;
370     } else {
371         /* mark that we didn't */
372         flag = 0;
373         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
374         /* and that we didn't launch daemons */
375         flag = 0;
376         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
377     }
378 
379     /* pack the job struct */
380     if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &jdata, 1, ORTE_JOB))) {
381         ORTE_ERROR_LOG(rc);
382         return rc;
383     }
384 
385     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
386         /* compute and pack the ppn regex */
387         if (ORTE_SUCCESS != (rc = orte_regx.generate_ppn(jdata, &nidmap))) {
388             ORTE_ERROR_LOG(rc);
389             return rc;
390         }
391         if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) {
392             ORTE_ERROR_LOG(rc);
393             free(nidmap);
394             return rc;
395         }
396         free(nidmap);
397     }
398 
399     /* compute and pack the regex of ppn */
400 
401     return ORTE_SUCCESS;
402 }
403 
fm_release(void * cbdata)404 static void fm_release(void *cbdata)
405 {
406     opal_buffer_t *bptr = (opal_buffer_t*)cbdata;
407 
408     OBJ_RELEASE(bptr);
409 }
410 
orte_odls_base_default_construct_child_list(opal_buffer_t * buffer,orte_jobid_t * job)411 int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
412                                                 orte_jobid_t *job)
413 {
414     int rc;
415     orte_std_cntr_t cnt;
416     orte_job_t *jdata=NULL, *daemons;
417     orte_node_t *node;
418     orte_vpid_t dmnvpid, v;
419     int32_t n, k;
420     opal_buffer_t *bptr;
421     orte_proc_t *pptr, *dmn;
422     orte_app_context_t *app;
423     int8_t flag;
424     char *ppn;
425 
426     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
427                          "%s odls:constructing child list",
428                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
429 
430     /* set a default response */
431     *job = ORTE_JOBID_INVALID;
432     /* get the daemon job object */
433     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
434 
435     /* unpack the flag to see if new daemons were launched */
436     cnt=1;
437     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
438         ORTE_ERROR_LOG(rc);
439         goto REPORT_ERROR;
440     }
441 
442     if (0 != flag) {
443         /* see if additional jobs are included in the data */
444         cnt=1;
445         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &n, &cnt, OPAL_INT32))) {
446             *job = ORTE_JOBID_INVALID;
447             ORTE_ERROR_LOG(rc);
448             goto REPORT_ERROR;
449         }
450 
451         if (0 < n) {
452             /* unpack the buffer containing the info */
453             cnt=1;
454             if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
455                 *job = ORTE_JOBID_INVALID;
456                 ORTE_ERROR_LOG(rc);
457                 goto REPORT_ERROR;
458             }
459             for (k=0; k < n; k++) {
460                 /* unpack each job and add it to the local orte_job_data array */
461                 cnt=1;
462                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &jdata, &cnt, ORTE_JOB))) {
463                     *job = ORTE_JOBID_INVALID;
464                     ORTE_ERROR_LOG(rc);
465                     goto REPORT_ERROR;
466                 }
467                 /* check to see if we already have this one */
468                 if (NULL == orte_get_job_data_object(jdata->jobid)) {
469                     /* nope - add it */
470                     opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
471                 } else {
472                     /* yep - so we can drop this copy */
473                     jdata->jobid = ORTE_JOBID_INVALID;
474                     OBJ_RELEASE(jdata);
475                     continue;
476                 }
477                 /* unpack the location of each proc in this job */
478                 for (v=0; v < jdata->num_procs; v++) {
479                     if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
480                         pptr = OBJ_NEW(orte_proc_t);
481                         pptr->name.jobid = jdata->jobid;
482                         pptr->name.vpid = v;
483                         opal_pointer_array_set_item(jdata->procs, v, pptr);
484                     }
485                     cnt=1;
486                     if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &dmnvpid, &cnt, ORTE_VPID))) {
487                         ORTE_ERROR_LOG(rc);
488                         OBJ_RELEASE(jdata);
489                         goto REPORT_ERROR;
490                     }
491                     /* lookup the daemon */
492                     if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
493                         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
494                         rc = ORTE_ERR_NOT_FOUND;
495                         goto REPORT_ERROR;
496                     }
497                     /* connect the two */
498                     OBJ_RETAIN(dmn->node);
499                     pptr->node = dmn->node;
500                 }
501             }
502             /* release the buffer */
503             OBJ_RELEASE(bptr);
504         }
505     }
506 
507     /* unpack the job we are to launch */
508     cnt=1;
509     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &cnt, ORTE_JOB))) {
510         *job = ORTE_JOBID_INVALID;
511         ORTE_ERROR_LOG(rc);
512         goto REPORT_ERROR;
513     }
514     if (ORTE_JOBID_INVALID == jdata->jobid) {
515         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
516         rc = ORTE_ERR_BAD_PARAM;
517         goto REPORT_ERROR;
518     }
519     *job = jdata->jobid;
520 
521     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
522                          "%s odls:construct_child_list unpacking data to launch job %s",
523                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
524 
525     /* if we are the HNP, we don't need to unpack this buffer - we already
526      * have all the required info in our local job array. So just build the
527      * array of local children
528      */
529     if (ORTE_PROC_IS_HNP) {
530         /* we don't want/need the extra copy of the orte_job_t, but
531          * we can't just release it as that will NULL the location in
532          * the orte_job_data array. So set the jobid to INVALID to
533          * protect the array, and then release the object to free
534          * the storage */
535         jdata->jobid = ORTE_JOBID_INVALID;
536         OBJ_RELEASE(jdata);
537         /* get the correct job object - it will be completely filled out */
538         if (NULL == (jdata = orte_get_job_data_object(*job))) {
539             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
540             rc = ORTE_ERR_NOT_FOUND;
541             goto REPORT_ERROR;
542         }
543     } else {
544         opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
545 
546         /* ensure the map object is present */
547         if (NULL == jdata->map) {
548             jdata->map = OBJ_NEW(orte_job_map_t);
549         }
550     }
551 
552     /* if the job is fully described, then mpirun will have computed
553      * and sent us the complete array of procs in the orte_job_t, so we
554      * don't need to do anything more here */
555     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
556         if (!ORTE_PROC_IS_HNP) {
557             /* extract the ppn regex */
558             cnt = 1;
559             if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) {
560                 ORTE_ERROR_LOG(rc);
561                 goto REPORT_ERROR;
562             }
563             /* populate the node array of the job map and the proc array of
564              * the job object so we know how many procs are on each node */
565             if (ORTE_SUCCESS != (rc = orte_regx.parse_ppn(jdata, ppn))) {
566                 ORTE_ERROR_LOG(rc);
567                 free(ppn);
568                 goto REPORT_ERROR;
569             }
570             free(ppn);
571             /* now assign locations to the procs */
572             if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
573                 ORTE_ERROR_LOG(rc);
574                 goto REPORT_ERROR;
575             }
576         }
577         /* compute the ranks and add the proc objects
578          * to the jdata->procs array */
579         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
580             ORTE_ERROR_LOG(rc);
581             goto REPORT_ERROR;
582         }
583         /* and finally, compute the local and node ranks */
584         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
585             ORTE_ERROR_LOG(rc);
586             goto REPORT_ERROR;
587         }
588     }
589 
590     /* now that the node array in the job map and jdata are completely filled out,.
591      * we need to "wireup" the procs to their nodes so other utilities can
592      * locate them */
593     for (n=0; n < jdata->procs->size; n++) {
594         if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
595             continue;
596         }
597         if (ORTE_PROC_STATE_UNDEF == pptr->state) {
598             /* not ready for use yet */
599             continue;
600         }
601         if (!ORTE_PROC_IS_HNP &&
602             orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
603             /* the parser will have already made the connection, but the fully described
604              * case won't have done it, so connect the proc to its node here */
605             opal_output_verbose(5, orte_odls_base_framework.framework_output,
606                                 "%s GETTING DAEMON FOR PROC %s WITH PARENT %s",
607                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
608                                 ORTE_NAME_PRINT(&pptr->name),
609                                 ORTE_VPID_PRINT(pptr->parent));
610             if (ORTE_VPID_INVALID == pptr->parent) {
611                 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
612                 rc = ORTE_ERR_BAD_PARAM;
613                 goto REPORT_ERROR;
614             }
615             /* connect the proc to its node object */
616             if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, pptr->parent))) {
617                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
618                 rc = ORTE_ERR_NOT_FOUND;
619                 goto REPORT_ERROR;
620             }
621             OBJ_RETAIN(dmn->node);
622             pptr->node = dmn->node;
623             /* add the node to the job map, if needed */
624             if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
625                 OBJ_RETAIN(pptr->node);
626                 opal_pointer_array_add(jdata->map->nodes, pptr->node);
627                 jdata->map->num_nodes++;
628                 ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
629             }
630             /* add this proc to that node */
631             OBJ_RETAIN(pptr);
632             opal_pointer_array_add(pptr->node->procs, pptr);
633             pptr->node->num_procs++;
634         }
635         /* see if it belongs to us */
636         if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
637             /* is this child on our current list of children */
638             if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
639                 /* not on the local list */
640                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
641                                      "%s[%s:%d] adding proc %s to my local list",
642                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
643                                      __FILE__, __LINE__,
644                                      ORTE_NAME_PRINT(&pptr->name)));
645                 /* keep tabs of the number of local procs */
646                 jdata->num_local_procs++;
647                 /* add this proc to our child list */
648                 OBJ_RETAIN(pptr);
649                 ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
650                 opal_pointer_array_add(orte_local_children, pptr);
651             }
652 
653             /* if the job is in restart mode, the child must not barrier when launched */
654             if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
655                 orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
656             }
657             /* mark that this app_context is being used on this node */
658             app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
659             ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
660         }
661     }
662     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
663         /* reset the mapped flags */
664         for (n=0; n < jdata->map->nodes->size; n++) {
665             if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
666                 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
667             }
668         }
669     }
670 
671     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
672         /* compute and save bindings of local children */
673         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
674             ORTE_ERROR_LOG(rc);
675             goto REPORT_ERROR;
676         }
677     }
678 
679     /* if we wanted to see the map, now is the time to display it */
680     if (jdata->map->display_map) {
681         orte_rmaps_base_display_map(jdata);
682     }
683 
684     /* if we have a file map, then we need to load it */
685     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&bptr, OPAL_BUFFER)) {
686         if (NULL != orte_dfs.load_file_maps) {
687             orte_dfs.load_file_maps(jdata->jobid, bptr, fm_release, bptr);
688         } else {
689             OBJ_RELEASE(bptr);
690         }
691     }
692 
693     /* load any controls into the job */
694     orte_rtc.assign(jdata);
695 
696     /* register this job with the PMIx server - need to wait until after we
697      * have computed the #local_procs before calling the function */
698     if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace(jdata, false))) {
699         ORTE_ERROR_LOG(rc);
700         goto REPORT_ERROR;
701     }
702 
703     /* to save memory, purge the job map of all procs other than
704      * our own - for daemons, this will completely release the
705      * proc structures. For the HNP, the proc structs will
706      * remain in the orte_job_t array */
707 
708     return ORTE_SUCCESS;
709 
710   REPORT_ERROR:
711     /* we have to report an error back to the HNP so we don't just
712      * hang. Although there shouldn't be any errors once this is
713      * all debugged, it is still good practice to have a way
714      * for it to happen - especially so developers don't have to
715      * deal with the hang!
716      */
717     ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED);
718     return rc;
719 }
720 
setup_path(orte_app_context_t * app,char ** wdir)721 static int setup_path(orte_app_context_t *app, char **wdir)
722 {
723     int rc=ORTE_SUCCESS;
724     char dir[MAXPATHLEN];
725 
726     if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
727         /* Try to change to the app's cwd and check that the app
728            exists and is executable The function will
729            take care of outputting a pretty error message, if required
730         */
731         if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
732             /* do not ERROR_LOG - it will be reported elsewhere */
733             goto CLEANUP;
734         }
735 
736         /* The prior function will have done a chdir() to jump us to
737          * wherever the app is to be executed. This could be either where
738          * the user specified (via -wdir), or to the user's home directory
739          * on this node if nothing was provided. It seems that chdir doesn't
740          * adjust the $PWD enviro variable when it changes the directory. This
741          * can cause a user to get a different response when doing getcwd vs
742          * looking at the enviro variable. To keep this consistent, we explicitly
743          * ensure that the PWD enviro variable matches the CWD we moved to.
744          *
745          * NOTE: if a user's program does a chdir(), then $PWD will once
746          * again not match getcwd! This is beyond our control - we are only
747          * ensuring they start out matching.
748          */
749         getcwd(dir, sizeof(dir));
750         *wdir = strdup(dir);
751         opal_setenv("PWD", dir, true, &app->env);
752         /* update the initial wdir value too */
753         opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
754     } else {
755         *wdir = NULL;
756     }
757 
758  CLEANUP:
759     return rc;
760 }
761 
762 
763 /* define a timer release point so that we can wait for
764  * file descriptors to come available, if necessary
765  */
timer_cb(int fd,short event,void * cbdata)766 static void timer_cb(int fd, short event, void *cbdata)
767 {
768     orte_timer_t *tm = (orte_timer_t*)cbdata;
769     orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
770 
771     ORTE_ACQUIRE_OBJECT(tm);
772 
773     /* increment the number of retries */
774     ll->retries++;
775 
776     /* re-attempt the launch */
777     opal_event_active(ll->ev, OPAL_EV_WRITE, 1);
778 
779     /* release the timer event */
780     OBJ_RELEASE(tm);
781 }
782 
compute_num_procs_alive(orte_jobid_t job)783 static int compute_num_procs_alive(orte_jobid_t job)
784 {
785     int i;
786     orte_proc_t *child;
787     int num_procs_alive = 0;
788 
789     for (i=0; i < orte_local_children->size; i++) {
790         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
791             continue;
792         }
793         if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
794             continue;
795         }
796         /* do not include members of the specified job as they
797          * will be added later, if required
798          */
799         if (job == child->name.jobid) {
800             continue;
801         }
802         num_procs_alive++;
803     }
804     return num_procs_alive;
805 }
806 
orte_odls_base_spawn_proc(int fd,short sd,void * cbdata)807 void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
808 {
809     orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cbdata;
810     orte_job_t *jobdat = cd->jdata;
811     orte_app_context_t *app = cd->app;
812     orte_proc_t *child = cd->child;
813     int rc, i;
814     bool found;
815     orte_proc_state_t state;
816     char **argvptr;
817     char *pathenv = NULL, *mpiexec_pathenv = NULL;
818     char *full_search;
819 
820     ORTE_ACQUIRE_OBJECT(cd);
821 
822     /* thread-protect common values */
823     cd->env = opal_argv_copy(app->env);
824 
825     /* ensure we clear any prior info regarding state or exit status in
826      * case this is a restart
827      */
828     child->exit_code = 0;
829     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
830 
831     /* setup the pmix environment */
832     if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
833         ORTE_ERROR_LOG(rc);
834         state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
835         goto errorout;
836     }
837 
838     /* if we are not forwarding output for this job, then
839      * flag iof as complete
840      */
841     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
842         ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
843     } else {
844         ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
845     }
846     child->pid = 0;
847     if (NULL != child->rml_uri) {
848         free(child->rml_uri);
849         child->rml_uri = NULL;
850     }
851 
852     /* setup the rest of the environment with the proc-specific items - these
853      * will be overwritten for each child
854      */
855     if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
856         ORTE_ERROR_LOG(rc);
857         state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
858         goto errorout;
859     }
860 
861     /* Search for the OMPI_exec_path and PATH settings in the environment. */
862     for (argvptr = app->env; *argvptr != NULL; argvptr++) {
863         if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
864             mpiexec_pathenv = *argvptr + 15;
865         }
866         if (0 == strncmp("PATH=", *argvptr, 5)) {
867             pathenv = *argvptr + 5;
868         }
869     }
870 
871     /* If OMPI_exec_path is set (meaning --path was used), then create a
872        temporary environment to be used in the search for the executable.
873        The PATH setting in this temporary environment is a combination of
874        the OMPI_exec_path and PATH values.  If OMPI_exec_path is not set,
875        then just use existing environment with PATH in it.  */
876     if (NULL != mpiexec_pathenv) {
877         argvptr = NULL;
878         if (pathenv != NULL) {
879             asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
880         } else {
881             asprintf(&full_search, "%s", mpiexec_pathenv);
882         }
883         opal_setenv("PATH", full_search, true, &argvptr);
884         free(full_search);
885     } else {
886         argvptr = app->env;
887     }
888 
889     rc = orte_util_check_context_app(app, argvptr);
890     /* do not ERROR_LOG - it will be reported elsewhere */
891     if (NULL != mpiexec_pathenv) {
892         opal_argv_free(argvptr);
893     }
894     if (ORTE_SUCCESS != rc) {
895         state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
896         goto errorout;
897     }
898 
899     /* did the user request we display output in xterms? */
900     if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
901         opal_list_item_t *nmitem;
902         orte_namelist_t *nm;
903         /* see if this rank is one of those requested */
904         found = false;
905         for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
906              nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
907              nmitem = opal_list_get_next(nmitem)) {
908             nm = (orte_namelist_t*)nmitem;
909             if (ORTE_VPID_WILDCARD == nm->name.vpid ||
910                 child->name.vpid == nm->name.vpid) {
911                 /* we want this one - modify the app's command to include
912                  * the orte xterm cmd that starts with the xtermcmd */
913                 cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
914                 /* insert the rank into the correct place as a window title */
915                 free(cd->argv[2]);
916                 asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
917                 /* add in the argv from the app */
918                 for (i=0; NULL != app->argv[i]; i++) {
919                     opal_argv_append_nosize(&cd->argv, app->argv[i]);
920                 }
921                 /* use the xterm cmd as the app string */
922                 cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
923                 found = true;
924                 break;
925             } else if (jobdat->num_procs <= nm->name.vpid) {  /* check for bozo case */
926                 /* can't be done! */
927                 orte_show_help("help-orte-odls-base.txt",
928                                "orte-odls-base:xterm-rank-out-of-bounds",
929                                true, orte_process_info.nodename,
930                                nm->name.vpid, jobdat->num_procs);
931                 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
932                 goto errorout;
933             }
934         }
935         if (!found) {
936             cd->cmd = strdup(app->app);
937             cd->argv = opal_argv_copy(app->argv);
938         }
939     } else if (NULL != orte_fork_agent) {
940         /* we were given a fork agent - use it */
941         cd->argv = opal_argv_copy(orte_fork_agent);
942         /* add in the argv from the app */
943         for (i=0; NULL != app->argv[i]; i++) {
944             opal_argv_append_nosize(&cd->argv, app->argv[i]);
945         }
946         cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
947         if (NULL == cd->cmd) {
948             orte_show_help("help-orte-odls-base.txt",
949                            "orte-odls-base:fork-agent-not-found",
950                            true, orte_process_info.nodename, orte_fork_agent[0]);
951             state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
952             goto errorout;
953         }
954     } else {
955         cd->cmd = strdup(app->app);
956         cd->argv = opal_argv_copy(app->argv);
957     }
958 
959     /* if we are indexing the argv by rank, do so now */
960     if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
961         char *param;
962         asprintf(&param, "%s-%d", cd->argv[0], (int)child->name.vpid);
963         free(cd->argv[0]);
964         cd->argv[0] = param;
965     }
966 
967     if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
968         opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
969                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
970                     ORTE_NAME_PRINT(&child->name));
971 
972         /* dump what is going to be exec'd */
973         if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
974             opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
975         }
976     }
977 
978     if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
979         /* error message already output */
980         state = ORTE_PROC_STATE_FAILED_TO_START;
981         goto errorout;
982     }
983 
984     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
985     OBJ_RELEASE(cd);
986     return;
987 
988   errorout:
989     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
990     child->exit_code = rc;
991     ORTE_ACTIVATE_PROC_STATE(&child->name, state);
992     OBJ_RELEASE(cd);
993 }
994 
orte_odls_base_default_launch_local(int fd,short sd,void * cbdata)995 void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
996 {
997     orte_app_context_t *app;
998     orte_proc_t *child=NULL;
999     int rc=ORTE_SUCCESS;
1000     char basedir[MAXPATHLEN];
1001     int j, idx;
1002     int total_num_local_procs = 0;
1003     orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
1004     orte_job_t *jobdat;
1005     orte_jobid_t job = caddy->job;
1006     orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
1007     bool index_argv;
1008     char *msg;
1009     orte_odls_spawn_caddy_t *cd;
1010     opal_event_base_t *evb;
1011     char *effective_dir = NULL;
1012 
1013     ORTE_ACQUIRE_OBJECT(caddy);
1014 
1015     opal_output_verbose(5, orte_odls_base_framework.framework_output,
1016                         "%s local:launch",
1017                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1018 
1019     /* establish our baseline working directory - we will be potentially
1020      * bouncing around as we execute various apps, but we will always return
1021      * to this place as our default directory
1022      */
1023     getcwd(basedir, sizeof(basedir));
1024 
1025     /* find the jobdat for this job */
1026     if (NULL == (jobdat = orte_get_job_data_object(job))) {
1027         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1028         /* not much we can do here - we are just hosed, so
1029          * report that to the error manager
1030          */
1031         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1032         goto ERROR_OUT;
1033     }
1034 
1035     /* do we have any local procs to launch? */
1036     if (0 == jobdat->num_local_procs) {
1037         /* indicate that we are done trying to launch them */
1038         opal_output_verbose(5, orte_odls_base_framework.framework_output,
1039                             "%s local:launch no local procs",
1040                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1041         goto GETOUT;
1042     }
1043 
1044     /* track if we are indexing argvs so we don't check every time */
1045     index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
1046 
1047     /* compute the total number of local procs currently alive and about to be launched */
1048     total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
1049 
1050     /* check the system limits - if we are at our max allowed children, then
1051      * we won't be allowed to do this anyway, so we may as well abort now.
1052      * According to the documentation, num_procs = 0 is equivalent to
1053      * no limit, so treat it as unlimited here.
1054      */
1055     if (0 < opal_sys_limits.num_procs) {
1056         OPAL_OUTPUT_VERBOSE((10,  orte_odls_base_framework.framework_output,
1057                              "%s checking limit on num procs %d #children needed %d",
1058                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1059                              opal_sys_limits.num_procs, total_num_local_procs));
1060         if (opal_sys_limits.num_procs < total_num_local_procs) {
1061             if (2 < caddy->retries) {
1062                 /* if we have already tried too many times, then just give up */
1063                 ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1064                 goto ERROR_OUT;
1065             }
1066             /* set a timer event so we can retry later - this
1067              * gives the system a chance to let other procs
1068              * terminate, thus creating room for new ones
1069              */
1070             ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1071             return;
1072         }
1073     }
1074 
1075     /* check to see if we have enough available file descriptors
1076      * to launch these children - if not, then let's wait a little
1077      * while to see if some come free. This can happen if we are
1078      * in a tight loop over comm_spawn
1079      */
1080     if (0 < opal_sys_limits.num_files) {
1081         int limit;
1082         limit = 4*total_num_local_procs + 6*jobdat->num_local_procs;
1083         OPAL_OUTPUT_VERBOSE((10,  orte_odls_base_framework.framework_output,
1084                              "%s checking limit on file descriptors %d need %d",
1085                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1086                              opal_sys_limits.num_files, limit));
1087         if (opal_sys_limits.num_files < limit) {
1088             if (2 < caddy->retries) {
1089                 /* tried enough - give up */
1090                 for (idx=0; idx < orte_local_children->size; idx++) {
1091                     if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1092                         continue;
1093                     }
1094                     if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1095                         child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1096                         ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1097                     }
1098                 }
1099                 goto ERROR_OUT;
1100             }
1101             /* don't have enough - wait a little time */
1102             ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1103             return;
1104         }
1105     }
1106 
1107     for (j=0; j < jobdat->apps->size; j++) {
1108         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
1109             continue;
1110         }
1111 
1112         /* if this app isn't being used on our node, skip it */
1113         if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
1114             opal_output_verbose(5, orte_odls_base_framework.framework_output,
1115                                 "%s app %d not used on node",
1116                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
1117             continue;
1118         }
1119 
1120         /* setup the environment for this app */
1121         if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
1122 
1123             OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1124                                  "%s odls:launch:setup_fork failed with error %s",
1125                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1126                                  ORTE_ERROR_NAME(rc)));
1127 
1128             /* do not ERROR_LOG this failure - it will be reported
1129              * elsewhere. The launch is going to fail. Since we could have
1130              * multiple app_contexts, we need to ensure that we flag only
1131              * the correct one that caused this operation to fail. We then have
1132              * to flag all the other procs from the app_context as having "not failed"
1133              * so we can report things out correctly
1134              */
1135             /* cycle through children to find those for this jobid */
1136             for (idx=0; idx < orte_local_children->size; idx++) {
1137                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1138                     continue;
1139                 }
1140                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1141                     j == (int)child->app_idx) {
1142                     child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1143                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1144                 }
1145             }
1146             goto GETOUT;
1147         }
1148 
1149         /* setup the working directory for this app - will jump us
1150          * to that directory
1151          */
1152         if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
1153             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1154                                  "%s odls:launch:setup_path failed with error %s(%d)",
1155                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1156                                  ORTE_ERROR_NAME(rc), rc));
1157             /* do not ERROR_LOG this failure - it will be reported
1158              * elsewhere. The launch is going to fail. Since we could have
1159              * multiple app_contexts, we need to ensure that we flag only
1160              * the correct one that caused this operation to fail. We then have
1161              * to flag all the other procs from the app_context as having "not failed"
1162              * so we can report things out correctly
1163              */
1164             /* cycle through children to find those for this jobid */
1165             for (idx=0; idx < orte_local_children->size; idx++) {
1166                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1167                     continue;
1168                 }
1169                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1170                     j == (int)child->app_idx) {
1171                     child->exit_code = rc;
1172                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1173                 }
1174             }
1175             goto GETOUT;
1176         }
1177 
1178         /* setup any local files that were prepositioned for us */
1179         if (ORTE_SUCCESS != (rc = orte_filem.link_local_files(jobdat, app))) {
1180             /* cycle through children to find those for this jobid */
1181             for (idx=0; idx < orte_local_children->size; idx++) {
1182                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1183                     continue;
1184                 }
1185                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1186                     j == (int)child->app_idx) {
1187                     child->exit_code = rc;
1188                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1189                 }
1190             }
1191             goto GETOUT;
1192         }
1193 
1194         /* tell all children that they are being launched via ORTE */
1195         opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
1196 
1197         /* if the user requested it, set the system resource limits */
1198         if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
1199             orte_show_help("help-orte-odls-default.txt", "set limit", true,
1200                            orte_process_info.nodename, app,
1201                            __FILE__, __LINE__, msg);
1202             /* cycle through children to find those for this jobid */
1203             for (idx=0; idx < orte_local_children->size; idx++) {
1204                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1205                     continue;
1206                 }
1207                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1208                     j == (int)child->app_idx) {
1209                     child->exit_code = rc;
1210                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1211                 }
1212             }
1213             goto GETOUT;
1214         }
1215 
1216         /* reset our working directory back to our default location - if we
1217          * don't do this, then we will be looking for relative paths starting
1218          * from the last wdir option specified by the user. Thus, we would
1219          * be requiring that the user keep track on the cmd line of where
1220          * each app was located relative to the prior app, instead of relative
1221          * to their current location
1222          */
1223         chdir(basedir);
1224 
1225         /* okay, now let's launch all the local procs for this app using the provided fork_local fn */
1226         for (idx=0; idx < orte_local_children->size; idx++) {
1227             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1228                 continue;
1229             }
1230             /* does this child belong to this app? */
1231             if (j != (int)child->app_idx) {
1232                 continue;
1233             }
1234 
1235             /* is this child already alive? This can happen if
1236              * we are asked to launch additional processes.
1237              * If it has been launched, then do nothing
1238              */
1239             if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1240 
1241                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1242                                      "%s odls:launch child %s has already been launched",
1243                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1244                                      ORTE_NAME_PRINT(&child->name)));
1245 
1246                 continue;
1247             }
1248             /* is this child a candidate to start? it may not be alive
1249              * because it already executed
1250              */
1251             if (ORTE_PROC_STATE_INIT != child->state &&
1252                 ORTE_PROC_STATE_RESTART != child->state) {
1253                 continue;
1254             }
1255             /* do we have a child from the specified job. Because the
1256              * job could be given as a WILDCARD value, we must use
1257              * the dss.compare function to check for equality.
1258              */
1259             if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1260 
1261                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1262                                      "%s odls:launch child %s is not in job %s being launched",
1263                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1264                                      ORTE_NAME_PRINT(&child->name),
1265                                      ORTE_JOBID_PRINT(job)));
1266 
1267                 continue;
1268             }
1269 
1270             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1271                                  "%s odls:launch working child %s",
1272                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1273                                  ORTE_NAME_PRINT(&child->name)));
1274 
1275             /* set the waitpid callback here for thread protection and
1276              * to ensure we can capture the callback on shortlived apps */
1277             ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1278             orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1279 
1280             /* dispatch this child to the next available launch thread */
1281             cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1282             if (NULL != effective_dir) {
1283                 cd->wdir = strdup(effective_dir);
1284             }
1285             cd->jdata = jobdat;
1286             cd->app = app;
1287             cd->child = child;
1288             cd->fork_local = fork_local;
1289             cd->index_argv = index_argv;
1290             /* setup any IOF */
1291             cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1292 
1293             /* do we want to setup stdin? */
1294             if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1295                  child->name.vpid == jobdat->stdin_target) {
1296                 cd->opts.connect_stdin = true;
1297             } else {
1298                 cd->opts.connect_stdin = false;
1299             }
1300             if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1301                 ORTE_ERROR_LOG(rc);
1302                 child->exit_code = rc;
1303                 OBJ_RELEASE(cd);
1304                 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1305                 goto GETOUT;
1306             }
1307             if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1308                 /* connect endpoints IOF */
1309                 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1310                 if (ORTE_SUCCESS != rc) {
1311                     ORTE_ERROR_LOG(rc);
1312                     OBJ_RELEASE(cd);
1313                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1314                     goto GETOUT;
1315                 }
1316             }
1317             ++orte_odls_globals.next_base;
1318             if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1319                 orte_odls_globals.next_base = 0;
1320             }
1321             opal_output_verbose(1, orte_odls_base_framework.framework_output,
1322                                 "%s odls:dispatch %s to thread %d",
1323                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1324                                 ORTE_NAME_PRINT(&child->name),
1325                                 orte_odls_globals.next_base);
1326             evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1327             opal_event_set(evb, &cd->ev, -1,
1328                            OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
1329             opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
1330             opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
1331 
1332         }
1333         if (NULL != effective_dir) {
1334             free(effective_dir);
1335             effective_dir = NULL;
1336         }
1337     }
1338 
1339   GETOUT:
1340     if (NULL != effective_dir) {
1341         free(effective_dir);
1342         effective_dir = NULL;
1343     }
1344     /* tell the state machine that all local procs for this job
1345      * were launched so that it can do whatever it needs to do,
1346      * like send a state update message for all procs to the HNP
1347      */
1348     ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE);
1349 
1350   ERROR_OUT:
1351     /* ensure we reset our working directory back to our default location  */
1352     chdir(basedir);
1353     /* release the event */
1354     OBJ_RELEASE(caddy);
1355 }
1356 
1357 /**
1358 *  Pass a signal to my local procs
1359  */
1360 
orte_odls_base_default_signal_local_procs(const orte_process_name_t * proc,int32_t signal,orte_odls_base_signal_local_fn_t signal_local)1361 int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal,
1362                                               orte_odls_base_signal_local_fn_t signal_local)
1363 {
1364     int rc, i;
1365     orte_proc_t *child;
1366 
1367     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1368                          "%s odls: signaling proc %s",
1369                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1370                          (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
1371 
1372     /* if procs is NULL, then we want to signal all
1373      * of the local procs, so just do that case
1374      */
1375     if (NULL == proc) {
1376         rc = ORTE_SUCCESS;  /* pre-set this as an empty list causes us to drop to bottom */
1377         for (i=0; i < orte_local_children->size; i++) {
1378             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1379                 continue;
1380             }
1381             if (0 == child->pid || !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1382                 /* skip this one as the child isn't alive */
1383                 continue;
1384             }
1385             if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1386                 ORTE_ERROR_LOG(rc);
1387             }
1388         }
1389         return rc;
1390     }
1391 
1392     /* we want it sent to some specified process, so find it */
1393     for (i=0; i < orte_local_children->size; i++) {
1394         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1395             continue;
1396         }
1397         if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {
1398             if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1399                 ORTE_ERROR_LOG(rc);
1400             }
1401             return rc;
1402         }
1403     }
1404 
1405     /* only way to get here is if we couldn't find the specified proc.
1406      * report that as an error and return it
1407      */
1408     ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1409     return ORTE_ERR_NOT_FOUND;
1410 }
1411 
1412 /*
1413  *  Wait for a callback indicating the child has completed.
1414  */
1415 
ompi_odls_base_default_wait_local_proc(orte_proc_t * proc,void * cbdata)1416 void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1417 {
1418     int i;
1419     orte_job_t *jobdat;
1420     orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
1421     orte_proc_t *cptr;
1422 
1423     opal_output_verbose(5, orte_odls_base_framework.framework_output,
1424                         "%s odls:wait_local_proc child process %s pid %ld terminated",
1425                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1426                         ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
1427 
1428     /* if the child was previously flagged as dead, then just
1429      * update its exit status and
1430      * ensure that its exit state gets reported to avoid hanging
1431      * don't forget to check if the process was signaled.
1432      */
1433     if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) {
1434         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1435                              "%s odls:waitpid_fired child %s was already dead exit code %d",
1436                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1437                              ORTE_NAME_PRINT(&proc->name),proc->exit_code));
1438         if (WIFEXITED(proc->exit_code)) {
1439             proc->exit_code = WEXITSTATUS(proc->exit_code);
1440             if (0 != proc->exit_code) {
1441                 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1442             }
1443         } else {
1444             if (WIFSIGNALED(proc->exit_code)) {
1445                 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1446                 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1447             }
1448         }
1449         goto MOVEON;
1450     }
1451 
1452     /* if the proc called "abort", then we just need to flag that it
1453      * came thru here */
1454     if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
1455         /* even though the process exited "normally", it happened
1456          * via an orte_abort call
1457          */
1458         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1459                              "%s odls:waitpid_fired child %s died by call to abort",
1460                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1461                              ORTE_NAME_PRINT(&proc->name)));
1462         state = ORTE_PROC_STATE_CALLED_ABORT;
1463         /* regardless of our eventual code path, we need to
1464          * flag that this proc has had its waitpid fired */
1465         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1466         goto MOVEON;
1467     }
1468 
1469     /* get the jobdat for this child */
1470     if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
1471         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1472         goto MOVEON;
1473     }
1474 
1475     /* if this is a debugger daemon, then just report the state
1476      * and return as we aren't monitoring it
1477      */
1478     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON))  {
1479         goto MOVEON;
1480     }
1481 
1482     /* if this child was ordered to die, then just pass that along
1483      * so we don't hang
1484      */
1485     if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
1486         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1487                              "%s odls:waitpid_fired child %s was ordered to die",
1488                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1489                              ORTE_NAME_PRINT(&proc->name)));
1490         /* regardless of our eventual code path, we need to
1491          * flag that this proc has had its waitpid fired */
1492         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1493         goto MOVEON;
1494     }
1495 
1496     /* determine the state of this process */
1497     if (WIFEXITED(proc->exit_code)) {
1498 
1499         /* set the exit status appropriately */
1500         proc->exit_code = WEXITSTATUS(proc->exit_code);
1501 
1502         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1503                              "%s odls:waitpid_fired child %s exit code %d",
1504                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1505                              ORTE_NAME_PRINT(&proc->name), proc->exit_code));
1506 
1507         /* provide a default state */
1508         state = ORTE_PROC_STATE_WAITPID_FIRED;
1509 
1510         /* check to see if a sync was required and if it was received */
1511         if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_REG)) {
1512             if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_HAS_DEREG) ||
1513                 orte_allowed_exit_without_sync || 0 != proc->exit_code) {
1514                 /* if we did recv a finalize sync, or one is not required,
1515                  * then declare it normally terminated
1516                  * unless it returned with a non-zero status indicating the code
1517                  * felt it was non-normal - in this latter case, we do not
1518                  * require that the proc deregister before terminating
1519                  */
1520                 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1521                     state = ORTE_PROC_STATE_TERM_NON_ZERO;
1522                     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1523                                          "%s odls:waitpid_fired child process %s terminated normally "
1524                                          "but with a non-zero exit status - it "
1525                                          "will be treated as an abnormal termination",
1526                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1527                                          ORTE_NAME_PRINT(&proc->name)));
1528                 } else {
1529                     /* indicate the waitpid fired */
1530                     state = ORTE_PROC_STATE_WAITPID_FIRED;
1531                 }
1532             } else {
1533                 /* we required a finalizing sync and didn't get it, so this
1534                  * is considered an abnormal termination and treated accordingly
1535                  */
1536                 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1537                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1538                                      "%s odls:waitpid_fired child process %s terminated normally "
1539                                      "but did not provide a required finalize sync - it "
1540                                      "will be treated as an abnormal termination",
1541                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1542                                      ORTE_NAME_PRINT(&proc->name)));
1543             }
1544         } else {
1545             /* has any child in this job already registered? */
1546             for (i=0; i < orte_local_children->size; i++) {
1547                 if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1548                     continue;
1549                 }
1550                 if (cptr->name.jobid != proc->name.jobid) {
1551                     continue;
1552                 }
1553                 if (ORTE_FLAG_TEST(cptr, ORTE_PROC_FLAG_REG) && !orte_allowed_exit_without_sync) {
1554                     /* someone has registered, and we didn't before
1555                      * terminating - this is an abnormal termination unless
1556                      * the allowed_exit_without_sync flag is set
1557                      */
1558                     if (0 != proc->exit_code) {
1559                         state = ORTE_PROC_STATE_TERM_NON_ZERO;
1560                         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1561                                              "%s odls:waitpid_fired child process %s terminated normally "
1562                                              "but with a non-zero exit status - it "
1563                                              "will be treated as an abnormal termination",
1564                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1565                                              ORTE_NAME_PRINT(&proc->name)));
1566                     } else {
1567                         state = ORTE_PROC_STATE_TERM_WO_SYNC;
1568                         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1569                                              "%s odls:waitpid_fired child process %s terminated normally "
1570                                              "but did not provide a required init sync - it "
1571                                              "will be treated as an abnormal termination",
1572                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1573                                              ORTE_NAME_PRINT(&proc->name)));
1574                     }
1575                     goto MOVEON;
1576                 }
1577             }
1578             /* if no child has registered, then it is possible that
1579              * none of them will. This is considered acceptable. Still
1580              * flag it as abnormal if the exit code was non-zero
1581              */
1582             if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1583                 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1584             } else {
1585                 state = ORTE_PROC_STATE_WAITPID_FIRED;
1586             }
1587         }
1588 
1589         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1590                              "%s odls:waitpid_fired child process %s terminated %s",
1591                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1592                              ORTE_NAME_PRINT(&proc->name),
1593                              (0 == proc->exit_code) ? "normally" : "with non-zero status"));
1594     } else {
1595         /* the process was terminated with a signal! That's definitely
1596          * abnormal, so indicate that condition
1597          */
1598         state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1599         /* If a process was killed by a signal, then make the
1600          * exit code of orterun be "signo + 128" so that "prog"
1601          * and "orterun prog" will both yield the same exit code.
1602          *
1603          * This is actually what the shell does for you when
1604          * a process dies by signal, so this makes orterun treat
1605          * the termination code to exit status translation the
1606          * same way
1607          */
1608         proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1609 
1610         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1611                              "%s odls:waitpid_fired child process %s terminated with signal",
1612                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1613                              ORTE_NAME_PRINT(&proc->name) ));
1614         /* Do not decrement the number of local procs here. That is handled in the errmgr */
1615     }
1616 
1617  MOVEON:
1618     /* cancel the wait as this proc has already terminated */
1619     orte_wait_cb_cancel(proc);
1620     ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1621 }
1622 
1623 typedef struct {
1624     opal_list_item_t super;
1625     orte_proc_t *child;
1626 } orte_odls_quick_caddy_t;
qcdcon(orte_odls_quick_caddy_t * p)1627 static void qcdcon(orte_odls_quick_caddy_t *p)
1628 {
1629     p->child = NULL;
1630 }
qcddes(orte_odls_quick_caddy_t * p)1631 static void qcddes(orte_odls_quick_caddy_t *p)
1632 {
1633     if (NULL != p->child) {
1634         OBJ_RELEASE(p->child);
1635     }
1636 }
1637 OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
1638                    opal_list_item_t,
1639                    qcdcon, qcddes);
1640 
orte_odls_base_default_kill_local_procs(opal_pointer_array_t * procs,orte_odls_base_kill_local_fn_t kill_local)1641 int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1642                                             orte_odls_base_kill_local_fn_t kill_local)
1643 {
1644     orte_proc_t *child;
1645     opal_list_t procs_killed;
1646     orte_proc_t *proc, proctmp;
1647     int i, j;
1648     opal_pointer_array_t procarray, *procptr;
1649     bool do_cleanup;
1650     orte_odls_quick_caddy_t *cd;
1651 
1652     OBJ_CONSTRUCT(&procs_killed, opal_list_t);
1653 
1654     /* if the pointer array is NULL, then just kill everything */
1655     if (NULL == procs) {
1656         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1657                              "%s odls:kill_local_proc working on WILDCARD",
1658                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1659         OBJ_CONSTRUCT(&procarray, opal_pointer_array_t);
1660         opal_pointer_array_init(&procarray, 1, 1, 1);
1661         OBJ_CONSTRUCT(&proctmp, orte_proc_t);
1662         proctmp.name.jobid = ORTE_JOBID_WILDCARD;
1663         proctmp.name.vpid = ORTE_VPID_WILDCARD;
1664         opal_pointer_array_add(&procarray, &proctmp);
1665         procptr = &procarray;
1666         do_cleanup = true;
1667     } else {
1668         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1669                              "%s odls:kill_local_proc working on provided array",
1670                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1671         procptr = procs;
1672         do_cleanup = false;
1673     }
1674 
1675     /* cycle through the provided array of processes to kill */
1676     for (i=0; i < procptr->size; i++) {
1677         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) {
1678             continue;
1679         }
1680         for (j=0; j < orte_local_children->size; j++) {
1681             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) {
1682                 continue;
1683             }
1684 
1685             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1686                                  "%s odls:kill_local_proc checking child process %s",
1687                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1688                                  ORTE_NAME_PRINT(&child->name)));
1689 
1690             /* do we have a child from the specified job? Because the
1691              *  job could be given as a WILDCARD value, we must
1692              *  check for that as well as for equality.
1693              */
1694             if (ORTE_JOBID_WILDCARD != proc->name.jobid &&
1695                 proc->name.jobid != child->name.jobid) {
1696 
1697                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1698                                      "%s odls:kill_local_proc child %s is not part of job %s",
1699                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1700                                      ORTE_NAME_PRINT(&child->name),
1701                                      ORTE_JOBID_PRINT(proc->name.jobid)));
1702                 continue;
1703             }
1704 
1705             /* see if this is the specified proc - could be a WILDCARD again, so check
1706              * appropriately
1707              */
1708             if (ORTE_VPID_WILDCARD != proc->name.vpid &&
1709                 proc->name.vpid != child->name.vpid) {
1710 
1711                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1712                                      "%s odls:kill_local_proc child %s is not covered by rank %s",
1713                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1714                                      ORTE_NAME_PRINT(&child->name),
1715                                      ORTE_VPID_PRINT(proc->name.vpid)));
1716                 continue;
1717             }
1718 
1719             /* is this process alive? if not, then nothing for us
1720              * to do to it
1721              */
1722             if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE) || 0 == child->pid) {
1723 
1724                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1725                                      "%s odls:kill_local_proc child %s is not alive",
1726                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1727                                      ORTE_NAME_PRINT(&child->name)));
1728 
1729                 /* ensure, though, that the state is terminated so we don't lockup if
1730                  * the proc never started
1731                  */
1732                 if (ORTE_PROC_STATE_UNDEF == child->state ||
1733                     ORTE_PROC_STATE_INIT == child->state ||
1734                     ORTE_PROC_STATE_RUNNING == child->state) {
1735                     /* we can't be sure what happened, but make sure we
1736                      * at least have a value that will let us eventually wakeup
1737                      */
1738                     child->state = ORTE_PROC_STATE_TERMINATED;
1739                     /* ensure we realize that the waitpid will never come, if
1740                      * it already hasn't
1741                      */
1742                     ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
1743                     child->pid = 0;
1744                     goto CLEANUP;
1745                 } else {
1746                     continue;
1747                 }
1748             }
1749 
1750             /* ensure the stdin IOF channel for this child is closed. The other
1751              * channels will automatically close when the proc is killed
1752              */
1753             if (NULL != orte_iof.close) {
1754                 orte_iof.close(&child->name, ORTE_IOF_STDIN);
1755             }
1756 
1757             /* cancel the waitpid callback as this induces unmanageable race
1758              * conditions when we are deliberately killing the process
1759              */
1760             orte_wait_cb_cancel(child);
1761 
1762             /* First send a SIGCONT in case the process is in stopped state.
1763                If it is in a stopped state and we do not first change it to
1764                running, then SIGTERM will not get delivered.  Ignore return
1765                value. */
1766             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1767                                  "%s SENDING SIGCONT TO %s",
1768                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1769                                  ORTE_NAME_PRINT(&child->name)));
1770             cd = OBJ_NEW(orte_odls_quick_caddy_t);
1771             OBJ_RETAIN(child);
1772             cd->child = child;
1773             opal_list_append(&procs_killed, &cd->super);
1774             kill_local(child->pid, SIGCONT);
1775             continue;
1776 
1777         CLEANUP:
1778             /* ensure the child's session directory is cleaned up */
1779             orte_session_dir_finalize(&child->name);
1780             /* check for everything complete - this will remove
1781              * the child object from our local list
1782              */
1783             if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1784                 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID)) {
1785                 ORTE_ACTIVATE_PROC_STATE(&child->name, child->state);
1786             }
1787         }
1788     }
1789 
1790     /* if we are issuing signals, then we need to wait a little
1791      * and send the next in sequence */
1792     if (0 < opal_list_get_size(&procs_killed)) {
1793         sleep(orte_odls_globals.timeout_before_sigkill);
1794         /* issue a SIGTERM to all */
1795         OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1796             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1797                                  "%s SENDING SIGTERM TO %s",
1798                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1799                                  ORTE_NAME_PRINT(&cd->child->name)));
1800             kill_local(cd->child->pid, SIGTERM);
1801         }
1802         /* wait a little again */
1803         sleep(orte_odls_globals.timeout_before_sigkill);
1804         /* issue a SIGKILL to all */
1805         OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1806             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1807                                  "%s SENDING SIGKILL TO %s",
1808                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1809                                  ORTE_NAME_PRINT(&cd->child->name)));
1810             kill_local(cd->child->pid, SIGKILL);
1811             /* indicate the waitpid fired as this is effectively what
1812              * has happened
1813              */
1814             ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1815 
1816             /* Since we are not going to wait for this process, make sure
1817              * we mark it as not-alive so that we don't wait for it
1818              * in orted_cmd
1819              */
1820             ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
1821             cd->child->pid = 0;
1822 
1823             /* mark the child as "killed" */
1824             cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD;  /* we ordered it to die */
1825 
1826             /* ensure the child's session directory is cleaned up */
1827             orte_session_dir_finalize(&cd->child->name);
1828             /* check for everything complete - this will remove
1829              * the child object from our local list
1830              */
1831             if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1832                 ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1833                 ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1834             }
1835         }
1836     }
1837     OPAL_LIST_DESTRUCT(&procs_killed);
1838 
1839     /* cleanup arrays, if required */
1840     if (do_cleanup) {
1841         OBJ_DESTRUCT(&procarray);
1842         OBJ_DESTRUCT(&proctmp);
1843     }
1844 
1845     return ORTE_SUCCESS;
1846 }
1847 
orte_odls_base_get_proc_stats(opal_buffer_t * answer,orte_process_name_t * proc)1848 int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
1849                                   orte_process_name_t *proc)
1850 {
1851     int rc;
1852     orte_proc_t *child;
1853     opal_pstats_t stats, *statsptr;
1854     int i, j;
1855 
1856     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1857                          "%s odls:get_proc_stats for proc %s",
1858                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1859                          ORTE_NAME_PRINT(proc)));
1860 
1861     /* find this child */
1862     for (i=0; i < orte_local_children->size; i++) {
1863         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1864             continue;
1865         }
1866 
1867         if (proc->jobid == child->name.jobid &&
1868             (proc->vpid == child->name.vpid ||
1869              ORTE_VPID_WILDCARD == proc->vpid)) { /* found it */
1870 
1871             OBJ_CONSTRUCT(&stats, opal_pstats_t);
1872             /* record node up to first '.' */
1873             for (j=0; j < (int)strlen(orte_process_info.nodename) &&
1874                  j < OPAL_PSTAT_MAX_STRING_LEN-1 &&
1875                  orte_process_info.nodename[j] != '.'; j++) {
1876                 stats.node[j] = orte_process_info.nodename[j];
1877             }
1878             /* record rank */
1879             stats.rank = child->name.vpid;
1880             /* get stats */
1881             rc = opal_pstat.query(child->pid, &stats, NULL);
1882             if (ORTE_SUCCESS != rc) {
1883                 OBJ_DESTRUCT(&stats);
1884                 return rc;
1885             }
1886             if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, proc, 1, ORTE_NAME))) {
1887                 ORTE_ERROR_LOG(rc);
1888                 OBJ_DESTRUCT(&stats);
1889                 return rc;
1890             }
1891             statsptr = &stats;
1892             if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &statsptr, 1, OPAL_PSTAT))) {
1893                 ORTE_ERROR_LOG(rc);
1894                 OBJ_DESTRUCT(&stats);
1895                 return rc;
1896             }
1897             OBJ_DESTRUCT(&stats);
1898         }
1899     }
1900 
1901     return ORTE_SUCCESS;
1902 }
1903 
orte_odls_base_default_restart_proc(orte_proc_t * child,orte_odls_base_fork_local_proc_fn_t fork_local)1904 int orte_odls_base_default_restart_proc(orte_proc_t *child,
1905                                         orte_odls_base_fork_local_proc_fn_t fork_local)
1906 {
1907     int rc;
1908     orte_app_context_t *app;
1909     orte_job_t *jobdat;
1910     char basedir[MAXPATHLEN];
1911     char *wdir = NULL;
1912     orte_odls_spawn_caddy_t *cd;
1913     opal_event_base_t *evb;
1914 
1915     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1916                          "%s odls:restart_proc for proc %s",
1917                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1918                          ORTE_NAME_PRINT(&child->name)));
1919 
1920     /* establish our baseline working directory - we will be potentially
1921      * bouncing around as we execute this app, but we will always return
1922      * to this place as our default directory
1923      */
1924     getcwd(basedir, sizeof(basedir));
1925 
1926     /* find this child's jobdat */
1927     if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) {
1928         /* not found */
1929         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1930         return ORTE_ERR_NOT_FOUND;
1931     }
1932 
1933     child->state = ORTE_PROC_STATE_FAILED_TO_START;
1934     child->exit_code = 0;
1935     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
1936     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
1937     child->pid = 0;
1938     if (NULL != child->rml_uri) {
1939         free(child->rml_uri);
1940         child->rml_uri = NULL;
1941     }
1942     app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
1943 
1944     /* reset envars to match this child */
1945     if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &app->env))) {
1946         ORTE_ERROR_LOG(rc);
1947         goto CLEANUP;
1948     }
1949 
1950     /* setup the path */
1951     if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
1952         ORTE_ERROR_LOG(rc);
1953         if (NULL != wdir) {
1954             free(wdir);
1955         }
1956         goto CLEANUP;
1957     }
1958 
1959     /* dispatch this child to the next available launch thread */
1960     cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1961     if (NULL != wdir) {
1962         cd->wdir = strdup(wdir);
1963         free(wdir);
1964     }
1965     cd->jdata = jobdat;
1966     cd->app = app;
1967     cd->child = child;
1968     cd->fork_local = fork_local;
1969     /* setup any IOF */
1970     cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1971 
1972     /* do we want to setup stdin? */
1973     if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1974          child->name.vpid == jobdat->stdin_target) {
1975         cd->opts.connect_stdin = true;
1976     } else {
1977         cd->opts.connect_stdin = false;
1978     }
1979     if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1980         ORTE_ERROR_LOG(rc);
1981         child->exit_code = rc;
1982         OBJ_RELEASE(cd);
1983         ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1984         goto CLEANUP;
1985     }
1986     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1987         /* connect endpoints IOF */
1988         rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1989         if (ORTE_SUCCESS != rc) {
1990             ORTE_ERROR_LOG(rc);
1991             OBJ_RELEASE(cd);
1992             ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1993             goto CLEANUP;
1994         }
1995     }
1996     orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1997 
1998     ++orte_odls_globals.next_base;
1999     if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
2000         orte_odls_globals.next_base = 0;
2001     }
2002     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2003                          "%s restarting app %s",
2004                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
2005 
2006     evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
2007     opal_event_set(evb, &cd->ev, -1,
2008                    OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
2009     opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
2010     opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
2011 
2012   CLEANUP:
2013     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2014                          "%s odls:restart of proc %s %s",
2015                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2016                          ORTE_NAME_PRINT(&child->name),
2017                          (ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
2018 
2019     /* reset our working directory back to our default location - if we
2020      * don't do this, then we will be looking for relative paths starting
2021      * from the last wdir option specified by the user. Thus, we would
2022      * be requiring that the user keep track on the cmd line of where
2023      * each app was located relative to the prior app, instead of relative
2024      * to their current location
2025      */
2026     chdir(basedir);
2027 
2028     return rc;
2029 }
2030