1 /*
2 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2011 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2007-2011 Oracle and/or its affiliates. All rights reserved.
13 * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
14 * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
15 * All rights reserved.
16 * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
17 * Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
18 * Copyright (c) 2014-2018 Research Organization for Information Science
19 * and Technology (RIST). All rights reserved.
20 * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
21 * Copyright (c) 2017 IBM Corporation. All rights reserved.
22 * $COPYRIGHT$
23 *
24 * Additional copyrights may follow
25 *
26 * $HEADER$
27 */
28
29
30 #include "orte_config.h"
31 #include "orte/constants.h"
32 #include "orte/types.h"
33
34 #ifdef HAVE_SYS_WAIT_H
35 #include <sys/wait.h>
36 #endif
37 #include <errno.h>
38 #ifdef HAVE_SYS_STAT_H
39 #include <sys/stat.h>
40 #endif /* HAVE_SYS_STAT_H */
41 #ifdef HAVE_SYS_PARAM_H
42 #include <sys/param.h>
43 #endif
44 #include <time.h>
45
46 #include <signal.h>
47
48 #include "opal_stdint.h"
49 #include "opal/util/opal_environ.h"
50 #include "opal/util/argv.h"
51 #include "opal/util/os_dirpath.h"
52 #include "opal/util/os_path.h"
53 #include "opal/util/path.h"
54 #include "opal/util/sys_limits.h"
55 #include "opal/dss/dss.h"
56 #include "opal/mca/hwloc/hwloc-internal.h"
57 #include "opal/mca/shmem/base/base.h"
58 #include "opal/mca/pstat/pstat.h"
59 #include "opal/mca/pmix/pmix.h"
60
61 #include "orte/mca/errmgr/errmgr.h"
62 #include "orte/mca/rml/rml.h"
63 #include "orte/mca/routed/routed.h"
64 #include "orte/mca/iof/iof.h"
65 #include "orte/mca/iof/base/iof_base_setup.h"
66 #include "orte/mca/ess/base/base.h"
67 #include "orte/mca/grpcomm/base/base.h"
68 #include "orte/mca/plm/base/base.h"
69 #include "orte/mca/regx/regx.h"
70 #include "orte/mca/rml/base/rml_contact.h"
71 #include "orte/mca/rmaps/rmaps_types.h"
72 #include "orte/mca/rmaps/base/base.h"
73 #include "orte/mca/rmaps/base/rmaps_private.h"
74 #include "orte/mca/rtc/rtc.h"
75 #include "orte/mca/schizo/schizo.h"
76 #include "orte/mca/state/state.h"
77 #include "orte/mca/filem/filem.h"
78 #include "orte/mca/dfs/dfs.h"
79
80 #include "orte/util/context_fns.h"
81 #include "orte/util/name_fns.h"
82 #include "orte/util/session_dir.h"
83 #include "orte/util/proc_info.h"
84 #include "orte/util/show_help.h"
85 #include "orte/util/threads.h"
86 #include "orte/runtime/orte_globals.h"
87 #include "orte/runtime/orte_wait.h"
88 #include "orte/orted/orted.h"
89 #include "orte/orted/pmix/pmix_server.h"
90
91 #if OPAL_ENABLE_FT_CR == 1
92 #include "orte/mca/snapc/snapc.h"
93 #include "orte/mca/snapc/base/base.h"
94 #include "orte/mca/sstore/sstore.h"
95 #include "orte/mca/sstore/base/base.h"
96 #include "opal/mca/crs/crs.h"
97 #include "opal/mca/crs/base/base.h"
98 #endif
99
100 #include "orte/mca/odls/base/base.h"
101 #include "orte/mca/odls/base/odls_private.h"
102
103 /* IT IS CRITICAL THAT ANY CHANGE IN THE ORDER OF THE INFO PACKED IN
104 * THIS FUNCTION BE REFLECTED IN THE CONSTRUCT_CHILD_LIST PARSER BELOW
105 */
orte_odls_base_default_get_add_procs_data(opal_buffer_t * buffer,orte_jobid_t job)106 int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
107 orte_jobid_t job)
108 {
109 int rc, v;
110 orte_job_t *jdata=NULL, *jptr;
111 orte_job_map_t *map=NULL;
112 opal_buffer_t *wireup, jobdata;
113 opal_byte_object_t bo, *boptr;
114 int32_t numbytes, numjobs;
115 int8_t flag;
116 void *nptr;
117 uint32_t key;
118 char *nidmap;
119 orte_proc_t *dmn, *proc;
120 opal_value_t *val = NULL, *kv;
121 opal_list_t *modex;
122 int n;
123
124
125 /* get the job data pointer */
126 if (NULL == (jdata = orte_get_job_data_object(job))) {
127 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
128 return ORTE_ERR_BAD_PARAM;
129 }
130
131 /* get a pointer to the job map */
132 map = jdata->map;
133 /* if there is no map, just return */
134 if (NULL == map) {
135 return ORTE_SUCCESS;
136 }
137
138 /* if we couldn't provide the allocation regex on the orted
139 * cmd line, then we need to provide all the info here */
140 if (!orte_nidmap_communicated) {
141 if (ORTE_SUCCESS != (rc = orte_regx.nidmap_create(orte_node_pool, &nidmap))) {
142 ORTE_ERROR_LOG(rc);
143 return rc;
144 }
145 orte_nidmap_communicated = true;
146 } else {
147 nidmap = NULL;
148 }
149 opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING);
150 if (NULL != nidmap) {
151 free(nidmap);
152 }
153
154 /* if we haven't already done so, provide the info on the
155 * capabilities of each node */
156 if (1 < orte_process_info.num_procs &&
157 (!orte_node_info_communicated ||
158 orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL))) {
159 flag = 1;
160 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
161 if (ORTE_SUCCESS != (rc = orte_regx.encode_nodemap(buffer))) {
162 ORTE_ERROR_LOG(rc);
163 return rc;
164 }
165 /* get wireup info for daemons */
166 if (NULL == (jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
167 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
168 return ORTE_ERR_BAD_PARAM;
169 }
170 wireup = OBJ_NEW(opal_buffer_t);
171 /* always include data for mpirun as the daemons can't have it yet */
172 val = NULL;
173 if (opal_pmix.legacy_get()) {
174 if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
175 ORTE_ERROR_LOG(rc);
176 OBJ_RELEASE(wireup);
177 return rc;
178 } else {
179 /* pack the name of the daemon */
180 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
181 ORTE_ERROR_LOG(rc);
182 OBJ_RELEASE(wireup);
183 return rc;
184 }
185 /* pack the URI */
186 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
187 ORTE_ERROR_LOG(rc);
188 OBJ_RELEASE(wireup);
189 return rc;
190 }
191 OBJ_RELEASE(val);
192 }
193 } else {
194 if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
195 ORTE_ERROR_LOG(rc);
196 OBJ_RELEASE(wireup);
197 return rc;
198 } else {
199 /* the data is returned as a list of key-value pairs in the opal_value_t */
200 if (OPAL_PTR != val->type) {
201 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
202 OBJ_RELEASE(wireup);
203 return ORTE_ERR_NOT_FOUND;
204 }
205 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
206 ORTE_ERROR_LOG(rc);
207 OBJ_RELEASE(wireup);
208 return rc;
209 }
210 modex = (opal_list_t*)val->data.ptr;
211 numbytes = (int32_t)opal_list_get_size(modex);
212 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
213 ORTE_ERROR_LOG(rc);
214 OBJ_RELEASE(wireup);
215 return rc;
216 }
217 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
218 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
219 ORTE_ERROR_LOG(rc);
220 OBJ_RELEASE(wireup);
221 return rc;
222 }
223 }
224 OPAL_LIST_RELEASE(modex);
225 OBJ_RELEASE(val);
226 }
227 }
228 /* if we didn't rollup the connection info, then we have
229 * to provide a complete map of connection info */
230 if (!orte_static_ports && !orte_fwd_mpirun_port) {
231 for (v=1; v < jptr->procs->size; v++) {
232 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
233 continue;
234 }
235 val = NULL;
236 if (opal_pmix.legacy_get()) {
237 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
238 ORTE_ERROR_LOG(rc);
239 OBJ_RELEASE(buffer);
240 OBJ_RELEASE(wireup);
241 return rc;
242 } else {
243 /* pack the name of the daemon */
244 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
245 ORTE_ERROR_LOG(rc);
246 OBJ_RELEASE(buffer);
247 OBJ_RELEASE(wireup);
248 return rc;
249 }
250 /* pack the URI */
251 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
252 ORTE_ERROR_LOG(rc);
253 OBJ_RELEASE(buffer);
254 OBJ_RELEASE(wireup);
255 return rc;
256 }
257 OBJ_RELEASE(val);
258 }
259 } else {
260 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
261 ORTE_ERROR_LOG(rc);
262 OBJ_RELEASE(buffer);
263 return rc;
264 } else {
265 /* the data is returned as a list of key-value pairs in the opal_value_t */
266 if (OPAL_PTR != val->type) {
267 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
268 OBJ_RELEASE(buffer);
269 return ORTE_ERR_NOT_FOUND;
270 }
271 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
272 ORTE_ERROR_LOG(rc);
273 OBJ_RELEASE(buffer);
274 OBJ_RELEASE(wireup);
275 return rc;
276 }
277 modex = (opal_list_t*)val->data.ptr;
278 numbytes = (int32_t)opal_list_get_size(modex);
279 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
280 ORTE_ERROR_LOG(rc);
281 OBJ_RELEASE(buffer);
282 OBJ_RELEASE(wireup);
283 return rc;
284 }
285 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
286 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
287 ORTE_ERROR_LOG(rc);
288 OBJ_RELEASE(buffer);
289 OBJ_RELEASE(wireup);
290 return rc;
291 }
292 }
293 OPAL_LIST_RELEASE(modex);
294 OBJ_RELEASE(val);
295 }
296 }
297 }
298 }
299 /* put it in a byte object for xmission */
300 opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
301 OBJ_RELEASE(wireup);
302 /* pack the byte object - zero-byte objects are fine */
303 bo.size = numbytes;
304 boptr = &bo;
305 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &boptr, 1, OPAL_BYTE_OBJECT))) {
306 ORTE_ERROR_LOG(rc);
307 return rc;
308 }
309 /* release the data since it has now been copied into our buffer */
310 if (NULL != bo.bytes) {
311 free(bo.bytes);
312 }
313
314 /* we need to ensure that any new daemons get a complete
315 * copy of all active jobs so the grpcomm collectives can
316 * properly work should a proc from one of the other jobs
317 * interact with this one */
318 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
319 flag = 1;
320 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
321 OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
322 numjobs = 0;
323 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
324 while (OPAL_SUCCESS == rc) {
325 /* skip the one we are launching now */
326 if (NULL != jptr && jptr != jdata &&
327 ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
328 /* pack the job struct */
329 if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &jptr, 1, ORTE_JOB))) {
330 ORTE_ERROR_LOG(rc);
331 OBJ_DESTRUCT(&jobdata);
332 return rc;
333 }
334 /* pack the location of each proc */
335 for (n=0; n < jptr->procs->size; n++) {
336 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
337 continue;
338 }
339 if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &proc->parent, 1, ORTE_VPID))) {
340 ORTE_ERROR_LOG(rc);
341 OBJ_DESTRUCT(&jobdata);
342 return rc;
343 }
344 }
345 ++numjobs;
346 }
347 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
348 }
349 /* pack the number of jobs */
350 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &numjobs, 1, OPAL_INT32))) {
351 ORTE_ERROR_LOG(rc);
352 OBJ_DESTRUCT(&jobdata);
353 return rc;
354 }
355 if (0 < numjobs) {
356 /* pack the jobdata buffer */
357 wireup = &jobdata;
358 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) {
359 ORTE_ERROR_LOG(rc);
360 OBJ_DESTRUCT(&jobdata);
361 return rc;
362 }
363 OBJ_DESTRUCT(&jobdata);
364 }
365 } else {
366 flag = 0;
367 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
368 }
369 orte_node_info_communicated = true;
370 } else {
371 /* mark that we didn't */
372 flag = 0;
373 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
374 /* and that we didn't launch daemons */
375 flag = 0;
376 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
377 }
378
379 /* pack the job struct */
380 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &jdata, 1, ORTE_JOB))) {
381 ORTE_ERROR_LOG(rc);
382 return rc;
383 }
384
385 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
386 /* compute and pack the ppn regex */
387 if (ORTE_SUCCESS != (rc = orte_regx.generate_ppn(jdata, &nidmap))) {
388 ORTE_ERROR_LOG(rc);
389 return rc;
390 }
391 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) {
392 ORTE_ERROR_LOG(rc);
393 free(nidmap);
394 return rc;
395 }
396 free(nidmap);
397 }
398
399 /* compute and pack the regex of ppn */
400
401 return ORTE_SUCCESS;
402 }
403
fm_release(void * cbdata)404 static void fm_release(void *cbdata)
405 {
406 opal_buffer_t *bptr = (opal_buffer_t*)cbdata;
407
408 OBJ_RELEASE(bptr);
409 }
410
orte_odls_base_default_construct_child_list(opal_buffer_t * buffer,orte_jobid_t * job)411 int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
412 orte_jobid_t *job)
413 {
414 int rc;
415 orte_std_cntr_t cnt;
416 orte_job_t *jdata=NULL, *daemons;
417 orte_node_t *node;
418 orte_vpid_t dmnvpid, v;
419 int32_t n, k;
420 opal_buffer_t *bptr;
421 orte_proc_t *pptr, *dmn;
422 orte_app_context_t *app;
423 int8_t flag;
424 char *ppn;
425
426 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
427 "%s odls:constructing child list",
428 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
429
430 /* set a default response */
431 *job = ORTE_JOBID_INVALID;
432 /* get the daemon job object */
433 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
434
435 /* unpack the flag to see if new daemons were launched */
436 cnt=1;
437 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
438 ORTE_ERROR_LOG(rc);
439 goto REPORT_ERROR;
440 }
441
442 if (0 != flag) {
443 /* see if additional jobs are included in the data */
444 cnt=1;
445 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &n, &cnt, OPAL_INT32))) {
446 *job = ORTE_JOBID_INVALID;
447 ORTE_ERROR_LOG(rc);
448 goto REPORT_ERROR;
449 }
450
451 if (0 < n) {
452 /* unpack the buffer containing the info */
453 cnt=1;
454 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
455 *job = ORTE_JOBID_INVALID;
456 ORTE_ERROR_LOG(rc);
457 goto REPORT_ERROR;
458 }
459 for (k=0; k < n; k++) {
460 /* unpack each job and add it to the local orte_job_data array */
461 cnt=1;
462 if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &jdata, &cnt, ORTE_JOB))) {
463 *job = ORTE_JOBID_INVALID;
464 ORTE_ERROR_LOG(rc);
465 goto REPORT_ERROR;
466 }
467 /* check to see if we already have this one */
468 if (NULL == orte_get_job_data_object(jdata->jobid)) {
469 /* nope - add it */
470 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
471 } else {
472 /* yep - so we can drop this copy */
473 jdata->jobid = ORTE_JOBID_INVALID;
474 OBJ_RELEASE(jdata);
475 continue;
476 }
477 /* unpack the location of each proc in this job */
478 for (v=0; v < jdata->num_procs; v++) {
479 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
480 pptr = OBJ_NEW(orte_proc_t);
481 pptr->name.jobid = jdata->jobid;
482 pptr->name.vpid = v;
483 opal_pointer_array_set_item(jdata->procs, v, pptr);
484 }
485 cnt=1;
486 if (ORTE_SUCCESS != (rc = opal_dss.unpack(bptr, &dmnvpid, &cnt, ORTE_VPID))) {
487 ORTE_ERROR_LOG(rc);
488 OBJ_RELEASE(jdata);
489 goto REPORT_ERROR;
490 }
491 /* lookup the daemon */
492 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
493 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
494 rc = ORTE_ERR_NOT_FOUND;
495 goto REPORT_ERROR;
496 }
497 /* connect the two */
498 OBJ_RETAIN(dmn->node);
499 pptr->node = dmn->node;
500 }
501 }
502 /* release the buffer */
503 OBJ_RELEASE(bptr);
504 }
505 }
506
507 /* unpack the job we are to launch */
508 cnt=1;
509 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &cnt, ORTE_JOB))) {
510 *job = ORTE_JOBID_INVALID;
511 ORTE_ERROR_LOG(rc);
512 goto REPORT_ERROR;
513 }
514 if (ORTE_JOBID_INVALID == jdata->jobid) {
515 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
516 rc = ORTE_ERR_BAD_PARAM;
517 goto REPORT_ERROR;
518 }
519 *job = jdata->jobid;
520
521 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
522 "%s odls:construct_child_list unpacking data to launch job %s",
523 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
524
525 /* if we are the HNP, we don't need to unpack this buffer - we already
526 * have all the required info in our local job array. So just build the
527 * array of local children
528 */
529 if (ORTE_PROC_IS_HNP) {
530 /* we don't want/need the extra copy of the orte_job_t, but
531 * we can't just release it as that will NULL the location in
532 * the orte_job_data array. So set the jobid to INVALID to
533 * protect the array, and then release the object to free
534 * the storage */
535 jdata->jobid = ORTE_JOBID_INVALID;
536 OBJ_RELEASE(jdata);
537 /* get the correct job object - it will be completely filled out */
538 if (NULL == (jdata = orte_get_job_data_object(*job))) {
539 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
540 rc = ORTE_ERR_NOT_FOUND;
541 goto REPORT_ERROR;
542 }
543 } else {
544 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
545
546 /* ensure the map object is present */
547 if (NULL == jdata->map) {
548 jdata->map = OBJ_NEW(orte_job_map_t);
549 }
550 }
551
552 /* if the job is fully described, then mpirun will have computed
553 * and sent us the complete array of procs in the orte_job_t, so we
554 * don't need to do anything more here */
555 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
556 if (!ORTE_PROC_IS_HNP) {
557 /* extract the ppn regex */
558 cnt = 1;
559 if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) {
560 ORTE_ERROR_LOG(rc);
561 goto REPORT_ERROR;
562 }
563 /* populate the node array of the job map and the proc array of
564 * the job object so we know how many procs are on each node */
565 if (ORTE_SUCCESS != (rc = orte_regx.parse_ppn(jdata, ppn))) {
566 ORTE_ERROR_LOG(rc);
567 free(ppn);
568 goto REPORT_ERROR;
569 }
570 free(ppn);
571 /* now assign locations to the procs */
572 if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
573 ORTE_ERROR_LOG(rc);
574 goto REPORT_ERROR;
575 }
576 }
577 /* compute the ranks and add the proc objects
578 * to the jdata->procs array */
579 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
580 ORTE_ERROR_LOG(rc);
581 goto REPORT_ERROR;
582 }
583 /* and finally, compute the local and node ranks */
584 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
585 ORTE_ERROR_LOG(rc);
586 goto REPORT_ERROR;
587 }
588 }
589
590 /* now that the node array in the job map and jdata are completely filled out,.
591 * we need to "wireup" the procs to their nodes so other utilities can
592 * locate them */
593 for (n=0; n < jdata->procs->size; n++) {
594 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
595 continue;
596 }
597 if (ORTE_PROC_STATE_UNDEF == pptr->state) {
598 /* not ready for use yet */
599 continue;
600 }
601 if (!ORTE_PROC_IS_HNP &&
602 orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
603 /* the parser will have already made the connection, but the fully described
604 * case won't have done it, so connect the proc to its node here */
605 opal_output_verbose(5, orte_odls_base_framework.framework_output,
606 "%s GETTING DAEMON FOR PROC %s WITH PARENT %s",
607 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
608 ORTE_NAME_PRINT(&pptr->name),
609 ORTE_VPID_PRINT(pptr->parent));
610 if (ORTE_VPID_INVALID == pptr->parent) {
611 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
612 rc = ORTE_ERR_BAD_PARAM;
613 goto REPORT_ERROR;
614 }
615 /* connect the proc to its node object */
616 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, pptr->parent))) {
617 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
618 rc = ORTE_ERR_NOT_FOUND;
619 goto REPORT_ERROR;
620 }
621 OBJ_RETAIN(dmn->node);
622 pptr->node = dmn->node;
623 /* add the node to the job map, if needed */
624 if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
625 OBJ_RETAIN(pptr->node);
626 opal_pointer_array_add(jdata->map->nodes, pptr->node);
627 jdata->map->num_nodes++;
628 ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
629 }
630 /* add this proc to that node */
631 OBJ_RETAIN(pptr);
632 opal_pointer_array_add(pptr->node->procs, pptr);
633 pptr->node->num_procs++;
634 }
635 /* see if it belongs to us */
636 if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
637 /* is this child on our current list of children */
638 if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
639 /* not on the local list */
640 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
641 "%s[%s:%d] adding proc %s to my local list",
642 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
643 __FILE__, __LINE__,
644 ORTE_NAME_PRINT(&pptr->name)));
645 /* keep tabs of the number of local procs */
646 jdata->num_local_procs++;
647 /* add this proc to our child list */
648 OBJ_RETAIN(pptr);
649 ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
650 opal_pointer_array_add(orte_local_children, pptr);
651 }
652
653 /* if the job is in restart mode, the child must not barrier when launched */
654 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
655 orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
656 }
657 /* mark that this app_context is being used on this node */
658 app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
659 ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
660 }
661 }
662 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
663 /* reset the mapped flags */
664 for (n=0; n < jdata->map->nodes->size; n++) {
665 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
666 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
667 }
668 }
669 }
670
671 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
672 /* compute and save bindings of local children */
673 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
674 ORTE_ERROR_LOG(rc);
675 goto REPORT_ERROR;
676 }
677 }
678
679 /* if we wanted to see the map, now is the time to display it */
680 if (jdata->map->display_map) {
681 orte_rmaps_base_display_map(jdata);
682 }
683
684 /* if we have a file map, then we need to load it */
685 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&bptr, OPAL_BUFFER)) {
686 if (NULL != orte_dfs.load_file_maps) {
687 orte_dfs.load_file_maps(jdata->jobid, bptr, fm_release, bptr);
688 } else {
689 OBJ_RELEASE(bptr);
690 }
691 }
692
693 /* load any controls into the job */
694 orte_rtc.assign(jdata);
695
696 /* register this job with the PMIx server - need to wait until after we
697 * have computed the #local_procs before calling the function */
698 if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace(jdata, false))) {
699 ORTE_ERROR_LOG(rc);
700 goto REPORT_ERROR;
701 }
702
703 /* to save memory, purge the job map of all procs other than
704 * our own - for daemons, this will completely release the
705 * proc structures. For the HNP, the proc structs will
706 * remain in the orte_job_t array */
707
708 return ORTE_SUCCESS;
709
710 REPORT_ERROR:
711 /* we have to report an error back to the HNP so we don't just
712 * hang. Although there shouldn't be any errors once this is
713 * all debugged, it is still good practice to have a way
714 * for it to happen - especially so developers don't have to
715 * deal with the hang!
716 */
717 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED);
718 return rc;
719 }
720
setup_path(orte_app_context_t * app,char ** wdir)721 static int setup_path(orte_app_context_t *app, char **wdir)
722 {
723 int rc=ORTE_SUCCESS;
724 char dir[MAXPATHLEN];
725
726 if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
727 /* Try to change to the app's cwd and check that the app
728 exists and is executable The function will
729 take care of outputting a pretty error message, if required
730 */
731 if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
732 /* do not ERROR_LOG - it will be reported elsewhere */
733 goto CLEANUP;
734 }
735
736 /* The prior function will have done a chdir() to jump us to
737 * wherever the app is to be executed. This could be either where
738 * the user specified (via -wdir), or to the user's home directory
739 * on this node if nothing was provided. It seems that chdir doesn't
740 * adjust the $PWD enviro variable when it changes the directory. This
741 * can cause a user to get a different response when doing getcwd vs
742 * looking at the enviro variable. To keep this consistent, we explicitly
743 * ensure that the PWD enviro variable matches the CWD we moved to.
744 *
745 * NOTE: if a user's program does a chdir(), then $PWD will once
746 * again not match getcwd! This is beyond our control - we are only
747 * ensuring they start out matching.
748 */
749 getcwd(dir, sizeof(dir));
750 *wdir = strdup(dir);
751 opal_setenv("PWD", dir, true, &app->env);
752 /* update the initial wdir value too */
753 opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
754 } else {
755 *wdir = NULL;
756 }
757
758 CLEANUP:
759 return rc;
760 }
761
762
763 /* define a timer release point so that we can wait for
764 * file descriptors to come available, if necessary
765 */
timer_cb(int fd,short event,void * cbdata)766 static void timer_cb(int fd, short event, void *cbdata)
767 {
768 orte_timer_t *tm = (orte_timer_t*)cbdata;
769 orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
770
771 ORTE_ACQUIRE_OBJECT(tm);
772
773 /* increment the number of retries */
774 ll->retries++;
775
776 /* re-attempt the launch */
777 opal_event_active(ll->ev, OPAL_EV_WRITE, 1);
778
779 /* release the timer event */
780 OBJ_RELEASE(tm);
781 }
782
compute_num_procs_alive(orte_jobid_t job)783 static int compute_num_procs_alive(orte_jobid_t job)
784 {
785 int i;
786 orte_proc_t *child;
787 int num_procs_alive = 0;
788
789 for (i=0; i < orte_local_children->size; i++) {
790 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
791 continue;
792 }
793 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
794 continue;
795 }
796 /* do not include members of the specified job as they
797 * will be added later, if required
798 */
799 if (job == child->name.jobid) {
800 continue;
801 }
802 num_procs_alive++;
803 }
804 return num_procs_alive;
805 }
806
orte_odls_base_spawn_proc(int fd,short sd,void * cbdata)807 void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
808 {
809 orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cbdata;
810 orte_job_t *jobdat = cd->jdata;
811 orte_app_context_t *app = cd->app;
812 orte_proc_t *child = cd->child;
813 int rc, i;
814 bool found;
815 orte_proc_state_t state;
816 char **argvptr;
817 char *pathenv = NULL, *mpiexec_pathenv = NULL;
818 char *full_search;
819
820 ORTE_ACQUIRE_OBJECT(cd);
821
822 /* thread-protect common values */
823 cd->env = opal_argv_copy(app->env);
824
825 /* ensure we clear any prior info regarding state or exit status in
826 * case this is a restart
827 */
828 child->exit_code = 0;
829 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
830
831 /* setup the pmix environment */
832 if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
833 ORTE_ERROR_LOG(rc);
834 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
835 goto errorout;
836 }
837
838 /* if we are not forwarding output for this job, then
839 * flag iof as complete
840 */
841 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
842 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
843 } else {
844 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
845 }
846 child->pid = 0;
847 if (NULL != child->rml_uri) {
848 free(child->rml_uri);
849 child->rml_uri = NULL;
850 }
851
852 /* setup the rest of the environment with the proc-specific items - these
853 * will be overwritten for each child
854 */
855 if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
856 ORTE_ERROR_LOG(rc);
857 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
858 goto errorout;
859 }
860
861 /* Search for the OMPI_exec_path and PATH settings in the environment. */
862 for (argvptr = app->env; *argvptr != NULL; argvptr++) {
863 if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
864 mpiexec_pathenv = *argvptr + 15;
865 }
866 if (0 == strncmp("PATH=", *argvptr, 5)) {
867 pathenv = *argvptr + 5;
868 }
869 }
870
871 /* If OMPI_exec_path is set (meaning --path was used), then create a
872 temporary environment to be used in the search for the executable.
873 The PATH setting in this temporary environment is a combination of
874 the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
875 then just use existing environment with PATH in it. */
876 if (NULL != mpiexec_pathenv) {
877 argvptr = NULL;
878 if (pathenv != NULL) {
879 asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
880 } else {
881 asprintf(&full_search, "%s", mpiexec_pathenv);
882 }
883 opal_setenv("PATH", full_search, true, &argvptr);
884 free(full_search);
885 } else {
886 argvptr = app->env;
887 }
888
889 rc = orte_util_check_context_app(app, argvptr);
890 /* do not ERROR_LOG - it will be reported elsewhere */
891 if (NULL != mpiexec_pathenv) {
892 opal_argv_free(argvptr);
893 }
894 if (ORTE_SUCCESS != rc) {
895 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
896 goto errorout;
897 }
898
899 /* did the user request we display output in xterms? */
900 if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
901 opal_list_item_t *nmitem;
902 orte_namelist_t *nm;
903 /* see if this rank is one of those requested */
904 found = false;
905 for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
906 nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
907 nmitem = opal_list_get_next(nmitem)) {
908 nm = (orte_namelist_t*)nmitem;
909 if (ORTE_VPID_WILDCARD == nm->name.vpid ||
910 child->name.vpid == nm->name.vpid) {
911 /* we want this one - modify the app's command to include
912 * the orte xterm cmd that starts with the xtermcmd */
913 cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
914 /* insert the rank into the correct place as a window title */
915 free(cd->argv[2]);
916 asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
917 /* add in the argv from the app */
918 for (i=0; NULL != app->argv[i]; i++) {
919 opal_argv_append_nosize(&cd->argv, app->argv[i]);
920 }
921 /* use the xterm cmd as the app string */
922 cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
923 found = true;
924 break;
925 } else if (jobdat->num_procs <= nm->name.vpid) { /* check for bozo case */
926 /* can't be done! */
927 orte_show_help("help-orte-odls-base.txt",
928 "orte-odls-base:xterm-rank-out-of-bounds",
929 true, orte_process_info.nodename,
930 nm->name.vpid, jobdat->num_procs);
931 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
932 goto errorout;
933 }
934 }
935 if (!found) {
936 cd->cmd = strdup(app->app);
937 cd->argv = opal_argv_copy(app->argv);
938 }
939 } else if (NULL != orte_fork_agent) {
940 /* we were given a fork agent - use it */
941 cd->argv = opal_argv_copy(orte_fork_agent);
942 /* add in the argv from the app */
943 for (i=0; NULL != app->argv[i]; i++) {
944 opal_argv_append_nosize(&cd->argv, app->argv[i]);
945 }
946 cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
947 if (NULL == cd->cmd) {
948 orte_show_help("help-orte-odls-base.txt",
949 "orte-odls-base:fork-agent-not-found",
950 true, orte_process_info.nodename, orte_fork_agent[0]);
951 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
952 goto errorout;
953 }
954 } else {
955 cd->cmd = strdup(app->app);
956 cd->argv = opal_argv_copy(app->argv);
957 }
958
959 /* if we are indexing the argv by rank, do so now */
960 if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
961 char *param;
962 asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid);
963 free(cd->argv[0]);
964 cd->argv[0] = param;
965 }
966
967 if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
968 opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
969 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
970 ORTE_NAME_PRINT(&child->name));
971
972 /* dump what is going to be exec'd */
973 if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
974 opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
975 }
976 }
977
978 if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
979 /* error message already output */
980 state = ORTE_PROC_STATE_FAILED_TO_START;
981 goto errorout;
982 }
983
984 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
985 OBJ_RELEASE(cd);
986 return;
987
988 errorout:
989 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
990 child->exit_code = rc;
991 ORTE_ACTIVATE_PROC_STATE(&child->name, state);
992 OBJ_RELEASE(cd);
993 }
994
orte_odls_base_default_launch_local(int fd,short sd,void * cbdata)995 void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
996 {
997 orte_app_context_t *app;
998 orte_proc_t *child=NULL;
999 int rc=ORTE_SUCCESS;
1000 char basedir[MAXPATHLEN];
1001 int j, idx;
1002 int total_num_local_procs = 0;
1003 orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
1004 orte_job_t *jobdat;
1005 orte_jobid_t job = caddy->job;
1006 orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
1007 bool index_argv;
1008 char *msg;
1009 orte_odls_spawn_caddy_t *cd;
1010 opal_event_base_t *evb;
1011 char *effective_dir = NULL;
1012
1013 ORTE_ACQUIRE_OBJECT(caddy);
1014
1015 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1016 "%s local:launch",
1017 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1018
1019 /* establish our baseline working directory - we will be potentially
1020 * bouncing around as we execute various apps, but we will always return
1021 * to this place as our default directory
1022 */
1023 getcwd(basedir, sizeof(basedir));
1024
1025 /* find the jobdat for this job */
1026 if (NULL == (jobdat = orte_get_job_data_object(job))) {
1027 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1028 /* not much we can do here - we are just hosed, so
1029 * report that to the error manager
1030 */
1031 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1032 goto ERROR_OUT;
1033 }
1034
1035 /* do we have any local procs to launch? */
1036 if (0 == jobdat->num_local_procs) {
1037 /* indicate that we are done trying to launch them */
1038 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1039 "%s local:launch no local procs",
1040 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1041 goto GETOUT;
1042 }
1043
1044 /* track if we are indexing argvs so we don't check every time */
1045 index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
1046
1047 /* compute the total number of local procs currently alive and about to be launched */
1048 total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
1049
1050 /* check the system limits - if we are at our max allowed children, then
1051 * we won't be allowed to do this anyway, so we may as well abort now.
1052 * According to the documentation, num_procs = 0 is equivalent to
1053 * no limit, so treat it as unlimited here.
1054 */
1055 if (0 < opal_sys_limits.num_procs) {
1056 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1057 "%s checking limit on num procs %d #children needed %d",
1058 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1059 opal_sys_limits.num_procs, total_num_local_procs));
1060 if (opal_sys_limits.num_procs < total_num_local_procs) {
1061 if (2 < caddy->retries) {
1062 /* if we have already tried too many times, then just give up */
1063 ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1064 goto ERROR_OUT;
1065 }
1066 /* set a timer event so we can retry later - this
1067 * gives the system a chance to let other procs
1068 * terminate, thus creating room for new ones
1069 */
1070 ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1071 return;
1072 }
1073 }
1074
1075 /* check to see if we have enough available file descriptors
1076 * to launch these children - if not, then let's wait a little
1077 * while to see if some come free. This can happen if we are
1078 * in a tight loop over comm_spawn
1079 */
1080 if (0 < opal_sys_limits.num_files) {
1081 int limit;
1082 limit = 4*total_num_local_procs + 6*jobdat->num_local_procs;
1083 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1084 "%s checking limit on file descriptors %d need %d",
1085 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1086 opal_sys_limits.num_files, limit));
1087 if (opal_sys_limits.num_files < limit) {
1088 if (2 < caddy->retries) {
1089 /* tried enough - give up */
1090 for (idx=0; idx < orte_local_children->size; idx++) {
1091 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1092 continue;
1093 }
1094 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1095 child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1096 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1097 }
1098 }
1099 goto ERROR_OUT;
1100 }
1101 /* don't have enough - wait a little time */
1102 ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1103 return;
1104 }
1105 }
1106
1107 for (j=0; j < jobdat->apps->size; j++) {
1108 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
1109 continue;
1110 }
1111
1112 /* if this app isn't being used on our node, skip it */
1113 if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
1114 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1115 "%s app %d not used on node",
1116 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
1117 continue;
1118 }
1119
1120 /* setup the environment for this app */
1121 if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
1122
1123 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1124 "%s odls:launch:setup_fork failed with error %s",
1125 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1126 ORTE_ERROR_NAME(rc)));
1127
1128 /* do not ERROR_LOG this failure - it will be reported
1129 * elsewhere. The launch is going to fail. Since we could have
1130 * multiple app_contexts, we need to ensure that we flag only
1131 * the correct one that caused this operation to fail. We then have
1132 * to flag all the other procs from the app_context as having "not failed"
1133 * so we can report things out correctly
1134 */
1135 /* cycle through children to find those for this jobid */
1136 for (idx=0; idx < orte_local_children->size; idx++) {
1137 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1138 continue;
1139 }
1140 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1141 j == (int)child->app_idx) {
1142 child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1143 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1144 }
1145 }
1146 goto GETOUT;
1147 }
1148
1149 /* setup the working directory for this app - will jump us
1150 * to that directory
1151 */
1152 if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
1153 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1154 "%s odls:launch:setup_path failed with error %s(%d)",
1155 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1156 ORTE_ERROR_NAME(rc), rc));
1157 /* do not ERROR_LOG this failure - it will be reported
1158 * elsewhere. The launch is going to fail. Since we could have
1159 * multiple app_contexts, we need to ensure that we flag only
1160 * the correct one that caused this operation to fail. We then have
1161 * to flag all the other procs from the app_context as having "not failed"
1162 * so we can report things out correctly
1163 */
1164 /* cycle through children to find those for this jobid */
1165 for (idx=0; idx < orte_local_children->size; idx++) {
1166 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1167 continue;
1168 }
1169 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1170 j == (int)child->app_idx) {
1171 child->exit_code = rc;
1172 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1173 }
1174 }
1175 goto GETOUT;
1176 }
1177
1178 /* setup any local files that were prepositioned for us */
1179 if (ORTE_SUCCESS != (rc = orte_filem.link_local_files(jobdat, app))) {
1180 /* cycle through children to find those for this jobid */
1181 for (idx=0; idx < orte_local_children->size; idx++) {
1182 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1183 continue;
1184 }
1185 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1186 j == (int)child->app_idx) {
1187 child->exit_code = rc;
1188 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1189 }
1190 }
1191 goto GETOUT;
1192 }
1193
1194 /* tell all children that they are being launched via ORTE */
1195 opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
1196
1197 /* if the user requested it, set the system resource limits */
1198 if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
1199 orte_show_help("help-orte-odls-default.txt", "set limit", true,
1200 orte_process_info.nodename, app,
1201 __FILE__, __LINE__, msg);
1202 /* cycle through children to find those for this jobid */
1203 for (idx=0; idx < orte_local_children->size; idx++) {
1204 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1205 continue;
1206 }
1207 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1208 j == (int)child->app_idx) {
1209 child->exit_code = rc;
1210 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1211 }
1212 }
1213 goto GETOUT;
1214 }
1215
1216 /* reset our working directory back to our default location - if we
1217 * don't do this, then we will be looking for relative paths starting
1218 * from the last wdir option specified by the user. Thus, we would
1219 * be requiring that the user keep track on the cmd line of where
1220 * each app was located relative to the prior app, instead of relative
1221 * to their current location
1222 */
1223 chdir(basedir);
1224
1225 /* okay, now let's launch all the local procs for this app using the provided fork_local fn */
1226 for (idx=0; idx < orte_local_children->size; idx++) {
1227 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1228 continue;
1229 }
1230 /* does this child belong to this app? */
1231 if (j != (int)child->app_idx) {
1232 continue;
1233 }
1234
1235 /* is this child already alive? This can happen if
1236 * we are asked to launch additional processes.
1237 * If it has been launched, then do nothing
1238 */
1239 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1240
1241 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1242 "%s odls:launch child %s has already been launched",
1243 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1244 ORTE_NAME_PRINT(&child->name)));
1245
1246 continue;
1247 }
1248 /* is this child a candidate to start? it may not be alive
1249 * because it already executed
1250 */
1251 if (ORTE_PROC_STATE_INIT != child->state &&
1252 ORTE_PROC_STATE_RESTART != child->state) {
1253 continue;
1254 }
1255 /* do we have a child from the specified job. Because the
1256 * job could be given as a WILDCARD value, we must use
1257 * the dss.compare function to check for equality.
1258 */
1259 if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1260
1261 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1262 "%s odls:launch child %s is not in job %s being launched",
1263 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1264 ORTE_NAME_PRINT(&child->name),
1265 ORTE_JOBID_PRINT(job)));
1266
1267 continue;
1268 }
1269
1270 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1271 "%s odls:launch working child %s",
1272 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1273 ORTE_NAME_PRINT(&child->name)));
1274
1275 /* set the waitpid callback here for thread protection and
1276 * to ensure we can capture the callback on shortlived apps */
1277 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1278 orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1279
1280 /* dispatch this child to the next available launch thread */
1281 cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1282 if (NULL != effective_dir) {
1283 cd->wdir = strdup(effective_dir);
1284 }
1285 cd->jdata = jobdat;
1286 cd->app = app;
1287 cd->child = child;
1288 cd->fork_local = fork_local;
1289 cd->index_argv = index_argv;
1290 /* setup any IOF */
1291 cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1292
1293 /* do we want to setup stdin? */
1294 if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1295 child->name.vpid == jobdat->stdin_target) {
1296 cd->opts.connect_stdin = true;
1297 } else {
1298 cd->opts.connect_stdin = false;
1299 }
1300 if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1301 ORTE_ERROR_LOG(rc);
1302 child->exit_code = rc;
1303 OBJ_RELEASE(cd);
1304 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1305 goto GETOUT;
1306 }
1307 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1308 /* connect endpoints IOF */
1309 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1310 if (ORTE_SUCCESS != rc) {
1311 ORTE_ERROR_LOG(rc);
1312 OBJ_RELEASE(cd);
1313 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1314 goto GETOUT;
1315 }
1316 }
1317 ++orte_odls_globals.next_base;
1318 if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1319 orte_odls_globals.next_base = 0;
1320 }
1321 opal_output_verbose(1, orte_odls_base_framework.framework_output,
1322 "%s odls:dispatch %s to thread %d",
1323 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1324 ORTE_NAME_PRINT(&child->name),
1325 orte_odls_globals.next_base);
1326 evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1327 opal_event_set(evb, &cd->ev, -1,
1328 OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
1329 opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
1330 opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
1331
1332 }
1333 if (NULL != effective_dir) {
1334 free(effective_dir);
1335 effective_dir = NULL;
1336 }
1337 }
1338
1339 GETOUT:
1340 if (NULL != effective_dir) {
1341 free(effective_dir);
1342 effective_dir = NULL;
1343 }
1344 /* tell the state machine that all local procs for this job
1345 * were launched so that it can do whatever it needs to do,
1346 * like send a state update message for all procs to the HNP
1347 */
1348 ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE);
1349
1350 ERROR_OUT:
1351 /* ensure we reset our working directory back to our default location */
1352 chdir(basedir);
1353 /* release the event */
1354 OBJ_RELEASE(caddy);
1355 }
1356
1357 /**
1358 * Pass a signal to my local procs
1359 */
1360
orte_odls_base_default_signal_local_procs(const orte_process_name_t * proc,int32_t signal,orte_odls_base_signal_local_fn_t signal_local)1361 int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal,
1362 orte_odls_base_signal_local_fn_t signal_local)
1363 {
1364 int rc, i;
1365 orte_proc_t *child;
1366
1367 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1368 "%s odls: signaling proc %s",
1369 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1370 (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
1371
1372 /* if procs is NULL, then we want to signal all
1373 * of the local procs, so just do that case
1374 */
1375 if (NULL == proc) {
1376 rc = ORTE_SUCCESS; /* pre-set this as an empty list causes us to drop to bottom */
1377 for (i=0; i < orte_local_children->size; i++) {
1378 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1379 continue;
1380 }
1381 if (0 == child->pid || !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1382 /* skip this one as the child isn't alive */
1383 continue;
1384 }
1385 if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1386 ORTE_ERROR_LOG(rc);
1387 }
1388 }
1389 return rc;
1390 }
1391
1392 /* we want it sent to some specified process, so find it */
1393 for (i=0; i < orte_local_children->size; i++) {
1394 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1395 continue;
1396 }
1397 if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {
1398 if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1399 ORTE_ERROR_LOG(rc);
1400 }
1401 return rc;
1402 }
1403 }
1404
1405 /* only way to get here is if we couldn't find the specified proc.
1406 * report that as an error and return it
1407 */
1408 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1409 return ORTE_ERR_NOT_FOUND;
1410 }
1411
1412 /*
1413 * Wait for a callback indicating the child has completed.
1414 */
1415
ompi_odls_base_default_wait_local_proc(orte_proc_t * proc,void * cbdata)1416 void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1417 {
1418 int i;
1419 orte_job_t *jobdat;
1420 orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
1421 orte_proc_t *cptr;
1422
1423 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1424 "%s odls:wait_local_proc child process %s pid %ld terminated",
1425 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1426 ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
1427
1428 /* if the child was previously flagged as dead, then just
1429 * update its exit status and
1430 * ensure that its exit state gets reported to avoid hanging
1431 * don't forget to check if the process was signaled.
1432 */
1433 if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) {
1434 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1435 "%s odls:waitpid_fired child %s was already dead exit code %d",
1436 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1437 ORTE_NAME_PRINT(&proc->name),proc->exit_code));
1438 if (WIFEXITED(proc->exit_code)) {
1439 proc->exit_code = WEXITSTATUS(proc->exit_code);
1440 if (0 != proc->exit_code) {
1441 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1442 }
1443 } else {
1444 if (WIFSIGNALED(proc->exit_code)) {
1445 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1446 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1447 }
1448 }
1449 goto MOVEON;
1450 }
1451
1452 /* if the proc called "abort", then we just need to flag that it
1453 * came thru here */
1454 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
1455 /* even though the process exited "normally", it happened
1456 * via an orte_abort call
1457 */
1458 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1459 "%s odls:waitpid_fired child %s died by call to abort",
1460 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1461 ORTE_NAME_PRINT(&proc->name)));
1462 state = ORTE_PROC_STATE_CALLED_ABORT;
1463 /* regardless of our eventual code path, we need to
1464 * flag that this proc has had its waitpid fired */
1465 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1466 goto MOVEON;
1467 }
1468
1469 /* get the jobdat for this child */
1470 if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
1471 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1472 goto MOVEON;
1473 }
1474
1475 /* if this is a debugger daemon, then just report the state
1476 * and return as we aren't monitoring it
1477 */
1478 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1479 goto MOVEON;
1480 }
1481
1482 /* if this child was ordered to die, then just pass that along
1483 * so we don't hang
1484 */
1485 if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
1486 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1487 "%s odls:waitpid_fired child %s was ordered to die",
1488 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1489 ORTE_NAME_PRINT(&proc->name)));
1490 /* regardless of our eventual code path, we need to
1491 * flag that this proc has had its waitpid fired */
1492 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1493 goto MOVEON;
1494 }
1495
1496 /* determine the state of this process */
1497 if (WIFEXITED(proc->exit_code)) {
1498
1499 /* set the exit status appropriately */
1500 proc->exit_code = WEXITSTATUS(proc->exit_code);
1501
1502 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1503 "%s odls:waitpid_fired child %s exit code %d",
1504 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1505 ORTE_NAME_PRINT(&proc->name), proc->exit_code));
1506
1507 /* provide a default state */
1508 state = ORTE_PROC_STATE_WAITPID_FIRED;
1509
1510 /* check to see if a sync was required and if it was received */
1511 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_REG)) {
1512 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_HAS_DEREG) ||
1513 orte_allowed_exit_without_sync || 0 != proc->exit_code) {
1514 /* if we did recv a finalize sync, or one is not required,
1515 * then declare it normally terminated
1516 * unless it returned with a non-zero status indicating the code
1517 * felt it was non-normal - in this latter case, we do not
1518 * require that the proc deregister before terminating
1519 */
1520 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1521 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1522 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1523 "%s odls:waitpid_fired child process %s terminated normally "
1524 "but with a non-zero exit status - it "
1525 "will be treated as an abnormal termination",
1526 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1527 ORTE_NAME_PRINT(&proc->name)));
1528 } else {
1529 /* indicate the waitpid fired */
1530 state = ORTE_PROC_STATE_WAITPID_FIRED;
1531 }
1532 } else {
1533 /* we required a finalizing sync and didn't get it, so this
1534 * is considered an abnormal termination and treated accordingly
1535 */
1536 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1537 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1538 "%s odls:waitpid_fired child process %s terminated normally "
1539 "but did not provide a required finalize sync - it "
1540 "will be treated as an abnormal termination",
1541 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1542 ORTE_NAME_PRINT(&proc->name)));
1543 }
1544 } else {
1545 /* has any child in this job already registered? */
1546 for (i=0; i < orte_local_children->size; i++) {
1547 if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1548 continue;
1549 }
1550 if (cptr->name.jobid != proc->name.jobid) {
1551 continue;
1552 }
1553 if (ORTE_FLAG_TEST(cptr, ORTE_PROC_FLAG_REG) && !orte_allowed_exit_without_sync) {
1554 /* someone has registered, and we didn't before
1555 * terminating - this is an abnormal termination unless
1556 * the allowed_exit_without_sync flag is set
1557 */
1558 if (0 != proc->exit_code) {
1559 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1560 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1561 "%s odls:waitpid_fired child process %s terminated normally "
1562 "but with a non-zero exit status - it "
1563 "will be treated as an abnormal termination",
1564 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1565 ORTE_NAME_PRINT(&proc->name)));
1566 } else {
1567 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1568 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1569 "%s odls:waitpid_fired child process %s terminated normally "
1570 "but did not provide a required init sync - it "
1571 "will be treated as an abnormal termination",
1572 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1573 ORTE_NAME_PRINT(&proc->name)));
1574 }
1575 goto MOVEON;
1576 }
1577 }
1578 /* if no child has registered, then it is possible that
1579 * none of them will. This is considered acceptable. Still
1580 * flag it as abnormal if the exit code was non-zero
1581 */
1582 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1583 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1584 } else {
1585 state = ORTE_PROC_STATE_WAITPID_FIRED;
1586 }
1587 }
1588
1589 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1590 "%s odls:waitpid_fired child process %s terminated %s",
1591 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1592 ORTE_NAME_PRINT(&proc->name),
1593 (0 == proc->exit_code) ? "normally" : "with non-zero status"));
1594 } else {
1595 /* the process was terminated with a signal! That's definitely
1596 * abnormal, so indicate that condition
1597 */
1598 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1599 /* If a process was killed by a signal, then make the
1600 * exit code of orterun be "signo + 128" so that "prog"
1601 * and "orterun prog" will both yield the same exit code.
1602 *
1603 * This is actually what the shell does for you when
1604 * a process dies by signal, so this makes orterun treat
1605 * the termination code to exit status translation the
1606 * same way
1607 */
1608 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1609
1610 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1611 "%s odls:waitpid_fired child process %s terminated with signal",
1612 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1613 ORTE_NAME_PRINT(&proc->name) ));
1614 /* Do not decrement the number of local procs here. That is handled in the errmgr */
1615 }
1616
1617 MOVEON:
1618 /* cancel the wait as this proc has already terminated */
1619 orte_wait_cb_cancel(proc);
1620 ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1621 }
1622
1623 typedef struct {
1624 opal_list_item_t super;
1625 orte_proc_t *child;
1626 } orte_odls_quick_caddy_t;
qcdcon(orte_odls_quick_caddy_t * p)1627 static void qcdcon(orte_odls_quick_caddy_t *p)
1628 {
1629 p->child = NULL;
1630 }
qcddes(orte_odls_quick_caddy_t * p)1631 static void qcddes(orte_odls_quick_caddy_t *p)
1632 {
1633 if (NULL != p->child) {
1634 OBJ_RELEASE(p->child);
1635 }
1636 }
1637 OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
1638 opal_list_item_t,
1639 qcdcon, qcddes);
1640
orte_odls_base_default_kill_local_procs(opal_pointer_array_t * procs,orte_odls_base_kill_local_fn_t kill_local)1641 int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1642 orte_odls_base_kill_local_fn_t kill_local)
1643 {
1644 orte_proc_t *child;
1645 opal_list_t procs_killed;
1646 orte_proc_t *proc, proctmp;
1647 int i, j;
1648 opal_pointer_array_t procarray, *procptr;
1649 bool do_cleanup;
1650 orte_odls_quick_caddy_t *cd;
1651
1652 OBJ_CONSTRUCT(&procs_killed, opal_list_t);
1653
1654 /* if the pointer array is NULL, then just kill everything */
1655 if (NULL == procs) {
1656 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1657 "%s odls:kill_local_proc working on WILDCARD",
1658 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1659 OBJ_CONSTRUCT(&procarray, opal_pointer_array_t);
1660 opal_pointer_array_init(&procarray, 1, 1, 1);
1661 OBJ_CONSTRUCT(&proctmp, orte_proc_t);
1662 proctmp.name.jobid = ORTE_JOBID_WILDCARD;
1663 proctmp.name.vpid = ORTE_VPID_WILDCARD;
1664 opal_pointer_array_add(&procarray, &proctmp);
1665 procptr = &procarray;
1666 do_cleanup = true;
1667 } else {
1668 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1669 "%s odls:kill_local_proc working on provided array",
1670 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1671 procptr = procs;
1672 do_cleanup = false;
1673 }
1674
1675 /* cycle through the provided array of processes to kill */
1676 for (i=0; i < procptr->size; i++) {
1677 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) {
1678 continue;
1679 }
1680 for (j=0; j < orte_local_children->size; j++) {
1681 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) {
1682 continue;
1683 }
1684
1685 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1686 "%s odls:kill_local_proc checking child process %s",
1687 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1688 ORTE_NAME_PRINT(&child->name)));
1689
1690 /* do we have a child from the specified job? Because the
1691 * job could be given as a WILDCARD value, we must
1692 * check for that as well as for equality.
1693 */
1694 if (ORTE_JOBID_WILDCARD != proc->name.jobid &&
1695 proc->name.jobid != child->name.jobid) {
1696
1697 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1698 "%s odls:kill_local_proc child %s is not part of job %s",
1699 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1700 ORTE_NAME_PRINT(&child->name),
1701 ORTE_JOBID_PRINT(proc->name.jobid)));
1702 continue;
1703 }
1704
1705 /* see if this is the specified proc - could be a WILDCARD again, so check
1706 * appropriately
1707 */
1708 if (ORTE_VPID_WILDCARD != proc->name.vpid &&
1709 proc->name.vpid != child->name.vpid) {
1710
1711 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1712 "%s odls:kill_local_proc child %s is not covered by rank %s",
1713 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1714 ORTE_NAME_PRINT(&child->name),
1715 ORTE_VPID_PRINT(proc->name.vpid)));
1716 continue;
1717 }
1718
1719 /* is this process alive? if not, then nothing for us
1720 * to do to it
1721 */
1722 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE) || 0 == child->pid) {
1723
1724 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1725 "%s odls:kill_local_proc child %s is not alive",
1726 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1727 ORTE_NAME_PRINT(&child->name)));
1728
1729 /* ensure, though, that the state is terminated so we don't lockup if
1730 * the proc never started
1731 */
1732 if (ORTE_PROC_STATE_UNDEF == child->state ||
1733 ORTE_PROC_STATE_INIT == child->state ||
1734 ORTE_PROC_STATE_RUNNING == child->state) {
1735 /* we can't be sure what happened, but make sure we
1736 * at least have a value that will let us eventually wakeup
1737 */
1738 child->state = ORTE_PROC_STATE_TERMINATED;
1739 /* ensure we realize that the waitpid will never come, if
1740 * it already hasn't
1741 */
1742 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
1743 child->pid = 0;
1744 goto CLEANUP;
1745 } else {
1746 continue;
1747 }
1748 }
1749
1750 /* ensure the stdin IOF channel for this child is closed. The other
1751 * channels will automatically close when the proc is killed
1752 */
1753 if (NULL != orte_iof.close) {
1754 orte_iof.close(&child->name, ORTE_IOF_STDIN);
1755 }
1756
1757 /* cancel the waitpid callback as this induces unmanageable race
1758 * conditions when we are deliberately killing the process
1759 */
1760 orte_wait_cb_cancel(child);
1761
1762 /* First send a SIGCONT in case the process is in stopped state.
1763 If it is in a stopped state and we do not first change it to
1764 running, then SIGTERM will not get delivered. Ignore return
1765 value. */
1766 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1767 "%s SENDING SIGCONT TO %s",
1768 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1769 ORTE_NAME_PRINT(&child->name)));
1770 cd = OBJ_NEW(orte_odls_quick_caddy_t);
1771 OBJ_RETAIN(child);
1772 cd->child = child;
1773 opal_list_append(&procs_killed, &cd->super);
1774 kill_local(child->pid, SIGCONT);
1775 continue;
1776
1777 CLEANUP:
1778 /* ensure the child's session directory is cleaned up */
1779 orte_session_dir_finalize(&child->name);
1780 /* check for everything complete - this will remove
1781 * the child object from our local list
1782 */
1783 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1784 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID)) {
1785 ORTE_ACTIVATE_PROC_STATE(&child->name, child->state);
1786 }
1787 }
1788 }
1789
1790 /* if we are issuing signals, then we need to wait a little
1791 * and send the next in sequence */
1792 if (0 < opal_list_get_size(&procs_killed)) {
1793 sleep(orte_odls_globals.timeout_before_sigkill);
1794 /* issue a SIGTERM to all */
1795 OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1796 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1797 "%s SENDING SIGTERM TO %s",
1798 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1799 ORTE_NAME_PRINT(&cd->child->name)));
1800 kill_local(cd->child->pid, SIGTERM);
1801 }
1802 /* wait a little again */
1803 sleep(orte_odls_globals.timeout_before_sigkill);
1804 /* issue a SIGKILL to all */
1805 OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1806 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1807 "%s SENDING SIGKILL TO %s",
1808 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1809 ORTE_NAME_PRINT(&cd->child->name)));
1810 kill_local(cd->child->pid, SIGKILL);
1811 /* indicate the waitpid fired as this is effectively what
1812 * has happened
1813 */
1814 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1815
1816 /* Since we are not going to wait for this process, make sure
1817 * we mark it as not-alive so that we don't wait for it
1818 * in orted_cmd
1819 */
1820 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
1821 cd->child->pid = 0;
1822
1823 /* mark the child as "killed" */
1824 cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
1825
1826 /* ensure the child's session directory is cleaned up */
1827 orte_session_dir_finalize(&cd->child->name);
1828 /* check for everything complete - this will remove
1829 * the child object from our local list
1830 */
1831 if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1832 ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1833 ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1834 }
1835 }
1836 }
1837 OPAL_LIST_DESTRUCT(&procs_killed);
1838
1839 /* cleanup arrays, if required */
1840 if (do_cleanup) {
1841 OBJ_DESTRUCT(&procarray);
1842 OBJ_DESTRUCT(&proctmp);
1843 }
1844
1845 return ORTE_SUCCESS;
1846 }
1847
orte_odls_base_get_proc_stats(opal_buffer_t * answer,orte_process_name_t * proc)1848 int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
1849 orte_process_name_t *proc)
1850 {
1851 int rc;
1852 orte_proc_t *child;
1853 opal_pstats_t stats, *statsptr;
1854 int i, j;
1855
1856 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1857 "%s odls:get_proc_stats for proc %s",
1858 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1859 ORTE_NAME_PRINT(proc)));
1860
1861 /* find this child */
1862 for (i=0; i < orte_local_children->size; i++) {
1863 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1864 continue;
1865 }
1866
1867 if (proc->jobid == child->name.jobid &&
1868 (proc->vpid == child->name.vpid ||
1869 ORTE_VPID_WILDCARD == proc->vpid)) { /* found it */
1870
1871 OBJ_CONSTRUCT(&stats, opal_pstats_t);
1872 /* record node up to first '.' */
1873 for (j=0; j < (int)strlen(orte_process_info.nodename) &&
1874 j < OPAL_PSTAT_MAX_STRING_LEN-1 &&
1875 orte_process_info.nodename[j] != '.'; j++) {
1876 stats.node[j] = orte_process_info.nodename[j];
1877 }
1878 /* record rank */
1879 stats.rank = child->name.vpid;
1880 /* get stats */
1881 rc = opal_pstat.query(child->pid, &stats, NULL);
1882 if (ORTE_SUCCESS != rc) {
1883 OBJ_DESTRUCT(&stats);
1884 return rc;
1885 }
1886 if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, proc, 1, ORTE_NAME))) {
1887 ORTE_ERROR_LOG(rc);
1888 OBJ_DESTRUCT(&stats);
1889 return rc;
1890 }
1891 statsptr = &stats;
1892 if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &statsptr, 1, OPAL_PSTAT))) {
1893 ORTE_ERROR_LOG(rc);
1894 OBJ_DESTRUCT(&stats);
1895 return rc;
1896 }
1897 OBJ_DESTRUCT(&stats);
1898 }
1899 }
1900
1901 return ORTE_SUCCESS;
1902 }
1903
orte_odls_base_default_restart_proc(orte_proc_t * child,orte_odls_base_fork_local_proc_fn_t fork_local)1904 int orte_odls_base_default_restart_proc(orte_proc_t *child,
1905 orte_odls_base_fork_local_proc_fn_t fork_local)
1906 {
1907 int rc;
1908 orte_app_context_t *app;
1909 orte_job_t *jobdat;
1910 char basedir[MAXPATHLEN];
1911 char *wdir = NULL;
1912 orte_odls_spawn_caddy_t *cd;
1913 opal_event_base_t *evb;
1914
1915 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1916 "%s odls:restart_proc for proc %s",
1917 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1918 ORTE_NAME_PRINT(&child->name)));
1919
1920 /* establish our baseline working directory - we will be potentially
1921 * bouncing around as we execute this app, but we will always return
1922 * to this place as our default directory
1923 */
1924 getcwd(basedir, sizeof(basedir));
1925
1926 /* find this child's jobdat */
1927 if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) {
1928 /* not found */
1929 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1930 return ORTE_ERR_NOT_FOUND;
1931 }
1932
1933 child->state = ORTE_PROC_STATE_FAILED_TO_START;
1934 child->exit_code = 0;
1935 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
1936 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
1937 child->pid = 0;
1938 if (NULL != child->rml_uri) {
1939 free(child->rml_uri);
1940 child->rml_uri = NULL;
1941 }
1942 app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
1943
1944 /* reset envars to match this child */
1945 if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &app->env))) {
1946 ORTE_ERROR_LOG(rc);
1947 goto CLEANUP;
1948 }
1949
1950 /* setup the path */
1951 if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
1952 ORTE_ERROR_LOG(rc);
1953 if (NULL != wdir) {
1954 free(wdir);
1955 }
1956 goto CLEANUP;
1957 }
1958
1959 /* dispatch this child to the next available launch thread */
1960 cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1961 if (NULL != wdir) {
1962 cd->wdir = strdup(wdir);
1963 free(wdir);
1964 }
1965 cd->jdata = jobdat;
1966 cd->app = app;
1967 cd->child = child;
1968 cd->fork_local = fork_local;
1969 /* setup any IOF */
1970 cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1971
1972 /* do we want to setup stdin? */
1973 if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1974 child->name.vpid == jobdat->stdin_target) {
1975 cd->opts.connect_stdin = true;
1976 } else {
1977 cd->opts.connect_stdin = false;
1978 }
1979 if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1980 ORTE_ERROR_LOG(rc);
1981 child->exit_code = rc;
1982 OBJ_RELEASE(cd);
1983 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1984 goto CLEANUP;
1985 }
1986 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1987 /* connect endpoints IOF */
1988 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1989 if (ORTE_SUCCESS != rc) {
1990 ORTE_ERROR_LOG(rc);
1991 OBJ_RELEASE(cd);
1992 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1993 goto CLEANUP;
1994 }
1995 }
1996 orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1997
1998 ++orte_odls_globals.next_base;
1999 if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
2000 orte_odls_globals.next_base = 0;
2001 }
2002 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2003 "%s restarting app %s",
2004 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
2005
2006 evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
2007 opal_event_set(evb, &cd->ev, -1,
2008 OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
2009 opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
2010 opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
2011
2012 CLEANUP:
2013 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2014 "%s odls:restart of proc %s %s",
2015 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2016 ORTE_NAME_PRINT(&child->name),
2017 (ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
2018
2019 /* reset our working directory back to our default location - if we
2020 * don't do this, then we will be looking for relative paths starting
2021 * from the last wdir option specified by the user. Thus, we would
2022 * be requiring that the user keep track on the cmd line of where
2023 * each app was located relative to the prior app, instead of relative
2024 * to their current location
2025 */
2026 chdir(basedir);
2027
2028 return rc;
2029 }
2030