1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
14 * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
15 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
16 * Copyright (c) 2015 Los Alamos National Security, LLC. All rights
17 * reserved.
18 * Copyright (c) 2016-2017 Research Organization for Information Science
19 * and Technology (RIST). All rights reserved.
20 * $COPYRIGHT$
21 *
22 * Additional copyrights may follow
23 *
24 * $HEADER$
25 *
26 */
27
28 #include "orte_config.h"
29 #include "orte/constants.h"
30
31 #include <string.h>
32 #ifdef HAVE_SYS_TYPES_H
33 #include <sys/types.h>
34 #endif
35 #ifdef HAVE_UNISTD_H
36 #include <unistd.h>
37 #endif
38 #include <signal.h>
39 #include <errno.h>
40
41 #include "opal/hash_string.h"
42 #include "opal/util/argv.h"
43 #include "opal/util/opal_environ.h"
44 #include "opal/util/path.h"
45 #include "opal/runtime/opal_progress_threads.h"
46 #include "opal/mca/installdirs/installdirs.h"
47 #include "opal/mca/pmix/base/base.h"
48 #include "opal/mca/pmix/pmix.h"
49
50 #include "orte/util/show_help.h"
51 #include "orte/util/proc_info.h"
52 #include "orte/mca/errmgr/errmgr.h"
53 #include "orte/mca/plm/base/base.h"
54 #include "orte/util/name_fns.h"
55 #include "orte/runtime/orte_globals.h"
56 #include "orte/util/session_dir.h"
57 #include "orte/util/pre_condition_transports.h"
58
59 #include "orte/mca/ess/ess.h"
60 #include "orte/mca/ess/base/base.h"
61 #include "orte/mca/ess/singleton/ess_singleton.h"
62
63
64 static int rte_init(void);
65 static int rte_finalize(void);
66
67 orte_ess_base_module_t orte_ess_singleton_module = {
68 rte_init,
69 rte_finalize,
70 orte_ess_base_app_abort,
71 NULL /* ft_event */
72 };
73
74 extern char *orte_ess_singleton_server_uri;
75 static bool added_transport_keys=false;
76 static bool added_num_procs = false;
77 static bool added_app_ctx = false;
78 static bool added_pmix_envs = false;
79 static bool progress_thread_running = false;
80
81 static int fork_hnp(void);
82
rte_init(void)83 static int rte_init(void)
84 {
85 int rc, ret;
86 char *error = NULL;
87 int u32, *u32ptr;
88 uint16_t u16, *u16ptr;
89 orte_process_name_t name;
90
91 /* run the prolog */
92 if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) {
93 ORTE_ERROR_LOG(rc);
94 return rc;
95 }
96 u32ptr = &u32;
97 u16ptr = &u16;
98
99 if (NULL != mca_ess_singleton_component.server_uri) {
100 /* we are going to connect to a server HNP */
101 if (0 == strncmp(mca_ess_singleton_component.server_uri, "file", strlen("file")) ||
102 0 == strncmp(mca_ess_singleton_component.server_uri, "FILE", strlen("FILE"))) {
103 char input[1024], *filename;
104 FILE *fp;
105
106 /* it is a file - get the filename */
107 filename = strchr(mca_ess_singleton_component.server_uri, ':');
108 if (NULL == filename) {
109 /* filename is not correctly formatted */
110 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
111 "singleton", mca_ess_singleton_component.server_uri);
112 return ORTE_ERROR;
113 }
114 ++filename; /* space past the : */
115
116 if (0 >= strlen(filename)) {
117 /* they forgot to give us the name! */
118 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
119 "singleton", mca_ess_singleton_component.server_uri);
120 return ORTE_ERROR;
121 }
122
123 /* open the file and extract the uri */
124 fp = fopen(filename, "r");
125 if (NULL == fp) { /* can't find or read file! */
126 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
127 "singleton", mca_ess_singleton_component.server_uri);
128 return ORTE_ERROR;
129 }
130 memset(input, 0, 1024); // initialize the array to ensure a NULL termination
131 if (NULL == fgets(input, 1023, fp)) {
132 /* something malformed about file */
133 fclose(fp);
134 orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
135 "singleton", mca_ess_singleton_component.server_uri, "singleton");
136 return ORTE_ERROR;
137 }
138 fclose(fp);
139 input[strlen(input)-1] = '\0'; /* remove newline */
140 orte_process_info.my_hnp_uri = strdup(input);
141 } else {
142 orte_process_info.my_hnp_uri = strdup(mca_ess_singleton_component.server_uri);
143 }
144 /* save the daemon uri - we will process it later */
145 orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
146 /* construct our name - we are in their job family, so we know that
147 * much. However, we cannot know how many other singletons and jobs
148 * this HNP is running. Oh well - if someone really wants to use this
149 * option, they can try to figure it out. For now, we'll just assume
150 * we are the only ones */
151 ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_HNP->jobid, 1);
152 /* obviously, we are vpid=0 for this job */
153 ORTE_PROC_MY_NAME->vpid = 0;
154
155 /* for convenience, push the pubsub version of this param into the environ */
156 opal_setenv (OPAL_MCA_PREFIX"pubsub_orte_server", orte_process_info.my_hnp_uri, true, &environ);
157 } else if (NULL != getenv("SINGULARITY_CONTAINER") ||
158 mca_ess_singleton_component.isolated) {
159 /* ensure we use the isolated pmix component */
160 opal_setenv(OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
161 } else {
162 /* we want to use PMIX_NAMESPACE that will be sent by the hnp as a jobid */
163 opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &environ);
164 /* spawn our very own HNP to support us */
165 if (ORTE_SUCCESS != (rc = fork_hnp())) {
166 ORTE_ERROR_LOG(rc);
167 return rc;
168 }
169 /* our name was given to us by the HNP */
170 opal_setenv(OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
171 }
172
173 /* get an async event base - we use the opal_async one so
174 * we don't startup extra threads if not needed */
175 orte_event_base = opal_progress_thread_init(NULL);
176 progress_thread_running = true;
177
178 /* open and setup pmix */
179 if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
180 error = "opening pmix";
181 goto error;
182 }
183 if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
184 error = "select pmix";
185 goto error;
186 }
187 /* set the event base */
188 opal_pmix_base_set_evbase(orte_event_base);
189 /* initialize the selected module */
190 if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
191 /* we cannot run */
192 error = "pmix init";
193 goto error;
194 }
195
196 /* pmix.init set our process name down in the OPAL layer,
197 * so carry it forward here */
198 ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
199 ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
200 name.jobid = OPAL_PROC_MY_NAME.jobid;
201 name.vpid = ORTE_VPID_WILDCARD;
202
203 /* get our local rank from PMI */
204 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
205 ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
206 if (OPAL_SUCCESS != ret) {
207 error = "getting local rank";
208 goto error;
209 }
210 orte_process_info.my_local_rank = u16;
211
212 /* get our node rank from PMI */
213 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
214 ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
215 if (OPAL_SUCCESS != ret) {
216 error = "getting node rank";
217 goto error;
218 }
219 orte_process_info.my_node_rank = u16;
220
221 /* get max procs */
222 OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS,
223 &name, &u32ptr, OPAL_UINT32);
224 if (OPAL_SUCCESS != ret) {
225 error = "getting max procs";
226 goto error;
227 }
228 orte_process_info.max_procs = u32;
229
230 /* we are a singleton, so there is only one proc in the job */
231 orte_process_info.num_procs = 1;
232 /* push into the environ for pickup in MPI layer for
233 * MPI-3 required info key
234 */
235 if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
236 char * num_procs;
237 asprintf(&num_procs, "%d", orte_process_info.num_procs);
238 opal_setenv(OPAL_MCA_PREFIX"orte_ess_num_procs", num_procs, true, &environ);
239 free(num_procs);
240 added_num_procs = true;
241 }
242 if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
243 char * num_procs;
244 asprintf(&num_procs, "%d", orte_process_info.num_procs);
245 opal_setenv("OMPI_APP_CTX_NUM_PROCS", num_procs, true, &environ);
246 free(num_procs);
247 added_app_ctx = true;
248 }
249
250
251 /* get our app number from PMI - ok if not found */
252 OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
253 ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
254 if (OPAL_SUCCESS == ret) {
255 orte_process_info.app_num = u32;
256 } else {
257 orte_process_info.app_num = 0;
258 }
259 /* set some other standard values */
260 orte_process_info.num_local_peers = 0;
261
262 /* setup transport keys in case the MPI layer needs them -
263 * we can use the jobfam and stepid as unique keys
264 * because they are unique values assigned by the RM
265 */
266 if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
267 char *key;
268 ret = orte_pre_condition_transports(NULL, &key);
269 if (ORTE_SUCCESS == ret) {
270 opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &environ);
271 free(key);
272 }
273 }
274
275 /* use the std app init to complete the procedure */
276 if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup(true))) {
277 ORTE_ERROR_LOG(rc);
278 return rc;
279 }
280
281 return ORTE_SUCCESS;
282
283 error:
284 if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
285 orte_show_help("help-orte-runtime.txt",
286 "orte_init:startup:internal-failure",
287 true, error, ORTE_ERROR_NAME(ret), ret);
288 }
289 return ret;
290 }
291
rte_finalize(void)292 static int rte_finalize(void)
293 {
294 int ret;
295
296 /* remove the envars that we pushed into environ
297 * so we leave that structure intact
298 */
299 if (added_transport_keys) {
300 unsetenv(OPAL_MCA_PREFIX"orte_precondition_transports");
301 }
302 if (added_num_procs) {
303 unsetenv(OPAL_MCA_PREFIX"orte_ess_num_procs");
304 }
305 if (added_app_ctx) {
306 unsetenv("OMPI_APP_CTX_NUM_PROCS");
307 }
308 if (added_pmix_envs) {
309 unsetenv("PMIX_NAMESPACE");
310 unsetenv("PMIX_RANK");
311 unsetenv("PMIX_SERVER_URI");
312 unsetenv("PMIX_SECURITY_MODE");
313 }
314 /* use the default procedure to finish */
315 if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
316 ORTE_ERROR_LOG(ret);
317 }
318
319 /* mark us as finalized */
320 if (NULL != opal_pmix.finalize) {
321 opal_pmix.finalize();
322 (void) mca_base_framework_close(&opal_pmix_base_framework);
323 }
324
325 /* release the event base */
326 if (progress_thread_running) {
327 opal_progress_thread_finalize(NULL);
328 progress_thread_running = false;
329 }
330 return ret;
331 }
332
333 #define ORTE_URI_MSG_LGTH 256
334
set_handler_default(int sig)335 static void set_handler_default(int sig)
336 {
337 struct sigaction act;
338
339 act.sa_handler = SIG_DFL;
340 act.sa_flags = 0;
341 sigemptyset(&act.sa_mask);
342
343 sigaction(sig, &act, (struct sigaction *)0);
344 }
345
fork_hnp(void)346 static int fork_hnp(void)
347 {
348 int p[2], death_pipe[2];
349 char *cmd;
350 char **argv = NULL;
351 int argc;
352 char *param, *cptr;
353 sigset_t sigs;
354 int buffer_length, num_chars_read, chunk;
355 char *orted_uri;
356 int rc, i;
357
358 /* A pipe is used to communicate between the parent and child to
359 indicate whether the exec ultimately succeeded or failed. The
360 child sets the pipe to be close-on-exec; the child only ever
361 writes anything to the pipe if there is an error (e.g.,
362 executable not found, exec() fails, etc.). The parent does a
363 blocking read on the pipe; if the pipe closed with no data,
364 then the exec() succeeded. If the parent reads something from
365 the pipe, then the child was letting us know that it failed.
366 */
367 if (pipe(p) < 0) {
368 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
369 return ORTE_ERR_SYS_LIMITS_PIPES;
370 }
371
372 /* we also have to give the HNP a pipe it can watch to know when
373 * we terminated. Since the HNP is going to be a child of us, it
374 * can't just use waitpid to see when we leave - so it will watch
375 * the pipe instead
376 */
377 if (pipe(death_pipe) < 0) {
378 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
379 return ORTE_ERR_SYS_LIMITS_PIPES;
380 }
381
382 /* find the orted binary using the install_dirs support - this also
383 * checks to ensure that we can see this executable and it *is* executable by us
384 */
385 cmd = opal_path_access("orted", opal_install_dirs.bindir, X_OK);
386 if (NULL == cmd) {
387 /* guess we couldn't do it - best to abort */
388 ORTE_ERROR_LOG(ORTE_ERR_FILE_NOT_EXECUTABLE);
389 close(p[0]);
390 close(p[1]);
391 return ORTE_ERR_FILE_NOT_EXECUTABLE;
392 }
393
394 /* okay, setup an appropriate argv */
395 opal_argv_append(&argc, &argv, "orted");
396
397 /* tell the daemon it is to be the HNP */
398 opal_argv_append(&argc, &argv, "--hnp");
399
400 /* tell the daemon to get out of our process group */
401 opal_argv_append(&argc, &argv, "--set-sid");
402
403 /* tell the daemon to report back its uri so we can connect to it */
404 opal_argv_append(&argc, &argv, "--report-uri");
405 asprintf(¶m, "%d", p[1]);
406 opal_argv_append(&argc, &argv, param);
407 free(param);
408
409 /* give the daemon a pipe it can watch to tell when we have died */
410 opal_argv_append(&argc, &argv, "--singleton-died-pipe");
411 asprintf(¶m, "%d", death_pipe[0]);
412 opal_argv_append(&argc, &argv, param);
413 free(param);
414
415 /* add any debug flags */
416 if (orte_debug_flag) {
417 opal_argv_append(&argc, &argv, "--debug");
418 }
419
420 if (orte_debug_daemons_flag) {
421 opal_argv_append(&argc, &argv, "--debug-daemons");
422 }
423
424 if (orte_debug_daemons_file_flag) {
425 if (!orte_debug_daemons_flag) {
426 opal_argv_append(&argc, &argv, "--debug-daemons");
427 }
428 opal_argv_append(&argc, &argv, "--debug-daemons-file");
429 }
430
431 /* indicate that it must use the novm state machine */
432 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
433 opal_argv_append(&argc, &argv, "state_novm_select");
434 opal_argv_append(&argc, &argv, "1");
435
436 /* direct the selection of the ess component */
437 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
438 opal_argv_append(&argc, &argv, "ess");
439 opal_argv_append(&argc, &argv, "hnp");
440
441 /* direct the selection of the pmix component */
442 opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
443 opal_argv_append(&argc, &argv, "pmix");
444 opal_argv_append(&argc, &argv, "^s1,s2,cray,isolated");
445
446 /* Fork off the child */
447 orte_process_info.hnp_pid = fork();
448 if(orte_process_info.hnp_pid < 0) {
449 ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
450 close(p[0]);
451 close(p[1]);
452 close(death_pipe[0]);
453 close(death_pipe[1]);
454 free(cmd);
455 opal_argv_free(argv);
456 return ORTE_ERR_SYS_LIMITS_CHILDREN;
457 }
458
459 if (orte_process_info.hnp_pid == 0) {
460 close(p[0]);
461 close(death_pipe[1]);
462 /* I am the child - exec me */
463
464 /* Set signal handlers back to the default. Do this close
465 to the execve() because the event library may (and likely
466 will) reset them. If we don't do this, the event
467 library may have left some set that, at least on some
468 OS's, don't get reset via fork() or exec(). Hence, the
469 orted could be unkillable (for example). */
470 set_handler_default(SIGTERM);
471 set_handler_default(SIGINT);
472 set_handler_default(SIGHUP);
473 set_handler_default(SIGPIPE);
474 set_handler_default(SIGCHLD);
475
476 /* Unblock all signals, for many of the same reasons that
477 we set the default handlers, above. This is noticable
478 on Linux where the event library blocks SIGTERM, but we
479 don't want that blocked by the orted (or, more
480 specifically, we don't want it to be blocked by the
481 orted and then inherited by the ORTE processes that it
482 forks, making them unkillable by SIGTERM). */
483 sigprocmask(0, 0, &sigs);
484 sigprocmask(SIG_UNBLOCK, &sigs, 0);
485
486 execv(cmd, argv);
487
488 /* if I get here, the execv failed! */
489 orte_show_help("help-ess-base.txt", "ess-base:execv-error",
490 true, cmd, strerror(errno));
491 exit(1);
492
493 } else {
494 int count;
495
496 free(cmd);
497 /* I am the parent - wait to hear something back and
498 * report results
499 */
500 close(p[1]); /* parent closes the write - orted will write its contact info to it*/
501 close(death_pipe[0]); /* parent closes the death_pipe's read */
502 opal_argv_free(argv);
503
504 /* setup the buffer to read the HNP's uri */
505 buffer_length = ORTE_URI_MSG_LGTH;
506 chunk = ORTE_URI_MSG_LGTH-1;
507 num_chars_read = 0;
508 orted_uri = (char*)malloc(buffer_length);
509 memset(orted_uri, 0, buffer_length);
510
511 while (0 != (rc = read(p[0], &orted_uri[num_chars_read], chunk))) {
512 if (rc < 0 && (EAGAIN == errno || EINTR == errno)) {
513 continue;
514 } else if (rc < 0) {
515 num_chars_read = -1;
516 break;
517 }
518 /* we read something - better get more */
519 num_chars_read += rc;
520 chunk -= rc;
521 if (0 == chunk) {
522 chunk = ORTE_URI_MSG_LGTH;
523 orted_uri = realloc((void*)orted_uri, buffer_length+chunk);
524 memset(&orted_uri[buffer_length], 0, chunk);
525 buffer_length += chunk;
526 }
527 }
528 close(p[0]);
529
530 if (num_chars_read <= 0) {
531 /* we didn't get anything back - this is bad */
532 ORTE_ERROR_LOG(ORTE_ERR_HNP_COULD_NOT_START);
533 free(orted_uri);
534 return ORTE_ERR_HNP_COULD_NOT_START;
535 }
536
537 /* parse the sysinfo from the returned info - must
538 * start from the end of the string as the uri itself
539 * can contain brackets */
540 if (NULL == (param = strrchr(orted_uri, '['))) {
541 ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
542 free(orted_uri);
543 return ORTE_ERR_COMM_FAILURE;
544 }
545 *param = '\0'; /* terminate the uri string */
546 ++param; /* point to the start of the sysinfo */
547
548 /* find the end of the sysinfo */
549 if (NULL == (cptr = strchr(param, ']'))) {
550 ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
551 free(orted_uri);
552 return ORTE_ERR_COMM_FAILURE;
553 }
554 *cptr = '\0'; /* terminate the sysinfo string */
555 ++cptr; /* point to the start of the pmix uri */
556
557 /* convert the sysinfo string */
558 if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_sysinfo(&orte_local_cpu_type,
559 &orte_local_cpu_model, param))) {
560 ORTE_ERROR_LOG(rc);
561 free(orted_uri);
562 return rc;
563 }
564
565 /* save the daemon uri - we will process it later */
566 orte_process_info.my_daemon_uri = strdup(orted_uri);
567 /* likewise, since this is also the HNP, set that uri too */
568 orte_process_info.my_hnp_uri = orted_uri;
569
570 /* split the pmix_uri into its parts */
571 argv = opal_argv_split(cptr, '*');
572 count = opal_argv_count(argv);
573 /* push each piece into the environment */
574 for (i=0; i < count; i++) {
575 char *c = strchr(argv[i], '=');
576 assert(NULL != c);
577 *c++ = '\0';
578 opal_setenv(argv[i], c, true, &environ);
579 }
580 opal_argv_free(argv);
581 added_pmix_envs = true;
582
583 /* all done - report success */
584 return ORTE_SUCCESS;
585 }
586 }
587