1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2011 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
14  * Copyright (c) 2011      Cisco Systems, Inc.  All rights reserved.
15  * Copyright (c) 2013-2017 Intel, Inc.  All rights reserved.
16  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
17  *                         reserved.
18  * Copyright (c) 2016-2017 Research Organization for Information Science
19  *                         and Technology (RIST). All rights reserved.
20  * $COPYRIGHT$
21  *
22  * Additional copyrights may follow
23  *
24  * $HEADER$
25  *
26  */
27 
28 #include "orte_config.h"
29 #include "orte/constants.h"
30 
31 #include <string.h>
32 #ifdef HAVE_SYS_TYPES_H
33 #include <sys/types.h>
34 #endif
35 #ifdef HAVE_UNISTD_H
36 #include <unistd.h>
37 #endif
38 #include <signal.h>
39 #include <errno.h>
40 
41 #include "opal/hash_string.h"
42 #include "opal/util/argv.h"
43 #include "opal/util/opal_environ.h"
44 #include "opal/util/path.h"
45 #include "opal/runtime/opal_progress_threads.h"
46 #include "opal/mca/installdirs/installdirs.h"
47 #include "opal/mca/pmix/base/base.h"
48 #include "opal/mca/pmix/pmix.h"
49 
50 #include "orte/util/show_help.h"
51 #include "orte/util/proc_info.h"
52 #include "orte/mca/errmgr/errmgr.h"
53 #include "orte/mca/plm/base/base.h"
54 #include "orte/util/name_fns.h"
55 #include "orte/runtime/orte_globals.h"
56 #include "orte/util/session_dir.h"
57 #include "orte/util/pre_condition_transports.h"
58 
59 #include "orte/mca/ess/ess.h"
60 #include "orte/mca/ess/base/base.h"
61 #include "orte/mca/ess/singleton/ess_singleton.h"
62 
63 
64 static int rte_init(void);
65 static int rte_finalize(void);
66 
67 orte_ess_base_module_t orte_ess_singleton_module = {
68     rte_init,
69     rte_finalize,
70     orte_ess_base_app_abort,
71     NULL /* ft_event */
72 };
73 
74 extern char *orte_ess_singleton_server_uri;
75 static bool added_transport_keys=false;
76 static bool added_num_procs = false;
77 static bool added_app_ctx = false;
78 static bool added_pmix_envs = false;
79 static bool progress_thread_running = false;
80 
81 static int fork_hnp(void);
82 
rte_init(void)83 static int rte_init(void)
84 {
85     int rc, ret;
86     char *error = NULL;
87     int u32, *u32ptr;
88     uint16_t u16, *u16ptr;
89     orte_process_name_t name;
90 
91     /* run the prolog */
92     if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) {
93         ORTE_ERROR_LOG(rc);
94         return rc;
95     }
96     u32ptr = &u32;
97     u16ptr = &u16;
98 
99     if (NULL != mca_ess_singleton_component.server_uri) {
100         /* we are going to connect to a server HNP */
101         if (0 == strncmp(mca_ess_singleton_component.server_uri, "file", strlen("file")) ||
102             0 == strncmp(mca_ess_singleton_component.server_uri, "FILE", strlen("FILE"))) {
103             char input[1024], *filename;
104             FILE *fp;
105 
106             /* it is a file - get the filename */
107             filename = strchr(mca_ess_singleton_component.server_uri, ':');
108             if (NULL == filename) {
109                 /* filename is not correctly formatted */
110                 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
111                                "singleton", mca_ess_singleton_component.server_uri);
112                 return ORTE_ERROR;
113             }
114             ++filename; /* space past the : */
115 
116             if (0 >= strlen(filename)) {
117                 /* they forgot to give us the name! */
118                 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
119                                "singleton", mca_ess_singleton_component.server_uri);
120                 return ORTE_ERROR;
121             }
122 
123             /* open the file and extract the uri */
124             fp = fopen(filename, "r");
125             if (NULL == fp) { /* can't find or read file! */
126                 orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
127                                "singleton", mca_ess_singleton_component.server_uri);
128                 return ORTE_ERROR;
129             }
130             memset(input, 0, 1024);  // initialize the array to ensure a NULL termination
131             if (NULL == fgets(input, 1023, fp)) {
132                 /* something malformed about file */
133                 fclose(fp);
134                 orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
135                                "singleton", mca_ess_singleton_component.server_uri, "singleton");
136                 return ORTE_ERROR;
137             }
138             fclose(fp);
139             input[strlen(input)-1] = '\0';  /* remove newline */
140             orte_process_info.my_hnp_uri = strdup(input);
141         } else {
142             orte_process_info.my_hnp_uri = strdup(mca_ess_singleton_component.server_uri);
143         }
144         /* save the daemon uri - we will process it later */
145         orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
146         /* construct our name - we are in their job family, so we know that
147          * much. However, we cannot know how many other singletons and jobs
148          * this HNP is running. Oh well - if someone really wants to use this
149          * option, they can try to figure it out. For now, we'll just assume
150          * we are the only ones */
151         ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_HNP->jobid, 1);
152         /* obviously, we are vpid=0 for this job */
153         ORTE_PROC_MY_NAME->vpid = 0;
154 
155         /* for convenience, push the pubsub version of this param into the environ */
156         opal_setenv (OPAL_MCA_PREFIX"pubsub_orte_server", orte_process_info.my_hnp_uri, true, &environ);
157     } else if (NULL != getenv("SINGULARITY_CONTAINER") ||
158                mca_ess_singleton_component.isolated) {
159         /* ensure we use the isolated pmix component */
160         opal_setenv(OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
161     } else {
162         /* we want to use PMIX_NAMESPACE that will be sent by the hnp as a jobid */
163         opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &environ);
164         /* spawn our very own HNP to support us */
165         if (ORTE_SUCCESS != (rc = fork_hnp())) {
166             ORTE_ERROR_LOG(rc);
167             return rc;
168         }
169         /* our name was given to us by the HNP */
170         opal_setenv(OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
171     }
172 
173     /* get an async event base - we use the opal_async one so
174      * we don't startup extra threads if not needed */
175     orte_event_base = opal_progress_thread_init(NULL);
176     progress_thread_running = true;
177 
178     /* open and setup pmix */
179     if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
180         error = "opening pmix";
181         goto error;
182     }
183     if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
184         error = "select pmix";
185         goto error;
186     }
187     /* set the event base */
188     opal_pmix_base_set_evbase(orte_event_base);
189     /* initialize the selected module */
190     if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
191         /* we cannot run */
192         error = "pmix init";
193         goto error;
194     }
195 
196     /* pmix.init set our process name down in the OPAL layer,
197      * so carry it forward here */
198     ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
199     ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;
200     name.jobid = OPAL_PROC_MY_NAME.jobid;
201     name.vpid = ORTE_VPID_WILDCARD;
202 
203     /* get our local rank from PMI */
204     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
205                           ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
206     if (OPAL_SUCCESS != ret) {
207         error = "getting local rank";
208         goto error;
209     }
210     orte_process_info.my_local_rank = u16;
211 
212     /* get our node rank from PMI */
213     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
214                           ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
215     if (OPAL_SUCCESS != ret) {
216         error = "getting node rank";
217         goto error;
218     }
219     orte_process_info.my_node_rank = u16;
220 
221     /* get max procs */
222     OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS,
223                           &name, &u32ptr, OPAL_UINT32);
224     if (OPAL_SUCCESS != ret) {
225         error = "getting max procs";
226         goto error;
227     }
228     orte_process_info.max_procs = u32;
229 
230     /* we are a singleton, so there is only one proc in the job */
231     orte_process_info.num_procs = 1;
232     /* push into the environ for pickup in MPI layer for
233      * MPI-3 required info key
234      */
235     if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
236         char * num_procs;
237         asprintf(&num_procs, "%d", orte_process_info.num_procs);
238         opal_setenv(OPAL_MCA_PREFIX"orte_ess_num_procs", num_procs, true, &environ);
239         free(num_procs);
240         added_num_procs = true;
241     }
242     if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
243         char * num_procs;
244         asprintf(&num_procs, "%d", orte_process_info.num_procs);
245         opal_setenv("OMPI_APP_CTX_NUM_PROCS", num_procs, true, &environ);
246         free(num_procs);
247         added_app_ctx = true;
248     }
249 
250 
251     /* get our app number from PMI - ok if not found */
252     OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
253                           ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
254     if (OPAL_SUCCESS == ret) {
255         orte_process_info.app_num = u32;
256     } else {
257         orte_process_info.app_num = 0;
258     }
259     /* set some other standard values */
260     orte_process_info.num_local_peers = 0;
261 
262     /* setup transport keys in case the MPI layer needs them -
263      * we can use the jobfam and stepid as unique keys
264      * because they are unique values assigned by the RM
265      */
266     if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
267         char *key;
268         ret = orte_pre_condition_transports(NULL, &key);
269         if (ORTE_SUCCESS == ret) {
270             opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &environ);
271             free(key);
272         }
273     }
274 
275     /* use the std app init to complete the procedure */
276     if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup(true))) {
277         ORTE_ERROR_LOG(rc);
278         return rc;
279     }
280 
281     return ORTE_SUCCESS;
282 
283  error:
284     if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
285         orte_show_help("help-orte-runtime.txt",
286                        "orte_init:startup:internal-failure",
287                        true, error, ORTE_ERROR_NAME(ret), ret);
288     }
289     return ret;
290 }
291 
rte_finalize(void)292 static int rte_finalize(void)
293 {
294     int ret;
295 
296     /* remove the envars that we pushed into environ
297      * so we leave that structure intact
298      */
299     if (added_transport_keys) {
300         unsetenv(OPAL_MCA_PREFIX"orte_precondition_transports");
301     }
302     if (added_num_procs) {
303         unsetenv(OPAL_MCA_PREFIX"orte_ess_num_procs");
304     }
305     if (added_app_ctx) {
306         unsetenv("OMPI_APP_CTX_NUM_PROCS");
307     }
308     if (added_pmix_envs) {
309         unsetenv("PMIX_NAMESPACE");
310         unsetenv("PMIX_RANK");
311         unsetenv("PMIX_SERVER_URI");
312         unsetenv("PMIX_SECURITY_MODE");
313     }
314     /* use the default procedure to finish */
315     if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
316         ORTE_ERROR_LOG(ret);
317     }
318 
319     /* mark us as finalized */
320     if (NULL != opal_pmix.finalize) {
321         opal_pmix.finalize();
322         (void) mca_base_framework_close(&opal_pmix_base_framework);
323     }
324 
325     /* release the event base */
326     if (progress_thread_running) {
327         opal_progress_thread_finalize(NULL);
328         progress_thread_running = false;
329     }
330     return ret;
331 }
332 
333 #define ORTE_URI_MSG_LGTH   256
334 
set_handler_default(int sig)335 static void set_handler_default(int sig)
336 {
337     struct sigaction act;
338 
339     act.sa_handler = SIG_DFL;
340     act.sa_flags = 0;
341     sigemptyset(&act.sa_mask);
342 
343     sigaction(sig, &act, (struct sigaction *)0);
344 }
345 
fork_hnp(void)346 static int fork_hnp(void)
347 {
348     int p[2], death_pipe[2];
349     char *cmd;
350     char **argv = NULL;
351     int argc;
352     char *param, *cptr;
353     sigset_t sigs;
354     int buffer_length, num_chars_read, chunk;
355     char *orted_uri;
356     int rc, i;
357 
358     /* A pipe is used to communicate between the parent and child to
359        indicate whether the exec ultimately succeeded or failed.  The
360        child sets the pipe to be close-on-exec; the child only ever
361        writes anything to the pipe if there is an error (e.g.,
362        executable not found, exec() fails, etc.).  The parent does a
363        blocking read on the pipe; if the pipe closed with no data,
364        then the exec() succeeded.  If the parent reads something from
365        the pipe, then the child was letting us know that it failed.
366     */
367     if (pipe(p) < 0) {
368         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
369         return ORTE_ERR_SYS_LIMITS_PIPES;
370     }
371 
372     /* we also have to give the HNP a pipe it can watch to know when
373      * we terminated. Since the HNP is going to be a child of us, it
374      * can't just use waitpid to see when we leave - so it will watch
375      * the pipe instead
376      */
377     if (pipe(death_pipe) < 0) {
378         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
379         return ORTE_ERR_SYS_LIMITS_PIPES;
380     }
381 
382     /* find the orted binary using the install_dirs support - this also
383      * checks to ensure that we can see this executable and it *is* executable by us
384      */
385     cmd = opal_path_access("orted", opal_install_dirs.bindir, X_OK);
386     if (NULL == cmd) {
387         /* guess we couldn't do it - best to abort */
388         ORTE_ERROR_LOG(ORTE_ERR_FILE_NOT_EXECUTABLE);
389         close(p[0]);
390         close(p[1]);
391         return ORTE_ERR_FILE_NOT_EXECUTABLE;
392     }
393 
394     /* okay, setup an appropriate argv */
395     opal_argv_append(&argc, &argv, "orted");
396 
397     /* tell the daemon it is to be the HNP */
398     opal_argv_append(&argc, &argv, "--hnp");
399 
400     /* tell the daemon to get out of our process group */
401     opal_argv_append(&argc, &argv, "--set-sid");
402 
403     /* tell the daemon to report back its uri so we can connect to it */
404     opal_argv_append(&argc, &argv, "--report-uri");
405     asprintf(&param, "%d", p[1]);
406     opal_argv_append(&argc, &argv, param);
407     free(param);
408 
409     /* give the daemon a pipe it can watch to tell when we have died */
410     opal_argv_append(&argc, &argv, "--singleton-died-pipe");
411     asprintf(&param, "%d", death_pipe[0]);
412     opal_argv_append(&argc, &argv, param);
413     free(param);
414 
415     /* add any debug flags */
416     if (orte_debug_flag) {
417         opal_argv_append(&argc, &argv, "--debug");
418     }
419 
420     if (orte_debug_daemons_flag) {
421         opal_argv_append(&argc, &argv, "--debug-daemons");
422     }
423 
424     if (orte_debug_daemons_file_flag) {
425         if (!orte_debug_daemons_flag) {
426             opal_argv_append(&argc, &argv, "--debug-daemons");
427         }
428         opal_argv_append(&argc, &argv, "--debug-daemons-file");
429     }
430 
431     /* indicate that it must use the novm state machine */
432     opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
433     opal_argv_append(&argc, &argv, "state_novm_select");
434     opal_argv_append(&argc, &argv, "1");
435 
436     /* direct the selection of the ess component */
437     opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
438     opal_argv_append(&argc, &argv, "ess");
439     opal_argv_append(&argc, &argv, "hnp");
440 
441     /* direct the selection of the pmix component */
442     opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
443     opal_argv_append(&argc, &argv, "pmix");
444     opal_argv_append(&argc, &argv, "^s1,s2,cray,isolated");
445 
446     /* Fork off the child */
447     orte_process_info.hnp_pid = fork();
448     if(orte_process_info.hnp_pid < 0) {
449         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
450         close(p[0]);
451         close(p[1]);
452         close(death_pipe[0]);
453         close(death_pipe[1]);
454         free(cmd);
455         opal_argv_free(argv);
456         return ORTE_ERR_SYS_LIMITS_CHILDREN;
457     }
458 
459     if (orte_process_info.hnp_pid == 0) {
460         close(p[0]);
461         close(death_pipe[1]);
462         /* I am the child - exec me */
463 
464         /* Set signal handlers back to the default.  Do this close
465            to the execve() because the event library may (and likely
466            will) reset them.  If we don't do this, the event
467            library may have left some set that, at least on some
468            OS's, don't get reset via fork() or exec().  Hence, the
469            orted could be unkillable (for example). */
470         set_handler_default(SIGTERM);
471         set_handler_default(SIGINT);
472         set_handler_default(SIGHUP);
473         set_handler_default(SIGPIPE);
474         set_handler_default(SIGCHLD);
475 
476         /* Unblock all signals, for many of the same reasons that
477            we set the default handlers, above.  This is noticable
478            on Linux where the event library blocks SIGTERM, but we
479            don't want that blocked by the orted (or, more
480            specifically, we don't want it to be blocked by the
481            orted and then inherited by the ORTE processes that it
482            forks, making them unkillable by SIGTERM). */
483         sigprocmask(0, 0, &sigs);
484         sigprocmask(SIG_UNBLOCK, &sigs, 0);
485 
486         execv(cmd, argv);
487 
488         /* if I get here, the execv failed! */
489         orte_show_help("help-ess-base.txt", "ess-base:execv-error",
490                        true, cmd, strerror(errno));
491         exit(1);
492 
493     } else {
494         int count;
495 
496         free(cmd);
497         /* I am the parent - wait to hear something back and
498          * report results
499          */
500         close(p[1]);  /* parent closes the write - orted will write its contact info to it*/
501         close(death_pipe[0]);  /* parent closes the death_pipe's read */
502         opal_argv_free(argv);
503 
504         /* setup the buffer to read the HNP's uri */
505         buffer_length = ORTE_URI_MSG_LGTH;
506         chunk = ORTE_URI_MSG_LGTH-1;
507         num_chars_read = 0;
508         orted_uri = (char*)malloc(buffer_length);
509         memset(orted_uri, 0, buffer_length);
510 
511         while (0 != (rc = read(p[0], &orted_uri[num_chars_read], chunk))) {
512             if (rc < 0 && (EAGAIN == errno || EINTR == errno)) {
513                 continue;
514             } else if (rc < 0) {
515                 num_chars_read = -1;
516                 break;
517             }
518             /* we read something - better get more */
519             num_chars_read += rc;
520             chunk -= rc;
521             if (0 == chunk) {
522                 chunk = ORTE_URI_MSG_LGTH;
523                 orted_uri = realloc((void*)orted_uri, buffer_length+chunk);
524                 memset(&orted_uri[buffer_length], 0, chunk);
525                 buffer_length += chunk;
526             }
527         }
528         close(p[0]);
529 
530         if (num_chars_read <= 0) {
531             /* we didn't get anything back - this is bad */
532             ORTE_ERROR_LOG(ORTE_ERR_HNP_COULD_NOT_START);
533             free(orted_uri);
534             return ORTE_ERR_HNP_COULD_NOT_START;
535         }
536 
537         /* parse the sysinfo from the returned info - must
538          * start from the end of the string as the uri itself
539          * can contain brackets */
540         if (NULL == (param = strrchr(orted_uri, '['))) {
541             ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
542             free(orted_uri);
543             return ORTE_ERR_COMM_FAILURE;
544         }
545         *param = '\0'; /* terminate the uri string */
546         ++param;  /* point to the start of the sysinfo */
547 
548         /* find the end of the sysinfo */
549         if (NULL == (cptr = strchr(param, ']'))) {
550             ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE);
551             free(orted_uri);
552             return ORTE_ERR_COMM_FAILURE;
553         }
554         *cptr = '\0';  /* terminate the sysinfo string */
555         ++cptr;  /* point to the start of the pmix uri */
556 
557         /* convert the sysinfo string */
558         if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_sysinfo(&orte_local_cpu_type,
559                                       &orte_local_cpu_model, param))) {
560             ORTE_ERROR_LOG(rc);
561             free(orted_uri);
562             return rc;
563         }
564 
565         /* save the daemon uri - we will process it later */
566         orte_process_info.my_daemon_uri = strdup(orted_uri);
567         /* likewise, since this is also the HNP, set that uri too */
568         orte_process_info.my_hnp_uri = orted_uri;
569 
570         /* split the pmix_uri into its parts */
571         argv = opal_argv_split(cptr, '*');
572         count = opal_argv_count(argv);
573         /* push each piece into the environment */
574         for (i=0; i < count; i++) {
575             char *c = strchr(argv[i], '=');
576             assert(NULL != c);
577             *c++ = '\0';
578             opal_setenv(argv[i], c, true, &environ);
579         }
580         opal_argv_free(argv);
581         added_pmix_envs = true;
582 
583         /* all done - report success */
584         return ORTE_SUCCESS;
585     }
586 }
587