1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 /* OWNER=gropp */
7 
8 /* An example mpiexec program that uses a remote shell program to create
9    new processes on the selected hosts.
10 
11    This code also shows how to use the pmutil routines (in ../util)
12    to provide many of the services required by mpiexec
13 
14    Steps:
15    1. Read and process that command line.  Build a ProcessList.  (A ProcessList
16    may have one entry for a request to create n separate processes)
17 
18    2. Convert the ProcessList into a ProcessTable.  In the forker mpiexec,
19    this simply expands the requested number of processes into an
20    array with one entry per process.  These entries contain information
21    on both the setup of the processes and the file descriptors used for
22    stdin,out,err, and for the PMI calls.
23 
24    3. (Optionally) allow the forked processes to use a host:port to
25    contact this program, rather than just sharing a pipe.  This allows the
26    forker to start other programs, such as debuggers.
27 
28    4. Establish a signal handler for SIGCHLD.  This will allow us to
29    get information about process termination; in particular, the exit
30    status.
31 
32    5. Start the programs.
33 
34    6. Process input from the programs; send stdin given to this process
35    to the selected processes (usually rank 0 or everyone).  Handle all
36    PMI commands, including spawn.  Another "input" is the expiration of the
37    specified timelimit for the run, if any.
38 
39    7. Process rundown commands and handle any abnormal termination.
40 
41    8. Wait for any processes to exit; gather the exit status and reason
42    for exit (if abnormal, such as signaled with SEGV or BUS)
43 
44    9. Release all resources and compute the exit status for this program
45    (using one of several approaches, such as taking the maximum of the
46    exit statuses).
47 
48   Special Case to support Singleton Init:
49   To support a singleton init of a process that then wants to
50   create processes with MPI_Comm_spawn(_multiple), a special form of
51   mpiexec is supported:
52 
53      mpiexec -pmi_args <port> <interfacename> <securitykey> <pid>
54 
55   The singleton process (in a routine in simple_pmi.c) forks a process and
56   execs mpiexe with these arguments, where port is the port to which
57   mpiexec should connect, interfacename is the name of the network interface, securitykey
58   is a place-holder for a key used by the singleton init process to verify
59   that the process connecting on the port is the one that was intended, and
60   pid is the pid of the singleton init process.
61 
62   FIXME: The above has not been implemented yet.
63 */
64 
65 #include "mpichconf.h"
66 #include <stdio.h>
67 #include <string.h>
68 #ifdef HAVE_UNISTD_H
69 #include <unistd.h>
70 #endif
71 #include <stdlib.h>
72 
73 #include "pmutil.h"
74 #include "process.h"
75 #include "cmnargs.h"
76 #include "pmiserv.h"
77 #include "ioloop.h"
78 #include "labelout.h"
79 #include "rm.h"
80 #include "simple_pmiutil.h"
81 #include "env.h"        /* MPIE_Putenv */
82 /* mpir_mem.h contains prototypes for MPL_strncpy etc. */
83 /* We no longer can use these because they are MPI device specific */
84 /* #include "mpir_mem.h" */
85 
86 typedef struct {
87     PMISetup pmiinfo;
88     IOLabelSetup labelinfo;
89 } SetupInfo;
90 
91 /* Forward declarations */
92 int mypreamble(void *, ProcessState *);
93 int mypostfork(void *, void *, ProcessState *);
94 int mypostamble(void *, void *, ProcessState *);
95 int myspawn(ProcessWorld *, void *);
96 
97 static int AddEnvSetToCmdLine(const char *, const char *, const char **);
98 
99 /* Set printFailure to 1 to get an explanation of the failure reason
100    for each process when a process fails */
101 static int printFailure = 0;
102 
103 #ifndef MAX_PORT_STRING
104 #define MAX_PORT_STRING 1024
105 #endif
106 
107 /* Note that envp is common but not standard */
main(int argc,char * argv[],char * envp[])108 int main(int argc, char *argv[], char *envp[])
109 {
110     int rc;
111     int erc = 0;                /* Other (exceptional) return codes */
112     int reason, signaled = 0;
113     SetupInfo s;
114     char portString[MAX_PORT_STRING];
115 
116     /* MPIE_ProcessInit initializes the global pUniv */
117     MPIE_ProcessInit();
118     /* Set a default for the universe size */
119     pUniv.size = 64;
120 
121     /* Set defaults for any arguments that are options.  Also check the
122      * environment for special options, such as debugging.  Set
123      * some defaults in pUniv */
124     MPIE_CheckEnv(&pUniv, 0, 0);
125     IOLabelCheckEnv();
126 
127     /* Handle the command line arguments.  Use the routine from util/cmnargs.c
128      * to fill in the universe */
129     MPIE_Args(argc, argv, &pUniv, 0, 0);
130     /* If there were any soft arguments, we need to handle them now */
131     rc = MPIE_InitWorldWithSoft(&pUniv.worlds[0], pUniv.size);
132     if (!rc) {
133         MPL_error_printf("Unable to process soft arguments\n");
134         exit(1);
135     }
136 
137     if (pUniv.fromSingleton) {
138         /* The MPI process is already running.  We create a simple entry
139          * for a single process rather than creating the process */
140         MPIE_SetupSingleton(&pUniv);
141     }
142 
143 
144     rc = MPIE_ChooseHosts(&pUniv.worlds[0], MPIE_ReadMachines, 0);
145     if (rc) {
146         MPL_error_printf("Unable to assign hosts to processes\n");
147         exit(1);
148     }
149 
150     if (MPIE_Debug)
151         MPIE_PrintProcessUniverse(stdout, &pUniv);
152 
153     DBG_PRINTF(("timeout_seconds = %d\n", pUniv.timeout));
154 
155     /* Get the common port for creating PMI connections to the created
156      * processes */
157     rc = PMIServSetupPort(&pUniv, portString, sizeof(portString));
158     if (rc) {
159         MPL_error_printf("Unable to setup port for listener\n");
160         exit(1);
161     }
162     s.pmiinfo.portName = portString;
163 
164 #ifdef USE_MPI_STAGE_EXECUTABLES
165     /* Hook for later use in staging executables */
166     if (? stageExes) {
167         rc = MPIE_StageExecutables(&pUniv.worlds[0]);
168         if (!rc)
169             ...;
170     }
171 #endif
172 
173     PMIServInit(myspawn, &s);
174     s.pmiinfo.pWorld = &pUniv.worlds[0];
175     PMISetupNewGroup(pUniv.worlds[0].nProcess, 0);
176     MPIE_ForwardCommonSignals();
177     if (!pUniv.fromSingleton) {
178         MPIE_ForkProcesses(&pUniv.worlds[0], envp, mypreamble, &s, mypostfork, 0, mypostamble, 0);
179     } else {
180         /* FIXME: The singleton code goes here */
181         MPL_error_printf("Singleton init not supported\n");
182         exit(1);
183     }
184     reason = MPIE_IOLoop(pUniv.timeout);
185 
186     if (reason == IOLOOP_TIMEOUT) {
187         /* Exited due to timeout.  Generate an error message and
188          * terminate the children */
189         if (pUniv.timeout > 60) {
190             MPL_error_printf("Timeout of %d minutes expired; job aborted\n", pUniv.timeout / 60);
191         } else {
192             MPL_error_printf("Timeout of %d seconds expired; job aborted\n", pUniv.timeout);
193         }
194         erc = 1;
195         MPIE_KillUniverse(&pUniv);
196     }
197 
198     /* Wait for all processes to exit and gather information on them.
199      * We do this through the SIGCHLD handler. We also bound the length
200      * of time that we wait to 2 seconds.
201      */
202     MPIE_WaitForProcesses(&pUniv, 2);
203 
204     /* Compute the return code (max for now) */
205     rc = MPIE_ProcessGetExitStatus(&signaled);
206 
207     /* Optionally provide detailed information about failed processes */
208     if ((rc && printFailure) || signaled)
209         MPIE_PrintFailureReasons(stderr);
210 
211     /* If the processes exited normally (or were already gone) but we
212      * had an exceptional exit, such as a timeout, use the erc value */
213     if (!rc && erc)
214         rc = erc;
215 
216     return (rc);
217 }
218 
mpiexec_usage(const char * msg)219 void mpiexec_usage(const char *msg)
220 {
221     if (msg) {
222         MPL_error_printf("%s", msg);
223         if (msg[strlen(msg) - 1] != '\n') {
224             MPL_error_printf("\n");
225         }
226     }
227     MPL_usage_printf("Usage: mpiexec %s\n", MPIE_ArgDescription());
228     exit(-1);
229 }
230 
231 /* Redirect stdout and stderr to a handler */
mypreamble(void * data,ProcessState * pState)232 int mypreamble(void *data, ProcessState * pState)
233 {
234     SetupInfo *s = (SetupInfo *) data;
235     int rc;
236 
237     IOLabelSetupFDs(&s->labelinfo);
238     rc = PMISetupSockets(1, &s->pmiinfo);
239     /* We must use communication over the socket, rather than the
240      * environment, to pass initialization data */
241     pState->initWithEnv = 0;
242 
243     return rc;
244 }
245 
246 /* Close one side of each pipe pair and replace stdout/err with the pipes */
mypostfork(void * predata,void * data,ProcessState * pState)247 int mypostfork(void *predata, void *data, ProcessState * pState)
248 {
249     SetupInfo *s = (SetupInfo *) predata;
250     int curarg = 0;
251 
252     IOLabelSetupInClient(&s->labelinfo);
253     PMISetupInClient(1, &s->pmiinfo);
254 
255     /* Now, we *also* change the process state to insert the
256      * interposed remote shell routine.  This is probably not
257      * where we want this in the final version (because MPIE_ExecProgram
258      * does a lot under the assumption that the started program will
259      * know what to do with new environment variables), but this
260      * will allow us to start. */
261     {
262         ProcessApp *app = pState->app;
263         const char **newargs = 0;
264         char *pmiDebugStr = 0;
265         int j;
266         char rankStr[12];
267 
268         /* Insert into app->args */
269         newargs = (const char **) MPL_malloc((app->nArgs + 14 + 1) * sizeof(char *), MPL_MEM_PM);
270         if (!pState->hostname) {
271             MPL_error_printf("No hostname avaliable for %s\n", app->exename);
272             exit(1);
273         }
274 
275         snprintf(rankStr, sizeof(rankStr) - 1, "%d", pState->id);
276         rankStr[12 - 1] = 0;
277         curarg = 0;
278         newargs[curarg++] = MPL_strdup("-Y");
279 
280         newargs[curarg++] = pState->hostname;
281         curarg += AddEnvSetToCmdLine("PMI_PORT", s->pmiinfo.portName, newargs + curarg);
282         curarg += AddEnvSetToCmdLine("PMI_ID", rankStr, newargs + curarg);
283         pmiDebugStr = getenv("PMI_DEBUG");
284         if (pmiDebugStr) {
285             /* Use this to help debug the connection process */
286             curarg += AddEnvSetToCmdLine("PMI_DEBUG", pmiDebugStr, newargs + curarg);
287         }
288 
289         newargs[curarg++] = app->exename;
290         for (j = 0; j < app->nArgs; j++) {
291             newargs[j + curarg] = app->args[j];
292         }
293         newargs[j + curarg] = 0;
294         app->exename = MPL_strdup("/usr/bin/ssh");
295 
296         app->args = newargs;
297         app->nArgs += curarg;
298 
299         if (MPIE_Debug) {
300             printf("cmd = %s\n", app->exename);
301             fflush(stdout);
302             printf("Number of args = %d\n", app->nArgs);
303             for (j = 0; j < app->nArgs; j++) {
304                 printf("argv[%d] = %s\n", j, app->args[j]);
305                 fflush(stdout);
306             }
307         }
308     }
309 
310     return 0;
311 }
312 
313 /* Close one side of the pipe pair and register a handler for the I/O */
mypostamble(void * predata,void * data,ProcessState * pState)314 int mypostamble(void *predata, void *data, ProcessState * pState)
315 {
316     SetupInfo *s = (SetupInfo *) predata;
317 
318     IOLabelSetupFinishInServer(&s->labelinfo, pState);
319     PMISetupFinishInServer(1, &s->pmiinfo, pState);
320 
321     return 0;
322 }
323 
myspawn(ProcessWorld * pWorld,void * data)324 int myspawn(ProcessWorld * pWorld, void *data)
325 {
326     SetupInfo *s = (SetupInfo *) data;
327     ProcessWorld *p, **pPtr;
328 
329     p = pUniv.worlds;
330     pPtr = &(pUniv.worlds);
331     while (p) {
332         pPtr = &p->nextWorld;
333         p = *pPtr;
334     }
335     *pPtr = pWorld;
336 
337     /* Fork Processes may call a routine that is passed s but not pWorld;
338      * this makes sure that all routines can access the current world */
339     s->pmiinfo.pWorld = pWorld;
340 
341     /* FIXME: This should be part of the PMI initialization in the clients */
342     MPIE_Putenv(pWorld, "PMI_SPAWNED=1");
343 
344     MPIE_ForkProcesses(pWorld, 0, mypreamble, s, mypostfork, 0, mypostamble, 0);
345     return 0;
346 }
347 
348 /* Temp test for the replacement for the simple "spawn == fork" */
349 
350 /*
351  * Approach:
352  * Processes are created using a remote shell program. This requires
353  * changing the command line from
354  *
355  *  a.out args ...
356  *
357  * to
358  *
359  * remshell-program remshell-args /bin/sh -c PMI_PORT=string &&
360  *            export PMI_PORT && PMI_ID=rank-in-world && export PMI_ID &&
361  *            a.out args
362  *
363  * (the export PMI_PORT=string syntax is not valid in all versions of sh)
364  *
365  * Using PMI_ID ensures that we correctly identify each process (this was
366  * a major problem in the setup used by the p4 device in MPICH1).
367  * Using environment variables instead of command line arguments keeps
368  * the commaand line clean.
369  *
370  * Two alternatives should be considered
371  * 1) Use an intermediate manager.  This would allow us to set up the
372  *    environment as well:
373  *    remshell-program remshell-args manager -port string
374  * 2) Use the secure server (even the same one as in MPICH1); then
375  *    there is no remote shell command.
376  *
377  * We can handle the transformation of the command line by adding a
378  * to the postfork routine; this is called after the fork but before the
379  * exec, and it can change the command line by making a copy of the app
380  * structure, changing the command line, and setting the pState structure
381  * to point to this new app (after the fork, these changes are visable only
382  * to the forked process).
383  *
384  * Enhancements:
385  * Allow the code to avoid the remote shell if the process is being created
386  * on the local host.
387  *
388  * Handle the user of -l username and -n options to remshell
389  * (-n makes stdin /dev/null, necessary for backgrounding).
390  * (-l username allows login to hosts where the user's username is
391  * different)
392  *
393  * Provide an option to add a backslash before any - to deal with the
394  * serious bug in the GNU inetutils remote shell programs that process
395  * *all* arguments on the remote shell command line, even those for the
396  * *program*!
397  *
398  * To best support the errcodes return from MPI_Comm_spawn,
399  * we need a way to communicate the array of error codes back to the
400  * spawn and spawn multiple commands.  Query: how is that done in
401  * PMI?
402  *
403  */
404 
AddEnvSetToCmdLine(const char * envName,const char * envValue,const char ** args)405 static int AddEnvSetToCmdLine(const char *envName, const char *envValue, const char **args)
406 {
407     int nArgs = 0;
408     static int useCSHFormat = -1;
409 
410     /* Determine the Shell type the first time */
411     if (useCSHFormat == -1) {
412         char *shell = getenv("SHELL"), *sname;
413         if (shell) {
414             /* printf("Shell is %s\n", shell); */
415             sname = strrchr(shell, '/');
416             if (!sname)
417                 sname = shell;
418             else
419                 sname++;
420             /* printf("Sname is %s\n", sname); */
421             if (strcmp(sname, "bash") == 0 || strcmp(sname, "sh") == 0 || strcmp(sname, "ash") == 0)
422                 useCSHFormat = 0;
423             else
424                 useCSHFormat = 1;
425         } else {
426             /* Default is to assume csh (setenv) format */
427             useCSHFormat = 1;
428         }
429     }
430 
431     if (useCSHFormat) {
432         args[nArgs++] = MPL_strdup("setenv");
433         args[nArgs++] = MPL_strdup(envName);
434         args[nArgs++] = MPL_strdup(envValue);
435         args[nArgs++] = MPL_strdup(";");
436     } else {
437         char tmpBuf[1024];
438         args[nArgs++] = MPL_strdup("export");
439         MPL_strncpy(tmpBuf, envName, sizeof(tmpBuf));
440         MPL_strnapp(tmpBuf, "=", sizeof(tmpBuf));
441         MPL_strnapp(tmpBuf, envValue, sizeof(tmpBuf));
442         args[nArgs++] = MPL_strdup(tmpBuf);
443         args[nArgs++] = MPL_strdup(";");
444     }
445     return nArgs;
446 }
447