1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 /* OWNER=gropp */
7
8 /* An example mpiexec program that uses a remote shell program to create
9 new processes on the selected hosts.
10
11 This code also shows how to use the pmutil routines (in ../util)
12 to provide many of the services required by mpiexec
13
14 Steps:
15 1. Read and process that command line. Build a ProcessList. (A ProcessList
16 may have one entry for a request to create n separate processes)
17
18 2. Convert the ProcessList into a ProcessTable. In the forker mpiexec,
19 this simply expands the requested number of processes into an
20 array with one entry per process. These entries contain information
21 on both the setup of the processes and the file descriptors used for
22 stdin,out,err, and for the PMI calls.
23
24 3. (Optionally) allow the forked processes to use a host:port to
25 contact this program, rather than just sharing a pipe. This allows the
26 forker to start other programs, such as debuggers.
27
28 4. Establish a signal handler for SIGCHLD. This will allow us to
29 get information about process termination; in particular, the exit
30 status.
31
32 5. Start the programs.
33
34 6. Process input from the programs; send stdin given to this process
35 to the selected processes (usually rank 0 or everyone). Handle all
36 PMI commands, including spawn. Another "input" is the expiration of the
37 specified timelimit for the run, if any.
38
39 7. Process rundown commands and handle any abnormal termination.
40
41 8. Wait for any processes to exit; gather the exit status and reason
42 for exit (if abnormal, such as signaled with SEGV or BUS)
43
44 9. Release all resources and compute the exit status for this program
45 (using one of several approaches, such as taking the maximum of the
46 exit statuses).
47
48 Special Case to support Singleton Init:
49 To support a singleton init of a process that then wants to
50 create processes with MPI_Comm_spawn(_multiple), a special form of
51 mpiexec is supported:
52
53 mpiexec -pmi_args <port> <interfacename> <securitykey> <pid>
54
55 The singleton process (in a routine in simple_pmi.c) forks a process and
56 execs mpiexe with these arguments, where port is the port to which
57 mpiexec should connect, interfacename is the name of the network interface, securitykey
58 is a place-holder for a key used by the singleton init process to verify
59 that the process connecting on the port is the one that was intended, and
60 pid is the pid of the singleton init process.
61
62 FIXME: The above has not been implemented yet.
63 */
64
65 #include "mpichconf.h"
66 #include <stdio.h>
67 #include <string.h>
68 #ifdef HAVE_UNISTD_H
69 #include <unistd.h>
70 #endif
71 #include <stdlib.h>
72
73 #include "pmutil.h"
74 #include "process.h"
75 #include "cmnargs.h"
76 #include "pmiserv.h"
77 #include "ioloop.h"
78 #include "labelout.h"
79 #include "rm.h"
80 #include "simple_pmiutil.h"
81 #include "env.h" /* MPIE_Putenv */
82 /* mpir_mem.h contains prototypes for MPL_strncpy etc. */
83 /* We no longer can use these because they are MPI device specific */
84 /* #include "mpir_mem.h" */
85
86 typedef struct {
87 PMISetup pmiinfo;
88 IOLabelSetup labelinfo;
89 } SetupInfo;
90
91 /* Forward declarations */
92 int mypreamble(void *, ProcessState *);
93 int mypostfork(void *, void *, ProcessState *);
94 int mypostamble(void *, void *, ProcessState *);
95 int myspawn(ProcessWorld *, void *);
96
97 static int AddEnvSetToCmdLine(const char *, const char *, const char **);
98
99 /* Set printFailure to 1 to get an explanation of the failure reason
100 for each process when a process fails */
101 static int printFailure = 0;
102
103 #ifndef MAX_PORT_STRING
104 #define MAX_PORT_STRING 1024
105 #endif
106
107 /* Note that envp is common but not standard */
main(int argc,char * argv[],char * envp[])108 int main(int argc, char *argv[], char *envp[])
109 {
110 int rc;
111 int erc = 0; /* Other (exceptional) return codes */
112 int reason, signaled = 0;
113 SetupInfo s;
114 char portString[MAX_PORT_STRING];
115
116 /* MPIE_ProcessInit initializes the global pUniv */
117 MPIE_ProcessInit();
118 /* Set a default for the universe size */
119 pUniv.size = 64;
120
121 /* Set defaults for any arguments that are options. Also check the
122 * environment for special options, such as debugging. Set
123 * some defaults in pUniv */
124 MPIE_CheckEnv(&pUniv, 0, 0);
125 IOLabelCheckEnv();
126
127 /* Handle the command line arguments. Use the routine from util/cmnargs.c
128 * to fill in the universe */
129 MPIE_Args(argc, argv, &pUniv, 0, 0);
130 /* If there were any soft arguments, we need to handle them now */
131 rc = MPIE_InitWorldWithSoft(&pUniv.worlds[0], pUniv.size);
132 if (!rc) {
133 MPL_error_printf("Unable to process soft arguments\n");
134 exit(1);
135 }
136
137 if (pUniv.fromSingleton) {
138 /* The MPI process is already running. We create a simple entry
139 * for a single process rather than creating the process */
140 MPIE_SetupSingleton(&pUniv);
141 }
142
143
144 rc = MPIE_ChooseHosts(&pUniv.worlds[0], MPIE_ReadMachines, 0);
145 if (rc) {
146 MPL_error_printf("Unable to assign hosts to processes\n");
147 exit(1);
148 }
149
150 if (MPIE_Debug)
151 MPIE_PrintProcessUniverse(stdout, &pUniv);
152
153 DBG_PRINTF(("timeout_seconds = %d\n", pUniv.timeout));
154
155 /* Get the common port for creating PMI connections to the created
156 * processes */
157 rc = PMIServSetupPort(&pUniv, portString, sizeof(portString));
158 if (rc) {
159 MPL_error_printf("Unable to setup port for listener\n");
160 exit(1);
161 }
162 s.pmiinfo.portName = portString;
163
164 #ifdef USE_MPI_STAGE_EXECUTABLES
165 /* Hook for later use in staging executables */
166 if (? stageExes) {
167 rc = MPIE_StageExecutables(&pUniv.worlds[0]);
168 if (!rc)
169 ...;
170 }
171 #endif
172
173 PMIServInit(myspawn, &s);
174 s.pmiinfo.pWorld = &pUniv.worlds[0];
175 PMISetupNewGroup(pUniv.worlds[0].nProcess, 0);
176 MPIE_ForwardCommonSignals();
177 if (!pUniv.fromSingleton) {
178 MPIE_ForkProcesses(&pUniv.worlds[0], envp, mypreamble, &s, mypostfork, 0, mypostamble, 0);
179 } else {
180 /* FIXME: The singleton code goes here */
181 MPL_error_printf("Singleton init not supported\n");
182 exit(1);
183 }
184 reason = MPIE_IOLoop(pUniv.timeout);
185
186 if (reason == IOLOOP_TIMEOUT) {
187 /* Exited due to timeout. Generate an error message and
188 * terminate the children */
189 if (pUniv.timeout > 60) {
190 MPL_error_printf("Timeout of %d minutes expired; job aborted\n", pUniv.timeout / 60);
191 } else {
192 MPL_error_printf("Timeout of %d seconds expired; job aborted\n", pUniv.timeout);
193 }
194 erc = 1;
195 MPIE_KillUniverse(&pUniv);
196 }
197
198 /* Wait for all processes to exit and gather information on them.
199 * We do this through the SIGCHLD handler. We also bound the length
200 * of time that we wait to 2 seconds.
201 */
202 MPIE_WaitForProcesses(&pUniv, 2);
203
204 /* Compute the return code (max for now) */
205 rc = MPIE_ProcessGetExitStatus(&signaled);
206
207 /* Optionally provide detailed information about failed processes */
208 if ((rc && printFailure) || signaled)
209 MPIE_PrintFailureReasons(stderr);
210
211 /* If the processes exited normally (or were already gone) but we
212 * had an exceptional exit, such as a timeout, use the erc value */
213 if (!rc && erc)
214 rc = erc;
215
216 return (rc);
217 }
218
mpiexec_usage(const char * msg)219 void mpiexec_usage(const char *msg)
220 {
221 if (msg) {
222 MPL_error_printf("%s", msg);
223 if (msg[strlen(msg) - 1] != '\n') {
224 MPL_error_printf("\n");
225 }
226 }
227 MPL_usage_printf("Usage: mpiexec %s\n", MPIE_ArgDescription());
228 exit(-1);
229 }
230
231 /* Redirect stdout and stderr to a handler */
mypreamble(void * data,ProcessState * pState)232 int mypreamble(void *data, ProcessState * pState)
233 {
234 SetupInfo *s = (SetupInfo *) data;
235 int rc;
236
237 IOLabelSetupFDs(&s->labelinfo);
238 rc = PMISetupSockets(1, &s->pmiinfo);
239 /* We must use communication over the socket, rather than the
240 * environment, to pass initialization data */
241 pState->initWithEnv = 0;
242
243 return rc;
244 }
245
246 /* Close one side of each pipe pair and replace stdout/err with the pipes */
mypostfork(void * predata,void * data,ProcessState * pState)247 int mypostfork(void *predata, void *data, ProcessState * pState)
248 {
249 SetupInfo *s = (SetupInfo *) predata;
250 int curarg = 0;
251
252 IOLabelSetupInClient(&s->labelinfo);
253 PMISetupInClient(1, &s->pmiinfo);
254
255 /* Now, we *also* change the process state to insert the
256 * interposed remote shell routine. This is probably not
257 * where we want this in the final version (because MPIE_ExecProgram
258 * does a lot under the assumption that the started program will
259 * know what to do with new environment variables), but this
260 * will allow us to start. */
261 {
262 ProcessApp *app = pState->app;
263 const char **newargs = 0;
264 char *pmiDebugStr = 0;
265 int j;
266 char rankStr[12];
267
268 /* Insert into app->args */
269 newargs = (const char **) MPL_malloc((app->nArgs + 14 + 1) * sizeof(char *), MPL_MEM_PM);
270 if (!pState->hostname) {
271 MPL_error_printf("No hostname avaliable for %s\n", app->exename);
272 exit(1);
273 }
274
275 snprintf(rankStr, sizeof(rankStr) - 1, "%d", pState->id);
276 rankStr[12 - 1] = 0;
277 curarg = 0;
278 newargs[curarg++] = MPL_strdup("-Y");
279
280 newargs[curarg++] = pState->hostname;
281 curarg += AddEnvSetToCmdLine("PMI_PORT", s->pmiinfo.portName, newargs + curarg);
282 curarg += AddEnvSetToCmdLine("PMI_ID", rankStr, newargs + curarg);
283 pmiDebugStr = getenv("PMI_DEBUG");
284 if (pmiDebugStr) {
285 /* Use this to help debug the connection process */
286 curarg += AddEnvSetToCmdLine("PMI_DEBUG", pmiDebugStr, newargs + curarg);
287 }
288
289 newargs[curarg++] = app->exename;
290 for (j = 0; j < app->nArgs; j++) {
291 newargs[j + curarg] = app->args[j];
292 }
293 newargs[j + curarg] = 0;
294 app->exename = MPL_strdup("/usr/bin/ssh");
295
296 app->args = newargs;
297 app->nArgs += curarg;
298
299 if (MPIE_Debug) {
300 printf("cmd = %s\n", app->exename);
301 fflush(stdout);
302 printf("Number of args = %d\n", app->nArgs);
303 for (j = 0; j < app->nArgs; j++) {
304 printf("argv[%d] = %s\n", j, app->args[j]);
305 fflush(stdout);
306 }
307 }
308 }
309
310 return 0;
311 }
312
313 /* Close one side of the pipe pair and register a handler for the I/O */
mypostamble(void * predata,void * data,ProcessState * pState)314 int mypostamble(void *predata, void *data, ProcessState * pState)
315 {
316 SetupInfo *s = (SetupInfo *) predata;
317
318 IOLabelSetupFinishInServer(&s->labelinfo, pState);
319 PMISetupFinishInServer(1, &s->pmiinfo, pState);
320
321 return 0;
322 }
323
myspawn(ProcessWorld * pWorld,void * data)324 int myspawn(ProcessWorld * pWorld, void *data)
325 {
326 SetupInfo *s = (SetupInfo *) data;
327 ProcessWorld *p, **pPtr;
328
329 p = pUniv.worlds;
330 pPtr = &(pUniv.worlds);
331 while (p) {
332 pPtr = &p->nextWorld;
333 p = *pPtr;
334 }
335 *pPtr = pWorld;
336
337 /* Fork Processes may call a routine that is passed s but not pWorld;
338 * this makes sure that all routines can access the current world */
339 s->pmiinfo.pWorld = pWorld;
340
341 /* FIXME: This should be part of the PMI initialization in the clients */
342 MPIE_Putenv(pWorld, "PMI_SPAWNED=1");
343
344 MPIE_ForkProcesses(pWorld, 0, mypreamble, s, mypostfork, 0, mypostamble, 0);
345 return 0;
346 }
347
348 /* Temp test for the replacement for the simple "spawn == fork" */
349
350 /*
351 * Approach:
352 * Processes are created using a remote shell program. This requires
353 * changing the command line from
354 *
355 * a.out args ...
356 *
357 * to
358 *
359 * remshell-program remshell-args /bin/sh -c PMI_PORT=string &&
360 * export PMI_PORT && PMI_ID=rank-in-world && export PMI_ID &&
361 * a.out args
362 *
363 * (the export PMI_PORT=string syntax is not valid in all versions of sh)
364 *
365 * Using PMI_ID ensures that we correctly identify each process (this was
366 * a major problem in the setup used by the p4 device in MPICH1).
367 * Using environment variables instead of command line arguments keeps
368 * the commaand line clean.
369 *
370 * Two alternatives should be considered
371 * 1) Use an intermediate manager. This would allow us to set up the
372 * environment as well:
373 * remshell-program remshell-args manager -port string
374 * 2) Use the secure server (even the same one as in MPICH1); then
375 * there is no remote shell command.
376 *
377 * We can handle the transformation of the command line by adding a
378 * to the postfork routine; this is called after the fork but before the
379 * exec, and it can change the command line by making a copy of the app
380 * structure, changing the command line, and setting the pState structure
381 * to point to this new app (after the fork, these changes are visable only
382 * to the forked process).
383 *
384 * Enhancements:
385 * Allow the code to avoid the remote shell if the process is being created
386 * on the local host.
387 *
388 * Handle the user of -l username and -n options to remshell
389 * (-n makes stdin /dev/null, necessary for backgrounding).
390 * (-l username allows login to hosts where the user's username is
391 * different)
392 *
393 * Provide an option to add a backslash before any - to deal with the
394 * serious bug in the GNU inetutils remote shell programs that process
395 * *all* arguments on the remote shell command line, even those for the
396 * *program*!
397 *
398 * To best support the errcodes return from MPI_Comm_spawn,
399 * we need a way to communicate the array of error codes back to the
400 * spawn and spawn multiple commands. Query: how is that done in
401 * PMI?
402 *
403 */
404
AddEnvSetToCmdLine(const char * envName,const char * envValue,const char ** args)405 static int AddEnvSetToCmdLine(const char *envName, const char *envValue, const char **args)
406 {
407 int nArgs = 0;
408 static int useCSHFormat = -1;
409
410 /* Determine the Shell type the first time */
411 if (useCSHFormat == -1) {
412 char *shell = getenv("SHELL"), *sname;
413 if (shell) {
414 /* printf("Shell is %s\n", shell); */
415 sname = strrchr(shell, '/');
416 if (!sname)
417 sname = shell;
418 else
419 sname++;
420 /* printf("Sname is %s\n", sname); */
421 if (strcmp(sname, "bash") == 0 || strcmp(sname, "sh") == 0 || strcmp(sname, "ash") == 0)
422 useCSHFormat = 0;
423 else
424 useCSHFormat = 1;
425 } else {
426 /* Default is to assume csh (setenv) format */
427 useCSHFormat = 1;
428 }
429 }
430
431 if (useCSHFormat) {
432 args[nArgs++] = MPL_strdup("setenv");
433 args[nArgs++] = MPL_strdup(envName);
434 args[nArgs++] = MPL_strdup(envValue);
435 args[nArgs++] = MPL_strdup(";");
436 } else {
437 char tmpBuf[1024];
438 args[nArgs++] = MPL_strdup("export");
439 MPL_strncpy(tmpBuf, envName, sizeof(tmpBuf));
440 MPL_strnapp(tmpBuf, "=", sizeof(tmpBuf));
441 MPL_strnapp(tmpBuf, envValue, sizeof(tmpBuf));
442 args[nArgs++] = MPL_strdup(tmpBuf);
443 args[nArgs++] = MPL_strdup(";");
444 }
445 return nArgs;
446 }
447