1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  *  (C) 2004 by Argonne National Laboratory.
4  *      See COPYRIGHT in top-level directory.
5  */
6 
7 /* OWNER=gropp */
8 
9 /* An example mpiexec program that uses a remote shell program to create
10    new processes on the selected hosts.
11 
12    This code also shows how to use the pmutil routines (in ../util)
13    to provide many of the services required by mpiexec
14 
15    Steps:
16    1. Read and process that command line.  Build a ProcessList.  (A ProcessList
17    may have one entry for a request to create n separate processes)
18 
19    2. Convert the ProcessList into a ProcessTable.  In the forker mpiexec,
20    this simply expands the requested number of processes into an
21    array with one entry per process.  These entries contain information
22    on both the setup of the processes and the file descriptors used for
23    stdin,out,err, and for the PMI calls.
24 
25    3. (Optionally) allow the forked processes to use a host:port to
26    contact this program, rather than just sharing a pipe.  This allows the
27    forker to start other programs, such as debuggers.
28 
29    4. Establish a signal handler for SIGCHLD.  This will allow us to
30    get information about process termination; in particular, the exit
31    status.
32 
33    5. Start the programs.
34 
35    6. Process input from the programs; send stdin given to this process
36    to the selected processes (usually rank 0 or everyone).  Handle all
37    PMI commands, including spawn.  Another "input" is the expiration of the
38    specified timelimit for the run, if any.
39 
40    7. Process rundown commands and handle any abnormal termination.
41 
42    8. Wait for any processes to exit; gather the exit status and reason
43    for exit (if abnormal, such as signaled with SEGV or BUS)
44 
45    9. Release all resources and compute the exit status for this program
46    (using one of several approaches, such as taking the maximum of the
47    exit statuses).
48 
49   Special Case to support Singleton Init:
50   To support a singleton init of a process that then wants to
51   create processes with MPI_Comm_spawn(_multiple), a special form of
52   mpiexec is supported:
53 
54      mpiexec -pmi_args <port> <interfacename> <securitykey> <pid>
55 
56   The singleton process (in a routine in simple_pmi.c) forks a process and
57   execs mpiexe with these arguments, where port is the port to which
58   mpiexec should connect, interfacename is the name of the network interface
59   (BUG: may not be correctly set as mpd currently ignores it), securitykey
60   is a place-holder for a key used by the singleton init process to verify
61   that the process connecting on the port is the one that was intended, and
62   pid is the pid of the singleton init process.
63 
64   FIXME: The above has not been implemented yet.
65 */
66 
67 #include "mpichconf.h"
68 #include <stdio.h>
69 #include <string.h>
70 #ifdef HAVE_UNISTD_H
71 #include <unistd.h>
72 #endif
73 #include <stdlib.h>
74 
75 #include "pmutil.h"
76 #include "process.h"
77 #include "cmnargs.h"
78 #include "pmiserv.h"
79 #include "ioloop.h"
80 #include "labelout.h"
81 #include "rm.h"
82 #include "simple_pmiutil.h"
83 #include "env.h"             /* MPIE_Putenv */
84 /* mpimem.h contains prototypes for MPIU_Strncpy etc. */
85 /* We no longer can use these because they are MPI device specific */
86 /* #include "mpimem.h" */
87 
88 typedef struct { PMISetup pmiinfo; IOLabelSetup labelinfo; } SetupInfo;
89 
90 /* Forward declarations */
91 int mypreamble( void *, ProcessState* );
92 int mypostfork( void *, void *, ProcessState* );
93 int mypostamble( void *, void *, ProcessState* );
94 int myspawn( ProcessWorld *, void * );
95 
96 static int AddEnvSetToCmdLine( const char *, const char *, const char ** );
97 
98 /* Set printFailure to 1 to get an explanation of the failure reason
99    for each process when a process fails */
100 static int printFailure = 0;
101 
102 #ifndef MAX_PORT_STRING
103 #define MAX_PORT_STRING 1024
104 #endif
105 
106 /* Note that envp is common but not standard */
main(int argc,char * argv[],char * envp[])107 int main( int argc, char *argv[], char *envp[] )
108 {
109     int          rc;
110     int          erc = 0;  /* Other (exceptional) return codes */
111     int          reason, signaled = 0;
112     SetupInfo    s;
113     char         portString[MAX_PORT_STRING];
114 
115     /* MPIE_ProcessInit initializes the global pUniv */
116     MPIE_ProcessInit();
117     /* Set a default for the universe size */
118     pUniv.size = 64;
119 
120     /* Set defaults for any arguments that are options.  Also check the
121        environment for special options, such as debugging.  Set
122        some defaults in pUniv */
123     MPIE_CheckEnv( &pUniv, 0, 0 );
124     IOLabelCheckEnv( );
125 
126     /* Handle the command line arguments.  Use the routine from util/cmnargs.c
127        to fill in the universe */
128     MPIE_Args( argc, argv, &pUniv, 0, 0 );
129     /* If there were any soft arguments, we need to handle them now */
130     rc = MPIE_InitWorldWithSoft( &pUniv.worlds[0], pUniv.size );
131     if (!rc) {
132 	MPIU_Error_printf( "Unable to process soft arguments\n" );
133 	exit(1);
134     }
135 
136     if (pUniv.fromSingleton) {
137 	/* The MPI process is already running.  We create a simple entry
138 	   for a single process rather than creating the process */
139 	MPIE_SetupSingleton( &pUniv );
140     }
141 
142 
143     rc = MPIE_ChooseHosts( &pUniv.worlds[0], MPIE_ReadMachines, 0 );
144     if (rc) {
145 	MPIU_Error_printf( "Unable to assign hosts to processes\n" );
146 	exit(1);
147     }
148 
149     if (MPIE_Debug) MPIE_PrintProcessUniverse( stdout, &pUniv );
150 
151     DBG_PRINTF( ("timeout_seconds = %d\n", pUniv.timeout) );
152 
153     /* Get the common port for creating PMI connections to the created
154        processes */
155     rc = PMIServSetupPort( &pUniv, portString, sizeof(portString) );
156     if (rc) {
157 	MPIU_Error_printf( "Unable to setup port for listener\n" );
158 	exit(1);
159     }
160     s.pmiinfo.portName = portString;
161 
162 #ifdef USE_MPI_STAGE_EXECUTABLES
163     /* Hook for later use in staging executables */
164     if (?stageExes) {
165 	rc = MPIE_StageExecutables( &pUniv.worlds[0] );
166 	if (!rc) ...;
167     }
168 #endif
169 
170     PMIServInit(myspawn,&s);
171     s.pmiinfo.pWorld = &pUniv.worlds[0];
172     PMISetupNewGroup( pUniv.worlds[0].nProcess, 0 );
173     MPIE_ForwardCommonSignals();
174     if (!pUniv.fromSingleton) {
175 	MPIE_ForkProcesses( &pUniv.worlds[0], envp, mypreamble, &s,
176 			mypostfork, 0, mypostamble, 0 );
177     }
178     else {
179 	/* FIXME: The singleton code goes here */
180 	MPIU_Error_printf( "Singleton init not supported\n" );
181 	exit(1);
182     }
183     reason = MPIE_IOLoop( pUniv.timeout );
184 
185     if (reason == IOLOOP_TIMEOUT) {
186 	/* Exited due to timeout.  Generate an error message and
187 	   terminate the children */
188 	if (pUniv.timeout > 60) {
189 	    MPIU_Error_printf( "Timeout of %d minutes expired; job aborted\n",
190 			       pUniv.timeout / 60 );
191 	}
192 	else {
193 	    MPIU_Error_printf( "Timeout of %d seconds expired; job aborted\n",
194 			       pUniv.timeout );
195 	}
196 	erc = 1;
197 	MPIE_KillUniverse( &pUniv );
198     }
199 
200     /* Wait for all processes to exit and gather information on them.
201        We do this through the SIGCHLD handler. We also bound the length
202        of time that we wait to 2 seconds.
203     */
204     MPIE_WaitForProcesses( &pUniv, 2 );
205 
206     /* Compute the return code (max for now) */
207     rc = MPIE_ProcessGetExitStatus( &signaled );
208 
209     /* Optionally provide detailed information about failed processes */
210     if ( (rc && printFailure) || signaled)
211 	MPIE_PrintFailureReasons( stderr );
212 
213     /* If the processes exited normally (or were already gone) but we
214        had an exceptional exit, such as a timeout, use the erc value */
215     if (!rc && erc) rc = erc;
216 
217     return( rc );
218 }
219 
mpiexec_usage(const char * msg)220 void mpiexec_usage( const char *msg )
221 {
222     if (msg) {
223 	MPIU_Error_printf( msg );
224 	if (msg[strlen(msg)-1] != '\n') {
225 	    MPIU_Error_printf( "\n" );
226 	}
227     }
228     MPIU_Usage_printf( "Usage: mpiexec %s\n", MPIE_ArgDescription() );
229     exit( -1 );
230 }
231 
232 /* Redirect stdout and stderr to a handler */
mypreamble(void * data,ProcessState * pState)233 int mypreamble( void *data, ProcessState *pState )
234 {
235     SetupInfo *s = (SetupInfo *)data;
236     int       rc;
237 
238     IOLabelSetupFDs( &s->labelinfo );
239     rc = PMISetupSockets( 1, &s->pmiinfo );
240     /* We must use communication over the socket, rather than the
241        environment, to pass initialization data */
242     pState->initWithEnv = 0;
243 
244     return rc;
245 }
246 
247 /* Close one side of each pipe pair and replace stdout/err with the pipes */
mypostfork(void * predata,void * data,ProcessState * pState)248 int mypostfork( void *predata, void *data, ProcessState *pState )
249 {
250     SetupInfo *s = (SetupInfo *)predata;
251     int curarg=0;
252 
253     IOLabelSetupInClient( &s->labelinfo );
254     PMISetupInClient( 1, &s->pmiinfo );
255 
256     /* Now, we *also* change the process state to insert the
257        interposed remote shell routine.  This is probably not
258        where we want this in the final version (because MPIE_ExecProgram
259        does a lot under the assumption that the started program will
260        know what to do with new environment variables), but this
261        will allow us to start. */
262     {
263 	ProcessApp *app = pState->app;
264 	const char **newargs = 0;
265 	char *pmiDebugStr = 0;
266 	int j;
267 	char rankStr[12];
268 
269 	/* Insert into app->args */
270 	newargs = (const char **) MPIU_Malloc( (app->nArgs + 14 + 1) *
271 					  sizeof(char *) );
272 	if (!pState->hostname) {
273 	    MPIU_Error_printf( "No hostname avaliable for %s\n", app->exename );
274 	    exit(1);
275 	}
276 
277 	snprintf( rankStr, sizeof(rankStr)-1, "%d", pState->id );
278 	rankStr[12-1] = 0;
279 	curarg = 0;
280         newargs[curarg++] = MPIU_Strdup( "-Y" );
281 
282 	newargs[curarg++] = pState->hostname;
283 	curarg += AddEnvSetToCmdLine( "PMI_PORT", s->pmiinfo.portName,
284 				      newargs + curarg );
285 	curarg += AddEnvSetToCmdLine( "PMI_ID", rankStr, newargs + curarg );
286 	pmiDebugStr = getenv( "PMI_DEBUG" );
287 	if (pmiDebugStr) {
288 	    /* Use this to help debug the connection process */
289 	    curarg += AddEnvSetToCmdLine( "PMI_DEBUG", pmiDebugStr,
290 					  newargs + curarg );
291 	}
292 
293 	newargs[curarg++] = app->exename;
294 	for (j=0; j<app->nArgs; j++) {
295 	    newargs[j+curarg] = app->args[j];
296 	}
297 	newargs[j+curarg] = 0;
298 	app->exename = MPIU_Strdup( "/usr/bin/ssh" );
299 
300 	app->args = newargs;
301 	app->nArgs += curarg;
302 
303 	if (MPIE_Debug) {
304 	    printf( "cmd = %s\n", app->exename ); fflush(stdout);
305 	    printf( "Number of args = %d\n", app->nArgs );
306 	    for (j=0; j<app->nArgs; j++) {
307 		printf( "argv[%d] = %s\n", j, app->args[j] ); fflush(stdout);
308 	    }
309 	}
310     }
311 
312     return 0;
313 }
314 
315 /* Close one side of the pipe pair and register a handler for the I/O */
mypostamble(void * predata,void * data,ProcessState * pState)316 int mypostamble( void *predata, void *data, ProcessState *pState )
317 {
318     SetupInfo *s = (SetupInfo *)predata;
319 
320     IOLabelSetupFinishInServer( &s->labelinfo, pState );
321     PMISetupFinishInServer( 1, &s->pmiinfo, pState );
322 
323     return 0;
324 }
325 
myspawn(ProcessWorld * pWorld,void * data)326 int myspawn( ProcessWorld *pWorld, void *data )
327 {
328     SetupInfo    *s = (SetupInfo *)data;
329     ProcessWorld *p, **pPtr;
330 
331     p = pUniv.worlds;
332     pPtr = &(pUniv.worlds);
333     while (p) {
334 	pPtr = &p->nextWorld;
335 	p    = *pPtr;
336     }
337     *pPtr = pWorld;
338 
339     /* Fork Processes may call a routine that is passed s but not pWorld;
340        this makes sure that all routines can access the current world */
341     s->pmiinfo.pWorld = pWorld;
342 
343     /* FIXME: This should be part of the PMI initialization in the clients */
344     MPIE_Putenv( pWorld, "PMI_SPAWNED=1" );
345 
346     MPIE_ForkProcesses( pWorld, 0, mypreamble, s,
347 			mypostfork, 0, mypostamble, 0 );
348     return 0;
349 }
350 
351 /* Temp test for the replacement for the simple "spawn == fork" */
352 
353 /*
354  * Approach:
355  * Processes are created using a remote shell program. This requires
356  * changing the command line from
357  *
358  *  a.out args ...
359  *
360  * to
361  *
362  * remshell-program remshell-args /bin/sh -c PMI_PORT=string &&
363  *            export PMI_PORT && PMI_ID=rank-in-world && export PMI_ID &&
364  *            a.out args
365  *
366  * (the export PMI_PORT=string syntax is not valid in all versions of sh)
367  *
368  * Using PMI_ID ensures that we correctly identify each process (this was
369  * a major problem in the setup used by the p4 device in MPICH1).
370  * Using environment variables instead of command line arguments keeps
371  * the commaand line clean.
372  *
373  * Two alternatives should be considered
374  * 1) Use an intermediate manager.  This would allow us to set up the
375  *    environment as well:
376  *    remshell-program remshell-args manager -port string
377  *    One possibilty for the manager is the mpd manager
378  * 2) Use the secure server (even the same one as in MPICH1); then
379  *    there is no remote shell command.
380  *
381  * We can handle the transformation of the command line by adding a
382  * to the postfork routine; this is called after the fork but before the
383  * exec, and it can change the command line by making a copy of the app
384  * structure, changing the command line, and setting the pState structure
385  * to point to this new app (after the fork, these changes are visable only
386  * to the forked process).
387  *
388  * Enhancements:
389  * Allow the code to avoid the remote shell if the process is being created
390  * on the local host.
391  *
392  * Handle the user of -l username and -n options to remshell
393  * (-n makes stdin /dev/null, necessary for backgrounding).
394  * (-l username allows login to hosts where the user's username is
395  * different)
396  *
397  * Provide an option to add a backslash before any - to deal with the
398  * serious bug in the GNU inetutils remote shell programs that process
399  * *all* arguments on the remote shell command line, even those for the
400  * *program*!
401  *
402  * To best support the errcodes return from MPI_Comm_spawn,
403  * we need a way to communicate the array of error codes back to the
404  * spawn and spawn multiple commands.  Query: how is that done in
405  * PMI?
406  *
407  */
408 
AddEnvSetToCmdLine(const char * envName,const char * envValue,const char ** args)409 static int AddEnvSetToCmdLine( const char *envName, const char *envValue,
410 			       const char **args )
411 {
412     int nArgs = 0;
413     static int useCSHFormat = -1;
414 
415     /* Determine the Shell type the first time*/
416     if (useCSHFormat == -1) {
417 	char *shell = getenv( "SHELL" ), *sname;
418 	if (shell) {
419 /* 	    printf( "Shell is %s\n", shell ); */
420 	    sname = strrchr( shell, '/' );
421 	    if (!sname) sname = shell;
422 	    else sname++;
423 /* 	    printf( "Sname is %s\n", sname ); */
424 	    if (strcmp( sname, "bash" ) == 0 || strcmp( sname, "sh" ) ||
425 		strcmp( sname, "ash" ) == 0) useCSHFormat = 0;
426 	    else
427 		useCSHFormat = 1;
428 	}
429 	else {
430 	    /* Default is to assume csh (setenv) format */
431 	    useCSHFormat = 1;
432 	}
433     }
434 
435     if (useCSHFormat) {
436 	args[nArgs++] = MPIU_Strdup( "setenv" );
437 	args[nArgs++] = MPIU_Strdup( envName );
438 	args[nArgs++] = MPIU_Strdup( envValue );
439 	args[nArgs++] = MPIU_Strdup( ";" );
440     }
441     else {
442 	char tmpBuf[1024];
443 	args[nArgs++] = MPIU_Strdup( "export" );
444 	MPIU_Strncpy( tmpBuf, envName, sizeof(tmpBuf) );
445 	MPIU_Strnapp( tmpBuf, "=", sizeof(tmpBuf) );
446 	MPIU_Strnapp( tmpBuf, envValue, sizeof(tmpBuf) );
447 	args[nArgs++] = MPIU_Strdup( tmpBuf );
448 	args[nArgs++] = MPIU_Strdup( ";" );
449     }
450     return nArgs;
451 }
452