1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpidimpl.h"
7 
8 #define MAX_JOBID_LEN 1024
9 
10 #if defined(HAVE_LIMITS_H)
11 #include <limits.h>
12 #endif
13 #if defined(HAVE_UNISTD_H)
14 #include <unistd.h>
15 #endif
16 
17 /* FIXME: This does not belong here */
18 #ifdef USE_MPIDI_DBG_PRINT_VC
19 char *MPIDI_DBG_parent_str = "?";
20 #endif
21 
22 /* FIXME: the PMI init function should ONLY do the PMI operations, not the
23    process group or bc operations.  These should be in a separate routine */
24 #ifdef USE_PMI2_API
25 #include "pmi2.h"
26 #else
27 #include "pmi.h"
28 #endif
29 
30 #include "datatype.h"
31 
32 static int init_pg(int *has_parent, int *pg_rank_p, MPIDI_PG_t **pg_p);
33 static int pg_compare_ids(void * id1, void * id2);
34 static int pg_destroy(MPIDI_PG_t * pg );
35 
36 MPIDI_Process_t MPIDI_Process = { NULL };
37 MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
38 MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns = { NULL };
39 MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks = { NULL };
40 MPIDI_CH3U_Win_pkt_ordering_t MPIDI_CH3U_Win_pkt_orderings = { 0 };
41 
42 #if defined(MPL_USE_DBG_LOGGING)
43 MPL_dbg_class MPIDI_CH3_DBG_CONNECT;
44 MPL_dbg_class MPIDI_CH3_DBG_DISCONNECT;
45 MPL_dbg_class MPIDI_CH3_DBG_PROGRESS;
46 MPL_dbg_class MPIDI_CH3_DBG_CHANNEL;
47 MPL_dbg_class MPIDI_CH3_DBG_OTHER;
48 MPL_dbg_class MPIDI_CH3_DBG_MSG;
49 MPL_dbg_class MPIDI_CH3_DBG_VC;
50 MPL_dbg_class MPIDI_CH3_DBG_REFCOUNT;
51 #endif /* MPL_USE_DBG_LOGGING */
52 
finalize_failed_procs_group(void * param)53 static int finalize_failed_procs_group(void *param)
54 {
55     int mpi_errno = MPI_SUCCESS;
56     if (MPIDI_Failed_procs_group != MPIR_Group_empty) {
57         mpi_errno = MPIR_Group_free_impl(MPIDI_Failed_procs_group);
58         MPIR_ERR_CHECK(mpi_errno);
59     }
60 
61  fn_fail:
62     return mpi_errno;
63 }
64 
MPID_Init(int requested,int * provided)65 int MPID_Init(int requested, int *provided)
66 {
67     int pmi_errno;
68     int mpi_errno = MPI_SUCCESS;
69     int has_parent;
70     MPIDI_PG_t * pg=NULL;
71     int pg_rank=-1;
72     int pg_size;
73     MPIR_Comm * comm;
74     int p;
75     int val;
76     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_INIT);
77 
78     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_INIT);
79 
80     if (MPICH_THREAD_LEVEL >= requested)
81         *provided = requested;
82     else
83         *provided = MPICH_THREAD_LEVEL;
84 
85     /* initialization routine for ch3u_comm.c */
86     mpi_errno = MPIDI_CH3I_Comm_init();
87     MPIR_ERR_CHECK(mpi_errno);
88 
89     /* init group of failed processes, and set finalize callback */
90     MPIDI_Failed_procs_group = MPIR_Group_empty;
91     MPIR_Add_finalize(finalize_failed_procs_group, NULL, MPIR_FINALIZE_CALLBACK_PRIO-1);
92 
93     /* Create the string that will cache the last group of failed processes
94      * we received from PMI */
95 #ifdef USE_PMI2_API
96     MPIDI_failed_procs_string = MPL_malloc(sizeof(char) * PMI2_MAX_VALLEN, MPL_MEM_STRINGS);
97 #else
98     pmi_errno = PMI_KVS_Get_value_length_max(&val);
99     if (pmi_errno != PMI_SUCCESS)
100     {
101         MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
102                              "**pmi_kvs_get_value_length_max",
103                              "**pmi_kvs_get_value_length_max %d", pmi_errno);
104     }
105     MPIDI_failed_procs_string = MPL_malloc(sizeof(char) * (val+1), MPL_MEM_STRINGS);
106 #endif
107 
108     /*
109      * Set global process attributes.  These can be overridden by the channel
110      * if necessary.
111      */
112     MPIR_Process.attrs.io = MPI_ANY_SOURCE;
113 
114     /*
115      * Perform channel-independent PMI initialization
116      */
117     mpi_errno = init_pg(&has_parent, &pg_rank, &pg);
118     if (mpi_errno) {
119 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ch3_init");
120     }
121 
122     /* FIXME: Why are pg_size and pg_rank handled differently? */
123     pg_size = MPIDI_PG_Get_size(pg);
124     MPIDI_Process.my_pg = pg;  /* brad : this is rework for shared memories
125 				* because they need this set earlier
126                                 * for getting the business card
127                                 */
128     MPIDI_Process.my_pg_rank = pg_rank;
129     /* FIXME: Why do we add a ref to pg here? */
130     MPIDI_PG_add_ref(pg);
131 
132     /* We intentionally call this before the channel init so that the channel
133        can use the node_id info. */
134     /* Ideally this wouldn't be needed.  Once we have PMIv2 support for node
135        information we should probably eliminate this function. */
136     mpi_errno = MPIDI_Populate_vc_node_ids(pg, pg_rank);
137     MPIR_ERR_CHECK(mpi_errno);
138 
139     /* Initialize Window functions table with defaults, then call the channel's
140        init function. */
141     MPIDI_Win_fns_init(&MPIDI_CH3U_Win_fns);
142     MPIDI_CH3_Win_fns_init(&MPIDI_CH3U_Win_fns);
143     MPIDI_CH3_Win_hooks_init(&MPIDI_CH3U_Win_hooks);
144 
145 #ifdef MPL_USE_DBG_LOGGING
146     MPIDI_CH3_DBG_CONNECT = MPL_dbg_class_alloc("CH3_CONNECT", "ch3_connect");;
147     MPIDI_CH3_DBG_DISCONNECT = MPL_dbg_class_alloc("CH3_DISCONNECT", "ch3_disconnect");
148     MPIDI_CH3_DBG_PROGRESS = MPL_dbg_class_alloc("CH3_PROGRESS", "ch3_progress");
149     MPIDI_CH3_DBG_CHANNEL = MPL_dbg_class_alloc("CH3_CHANNEL", "ch3_channel");
150     MPIDI_CH3_DBG_OTHER = MPL_dbg_class_alloc("CH3_OTHER", "ch3_other");
151     MPIDI_CH3_DBG_MSG = MPL_dbg_class_alloc("CH3_MSG", "ch3_msg");
152     MPIDI_CH3_DBG_VC = MPL_dbg_class_alloc("VC", "vc");
153     MPIDI_CH3_DBG_REFCOUNT = MPL_dbg_class_alloc("REFCOUNT", "refcount");
154 #endif /* MPL_USE_DBG_LOGGING */
155 
156     /*
157      * Let the channel perform any necessary initialization
158      * The channel init should assume that PMI_Init has been called and that
159      * the basic information about the job has been extracted from PMI (e.g.,
160      * the size and rank of this process, and the process group id)
161      */
162     mpi_errno = MPIDI_CH3_Init(has_parent, pg, pg_rank);
163     if (mpi_errno != MPI_SUCCESS) {
164 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ch3_init");
165     }
166 
167     /* setup receive queue statistics */
168     mpi_errno = MPIDI_CH3U_Recvq_init();
169     MPIR_ERR_CHECK(mpi_errno);
170 
171     /* Ask channel to expose Window packet ordering. */
172     MPIDI_CH3_Win_pkt_orderings_init(&MPIDI_CH3U_Win_pkt_orderings);
173 
174     /*
175      * Initialize the MPI_COMM_WORLD object
176      */
177     comm = MPIR_Process.comm_world;
178 
179     comm->rank        = pg_rank;
180     comm->remote_size = pg_size;
181     comm->local_size  = pg_size;
182 
183     mpi_errno = MPIDI_VCRT_Create(comm->remote_size, &comm->dev.vcrt);
184     if (mpi_errno != MPI_SUCCESS)
185     {
186 	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**dev|vcrt_create",
187 			     "**dev|vcrt_create %s", "MPI_COMM_WORLD");
188     }
189 
190     /* Initialize the connection table on COMM_WORLD from the process group's
191        connection table */
192     for (p = 0; p < pg_size; p++)
193     {
194 	MPIDI_VCR_Dup(&pg->vct[p], &comm->dev.vcrt->vcr_table[p]);
195     }
196 
197     mpi_errno = MPIR_Comm_commit(comm);
198     MPIR_ERR_CHECK(mpi_errno);
199 
200     /*
201      * Initialize the MPI_COMM_SELF object
202      */
203     comm = MPIR_Process.comm_self;
204     comm->rank        = 0;
205     comm->remote_size = 1;
206     comm->local_size  = 1;
207 
208     mpi_errno = MPIDI_VCRT_Create(comm->remote_size, &comm->dev.vcrt);
209     if (mpi_errno != MPI_SUCCESS)
210     {
211 	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**dev|vcrt_create",
212 			     "**dev|vcrt_create %s", "MPI_COMM_SELF");
213     }
214 
215     MPIDI_VCR_Dup(&pg->vct[pg_rank], &comm->dev.vcrt->vcr_table[0]);
216 
217     mpi_errno = MPIR_Comm_commit(comm);
218     MPIR_ERR_CHECK(mpi_errno);
219 
220     /* Currently, mpidpre.h always defines MPID_NEEDS_ICOMM_WORLD. */
221 #ifdef MPID_NEEDS_ICOMM_WORLD
222     /*
223      * Initialize the MPIR_ICOMM_WORLD object (an internal, private version
224      * of MPI_COMM_WORLD)
225      */
226     comm = MPIR_Process.icomm_world;
227 
228     comm->rank        = pg_rank;
229     comm->remote_size = pg_size;
230     comm->local_size  = pg_size;
231     MPIDI_VCRT_Add_ref( MPIR_Process.comm_world->dev.vcrt );
232     comm->dev.vcrt = MPIR_Process.comm_world->dev.vcrt;
233 
234     mpi_errno = MPIR_Comm_commit(comm);
235     MPIR_ERR_CHECK(mpi_errno);
236 #endif
237 
238     MPIR_Process.has_parent = has_parent;
239 
240     MPIR_Comm_register_hint(MPIR_COMM_HINT_EAGER_THRESH, "eager_rendezvous_threshold",
241                             NULL, MPIR_COMM_HINT_TYPE_INT, 0);
242 
243     mpi_errno = MPIDI_RMA_init();
244     MPIR_ERR_CHECK(mpi_errno);
245 
246   fn_exit:
247     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_INIT);
248     return mpi_errno;
249 
250     /* --BEGIN ERROR HANDLING-- */
251   fn_fail:
252     goto fn_exit;
253     /* --END ERROR HANDLING-- */
254 }
255 
init_spawn(void)256 static int init_spawn(void)
257 {
258     int mpi_errno = MPI_SUCCESS;
259     char * parent_port;
260     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_INIT_SPAWN);
261     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_INIT_SPAWN);
262 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
263 
264     /* FIXME: To allow just the "root" process to
265        request the port and then use MPIR_Bcast_allcomm_auto to
266        distribute it to the rest of the processes,
267        we need to perform the Bcast after MPI is
268        otherwise initialized.  We could do this
269        by adding another MPID call that the MPI_Init(_thread)
270        routine would make after the rest of MPI is
271        initialized, but before MPI_Init returns.
272        In fact, such a routine could be used to
273        perform various checks, including parameter
274        consistency value (e.g., all processes have the
275        same environment variable values). Alternately,
276        we could allow a few routines to operate with
277        predefined parameter choices (e.g., bcast, allreduce)
278        for the purposes of initialization. */
279     mpi_errno = MPIDI_CH3_GetParentPort(&parent_port);
280     if (mpi_errno != MPI_SUCCESS) {
281         MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
282                             "**ch3|get_parent_port");
283     }
284     MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Parent port is %s", parent_port);
285 
286     mpi_errno = MPID_Comm_connect(parent_port, NULL, 0, MPIR_Process.comm_world,
287                                   &MPIR_Process.comm_parent);
288     MPIR_ERR_CHKANDJUMP1(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER,
289                          "**ch3|conn_parent",
290                          "**ch3|conn_parent %s", parent_port);
291 
292     MPIR_Assert(MPIR_Process.comm_parent != NULL);
293     MPL_strncpy(MPIR_Process.comm_parent->name, "MPI_COMM_PARENT", MPI_MAX_OBJECT_NAME);
294 
295     /* FIXME: Check that this intercommunicator gets freed in MPI_Finalize
296        if not already freed.  */
297 #endif
298     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_INIT_SPAWN);
299   fn_exit:
300     return mpi_errno;
301   fn_fail:
302     goto fn_exit;
303 }
304 
305 /* This allows each channel to perform final initialization after the
306  rest of MPI_Init completes.  */
MPID_InitCompleted(void)307 int MPID_InitCompleted( void )
308 {
309     int mpi_errno;
310 
311     if (MPIR_Process.has_parent) {
312         mpi_errno = init_spawn();
313         MPIR_ERR_CHECK(mpi_errno);
314     }
315 
316     mpi_errno = MPIDI_CH3_InitCompleted();
317     MPIR_ERR_CHECK(mpi_errno);
318 
319   fn_exit:
320     return mpi_errno;
321 
322     /* --BEGIN ERROR HANDLING-- */
323   fn_fail:
324     goto fn_exit;
325     /* --END ERROR HANDLING-- */
326 }
327 
328 /*
329  * Initialize the process group structure by using PMI calls.
330  * This routine initializes PMI and uses PMI calls to setup the
331  * process group structures.
332  *
333  */
init_pg(int * has_parent,int * pg_rank_p,MPIDI_PG_t ** pg_p)334 static int init_pg(int *has_parent, int *pg_rank_p, MPIDI_PG_t **pg_p)
335 {
336     int mpi_errno = MPI_SUCCESS;
337     int pg_rank, pg_size, appnum;
338     int usePMI=1;
339     char *pg_id;
340     MPIDI_PG_t *pg = 0;
341 
342     /* See if the channel will provide the PMI values.  The channel
343      is responsible for defining HAVE_CH3_PRE_INIT and providing
344     the MPIDI_CH3_Pre_init function.  */
345     /* FIXME: Document this */
346 #ifdef HAVE_CH3_PRE_INIT
347     {
348 	int setvals;
349 	mpi_errno = MPIDI_CH3_Pre_init( &setvals, has_parent, &pg_rank,
350 					&pg_size );
351 	if (mpi_errno) {
352 	    goto fn_fail;
353 	}
354 	if (setvals) usePMI = 0;
355     }
356 #endif
357 
358     /* If we use PMI here, make the PMI calls to get the
359        basic values.  Note that systems that return setvals == true
360        do not make use of PMI for the KVS routines either (it is
361        assumed that the discover connection information through some
362        other mechanism */
363     /* FIXME: We may want to allow the channel to ifdef out the use
364        of PMI calls, or ask the channel to provide stubs that
365        return errors if the routines are in fact used */
366     if (usePMI) {
367 	/*
368 	 * Initialize the process manangement interface (PMI),
369 	 * and get rank and size information about our process group
370 	 */
371 
372         mpi_errno = MPIR_pmi_init();
373         MPIR_ERR_CHECK(mpi_errno);
374 
375         *has_parent = MPIR_Process.has_parent;
376         pg_rank = MPIR_Process.rank;
377         pg_size = MPIR_Process.size;
378         appnum = MPIR_Process.appnum;
379 
380 	/* Note that if pmi is not availble, the value of MPI_APPNUM is
381 	   not set */
382 	if (appnum != -1) {
383 	    MPIR_Process.attrs.appnum = appnum;
384 	}
385 
386         pg_id = MPL_strdup(MPIR_pmi_job_id());
387     }
388     else {
389 	pg_id = MPL_strdup("0");
390     }
391 
392     /*
393      * Initialize the process group tracking subsystem
394      */
395     mpi_errno = MPIDI_PG_Init(pg_compare_ids, pg_destroy);
396     if (mpi_errno != MPI_SUCCESS) {
397 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**dev|pg_init");
398     }
399 
400     /*
401      * Create a new structure to track the process group for our MPI_COMM_WORLD
402      */
403     mpi_errno = MPIDI_PG_Create(pg_size, pg_id, &pg);
404     if (mpi_errno != MPI_SUCCESS) {
405 	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|pg_create");
406     }
407 
408     /* FIXME: We can allow the channels to tell the PG how to get
409        connection information by passing the pg to the channel init routine */
410     if (usePMI) {
411 	/* Tell the process group how to get connection information */
412         mpi_errno = MPIDI_PG_InitConnKVS( pg );
413         MPIR_ERR_CHECK(mpi_errno);
414     }
415 
416     /* FIXME: Who is this for and where does it belong? */
417 #ifdef USE_MPIDI_DBG_PRINT_VC
418     MPIDI_DBG_parent_str = (*has_parent) ? "+" : "";
419 #endif
420 
421     *pg_p      = pg;
422     *pg_rank_p = pg_rank;
423 
424  fn_exit:
425     return mpi_errno;
426  fn_fail:
427     /* --BEGIN ERROR HANDLING-- */
428     if (pg) {
429 	MPIDI_PG_Destroy( pg );
430     }
431     goto fn_exit;
432     /* --END ERROR HANDLING-- */
433 }
434 
435 /*
436  * Create the storage for the business card.
437  *
438  * The routine MPIDI_CH3I_BCFree should be called with the original
439  * value *bc_val_p .  Note that the routines that set the value
440  * of the businesscard return a pointer to the first free location,
441  * so you need to remember the original location in order to free
442  * it later.
443  *
444  */
MPIDI_CH3I_BCInit(char ** bc_val_p,int * val_max_sz_p)445 int MPIDI_CH3I_BCInit( char **bc_val_p, int *val_max_sz_p )
446 {
447     int pmi_errno;
448     int mpi_errno = MPI_SUCCESS;
449 #ifdef USE_PMI2_API
450     *val_max_sz_p = PMI2_MAX_VALLEN;
451 #else
452     pmi_errno = PMI_KVS_Get_value_length_max(val_max_sz_p);
453     if (pmi_errno != PMI_SUCCESS)
454     {
455         MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,
456                              "**pmi_kvs_get_value_length_max",
457                              "**pmi_kvs_get_value_length_max %d", pmi_errno);
458     }
459 #endif
460     /* This memroy is returned by this routine */
461     *bc_val_p = MPL_malloc(*val_max_sz_p, MPL_MEM_ADDRESS);
462     if (*bc_val_p == NULL) {
463 	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**nomem","**nomem %d",
464 			     *val_max_sz_p);
465     }
466 
467     /* Add a null to simplify looking at the bc */
468     **bc_val_p = 0;
469 
470   fn_exit:
471     return mpi_errno;
472 
473   fn_fail:
474     goto fn_exit;
475 }
476 
477 /* Free the business card.  This routine should be called once the business
478    card is published. */
MPIDI_CH3I_BCFree(char * bc_val)479 int MPIDI_CH3I_BCFree( char *bc_val )
480 {
481     /* */
482     MPL_free( bc_val );
483 
484     return 0;
485 }
486 
487 /* FIXME: The PG code should supply these, since it knows how the
488    pg_ids and other data are represented */
pg_compare_ids(void * id1,void * id2)489 static int pg_compare_ids(void * id1, void * id2)
490 {
491     return (strcmp((char *) id1, (char *) id2) == 0) ? TRUE : FALSE;
492 }
493 
494 
pg_destroy(MPIDI_PG_t * pg)495 static int pg_destroy(MPIDI_PG_t * pg)
496 {
497     MPL_free(pg->id);
498 
499     return MPI_SUCCESS;
500 }
501 
502