1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpidimpl.h"
7
8 #define MAX_JOBID_LEN 1024
9
10 #if defined(HAVE_LIMITS_H)
11 #include <limits.h>
12 #endif
13 #if defined(HAVE_UNISTD_H)
14 #include <unistd.h>
15 #endif
16
17 /* FIXME: This does not belong here */
18 #ifdef USE_MPIDI_DBG_PRINT_VC
19 char *MPIDI_DBG_parent_str = "?";
20 #endif
21
22 /* FIXME: the PMI init function should ONLY do the PMI operations, not the
23 process group or bc operations. These should be in a separate routine */
24 #ifdef USE_PMI2_API
25 #include "pmi2.h"
26 #else
27 #include "pmi.h"
28 #endif
29
30 #include "datatype.h"
31
32 static int init_pg(int *has_parent, int *pg_rank_p, MPIDI_PG_t **pg_p);
33 static int pg_compare_ids(void * id1, void * id2);
34 static int pg_destroy(MPIDI_PG_t * pg );
35
36 MPIDI_Process_t MPIDI_Process = { NULL };
37 MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool = NULL;
38 MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns = { NULL };
39 MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks = { NULL };
40 MPIDI_CH3U_Win_pkt_ordering_t MPIDI_CH3U_Win_pkt_orderings = { 0 };
41
42 #if defined(MPL_USE_DBG_LOGGING)
43 MPL_dbg_class MPIDI_CH3_DBG_CONNECT;
44 MPL_dbg_class MPIDI_CH3_DBG_DISCONNECT;
45 MPL_dbg_class MPIDI_CH3_DBG_PROGRESS;
46 MPL_dbg_class MPIDI_CH3_DBG_CHANNEL;
47 MPL_dbg_class MPIDI_CH3_DBG_OTHER;
48 MPL_dbg_class MPIDI_CH3_DBG_MSG;
49 MPL_dbg_class MPIDI_CH3_DBG_VC;
50 MPL_dbg_class MPIDI_CH3_DBG_REFCOUNT;
51 #endif /* MPL_USE_DBG_LOGGING */
52
finalize_failed_procs_group(void * param)53 static int finalize_failed_procs_group(void *param)
54 {
55 int mpi_errno = MPI_SUCCESS;
56 if (MPIDI_Failed_procs_group != MPIR_Group_empty) {
57 mpi_errno = MPIR_Group_free_impl(MPIDI_Failed_procs_group);
58 MPIR_ERR_CHECK(mpi_errno);
59 }
60
61 fn_fail:
62 return mpi_errno;
63 }
64
MPID_Init(int requested,int * provided)65 int MPID_Init(int requested, int *provided)
66 {
67 int pmi_errno;
68 int mpi_errno = MPI_SUCCESS;
69 int has_parent;
70 MPIDI_PG_t * pg=NULL;
71 int pg_rank=-1;
72 int pg_size;
73 MPIR_Comm * comm;
74 int p;
75 int val;
76 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_INIT);
77
78 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_INIT);
79
80 if (MPICH_THREAD_LEVEL >= requested)
81 *provided = requested;
82 else
83 *provided = MPICH_THREAD_LEVEL;
84
85 /* initialization routine for ch3u_comm.c */
86 mpi_errno = MPIDI_CH3I_Comm_init();
87 MPIR_ERR_CHECK(mpi_errno);
88
89 /* init group of failed processes, and set finalize callback */
90 MPIDI_Failed_procs_group = MPIR_Group_empty;
91 MPIR_Add_finalize(finalize_failed_procs_group, NULL, MPIR_FINALIZE_CALLBACK_PRIO-1);
92
93 /* Create the string that will cache the last group of failed processes
94 * we received from PMI */
95 #ifdef USE_PMI2_API
96 MPIDI_failed_procs_string = MPL_malloc(sizeof(char) * PMI2_MAX_VALLEN, MPL_MEM_STRINGS);
97 #else
98 pmi_errno = PMI_KVS_Get_value_length_max(&val);
99 if (pmi_errno != PMI_SUCCESS)
100 {
101 MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER,
102 "**pmi_kvs_get_value_length_max",
103 "**pmi_kvs_get_value_length_max %d", pmi_errno);
104 }
105 MPIDI_failed_procs_string = MPL_malloc(sizeof(char) * (val+1), MPL_MEM_STRINGS);
106 #endif
107
108 /*
109 * Set global process attributes. These can be overridden by the channel
110 * if necessary.
111 */
112 MPIR_Process.attrs.io = MPI_ANY_SOURCE;
113
114 /*
115 * Perform channel-independent PMI initialization
116 */
117 mpi_errno = init_pg(&has_parent, &pg_rank, &pg);
118 if (mpi_errno) {
119 MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ch3_init");
120 }
121
122 /* FIXME: Why are pg_size and pg_rank handled differently? */
123 pg_size = MPIDI_PG_Get_size(pg);
124 MPIDI_Process.my_pg = pg; /* brad : this is rework for shared memories
125 * because they need this set earlier
126 * for getting the business card
127 */
128 MPIDI_Process.my_pg_rank = pg_rank;
129 /* FIXME: Why do we add a ref to pg here? */
130 MPIDI_PG_add_ref(pg);
131
132 /* We intentionally call this before the channel init so that the channel
133 can use the node_id info. */
134 /* Ideally this wouldn't be needed. Once we have PMIv2 support for node
135 information we should probably eliminate this function. */
136 mpi_errno = MPIDI_Populate_vc_node_ids(pg, pg_rank);
137 MPIR_ERR_CHECK(mpi_errno);
138
139 /* Initialize Window functions table with defaults, then call the channel's
140 init function. */
141 MPIDI_Win_fns_init(&MPIDI_CH3U_Win_fns);
142 MPIDI_CH3_Win_fns_init(&MPIDI_CH3U_Win_fns);
143 MPIDI_CH3_Win_hooks_init(&MPIDI_CH3U_Win_hooks);
144
145 #ifdef MPL_USE_DBG_LOGGING
146 MPIDI_CH3_DBG_CONNECT = MPL_dbg_class_alloc("CH3_CONNECT", "ch3_connect");;
147 MPIDI_CH3_DBG_DISCONNECT = MPL_dbg_class_alloc("CH3_DISCONNECT", "ch3_disconnect");
148 MPIDI_CH3_DBG_PROGRESS = MPL_dbg_class_alloc("CH3_PROGRESS", "ch3_progress");
149 MPIDI_CH3_DBG_CHANNEL = MPL_dbg_class_alloc("CH3_CHANNEL", "ch3_channel");
150 MPIDI_CH3_DBG_OTHER = MPL_dbg_class_alloc("CH3_OTHER", "ch3_other");
151 MPIDI_CH3_DBG_MSG = MPL_dbg_class_alloc("CH3_MSG", "ch3_msg");
152 MPIDI_CH3_DBG_VC = MPL_dbg_class_alloc("VC", "vc");
153 MPIDI_CH3_DBG_REFCOUNT = MPL_dbg_class_alloc("REFCOUNT", "refcount");
154 #endif /* MPL_USE_DBG_LOGGING */
155
156 /*
157 * Let the channel perform any necessary initialization
158 * The channel init should assume that PMI_Init has been called and that
159 * the basic information about the job has been extracted from PMI (e.g.,
160 * the size and rank of this process, and the process group id)
161 */
162 mpi_errno = MPIDI_CH3_Init(has_parent, pg, pg_rank);
163 if (mpi_errno != MPI_SUCCESS) {
164 MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ch3_init");
165 }
166
167 /* setup receive queue statistics */
168 mpi_errno = MPIDI_CH3U_Recvq_init();
169 MPIR_ERR_CHECK(mpi_errno);
170
171 /* Ask channel to expose Window packet ordering. */
172 MPIDI_CH3_Win_pkt_orderings_init(&MPIDI_CH3U_Win_pkt_orderings);
173
174 /*
175 * Initialize the MPI_COMM_WORLD object
176 */
177 comm = MPIR_Process.comm_world;
178
179 comm->rank = pg_rank;
180 comm->remote_size = pg_size;
181 comm->local_size = pg_size;
182
183 mpi_errno = MPIDI_VCRT_Create(comm->remote_size, &comm->dev.vcrt);
184 if (mpi_errno != MPI_SUCCESS)
185 {
186 MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**dev|vcrt_create",
187 "**dev|vcrt_create %s", "MPI_COMM_WORLD");
188 }
189
190 /* Initialize the connection table on COMM_WORLD from the process group's
191 connection table */
192 for (p = 0; p < pg_size; p++)
193 {
194 MPIDI_VCR_Dup(&pg->vct[p], &comm->dev.vcrt->vcr_table[p]);
195 }
196
197 mpi_errno = MPIR_Comm_commit(comm);
198 MPIR_ERR_CHECK(mpi_errno);
199
200 /*
201 * Initialize the MPI_COMM_SELF object
202 */
203 comm = MPIR_Process.comm_self;
204 comm->rank = 0;
205 comm->remote_size = 1;
206 comm->local_size = 1;
207
208 mpi_errno = MPIDI_VCRT_Create(comm->remote_size, &comm->dev.vcrt);
209 if (mpi_errno != MPI_SUCCESS)
210 {
211 MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**dev|vcrt_create",
212 "**dev|vcrt_create %s", "MPI_COMM_SELF");
213 }
214
215 MPIDI_VCR_Dup(&pg->vct[pg_rank], &comm->dev.vcrt->vcr_table[0]);
216
217 mpi_errno = MPIR_Comm_commit(comm);
218 MPIR_ERR_CHECK(mpi_errno);
219
220 /* Currently, mpidpre.h always defines MPID_NEEDS_ICOMM_WORLD. */
221 #ifdef MPID_NEEDS_ICOMM_WORLD
222 /*
223 * Initialize the MPIR_ICOMM_WORLD object (an internal, private version
224 * of MPI_COMM_WORLD)
225 */
226 comm = MPIR_Process.icomm_world;
227
228 comm->rank = pg_rank;
229 comm->remote_size = pg_size;
230 comm->local_size = pg_size;
231 MPIDI_VCRT_Add_ref( MPIR_Process.comm_world->dev.vcrt );
232 comm->dev.vcrt = MPIR_Process.comm_world->dev.vcrt;
233
234 mpi_errno = MPIR_Comm_commit(comm);
235 MPIR_ERR_CHECK(mpi_errno);
236 #endif
237
238 MPIR_Process.has_parent = has_parent;
239
240 MPIR_Comm_register_hint(MPIR_COMM_HINT_EAGER_THRESH, "eager_rendezvous_threshold",
241 NULL, MPIR_COMM_HINT_TYPE_INT, 0);
242
243 mpi_errno = MPIDI_RMA_init();
244 MPIR_ERR_CHECK(mpi_errno);
245
246 fn_exit:
247 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_INIT);
248 return mpi_errno;
249
250 /* --BEGIN ERROR HANDLING-- */
251 fn_fail:
252 goto fn_exit;
253 /* --END ERROR HANDLING-- */
254 }
255
init_spawn(void)256 static int init_spawn(void)
257 {
258 int mpi_errno = MPI_SUCCESS;
259 char * parent_port;
260 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_INIT_SPAWN);
261 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_INIT_SPAWN);
262 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
263
264 /* FIXME: To allow just the "root" process to
265 request the port and then use MPIR_Bcast_allcomm_auto to
266 distribute it to the rest of the processes,
267 we need to perform the Bcast after MPI is
268 otherwise initialized. We could do this
269 by adding another MPID call that the MPI_Init(_thread)
270 routine would make after the rest of MPI is
271 initialized, but before MPI_Init returns.
272 In fact, such a routine could be used to
273 perform various checks, including parameter
274 consistency value (e.g., all processes have the
275 same environment variable values). Alternately,
276 we could allow a few routines to operate with
277 predefined parameter choices (e.g., bcast, allreduce)
278 for the purposes of initialization. */
279 mpi_errno = MPIDI_CH3_GetParentPort(&parent_port);
280 if (mpi_errno != MPI_SUCCESS) {
281 MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
282 "**ch3|get_parent_port");
283 }
284 MPL_DBG_MSG_S(MPIDI_CH3_DBG_CONNECT,VERBOSE,"Parent port is %s", parent_port);
285
286 mpi_errno = MPID_Comm_connect(parent_port, NULL, 0, MPIR_Process.comm_world,
287 &MPIR_Process.comm_parent);
288 MPIR_ERR_CHKANDJUMP1(mpi_errno != MPI_SUCCESS, mpi_errno, MPI_ERR_OTHER,
289 "**ch3|conn_parent",
290 "**ch3|conn_parent %s", parent_port);
291
292 MPIR_Assert(MPIR_Process.comm_parent != NULL);
293 MPL_strncpy(MPIR_Process.comm_parent->name, "MPI_COMM_PARENT", MPI_MAX_OBJECT_NAME);
294
295 /* FIXME: Check that this intercommunicator gets freed in MPI_Finalize
296 if not already freed. */
297 #endif
298 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_INIT_SPAWN);
299 fn_exit:
300 return mpi_errno;
301 fn_fail:
302 goto fn_exit;
303 }
304
305 /* This allows each channel to perform final initialization after the
306 rest of MPI_Init completes. */
MPID_InitCompleted(void)307 int MPID_InitCompleted( void )
308 {
309 int mpi_errno;
310
311 if (MPIR_Process.has_parent) {
312 mpi_errno = init_spawn();
313 MPIR_ERR_CHECK(mpi_errno);
314 }
315
316 mpi_errno = MPIDI_CH3_InitCompleted();
317 MPIR_ERR_CHECK(mpi_errno);
318
319 fn_exit:
320 return mpi_errno;
321
322 /* --BEGIN ERROR HANDLING-- */
323 fn_fail:
324 goto fn_exit;
325 /* --END ERROR HANDLING-- */
326 }
327
328 /*
329 * Initialize the process group structure by using PMI calls.
330 * This routine initializes PMI and uses PMI calls to setup the
331 * process group structures.
332 *
333 */
init_pg(int * has_parent,int * pg_rank_p,MPIDI_PG_t ** pg_p)334 static int init_pg(int *has_parent, int *pg_rank_p, MPIDI_PG_t **pg_p)
335 {
336 int mpi_errno = MPI_SUCCESS;
337 int pg_rank, pg_size, appnum;
338 int usePMI=1;
339 char *pg_id;
340 MPIDI_PG_t *pg = 0;
341
342 /* See if the channel will provide the PMI values. The channel
343 is responsible for defining HAVE_CH3_PRE_INIT and providing
344 the MPIDI_CH3_Pre_init function. */
345 /* FIXME: Document this */
346 #ifdef HAVE_CH3_PRE_INIT
347 {
348 int setvals;
349 mpi_errno = MPIDI_CH3_Pre_init( &setvals, has_parent, &pg_rank,
350 &pg_size );
351 if (mpi_errno) {
352 goto fn_fail;
353 }
354 if (setvals) usePMI = 0;
355 }
356 #endif
357
358 /* If we use PMI here, make the PMI calls to get the
359 basic values. Note that systems that return setvals == true
360 do not make use of PMI for the KVS routines either (it is
361 assumed that the discover connection information through some
362 other mechanism */
363 /* FIXME: We may want to allow the channel to ifdef out the use
364 of PMI calls, or ask the channel to provide stubs that
365 return errors if the routines are in fact used */
366 if (usePMI) {
367 /*
368 * Initialize the process manangement interface (PMI),
369 * and get rank and size information about our process group
370 */
371
372 mpi_errno = MPIR_pmi_init();
373 MPIR_ERR_CHECK(mpi_errno);
374
375 *has_parent = MPIR_Process.has_parent;
376 pg_rank = MPIR_Process.rank;
377 pg_size = MPIR_Process.size;
378 appnum = MPIR_Process.appnum;
379
380 /* Note that if pmi is not availble, the value of MPI_APPNUM is
381 not set */
382 if (appnum != -1) {
383 MPIR_Process.attrs.appnum = appnum;
384 }
385
386 pg_id = MPL_strdup(MPIR_pmi_job_id());
387 }
388 else {
389 pg_id = MPL_strdup("0");
390 }
391
392 /*
393 * Initialize the process group tracking subsystem
394 */
395 mpi_errno = MPIDI_PG_Init(pg_compare_ids, pg_destroy);
396 if (mpi_errno != MPI_SUCCESS) {
397 MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**dev|pg_init");
398 }
399
400 /*
401 * Create a new structure to track the process group for our MPI_COMM_WORLD
402 */
403 mpi_errno = MPIDI_PG_Create(pg_size, pg_id, &pg);
404 if (mpi_errno != MPI_SUCCESS) {
405 MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|pg_create");
406 }
407
408 /* FIXME: We can allow the channels to tell the PG how to get
409 connection information by passing the pg to the channel init routine */
410 if (usePMI) {
411 /* Tell the process group how to get connection information */
412 mpi_errno = MPIDI_PG_InitConnKVS( pg );
413 MPIR_ERR_CHECK(mpi_errno);
414 }
415
416 /* FIXME: Who is this for and where does it belong? */
417 #ifdef USE_MPIDI_DBG_PRINT_VC
418 MPIDI_DBG_parent_str = (*has_parent) ? "+" : "";
419 #endif
420
421 *pg_p = pg;
422 *pg_rank_p = pg_rank;
423
424 fn_exit:
425 return mpi_errno;
426 fn_fail:
427 /* --BEGIN ERROR HANDLING-- */
428 if (pg) {
429 MPIDI_PG_Destroy( pg );
430 }
431 goto fn_exit;
432 /* --END ERROR HANDLING-- */
433 }
434
435 /*
436 * Create the storage for the business card.
437 *
438 * The routine MPIDI_CH3I_BCFree should be called with the original
439 * value *bc_val_p . Note that the routines that set the value
440 * of the businesscard return a pointer to the first free location,
441 * so you need to remember the original location in order to free
442 * it later.
443 *
444 */
MPIDI_CH3I_BCInit(char ** bc_val_p,int * val_max_sz_p)445 int MPIDI_CH3I_BCInit( char **bc_val_p, int *val_max_sz_p )
446 {
447 int pmi_errno;
448 int mpi_errno = MPI_SUCCESS;
449 #ifdef USE_PMI2_API
450 *val_max_sz_p = PMI2_MAX_VALLEN;
451 #else
452 pmi_errno = PMI_KVS_Get_value_length_max(val_max_sz_p);
453 if (pmi_errno != PMI_SUCCESS)
454 {
455 MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,
456 "**pmi_kvs_get_value_length_max",
457 "**pmi_kvs_get_value_length_max %d", pmi_errno);
458 }
459 #endif
460 /* This memroy is returned by this routine */
461 *bc_val_p = MPL_malloc(*val_max_sz_p, MPL_MEM_ADDRESS);
462 if (*bc_val_p == NULL) {
463 MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**nomem","**nomem %d",
464 *val_max_sz_p);
465 }
466
467 /* Add a null to simplify looking at the bc */
468 **bc_val_p = 0;
469
470 fn_exit:
471 return mpi_errno;
472
473 fn_fail:
474 goto fn_exit;
475 }
476
477 /* Free the business card. This routine should be called once the business
478 card is published. */
MPIDI_CH3I_BCFree(char * bc_val)479 int MPIDI_CH3I_BCFree( char *bc_val )
480 {
481 /* */
482 MPL_free( bc_val );
483
484 return 0;
485 }
486
487 /* FIXME: The PG code should supply these, since it knows how the
488 pg_ids and other data are represented */
pg_compare_ids(void * id1,void * id2)489 static int pg_compare_ids(void * id1, void * id2)
490 {
491 return (strcmp((char *) id1, (char *) id2) == 0) ? TRUE : FALSE;
492 }
493
494
pg_destroy(MPIDI_PG_t * pg)495 static int pg_destroy(MPIDI_PG_t * pg)
496 {
497 MPL_free(pg->id);
498
499 return MPI_SUCCESS;
500 }
501
502