1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2014 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
14 * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
15 * reserved.
16 * Copyright (c) 2006-2009 University of Houston. All rights reserved.
17 * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
18 * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
19 * Copyright (c) 2012-2013 Inria. All rights reserved.
20 * Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
21 * Copyright (c) 2014-2016 Research Organization for Information Science
22 * and Technology (RIST). All rights reserved.
23 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
24 *
25 * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
26 * $COPYRIGHT$
27 *
28 * Additional copyrights may follow
29 *
30 * $HEADER$
31 */
32
33 #include "ompi_config.h"
34
35 #ifdef HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif /* HAVE_SYS_TIME_H */
38 #include <pthread.h>
39 #ifdef HAVE_UNISTD_H
40 #include <unistd.h>
41 #endif
42
43 #include "mpi.h"
44 #include "opal/class/opal_list.h"
45 #include "opal/mca/base/base.h"
46 #include "opal/mca/hwloc/base/base.h"
47 #include "opal/runtime/opal_progress.h"
48 #include "opal/threads/threads.h"
49 #include "opal/util/arch.h"
50 #include "opal/util/argv.h"
51 #include "opal/util/output.h"
52 #include "opal/util/error.h"
53 #include "opal/util/stacktrace.h"
54 #include "opal/util/show_help.h"
55 #include "opal/runtime/opal.h"
56 #include "opal/mca/event/event.h"
57 #include "opal/mca/allocator/base/base.h"
58 #include "opal/mca/rcache/base/base.h"
59 #include "opal/mca/rcache/rcache.h"
60 #include "opal/mca/mpool/base/base.h"
61 #include "opal/mca/btl/base/base.h"
62 #include "opal/mca/pmix/pmix.h"
63 #include "opal/util/timings.h"
64 #include "opal/util/opal_environ.h"
65
66 #include "ompi/constants.h"
67 #include "ompi/mpi/fortran/base/constants.h"
68 #include "ompi/runtime/mpiruntime.h"
69 #include "ompi/runtime/params.h"
70 #include "ompi/communicator/communicator.h"
71 #include "ompi/info/info.h"
72 #include "ompi/errhandler/errcode.h"
73 #include "ompi/errhandler/errhandler.h"
74 #include "ompi/interlib/interlib.h"
75 #include "ompi/request/request.h"
76 #include "ompi/message/message.h"
77 #include "ompi/op/op.h"
78 #include "ompi/mca/op/op.h"
79 #include "ompi/mca/op/base/base.h"
80 #include "ompi/file/file.h"
81 #include "ompi/attribute/attribute.h"
82 #include "ompi/mca/pml/pml.h"
83 #include "ompi/mca/bml/bml.h"
84 #include "ompi/mca/pml/base/base.h"
85 #include "ompi/mca/bml/base/base.h"
86 #include "ompi/mca/osc/base/base.h"
87 #include "ompi/mca/coll/base/base.h"
88 #include "ompi/mca/io/io.h"
89 #include "ompi/mca/io/base/base.h"
90 #include "ompi/mca/rte/rte.h"
91 #include "ompi/mca/rte/base/base.h"
92 #include "ompi/debuggers/debuggers.h"
93 #include "ompi/proc/proc.h"
94 #include "ompi/mca/pml/base/pml_base_bsend.h"
95 #include "ompi/dpm/dpm.h"
96 #include "ompi/mpiext/mpiext.h"
97 #include "ompi/mca/hook/base/base.h"
98 #include "ompi/util/timings.h"
99
100 #if OPAL_ENABLE_FT_CR == 1
101 #include "ompi/mca/crcp/crcp.h"
102 #include "ompi/mca/crcp/base/base.h"
103 #endif
104 #include "ompi/runtime/ompi_cr.h"
105
106 /* newer versions of gcc have poisoned this deprecated feature */
107 #ifdef HAVE___MALLOC_INITIALIZE_HOOK
108 #include "opal/mca/memory/base/base.h"
109 /* So this sucks, but with OPAL in its own library that is brought in
110 implicity from libmpi, there are times when the malloc initialize
111 hook in the memory component doesn't work. So we have to do it
112 from here, since any MPI code is going to call MPI_Init... */
113 OPAL_DECLSPEC void (*__malloc_initialize_hook) (void) =
114 opal_memory_base_malloc_init_hook;
115 #endif
116
117 /* This is required for the boundaries of the hash tables used to store
118 * the F90 types returned by the MPI_Type_create_f90_XXX functions.
119 */
120 #include <float.h>
121
122 #if OPAL_CC_USE_PRAGMA_IDENT
123 #pragma ident OMPI_IDENT_STRING
124 #elif OPAL_CC_USE_IDENT
125 #ident OMPI_IDENT_STRING
126 #endif
127 const char ompi_version_string[] = OMPI_IDENT_STRING;
128
129 /*
130 * Global variables and symbols for the MPI layer
131 */
132
133 volatile int32_t ompi_mpi_state = OMPI_MPI_STATE_NOT_INITIALIZED;
134 volatile bool ompi_rte_initialized = false;
135
136 bool ompi_mpi_thread_multiple = false;
137 int ompi_mpi_thread_requested = MPI_THREAD_SINGLE;
138 int ompi_mpi_thread_provided = MPI_THREAD_SINGLE;
139
140 opal_thread_t *ompi_mpi_main_thread = NULL;
141
142 /*
143 * These variables are for the MPI F08 bindings (F08 must bind Fortran
144 * varaiables to symbols; it cannot bind Fortran variables to the
145 * address of a C variable).
146 */
147
148 ompi_predefined_datatype_t *ompi_mpi_character_addr = &ompi_mpi_character;
149 ompi_predefined_datatype_t *ompi_mpi_logical_addr = &ompi_mpi_logical;
150 ompi_predefined_datatype_t *ompi_mpi_logical1_addr = &ompi_mpi_logical1;
151 ompi_predefined_datatype_t *ompi_mpi_logical2_addr = &ompi_mpi_logical2;
152 ompi_predefined_datatype_t *ompi_mpi_logical4_addr = &ompi_mpi_logical4;
153 ompi_predefined_datatype_t *ompi_mpi_logical8_addr = &ompi_mpi_logical8;
154 ompi_predefined_datatype_t *ompi_mpi_integer_addr = &ompi_mpi_integer;
155 ompi_predefined_datatype_t *ompi_mpi_integer1_addr = &ompi_mpi_integer1;
156 ompi_predefined_datatype_t *ompi_mpi_integer2_addr = &ompi_mpi_integer2;
157 ompi_predefined_datatype_t *ompi_mpi_integer4_addr = &ompi_mpi_integer4;
158 ompi_predefined_datatype_t *ompi_mpi_integer8_addr = &ompi_mpi_integer8;
159 ompi_predefined_datatype_t *ompi_mpi_integer16_addr = &ompi_mpi_integer16;
160 ompi_predefined_datatype_t *ompi_mpi_real_addr = &ompi_mpi_real;
161 ompi_predefined_datatype_t *ompi_mpi_real4_addr = &ompi_mpi_real4;
162 ompi_predefined_datatype_t *ompi_mpi_real8_addr = &ompi_mpi_real8;
163 ompi_predefined_datatype_t *ompi_mpi_real16_addr = &ompi_mpi_real16;
164 ompi_predefined_datatype_t *ompi_mpi_dblprec_addr = &ompi_mpi_dblprec;
165 ompi_predefined_datatype_t *ompi_mpi_cplex_addr = &ompi_mpi_cplex;
166 ompi_predefined_datatype_t *ompi_mpi_complex8_addr = &ompi_mpi_complex8;
167 ompi_predefined_datatype_t *ompi_mpi_complex16_addr = &ompi_mpi_complex16;
168 ompi_predefined_datatype_t *ompi_mpi_complex32_addr = &ompi_mpi_complex32;
169 ompi_predefined_datatype_t *ompi_mpi_dblcplex_addr = &ompi_mpi_dblcplex;
170 ompi_predefined_datatype_t *ompi_mpi_2real_addr = &ompi_mpi_2real;
171 ompi_predefined_datatype_t *ompi_mpi_2dblprec_addr = &ompi_mpi_2dblprec;
172 ompi_predefined_datatype_t *ompi_mpi_2integer_addr = &ompi_mpi_2integer;
173
174 struct ompi_status_public_t *ompi_mpi_status_ignore_addr =
175 (ompi_status_public_t *) 0;
176 struct ompi_status_public_t *ompi_mpi_statuses_ignore_addr =
177 (ompi_status_public_t *) 0;
178
179 /*
180 * These variables are here, rather than under ompi/mpi/c/foo.c
181 * because it is not sufficient to have a .c file that only contains
182 * variables -- you must have a function that is invoked from
183 * elsewhere in the code to guarantee that all linkers will pull in
184 * the .o file from the library. Hence, although these are MPI
185 * constants, we might as well just define them here (i.e., in a file
186 * that already has a function that is guaranteed to be linked in,
187 * rather than make a new .c file with the constants and a
188 * corresponding dummy function that is invoked from this function).
189 *
190 * Additionally, there can be/are strange linking paths such that
191 * ompi_info needs symbols such as ompi_fortran_status_ignore,
192 * which, if they weren't here with a collection of other global
193 * symbols that are initialized (which seems to force this .o file to
194 * be pulled into the resolution process, because ompi_info certainly
195 * does not call ompi_mpi_init()), would not be able to be found by
196 * the OSX linker.
197 *
198 * NOTE: See the big comment in ompi/mpi/fortran/base/constants.h
199 * about why we have four symbols for each of the common blocks (e.g.,
200 * the Fortran equivalent(s) of MPI_STATUS_IGNORE). Here, we can only
201 * have *one* value (not four). So the only thing we can do is make
202 * it equal to the fortran compiler convention that was selected at
203 * configure time. Note that this is also true for the value of
204 * .TRUE. from the Fortran compiler, so even though Open MPI supports
205 * all four Fortran symbol conventions, it can only support one
206 * convention for the two C constants (MPI_FORTRAN_STATUS[ES]_IGNORE)
207 * and only support one compiler for the value of .TRUE. Ugh!!
208 *
209 * Note that the casts here are ok -- we're *only* comparing pointer
210 * values (i.e., they'll never be de-referenced). The global symbols
211 * are actually of type (ompi_fortran_common_t) (for alignment
212 * issues), but MPI says that MPI_F_STATUS[ES]_IGNORE must be of type
213 * (MPI_Fint*). Hence, we have to cast to make compilers not
214 * complain.
215 */
216 #if OMPI_BUILD_FORTRAN_BINDINGS
217 # if OMPI_FORTRAN_CAPS
218 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUS_IGNORE;
219 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUSES_IGNORE;
220 # elif OMPI_FORTRAN_PLAIN
221 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore;
222 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore;
223 # elif OMPI_FORTRAN_SINGLE_UNDERSCORE
224 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore_;
225 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore_;
226 # elif OMPI_FORTRAN_DOUBLE_UNDERSCORE
227 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore__;
228 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore__;
229 # else
230 # error Unrecognized Fortran name mangling scheme
231 # endif
232 #else
233 MPI_Fint *MPI_F_STATUS_IGNORE = NULL;
234 MPI_Fint *MPI_F_STATUSES_IGNORE = NULL;
235 #endif /* OMPI_BUILD_FORTRAN_BINDINGS */
236
237
238 /* Constants for the Fortran layer. These values are referred to via
239 common blocks in the Fortran equivalents. See
240 ompi/mpi/fortran/base/constants.h for a more detailed explanation.
241
242 The values are *NOT* initialized. We do not use the values of
243 these constants; only their addresses (because they're always
244 passed by reference by Fortran).
245
246 Initializing upon instantiation these can reveal size and/or
247 alignment differences between Fortran and C (!) which can cause
248 warnings or errors upon linking (e.g., making static libraries with
249 the intel 9.0 compilers on 64 bit platforms shows alignment
250 differences between libmpi.a and the user's application, resulting
251 in a linker warning). FWIW, if you initialize these variables in
252 functions (i.e., not at the instantiation in the global scope), the
253 linker somehow "figures it all out" (w.r.t. different alignments
254 between fortan common blocks and the corresponding C variables) and
255 no linker warnings occur.
256
257 Note that the rationale for the types of each of these variables is
258 discussed in ompi/include/mpif-common.h. Do not change the types
259 without also modifying ompi/mpi/fortran/base/constants.h and
260 ompi/include/mpif-common.h.
261 */
262
263 #include "mpif-c-constants.h"
264
265 /*
266 * Hash tables for MPI_Type_create_f90* functions
267 */
268 opal_hash_table_t ompi_mpi_f90_integer_hashtable = {{0}};
269 opal_hash_table_t ompi_mpi_f90_real_hashtable = {{0}};
270 opal_hash_table_t ompi_mpi_f90_complex_hashtable = {{0}};
271
272 /*
273 * Per MPI-2:9.5.3, MPI_REGISTER_DATAREP is a memory leak. There is
274 * no way to *de*register datareps once they've been registered. So
275 * we have to track all registrations here so that they can be
276 * de-registered during MPI_FINALIZE so that memory-tracking debuggers
277 * don't show Open MPI as leaking memory.
278 */
279 opal_list_t ompi_registered_datareps = {{0}};
280
281 bool ompi_enable_timing = false;
282 extern bool ompi_mpi_yield_when_idle;
283 extern int ompi_mpi_event_tick_rate;
284
285 /**
286 * Static functions used to configure the interactions between the OPAL and
287 * the runtime.
288 */
289 static char*
_process_name_print_for_opal(const opal_process_name_t procname)290 _process_name_print_for_opal(const opal_process_name_t procname)
291 {
292 ompi_process_name_t* rte_name = (ompi_process_name_t*)&procname;
293 return OMPI_NAME_PRINT(rte_name);
294 }
295
296 static int
_process_name_compare(const opal_process_name_t p1,const opal_process_name_t p2)297 _process_name_compare(const opal_process_name_t p1, const opal_process_name_t p2)
298 {
299 ompi_process_name_t* o1 = (ompi_process_name_t*)&p1;
300 ompi_process_name_t* o2 = (ompi_process_name_t*)&p2;
301 return ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, o1, o2);
302 }
303
_convert_string_to_process_name(opal_process_name_t * name,const char * name_string)304 static int _convert_string_to_process_name(opal_process_name_t *name,
305 const char* name_string)
306 {
307 return ompi_rte_convert_string_to_process_name(name, name_string);
308 }
309
_convert_process_name_to_string(char ** name_string,const opal_process_name_t * name)310 static int _convert_process_name_to_string(char** name_string,
311 const opal_process_name_t *name)
312 {
313 return ompi_rte_convert_process_name_to_string(name_string, name);
314 }
315
ompi_mpi_thread_level(int requested,int * provided)316 void ompi_mpi_thread_level(int requested, int *provided)
317 {
318 /**
319 * These values are monotonic; MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED
320 * < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
321 * If possible, the call will return provided = required. Failing this,
322 * the call will return the least supported level such that
323 * provided > required. Finally, if the user requirement cannot be
324 * satisfied, then the call will return in provided the highest
325 * supported level.
326 */
327 ompi_mpi_thread_requested = requested;
328
329 ompi_mpi_thread_provided = *provided = requested;
330
331 if (!ompi_mpi_main_thread) {
332 ompi_mpi_main_thread = opal_thread_get_self();
333 }
334
335 ompi_mpi_thread_multiple = (ompi_mpi_thread_provided ==
336 MPI_THREAD_MULTIPLE);
337 }
338
ompi_register_mca_variables(void)339 static int ompi_register_mca_variables(void)
340 {
341 int ret;
342
343 /* Register MPI variables */
344 if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
345 return ret;
346 }
347
348 /* check to see if we want timing information */
349 /* TODO: enable OMPI init and OMPI finalize timings if
350 * this variable was set to 1!
351 */
352 ompi_enable_timing = false;
353 (void) mca_base_var_register("ompi", "ompi", NULL, "timing",
354 "Request that critical timing loops be measured",
355 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
356 OPAL_INFO_LVL_9,
357 MCA_BASE_VAR_SCOPE_READONLY,
358 &ompi_enable_timing);
359
360 return OMPI_SUCCESS;
361 }
362
fence_release(int status,void * cbdata)363 static void fence_release(int status, void *cbdata)
364 {
365 volatile bool *active = (volatile bool*)cbdata;
366 *active = false;
367 }
368
ompi_mpi_init(int argc,char ** argv,int requested,int * provided,bool reinit_ok)369 int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
370 bool reinit_ok)
371 {
372 int ret;
373 ompi_proc_t** procs;
374 size_t nprocs;
375 char *error = NULL;
376 ompi_errhandler_errtrk_t errtrk;
377 volatile bool active;
378 opal_list_t info;
379 opal_value_t *kv;
380
381 OMPI_TIMING_INIT(32);
382
383 ompi_hook_base_mpi_init_top(argc, argv, requested, provided);
384
385 /* Ensure that we were not already initialized or finalized. */
386 int32_t expected = OMPI_MPI_STATE_NOT_INITIALIZED;
387 int32_t desired = OMPI_MPI_STATE_INIT_STARTED;
388 opal_atomic_wmb();
389 if (!opal_atomic_cmpset_32(&ompi_mpi_state, expected, desired)) {
390 // If we failed to atomically transition ompi_mpi_state from
391 // NOT_INITIALIZED to INIT_STARTED, then someone else already
392 // did that, and we should return.
393 if (expected >= OMPI_MPI_STATE_FINALIZE_STARTED) {
394 opal_show_help("help-mpi-runtime.txt",
395 "mpi_init: already finalized", true);
396 return MPI_ERR_OTHER;
397 } else if (expected >= OMPI_MPI_STATE_INIT_STARTED) {
398 // In some cases (e.g., oshmem_shmem_init()), we may call
399 // ompi_mpi_init() multiple times. In such cases, just
400 // silently return successfully once the initializing
401 // thread has completed.
402 if (reinit_ok) {
403 while (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
404 usleep(1);
405 }
406 return MPI_SUCCESS;
407 }
408
409 opal_show_help("help-mpi-runtime.txt",
410 "mpi_init: invoked multiple times", true);
411 return MPI_ERR_OTHER;
412 }
413 }
414
415 /* Figure out the final MPI thread levels. If we were not
416 compiled for support for MPI threads, then don't allow
417 MPI_THREAD_MULTIPLE. Set this stuff up here early in the
418 process so that other components can make decisions based on
419 this value. */
420
421 ompi_mpi_thread_level(requested, provided);
422
423 /* Setup enough to check get/set MCA params */
424 if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
425 error = "ompi_mpi_init: opal_init_util failed";
426 goto error;
427 }
428
429 /* If thread support was enabled, then setup OPAL to allow for them. This must be done
430 * early to prevent a race condition that can occur with orte_init(). */
431 if (*provided != MPI_THREAD_SINGLE) {
432 opal_set_using_threads(true);
433 }
434
435 /* Convince OPAL to use our naming scheme */
436 opal_process_name_print = _process_name_print_for_opal;
437 opal_compare_proc = _process_name_compare;
438 opal_convert_string_to_process_name = _convert_string_to_process_name;
439 opal_convert_process_name_to_string = _convert_process_name_to_string;
440 opal_proc_for_name = ompi_proc_for_name;
441
442 /* Register MCA variables */
443 if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {
444 error = "ompi_mpi_init: ompi_register_mca_variables failed";
445 goto error;
446 }
447
448 if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
449 error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
450 goto error;
451 }
452
453 /* _After_ opal_init_util() but _before_ orte_init(), we need to
454 set an MCA param that tells libevent that it's ok to use any
455 mechanism in libevent that is available on this platform (e.g.,
456 epoll and friends). Per opal/event/event.s, we default to
457 select/poll -- but we know that MPI processes won't be using
458 pty's with the event engine, so it's ok to relax this
459 constraint and let any fd-monitoring mechanism be used. */
460
461 ret = mca_base_var_find("opal", "event", "*", "event_include");
462 if (ret >= 0) {
463 char *allvalue = "all";
464 /* We have to explicitly "set" the MCA param value here
465 because libevent initialization will re-register the MCA
466 param and therefore override the default. Setting the value
467 here puts the desired value ("all") in different storage
468 that is not overwritten if/when the MCA param is
469 re-registered. This is unless the user has specified a different
470 value for this MCA parameter. Make sure we check to see if the
471 default is specified before forcing "all" in case that is not what
472 the user desires. Note that we do *NOT* set this value as an
473 environment variable, just so that it won't be inherited by
474 any spawned processes and potentially cause unintented
475 side-effects with launching RTE tools... */
476 mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
477 }
478
479 /* open the ompi hook framework */
480 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_hook_base_framework, 0))) {
481 error = "ompi_hook_base_open() failed";
482 goto error;
483 }
484
485 ompi_hook_base_mpi_init_top_post_opal(argc, argv, requested, provided);
486
487
488 OMPI_TIMING_NEXT("initialization");
489
490 /* if we were not externally started, then we need to setup
491 * some envars so the MPI_INFO_ENV can get the cmd name
492 * and argv (but only if the user supplied a non-NULL argv!), and
493 * the requested thread level
494 */
495 if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
496 opal_setenv("OMPI_COMMAND", argv[0], true, &environ);
497 }
498 if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
499 char *tmp;
500 tmp = opal_argv_join(&argv[1], ' ');
501 opal_setenv("OMPI_ARGV", tmp, true, &environ);
502 free(tmp);
503 }
504
505 /* open the rte framework */
506 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) {
507 error = "ompi_rte_base_open() failed";
508 goto error;
509 }
510 /* no select is required as this is a static framework */
511
512 /* Setup RTE */
513 if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) {
514 error = "ompi_mpi_init: ompi_rte_init failed";
515 goto error;
516 }
517
518 OMPI_TIMING_NEXT("rte_init");
519
520 ompi_rte_initialized = true;
521
522 /* Register the default errhandler callback */
523 errtrk.status = OPAL_ERROR;
524 errtrk.active = true;
525 /* we want to go first */
526 OBJ_CONSTRUCT(&info, opal_list_t);
527 kv = OBJ_NEW(opal_value_t);
528 kv->key = strdup(OPAL_PMIX_EVENT_ORDER_PREPEND);
529 opal_list_append(&info, &kv->super);
530 /* give it a name so we can distinguish it */
531 kv = OBJ_NEW(opal_value_t);
532 kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
533 kv->type = OPAL_STRING;
534 kv->data.string = strdup("MPI-Default");
535 opal_list_append(&info, &kv->super);
536 opal_pmix.register_evhandler(NULL, &info, ompi_errhandler_callback,
537 ompi_errhandler_registration_callback,
538 (void*)&errtrk);
539 OMPI_LAZY_WAIT_FOR_COMPLETION(errtrk.active);
540
541 OPAL_LIST_DESTRUCT(&info);
542 if (OPAL_SUCCESS != errtrk.status) {
543 error = "Error handler registration";
544 ret = errtrk.status;
545 goto error;
546 }
547
548 /* declare our presence for interlib coordination, and
549 * register for callbacks when other libs declare */
550 if (OMPI_SUCCESS != (ret = ompi_interlib_declare(*provided, OMPI_IDENT_STRING))) {
551 error = "ompi_interlib_declare";
552 goto error;
553 }
554
555 /* initialize datatypes. This step should be done early as it will
556 * create the local convertor and local arch used in the proc
557 * init.
558 */
559 if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
560 error = "ompi_datatype_init() failed";
561 goto error;
562 }
563
564 /* Initialize OMPI procs */
565 if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
566 error = "mca_proc_init() failed";
567 goto error;
568 }
569
570 /* Initialize the op framework. This has to be done *after*
571 ddt_init, but befor mca_coll_base_open, since some collective
572 modules (e.g., the hierarchical coll component) may need ops in
573 their query function. */
574 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) {
575 error = "ompi_op_base_open() failed";
576 goto error;
577 }
578 if (OMPI_SUCCESS !=
579 (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
580 ompi_mpi_thread_multiple))) {
581 error = "ompi_op_base_find_available() failed";
582 goto error;
583 }
584 if (OMPI_SUCCESS != (ret = ompi_op_init())) {
585 error = "ompi_op_init() failed";
586 goto error;
587 }
588
589 /* Open up MPI-related MCA components */
590
591 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) {
592 error = "mca_allocator_base_open() failed";
593 goto error;
594 }
595 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_rcache_base_framework, 0))) {
596 error = "mca_rcache_base_open() failed";
597 goto error;
598 }
599 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_mpool_base_framework, 0))) {
600 error = "mca_mpool_base_open() failed";
601 goto error;
602 }
603 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) {
604 error = "mca_bml_base_open() failed";
605 goto error;
606 }
607 if (OMPI_SUCCESS != (ret = mca_bml_base_init (1, ompi_mpi_thread_multiple))) {
608 error = "mca_bml_base_init() failed";
609 goto error;
610 }
611 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) {
612 error = "mca_pml_base_open() failed";
613 goto error;
614 }
615 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) {
616 error = "mca_coll_base_open() failed";
617 goto error;
618 }
619
620 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) {
621 error = "ompi_osc_base_open() failed";
622 goto error;
623 }
624
625 #if OPAL_ENABLE_FT_CR == 1
626 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) {
627 error = "ompi_crcp_base_open() failed";
628 goto error;
629 }
630 #endif
631
632 /* In order to reduce the common case for MPI apps (where they
633 don't use MPI-2 IO or MPI-1 topology functions), the io and
634 topo frameworks are initialized lazily, at the first use of
635 relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
636 so they are not opened here. */
637
638 /* Select which MPI components to use */
639
640 if (OMPI_SUCCESS !=
641 (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
642 ompi_mpi_thread_multiple))) {
643 error = "mca_pml_base_select() failed";
644 goto error;
645 }
646
647 OMPI_TIMING_IMPORT_OPAL("orte_init");
648 OMPI_TIMING_IMPORT_OPAL("opal_init_util");
649 OMPI_TIMING_NEXT("rte_init-commit");
650
651
652 /* exchange connection info - this function may also act as a barrier
653 * if data exchange is required. The modex occurs solely across procs
654 * in our job. If a barrier is required, the "modex" function will
655 * perform it internally */
656 opal_pmix.commit();
657 OMPI_TIMING_NEXT("commit");
658
659 /* If we have a non-blocking fence:
660 * if we are doing an async modex, but we are collecting all
661 * data, then execute the non-blocking modex in the background.
662 * All calls to modex_recv will be cached until the background
663 * modex completes. If collect_all_data is false, then we skip
664 * the fence completely and retrieve data on-demand from the
665 * source node.
666 *
667 * If we do not have a non-blocking fence, then we must always
668 * execute the blocking fence as the system does not support
669 * later data retrieval. */
670 if (NULL != opal_pmix.fence_nb) {
671 if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
672 /* execute the fence_nb in the background to collect
673 * the data */
674 if (!ompi_async_mpi_init) {
675 /* we are going to execute a barrier at the
676 * end of MPI_Init. We can only have ONE fence
677 * operation with the identical involved procs
678 * at a time, so we will need to wait when we
679 * get there */
680 active = true;
681 ret = opal_pmix.fence_nb(NULL, true, fence_release,
682 (void*)&active);
683 } else {
684 ret = opal_pmix.fence_nb(NULL, true, NULL, NULL);
685 }
686 if (OMPI_SUCCESS != ret) {
687 error = "opal_pmix.fence_nb() failed";
688 goto error;
689 }
690 } else if (!opal_pmix_base_async_modex) {
691 active = true;
692 if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL,
693 opal_pmix_collect_all_data, fence_release,
694 (void*)&active))) {
695 error = "opal_pmix.fence_nb() failed";
696 goto error;
697 }
698 OMPI_LAZY_WAIT_FOR_COMPLETION(active);
699 }
700 } else {
701 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL,
702 opal_pmix_collect_all_data))) {
703 error = "opal_pmix.fence() failed";
704 goto error;
705 }
706 }
707
708 OMPI_TIMING_NEXT("modex");
709
710 /* select buffered send allocator component to be used */
711 if( OMPI_SUCCESS !=
712 (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
713 error = "mca_pml_base_bsend_init() failed";
714 goto error;
715 }
716
717 if (OMPI_SUCCESS !=
718 (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
719 ompi_mpi_thread_multiple))) {
720 error = "mca_coll_base_find_available() failed";
721 goto error;
722 }
723
724 if (OMPI_SUCCESS !=
725 (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
726 ompi_mpi_thread_multiple))) {
727 error = "ompi_osc_base_find_available() failed";
728 goto error;
729 }
730
731 #if OPAL_ENABLE_FT_CR == 1
732 if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
733 error = "ompi_crcp_base_select() failed";
734 goto error;
735 }
736 #endif
737
738 /* io and topo components are not selected here -- see comment
739 above about the io and topo frameworks being loaded lazily */
740
741 /* Initialize each MPI handle subsystem */
742 /* initialize requests */
743 if (OMPI_SUCCESS != (ret = ompi_request_init())) {
744 error = "ompi_request_init() failed";
745 goto error;
746 }
747
748 if (OMPI_SUCCESS != (ret = ompi_message_init())) {
749 error = "ompi_message_init() failed";
750 goto error;
751 }
752
753 /* initialize info */
754 if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) {
755 error = "ompi_info_init() failed";
756 goto error;
757 }
758
759 /* initialize error handlers */
760 if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
761 error = "ompi_errhandler_init() failed";
762 goto error;
763 }
764
765 /* initialize error codes */
766 if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
767 error = "ompi_mpi_errcode_init() failed";
768 goto error;
769 }
770
771 /* initialize internal error codes */
772 if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
773 error = "ompi_errcode_intern_init() failed";
774 goto error;
775 }
776
777 /* initialize groups */
778 if (OMPI_SUCCESS != (ret = ompi_group_init())) {
779 error = "ompi_group_init() failed";
780 goto error;
781 }
782
783 /* initialize communicators */
784 if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
785 error = "ompi_comm_init() failed";
786 goto error;
787 }
788
789 /* initialize file handles */
790 if (OMPI_SUCCESS != (ret = ompi_file_init())) {
791 error = "ompi_file_init() failed";
792 goto error;
793 }
794
795 /* initialize windows */
796 if (OMPI_SUCCESS != (ret = ompi_win_init())) {
797 error = "ompi_win_init() failed";
798 goto error;
799 }
800
801 /* initialize attribute meta-data structure for comm/win/dtype */
802 if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
803 error = "ompi_attr_init() failed";
804 goto error;
805 }
806
807 /* identify the architectures of remote procs and setup
808 * their datatype convertors, if required
809 */
810 if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
811 error = "ompi_proc_complete_init failed";
812 goto error;
813 }
814
815 /* start PML/BTL's */
816 ret = MCA_PML_CALL(enable(true));
817 if( OMPI_SUCCESS != ret ) {
818 error = "PML control failed";
819 goto error;
820 }
821
822 /* some btls/mtls require we call add_procs with all procs in the job.
823 * since the btls/mtls have no visibility here it is up to the pml to
824 * convey this requirement */
825 if (mca_pml_base_requires_world ()) {
826 if (NULL == (procs = ompi_proc_world (&nprocs))) {
827 error = "ompi_proc_get_allocated () failed";
828 goto error;
829 }
830 } else {
831 /* add all allocated ompi_proc_t's to PML (below the add_procs limit this
832 * behaves identically to ompi_proc_world ()) */
833 if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
834 error = "ompi_proc_get_allocated () failed";
835 goto error;
836 }
837 }
838 ret = MCA_PML_CALL(add_procs(procs, nprocs));
839 free(procs);
840 /* If we got "unreachable", then print a specific error message.
841 Otherwise, if we got some other failure, fall through to print
842 a generic message. */
843 if (OMPI_ERR_UNREACH == ret) {
844 opal_show_help("help-mpi-runtime.txt",
845 "mpi_init:startup:pml-add-procs-fail", true);
846 error = NULL;
847 goto error;
848 } else if (OMPI_SUCCESS != ret) {
849 error = "PML add procs failed";
850 goto error;
851 }
852
853 MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
854 MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
855
856 /*
857 * Dump all MCA parameters if requested
858 */
859 if (ompi_mpi_show_mca_params) {
860 ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
861 nprocs,
862 ompi_process_info.nodename);
863 }
864
865 /* Do we need to wait for a debugger? */
866 ompi_rte_wait_for_debugger();
867
868 /* Next timing measurement */
869 OMPI_TIMING_NEXT("modex-barrier");
870
871 /* wait for everyone to reach this point - this is a hard
872 * barrier requirement at this time, though we hope to relax
873 * it at a later point */
874 if (!ompi_async_mpi_init) {
875 /* if we executed the above fence in the background, then
876 * we have to wait here for it to complete. However, there
877 * is no reason to do two barriers! */
878 if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
879 OMPI_LAZY_WAIT_FOR_COMPLETION(active);
880 } else {
881 active = true;
882 if (NULL != opal_pmix.fence_nb) {
883 if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false,
884 fence_release, (void*)&active))) {
885 error = "opal_pmix.fence_nb() failed";
886 goto error;
887 }
888 OMPI_LAZY_WAIT_FOR_COMPLETION(active);
889 } else {
890 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) {
891 error = "opal_pmix.fence() failed";
892 goto error;
893 }
894 }
895 }
896 }
897
898 /* check for timing request - get stop time and report elapsed
899 time if so, then start the clock again */
900 OMPI_TIMING_NEXT("barrier");
901
902 #if OPAL_ENABLE_PROGRESS_THREADS == 0
903 /* Start setting up the event engine for MPI operations. Don't
904 block in the event library, so that communications don't take
905 forever between procs in the dynamic code. This will increase
906 CPU utilization for the remainder of MPI_INIT when we are
907 blocking on RTE-level events, but may greatly reduce non-TCP
908 latency. */
909 opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
910 #endif
911
912 /* wire up the mpi interface, if requested. Do this after the
913 non-block switch for non-TCP performance. Do before the
914 polling change as anyone with a complex wire-up is going to be
915 using the oob. */
916 if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
917 error = "ompi_mpi_do_preconnect_all() failed";
918 goto error;
919 }
920
921 /* Setup the dynamic process management (DPM) subsystem */
922 if (OMPI_SUCCESS != (ret = ompi_dpm_init())) {
923 error = "ompi_dpm_init() failed";
924 goto error;
925 }
926
927 /* Determine the overall threadlevel support of all processes
928 in MPI_COMM_WORLD. This has to be done before calling
929 coll_base_comm_select, since some of the collective components
930 e.g. hierarch, might create subcommunicators. The threadlevel
931 requested by all processes is required in order to know
932 which cid allocation algorithm can be used. */
933 if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
934 error = "ompi_mpi_init: ompi_comm_cid_init failed";
935 goto error;
936 }
937
938 /* Init coll for the comms. This has to be after dpm_base_select,
939 (since dpm.mark_dyncomm is not set in the communicator creation
940 function else), but before dpm.dyncom_init, since this function
941 might require collective for the CID allocation. */
942 if (OMPI_SUCCESS !=
943 (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
944 error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
945 goto error;
946 }
947
948 if (OMPI_SUCCESS !=
949 (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
950 error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
951 goto error;
952 }
953
954 /* Check whether we have been spawned or not. We introduce that
955 at the very end, since we need collectives, datatypes, ptls
956 etc. up and running here.... */
957 if (OMPI_SUCCESS != (ret = ompi_dpm_dyn_init())) {
958 error = "ompi_dpm_dyn_init() failed";
959 goto error;
960 }
961
962 /*
963 * Startup the Checkpoint/Restart Mech.
964 * Note: Always do this so tools don't hang when
965 * in a non-checkpointable build
966 */
967 if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
968 error = "ompi_cr_init";
969 goto error;
970 }
971
972 /* Undo OPAL calling opal_progress_event_users_increment() during
973 opal_init, to get better latency when not using TCP. Do
974 this *after* dyn_init, as dyn init uses lots of RTE
975 communication and we don't want to hinder the performance of
976 that code. */
977 opal_progress_event_users_decrement();
978
979 /* see if yield_when_idle was specified - if so, use it */
980 opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
981
982 /* negative value means use default - just don't do anything */
983 if (ompi_mpi_event_tick_rate >= 0) {
984 opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);
985 }
986
987 /* At this point, we are fully configured and in MPI mode. Any
988 communication calls here will work exactly like they would in
989 the user's code. Setup the connections between procs and warm
990 them up with simple sends, if requested */
991
992 if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
993 error = "ompi_mpiext_init";
994 goto error;
995 }
996
997 /* Fall through */
998 error:
999 if (ret != OMPI_SUCCESS) {
1000 /* Only print a message if one was not already printed */
1001 if (NULL != error && OMPI_ERR_SILENT != ret) {
1002 const char *err_msg = opal_strerror(ret);
1003 opal_show_help("help-mpi-runtime.txt",
1004 "mpi_init:startup:internal-failure", true,
1005 "MPI_INIT", "MPI_INIT", error, err_msg, ret);
1006 }
1007 ompi_hook_base_mpi_init_error(argc, argv, requested, provided);
1008 OMPI_TIMING_FINALIZE;
1009 return ret;
1010 }
1011
1012 /* Initialize the registered datarep list to be empty */
1013 OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);
1014
1015 /* Initialize the arrays used to store the F90 types returned by the
1016 * MPI_Type_create_f90_XXX functions.
1017 */
1018 OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
1019 opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);
1020
1021 OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
1022 opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);
1023
1024 OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
1025 opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
1026
1027 /* All done. Wasn't that simple? */
1028 opal_atomic_wmb();
1029 opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
1030
1031 /* Finish last measurement, output results
1032 * and clear timing structure */
1033 OMPI_TIMING_NEXT("barrier-finish");
1034 OMPI_TIMING_OUT;
1035 OMPI_TIMING_FINALIZE;
1036
1037 ompi_hook_base_mpi_init_bottom(argc, argv, requested, provided);
1038
1039 return MPI_SUCCESS;
1040 }
1041