1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2014 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2018 Cisco Systems, Inc.  All rights reserved
14  * Copyright (c) 2006-2015 Los Alamos National Security, LLC.  All rights
15  *                         reserved.
16  * Copyright (c) 2006-2009 University of Houston. All rights reserved.
17  * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
18  * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
19  * Copyright (c) 2012-2013 Inria.  All rights reserved.
20  * Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
21  * Copyright (c) 2014-2016 Research Organization for Information Science
22  *                         and Technology (RIST). All rights reserved.
23  * Copyright (c) 2016      Mellanox Technologies Ltd. All rights reserved.
24  *
25  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
26  * $COPYRIGHT$
27  *
28  * Additional copyrights may follow
29  *
30  * $HEADER$
31  */
32 
33 #include "ompi_config.h"
34 
35 #ifdef HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif  /* HAVE_SYS_TIME_H */
38 #include <pthread.h>
39 #ifdef HAVE_UNISTD_H
40 #include <unistd.h>
41 #endif
42 
43 #include "mpi.h"
44 #include "opal/class/opal_list.h"
45 #include "opal/mca/base/base.h"
46 #include "opal/mca/hwloc/base/base.h"
47 #include "opal/runtime/opal_progress.h"
48 #include "opal/threads/threads.h"
49 #include "opal/util/arch.h"
50 #include "opal/util/argv.h"
51 #include "opal/util/output.h"
52 #include "opal/util/error.h"
53 #include "opal/util/stacktrace.h"
54 #include "opal/util/show_help.h"
55 #include "opal/runtime/opal.h"
56 #include "opal/mca/event/event.h"
57 #include "opal/mca/allocator/base/base.h"
58 #include "opal/mca/rcache/base/base.h"
59 #include "opal/mca/rcache/rcache.h"
60 #include "opal/mca/mpool/base/base.h"
61 #include "opal/mca/btl/base/base.h"
62 #include "opal/mca/pmix/pmix.h"
63 #include "opal/util/timings.h"
64 #include "opal/util/opal_environ.h"
65 
66 #include "ompi/constants.h"
67 #include "ompi/mpi/fortran/base/constants.h"
68 #include "ompi/runtime/mpiruntime.h"
69 #include "ompi/runtime/params.h"
70 #include "ompi/communicator/communicator.h"
71 #include "ompi/info/info.h"
72 #include "ompi/errhandler/errcode.h"
73 #include "ompi/errhandler/errhandler.h"
74 #include "ompi/interlib/interlib.h"
75 #include "ompi/request/request.h"
76 #include "ompi/message/message.h"
77 #include "ompi/op/op.h"
78 #include "ompi/mca/op/op.h"
79 #include "ompi/mca/op/base/base.h"
80 #include "ompi/file/file.h"
81 #include "ompi/attribute/attribute.h"
82 #include "ompi/mca/pml/pml.h"
83 #include "ompi/mca/bml/bml.h"
84 #include "ompi/mca/pml/base/base.h"
85 #include "ompi/mca/bml/base/base.h"
86 #include "ompi/mca/osc/base/base.h"
87 #include "ompi/mca/coll/base/base.h"
88 #include "ompi/mca/io/io.h"
89 #include "ompi/mca/io/base/base.h"
90 #include "ompi/mca/rte/rte.h"
91 #include "ompi/mca/rte/base/base.h"
92 #include "ompi/debuggers/debuggers.h"
93 #include "ompi/proc/proc.h"
94 #include "ompi/mca/pml/base/pml_base_bsend.h"
95 #include "ompi/dpm/dpm.h"
96 #include "ompi/mpiext/mpiext.h"
97 #include "ompi/mca/hook/base/base.h"
98 #include "ompi/util/timings.h"
99 
100 #if OPAL_ENABLE_FT_CR == 1
101 #include "ompi/mca/crcp/crcp.h"
102 #include "ompi/mca/crcp/base/base.h"
103 #endif
104 #include "ompi/runtime/ompi_cr.h"
105 
106 /* newer versions of gcc have poisoned this deprecated feature */
107 #ifdef HAVE___MALLOC_INITIALIZE_HOOK
108 #include "opal/mca/memory/base/base.h"
109 /* So this sucks, but with OPAL in its own library that is brought in
110    implicity from libmpi, there are times when the malloc initialize
111    hook in the memory component doesn't work.  So we have to do it
112    from here, since any MPI code is going to call MPI_Init... */
113 OPAL_DECLSPEC void (*__malloc_initialize_hook) (void) =
114     opal_memory_base_malloc_init_hook;
115 #endif
116 
117 /* This is required for the boundaries of the hash tables used to store
118  * the F90 types returned by the MPI_Type_create_f90_XXX functions.
119  */
120 #include <float.h>
121 
122 #if OPAL_CC_USE_PRAGMA_IDENT
123 #pragma ident OMPI_IDENT_STRING
124 #elif OPAL_CC_USE_IDENT
125 #ident OMPI_IDENT_STRING
126 #endif
127 const char ompi_version_string[] = OMPI_IDENT_STRING;
128 
129 /*
130  * Global variables and symbols for the MPI layer
131  */
132 
133 volatile int32_t ompi_mpi_state = OMPI_MPI_STATE_NOT_INITIALIZED;
134 volatile bool ompi_rte_initialized = false;
135 
136 bool ompi_mpi_thread_multiple = false;
137 int ompi_mpi_thread_requested = MPI_THREAD_SINGLE;
138 int ompi_mpi_thread_provided = MPI_THREAD_SINGLE;
139 
140 opal_thread_t *ompi_mpi_main_thread = NULL;
141 
142 /*
143  * These variables are for the MPI F08 bindings (F08 must bind Fortran
144  * varaiables to symbols; it cannot bind Fortran variables to the
145  * address of a C variable).
146  */
147 
148 ompi_predefined_datatype_t *ompi_mpi_character_addr = &ompi_mpi_character;
149 ompi_predefined_datatype_t *ompi_mpi_logical_addr   = &ompi_mpi_logical;
150 ompi_predefined_datatype_t *ompi_mpi_logical1_addr  = &ompi_mpi_logical1;
151 ompi_predefined_datatype_t *ompi_mpi_logical2_addr  = &ompi_mpi_logical2;
152 ompi_predefined_datatype_t *ompi_mpi_logical4_addr  = &ompi_mpi_logical4;
153 ompi_predefined_datatype_t *ompi_mpi_logical8_addr  = &ompi_mpi_logical8;
154 ompi_predefined_datatype_t *ompi_mpi_integer_addr   = &ompi_mpi_integer;
155 ompi_predefined_datatype_t *ompi_mpi_integer1_addr  = &ompi_mpi_integer1;
156 ompi_predefined_datatype_t *ompi_mpi_integer2_addr  = &ompi_mpi_integer2;
157 ompi_predefined_datatype_t *ompi_mpi_integer4_addr  = &ompi_mpi_integer4;
158 ompi_predefined_datatype_t *ompi_mpi_integer8_addr  = &ompi_mpi_integer8;
159 ompi_predefined_datatype_t *ompi_mpi_integer16_addr = &ompi_mpi_integer16;
160 ompi_predefined_datatype_t *ompi_mpi_real_addr      = &ompi_mpi_real;
161 ompi_predefined_datatype_t *ompi_mpi_real4_addr     = &ompi_mpi_real4;
162 ompi_predefined_datatype_t *ompi_mpi_real8_addr     = &ompi_mpi_real8;
163 ompi_predefined_datatype_t *ompi_mpi_real16_addr    = &ompi_mpi_real16;
164 ompi_predefined_datatype_t *ompi_mpi_dblprec_addr   = &ompi_mpi_dblprec;
165 ompi_predefined_datatype_t *ompi_mpi_cplex_addr     = &ompi_mpi_cplex;
166 ompi_predefined_datatype_t *ompi_mpi_complex8_addr  = &ompi_mpi_complex8;
167 ompi_predefined_datatype_t *ompi_mpi_complex16_addr = &ompi_mpi_complex16;
168 ompi_predefined_datatype_t *ompi_mpi_complex32_addr = &ompi_mpi_complex32;
169 ompi_predefined_datatype_t *ompi_mpi_dblcplex_addr  = &ompi_mpi_dblcplex;
170 ompi_predefined_datatype_t *ompi_mpi_2real_addr     = &ompi_mpi_2real;
171 ompi_predefined_datatype_t *ompi_mpi_2dblprec_addr  = &ompi_mpi_2dblprec;
172 ompi_predefined_datatype_t *ompi_mpi_2integer_addr  = &ompi_mpi_2integer;
173 
174 struct ompi_status_public_t *ompi_mpi_status_ignore_addr =
175     (ompi_status_public_t *) 0;
176 struct ompi_status_public_t *ompi_mpi_statuses_ignore_addr =
177     (ompi_status_public_t *) 0;
178 
179 /*
180  * These variables are here, rather than under ompi/mpi/c/foo.c
181  * because it is not sufficient to have a .c file that only contains
182  * variables -- you must have a function that is invoked from
183  * elsewhere in the code to guarantee that all linkers will pull in
184  * the .o file from the library.  Hence, although these are MPI
185  * constants, we might as well just define them here (i.e., in a file
186  * that already has a function that is guaranteed to be linked in,
187  * rather than make a new .c file with the constants and a
188  * corresponding dummy function that is invoked from this function).
189  *
190  * Additionally, there can be/are strange linking paths such that
191  * ompi_info needs symbols such as ompi_fortran_status_ignore,
192  * which, if they weren't here with a collection of other global
193  * symbols that are initialized (which seems to force this .o file to
194  * be pulled into the resolution process, because ompi_info certainly
195  * does not call ompi_mpi_init()), would not be able to be found by
196  * the OSX linker.
197  *
198  * NOTE: See the big comment in ompi/mpi/fortran/base/constants.h
199  * about why we have four symbols for each of the common blocks (e.g.,
200  * the Fortran equivalent(s) of MPI_STATUS_IGNORE).  Here, we can only
201  * have *one* value (not four).  So the only thing we can do is make
202  * it equal to the fortran compiler convention that was selected at
203  * configure time.  Note that this is also true for the value of
204  * .TRUE. from the Fortran compiler, so even though Open MPI supports
205  * all four Fortran symbol conventions, it can only support one
206  * convention for the two C constants (MPI_FORTRAN_STATUS[ES]_IGNORE)
207  * and only support one compiler for the value of .TRUE.  Ugh!!
208  *
209  * Note that the casts here are ok -- we're *only* comparing pointer
210  * values (i.e., they'll never be de-referenced).  The global symbols
211  * are actually of type (ompi_fortran_common_t) (for alignment
212  * issues), but MPI says that MPI_F_STATUS[ES]_IGNORE must be of type
213  * (MPI_Fint*).  Hence, we have to cast to make compilers not
214  * complain.
215  */
216 #if OMPI_BUILD_FORTRAN_BINDINGS
217 #  if OMPI_FORTRAN_CAPS
218 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUS_IGNORE;
219 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &MPI_FORTRAN_STATUSES_IGNORE;
220 #  elif OMPI_FORTRAN_PLAIN
221 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore;
222 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore;
223 #  elif OMPI_FORTRAN_SINGLE_UNDERSCORE
224 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore_;
225 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore_;
226 #  elif OMPI_FORTRAN_DOUBLE_UNDERSCORE
227 MPI_Fint *MPI_F_STATUS_IGNORE = (MPI_Fint*) &mpi_fortran_status_ignore__;
228 MPI_Fint *MPI_F_STATUSES_IGNORE = (MPI_Fint*) &mpi_fortran_statuses_ignore__;
229 #  else
230 #    error Unrecognized Fortran name mangling scheme
231 #  endif
232 #else
233 MPI_Fint *MPI_F_STATUS_IGNORE = NULL;
234 MPI_Fint *MPI_F_STATUSES_IGNORE = NULL;
235 #endif  /* OMPI_BUILD_FORTRAN_BINDINGS */
236 
237 
238 /* Constants for the Fortran layer.  These values are referred to via
239    common blocks in the Fortran equivalents.  See
240    ompi/mpi/fortran/base/constants.h for a more detailed explanation.
241 
242    The values are *NOT* initialized.  We do not use the values of
243    these constants; only their addresses (because they're always
244    passed by reference by Fortran).
245 
246    Initializing upon instantiation these can reveal size and/or
247    alignment differences between Fortran and C (!) which can cause
248    warnings or errors upon linking (e.g., making static libraries with
249    the intel 9.0 compilers on 64 bit platforms shows alignment
250    differences between libmpi.a and the user's application, resulting
251    in a linker warning).  FWIW, if you initialize these variables in
252    functions (i.e., not at the instantiation in the global scope), the
253    linker somehow "figures it all out" (w.r.t. different alignments
254    between fortan common blocks and the corresponding C variables) and
255    no linker warnings occur.
256 
257    Note that the rationale for the types of each of these variables is
258    discussed in ompi/include/mpif-common.h.  Do not change the types
259    without also modifying ompi/mpi/fortran/base/constants.h and
260    ompi/include/mpif-common.h.
261  */
262 
263 #include "mpif-c-constants.h"
264 
265 /*
266  * Hash tables for MPI_Type_create_f90* functions
267  */
268 opal_hash_table_t ompi_mpi_f90_integer_hashtable = {{0}};
269 opal_hash_table_t ompi_mpi_f90_real_hashtable = {{0}};
270 opal_hash_table_t ompi_mpi_f90_complex_hashtable = {{0}};
271 
272 /*
273  * Per MPI-2:9.5.3, MPI_REGISTER_DATAREP is a memory leak.  There is
274  * no way to *de*register datareps once they've been registered.  So
275  * we have to track all registrations here so that they can be
276  * de-registered during MPI_FINALIZE so that memory-tracking debuggers
277  * don't show Open MPI as leaking memory.
278  */
279 opal_list_t ompi_registered_datareps = {{0}};
280 
281 bool ompi_enable_timing = false;
282 extern bool ompi_mpi_yield_when_idle;
283 extern int ompi_mpi_event_tick_rate;
284 
285 /**
286  * Static functions used to configure the interactions between the OPAL and
287  * the runtime.
288  */
289 static char*
_process_name_print_for_opal(const opal_process_name_t procname)290 _process_name_print_for_opal(const opal_process_name_t procname)
291 {
292     ompi_process_name_t* rte_name = (ompi_process_name_t*)&procname;
293     return OMPI_NAME_PRINT(rte_name);
294 }
295 
296 static int
_process_name_compare(const opal_process_name_t p1,const opal_process_name_t p2)297 _process_name_compare(const opal_process_name_t p1, const opal_process_name_t p2)
298 {
299     ompi_process_name_t* o1 = (ompi_process_name_t*)&p1;
300     ompi_process_name_t* o2 = (ompi_process_name_t*)&p2;
301     return ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, o1, o2);
302 }
303 
_convert_string_to_process_name(opal_process_name_t * name,const char * name_string)304 static int _convert_string_to_process_name(opal_process_name_t *name,
305                                            const char* name_string)
306 {
307     return ompi_rte_convert_string_to_process_name(name, name_string);
308 }
309 
_convert_process_name_to_string(char ** name_string,const opal_process_name_t * name)310 static int _convert_process_name_to_string(char** name_string,
311                                           const opal_process_name_t *name)
312 {
313     return ompi_rte_convert_process_name_to_string(name_string, name);
314 }
315 
ompi_mpi_thread_level(int requested,int * provided)316 void ompi_mpi_thread_level(int requested, int *provided)
317 {
318     /**
319      * These values are monotonic; MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED
320      *                             < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
321      * If possible, the call will return provided = required. Failing this,
322      * the call will return the least supported level such that
323      * provided > required. Finally, if the user requirement cannot be
324      * satisfied, then the call will return in provided the highest
325      * supported level.
326      */
327     ompi_mpi_thread_requested = requested;
328 
329     ompi_mpi_thread_provided = *provided = requested;
330 
331     if (!ompi_mpi_main_thread) {
332         ompi_mpi_main_thread = opal_thread_get_self();
333     }
334 
335     ompi_mpi_thread_multiple = (ompi_mpi_thread_provided ==
336                                 MPI_THREAD_MULTIPLE);
337 }
338 
ompi_register_mca_variables(void)339 static int ompi_register_mca_variables(void)
340 {
341     int ret;
342 
343     /* Register MPI variables */
344     if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
345         return ret;
346     }
347 
348     /* check to see if we want timing information */
349     /* TODO: enable OMPI init and OMPI finalize timings if
350      * this variable was set to 1!
351      */
352     ompi_enable_timing = false;
353     (void) mca_base_var_register("ompi", "ompi", NULL, "timing",
354                                  "Request that critical timing loops be measured",
355                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
356                                  OPAL_INFO_LVL_9,
357                                  MCA_BASE_VAR_SCOPE_READONLY,
358                                  &ompi_enable_timing);
359 
360     return OMPI_SUCCESS;
361 }
362 
fence_release(int status,void * cbdata)363 static void fence_release(int status, void *cbdata)
364 {
365     volatile bool *active = (volatile bool*)cbdata;
366     *active = false;
367 }
368 
ompi_mpi_init(int argc,char ** argv,int requested,int * provided,bool reinit_ok)369 int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
370                   bool reinit_ok)
371 {
372     int ret;
373     ompi_proc_t** procs;
374     size_t nprocs;
375     char *error = NULL;
376     ompi_errhandler_errtrk_t errtrk;
377     volatile bool active;
378     opal_list_t info;
379     opal_value_t *kv;
380 
381     OMPI_TIMING_INIT(32);
382 
383     ompi_hook_base_mpi_init_top(argc, argv, requested, provided);
384 
385     /* Ensure that we were not already initialized or finalized. */
386     int32_t expected = OMPI_MPI_STATE_NOT_INITIALIZED;
387     int32_t desired  = OMPI_MPI_STATE_INIT_STARTED;
388     opal_atomic_wmb();
389     if (!opal_atomic_cmpset_32(&ompi_mpi_state, expected, desired)) {
390         // If we failed to atomically transition ompi_mpi_state from
391         // NOT_INITIALIZED to INIT_STARTED, then someone else already
392         // did that, and we should return.
393         if (expected >= OMPI_MPI_STATE_FINALIZE_STARTED) {
394             opal_show_help("help-mpi-runtime.txt",
395                            "mpi_init: already finalized", true);
396             return MPI_ERR_OTHER;
397         } else if (expected >= OMPI_MPI_STATE_INIT_STARTED) {
398             // In some cases (e.g., oshmem_shmem_init()), we may call
399             // ompi_mpi_init() multiple times.  In such cases, just
400             // silently return successfully once the initializing
401             // thread has completed.
402             if (reinit_ok) {
403                 while (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
404                     usleep(1);
405                 }
406                 return MPI_SUCCESS;
407             }
408 
409             opal_show_help("help-mpi-runtime.txt",
410                            "mpi_init: invoked multiple times", true);
411             return MPI_ERR_OTHER;
412         }
413     }
414 
415     /* Figure out the final MPI thread levels.  If we were not
416        compiled for support for MPI threads, then don't allow
417        MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
418        process so that other components can make decisions based on
419        this value. */
420 
421     ompi_mpi_thread_level(requested, provided);
422 
423     /* Setup enough to check get/set MCA params */
424     if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
425         error = "ompi_mpi_init: opal_init_util failed";
426         goto error;
427     }
428 
429     /* If thread support was enabled, then setup OPAL to allow for them. This must be done
430      * early to prevent a race condition that can occur with orte_init(). */
431     if (*provided != MPI_THREAD_SINGLE) {
432         opal_set_using_threads(true);
433     }
434 
435     /* Convince OPAL to use our naming scheme */
436     opal_process_name_print = _process_name_print_for_opal;
437     opal_compare_proc = _process_name_compare;
438     opal_convert_string_to_process_name = _convert_string_to_process_name;
439     opal_convert_process_name_to_string = _convert_process_name_to_string;
440     opal_proc_for_name = ompi_proc_for_name;
441 
442     /* Register MCA variables */
443     if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {
444         error = "ompi_mpi_init: ompi_register_mca_variables failed";
445         goto error;
446     }
447 
448     if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
449         error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
450         goto error;
451     }
452 
453     /* _After_ opal_init_util() but _before_ orte_init(), we need to
454        set an MCA param that tells libevent that it's ok to use any
455        mechanism in libevent that is available on this platform (e.g.,
456        epoll and friends).  Per opal/event/event.s, we default to
457        select/poll -- but we know that MPI processes won't be using
458        pty's with the event engine, so it's ok to relax this
459        constraint and let any fd-monitoring mechanism be used. */
460 
461     ret = mca_base_var_find("opal", "event", "*", "event_include");
462     if (ret >= 0) {
463         char *allvalue = "all";
464         /* We have to explicitly "set" the MCA param value here
465            because libevent initialization will re-register the MCA
466            param and therefore override the default. Setting the value
467            here puts the desired value ("all") in different storage
468            that is not overwritten if/when the MCA param is
469            re-registered. This is unless the user has specified a different
470            value for this MCA parameter. Make sure we check to see if the
471            default is specified before forcing "all" in case that is not what
472            the user desires. Note that we do *NOT* set this value as an
473            environment variable, just so that it won't be inherited by
474            any spawned processes and potentially cause unintented
475            side-effects with launching RTE tools... */
476         mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
477     }
478 
479     /* open the ompi hook framework */
480     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_hook_base_framework, 0))) {
481         error = "ompi_hook_base_open() failed";
482         goto error;
483     }
484 
485     ompi_hook_base_mpi_init_top_post_opal(argc, argv, requested, provided);
486 
487 
488     OMPI_TIMING_NEXT("initialization");
489 
490     /* if we were not externally started, then we need to setup
491      * some envars so the MPI_INFO_ENV can get the cmd name
492      * and argv (but only if the user supplied a non-NULL argv!), and
493      * the requested thread level
494      */
495     if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
496         opal_setenv("OMPI_COMMAND", argv[0], true, &environ);
497     }
498     if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
499         char *tmp;
500         tmp = opal_argv_join(&argv[1], ' ');
501         opal_setenv("OMPI_ARGV", tmp, true, &environ);
502         free(tmp);
503     }
504 
505     /* open the rte framework */
506     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) {
507         error = "ompi_rte_base_open() failed";
508         goto error;
509     }
510     /* no select is required as this is a static framework */
511 
512     /* Setup RTE */
513     if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) {
514         error = "ompi_mpi_init: ompi_rte_init failed";
515         goto error;
516     }
517 
518     OMPI_TIMING_NEXT("rte_init");
519 
520     ompi_rte_initialized = true;
521 
522     /* Register the default errhandler callback  */
523     errtrk.status = OPAL_ERROR;
524     errtrk.active = true;
525     /* we want to go first */
526     OBJ_CONSTRUCT(&info, opal_list_t);
527     kv = OBJ_NEW(opal_value_t);
528     kv->key = strdup(OPAL_PMIX_EVENT_ORDER_PREPEND);
529     opal_list_append(&info, &kv->super);
530     /* give it a name so we can distinguish it */
531     kv = OBJ_NEW(opal_value_t);
532     kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
533     kv->type = OPAL_STRING;
534     kv->data.string = strdup("MPI-Default");
535     opal_list_append(&info, &kv->super);
536     opal_pmix.register_evhandler(NULL, &info, ompi_errhandler_callback,
537                                  ompi_errhandler_registration_callback,
538                                  (void*)&errtrk);
539     OMPI_LAZY_WAIT_FOR_COMPLETION(errtrk.active);
540 
541     OPAL_LIST_DESTRUCT(&info);
542     if (OPAL_SUCCESS != errtrk.status) {
543         error = "Error handler registration";
544         ret = errtrk.status;
545         goto error;
546     }
547 
548     /* declare our presence for interlib coordination, and
549      * register for callbacks when other libs declare */
550     if (OMPI_SUCCESS != (ret = ompi_interlib_declare(*provided, OMPI_IDENT_STRING))) {
551         error = "ompi_interlib_declare";
552         goto error;
553     }
554 
555     /* initialize datatypes. This step should be done early as it will
556      * create the local convertor and local arch used in the proc
557      * init.
558      */
559     if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
560         error = "ompi_datatype_init() failed";
561         goto error;
562     }
563 
564     /* Initialize OMPI procs */
565     if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
566         error = "mca_proc_init() failed";
567         goto error;
568     }
569 
570     /* Initialize the op framework. This has to be done *after*
571        ddt_init, but befor mca_coll_base_open, since some collective
572        modules (e.g., the hierarchical coll component) may need ops in
573        their query function. */
574     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) {
575         error = "ompi_op_base_open() failed";
576         goto error;
577     }
578     if (OMPI_SUCCESS !=
579         (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
580                                            ompi_mpi_thread_multiple))) {
581         error = "ompi_op_base_find_available() failed";
582         goto error;
583     }
584     if (OMPI_SUCCESS != (ret = ompi_op_init())) {
585         error = "ompi_op_init() failed";
586         goto error;
587     }
588 
589     /* Open up MPI-related MCA components */
590 
591     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) {
592         error = "mca_allocator_base_open() failed";
593         goto error;
594     }
595     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_rcache_base_framework, 0))) {
596         error = "mca_rcache_base_open() failed";
597         goto error;
598     }
599     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_mpool_base_framework, 0))) {
600         error = "mca_mpool_base_open() failed";
601         goto error;
602     }
603     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) {
604         error = "mca_bml_base_open() failed";
605         goto error;
606     }
607     if (OMPI_SUCCESS != (ret = mca_bml_base_init (1, ompi_mpi_thread_multiple))) {
608         error = "mca_bml_base_init() failed";
609         goto error;
610     }
611     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) {
612         error = "mca_pml_base_open() failed";
613         goto error;
614     }
615     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) {
616         error = "mca_coll_base_open() failed";
617         goto error;
618     }
619 
620     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) {
621         error = "ompi_osc_base_open() failed";
622         goto error;
623     }
624 
625 #if OPAL_ENABLE_FT_CR == 1
626     if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) {
627         error = "ompi_crcp_base_open() failed";
628         goto error;
629     }
630 #endif
631 
632     /* In order to reduce the common case for MPI apps (where they
633        don't use MPI-2 IO or MPI-1 topology functions), the io and
634        topo frameworks are initialized lazily, at the first use of
635        relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
636        so they are not opened here. */
637 
638     /* Select which MPI components to use */
639 
640     if (OMPI_SUCCESS !=
641         (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
642                                    ompi_mpi_thread_multiple))) {
643         error = "mca_pml_base_select() failed";
644         goto error;
645     }
646 
647     OMPI_TIMING_IMPORT_OPAL("orte_init");
648     OMPI_TIMING_IMPORT_OPAL("opal_init_util");
649     OMPI_TIMING_NEXT("rte_init-commit");
650 
651 
652     /* exchange connection info - this function may also act as a barrier
653      * if data exchange is required. The modex occurs solely across procs
654      * in our job. If a barrier is required, the "modex" function will
655      * perform it internally */
656     opal_pmix.commit();
657     OMPI_TIMING_NEXT("commit");
658 
659     /* If we have a non-blocking fence:
660      * if we are doing an async modex, but we are collecting all
661      * data, then execute the non-blocking modex in the background.
662      * All calls to modex_recv will be cached until the background
663      * modex completes. If collect_all_data is false, then we skip
664      * the fence completely and retrieve data on-demand from the
665      * source node.
666      *
667      * If we do not have a non-blocking fence, then we must always
668      * execute the blocking fence as the system does not support
669      * later data retrieval. */
670     if (NULL != opal_pmix.fence_nb) {
671         if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
672             /* execute the fence_nb in the background to collect
673              * the data */
674             if (!ompi_async_mpi_init) {
675                 /* we are going to execute a barrier at the
676                  * end of MPI_Init. We can only have ONE fence
677                  * operation with the identical involved procs
678                  * at a time, so we will need to wait when we
679                  * get there */
680                 active = true;
681                 ret = opal_pmix.fence_nb(NULL, true, fence_release,
682                                          (void*)&active);
683             } else {
684                 ret = opal_pmix.fence_nb(NULL, true, NULL, NULL);
685             }
686             if (OMPI_SUCCESS != ret) {
687                 error = "opal_pmix.fence_nb() failed";
688                 goto error;
689             }
690         } else if (!opal_pmix_base_async_modex) {
691             active = true;
692             if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL,
693                     opal_pmix_collect_all_data, fence_release,
694                     (void*)&active))) {
695                 error = "opal_pmix.fence_nb() failed";
696                 goto error;
697             }
698             OMPI_LAZY_WAIT_FOR_COMPLETION(active);
699         }
700     } else {
701         if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL,
702                 opal_pmix_collect_all_data))) {
703             error = "opal_pmix.fence() failed";
704             goto error;
705         }
706     }
707 
708     OMPI_TIMING_NEXT("modex");
709 
710     /* select buffered send allocator component to be used */
711     if( OMPI_SUCCESS !=
712         (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
713         error = "mca_pml_base_bsend_init() failed";
714         goto error;
715     }
716 
717     if (OMPI_SUCCESS !=
718         (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
719                                             ompi_mpi_thread_multiple))) {
720         error = "mca_coll_base_find_available() failed";
721         goto error;
722     }
723 
724     if (OMPI_SUCCESS !=
725         (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
726                                             ompi_mpi_thread_multiple))) {
727         error = "ompi_osc_base_find_available() failed";
728         goto error;
729     }
730 
731 #if OPAL_ENABLE_FT_CR == 1
732     if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
733         error = "ompi_crcp_base_select() failed";
734         goto error;
735     }
736 #endif
737 
738     /* io and topo components are not selected here -- see comment
739        above about the io and topo frameworks being loaded lazily */
740 
741     /* Initialize each MPI handle subsystem */
742     /* initialize requests */
743     if (OMPI_SUCCESS != (ret = ompi_request_init())) {
744         error = "ompi_request_init() failed";
745         goto error;
746     }
747 
748     if (OMPI_SUCCESS != (ret = ompi_message_init())) {
749         error = "ompi_message_init() failed";
750         goto error;
751     }
752 
753     /* initialize info */
754     if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) {
755         error = "ompi_info_init() failed";
756         goto error;
757     }
758 
759     /* initialize error handlers */
760     if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
761         error = "ompi_errhandler_init() failed";
762         goto error;
763     }
764 
765     /* initialize error codes */
766     if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
767         error = "ompi_mpi_errcode_init() failed";
768         goto error;
769     }
770 
771     /* initialize internal error codes */
772     if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
773         error = "ompi_errcode_intern_init() failed";
774         goto error;
775     }
776 
777     /* initialize groups  */
778     if (OMPI_SUCCESS != (ret = ompi_group_init())) {
779         error = "ompi_group_init() failed";
780         goto error;
781     }
782 
783     /* initialize communicators */
784     if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
785         error = "ompi_comm_init() failed";
786         goto error;
787     }
788 
789     /* initialize file handles */
790     if (OMPI_SUCCESS != (ret = ompi_file_init())) {
791         error = "ompi_file_init() failed";
792         goto error;
793     }
794 
795     /* initialize windows */
796     if (OMPI_SUCCESS != (ret = ompi_win_init())) {
797         error = "ompi_win_init() failed";
798         goto error;
799     }
800 
801     /* initialize attribute meta-data structure for comm/win/dtype */
802     if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
803         error = "ompi_attr_init() failed";
804         goto error;
805     }
806 
807     /* identify the architectures of remote procs and setup
808      * their datatype convertors, if required
809      */
810     if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
811         error = "ompi_proc_complete_init failed";
812         goto error;
813     }
814 
815     /* start PML/BTL's */
816     ret = MCA_PML_CALL(enable(true));
817     if( OMPI_SUCCESS != ret ) {
818         error = "PML control failed";
819         goto error;
820     }
821 
822     /* some btls/mtls require we call add_procs with all procs in the job.
823      * since the btls/mtls have no visibility here it is up to the pml to
824      * convey this requirement */
825     if (mca_pml_base_requires_world ()) {
826         if (NULL == (procs = ompi_proc_world (&nprocs))) {
827             error = "ompi_proc_get_allocated () failed";
828             goto error;
829         }
830     } else {
831         /* add all allocated ompi_proc_t's to PML (below the add_procs limit this
832          * behaves identically to ompi_proc_world ()) */
833         if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
834             error = "ompi_proc_get_allocated () failed";
835             goto error;
836         }
837     }
838     ret = MCA_PML_CALL(add_procs(procs, nprocs));
839     free(procs);
840     /* If we got "unreachable", then print a specific error message.
841        Otherwise, if we got some other failure, fall through to print
842        a generic message. */
843     if (OMPI_ERR_UNREACH == ret) {
844         opal_show_help("help-mpi-runtime.txt",
845                        "mpi_init:startup:pml-add-procs-fail", true);
846         error = NULL;
847         goto error;
848     } else if (OMPI_SUCCESS != ret) {
849         error = "PML add procs failed";
850         goto error;
851     }
852 
853     MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
854     MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
855 
856     /*
857      * Dump all MCA parameters if requested
858      */
859     if (ompi_mpi_show_mca_params) {
860         ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
861                                  nprocs,
862                                  ompi_process_info.nodename);
863     }
864 
865     /* Do we need to wait for a debugger? */
866     ompi_rte_wait_for_debugger();
867 
868     /* Next timing measurement */
869     OMPI_TIMING_NEXT("modex-barrier");
870 
871     /* wait for everyone to reach this point - this is a hard
872      * barrier requirement at this time, though we hope to relax
873      * it at a later point */
874     if (!ompi_async_mpi_init) {
875         /* if we executed the above fence in the background, then
876          * we have to wait here for it to complete. However, there
877          * is no reason to do two barriers! */
878         if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
879             OMPI_LAZY_WAIT_FOR_COMPLETION(active);
880         } else {
881             active = true;
882             if (NULL != opal_pmix.fence_nb) {
883                 if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false,
884                                    fence_release, (void*)&active))) {
885                     error = "opal_pmix.fence_nb() failed";
886                     goto error;
887                 }
888                 OMPI_LAZY_WAIT_FOR_COMPLETION(active);
889             } else {
890                 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) {
891                     error = "opal_pmix.fence() failed";
892                     goto error;
893                 }
894             }
895         }
896     }
897 
898     /* check for timing request - get stop time and report elapsed
899        time if so, then start the clock again */
900     OMPI_TIMING_NEXT("barrier");
901 
902 #if OPAL_ENABLE_PROGRESS_THREADS == 0
903     /* Start setting up the event engine for MPI operations.  Don't
904        block in the event library, so that communications don't take
905        forever between procs in the dynamic code.  This will increase
906        CPU utilization for the remainder of MPI_INIT when we are
907        blocking on RTE-level events, but may greatly reduce non-TCP
908        latency. */
909     opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
910 #endif
911 
912     /* wire up the mpi interface, if requested.  Do this after the
913        non-block switch for non-TCP performance.  Do before the
914        polling change as anyone with a complex wire-up is going to be
915        using the oob. */
916     if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
917         error = "ompi_mpi_do_preconnect_all() failed";
918         goto error;
919     }
920 
921     /* Setup the dynamic process management (DPM) subsystem */
922     if (OMPI_SUCCESS != (ret = ompi_dpm_init())) {
923         error = "ompi_dpm_init() failed";
924         goto error;
925     }
926 
927     /* Determine the overall threadlevel support of all processes
928        in MPI_COMM_WORLD. This has to be done before calling
929        coll_base_comm_select, since some of the collective components
930        e.g. hierarch, might create subcommunicators. The threadlevel
931        requested by all processes is required in order to know
932        which cid allocation algorithm can be used. */
933     if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
934         error = "ompi_mpi_init: ompi_comm_cid_init failed";
935         goto error;
936     }
937 
938     /* Init coll for the comms. This has to be after dpm_base_select,
939        (since dpm.mark_dyncomm is not set in the communicator creation
940        function else), but before dpm.dyncom_init, since this function
941        might require collective for the CID allocation. */
942     if (OMPI_SUCCESS !=
943         (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
944         error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
945         goto error;
946     }
947 
948     if (OMPI_SUCCESS !=
949         (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
950         error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
951         goto error;
952     }
953 
954     /* Check whether we have been spawned or not.  We introduce that
955        at the very end, since we need collectives, datatypes, ptls
956        etc. up and running here.... */
957     if (OMPI_SUCCESS != (ret = ompi_dpm_dyn_init())) {
958         error = "ompi_dpm_dyn_init() failed";
959         goto error;
960     }
961 
962     /*
963      * Startup the Checkpoint/Restart Mech.
964      * Note: Always do this so tools don't hang when
965      * in a non-checkpointable build
966      */
967     if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
968         error = "ompi_cr_init";
969         goto error;
970     }
971 
972     /* Undo OPAL calling opal_progress_event_users_increment() during
973        opal_init, to get better latency when not using TCP.  Do
974        this *after* dyn_init, as dyn init uses lots of RTE
975        communication and we don't want to hinder the performance of
976        that code. */
977     opal_progress_event_users_decrement();
978 
979     /* see if yield_when_idle was specified - if so, use it */
980     opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);
981 
982     /* negative value means use default - just don't do anything */
983     if (ompi_mpi_event_tick_rate >= 0) {
984         opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);
985     }
986 
987     /* At this point, we are fully configured and in MPI mode.  Any
988        communication calls here will work exactly like they would in
989        the user's code.  Setup the connections between procs and warm
990        them up with simple sends, if requested */
991 
992     if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
993         error = "ompi_mpiext_init";
994         goto error;
995     }
996 
997     /* Fall through */
998  error:
999     if (ret != OMPI_SUCCESS) {
1000         /* Only print a message if one was not already printed */
1001         if (NULL != error && OMPI_ERR_SILENT != ret) {
1002             const char *err_msg = opal_strerror(ret);
1003             opal_show_help("help-mpi-runtime.txt",
1004                            "mpi_init:startup:internal-failure", true,
1005                            "MPI_INIT", "MPI_INIT", error, err_msg, ret);
1006         }
1007         ompi_hook_base_mpi_init_error(argc, argv, requested, provided);
1008         OMPI_TIMING_FINALIZE;
1009         return ret;
1010     }
1011 
1012     /* Initialize the registered datarep list to be empty */
1013     OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);
1014 
1015     /* Initialize the arrays used to store the F90 types returned by the
1016      *  MPI_Type_create_f90_XXX functions.
1017      */
1018     OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
1019     opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);
1020 
1021     OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
1022     opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);
1023 
1024     OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
1025     opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
1026 
1027     /* All done.  Wasn't that simple? */
1028     opal_atomic_wmb();
1029     opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
1030 
1031     /* Finish last measurement, output results
1032      * and clear timing structure */
1033     OMPI_TIMING_NEXT("barrier-finish");
1034     OMPI_TIMING_OUT;
1035     OMPI_TIMING_FINALIZE;
1036 
1037     ompi_hook_base_mpi_init_bottom(argc, argv, requested, provided);
1038 
1039     return MPI_SUCCESS;
1040 }
1041