1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2017 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
14 * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
15 * reserved.
16 * Copyright (c) 2006 University of Houston. All rights reserved.
17 * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
18 * Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
19 * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
20 * Copyright (c) 2016 Research Organization for Information Science
21 * and Technology (RIST). All rights reserved.
22 *
23 * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
24 * $COPYRIGHT$
25 *
26 * Additional copyrights may follow
27 *
28 * $HEADER$
29 */
30
31 #include "ompi_config.h"
32
33 #ifdef HAVE_SYS_TYPES_H
34 #include <sys/types.h>
35 #endif
36 #ifdef HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #ifdef HAVE_SYS_PARAM_H
40 #include <sys/param.h>
41 #endif
42 #ifdef HAVE_NETDB_H
43 #include <netdb.h>
44 #endif
45
46 #include "opal/mca/event/event.h"
47 #include "opal/util/output.h"
48 #include "opal/runtime/opal_progress.h"
49 #include "opal/mca/base/base.h"
50 #include "opal/sys/atomic.h"
51 #include "opal/runtime/opal.h"
52 #include "opal/util/show_help.h"
53 #include "opal/mca/mpool/base/base.h"
54 #include "opal/mca/mpool/base/mpool_base_tree.h"
55 #include "opal/mca/rcache/base/base.h"
56 #include "opal/mca/allocator/base/base.h"
57 #include "opal/mca/pmix/pmix.h"
58 #include "opal/util/timings.h"
59
60 #include "mpi.h"
61 #include "ompi/constants.h"
62 #include "ompi/errhandler/errcode.h"
63 #include "ompi/communicator/communicator.h"
64 #include "ompi/datatype/ompi_datatype.h"
65 #include "ompi/message/message.h"
66 #include "ompi/op/op.h"
67 #include "ompi/file/file.h"
68 #include "ompi/info/info.h"
69 #include "ompi/runtime/mpiruntime.h"
70 #include "ompi/attribute/attribute.h"
71 #include "ompi/mca/pml/pml.h"
72 #include "ompi/mca/bml/bml.h"
73 #include "ompi/mca/pml/base/base.h"
74 #include "ompi/mca/bml/base/base.h"
75 #include "ompi/mca/osc/base/base.h"
76 #include "ompi/mca/coll/base/base.h"
77 #include "ompi/mca/rte/rte.h"
78 #include "ompi/mca/rte/base/base.h"
79 #include "ompi/mca/topo/base/base.h"
80 #include "ompi/mca/io/io.h"
81 #include "ompi/mca/io/base/base.h"
82 #include "ompi/mca/pml/base/pml_base_bsend.h"
83 #include "ompi/runtime/params.h"
84 #include "ompi/dpm/dpm.h"
85 #include "ompi/mpiext/mpiext.h"
86 #include "ompi/mca/hook/base/base.h"
87
88 #if OPAL_ENABLE_FT_CR == 1
89 #include "ompi/mca/crcp/crcp.h"
90 #include "ompi/mca/crcp/base/base.h"
91 #endif
92 #include "ompi/runtime/ompi_cr.h"
93
94 extern bool ompi_enable_timing;
95
fence_cbfunc(int status,void * cbdata)96 static void fence_cbfunc(int status, void *cbdata)
97 {
98 volatile bool *active = (volatile bool*)cbdata;
99 OPAL_ACQUIRE_OBJECT(active);
100 *active = false;
101 OPAL_POST_OBJECT(active);
102 }
103
ompi_mpi_finalize(void)104 int ompi_mpi_finalize(void)
105 {
106 int ret = MPI_SUCCESS;
107 opal_list_item_t *item;
108 ompi_proc_t** procs;
109 size_t nprocs;
110 volatile bool active;
111 uint32_t key;
112 ompi_datatype_t * datatype;
113
114 ompi_hook_base_mpi_finalize_top();
115
116 int32_t state = ompi_mpi_state;
117 if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
118 state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
119 /* Note that if we're not initialized or already finalized, we
120 cannot raise an MPI exception. The best that we can do is
121 write something to stderr. */
122 char hostname[OPAL_MAXHOSTNAMELEN];
123 pid_t pid = getpid();
124 gethostname(hostname, sizeof(hostname));
125
126 if (state < OMPI_MPI_STATE_INIT_COMPLETED) {
127 opal_show_help("help-mpi-runtime.txt",
128 "mpi_finalize: not initialized",
129 true, hostname, pid);
130 } else if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
131 opal_show_help("help-mpi-runtime.txt",
132 "mpi_finalize:invoked_multiple_times",
133 true, hostname, pid);
134 }
135 return MPI_ERR_OTHER;
136 }
137 opal_atomic_wmb();
138 opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_STARTED);
139
140 ompi_mpiext_fini();
141
142 /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing
143 anything else in MPI_FINALIZE (to include setting up such that
144 MPI_FINALIZED will return true). */
145
146 if (NULL != ompi_mpi_comm_self.comm.c_keyhash) {
147 ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self,
148 ompi_mpi_comm_self.comm.c_keyhash);
149 OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash);
150 ompi_mpi_comm_self.comm.c_keyhash = NULL;
151 }
152
153 /* Mark that we are past COMM_SELF destruction so that
154 MPI_FINALIZED can return an accurate value (per MPI-3.1,
155 FINALIZED needs to return FALSE to MPI_FINALIZED until after
156 COMM_SELF is destroyed / all the attribute callbacks have been
157 invoked) */
158 opal_atomic_wmb();
159 opal_atomic_swap_32(&ompi_mpi_state,
160 OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
161
162 /* As finalize is the last legal MPI call, we are allowed to force the release
163 * of the user buffer used for bsend, before going anywhere further.
164 */
165 (void)mca_pml_base_bsend_detach(NULL, NULL);
166
167 #if OPAL_ENABLE_PROGRESS_THREADS == 0
168 opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
169 #endif
170
171 /* Redo ORTE calling opal_progress_event_users_increment() during
172 MPI lifetime, to get better latency when not using TCP */
173 opal_progress_event_users_increment();
174
175 /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
176 *all* connected processes. This only means that all processes
177 have to call it. It does *not* mean that all connected
178 processes need to synchronize (either directly or indirectly).
179
180 For example, it is quite easy to construct complicated
181 scenarios where one job is "connected" to another job via
182 transitivity, but have no direct knowledge of each other.
183 Consider the following case: job A spawns job B, and job B
184 later spawns job C. A "connectedness" graph looks something
185 like this:
186
187 A <--> B <--> C
188
189 So what are we *supposed* to do in this case? If job A is
190 still connected to B when it calls FINALIZE, should it block
191 until jobs B and C also call FINALIZE?
192
193 After lengthy discussions many times over the course of this
194 project, the issue was finally decided at the Louisville Feb
195 2009 meeting: no.
196
197 Rationale:
198
199 - "Collective" does not mean synchronizing. It only means that
200 every process call it. Hence, in this scenario, every
201 process in A, B, and C must call FINALIZE.
202
203 - KEY POINT: if A calls FINALIZE, then it is erroneous for B or
204 C to try to communicate with A again.
205
206 - Hence, OMPI is *correct* to only effect a barrier across each
207 jobs' MPI_COMM_WORLD before exiting. Specifically, if A
208 calls FINALIZE long before B or C, it's *correct* if A exits
209 at any time (and doesn't notify B or C that it is exiting).
210
211 - Arguably, if B or C do try to communicate with the now-gone
212 A, OMPI should try to print a nice error ("you tried to
213 communicate with a job that is already gone...") instead of
214 segv or other Badness. However, that is an *extremely*
215 difficult problem -- sure, it's easy for A to tell B that it
216 is finalizing, but how can A tell C? A doesn't even know
217 about C. You'd need to construct a "connected" graph in a
218 distributed fashion, which is fraught with race conditions,
219 etc.
220
221 Hence, our conclusion is: OMPI is *correct* in its current
222 behavior (of only doing a barrier across its own COMM_WORLD)
223 before exiting. Any problems that occur are as a result of
224 erroneous MPI applications. We *could* tighten up the erroneous
225 cases and ensure that we print nice error messages / don't
226 crash, but that is such a difficult problem that we decided we
227 have many other, much higher priority issues to handle that deal
228 with non-erroneous cases. */
229
230 /* Wait for everyone to reach this point. This is a PMIx
231 barrier instead of an MPI barrier for (at least) two reasons:
232
233 1. An MPI barrier doesn't ensure that all messages have been
234 transmitted before exiting (e.g., a BTL can lie and buffer a
235 message without actually injecting it to the network, and
236 therefore require further calls to that BTL's progress), so
237 the possibility of a stranded message exists.
238
239 2. If the MPI communication is using an unreliable transport,
240 there's a problem of knowing that everyone has *left* the
241 barrier. E.g., one proc can send its ACK to the barrier
242 message to a peer and then leave the barrier, but the ACK
243 can get lost and therefore the peer is left in the barrier.
244
245 Point #1 has been known for a long time; point #2 emerged after
246 we added the first unreliable BTL to Open MPI and fixed the
247 del_procs behavior around May of 2014 (see
248 https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for
249 more details). */
250 if (!ompi_async_mpi_finalize) {
251 if (NULL != opal_pmix.fence_nb) {
252 active = true;
253 OPAL_POST_OBJECT(&active);
254 /* Note that use of the non-blocking PMIx fence will
255 * allow us to lazily cycle calling
256 * opal_progress(), which will allow any other pending
257 * communications/actions to complete. See
258 * https://github.com/open-mpi/ompi/issues/1576 for the
259 * original bug report. */
260 if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
261 (void*)&active))) {
262 OMPI_ERROR_LOG(ret);
263 /* Reset the active flag to false, to avoid waiting for
264 * completion when the fence was failed. */
265 active = false;
266 }
267 OMPI_LAZY_WAIT_FOR_COMPLETION(active);
268 } else {
269 /* However, we cannot guarantee that the provided PMIx has
270 * fence_nb. If it doesn't, then do the best we can: an MPI
271 * barrier on COMM_WORLD (which isn't the best because of the
272 * reasons cited above), followed by a blocking PMIx fence
273 * (which does not call opal_progress()). */
274 ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
275 comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
276
277 if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
278 OMPI_ERROR_LOG(ret);
279 }
280 }
281 }
282
283 /*
284 * Shutdown the Checkpoint/Restart Mech.
285 */
286 if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) {
287 OMPI_ERROR_LOG(ret);
288 }
289
290 /* Shut down any bindings-specific issues: C++, F77, F90 */
291
292 /* Remove all memory associated by MPI_REGISTER_DATAREP (per
293 MPI-2:9.5.3, there is no way for an MPI application to
294 *un*register datareps, but we don't want the OMPI layer causing
295 memory leaks). */
296 while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) {
297 OBJ_RELEASE(item);
298 }
299 OBJ_DESTRUCT(&ompi_registered_datareps);
300
301 /* Remove all F90 types from the hash tables */
302 OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_integer_hashtable)
303 OBJ_RELEASE(datatype);
304 OBJ_DESTRUCT(&ompi_mpi_f90_integer_hashtable);
305 OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_real_hashtable)
306 OBJ_RELEASE(datatype);
307 OBJ_DESTRUCT(&ompi_mpi_f90_real_hashtable);
308 OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_complex_hashtable)
309 OBJ_RELEASE(datatype);
310 OBJ_DESTRUCT(&ompi_mpi_f90_complex_hashtable);
311
312 /* Free communication objects */
313
314 /* free file resources */
315 if (OMPI_SUCCESS != (ret = ompi_file_finalize())) {
316 goto done;
317 }
318
319 /* free window resources */
320 if (OMPI_SUCCESS != (ret = ompi_win_finalize())) {
321 goto done;
322 }
323 if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) {
324 goto done;
325 }
326
327 /* free communicator resources. this MUST come before finalizing the PML
328 * as this will call into the pml */
329 if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) {
330 goto done;
331 }
332
333 /* call del_procs on all allocated procs even though some may not be known
334 * to the pml layer. the pml layer is expected to be resilient and ignore
335 * any unknown procs. */
336 nprocs = 0;
337 procs = ompi_proc_get_allocated (&nprocs);
338 MCA_PML_CALL(del_procs(procs, nprocs));
339 free(procs);
340
341 /* free pml resource */
342 if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) {
343 goto done;
344 }
345
346 /* free requests */
347 if (OMPI_SUCCESS != (ret = ompi_request_finalize())) {
348 goto done;
349 }
350
351 if (OMPI_SUCCESS != (ret = ompi_message_finalize())) {
352 goto done;
353 }
354
355 /* If requested, print out a list of memory allocated by ALLOC_MEM
356 but not freed by FREE_MEM */
357 if (0 != ompi_debug_show_mpi_alloc_mem_leaks) {
358 mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks);
359 }
360
361 /* Now that all MPI objects dealing with communications are gone,
362 shut down MCA types having to do with communications */
363 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) {
364 OMPI_ERROR_LOG(ret);
365 goto done;
366 }
367
368 /* shut down buffered send code */
369 mca_pml_base_bsend_fini();
370
371 #if OPAL_ENABLE_FT_CR == 1
372 /*
373 * Shutdown the CRCP Framework, must happen after PML shutdown
374 */
375 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) {
376 OMPI_ERROR_LOG(ret);
377 goto done;
378 }
379 #endif
380
381 /* Free secondary resources */
382
383 /* free attr resources */
384 if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) {
385 goto done;
386 }
387
388 /* free group resources */
389 if (OMPI_SUCCESS != (ret = ompi_group_finalize())) {
390 goto done;
391 }
392
393 /* finalize the DPM subsystem */
394 if ( OMPI_SUCCESS != (ret = ompi_dpm_finalize())) {
395 goto done;
396 }
397
398 /* free internal error resources */
399 if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) {
400 goto done;
401 }
402
403 /* free error code resources */
404 if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) {
405 goto done;
406 }
407
408 /* free errhandler resources */
409 if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) {
410 goto done;
411 }
412
413 /* Free all other resources */
414
415 /* free op resources */
416 if (OMPI_SUCCESS != (ret = ompi_op_finalize())) {
417 goto done;
418 }
419
420 /* free ddt resources */
421 if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) {
422 goto done;
423 }
424
425 /* free info resources */
426 if (OMPI_SUCCESS != (ret = ompi_mpiinfo_finalize())) {
427 goto done;
428 }
429
430 /* Close down MCA modules */
431
432 /* io is opened lazily, so it's only necessary to close it if it
433 was actually opened */
434 if (0 < ompi_io_base_framework.framework_refcnt) {
435 /* May have been "opened" multiple times. We want it closed now */
436 ompi_io_base_framework.framework_refcnt = 1;
437
438 if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) {
439 goto done;
440 }
441 }
442 (void) mca_base_framework_close(&ompi_topo_base_framework);
443 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) {
444 goto done;
445 }
446 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) {
447 goto done;
448 }
449 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) {
450 goto done;
451 }
452 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) {
453 goto done;
454 }
455 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) {
456 goto done;
457 }
458 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) {
459 goto done;
460 }
461
462 /* free proc resources */
463 if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) {
464 goto done;
465 }
466
467 if (NULL != ompi_mpi_main_thread) {
468 OBJ_RELEASE(ompi_mpi_main_thread);
469 ompi_mpi_main_thread = NULL;
470 }
471
472 /* Clean up memory/resources from the MPI dynamic process
473 functionality checker */
474 ompi_mpi_dynamics_finalize();
475
476 /* Leave the RTE */
477
478 if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {
479 goto done;
480 }
481 ompi_rte_initialized = false;
482
483 /* now close the rte framework */
484 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) {
485 OMPI_ERROR_LOG(ret);
486 goto done;
487 }
488
489 /* Now close the hook framework */
490 if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_hook_base_framework) ) ) {
491 OMPI_ERROR_LOG(ret);
492 goto done;
493 }
494
495 if (OPAL_SUCCESS != (ret = opal_finalize_util())) {
496 goto done;
497 }
498
499 if (0 == opal_initialized) {
500 /* if there is no MPI_T_init_thread that has been MPI_T_finalize'd,
501 * then be gentle to the app and release all the memory now (instead
502 * of the opal library destructor */
503 opal_class_finalize();
504 }
505
506 /* cleanup environment */
507 opal_unsetenv("OMPI_COMMAND", &environ);
508 opal_unsetenv("OMPI_ARGV", &environ);
509
510 /* All done */
511
512 done:
513 opal_atomic_wmb();
514 opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_COMPLETED);
515
516 ompi_hook_base_mpi_finalize_bottom();
517
518 return ret;
519 }
520