1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2017 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2018 Cisco Systems, Inc.  All rights reserved
14  * Copyright (c) 2006-2014 Los Alamos National Security, LLC.  All rights
15  *                         reserved.
16  * Copyright (c) 2006      University of Houston. All rights reserved.
17  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
18  * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
19  * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
20  * Copyright (c) 2016      Research Organization for Information Science
21  *                         and Technology (RIST). All rights reserved.
22  *
23  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
24  * $COPYRIGHT$
25  *
26  * Additional copyrights may follow
27  *
28  * $HEADER$
29  */
30 
31 #include "ompi_config.h"
32 
33 #ifdef HAVE_SYS_TYPES_H
34 #include <sys/types.h>
35 #endif
36 #ifdef HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #ifdef HAVE_SYS_PARAM_H
40 #include <sys/param.h>
41 #endif
42 #ifdef HAVE_NETDB_H
43 #include <netdb.h>
44 #endif
45 
46 #include "opal/mca/event/event.h"
47 #include "opal/util/output.h"
48 #include "opal/runtime/opal_progress.h"
49 #include "opal/mca/base/base.h"
50 #include "opal/sys/atomic.h"
51 #include "opal/runtime/opal.h"
52 #include "opal/util/show_help.h"
53 #include "opal/mca/mpool/base/base.h"
54 #include "opal/mca/mpool/base/mpool_base_tree.h"
55 #include "opal/mca/rcache/base/base.h"
56 #include "opal/mca/allocator/base/base.h"
57 #include "opal/mca/pmix/pmix.h"
58 #include "opal/util/timings.h"
59 
60 #include "mpi.h"
61 #include "ompi/constants.h"
62 #include "ompi/errhandler/errcode.h"
63 #include "ompi/communicator/communicator.h"
64 #include "ompi/datatype/ompi_datatype.h"
65 #include "ompi/message/message.h"
66 #include "ompi/op/op.h"
67 #include "ompi/file/file.h"
68 #include "ompi/info/info.h"
69 #include "ompi/runtime/mpiruntime.h"
70 #include "ompi/attribute/attribute.h"
71 #include "ompi/mca/pml/pml.h"
72 #include "ompi/mca/bml/bml.h"
73 #include "ompi/mca/pml/base/base.h"
74 #include "ompi/mca/bml/base/base.h"
75 #include "ompi/mca/osc/base/base.h"
76 #include "ompi/mca/coll/base/base.h"
77 #include "ompi/mca/rte/rte.h"
78 #include "ompi/mca/rte/base/base.h"
79 #include "ompi/mca/topo/base/base.h"
80 #include "ompi/mca/io/io.h"
81 #include "ompi/mca/io/base/base.h"
82 #include "ompi/mca/pml/base/pml_base_bsend.h"
83 #include "ompi/runtime/params.h"
84 #include "ompi/dpm/dpm.h"
85 #include "ompi/mpiext/mpiext.h"
86 #include "ompi/mca/hook/base/base.h"
87 
88 #if OPAL_ENABLE_FT_CR == 1
89 #include "ompi/mca/crcp/crcp.h"
90 #include "ompi/mca/crcp/base/base.h"
91 #endif
92 #include "ompi/runtime/ompi_cr.h"
93 
94 extern bool ompi_enable_timing;
95 
fence_cbfunc(int status,void * cbdata)96 static void fence_cbfunc(int status, void *cbdata)
97 {
98     volatile bool *active = (volatile bool*)cbdata;
99     OPAL_ACQUIRE_OBJECT(active);
100     *active = false;
101     OPAL_POST_OBJECT(active);
102 }
103 
ompi_mpi_finalize(void)104 int ompi_mpi_finalize(void)
105 {
106     int ret = MPI_SUCCESS;
107     opal_list_item_t *item;
108     ompi_proc_t** procs;
109     size_t nprocs;
110     volatile bool active;
111     uint32_t key;
112     ompi_datatype_t * datatype;
113 
114     ompi_hook_base_mpi_finalize_top();
115 
116     int32_t state = ompi_mpi_state;
117     if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
118         state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
119         /* Note that if we're not initialized or already finalized, we
120            cannot raise an MPI exception.  The best that we can do is
121            write something to stderr. */
122         char hostname[OPAL_MAXHOSTNAMELEN];
123         pid_t pid = getpid();
124         gethostname(hostname, sizeof(hostname));
125 
126         if (state < OMPI_MPI_STATE_INIT_COMPLETED) {
127             opal_show_help("help-mpi-runtime.txt",
128                            "mpi_finalize: not initialized",
129                            true, hostname, pid);
130         } else if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
131             opal_show_help("help-mpi-runtime.txt",
132                            "mpi_finalize:invoked_multiple_times",
133                            true, hostname, pid);
134         }
135         return MPI_ERR_OTHER;
136     }
137     opal_atomic_wmb();
138     opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_STARTED);
139 
140     ompi_mpiext_fini();
141 
142     /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing
143        anything else in MPI_FINALIZE (to include setting up such that
144        MPI_FINALIZED will return true). */
145 
146     if (NULL != ompi_mpi_comm_self.comm.c_keyhash) {
147         ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self,
148                              ompi_mpi_comm_self.comm.c_keyhash);
149         OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash);
150         ompi_mpi_comm_self.comm.c_keyhash = NULL;
151     }
152 
153     /* Mark that we are past COMM_SELF destruction so that
154        MPI_FINALIZED can return an accurate value (per MPI-3.1,
155        FINALIZED needs to return FALSE to MPI_FINALIZED until after
156        COMM_SELF is destroyed / all the attribute callbacks have been
157        invoked) */
158     opal_atomic_wmb();
159     opal_atomic_swap_32(&ompi_mpi_state,
160                         OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
161 
162     /* As finalize is the last legal MPI call, we are allowed to force the release
163      * of the user buffer used for bsend, before going anywhere further.
164      */
165     (void)mca_pml_base_bsend_detach(NULL, NULL);
166 
167 #if OPAL_ENABLE_PROGRESS_THREADS == 0
168     opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
169 #endif
170 
171     /* Redo ORTE calling opal_progress_event_users_increment() during
172        MPI lifetime, to get better latency when not using TCP */
173     opal_progress_event_users_increment();
174 
175     /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
176        *all* connected processes.  This only means that all processes
177        have to call it.  It does *not* mean that all connected
178        processes need to synchronize (either directly or indirectly).
179 
180        For example, it is quite easy to construct complicated
181        scenarios where one job is "connected" to another job via
182        transitivity, but have no direct knowledge of each other.
183        Consider the following case: job A spawns job B, and job B
184        later spawns job C.  A "connectedness" graph looks something
185        like this:
186 
187            A <--> B <--> C
188 
189        So what are we *supposed* to do in this case?  If job A is
190        still connected to B when it calls FINALIZE, should it block
191        until jobs B and C also call FINALIZE?
192 
193        After lengthy discussions many times over the course of this
194        project, the issue was finally decided at the Louisville Feb
195        2009 meeting: no.
196 
197        Rationale:
198 
199        - "Collective" does not mean synchronizing.  It only means that
200          every process call it.  Hence, in this scenario, every
201          process in A, B, and C must call FINALIZE.
202 
203        - KEY POINT: if A calls FINALIZE, then it is erroneous for B or
204          C to try to communicate with A again.
205 
206        - Hence, OMPI is *correct* to only effect a barrier across each
207          jobs' MPI_COMM_WORLD before exiting.  Specifically, if A
208          calls FINALIZE long before B or C, it's *correct* if A exits
209          at any time (and doesn't notify B or C that it is exiting).
210 
211        - Arguably, if B or C do try to communicate with the now-gone
212          A, OMPI should try to print a nice error ("you tried to
213          communicate with a job that is already gone...") instead of
214          segv or other Badness.  However, that is an *extremely*
215          difficult problem -- sure, it's easy for A to tell B that it
216          is finalizing, but how can A tell C?  A doesn't even know
217          about C.  You'd need to construct a "connected" graph in a
218          distributed fashion, which is fraught with race conditions,
219          etc.
220 
221       Hence, our conclusion is: OMPI is *correct* in its current
222       behavior (of only doing a barrier across its own COMM_WORLD)
223       before exiting.  Any problems that occur are as a result of
224       erroneous MPI applications.  We *could* tighten up the erroneous
225       cases and ensure that we print nice error messages / don't
226       crash, but that is such a difficult problem that we decided we
227       have many other, much higher priority issues to handle that deal
228       with non-erroneous cases. */
229 
230     /* Wait for everyone to reach this point.  This is a PMIx
231        barrier instead of an MPI barrier for (at least) two reasons:
232 
233        1. An MPI barrier doesn't ensure that all messages have been
234           transmitted before exiting (e.g., a BTL can lie and buffer a
235           message without actually injecting it to the network, and
236           therefore require further calls to that BTL's progress), so
237           the possibility of a stranded message exists.
238 
239        2. If the MPI communication is using an unreliable transport,
240           there's a problem of knowing that everyone has *left* the
241           barrier.  E.g., one proc can send its ACK to the barrier
242           message to a peer and then leave the barrier, but the ACK
243           can get lost and therefore the peer is left in the barrier.
244 
245        Point #1 has been known for a long time; point #2 emerged after
246        we added the first unreliable BTL to Open MPI and fixed the
247        del_procs behavior around May of 2014 (see
248        https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for
249        more details). */
250     if (!ompi_async_mpi_finalize) {
251         if (NULL != opal_pmix.fence_nb) {
252             active = true;
253             OPAL_POST_OBJECT(&active);
254             /* Note that use of the non-blocking PMIx fence will
255              * allow us to lazily cycle calling
256              * opal_progress(), which will allow any other pending
257              * communications/actions to complete.  See
258              * https://github.com/open-mpi/ompi/issues/1576 for the
259              * original bug report. */
260             if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
261                                                           (void*)&active))) {
262                 OMPI_ERROR_LOG(ret);
263                 /* Reset the active flag to false, to avoid waiting for
264                  * completion when the fence was failed. */
265                 active = false;
266             }
267             OMPI_LAZY_WAIT_FOR_COMPLETION(active);
268         } else {
269             /* However, we cannot guarantee that the provided PMIx has
270              * fence_nb.  If it doesn't, then do the best we can: an MPI
271              * barrier on COMM_WORLD (which isn't the best because of the
272              * reasons cited above), followed by a blocking PMIx fence
273              * (which does not call opal_progress()). */
274             ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
275             comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
276 
277             if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
278                 OMPI_ERROR_LOG(ret);
279             }
280         }
281     }
282 
283     /*
284      * Shutdown the Checkpoint/Restart Mech.
285      */
286     if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) {
287         OMPI_ERROR_LOG(ret);
288     }
289 
290     /* Shut down any bindings-specific issues: C++, F77, F90 */
291 
292     /* Remove all memory associated by MPI_REGISTER_DATAREP (per
293        MPI-2:9.5.3, there is no way for an MPI application to
294        *un*register datareps, but we don't want the OMPI layer causing
295        memory leaks). */
296     while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) {
297         OBJ_RELEASE(item);
298     }
299     OBJ_DESTRUCT(&ompi_registered_datareps);
300 
301     /* Remove all F90 types from the hash tables */
302     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_integer_hashtable)
303         OBJ_RELEASE(datatype);
304     OBJ_DESTRUCT(&ompi_mpi_f90_integer_hashtable);
305     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_real_hashtable)
306         OBJ_RELEASE(datatype);
307     OBJ_DESTRUCT(&ompi_mpi_f90_real_hashtable);
308     OPAL_HASH_TABLE_FOREACH(key, uint32, datatype, &ompi_mpi_f90_complex_hashtable)
309         OBJ_RELEASE(datatype);
310     OBJ_DESTRUCT(&ompi_mpi_f90_complex_hashtable);
311 
312     /* Free communication objects */
313 
314     /* free file resources */
315     if (OMPI_SUCCESS != (ret = ompi_file_finalize())) {
316         goto done;
317     }
318 
319     /* free window resources */
320     if (OMPI_SUCCESS != (ret = ompi_win_finalize())) {
321         goto done;
322     }
323     if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) {
324         goto done;
325     }
326 
327     /* free communicator resources. this MUST come before finalizing the PML
328      * as this will call into the pml */
329     if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) {
330         goto done;
331     }
332 
333     /* call del_procs on all allocated procs even though some may not be known
334      * to the pml layer. the pml layer is expected to be resilient and ignore
335      * any unknown procs. */
336     nprocs = 0;
337     procs = ompi_proc_get_allocated (&nprocs);
338     MCA_PML_CALL(del_procs(procs, nprocs));
339     free(procs);
340 
341     /* free pml resource */
342     if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) {
343         goto done;
344     }
345 
346     /* free requests */
347     if (OMPI_SUCCESS != (ret = ompi_request_finalize())) {
348         goto done;
349     }
350 
351     if (OMPI_SUCCESS != (ret = ompi_message_finalize())) {
352         goto done;
353     }
354 
355     /* If requested, print out a list of memory allocated by ALLOC_MEM
356        but not freed by FREE_MEM */
357     if (0 != ompi_debug_show_mpi_alloc_mem_leaks) {
358         mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks);
359     }
360 
361     /* Now that all MPI objects dealing with communications are gone,
362        shut down MCA types having to do with communications */
363     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) {
364         OMPI_ERROR_LOG(ret);
365         goto done;
366     }
367 
368     /* shut down buffered send code */
369     mca_pml_base_bsend_fini();
370 
371 #if OPAL_ENABLE_FT_CR == 1
372     /*
373      * Shutdown the CRCP Framework, must happen after PML shutdown
374      */
375     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) {
376         OMPI_ERROR_LOG(ret);
377         goto done;
378     }
379 #endif
380 
381     /* Free secondary resources */
382 
383     /* free attr resources */
384     if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) {
385         goto done;
386     }
387 
388     /* free group resources */
389     if (OMPI_SUCCESS != (ret = ompi_group_finalize())) {
390         goto done;
391     }
392 
393     /* finalize the DPM subsystem */
394     if ( OMPI_SUCCESS != (ret = ompi_dpm_finalize())) {
395         goto done;
396     }
397 
398     /* free internal error resources */
399     if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) {
400         goto done;
401     }
402 
403     /* free error code resources */
404     if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) {
405         goto done;
406     }
407 
408     /* free errhandler resources */
409     if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) {
410         goto done;
411     }
412 
413     /* Free all other resources */
414 
415     /* free op resources */
416     if (OMPI_SUCCESS != (ret = ompi_op_finalize())) {
417         goto done;
418     }
419 
420     /* free ddt resources */
421     if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) {
422         goto done;
423     }
424 
425     /* free info resources */
426     if (OMPI_SUCCESS != (ret = ompi_mpiinfo_finalize())) {
427         goto done;
428     }
429 
430     /* Close down MCA modules */
431 
432     /* io is opened lazily, so it's only necessary to close it if it
433        was actually opened */
434     if (0 < ompi_io_base_framework.framework_refcnt) {
435         /* May have been "opened" multiple times. We want it closed now */
436         ompi_io_base_framework.framework_refcnt = 1;
437 
438         if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) {
439             goto done;
440         }
441     }
442     (void) mca_base_framework_close(&ompi_topo_base_framework);
443     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) {
444         goto done;
445     }
446     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) {
447         goto done;
448     }
449     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) {
450         goto done;
451     }
452     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) {
453         goto done;
454     }
455     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) {
456         goto done;
457     }
458     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) {
459         goto done;
460     }
461 
462     /* free proc resources */
463     if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) {
464         goto done;
465     }
466 
467     if (NULL != ompi_mpi_main_thread) {
468         OBJ_RELEASE(ompi_mpi_main_thread);
469         ompi_mpi_main_thread = NULL;
470     }
471 
472     /* Clean up memory/resources from the MPI dynamic process
473        functionality checker */
474     ompi_mpi_dynamics_finalize();
475 
476     /* Leave the RTE */
477 
478     if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {
479         goto done;
480     }
481     ompi_rte_initialized = false;
482 
483     /* now close the rte framework */
484     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) {
485         OMPI_ERROR_LOG(ret);
486         goto done;
487     }
488 
489     /* Now close the hook framework */
490     if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_hook_base_framework) ) ) {
491         OMPI_ERROR_LOG(ret);
492         goto done;
493     }
494 
495     if (OPAL_SUCCESS != (ret = opal_finalize_util())) {
496         goto done;
497     }
498 
499     if (0 == opal_initialized) {
500         /* if there is no MPI_T_init_thread that has been MPI_T_finalize'd,
501          * then be gentle to the app and release all the memory now (instead
502          * of the opal library destructor */
503         opal_class_finalize();
504     }
505 
506     /* cleanup environment */
507     opal_unsetenv("OMPI_COMMAND", &environ);
508     opal_unsetenv("OMPI_ARGV", &environ);
509 
510     /* All done */
511 
512   done:
513     opal_atomic_wmb();
514     opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_COMPLETED);
515 
516     ompi_hook_base_mpi_finalize_bottom();
517 
518     return ret;
519 }
520