1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpidimpl.h"
7 #include "ch4r_init.h"
8 
MPIDIG_init_comm(MPIR_Comm * comm)9 int MPIDIG_init_comm(MPIR_Comm * comm)
10 {
11     int mpi_errno = MPI_SUCCESS, comm_idx, subcomm_type, is_localcomm;
12     MPIDIG_rreq_t **uelist;
13 
14     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIG_INIT_COMM);
15     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIG_INIT_COMM);
16 
17     MPIR_Assert(MPIDI_global.is_ch4u_initialized);
18 
19     if (MPIR_CONTEXT_READ_FIELD(DYNAMIC_PROC, comm->recvcontext_id))
20         goto fn_exit;
21 
22     comm_idx = MPIDIG_get_context_index(comm->recvcontext_id);
23     subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, comm->recvcontext_id);
24     is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, comm->recvcontext_id);
25 
26     MPIR_Assert(subcomm_type <= 3);
27     MPIR_Assert(is_localcomm <= 1);
28 
29     /* There is a potential race between this code (likely called by a user/main thread)
30      * and an MPIDIG callback handler (called by a progress thread, when async progress
31      * is turned on).
32      * Thus we take a lock here to make sure the following operations are atomically done.
33      * (transferring unexpected messages from a global queue to the newly created communicator) */
34     MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_MPIDIG_GLOBAL_MUTEX);
35     MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] = comm;
36     MPIDIG_COMM(comm, posted_list) = NULL;
37     MPIDIG_COMM(comm, unexp_list) = NULL;
38 
39     uelist = MPIDIG_context_id_to_uelist(comm->context_id);
40     if (*uelist) {
41         MPIDIG_rreq_t *curr, *tmp;
42         DL_FOREACH_SAFE(*uelist, curr, tmp) {
43             DL_DELETE(*uelist, curr);
44             MPIR_Comm_add_ref(comm);    /* +1 for each entry in unexp_list */
45             DL_APPEND(MPIDIG_COMM(comm, unexp_list), curr);
46         }
47         *uelist = NULL;
48     }
49     MPID_THREAD_CS_EXIT(VCI, MPIDIU_THREAD_MPIDIG_GLOBAL_MUTEX);
50 
51     MPIDIG_COMM(comm, window_instance) = 0;
52   fn_exit:
53     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIG_INIT_COMM);
54     return mpi_errno;
55 }
56 
MPIDIG_destroy_comm(MPIR_Comm * comm)57 int MPIDIG_destroy_comm(MPIR_Comm * comm)
58 {
59     int mpi_errno = MPI_SUCCESS, comm_idx, subcomm_type, is_localcomm;
60     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIG_DESTROY_COMM);
61     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIG_DESTROY_COMM);
62 
63     if (MPIR_CONTEXT_READ_FIELD(DYNAMIC_PROC, comm->recvcontext_id))
64         goto fn_exit;
65     comm_idx = MPIDIG_get_context_index(comm->recvcontext_id);
66     subcomm_type = MPIR_CONTEXT_READ_FIELD(SUBCOMM, comm->recvcontext_id);
67     is_localcomm = MPIR_CONTEXT_READ_FIELD(IS_LOCALCOMM, comm->recvcontext_id);
68 
69     MPIR_Assert(subcomm_type <= 3);
70     MPIR_Assert(is_localcomm <= 1);
71 
72     MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_MPIDIG_GLOBAL_MUTEX);
73     MPIR_Assert(MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] != NULL);
74 
75     if (MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type]) {
76         MPIR_Assert(MPIDIG_COMM
77                     (MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type],
78                      posted_list) == NULL);
79         MPIR_Assert(MPIDIG_COMM
80                     (MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type],
81                      unexp_list) == NULL);
82     }
83     MPIDI_global.comm_req_lists[comm_idx].comm[is_localcomm][subcomm_type] = NULL;
84     MPID_THREAD_CS_EXIT(VCI, MPIDIU_THREAD_MPIDIG_GLOBAL_MUTEX);
85 
86   fn_exit:
87     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIG_DESTROY_COMM);
88     return mpi_errno;
89 }
90 
91 /* Linked list internally used to keep track of
92  * allocated memory for which memory binding is
93  * requested by the user. */
94 typedef struct mem_node {
95     void *ptr;
96     size_t size;
97     struct mem_node *next;
98 } mem_node_t;
99 
100 static mem_node_t *mem_list_head = NULL;
101 static mem_node_t *mem_list_tail = NULL;
102 
MPIDIG_mpi_alloc_mem(size_t size,MPIR_Info * info_ptr)103 void *MPIDIG_mpi_alloc_mem(size_t size, MPIR_Info * info_ptr)
104 {
105     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIG_MPI_ALLOC_MEM);
106     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIG_MPI_ALLOC_MEM);
107     void *p;
108     MPIR_hwtopo_type_e mem_type = MPIR_HWTOPO_TYPE__DDR;
109     MPIR_hwtopo_gid_t mem_gid = MPIR_HWTOPO_GID_ROOT;
110     int flag = 0;
111     char hint_str[MPI_MAX_INFO_VAL + 1];
112 
113     /* retrieve requested memory type for allocation */
114     if (info_ptr) {
115         MPIR_Info_get_impl(info_ptr, "bind_memory", MPI_MAX_INFO_VAL, hint_str, &flag);
116     }
117 
118     if (flag) {
119         if (!strcmp(hint_str, "ddr"))
120             mem_type = MPIR_HWTOPO_TYPE__DDR;
121         else if (!strcmp(hint_str, "hbm")) {
122             mem_type = MPIR_HWTOPO_TYPE__HBM;
123         } else {
124             mem_type = MPIR_HWTOPO_TYPE__DDR;
125         }
126         mem_gid = MPIR_hwtopo_get_obj_by_type(mem_type);
127     }
128 
129     if (mem_gid != MPIR_HWTOPO_GID_ROOT) {
130         /* requested memory type is available in the system and process is bound
131          * to the corresponding device; allocate memory and bind it to device. */
132         p = MPL_mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0,
133                      MPL_MEM_USER);
134         MPIR_hwtopo_mem_bind(p, size, mem_gid);
135 
136         /* keep track of bound memory for freeing it later */
137         mem_node_t *el = MPL_malloc(sizeof(*el), MPL_MEM_OTHER);
138         el->ptr = p;
139         el->size = size;
140         LL_APPEND(mem_list_head, mem_list_tail, el);
141     } else if (mem_type != MPIR_HWTOPO_TYPE__DDR) {
142         /* if mem_gid = MPIR_HWTOPO_GID_ROOT and mem_type is non-default (DDR)
143          * it can mean either that the requested memory type is not available
144          * in the system or the requested memory type is available but there
145          * are many devices of such type and the process requesting memory is
146          * not bound to any of them. Regardless the reason we do not fall back
147          * to the default allocation and return a NULL pointer to the upper layer
148          * instead. */
149         p = NULL;
150     } else {
151         /* if mem_gid = MPIR_HWTOPO_GID_ROOT and mem_type is default (DDR) it
152          * means that we cannot bind memory to a single device explicitly. In
153          * this case we still allocate memory and leave the binding to the OS
154          * (first touch policy in Linux). */
155         p = MPL_malloc(size, MPL_MEM_USER);
156     }
157     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIG_MPI_ALLOC_MEM);
158     return p;
159 }
160 
MPIDIG_mpi_free_mem(void * ptr)161 int MPIDIG_mpi_free_mem(void *ptr)
162 {
163     int mpi_errno = MPI_SUCCESS;
164     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDIG_MPI_FREE_MEM);
165     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDIG_MPI_FREE_MEM);
166     mem_node_t *el = NULL;
167 
168     /* scan memory list for allocations */
169     LL_FOREACH(mem_list_head, el) {
170         if (el->ptr == ptr) {
171             LL_DELETE(mem_list_head, mem_list_tail, el);
172             break;
173         }
174     }
175 
176     if (el) {
177         MPL_munmap(el->ptr, el->size, MPL_MEM_USER);
178         MPL_free(el);
179     } else {
180         MPL_free(ptr);
181     }
182     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDIG_MPI_FREE_MEM);
183     return mpi_errno;
184 }
185