1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
4  *                         reserved.
5  * Copyright (c) 2015      Research Organization for Information Science
6  *                         and Technology (RIST). All rights reserved.
7  * Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
8  * $COPYRIGHT$
9  *
10  * Additional copyrights may follow
11  *
12  * $HEADER$
13  */
14 
15 #include "ompi_config.h"
16 
17 #ifdef HAVE_ALLOCA_H
18 #include <alloca.h>
19 #endif
20 
21 #include "osc_rdma_comm.h"
22 
23 #include "ompi/mca/bml/base/base.h"
24 
25 #define NODE_ID_TO_RANK(module, peer_data, node_id) ((int)(peer_data)->len)
26 
27 /**
28  * @brief find the btl endpoint for a process
29  *
30  * @param[in] module         osc rdma module
31  * @param[in] peer_id        process rank in the module communicator
32  *
33  * @returns NULL on error
34  * @returns btl endpoint on success
35  */
ompi_osc_rdma_peer_btl_endpoint(struct ompi_osc_rdma_module_t * module,int peer_id)36 struct mca_btl_base_endpoint_t *ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, int peer_id)
37 {
38     ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id);
39     mca_bml_base_endpoint_t *bml_endpoint;
40     int num_btls;
41 
42     /* for not just use the bml to get the btl endpoint */
43     bml_endpoint = mca_bml_base_get_endpoint (proc);
44 
45     num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
46 
47     for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) {
48         if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btl) {
49             return bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint;
50         }
51     }
52 
53     /* very unlikely. if this happened the btl section process is broken */
54     return NULL;
55 }
56 
ompi_osc_rdma_new_peer(struct ompi_osc_rdma_module_t * module,int peer_id,ompi_osc_rdma_peer_t ** peer_out)57 int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) {
58     struct mca_btl_base_endpoint_t *endpoint;
59     ompi_osc_rdma_peer_t *peer;
60 
61     *peer_out = NULL;
62 
63     endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
64     if (OPAL_UNLIKELY(NULL == endpoint && !((module->selected_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) &&
65                                             peer_id == ompi_comm_rank (module->comm)))) {
66         return OMPI_ERR_UNREACH;
67     }
68 
69     if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
70         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_dynamic_t);
71     } else if (module->same_size && module->same_disp_unit) {
72         /* use a smaller peer object when same_size and same_disp_unit are set */
73         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_basic_t);
74     } else {
75         peer = (ompi_osc_rdma_peer_t *) OBJ_NEW(ompi_osc_rdma_peer_extended_t);
76     }
77 
78     peer->data_endpoint = endpoint;
79     peer->rank          = peer_id;
80 
81     *peer_out = peer;
82 
83     return OMPI_SUCCESS;
84 }
85 
86 /**
87  * @brief finish initializing a peer object
88  *
89  * @param[in] module         osc rdma module
90  * @param[in] peer           peer object to set up
91  *
92  * This function reads the registration handle and state pointer from the peer that holds that data. If necessary
93  * it will then ready information about the peer from its state data structure. This information includes the
94  * displacement unit, base pointer, window size, and registation handle (if applicable).
95  */
ompi_osc_rdma_peer_setup(ompi_osc_rdma_module_t * module,ompi_osc_rdma_peer_t * peer)96 static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer)
97 {
98     ompi_osc_rdma_peer_extended_t *ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;
99     uint64_t peer_data_size;
100     uint64_t peer_data_offset, array_pointer;
101     struct mca_btl_base_endpoint_t *array_endpoint;
102     ompi_osc_rdma_region_t *array_peer_data, *node_peer_data;
103     ompi_osc_rdma_rank_data_t rank_data;
104     int registration_handle_size = 0;
105     int node_id, node_rank, array_index;
106     int ret, disp_unit, comm_size;
107     char *peer_data;
108 
109     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank);
110 
111     if (module->selected_btl->btl_register_mem) {
112         registration_handle_size = module->selected_btl->btl_registration_handle_size;
113     }
114 
115     comm_size = ompi_comm_size (module->comm);
116 
117     /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
118      * calculates the node and offset the mapping can be found. once the mapping has been read the state
119      * part of the peer structure can be initialized. */
120     node_id = (peer->rank * module->node_count) / comm_size;
121     array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
122 
123     /* the node leader rank is stored in the length field */
124     node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
125     array_index = peer->rank % ((comm_size + module->node_count - 1) / module->node_count);
126 
127     array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
128 
129     /* lookup the btl endpoint needed to retrieve the mapping */
130     array_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, node_rank);
131     if (OPAL_UNLIKELY(NULL == array_endpoint)) {
132         return OMPI_ERR_UNREACH;
133     }
134 
135     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
136                      ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
137 
138     ret = ompi_osc_get_data_blocking (module, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
139                                       &rank_data, sizeof (rank_data));
140     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
141         return ret;
142     }
143 
144     /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
145      * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
146      * of this by re-using the endpoint and pointer stored in the node_comm_info array. */
147     node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
148 
149     peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
150 
151     if (registration_handle_size) {
152         peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
153     }
154 
155     peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id));
156     if (OPAL_UNLIKELY(NULL == peer->state_endpoint)) {
157         return OPAL_ERR_UNREACH;
158     }
159 
160     /* nothing more to do for dynamic memory windows */
161     if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
162         return OMPI_SUCCESS;
163     }
164 
165     /* read window data from the target rank */
166     if (module->same_disp_unit) {
167         /* do not bother reading the displacement unit as it is already known */
168         peer_data_offset = offsetof (ompi_osc_rdma_state_t, regions);
169     } else {
170         peer_data_offset = offsetof (ompi_osc_rdma_state_t, disp_unit);
171     }
172 
173     peer_data_size = module->state_size - peer_data_offset;
174     peer_data = alloca (peer_data_size);
175 
176     /* read window data from the end of the target's state structure */
177     ret = ompi_osc_get_data_blocking (module, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle,
178                                       peer_data, peer_data_size);
179     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
180         return ret;
181     }
182 
183     if (!module->same_disp_unit) {
184         /* unpack displacement */
185         memcpy (&ex_peer->disp_unit, peer_data, sizeof (ex_peer->disp_unit));
186         peer_data += offsetof (ompi_osc_rdma_state_t, regions) - offsetof (ompi_osc_rdma_state_t, disp_unit);
187         disp_unit = ex_peer->disp_unit;
188     } else {
189         disp_unit = module->disp_unit;
190     }
191 
192     ompi_osc_rdma_region_t *base_region = (ompi_osc_rdma_region_t *) peer_data;
193 
194     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "peer %d: remote base region: 0x%" PRIx64 ", size: %" PRId64
195                      ", flags: 0x%x, disp_unit: %d", peer->rank, base_region->base, base_region->len,
196                      peer->flags, disp_unit);
197     (void)disp_unit;  // silence compiler warning
198 
199     if (ompi_osc_rdma_peer_local_base (peer)) {
200         /* for now we store the local address in the standard place. do no overwrite it */
201         return OMPI_SUCCESS;
202     }
203 
204     ex_peer->super.base = base_region->base;
205 
206     /* save size and base */
207     if (!module->same_size) {
208         ex_peer->size = base_region->len;
209     }
210 
211     if (base_region->len) {
212         if (registration_handle_size) {
213             ex_peer->super.base_handle = malloc (registration_handle_size);
214             if (OPAL_UNLIKELY(NULL == ex_peer->super.base_handle)) {
215                 return OMPI_ERR_OUT_OF_RESOURCE;
216             }
217 
218             peer->flags |= OMPI_OSC_RDMA_PEER_BASE_FREE;
219 
220             memcpy (ex_peer->super.base_handle, base_region->btl_handle_data, registration_handle_size);
221         }
222 
223         if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
224             ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint;
225         }
226     }
227 
228     return OMPI_SUCCESS;
229 }
230 
231 /**
232  * @brief lookup (or allocate) a peer for a rank (internal)
233  *
234  * @param[in] module         osc rdma module
235  * @param[in] peer_id        rank of remote peer (in module communicator)
236  *
237  * @returns peer object on success
238  * @returns NULL on error
239  *
240  * This is an internal function for looking up or allocating a peer object for a window rank. This
241  * function requires the peer lock to be held and is only expected to be called from itself or
242  * the ompi_osc_rdma_peer_lookup() helper function.
243  */
ompi_osc_rdma_peer_lookup_internal(struct ompi_osc_rdma_module_t * module,int peer_id)244 static struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup_internal (struct ompi_osc_rdma_module_t *module, int peer_id)
245 {
246     ompi_osc_rdma_peer_t *peer;
247     int ret;
248 
249     OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "looking up peer data for rank %d", peer_id);
250 
251     peer = ompi_osc_module_get_peer (module, peer_id);
252     if (NULL != peer) {
253         return peer;
254     }
255 
256     ret = ompi_osc_rdma_new_peer (module, peer_id, &peer);
257     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
258         return NULL;
259     }
260 
261     ret = ompi_osc_rdma_peer_setup (module, peer);
262     if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
263         OBJ_RELEASE(peer);
264         return NULL;
265     }
266 
267     ret = ompi_osc_module_add_peer (module, peer);
268     if (OPAL_SUCCESS != ret) {
269         /* out of memory */
270         OBJ_RELEASE(peer);
271         return NULL;
272     }
273 
274     /* ensure the peer hash is updated before we drop the lock */
275     opal_atomic_wmb ();
276 
277     return peer;
278 }
279 
ompi_osc_rdma_peer_lookup(struct ompi_osc_rdma_module_t * module,int peer_id)280 struct ompi_osc_rdma_peer_t *ompi_osc_rdma_peer_lookup (struct ompi_osc_rdma_module_t *module, int peer_id)
281 {
282     struct ompi_osc_rdma_peer_t *peer;
283 
284     opal_mutex_lock (&module->peer_lock);
285     peer = ompi_osc_rdma_peer_lookup_internal (module, peer_id);
286     opal_mutex_unlock (&module->peer_lock);
287 
288     return peer;
289 }
290 
291 
292 /******* peer objects *******/
293 
ompi_osc_rdma_peer_construct(ompi_osc_rdma_peer_t * peer)294 static void ompi_osc_rdma_peer_construct (ompi_osc_rdma_peer_t *peer)
295 {
296     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
297 }
298 
ompi_osc_rdma_peer_destruct(ompi_osc_rdma_peer_t * peer)299 static void ompi_osc_rdma_peer_destruct (ompi_osc_rdma_peer_t *peer)
300 {
301     if (peer->state_handle && (peer->flags & OMPI_OSC_RDMA_PEER_STATE_FREE)) {
302         free (peer->state_handle);
303     }
304 }
305 
306 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_t, opal_list_item_t,
307                    ompi_osc_rdma_peer_construct,
308                    ompi_osc_rdma_peer_destruct);
309 
ompi_osc_rdma_peer_basic_construct(ompi_osc_rdma_peer_basic_t * peer)310 static void ompi_osc_rdma_peer_basic_construct (ompi_osc_rdma_peer_basic_t *peer)
311 {
312     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
313 }
314 
ompi_osc_rdma_peer_basic_destruct(ompi_osc_rdma_peer_basic_t * peer)315 static void ompi_osc_rdma_peer_basic_destruct (ompi_osc_rdma_peer_basic_t *peer)
316 {
317     if (peer->base_handle && (peer->super.flags & OMPI_OSC_RDMA_PEER_BASE_FREE)) {
318         free (peer->base_handle);
319     }
320 }
321 
322 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_basic_t, ompi_osc_rdma_peer_t,
323                    ompi_osc_rdma_peer_basic_construct,
324                    ompi_osc_rdma_peer_basic_destruct);
325 
326 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_extended_t, ompi_osc_rdma_peer_basic_t,
327                    NULL, NULL);
328 
ompi_osc_rdma_peer_dynamic_construct(ompi_osc_rdma_peer_dynamic_t * peer)329 static void ompi_osc_rdma_peer_dynamic_construct (ompi_osc_rdma_peer_dynamic_t *peer)
330 {
331     memset ((char *) peer + sizeof (peer->super), 0, sizeof (*peer) - sizeof (peer->super));
332 }
333 
ompi_osc_rdma_peer_dynamic_destruct(ompi_osc_rdma_peer_dynamic_t * peer)334 static void ompi_osc_rdma_peer_dynamic_destruct (ompi_osc_rdma_peer_dynamic_t *peer)
335 {
336     if (peer->regions) {
337         free (peer->regions);
338     }
339 }
340 
341 OBJ_CLASS_INSTANCE(ompi_osc_rdma_peer_dynamic_t, ompi_osc_rdma_peer_t,
342                    ompi_osc_rdma_peer_dynamic_construct,
343                    ompi_osc_rdma_peer_dynamic_destruct);
344