1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
4 * reserved.
5 * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
6 * Copyright (c) 2014-2016 Research Organization for Information Science
7 * and Technology (RIST). All rights reserved.
8 * Copyright (c) 2017 Intel, Inc. All rights reserved.
9 * $COPYRIGHT$
10 *
11 * Additional copyrights may follow
12 *
13 * $HEADER$
14 */
15
16 #include "opal_config.h"
17
18 #include "btl_ugni.h"
19 #include "btl_ugni_frag.h"
20 #include "btl_ugni_endpoint.h"
21 #include "btl_ugni_prepare.h"
22 #include "btl_ugni_smsg.h"
23
24 static int
25 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
26 mca_btl_base_descriptor_t *des);
27
28 static int
29 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl);
30
31 static struct mca_btl_base_descriptor_t *
32 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
33 struct mca_btl_base_endpoint_t *endpoint,
34 struct opal_convertor_t *convertor,
35 uint8_t order, size_t reserve, size_t *size,
36 uint32_t flags);
37
38 static mca_btl_base_registration_handle_t *
39 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
40 size_t size, uint32_t flags);
41
42 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
43
44 mca_btl_ugni_module_t mca_btl_ugni_module = {
45 .super = {
46 .btl_component = &mca_btl_ugni_component.super,
47 .btl_add_procs = mca_btl_ugni_add_procs,
48 .btl_del_procs = mca_btl_ugni_del_procs,
49 .btl_finalize = mca_btl_ugni_module_finalize,
50 .btl_alloc = mca_btl_ugni_alloc,
51 .btl_free = mca_btl_ugni_free,
52 .btl_prepare_src = mca_btl_ugni_prepare_src,
53 .btl_send = mca_btl_ugni_send,
54 .btl_sendi = mca_btl_ugni_sendi,
55 .btl_put = mca_btl_ugni_put,
56 .btl_get = mca_btl_ugni_get,
57 .btl_register_mem = mca_btl_ugni_register_mem,
58 .btl_deregister_mem = mca_btl_ugni_deregister_mem,
59 .btl_atomic_op = mca_btl_ugni_aop,
60 .btl_atomic_fop = mca_btl_ugni_afop,
61 .btl_atomic_cswap = mca_btl_ugni_acswap,
62 .btl_flush = mca_btl_ugni_flush,
63 }
64 };
65
mca_btl_ugni_datagram_event(int foo,short bar,void * arg)66 static void mca_btl_ugni_datagram_event (int foo, short bar, void *arg)
67 {
68 mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) arg;
69 mca_btl_ugni_device_t *device = ugni_module->devices;
70 struct timeval tv = {.tv_sec = 0, .tv_usec = MCA_BTL_UGNI_CONNECT_USEC};
71
72 mca_btl_ugni_progress_datagram (device);
73
74 opal_event_evtimer_add (&ugni_module->connection_event, &tv);
75 }
76
77 int
mca_btl_ugni_module_init(mca_btl_ugni_module_t * ugni_module)78 mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
79 {
80 int rc;
81
82 BTL_VERBOSE(("binding module %p to device 0", (void *) ugni_module));
83
84 /* copy module defaults (and function pointers) */
85 memmove (ugni_module, &mca_btl_ugni_module, sizeof (mca_btl_ugni_module));
86
87 ugni_module->initialized = false;
88 ugni_module->nlocal_procs = 0;
89 ugni_module->active_datagrams = 0;
90 ugni_module->active_rdma_count = 0;
91
92 opal_event_evtimer_set (opal_sync_event_base, &ugni_module->connection_event,
93 mca_btl_ugni_datagram_event, ugni_module);
94
95 OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
96 OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
97
98 OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
99 OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t);
100
101 for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
102 OBJ_CONSTRUCT(ugni_module->frags_lists + i, opal_free_list_t);
103 }
104
105 OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
106 OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
107 OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
108 OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t);
109 OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
110 OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
111 OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
112 OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
113
114 /* set up virtual device handles */
115 for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
116 rc = mca_btl_ugni_device_init (ugni_module->devices + i, i);
117 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
118 BTL_VERBOSE(("error initializing uGNI device handle"));
119 return rc;
120 }
121 }
122
123 /* create wildcard endpoint on first device to listen for connections.
124 * there is no need to bind this endpoint. We are single threaded
125 * here so there is no need for a device lock. */
126 rc = GNI_EpCreate (ugni_module->devices[0].dev_handle, NULL,
127 &ugni_module->wildcard_ep);
128 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
129 BTL_ERROR(("error creating wildcard ugni endpoint"));
130 return mca_btl_rc_ugni_to_opal (rc);
131 }
132
133 /* post wildcard datagram */
134 rc = mca_btl_ugni_wildcard_ep_post (ugni_module);
135 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
136 BTL_ERROR(("error posting wildcard datagram"));
137 return rc;
138 }
139
140 return OPAL_SUCCESS;
141 }
142
143 static int
mca_btl_ugni_module_finalize(struct mca_btl_base_module_t * btl)144 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
145 {
146 mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *)btl;
147 mca_btl_base_endpoint_t *ep;
148 uint64_t key;
149 int rc;
150
151 if (ugni_module->initialized) {
152 /* close all open connections and release endpoints */
153 OPAL_HASH_TABLE_FOREACH(key, uint64, ep, &ugni_module->id_to_endpoint) {
154 if (NULL != ep) {
155 mca_btl_ugni_release_ep (ep);
156 }
157 }
158
159 if (mca_btl_ugni_component.progress_thread_enabled) {
160 mca_btl_ugni_kill_progress_thread();
161 }
162
163 /* destroy all cqs */
164 rc = GNI_CqDestroy (ugni_module->smsg_remote_cq);
165 if (GNI_RC_SUCCESS != rc) {
166 BTL_ERROR(("error tearing down RX SMSG CQ - %s",gni_err_str[rc]));
167 }
168
169 if (mca_btl_ugni_component.progress_thread_enabled) {
170 rc = GNI_CqDestroy (ugni_module->smsg_remote_irq_cq);
171 if (GNI_RC_SUCCESS != rc) {
172 BTL_ERROR(("error tearing down remote SMSG CQ - %s",gni_err_str[rc]));
173 }
174 }
175
176 /* cancel wildcard post */
177 rc = GNI_EpPostDataCancelById (ugni_module->wildcard_ep,
178 MCA_BTL_UGNI_CONNECT_WILDCARD_ID |
179 OPAL_PROC_MY_NAME.vpid);
180 if (GNI_RC_SUCCESS != rc) {
181 BTL_VERBOSE(("btl/ugni error cancelling wildcard post"));
182 }
183
184 /* tear down wildcard endpoint */
185 rc = GNI_EpDestroy (ugni_module->wildcard_ep);
186 if (GNI_RC_SUCCESS != rc) {
187 BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc]));
188 }
189
190 opal_event_del (&ugni_module->connection_event);
191 }
192
193 for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
194 OBJ_DESTRUCT(ugni_module->frags_lists + i);
195 }
196
197 OBJ_DESTRUCT(&ugni_module->ep_wait_list);
198 OBJ_DESTRUCT(&ugni_module->smsg_mboxes);
199 OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
200 OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
201 OBJ_DESTRUCT(&ugni_module->endpoint_lock);
202 OBJ_DESTRUCT(&ugni_module->endpoints);
203
204 OBJ_DESTRUCT(&ugni_module->eager_get_pending);
205 OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
206
207 if (ugni_module->rcache) {
208 mca_rcache_base_module_destroy (ugni_module->rcache);
209 }
210
211 for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
212 mca_btl_ugni_device_fini (ugni_module->devices + i);
213 }
214
215 ugni_module->initialized = false;
216
217 return OPAL_SUCCESS;
218 }
219
220
221 mca_btl_base_descriptor_t *
mca_btl_ugni_alloc(struct mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * endpoint,uint8_t order,size_t size,uint32_t flags)222 mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
223 struct mca_btl_base_endpoint_t *endpoint,
224 uint8_t order, size_t size, uint32_t flags)
225 {
226 mca_btl_ugni_base_frag_t *frag = NULL;
227
228 /* do not allocate a fragment unless the wait list is relatively small. this
229 * reduces the potential for resource exhaustion. note the wait list only exists
230 * because we have no way to notify the sender that credits are available. */
231 if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
232 return NULL;
233 }
234
235 if (size <= mca_btl_ugni_component.smsg_max_data) {
236 frag = mca_btl_ugni_frag_alloc_smsg (endpoint);
237 } else if (size <= btl->btl_eager_limit) {
238 frag = mca_btl_ugni_frag_alloc_eager_send (endpoint);
239 }
240
241 if (OPAL_UNLIKELY(NULL == frag)) {
242 return NULL;
243 }
244
245 BTL_VERBOSE(("btl/ugni_module allocated frag of size: %u, flags: %x. frag = %p",
246 (unsigned int)size, flags, (void *) frag));
247
248 frag->base.des_flags = flags;
249 frag->base.order = order;
250 frag->base.des_segments = &frag->segments[1];
251 frag->base.des_segment_count = 1;
252
253 frag->segments[0].seg_addr.pval = NULL;
254 frag->segments[0].seg_len = 0;
255 frag->segments[1].seg_addr.pval = frag->base.super.ptr;
256 frag->segments[1].seg_len = size;
257
258 frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED;
259 if (size > mca_btl_ugni_component.smsg_max_data) {
260 mca_btl_ugni_reg_t *registration;
261
262 frag->hdr_size = sizeof (frag->hdr.eager);
263 frag->flags |= MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
264
265 registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
266
267 frag->hdr.eager.memory_handle = registration->handle;
268 } else {
269 frag->hdr_size = sizeof (frag->hdr.send);
270 }
271
272 return &frag->base;
273 }
274
275 static int
mca_btl_ugni_free(struct mca_btl_base_module_t * btl,mca_btl_base_descriptor_t * des)276 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
277 mca_btl_base_descriptor_t *des)
278 {
279 return mca_btl_ugni_frag_return ((mca_btl_ugni_base_frag_t *) des);
280 }
281
282 static struct mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_src(struct mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * endpoint,struct opal_convertor_t * convertor,uint8_t order,size_t reserve,size_t * size,uint32_t flags)283 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
284 mca_btl_base_endpoint_t *endpoint,
285 struct opal_convertor_t *convertor,
286 uint8_t order, size_t reserve, size_t *size,
287 uint32_t flags)
288 {
289 /* do not allocate a fragment unless the wait list is relatively small. this
290 * reduces the potential for resource exhaustion. note the wait list only exists
291 * because we have no way to notify the sender that credits are available. */
292 if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
293 return NULL;
294 }
295
296 return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
297 order, reserve, size, flags);
298 }
299
300 static mca_btl_base_registration_handle_t *
mca_btl_ugni_register_mem(mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * endpoint,void * base,size_t size,uint32_t flags)301 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
302 size_t size, uint32_t flags)
303 {
304 mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
305 mca_btl_ugni_reg_t *reg;
306 int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
307 int rc;
308
309 rc = ugni_module->rcache->rcache_register (ugni_module->rcache, base, size, 0, access_flags,
310 (mca_rcache_base_registration_t **) ®);
311 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
312 return NULL;
313 }
314
315 return ®->handle;
316 }
317
mca_btl_ugni_deregister_mem(mca_btl_base_module_t * btl,mca_btl_base_registration_handle_t * handle)318 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
319 {
320 mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
321 mca_btl_ugni_reg_t *reg =
322 (mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle));
323
324 (void) ugni_module->rcache->rcache_deregister (ugni_module->rcache, ®->base);
325
326 return OPAL_SUCCESS;
327 }
328
mca_btl_ugni_event_fatal_error(gni_return_t grc,gni_cq_entry_t event_data)329 int mca_btl_ugni_event_fatal_error (gni_return_t grc, gni_cq_entry_t event_data)
330 {
331 /* combined error check for get event and get completed. we might miss exactly
332 * what happened but it is unrecoverable anyway. fwiw, this error path has
333 * never been seen in production. */
334 if (GNI_CQ_OVERRUN(event_data)) {
335 /* TODO -- need to handle overrun -- how do we do this without an event?
336 will the event eventually come back? Ask Cray */
337 BTL_ERROR(("CQ overrun detected in RDMA event data. can not recover"));
338 } else {
339 BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
340 }
341
342 return mca_btl_rc_ugni_to_opal (grc);
343 }
344
mca_btl_ugni_device_handle_event_error(mca_btl_ugni_rdma_desc_t * rdma_desc,gni_cq_entry_t event_data)345 int mca_btl_ugni_device_handle_event_error (mca_btl_ugni_rdma_desc_t *rdma_desc, gni_cq_entry_t event_data)
346 {
347 mca_btl_ugni_device_t *device = rdma_desc->device;
348 uint32_t recoverable = 1;
349
350 (void) GNI_CqErrorRecoverable (event_data, &recoverable);
351
352 if (OPAL_UNLIKELY(++rdma_desc->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) {
353 char char_buffer[1024];
354 GNI_CqErrorStr (event_data, char_buffer, sizeof (char_buffer));
355
356 BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) rdma_desc, recoverable, char_buffer));
357
358 return OPAL_ERROR;
359 }
360
361 return _mca_btl_ugni_repost_rdma_desc_device (device, rdma_desc);
362 }
363