1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
4  *                         reserved.
5  * Copyright (c) 2011      UT-Battelle, LLC. All rights reserved.
6  * Copyright (c) 2014-2016 Research Organization for Information Science
7  *                         and Technology (RIST). All rights reserved.
8  * Copyright (c) 2017      Intel, Inc.  All rights reserved.
9  * $COPYRIGHT$
10  *
11  * Additional copyrights may follow
12  *
13  * $HEADER$
14  */
15 
16 #include "opal_config.h"
17 
18 #include "btl_ugni.h"
19 #include "btl_ugni_frag.h"
20 #include "btl_ugni_endpoint.h"
21 #include "btl_ugni_prepare.h"
22 #include "btl_ugni_smsg.h"
23 
24 static int
25 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
26                    mca_btl_base_descriptor_t *des);
27 
28 static int
29 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl);
30 
31 static struct mca_btl_base_descriptor_t *
32 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
33                           struct mca_btl_base_endpoint_t *endpoint,
34                           struct opal_convertor_t *convertor,
35                           uint8_t order, size_t reserve, size_t *size,
36                           uint32_t flags);
37 
38 static mca_btl_base_registration_handle_t *
39 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
40                            size_t size, uint32_t flags);
41 
42 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle);
43 
44 mca_btl_ugni_module_t mca_btl_ugni_module = {
45     .super = {
46         .btl_component      = &mca_btl_ugni_component.super,
47         .btl_add_procs      = mca_btl_ugni_add_procs,
48         .btl_del_procs      = mca_btl_ugni_del_procs,
49         .btl_finalize       = mca_btl_ugni_module_finalize,
50         .btl_alloc          = mca_btl_ugni_alloc,
51         .btl_free           = mca_btl_ugni_free,
52         .btl_prepare_src    = mca_btl_ugni_prepare_src,
53         .btl_send           = mca_btl_ugni_send,
54         .btl_sendi          = mca_btl_ugni_sendi,
55         .btl_put            = mca_btl_ugni_put,
56         .btl_get            = mca_btl_ugni_get,
57         .btl_register_mem   = mca_btl_ugni_register_mem,
58         .btl_deregister_mem = mca_btl_ugni_deregister_mem,
59         .btl_atomic_op      = mca_btl_ugni_aop,
60         .btl_atomic_fop     = mca_btl_ugni_afop,
61         .btl_atomic_cswap   = mca_btl_ugni_acswap,
62         .btl_flush          = mca_btl_ugni_flush,
63     }
64 };
65 
mca_btl_ugni_datagram_event(int foo,short bar,void * arg)66 static void mca_btl_ugni_datagram_event (int foo, short bar, void *arg)
67 {
68     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) arg;
69     mca_btl_ugni_device_t *device = ugni_module->devices;
70     struct timeval tv = {.tv_sec = 0, .tv_usec = MCA_BTL_UGNI_CONNECT_USEC};
71 
72     mca_btl_ugni_progress_datagram (device);
73 
74     opal_event_evtimer_add (&ugni_module->connection_event, &tv);
75 }
76 
77 int
mca_btl_ugni_module_init(mca_btl_ugni_module_t * ugni_module)78 mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
79 {
80     int rc;
81 
82     BTL_VERBOSE(("binding module %p to device 0", (void *) ugni_module));
83 
84     /* copy module defaults (and function pointers) */
85     memmove (ugni_module, &mca_btl_ugni_module, sizeof (mca_btl_ugni_module));
86 
87     ugni_module->initialized = false;
88     ugni_module->nlocal_procs = 0;
89     ugni_module->active_datagrams = 0;
90     ugni_module->active_rdma_count = 0;
91 
92     opal_event_evtimer_set (opal_sync_event_base, &ugni_module->connection_event,
93                             mca_btl_ugni_datagram_event, ugni_module);
94 
95     OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
96     OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
97 
98     OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
99     OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t);
100 
101     for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
102         OBJ_CONSTRUCT(ugni_module->frags_lists + i, opal_free_list_t);
103     }
104 
105     OBJ_CONSTRUCT(&ugni_module->pending_smsg_frags_bb, opal_pointer_array_t);
106     OBJ_CONSTRUCT(&ugni_module->ep_wait_list_lock,opal_mutex_t);
107     OBJ_CONSTRUCT(&ugni_module->ep_wait_list, opal_list_t);
108     OBJ_CONSTRUCT(&ugni_module->endpoint_lock, opal_mutex_t);
109     OBJ_CONSTRUCT(&ugni_module->endpoints, opal_pointer_array_t);
110     OBJ_CONSTRUCT(&ugni_module->id_to_endpoint, opal_hash_table_t);
111     OBJ_CONSTRUCT(&ugni_module->smsg_mboxes, opal_free_list_t);
112     OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
113 
114     /* set up virtual device handles */
115     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
116         rc = mca_btl_ugni_device_init (ugni_module->devices + i, i);
117         if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
118             BTL_VERBOSE(("error initializing uGNI device handle"));
119             return rc;
120         }
121     }
122 
123     /* create wildcard endpoint on first device to listen for connections.
124      * there is no need to bind this endpoint. We are single threaded
125      * here so there is no need for a device lock. */
126     rc = GNI_EpCreate (ugni_module->devices[0].dev_handle, NULL,
127                        &ugni_module->wildcard_ep);
128     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
129         BTL_ERROR(("error creating wildcard ugni endpoint"));
130         return mca_btl_rc_ugni_to_opal (rc);
131     }
132 
133     /* post wildcard datagram */
134     rc = mca_btl_ugni_wildcard_ep_post (ugni_module);
135     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
136         BTL_ERROR(("error posting wildcard datagram"));
137         return rc;
138     }
139 
140     return OPAL_SUCCESS;
141 }
142 
143 static int
mca_btl_ugni_module_finalize(struct mca_btl_base_module_t * btl)144 mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
145 {
146     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *)btl;
147     mca_btl_base_endpoint_t *ep;
148     uint64_t key;
149     int rc;
150 
151     if (ugni_module->initialized) {
152         /* close all open connections and release endpoints */
153         OPAL_HASH_TABLE_FOREACH(key, uint64, ep, &ugni_module->id_to_endpoint) {
154             if (NULL != ep) {
155                 mca_btl_ugni_release_ep (ep);
156             }
157         }
158 
159         if (mca_btl_ugni_component.progress_thread_enabled) {
160             mca_btl_ugni_kill_progress_thread();
161         }
162 
163         /* destroy all cqs */
164         rc = GNI_CqDestroy (ugni_module->smsg_remote_cq);
165         if (GNI_RC_SUCCESS != rc) {
166             BTL_ERROR(("error tearing down RX SMSG CQ - %s",gni_err_str[rc]));
167         }
168 
169         if (mca_btl_ugni_component.progress_thread_enabled) {
170             rc = GNI_CqDestroy (ugni_module->smsg_remote_irq_cq);
171             if (GNI_RC_SUCCESS != rc) {
172                 BTL_ERROR(("error tearing down remote SMSG CQ - %s",gni_err_str[rc]));
173             }
174         }
175 
176         /* cancel wildcard post */
177         rc = GNI_EpPostDataCancelById (ugni_module->wildcard_ep,
178                                        MCA_BTL_UGNI_CONNECT_WILDCARD_ID |
179                                        OPAL_PROC_MY_NAME.vpid);
180         if (GNI_RC_SUCCESS != rc) {
181             BTL_VERBOSE(("btl/ugni error cancelling wildcard post"));
182         }
183 
184         /* tear down wildcard endpoint */
185         rc = GNI_EpDestroy (ugni_module->wildcard_ep);
186         if (GNI_RC_SUCCESS != rc) {
187             BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc]));
188         }
189 
190         opal_event_del (&ugni_module->connection_event);
191     }
192 
193     for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {
194         OBJ_DESTRUCT(ugni_module->frags_lists + i);
195     }
196 
197     OBJ_DESTRUCT(&ugni_module->ep_wait_list);
198     OBJ_DESTRUCT(&ugni_module->smsg_mboxes);
199     OBJ_DESTRUCT(&ugni_module->pending_smsg_frags_bb);
200     OBJ_DESTRUCT(&ugni_module->id_to_endpoint);
201     OBJ_DESTRUCT(&ugni_module->endpoint_lock);
202     OBJ_DESTRUCT(&ugni_module->endpoints);
203 
204     OBJ_DESTRUCT(&ugni_module->eager_get_pending);
205     OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
206 
207     if (ugni_module->rcache) {
208         mca_rcache_base_module_destroy (ugni_module->rcache);
209     }
210 
211     for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
212         mca_btl_ugni_device_fini (ugni_module->devices + i);
213     }
214 
215     ugni_module->initialized = false;
216 
217     return OPAL_SUCCESS;
218 }
219 
220 
221 mca_btl_base_descriptor_t *
mca_btl_ugni_alloc(struct mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * endpoint,uint8_t order,size_t size,uint32_t flags)222 mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,
223                    struct mca_btl_base_endpoint_t *endpoint,
224                    uint8_t order, size_t size, uint32_t flags)
225 {
226     mca_btl_ugni_base_frag_t *frag = NULL;
227 
228     /* do not allocate a fragment unless the wait list is relatively small. this
229      * reduces the potential for resource exhaustion. note the wait list only exists
230      * because we have no way to notify the sender that credits are available. */
231     if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
232         return NULL;
233     }
234 
235     if (size <= mca_btl_ugni_component.smsg_max_data) {
236         frag = mca_btl_ugni_frag_alloc_smsg (endpoint);
237     } else if (size <= btl->btl_eager_limit) {
238         frag = mca_btl_ugni_frag_alloc_eager_send (endpoint);
239     }
240 
241     if (OPAL_UNLIKELY(NULL == frag)) {
242         return NULL;
243     }
244 
245     BTL_VERBOSE(("btl/ugni_module allocated frag of size: %u, flags: %x. frag = %p",
246                  (unsigned int)size, flags, (void *) frag));
247 
248     frag->base.des_flags = flags;
249     frag->base.order = order;
250     frag->base.des_segments = &frag->segments[1];
251     frag->base.des_segment_count = 1;
252 
253     frag->segments[0].seg_addr.pval = NULL;
254     frag->segments[0].seg_len       = 0;
255     frag->segments[1].seg_addr.pval = frag->base.super.ptr;
256     frag->segments[1].seg_len       = size;
257 
258     frag->flags = MCA_BTL_UGNI_FRAG_BUFFERED;
259     if (size > mca_btl_ugni_component.smsg_max_data) {
260         mca_btl_ugni_reg_t *registration;
261 
262         frag->hdr_size = sizeof (frag->hdr.eager);
263         frag->flags    |= MCA_BTL_UGNI_FRAG_EAGER | MCA_BTL_UGNI_FRAG_IGNORE;
264 
265         registration = (mca_btl_ugni_reg_t *) frag->base.super.registration;
266 
267         frag->hdr.eager.memory_handle = registration->handle;
268     } else {
269         frag->hdr_size = sizeof (frag->hdr.send);
270     }
271 
272     return &frag->base;
273 }
274 
275 static int
mca_btl_ugni_free(struct mca_btl_base_module_t * btl,mca_btl_base_descriptor_t * des)276 mca_btl_ugni_free (struct mca_btl_base_module_t *btl,
277                    mca_btl_base_descriptor_t *des)
278 {
279     return mca_btl_ugni_frag_return ((mca_btl_ugni_base_frag_t *) des);
280 }
281 
282 static struct mca_btl_base_descriptor_t *
mca_btl_ugni_prepare_src(struct mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * endpoint,struct opal_convertor_t * convertor,uint8_t order,size_t reserve,size_t * size,uint32_t flags)283 mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl,
284                           mca_btl_base_endpoint_t *endpoint,
285                           struct opal_convertor_t *convertor,
286                           uint8_t order, size_t reserve, size_t *size,
287                           uint32_t flags)
288 {
289     /* do not allocate a fragment unless the wait list is relatively small. this
290      * reduces the potential for resource exhaustion. note the wait list only exists
291      * because we have no way to notify the sender that credits are available. */
292     if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list) > 32)) {
293         return NULL;
294     }
295 
296     return mca_btl_ugni_prepare_src_send (btl, endpoint, convertor,
297                                           order, reserve, size, flags);
298 }
299 
300 static mca_btl_base_registration_handle_t *
mca_btl_ugni_register_mem(mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * endpoint,void * base,size_t size,uint32_t flags)301 mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base,
302                            size_t size, uint32_t flags)
303 {
304     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
305     mca_btl_ugni_reg_t *reg;
306     int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY;
307     int rc;
308 
309     rc = ugni_module->rcache->rcache_register (ugni_module->rcache, base, size, 0, access_flags,
310                                                (mca_rcache_base_registration_t **) &reg);
311     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
312         return NULL;
313     }
314 
315     return &reg->handle;
316 }
317 
mca_btl_ugni_deregister_mem(mca_btl_base_module_t * btl,mca_btl_base_registration_handle_t * handle)318 static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle)
319 {
320     mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
321     mca_btl_ugni_reg_t *reg =
322         (mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle));
323 
324     (void) ugni_module->rcache->rcache_deregister (ugni_module->rcache, &reg->base);
325 
326     return OPAL_SUCCESS;
327 }
328 
mca_btl_ugni_event_fatal_error(gni_return_t grc,gni_cq_entry_t event_data)329 int mca_btl_ugni_event_fatal_error (gni_return_t grc, gni_cq_entry_t event_data)
330 {
331     /* combined error check for get event and get completed. we might miss exactly
332      * what happened but it is unrecoverable anyway. fwiw, this error path has
333      * never been seen in production. */
334     if (GNI_CQ_OVERRUN(event_data)) {
335         /* TODO -- need to handle overrun -- how do we do this without an event?
336            will the event eventually come back? Ask Cray */
337         BTL_ERROR(("CQ overrun detected in RDMA event data. can not recover"));
338     } else {
339         BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
340     }
341 
342     return mca_btl_rc_ugni_to_opal (grc);
343 }
344 
mca_btl_ugni_device_handle_event_error(mca_btl_ugni_rdma_desc_t * rdma_desc,gni_cq_entry_t event_data)345 int mca_btl_ugni_device_handle_event_error (mca_btl_ugni_rdma_desc_t *rdma_desc, gni_cq_entry_t event_data)
346 {
347     mca_btl_ugni_device_t *device = rdma_desc->device;
348     uint32_t recoverable = 1;
349 
350     (void) GNI_CqErrorRecoverable (event_data, &recoverable);
351 
352     if (OPAL_UNLIKELY(++rdma_desc->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) {
353         char char_buffer[1024];
354         GNI_CqErrorStr (event_data, char_buffer, sizeof (char_buffer));
355 
356         BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) rdma_desc, recoverable, char_buffer));
357 
358         return OPAL_ERROR;
359     }
360 
361     return _mca_btl_ugni_repost_rdma_desc_device (device, rdma_desc);
362 }
363