1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2014 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2006-2017 Los Alamos National Security, LLC.  All rights
15  *                         reserved.
16  * Copyright (c) 2006-2007 Voltaire All rights reserved.
17  * Copyright (c) 2006-2009 Mellanox Technologies, Inc.  All rights reserved.
18  * Copyright (c) 2010-2011 IBM Corporation.  All rights reserved.
19  * Copyright (c) 2010-2011 Oracle and/or its affiliates.  All rights reserved
20  * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
21  * Copyright (c) 2013-2015 NVIDIA Corporation.  All rights reserved.
22  * Copyright (c) 2014      Bull SAS.  All rights reserved.
23  * Copyright (c) 2015      Research Organization for Information Science
24  *                         and Technology (RIST). All rights reserved.
25  *
26  * $COPYRIGHT$
27  *
28  * Additional copyrights may follow
29  *
30  * $HEADER$
31  */
32 
33 #include "opal_config.h"
34 
35 #ifdef HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 #include <time.h>
39 #include <errno.h>
40 #include <string.h>
41 
42 #include "opal_stdint.h"
43 #include "opal/util/output.h"
44 #include "opal/util/proc.h"
45 #include "opal/util/show_help.h"
46 #include "opal/class/opal_free_list.h"
47 
48 #include "btl_openib_endpoint.h"
49 #include "btl_openib_proc.h"
50 #include "btl_openib_xrc.h"
51 #include "btl_openib_async.h"
52 #include "connect/connect.h"
53 
54 static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
55 static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
56 
acquire_wqe(mca_btl_openib_endpoint_t * ep,mca_btl_openib_send_frag_t * frag)57 static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
58         mca_btl_openib_send_frag_t *frag)
59 {
60     int qp = to_base_frag(frag)->base.order;
61     int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
62 
63     if(qp_get_wqe(ep, qp) < 0) {
64         qp_put_wqe(ep, qp);
65         opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
66                 (opal_list_item_t *)frag);
67         return OPAL_ERR_OUT_OF_RESOURCE;
68     }
69 
70     return OPAL_SUCCESS;
71 }
72 
73 /* this function is called with endpoint->endpoint_lock held */
mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t * endpoint,mca_btl_openib_send_frag_t * frag)74 int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
75         mca_btl_openib_send_frag_t *frag)
76 {
77     int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY;
78     mca_btl_openib_header_t *hdr = frag->hdr;
79     mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
80     int qp, ib_rc, rc;
81     bool do_rdma = false;
82     size_t size;
83 
84     if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
85         des->order = frag->qp_idx;
86 
87     qp = des->order;
88 
89     if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
90         return OPAL_ERR_RESOURCE_BUSY;
91 
92     size = des->des_segments->seg_len + frag->coalesced_length;
93 
94     rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
95                                                  &do_rdma, frag, true);
96     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
97         qp_put_wqe(endpoint, qp);
98         return OPAL_ERR_RESOURCE_BUSY;
99     }
100 
101     qp_reset_signal_count(endpoint, qp);
102     ib_rc = post_send(endpoint, frag, do_rdma, 1);
103 
104     if(!ib_rc)
105         return OPAL_SUCCESS;
106 
107     if(endpoint->nbo)
108         BTL_OPENIB_HEADER_NTOH(*hdr);
109 
110     mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);
111 
112     qp_put_wqe(endpoint, qp);
113 
114     BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
115                ib_rc, strerror(ib_rc), size));
116     return OPAL_ERROR;
117 }
118 
119 
120 
121 OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t,
122                    opal_list_item_t, mca_btl_openib_endpoint_construct,
123                    mca_btl_openib_endpoint_destruct);
124 
125 /*
126  * Initialize state of the endpoint instance.
127  *
128  */
endpoint_alloc_qp(void)129 static mca_btl_openib_qp_t *endpoint_alloc_qp(void)
130 {
131     mca_btl_openib_qp_t *qp = (mca_btl_openib_qp_t *) calloc(1, sizeof(mca_btl_openib_qp_t));
132     if(!qp) {
133         BTL_ERROR(("Failed to allocate memory for qp"));
134         return NULL;
135     }
136 
137     OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
138 
139     return qp;
140 }
141 
142 static void
endpoint_init_qp_pp(mca_btl_openib_endpoint_qp_t * ep_qp,const int qp)143 endpoint_init_qp_pp(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
144 {
145     mca_btl_openib_qp_info_t *qp_info = &mca_btl_openib_component.qp_infos[qp];
146     ep_qp->qp = endpoint_alloc_qp();
147     ep_qp->qp->users++;
148 
149     /* local credits are set here such that on initial posting
150      * of the receive buffers we end up with zero credits to return
151      * to our peer. The peer initializes his sd_credits to reflect this
152      * below. Note that this may be a problem for iWARP as the sender
153      * now has credits even if the receive buffers are not yet posted
154      */
155     ep_qp->u.pp_qp.rd_credits = -qp_info->rd_num;
156 
157     ep_qp->u.pp_qp.rd_posted = 0;
158     ep_qp->u.pp_qp.cm_sent = 0;
159     ep_qp->u.pp_qp.cm_return = -qp_info->u.pp_qp.rd_rsv;
160     ep_qp->u.pp_qp.cm_received = qp_info->u.pp_qp.rd_rsv;
161 
162     /* initialize the local view of credits */
163     ep_qp->u.pp_qp.sd_credits = qp_info->rd_num;
164 
165     /* number of available send WQEs */
166     ep_qp->qp->sd_wqe = qp_info->rd_num;
167 }
168 
169 static void
endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t * ep_qp,const int qp)170 endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
171 {
172     ep_qp->qp = endpoint_alloc_qp();
173     ep_qp->qp->users++;
174 
175     /* number of available send WQEs */
176     ep_qp->qp->sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
177 }
178 
179 static void
endpoint_init_qp_xrc(mca_btl_base_endpoint_t * ep,const int qp)180 endpoint_init_qp_xrc(mca_btl_base_endpoint_t *ep, const int qp)
181 {
182     int max = ep->endpoint_btl->device->ib_dev_attr.max_qp_wr -
183         (mca_btl_openib_component.use_eager_rdma ?
184          mca_btl_openib_component.max_eager_rdma : 0);
185     mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp];
186     int32_t wqe, incr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
187     int rc;
188 
189     opal_mutex_lock (&ep->ib_addr->addr_lock);
190 
191     ep_qp->qp = ep->ib_addr->qp;
192     if (ep->ib_addr->max_wqe + incr > max) {
193         /* make sure that we don't overrun maximum supported by device */
194         incr = max - ep->ib_addr->max_wqe;
195     }
196 
197     wqe = ep->ib_addr->max_wqe + incr +
198         (mca_btl_openib_component.use_eager_rdma ?
199          mca_btl_openib_component.max_eager_rdma : 0);
200 
201     ep->ib_addr->max_wqe += incr;
202 
203     if (NULL != ep_qp->qp->lcl_qp) {
204         struct ibv_qp_attr qp_attr;
205 
206         /* if this is modified the code in udcm_xrc_send_qp_create may
207          * need to be updated as well */
208         qp_attr.cap.max_recv_wr = 0;
209         qp_attr.cap.max_send_wr = wqe;
210         qp_attr.cap.max_inline_data = ep->endpoint_btl->device->max_inline_data;
211         qp_attr.cap.max_send_sge = 1;
212         qp_attr.cap.max_recv_sge = 1; /* we do not use SG list */
213         rc = ibv_modify_qp (ep_qp->qp->lcl_qp, &qp_attr, IBV_QP_CAP);
214         if (0 == rc) {
215             opal_atomic_add_32 (&ep_qp->qp->sd_wqe, incr);
216         }
217     } else {
218         ep_qp->qp->sd_wqe = ep->ib_addr->max_wqe;
219     }
220     ep_qp->qp->users++;
221     opal_mutex_unlock (&ep->ib_addr->addr_lock);
222 }
223 
endpoint_init_qp(mca_btl_base_endpoint_t * ep,const int qp)224 static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
225 {
226     mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp];
227 
228     ep_qp->rd_credit_send_lock = 0;
229     ep_qp->credit_frag = NULL;
230 
231     OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t);
232     OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t);
233 
234     OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[0], opal_list_t);
235     OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[1], opal_list_t);
236 
237     switch(BTL_OPENIB_QP_TYPE(qp)) {
238         case MCA_BTL_OPENIB_PP_QP:
239             endpoint_init_qp_pp(ep_qp, qp);
240             break;
241         case MCA_BTL_OPENIB_SRQ_QP:
242             endpoint_init_qp_srq(ep_qp, qp);
243             break;
244         case MCA_BTL_OPENIB_XRC_QP:
245             if (NULL == ep->ib_addr->qp) {
246                 ep->ib_addr->qp = endpoint_alloc_qp();
247             }
248             endpoint_init_qp_xrc(ep, qp);
249             break;
250         default:
251             BTL_ERROR(("Wrong QP type"));
252             return;
253     }
254 
255     ep_qp->qp->sd_wqe_inflight = 0;
256     ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT;
257 }
258 
mca_btl_openib_endpoint_init(mca_btl_openib_module_t * btl,mca_btl_base_endpoint_t * ep,opal_btl_openib_connect_base_module_t * local_cpc,mca_btl_openib_proc_modex_t * remote_proc_info,opal_btl_openib_connect_base_module_data_t * remote_cpc_data)259 void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
260                                   mca_btl_base_endpoint_t *ep,
261                                   opal_btl_openib_connect_base_module_t *local_cpc,
262                                   mca_btl_openib_proc_modex_t *remote_proc_info,
263                                   opal_btl_openib_connect_base_module_data_t *remote_cpc_data)
264 {
265     int qp;
266 
267     ep->endpoint_btl = btl;
268     ep->use_eager_rdma = btl->device->use_eager_rdma &
269         mca_btl_openib_component.use_eager_rdma;
270     ep->subnet_id = btl->port_info.subnet_id;
271     ep->endpoint_local_cpc = local_cpc;
272     ep->endpoint_remote_cpc_data = remote_cpc_data;
273 
274     ep->rem_info.rem_lid = remote_proc_info->pm_port_info.lid;
275     ep->rem_info.rem_subnet_id = remote_proc_info->pm_port_info.subnet_id;
276     ep->rem_info.rem_mtu = remote_proc_info->pm_port_info.mtu;
277     opal_output(-1, "Got remote LID, subnet, MTU: %d, 0x%" PRIx64 ", %d",
278                 ep->rem_info.rem_lid,
279                 ep->rem_info.rem_subnet_id,
280                 ep->rem_info.rem_mtu);
281 
282     ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
283     ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
284 
285     ep->rem_info.rem_transport_type =
286          (mca_btl_openib_transport_type_t) (remote_proc_info->pm_port_info).transport_type;
287 
288     for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
289         endpoint_init_qp(ep, qp);
290     }
291 }
292 
mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t * endpoint)293 static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
294 {
295     /* setup qp structures */
296     endpoint->qps = (mca_btl_openib_endpoint_qp_t*)
297         calloc(mca_btl_openib_component.num_qps,
298                 sizeof(mca_btl_openib_endpoint_qp_t));
299     if (MCA_BTL_XRC_ENABLED) {
300         endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
301             calloc(1, sizeof(mca_btl_openib_rem_qp_info_t));
302         endpoint->rem_info.rem_srqs = (mca_btl_openib_rem_srq_info_t*)
303             calloc(mca_btl_openib_component.num_xrc_qps,
304                     sizeof(mca_btl_openib_rem_srq_info_t));
305     } else {
306         endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
307             calloc(mca_btl_openib_component.num_qps,
308                     sizeof(mca_btl_openib_rem_qp_info_t));
309         endpoint->rem_info.rem_srqs = NULL;
310     }
311 
312     endpoint->ib_addr = NULL;
313 #if OPAL_HAVE_CONNECTX_XRC_DOMAINS
314     endpoint->xrc_recv_qp = NULL;
315 #else
316     endpoint->xrc_recv_qp_num = 0;
317 #endif
318     endpoint->endpoint_btl = 0;
319     endpoint->endpoint_proc = 0;
320     endpoint->endpoint_local_cpc = NULL;
321     endpoint->endpoint_remote_cpc_data = NULL;
322     endpoint->endpoint_initiator = false;
323     endpoint->endpoint_tstamp = 0.0;
324     endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
325     endpoint->endpoint_retries = 0;
326     OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
327     OBJ_CONSTRUCT(&endpoint->pending_lazy_frags, opal_list_t);
328     OBJ_CONSTRUCT(&endpoint->pending_get_frags, opal_list_t);
329     OBJ_CONSTRUCT(&endpoint->pending_put_frags, opal_list_t);
330 
331     endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom;
332 
333     /* initialize RDMA eager related parts */
334     endpoint->eager_recv_count = 0;
335     memset(&endpoint->eager_rdma_remote, 0,
336            sizeof(mca_btl_openib_eager_rdma_remote_t));
337     memset(&endpoint->eager_rdma_local, 0,
338            sizeof(mca_btl_openib_eager_rdma_local_t));
339     OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);
340 
341     endpoint->rem_info.rem_lid = 0;
342     endpoint->rem_info.rem_subnet_id = 0;
343     endpoint->rem_info.rem_mtu = 0;
344     endpoint->nbo = false;
345     endpoint->use_eager_rdma = false;
346     endpoint->eager_rdma_remote.tokens = 0;
347     endpoint->eager_rdma_local.credits = 0;
348     endpoint->endpoint_cts_mr = NULL;
349     endpoint->endpoint_cts_frag.super.super.base.super.registration = NULL;
350     endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL;
351     endpoint->endpoint_posted_recvs = false;
352     endpoint->endpoint_cts_received = false;
353     endpoint->endpoint_cts_sent = false;
354 }
355 
356 /*
357  * Destroy a endpoint
358  *
359  */
360 
mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t * endpoint)361 static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
362 {
363     bool pval_clean = false;
364     int qp;
365 
366     /* If the CPC has an endpoint_finalize function, call it */
367     if (NULL != endpoint->endpoint_local_cpc->cbm_endpoint_finalize) {
368         endpoint->endpoint_local_cpc->cbm_endpoint_finalize(endpoint);
369     }
370 
371     /* Release CTS buffer */
372     opal_btl_openib_connect_base_free_cts(endpoint);
373 
374     /* Release memory resources */
375     do {
376         /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma ()
377          * was not in "connect" or "bad" flow (failed to allocate memory)
378          * and changed the pointer back to NULL
379          */
380         if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) {
381             if (NULL != endpoint->eager_rdma_local.reg) {
382                 endpoint->endpoint_btl->device->rcache->rcache_deregister (endpoint->endpoint_btl->device->rcache,
383                                                                            &endpoint->eager_rdma_local.reg->base);
384                 endpoint->eager_rdma_local.reg = NULL;
385             }
386 
387             void *alloc_base = opal_atomic_swap_ptr (&endpoint->eager_rdma_local.alloc_base, NULL);
388             if (alloc_base) {
389                 endpoint->endpoint_btl->super.btl_mpool->mpool_free (endpoint->endpoint_btl->super.btl_mpool, alloc_base);
390                 pval_clean = true;
391             }
392         } else {
393             pval_clean=true;
394         }
395     } while (!pval_clean);
396 
397     /* Close opened QPs if we have them*/
398    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
399         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[0]);
400         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[1]);
401         OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[0]);
402         OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[1]);
403 
404         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
405                 &endpoint->qps[qp].no_wqe_pending_frags[0]);
406         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
407                 &endpoint->qps[qp].no_wqe_pending_frags[1]);
408         OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
409         OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);
410 
411 
412         if(--endpoint->qps[qp].qp->users != 0)
413             continue;
414 
415         if(endpoint->qps[qp].qp->lcl_qp != NULL)
416             if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
417                 BTL_ERROR(("Failed to destroy QP:%d\n", qp));
418 
419         free(endpoint->qps[qp].qp);
420     }
421 
422     /* free the qps */
423     free(endpoint->qps);
424     endpoint->qps = NULL;
425 
426     free(endpoint->rem_info.rem_qps);
427     free(endpoint->rem_info.rem_srqs);
428 
429     /* unregister xrc recv qp */
430 #if HAVE_XRC
431 #if OPAL_HAVE_CONNECTX_XRC_DOMAINS
432     if (NULL != endpoint->xrc_recv_qp) {
433         if(ibv_destroy_qp(endpoint->xrc_recv_qp)) {
434             BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp->qp_num));
435         } else {
436             BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp->qp_num));
437         }
438     }
439 #else
440     if (0 != endpoint->xrc_recv_qp_num) {
441         if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain,
442                     endpoint->xrc_recv_qp_num)) {
443             BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num));
444         } else {
445             BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp_num));
446         }
447     }
448 #endif
449 #endif
450 
451     OBJ_DESTRUCT(&endpoint->endpoint_lock);
452     /* Clean pending lists */
453     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
454     OBJ_DESTRUCT(&endpoint->pending_lazy_frags);
455 
456     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags);
457     OBJ_DESTRUCT(&endpoint->pending_get_frags);
458 
459     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags);
460     OBJ_DESTRUCT(&endpoint->pending_put_frags);
461 }
462 
463 
464 /*
465  * Called when the connect module has created all the qp's on an
466  * endpoint and needs to have some receive buffers posted.
467  */
mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t * endpoint)468 int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
469 {
470     int qp;
471 
472     for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
473         if (BTL_OPENIB_QP_TYPE_PP(qp)) {
474             mca_btl_openib_endpoint_post_rr_nolock(endpoint, qp);
475         } else {
476             mca_btl_openib_post_srr(endpoint->endpoint_btl, qp);
477         }
478     }
479 
480     return OPAL_SUCCESS;
481 }
482 
cts_sent(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * ep,struct mca_btl_base_descriptor_t * des,int status)483 static void cts_sent(mca_btl_base_module_t* btl,
484                      struct mca_btl_base_endpoint_t* ep,
485                      struct mca_btl_base_descriptor_t* des,
486                      int status)
487 {
488     /* Nothing to do/empty function (we can't pass in a NULL pointer
489        for the des_cbfunc) */
490     OPAL_OUTPUT((-1, "CTS send to %s completed",
491                  opal_get_proc_hostname(ep->endpoint_proc->proc_opal)));
492 }
493 
494 /*
495  * Send CTS control fragment
496  */
mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t * endpoint)497 void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
498 {
499     mca_btl_openib_send_control_frag_t *sc_frag;
500     mca_btl_base_descriptor_t *base_des;
501     mca_btl_openib_frag_t *openib_frag;
502     mca_btl_openib_com_frag_t *com_frag;
503     mca_btl_openib_control_header_t *ctl_hdr;
504 
505     OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
506                  opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
507                  mca_btl_openib_component.credits_qp,
508                  endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
509     sc_frag = alloc_control_frag(endpoint->endpoint_btl);
510     if (OPAL_UNLIKELY(NULL == sc_frag)) {
511         BTL_ERROR(("Failed to allocate control buffer"));
512         mca_btl_openib_endpoint_invoke_error(endpoint);
513         return;
514     }
515 
516     /* I dislike using the "to_<foo>()" macros; I prefer using the
517        explicit member fields to ensure I get the types right.  Since
518        this is not a performance-criticial part of the code, it's
519        ok. */
520     com_frag = &(sc_frag->super.super);
521     openib_frag = &(com_frag->super);
522     base_des = &(openib_frag->base);
523 
524     base_des->des_cbfunc = cts_sent;
525     base_des->des_cbdata = NULL;
526     base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
527     base_des->order = mca_btl_openib_component.credits_qp;
528     openib_frag->segment.seg_len = sizeof(mca_btl_openib_control_header_t);
529     com_frag->endpoint = endpoint;
530 
531     sc_frag->hdr->tag = MCA_BTL_TAG_IB;
532     sc_frag->hdr->cm_seen = 0;
533     sc_frag->hdr->credits = 0;
534 
535     ctl_hdr = (mca_btl_openib_control_header_t*)
536         openib_frag->segment.seg_addr.pval;
537     ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
538 
539     /* Send the fragment */
540     if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
541         BTL_ERROR(("Failed to post CTS send"));
542         mca_btl_openib_endpoint_invoke_error(endpoint);
543     }
544     endpoint->endpoint_cts_sent = true;
545 }
546 
547 /*
548  * Called when the CPC has established a connection on an endpoint
549  */
mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t * endpoint)550 void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
551 {
552     /* If the CPC uses the CTS protocol, then start it up */
553     if (endpoint->endpoint_local_cpc->cbm_uses_cts) {
554         int transport_type_ib_p = 0;
555         /* Post our receives, which will make credit management happy
556            (i.e., rd_credits will be 0) */
557         if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_recvs(endpoint)) {
558             BTL_ERROR(("Failed to post receive buffers"));
559             mca_btl_openib_endpoint_invoke_error(endpoint);
560             return;
561         }
562         endpoint->endpoint_posted_recvs = true;
563 
564         /* If this is IB, send the CTS immediately.  If this is iWARP,
565            then only send the CTS if this endpoint was the initiator
566            of the connection (the receiver will send its CTS when it
567            receives this side's CTS).  Also send the CTS if we already
568            received the peer's CTS (e.g., if this process was slow to
569            call cpc_complete(). */
570 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
571         transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
572 #endif
573         OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
574                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
575                      transport_type_ib_p,
576                      endpoint->endpoint_initiator,
577                      endpoint->endpoint_cts_received));
578         if (transport_type_ib_p ||
579             endpoint->endpoint_initiator ||
580             endpoint->endpoint_cts_received) {
581             mca_btl_openib_endpoint_send_cts(endpoint);
582 
583             /* If we've already got the CTS from the other side, then
584                mark us as connected */
585             if (endpoint->endpoint_cts_received) {
586                 OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
587                              opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
588                 mca_btl_openib_endpoint_connected(endpoint);
589             } else {
590                 /* the caller hold the lock and expects us to drop it */
591                 OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
592             }
593         }
594 
595         OPAL_OUTPUT((-1, "cpc_complete to %s -- done",
596                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
597         return;
598     }
599 
600     /* Otherwise, just set the endpoint to "connected" */
601     mca_btl_openib_endpoint_connected(endpoint);
602 }
603 
604 /*
605  * called when the connect module has completed setup of an endpoint
606  */
mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t * endpoint)607 void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
608 {
609     opal_list_item_t *frag_item, *ep_item;
610     mca_btl_openib_send_frag_t *frag;
611     mca_btl_openib_endpoint_t *ep;
612     bool master = false;
613 
614     opal_output(-1, "Now we are CONNECTED");
615     if (MCA_BTL_XRC_ENABLED) {
616         opal_mutex_lock (&endpoint->ib_addr->addr_lock);
617         if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
618             /* We are not xrc master */
619             /* set our qp pointer to master qp */
620             master = false;
621         } else {
622             /* I'm master of XRC */
623             endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED;
624             master = true;
625         }
626     }
627 
628     /* Run over all qps and load alternative path */
629     if (APM_ENABLED) {
630         int i;
631         if (MCA_BTL_XRC_ENABLED) {
632             if (master) {
633                 mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint);
634             }
635         } else {
636             for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
637                 mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint);
638             }
639         }
640     }
641 
642     endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
643     endpoint->endpoint_btl->device->non_eager_rdma_endpoints++;
644 
645     if(MCA_BTL_XRC_ENABLED) {
646         if (master) {
647             while (NULL != (ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep))) {
648                 ep = (mca_btl_openib_endpoint_t *)ep_item;
649                 if (OPAL_SUCCESS !=
650                     opal_btl_openib_connect_base_start(endpoint->endpoint_local_cpc, ep)) {
651                     BTL_ERROR(("Failed to connect pending endpoint\n"));
652                 }
653             }
654         }
655         opal_mutex_unlock (&endpoint->ib_addr->addr_lock);
656     }
657 
658 
659     /* Process pending packet on the endpoint */
660 
661     /* While there are frags in the list, process them */
662     while (NULL != (frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)))) {
663         frag = to_send_frag(frag_item);
664         /* We need to post this one */
665 
666         if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
667             BTL_ERROR(("Error posting send"));
668         }
669     }
670     OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
671 
672     /* if upper layer called put or get before connection moved to connected
673      * state then we restart them here */
674     mca_btl_openib_frag_progress_pending_put_get(endpoint,
675             mca_btl_openib_component.rdma_qp);
676 }
677 
678 /*
679  * Attempt to send a fragment using a given endpoint. If the endpoint is not
680  * connected, queue the fragment and start the connection as required.
681  */
mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t * ep,mca_btl_openib_send_frag_t * frag)682 int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep,
683                                  mca_btl_openib_send_frag_t* frag)
684 {
685     int rc;
686 
687     OPAL_THREAD_LOCK(&ep->endpoint_lock);
688     rc = check_endpoint_state(ep, &to_base_frag(frag)->base,
689             &ep->pending_lazy_frags);
690 
691     if(OPAL_LIKELY(OPAL_SUCCESS == rc)) {
692         rc = mca_btl_openib_endpoint_post_send(ep, frag);
693     }
694     OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
695     if (OPAL_UNLIKELY(OPAL_ERR_RESOURCE_BUSY == rc)) {
696         rc = OPAL_SUCCESS;
697     }
698 
699     return rc;
700 }
701 
702 /**
703  * Return control fragment.
704  */
705 
mca_btl_openib_endpoint_credits(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * ep,struct mca_btl_base_descriptor_t * des,int status)706 static void mca_btl_openib_endpoint_credits(
707     mca_btl_base_module_t* btl,
708     struct mca_btl_base_endpoint_t* ep,
709     struct mca_btl_base_descriptor_t* des,
710     int status)
711 {
712 
713     int qp;
714 
715     mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des);
716 
717     qp = frag->qp_idx;
718 
719     /* we don't acquire a WQE for credit message - so decrement.
720      * Note: doing it for QP used for credit management */
721     (void) qp_get_wqe(ep, des->order);
722 
723     if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep))
724         mca_btl_openib_endpoint_send_credits(ep, qp);
725     else {
726         BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp);
727         /* check one more time if credits are available after unlock */
728         send_credits(ep, qp);
729     }
730 }
731 
732 /**
733  * Return credits to peer
734  */
735 
mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t * endpoint,const int qp)736 void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
737         const int qp)
738 {
739     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
740     mca_btl_openib_send_control_frag_t* frag;
741     mca_btl_openib_rdma_credits_header_t *credits_hdr;
742     int rc;
743     bool do_rdma = false;
744     int32_t cm_return;
745 
746     frag = endpoint->qps[qp].credit_frag;
747 
748     if(OPAL_UNLIKELY(NULL == frag)) {
749         frag = alloc_control_frag(openib_btl);
750         frag->qp_idx = qp;
751         endpoint->qps[qp].credit_frag = frag;
752         /* set those once and forever */
753         to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
754         to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
755         to_base_frag(frag)->base.des_cbdata = NULL;
756         to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
757         to_com_frag(frag)->endpoint = endpoint;
758         frag->hdr->tag = MCA_BTL_TAG_IB;
759         to_base_frag(frag)->segment.seg_len =
760             sizeof(mca_btl_openib_rdma_credits_header_t);
761     }
762 
763     assert(frag->qp_idx == qp);
764     credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
765         to_base_frag(frag)->segment.seg_addr.pval;
766     if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) {
767         do_rdma = true;
768     } else {
769         if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
770                 (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
771             OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
772             BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
773             return;
774         }
775      }
776 
777     BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
778 
779     frag->hdr->cm_seen = 0;
780     BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
781     if(cm_return > 255) {
782         frag->hdr->cm_seen = 255;
783         cm_return -= 255;
784         OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
785     } else {
786         frag->hdr->cm_seen = cm_return;
787     }
788 
789     BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
790     credits_hdr->qpn = qp;
791     credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;
792 
793     if(endpoint->nbo)
794          BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);
795 
796     qp_reset_signal_count(endpoint, qp);
797     if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
798         return;
799 
800     if(endpoint->nbo) {
801         BTL_OPENIB_HEADER_NTOH(*frag->hdr);
802         BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
803     }
804     BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
805     OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
806             frag->hdr->credits);
807     OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
808             credits_hdr->rdma_credits);
809     if(do_rdma)
810         OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
811     else
812         OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
813 
814     BTL_ERROR(("error posting send request errno %d says %s", rc,
815                 strerror(errno)));
816 }
817 
818 /* local callback function for completion of eager rdma connect */
mca_btl_openib_endpoint_eager_rdma_connect_cb(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * endpoint,struct mca_btl_base_descriptor_t * descriptor,int status)819 static void mca_btl_openib_endpoint_eager_rdma_connect_cb(
820     mca_btl_base_module_t* btl,
821     struct mca_btl_base_endpoint_t* endpoint,
822     struct mca_btl_base_descriptor_t* descriptor,
823     int status)
824 {
825     mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
826     OPAL_THREAD_ADD32(&device->non_eager_rdma_endpoints, -1);
827     assert(device->non_eager_rdma_endpoints >= 0);
828     MCA_BTL_IB_FRAG_RETURN(descriptor);
829 }
830 
831 /* send the eager rdma connect message to the remote endpoint */
mca_btl_openib_endpoint_send_eager_rdma(mca_btl_base_endpoint_t * endpoint)832 static int mca_btl_openib_endpoint_send_eager_rdma(
833     mca_btl_base_endpoint_t* endpoint)
834 {
835     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
836     mca_btl_openib_eager_rdma_header_t *rdma_hdr;
837     mca_btl_openib_send_control_frag_t* frag;
838     int rc;
839 
840     frag = alloc_control_frag(openib_btl);
841     if(NULL == frag) {
842         return -1;
843     }
844 
845     to_base_frag(frag)->base.des_cbfunc =
846         mca_btl_openib_endpoint_eager_rdma_connect_cb;
847     to_base_frag(frag)->base.des_cbdata = NULL;
848     to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
849     to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
850     to_base_frag(frag)->segment.seg_len =
851         sizeof(mca_btl_openib_eager_rdma_header_t);
852     to_com_frag(frag)->endpoint = endpoint;
853 
854     frag->hdr->tag = MCA_BTL_TAG_IB;
855     rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
856     rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
857     rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
858     rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval);
859     BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
860                  ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
861                  rdma_hdr->rkey,
862                  rdma_hdr->rdma_start.lval,
863                  rdma_hdr->rdma_start.pval,
864                  rdma_hdr->rdma_start.ival,
865                  rdma_hdr->control.type,
866                  (int) sizeof(mca_btl_openib_eager_rdma_header_t)
867                  ));
868 
869     if(endpoint->nbo) {
870         BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));
871 
872         BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n",
873                      rdma_hdr->rkey,
874                      rdma_hdr->rdma_start.lval,
875                      rdma_hdr->rdma_start.pval,
876                      rdma_hdr->rdma_start.ival
877                      ));
878     }
879     rc = mca_btl_openib_endpoint_send(endpoint, frag);
880     if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc)
881         return OPAL_SUCCESS;
882 
883     MCA_BTL_IB_FRAG_RETURN(frag);
884     BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno)));
885     return rc;
886 }
887 
888 /* Setup eager RDMA buffers and notify the remote endpoint*/
mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t * endpoint)889 void mca_btl_openib_endpoint_connect_eager_rdma(
890         mca_btl_openib_endpoint_t* endpoint)
891 {
892     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
893     char *buf, *alloc_base;
894     mca_btl_openib_recv_frag_t *headers_buf;
895     int i, rc;
896     uint32_t flag = MCA_RCACHE_FLAGS_CACHE_BYPASS;
897 
898     /* Set local rdma pointer to 1 temporarily so other threads will not try
899      * to enter the function */
900     if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
901                 (void*)1))
902         return;
903 
904     headers_buf = (mca_btl_openib_recv_frag_t*)
905         malloc(sizeof(mca_btl_openib_recv_frag_t) *
906             mca_btl_openib_component.eager_rdma_num);
907 
908     if(NULL == headers_buf)
909        goto unlock_rdma_local;
910 
911 #if HAVE_DECL_IBV_ACCESS_SO
912     /* Solaris implements the Relaxed Ordering feature defined in the
913        PCI Specification. With this in mind any memory region which
914        relies on a buffer being written in a specific order, for
915        example the eager rdma connections created in this routinue,
916        must set a strong order flag when registering the memory for
917        rdma operations.
918 
919        The following flag will be interpreted and the appropriate
920        steps will be taken when the memory is registered in
921        openib_reg_mr(). */
922     flag |= MCA_RCACHE_FLAGS_SO_MEM;
923 #endif
924 
925     alloc_base = buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
926                                                             openib_btl->eager_rdma_frag_size *
927                                                             mca_btl_openib_component.eager_rdma_num,
928                                                             mca_btl_openib_component.buffer_alignment,
929                                                             0);
930 
931     if(!buf)
932        goto free_headers_buf;
933 
934     rc = openib_btl->device->rcache->rcache_register (openib_btl->device->rcache, buf, openib_btl->eager_rdma_frag_size *
935                                                       mca_btl_openib_component.eager_rdma_num, flag, MCA_RCACHE_ACCESS_ANY,
936                                                       (mca_rcache_base_registration_t**)&endpoint->eager_rdma_local.reg);
937     if (OPAL_SUCCESS != rc) {
938         openib_btl->super.btl_mpool->mpool_free (openib_btl->super.btl_mpool, alloc_base);
939         goto free_headers_buf;
940     }
941 
942     buf = buf + openib_btl->eager_rdma_frag_size -
943         sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
944         sizeof(mca_btl_openib_header_t);
945 
946     for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
947         opal_free_list_item_t *item;
948         mca_btl_openib_recv_frag_t * frag;
949         mca_btl_openib_frag_init_data_t init_data;
950 
951         item = (opal_free_list_item_t*)&headers_buf[i];
952         item->registration = (mca_rcache_base_registration_t *)endpoint->eager_rdma_local.reg;
953         item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
954         OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);
955 
956         init_data.order = mca_btl_openib_component.credits_qp;
957         init_data.list = NULL;
958 
959         mca_btl_openib_frag_init(item, &init_data);
960         frag = to_recv_frag(item);
961         to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
962         to_com_frag(frag)->endpoint = endpoint;
963         frag->ftr = (mca_btl_openib_footer_t*)
964             ((char*)to_base_frag(frag)->segment.seg_addr.pval +
965              mca_btl_openib_component.eager_limit);
966 
967         MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
968     }
969 
970     endpoint->eager_rdma_local.frags = headers_buf;
971 
972     endpoint->eager_rdma_local.rd_win =
973         mca_btl_openib_component.eager_rdma_num >> 2;
974     endpoint->eager_rdma_local.rd_win =
975         endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;
976 
977     /* set local rdma pointer to real value */
978     (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
979                                  (void*)1, buf);
980     endpoint->eager_rdma_local.alloc_base = alloc_base;
981 
982     if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) {
983         mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
984         mca_btl_openib_endpoint_t **p;
985         OBJ_RETAIN(endpoint);
986         assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
987         do {
988             p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
989         } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint));
990 
991         OPAL_THREAD_ADD32(&openib_btl->eager_rdma_channels, 1);
992         /* from this point progress function starts to poll new buffer */
993         OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1);
994         return;
995     }
996 
997     openib_btl->device->rcache->rcache_deregister (openib_btl->device->rcache,
998                                                    (mca_rcache_base_registration_t*)endpoint->eager_rdma_local.reg);
999     openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf);
1000 free_headers_buf:
1001     free(headers_buf);
1002 unlock_rdma_local:
1003     /* set local rdma pointer back to zero. Will retry later */
1004     (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
1005                                  endpoint->eager_rdma_local.base.pval, NULL);
1006     endpoint->eager_rdma_local.frags = NULL;
1007 }
1008 
1009 /*
1010  * Invoke an error on the btl associated with an endpoint.  If we
1011  * don't have an endpoint, then just use the first one on the
1012  * component list of BTLs.
1013  */
mca_btl_openib_endpoint_invoke_error(void * context)1014 void *mca_btl_openib_endpoint_invoke_error(void *context)
1015 {
1016     mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context;
1017     mca_btl_openib_module_t *btl = NULL;
1018 
1019     if (NULL == endpoint) {
1020         int i;
1021         for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i) {
1022             if (NULL != mca_btl_openib_component.openib_btls[i] &&
1023                 NULL != mca_btl_openib_component.openib_btls[i]->error_cb) {
1024                 btl = mca_btl_openib_component.openib_btls[i];
1025                 break;
1026             }
1027         }
1028     } else {
1029         btl = endpoint->endpoint_btl;
1030     }
1031 
1032     /* If we didn't find a BTL, then just bail :-( */
1033     if (NULL == btl || NULL == btl->error_cb) {
1034         opal_show_help("help-mpi-btl-openib.txt",
1035                        "cannot raise btl error", true,
1036                        opal_process_info.nodename,
1037                        __FILE__, __LINE__);
1038         exit(1);
1039     }
1040 
1041     /* Invoke the callback to the upper layer */
1042     btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
1043 
1044     /* Will likely never get here */
1045     return NULL;
1046 }
1047