1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2014 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2006-2017 Los Alamos National Security, LLC.  All rights
15  *                         reserved.
16  * Copyright (c) 2006-2007 Voltaire All rights reserved.
17  * Copyright (c) 2006-2009 Mellanox Technologies, Inc.  All rights reserved.
18  * Copyright (c) 2010-2011 IBM Corporation.  All rights reserved.
19  * Copyright (c) 2010-2011 Oracle and/or its affiliates.  All rights reserved
20  * Copyright (c) 2013-2014 Intel, Inc. All rights reserved
21  * Copyright (c) 2013-2015 NVIDIA Corporation.  All rights reserved.
22  * Copyright (c) 2014      Bull SAS.  All rights reserved.
23  * Copyright (c) 2015      Research Organization for Information Science
24  *                         and Technology (RIST). All rights reserved.
25  *
26  * $COPYRIGHT$
27  *
28  * Additional copyrights may follow
29  *
30  * $HEADER$
31  */
32 
33 #include "opal_config.h"
34 
35 #ifdef HAVE_SYS_TIME_H
36 #include <sys/time.h>
37 #endif
38 #include <time.h>
39 #include <errno.h>
40 #include <string.h>
41 
42 #include "opal_stdint.h"
43 #include "opal/util/output.h"
44 #include "opal/util/proc.h"
45 #include "opal/util/show_help.h"
46 #include "opal/class/opal_free_list.h"
47 
48 #include "btl_openib_endpoint.h"
49 #include "btl_openib_proc.h"
50 #include "btl_openib_xrc.h"
51 #include "btl_openib_async.h"
52 #include "connect/connect.h"
53 
54 static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint);
55 static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint);
56 
acquire_wqe(mca_btl_openib_endpoint_t * ep,mca_btl_openib_send_frag_t * frag)57 static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
58         mca_btl_openib_send_frag_t *frag)
59 {
60     int qp = to_base_frag(frag)->base.order;
61     int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);
62 
63     if(qp_get_wqe(ep, qp) < 0) {
64         qp_put_wqe(ep, qp);
65         opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
66                 (opal_list_item_t *)frag);
67         return OPAL_ERR_OUT_OF_RESOURCE;
68     }
69 
70     return OPAL_SUCCESS;
71 }
72 
73 /* this function is called with endpoint->endpoint_lock held */
mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t * endpoint,mca_btl_openib_send_frag_t * frag)74 int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
75         mca_btl_openib_send_frag_t *frag)
76 {
77     int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY;
78     mca_btl_openib_header_t *hdr = frag->hdr;
79     mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
80     int qp, ib_rc, rc;
81     bool do_rdma = false;
82     size_t size;
83 
84     if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
85         des->order = frag->qp_idx;
86 
87     qp = des->order;
88 
89     if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
90         return OPAL_ERR_RESOURCE_BUSY;
91 
92     size = des->des_segments->seg_len + frag->coalesced_length;
93 
94     rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
95                                                  &do_rdma, frag, true);
96     if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
97         qp_put_wqe(endpoint, qp);
98         return OPAL_ERR_RESOURCE_BUSY;
99     }
100 
101     qp_reset_signal_count(endpoint, qp);
102     ib_rc = post_send(endpoint, frag, do_rdma, 1);
103 
104     if(!ib_rc)
105         return OPAL_SUCCESS;
106 
107     if(endpoint->nbo)
108         BTL_OPENIB_HEADER_NTOH(*hdr);
109 
110     mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);
111 
112     qp_put_wqe(endpoint, qp);
113 
114     BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
115                ib_rc, strerror(ib_rc), size));
116     return OPAL_ERROR;
117 }
118 
119 
120 
121 OBJ_CLASS_INSTANCE(mca_btl_openib_endpoint_t,
122                    opal_list_item_t, mca_btl_openib_endpoint_construct,
123                    mca_btl_openib_endpoint_destruct);
124 
125 /*
126  * Initialize state of the endpoint instance.
127  *
128  */
endpoint_alloc_qp(void)129 static mca_btl_openib_qp_t *endpoint_alloc_qp(void)
130 {
131     mca_btl_openib_qp_t *qp = (mca_btl_openib_qp_t *) calloc(1, sizeof(mca_btl_openib_qp_t));
132     if(!qp) {
133         BTL_ERROR(("Failed to allocate memory for qp"));
134         return NULL;
135     }
136 
137     OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
138 
139     return qp;
140 }
141 
142 static void
endpoint_init_qp_pp(mca_btl_openib_endpoint_qp_t * ep_qp,const int qp)143 endpoint_init_qp_pp(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
144 {
145     mca_btl_openib_qp_info_t *qp_info = &mca_btl_openib_component.qp_infos[qp];
146     ep_qp->qp = endpoint_alloc_qp();
147     ep_qp->qp->users++;
148 
149     /* local credits are set here such that on initial posting
150      * of the receive buffers we end up with zero credits to return
151      * to our peer. The peer initializes his sd_credits to reflect this
152      * below. Note that this may be a problem for iWARP as the sender
153      * now has credits even if the receive buffers are not yet posted
154      */
155     ep_qp->u.pp_qp.rd_credits = -qp_info->rd_num;
156 
157     ep_qp->u.pp_qp.rd_posted = 0;
158     ep_qp->u.pp_qp.cm_sent = 0;
159     ep_qp->u.pp_qp.cm_return = -qp_info->u.pp_qp.rd_rsv;
160     ep_qp->u.pp_qp.cm_received = qp_info->u.pp_qp.rd_rsv;
161 
162     /* initialize the local view of credits */
163     ep_qp->u.pp_qp.sd_credits = qp_info->rd_num;
164 
165     /* number of available send WQEs */
166     ep_qp->qp->sd_wqe = qp_info->rd_num;
167 }
168 
169 static void
endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t * ep_qp,const int qp)170 endpoint_init_qp_srq(mca_btl_openib_endpoint_qp_t *ep_qp, const int qp)
171 {
172     ep_qp->qp = endpoint_alloc_qp();
173     ep_qp->qp->users++;
174 
175     /* number of available send WQEs */
176     ep_qp->qp->sd_wqe = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
177 }
178 
179 static void
endpoint_init_qp_xrc(mca_btl_base_endpoint_t * ep,const int qp)180 endpoint_init_qp_xrc(mca_btl_base_endpoint_t *ep, const int qp)
181 {
182     int max = ep->endpoint_btl->device->ib_dev_attr.max_qp_wr -
183         (mca_btl_openib_component.use_eager_rdma ?
184          mca_btl_openib_component.max_eager_rdma : 0);
185     mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp];
186     int32_t wqe, incr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
187     int rc;
188 
189     opal_mutex_lock (&ep->ib_addr->addr_lock);
190 
191     ep_qp->qp = ep->ib_addr->qp;
192     if (ep->ib_addr->max_wqe + incr > max) {
193         /* make sure that we don't overrun maximum supported by device */
194         incr = max - ep->ib_addr->max_wqe;
195     }
196 
197     wqe = ep->ib_addr->max_wqe + incr +
198         (mca_btl_openib_component.use_eager_rdma ?
199          mca_btl_openib_component.max_eager_rdma : 0);
200 
201     ep->ib_addr->max_wqe += incr;
202 
203     if (NULL != ep_qp->qp->lcl_qp) {
204         struct ibv_qp_attr qp_attr;
205 
206         /* if this is modified the code in udcm_xrc_send_qp_create may
207          * need to be updated as well */
208         qp_attr.cap.max_recv_wr = 0;
209         qp_attr.cap.max_send_wr = wqe;
210         qp_attr.cap.max_inline_data = ep->endpoint_btl->device->max_inline_data;
211         qp_attr.cap.max_send_sge = 1;
212         qp_attr.cap.max_recv_sge = 1; /* we do not use SG list */
213         rc = ibv_modify_qp (ep_qp->qp->lcl_qp, &qp_attr, IBV_QP_CAP);
214         if (0 == rc) {
215             opal_atomic_add_fetch_32 (&ep_qp->qp->sd_wqe, incr);
216         }
217     } else {
218         ep_qp->qp->sd_wqe = ep->ib_addr->max_wqe;
219     }
220     ep_qp->qp->users++;
221     opal_mutex_unlock (&ep->ib_addr->addr_lock);
222 }
223 
endpoint_init_qp(mca_btl_base_endpoint_t * ep,const int qp)224 static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
225 {
226     mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp];
227 
228     ep_qp->rd_credit_send_lock = 0;
229     ep_qp->credit_frag = NULL;
230 
231     OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t);
232     OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t);
233 
234     OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[0], opal_list_t);
235     OBJ_CONSTRUCT(&ep_qp->no_credits_pending_frags[1], opal_list_t);
236 
237     switch(BTL_OPENIB_QP_TYPE(qp)) {
238         case MCA_BTL_OPENIB_PP_QP:
239             endpoint_init_qp_pp(ep_qp, qp);
240             break;
241         case MCA_BTL_OPENIB_SRQ_QP:
242             endpoint_init_qp_srq(ep_qp, qp);
243             break;
244         case MCA_BTL_OPENIB_XRC_QP:
245             if (NULL == ep->ib_addr->qp) {
246                 ep->ib_addr->qp = endpoint_alloc_qp();
247             }
248             endpoint_init_qp_xrc(ep, qp);
249             break;
250         default:
251             BTL_ERROR(("Wrong QP type"));
252             return;
253     }
254 
255     ep_qp->qp->sd_wqe_inflight = 0;
256     ep_qp->qp->wqe_count = QP_TX_BATCH_COUNT;
257 }
258 
mca_btl_openib_endpoint_init(mca_btl_openib_module_t * btl,mca_btl_base_endpoint_t * ep,opal_btl_openib_connect_base_module_t * local_cpc,mca_btl_openib_proc_modex_t * remote_proc_info,opal_btl_openib_connect_base_module_data_t * remote_cpc_data)259 void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
260                                   mca_btl_base_endpoint_t *ep,
261                                   opal_btl_openib_connect_base_module_t *local_cpc,
262                                   mca_btl_openib_proc_modex_t *remote_proc_info,
263                                   opal_btl_openib_connect_base_module_data_t *remote_cpc_data)
264 {
265     int qp;
266 
267     ep->endpoint_btl = btl;
268     ep->use_eager_rdma = btl->device->use_eager_rdma &
269         mca_btl_openib_component.use_eager_rdma;
270     ep->subnet_id = btl->port_info.subnet_id;
271     ep->endpoint_local_cpc = local_cpc;
272     ep->endpoint_remote_cpc_data = remote_cpc_data;
273 
274     ep->rem_info.rem_lid = remote_proc_info->pm_port_info.lid;
275     ep->rem_info.rem_subnet_id = remote_proc_info->pm_port_info.subnet_id;
276     ep->rem_info.rem_mtu = remote_proc_info->pm_port_info.mtu;
277     opal_output(-1, "Got remote LID, subnet, MTU: %d, 0x%" PRIx64 ", %d",
278                 ep->rem_info.rem_lid,
279                 ep->rem_info.rem_subnet_id,
280                 ep->rem_info.rem_mtu);
281 
282     ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
283     ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
284 
285     ep->rem_info.rem_transport_type =
286          (mca_btl_openib_transport_type_t) (remote_proc_info->pm_port_info).transport_type;
287 
288     for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
289         endpoint_init_qp(ep, qp);
290     }
291 }
292 
mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t * endpoint)293 static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
294 {
295     /* setup qp structures */
296     endpoint->qps = (mca_btl_openib_endpoint_qp_t*)
297         calloc(mca_btl_openib_component.num_qps,
298                 sizeof(mca_btl_openib_endpoint_qp_t));
299     if (MCA_BTL_XRC_ENABLED) {
300         endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
301             calloc(1, sizeof(mca_btl_openib_rem_qp_info_t));
302         endpoint->rem_info.rem_srqs = (mca_btl_openib_rem_srq_info_t*)
303             calloc(mca_btl_openib_component.num_xrc_qps,
304                     sizeof(mca_btl_openib_rem_srq_info_t));
305     } else {
306         endpoint->rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*)
307             calloc(mca_btl_openib_component.num_qps,
308                     sizeof(mca_btl_openib_rem_qp_info_t));
309         endpoint->rem_info.rem_srqs = NULL;
310     }
311 
312     endpoint->ib_addr = NULL;
313 #if OPAL_HAVE_CONNECTX_XRC_DOMAINS
314     endpoint->xrc_recv_qp = NULL;
315 #else
316     endpoint->xrc_recv_qp_num = 0;
317 #endif
318     endpoint->endpoint_btl = 0;
319     endpoint->endpoint_proc = 0;
320     endpoint->endpoint_local_cpc = NULL;
321     endpoint->endpoint_remote_cpc_data = NULL;
322     endpoint->endpoint_initiator = false;
323     endpoint->endpoint_tstamp = 0.0;
324     endpoint->endpoint_state = MCA_BTL_IB_CLOSED;
325     endpoint->endpoint_retries = 0;
326     OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t);
327     OBJ_CONSTRUCT(&endpoint->pending_lazy_frags, opal_list_t);
328     OBJ_CONSTRUCT(&endpoint->pending_get_frags, opal_list_t);
329     OBJ_CONSTRUCT(&endpoint->pending_put_frags, opal_list_t);
330 
331     endpoint->get_tokens = mca_btl_openib_component.ib_qp_ous_rd_atom;
332 
333     /* initialize RDMA eager related parts */
334     endpoint->eager_recv_count = 0;
335     memset(&endpoint->eager_rdma_remote, 0,
336            sizeof(mca_btl_openib_eager_rdma_remote_t));
337     memset(&endpoint->eager_rdma_local, 0,
338            sizeof(mca_btl_openib_eager_rdma_local_t));
339     OBJ_CONSTRUCT(&endpoint->eager_rdma_local.lock, opal_mutex_t);
340 
341     endpoint->rem_info.rem_lid = 0;
342     endpoint->rem_info.rem_subnet_id = 0;
343     endpoint->rem_info.rem_mtu = 0;
344     endpoint->nbo = false;
345     endpoint->use_eager_rdma = false;
346     endpoint->eager_rdma_remote.tokens = 0;
347     endpoint->eager_rdma_local.credits = 0;
348     endpoint->endpoint_cts_mr = NULL;
349     endpoint->endpoint_cts_frag.super.super.base.super.registration = NULL;
350     endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL;
351     endpoint->endpoint_posted_recvs = false;
352     endpoint->endpoint_cts_received = false;
353     endpoint->endpoint_cts_sent = false;
354 }
355 
356 /*
357  * Destroy a endpoint
358  *
359  */
360 
mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t * endpoint)361 static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
362 {
363     bool pval_clean = false;
364     int qp;
365 
366     /* If the CPC has an endpoint_finalize function, call it */
367     if (NULL != endpoint->endpoint_local_cpc->cbm_endpoint_finalize) {
368         endpoint->endpoint_local_cpc->cbm_endpoint_finalize(endpoint);
369     }
370 
371     /* Release CTS buffer */
372     opal_btl_openib_connect_base_free_cts(endpoint);
373 
374     /* Release memory resources */
375     do {
376         void *_tmp_ptr = NULL;
377         /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma ()
378          * was not in "connect" or "bad" flow (failed to allocate memory)
379          * and changed the pointer back to NULL
380          */
381         if(!opal_atomic_compare_exchange_strong_ptr(&endpoint->eager_rdma_local.base.pval, (void *) &_tmp_ptr, (void *) 1)) {
382             if (NULL != endpoint->eager_rdma_local.reg) {
383                 endpoint->endpoint_btl->device->rcache->rcache_deregister (endpoint->endpoint_btl->device->rcache,
384                                                                            &endpoint->eager_rdma_local.reg->base);
385                 endpoint->eager_rdma_local.reg = NULL;
386             }
387 
388             void *alloc_base = opal_atomic_swap_ptr (&endpoint->eager_rdma_local.alloc_base, NULL);
389             if (alloc_base) {
390                 endpoint->endpoint_btl->super.btl_mpool->mpool_free (endpoint->endpoint_btl->super.btl_mpool, alloc_base);
391                 pval_clean = true;
392             }
393         } else {
394             pval_clean=true;
395         }
396     } while (!pval_clean);
397 
398     /* Close opened QPs if we have them*/
399    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
400         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[0]);
401         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[1]);
402         OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[0]);
403         OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[1]);
404 
405         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
406                 &endpoint->qps[qp].no_wqe_pending_frags[0]);
407         MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
408                 &endpoint->qps[qp].no_wqe_pending_frags[1]);
409         OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
410         OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);
411 
412 
413         if(--endpoint->qps[qp].qp->users != 0)
414             continue;
415 
416         if(endpoint->qps[qp].qp->lcl_qp != NULL)
417             if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
418                 BTL_ERROR(("Failed to destroy QP:%d\n", qp));
419 
420         free(endpoint->qps[qp].qp);
421     }
422 
423     /* free the qps */
424     free(endpoint->qps);
425     endpoint->qps = NULL;
426 
427     free(endpoint->rem_info.rem_qps);
428     free(endpoint->rem_info.rem_srqs);
429 
430     /* unregister xrc recv qp */
431 #if HAVE_XRC
432 #if OPAL_HAVE_CONNECTX_XRC_DOMAINS
433     if (NULL != endpoint->xrc_recv_qp) {
434         if(ibv_destroy_qp(endpoint->xrc_recv_qp)) {
435             BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp->qp_num));
436         } else {
437             BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp->qp_num));
438         }
439     }
440 #else
441     if (0 != endpoint->xrc_recv_qp_num) {
442         if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain,
443                     endpoint->xrc_recv_qp_num)) {
444             BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num));
445         } else {
446             BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp_num));
447         }
448     }
449 #endif
450 #endif
451 
452     OBJ_DESTRUCT(&endpoint->endpoint_lock);
453     /* Clean pending lists */
454     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
455     OBJ_DESTRUCT(&endpoint->pending_lazy_frags);
456 
457     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags);
458     OBJ_DESTRUCT(&endpoint->pending_get_frags);
459 
460     MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags);
461     OBJ_DESTRUCT(&endpoint->pending_put_frags);
462 }
463 
464 
465 /*
466  * Called when the connect module has created all the qp's on an
467  * endpoint and needs to have some receive buffers posted.
468  */
mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t * endpoint)469 int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
470 {
471     int qp;
472 
473     for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
474         if (BTL_OPENIB_QP_TYPE_PP(qp)) {
475             mca_btl_openib_endpoint_post_rr_nolock(endpoint, qp);
476         } else {
477             mca_btl_openib_post_srr(endpoint->endpoint_btl, qp);
478         }
479     }
480 
481     return OPAL_SUCCESS;
482 }
483 
cts_sent(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * ep,struct mca_btl_base_descriptor_t * des,int status)484 static void cts_sent(mca_btl_base_module_t* btl,
485                      struct mca_btl_base_endpoint_t* ep,
486                      struct mca_btl_base_descriptor_t* des,
487                      int status)
488 {
489     /* Nothing to do/empty function (we can't pass in a NULL pointer
490        for the des_cbfunc) */
491     OPAL_OUTPUT((-1, "CTS send to %s completed",
492                  opal_get_proc_hostname(ep->endpoint_proc->proc_opal)));
493 }
494 
495 /*
496  * Send CTS control fragment
497  */
mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t * endpoint)498 void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
499 {
500     mca_btl_openib_send_control_frag_t *sc_frag;
501     mca_btl_base_descriptor_t *base_des;
502     mca_btl_openib_frag_t *openib_frag;
503     mca_btl_openib_com_frag_t *com_frag;
504     mca_btl_openib_control_header_t *ctl_hdr;
505 
506     OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
507                  opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
508                  mca_btl_openib_component.credits_qp,
509                  endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
510     sc_frag = alloc_control_frag(endpoint->endpoint_btl);
511     if (OPAL_UNLIKELY(NULL == sc_frag)) {
512         BTL_ERROR(("Failed to allocate control buffer"));
513         mca_btl_openib_endpoint_invoke_error(endpoint);
514         return;
515     }
516 
517     /* I dislike using the "to_<foo>()" macros; I prefer using the
518        explicit member fields to ensure I get the types right.  Since
519        this is not a performance-criticial part of the code, it's
520        ok. */
521     com_frag = &(sc_frag->super.super);
522     openib_frag = &(com_frag->super);
523     base_des = &(openib_frag->base);
524 
525     base_des->des_cbfunc = cts_sent;
526     base_des->des_cbdata = NULL;
527     base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
528     base_des->order = mca_btl_openib_component.credits_qp;
529     openib_frag->segment.seg_len = sizeof(mca_btl_openib_control_header_t);
530     com_frag->endpoint = endpoint;
531 
532     sc_frag->hdr->tag = MCA_BTL_TAG_IB;
533     sc_frag->hdr->cm_seen = 0;
534     sc_frag->hdr->credits = 0;
535 
536     ctl_hdr = (mca_btl_openib_control_header_t*)
537         openib_frag->segment.seg_addr.pval;
538     ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
539 
540     /* Send the fragment */
541     if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
542         BTL_ERROR(("Failed to post CTS send"));
543         mca_btl_openib_endpoint_invoke_error(endpoint);
544     }
545     endpoint->endpoint_cts_sent = true;
546 }
547 
548 /*
549  * Called when the CPC has established a connection on an endpoint
550  */
mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t * endpoint)551 void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
552 {
553     /* If the CPC uses the CTS protocol, then start it up */
554     if (endpoint->endpoint_local_cpc->cbm_uses_cts) {
555         int transport_type_ib_p = 0;
556         /* Post our receives, which will make credit management happy
557            (i.e., rd_credits will be 0) */
558         if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_recvs(endpoint)) {
559             BTL_ERROR(("Failed to post receive buffers"));
560             mca_btl_openib_endpoint_invoke_error(endpoint);
561             return;
562         }
563         endpoint->endpoint_posted_recvs = true;
564 
565         /* If this is IB, send the CTS immediately.  If this is iWARP,
566            then only send the CTS if this endpoint was the initiator
567            of the connection (the receiver will send its CTS when it
568            receives this side's CTS).  Also send the CTS if we already
569            received the peer's CTS (e.g., if this process was slow to
570            call cpc_complete(). */
571 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
572         transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
573 #endif
574         OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
575                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
576                      transport_type_ib_p,
577                      endpoint->endpoint_initiator,
578                      endpoint->endpoint_cts_received));
579         if (transport_type_ib_p ||
580             endpoint->endpoint_initiator ||
581             endpoint->endpoint_cts_received) {
582             mca_btl_openib_endpoint_send_cts(endpoint);
583 
584             /* If we've already got the CTS from the other side, then
585                mark us as connected */
586             if (endpoint->endpoint_cts_received) {
587                 OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
588                              opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
589                 mca_btl_openib_endpoint_connected(endpoint);
590             } else {
591                 /* the caller hold the lock and expects us to drop it */
592                 OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
593             }
594         }
595 
596         OPAL_OUTPUT((-1, "cpc_complete to %s -- done",
597                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
598         return;
599     }
600 
601     /* Otherwise, just set the endpoint to "connected" */
602     mca_btl_openib_endpoint_connected(endpoint);
603 }
604 
605 /*
606  * called when the connect module has completed setup of an endpoint
607  */
mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t * endpoint)608 void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
609 {
610     opal_list_item_t *frag_item, *ep_item;
611     mca_btl_openib_send_frag_t *frag;
612     mca_btl_openib_endpoint_t *ep;
613     bool master = false;
614 
615     opal_output(-1, "Now we are CONNECTED");
616     if (MCA_BTL_XRC_ENABLED) {
617         opal_mutex_lock (&endpoint->ib_addr->addr_lock);
618         if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
619             /* We are not xrc master */
620             /* set our qp pointer to master qp */
621             master = false;
622         } else {
623             /* I'm master of XRC */
624             endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED;
625             master = true;
626         }
627     }
628 
629     /* Run over all qps and load alternative path */
630     if (APM_ENABLED) {
631         int i;
632         if (MCA_BTL_XRC_ENABLED) {
633             if (master) {
634                 mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint);
635             }
636         } else {
637             for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
638                 mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint);
639             }
640         }
641     }
642 
643     endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
644     endpoint->endpoint_btl->device->non_eager_rdma_endpoints++;
645 
646     if(MCA_BTL_XRC_ENABLED) {
647         if (master) {
648             while (NULL != (ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep))) {
649                 ep = (mca_btl_openib_endpoint_t *)ep_item;
650                 if (OPAL_SUCCESS !=
651                     opal_btl_openib_connect_base_start(endpoint->endpoint_local_cpc, ep)) {
652                     BTL_ERROR(("Failed to connect pending endpoint\n"));
653                 }
654             }
655         }
656         opal_mutex_unlock (&endpoint->ib_addr->addr_lock);
657     }
658 
659 
660     /* Process pending packet on the endpoint */
661 
662     /* While there are frags in the list, process them */
663     while (NULL != (frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)))) {
664         frag = to_send_frag(frag_item);
665         /* We need to post this one */
666 
667         if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
668             BTL_ERROR(("Error posting send"));
669         }
670     }
671     OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
672 
673     /* if upper layer called put or get before connection moved to connected
674      * state then we restart them here */
675     mca_btl_openib_frag_progress_pending_put_get(endpoint,
676             mca_btl_openib_component.rdma_qp);
677 }
678 
679 /*
680  * Attempt to send a fragment using a given endpoint. If the endpoint is not
681  * connected, queue the fragment and start the connection as required.
682  */
mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t * ep,mca_btl_openib_send_frag_t * frag)683 int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep,
684                                  mca_btl_openib_send_frag_t* frag)
685 {
686     int rc;
687 
688     OPAL_THREAD_LOCK(&ep->endpoint_lock);
689     rc = check_endpoint_state(ep, &to_base_frag(frag)->base,
690             &ep->pending_lazy_frags);
691 
692     if(OPAL_LIKELY(OPAL_SUCCESS == rc)) {
693         rc = mca_btl_openib_endpoint_post_send(ep, frag);
694     }
695     OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
696     if (OPAL_UNLIKELY(OPAL_ERR_RESOURCE_BUSY == rc)) {
697         rc = OPAL_SUCCESS;
698     }
699 
700     return rc;
701 }
702 
703 /**
704  * Return control fragment.
705  */
706 
mca_btl_openib_endpoint_credits(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * ep,struct mca_btl_base_descriptor_t * des,int status)707 static void mca_btl_openib_endpoint_credits(
708     mca_btl_base_module_t* btl,
709     struct mca_btl_base_endpoint_t* ep,
710     struct mca_btl_base_descriptor_t* des,
711     int status)
712 {
713 
714     int qp;
715 
716     mca_btl_openib_send_control_frag_t *frag = to_send_control_frag(des);
717 
718     qp = frag->qp_idx;
719 
720     /* we don't acquire a WQE for credit message - so decrement.
721      * Note: doing it for QP used for credit management */
722     (void) qp_get_wqe(ep, des->order);
723 
724     if(check_send_credits(ep, qp) || check_eager_rdma_credits(ep))
725         mca_btl_openib_endpoint_send_credits(ep, qp);
726     else {
727         BTL_OPENIB_CREDITS_SEND_UNLOCK(ep, qp);
728         /* check one more time if credits are available after unlock */
729         send_credits(ep, qp);
730     }
731 }
732 
733 /**
734  * Return credits to peer
735  */
736 
mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t * endpoint,const int qp)737 void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
738         const int qp)
739 {
740     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
741     mca_btl_openib_send_control_frag_t* frag;
742     mca_btl_openib_rdma_credits_header_t *credits_hdr;
743     int rc;
744     bool do_rdma = false;
745     int32_t cm_return;
746 
747     frag = endpoint->qps[qp].credit_frag;
748 
749     if(OPAL_UNLIKELY(NULL == frag)) {
750         frag = alloc_control_frag(openib_btl);
751         frag->qp_idx = qp;
752         endpoint->qps[qp].credit_frag = frag;
753         /* set those once and forever */
754         to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
755         to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
756         to_base_frag(frag)->base.des_cbdata = NULL;
757         to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
758         to_com_frag(frag)->endpoint = endpoint;
759         frag->hdr->tag = MCA_BTL_TAG_IB;
760         to_base_frag(frag)->segment.seg_len =
761             sizeof(mca_btl_openib_rdma_credits_header_t);
762     }
763 
764     assert(frag->qp_idx == qp);
765     credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
766         to_base_frag(frag)->segment.seg_addr.pval;
767     if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) {
768         do_rdma = true;
769     } else {
770         if(OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
771                 (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
772             OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
773             BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
774             return;
775         }
776      }
777 
778     BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);
779 
780     frag->hdr->cm_seen = 0;
781     BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
782     if(cm_return > 255) {
783         frag->hdr->cm_seen = 255;
784         cm_return -= 255;
785         OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
786     } else {
787         frag->hdr->cm_seen = cm_return;
788     }
789 
790     BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
791     credits_hdr->qpn = qp;
792     credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;
793 
794     if(endpoint->nbo)
795          BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);
796 
797     qp_reset_signal_count(endpoint, qp);
798     if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
799         return;
800 
801     if(endpoint->nbo) {
802         BTL_OPENIB_HEADER_NTOH(*frag->hdr);
803         BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
804     }
805     BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
806     OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.rd_credits,
807             frag->hdr->credits);
808     OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_local.credits,
809             credits_hdr->rdma_credits);
810     if(do_rdma)
811         OPAL_THREAD_ADD_FETCH32(&endpoint->eager_rdma_remote.tokens, 1);
812     else
813         OPAL_THREAD_ADD_FETCH32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
814 
815     BTL_ERROR(("error posting send request errno %d says %s", rc,
816                 strerror(errno)));
817 }
818 
819 /* local callback function for completion of eager rdma connect */
mca_btl_openib_endpoint_eager_rdma_connect_cb(mca_btl_base_module_t * btl,struct mca_btl_base_endpoint_t * endpoint,struct mca_btl_base_descriptor_t * descriptor,int status)820 static void mca_btl_openib_endpoint_eager_rdma_connect_cb(
821     mca_btl_base_module_t* btl,
822     struct mca_btl_base_endpoint_t* endpoint,
823     struct mca_btl_base_descriptor_t* descriptor,
824     int status)
825 {
826     mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
827     OPAL_THREAD_ADD_FETCH32(&device->non_eager_rdma_endpoints, -1);
828     assert(device->non_eager_rdma_endpoints >= 0);
829     MCA_BTL_IB_FRAG_RETURN(descriptor);
830 }
831 
832 /* send the eager rdma connect message to the remote endpoint */
mca_btl_openib_endpoint_send_eager_rdma(mca_btl_base_endpoint_t * endpoint)833 static int mca_btl_openib_endpoint_send_eager_rdma(
834     mca_btl_base_endpoint_t* endpoint)
835 {
836     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
837     mca_btl_openib_eager_rdma_header_t *rdma_hdr;
838     mca_btl_openib_send_control_frag_t* frag;
839     int rc;
840 
841     frag = alloc_control_frag(openib_btl);
842     if(NULL == frag) {
843         return -1;
844     }
845 
846     to_base_frag(frag)->base.des_cbfunc =
847         mca_btl_openib_endpoint_eager_rdma_connect_cb;
848     to_base_frag(frag)->base.des_cbdata = NULL;
849     to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
850     to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
851     to_base_frag(frag)->segment.seg_len =
852         sizeof(mca_btl_openib_eager_rdma_header_t);
853     to_com_frag(frag)->endpoint = endpoint;
854 
855     frag->hdr->tag = MCA_BTL_TAG_IB;
856     rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
857     rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
858     rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
859     rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval);
860     BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
861                  ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
862                  rdma_hdr->rkey,
863                  rdma_hdr->rdma_start.lval,
864                  rdma_hdr->rdma_start.pval,
865                  rdma_hdr->rdma_start.ival,
866                  rdma_hdr->control.type,
867                  (int) sizeof(mca_btl_openib_eager_rdma_header_t)
868                  ));
869 
870     if(endpoint->nbo) {
871         BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));
872 
873         BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n",
874                      rdma_hdr->rkey,
875                      rdma_hdr->rdma_start.lval,
876                      rdma_hdr->rdma_start.pval,
877                      rdma_hdr->rdma_start.ival
878                      ));
879     }
880     rc = mca_btl_openib_endpoint_send(endpoint, frag);
881     if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc)
882         return OPAL_SUCCESS;
883 
884     MCA_BTL_IB_FRAG_RETURN(frag);
885     BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno)));
886     return rc;
887 }
888 
889 /* Setup eager RDMA buffers and notify the remote endpoint*/
mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t * endpoint)890 void mca_btl_openib_endpoint_connect_eager_rdma(
891         mca_btl_openib_endpoint_t* endpoint)
892 {
893     mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
894     char *buf, *alloc_base;
895     mca_btl_openib_recv_frag_t *headers_buf;
896     int i, rc;
897     uint32_t flag = MCA_RCACHE_FLAGS_CACHE_BYPASS;
898     void *_tmp_ptr = NULL;
899 
900     /* Set local rdma pointer to 1 temporarily so other threads will not try
901      * to enter the function */
902     if(!opal_atomic_compare_exchange_strong_ptr (&endpoint->eager_rdma_local.base.pval, (void *) &_tmp_ptr,
903                                                  (void *) 1)) {
904         return;
905     }
906 
907     headers_buf = (mca_btl_openib_recv_frag_t*)
908         malloc(sizeof(mca_btl_openib_recv_frag_t) *
909             mca_btl_openib_component.eager_rdma_num);
910 
911     if(NULL == headers_buf)
912        goto unlock_rdma_local;
913 
914 #if HAVE_DECL_IBV_ACCESS_SO
915     /* Solaris implements the Relaxed Ordering feature defined in the
916        PCI Specification. With this in mind any memory region which
917        relies on a buffer being written in a specific order, for
918        example the eager rdma connections created in this routinue,
919        must set a strong order flag when registering the memory for
920        rdma operations.
921 
922        The following flag will be interpreted and the appropriate
923        steps will be taken when the memory is registered in
924        openib_reg_mr(). */
925     flag |= MCA_RCACHE_FLAGS_SO_MEM;
926 #endif
927 
928     alloc_base = buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
929                                                             openib_btl->eager_rdma_frag_size *
930                                                             mca_btl_openib_component.eager_rdma_num,
931                                                             mca_btl_openib_component.buffer_alignment,
932                                                             0);
933 
934     if(!buf)
935        goto free_headers_buf;
936 
937     rc = openib_btl->device->rcache->rcache_register (openib_btl->device->rcache, buf, openib_btl->eager_rdma_frag_size *
938                                                       mca_btl_openib_component.eager_rdma_num, flag, MCA_RCACHE_ACCESS_ANY,
939                                                       (mca_rcache_base_registration_t**)&endpoint->eager_rdma_local.reg);
940     if (OPAL_SUCCESS != rc) {
941         openib_btl->super.btl_mpool->mpool_free (openib_btl->super.btl_mpool, alloc_base);
942         goto free_headers_buf;
943     }
944 
945     buf = buf + openib_btl->eager_rdma_frag_size -
946         sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
947         sizeof(mca_btl_openib_header_t);
948 
949     for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
950         opal_free_list_item_t *item;
951         mca_btl_openib_recv_frag_t * frag;
952         mca_btl_openib_frag_init_data_t init_data;
953 
954         item = (opal_free_list_item_t*)&headers_buf[i];
955         item->registration = (mca_rcache_base_registration_t *)endpoint->eager_rdma_local.reg;
956         item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
957         OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);
958 
959         init_data.order = mca_btl_openib_component.credits_qp;
960         init_data.list = NULL;
961 
962         mca_btl_openib_frag_init(item, &init_data);
963         frag = to_recv_frag(item);
964         to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
965         to_com_frag(frag)->endpoint = endpoint;
966         frag->ftr = (mca_btl_openib_footer_t*)
967             ((char*)to_base_frag(frag)->segment.seg_addr.pval +
968              mca_btl_openib_component.eager_limit);
969 
970         MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
971     }
972 
973     endpoint->eager_rdma_local.frags = headers_buf;
974 
975     endpoint->eager_rdma_local.rd_win =
976         mca_btl_openib_component.eager_rdma_num >> 2;
977     endpoint->eager_rdma_local.rd_win =
978         endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;
979 
980     /* set local rdma pointer to real value */
981     endpoint->eager_rdma_local.base.pval = buf;
982     endpoint->eager_rdma_local.alloc_base = alloc_base;
983 
984     if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) {
985         mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
986         mca_btl_openib_endpoint_t **p;
987         void *_tmp_ptr;
988         OBJ_RETAIN(endpoint);
989         assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
990         do {
991             _tmp_ptr = NULL;
992             p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
993         } while(!opal_atomic_compare_exchange_strong_ptr (p, (void *) &_tmp_ptr, endpoint));
994 
995         OPAL_THREAD_ADD_FETCH32(&openib_btl->eager_rdma_channels, 1);
996         /* from this point progress function starts to poll new buffer */
997         OPAL_THREAD_ADD_FETCH32(&device->eager_rdma_buffers_count, 1);
998         return;
999     }
1000 
1001     openib_btl->device->rcache->rcache_deregister (openib_btl->device->rcache,
1002                                                    (mca_rcache_base_registration_t*)endpoint->eager_rdma_local.reg);
1003     openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf);
1004 free_headers_buf:
1005     free(headers_buf);
1006 unlock_rdma_local:
1007     /* set local rdma pointer back to zero. Will retry later */
1008     endpoint->eager_rdma_local.base.pval = NULL;
1009     endpoint->eager_rdma_local.frags = NULL;
1010 }
1011 
1012 /*
1013  * Invoke an error on the btl associated with an endpoint.  If we
1014  * don't have an endpoint, then just use the first one on the
1015  * component list of BTLs.
1016  */
mca_btl_openib_endpoint_invoke_error(void * context)1017 void *mca_btl_openib_endpoint_invoke_error(void *context)
1018 {
1019     mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t*) context;
1020     mca_btl_openib_module_t *btl = NULL;
1021 
1022     if (NULL == endpoint) {
1023         int i;
1024         for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i) {
1025             if (NULL != mca_btl_openib_component.openib_btls[i] &&
1026                 NULL != mca_btl_openib_component.openib_btls[i]->error_cb) {
1027                 btl = mca_btl_openib_component.openib_btls[i];
1028                 break;
1029             }
1030         }
1031     } else {
1032         btl = endpoint->endpoint_btl;
1033     }
1034 
1035     /* If we didn't find a BTL, then just bail :-( */
1036     if (NULL == btl || NULL == btl->error_cb) {
1037         opal_show_help("help-mpi-btl-openib.txt",
1038                        "cannot raise btl error", true,
1039                        opal_process_info.nodename,
1040                        __FILE__, __LINE__);
1041         exit(1);
1042     }
1043 
1044     /* Invoke the callback to the upper layer */
1045     btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
1046 
1047     /* Will likely never get here */
1048     return NULL;
1049 }
1050