1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2013 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2017 Cisco Systems, Inc.  All rights reserved
14  * Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved.
15  * Copyright (c) 2006-2015 Los Alamos National Security, LLC.  All rights
16  *                         reserved.
17  * Copyright (c) 2006-2007 Voltaire All rights reserved.
18  * Copyright (c) 2009-2012 Oracle and/or its affiliates.  All rights reserved.
19  * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
20  * Copyright (c) 2012      Oak Ridge National Laboratory.  All rights reserved
21  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
22  * Copyright (c) 2014-2017 Research Organization for Information Science
23  *                         and Technology (RIST). All rights reserved.
24  * Copyright (c) 2014      Bull SAS.  All rights reserved.
25  * $COPYRIGHT$
26  *
27  * Additional copyrights may follow
28  *
29  * $HEADER$
30  */
31 
32 #include "opal_config.h"
33 
34 #include <infiniband/verbs.h>
35 #include <errno.h>
36 #include <string.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <fcntl.h>
43 #include <stdlib.h>
44 #include <stddef.h>
45 
46 #include "opal/mca/memory/memory.h"
47 #include "opal/mca/event/event.h"
48 #include "opal/align.h"
49 #include "opal/util/output.h"
50 #include "opal/util/argv.h"
51 #include "opal/mca/timer/base/base.h"
52 #include "opal/sys/atomic.h"
53 #include "opal/util/sys_limits.h"
54 #include "opal/util/argv.h"
55 #include "opal/memoryhooks/memory.h"
56 /* Define this before including hwloc.h so that we also get the hwloc
57    verbs helper header file, too.  We have to do this level of
58    indirection because the hwloc subsystem is a component -- we don't
59    know its exact path.  We have to rely on the framework header files
60    to find the right hwloc verbs helper file for us. */
61 #define OPAL_HWLOC_WANT_VERBS_HELPER 1
62 #include "opal/mca/hwloc/hwloc-internal.h"
63 #include "opal/mca/hwloc/base/base.h"
64 #include "opal/mca/installdirs/installdirs.h"
65 #include "opal_stdint.h"
66 #include "opal/util/show_help.h"
67 #include "opal/mca/btl/btl.h"
68 #include "opal/mca/btl/base/base.h"
69 #include "opal/mca/mpool/base/base.h"
70 #include "opal/mca/rcache/rcache.h"
71 #include "opal/mca/rcache/base/base.h"
72 #include "opal/mca/common/cuda/common_cuda.h"
73 #include "opal/mca/common/verbs/common_verbs.h"
74 #include "opal/runtime/opal_params.h"
75 #include "opal/runtime/opal.h"
76 #include "opal/mca/pmix/pmix.h"
77 #include "opal/util/proc.h"
78 
79 #include "btl_openib.h"
80 #include "btl_openib_frag.h"
81 #include "btl_openib_endpoint.h"
82 #include "btl_openib_eager_rdma.h"
83 #include "btl_openib_proc.h"
84 #include "btl_openib_ini.h"
85 #include "btl_openib_mca.h"
86 #include "btl_openib_xrc.h"
87 #include "btl_openib_async.h"
88 #include "connect/base.h"
89 #include "btl_openib_ip.h"
90 
91 #define EPS 1.e-6
92 /*
93  * Local functions
94  */
95 static int btl_openib_component_register(void);
96 static int btl_openib_component_open(void);
97 static int btl_openib_component_close(void);
98 static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
99 static int btl_openib_component_progress(void);
100 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
101 static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
102                                                   mca_btl_openib_endpoint_t *ep,
103                                                   mca_btl_base_descriptor_t* des,
104                                                   int status);
105 #endif /* OPAL_CUDA_SUPPORT */
106 /*
107  * Local variables
108  */
109 static mca_btl_openib_device_t *receive_queues_device = NULL;
110 static int num_devices_intentionally_ignored = 0;
111 
112 mca_btl_openib_component_t mca_btl_openib_component = {
113     .super = {
114         /* First, the mca_base_component_t struct containing meta information
115            about the component itself */
116 
117         .btl_version = {
118             MCA_BTL_DEFAULT_VERSION("openib"),
119             .mca_open_component = btl_openib_component_open,
120             .mca_close_component = btl_openib_component_close,
121             .mca_register_component_params = btl_openib_component_register,
122         },
123         .btl_data = {
124             /* The component is checkpoint ready */
125             .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
126         },
127 
128         .btl_init = btl_openib_component_init,
129         .btl_progress = btl_openib_component_progress,
130     }
131 };
132 
btl_openib_component_register(void)133 static int btl_openib_component_register(void)
134 {
135     int ret;
136 
137     /* register IB component parameters */
138     if (OPAL_SUCCESS != (ret = btl_openib_register_mca_params())) {
139         return ret;
140     }
141 
142     mca_btl_openib_component.max_send_size =
143         mca_btl_openib_module.super.btl_max_send_size;
144     mca_btl_openib_component.eager_limit =
145         mca_btl_openib_module.super.btl_eager_limit;
146 
147     /* if_include and if_exclude need to be mutually exclusive */
148     if (OPAL_SUCCESS !=
149         mca_base_var_check_exclusive("ompi",
150         mca_btl_openib_component.super.btl_version.mca_type_name,
151         mca_btl_openib_component.super.btl_version.mca_component_name,
152         "if_include",
153         mca_btl_openib_component.super.btl_version.mca_type_name,
154         mca_btl_openib_component.super.btl_version.mca_component_name,
155         "if_exclude")) {
156         /* Return ERR_NOT_AVAILABLE so that a warning message about
157            "open" failing is not printed */
158         return OPAL_ERR_NOT_AVAILABLE;
159     }
160 
161 #if OPAL_CUDA_SUPPORT
162     mca_common_cuda_register_mca_variables();
163 #endif
164 
165     return OPAL_SUCCESS;
166 }
167 
168 /*
169  *  Called by MCA framework to open the component
170  */
btl_openib_component_open(void)171 static int btl_openib_component_open(void)
172 {
173     opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock;
174     opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;
175 
176     /* Construct hash table that stores pointers to SRQs */
177     OBJ_CONSTRUCT(lock, opal_mutex_t);
178     OBJ_CONSTRUCT(srq_addr_table, opal_hash_table_t);
179 
180     /* initialize state */
181     mca_btl_openib_component.ib_num_btls = 0;
182     mca_btl_openib_component.num_default_gid_btls = 0;
183     mca_btl_openib_component.openib_btls = NULL;
184     OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
185     mca_btl_openib_component.devices_count = 0;
186     mca_btl_openib_component.cpc_explicitly_defined = false;
187 
188     /* initialize objects */
189     OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
190     mca_btl_openib_component.memory_registration_verbose = -1;
191 
192 #if OPAL_CUDA_SUPPORT
193     mca_common_cuda_stage_one_init();
194 #endif /* OPAL_CUDA_SUPPORT */
195 
196     return OPAL_SUCCESS;
197 }
198 
199 /*
200  * component cleanup - sanity checking of queue lengths
201  */
202 
btl_openib_component_close(void)203 static int btl_openib_component_close(void)
204 {
205     int rc = OPAL_SUCCESS;
206 
207     /* remove the async event from the event base */
208     mca_btl_openib_async_fini ();
209 
210     OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.lock);
211     OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.srq_addr_table);
212 
213     opal_btl_openib_connect_base_finalize();
214     opal_btl_openib_ini_finalize();
215 
216     if (NULL != mca_btl_openib_component.default_recv_qps) {
217         free(mca_btl_openib_component.default_recv_qps);
218     }
219 
220     /* close memory registration debugging output */
221     opal_output_close (mca_btl_openib_component.memory_registration_verbose);
222 
223 #if OPAL_CUDA_SUPPORT
224     mca_common_cuda_fini();
225 #endif /* OPAL_CUDA_SUPPORT */
226 
227     return rc;
228 }
229 
pack8(char ** dest,uint8_t value)230 static void inline pack8(char **dest, uint8_t value)
231 {
232     /* Copy one character */
233     **dest = (char) value;
234     /* Most the dest ahead one */
235     ++*dest;
236 }
237 
238 /*
239  *  Register local openib port information with the modex so that it
240  *  can be shared with all other peers.
241  */
btl_openib_modex_send(void)242 static int btl_openib_modex_send(void)
243 {
244     int rc, i, j;
245     int modex_message_size;
246     char *message, *offset;
247     size_t size, msg_size;
248     opal_btl_openib_connect_base_module_t *cpc;
249 
250     opal_output(-1, "Starting to modex send");
251     if (0 == mca_btl_openib_component.ib_num_btls) {
252         return 0;
253     }
254     modex_message_size = offsetof(mca_btl_openib_modex_message_t, end);
255 
256     /* The message is packed into multiple parts:
257      * 1. a uint8_t indicating the number of modules (ports) in the message
258      * 2. for each module:
259      *    a. the common module data
260      *    b. a uint8_t indicating how many CPCs follow
261      *    c. for each CPC:
262      *       a. a uint8_t indicating the index of the CPC in the all[]
263      *          array in btl_openib_connect_base.c
264      *       b. a uint8_t indicating the priority of this CPC
265      *       c. a uint8_t indicating the length of the blob to follow
266      *       d. a blob that is only meaningful to that CPC
267      */
268     msg_size =
269         /* uint8_t for number of modules in the message */
270         1 +
271         /* For each module: */
272         mca_btl_openib_component.ib_num_btls *
273         (
274          /* Common module data */
275          modex_message_size +
276          /* uint8_t for how many CPCs follow */
277          1
278          );
279     /* For each module, add in the size of the per-CPC data */
280     for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
281         for (j = 0;
282              j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
283              ++j) {
284             msg_size +=
285                 /* uint8_t for the index of the CPC */
286                 1 +
287                 /* uint8_t for the CPC's priority */
288                 1 +
289                 /* uint8_t for the blob length */
290                 1 +
291                 /* blob length */
292                 mca_btl_openib_component.openib_btls[i]->cpcs[j]->data.cbm_modex_message_len;
293         }
294     }
295     message = (char *) malloc(msg_size);
296     if (NULL == message) {
297         BTL_ERROR(("Failed malloc"));
298         return OPAL_ERR_OUT_OF_RESOURCE;
299     }
300 
301     /* Pack the number of modules */
302     offset = message;
303     pack8(&offset, mca_btl_openib_component.ib_num_btls);
304     opal_output(-1, "modex sending %d btls (packed: %d, offset now at %d)", mca_btl_openib_component.ib_num_btls, *((uint8_t*) message), (int) (offset - message));
305 
306     /* Pack each of the modules */
307     for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
308 
309         /* Pack the modex common message struct.  */
310         size = modex_message_size;
311 
312         (mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
313             (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
314 
315         (mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
316             (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
317 
318         (mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
319             mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
320 
321         memcpy(offset,
322                &(mca_btl_openib_component.openib_btls[i]->port_info),
323                size);
324         opal_output(-1, "modex packed btl port modex message: 0x%" PRIx64 ", %d, %d (size: %d)",
325                     mca_btl_openib_component.openib_btls[i]->port_info.subnet_id,
326                     mca_btl_openib_component.openib_btls[i]->port_info.mtu,
327                     mca_btl_openib_component.openib_btls[i]->port_info.lid,
328                     (int) size);
329 
330 #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
331         MCA_BTL_OPENIB_MODEX_MSG_HTON(*(mca_btl_openib_modex_message_t *)offset);
332 #endif
333         offset += size;
334         opal_output(-1, "modex packed btl %d: modex message, offset now %d",
335                     i, (int) (offset -message));
336 
337         /* Pack the number of CPCs that follow */
338         pack8(&offset,
339               mca_btl_openib_component.openib_btls[i]->num_cpcs);
340         opal_output(-1, "modex packed btl %d: to pack %d cpcs (packed: %d, offset now %d)",
341                     i, mca_btl_openib_component.openib_btls[i]->num_cpcs,
342                     *((uint8_t*) (offset - 1)), (int) (offset-message));
343 
344         /* Pack each CPC */
345         for (j = 0;
346              j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
347              ++j) {
348             uint8_t u8;
349 
350             cpc = mca_btl_openib_component.openib_btls[i]->cpcs[j];
351             opal_output(-1, "modex packed btl %d: packing cpc %s",
352                         i, cpc->data.cbm_component->cbc_name);
353             /* Pack the CPC index */
354             u8 = opal_btl_openib_connect_base_get_cpc_index(cpc->data.cbm_component);
355             pack8(&offset, u8);
356             opal_output(-1, "packing btl %d: cpc %d: index %d (packed %d, offset now %d)",
357                         i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
358             /* Pack the CPC priority */
359             pack8(&offset, cpc->data.cbm_priority);
360             opal_output(-1, "packing btl %d: cpc %d: priority %d (packed %d, offset now %d)",
361                         i, j, cpc->data.cbm_priority, *((uint8_t*) (offset-1)), (int)(offset-message));
362             /* Pack the blob length */
363             u8 = cpc->data.cbm_modex_message_len;
364             pack8(&offset, u8);
365             opal_output(-1, "packing btl %d: cpc %d: message len %d (packed %d, offset now %d)",
366                         i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
367             /* If the blob length is > 0, pack the blob */
368             if (u8 > 0) {
369                 memcpy(offset, cpc->data.cbm_modex_message, u8);
370                 offset += u8;
371                 opal_output(-1, "packing btl %d: cpc %d: blob packed %d %x (offset now %d)",
372                             i, j,
373                             ((uint32_t*)cpc->data.cbm_modex_message)[0],
374                             ((uint32_t*)cpc->data.cbm_modex_message)[1],
375                             (int)(offset-message));
376             }
377 
378             /* Sanity check */
379             assert((size_t) (offset - message) <= msg_size);
380         }
381     }
382 
383     /* All done -- send it! */
384     OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
385                     &mca_btl_openib_component.super.btl_version,
386                     message, msg_size);
387     free(message);
388     opal_output(-1, "Modex sent!  %d calculated, %d actual\n", (int) msg_size, (int) (offset - message));
389 
390     return rc;
391 }
392 
393 /*
394  * Active Message Callback function on control message.
395  */
396 
btl_openib_control(mca_btl_base_module_t * btl,mca_btl_base_tag_t tag,mca_btl_base_descriptor_t * des,void * cbdata)397 static void btl_openib_control(mca_btl_base_module_t* btl,
398         mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des,
399         void* cbdata)
400 {
401     /* don't return credits used for control messages */
402     mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
403     mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint;
404     mca_btl_openib_control_header_t *ctl_hdr =
405         (mca_btl_openib_control_header_t *) to_base_frag(des)->segment.seg_addr.pval;
406     mca_btl_openib_eager_rdma_header_t *rdma_hdr;
407     mca_btl_openib_header_coalesced_t *clsc_hdr =
408         (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
409     mca_btl_active_message_callback_t* reg;
410     size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
411 
412     switch (ctl_hdr->type) {
413     case MCA_BTL_OPENIB_CONTROL_CREDITS:
414         assert(0); /* Credit message is handled elsewhere */
415         break;
416     case MCA_BTL_OPENIB_CONTROL_RDMA:
417        rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
418 
419        BTL_VERBOSE(("prior to NTOH received  rkey %" PRIu32
420                     ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32,
421                     rdma_hdr->rkey,
422                     rdma_hdr->rdma_start.lval,
423                     rdma_hdr->rdma_start.pval,
424                     rdma_hdr->rdma_start.ival
425                   ));
426 
427        if(ep->nbo) {
428            BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr);
429        }
430 
431        BTL_VERBOSE(("received  rkey %" PRIu32
432                     ", rdma_start.lval %" PRIx64 ", pval %p,"
433                     " ival %" PRIu32, rdma_hdr->rkey,
434                     rdma_hdr->rdma_start.lval,
435                     rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival));
436 
437        if (ep->eager_rdma_remote.base.pval) {
438            BTL_ERROR(("Got RDMA connect twice!"));
439            return;
440        }
441        ep->eager_rdma_remote.rkey = rdma_hdr->rkey;
442        ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
443        ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1;
444        break;
445     case MCA_BTL_OPENIB_CONTROL_COALESCED:
446         {
447             size_t pad = 0;
448             while(len > 0) {
449                 size_t skip;
450                 mca_btl_openib_header_coalesced_t* unalign_hdr = 0;
451                 mca_btl_base_descriptor_t tmp_des;
452                 mca_btl_base_segment_t tmp_seg;
453 
454                 assert(len >= sizeof(*clsc_hdr));
455 
456                 if(ep->nbo)
457                     BTL_OPENIB_HEADER_COALESCED_NTOH(*clsc_hdr);
458 
459                 skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
460 
461                 tmp_des.des_segments = &tmp_seg;
462                 tmp_des.des_segment_count = 1;
463                 tmp_seg.seg_addr.pval = clsc_hdr + 1;
464                 tmp_seg.seg_len = clsc_hdr->size;
465 
466                 /* call registered callback */
467                 reg = mca_btl_base_active_message_trigger + clsc_hdr->tag;
468                 reg->cbfunc( &obtl->super, clsc_hdr->tag, &tmp_des, reg->cbdata );
469                 len -= (skip + pad);
470                 unalign_hdr = (mca_btl_openib_header_coalesced_t*)
471                     ((unsigned char*)clsc_hdr + skip);
472                 pad = (size_t)BTL_OPENIB_COALESCE_HDR_PADDING(unalign_hdr);
473                 clsc_hdr = (mca_btl_openib_header_coalesced_t*)((unsigned char*)unalign_hdr +
474                                                                 pad);
475             }
476         }
477        break;
478     case MCA_BTL_OPENIB_CONTROL_CTS:
479         OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d",
480                      opal_get_proc_hostname(ep->endpoint_proc->proc_opal),
481                      (void*) ctl_hdr,
482                      ep->endpoint_posted_recvs, ep->endpoint_cts_sent));
483         ep->endpoint_cts_received = true;
484 
485         /* Only send the CTS back and mark connected if:
486            - we have posted our receives (it's possible that we can
487              get this CTS before this side's CPC has called
488              cpc_complete())
489            - we have not yet sent our CTS
490 
491            We don't even want to mark the endpoint connected() until
492            we have posted our receives because otherwise we will
493            trigger credit management (because the rd_credits will
494            still be negative), and Bad Things will happen. */
495         if (ep->endpoint_posted_recvs) {
496             /* need to hold to lock for both send_cts and connected */
497             OPAL_THREAD_LOCK(&ep->endpoint_lock);
498             if (!ep->endpoint_cts_sent) {
499                 mca_btl_openib_endpoint_send_cts(ep);
500             }
501             mca_btl_openib_endpoint_connected(ep);
502         }
503         break;
504     default:
505         BTL_ERROR(("Unknown message type received by BTL"));
506        break;
507     }
508 }
509 
openib_reg_mr(void * reg_data,void * base,size_t size,mca_rcache_base_registration_t * reg)510 static int openib_reg_mr (void *reg_data, void *base, size_t size,
511                           mca_rcache_base_registration_t *reg)
512 {
513     mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
514     mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
515     enum ibv_access_flags access_flag = 0;
516 
517     if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) {
518         access_flag |= IBV_ACCESS_REMOTE_READ;
519     }
520 
521     if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_WRITE) {
522         access_flag |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
523     }
524 
525     if (reg->access_flags & MCA_RCACHE_ACCESS_LOCAL_WRITE) {
526         access_flag |= IBV_ACCESS_LOCAL_WRITE;
527     }
528 
529 #if HAVE_DECL_IBV_ATOMIC_HCA
530     if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_ATOMIC) {
531         access_flag |= IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE;
532     }
533 #endif
534 
535     if (device->mem_reg_max &&
536         device->mem_reg_max < (device->mem_reg_active + size)) {
537         return OPAL_ERR_OUT_OF_RESOURCE;
538     }
539 
540     device->mem_reg_active += size;
541 
542 #if HAVE_DECL_IBV_ACCESS_SO
543     if (reg->flags & MCA_RCACHE_FLAGS_SO_MEM) {
544         access_flag |= IBV_ACCESS_SO;
545     }
546 #endif
547 
548     openib_reg->mr = ibv_reg_mr(device->ib_pd, base, size, access_flag);
549 
550     if (NULL == openib_reg->mr) {
551         OPAL_OUTPUT_VERBOSE((5, mca_btl_openib_component.memory_registration_verbose,
552                              "ibv_reg_mr() failed: base=%p, bound=%p, size=%d, flags=0x%x, errno=%d",
553                               reg->base, reg->bound, (int) (reg->bound - reg->base + 1), reg->flags, errno));
554         return OPAL_ERR_OUT_OF_RESOURCE;
555     }
556 
557     openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
558     openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
559 
560     OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
561                          "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
562                          (int) (reg->bound - reg->base + 1), reg->flags));
563 
564 #if OPAL_CUDA_SUPPORT
565     if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) {
566         mca_common_cuda_register (base, size,
567             openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name);
568     }
569 #endif
570 
571     return OPAL_SUCCESS;
572 }
573 
openib_dereg_mr(void * reg_data,mca_rcache_base_registration_t * reg)574 static int openib_dereg_mr(void *reg_data, mca_rcache_base_registration_t *reg)
575 {
576     mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
577     mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
578 
579     OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
580                          "openib_dereg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
581                          (int) (reg->bound - reg->base + 1), reg->flags));
582 
583     if(openib_reg->mr != NULL) {
584         if(ibv_dereg_mr(openib_reg->mr)) {
585             BTL_ERROR(("%s: error unpinning openib memory errno says %s",
586                        __func__, strerror(errno)));
587             return OPAL_ERROR;
588         }
589 
590 #if OPAL_CUDA_SUPPORT
591         if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) {
592             mca_common_cuda_unregister(openib_reg->base.base,
593                 openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name);
594         }
595 #endif
596 
597     }
598 
599     device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
600 
601     openib_reg->mr = NULL;
602     return OPAL_SUCCESS;
603 }
604 
param_register_uint(const char * param_name,unsigned int default_value,unsigned int * storage)605 static inline int param_register_uint(const char* param_name, unsigned int default_value, unsigned int *storage)
606 {
607     *storage = default_value;
608     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
609                                            param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
610                                            NULL, 0, 0, OPAL_INFO_LVL_9,
611                                            MCA_BASE_VAR_SCOPE_READONLY, storage);
612     return *storage;
613 }
614 
init_one_port(opal_list_t * btl_list,mca_btl_openib_device_t * device,uint8_t port_num,uint16_t pkey_index,struct ibv_port_attr * ib_port_attr)615 static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
616                          uint8_t port_num, uint16_t pkey_index,
617                          struct ibv_port_attr *ib_port_attr)
618 {
619     uint16_t lid, i, lmc, lmc_step;
620     mca_btl_openib_module_t *openib_btl;
621     mca_btl_base_selected_module_t *ib_selected;
622     union ibv_gid gid;
623     uint64_t subnet_id;
624 
625     /* Ensure that the requested GID index (via the
626        btl_openib_gid_index MCA param) is within the GID table
627        size. */
628     if (mca_btl_openib_component.gid_index >
629         ib_port_attr->gid_tbl_len) {
630         opal_show_help("help-mpi-btl-openib.txt", "gid index too large",
631                        true, opal_process_info.nodename,
632                        ibv_get_device_name(device->ib_dev), port_num,
633                        mca_btl_openib_component.gid_index,
634                        ib_port_attr->gid_tbl_len);
635         return OPAL_ERR_NOT_FOUND;
636     }
637     BTL_VERBOSE(("looking for %s:%d GID index %d",
638                  ibv_get_device_name(device->ib_dev), port_num,
639                  mca_btl_openib_component.gid_index));
640 
641     /* If we have struct ibv_device.transport_type, then we're >= OFED
642        v1.2, and the transport could be iWarp or IB.  If we don't have
643        that member, then we're < OFED v1.2, and it can only be IB. */
644 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
645     if (IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) {
646         subnet_id = mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num);
647         BTL_VERBOSE(("my iWARP subnet_id is %016" PRIx64, subnet_id));
648     } else {
649         memset(&gid, 0, sizeof(gid));
650         if (0 != ibv_query_gid(device->ib_dev_context, port_num,
651                                mca_btl_openib_component.gid_index, &gid)) {
652             BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
653                        ibv_get_device_name(device->ib_dev), port_num,
654                        mca_btl_openib_component.gid_index));
655             return OPAL_ERR_NOT_FOUND;
656         }
657 
658 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
659         if (IBV_LINK_LAYER_ETHERNET == ib_port_attr->link_layer) {
660             subnet_id = mca_btl_openib_component.rroce_enable ? 0 :
661                    mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num);
662         } else {
663             subnet_id = ntoh64(gid.global.subnet_prefix);
664         }
665 #else
666         subnet_id = ntoh64(gid.global.subnet_prefix);
667 #endif
668 
669         BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64,
670                      ibv_get_device_name(device->ib_dev), port_num, subnet_id));
671     }
672 #else
673     if (0 != ibv_query_gid(device->ib_dev_context, port_num,
674                            mca_btl_openib_component.gid_index, &gid)) {
675         BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
676                    ibv_get_device_name(device->ib_dev), port_num,
677                    mca_btl_openib_component.gid_index));
678         return OPAL_ERR_NOT_FOUND;
679     }
680     subnet_id = ntoh64(gid.global.subnet_prefix);
681     BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64,
682                  ibv_get_device_name(device->ib_dev), port_num, subnet_id));
683 #endif
684 
685     if(mca_btl_openib_component.num_default_gid_btls > 0 &&
686             IB_DEFAULT_GID_PREFIX == subnet_id &&
687             mca_btl_openib_component.warn_default_gid_prefix) {
688         opal_show_help("help-mpi-btl-openib.txt", "default subnet prefix",
689                 true, opal_process_info.nodename);
690     }
691 
692     if (IB_DEFAULT_GID_PREFIX == subnet_id) {
693         mca_btl_openib_component.num_default_gid_btls++;
694     }
695 
696     lmc = (1 << ib_port_attr->lmc);
697     lmc_step = 1;
698 
699     if (0 != mca_btl_openib_component.max_lmc &&
700         mca_btl_openib_component.max_lmc < lmc) {
701         lmc = mca_btl_openib_component.max_lmc;
702     }
703 
704     /* APM support -- only meaningful if async event support is
705        enabled.  If async events are not enabled, then there's nothing
706        to listen for the APM event to load the new path, so it's not
707        worth enabling APM.  */
708     if (lmc > 1){
709         if (-1 == mca_btl_openib_component.apm_lmc) {
710             lmc_step = lmc;
711             mca_btl_openib_component.apm_lmc = lmc - 1;
712         } else if (0 == lmc % (mca_btl_openib_component.apm_lmc + 1)) {
713             lmc_step = mca_btl_openib_component.apm_lmc + 1;
714         } else {
715             opal_show_help("help-mpi-btl-openib.txt", "apm with wrong lmc",true,
716                     mca_btl_openib_component.apm_lmc, lmc);
717             return OPAL_ERROR;
718         }
719     } else {
720         if (mca_btl_openib_component.apm_lmc) {
721             /* Disable apm and report warning */
722             mca_btl_openib_component.apm_lmc = 0;
723             opal_show_help("help-mpi-btl-openib.txt", "apm without lmc",true);
724         }
725     }
726 
727     for(lid = ib_port_attr->lid;
728             lid < ib_port_attr->lid + lmc; lid += lmc_step){
729         for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){
730             char param[40];
731 
732             openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
733             if(NULL == openib_btl) {
734                 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
735                 return OPAL_ERR_OUT_OF_RESOURCE;
736             }
737             memcpy(openib_btl, &mca_btl_openib_module,
738                     sizeof(mca_btl_openib_module));
739             memcpy(&openib_btl->ib_port_attr, ib_port_attr,
740                     sizeof(struct ibv_port_attr));
741             ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
742             ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
743             openib_btl->device = device;
744             openib_btl->port_num = (uint8_t) port_num;
745             openib_btl->pkey_index = pkey_index;
746             openib_btl->lid = lid;
747             openib_btl->apm_port = 0;
748             openib_btl->src_path_bits = lid - ib_port_attr->lid;
749 
750             openib_btl->port_info.subnet_id = subnet_id;
751             openib_btl->port_info.mtu = device->mtu;
752             openib_btl->port_info.lid = lid;
753 
754             openib_btl->cpcs = NULL;
755             openib_btl->num_cpcs = 0;
756             openib_btl->local_procs = 0;
757 
758             mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
759             mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
760 
761             if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
762                 openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
763             }
764 
765             openib_btl->super.btl_get_alignment = 0;
766 
767             if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
768                 openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
769             }
770 
771             openib_btl->super.btl_put_local_registration_threshold = openib_btl->device->max_inline_data;
772             openib_btl->super.btl_get_local_registration_threshold = 0;
773 
774 #if HAVE_DECL_IBV_ATOMIC_HCA
775             openib_btl->atomic_ops_be = false;
776 
777 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXT_ATOM
778             /* check that 8-byte atomics are supported */
779             if (!(device->ib_exp_dev_attr.ext_atom.log_atomic_arg_sizes & (1<<3ull))) {
780                 openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
781                 openib_btl->super.btl_atomic_flags = 0;
782                 openib_btl->super.btl_atomic_fop = NULL;
783                 openib_btl->super.btl_atomic_cswap = NULL;
784             }
785 #endif
786 
787 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXP_ATOMIC_CAP
788             switch (openib_btl->device->ib_exp_dev_attr.exp_atomic_cap)
789 #else
790             switch (openib_btl->device->ib_dev_attr.atomic_cap)
791 #endif
792             {
793             case IBV_ATOMIC_GLOB:
794                 openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
795                 break;
796 #if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
797             case IBV_EXP_ATOMIC_HCA_REPLY_BE:
798                 openib_btl->atomic_ops_be = true;
799                 break;
800 #endif
801             case IBV_ATOMIC_HCA:
802                 break;
803             case IBV_ATOMIC_NONE:
804             default:
805                 /* no atomics or an unsupported atomic type */
806                 openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
807                 openib_btl->super.btl_atomic_flags = 0;
808                 openib_btl->super.btl_atomic_fop = NULL;
809                 openib_btl->super.btl_atomic_cswap = NULL;
810             }
811 #endif
812 
813             openib_btl->super.btl_put_alignment = 0;
814 
815             openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
816 
817             /* Check bandwidth configured for this device */
818             sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
819            param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
820 
821             /* Check bandwidth configured for this device/port */
822             sprintf(param, "bandwidth_%s:%d", ibv_get_device_name(device->ib_dev),
823                     port_num);
824            param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
825 
826             /* Check bandwidth configured for this device/port/LID */
827             sprintf(param, "bandwidth_%s:%d:%d",
828                     ibv_get_device_name(device->ib_dev), port_num, lid);
829            param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
830 
831             /* Check latency configured for this device */
832             sprintf(param, "latency_%s", ibv_get_device_name(device->ib_dev));
833            param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
834 
835             /* Check latency configured for this device/port */
836             sprintf(param, "latency_%s:%d", ibv_get_device_name(device->ib_dev),
837                     port_num);
838            param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
839 
840             /* Check latency configured for this device/port/LID */
841             sprintf(param, "latency_%s:%d:%d", ibv_get_device_name(device->ib_dev),
842                     port_num, lid);
843            param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
844 
845             /* Auto-detect the port bandwidth */
846             if (0 == openib_btl->super.btl_bandwidth) {
847                 if (OPAL_SUCCESS !=
848                     opal_common_verbs_port_bw(ib_port_attr,
849                                               &openib_btl->super.btl_bandwidth)) {
850                     /* If we can't figure out the bandwidth, declare
851                        this port unreachable (do not* return
852                        ERR_VALUE_OF_OUT_OF_BOUNDS; that is reserved
853                        for when we exceed the number of allowable
854                        BTLs). */
855                     return OPAL_ERR_UNREACH;
856                 }
857             }
858 
859             opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
860             opal_pointer_array_add(device->device_btls, (void*) openib_btl);
861             ++device->btls;
862             ++mca_btl_openib_component.ib_num_btls;
863             if (-1 != mca_btl_openib_component.ib_max_btls &&
864                 mca_btl_openib_component.ib_num_btls >=
865                 mca_btl_openib_component.ib_max_btls) {
866                 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
867             }
868         }
869     }
870 
871     return OPAL_SUCCESS;
872 }
873 
device_construct(mca_btl_openib_device_t * device)874 static void device_construct(mca_btl_openib_device_t *device)
875 {
876     device->ib_dev = NULL;
877     device->ib_dev_context = NULL;
878     device->ib_pd = NULL;
879     device->mpool = NULL;
880     device->rcache = NULL;
881 #if OPAL_ENABLE_PROGRESS_THREADS == 1
882     device->ib_channel = NULL;
883 #endif
884     device->btls = 0;
885     device->endpoints = NULL;
886     device->device_btls = NULL;
887     device->ib_cq[BTL_OPENIB_HP_CQ] = NULL;
888     device->ib_cq[BTL_OPENIB_LP_CQ] = NULL;
889     device->cq_size[BTL_OPENIB_HP_CQ] = 0;
890     device->cq_size[BTL_OPENIB_LP_CQ] = 0;
891     device->non_eager_rdma_endpoints = 0;
892     device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
893     device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
894     device->pollme = true;
895     device->eager_rdma_buffers_count = 0;
896     device->eager_rdma_buffers = NULL;
897 #if HAVE_XRC
898     device->xrc_fd = -1;
899 #endif
900     device->qps = NULL;
901     OBJ_CONSTRUCT(&device->device_lock, opal_mutex_t);
902     OBJ_CONSTRUCT(&device->send_free_control, opal_free_list_t);
903     device->max_inline_data = 0;
904     device->ready_for_use = false;
905 }
906 
device_destruct(mca_btl_openib_device_t * device)907 static void device_destruct(mca_btl_openib_device_t *device)
908 {
909     int i;
910 
911 #if OPAL_ENABLE_PROGRESS_THREADS == 1
912     if (device->progress) {
913         device->progress = false;
914         if (pthread_cancel(device->thread.t_handle)) {
915             BTL_ERROR(("Failed to cancel OpenIB progress thread"));
916             goto device_error;
917         }
918         opal_thread_join(&device->thread, NULL);
919     }
920 
921     if (ibv_destroy_comp_channel(device->ib_channel)) {
922         BTL_VERBOSE(("Failed to close comp_channel"));
923         goto device_error;
924     }
925 #endif
926 
927     /* signaling to async_tread to stop poll for this device */
928     mca_btl_openib_async_rem_device (device);
929 
930     if(device->eager_rdma_buffers) {
931         int i;
932         for(i = 0; i < device->eager_rdma_buffers_count; i++)
933             if(device->eager_rdma_buffers[i])
934                 OBJ_RELEASE(device->eager_rdma_buffers[i]);
935         free(device->eager_rdma_buffers);
936     }
937 
938     if (NULL != device->qps) {
939         for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
940             OBJ_DESTRUCT(&device->qps[i].send_free);
941             OBJ_DESTRUCT(&device->qps[i].recv_free);
942         }
943         free(device->qps);
944     }
945 
946     OBJ_DESTRUCT(&device->send_free_control);
947 
948     /* Release CQs */
949     if(device->ib_cq[BTL_OPENIB_HP_CQ] != NULL) {
950         if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_HP_CQ])) {
951             BTL_VERBOSE(("Failed to close HP CQ"));
952             goto device_error;
953         }
954     }
955 
956     if(device->ib_cq[BTL_OPENIB_LP_CQ] != NULL) {
957         if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_LP_CQ])) {
958             BTL_VERBOSE(("Failed to close LP CQ"));
959             goto device_error;
960         }
961     }
962 
963     if (OPAL_SUCCESS != mca_rcache_base_module_destroy (device->rcache)) {
964         BTL_VERBOSE(("failed to release registration cache"));
965         goto device_error;
966     }
967 
968 #if HAVE_XRC
969 
970     if (MCA_BTL_XRC_ENABLED) {
971         if (OPAL_SUCCESS != mca_btl_openib_close_xrc_domain(device)) {
972             BTL_VERBOSE(("XRC Internal error. Failed to close xrc domain"));
973             goto device_error;
974         }
975     }
976 #endif
977 
978     if (ibv_dealloc_pd(device->ib_pd)) {
979         BTL_VERBOSE(("Warning! Failed to release PD"));
980         goto device_error;
981     }
982 
983     OBJ_DESTRUCT(&device->device_lock);
984 
985     if (ibv_close_device(device->ib_dev_context)) {
986         if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
987             BTL_VERBOSE(("Warning! Failed to close device"));
988             goto device_error;
989         } else {
990             BTL_ERROR(("Error! Failed to close device"));
991             goto device_error;
992         }
993     }
994     BTL_VERBOSE(("device was successfully released"));
995     return;
996 device_error:
997     BTL_VERBOSE(("Failed to destroy device resources"));
998 }
999 
1000 OBJ_CLASS_INSTANCE(mca_btl_openib_device_t, opal_object_t, device_construct,
1001         device_destruct);
1002 
1003 static int
get_port_list(mca_btl_openib_device_t * device,int * allowed_ports)1004 get_port_list(mca_btl_openib_device_t *device, int *allowed_ports)
1005 {
1006     int i, j, k, num_ports = 0;
1007     const char *dev_name;
1008     char *name;
1009 
1010     dev_name = ibv_get_device_name(device->ib_dev);
1011     name = (char*) malloc(strlen(dev_name) + 4);
1012     if (NULL == name) {
1013         return 0;
1014     }
1015 
1016     /* Assume that all ports are allowed.  num_ports will be adjusted
1017        below to reflect whether this is true or not. */
1018     for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1019         allowed_ports[num_ports++] = i;
1020     }
1021     num_ports = 0;
1022     if (NULL != mca_btl_openib_component.if_include_list) {
1023         /* If only the device name is given (eg. mtdevice0,mtdevice1) use all
1024            ports */
1025         i = 0;
1026         while (mca_btl_openib_component.if_include_list[i]) {
1027             if (0 == strcmp(dev_name,
1028                             mca_btl_openib_component.if_include_list[i])) {
1029                 num_ports = device->ib_dev_attr.phys_port_cnt;
1030                 goto done;
1031             }
1032             ++i;
1033         }
1034         /* Include only requested ports on the device */
1035         for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1036             sprintf(name,"%s:%d",dev_name,i);
1037             for (j = 0;
1038                  NULL != mca_btl_openib_component.if_include_list[j]; ++j) {
1039                 if (0 == strcmp(name,
1040                                 mca_btl_openib_component.if_include_list[j])) {
1041                     allowed_ports[num_ports++] = i;
1042                     break;
1043                 }
1044             }
1045         }
1046     } else if (NULL != mca_btl_openib_component.if_exclude_list) {
1047         /* If only the device name is given (eg. mtdevice0,mtdevice1) exclude
1048            all ports */
1049         i = 0;
1050         while (mca_btl_openib_component.if_exclude_list[i]) {
1051             if (0 == strcmp(dev_name,
1052                             mca_btl_openib_component.if_exclude_list[i])) {
1053                 num_ports = 0;
1054                 goto done;
1055             }
1056             ++i;
1057         }
1058         /* Exclude the specified ports on this device */
1059         for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1060             sprintf(name,"%s:%d",dev_name,i);
1061             for (j = 0;
1062                  NULL != mca_btl_openib_component.if_exclude_list[j]; ++j) {
1063                 if (0 == strcmp(name,
1064                                 mca_btl_openib_component.if_exclude_list[j])) {
1065                     /* If found, set a sentinel value */
1066                     j = -1;
1067                     break;
1068                 }
1069             }
1070             /* If we didn't find it, it's ok to include in the list */
1071             if (-1 != j) {
1072                 allowed_ports[num_ports++] = i;
1073             }
1074         }
1075     } else {
1076         num_ports = device->ib_dev_attr.phys_port_cnt;
1077     }
1078 
1079 done:
1080 
1081     /* Remove the following from the error-checking if_list:
1082        - bare device name
1083        - device name suffixed with port number */
1084     if (NULL != mca_btl_openib_component.if_list) {
1085         for (i = 0; NULL != mca_btl_openib_component.if_list[i]; ++i) {
1086 
1087             /* Look for raw device name */
1088             if (0 == strcmp(mca_btl_openib_component.if_list[i], dev_name)) {
1089                 j = opal_argv_count(mca_btl_openib_component.if_list);
1090                 opal_argv_delete(&j, &(mca_btl_openib_component.if_list),
1091                                  i, 1);
1092                 --i;
1093             }
1094         }
1095         for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1096             sprintf(name, "%s:%d", dev_name, i);
1097             for (j = 0; NULL != mca_btl_openib_component.if_list[j]; ++j) {
1098                 if (0 == strcmp(mca_btl_openib_component.if_list[j], name)) {
1099                     k = opal_argv_count(mca_btl_openib_component.if_list);
1100                     opal_argv_delete(&k, &(mca_btl_openib_component.if_list),
1101                                      j, 1);
1102                     --j;
1103                     break;
1104                 }
1105             }
1106         }
1107     }
1108 
1109     free(name);
1110 
1111     return num_ports;
1112 }
1113 
1114 /*
1115  * Prefer values that are already in the target
1116  */
merge_values(opal_btl_openib_ini_values_t * target,opal_btl_openib_ini_values_t * src)1117 static void merge_values(opal_btl_openib_ini_values_t *target,
1118                          opal_btl_openib_ini_values_t *src)
1119 {
1120     if (!target->mtu_set && src->mtu_set) {
1121         target->mtu = src->mtu;
1122         target->mtu_set = true;
1123     }
1124 
1125     if (!target->use_eager_rdma_set && src->use_eager_rdma_set) {
1126         target->use_eager_rdma = src->use_eager_rdma;
1127         target->use_eager_rdma_set = true;
1128     }
1129 
1130     if (NULL == target->receive_queues && NULL != src->receive_queues) {
1131         target->receive_queues = strdup(src->receive_queues);
1132     }
1133 
1134     if (!target->max_inline_data_set && src->max_inline_data_set) {
1135         target->max_inline_data = src->max_inline_data;
1136         target->max_inline_data_set = true;
1137     }
1138 }
1139 
is_credit_message(const mca_btl_openib_recv_frag_t * frag)1140 static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
1141 {
1142     mca_btl_openib_control_header_t* chdr =
1143         (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval;
1144     return (MCA_BTL_TAG_IB == frag->hdr->tag) &&
1145         (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
1146 }
1147 
is_cts_message(const mca_btl_openib_recv_frag_t * frag)1148 static bool inline is_cts_message(const mca_btl_openib_recv_frag_t *frag)
1149 {
1150     mca_btl_openib_control_header_t* chdr =
1151         (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval;
1152     return (MCA_BTL_TAG_IB == frag->hdr->tag) &&
1153         (MCA_BTL_OPENIB_CONTROL_CTS == chdr->type);
1154 }
1155 
atoi_param(char * param,int32_t dflt)1156 static int32_t atoi_param(char *param, int32_t dflt)
1157 {
1158     if (NULL == param || '\0' == param[0]) {
1159         return dflt ? dflt : 1;
1160     }
1161 
1162     return atoi(param);
1163 }
1164 
init_apm_port(mca_btl_openib_device_t * device,int port,uint16_t lid)1165 static void init_apm_port(mca_btl_openib_device_t *device, int port, uint16_t lid)
1166 {
1167     int index;
1168     struct mca_btl_openib_module_t *btl;
1169     for(index = 0; index < device->btls; index++) {
1170         btl = (mca_btl_openib_module_t *) opal_pointer_array_get_item(device->device_btls, index);
1171         /* Ok, we already have btl for the fist port,
1172          * second one will be used for APM */
1173         btl->apm_port = port;
1174         btl->port_info.apm_lid = lid + btl->src_path_bits;
1175         mca_btl_openib_component.apm_ports++;
1176         BTL_VERBOSE(("APM-PORT: Setting alternative port - %d, lid - %d"
1177                     ,port ,lid));
1178     }
1179 }
1180 
get_var_source(const char * var_name,mca_base_var_source_t * source)1181 static int get_var_source (const char *var_name, mca_base_var_source_t *source)
1182 {
1183     int vari = mca_base_var_find ("opal", "btl", "openib", var_name);
1184     if (0 > vari) {
1185         return vari;
1186     }
1187 
1188     return mca_base_var_get_value (vari, NULL, source, NULL);
1189 }
1190 
setup_qps(void)1191 static int setup_qps(void)
1192 {
1193     char **queues, **params = NULL;
1194     int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
1195     uint32_t max_qp_size, max_size_needed;
1196     int32_t min_freelist_size = 0;
1197     int smallest_pp_qp = INT_MAX, ret = OPAL_ERROR;
1198 
1199     queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':');
1200     if (0 == opal_argv_count(queues)) {
1201         opal_show_help("help-mpi-btl-openib.txt",
1202                        "no qps in receive_queues", true,
1203                        opal_process_info.nodename,
1204                        mca_btl_openib_component.receive_queues);
1205         ret = OPAL_ERROR;
1206         goto error;
1207     }
1208 
1209     while (queues[qp] != NULL) {
1210         if (0 == strncmp("P,", queues[qp], 2)) {
1211             num_pp_qps++;
1212             if (smallest_pp_qp > qp) {
1213                 smallest_pp_qp = qp;
1214             }
1215         } else if (0 == strncmp("S,", queues[qp], 2)) {
1216             num_srq_qps++;
1217         } else if (0 == strncmp("X,", queues[qp], 2)) {
1218 #if HAVE_XRC
1219             num_xrc_qps++;
1220 #else
1221             opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
1222                            opal_process_info.nodename,
1223                            mca_btl_openib_component.receive_queues);
1224             ret = OPAL_ERR_NOT_AVAILABLE;
1225             goto error;
1226 #endif
1227         } else {
1228             opal_show_help("help-mpi-btl-openib.txt",
1229                            "invalid qp type in receive_queues", true,
1230                            opal_process_info.nodename,
1231                            mca_btl_openib_component.receive_queues,
1232                            queues[qp]);
1233             ret = OPAL_ERR_BAD_PARAM;
1234             goto error;
1235         }
1236         qp++;
1237     }
1238 
1239 #if HAVE_XRC
1240     /* Current XRC implementation can't used with other QP types - PP
1241        and SRQ */
1242     if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
1243         opal_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
1244                        opal_process_info.nodename,
1245                        mca_btl_openib_component.receive_queues);
1246         ret = OPAL_ERR_BAD_PARAM;
1247         goto error;
1248     }
1249 
1250     /* Current XRC implementation can't used with btls_per_lid > 1 */
1251     if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
1252         opal_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
1253                        true, opal_process_info.nodename,
1254                        mca_btl_openib_component.receive_queues, num_xrc_qps);
1255         ret = OPAL_ERR_BAD_PARAM;
1256         goto error;
1257     }
1258 #endif
1259 
1260     mca_btl_openib_component.num_pp_qps = num_pp_qps;
1261     mca_btl_openib_component.num_srq_qps = num_srq_qps;
1262     mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
1263     mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
1264 
1265     mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
1266         malloc(sizeof(mca_btl_openib_qp_info_t) *
1267                 mca_btl_openib_component.num_qps);
1268     if (NULL == mca_btl_openib_component.qp_infos) {
1269         ret = OPAL_ERR_OUT_OF_RESOURCE;
1270         goto error;
1271     }
1272 
1273     qp = 0;
1274 #define P(N) (((N) > count) ? NULL : params[(N)])
1275     while (queues[qp] != NULL) {
1276         int count;
1277         int32_t rd_low, rd_num;
1278         params = opal_argv_split_with_empty(queues[qp], ',');
1279         count = opal_argv_count(params);
1280 
1281         if ('P' == params[0][0]) {
1282             int32_t rd_win, rd_rsv;
1283             if (count < 3 || count > 6) {
1284                 opal_show_help("help-mpi-btl-openib.txt",
1285                                "invalid pp qp specification", true,
1286                                opal_process_info.nodename, queues[qp]);
1287                 ret = OPAL_ERR_BAD_PARAM;
1288                 goto error;
1289             }
1290             mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
1291             mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
1292             rd_num = atoi_param(P(2), 256);
1293             /* by default set rd_low to be 3/4 of rd_num */
1294             rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
1295             rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
1296 
1297             if (0 >= rd_win) {
1298                 opal_show_help("help-mpi-btl-openib.txt",
1299                                "invalid pp qp specification", true,
1300                                opal_process_info.nodename, queues[qp]);
1301                 ret = OPAL_ERR_BAD_PARAM;
1302                 goto error;
1303             }
1304 
1305             rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
1306 
1307             BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
1308                          rd_num, rd_low, rd_win, rd_rsv));
1309 
1310             /* Calculate the smallest freelist size that can be allowed */
1311             if (rd_num + rd_rsv > min_freelist_size) {
1312                 min_freelist_size = rd_num + rd_rsv;
1313             }
1314 
1315             mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
1316             mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
1317             if ((rd_num - rd_low) > rd_win) {
1318                 opal_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
1319                         true, rd_win, rd_num - rd_low);
1320             }
1321         } else {
1322             int32_t sd_max, rd_init, srq_limit;
1323             if (count < 3 || count > 7) {
1324                 opal_show_help("help-mpi-btl-openib.txt",
1325                                "invalid srq specification", true,
1326                                opal_process_info.nodename, queues[qp]);
1327                 ret = OPAL_ERR_BAD_PARAM;
1328                 goto error;
1329             }
1330             mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
1331                 MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
1332             mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
1333             rd_num = atoi_param(P(2), 256);
1334             /* by default set rd_low to be 3/4 of rd_num */
1335             rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
1336             sd_max = atoi_param(P(4), rd_low / 4);
1337             /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
1338             rd_init = atoi_param(P(5), rd_num / 4);
1339             /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
1340                the value of rd_low_local we calculate in create_srq function) */
1341             srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
1342 
1343             /* If we set srq_limit less or greater than rd_init
1344                (init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
1345                event immediately and the value of rd_curr_num will be increased */
1346 
1347             /* If we set srq_limit to zero, but size of SRQ greater than 1 => set it to be 1 */
1348             if((0 == srq_limit) && (1 < rd_num)) {
1349                 srq_limit = 1;
1350             }
1351 
1352             BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
1353                          rd_num, rd_low, sd_max, rd_init, srq_limit));
1354 
1355             /* Calculate the smallest freelist size that can be allowed */
1356             if (rd_num > min_freelist_size) {
1357                 min_freelist_size = rd_num;
1358             }
1359 
1360             if (rd_num < rd_init) {
1361                 opal_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
1362                         true, opal_process_info.nodename, queues[qp]);
1363                 ret = OPAL_ERR_BAD_PARAM;
1364                 goto error;
1365             }
1366 
1367             if (rd_num < srq_limit) {
1368                 opal_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
1369                         true, opal_process_info.nodename, queues[qp]);
1370                 ret = OPAL_ERR_BAD_PARAM;
1371                 goto error;
1372             }
1373 
1374             mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
1375             mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
1376             mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
1377         }
1378 
1379         if (rd_num <= rd_low) {
1380             opal_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
1381                     true, opal_process_info.nodename, queues[qp]);
1382             ret = OPAL_ERR_BAD_PARAM;
1383             goto error;
1384         }
1385         mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
1386         mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
1387         opal_argv_free(params);
1388         qp++;
1389     }
1390     params = NULL;
1391 
1392     /* Sanity check some sizes */
1393 
1394     max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
1395     max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
1396                        mca_btl_openib_module.super.btl_max_send_size) ?
1397         mca_btl_openib_module.super.btl_eager_limit :
1398         mca_btl_openib_module.super.btl_max_send_size;
1399 
1400     if (max_qp_size < max_size_needed) {
1401         mca_base_var_source_t eager_source = MCA_BASE_VAR_SOURCE_DEFAULT;
1402         mca_base_var_source_t max_send_source = MCA_BASE_VAR_SOURCE_DEFAULT;
1403 
1404         (void) get_var_source ("max_send_size", &max_send_source);
1405         (void) get_var_source ("eager_limit", &eager_source);
1406 
1407         /* the largest queue pair is too small for either the max send size or eager
1408          * limit. check where we got the max_send_size and eager_limit and adjust if
1409          * the user did not specify one or the other. */
1410         if (mca_btl_openib_module.super.btl_eager_limit > max_qp_size &&
1411             MCA_BASE_VAR_SOURCE_DEFAULT == eager_source) {
1412             mca_btl_openib_module.super.btl_eager_limit = max_qp_size;
1413         }
1414 
1415         if (mca_btl_openib_module.super.btl_max_send_size > max_qp_size &&
1416             MCA_BASE_VAR_SOURCE_DEFAULT == max_send_source) {
1417             mca_btl_openib_module.super.btl_max_send_size = max_qp_size;
1418         }
1419 
1420         max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
1421                        mca_btl_openib_module.super.btl_max_send_size) ?
1422         mca_btl_openib_module.super.btl_eager_limit :
1423         mca_btl_openib_module.super.btl_max_send_size;
1424     }
1425 
1426     if (max_qp_size < max_size_needed) {
1427         opal_show_help("help-mpi-btl-openib.txt",
1428                        "biggest qp size is too small", true,
1429                        opal_process_info.nodename, max_qp_size,
1430                        max_size_needed);
1431         ret = OPAL_ERR_BAD_PARAM;
1432         goto error;
1433     } else if (max_qp_size > max_size_needed) {
1434         opal_show_help("help-mpi-btl-openib.txt",
1435                        "biggest qp size is too big", true,
1436                        opal_process_info.nodename, max_qp_size,
1437                        max_size_needed);
1438     }
1439 
1440     if (mca_btl_openib_component.ib_free_list_max > 0 &&
1441         min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
1442         opal_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
1443                        opal_process_info.nodename,
1444                        mca_btl_openib_component.ib_free_list_max,
1445                        min_freelist_size);
1446         ret = OPAL_ERR_BAD_PARAM;
1447         goto error;
1448     }
1449 
1450     mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
1451     if (mca_btl_openib_component.num_qps > smallest_pp_qp) {
1452         mca_btl_openib_component.credits_qp = smallest_pp_qp;
1453     } else {
1454         mca_btl_openib_component.credits_qp = mca_btl_openib_component.num_qps - 1;
1455     }
1456 
1457     ret = OPAL_SUCCESS;
1458 error:
1459     if (NULL != params) {
1460         opal_argv_free(params);
1461     }
1462 
1463     if (NULL != queues) {
1464         opal_argv_free(queues);
1465     }
1466 
1467     return ret;
1468 }
1469 
1470 /* read a single integer from a linux module parameters file */
read_module_param(char * file,uint64_t value,uint64_t max)1471 static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
1472 {
1473     int fd = open(file, O_RDONLY);
1474     char buffer[64];
1475     uint64_t ret;
1476     int rc;
1477 
1478     if (0 > fd) {
1479         return value;
1480     }
1481 
1482     rc = read (fd, buffer, 64);
1483 
1484     close (fd);
1485 
1486     if (0 == rc) {
1487         return value;
1488     }
1489 
1490     errno = 0;
1491     ret = strtoull(buffer, NULL, 10);
1492 
1493     if (ret > max) {
1494         /* NTH: probably should report a bogus value */
1495         ret = max;
1496     }
1497 
1498     return (0 == errno) ? ret : value;
1499 }
1500 
1501 /* calculate memory registation limits */
calculate_total_mem(void)1502 static uint64_t calculate_total_mem (void)
1503 {
1504     hwloc_obj_t machine;
1505     int rc;
1506     uint64_t mem, *mptr;
1507     opal_process_name_t wildcard_rank;
1508 
1509     /* first try to retrieve it from PMIx as it may have
1510      * been provided */
1511     wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
1512     wildcard_rank.vpid = OPAL_VPID_WILDCARD;
1513     mptr = &mem;
1514     OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
1515                                    &wildcard_rank, &mptr, OPAL_UINT64);
1516     if (OPAL_SUCCESS == rc) {
1517         return mem;
1518     }
1519 
1520     /* if not available, then ensure that the topology has been
1521      * loaded and try to get it from there */
1522     if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
1523         machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1524         if (NULL == machine) {
1525             return 0;
1526         }
1527         return machine->memory.total_memory;
1528     }
1529 
1530     /* couldn't find it */
1531     return 0;
1532 }
1533 
1534 
calculate_max_reg(const char * device_name)1535 static uint64_t calculate_max_reg (const char *device_name)
1536 {
1537     struct stat statinfo;
1538     uint64_t mtts_per_seg = 1;
1539     uint64_t num_mtt = 1 << 19;
1540     uint64_t reserved_mtt = 0;
1541     uint64_t max_reg, mem_total;
1542 
1543     mem_total = calculate_total_mem ();
1544 
1545     /* On older OFED(<2.0), may need to turn off this parameter*/
1546     if (mca_btl_openib_component.allow_max_memory_registration) {
1547         max_reg = 2 * mem_total;
1548         /* Limit us to 87.5% of the registered memory (some fluff for QPs,
1549         file systems, etc) */
1550         return (max_reg * 7) >> 3;
1551     }
1552 
1553     /* Default to being able to register everything (to ensure that
1554        max_reg is initialized in all cases) */
1555     max_reg = mem_total;
1556     if (!strncmp(device_name, "mlx5", 4)) {
1557         max_reg = 2 * mem_total;
1558 
1559     } else if (!strncmp(device_name, "mlx4", 4)) {
1560         if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) {
1561             mtts_per_seg = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63);
1562             num_mtt = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63);
1563             if (1 == num_mtt) {
1564                 /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
1565                 num_mtt = 1 << 19;
1566                 max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1567             } else  {
1568                 max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1569             }
1570         }
1571 
1572     } else if (!strncmp(device_name, "mthca", 5)) {
1573         if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) {
1574             mtts_per_seg = 1ull << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1, 63);
1575             num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20, (uint64_t) -1);
1576             reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0, (uint64_t) -1);
1577 
1578             max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1579         } else {
1580             max_reg = mem_total;
1581         }
1582 
1583     } else {
1584         /* Need to update to determine the registration limit for this
1585            configuration */
1586         max_reg = mem_total;
1587     }
1588 
1589     /* Print a warning if we can't register more than 75% of physical
1590        memory.  Abort if the abort_not_enough_reg_mem MCA param was
1591        set. */
1592     if (max_reg < mem_total * 3 / 4) {
1593         char *action;
1594 
1595         if (mca_btl_openib_component.abort_not_enough_reg_mem) {
1596             action = "Your MPI job will now abort.";
1597         } else {
1598             action = "Your MPI job will continue, but may be behave poorly and/or hang.";
1599         }
1600         opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
1601                        opal_process_info.nodename, (unsigned long)(max_reg >> 20),
1602                        (unsigned long)(mem_total >> 20), action);
1603         return 0;  /* signal that we can't have enough memory */
1604     }
1605 
1606     /* Limit us to 87.5% of the registered memory (some fluff for QPs,
1607        file systems, etc) */
1608     return (max_reg * 7) >> 3;
1609 }
1610 
init_one_device(opal_list_t * btl_list,struct ibv_device * ib_dev)1611 static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
1612 {
1613     mca_rcache_base_resources_t rcache_resources;
1614     mca_btl_openib_device_t *device;
1615     uint8_t i, k = 0;
1616     int ret = -1, port_cnt;
1617     opal_btl_openib_ini_values_t values, default_values;
1618     int *allowed_ports = NULL;
1619     bool need_search;
1620     struct ibv_context *dev_context = NULL;
1621 
1622     /* Open up the device */
1623     dev_context = ibv_open_device(ib_dev);
1624     if (NULL == dev_context) {
1625         return OPAL_ERR_NOT_SUPPORTED;
1626     }
1627 
1628     /* Find out if this device supports RC QPs */
1629     if (OPAL_SUCCESS != opal_common_verbs_qp_test(dev_context,
1630                                                   OPAL_COMMON_VERBS_FLAGS_RC)) {
1631         ibv_close_device(dev_context);
1632         BTL_VERBOSE(("openib: RC QPs not supported -- skipping %s",
1633                      ibv_get_device_name(ib_dev)));
1634         ++num_devices_intentionally_ignored;
1635         return OPAL_ERR_NOT_SUPPORTED;
1636     }
1637 
1638     device = OBJ_NEW(mca_btl_openib_device_t);
1639     if(NULL == device){
1640         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
1641         ibv_close_device(dev_context);
1642         return OPAL_ERR_OUT_OF_RESOURCE;
1643     }
1644 
1645     device->mem_reg_active = 0;
1646     device->mem_reg_max_total = calculate_max_reg(ibv_get_device_name(ib_dev));
1647     device->mem_reg_max = device->mem_reg_max_total;
1648     if(( 0 == device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) {
1649         return OPAL_ERROR;
1650     }
1651 
1652     device->ib_dev = ib_dev;
1653     device->ib_dev_context = dev_context;
1654     device->ib_pd = NULL;
1655     device->device_btls = OBJ_NEW(opal_pointer_array_t);
1656     if (OPAL_SUCCESS != opal_pointer_array_init(device->device_btls, 2, INT_MAX, 2)) {
1657         BTL_ERROR(("Failed to initialize device_btls array: %s:%d", __FILE__, __LINE__));
1658         return OPAL_ERR_OUT_OF_RESOURCE;
1659     }
1660 
1661     if(NULL == device->ib_dev_context){
1662         BTL_ERROR(("error obtaining device context for %s errno says %s",
1663                     ibv_get_device_name(device->ib_dev), strerror(errno)));
1664         goto error;
1665     }
1666 #if HAVE_DECL_IBV_EXP_QUERY_DEVICE
1667     memset(&device->ib_exp_dev_attr, 0, sizeof(device->ib_exp_dev_attr));
1668     device->ib_exp_dev_attr.comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1;
1669     if(ibv_exp_query_device(device->ib_dev_context, &device->ib_exp_dev_attr)){
1670         BTL_ERROR(("error obtaining device attributes for %s errno says %s",
1671                     ibv_get_device_name(device->ib_dev), strerror(errno)));
1672         goto error;
1673     }
1674 #endif
1675     if(ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)){
1676         BTL_ERROR(("error obtaining device attributes for %s errno says %s",
1677                     ibv_get_device_name(device->ib_dev), strerror(errno)));
1678         goto error;
1679     }
1680     /* If mca_btl_if_include/exclude were specified, get usable ports */
1681     allowed_ports = (int*)malloc(device->ib_dev_attr.phys_port_cnt * sizeof(int));
1682     if (NULL == allowed_ports) {
1683         ret = OPAL_ERR_OUT_OF_RESOURCE;
1684         goto error;
1685     }
1686 
1687     port_cnt = get_port_list(device, allowed_ports);
1688     if (0 == port_cnt) {
1689         ret = OPAL_SUCCESS;
1690         ++num_devices_intentionally_ignored;
1691         goto error;
1692     }
1693 
1694     /* Load in vendor/part-specific device parameters.  Note that even if
1695        we don't find values for this vendor/part, "values" will be set
1696        indicating that it does not have good values */
1697     ret = opal_btl_openib_ini_query(device->ib_dev_attr.vendor_id,
1698                                     device->ib_dev_attr.vendor_part_id,
1699                                     &values);
1700     if (OPAL_SUCCESS != ret &&
1701         OPAL_ERR_NOT_FOUND != ret) {
1702         /* If we get a serious error, propagate it upwards */
1703         goto error;
1704     }
1705     if (OPAL_ERR_NOT_FOUND == ret) {
1706         /* If we didn't find a matching device in the INI files, output a
1707            warning that we're using default values (unless overridden
1708            that we don't want to see these warnings) */
1709         if (mca_btl_openib_component.warn_no_device_params_found) {
1710             opal_show_help("help-mpi-btl-openib.txt",
1711                            "no device params found", true,
1712                            opal_process_info.nodename,
1713                            ibv_get_device_name(device->ib_dev),
1714                            device->ib_dev_attr.vendor_id,
1715                            device->ib_dev_attr.vendor_part_id);
1716         }
1717     }
1718 
1719     /* If we're supposed to ignore devices of this vendor/part ID,
1720        then do so */
1721     if (values.ignore_device_set && values.ignore_device) {
1722         BTL_VERBOSE(("device %s skipped; ignore_device=1",
1723                      ibv_get_device_name(device->ib_dev)));
1724         ret = OPAL_SUCCESS;
1725         ++num_devices_intentionally_ignored;
1726         goto error;
1727     }
1728 
1729     /* Note that even if we don't find default values, "values" will
1730        be set indicating that it does not have good values */
1731     ret = opal_btl_openib_ini_query(0, 0, &default_values);
1732     if (OPAL_SUCCESS != ret &&
1733         OPAL_ERR_NOT_FOUND != ret) {
1734         /* If we get a serious error, propagate it upwards */
1735         goto error;
1736     }
1737 
1738     /* If we did find values for this device (or in the defaults
1739        section), handle them */
1740     merge_values(&values, &default_values);
1741     /*  If MCA param was set, use it. If not, check the INI file
1742         or default to IBV_MTU_1024 */
1743     if (0 < mca_btl_openib_component.ib_mtu) {
1744         device->mtu = mca_btl_openib_component.ib_mtu;
1745     } else if (values.mtu_set) {
1746         switch (values.mtu) {
1747         case 256:
1748             device->mtu = IBV_MTU_256;
1749             break;
1750         case 512:
1751             device->mtu = IBV_MTU_512;
1752             break;
1753         case 1024:
1754             device->mtu = IBV_MTU_1024;
1755             break;
1756         case 2048:
1757             device->mtu = IBV_MTU_2048;
1758             break;
1759         case 4096:
1760             device->mtu = IBV_MTU_4096;
1761             break;
1762         default:
1763             BTL_ERROR(("invalid MTU value specified in INI file (%d); ignored", values.mtu));
1764             device->mtu = IBV_MTU_1024 ;
1765             break;
1766         }
1767     } else {
1768         device->mtu = IBV_MTU_1024 ;
1769     }
1770 
1771     /* Allocate the protection domain for the device */
1772     device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
1773     if(NULL == device->ib_pd){
1774         BTL_ERROR(("error allocating protection domain for %s errno says %s",
1775                     ibv_get_device_name(device->ib_dev), strerror(errno)));
1776         goto error;
1777     }
1778 
1779     /* Figure out what the max_inline_data value should be for all
1780        ports and QPs on this device */
1781     need_search = false;
1782     if(-2 != mca_btl_openib_component.ib_max_inline_data) {
1783         /* User has explicitly set btl_openib_max_inline_data MCA parameter
1784            Per setup in _mca.c, we know that the MCA param value is guaranteed
1785            to be >= -1 */
1786         if (-1 == mca_btl_openib_component.ib_max_inline_data) {
1787             need_search = true;
1788         } else {
1789             device->max_inline_data = (uint32_t)
1790                 mca_btl_openib_component.ib_max_inline_data;
1791         }
1792     } else if (values.max_inline_data_set) {
1793         if (-1 == values.max_inline_data) {
1794             need_search = true;
1795         } else if (values.max_inline_data >= 0) {
1796             device->max_inline_data = (uint32_t) values.max_inline_data;
1797         } else {
1798             if(default_values.max_inline_data_set &&
1799                default_values.max_inline_data >= -1) {
1800                 BTL_ERROR(("Invalid max_inline_data value specified "
1801                            "in INI file (%d); using default value (%d)",
1802                             values.max_inline_data,
1803                             default_values.max_inline_data));
1804                 device->max_inline_data = (uint32_t)
1805                     default_values.max_inline_data;
1806             } else {
1807                 BTL_ERROR(("Invalid max_inline_data value specified "
1808                            "in INI file (%d)", values.max_inline_data));
1809                 ret = OPAL_ERR_BAD_PARAM;
1810                 goto error;
1811             }
1812         }
1813     }
1814 
1815     /* If we don't have a set max inline data size, search for it */
1816     if (need_search) {
1817         opal_common_verbs_find_max_inline(device->ib_dev,
1818                                           device->ib_dev_context,
1819                                           device->ib_pd,
1820                                           &device->max_inline_data);
1821     }
1822 
1823     /* Should we use RDMA for short / eager messages?  First check MCA
1824        param, then check INI file values. */
1825     if (mca_btl_openib_component.use_eager_rdma >= 0) {
1826         device->use_eager_rdma = mca_btl_openib_component.use_eager_rdma;
1827     } else if (values.use_eager_rdma_set) {
1828         device->use_eager_rdma = values.use_eager_rdma;
1829     }
1830     /* Eager RDMA is not currently supported with progress threads */
1831     if (device->use_eager_rdma && OPAL_ENABLE_PROGRESS_THREADS) {
1832         device->use_eager_rdma = 0;
1833         opal_show_help("help-mpi-btl-openib.txt",
1834                        "eager RDMA and progress threads", true);
1835     }
1836 
1837     asprintf (&rcache_resources.cache_name, "verbs.%" PRIu64, device->ib_dev_attr.node_guid);
1838     rcache_resources.reg_data = (void*)device;
1839     rcache_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t);
1840     rcache_resources.register_mem = openib_reg_mr;
1841     rcache_resources.deregister_mem = openib_dereg_mr;
1842     device->rcache =
1843         mca_rcache_base_module_create (mca_btl_openib_component.ib_rcache_name,
1844                                        device, &rcache_resources);
1845     if (NULL == device->rcache) {
1846         /* Don't print an error message here -- we'll get one from
1847            mpool_create anyway */
1848          goto error;
1849     }
1850 
1851     device->mpool = mca_mpool_base_module_lookup (mca_btl_openib_component.ib_mpool_hints);
1852     if (NULL == device->mpool) {
1853         goto error;
1854     }
1855 
1856 #if OPAL_ENABLE_PROGRESS_THREADS
1857     device->ib_channel = ibv_create_comp_channel(device->ib_dev_context);
1858     if (NULL == device->ib_channel) {
1859         BTL_ERROR(("error creating channel for %s errno says %s",
1860                     ibv_get_device_name(device->ib_dev),
1861                     strerror(errno)));
1862         goto error;
1863     }
1864 #endif
1865 
1866     ret = OPAL_SUCCESS;
1867 
1868     /* Note ports are 1 based (i >= 1) */
1869     for(k = 0; k < port_cnt; k++){
1870         struct ibv_port_attr ib_port_attr;
1871         i = allowed_ports[k];
1872         if(ibv_query_port(device->ib_dev_context, i, &ib_port_attr)){
1873             BTL_ERROR(("error getting port attributes for device %s "
1874                         "port number %d errno says %s",
1875                         ibv_get_device_name(device->ib_dev), i, strerror(errno)));
1876             break;
1877         }
1878         if(IBV_PORT_ACTIVE == ib_port_attr.state) {
1879             /* Select the lower of the HCA and port active speed. With QLogic
1880                HCAs that are capable of 4K MTU we had an issue when connected
1881                to switches with 2K MTU. This fix is valid for other IB vendors
1882                as well. */
1883             if (ib_port_attr.active_mtu < device->mtu){
1884                 device->mtu = ib_port_attr.active_mtu;
1885             }
1886             if (mca_btl_openib_component.apm_ports && device->btls > 0) {
1887                 init_apm_port(device, i, ib_port_attr.lid);
1888                 break;
1889             }
1890             if (0 == mca_btl_openib_component.ib_pkey_val) {
1891                 ret = init_one_port(btl_list, device, i, 0, &ib_port_attr);
1892             } else {
1893                 uint16_t pkey,j;
1894                 for (j = 0; j < device->ib_dev_attr.max_pkeys; j++) {
1895                     if(ibv_query_pkey(device->ib_dev_context, i, j, &pkey)){
1896                         BTL_ERROR(("error getting pkey for index %d, device %s "
1897                                     "port number %d errno says %s",
1898                                     j, ibv_get_device_name(device->ib_dev), i, strerror(errno)));
1899                     }
1900                     pkey = ntohs(pkey) & MCA_BTL_IB_PKEY_MASK;
1901                     if(pkey == mca_btl_openib_component.ib_pkey_val){
1902                         ret = init_one_port(btl_list, device, i, j, &ib_port_attr);
1903                         break;
1904                     }
1905                 }
1906             }
1907             if (OPAL_SUCCESS != ret) {
1908                 /* Out of bounds error indicates that we hit max btl number
1909                  * don't propagate the error to the caller */
1910                 if (OPAL_ERR_VALUE_OUT_OF_BOUNDS == ret) {
1911                     ret = OPAL_SUCCESS;
1912                 }
1913                 break;
1914             }
1915         }
1916     }
1917     free(allowed_ports);
1918     allowed_ports = NULL;
1919 
1920     /* If we made a BTL, check APM status and return.  Otherwise, fall
1921        through and destroy everything */
1922     if (device->btls > 0) {
1923         /* if apm was enabled it should be > 1 */
1924         if (1 == mca_btl_openib_component.apm_ports) {
1925             opal_show_help("help-mpi-btl-openib.txt",
1926                            "apm not enough ports", true);
1927             mca_btl_openib_component.apm_ports = 0;
1928         }
1929 
1930         /* Check to ensure that all devices used in this process have
1931            compatible receive_queues values (we check elsewhere to see
1932            if all devices used in other processes in this job have
1933            compatible receive_queues values).
1934 
1935            Not only is the check complex, but the reasons behind what
1936            it does (and does not do) are complex.  Before explaining
1937            the code below, here's some notes:
1938 
1939            1. The openib BTL component only supports 1 value of the
1940               receive_queues between all of its modules.
1941 
1942               --> This could be changed to allow every module to have
1943                   its own receive_queues.  But that would be a big
1944                   deal; no one has time to code this up right now.
1945 
1946            2. The receive_queues value can be specified either as an
1947               MCA parameter or in the INI file.  Specifying the value
1948               as an MCA parameter overrides all INI file values
1949               (meaning: that MCA param value will be used for all
1950               openib BTL modules in the process).
1951 
1952            Effectively, the first device through init_one_device()
1953            gets to decide what the receive_queues will be for the all
1954            modules in this process.  This is an unfortunate artifact
1955            of the openib BTL startup sequence (see below for more
1956            details).  The first device will choose the receive_queues
1957            value from: (in priority order):
1958 
1959            1. If the btl_openib_receive_queues MCA param was
1960               specified, use that.
1961            2. If this device has a receive_queues value specified in
1962               the INI file, use that.
1963            3. Otherwise, use the default MCA param value for
1964               btl_openib_receive_queues.
1965 
1966            If any successive device has a different value specified in
1967            the INI file, we show_help and return up the stack that
1968            this device failed.
1969 
1970            In the case that the user does not specify a
1971            mca_btl_openib_receive_queues value, the short description
1972            of what is allowed is that either a) no devices specify a
1973            receive_queues value in the INI file (in which case we use
1974            the default MCA param value), b) all devices specify the
1975            same receive_queues value in the INI value, or c) some/all
1976            devices specify the same receive_queues value in the INI
1977            value as the default MCA param value.
1978 
1979            Let's take some sample cases to explain this more clearly...
1980 
1981            THESE ARE THE "GOOD" CASES
1982            --------------------------
1983 
1984            Case 1: no INI values
1985            - MCA parameter: not specified
1986            - default receive_queues: value A
1987            - device 0: no receive_queues in INI file
1988            - device 1: no receive_queues in INI file
1989            - device 2: no receive_queues in INI file
1990            --> use receive_queues value A with all devices
1991 
1992            Case 2: all INI values the same (same as default)
1993            - MCA parameter: not specified
1994            - default receive_queues: value A
1995            - device 0: receive_queues value A in the INI file
1996            - device 1: receive_queues value A in the INI file
1997            - device 2: receive_queues value A in the INI file
1998            --> use receive_queues value A with all devices
1999 
2000            Case 3: all INI values the same (but different than default)
2001            - MCA parameter: not specified
2002            - default receive_queues: value A
2003            - device 0: receive_queues value B in the INI file
2004            - device 1: receive_queues value B in the INI file
2005            - device 2: receive_queues value B in the INI file
2006            --> use receive_queues value B with all devices
2007 
2008            Case 4: some INI unspecified, but rest same as default
2009            - MCA parameter: not specified
2010            - default receive_queues: value A
2011            - device 0: receive_queues value A in the INI file
2012            - device 1: no receive_queues in INI file
2013            - device 2: receive_queues value A in the INI file
2014            --> use receive_queues value A with all devices
2015 
2016            Case 5: some INI unspecified (including device 0), but rest same as default
2017            - MCA parameter: not specified
2018            - default receive_queues: value A
2019            - device 0: no receive_queues in INI file
2020            - device 1: no receive_queues in INI file
2021            - device 2: receive_queues value A in the INI file
2022            --> use receive_queues value A with all devices
2023 
2024            Case 6: different default/INI values, but MCA param is specified
2025            - MCA parameter: value D
2026            - default receive_queues: value A
2027            - device 0: no receive_queues in INI file
2028            - device 1: receive_queues value B in INI file
2029            - device 2: receive_queues value C in INI file
2030            --> use receive_queues value D with all devices
2031 
2032            What this means is that this selection process is
2033            unfortunately tied to the order of devices.  :-( Device 0
2034            effectively sets what the receive_queues value will be for
2035            that process.  If any later device disagrees, that's
2036            problematic and we have to error/abort.
2037 
2038            ALL REMAINING CASES WILL FAIL
2039            -----------------------------
2040 
2041            Case 7: one INI value (different than default)
2042            - MCA parameter: not specified
2043            - default receive_queues: value A
2044            - device 0: receive_queues value B in INI file
2045            - device 1: no receive_queues in INI file
2046            - device 2: no receive_queues in INI file
2047            --> Jeff thinks that it would be great to use
2048                receive_queues value B with all devices.  However, it
2049                shares one of the problems cited in case 8, below.  So
2050                we need to fail this scenario; print an error and
2051                abort.
2052 
2053            Case 8: one INI value, different than default
2054            - MCA parameter: not specified
2055            - default receive_queues: value A
2056            - device 0: no receive_queues in INI file
2057            - device 1: receive_queues value B in INI file
2058            - device 2: no receive_queues in INI file
2059 
2060            --> Jeff thinks that it would be great to use
2061                receive_queues value B with all devices.  However, it
2062                has (at least) 2 problems:
2063 
2064                1. The check for local receive_queue compatibility is
2065                   done here in init_one_device().  By the time we call
2066                   init_one_device() for device 1, we have already
2067                   called init_one_device() for device 0, meaning that
2068                   device 0's QPs have already been created and setup
2069                   using the MCA parameter's default receive_queues
2070                   value.  So if device 1 *changes* the
2071                   component.receive_queues value, then device 0 and
2072                   device 1 now have different receive_queue sets (more
2073                   specifically: the QPs setup for device 0 are now
2074                   effectively lost).  This is Bad.
2075 
2076                   It would be great if we didn't have this restriction
2077                   -- either by letting each module have its own
2078                   receive_queues value or by scanning all devices and
2079                   figuring out a final receive_queues value *before*
2080                   actually setting up any QPs.  But that's not the
2081                   current flow of the code (patches would be greatly
2082                   appreciated here, of course!).  Unfortunately, no
2083                   one has time to code this up right now, so we're
2084                   leaving this as explicitly documented for some
2085                   future implementer...
2086 
2087                2. Conside a scenario with server 1 having HCA A/subnet
2088                   X, and server 2 having HCA B/subnet X and HCA
2089                   C/subnet Y.  And let's assume:
2090 
2091                   Server 1:
2092                   HCA A: no receive_queues in INI file
2093 
2094                   Server 2:
2095                   HCA B: no receive_queues in INI file
2096                   HCA C: receive_queues specified in INI file
2097 
2098                   A will therefore use the default receive_queues
2099                   value.  B and C will use C's INI receive_queues.
2100                   But note that modex [currently] only sends around
2101                   vendor/part IDs for OpenFabrics devices -- not the
2102                   actual receive_queues value (it was felt that
2103                   including the final receive_queues string value in
2104                   the modex would dramatically increase the size of
2105                   the modex).  So processes on server 1 will get the
2106                   vendor/part ID for HCA B, look it up in the INI
2107                   file, see that it has no receive_queues value
2108                   specified, and then assume that it uses the default
2109                   receive_queues value.  Hence, procs on server 1 will
2110                   try to connect HCA A-->HCA B with the wrong
2111                   receive_queues value.  Bad.  Further, the error
2112                   won't be discovered by checks like this because A
2113                   won't check D's receive_queues because D is on a
2114                   different subnet.
2115 
2116                   This could be fixed, of course; either by a) send
2117                   the final receive_queues value in the modex (perhaps
2118                   compressing or encoding it so that it can be much
2119                   shorter than the string -- the current vendor/part
2120                   ID stuff takes 8 bytes for each device), or b)
2121                   replicating the determination process of each host
2122                   in each process (i.e., procs on server 1 would see
2123                   both B and C, and use them both to figure out what
2124                   the "final" receive_queues value is for B).
2125                   Unfortunately, no one has time to code this up right
2126                   now, so we're leaving this as explicitly documented
2127                   for some future implementer...
2128 
2129                Because of both of these problems, this case is
2130                problematic and must fail with a show_help error.
2131 
2132            Case 9: two devices with same INI value (different than default)
2133            - MCA parameter: not specified
2134            - default receive_queues: value A
2135            - device 0: no receive_queues in INI file
2136            - device 1: receive_queues value B in INI file
2137            - device 2: receive_queues value B in INI file
2138            --> per case 8, fail with a show_help message.
2139 
2140            Case 10: two devices with different INI values
2141            - MCA parameter: not specified
2142            - default receive_queues: value A
2143            - device 0: no receive_queues in INI file
2144            - device 1: receive_queues value B in INI file
2145            - device 2: receive_queues value C in INI file
2146            --> per case 8, fail with a show_help message.
2147 
2148         */
2149 
2150         {
2151             /* we need to read this MCA param at this point in case someone
2152              * altered it via MPI_T */
2153             mca_base_var_source_t source;
2154 
2155             if (OPAL_SUCCESS != (ret = get_var_source ("receive_queues", &source))) {
2156                 BTL_ERROR(("mca_base_var_get_value failed to get value for receive_queues: %s:%d",
2157                            __FILE__, __LINE__));
2158                 goto error;
2159             }
2160 
2161             mca_btl_openib_component.receive_queues_source = source;
2162         }
2163 
2164         /* If the MCA param was specified, skip all the checks */
2165         if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) {
2166             goto good;
2167         }
2168 
2169         /* If we're the first device and we have a receive_queues
2170            value from the INI file *that is different than the
2171            already-existing default value*, then set the component to
2172            use that. */
2173         if (0 == mca_btl_openib_component.devices_count) {
2174             if (NULL != values.receive_queues &&
2175                 0 != strcmp(values.receive_queues,
2176                             mca_btl_openib_component.receive_queues)) {
2177                 if (NULL != mca_btl_openib_component.receive_queues) {
2178                     free(mca_btl_openib_component.receive_queues);
2179                 }
2180                 mca_btl_openib_component.receive_queues =
2181                     strdup(values.receive_queues);
2182                 mca_btl_openib_component.receive_queues_source =
2183                     BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
2184             }
2185         }
2186 
2187         /* If we're not the first device, then we have to conform to
2188            either the default value if the first device didn't set
2189            anything, or to whatever the first device decided. */
2190         else {
2191             /* In all cases, if this device has a receive_queues value
2192                in the INI, then it must agree with
2193                component.receive_queues. */
2194             if (NULL != values.receive_queues) {
2195                 if (0 != strcmp(values.receive_queues,
2196                                 mca_btl_openib_component.receive_queues)) {
2197                     opal_show_help("help-mpi-btl-openib.txt",
2198                                    "locally conflicting receive_queues", true,
2199                                    opal_install_dirs.opaldatadir,
2200                                    opal_process_info.nodename,
2201                                    ibv_get_device_name(receive_queues_device->ib_dev),
2202                                    receive_queues_device->ib_dev_attr.vendor_id,
2203                                    receive_queues_device->ib_dev_attr.vendor_part_id,
2204                                    mca_btl_openib_component.receive_queues,
2205                                    ibv_get_device_name(device->ib_dev),
2206                                    device->ib_dev_attr.vendor_id,
2207                                    device->ib_dev_attr.vendor_part_id,
2208                                    values.receive_queues);
2209                     ret = OPAL_ERR_RESOURCE_BUSY;
2210                     goto error;
2211                 }
2212             }
2213 
2214             /* If this device doesn't have an INI receive_queues
2215                value, then if the component.receive_queues value came
2216                from the default, we're ok.  But if the
2217                component.receive_queues value came from the 1st
2218                device's INI file, we must error. */
2219             else if ((mca_base_var_source_t) BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
2220                 mca_btl_openib_component.receive_queues_source) {
2221                 opal_show_help("help-mpi-btl-openib.txt",
2222                                "locally conflicting receive_queues", true,
2223                                opal_install_dirs.opaldatadir,
2224                                opal_process_info.nodename,
2225                                ibv_get_device_name(receive_queues_device->ib_dev),
2226                                receive_queues_device->ib_dev_attr.vendor_id,
2227                                receive_queues_device->ib_dev_attr.vendor_part_id,
2228                                mca_btl_openib_component.receive_queues,
2229                                ibv_get_device_name(device->ib_dev),
2230                                device->ib_dev_attr.vendor_id,
2231                                device->ib_dev_attr.vendor_part_id,
2232                                mca_btl_openib_component.default_recv_qps);
2233                 ret = OPAL_ERR_RESOURCE_BUSY;
2234                 goto error;
2235             }
2236         }
2237 
2238         receive_queues_device = device;
2239 
2240     good:
2241         mca_btl_openib_component.devices_count++;
2242         return OPAL_SUCCESS;
2243     }
2244 
2245 error:
2246     if (OPAL_SUCCESS != ret) {
2247         opal_show_help("help-mpi-btl-openib.txt",
2248                        "error in device init", true,
2249                        opal_process_info.nodename,
2250                        ibv_get_device_name(device->ib_dev));
2251     }
2252 
2253     if (NULL != allowed_ports) {
2254         free(allowed_ports);
2255     }
2256     OBJ_RELEASE(device);
2257     return ret;
2258 }
2259 
finish_btl_init(mca_btl_openib_module_t * openib_btl)2260 static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
2261 {
2262     int qp;
2263     openib_btl->num_peers = 0;
2264 
2265     /* Initialize module state */
2266     OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
2267 
2268     /* setup the qp structure */
2269     openib_btl->qps = (mca_btl_openib_module_qp_t*)
2270         calloc(mca_btl_openib_component.num_qps,
2271                 sizeof(mca_btl_openib_module_qp_t));
2272     if (NULL == openib_btl->qps) {
2273         return OPAL_ERR_OUT_OF_RESOURCE;
2274     }
2275 
2276     /* setup all the qps */
2277     for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
2278         if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
2279             OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0],
2280                     opal_list_t);
2281             OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1],
2282                     opal_list_t);
2283             openib_btl->qps[qp].u.srq_qp.sd_credits =
2284                 mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
2285             openib_btl->qps[qp].u.srq_qp.srq = NULL;
2286         }
2287     }
2288 
2289     /* initialize the memory pool using the device */
2290     openib_btl->super.btl_mpool = openib_btl->device->mpool;
2291 
2292     openib_btl->eager_rdma_channels = 0;
2293 
2294     openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
2295             sizeof(mca_btl_openib_header_t) +
2296             sizeof(mca_btl_openib_header_coalesced_t) +
2297             sizeof(mca_btl_openib_control_header_t) +
2298             sizeof(mca_btl_openib_footer_t) +
2299             openib_btl->super.btl_eager_limit,
2300             mca_btl_openib_component.buffer_alignment, size_t);
2301 
2302     opal_output_verbose(1, opal_btl_base_framework.framework_output,
2303                         "[rank=%d] openib: using port %s:%d",
2304                         OPAL_PROC_MY_NAME.vpid,
2305                         ibv_get_device_name(openib_btl->device->ib_dev),
2306                         openib_btl->port_num);
2307     return OPAL_SUCCESS;
2308 }
2309 
2310 struct dev_distance {
2311     struct ibv_device *ib_dev;
2312     float distance;
2313 };
2314 
compare_distance(const void * p1,const void * p2)2315 static int compare_distance(const void *p1, const void *p2)
2316 {
2317     const struct dev_distance *d1 = (const struct dev_distance *) p1;
2318     const struct dev_distance *d2 = (const struct dev_distance *) p2;
2319 
2320     if (d1->distance > (d2->distance+EPS)) {
2321         return 1;
2322     } else if ((d1->distance + EPS) < d2->distance) {
2323         return -1;
2324     } else {
2325         return 0;
2326     }
2327 }
2328 
get_ib_dev_distance(struct ibv_device * dev)2329 static float get_ib_dev_distance(struct ibv_device *dev)
2330 {
2331     /* If we don't have hwloc, we'll default to a distance of 0,
2332        because we have no way of measuring. */
2333     float distance = 0;
2334     float a, b;
2335     int i;
2336     hwloc_cpuset_t my_cpuset = NULL, ibv_cpuset = NULL;
2337     hwloc_obj_t my_obj, ibv_obj, node_obj;
2338     struct hwloc_distances_s *hwloc_distances = NULL;
2339 
2340     /* Override any distance logic so all devices are used */
2341     if (0 != mca_btl_openib_component.ignore_locality ||
2342         OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
2343         return distance;
2344     }
2345 
2346 #if HWLOC_API_VERSION >= 0x20000
2347     unsigned int j, distances_nr = 1;
2348     int ibvindex, myindex;
2349 #endif
2350 
2351     if (NULL == hwloc_distances) {
2352         #if HWLOC_API_VERSION < 0x20000
2353             hwloc_distances =
2354                 (struct hwloc_distances_s*)hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
2355                                                                                    HWLOC_OBJ_NODE);
2356             /* If we got no info, just return 0 */
2357             if (NULL == hwloc_distances || NULL == hwloc_distances->latency) {
2358                 goto out;
2359             }
2360 
2361         #else
2362             if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
2363                                                  &distances_nr, &hwloc_distances,
2364                                                  HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == distances_nr) {
2365                 hwloc_distances = NULL;
2366                 goto out;
2367             }
2368         #endif
2369     }
2370 
2371     /* Next, find the NUMA node where this IBV device is located */
2372     ibv_cpuset = hwloc_bitmap_alloc();
2373     if (NULL == ibv_cpuset) {
2374         goto out;
2375     }
2376     if (0 != hwloc_ibv_get_device_cpuset(opal_hwloc_topology, dev, ibv_cpuset)) {
2377         goto out;
2378     }
2379     ibv_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, ibv_cpuset);
2380     if (NULL == ibv_obj) {
2381         goto out;
2382     }
2383 
2384     opal_output_verbose(5, opal_btl_base_framework.framework_output,
2385                         "hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs);
2386 #if HWLOC_API_VERSION < 0x20000
2387     for (i = 0; i < (int)(2 *  hwloc_distances->nbobjs); i++) {
2388         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2389                             "hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]);
2390     }
2391 #else
2392     for (i = 0; i < (int)hwloc_distances->nbobjs; i++) {
2393         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2394                             "hwloc_distances->values[%d]=%"PRIu64, i, hwloc_distances->values[i]);
2395     }
2396 #endif
2397 
2398     /* If ibv_obj is a NUMA node or below, we're good. */
2399     switch (ibv_obj->type) {
2400     case HWLOC_OBJ_NODE:
2401     case HWLOC_OBJ_SOCKET:
2402 #if HWLOC_API_VERSION < 0x20000
2403     case HWLOC_OBJ_CACHE:
2404 #else
2405     case HWLOC_OBJ_L1CACHE:
2406     case HWLOC_OBJ_L2CACHE:
2407     case HWLOC_OBJ_L3CACHE:
2408     case HWLOC_OBJ_L4CACHE:
2409     case HWLOC_OBJ_L5CACHE:
2410 #endif
2411     case HWLOC_OBJ_CORE:
2412     case HWLOC_OBJ_PU:
2413         while (NULL != ibv_obj && ibv_obj->type != HWLOC_OBJ_NODE) {
2414             ibv_obj = ibv_obj->parent;
2415         }
2416         break;
2417 
2418     default:
2419         /* If it's above a NUMA node, then I don't know how to compute
2420            the distance... */
2421         opal_output_verbose(5, opal_btl_base_framework.framework_output, "ibv_obj->type set to NULL");
2422         ibv_obj = NULL;
2423         break;
2424     }
2425 
2426     /* If we don't have an object for this ibv device, give up */
2427     if (NULL == ibv_obj) {
2428         goto out;
2429     }
2430     #if HWLOC_API_VERSION >= 0x20000
2431         /* the new matrix format isn't quite as friendly, so we have to
2432          * do an exhaustive search to find the index of this object
2433          * in that array */
2434         ibvindex = -1;
2435         for (j=0; j < distances_nr; j++) {
2436             if (ibv_obj == hwloc_distances->objs[j]) {
2437                 ibvindex = j;
2438                 break;
2439             }
2440         }
2441         if (-1 == ibvindex) {
2442             OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2443             goto out;
2444         }
2445     #endif
2446 
2447     opal_output_verbose(5, opal_btl_base_framework.framework_output,
2448                         "ibv_obj->logical_index=%d", ibv_obj->logical_index);
2449     /* This function is only called if the process is bound, so let's
2450        find out where we are bound to.  For the moment, we only care
2451        about the NUMA node to which we are bound. */
2452     my_cpuset = hwloc_bitmap_alloc();
2453     if (NULL == my_cpuset) {
2454         goto out;
2455     }
2456     if (0 != hwloc_get_cpubind(opal_hwloc_topology, my_cpuset, 0)) {
2457         goto out;
2458     }
2459     my_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, my_cpuset);
2460     if (NULL == my_obj) {
2461         goto out;
2462     }
2463 
2464     /* If my_obj is a NUMA node or below, we're good. */
2465     switch (my_obj->type) {
2466     case HWLOC_OBJ_NODE:
2467     case HWLOC_OBJ_SOCKET:
2468     #if HWLOC_API_VERSION < 0x20000
2469         case HWLOC_OBJ_CACHE:
2470     #else
2471         case HWLOC_OBJ_L1CACHE:
2472         case HWLOC_OBJ_L2CACHE:
2473         case HWLOC_OBJ_L3CACHE:
2474         case HWLOC_OBJ_L4CACHE:
2475         case HWLOC_OBJ_L5CACHE:
2476     #endif
2477     case HWLOC_OBJ_CORE:
2478     case HWLOC_OBJ_PU:
2479         while (NULL != my_obj && my_obj->type != HWLOC_OBJ_NODE) {
2480             my_obj = my_obj->parent;
2481         }
2482         if (NULL != my_obj) {
2483             opal_output_verbose(5, opal_btl_base_framework.framework_output,
2484                                 "my_obj->logical_index=%d", my_obj->logical_index);
2485             /* Distance may be asymetrical, so calculate both of them
2486                and take the max */
2487             #if HWLOC_API_VERSION < 0x20000
2488                 a = hwloc_distances->latency[my_obj->logical_index +
2489                                              (ibv_obj->logical_index *
2490                                               hwloc_distances->nbobjs)];
2491                 b = hwloc_distances->latency[ibv_obj->logical_index +
2492                                              (my_obj->logical_index *
2493                                               hwloc_distances->nbobjs)];
2494             #else
2495                 /* the new matrix format isn't quite as friendly, so we have to
2496                  * do an exhaustive search to find the index of this object
2497                  * in that array */
2498                 myindex = -1;
2499                 for (j=0; j < distances_nr; j++) {
2500                     if (my_obj == hwloc_distances->objs[j]) {
2501                         myindex = j;
2502                         break;
2503                     }
2504                 }
2505                 if (-1 == myindex) {
2506                     OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2507                     goto out;
2508                 }
2509                 a = (float)hwloc_distances->values[myindex + (ibvindex * hwloc_distances->nbobjs)];
2510                 b = (float)hwloc_distances->values[ibvindex + (myindex * hwloc_distances->nbobjs)];
2511             #endif
2512             distance = (a > b) ? a : b;
2513         }
2514         break;
2515 
2516     default:
2517         /* If the obj is above a NUMA node, then we're bound to more than
2518            one NUMA node.  Find the max distance. */
2519         i = 0;
2520         for (node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology,
2521                                                             ibv_obj->cpuset,
2522                                                             HWLOC_OBJ_NODE, i);
2523              NULL != node_obj;
2524              node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology,
2525                                                             ibv_obj->cpuset,
2526                                                             HWLOC_OBJ_NODE, ++i)) {
2527             #if HWLOC_API_VERSION < 0x20000
2528                 a = hwloc_distances->latency[node_obj->logical_index +
2529                                              (ibv_obj->logical_index *
2530                                               hwloc_distances->nbobjs)];
2531                 b = hwloc_distances->latency[ibv_obj->logical_index +
2532                                              (node_obj->logical_index *
2533                                               hwloc_distances->nbobjs)];
2534             #else
2535                 unsigned int j;
2536                 j = node_obj->logical_index + (ibv_obj->logical_index * hwloc_distances->nbobjs);
2537                 if (j < distances_nr) {
2538                     a = (float)hwloc_distances->values[j];
2539                 } else {
2540                     goto out;
2541                 }
2542                 j = ibv_obj->logical_index + (node_obj->logical_index * hwloc_distances->nbobjs);
2543                 if (j < distances_nr) {
2544                     b = (float)hwloc_distances->values[j];
2545                 } else {
2546                     goto out;
2547                 }
2548             #endif
2549             a = (a > b) ? a : b;
2550             distance = (a > distance) ? a : distance;
2551         }
2552         break;
2553     }
2554 
2555  out:
2556     if (NULL != ibv_cpuset) {
2557         hwloc_bitmap_free(ibv_cpuset);
2558     }
2559     if (NULL != my_cpuset) {
2560         hwloc_bitmap_free(my_cpuset);
2561     }
2562 
2563 #if HWLOC_API_VERSION >= 0x20000
2564     if (NULL != hwloc_distances) {
2565         hwloc_distances_release(opal_hwloc_topology, hwloc_distances);
2566     }
2567 #endif
2568     return distance;
2569 }
2570 
2571 static struct dev_distance *
sort_devs_by_distance(struct ibv_device ** ib_devs,int count)2572 sort_devs_by_distance(struct ibv_device **ib_devs, int count)
2573 {
2574     int i;
2575     struct dev_distance *devs = (struct dev_distance *) malloc(count * sizeof(struct dev_distance));
2576     if (NULL == devs) {
2577         return NULL;
2578     }
2579 
2580     for (i = 0; i < count; i++) {
2581         devs[i].ib_dev = ib_devs[i];
2582         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2583                             "Checking distance from this process to device=%s", ibv_get_device_name(ib_devs[i]));
2584         /* If we're not bound, just assume that the device is close. */
2585         devs[i].distance = 0;
2586         if (opal_process_info.cpuset) {
2587             /* If this process is bound to one or more PUs, we can get
2588                an accurate distance. */
2589             devs[i].distance = get_ib_dev_distance(ib_devs[i]);
2590         }
2591         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2592                             "Process is %s: distance to device is %f",
2593                             (opal_process_info.cpuset ? "bound" : "not bound"), devs[i].distance);
2594     }
2595 
2596     qsort(devs, count, sizeof(struct dev_distance), compare_distance);
2597 
2598     return devs;
2599 }
2600 
2601 
2602 /*
2603  *  IB component initialization:
2604  *  (1) read interface list from kernel and compare against component parameters
2605  *      then create a BTL instance for selected interfaces
2606  *  (2) setup IB listen socket for incoming connection attempts
2607  *  (3) register BTL parameters with the MCA
2608  */
2609 
2610 static mca_btl_base_module_t**
btl_openib_component_init(int * num_btl_modules,bool enable_progress_threads,bool enable_mpi_threads)2611 btl_openib_component_init(int *num_btl_modules,
2612                           bool enable_progress_threads,
2613                           bool enable_mpi_threads)
2614 {
2615     struct ibv_device **ib_devs;
2616     mca_btl_base_module_t** btls = NULL;
2617     int i, ret, num_devs, length;
2618     opal_list_t btl_list;
2619     mca_btl_openib_module_t * openib_btl;
2620     mca_btl_base_selected_module_t* ib_selected;
2621     opal_list_item_t* item;
2622     mca_btl_openib_frag_init_data_t *init_data;
2623     struct dev_distance *dev_sorted;
2624     float distance;
2625     int index;
2626     bool found;
2627     mca_base_var_source_t source;
2628     int list_count = 0;
2629 
2630     /* initialization */
2631     *num_btl_modules = 0;
2632     num_devs = 0;
2633 
2634     /* If we got this far, then setup the memory alloc hook (because
2635        we're most likely going to be using this component). The hook
2636        is to be set up as early as possible in this function since we
2637        want most of the allocated resources be aligned.
2638      */
2639     opal_memory->memoryc_set_alignment(32, mca_btl_openib_module.super.btl_eager_limit);
2640 
2641     /* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to
2642        see if $sysfsdir/class/infiniband exists.  If it does not,
2643        assume that the RDMA hardware drivers are not loaded, and
2644        therefore we don't want OpenFabrics verbs support in this OMPI
2645        job.  No need to print a warning. */
2646     if (!opal_common_verbs_check_basics()) {
2647         goto no_btls;
2648     }
2649 
2650     /* Read in INI files with device-specific parameters */
2651     if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_init())) {
2652         goto no_btls;
2653     }
2654 
2655     index = mca_base_var_find("ompi", "btl", "openib", "max_inline_data");
2656     if (index >= 0) {
2657         if (OPAL_SUCCESS == mca_base_var_get_value(index, NULL, &source, NULL)) {
2658             if (-1 == mca_btl_openib_component.ib_max_inline_data  &&
2659                 MCA_BASE_VAR_SOURCE_DEFAULT == source) {
2660                 /* If the user has not explicitly set this MCA parameter
2661                    use max_inline_data value specified in the
2662                    device-specific parameters INI file */
2663                 mca_btl_openib_component.ib_max_inline_data = -2;
2664             }
2665         }
2666     }
2667 
2668     OBJ_CONSTRUCT(&mca_btl_openib_component.send_free_coalesced, opal_free_list_t);
2669     OBJ_CONSTRUCT(&mca_btl_openib_component.send_user_free, opal_free_list_t);
2670     OBJ_CONSTRUCT(&mca_btl_openib_component.recv_user_free, opal_free_list_t);
2671 
2672     init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2673     if (NULL == init_data) {
2674         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2675         goto no_btls;
2676     }
2677 
2678     init_data->order = mca_btl_openib_component.rdma_qp;
2679     init_data->list = &mca_btl_openib_component.send_user_free;
2680 
2681     /* Align fragments on 8-byte boundaries (instead of 2) to fix bus errors that
2682        occur on some 32-bit platforms. Depending on the size of the fragment this
2683        will waste 2-6 bytes of space per frag. In most cases this shouldn't waste
2684        any space. */
2685     if (OPAL_SUCCESS != opal_free_list_init (
2686                 &mca_btl_openib_component.send_user_free,
2687                 sizeof(mca_btl_openib_put_frag_t), 8,
2688                 OBJ_CLASS(mca_btl_openib_put_frag_t),
2689                 0, 0,
2690                 mca_btl_openib_component.ib_free_list_num,
2691                 mca_btl_openib_component.ib_free_list_max,
2692                 mca_btl_openib_component.ib_free_list_inc,
2693                 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2694         goto no_btls;
2695     }
2696 
2697     init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2698     if (NULL == init_data) {
2699         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2700         goto no_btls;
2701     }
2702 
2703     init_data->order = mca_btl_openib_component.rdma_qp;
2704     init_data->list = &mca_btl_openib_component.recv_user_free;
2705 
2706     if(OPAL_SUCCESS != opal_free_list_init (
2707                 &mca_btl_openib_component.recv_user_free,
2708                 sizeof(mca_btl_openib_get_frag_t), 8,
2709                 OBJ_CLASS(mca_btl_openib_get_frag_t),
2710                 0, 0,
2711                 mca_btl_openib_component.ib_free_list_num,
2712                 mca_btl_openib_component.ib_free_list_max,
2713                 mca_btl_openib_component.ib_free_list_inc,
2714                 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2715         goto no_btls;
2716     }
2717 
2718     init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2719     if (NULL == init_data) {
2720         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2721         goto no_btls;
2722     }
2723     length = sizeof(mca_btl_openib_coalesced_frag_t);
2724 
2725     init_data->list = &mca_btl_openib_component.send_free_coalesced;
2726 
2727     if(OPAL_SUCCESS != opal_free_list_init (
2728                 &mca_btl_openib_component.send_free_coalesced,
2729                 length, 8, OBJ_CLASS(mca_btl_openib_coalesced_frag_t),
2730                 0, 0, mca_btl_openib_component.ib_free_list_num,
2731                 mca_btl_openib_component.ib_free_list_max,
2732                 mca_btl_openib_component.ib_free_list_inc,
2733                 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2734         goto no_btls;
2735     }
2736 
2737     /* If fork support is requested, try to enable it */
2738     if (OPAL_SUCCESS != (ret = opal_common_verbs_fork_test())) {
2739         goto no_btls;
2740     }
2741 
2742     /* Parse the include and exclude lists, checking for errors */
2743     mca_btl_openib_component.if_include_list =
2744         mca_btl_openib_component.if_exclude_list =
2745         mca_btl_openib_component.if_list = NULL;
2746 
2747     if (NULL != mca_btl_openib_component.if_include)
2748       list_count++;
2749     if (NULL != mca_btl_openib_component.if_exclude)
2750       list_count++;
2751     if (NULL != mca_btl_openib_component.ipaddr_include)
2752       list_count++;
2753     if (NULL != mca_btl_openib_component.ipaddr_exclude)
2754       list_count++;
2755 
2756     if (list_count > 1) {
2757         opal_show_help("help-mpi-btl-openib.txt",
2758                        "specified include and exclude", true,
2759                        NULL == mca_btl_openib_component.if_include ?
2760                         "<not specified>" : mca_btl_openib_component.if_include,
2761                        NULL == mca_btl_openib_component.if_exclude ?
2762                         "<not specified>" : mca_btl_openib_component.if_exclude,
2763                        NULL == mca_btl_openib_component.ipaddr_include ?
2764                         "<not specified>" :mca_btl_openib_component.ipaddr_include,
2765                        NULL == mca_btl_openib_component.ipaddr_exclude ?
2766                          "<not specified>" :mca_btl_openib_component.ipaddr_exclude,
2767                        NULL);
2768         goto no_btls;
2769     } else if (NULL != mca_btl_openib_component.if_include) {
2770         mca_btl_openib_component.if_include_list =
2771             opal_argv_split(mca_btl_openib_component.if_include, ',');
2772         mca_btl_openib_component.if_list =
2773             opal_argv_copy(mca_btl_openib_component.if_include_list);
2774     } else if (NULL != mca_btl_openib_component.if_exclude) {
2775         mca_btl_openib_component.if_exclude_list =
2776             opal_argv_split(mca_btl_openib_component.if_exclude, ',');
2777         mca_btl_openib_component.if_list =
2778             opal_argv_copy(mca_btl_openib_component.if_exclude_list);
2779     }
2780 
2781     ib_devs = opal_ibv_get_device_list(&num_devs);
2782 
2783     if(0 == num_devs || NULL == ib_devs) {
2784         mca_btl_base_error_no_nics("OpenFabrics (openib)", "device");
2785         goto no_btls;
2786     }
2787 
2788     dev_sorted = sort_devs_by_distance(ib_devs, num_devs);
2789     if (NULL == dev_sorted) {
2790         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2791         goto no_btls;
2792     }
2793 
2794     OBJ_CONSTRUCT(&btl_list, opal_list_t);
2795     OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
2796 
2797     distance = dev_sorted[0].distance;
2798     for (found = false, i = 0;
2799          i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls ||
2800                 mca_btl_openib_component.ib_num_btls <
2801                 mca_btl_openib_component.ib_max_btls); i++) {
2802         if (0 != mca_btl_openib_component.ib_num_btls &&
2803             (dev_sorted[i].distance - distance) > EPS) {
2804             opal_output_verbose(1, opal_btl_base_framework.framework_output,
2805                                 "[rank=%d] openib: skipping device %s; it is too far away",
2806                                 OPAL_PROC_MY_NAME.vpid,
2807                                 ibv_get_device_name(dev_sorted[i].ib_dev));
2808             break;
2809         }
2810 
2811         /* Only take devices that match the type specified by
2812            btl_openib_device_type */
2813         switch (mca_btl_openib_component.device_type) {
2814         case BTL_OPENIB_DT_IB:
2815 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
2816             if (IBV_TRANSPORT_IWARP == dev_sorted[i].ib_dev->transport_type) {
2817                 BTL_VERBOSE(("openib: only taking infiniband devices -- skipping %s",
2818                              ibv_get_device_name(dev_sorted[i].ib_dev)));
2819                 continue;
2820             }
2821 #endif
2822             break;
2823 
2824         case BTL_OPENIB_DT_IWARP:
2825 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
2826             if (IBV_TRANSPORT_IB == dev_sorted[i].ib_dev->transport_type) {
2827                 BTL_VERBOSE(("openib: only taking iwarp devices -- skipping %s",
2828                              ibv_get_device_name(dev_sorted[i].ib_dev)));
2829                 continue;
2830             }
2831 #else
2832             opal_show_help("help-mpi-btl-openib.txt", "no iwarp support",
2833                            true);
2834 #endif
2835             break;
2836 
2837         case BTL_OPENIB_DT_ALL:
2838             break;
2839         }
2840 
2841         found = true;
2842         ret = init_one_device(&btl_list, dev_sorted[i].ib_dev);
2843         if (OPAL_ERR_NOT_SUPPORTED == ret) {
2844             ++num_devices_intentionally_ignored;
2845             continue;
2846         } else if (OPAL_SUCCESS != ret) {
2847             free(dev_sorted);
2848             goto no_btls;
2849         }
2850     }
2851     free(dev_sorted);
2852     if (!found) {
2853         opal_show_help("help-mpi-btl-openib.txt", "no devices right type",
2854                        true, opal_process_info.nodename,
2855                        ((BTL_OPENIB_DT_IB == mca_btl_openib_component.device_type) ?
2856                         "InfiniBand" :
2857                         (BTL_OPENIB_DT_IWARP == mca_btl_openib_component.device_type) ?
2858                         "iWARP" : "<any>"));
2859         goto no_btls;
2860     }
2861 
2862     /* If we got back from checking all the devices and find that
2863        there are still items in the component.if_list, that means that
2864        they didn't exist.  Show an appropriate warning if the warning
2865        was not disabled. */
2866 
2867     if (0 != opal_argv_count(mca_btl_openib_component.if_list) &&
2868         mca_btl_openib_component.warn_nonexistent_if) {
2869         char *str = opal_argv_join(mca_btl_openib_component.if_list, ',');
2870         opal_show_help("help-mpi-btl-openib.txt", "nonexistent port",
2871                        true, opal_process_info.nodename,
2872                        ((NULL != mca_btl_openib_component.if_include) ?
2873                         "in" : "ex"), str);
2874         free(str);
2875     }
2876 
2877     if(0 == mca_btl_openib_component.ib_num_btls) {
2878         /* If there were unusable devices that weren't specifically
2879            ignored, warn about it */
2880         if (num_devices_intentionally_ignored < num_devs) {
2881             opal_show_help("help-mpi-btl-openib.txt",
2882                            "no active ports found", true,
2883                            opal_process_info.nodename);
2884         }
2885         goto no_btls;
2886     }
2887 
2888     /* Now that we know we have devices and ports that we want to use,
2889        init CPC components */
2890     if (OPAL_SUCCESS != (ret = opal_btl_openib_connect_base_init())) {
2891         goto no_btls;
2892     }
2893 
2894     /* Setup the BSRQ QP's based on the final value of
2895        mca_btl_openib_component.receive_queues. */
2896     if (OPAL_SUCCESS != setup_qps()) {
2897         goto no_btls;
2898     }
2899     if (mca_btl_openib_component.num_srq_qps > 0 ||
2900                      mca_btl_openib_component.num_xrc_qps > 0) {
2901         opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;
2902         if(OPAL_SUCCESS != opal_hash_table_init(
2903                 srq_addr_table, (mca_btl_openib_component.num_srq_qps +
2904                                  mca_btl_openib_component.num_xrc_qps) *
2905                                  mca_btl_openib_component.ib_num_btls)) {
2906             BTL_ERROR(("SRQ internal error. Failed to allocate SRQ addr hash table"));
2907             goto no_btls;
2908         }
2909     }
2910 
2911     /* For XRC:
2912      * from this point we know if MCA_BTL_XRC_ENABLED it true or false */
2913 
2914     /* Init XRC IB Addr hash table */
2915     if (MCA_BTL_XRC_ENABLED) {
2916         OBJ_CONSTRUCT(&mca_btl_openib_component.ib_addr_table,
2917                 opal_hash_table_t);
2918     }
2919 
2920     /* Allocate space for btl modules */
2921     mca_btl_openib_component.openib_btls =
2922         (mca_btl_openib_module_t **) malloc(sizeof(mca_btl_openib_module_t*) *
2923                 mca_btl_openib_component.ib_num_btls);
2924     if(NULL == mca_btl_openib_component.openib_btls) {
2925         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2926         goto no_btls;
2927     }
2928     btls = (struct mca_btl_base_module_t **)
2929         malloc(mca_btl_openib_component.ib_num_btls *
2930                sizeof(struct mca_btl_base_module_t*));
2931     if(NULL == btls) {
2932         BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2933         goto no_btls;
2934     }
2935 
2936     /* Copy the btl module structs into a contiguous array and fully
2937        initialize them */
2938     i = 0;
2939     while (NULL != (item = opal_list_remove_first(&btl_list))) {
2940         ib_selected = (mca_btl_base_selected_module_t*)item;
2941         openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
2942 
2943         /* Search for a CPC that can handle this port */
2944         ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
2945         /* If we get NOT_SUPPORTED, then no CPC was found for this
2946            port.  But that's not a fatal error -- just keep going;
2947            let's see if we find any usable openib modules or not. */
2948         if (OPAL_ERR_NOT_SUPPORTED == ret) {
2949             continue;
2950         } else if (OPAL_SUCCESS != ret) {
2951             /* All others *are* fatal.  Note that we already did a
2952                show_help in the lower layer */
2953             goto no_btls;
2954         }
2955 
2956         if (mca_btl_openib_component.max_hw_msg_size > 0 &&
2957             (uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
2958             BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
2959                 mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
2960         }
2961 
2962         mca_btl_openib_component.openib_btls[i] = openib_btl;
2963         OBJ_RELEASE(ib_selected);
2964         btls[i] = &openib_btl->super;
2965         if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
2966             goto no_btls;
2967         }
2968         ++i;
2969     }
2970     /* If we got nothing, then error out */
2971     if (0 == i) {
2972         goto no_btls;
2973     }
2974     /* Otherwise reset to the number of openib modules that we
2975        actually got */
2976     mca_btl_openib_component.ib_num_btls = i;
2977 
2978     btl_openib_modex_send();
2979 
2980     *num_btl_modules = mca_btl_openib_component.ib_num_btls;
2981     opal_ibv_free_device_list(ib_devs);
2982     if (NULL != mca_btl_openib_component.if_include_list) {
2983         opal_argv_free(mca_btl_openib_component.if_include_list);
2984         mca_btl_openib_component.if_include_list = NULL;
2985     }
2986     if (NULL != mca_btl_openib_component.if_exclude_list) {
2987         opal_argv_free(mca_btl_openib_component.if_exclude_list);
2988         mca_btl_openib_component.if_exclude_list = NULL;
2989     }
2990 
2991 #if OPAL_CUDA_SUPPORT
2992    if (mca_btl_openib_component.cuda_want_gdr && (0 == opal_leave_pinned)) {
2993         opal_show_help("help-mpi-btl-openib.txt",
2994                        "CUDA_gdr_and_nopinned", true,
2995                        opal_process_info.nodename);
2996         goto no_btls;
2997     }
2998 #endif /* OPAL_CUDA_SUPPORT */
2999 
3000     mca_btl_openib_component.memory_registration_verbose = opal_output_open(NULL);
3001     opal_output_set_verbosity (mca_btl_openib_component.memory_registration_verbose,
3002                                mca_btl_openib_component.memory_registration_verbose_level);
3003 
3004     /* setup the fork warning message as we are sensitive
3005      * to memory corruption issues when fork is called
3006      */
3007     opal_warn_fork();
3008     return btls;
3009 
3010  no_btls:
3011     /* If we fail early enough in the setup, we just modex around that
3012        there are no openib BTL's in this process and return NULL. */
3013 
3014     mca_btl_openib_component.ib_num_btls = 0;
3015     btl_openib_modex_send();
3016     if (NULL != btls) {
3017         free(btls);
3018     }
3019     return NULL;
3020 }
3021 
3022 /*
3023  * Progress the no_credits_pending_frags lists on all qp's
3024  */
progress_no_credits_pending_frags(mca_btl_base_endpoint_t * ep)3025 static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep)
3026 {
3027     int qp, pri, rc, len;
3028     opal_list_item_t *frag;
3029 
3030     OPAL_THREAD_LOCK(&ep->endpoint_lock);
3031 
3032     /* Traverse all QPs and all priorities */
3033     for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
3034         for (pri = 0; pri < 2; ++pri) {
3035             /* Note that entries in the no_credits_pending_frags list
3036                may be eager RDMA or send fragments.  So be sure to
3037                check that we have at least 1 RDMA or send credit.
3038 
3039                This loop needs a little explaining.  :-\
3040 
3041                In the body of the loop, we call _endpoint_post_send().
3042                The frag will either be successfully sent, or it will
3043                be [re]added to the no_credit_pending_frags list.  So
3044                if we keep trying to drain the no_credits_pending_frag
3045                list, we could end up in an infinite loop.  So instead,
3046                we get the initial length of the list and ensure to run
3047                through every entry at least once.  This attempts to
3048                send *every* frag once and catches the case where a
3049                frag may be on the RDMA list, but because of
3050                coalescing, is now too big for RDMA and defaults over
3051                to sending -- but then we're out of send credits, so it
3052                doesn't go.  But if we *do* still have some RDMA
3053                credits and there are RDMA frags on the list behind
3054                this now-too-big frag, they'll get a chance to go.
3055 
3056                Specifically, the condition in this for loop is as follows:
3057 
3058                - len > 0: ensure to go through all entries in the list once
3059                - the 2nd part of the conditional checks to see if we
3060                  have any credits at all.  Specifically, do we have
3061                  any RDMA credits or any send credits, *or* are we on
3062                  an SRQ, in which case we define that we *always* have
3063                  credits (because the hardware will continually
3064                  retransmit for us).
3065             */
3066             for (len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
3067                  len > 0 &&
3068                      (ep->eager_rdma_remote.tokens > 0 ||
3069                       ep->qps[qp].u.pp_qp.sd_credits > 0 ||
3070                       !BTL_OPENIB_QP_TYPE_PP(qp)); --len) {
3071                 frag = opal_list_remove_first(&ep->qps[qp].no_credits_pending_frags[pri]);
3072                 assert (NULL != frag);
3073 
3074                 /* If _endpoint_post_send() fails because of
3075                    RESOURCE_BUSY, then the frag was re-added to the
3076                    no_credits_pending list.  Specifically: either the
3077                    frag was initially an RDMA frag, but there were no
3078                    RDMA credits so it fell through the trying to send,
3079                    but we had no send credits and therefore re-added
3080                    the frag to the no_credits list, or the frag was a
3081                    send frag initially (and the same sequence
3082                    occurred, starting at the send frag out-of-credits
3083                    scenario).  In this case, just continue and try the
3084                    rest of the frags in the list.
3085 
3086                    If it fails because of another error, return the
3087                    error upward. */
3088                 rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
3089                 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc &&
3090                                   OPAL_ERR_RESOURCE_BUSY != rc)) {
3091                     OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3092                     return rc;
3093                 }
3094             }
3095         }
3096     }
3097 
3098     OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3099     return OPAL_SUCCESS;
3100 }
3101 
mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t * ep,const int qp)3102 void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
3103         const int qp)
3104 {
3105     mca_btl_openib_module_t* openib_btl = ep->endpoint_btl;
3106     opal_list_item_t *frag;
3107     size_t i, len = opal_list_get_size(&ep->pending_get_frags);
3108     int rc;
3109 
3110     for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
3111         OPAL_THREAD_LOCK(&ep->endpoint_lock);
3112         frag = opal_list_remove_first(&(ep->pending_get_frags));
3113         OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3114         if (NULL == frag)
3115             break;
3116         rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
3117                                           to_get_frag(frag));
3118         if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
3119             OPAL_THREAD_LOCK(&ep->endpoint_lock);
3120             opal_list_prepend (&ep->pending_get_frags, frag);
3121             OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3122             break;
3123         }
3124     }
3125 
3126     len = opal_list_get_size(&ep->pending_put_frags);
3127     for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0; i++) {
3128         OPAL_THREAD_LOCK(&ep->endpoint_lock);
3129         frag = opal_list_remove_first(&(ep->pending_put_frags));
3130         OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3131         if (NULL == frag)
3132             break;
3133         rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
3134                                           to_put_frag(frag));
3135         if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
3136             OPAL_THREAD_LOCK(&ep->endpoint_lock);
3137             opal_list_prepend (&ep->pending_put_frags, frag);
3138             OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3139             break;
3140         }
3141     }
3142 }
3143 
btl_openib_handle_incoming(mca_btl_openib_module_t * openib_btl,mca_btl_openib_endpoint_t * ep,mca_btl_openib_recv_frag_t * frag,size_t byte_len)3144 static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
3145                                          mca_btl_openib_endpoint_t *ep,
3146                                          mca_btl_openib_recv_frag_t *frag,
3147                                          size_t byte_len)
3148 {
3149     mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
3150     mca_btl_openib_header_t *hdr = frag->hdr;
3151     int rqp = to_base_frag(frag)->base.order, cqp;
3152     uint16_t rcredits = 0, credits;
3153     bool is_credit_msg;
3154 
3155     if(ep->nbo) {
3156         BTL_OPENIB_HEADER_NTOH(*hdr);
3157     }
3158 
3159     /* advance the segment address past the header and subtract from the
3160      * length.*/
3161     des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
3162 
3163     if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
3164         /* call registered callback */
3165         mca_btl_active_message_callback_t* reg;
3166 
3167 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3168         /* The COPY_ASYNC flag should not be set */
3169         assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
3170 #endif /* OPAL_CUDA_SUPPORT */
3171         reg = mca_btl_base_active_message_trigger + hdr->tag;
3172         reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
3173 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3174         if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
3175             /* Since ASYNC flag is set, we know this descriptor is being used
3176              * for asynchronous copy and cannot be freed yet. Therefore, set
3177              * up callback for PML to call when complete, add argument into
3178              * descriptor and return. */
3179             des->des_cbfunc = btl_openib_handle_incoming_completion;
3180             to_in_frag(des)->endpoint = ep;
3181             return OPAL_SUCCESS;
3182         }
3183 #endif /* OPAL_CUDA_SUPPORT */
3184         if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3185             cqp = (hdr->credits >> 11) & 0x0f;
3186             hdr->credits &= 0x87ff;
3187         } else {
3188             cqp = rqp;
3189         }
3190         if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
3191             rcredits = BTL_OPENIB_CREDITS(hdr->credits);
3192             hdr->credits = 0;
3193         }
3194     } else {
3195         mca_btl_openib_rdma_credits_header_t *chdr =
3196             (mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
3197         if(ep->nbo) {
3198             BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
3199         }
3200         cqp = chdr->qpn;
3201         rcredits = chdr->rdma_credits;
3202     }
3203 
3204     credits = hdr->credits;
3205 
3206     if(hdr->cm_seen)
3207          OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
3208 
3209     /* Now return fragment. Don't touch hdr after this point! */
3210     if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3211         mca_btl_openib_eager_rdma_local_t *erl = &ep->eager_rdma_local;
3212         OPAL_THREAD_LOCK(&erl->lock);
3213         MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
3214         while(erl->tail != erl->head) {
3215             mca_btl_openib_recv_frag_t *tf;
3216             tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(ep, erl->tail);
3217             if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(tf))
3218                 break;
3219             OPAL_THREAD_ADD32(&erl->credits, 1);
3220             MCA_BTL_OPENIB_RDMA_NEXT_INDEX(erl->tail);
3221         }
3222         OPAL_THREAD_UNLOCK(&erl->lock);
3223     } else {
3224         if (is_cts_message(frag)) {
3225             /* If this was a CTS, free it here (it was
3226                malloc'ed+ibv_reg_mr'ed -- so it should *not* be
3227                FRAG_RETURN'ed). */
3228             int rc = opal_btl_openib_connect_base_free_cts(ep);
3229             if (OPAL_SUCCESS != rc) {
3230                 return rc;
3231             }
3232         } else {
3233             /* Otherwise, FRAG_RETURN it and repost if necessary */
3234             MCA_BTL_IB_FRAG_RETURN(frag);
3235             if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
3236                 if (OPAL_UNLIKELY(is_credit_msg)) {
3237                     OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
3238                 } else {
3239                     OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
3240                 }
3241                 mca_btl_openib_endpoint_post_rr(ep, cqp);
3242             } else {
3243                 mca_btl_openib_module_t *btl = ep->endpoint_btl;
3244                 OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
3245                 mca_btl_openib_post_srr(btl, rqp);
3246             }
3247         }
3248     }
3249 
3250     assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
3251 
3252     /* If we got any credits (RDMA or send), then try to progress all
3253        the no_credits_pending_frags lists */
3254     if (rcredits > 0) {
3255         OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
3256     }
3257     if (credits > 0) {
3258         OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
3259     }
3260     if (rcredits + credits > 0) {
3261         int rc;
3262 
3263         if (OPAL_SUCCESS !=
3264             (rc = progress_no_credits_pending_frags(ep))) {
3265             return rc;
3266         }
3267     }
3268 
3269     send_credits(ep, cqp);
3270 
3271     return OPAL_SUCCESS;
3272 }
3273 
3274 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3275 /**
3276  * Called by the PML when the copying of the data out of the fragment
3277  * is complete.
3278  */
btl_openib_handle_incoming_completion(mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * ep,mca_btl_base_descriptor_t * des,int status)3279 static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
3280                                                  mca_btl_base_endpoint_t *ep,
3281                                                  mca_btl_base_descriptor_t* des,
3282                                                  int status)
3283 {
3284     mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
3285     mca_btl_openib_header_t *hdr = frag->hdr;
3286     int rqp = to_base_frag(frag)->base.order, cqp;
3287     uint16_t rcredits = 0, credits;
3288 
3289     ep = to_in_frag (des)->endpoint;
3290 
3291     OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
3292 
3293     if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3294         cqp = (hdr->credits >> 11) & 0x0f;
3295         hdr->credits &= 0x87ff;
3296     } else {
3297         cqp = rqp;
3298     }
3299     if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
3300         rcredits = BTL_OPENIB_CREDITS(hdr->credits);
3301         hdr->credits = 0;
3302     }
3303 
3304     credits = hdr->credits;
3305 
3306     if(hdr->cm_seen)
3307          OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
3308 
3309     /* We should not be here with eager, control, or credit messages */
3310     assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
3311     assert(0 == is_cts_message(frag));
3312     assert(0 == is_credit_message(frag));
3313     /* HACK - clear out flags.  Must be better way */
3314     des->des_flags = 0;
3315     /* Otherwise, FRAG_RETURN it and repost if necessary */
3316     MCA_BTL_IB_FRAG_RETURN(frag);
3317     if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
3318         OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
3319         mca_btl_openib_endpoint_post_rr(ep, cqp);
3320     } else {
3321         mca_btl_openib_module_t *btl = ep->endpoint_btl;
3322         OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
3323         mca_btl_openib_post_srr(btl, rqp);
3324     }
3325 
3326     assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
3327 
3328     /* If we got any credits (RDMA or send), then try to progress all
3329        the no_credits_pending_frags lists */
3330     if (rcredits > 0) {
3331         OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
3332     }
3333     if (credits > 0) {
3334         OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
3335     }
3336     if (rcredits + credits > 0) {
3337         int rc;
3338 
3339         if (OPAL_SUCCESS !=
3340             (rc = progress_no_credits_pending_frags(ep))) {
3341             /* This is a fatal issue so call into PML and let it know. */
3342             mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
3343             openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3344                                  NULL, NULL);
3345             return;
3346         }
3347     }
3348 
3349     send_credits(ep, cqp);
3350 
3351 }
3352 #endif /* OPAL_CUDA_SUPPORT */
3353 
btl_openib_component_status_to_string(enum ibv_wc_status status)3354 static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
3355 {
3356     switch(status) {
3357     case IBV_WC_SUCCESS:
3358         return "SUCCESS";
3359         break;
3360     case IBV_WC_LOC_LEN_ERR:
3361         return "LOCAL LENGTH ERROR";
3362         break;
3363     case IBV_WC_LOC_QP_OP_ERR:
3364         return "LOCAL QP OPERATION ERROR";
3365         break;
3366     case IBV_WC_LOC_PROT_ERR:
3367         return "LOCAL PROTOCOL ERROR";
3368         break;
3369     case IBV_WC_WR_FLUSH_ERR:
3370         return "WORK REQUEST FLUSHED ERROR";
3371         break;
3372     case IBV_WC_MW_BIND_ERR:
3373         return "MEMORY WINDOW BIND ERROR";
3374         break;
3375     case IBV_WC_BAD_RESP_ERR:
3376         return "BAD RESPONSE ERROR";
3377         break;
3378     case IBV_WC_LOC_ACCESS_ERR:
3379         return "LOCAL ACCESS ERROR";
3380         break;
3381     case IBV_WC_REM_INV_REQ_ERR:
3382         return "INVALID REQUEST ERROR";
3383         break;
3384     case IBV_WC_REM_ACCESS_ERR:
3385         return "REMOTE ACCESS ERROR";
3386         break;
3387     case IBV_WC_REM_OP_ERR:
3388         return "REMOTE OPERATION ERROR";
3389         break;
3390     case IBV_WC_RETRY_EXC_ERR:
3391         return "RETRY EXCEEDED ERROR";
3392         break;
3393     case IBV_WC_RNR_RETRY_EXC_ERR:
3394         return "RECEIVER NOT READY RETRY EXCEEDED ERROR";
3395         break;
3396     case IBV_WC_LOC_RDD_VIOL_ERR:
3397         return "LOCAL RDD VIOLATION ERROR";
3398         break;
3399     case IBV_WC_REM_INV_RD_REQ_ERR:
3400         return "INVALID READ REQUEST ERROR";
3401         break;
3402     case IBV_WC_REM_ABORT_ERR:
3403         return "REMOTE ABORT ERROR";
3404         break;
3405     case IBV_WC_INV_EECN_ERR:
3406         return "INVALID EECN ERROR";
3407         break;
3408     case IBV_WC_INV_EEC_STATE_ERR:
3409         return "INVALID EEC STATE ERROR";
3410         break;
3411     case IBV_WC_FATAL_ERR:
3412         return "FATAL ERROR";
3413         break;
3414     case IBV_WC_RESP_TIMEOUT_ERR:
3415         return "RESPONSE TIMEOUT ERROR";
3416         break;
3417     case IBV_WC_GENERAL_ERR:
3418         return "GENERAL ERROR";
3419         break;
3420     default:
3421         return "STATUS UNDEFINED";
3422         break;
3423     }
3424 }
3425 
3426 static void
progress_pending_frags_wqe(mca_btl_base_endpoint_t * ep,const int qpn)3427 progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
3428 {
3429     int ret;
3430     opal_list_item_t *frag;
3431     mca_btl_openib_qp_t *qp = ep->qps[qpn].qp;
3432 
3433     OPAL_THREAD_LOCK(&ep->endpoint_lock);
3434     for(int i = 0; i < 2; i++) {
3435        while(qp->sd_wqe > 0) {
3436             mca_btl_base_endpoint_t *tmp_ep;
3437             frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]);
3438             if(NULL == frag)
3439                 break;
3440             assert(0 == frag->opal_list_item_refcount);
3441             tmp_ep = to_com_frag(frag)->endpoint;
3442             ret = mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag));
3443             if (OPAL_SUCCESS != ret) {
3444                 /* NTH: this handles retrying if we are out of credits but other errors are not
3445                  * handled (maybe abort?). */
3446                 if (OPAL_ERR_RESOURCE_BUSY != ret) {
3447                     opal_list_prepend (&ep->qps[qpn].no_wqe_pending_frags[i], (opal_list_item_t *) frag);
3448                 }
3449                 break;
3450             }
3451        }
3452     }
3453     OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3454 }
3455 
progress_pending_frags_srq(mca_btl_openib_module_t * openib_btl,const int qp)3456 static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
3457         const int qp)
3458 {
3459     opal_list_item_t *frag;
3460     int i;
3461 
3462     assert(BTL_OPENIB_QP_TYPE_SRQ(qp) || BTL_OPENIB_QP_TYPE_XRC(qp));
3463 
3464     for(i = 0; i < 2; i++) {
3465         while(openib_btl->qps[qp].u.srq_qp.sd_credits > 0) {
3466             OPAL_THREAD_LOCK(&openib_btl->ib_lock);
3467             frag = opal_list_remove_first(
3468                     &openib_btl->qps[qp].u.srq_qp.pending_frags[i]);
3469             OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3470 
3471             if(NULL == frag)
3472                 break;
3473 
3474             mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint,
3475                     to_send_frag(frag));
3476         }
3477     }
3478 }
3479 
3480 static char *cq_name[] = {"HP CQ", "LP CQ"};
handle_wc(mca_btl_openib_device_t * device,const uint32_t cq,struct ibv_wc * wc)3481 static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
3482         struct ibv_wc *wc)
3483 {
3484     static int flush_err_printed[] = {0, 0};
3485     mca_btl_openib_com_frag_t* frag;
3486     mca_btl_base_descriptor_t *des;
3487     mca_btl_openib_endpoint_t* endpoint;
3488     mca_btl_openib_module_t *openib_btl = NULL;
3489     const opal_proc_t* remote_proc = NULL;
3490     int qp, btl_ownership;
3491     int n;
3492 
3493     des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
3494     frag = to_com_frag(des);
3495 
3496     /* For receive fragments "order" contains QP idx the fragment was posted
3497      * to. For send fragments "order" contains QP idx the fragment was send
3498      * through */
3499     qp = des->order;
3500 
3501     if (IBV_WC_RECV == wc->opcode && (wc->wc_flags & IBV_WC_WITH_IMM)) {
3502 #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
3503         wc->imm_data = ntohl(wc->imm_data);
3504 #endif
3505         frag->endpoint = (mca_btl_openib_endpoint_t*)
3506             opal_pointer_array_get_item(device->endpoints, wc->imm_data);
3507     }
3508 
3509     endpoint = frag->endpoint;
3510 
3511     assert (NULL != endpoint);
3512 
3513     openib_btl = endpoint->endpoint_btl;
3514 
3515     if(wc->status != IBV_WC_SUCCESS) {
3516         OPAL_OUTPUT((-1, "Got WC: ERROR"));
3517         goto error;
3518     }
3519 
3520     /* Handle work completions */
3521     switch(wc->opcode) {
3522         case IBV_WC_RDMA_READ:
3523         case IBV_WC_COMP_SWAP:
3524         case IBV_WC_FETCH_ADD:
3525             OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
3526 
3527             OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
3528 
3529             mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
3530 
3531             /* check if atomic result needs to be byte swapped (mlx5) */
3532             if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) {
3533                 *((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr));
3534             }
3535 
3536             get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
3537                                get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
3538                                OPAL_SUCCESS);
3539             /* fall through */
3540         case IBV_WC_RDMA_WRITE:
3541             if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
3542                 mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
3543 
3544                 put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
3545                                    put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
3546                                    OPAL_SUCCESS);
3547                 put_frag->cb.func = NULL;
3548             }
3549             /* fall through */
3550         case IBV_WC_SEND:
3551             OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
3552             if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
3553                 opal_list_item_t *i;
3554                 while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
3555                     btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
3556                         to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
3557                                 &to_base_frag(i)->base, OPAL_SUCCESS);
3558                     if( btl_ownership ) {
3559                         mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
3560                     }
3561                 }
3562             }
3563             /* Process a completed send/put/get */
3564             btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
3565             if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
3566                 des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
3567             }
3568             if( btl_ownership ) {
3569                 mca_btl_openib_free(&openib_btl->super, des);
3570             }
3571 
3572             /* return send wqe */
3573             qp_put_wqe(endpoint, qp);
3574 
3575             /* return wqes that were sent before this frag */
3576             n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des));
3577 
3578             if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
3579                 OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n);
3580 
3581                 /* new SRQ credit available. Try to progress pending frags*/
3582                 progress_pending_frags_srq(openib_btl, qp);
3583             }
3584             /* new wqe or/and get token available. Try to progress pending frags */
3585             progress_pending_frags_wqe(endpoint, qp);
3586             mca_btl_openib_frag_progress_pending_put_get(endpoint, qp);
3587             break;
3588         case IBV_WC_RECV:
3589             OPAL_OUTPUT((-1, "Got WC: RDMA_RECV, qp %d, src qp %d, WR ID %" PRIx64,
3590                          wc->qp_num, wc->src_qp, wc->wr_id));
3591 
3592             /* Process a RECV */
3593             if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag),
3594                         wc->byte_len) != OPAL_SUCCESS) {
3595                 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3596                                      NULL, NULL);
3597                 break;
3598             }
3599 
3600             /* decide if it is time to setup an eager rdma channel */
3601             if(!endpoint->eager_rdma_local.base.pval && endpoint->use_eager_rdma &&
3602                     wc->byte_len < mca_btl_openib_component.eager_limit &&
3603                     openib_btl->eager_rdma_channels <
3604                     mca_btl_openib_component.max_eager_rdma &&
3605                     OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
3606                     mca_btl_openib_component.eager_rdma_threshold) {
3607                 mca_btl_openib_endpoint_connect_eager_rdma(endpoint);
3608             }
3609             break;
3610         default:
3611             BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode));
3612             if(openib_btl)
3613                 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3614                                      NULL, NULL);
3615             break;
3616     }
3617 
3618     return;
3619 
3620 error:
3621     if(endpoint->endpoint_proc && endpoint->endpoint_proc->proc_opal)
3622         remote_proc = endpoint->endpoint_proc->proc_opal;
3623 
3624     /* For iWARP, the TCP connection is tied to the QP once the QP is
3625      * in RTS.  And destroying the QP is thus tied to connection
3626      * teardown for iWARP.  To destroy the connection in iWARP you
3627      * must move the QP out of RTS, either into CLOSING for a nice
3628      * graceful close (e.g., via rdma_disconnect()), or to ERROR if
3629      * you want to be rude (e.g., just destroying the QP without
3630      * disconnecting first).  In both cases, all pending non-completed
3631      * SQ and RQ WRs will automatically be flushed.
3632      */
3633 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
3634     if (IBV_WC_WR_FLUSH_ERR == wc->status &&
3635         IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) {
3636         return;
3637     }
3638 #endif
3639 
3640     if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) {
3641         BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
3642                     "status number %d for wr_id %" PRIx64 " opcode %d  vendor error %d qp_idx %d",
3643                     cq_name[cq], btl_openib_component_status_to_string(wc->status),
3644                     wc->status, wc->wr_id,
3645                     wc->opcode, wc->vendor_err, qp));
3646     }
3647 
3648     if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
3649         IBV_WC_RETRY_EXC_ERR == wc->status) {
3650         const char *peer_hostname;
3651         peer_hostname = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
3652         const char *device_name =
3653             ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
3654 
3655         if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
3656             // The show_help checker script gets confused if the topic
3657             // is an inline logic check, so separate it into two calls
3658             // to show_help.
3659             if (BTL_OPENIB_QP_TYPE_PP(qp)) {
3660                 opal_show_help("help-mpi-btl-openib.txt",
3661                                "pp rnr retry exceeded",
3662                                true,
3663                                opal_process_info.nodename,
3664                                device_name,
3665                                peer_hostname);
3666             } else {
3667                 opal_show_help("help-mpi-btl-openib.txt",
3668                                "srq rnr retry exceeded",
3669                                true,
3670                                opal_process_info.nodename,
3671                                device_name,
3672                                peer_hostname);
3673             }
3674         } else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
3675             opal_show_help("help-mpi-btl-openib.txt",
3676                            "pp retry exceeded", true,
3677                            opal_process_info.nodename,
3678                            device_name, peer_hostname);
3679         }
3680     }
3681 
3682     if(openib_btl)
3683         openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3684                              (struct opal_proc_t*)remote_proc, NULL);
3685 }
3686 
poll_device(mca_btl_openib_device_t * device,int count)3687 static int poll_device(mca_btl_openib_device_t* device, int count)
3688 {
3689     int ne = 0, cq;
3690     uint32_t hp_iter = 0;
3691     struct ibv_wc wc[MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT];
3692     int i;
3693 
3694     device->pollme = false;
3695     for(cq = 0; cq < 2 && hp_iter < mca_btl_openib_component.cq_poll_progress;)
3696     {
3697         ne = ibv_poll_cq(device->ib_cq[cq], mca_btl_openib_component.cq_poll_batch, wc);
3698         if(0 == ne) {
3699             /* don't check low prio cq if there was something in high prio cq,
3700              * but for each cq_poll_ratio hp cq polls poll lp cq once */
3701             if(count && device->hp_cq_polls)
3702                 break;
3703             cq++;
3704             device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
3705             continue;
3706         }
3707 
3708         if(ne < 0)
3709             goto error;
3710 
3711         count++;
3712 
3713         if(BTL_OPENIB_HP_CQ == cq) {
3714             device->pollme = true;
3715             hp_iter++;
3716             device->hp_cq_polls--;
3717         }
3718 
3719         for (i = 0; i < ne; i++)
3720             handle_wc(device, cq, &wc[i]);
3721     }
3722 
3723     return count;
3724 error:
3725     BTL_ERROR(("error polling %s with %d errno says %s", cq_name[cq], ne,
3726                 strerror(errno)));
3727     return count;
3728 }
3729 
3730 #if OPAL_ENABLE_PROGRESS_THREADS
mca_btl_openib_progress_thread(opal_object_t * arg)3731 void* mca_btl_openib_progress_thread(opal_object_t* arg)
3732 {
3733     opal_thread_t* thread = (opal_thread_t*)arg;
3734     mca_btl_openib_device_t* device = thread->t_arg;
3735     struct ibv_cq *ev_cq;
3736     void *ev_ctx;
3737 
3738     /* This thread enter in a cancel enabled state */
3739     pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
3740     pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
3741 
3742     opal_output(-1, "WARNING: the openib btl progress thread code *does not yet work*.  Your run is likely to hang, crash, break the kitchen sink, and/or eat your cat.  You have been warned.");
3743 
3744     while (device->progress) {
3745 #if 0
3746         while(ompi_progress_threads()) {
3747             while(ompi_progress_threads())
3748                 sched_yield();
3749             usleep(100); /* give app a chance to re-enter library */
3750         }
3751 #endif
3752 
3753         if(ibv_get_cq_event(device->ib_channel, &ev_cq, &ev_ctx))
3754             BTL_ERROR(("Failed to get CQ event with error %s",
3755                         strerror(errno)));
3756         if(ibv_req_notify_cq(ev_cq, 0)) {
3757             BTL_ERROR(("Couldn't request CQ notification with error %s",
3758                         strerror(errno)));
3759         }
3760 
3761         ibv_ack_cq_events(ev_cq, 1);
3762 
3763         while(poll_device(device, 0));
3764     }
3765 
3766     return PTHREAD_CANCELED;
3767 }
3768 #endif
3769 
progress_one_device(mca_btl_openib_device_t * device)3770 static int progress_one_device(mca_btl_openib_device_t *device)
3771 {
3772     int i, c, count = 0, ret;
3773     mca_btl_openib_recv_frag_t* frag;
3774     mca_btl_openib_endpoint_t* endpoint;
3775     uint32_t non_eager_rdma_endpoints = 0;
3776 
3777     c = device->eager_rdma_buffers_count;
3778     non_eager_rdma_endpoints += (device->non_eager_rdma_endpoints + device->pollme);
3779 
3780     for(i = 0; i < c; i++) {
3781         endpoint = device->eager_rdma_buffers[i];
3782 
3783         if(!endpoint)
3784             continue;
3785 
3786         OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
3787         frag = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
3788                 endpoint->eager_rdma_local.head);
3789 
3790         if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(frag)) {
3791             uint32_t size;
3792             mca_btl_openib_module_t *btl = endpoint->endpoint_btl;
3793 
3794             opal_atomic_mb();
3795 
3796             if(endpoint->nbo) {
3797                 BTL_OPENIB_FOOTER_NTOH(*frag->ftr);
3798             }
3799             size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
3800 #if OPAL_ENABLE_DEBUG
3801             if (frag->ftr->seq != endpoint->eager_rdma_local.seq)
3802                 BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d",
3803                            frag->ftr->seq,
3804                            endpoint->eager_rdma_local.seq));
3805             endpoint->eager_rdma_local.seq++;
3806 #endif
3807             MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head);
3808 
3809             OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
3810             frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
3811                 size - BTL_OPENIB_FTR_PADDING(size) + sizeof(mca_btl_openib_footer_t));
3812             to_base_frag(frag)->segment.seg_addr.pval =
3813                 ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
3814 
3815             ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint,
3816                     frag, size - sizeof(mca_btl_openib_footer_t));
3817             if (ret != OPAL_SUCCESS) {
3818                 btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
3819                 return 0;
3820             }
3821 
3822             count++;
3823         } else
3824             OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
3825     }
3826 
3827     device->eager_rdma_polls--;
3828 
3829     if(0 == count || non_eager_rdma_endpoints != 0 || !device->eager_rdma_polls) {
3830         count += poll_device(device, count);
3831         device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
3832     }
3833 
3834     return count;
3835 }
3836 
3837 /*
3838  *  IB component progress.
3839  */
btl_openib_component_progress(void)3840 static int btl_openib_component_progress(void)
3841 {
3842     int i;
3843     int count = 0;
3844 
3845     if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread &&
3846             mca_btl_openib_component.error_counter)) {
3847         goto error;
3848     }
3849 
3850     for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
3851         mca_btl_openib_device_t *device =
3852             (mca_btl_openib_device_t *) opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
3853         if (NULL != device) {
3854             count += progress_one_device(device);
3855         }
3856     }
3857 
3858 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
3859     /* Check to see if there are any outstanding dtoh CUDA events that
3860      * have completed.  If so, issue the PML callbacks on the fragments.
3861      * The only thing that gets completed here are asynchronous copies
3862      * so there is no need to free anything.
3863      */
3864     {
3865         int local_count = 0;
3866         mca_btl_base_descriptor_t *frag;
3867         while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
3868             OPAL_OUTPUT((-1, "btl_openib: event completed on frag=%p", (void *)frag));
3869             frag->des_cbfunc(NULL, NULL, frag, OPAL_SUCCESS);
3870             local_count++;
3871         }
3872         count += local_count;
3873     }
3874     if (count > 0) {
3875         OPAL_OUTPUT((-1, "btl_openib: DONE with openib progress, count=%d", count));
3876     }
3877 #endif /* OPAL_CUDA_SUPPORT */
3878 
3879     return count;
3880 
3881 error:
3882     /* Set the fatal counter to zero */
3883     mca_btl_openib_component.error_counter = 0;
3884     /* Lets find all error events */
3885     for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
3886         mca_btl_openib_module_t* openib_btl =
3887             mca_btl_openib_component.openib_btls[i];
3888         if(openib_btl->device->got_fatal_event) {
3889             openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3890                                  NULL, NULL);
3891         }
3892         if(openib_btl->device->got_port_event) {
3893             /* These are non-fatal so just ignore it. */
3894             openib_btl->device->got_port_event = false;
3895         }
3896     }
3897     return count;
3898 }
3899 
mca_btl_openib_post_srr(mca_btl_openib_module_t * openib_btl,const int qp)3900 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
3901 {
3902     int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
3903     int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
3904     int num_post, i, rc;
3905     struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
3906 
3907     assert(!BTL_OPENIB_QP_TYPE_PP(qp));
3908 
3909     OPAL_THREAD_LOCK(&openib_btl->ib_lock);
3910     if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
3911         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3912         return OPAL_SUCCESS;
3913     }
3914     num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
3915 
3916     if (0 == num_post) {
3917         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3918         return OPAL_SUCCESS;
3919     }
3920 
3921     for(i = 0; i < num_post; i++) {
3922         opal_free_list_item_t* item;
3923         item = opal_free_list_wait (&openib_btl->device->qps[qp].recv_free);
3924         to_base_frag(item)->base.order = qp;
3925         to_com_frag(item)->endpoint = NULL;
3926         if(NULL == wr)
3927             wr = wr_list = &to_recv_frag(item)->rd_desc;
3928         else
3929             wr = wr->next = &to_recv_frag(item)->rd_desc;
3930     }
3931 
3932     wr->next = NULL;
3933 
3934     rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
3935     if(OPAL_LIKELY(0 == rc)) {
3936         struct ibv_srq_attr srq_attr;
3937 
3938         OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
3939 
3940         if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
3941             srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
3942             srq_attr.max_sge = 1;
3943             srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
3944 
3945             openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
3946             if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
3947                 BTL_ERROR(("Failed to request limit event for srq on  %s.  "
3948                    "Fatal error, stoping asynch event thread",
3949                    ibv_get_device_name(openib_btl->device->ib_dev)));
3950 
3951                 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3952                 return OPAL_ERROR;
3953             }
3954         }
3955 
3956         OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3957         return OPAL_SUCCESS;
3958     }
3959 
3960     for(i = 0; wr_list && wr_list != bad_wr; i++, wr_list = wr_list->next);
3961 
3962     BTL_ERROR(("error posting receive descriptors to shared receive "
3963                 "queue %d (%d from %d)", qp, i, num_post));
3964 
3965     OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3966     return OPAL_ERROR;
3967 }
3968 
3969 
3970 struct mca_btl_openib_event_t {
3971     opal_event_t super;
3972     void *(*fn)(void *);
3973     void *arg;
3974     opal_event_t *event;
3975 };
3976 
3977 typedef struct mca_btl_openib_event_t mca_btl_openib_event_t;
3978 
mca_btl_openib_run_once_cb(int fd,int flags,void * context)3979 static void *mca_btl_openib_run_once_cb (int fd, int flags, void *context)
3980 {
3981     mca_btl_openib_event_t *event = (mca_btl_openib_event_t *) context;
3982     void *ret;
3983 
3984     ret = event->fn (event->arg);
3985     opal_event_del (&event->super);
3986     free (event);
3987     return ret;
3988 }
3989 
mca_btl_openib_run_in_main(void * (* fn)(void *),void * arg)3990 int mca_btl_openib_run_in_main (void *(*fn)(void *), void *arg)
3991 {
3992     mca_btl_openib_event_t *event = malloc (sizeof (mca_btl_openib_event_t));
3993 
3994     if (OPAL_UNLIKELY(NULL == event)) {
3995         return OPAL_ERR_OUT_OF_RESOURCE;
3996     }
3997 
3998     event->fn = fn;
3999     event->arg = arg;
4000 
4001     opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
4002                     mca_btl_openib_run_once_cb, event);
4003 
4004     opal_event_active (&event->super, OPAL_EV_READ, 1);
4005 
4006     return OPAL_SUCCESS;
4007 }
4008