1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2013 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved
14 * Copyright (c) 2006-2015 Mellanox Technologies. All rights reserved.
15 * Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
16 * reserved.
17 * Copyright (c) 2006-2007 Voltaire All rights reserved.
18 * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
19 * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
20 * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
21 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
22 * Copyright (c) 2014-2017 Research Organization for Information Science
23 * and Technology (RIST). All rights reserved.
24 * Copyright (c) 2014 Bull SAS. All rights reserved.
25 * $COPYRIGHT$
26 *
27 * Additional copyrights may follow
28 *
29 * $HEADER$
30 */
31
32 #include "opal_config.h"
33
34 #include <infiniband/verbs.h>
35 #include <errno.h>
36 #include <string.h>
37 #ifdef HAVE_UNISTD_H
38 #include <unistd.h>
39 #endif
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <fcntl.h>
43 #include <stdlib.h>
44 #include <stddef.h>
45
46 #include "opal/mca/memory/memory.h"
47 #include "opal/mca/event/event.h"
48 #include "opal/align.h"
49 #include "opal/util/output.h"
50 #include "opal/util/argv.h"
51 #include "opal/mca/timer/base/base.h"
52 #include "opal/sys/atomic.h"
53 #include "opal/util/sys_limits.h"
54 #include "opal/util/argv.h"
55 #include "opal/memoryhooks/memory.h"
56 /* Define this before including hwloc.h so that we also get the hwloc
57 verbs helper header file, too. We have to do this level of
58 indirection because the hwloc subsystem is a component -- we don't
59 know its exact path. We have to rely on the framework header files
60 to find the right hwloc verbs helper file for us. */
61 #define OPAL_HWLOC_WANT_VERBS_HELPER 1
62 #include "opal/mca/hwloc/hwloc-internal.h"
63 #include "opal/mca/hwloc/base/base.h"
64 #include "opal/mca/installdirs/installdirs.h"
65 #include "opal_stdint.h"
66 #include "opal/util/show_help.h"
67 #include "opal/mca/btl/btl.h"
68 #include "opal/mca/btl/base/base.h"
69 #include "opal/mca/mpool/base/base.h"
70 #include "opal/mca/rcache/rcache.h"
71 #include "opal/mca/rcache/base/base.h"
72 #include "opal/mca/common/cuda/common_cuda.h"
73 #include "opal/mca/common/verbs/common_verbs.h"
74 #include "opal/runtime/opal_params.h"
75 #include "opal/runtime/opal.h"
76 #include "opal/mca/pmix/pmix.h"
77 #include "opal/util/proc.h"
78
79 #include "btl_openib.h"
80 #include "btl_openib_frag.h"
81 #include "btl_openib_endpoint.h"
82 #include "btl_openib_eager_rdma.h"
83 #include "btl_openib_proc.h"
84 #include "btl_openib_ini.h"
85 #include "btl_openib_mca.h"
86 #include "btl_openib_xrc.h"
87 #include "btl_openib_async.h"
88 #include "connect/base.h"
89 #include "btl_openib_ip.h"
90
91 #define EPS 1.e-6
92 /*
93 * Local functions
94 */
95 static int btl_openib_component_register(void);
96 static int btl_openib_component_open(void);
97 static int btl_openib_component_close(void);
98 static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
99 static int btl_openib_component_progress(void);
100 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
101 static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
102 mca_btl_openib_endpoint_t *ep,
103 mca_btl_base_descriptor_t* des,
104 int status);
105 #endif /* OPAL_CUDA_SUPPORT */
106 /*
107 * Local variables
108 */
109 static mca_btl_openib_device_t *receive_queues_device = NULL;
110 static int num_devices_intentionally_ignored = 0;
111
112 mca_btl_openib_component_t mca_btl_openib_component = {
113 .super = {
114 /* First, the mca_base_component_t struct containing meta information
115 about the component itself */
116
117 .btl_version = {
118 MCA_BTL_DEFAULT_VERSION("openib"),
119 .mca_open_component = btl_openib_component_open,
120 .mca_close_component = btl_openib_component_close,
121 .mca_register_component_params = btl_openib_component_register,
122 },
123 .btl_data = {
124 /* The component is checkpoint ready */
125 .param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT
126 },
127
128 .btl_init = btl_openib_component_init,
129 .btl_progress = btl_openib_component_progress,
130 }
131 };
132
btl_openib_component_register(void)133 static int btl_openib_component_register(void)
134 {
135 int ret;
136
137 /* register IB component parameters */
138 if (OPAL_SUCCESS != (ret = btl_openib_register_mca_params())) {
139 return ret;
140 }
141
142 mca_btl_openib_component.max_send_size =
143 mca_btl_openib_module.super.btl_max_send_size;
144 mca_btl_openib_component.eager_limit =
145 mca_btl_openib_module.super.btl_eager_limit;
146
147 /* if_include and if_exclude need to be mutually exclusive */
148 if (OPAL_SUCCESS !=
149 mca_base_var_check_exclusive("ompi",
150 mca_btl_openib_component.super.btl_version.mca_type_name,
151 mca_btl_openib_component.super.btl_version.mca_component_name,
152 "if_include",
153 mca_btl_openib_component.super.btl_version.mca_type_name,
154 mca_btl_openib_component.super.btl_version.mca_component_name,
155 "if_exclude")) {
156 /* Return ERR_NOT_AVAILABLE so that a warning message about
157 "open" failing is not printed */
158 return OPAL_ERR_NOT_AVAILABLE;
159 }
160
161 #if OPAL_CUDA_SUPPORT
162 mca_common_cuda_register_mca_variables();
163 #endif
164
165 return OPAL_SUCCESS;
166 }
167
168 /*
169 * Called by MCA framework to open the component
170 */
btl_openib_component_open(void)171 static int btl_openib_component_open(void)
172 {
173 opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock;
174 opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;
175
176 /* Construct hash table that stores pointers to SRQs */
177 OBJ_CONSTRUCT(lock, opal_mutex_t);
178 OBJ_CONSTRUCT(srq_addr_table, opal_hash_table_t);
179
180 /* initialize state */
181 mca_btl_openib_component.ib_num_btls = 0;
182 mca_btl_openib_component.num_default_gid_btls = 0;
183 mca_btl_openib_component.openib_btls = NULL;
184 OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
185 mca_btl_openib_component.devices_count = 0;
186 mca_btl_openib_component.cpc_explicitly_defined = false;
187
188 /* initialize objects */
189 OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
190 mca_btl_openib_component.memory_registration_verbose = -1;
191
192 #if OPAL_CUDA_SUPPORT
193 mca_common_cuda_stage_one_init();
194 #endif /* OPAL_CUDA_SUPPORT */
195
196 return OPAL_SUCCESS;
197 }
198
199 /*
200 * component cleanup - sanity checking of queue lengths
201 */
202
btl_openib_component_close(void)203 static int btl_openib_component_close(void)
204 {
205 int rc = OPAL_SUCCESS;
206
207 /* remove the async event from the event base */
208 mca_btl_openib_async_fini ();
209
210 OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.lock);
211 OBJ_DESTRUCT(&mca_btl_openib_component.srq_manager.srq_addr_table);
212
213 opal_btl_openib_connect_base_finalize();
214 opal_btl_openib_ini_finalize();
215
216 if (NULL != mca_btl_openib_component.default_recv_qps) {
217 free(mca_btl_openib_component.default_recv_qps);
218 }
219
220 /* close memory registration debugging output */
221 opal_output_close (mca_btl_openib_component.memory_registration_verbose);
222
223 #if OPAL_CUDA_SUPPORT
224 mca_common_cuda_fini();
225 #endif /* OPAL_CUDA_SUPPORT */
226
227 return rc;
228 }
229
pack8(char ** dest,uint8_t value)230 static void inline pack8(char **dest, uint8_t value)
231 {
232 /* Copy one character */
233 **dest = (char) value;
234 /* Most the dest ahead one */
235 ++*dest;
236 }
237
238 /*
239 * Register local openib port information with the modex so that it
240 * can be shared with all other peers.
241 */
btl_openib_modex_send(void)242 static int btl_openib_modex_send(void)
243 {
244 int rc, i, j;
245 int modex_message_size;
246 char *message, *offset;
247 size_t size, msg_size;
248 opal_btl_openib_connect_base_module_t *cpc;
249
250 opal_output(-1, "Starting to modex send");
251 if (0 == mca_btl_openib_component.ib_num_btls) {
252 return 0;
253 }
254 modex_message_size = offsetof(mca_btl_openib_modex_message_t, end);
255
256 /* The message is packed into multiple parts:
257 * 1. a uint8_t indicating the number of modules (ports) in the message
258 * 2. for each module:
259 * a. the common module data
260 * b. a uint8_t indicating how many CPCs follow
261 * c. for each CPC:
262 * a. a uint8_t indicating the index of the CPC in the all[]
263 * array in btl_openib_connect_base.c
264 * b. a uint8_t indicating the priority of this CPC
265 * c. a uint8_t indicating the length of the blob to follow
266 * d. a blob that is only meaningful to that CPC
267 */
268 msg_size =
269 /* uint8_t for number of modules in the message */
270 1 +
271 /* For each module: */
272 mca_btl_openib_component.ib_num_btls *
273 (
274 /* Common module data */
275 modex_message_size +
276 /* uint8_t for how many CPCs follow */
277 1
278 );
279 /* For each module, add in the size of the per-CPC data */
280 for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
281 for (j = 0;
282 j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
283 ++j) {
284 msg_size +=
285 /* uint8_t for the index of the CPC */
286 1 +
287 /* uint8_t for the CPC's priority */
288 1 +
289 /* uint8_t for the blob length */
290 1 +
291 /* blob length */
292 mca_btl_openib_component.openib_btls[i]->cpcs[j]->data.cbm_modex_message_len;
293 }
294 }
295 message = (char *) malloc(msg_size);
296 if (NULL == message) {
297 BTL_ERROR(("Failed malloc"));
298 return OPAL_ERR_OUT_OF_RESOURCE;
299 }
300
301 /* Pack the number of modules */
302 offset = message;
303 pack8(&offset, mca_btl_openib_component.ib_num_btls);
304 opal_output(-1, "modex sending %d btls (packed: %d, offset now at %d)", mca_btl_openib_component.ib_num_btls, *((uint8_t*) message), (int) (offset - message));
305
306 /* Pack each of the modules */
307 for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
308
309 /* Pack the modex common message struct. */
310 size = modex_message_size;
311
312 (mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
313 (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
314
315 (mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
316 (mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
317
318 (mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
319 mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
320
321 memcpy(offset,
322 &(mca_btl_openib_component.openib_btls[i]->port_info),
323 size);
324 opal_output(-1, "modex packed btl port modex message: 0x%" PRIx64 ", %d, %d (size: %d)",
325 mca_btl_openib_component.openib_btls[i]->port_info.subnet_id,
326 mca_btl_openib_component.openib_btls[i]->port_info.mtu,
327 mca_btl_openib_component.openib_btls[i]->port_info.lid,
328 (int) size);
329
330 #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
331 MCA_BTL_OPENIB_MODEX_MSG_HTON(*(mca_btl_openib_modex_message_t *)offset);
332 #endif
333 offset += size;
334 opal_output(-1, "modex packed btl %d: modex message, offset now %d",
335 i, (int) (offset -message));
336
337 /* Pack the number of CPCs that follow */
338 pack8(&offset,
339 mca_btl_openib_component.openib_btls[i]->num_cpcs);
340 opal_output(-1, "modex packed btl %d: to pack %d cpcs (packed: %d, offset now %d)",
341 i, mca_btl_openib_component.openib_btls[i]->num_cpcs,
342 *((uint8_t*) (offset - 1)), (int) (offset-message));
343
344 /* Pack each CPC */
345 for (j = 0;
346 j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
347 ++j) {
348 uint8_t u8;
349
350 cpc = mca_btl_openib_component.openib_btls[i]->cpcs[j];
351 opal_output(-1, "modex packed btl %d: packing cpc %s",
352 i, cpc->data.cbm_component->cbc_name);
353 /* Pack the CPC index */
354 u8 = opal_btl_openib_connect_base_get_cpc_index(cpc->data.cbm_component);
355 pack8(&offset, u8);
356 opal_output(-1, "packing btl %d: cpc %d: index %d (packed %d, offset now %d)",
357 i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
358 /* Pack the CPC priority */
359 pack8(&offset, cpc->data.cbm_priority);
360 opal_output(-1, "packing btl %d: cpc %d: priority %d (packed %d, offset now %d)",
361 i, j, cpc->data.cbm_priority, *((uint8_t*) (offset-1)), (int)(offset-message));
362 /* Pack the blob length */
363 u8 = cpc->data.cbm_modex_message_len;
364 pack8(&offset, u8);
365 opal_output(-1, "packing btl %d: cpc %d: message len %d (packed %d, offset now %d)",
366 i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
367 /* If the blob length is > 0, pack the blob */
368 if (u8 > 0) {
369 memcpy(offset, cpc->data.cbm_modex_message, u8);
370 offset += u8;
371 opal_output(-1, "packing btl %d: cpc %d: blob packed %d %x (offset now %d)",
372 i, j,
373 ((uint32_t*)cpc->data.cbm_modex_message)[0],
374 ((uint32_t*)cpc->data.cbm_modex_message)[1],
375 (int)(offset-message));
376 }
377
378 /* Sanity check */
379 assert((size_t) (offset - message) <= msg_size);
380 }
381 }
382
383 /* All done -- send it! */
384 OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
385 &mca_btl_openib_component.super.btl_version,
386 message, msg_size);
387 free(message);
388 opal_output(-1, "Modex sent! %d calculated, %d actual\n", (int) msg_size, (int) (offset - message));
389
390 return rc;
391 }
392
393 /*
394 * Active Message Callback function on control message.
395 */
396
btl_openib_control(mca_btl_base_module_t * btl,mca_btl_base_tag_t tag,mca_btl_base_descriptor_t * des,void * cbdata)397 static void btl_openib_control(mca_btl_base_module_t* btl,
398 mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des,
399 void* cbdata)
400 {
401 /* don't return credits used for control messages */
402 mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
403 mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint;
404 mca_btl_openib_control_header_t *ctl_hdr =
405 (mca_btl_openib_control_header_t *) to_base_frag(des)->segment.seg_addr.pval;
406 mca_btl_openib_eager_rdma_header_t *rdma_hdr;
407 mca_btl_openib_header_coalesced_t *clsc_hdr =
408 (mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
409 mca_btl_active_message_callback_t* reg;
410 size_t len = des->des_segments->seg_len - sizeof(*ctl_hdr);
411
412 switch (ctl_hdr->type) {
413 case MCA_BTL_OPENIB_CONTROL_CREDITS:
414 assert(0); /* Credit message is handled elsewhere */
415 break;
416 case MCA_BTL_OPENIB_CONTROL_RDMA:
417 rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
418
419 BTL_VERBOSE(("prior to NTOH received rkey %" PRIu32
420 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32,
421 rdma_hdr->rkey,
422 rdma_hdr->rdma_start.lval,
423 rdma_hdr->rdma_start.pval,
424 rdma_hdr->rdma_start.ival
425 ));
426
427 if(ep->nbo) {
428 BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr);
429 }
430
431 BTL_VERBOSE(("received rkey %" PRIu32
432 ", rdma_start.lval %" PRIx64 ", pval %p,"
433 " ival %" PRIu32, rdma_hdr->rkey,
434 rdma_hdr->rdma_start.lval,
435 rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival));
436
437 if (ep->eager_rdma_remote.base.pval) {
438 BTL_ERROR(("Got RDMA connect twice!"));
439 return;
440 }
441 ep->eager_rdma_remote.rkey = rdma_hdr->rkey;
442 ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
443 ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1;
444 break;
445 case MCA_BTL_OPENIB_CONTROL_COALESCED:
446 {
447 size_t pad = 0;
448 while(len > 0) {
449 size_t skip;
450 mca_btl_openib_header_coalesced_t* unalign_hdr = 0;
451 mca_btl_base_descriptor_t tmp_des;
452 mca_btl_base_segment_t tmp_seg;
453
454 assert(len >= sizeof(*clsc_hdr));
455
456 if(ep->nbo)
457 BTL_OPENIB_HEADER_COALESCED_NTOH(*clsc_hdr);
458
459 skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size - pad);
460
461 tmp_des.des_segments = &tmp_seg;
462 tmp_des.des_segment_count = 1;
463 tmp_seg.seg_addr.pval = clsc_hdr + 1;
464 tmp_seg.seg_len = clsc_hdr->size;
465
466 /* call registered callback */
467 reg = mca_btl_base_active_message_trigger + clsc_hdr->tag;
468 reg->cbfunc( &obtl->super, clsc_hdr->tag, &tmp_des, reg->cbdata );
469 len -= (skip + pad);
470 unalign_hdr = (mca_btl_openib_header_coalesced_t*)
471 ((unsigned char*)clsc_hdr + skip);
472 pad = (size_t)BTL_OPENIB_COALESCE_HDR_PADDING(unalign_hdr);
473 clsc_hdr = (mca_btl_openib_header_coalesced_t*)((unsigned char*)unalign_hdr +
474 pad);
475 }
476 }
477 break;
478 case MCA_BTL_OPENIB_CONTROL_CTS:
479 OPAL_OUTPUT((-1, "received CTS from %s (buffer %p): posted recvs %d, sent cts %d",
480 opal_get_proc_hostname(ep->endpoint_proc->proc_opal),
481 (void*) ctl_hdr,
482 ep->endpoint_posted_recvs, ep->endpoint_cts_sent));
483 ep->endpoint_cts_received = true;
484
485 /* Only send the CTS back and mark connected if:
486 - we have posted our receives (it's possible that we can
487 get this CTS before this side's CPC has called
488 cpc_complete())
489 - we have not yet sent our CTS
490
491 We don't even want to mark the endpoint connected() until
492 we have posted our receives because otherwise we will
493 trigger credit management (because the rd_credits will
494 still be negative), and Bad Things will happen. */
495 if (ep->endpoint_posted_recvs) {
496 /* need to hold to lock for both send_cts and connected */
497 OPAL_THREAD_LOCK(&ep->endpoint_lock);
498 if (!ep->endpoint_cts_sent) {
499 mca_btl_openib_endpoint_send_cts(ep);
500 }
501 mca_btl_openib_endpoint_connected(ep);
502 }
503 break;
504 default:
505 BTL_ERROR(("Unknown message type received by BTL"));
506 break;
507 }
508 }
509
openib_reg_mr(void * reg_data,void * base,size_t size,mca_rcache_base_registration_t * reg)510 static int openib_reg_mr (void *reg_data, void *base, size_t size,
511 mca_rcache_base_registration_t *reg)
512 {
513 mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
514 mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
515 enum ibv_access_flags access_flag = 0;
516
517 if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) {
518 access_flag |= IBV_ACCESS_REMOTE_READ;
519 }
520
521 if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_WRITE) {
522 access_flag |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
523 }
524
525 if (reg->access_flags & MCA_RCACHE_ACCESS_LOCAL_WRITE) {
526 access_flag |= IBV_ACCESS_LOCAL_WRITE;
527 }
528
529 #if HAVE_DECL_IBV_ATOMIC_HCA
530 if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_ATOMIC) {
531 access_flag |= IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE;
532 }
533 #endif
534
535 if (device->mem_reg_max &&
536 device->mem_reg_max < (device->mem_reg_active + size)) {
537 return OPAL_ERR_OUT_OF_RESOURCE;
538 }
539
540 device->mem_reg_active += size;
541
542 #if HAVE_DECL_IBV_ACCESS_SO
543 if (reg->flags & MCA_RCACHE_FLAGS_SO_MEM) {
544 access_flag |= IBV_ACCESS_SO;
545 }
546 #endif
547
548 openib_reg->mr = ibv_reg_mr(device->ib_pd, base, size, access_flag);
549
550 if (NULL == openib_reg->mr) {
551 OPAL_OUTPUT_VERBOSE((5, mca_btl_openib_component.memory_registration_verbose,
552 "ibv_reg_mr() failed: base=%p, bound=%p, size=%d, flags=0x%x, errno=%d",
553 reg->base, reg->bound, (int) (reg->bound - reg->base + 1), reg->flags, errno));
554 return OPAL_ERR_OUT_OF_RESOURCE;
555 }
556
557 openib_reg->btl_handle.lkey = openib_reg->mr->lkey;
558 openib_reg->btl_handle.rkey = openib_reg->mr->rkey;
559
560 OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
561 "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
562 (int) (reg->bound - reg->base + 1), reg->flags));
563
564 #if OPAL_CUDA_SUPPORT
565 if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) {
566 mca_common_cuda_register (base, size,
567 openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name);
568 }
569 #endif
570
571 return OPAL_SUCCESS;
572 }
573
openib_dereg_mr(void * reg_data,mca_rcache_base_registration_t * reg)574 static int openib_dereg_mr(void *reg_data, mca_rcache_base_registration_t *reg)
575 {
576 mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data;
577 mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
578
579 OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose,
580 "openib_dereg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound,
581 (int) (reg->bound - reg->base + 1), reg->flags));
582
583 if(openib_reg->mr != NULL) {
584 if(ibv_dereg_mr(openib_reg->mr)) {
585 BTL_ERROR(("%s: error unpinning openib memory errno says %s",
586 __func__, strerror(errno)));
587 return OPAL_ERROR;
588 }
589
590 #if OPAL_CUDA_SUPPORT
591 if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) {
592 mca_common_cuda_unregister(openib_reg->base.base,
593 openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name);
594 }
595 #endif
596
597 }
598
599 device->mem_reg_active -= (uint64_t) (reg->bound - reg->base + 1);
600
601 openib_reg->mr = NULL;
602 return OPAL_SUCCESS;
603 }
604
param_register_uint(const char * param_name,unsigned int default_value,unsigned int * storage)605 static inline int param_register_uint(const char* param_name, unsigned int default_value, unsigned int *storage)
606 {
607 *storage = default_value;
608 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
609 param_name, NULL, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
610 NULL, 0, 0, OPAL_INFO_LVL_9,
611 MCA_BASE_VAR_SCOPE_READONLY, storage);
612 return *storage;
613 }
614
init_one_port(opal_list_t * btl_list,mca_btl_openib_device_t * device,uint8_t port_num,uint16_t pkey_index,struct ibv_port_attr * ib_port_attr)615 static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
616 uint8_t port_num, uint16_t pkey_index,
617 struct ibv_port_attr *ib_port_attr)
618 {
619 uint16_t lid, i, lmc, lmc_step;
620 mca_btl_openib_module_t *openib_btl;
621 mca_btl_base_selected_module_t *ib_selected;
622 union ibv_gid gid;
623 uint64_t subnet_id;
624
625 /* Ensure that the requested GID index (via the
626 btl_openib_gid_index MCA param) is within the GID table
627 size. */
628 if (mca_btl_openib_component.gid_index >
629 ib_port_attr->gid_tbl_len) {
630 opal_show_help("help-mpi-btl-openib.txt", "gid index too large",
631 true, opal_process_info.nodename,
632 ibv_get_device_name(device->ib_dev), port_num,
633 mca_btl_openib_component.gid_index,
634 ib_port_attr->gid_tbl_len);
635 return OPAL_ERR_NOT_FOUND;
636 }
637 BTL_VERBOSE(("looking for %s:%d GID index %d",
638 ibv_get_device_name(device->ib_dev), port_num,
639 mca_btl_openib_component.gid_index));
640
641 /* If we have struct ibv_device.transport_type, then we're >= OFED
642 v1.2, and the transport could be iWarp or IB. If we don't have
643 that member, then we're < OFED v1.2, and it can only be IB. */
644 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
645 if (IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) {
646 subnet_id = mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num);
647 BTL_VERBOSE(("my iWARP subnet_id is %016" PRIx64, subnet_id));
648 } else {
649 memset(&gid, 0, sizeof(gid));
650 if (0 != ibv_query_gid(device->ib_dev_context, port_num,
651 mca_btl_openib_component.gid_index, &gid)) {
652 BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
653 ibv_get_device_name(device->ib_dev), port_num,
654 mca_btl_openib_component.gid_index));
655 return OPAL_ERR_NOT_FOUND;
656 }
657
658 #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
659 if (IBV_LINK_LAYER_ETHERNET == ib_port_attr->link_layer) {
660 subnet_id = mca_btl_openib_component.rroce_enable ? 0 :
661 mca_btl_openib_get_ip_subnet_id(device->ib_dev, port_num);
662 } else {
663 subnet_id = ntoh64(gid.global.subnet_prefix);
664 }
665 #else
666 subnet_id = ntoh64(gid.global.subnet_prefix);
667 #endif
668
669 BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64,
670 ibv_get_device_name(device->ib_dev), port_num, subnet_id));
671 }
672 #else
673 if (0 != ibv_query_gid(device->ib_dev_context, port_num,
674 mca_btl_openib_component.gid_index, &gid)) {
675 BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
676 ibv_get_device_name(device->ib_dev), port_num,
677 mca_btl_openib_component.gid_index));
678 return OPAL_ERR_NOT_FOUND;
679 }
680 subnet_id = ntoh64(gid.global.subnet_prefix);
681 BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64,
682 ibv_get_device_name(device->ib_dev), port_num, subnet_id));
683 #endif
684
685 if(mca_btl_openib_component.num_default_gid_btls > 0 &&
686 IB_DEFAULT_GID_PREFIX == subnet_id &&
687 mca_btl_openib_component.warn_default_gid_prefix) {
688 opal_show_help("help-mpi-btl-openib.txt", "default subnet prefix",
689 true, opal_process_info.nodename);
690 }
691
692 if (IB_DEFAULT_GID_PREFIX == subnet_id) {
693 mca_btl_openib_component.num_default_gid_btls++;
694 }
695
696 lmc = (1 << ib_port_attr->lmc);
697 lmc_step = 1;
698
699 if (0 != mca_btl_openib_component.max_lmc &&
700 mca_btl_openib_component.max_lmc < lmc) {
701 lmc = mca_btl_openib_component.max_lmc;
702 }
703
704 /* APM support -- only meaningful if async event support is
705 enabled. If async events are not enabled, then there's nothing
706 to listen for the APM event to load the new path, so it's not
707 worth enabling APM. */
708 if (lmc > 1){
709 if (-1 == mca_btl_openib_component.apm_lmc) {
710 lmc_step = lmc;
711 mca_btl_openib_component.apm_lmc = lmc - 1;
712 } else if (0 == lmc % (mca_btl_openib_component.apm_lmc + 1)) {
713 lmc_step = mca_btl_openib_component.apm_lmc + 1;
714 } else {
715 opal_show_help("help-mpi-btl-openib.txt", "apm with wrong lmc",true,
716 mca_btl_openib_component.apm_lmc, lmc);
717 return OPAL_ERROR;
718 }
719 } else {
720 if (mca_btl_openib_component.apm_lmc) {
721 /* Disable apm and report warning */
722 mca_btl_openib_component.apm_lmc = 0;
723 opal_show_help("help-mpi-btl-openib.txt", "apm without lmc",true);
724 }
725 }
726
727 for(lid = ib_port_attr->lid;
728 lid < ib_port_attr->lid + lmc; lid += lmc_step){
729 for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){
730 char param[40];
731
732 openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
733 if(NULL == openib_btl) {
734 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
735 return OPAL_ERR_OUT_OF_RESOURCE;
736 }
737 memcpy(openib_btl, &mca_btl_openib_module,
738 sizeof(mca_btl_openib_module));
739 memcpy(&openib_btl->ib_port_attr, ib_port_attr,
740 sizeof(struct ibv_port_attr));
741 ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
742 ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
743 openib_btl->device = device;
744 openib_btl->port_num = (uint8_t) port_num;
745 openib_btl->pkey_index = pkey_index;
746 openib_btl->lid = lid;
747 openib_btl->apm_port = 0;
748 openib_btl->src_path_bits = lid - ib_port_attr->lid;
749
750 openib_btl->port_info.subnet_id = subnet_id;
751 openib_btl->port_info.mtu = device->mtu;
752 openib_btl->port_info.lid = lid;
753
754 openib_btl->cpcs = NULL;
755 openib_btl->num_cpcs = 0;
756 openib_btl->local_procs = 0;
757
758 mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
759 mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
760
761 if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) {
762 openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz;
763 }
764
765 openib_btl->super.btl_get_alignment = 0;
766
767 if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) {
768 openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz;
769 }
770
771 openib_btl->super.btl_put_local_registration_threshold = openib_btl->device->max_inline_data;
772 openib_btl->super.btl_get_local_registration_threshold = 0;
773
774 #if HAVE_DECL_IBV_ATOMIC_HCA
775 openib_btl->atomic_ops_be = false;
776
777 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXT_ATOM
778 /* check that 8-byte atomics are supported */
779 if (!(device->ib_exp_dev_attr.ext_atom.log_atomic_arg_sizes & (1<<3ull))) {
780 openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
781 openib_btl->super.btl_atomic_flags = 0;
782 openib_btl->super.btl_atomic_fop = NULL;
783 openib_btl->super.btl_atomic_cswap = NULL;
784 }
785 #endif
786
787 #ifdef HAVE_STRUCT_IBV_EXP_DEVICE_ATTR_EXP_ATOMIC_CAP
788 switch (openib_btl->device->ib_exp_dev_attr.exp_atomic_cap)
789 #else
790 switch (openib_btl->device->ib_dev_attr.atomic_cap)
791 #endif
792 {
793 case IBV_ATOMIC_GLOB:
794 openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
795 break;
796 #if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
797 case IBV_EXP_ATOMIC_HCA_REPLY_BE:
798 openib_btl->atomic_ops_be = true;
799 break;
800 #endif
801 case IBV_ATOMIC_HCA:
802 break;
803 case IBV_ATOMIC_NONE:
804 default:
805 /* no atomics or an unsupported atomic type */
806 openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
807 openib_btl->super.btl_atomic_flags = 0;
808 openib_btl->super.btl_atomic_fop = NULL;
809 openib_btl->super.btl_atomic_cswap = NULL;
810 }
811 #endif
812
813 openib_btl->super.btl_put_alignment = 0;
814
815 openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t);
816
817 /* Check bandwidth configured for this device */
818 sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev));
819 param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
820
821 /* Check bandwidth configured for this device/port */
822 sprintf(param, "bandwidth_%s:%d", ibv_get_device_name(device->ib_dev),
823 port_num);
824 param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
825
826 /* Check bandwidth configured for this device/port/LID */
827 sprintf(param, "bandwidth_%s:%d:%d",
828 ibv_get_device_name(device->ib_dev), port_num, lid);
829 param_register_uint(param, openib_btl->super.btl_bandwidth, &openib_btl->super.btl_bandwidth);
830
831 /* Check latency configured for this device */
832 sprintf(param, "latency_%s", ibv_get_device_name(device->ib_dev));
833 param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
834
835 /* Check latency configured for this device/port */
836 sprintf(param, "latency_%s:%d", ibv_get_device_name(device->ib_dev),
837 port_num);
838 param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
839
840 /* Check latency configured for this device/port/LID */
841 sprintf(param, "latency_%s:%d:%d", ibv_get_device_name(device->ib_dev),
842 port_num, lid);
843 param_register_uint(param, openib_btl->super.btl_latency, &openib_btl->super.btl_latency);
844
845 /* Auto-detect the port bandwidth */
846 if (0 == openib_btl->super.btl_bandwidth) {
847 if (OPAL_SUCCESS !=
848 opal_common_verbs_port_bw(ib_port_attr,
849 &openib_btl->super.btl_bandwidth)) {
850 /* If we can't figure out the bandwidth, declare
851 this port unreachable (do not* return
852 ERR_VALUE_OF_OUT_OF_BOUNDS; that is reserved
853 for when we exceed the number of allowable
854 BTLs). */
855 return OPAL_ERR_UNREACH;
856 }
857 }
858
859 opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
860 opal_pointer_array_add(device->device_btls, (void*) openib_btl);
861 ++device->btls;
862 ++mca_btl_openib_component.ib_num_btls;
863 if (-1 != mca_btl_openib_component.ib_max_btls &&
864 mca_btl_openib_component.ib_num_btls >=
865 mca_btl_openib_component.ib_max_btls) {
866 return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
867 }
868 }
869 }
870
871 return OPAL_SUCCESS;
872 }
873
device_construct(mca_btl_openib_device_t * device)874 static void device_construct(mca_btl_openib_device_t *device)
875 {
876 device->ib_dev = NULL;
877 device->ib_dev_context = NULL;
878 device->ib_pd = NULL;
879 device->mpool = NULL;
880 device->rcache = NULL;
881 #if OPAL_ENABLE_PROGRESS_THREADS == 1
882 device->ib_channel = NULL;
883 #endif
884 device->btls = 0;
885 device->endpoints = NULL;
886 device->device_btls = NULL;
887 device->ib_cq[BTL_OPENIB_HP_CQ] = NULL;
888 device->ib_cq[BTL_OPENIB_LP_CQ] = NULL;
889 device->cq_size[BTL_OPENIB_HP_CQ] = 0;
890 device->cq_size[BTL_OPENIB_LP_CQ] = 0;
891 device->non_eager_rdma_endpoints = 0;
892 device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
893 device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
894 device->pollme = true;
895 device->eager_rdma_buffers_count = 0;
896 device->eager_rdma_buffers = NULL;
897 #if HAVE_XRC
898 device->xrc_fd = -1;
899 #endif
900 device->qps = NULL;
901 OBJ_CONSTRUCT(&device->device_lock, opal_mutex_t);
902 OBJ_CONSTRUCT(&device->send_free_control, opal_free_list_t);
903 device->max_inline_data = 0;
904 device->ready_for_use = false;
905 }
906
device_destruct(mca_btl_openib_device_t * device)907 static void device_destruct(mca_btl_openib_device_t *device)
908 {
909 int i;
910
911 #if OPAL_ENABLE_PROGRESS_THREADS == 1
912 if (device->progress) {
913 device->progress = false;
914 if (pthread_cancel(device->thread.t_handle)) {
915 BTL_ERROR(("Failed to cancel OpenIB progress thread"));
916 goto device_error;
917 }
918 opal_thread_join(&device->thread, NULL);
919 }
920
921 if (ibv_destroy_comp_channel(device->ib_channel)) {
922 BTL_VERBOSE(("Failed to close comp_channel"));
923 goto device_error;
924 }
925 #endif
926
927 /* signaling to async_tread to stop poll for this device */
928 mca_btl_openib_async_rem_device (device);
929
930 if(device->eager_rdma_buffers) {
931 int i;
932 for(i = 0; i < device->eager_rdma_buffers_count; i++)
933 if(device->eager_rdma_buffers[i])
934 OBJ_RELEASE(device->eager_rdma_buffers[i]);
935 free(device->eager_rdma_buffers);
936 }
937
938 if (NULL != device->qps) {
939 for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
940 OBJ_DESTRUCT(&device->qps[i].send_free);
941 OBJ_DESTRUCT(&device->qps[i].recv_free);
942 }
943 free(device->qps);
944 }
945
946 OBJ_DESTRUCT(&device->send_free_control);
947
948 /* Release CQs */
949 if(device->ib_cq[BTL_OPENIB_HP_CQ] != NULL) {
950 if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_HP_CQ])) {
951 BTL_VERBOSE(("Failed to close HP CQ"));
952 goto device_error;
953 }
954 }
955
956 if(device->ib_cq[BTL_OPENIB_LP_CQ] != NULL) {
957 if (ibv_destroy_cq(device->ib_cq[BTL_OPENIB_LP_CQ])) {
958 BTL_VERBOSE(("Failed to close LP CQ"));
959 goto device_error;
960 }
961 }
962
963 if (OPAL_SUCCESS != mca_rcache_base_module_destroy (device->rcache)) {
964 BTL_VERBOSE(("failed to release registration cache"));
965 goto device_error;
966 }
967
968 #if HAVE_XRC
969
970 if (MCA_BTL_XRC_ENABLED) {
971 if (OPAL_SUCCESS != mca_btl_openib_close_xrc_domain(device)) {
972 BTL_VERBOSE(("XRC Internal error. Failed to close xrc domain"));
973 goto device_error;
974 }
975 }
976 #endif
977
978 if (ibv_dealloc_pd(device->ib_pd)) {
979 BTL_VERBOSE(("Warning! Failed to release PD"));
980 goto device_error;
981 }
982
983 OBJ_DESTRUCT(&device->device_lock);
984
985 if (ibv_close_device(device->ib_dev_context)) {
986 if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
987 BTL_VERBOSE(("Warning! Failed to close device"));
988 goto device_error;
989 } else {
990 BTL_ERROR(("Error! Failed to close device"));
991 goto device_error;
992 }
993 }
994 BTL_VERBOSE(("device was successfully released"));
995 return;
996 device_error:
997 BTL_VERBOSE(("Failed to destroy device resources"));
998 }
999
1000 OBJ_CLASS_INSTANCE(mca_btl_openib_device_t, opal_object_t, device_construct,
1001 device_destruct);
1002
1003 static int
get_port_list(mca_btl_openib_device_t * device,int * allowed_ports)1004 get_port_list(mca_btl_openib_device_t *device, int *allowed_ports)
1005 {
1006 int i, j, k, num_ports = 0;
1007 const char *dev_name;
1008 char *name;
1009
1010 dev_name = ibv_get_device_name(device->ib_dev);
1011 name = (char*) malloc(strlen(dev_name) + 4);
1012 if (NULL == name) {
1013 return 0;
1014 }
1015
1016 /* Assume that all ports are allowed. num_ports will be adjusted
1017 below to reflect whether this is true or not. */
1018 for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1019 allowed_ports[num_ports++] = i;
1020 }
1021 num_ports = 0;
1022 if (NULL != mca_btl_openib_component.if_include_list) {
1023 /* If only the device name is given (eg. mtdevice0,mtdevice1) use all
1024 ports */
1025 i = 0;
1026 while (mca_btl_openib_component.if_include_list[i]) {
1027 if (0 == strcmp(dev_name,
1028 mca_btl_openib_component.if_include_list[i])) {
1029 num_ports = device->ib_dev_attr.phys_port_cnt;
1030 goto done;
1031 }
1032 ++i;
1033 }
1034 /* Include only requested ports on the device */
1035 for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1036 sprintf(name,"%s:%d",dev_name,i);
1037 for (j = 0;
1038 NULL != mca_btl_openib_component.if_include_list[j]; ++j) {
1039 if (0 == strcmp(name,
1040 mca_btl_openib_component.if_include_list[j])) {
1041 allowed_ports[num_ports++] = i;
1042 break;
1043 }
1044 }
1045 }
1046 } else if (NULL != mca_btl_openib_component.if_exclude_list) {
1047 /* If only the device name is given (eg. mtdevice0,mtdevice1) exclude
1048 all ports */
1049 i = 0;
1050 while (mca_btl_openib_component.if_exclude_list[i]) {
1051 if (0 == strcmp(dev_name,
1052 mca_btl_openib_component.if_exclude_list[i])) {
1053 num_ports = 0;
1054 goto done;
1055 }
1056 ++i;
1057 }
1058 /* Exclude the specified ports on this device */
1059 for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1060 sprintf(name,"%s:%d",dev_name,i);
1061 for (j = 0;
1062 NULL != mca_btl_openib_component.if_exclude_list[j]; ++j) {
1063 if (0 == strcmp(name,
1064 mca_btl_openib_component.if_exclude_list[j])) {
1065 /* If found, set a sentinel value */
1066 j = -1;
1067 break;
1068 }
1069 }
1070 /* If we didn't find it, it's ok to include in the list */
1071 if (-1 != j) {
1072 allowed_ports[num_ports++] = i;
1073 }
1074 }
1075 } else {
1076 num_ports = device->ib_dev_attr.phys_port_cnt;
1077 }
1078
1079 done:
1080
1081 /* Remove the following from the error-checking if_list:
1082 - bare device name
1083 - device name suffixed with port number */
1084 if (NULL != mca_btl_openib_component.if_list) {
1085 for (i = 0; NULL != mca_btl_openib_component.if_list[i]; ++i) {
1086
1087 /* Look for raw device name */
1088 if (0 == strcmp(mca_btl_openib_component.if_list[i], dev_name)) {
1089 j = opal_argv_count(mca_btl_openib_component.if_list);
1090 opal_argv_delete(&j, &(mca_btl_openib_component.if_list),
1091 i, 1);
1092 --i;
1093 }
1094 }
1095 for (i = 1; i <= device->ib_dev_attr.phys_port_cnt; ++i) {
1096 sprintf(name, "%s:%d", dev_name, i);
1097 for (j = 0; NULL != mca_btl_openib_component.if_list[j]; ++j) {
1098 if (0 == strcmp(mca_btl_openib_component.if_list[j], name)) {
1099 k = opal_argv_count(mca_btl_openib_component.if_list);
1100 opal_argv_delete(&k, &(mca_btl_openib_component.if_list),
1101 j, 1);
1102 --j;
1103 break;
1104 }
1105 }
1106 }
1107 }
1108
1109 free(name);
1110
1111 return num_ports;
1112 }
1113
1114 /*
1115 * Prefer values that are already in the target
1116 */
merge_values(opal_btl_openib_ini_values_t * target,opal_btl_openib_ini_values_t * src)1117 static void merge_values(opal_btl_openib_ini_values_t *target,
1118 opal_btl_openib_ini_values_t *src)
1119 {
1120 if (!target->mtu_set && src->mtu_set) {
1121 target->mtu = src->mtu;
1122 target->mtu_set = true;
1123 }
1124
1125 if (!target->use_eager_rdma_set && src->use_eager_rdma_set) {
1126 target->use_eager_rdma = src->use_eager_rdma;
1127 target->use_eager_rdma_set = true;
1128 }
1129
1130 if (NULL == target->receive_queues && NULL != src->receive_queues) {
1131 target->receive_queues = strdup(src->receive_queues);
1132 }
1133
1134 if (!target->max_inline_data_set && src->max_inline_data_set) {
1135 target->max_inline_data = src->max_inline_data;
1136 target->max_inline_data_set = true;
1137 }
1138 }
1139
is_credit_message(const mca_btl_openib_recv_frag_t * frag)1140 static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
1141 {
1142 mca_btl_openib_control_header_t* chdr =
1143 (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval;
1144 return (MCA_BTL_TAG_IB == frag->hdr->tag) &&
1145 (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
1146 }
1147
is_cts_message(const mca_btl_openib_recv_frag_t * frag)1148 static bool inline is_cts_message(const mca_btl_openib_recv_frag_t *frag)
1149 {
1150 mca_btl_openib_control_header_t* chdr =
1151 (mca_btl_openib_control_header_t *) to_base_frag(frag)->segment.seg_addr.pval;
1152 return (MCA_BTL_TAG_IB == frag->hdr->tag) &&
1153 (MCA_BTL_OPENIB_CONTROL_CTS == chdr->type);
1154 }
1155
atoi_param(char * param,int32_t dflt)1156 static int32_t atoi_param(char *param, int32_t dflt)
1157 {
1158 if (NULL == param || '\0' == param[0]) {
1159 return dflt ? dflt : 1;
1160 }
1161
1162 return atoi(param);
1163 }
1164
init_apm_port(mca_btl_openib_device_t * device,int port,uint16_t lid)1165 static void init_apm_port(mca_btl_openib_device_t *device, int port, uint16_t lid)
1166 {
1167 int index;
1168 struct mca_btl_openib_module_t *btl;
1169 for(index = 0; index < device->btls; index++) {
1170 btl = (mca_btl_openib_module_t *) opal_pointer_array_get_item(device->device_btls, index);
1171 /* Ok, we already have btl for the fist port,
1172 * second one will be used for APM */
1173 btl->apm_port = port;
1174 btl->port_info.apm_lid = lid + btl->src_path_bits;
1175 mca_btl_openib_component.apm_ports++;
1176 BTL_VERBOSE(("APM-PORT: Setting alternative port - %d, lid - %d"
1177 ,port ,lid));
1178 }
1179 }
1180
get_var_source(const char * var_name,mca_base_var_source_t * source)1181 static int get_var_source (const char *var_name, mca_base_var_source_t *source)
1182 {
1183 int vari = mca_base_var_find ("opal", "btl", "openib", var_name);
1184 if (0 > vari) {
1185 return vari;
1186 }
1187
1188 return mca_base_var_get_value (vari, NULL, source, NULL);
1189 }
1190
setup_qps(void)1191 static int setup_qps(void)
1192 {
1193 char **queues, **params = NULL;
1194 int num_xrc_qps = 0, num_pp_qps = 0, num_srq_qps = 0, qp = 0;
1195 uint32_t max_qp_size, max_size_needed;
1196 int32_t min_freelist_size = 0;
1197 int smallest_pp_qp = INT_MAX, ret = OPAL_ERROR;
1198
1199 queues = opal_argv_split(mca_btl_openib_component.receive_queues, ':');
1200 if (0 == opal_argv_count(queues)) {
1201 opal_show_help("help-mpi-btl-openib.txt",
1202 "no qps in receive_queues", true,
1203 opal_process_info.nodename,
1204 mca_btl_openib_component.receive_queues);
1205 ret = OPAL_ERROR;
1206 goto error;
1207 }
1208
1209 while (queues[qp] != NULL) {
1210 if (0 == strncmp("P,", queues[qp], 2)) {
1211 num_pp_qps++;
1212 if (smallest_pp_qp > qp) {
1213 smallest_pp_qp = qp;
1214 }
1215 } else if (0 == strncmp("S,", queues[qp], 2)) {
1216 num_srq_qps++;
1217 } else if (0 == strncmp("X,", queues[qp], 2)) {
1218 #if HAVE_XRC
1219 num_xrc_qps++;
1220 #else
1221 opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
1222 opal_process_info.nodename,
1223 mca_btl_openib_component.receive_queues);
1224 ret = OPAL_ERR_NOT_AVAILABLE;
1225 goto error;
1226 #endif
1227 } else {
1228 opal_show_help("help-mpi-btl-openib.txt",
1229 "invalid qp type in receive_queues", true,
1230 opal_process_info.nodename,
1231 mca_btl_openib_component.receive_queues,
1232 queues[qp]);
1233 ret = OPAL_ERR_BAD_PARAM;
1234 goto error;
1235 }
1236 qp++;
1237 }
1238
1239 #if HAVE_XRC
1240 /* Current XRC implementation can't used with other QP types - PP
1241 and SRQ */
1242 if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
1243 opal_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
1244 opal_process_info.nodename,
1245 mca_btl_openib_component.receive_queues);
1246 ret = OPAL_ERR_BAD_PARAM;
1247 goto error;
1248 }
1249
1250 /* Current XRC implementation can't used with btls_per_lid > 1 */
1251 if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
1252 opal_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
1253 true, opal_process_info.nodename,
1254 mca_btl_openib_component.receive_queues, num_xrc_qps);
1255 ret = OPAL_ERR_BAD_PARAM;
1256 goto error;
1257 }
1258 #endif
1259
1260 mca_btl_openib_component.num_pp_qps = num_pp_qps;
1261 mca_btl_openib_component.num_srq_qps = num_srq_qps;
1262 mca_btl_openib_component.num_xrc_qps = num_xrc_qps;
1263 mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps + num_xrc_qps;
1264
1265 mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
1266 malloc(sizeof(mca_btl_openib_qp_info_t) *
1267 mca_btl_openib_component.num_qps);
1268 if (NULL == mca_btl_openib_component.qp_infos) {
1269 ret = OPAL_ERR_OUT_OF_RESOURCE;
1270 goto error;
1271 }
1272
1273 qp = 0;
1274 #define P(N) (((N) > count) ? NULL : params[(N)])
1275 while (queues[qp] != NULL) {
1276 int count;
1277 int32_t rd_low, rd_num;
1278 params = opal_argv_split_with_empty(queues[qp], ',');
1279 count = opal_argv_count(params);
1280
1281 if ('P' == params[0][0]) {
1282 int32_t rd_win, rd_rsv;
1283 if (count < 3 || count > 6) {
1284 opal_show_help("help-mpi-btl-openib.txt",
1285 "invalid pp qp specification", true,
1286 opal_process_info.nodename, queues[qp]);
1287 ret = OPAL_ERR_BAD_PARAM;
1288 goto error;
1289 }
1290 mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
1291 mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
1292 rd_num = atoi_param(P(2), 256);
1293 /* by default set rd_low to be 3/4 of rd_num */
1294 rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
1295 rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
1296
1297 if (0 >= rd_win) {
1298 opal_show_help("help-mpi-btl-openib.txt",
1299 "invalid pp qp specification", true,
1300 opal_process_info.nodename, queues[qp]);
1301 ret = OPAL_ERR_BAD_PARAM;
1302 goto error;
1303 }
1304
1305 rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);
1306
1307 BTL_VERBOSE(("pp: rd_num is %d rd_low is %d rd_win %d rd_rsv %d",
1308 rd_num, rd_low, rd_win, rd_rsv));
1309
1310 /* Calculate the smallest freelist size that can be allowed */
1311 if (rd_num + rd_rsv > min_freelist_size) {
1312 min_freelist_size = rd_num + rd_rsv;
1313 }
1314
1315 mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
1316 mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
1317 if ((rd_num - rd_low) > rd_win) {
1318 opal_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
1319 true, rd_win, rd_num - rd_low);
1320 }
1321 } else {
1322 int32_t sd_max, rd_init, srq_limit;
1323 if (count < 3 || count > 7) {
1324 opal_show_help("help-mpi-btl-openib.txt",
1325 "invalid srq specification", true,
1326 opal_process_info.nodename, queues[qp]);
1327 ret = OPAL_ERR_BAD_PARAM;
1328 goto error;
1329 }
1330 mca_btl_openib_component.qp_infos[qp].type = (params[0][0] =='X') ?
1331 MCA_BTL_OPENIB_XRC_QP : MCA_BTL_OPENIB_SRQ_QP;
1332 mca_btl_openib_component.qp_infos[qp].size = atoi_param(P(1), 0);
1333 rd_num = atoi_param(P(2), 256);
1334 /* by default set rd_low to be 3/4 of rd_num */
1335 rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
1336 sd_max = atoi_param(P(4), rd_low / 4);
1337 /* rd_init is initial value for rd_curr_num of all SRQs, 1/4 of rd_num by default */
1338 rd_init = atoi_param(P(5), rd_num / 4);
1339 /* by default set srq_limit to be 3/16 of rd_init (it's 1/4 of rd_low_local,
1340 the value of rd_low_local we calculate in create_srq function) */
1341 srq_limit = atoi_param(P(6), (rd_init - (rd_init / 4)) / 4);
1342
1343 /* If we set srq_limit less or greater than rd_init
1344 (init value for rd_curr_num) => we receive the IBV_EVENT_SRQ_LIMIT_REACHED
1345 event immediately and the value of rd_curr_num will be increased */
1346
1347 /* If we set srq_limit to zero, but size of SRQ greater than 1 => set it to be 1 */
1348 if((0 == srq_limit) && (1 < rd_num)) {
1349 srq_limit = 1;
1350 }
1351
1352 BTL_VERBOSE(("srq: rd_num is %d rd_low is %d sd_max is %d rd_max is %d srq_limit is %d",
1353 rd_num, rd_low, sd_max, rd_init, srq_limit));
1354
1355 /* Calculate the smallest freelist size that can be allowed */
1356 if (rd_num > min_freelist_size) {
1357 min_freelist_size = rd_num;
1358 }
1359
1360 if (rd_num < rd_init) {
1361 opal_show_help("help-mpi-btl-openib.txt", "rd_num must be >= rd_init",
1362 true, opal_process_info.nodename, queues[qp]);
1363 ret = OPAL_ERR_BAD_PARAM;
1364 goto error;
1365 }
1366
1367 if (rd_num < srq_limit) {
1368 opal_show_help("help-mpi-btl-openib.txt", "srq_limit must be > rd_num",
1369 true, opal_process_info.nodename, queues[qp]);
1370 ret = OPAL_ERR_BAD_PARAM;
1371 goto error;
1372 }
1373
1374 mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
1375 mca_btl_openib_component.qp_infos[qp].u.srq_qp.rd_init = rd_init;
1376 mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit = srq_limit;
1377 }
1378
1379 if (rd_num <= rd_low) {
1380 opal_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
1381 true, opal_process_info.nodename, queues[qp]);
1382 ret = OPAL_ERR_BAD_PARAM;
1383 goto error;
1384 }
1385 mca_btl_openib_component.qp_infos[qp].rd_num = rd_num;
1386 mca_btl_openib_component.qp_infos[qp].rd_low = rd_low;
1387 opal_argv_free(params);
1388 qp++;
1389 }
1390 params = NULL;
1391
1392 /* Sanity check some sizes */
1393
1394 max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
1395 max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
1396 mca_btl_openib_module.super.btl_max_send_size) ?
1397 mca_btl_openib_module.super.btl_eager_limit :
1398 mca_btl_openib_module.super.btl_max_send_size;
1399
1400 if (max_qp_size < max_size_needed) {
1401 mca_base_var_source_t eager_source = MCA_BASE_VAR_SOURCE_DEFAULT;
1402 mca_base_var_source_t max_send_source = MCA_BASE_VAR_SOURCE_DEFAULT;
1403
1404 (void) get_var_source ("max_send_size", &max_send_source);
1405 (void) get_var_source ("eager_limit", &eager_source);
1406
1407 /* the largest queue pair is too small for either the max send size or eager
1408 * limit. check where we got the max_send_size and eager_limit and adjust if
1409 * the user did not specify one or the other. */
1410 if (mca_btl_openib_module.super.btl_eager_limit > max_qp_size &&
1411 MCA_BASE_VAR_SOURCE_DEFAULT == eager_source) {
1412 mca_btl_openib_module.super.btl_eager_limit = max_qp_size;
1413 }
1414
1415 if (mca_btl_openib_module.super.btl_max_send_size > max_qp_size &&
1416 MCA_BASE_VAR_SOURCE_DEFAULT == max_send_source) {
1417 mca_btl_openib_module.super.btl_max_send_size = max_qp_size;
1418 }
1419
1420 max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
1421 mca_btl_openib_module.super.btl_max_send_size) ?
1422 mca_btl_openib_module.super.btl_eager_limit :
1423 mca_btl_openib_module.super.btl_max_send_size;
1424 }
1425
1426 if (max_qp_size < max_size_needed) {
1427 opal_show_help("help-mpi-btl-openib.txt",
1428 "biggest qp size is too small", true,
1429 opal_process_info.nodename, max_qp_size,
1430 max_size_needed);
1431 ret = OPAL_ERR_BAD_PARAM;
1432 goto error;
1433 } else if (max_qp_size > max_size_needed) {
1434 opal_show_help("help-mpi-btl-openib.txt",
1435 "biggest qp size is too big", true,
1436 opal_process_info.nodename, max_qp_size,
1437 max_size_needed);
1438 }
1439
1440 if (mca_btl_openib_component.ib_free_list_max > 0 &&
1441 min_freelist_size > mca_btl_openib_component.ib_free_list_max) {
1442 opal_show_help("help-mpi-btl-openib.txt", "freelist too small", true,
1443 opal_process_info.nodename,
1444 mca_btl_openib_component.ib_free_list_max,
1445 min_freelist_size);
1446 ret = OPAL_ERR_BAD_PARAM;
1447 goto error;
1448 }
1449
1450 mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
1451 if (mca_btl_openib_component.num_qps > smallest_pp_qp) {
1452 mca_btl_openib_component.credits_qp = smallest_pp_qp;
1453 } else {
1454 mca_btl_openib_component.credits_qp = mca_btl_openib_component.num_qps - 1;
1455 }
1456
1457 ret = OPAL_SUCCESS;
1458 error:
1459 if (NULL != params) {
1460 opal_argv_free(params);
1461 }
1462
1463 if (NULL != queues) {
1464 opal_argv_free(queues);
1465 }
1466
1467 return ret;
1468 }
1469
1470 /* read a single integer from a linux module parameters file */
read_module_param(char * file,uint64_t value,uint64_t max)1471 static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
1472 {
1473 int fd = open(file, O_RDONLY);
1474 char buffer[64];
1475 uint64_t ret;
1476 int rc;
1477
1478 if (0 > fd) {
1479 return value;
1480 }
1481
1482 rc = read (fd, buffer, 64);
1483
1484 close (fd);
1485
1486 if (0 == rc) {
1487 return value;
1488 }
1489
1490 errno = 0;
1491 ret = strtoull(buffer, NULL, 10);
1492
1493 if (ret > max) {
1494 /* NTH: probably should report a bogus value */
1495 ret = max;
1496 }
1497
1498 return (0 == errno) ? ret : value;
1499 }
1500
1501 /* calculate memory registation limits */
calculate_total_mem(void)1502 static uint64_t calculate_total_mem (void)
1503 {
1504 hwloc_obj_t machine;
1505 int rc;
1506 uint64_t mem, *mptr;
1507 opal_process_name_t wildcard_rank;
1508
1509 /* first try to retrieve it from PMIx as it may have
1510 * been provided */
1511 wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
1512 wildcard_rank.vpid = OPAL_VPID_WILDCARD;
1513 mptr = &mem;
1514 OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
1515 &wildcard_rank, &mptr, OPAL_UINT64);
1516 if (OPAL_SUCCESS == rc) {
1517 return mem;
1518 }
1519
1520 /* if not available, then ensure that the topology has been
1521 * loaded and try to get it from there */
1522 if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
1523 machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1524 if (NULL == machine) {
1525 return 0;
1526 }
1527 return machine->memory.total_memory;
1528 }
1529
1530 /* couldn't find it */
1531 return 0;
1532 }
1533
1534
calculate_max_reg(const char * device_name)1535 static uint64_t calculate_max_reg (const char *device_name)
1536 {
1537 struct stat statinfo;
1538 uint64_t mtts_per_seg = 1;
1539 uint64_t num_mtt = 1 << 19;
1540 uint64_t reserved_mtt = 0;
1541 uint64_t max_reg, mem_total;
1542
1543 mem_total = calculate_total_mem ();
1544
1545 /* On older OFED(<2.0), may need to turn off this parameter*/
1546 if (mca_btl_openib_component.allow_max_memory_registration) {
1547 max_reg = 2 * mem_total;
1548 /* Limit us to 87.5% of the registered memory (some fluff for QPs,
1549 file systems, etc) */
1550 return (max_reg * 7) >> 3;
1551 }
1552
1553 /* Default to being able to register everything (to ensure that
1554 max_reg is initialized in all cases) */
1555 max_reg = mem_total;
1556 if (!strncmp(device_name, "mlx5", 4)) {
1557 max_reg = 2 * mem_total;
1558
1559 } else if (!strncmp(device_name, "mlx4", 4)) {
1560 if (0 == stat("/sys/module/mlx4_core/parameters/log_num_mtt", &statinfo)) {
1561 mtts_per_seg = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63);
1562 num_mtt = 1ull << read_module_param("/sys/module/mlx4_core/parameters/log_mtts_per_seg", 1, 63);
1563 if (1 == num_mtt) {
1564 /* NTH: is 19 a minimum? when log_num_mtt is set to 0 use 19 */
1565 num_mtt = 1 << 19;
1566 max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1567 } else {
1568 max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1569 }
1570 }
1571
1572 } else if (!strncmp(device_name, "mthca", 5)) {
1573 if (0 == stat("/sys/module/ib_mthca/parameters/num_mtt", &statinfo)) {
1574 mtts_per_seg = 1ull << read_module_param("/sys/module/ib_mthca/parameters/log_mtts_per_seg", 1, 63);
1575 num_mtt = read_module_param("/sys/module/ib_mthca/parameters/num_mtt", 1 << 20, (uint64_t) -1);
1576 reserved_mtt = read_module_param("/sys/module/ib_mthca/parameters/fmr_reserved_mtts", 0, (uint64_t) -1);
1577
1578 max_reg = (num_mtt - reserved_mtt) * opal_getpagesize () * mtts_per_seg;
1579 } else {
1580 max_reg = mem_total;
1581 }
1582
1583 } else {
1584 /* Need to update to determine the registration limit for this
1585 configuration */
1586 max_reg = mem_total;
1587 }
1588
1589 /* Print a warning if we can't register more than 75% of physical
1590 memory. Abort if the abort_not_enough_reg_mem MCA param was
1591 set. */
1592 if (max_reg < mem_total * 3 / 4) {
1593 char *action;
1594
1595 if (mca_btl_openib_component.abort_not_enough_reg_mem) {
1596 action = "Your MPI job will now abort.";
1597 } else {
1598 action = "Your MPI job will continue, but may be behave poorly and/or hang.";
1599 }
1600 opal_show_help("help-mpi-btl-openib.txt", "reg mem limit low", true,
1601 opal_process_info.nodename, (unsigned long)(max_reg >> 20),
1602 (unsigned long)(mem_total >> 20), action);
1603 return 0; /* signal that we can't have enough memory */
1604 }
1605
1606 /* Limit us to 87.5% of the registered memory (some fluff for QPs,
1607 file systems, etc) */
1608 return (max_reg * 7) >> 3;
1609 }
1610
init_one_device(opal_list_t * btl_list,struct ibv_device * ib_dev)1611 static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
1612 {
1613 mca_rcache_base_resources_t rcache_resources;
1614 mca_btl_openib_device_t *device;
1615 uint8_t i, k = 0;
1616 int ret = -1, port_cnt;
1617 opal_btl_openib_ini_values_t values, default_values;
1618 int *allowed_ports = NULL;
1619 bool need_search;
1620 struct ibv_context *dev_context = NULL;
1621
1622 /* Open up the device */
1623 dev_context = ibv_open_device(ib_dev);
1624 if (NULL == dev_context) {
1625 return OPAL_ERR_NOT_SUPPORTED;
1626 }
1627
1628 /* Find out if this device supports RC QPs */
1629 if (OPAL_SUCCESS != opal_common_verbs_qp_test(dev_context,
1630 OPAL_COMMON_VERBS_FLAGS_RC)) {
1631 ibv_close_device(dev_context);
1632 BTL_VERBOSE(("openib: RC QPs not supported -- skipping %s",
1633 ibv_get_device_name(ib_dev)));
1634 ++num_devices_intentionally_ignored;
1635 return OPAL_ERR_NOT_SUPPORTED;
1636 }
1637
1638 device = OBJ_NEW(mca_btl_openib_device_t);
1639 if(NULL == device){
1640 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
1641 ibv_close_device(dev_context);
1642 return OPAL_ERR_OUT_OF_RESOURCE;
1643 }
1644
1645 device->mem_reg_active = 0;
1646 device->mem_reg_max_total = calculate_max_reg(ibv_get_device_name(ib_dev));
1647 device->mem_reg_max = device->mem_reg_max_total;
1648 if(( 0 == device->mem_reg_max) && mca_btl_openib_component.abort_not_enough_reg_mem) {
1649 return OPAL_ERROR;
1650 }
1651
1652 device->ib_dev = ib_dev;
1653 device->ib_dev_context = dev_context;
1654 device->ib_pd = NULL;
1655 device->device_btls = OBJ_NEW(opal_pointer_array_t);
1656 if (OPAL_SUCCESS != opal_pointer_array_init(device->device_btls, 2, INT_MAX, 2)) {
1657 BTL_ERROR(("Failed to initialize device_btls array: %s:%d", __FILE__, __LINE__));
1658 return OPAL_ERR_OUT_OF_RESOURCE;
1659 }
1660
1661 if(NULL == device->ib_dev_context){
1662 BTL_ERROR(("error obtaining device context for %s errno says %s",
1663 ibv_get_device_name(device->ib_dev), strerror(errno)));
1664 goto error;
1665 }
1666 #if HAVE_DECL_IBV_EXP_QUERY_DEVICE
1667 memset(&device->ib_exp_dev_attr, 0, sizeof(device->ib_exp_dev_attr));
1668 device->ib_exp_dev_attr.comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1;
1669 if(ibv_exp_query_device(device->ib_dev_context, &device->ib_exp_dev_attr)){
1670 BTL_ERROR(("error obtaining device attributes for %s errno says %s",
1671 ibv_get_device_name(device->ib_dev), strerror(errno)));
1672 goto error;
1673 }
1674 #endif
1675 if(ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)){
1676 BTL_ERROR(("error obtaining device attributes for %s errno says %s",
1677 ibv_get_device_name(device->ib_dev), strerror(errno)));
1678 goto error;
1679 }
1680 /* If mca_btl_if_include/exclude were specified, get usable ports */
1681 allowed_ports = (int*)malloc(device->ib_dev_attr.phys_port_cnt * sizeof(int));
1682 if (NULL == allowed_ports) {
1683 ret = OPAL_ERR_OUT_OF_RESOURCE;
1684 goto error;
1685 }
1686
1687 port_cnt = get_port_list(device, allowed_ports);
1688 if (0 == port_cnt) {
1689 ret = OPAL_SUCCESS;
1690 ++num_devices_intentionally_ignored;
1691 goto error;
1692 }
1693
1694 /* Load in vendor/part-specific device parameters. Note that even if
1695 we don't find values for this vendor/part, "values" will be set
1696 indicating that it does not have good values */
1697 ret = opal_btl_openib_ini_query(device->ib_dev_attr.vendor_id,
1698 device->ib_dev_attr.vendor_part_id,
1699 &values);
1700 if (OPAL_SUCCESS != ret &&
1701 OPAL_ERR_NOT_FOUND != ret) {
1702 /* If we get a serious error, propagate it upwards */
1703 goto error;
1704 }
1705 if (OPAL_ERR_NOT_FOUND == ret) {
1706 /* If we didn't find a matching device in the INI files, output a
1707 warning that we're using default values (unless overridden
1708 that we don't want to see these warnings) */
1709 if (mca_btl_openib_component.warn_no_device_params_found) {
1710 opal_show_help("help-mpi-btl-openib.txt",
1711 "no device params found", true,
1712 opal_process_info.nodename,
1713 ibv_get_device_name(device->ib_dev),
1714 device->ib_dev_attr.vendor_id,
1715 device->ib_dev_attr.vendor_part_id);
1716 }
1717 }
1718
1719 /* If we're supposed to ignore devices of this vendor/part ID,
1720 then do so */
1721 if (values.ignore_device_set && values.ignore_device) {
1722 BTL_VERBOSE(("device %s skipped; ignore_device=1",
1723 ibv_get_device_name(device->ib_dev)));
1724 ret = OPAL_SUCCESS;
1725 ++num_devices_intentionally_ignored;
1726 goto error;
1727 }
1728
1729 /* Note that even if we don't find default values, "values" will
1730 be set indicating that it does not have good values */
1731 ret = opal_btl_openib_ini_query(0, 0, &default_values);
1732 if (OPAL_SUCCESS != ret &&
1733 OPAL_ERR_NOT_FOUND != ret) {
1734 /* If we get a serious error, propagate it upwards */
1735 goto error;
1736 }
1737
1738 /* If we did find values for this device (or in the defaults
1739 section), handle them */
1740 merge_values(&values, &default_values);
1741 /* If MCA param was set, use it. If not, check the INI file
1742 or default to IBV_MTU_1024 */
1743 if (0 < mca_btl_openib_component.ib_mtu) {
1744 device->mtu = mca_btl_openib_component.ib_mtu;
1745 } else if (values.mtu_set) {
1746 switch (values.mtu) {
1747 case 256:
1748 device->mtu = IBV_MTU_256;
1749 break;
1750 case 512:
1751 device->mtu = IBV_MTU_512;
1752 break;
1753 case 1024:
1754 device->mtu = IBV_MTU_1024;
1755 break;
1756 case 2048:
1757 device->mtu = IBV_MTU_2048;
1758 break;
1759 case 4096:
1760 device->mtu = IBV_MTU_4096;
1761 break;
1762 default:
1763 BTL_ERROR(("invalid MTU value specified in INI file (%d); ignored", values.mtu));
1764 device->mtu = IBV_MTU_1024 ;
1765 break;
1766 }
1767 } else {
1768 device->mtu = IBV_MTU_1024 ;
1769 }
1770
1771 /* Allocate the protection domain for the device */
1772 device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
1773 if(NULL == device->ib_pd){
1774 BTL_ERROR(("error allocating protection domain for %s errno says %s",
1775 ibv_get_device_name(device->ib_dev), strerror(errno)));
1776 goto error;
1777 }
1778
1779 /* Figure out what the max_inline_data value should be for all
1780 ports and QPs on this device */
1781 need_search = false;
1782 if(-2 != mca_btl_openib_component.ib_max_inline_data) {
1783 /* User has explicitly set btl_openib_max_inline_data MCA parameter
1784 Per setup in _mca.c, we know that the MCA param value is guaranteed
1785 to be >= -1 */
1786 if (-1 == mca_btl_openib_component.ib_max_inline_data) {
1787 need_search = true;
1788 } else {
1789 device->max_inline_data = (uint32_t)
1790 mca_btl_openib_component.ib_max_inline_data;
1791 }
1792 } else if (values.max_inline_data_set) {
1793 if (-1 == values.max_inline_data) {
1794 need_search = true;
1795 } else if (values.max_inline_data >= 0) {
1796 device->max_inline_data = (uint32_t) values.max_inline_data;
1797 } else {
1798 if(default_values.max_inline_data_set &&
1799 default_values.max_inline_data >= -1) {
1800 BTL_ERROR(("Invalid max_inline_data value specified "
1801 "in INI file (%d); using default value (%d)",
1802 values.max_inline_data,
1803 default_values.max_inline_data));
1804 device->max_inline_data = (uint32_t)
1805 default_values.max_inline_data;
1806 } else {
1807 BTL_ERROR(("Invalid max_inline_data value specified "
1808 "in INI file (%d)", values.max_inline_data));
1809 ret = OPAL_ERR_BAD_PARAM;
1810 goto error;
1811 }
1812 }
1813 }
1814
1815 /* If we don't have a set max inline data size, search for it */
1816 if (need_search) {
1817 opal_common_verbs_find_max_inline(device->ib_dev,
1818 device->ib_dev_context,
1819 device->ib_pd,
1820 &device->max_inline_data);
1821 }
1822
1823 /* Should we use RDMA for short / eager messages? First check MCA
1824 param, then check INI file values. */
1825 if (mca_btl_openib_component.use_eager_rdma >= 0) {
1826 device->use_eager_rdma = mca_btl_openib_component.use_eager_rdma;
1827 } else if (values.use_eager_rdma_set) {
1828 device->use_eager_rdma = values.use_eager_rdma;
1829 }
1830 /* Eager RDMA is not currently supported with progress threads */
1831 if (device->use_eager_rdma && OPAL_ENABLE_PROGRESS_THREADS) {
1832 device->use_eager_rdma = 0;
1833 opal_show_help("help-mpi-btl-openib.txt",
1834 "eager RDMA and progress threads", true);
1835 }
1836
1837 asprintf (&rcache_resources.cache_name, "verbs.%" PRIu64, device->ib_dev_attr.node_guid);
1838 rcache_resources.reg_data = (void*)device;
1839 rcache_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t);
1840 rcache_resources.register_mem = openib_reg_mr;
1841 rcache_resources.deregister_mem = openib_dereg_mr;
1842 device->rcache =
1843 mca_rcache_base_module_create (mca_btl_openib_component.ib_rcache_name,
1844 device, &rcache_resources);
1845 if (NULL == device->rcache) {
1846 /* Don't print an error message here -- we'll get one from
1847 mpool_create anyway */
1848 goto error;
1849 }
1850
1851 device->mpool = mca_mpool_base_module_lookup (mca_btl_openib_component.ib_mpool_hints);
1852 if (NULL == device->mpool) {
1853 goto error;
1854 }
1855
1856 #if OPAL_ENABLE_PROGRESS_THREADS
1857 device->ib_channel = ibv_create_comp_channel(device->ib_dev_context);
1858 if (NULL == device->ib_channel) {
1859 BTL_ERROR(("error creating channel for %s errno says %s",
1860 ibv_get_device_name(device->ib_dev),
1861 strerror(errno)));
1862 goto error;
1863 }
1864 #endif
1865
1866 ret = OPAL_SUCCESS;
1867
1868 /* Note ports are 1 based (i >= 1) */
1869 for(k = 0; k < port_cnt; k++){
1870 struct ibv_port_attr ib_port_attr;
1871 i = allowed_ports[k];
1872 if(ibv_query_port(device->ib_dev_context, i, &ib_port_attr)){
1873 BTL_ERROR(("error getting port attributes for device %s "
1874 "port number %d errno says %s",
1875 ibv_get_device_name(device->ib_dev), i, strerror(errno)));
1876 break;
1877 }
1878 if(IBV_PORT_ACTIVE == ib_port_attr.state) {
1879 /* Select the lower of the HCA and port active speed. With QLogic
1880 HCAs that are capable of 4K MTU we had an issue when connected
1881 to switches with 2K MTU. This fix is valid for other IB vendors
1882 as well. */
1883 if (ib_port_attr.active_mtu < device->mtu){
1884 device->mtu = ib_port_attr.active_mtu;
1885 }
1886 if (mca_btl_openib_component.apm_ports && device->btls > 0) {
1887 init_apm_port(device, i, ib_port_attr.lid);
1888 break;
1889 }
1890 if (0 == mca_btl_openib_component.ib_pkey_val) {
1891 ret = init_one_port(btl_list, device, i, 0, &ib_port_attr);
1892 } else {
1893 uint16_t pkey,j;
1894 for (j = 0; j < device->ib_dev_attr.max_pkeys; j++) {
1895 if(ibv_query_pkey(device->ib_dev_context, i, j, &pkey)){
1896 BTL_ERROR(("error getting pkey for index %d, device %s "
1897 "port number %d errno says %s",
1898 j, ibv_get_device_name(device->ib_dev), i, strerror(errno)));
1899 }
1900 pkey = ntohs(pkey) & MCA_BTL_IB_PKEY_MASK;
1901 if(pkey == mca_btl_openib_component.ib_pkey_val){
1902 ret = init_one_port(btl_list, device, i, j, &ib_port_attr);
1903 break;
1904 }
1905 }
1906 }
1907 if (OPAL_SUCCESS != ret) {
1908 /* Out of bounds error indicates that we hit max btl number
1909 * don't propagate the error to the caller */
1910 if (OPAL_ERR_VALUE_OUT_OF_BOUNDS == ret) {
1911 ret = OPAL_SUCCESS;
1912 }
1913 break;
1914 }
1915 }
1916 }
1917 free(allowed_ports);
1918 allowed_ports = NULL;
1919
1920 /* If we made a BTL, check APM status and return. Otherwise, fall
1921 through and destroy everything */
1922 if (device->btls > 0) {
1923 /* if apm was enabled it should be > 1 */
1924 if (1 == mca_btl_openib_component.apm_ports) {
1925 opal_show_help("help-mpi-btl-openib.txt",
1926 "apm not enough ports", true);
1927 mca_btl_openib_component.apm_ports = 0;
1928 }
1929
1930 /* Check to ensure that all devices used in this process have
1931 compatible receive_queues values (we check elsewhere to see
1932 if all devices used in other processes in this job have
1933 compatible receive_queues values).
1934
1935 Not only is the check complex, but the reasons behind what
1936 it does (and does not do) are complex. Before explaining
1937 the code below, here's some notes:
1938
1939 1. The openib BTL component only supports 1 value of the
1940 receive_queues between all of its modules.
1941
1942 --> This could be changed to allow every module to have
1943 its own receive_queues. But that would be a big
1944 deal; no one has time to code this up right now.
1945
1946 2. The receive_queues value can be specified either as an
1947 MCA parameter or in the INI file. Specifying the value
1948 as an MCA parameter overrides all INI file values
1949 (meaning: that MCA param value will be used for all
1950 openib BTL modules in the process).
1951
1952 Effectively, the first device through init_one_device()
1953 gets to decide what the receive_queues will be for the all
1954 modules in this process. This is an unfortunate artifact
1955 of the openib BTL startup sequence (see below for more
1956 details). The first device will choose the receive_queues
1957 value from: (in priority order):
1958
1959 1. If the btl_openib_receive_queues MCA param was
1960 specified, use that.
1961 2. If this device has a receive_queues value specified in
1962 the INI file, use that.
1963 3. Otherwise, use the default MCA param value for
1964 btl_openib_receive_queues.
1965
1966 If any successive device has a different value specified in
1967 the INI file, we show_help and return up the stack that
1968 this device failed.
1969
1970 In the case that the user does not specify a
1971 mca_btl_openib_receive_queues value, the short description
1972 of what is allowed is that either a) no devices specify a
1973 receive_queues value in the INI file (in which case we use
1974 the default MCA param value), b) all devices specify the
1975 same receive_queues value in the INI value, or c) some/all
1976 devices specify the same receive_queues value in the INI
1977 value as the default MCA param value.
1978
1979 Let's take some sample cases to explain this more clearly...
1980
1981 THESE ARE THE "GOOD" CASES
1982 --------------------------
1983
1984 Case 1: no INI values
1985 - MCA parameter: not specified
1986 - default receive_queues: value A
1987 - device 0: no receive_queues in INI file
1988 - device 1: no receive_queues in INI file
1989 - device 2: no receive_queues in INI file
1990 --> use receive_queues value A with all devices
1991
1992 Case 2: all INI values the same (same as default)
1993 - MCA parameter: not specified
1994 - default receive_queues: value A
1995 - device 0: receive_queues value A in the INI file
1996 - device 1: receive_queues value A in the INI file
1997 - device 2: receive_queues value A in the INI file
1998 --> use receive_queues value A with all devices
1999
2000 Case 3: all INI values the same (but different than default)
2001 - MCA parameter: not specified
2002 - default receive_queues: value A
2003 - device 0: receive_queues value B in the INI file
2004 - device 1: receive_queues value B in the INI file
2005 - device 2: receive_queues value B in the INI file
2006 --> use receive_queues value B with all devices
2007
2008 Case 4: some INI unspecified, but rest same as default
2009 - MCA parameter: not specified
2010 - default receive_queues: value A
2011 - device 0: receive_queues value A in the INI file
2012 - device 1: no receive_queues in INI file
2013 - device 2: receive_queues value A in the INI file
2014 --> use receive_queues value A with all devices
2015
2016 Case 5: some INI unspecified (including device 0), but rest same as default
2017 - MCA parameter: not specified
2018 - default receive_queues: value A
2019 - device 0: no receive_queues in INI file
2020 - device 1: no receive_queues in INI file
2021 - device 2: receive_queues value A in the INI file
2022 --> use receive_queues value A with all devices
2023
2024 Case 6: different default/INI values, but MCA param is specified
2025 - MCA parameter: value D
2026 - default receive_queues: value A
2027 - device 0: no receive_queues in INI file
2028 - device 1: receive_queues value B in INI file
2029 - device 2: receive_queues value C in INI file
2030 --> use receive_queues value D with all devices
2031
2032 What this means is that this selection process is
2033 unfortunately tied to the order of devices. :-( Device 0
2034 effectively sets what the receive_queues value will be for
2035 that process. If any later device disagrees, that's
2036 problematic and we have to error/abort.
2037
2038 ALL REMAINING CASES WILL FAIL
2039 -----------------------------
2040
2041 Case 7: one INI value (different than default)
2042 - MCA parameter: not specified
2043 - default receive_queues: value A
2044 - device 0: receive_queues value B in INI file
2045 - device 1: no receive_queues in INI file
2046 - device 2: no receive_queues in INI file
2047 --> Jeff thinks that it would be great to use
2048 receive_queues value B with all devices. However, it
2049 shares one of the problems cited in case 8, below. So
2050 we need to fail this scenario; print an error and
2051 abort.
2052
2053 Case 8: one INI value, different than default
2054 - MCA parameter: not specified
2055 - default receive_queues: value A
2056 - device 0: no receive_queues in INI file
2057 - device 1: receive_queues value B in INI file
2058 - device 2: no receive_queues in INI file
2059
2060 --> Jeff thinks that it would be great to use
2061 receive_queues value B with all devices. However, it
2062 has (at least) 2 problems:
2063
2064 1. The check for local receive_queue compatibility is
2065 done here in init_one_device(). By the time we call
2066 init_one_device() for device 1, we have already
2067 called init_one_device() for device 0, meaning that
2068 device 0's QPs have already been created and setup
2069 using the MCA parameter's default receive_queues
2070 value. So if device 1 *changes* the
2071 component.receive_queues value, then device 0 and
2072 device 1 now have different receive_queue sets (more
2073 specifically: the QPs setup for device 0 are now
2074 effectively lost). This is Bad.
2075
2076 It would be great if we didn't have this restriction
2077 -- either by letting each module have its own
2078 receive_queues value or by scanning all devices and
2079 figuring out a final receive_queues value *before*
2080 actually setting up any QPs. But that's not the
2081 current flow of the code (patches would be greatly
2082 appreciated here, of course!). Unfortunately, no
2083 one has time to code this up right now, so we're
2084 leaving this as explicitly documented for some
2085 future implementer...
2086
2087 2. Conside a scenario with server 1 having HCA A/subnet
2088 X, and server 2 having HCA B/subnet X and HCA
2089 C/subnet Y. And let's assume:
2090
2091 Server 1:
2092 HCA A: no receive_queues in INI file
2093
2094 Server 2:
2095 HCA B: no receive_queues in INI file
2096 HCA C: receive_queues specified in INI file
2097
2098 A will therefore use the default receive_queues
2099 value. B and C will use C's INI receive_queues.
2100 But note that modex [currently] only sends around
2101 vendor/part IDs for OpenFabrics devices -- not the
2102 actual receive_queues value (it was felt that
2103 including the final receive_queues string value in
2104 the modex would dramatically increase the size of
2105 the modex). So processes on server 1 will get the
2106 vendor/part ID for HCA B, look it up in the INI
2107 file, see that it has no receive_queues value
2108 specified, and then assume that it uses the default
2109 receive_queues value. Hence, procs on server 1 will
2110 try to connect HCA A-->HCA B with the wrong
2111 receive_queues value. Bad. Further, the error
2112 won't be discovered by checks like this because A
2113 won't check D's receive_queues because D is on a
2114 different subnet.
2115
2116 This could be fixed, of course; either by a) send
2117 the final receive_queues value in the modex (perhaps
2118 compressing or encoding it so that it can be much
2119 shorter than the string -- the current vendor/part
2120 ID stuff takes 8 bytes for each device), or b)
2121 replicating the determination process of each host
2122 in each process (i.e., procs on server 1 would see
2123 both B and C, and use them both to figure out what
2124 the "final" receive_queues value is for B).
2125 Unfortunately, no one has time to code this up right
2126 now, so we're leaving this as explicitly documented
2127 for some future implementer...
2128
2129 Because of both of these problems, this case is
2130 problematic and must fail with a show_help error.
2131
2132 Case 9: two devices with same INI value (different than default)
2133 - MCA parameter: not specified
2134 - default receive_queues: value A
2135 - device 0: no receive_queues in INI file
2136 - device 1: receive_queues value B in INI file
2137 - device 2: receive_queues value B in INI file
2138 --> per case 8, fail with a show_help message.
2139
2140 Case 10: two devices with different INI values
2141 - MCA parameter: not specified
2142 - default receive_queues: value A
2143 - device 0: no receive_queues in INI file
2144 - device 1: receive_queues value B in INI file
2145 - device 2: receive_queues value C in INI file
2146 --> per case 8, fail with a show_help message.
2147
2148 */
2149
2150 {
2151 /* we need to read this MCA param at this point in case someone
2152 * altered it via MPI_T */
2153 mca_base_var_source_t source;
2154
2155 if (OPAL_SUCCESS != (ret = get_var_source ("receive_queues", &source))) {
2156 BTL_ERROR(("mca_base_var_get_value failed to get value for receive_queues: %s:%d",
2157 __FILE__, __LINE__));
2158 goto error;
2159 }
2160
2161 mca_btl_openib_component.receive_queues_source = source;
2162 }
2163
2164 /* If the MCA param was specified, skip all the checks */
2165 if (MCA_BASE_VAR_SOURCE_DEFAULT != mca_btl_openib_component.receive_queues_source) {
2166 goto good;
2167 }
2168
2169 /* If we're the first device and we have a receive_queues
2170 value from the INI file *that is different than the
2171 already-existing default value*, then set the component to
2172 use that. */
2173 if (0 == mca_btl_openib_component.devices_count) {
2174 if (NULL != values.receive_queues &&
2175 0 != strcmp(values.receive_queues,
2176 mca_btl_openib_component.receive_queues)) {
2177 if (NULL != mca_btl_openib_component.receive_queues) {
2178 free(mca_btl_openib_component.receive_queues);
2179 }
2180 mca_btl_openib_component.receive_queues =
2181 strdup(values.receive_queues);
2182 mca_btl_openib_component.receive_queues_source =
2183 BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
2184 }
2185 }
2186
2187 /* If we're not the first device, then we have to conform to
2188 either the default value if the first device didn't set
2189 anything, or to whatever the first device decided. */
2190 else {
2191 /* In all cases, if this device has a receive_queues value
2192 in the INI, then it must agree with
2193 component.receive_queues. */
2194 if (NULL != values.receive_queues) {
2195 if (0 != strcmp(values.receive_queues,
2196 mca_btl_openib_component.receive_queues)) {
2197 opal_show_help("help-mpi-btl-openib.txt",
2198 "locally conflicting receive_queues", true,
2199 opal_install_dirs.opaldatadir,
2200 opal_process_info.nodename,
2201 ibv_get_device_name(receive_queues_device->ib_dev),
2202 receive_queues_device->ib_dev_attr.vendor_id,
2203 receive_queues_device->ib_dev_attr.vendor_part_id,
2204 mca_btl_openib_component.receive_queues,
2205 ibv_get_device_name(device->ib_dev),
2206 device->ib_dev_attr.vendor_id,
2207 device->ib_dev_attr.vendor_part_id,
2208 values.receive_queues);
2209 ret = OPAL_ERR_RESOURCE_BUSY;
2210 goto error;
2211 }
2212 }
2213
2214 /* If this device doesn't have an INI receive_queues
2215 value, then if the component.receive_queues value came
2216 from the default, we're ok. But if the
2217 component.receive_queues value came from the 1st
2218 device's INI file, we must error. */
2219 else if ((mca_base_var_source_t) BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
2220 mca_btl_openib_component.receive_queues_source) {
2221 opal_show_help("help-mpi-btl-openib.txt",
2222 "locally conflicting receive_queues", true,
2223 opal_install_dirs.opaldatadir,
2224 opal_process_info.nodename,
2225 ibv_get_device_name(receive_queues_device->ib_dev),
2226 receive_queues_device->ib_dev_attr.vendor_id,
2227 receive_queues_device->ib_dev_attr.vendor_part_id,
2228 mca_btl_openib_component.receive_queues,
2229 ibv_get_device_name(device->ib_dev),
2230 device->ib_dev_attr.vendor_id,
2231 device->ib_dev_attr.vendor_part_id,
2232 mca_btl_openib_component.default_recv_qps);
2233 ret = OPAL_ERR_RESOURCE_BUSY;
2234 goto error;
2235 }
2236 }
2237
2238 receive_queues_device = device;
2239
2240 good:
2241 mca_btl_openib_component.devices_count++;
2242 return OPAL_SUCCESS;
2243 }
2244
2245 error:
2246 if (OPAL_SUCCESS != ret) {
2247 opal_show_help("help-mpi-btl-openib.txt",
2248 "error in device init", true,
2249 opal_process_info.nodename,
2250 ibv_get_device_name(device->ib_dev));
2251 }
2252
2253 if (NULL != allowed_ports) {
2254 free(allowed_ports);
2255 }
2256 OBJ_RELEASE(device);
2257 return ret;
2258 }
2259
finish_btl_init(mca_btl_openib_module_t * openib_btl)2260 static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
2261 {
2262 int qp;
2263 openib_btl->num_peers = 0;
2264
2265 /* Initialize module state */
2266 OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
2267
2268 /* setup the qp structure */
2269 openib_btl->qps = (mca_btl_openib_module_qp_t*)
2270 calloc(mca_btl_openib_component.num_qps,
2271 sizeof(mca_btl_openib_module_qp_t));
2272 if (NULL == openib_btl->qps) {
2273 return OPAL_ERR_OUT_OF_RESOURCE;
2274 }
2275
2276 /* setup all the qps */
2277 for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
2278 if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
2279 OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0],
2280 opal_list_t);
2281 OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1],
2282 opal_list_t);
2283 openib_btl->qps[qp].u.srq_qp.sd_credits =
2284 mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
2285 openib_btl->qps[qp].u.srq_qp.srq = NULL;
2286 }
2287 }
2288
2289 /* initialize the memory pool using the device */
2290 openib_btl->super.btl_mpool = openib_btl->device->mpool;
2291
2292 openib_btl->eager_rdma_channels = 0;
2293
2294 openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
2295 sizeof(mca_btl_openib_header_t) +
2296 sizeof(mca_btl_openib_header_coalesced_t) +
2297 sizeof(mca_btl_openib_control_header_t) +
2298 sizeof(mca_btl_openib_footer_t) +
2299 openib_btl->super.btl_eager_limit,
2300 mca_btl_openib_component.buffer_alignment, size_t);
2301
2302 opal_output_verbose(1, opal_btl_base_framework.framework_output,
2303 "[rank=%d] openib: using port %s:%d",
2304 OPAL_PROC_MY_NAME.vpid,
2305 ibv_get_device_name(openib_btl->device->ib_dev),
2306 openib_btl->port_num);
2307 return OPAL_SUCCESS;
2308 }
2309
2310 struct dev_distance {
2311 struct ibv_device *ib_dev;
2312 float distance;
2313 };
2314
compare_distance(const void * p1,const void * p2)2315 static int compare_distance(const void *p1, const void *p2)
2316 {
2317 const struct dev_distance *d1 = (const struct dev_distance *) p1;
2318 const struct dev_distance *d2 = (const struct dev_distance *) p2;
2319
2320 if (d1->distance > (d2->distance+EPS)) {
2321 return 1;
2322 } else if ((d1->distance + EPS) < d2->distance) {
2323 return -1;
2324 } else {
2325 return 0;
2326 }
2327 }
2328
get_ib_dev_distance(struct ibv_device * dev)2329 static float get_ib_dev_distance(struct ibv_device *dev)
2330 {
2331 /* If we don't have hwloc, we'll default to a distance of 0,
2332 because we have no way of measuring. */
2333 float distance = 0;
2334 float a, b;
2335 int i;
2336 hwloc_cpuset_t my_cpuset = NULL, ibv_cpuset = NULL;
2337 hwloc_obj_t my_obj, ibv_obj, node_obj;
2338 struct hwloc_distances_s *hwloc_distances = NULL;
2339
2340 /* Override any distance logic so all devices are used */
2341 if (0 != mca_btl_openib_component.ignore_locality ||
2342 OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
2343 return distance;
2344 }
2345
2346 #if HWLOC_API_VERSION >= 0x20000
2347 unsigned int j, distances_nr = 1;
2348 int ibvindex, myindex;
2349 #endif
2350
2351 if (NULL == hwloc_distances) {
2352 #if HWLOC_API_VERSION < 0x20000
2353 hwloc_distances =
2354 (struct hwloc_distances_s*)hwloc_get_whole_distance_matrix_by_type(opal_hwloc_topology,
2355 HWLOC_OBJ_NODE);
2356 /* If we got no info, just return 0 */
2357 if (NULL == hwloc_distances || NULL == hwloc_distances->latency) {
2358 goto out;
2359 }
2360
2361 #else
2362 if (0 != hwloc_distances_get_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
2363 &distances_nr, &hwloc_distances,
2364 HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0) || 0 == distances_nr) {
2365 hwloc_distances = NULL;
2366 goto out;
2367 }
2368 #endif
2369 }
2370
2371 /* Next, find the NUMA node where this IBV device is located */
2372 ibv_cpuset = hwloc_bitmap_alloc();
2373 if (NULL == ibv_cpuset) {
2374 goto out;
2375 }
2376 if (0 != hwloc_ibv_get_device_cpuset(opal_hwloc_topology, dev, ibv_cpuset)) {
2377 goto out;
2378 }
2379 ibv_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, ibv_cpuset);
2380 if (NULL == ibv_obj) {
2381 goto out;
2382 }
2383
2384 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2385 "hwloc_distances->nbobjs=%d", hwloc_distances->nbobjs);
2386 #if HWLOC_API_VERSION < 0x20000
2387 for (i = 0; i < (int)(2 * hwloc_distances->nbobjs); i++) {
2388 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2389 "hwloc_distances->latency[%d]=%f", i, hwloc_distances->latency[i]);
2390 }
2391 #else
2392 for (i = 0; i < (int)hwloc_distances->nbobjs; i++) {
2393 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2394 "hwloc_distances->values[%d]=%"PRIu64, i, hwloc_distances->values[i]);
2395 }
2396 #endif
2397
2398 /* If ibv_obj is a NUMA node or below, we're good. */
2399 switch (ibv_obj->type) {
2400 case HWLOC_OBJ_NODE:
2401 case HWLOC_OBJ_SOCKET:
2402 #if HWLOC_API_VERSION < 0x20000
2403 case HWLOC_OBJ_CACHE:
2404 #else
2405 case HWLOC_OBJ_L1CACHE:
2406 case HWLOC_OBJ_L2CACHE:
2407 case HWLOC_OBJ_L3CACHE:
2408 case HWLOC_OBJ_L4CACHE:
2409 case HWLOC_OBJ_L5CACHE:
2410 #endif
2411 case HWLOC_OBJ_CORE:
2412 case HWLOC_OBJ_PU:
2413 while (NULL != ibv_obj && ibv_obj->type != HWLOC_OBJ_NODE) {
2414 ibv_obj = ibv_obj->parent;
2415 }
2416 break;
2417
2418 default:
2419 /* If it's above a NUMA node, then I don't know how to compute
2420 the distance... */
2421 opal_output_verbose(5, opal_btl_base_framework.framework_output, "ibv_obj->type set to NULL");
2422 ibv_obj = NULL;
2423 break;
2424 }
2425
2426 /* If we don't have an object for this ibv device, give up */
2427 if (NULL == ibv_obj) {
2428 goto out;
2429 }
2430 #if HWLOC_API_VERSION >= 0x20000
2431 /* the new matrix format isn't quite as friendly, so we have to
2432 * do an exhaustive search to find the index of this object
2433 * in that array */
2434 ibvindex = -1;
2435 for (j=0; j < distances_nr; j++) {
2436 if (ibv_obj == hwloc_distances->objs[j]) {
2437 ibvindex = j;
2438 break;
2439 }
2440 }
2441 if (-1 == ibvindex) {
2442 OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2443 goto out;
2444 }
2445 #endif
2446
2447 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2448 "ibv_obj->logical_index=%d", ibv_obj->logical_index);
2449 /* This function is only called if the process is bound, so let's
2450 find out where we are bound to. For the moment, we only care
2451 about the NUMA node to which we are bound. */
2452 my_cpuset = hwloc_bitmap_alloc();
2453 if (NULL == my_cpuset) {
2454 goto out;
2455 }
2456 if (0 != hwloc_get_cpubind(opal_hwloc_topology, my_cpuset, 0)) {
2457 goto out;
2458 }
2459 my_obj = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, my_cpuset);
2460 if (NULL == my_obj) {
2461 goto out;
2462 }
2463
2464 /* If my_obj is a NUMA node or below, we're good. */
2465 switch (my_obj->type) {
2466 case HWLOC_OBJ_NODE:
2467 case HWLOC_OBJ_SOCKET:
2468 #if HWLOC_API_VERSION < 0x20000
2469 case HWLOC_OBJ_CACHE:
2470 #else
2471 case HWLOC_OBJ_L1CACHE:
2472 case HWLOC_OBJ_L2CACHE:
2473 case HWLOC_OBJ_L3CACHE:
2474 case HWLOC_OBJ_L4CACHE:
2475 case HWLOC_OBJ_L5CACHE:
2476 #endif
2477 case HWLOC_OBJ_CORE:
2478 case HWLOC_OBJ_PU:
2479 while (NULL != my_obj && my_obj->type != HWLOC_OBJ_NODE) {
2480 my_obj = my_obj->parent;
2481 }
2482 if (NULL != my_obj) {
2483 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2484 "my_obj->logical_index=%d", my_obj->logical_index);
2485 /* Distance may be asymetrical, so calculate both of them
2486 and take the max */
2487 #if HWLOC_API_VERSION < 0x20000
2488 a = hwloc_distances->latency[my_obj->logical_index +
2489 (ibv_obj->logical_index *
2490 hwloc_distances->nbobjs)];
2491 b = hwloc_distances->latency[ibv_obj->logical_index +
2492 (my_obj->logical_index *
2493 hwloc_distances->nbobjs)];
2494 #else
2495 /* the new matrix format isn't quite as friendly, so we have to
2496 * do an exhaustive search to find the index of this object
2497 * in that array */
2498 myindex = -1;
2499 for (j=0; j < distances_nr; j++) {
2500 if (my_obj == hwloc_distances->objs[j]) {
2501 myindex = j;
2502 break;
2503 }
2504 }
2505 if (-1 == myindex) {
2506 OPAL_ERROR_LOG(OPAL_ERR_NOT_FOUND);
2507 goto out;
2508 }
2509 a = (float)hwloc_distances->values[myindex + (ibvindex * hwloc_distances->nbobjs)];
2510 b = (float)hwloc_distances->values[ibvindex + (myindex * hwloc_distances->nbobjs)];
2511 #endif
2512 distance = (a > b) ? a : b;
2513 }
2514 break;
2515
2516 default:
2517 /* If the obj is above a NUMA node, then we're bound to more than
2518 one NUMA node. Find the max distance. */
2519 i = 0;
2520 for (node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology,
2521 ibv_obj->cpuset,
2522 HWLOC_OBJ_NODE, i);
2523 NULL != node_obj;
2524 node_obj = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology,
2525 ibv_obj->cpuset,
2526 HWLOC_OBJ_NODE, ++i)) {
2527 #if HWLOC_API_VERSION < 0x20000
2528 a = hwloc_distances->latency[node_obj->logical_index +
2529 (ibv_obj->logical_index *
2530 hwloc_distances->nbobjs)];
2531 b = hwloc_distances->latency[ibv_obj->logical_index +
2532 (node_obj->logical_index *
2533 hwloc_distances->nbobjs)];
2534 #else
2535 unsigned int j;
2536 j = node_obj->logical_index + (ibv_obj->logical_index * hwloc_distances->nbobjs);
2537 if (j < distances_nr) {
2538 a = (float)hwloc_distances->values[j];
2539 } else {
2540 goto out;
2541 }
2542 j = ibv_obj->logical_index + (node_obj->logical_index * hwloc_distances->nbobjs);
2543 if (j < distances_nr) {
2544 b = (float)hwloc_distances->values[j];
2545 } else {
2546 goto out;
2547 }
2548 #endif
2549 a = (a > b) ? a : b;
2550 distance = (a > distance) ? a : distance;
2551 }
2552 break;
2553 }
2554
2555 out:
2556 if (NULL != ibv_cpuset) {
2557 hwloc_bitmap_free(ibv_cpuset);
2558 }
2559 if (NULL != my_cpuset) {
2560 hwloc_bitmap_free(my_cpuset);
2561 }
2562
2563 #if HWLOC_API_VERSION >= 0x20000
2564 if (NULL != hwloc_distances) {
2565 hwloc_distances_release(opal_hwloc_topology, hwloc_distances);
2566 }
2567 #endif
2568 return distance;
2569 }
2570
2571 static struct dev_distance *
sort_devs_by_distance(struct ibv_device ** ib_devs,int count)2572 sort_devs_by_distance(struct ibv_device **ib_devs, int count)
2573 {
2574 int i;
2575 struct dev_distance *devs = (struct dev_distance *) malloc(count * sizeof(struct dev_distance));
2576 if (NULL == devs) {
2577 return NULL;
2578 }
2579
2580 for (i = 0; i < count; i++) {
2581 devs[i].ib_dev = ib_devs[i];
2582 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2583 "Checking distance from this process to device=%s", ibv_get_device_name(ib_devs[i]));
2584 /* If we're not bound, just assume that the device is close. */
2585 devs[i].distance = 0;
2586 if (opal_process_info.cpuset) {
2587 /* If this process is bound to one or more PUs, we can get
2588 an accurate distance. */
2589 devs[i].distance = get_ib_dev_distance(ib_devs[i]);
2590 }
2591 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2592 "Process is %s: distance to device is %f",
2593 (opal_process_info.cpuset ? "bound" : "not bound"), devs[i].distance);
2594 }
2595
2596 qsort(devs, count, sizeof(struct dev_distance), compare_distance);
2597
2598 return devs;
2599 }
2600
2601
2602 /*
2603 * IB component initialization:
2604 * (1) read interface list from kernel and compare against component parameters
2605 * then create a BTL instance for selected interfaces
2606 * (2) setup IB listen socket for incoming connection attempts
2607 * (3) register BTL parameters with the MCA
2608 */
2609
2610 static mca_btl_base_module_t**
btl_openib_component_init(int * num_btl_modules,bool enable_progress_threads,bool enable_mpi_threads)2611 btl_openib_component_init(int *num_btl_modules,
2612 bool enable_progress_threads,
2613 bool enable_mpi_threads)
2614 {
2615 struct ibv_device **ib_devs;
2616 mca_btl_base_module_t** btls = NULL;
2617 int i, ret, num_devs, length;
2618 opal_list_t btl_list;
2619 mca_btl_openib_module_t * openib_btl;
2620 mca_btl_base_selected_module_t* ib_selected;
2621 opal_list_item_t* item;
2622 mca_btl_openib_frag_init_data_t *init_data;
2623 struct dev_distance *dev_sorted;
2624 float distance;
2625 int index;
2626 bool found;
2627 mca_base_var_source_t source;
2628 int list_count = 0;
2629
2630 /* initialization */
2631 *num_btl_modules = 0;
2632 num_devs = 0;
2633
2634 /* If we got this far, then setup the memory alloc hook (because
2635 we're most likely going to be using this component). The hook
2636 is to be set up as early as possible in this function since we
2637 want most of the allocated resources be aligned.
2638 */
2639 opal_memory->memoryc_set_alignment(32, mca_btl_openib_module.super.btl_eager_limit);
2640
2641 /* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to
2642 see if $sysfsdir/class/infiniband exists. If it does not,
2643 assume that the RDMA hardware drivers are not loaded, and
2644 therefore we don't want OpenFabrics verbs support in this OMPI
2645 job. No need to print a warning. */
2646 if (!opal_common_verbs_check_basics()) {
2647 goto no_btls;
2648 }
2649
2650 /* Read in INI files with device-specific parameters */
2651 if (OPAL_SUCCESS != (ret = opal_btl_openib_ini_init())) {
2652 goto no_btls;
2653 }
2654
2655 index = mca_base_var_find("ompi", "btl", "openib", "max_inline_data");
2656 if (index >= 0) {
2657 if (OPAL_SUCCESS == mca_base_var_get_value(index, NULL, &source, NULL)) {
2658 if (-1 == mca_btl_openib_component.ib_max_inline_data &&
2659 MCA_BASE_VAR_SOURCE_DEFAULT == source) {
2660 /* If the user has not explicitly set this MCA parameter
2661 use max_inline_data value specified in the
2662 device-specific parameters INI file */
2663 mca_btl_openib_component.ib_max_inline_data = -2;
2664 }
2665 }
2666 }
2667
2668 OBJ_CONSTRUCT(&mca_btl_openib_component.send_free_coalesced, opal_free_list_t);
2669 OBJ_CONSTRUCT(&mca_btl_openib_component.send_user_free, opal_free_list_t);
2670 OBJ_CONSTRUCT(&mca_btl_openib_component.recv_user_free, opal_free_list_t);
2671
2672 init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2673 if (NULL == init_data) {
2674 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2675 goto no_btls;
2676 }
2677
2678 init_data->order = mca_btl_openib_component.rdma_qp;
2679 init_data->list = &mca_btl_openib_component.send_user_free;
2680
2681 /* Align fragments on 8-byte boundaries (instead of 2) to fix bus errors that
2682 occur on some 32-bit platforms. Depending on the size of the fragment this
2683 will waste 2-6 bytes of space per frag. In most cases this shouldn't waste
2684 any space. */
2685 if (OPAL_SUCCESS != opal_free_list_init (
2686 &mca_btl_openib_component.send_user_free,
2687 sizeof(mca_btl_openib_put_frag_t), 8,
2688 OBJ_CLASS(mca_btl_openib_put_frag_t),
2689 0, 0,
2690 mca_btl_openib_component.ib_free_list_num,
2691 mca_btl_openib_component.ib_free_list_max,
2692 mca_btl_openib_component.ib_free_list_inc,
2693 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2694 goto no_btls;
2695 }
2696
2697 init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2698 if (NULL == init_data) {
2699 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2700 goto no_btls;
2701 }
2702
2703 init_data->order = mca_btl_openib_component.rdma_qp;
2704 init_data->list = &mca_btl_openib_component.recv_user_free;
2705
2706 if(OPAL_SUCCESS != opal_free_list_init (
2707 &mca_btl_openib_component.recv_user_free,
2708 sizeof(mca_btl_openib_get_frag_t), 8,
2709 OBJ_CLASS(mca_btl_openib_get_frag_t),
2710 0, 0,
2711 mca_btl_openib_component.ib_free_list_num,
2712 mca_btl_openib_component.ib_free_list_max,
2713 mca_btl_openib_component.ib_free_list_inc,
2714 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2715 goto no_btls;
2716 }
2717
2718 init_data = (mca_btl_openib_frag_init_data_t *) malloc(sizeof(mca_btl_openib_frag_init_data_t));
2719 if (NULL == init_data) {
2720 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2721 goto no_btls;
2722 }
2723 length = sizeof(mca_btl_openib_coalesced_frag_t);
2724
2725 init_data->list = &mca_btl_openib_component.send_free_coalesced;
2726
2727 if(OPAL_SUCCESS != opal_free_list_init (
2728 &mca_btl_openib_component.send_free_coalesced,
2729 length, 8, OBJ_CLASS(mca_btl_openib_coalesced_frag_t),
2730 0, 0, mca_btl_openib_component.ib_free_list_num,
2731 mca_btl_openib_component.ib_free_list_max,
2732 mca_btl_openib_component.ib_free_list_inc,
2733 NULL, 0, NULL, mca_btl_openib_frag_init, init_data)) {
2734 goto no_btls;
2735 }
2736
2737 /* If fork support is requested, try to enable it */
2738 if (OPAL_SUCCESS != (ret = opal_common_verbs_fork_test())) {
2739 goto no_btls;
2740 }
2741
2742 /* Parse the include and exclude lists, checking for errors */
2743 mca_btl_openib_component.if_include_list =
2744 mca_btl_openib_component.if_exclude_list =
2745 mca_btl_openib_component.if_list = NULL;
2746
2747 if (NULL != mca_btl_openib_component.if_include)
2748 list_count++;
2749 if (NULL != mca_btl_openib_component.if_exclude)
2750 list_count++;
2751 if (NULL != mca_btl_openib_component.ipaddr_include)
2752 list_count++;
2753 if (NULL != mca_btl_openib_component.ipaddr_exclude)
2754 list_count++;
2755
2756 if (list_count > 1) {
2757 opal_show_help("help-mpi-btl-openib.txt",
2758 "specified include and exclude", true,
2759 NULL == mca_btl_openib_component.if_include ?
2760 "<not specified>" : mca_btl_openib_component.if_include,
2761 NULL == mca_btl_openib_component.if_exclude ?
2762 "<not specified>" : mca_btl_openib_component.if_exclude,
2763 NULL == mca_btl_openib_component.ipaddr_include ?
2764 "<not specified>" :mca_btl_openib_component.ipaddr_include,
2765 NULL == mca_btl_openib_component.ipaddr_exclude ?
2766 "<not specified>" :mca_btl_openib_component.ipaddr_exclude,
2767 NULL);
2768 goto no_btls;
2769 } else if (NULL != mca_btl_openib_component.if_include) {
2770 mca_btl_openib_component.if_include_list =
2771 opal_argv_split(mca_btl_openib_component.if_include, ',');
2772 mca_btl_openib_component.if_list =
2773 opal_argv_copy(mca_btl_openib_component.if_include_list);
2774 } else if (NULL != mca_btl_openib_component.if_exclude) {
2775 mca_btl_openib_component.if_exclude_list =
2776 opal_argv_split(mca_btl_openib_component.if_exclude, ',');
2777 mca_btl_openib_component.if_list =
2778 opal_argv_copy(mca_btl_openib_component.if_exclude_list);
2779 }
2780
2781 ib_devs = opal_ibv_get_device_list(&num_devs);
2782
2783 if(0 == num_devs || NULL == ib_devs) {
2784 mca_btl_base_error_no_nics("OpenFabrics (openib)", "device");
2785 goto no_btls;
2786 }
2787
2788 dev_sorted = sort_devs_by_distance(ib_devs, num_devs);
2789 if (NULL == dev_sorted) {
2790 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2791 goto no_btls;
2792 }
2793
2794 OBJ_CONSTRUCT(&btl_list, opal_list_t);
2795 OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
2796
2797 distance = dev_sorted[0].distance;
2798 for (found = false, i = 0;
2799 i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls ||
2800 mca_btl_openib_component.ib_num_btls <
2801 mca_btl_openib_component.ib_max_btls); i++) {
2802 if (0 != mca_btl_openib_component.ib_num_btls &&
2803 (dev_sorted[i].distance - distance) > EPS) {
2804 opal_output_verbose(1, opal_btl_base_framework.framework_output,
2805 "[rank=%d] openib: skipping device %s; it is too far away",
2806 OPAL_PROC_MY_NAME.vpid,
2807 ibv_get_device_name(dev_sorted[i].ib_dev));
2808 break;
2809 }
2810
2811 /* Only take devices that match the type specified by
2812 btl_openib_device_type */
2813 switch (mca_btl_openib_component.device_type) {
2814 case BTL_OPENIB_DT_IB:
2815 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
2816 if (IBV_TRANSPORT_IWARP == dev_sorted[i].ib_dev->transport_type) {
2817 BTL_VERBOSE(("openib: only taking infiniband devices -- skipping %s",
2818 ibv_get_device_name(dev_sorted[i].ib_dev)));
2819 continue;
2820 }
2821 #endif
2822 break;
2823
2824 case BTL_OPENIB_DT_IWARP:
2825 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
2826 if (IBV_TRANSPORT_IB == dev_sorted[i].ib_dev->transport_type) {
2827 BTL_VERBOSE(("openib: only taking iwarp devices -- skipping %s",
2828 ibv_get_device_name(dev_sorted[i].ib_dev)));
2829 continue;
2830 }
2831 #else
2832 opal_show_help("help-mpi-btl-openib.txt", "no iwarp support",
2833 true);
2834 #endif
2835 break;
2836
2837 case BTL_OPENIB_DT_ALL:
2838 break;
2839 }
2840
2841 found = true;
2842 ret = init_one_device(&btl_list, dev_sorted[i].ib_dev);
2843 if (OPAL_ERR_NOT_SUPPORTED == ret) {
2844 ++num_devices_intentionally_ignored;
2845 continue;
2846 } else if (OPAL_SUCCESS != ret) {
2847 free(dev_sorted);
2848 goto no_btls;
2849 }
2850 }
2851 free(dev_sorted);
2852 if (!found) {
2853 opal_show_help("help-mpi-btl-openib.txt", "no devices right type",
2854 true, opal_process_info.nodename,
2855 ((BTL_OPENIB_DT_IB == mca_btl_openib_component.device_type) ?
2856 "InfiniBand" :
2857 (BTL_OPENIB_DT_IWARP == mca_btl_openib_component.device_type) ?
2858 "iWARP" : "<any>"));
2859 goto no_btls;
2860 }
2861
2862 /* If we got back from checking all the devices and find that
2863 there are still items in the component.if_list, that means that
2864 they didn't exist. Show an appropriate warning if the warning
2865 was not disabled. */
2866
2867 if (0 != opal_argv_count(mca_btl_openib_component.if_list) &&
2868 mca_btl_openib_component.warn_nonexistent_if) {
2869 char *str = opal_argv_join(mca_btl_openib_component.if_list, ',');
2870 opal_show_help("help-mpi-btl-openib.txt", "nonexistent port",
2871 true, opal_process_info.nodename,
2872 ((NULL != mca_btl_openib_component.if_include) ?
2873 "in" : "ex"), str);
2874 free(str);
2875 }
2876
2877 if(0 == mca_btl_openib_component.ib_num_btls) {
2878 /* If there were unusable devices that weren't specifically
2879 ignored, warn about it */
2880 if (num_devices_intentionally_ignored < num_devs) {
2881 opal_show_help("help-mpi-btl-openib.txt",
2882 "no active ports found", true,
2883 opal_process_info.nodename);
2884 }
2885 goto no_btls;
2886 }
2887
2888 /* Now that we know we have devices and ports that we want to use,
2889 init CPC components */
2890 if (OPAL_SUCCESS != (ret = opal_btl_openib_connect_base_init())) {
2891 goto no_btls;
2892 }
2893
2894 /* Setup the BSRQ QP's based on the final value of
2895 mca_btl_openib_component.receive_queues. */
2896 if (OPAL_SUCCESS != setup_qps()) {
2897 goto no_btls;
2898 }
2899 if (mca_btl_openib_component.num_srq_qps > 0 ||
2900 mca_btl_openib_component.num_xrc_qps > 0) {
2901 opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;
2902 if(OPAL_SUCCESS != opal_hash_table_init(
2903 srq_addr_table, (mca_btl_openib_component.num_srq_qps +
2904 mca_btl_openib_component.num_xrc_qps) *
2905 mca_btl_openib_component.ib_num_btls)) {
2906 BTL_ERROR(("SRQ internal error. Failed to allocate SRQ addr hash table"));
2907 goto no_btls;
2908 }
2909 }
2910
2911 /* For XRC:
2912 * from this point we know if MCA_BTL_XRC_ENABLED it true or false */
2913
2914 /* Init XRC IB Addr hash table */
2915 if (MCA_BTL_XRC_ENABLED) {
2916 OBJ_CONSTRUCT(&mca_btl_openib_component.ib_addr_table,
2917 opal_hash_table_t);
2918 }
2919
2920 /* Allocate space for btl modules */
2921 mca_btl_openib_component.openib_btls =
2922 (mca_btl_openib_module_t **) malloc(sizeof(mca_btl_openib_module_t*) *
2923 mca_btl_openib_component.ib_num_btls);
2924 if(NULL == mca_btl_openib_component.openib_btls) {
2925 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2926 goto no_btls;
2927 }
2928 btls = (struct mca_btl_base_module_t **)
2929 malloc(mca_btl_openib_component.ib_num_btls *
2930 sizeof(struct mca_btl_base_module_t*));
2931 if(NULL == btls) {
2932 BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
2933 goto no_btls;
2934 }
2935
2936 /* Copy the btl module structs into a contiguous array and fully
2937 initialize them */
2938 i = 0;
2939 while (NULL != (item = opal_list_remove_first(&btl_list))) {
2940 ib_selected = (mca_btl_base_selected_module_t*)item;
2941 openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
2942
2943 /* Search for a CPC that can handle this port */
2944 ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
2945 /* If we get NOT_SUPPORTED, then no CPC was found for this
2946 port. But that's not a fatal error -- just keep going;
2947 let's see if we find any usable openib modules or not. */
2948 if (OPAL_ERR_NOT_SUPPORTED == ret) {
2949 continue;
2950 } else if (OPAL_SUCCESS != ret) {
2951 /* All others *are* fatal. Note that we already did a
2952 show_help in the lower layer */
2953 goto no_btls;
2954 }
2955
2956 if (mca_btl_openib_component.max_hw_msg_size > 0 &&
2957 (uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
2958 BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
2959 mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
2960 }
2961
2962 mca_btl_openib_component.openib_btls[i] = openib_btl;
2963 OBJ_RELEASE(ib_selected);
2964 btls[i] = &openib_btl->super;
2965 if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
2966 goto no_btls;
2967 }
2968 ++i;
2969 }
2970 /* If we got nothing, then error out */
2971 if (0 == i) {
2972 goto no_btls;
2973 }
2974 /* Otherwise reset to the number of openib modules that we
2975 actually got */
2976 mca_btl_openib_component.ib_num_btls = i;
2977
2978 btl_openib_modex_send();
2979
2980 *num_btl_modules = mca_btl_openib_component.ib_num_btls;
2981 opal_ibv_free_device_list(ib_devs);
2982 if (NULL != mca_btl_openib_component.if_include_list) {
2983 opal_argv_free(mca_btl_openib_component.if_include_list);
2984 mca_btl_openib_component.if_include_list = NULL;
2985 }
2986 if (NULL != mca_btl_openib_component.if_exclude_list) {
2987 opal_argv_free(mca_btl_openib_component.if_exclude_list);
2988 mca_btl_openib_component.if_exclude_list = NULL;
2989 }
2990
2991 #if OPAL_CUDA_SUPPORT
2992 if (mca_btl_openib_component.cuda_want_gdr && (0 == opal_leave_pinned)) {
2993 opal_show_help("help-mpi-btl-openib.txt",
2994 "CUDA_gdr_and_nopinned", true,
2995 opal_process_info.nodename);
2996 goto no_btls;
2997 }
2998 #endif /* OPAL_CUDA_SUPPORT */
2999
3000 mca_btl_openib_component.memory_registration_verbose = opal_output_open(NULL);
3001 opal_output_set_verbosity (mca_btl_openib_component.memory_registration_verbose,
3002 mca_btl_openib_component.memory_registration_verbose_level);
3003
3004 /* setup the fork warning message as we are sensitive
3005 * to memory corruption issues when fork is called
3006 */
3007 opal_warn_fork();
3008 return btls;
3009
3010 no_btls:
3011 /* If we fail early enough in the setup, we just modex around that
3012 there are no openib BTL's in this process and return NULL. */
3013
3014 mca_btl_openib_component.ib_num_btls = 0;
3015 btl_openib_modex_send();
3016 if (NULL != btls) {
3017 free(btls);
3018 }
3019 return NULL;
3020 }
3021
3022 /*
3023 * Progress the no_credits_pending_frags lists on all qp's
3024 */
progress_no_credits_pending_frags(mca_btl_base_endpoint_t * ep)3025 static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep)
3026 {
3027 int qp, pri, rc, len;
3028 opal_list_item_t *frag;
3029
3030 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3031
3032 /* Traverse all QPs and all priorities */
3033 for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
3034 for (pri = 0; pri < 2; ++pri) {
3035 /* Note that entries in the no_credits_pending_frags list
3036 may be eager RDMA or send fragments. So be sure to
3037 check that we have at least 1 RDMA or send credit.
3038
3039 This loop needs a little explaining. :-\
3040
3041 In the body of the loop, we call _endpoint_post_send().
3042 The frag will either be successfully sent, or it will
3043 be [re]added to the no_credit_pending_frags list. So
3044 if we keep trying to drain the no_credits_pending_frag
3045 list, we could end up in an infinite loop. So instead,
3046 we get the initial length of the list and ensure to run
3047 through every entry at least once. This attempts to
3048 send *every* frag once and catches the case where a
3049 frag may be on the RDMA list, but because of
3050 coalescing, is now too big for RDMA and defaults over
3051 to sending -- but then we're out of send credits, so it
3052 doesn't go. But if we *do* still have some RDMA
3053 credits and there are RDMA frags on the list behind
3054 this now-too-big frag, they'll get a chance to go.
3055
3056 Specifically, the condition in this for loop is as follows:
3057
3058 - len > 0: ensure to go through all entries in the list once
3059 - the 2nd part of the conditional checks to see if we
3060 have any credits at all. Specifically, do we have
3061 any RDMA credits or any send credits, *or* are we on
3062 an SRQ, in which case we define that we *always* have
3063 credits (because the hardware will continually
3064 retransmit for us).
3065 */
3066 for (len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
3067 len > 0 &&
3068 (ep->eager_rdma_remote.tokens > 0 ||
3069 ep->qps[qp].u.pp_qp.sd_credits > 0 ||
3070 !BTL_OPENIB_QP_TYPE_PP(qp)); --len) {
3071 frag = opal_list_remove_first(&ep->qps[qp].no_credits_pending_frags[pri]);
3072 assert (NULL != frag);
3073
3074 /* If _endpoint_post_send() fails because of
3075 RESOURCE_BUSY, then the frag was re-added to the
3076 no_credits_pending list. Specifically: either the
3077 frag was initially an RDMA frag, but there were no
3078 RDMA credits so it fell through the trying to send,
3079 but we had no send credits and therefore re-added
3080 the frag to the no_credits list, or the frag was a
3081 send frag initially (and the same sequence
3082 occurred, starting at the send frag out-of-credits
3083 scenario). In this case, just continue and try the
3084 rest of the frags in the list.
3085
3086 If it fails because of another error, return the
3087 error upward. */
3088 rc = mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
3089 if (OPAL_UNLIKELY(OPAL_SUCCESS != rc &&
3090 OPAL_ERR_RESOURCE_BUSY != rc)) {
3091 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3092 return rc;
3093 }
3094 }
3095 }
3096 }
3097
3098 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3099 return OPAL_SUCCESS;
3100 }
3101
mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t * ep,const int qp)3102 void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
3103 const int qp)
3104 {
3105 mca_btl_openib_module_t* openib_btl = ep->endpoint_btl;
3106 opal_list_item_t *frag;
3107 size_t i, len = opal_list_get_size(&ep->pending_get_frags);
3108 int rc;
3109
3110 for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) {
3111 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3112 frag = opal_list_remove_first(&(ep->pending_get_frags));
3113 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3114 if (NULL == frag)
3115 break;
3116 rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep,
3117 to_get_frag(frag));
3118 if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
3119 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3120 opal_list_prepend (&ep->pending_get_frags, frag);
3121 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3122 break;
3123 }
3124 }
3125
3126 len = opal_list_get_size(&ep->pending_put_frags);
3127 for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0; i++) {
3128 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3129 frag = opal_list_remove_first(&(ep->pending_put_frags));
3130 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3131 if (NULL == frag)
3132 break;
3133 rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep,
3134 to_put_frag(frag));
3135 if (OPAL_ERR_OUT_OF_RESOURCE == rc) {
3136 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3137 opal_list_prepend (&ep->pending_put_frags, frag);
3138 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3139 break;
3140 }
3141 }
3142 }
3143
btl_openib_handle_incoming(mca_btl_openib_module_t * openib_btl,mca_btl_openib_endpoint_t * ep,mca_btl_openib_recv_frag_t * frag,size_t byte_len)3144 static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
3145 mca_btl_openib_endpoint_t *ep,
3146 mca_btl_openib_recv_frag_t *frag,
3147 size_t byte_len)
3148 {
3149 mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
3150 mca_btl_openib_header_t *hdr = frag->hdr;
3151 int rqp = to_base_frag(frag)->base.order, cqp;
3152 uint16_t rcredits = 0, credits;
3153 bool is_credit_msg;
3154
3155 if(ep->nbo) {
3156 BTL_OPENIB_HEADER_NTOH(*hdr);
3157 }
3158
3159 /* advance the segment address past the header and subtract from the
3160 * length.*/
3161 des->des_segments->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
3162
3163 if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
3164 /* call registered callback */
3165 mca_btl_active_message_callback_t* reg;
3166
3167 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3168 /* The COPY_ASYNC flag should not be set */
3169 assert(0 == (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC));
3170 #endif /* OPAL_CUDA_SUPPORT */
3171 reg = mca_btl_base_active_message_trigger + hdr->tag;
3172 reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
3173 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3174 if (des->des_flags & MCA_BTL_DES_FLAGS_CUDA_COPY_ASYNC) {
3175 /* Since ASYNC flag is set, we know this descriptor is being used
3176 * for asynchronous copy and cannot be freed yet. Therefore, set
3177 * up callback for PML to call when complete, add argument into
3178 * descriptor and return. */
3179 des->des_cbfunc = btl_openib_handle_incoming_completion;
3180 to_in_frag(des)->endpoint = ep;
3181 return OPAL_SUCCESS;
3182 }
3183 #endif /* OPAL_CUDA_SUPPORT */
3184 if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3185 cqp = (hdr->credits >> 11) & 0x0f;
3186 hdr->credits &= 0x87ff;
3187 } else {
3188 cqp = rqp;
3189 }
3190 if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
3191 rcredits = BTL_OPENIB_CREDITS(hdr->credits);
3192 hdr->credits = 0;
3193 }
3194 } else {
3195 mca_btl_openib_rdma_credits_header_t *chdr =
3196 (mca_btl_openib_rdma_credits_header_t *) des->des_segments->seg_addr.pval;
3197 if(ep->nbo) {
3198 BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
3199 }
3200 cqp = chdr->qpn;
3201 rcredits = chdr->rdma_credits;
3202 }
3203
3204 credits = hdr->credits;
3205
3206 if(hdr->cm_seen)
3207 OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
3208
3209 /* Now return fragment. Don't touch hdr after this point! */
3210 if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3211 mca_btl_openib_eager_rdma_local_t *erl = &ep->eager_rdma_local;
3212 OPAL_THREAD_LOCK(&erl->lock);
3213 MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
3214 while(erl->tail != erl->head) {
3215 mca_btl_openib_recv_frag_t *tf;
3216 tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(ep, erl->tail);
3217 if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(tf))
3218 break;
3219 OPAL_THREAD_ADD32(&erl->credits, 1);
3220 MCA_BTL_OPENIB_RDMA_NEXT_INDEX(erl->tail);
3221 }
3222 OPAL_THREAD_UNLOCK(&erl->lock);
3223 } else {
3224 if (is_cts_message(frag)) {
3225 /* If this was a CTS, free it here (it was
3226 malloc'ed+ibv_reg_mr'ed -- so it should *not* be
3227 FRAG_RETURN'ed). */
3228 int rc = opal_btl_openib_connect_base_free_cts(ep);
3229 if (OPAL_SUCCESS != rc) {
3230 return rc;
3231 }
3232 } else {
3233 /* Otherwise, FRAG_RETURN it and repost if necessary */
3234 MCA_BTL_IB_FRAG_RETURN(frag);
3235 if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
3236 if (OPAL_UNLIKELY(is_credit_msg)) {
3237 OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
3238 } else {
3239 OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
3240 }
3241 mca_btl_openib_endpoint_post_rr(ep, cqp);
3242 } else {
3243 mca_btl_openib_module_t *btl = ep->endpoint_btl;
3244 OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
3245 mca_btl_openib_post_srr(btl, rqp);
3246 }
3247 }
3248 }
3249
3250 assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
3251
3252 /* If we got any credits (RDMA or send), then try to progress all
3253 the no_credits_pending_frags lists */
3254 if (rcredits > 0) {
3255 OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
3256 }
3257 if (credits > 0) {
3258 OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
3259 }
3260 if (rcredits + credits > 0) {
3261 int rc;
3262
3263 if (OPAL_SUCCESS !=
3264 (rc = progress_no_credits_pending_frags(ep))) {
3265 return rc;
3266 }
3267 }
3268
3269 send_credits(ep, cqp);
3270
3271 return OPAL_SUCCESS;
3272 }
3273
3274 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_RECV */
3275 /**
3276 * Called by the PML when the copying of the data out of the fragment
3277 * is complete.
3278 */
btl_openib_handle_incoming_completion(mca_btl_base_module_t * btl,mca_btl_base_endpoint_t * ep,mca_btl_base_descriptor_t * des,int status)3279 static void btl_openib_handle_incoming_completion(mca_btl_base_module_t* btl,
3280 mca_btl_base_endpoint_t *ep,
3281 mca_btl_base_descriptor_t* des,
3282 int status)
3283 {
3284 mca_btl_openib_recv_frag_t *frag = (mca_btl_openib_recv_frag_t *)des;
3285 mca_btl_openib_header_t *hdr = frag->hdr;
3286 int rqp = to_base_frag(frag)->base.order, cqp;
3287 uint16_t rcredits = 0, credits;
3288
3289 ep = to_in_frag (des)->endpoint;
3290
3291 OPAL_OUTPUT((-1, "handle_incoming_complete frag=%p", (void *)des));
3292
3293 if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
3294 cqp = (hdr->credits >> 11) & 0x0f;
3295 hdr->credits &= 0x87ff;
3296 } else {
3297 cqp = rqp;
3298 }
3299 if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
3300 rcredits = BTL_OPENIB_CREDITS(hdr->credits);
3301 hdr->credits = 0;
3302 }
3303
3304 credits = hdr->credits;
3305
3306 if(hdr->cm_seen)
3307 OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
3308
3309 /* We should not be here with eager, control, or credit messages */
3310 assert(openib_frag_type(frag) != MCA_BTL_OPENIB_FRAG_EAGER_RDMA);
3311 assert(0 == is_cts_message(frag));
3312 assert(0 == is_credit_message(frag));
3313 /* HACK - clear out flags. Must be better way */
3314 des->des_flags = 0;
3315 /* Otherwise, FRAG_RETURN it and repost if necessary */
3316 MCA_BTL_IB_FRAG_RETURN(frag);
3317 if (BTL_OPENIB_QP_TYPE_PP(rqp)) {
3318 OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
3319 mca_btl_openib_endpoint_post_rr(ep, cqp);
3320 } else {
3321 mca_btl_openib_module_t *btl = ep->endpoint_btl;
3322 OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
3323 mca_btl_openib_post_srr(btl, rqp);
3324 }
3325
3326 assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
3327
3328 /* If we got any credits (RDMA or send), then try to progress all
3329 the no_credits_pending_frags lists */
3330 if (rcredits > 0) {
3331 OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
3332 }
3333 if (credits > 0) {
3334 OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
3335 }
3336 if (rcredits + credits > 0) {
3337 int rc;
3338
3339 if (OPAL_SUCCESS !=
3340 (rc = progress_no_credits_pending_frags(ep))) {
3341 /* This is a fatal issue so call into PML and let it know. */
3342 mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
3343 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3344 NULL, NULL);
3345 return;
3346 }
3347 }
3348
3349 send_credits(ep, cqp);
3350
3351 }
3352 #endif /* OPAL_CUDA_SUPPORT */
3353
btl_openib_component_status_to_string(enum ibv_wc_status status)3354 static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
3355 {
3356 switch(status) {
3357 case IBV_WC_SUCCESS:
3358 return "SUCCESS";
3359 break;
3360 case IBV_WC_LOC_LEN_ERR:
3361 return "LOCAL LENGTH ERROR";
3362 break;
3363 case IBV_WC_LOC_QP_OP_ERR:
3364 return "LOCAL QP OPERATION ERROR";
3365 break;
3366 case IBV_WC_LOC_PROT_ERR:
3367 return "LOCAL PROTOCOL ERROR";
3368 break;
3369 case IBV_WC_WR_FLUSH_ERR:
3370 return "WORK REQUEST FLUSHED ERROR";
3371 break;
3372 case IBV_WC_MW_BIND_ERR:
3373 return "MEMORY WINDOW BIND ERROR";
3374 break;
3375 case IBV_WC_BAD_RESP_ERR:
3376 return "BAD RESPONSE ERROR";
3377 break;
3378 case IBV_WC_LOC_ACCESS_ERR:
3379 return "LOCAL ACCESS ERROR";
3380 break;
3381 case IBV_WC_REM_INV_REQ_ERR:
3382 return "INVALID REQUEST ERROR";
3383 break;
3384 case IBV_WC_REM_ACCESS_ERR:
3385 return "REMOTE ACCESS ERROR";
3386 break;
3387 case IBV_WC_REM_OP_ERR:
3388 return "REMOTE OPERATION ERROR";
3389 break;
3390 case IBV_WC_RETRY_EXC_ERR:
3391 return "RETRY EXCEEDED ERROR";
3392 break;
3393 case IBV_WC_RNR_RETRY_EXC_ERR:
3394 return "RECEIVER NOT READY RETRY EXCEEDED ERROR";
3395 break;
3396 case IBV_WC_LOC_RDD_VIOL_ERR:
3397 return "LOCAL RDD VIOLATION ERROR";
3398 break;
3399 case IBV_WC_REM_INV_RD_REQ_ERR:
3400 return "INVALID READ REQUEST ERROR";
3401 break;
3402 case IBV_WC_REM_ABORT_ERR:
3403 return "REMOTE ABORT ERROR";
3404 break;
3405 case IBV_WC_INV_EECN_ERR:
3406 return "INVALID EECN ERROR";
3407 break;
3408 case IBV_WC_INV_EEC_STATE_ERR:
3409 return "INVALID EEC STATE ERROR";
3410 break;
3411 case IBV_WC_FATAL_ERR:
3412 return "FATAL ERROR";
3413 break;
3414 case IBV_WC_RESP_TIMEOUT_ERR:
3415 return "RESPONSE TIMEOUT ERROR";
3416 break;
3417 case IBV_WC_GENERAL_ERR:
3418 return "GENERAL ERROR";
3419 break;
3420 default:
3421 return "STATUS UNDEFINED";
3422 break;
3423 }
3424 }
3425
3426 static void
progress_pending_frags_wqe(mca_btl_base_endpoint_t * ep,const int qpn)3427 progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
3428 {
3429 int ret;
3430 opal_list_item_t *frag;
3431 mca_btl_openib_qp_t *qp = ep->qps[qpn].qp;
3432
3433 OPAL_THREAD_LOCK(&ep->endpoint_lock);
3434 for(int i = 0; i < 2; i++) {
3435 while(qp->sd_wqe > 0) {
3436 mca_btl_base_endpoint_t *tmp_ep;
3437 frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]);
3438 if(NULL == frag)
3439 break;
3440 assert(0 == frag->opal_list_item_refcount);
3441 tmp_ep = to_com_frag(frag)->endpoint;
3442 ret = mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag));
3443 if (OPAL_SUCCESS != ret) {
3444 /* NTH: this handles retrying if we are out of credits but other errors are not
3445 * handled (maybe abort?). */
3446 if (OPAL_ERR_RESOURCE_BUSY != ret) {
3447 opal_list_prepend (&ep->qps[qpn].no_wqe_pending_frags[i], (opal_list_item_t *) frag);
3448 }
3449 break;
3450 }
3451 }
3452 }
3453 OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
3454 }
3455
progress_pending_frags_srq(mca_btl_openib_module_t * openib_btl,const int qp)3456 static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
3457 const int qp)
3458 {
3459 opal_list_item_t *frag;
3460 int i;
3461
3462 assert(BTL_OPENIB_QP_TYPE_SRQ(qp) || BTL_OPENIB_QP_TYPE_XRC(qp));
3463
3464 for(i = 0; i < 2; i++) {
3465 while(openib_btl->qps[qp].u.srq_qp.sd_credits > 0) {
3466 OPAL_THREAD_LOCK(&openib_btl->ib_lock);
3467 frag = opal_list_remove_first(
3468 &openib_btl->qps[qp].u.srq_qp.pending_frags[i]);
3469 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3470
3471 if(NULL == frag)
3472 break;
3473
3474 mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint,
3475 to_send_frag(frag));
3476 }
3477 }
3478 }
3479
3480 static char *cq_name[] = {"HP CQ", "LP CQ"};
handle_wc(mca_btl_openib_device_t * device,const uint32_t cq,struct ibv_wc * wc)3481 static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
3482 struct ibv_wc *wc)
3483 {
3484 static int flush_err_printed[] = {0, 0};
3485 mca_btl_openib_com_frag_t* frag;
3486 mca_btl_base_descriptor_t *des;
3487 mca_btl_openib_endpoint_t* endpoint;
3488 mca_btl_openib_module_t *openib_btl = NULL;
3489 const opal_proc_t* remote_proc = NULL;
3490 int qp, btl_ownership;
3491 int n;
3492
3493 des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
3494 frag = to_com_frag(des);
3495
3496 /* For receive fragments "order" contains QP idx the fragment was posted
3497 * to. For send fragments "order" contains QP idx the fragment was send
3498 * through */
3499 qp = des->order;
3500
3501 if (IBV_WC_RECV == wc->opcode && (wc->wc_flags & IBV_WC_WITH_IMM)) {
3502 #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
3503 wc->imm_data = ntohl(wc->imm_data);
3504 #endif
3505 frag->endpoint = (mca_btl_openib_endpoint_t*)
3506 opal_pointer_array_get_item(device->endpoints, wc->imm_data);
3507 }
3508
3509 endpoint = frag->endpoint;
3510
3511 assert (NULL != endpoint);
3512
3513 openib_btl = endpoint->endpoint_btl;
3514
3515 if(wc->status != IBV_WC_SUCCESS) {
3516 OPAL_OUTPUT((-1, "Got WC: ERROR"));
3517 goto error;
3518 }
3519
3520 /* Handle work completions */
3521 switch(wc->opcode) {
3522 case IBV_WC_RDMA_READ:
3523 case IBV_WC_COMP_SWAP:
3524 case IBV_WC_FETCH_ADD:
3525 OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE"));
3526
3527 OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
3528
3529 mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
3530
3531 /* check if atomic result needs to be byte swapped (mlx5) */
3532 if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) {
3533 *((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr));
3534 }
3535
3536 get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
3537 get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
3538 OPAL_SUCCESS);
3539 /* fall through */
3540 case IBV_WC_RDMA_WRITE:
3541 if (MCA_BTL_OPENIB_FRAG_SEND_USER == openib_frag_type(des)) {
3542 mca_btl_openib_put_frag_t *put_frag = to_put_frag(des);
3543
3544 put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
3545 put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data,
3546 OPAL_SUCCESS);
3547 put_frag->cb.func = NULL;
3548 }
3549 /* fall through */
3550 case IBV_WC_SEND:
3551 OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND"));
3552 if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
3553 opal_list_item_t *i;
3554 while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
3555 btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
3556 to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
3557 &to_base_frag(i)->base, OPAL_SUCCESS);
3558 if( btl_ownership ) {
3559 mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
3560 }
3561 }
3562 }
3563 /* Process a completed send/put/get */
3564 btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
3565 if (des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
3566 des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_SUCCESS);
3567 }
3568 if( btl_ownership ) {
3569 mca_btl_openib_free(&openib_btl->super, des);
3570 }
3571
3572 /* return send wqe */
3573 qp_put_wqe(endpoint, qp);
3574
3575 /* return wqes that were sent before this frag */
3576 n = qp_frag_to_wqe(endpoint, qp, to_com_frag(des));
3577
3578 if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
3579 OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1+n);
3580
3581 /* new SRQ credit available. Try to progress pending frags*/
3582 progress_pending_frags_srq(openib_btl, qp);
3583 }
3584 /* new wqe or/and get token available. Try to progress pending frags */
3585 progress_pending_frags_wqe(endpoint, qp);
3586 mca_btl_openib_frag_progress_pending_put_get(endpoint, qp);
3587 break;
3588 case IBV_WC_RECV:
3589 OPAL_OUTPUT((-1, "Got WC: RDMA_RECV, qp %d, src qp %d, WR ID %" PRIx64,
3590 wc->qp_num, wc->src_qp, wc->wr_id));
3591
3592 /* Process a RECV */
3593 if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag),
3594 wc->byte_len) != OPAL_SUCCESS) {
3595 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3596 NULL, NULL);
3597 break;
3598 }
3599
3600 /* decide if it is time to setup an eager rdma channel */
3601 if(!endpoint->eager_rdma_local.base.pval && endpoint->use_eager_rdma &&
3602 wc->byte_len < mca_btl_openib_component.eager_limit &&
3603 openib_btl->eager_rdma_channels <
3604 mca_btl_openib_component.max_eager_rdma &&
3605 OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
3606 mca_btl_openib_component.eager_rdma_threshold) {
3607 mca_btl_openib_endpoint_connect_eager_rdma(endpoint);
3608 }
3609 break;
3610 default:
3611 BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode));
3612 if(openib_btl)
3613 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3614 NULL, NULL);
3615 break;
3616 }
3617
3618 return;
3619
3620 error:
3621 if(endpoint->endpoint_proc && endpoint->endpoint_proc->proc_opal)
3622 remote_proc = endpoint->endpoint_proc->proc_opal;
3623
3624 /* For iWARP, the TCP connection is tied to the QP once the QP is
3625 * in RTS. And destroying the QP is thus tied to connection
3626 * teardown for iWARP. To destroy the connection in iWARP you
3627 * must move the QP out of RTS, either into CLOSING for a nice
3628 * graceful close (e.g., via rdma_disconnect()), or to ERROR if
3629 * you want to be rude (e.g., just destroying the QP without
3630 * disconnecting first). In both cases, all pending non-completed
3631 * SQ and RQ WRs will automatically be flushed.
3632 */
3633 #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
3634 if (IBV_WC_WR_FLUSH_ERR == wc->status &&
3635 IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) {
3636 return;
3637 }
3638 #endif
3639
3640 if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) {
3641 BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
3642 "status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d",
3643 cq_name[cq], btl_openib_component_status_to_string(wc->status),
3644 wc->status, wc->wr_id,
3645 wc->opcode, wc->vendor_err, qp));
3646 }
3647
3648 if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
3649 IBV_WC_RETRY_EXC_ERR == wc->status) {
3650 const char *peer_hostname;
3651 peer_hostname = opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal);
3652 const char *device_name =
3653 ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
3654
3655 if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
3656 // The show_help checker script gets confused if the topic
3657 // is an inline logic check, so separate it into two calls
3658 // to show_help.
3659 if (BTL_OPENIB_QP_TYPE_PP(qp)) {
3660 opal_show_help("help-mpi-btl-openib.txt",
3661 "pp rnr retry exceeded",
3662 true,
3663 opal_process_info.nodename,
3664 device_name,
3665 peer_hostname);
3666 } else {
3667 opal_show_help("help-mpi-btl-openib.txt",
3668 "srq rnr retry exceeded",
3669 true,
3670 opal_process_info.nodename,
3671 device_name,
3672 peer_hostname);
3673 }
3674 } else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
3675 opal_show_help("help-mpi-btl-openib.txt",
3676 "pp retry exceeded", true,
3677 opal_process_info.nodename,
3678 device_name, peer_hostname);
3679 }
3680 }
3681
3682 if(openib_btl)
3683 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3684 (struct opal_proc_t*)remote_proc, NULL);
3685 }
3686
poll_device(mca_btl_openib_device_t * device,int count)3687 static int poll_device(mca_btl_openib_device_t* device, int count)
3688 {
3689 int ne = 0, cq;
3690 uint32_t hp_iter = 0;
3691 struct ibv_wc wc[MCA_BTL_OPENIB_CQ_POLL_BATCH_DEFAULT];
3692 int i;
3693
3694 device->pollme = false;
3695 for(cq = 0; cq < 2 && hp_iter < mca_btl_openib_component.cq_poll_progress;)
3696 {
3697 ne = ibv_poll_cq(device->ib_cq[cq], mca_btl_openib_component.cq_poll_batch, wc);
3698 if(0 == ne) {
3699 /* don't check low prio cq if there was something in high prio cq,
3700 * but for each cq_poll_ratio hp cq polls poll lp cq once */
3701 if(count && device->hp_cq_polls)
3702 break;
3703 cq++;
3704 device->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
3705 continue;
3706 }
3707
3708 if(ne < 0)
3709 goto error;
3710
3711 count++;
3712
3713 if(BTL_OPENIB_HP_CQ == cq) {
3714 device->pollme = true;
3715 hp_iter++;
3716 device->hp_cq_polls--;
3717 }
3718
3719 for (i = 0; i < ne; i++)
3720 handle_wc(device, cq, &wc[i]);
3721 }
3722
3723 return count;
3724 error:
3725 BTL_ERROR(("error polling %s with %d errno says %s", cq_name[cq], ne,
3726 strerror(errno)));
3727 return count;
3728 }
3729
3730 #if OPAL_ENABLE_PROGRESS_THREADS
mca_btl_openib_progress_thread(opal_object_t * arg)3731 void* mca_btl_openib_progress_thread(opal_object_t* arg)
3732 {
3733 opal_thread_t* thread = (opal_thread_t*)arg;
3734 mca_btl_openib_device_t* device = thread->t_arg;
3735 struct ibv_cq *ev_cq;
3736 void *ev_ctx;
3737
3738 /* This thread enter in a cancel enabled state */
3739 pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
3740 pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
3741
3742 opal_output(-1, "WARNING: the openib btl progress thread code *does not yet work*. Your run is likely to hang, crash, break the kitchen sink, and/or eat your cat. You have been warned.");
3743
3744 while (device->progress) {
3745 #if 0
3746 while(ompi_progress_threads()) {
3747 while(ompi_progress_threads())
3748 sched_yield();
3749 usleep(100); /* give app a chance to re-enter library */
3750 }
3751 #endif
3752
3753 if(ibv_get_cq_event(device->ib_channel, &ev_cq, &ev_ctx))
3754 BTL_ERROR(("Failed to get CQ event with error %s",
3755 strerror(errno)));
3756 if(ibv_req_notify_cq(ev_cq, 0)) {
3757 BTL_ERROR(("Couldn't request CQ notification with error %s",
3758 strerror(errno)));
3759 }
3760
3761 ibv_ack_cq_events(ev_cq, 1);
3762
3763 while(poll_device(device, 0));
3764 }
3765
3766 return PTHREAD_CANCELED;
3767 }
3768 #endif
3769
progress_one_device(mca_btl_openib_device_t * device)3770 static int progress_one_device(mca_btl_openib_device_t *device)
3771 {
3772 int i, c, count = 0, ret;
3773 mca_btl_openib_recv_frag_t* frag;
3774 mca_btl_openib_endpoint_t* endpoint;
3775 uint32_t non_eager_rdma_endpoints = 0;
3776
3777 c = device->eager_rdma_buffers_count;
3778 non_eager_rdma_endpoints += (device->non_eager_rdma_endpoints + device->pollme);
3779
3780 for(i = 0; i < c; i++) {
3781 endpoint = device->eager_rdma_buffers[i];
3782
3783 if(!endpoint)
3784 continue;
3785
3786 OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
3787 frag = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
3788 endpoint->eager_rdma_local.head);
3789
3790 if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(frag)) {
3791 uint32_t size;
3792 mca_btl_openib_module_t *btl = endpoint->endpoint_btl;
3793
3794 opal_atomic_mb();
3795
3796 if(endpoint->nbo) {
3797 BTL_OPENIB_FOOTER_NTOH(*frag->ftr);
3798 }
3799 size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
3800 #if OPAL_ENABLE_DEBUG
3801 if (frag->ftr->seq != endpoint->eager_rdma_local.seq)
3802 BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d",
3803 frag->ftr->seq,
3804 endpoint->eager_rdma_local.seq));
3805 endpoint->eager_rdma_local.seq++;
3806 #endif
3807 MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head);
3808
3809 OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
3810 frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
3811 size - BTL_OPENIB_FTR_PADDING(size) + sizeof(mca_btl_openib_footer_t));
3812 to_base_frag(frag)->segment.seg_addr.pval =
3813 ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
3814
3815 ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint,
3816 frag, size - sizeof(mca_btl_openib_footer_t));
3817 if (ret != OPAL_SUCCESS) {
3818 btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL);
3819 return 0;
3820 }
3821
3822 count++;
3823 } else
3824 OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
3825 }
3826
3827 device->eager_rdma_polls--;
3828
3829 if(0 == count || non_eager_rdma_endpoints != 0 || !device->eager_rdma_polls) {
3830 count += poll_device(device, count);
3831 device->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
3832 }
3833
3834 return count;
3835 }
3836
3837 /*
3838 * IB component progress.
3839 */
btl_openib_component_progress(void)3840 static int btl_openib_component_progress(void)
3841 {
3842 int i;
3843 int count = 0;
3844
3845 if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread &&
3846 mca_btl_openib_component.error_counter)) {
3847 goto error;
3848 }
3849
3850 for(i = 0; i < mca_btl_openib_component.devices_count; i++) {
3851 mca_btl_openib_device_t *device =
3852 (mca_btl_openib_device_t *) opal_pointer_array_get_item(&mca_btl_openib_component.devices, i);
3853 if (NULL != device) {
3854 count += progress_one_device(device);
3855 }
3856 }
3857
3858 #if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
3859 /* Check to see if there are any outstanding dtoh CUDA events that
3860 * have completed. If so, issue the PML callbacks on the fragments.
3861 * The only thing that gets completed here are asynchronous copies
3862 * so there is no need to free anything.
3863 */
3864 {
3865 int local_count = 0;
3866 mca_btl_base_descriptor_t *frag;
3867 while (local_count < 10 && (1 == progress_one_cuda_dtoh_event(&frag))) {
3868 OPAL_OUTPUT((-1, "btl_openib: event completed on frag=%p", (void *)frag));
3869 frag->des_cbfunc(NULL, NULL, frag, OPAL_SUCCESS);
3870 local_count++;
3871 }
3872 count += local_count;
3873 }
3874 if (count > 0) {
3875 OPAL_OUTPUT((-1, "btl_openib: DONE with openib progress, count=%d", count));
3876 }
3877 #endif /* OPAL_CUDA_SUPPORT */
3878
3879 return count;
3880
3881 error:
3882 /* Set the fatal counter to zero */
3883 mca_btl_openib_component.error_counter = 0;
3884 /* Lets find all error events */
3885 for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
3886 mca_btl_openib_module_t* openib_btl =
3887 mca_btl_openib_component.openib_btls[i];
3888 if(openib_btl->device->got_fatal_event) {
3889 openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
3890 NULL, NULL);
3891 }
3892 if(openib_btl->device->got_port_event) {
3893 /* These are non-fatal so just ignore it. */
3894 openib_btl->device->got_port_event = false;
3895 }
3896 }
3897 return count;
3898 }
3899
mca_btl_openib_post_srr(mca_btl_openib_module_t * openib_btl,const int qp)3900 int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
3901 {
3902 int rd_low_local = openib_btl->qps[qp].u.srq_qp.rd_low_local;
3903 int rd_curr_num = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
3904 int num_post, i, rc;
3905 struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
3906
3907 assert(!BTL_OPENIB_QP_TYPE_PP(qp));
3908
3909 OPAL_THREAD_LOCK(&openib_btl->ib_lock);
3910 if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low_local) {
3911 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3912 return OPAL_SUCCESS;
3913 }
3914 num_post = rd_curr_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
3915
3916 if (0 == num_post) {
3917 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3918 return OPAL_SUCCESS;
3919 }
3920
3921 for(i = 0; i < num_post; i++) {
3922 opal_free_list_item_t* item;
3923 item = opal_free_list_wait (&openib_btl->device->qps[qp].recv_free);
3924 to_base_frag(item)->base.order = qp;
3925 to_com_frag(item)->endpoint = NULL;
3926 if(NULL == wr)
3927 wr = wr_list = &to_recv_frag(item)->rd_desc;
3928 else
3929 wr = wr->next = &to_recv_frag(item)->rd_desc;
3930 }
3931
3932 wr->next = NULL;
3933
3934 rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
3935 if(OPAL_LIKELY(0 == rc)) {
3936 struct ibv_srq_attr srq_attr;
3937
3938 OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
3939
3940 if(true == openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag) {
3941 srq_attr.max_wr = openib_btl->qps[qp].u.srq_qp.rd_curr_num;
3942 srq_attr.max_sge = 1;
3943 srq_attr.srq_limit = mca_btl_openib_component.qp_infos[qp].u.srq_qp.srq_limit;
3944
3945 openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;
3946 if(ibv_modify_srq(openib_btl->qps[qp].u.srq_qp.srq, &srq_attr, IBV_SRQ_LIMIT)) {
3947 BTL_ERROR(("Failed to request limit event for srq on %s. "
3948 "Fatal error, stoping asynch event thread",
3949 ibv_get_device_name(openib_btl->device->ib_dev)));
3950
3951 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3952 return OPAL_ERROR;
3953 }
3954 }
3955
3956 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3957 return OPAL_SUCCESS;
3958 }
3959
3960 for(i = 0; wr_list && wr_list != bad_wr; i++, wr_list = wr_list->next);
3961
3962 BTL_ERROR(("error posting receive descriptors to shared receive "
3963 "queue %d (%d from %d)", qp, i, num_post));
3964
3965 OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
3966 return OPAL_ERROR;
3967 }
3968
3969
3970 struct mca_btl_openib_event_t {
3971 opal_event_t super;
3972 void *(*fn)(void *);
3973 void *arg;
3974 opal_event_t *event;
3975 };
3976
3977 typedef struct mca_btl_openib_event_t mca_btl_openib_event_t;
3978
mca_btl_openib_run_once_cb(int fd,int flags,void * context)3979 static void *mca_btl_openib_run_once_cb (int fd, int flags, void *context)
3980 {
3981 mca_btl_openib_event_t *event = (mca_btl_openib_event_t *) context;
3982 void *ret;
3983
3984 ret = event->fn (event->arg);
3985 opal_event_del (&event->super);
3986 free (event);
3987 return ret;
3988 }
3989
mca_btl_openib_run_in_main(void * (* fn)(void *),void * arg)3990 int mca_btl_openib_run_in_main (void *(*fn)(void *), void *arg)
3991 {
3992 mca_btl_openib_event_t *event = malloc (sizeof (mca_btl_openib_event_t));
3993
3994 if (OPAL_UNLIKELY(NULL == event)) {
3995 return OPAL_ERR_OUT_OF_RESOURCE;
3996 }
3997
3998 event->fn = fn;
3999 event->arg = arg;
4000
4001 opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
4002 mca_btl_openib_run_once_cb, event);
4003
4004 opal_event_active (&event->super, OPAL_EV_READ, 1);
4005
4006 return OPAL_SUCCESS;
4007 }
4008