1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
4 * Copyright (c) 2007-2008 Chelsio, Inc. All rights reserved.
5 * Copyright (c) 2008 Mellanox Technologies. All rights reserved.
6 * Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
7 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
8 * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights
9 * reserved.
10 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
11 * Copyright (c) 2014 The University of Tennessee and The University
12 * of Tennessee Research Foundation. All rights
13 * reserved.
14 *
15 * $COPYRIGHT$
16 *
17 * Additional copyrights may follow
18 *
19 * $HEADER$
20 */
21
22 #include "opal_config.h"
23
24 #include <rdma/rdma_cma.h>
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h>
27 #endif
28 #include <sys/types.h>
29 #ifdef HAVE_SYS_SOCKET_H
30 #include <sys/socket.h>
31 #endif
32 #ifdef HAVE_SYS_IOCTL_H
33 #include <sys/ioctl.h>
34 #endif
35 #ifdef HAVE_ARPA_INET_H
36 #include <arpa/inet.h>
37 #endif
38 #ifdef HAVE_NETINET_IN_H
39 #include <netinet/in.h>
40 #endif
41 #ifdef HAVE_NET_IF_H
42 #include <net/if.h>
43 #endif
44 #include <stdio.h>
45 #include <errno.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #ifdef HAVE_DIRENT_H
50 #include <dirent.h>
51 #endif
52 #include <stddef.h>
53
54 #include "opal/util/output.h"
55 #include "opal/util/error.h"
56 #include "opal/util/show_help.h"
57 #include "opal/util/proc.h"
58 #include "opal/runtime/opal_progress_threads.h"
59
60 #include "btl_openib_proc.h"
61 #include "btl_openib_endpoint.h"
62 #include "connect/connect.h"
63 #include "btl_openib_ip.h"
64 #include "btl_openib_ini.h"
65
66 #if BTL_OPENIB_RDMACM_IB_ADDR
67 #include <stdio.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <sys/types.h>
71 #include <rdma/rsocket.h>
72 #include <infiniband/ib.h>
73 #endif
74
75 #define mymin(a, b) ((a) < (b) ? (a) : (b))
76
77 static void rdmacm_component_register(void);
78 static int rdmacm_component_init(void);
79 static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl,
80 opal_btl_openib_connect_base_module_t **cpc);
81 static int rdmacm_component_finalize(void);
82
83 opal_btl_openib_connect_base_component_t opal_btl_openib_connect_rdmacm = {
84 "rdmacm",
85 rdmacm_component_register,
86 rdmacm_component_init,
87 rdmacm_component_query,
88 rdmacm_component_finalize
89 };
90
91 /*
92 * A single instance of this data structure is shared between one
93 * id_context_t for each BSRQ qp on an endpoint.
94 */
95 typedef struct {
96 opal_list_item_t super;
97 mca_btl_openib_endpoint_t *endpoint;
98 mca_btl_openib_module_t *openib_btl;
99 /* Dummy QP only used when we expect the connection to be
100 rejected */
101 struct ibv_cq *dummy_cq;
102 #if BTL_OPENIB_RDMACM_IB_ADDR
103 union ibv_gid gid;
104 uint64_t service_id;
105 #else
106 uint32_t ipaddr;
107 uint16_t tcp_port;
108 #endif
109 /* server==false means that this proc initiated the connection;
110 server==true means that this proc accepted the incoming
111 connection. Note that this may be different than the "one way"
112 / i_initiate() direction -- it is possible for server==false
113 and i_initiate() to return false; it means that this proc
114 initially initiated the connection, but we expect it to be
115 rejected. */
116 bool server;
117
118 /* Whether this contents struct has been saved on the client list
119 or not */
120 bool on_client_list;
121
122 /* A list of all the id_context_t's that are using this
123 rdmacm_contents_t */
124 opal_list_t ids;
125 } rdmacm_contents_t;
126
127 static void rdmacm_contents_constructor(rdmacm_contents_t *contents);
128 static void rdmacm_contents_destructor(rdmacm_contents_t *contents);
129 OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t,
130 rdmacm_contents_constructor,
131 rdmacm_contents_destructor);
132
133 typedef struct {
134 int device_max_qp_rd_atom;
135 int device_max_qp_init_rd_atom;
136 #if BTL_OPENIB_RDMACM_IB_ADDR
137 uint8_t gid[16];
138 uint64_t service_id;
139 #else
140 uint32_t ipaddr;
141 uint16_t tcp_port;
142 #endif
143 uint8_t end;
144 } modex_message_t;
145
146 typedef struct {
147 int rdmacm_counter;
148 } rdmacm_endpoint_local_cpc_data_t;
149
150 /*
151 * There are one of these for each RDMA CM ID. Because of BSRQ, there
152 * can be multiple of these for one endpoint, so all the
153 * id_context_t's on a single endpoing share a single
154 * rdmacm_contents_t.
155 */
156 typedef struct {
157 opal_list_item_t super;
158 rdmacm_contents_t *contents;
159 mca_btl_openib_endpoint_t *endpoint;
160 uint8_t qpnum;
161 bool already_disconnected;
162 uint16_t route_retry_count;
163 struct rdma_cm_id *id;
164 } id_context_t;
165
166 static void id_context_constructor(id_context_t *context);
167 static void id_context_destructor(id_context_t *context);
168 OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t,
169 id_context_constructor,
170 id_context_destructor);
171
172 typedef struct {
173 #if BTL_OPENIB_RDMACM_IB_ADDR
174 /*
175 * According to infiniband spec a "Consumer Private Data" begings from 36th up
176 * to 91th byte (so the limit is 56 bytes) and first 36 bytes
177 * intended for lib RDMA CM header (sometimes not all of these bytes are used)
178 * so we must take into account that in case of AF_IB user private data pointer
179 * points to a header and not to a "Consumer Private Data".
180 */
181 uint8_t librdmacm_header[36];
182 uint64_t rem_port;
183 #else
184 uint16_t rem_port;
185 #endif
186 uint32_t rem_index;
187 uint8_t qpnum;
188 opal_process_name_t rem_name;
189 } __opal_attribute_packed__ private_data_t;
190
191 #if !BTL_OPENIB_RDMACM_IB_ADDR
192 /* Used to send a specific show_help message from the service_thread
193 to the main thread (because we can't call show_help from the
194 service_thread) */
195 typedef struct {
196 char device_name[32];
197 uint32_t peer_ip_addr;
198 uint32_t peer_tcp_port;
199 } cant_find_endpoint_context_t;
200 #endif
201
202 static opal_list_t server_listener_list;
203 static opal_list_t client_list;
204 static opal_mutex_t client_list_lock;
205 static struct rdma_event_channel *event_channel = NULL;
206 static int rdmacm_priority = 30;
207 static unsigned int rdmacm_port = 0;
208
209 #if !BTL_OPENIB_RDMACM_IB_ADDR
210 static uint32_t rdmacm_addr = 0;
211 #endif
212
213 static int rdmacm_resolve_timeout = 30000;
214 static int rdmacm_resolve_max_retry_count = 20;
215 static bool rdmacm_reject_causes_connect_error = false;
216 static pthread_cond_t rdmacm_disconnect_cond;
217 static pthread_mutex_t rdmacm_disconnect_lock;
218 static volatile int disconnect_callbacks = 0;
219 static bool rdmacm_component_initialized = false;
220 static opal_event_base_t *rdmacm_event_base = NULL;
221 static opal_event_t rdmacm_event;
222
223 /* Calculate the *real* length of the message (not aligned/rounded
224 up) */
225 static int message_len = offsetof(modex_message_t, end);
226
227 /* Rejection reasons */
228 typedef enum {
229 REJECT_WRONG_DIRECTION,
230 REJECT_TRY_AGAIN
231 } reject_reason_t;
232
id_context_constructor(id_context_t * context)233 static void id_context_constructor(id_context_t *context)
234 {
235 context->already_disconnected = false;
236 context->id = NULL;
237 context->contents = NULL;
238 context->endpoint = NULL;
239 context->qpnum = 255;
240 context->route_retry_count = 0;
241 }
242
id_context_destructor(id_context_t * context)243 static void id_context_destructor(id_context_t *context)
244 {
245 if (NULL != context->id) {
246 rdma_destroy_id(context->id);
247 context->id = NULL;
248 }
249 if (NULL != context->contents) {
250 OBJ_RELEASE(context->contents);
251 }
252 }
253
rdmacm_contents_constructor(rdmacm_contents_t * contents)254 static void rdmacm_contents_constructor(rdmacm_contents_t *contents)
255 {
256 contents->endpoint = NULL;
257 contents->openib_btl = NULL;
258 contents->dummy_cq = NULL;
259 #if BTL_OPENIB_RDMACM_IB_ADDR
260 contents->service_id = 0;
261 #else
262 contents->ipaddr = 0;
263 contents->tcp_port = 0;
264 #endif
265 contents->server = false;
266 contents->on_client_list = false;
267 OBJ_CONSTRUCT(&(contents->ids), opal_list_t);
268 }
269
rdmacm_contents_destructor(rdmacm_contents_t * contents)270 static void rdmacm_contents_destructor(rdmacm_contents_t *contents)
271 {
272 OBJ_DESTRUCT(&(contents->ids));
273 }
274
275 /*
276 * Invoked by main thread
277 *
278 * Sets up any rdma_cm specific commandline params
279 */
rdmacm_component_register(void)280 static void rdmacm_component_register(void)
281 {
282 /* the priority is initialized in the declaration above */
283 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
284 "connect_rdmacm_priority",
285 "The selection method priority for rdma_cm",
286 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
287 OPAL_INFO_LVL_9,
288 MCA_BASE_VAR_SCOPE_READONLY,
289 &rdmacm_priority);
290 if (rdmacm_priority > 100) {
291 rdmacm_priority = 100;
292 } else if (rdmacm_priority < 0) {
293 rdmacm_priority = 0;
294 }
295
296 rdmacm_port = 0;
297 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
298 "connect_rdmacm_port",
299 "The selection method port for rdma_cm",
300 MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
301 OPAL_INFO_LVL_9,
302 MCA_BASE_VAR_SCOPE_READONLY,
303 &rdmacm_port);
304 if (rdmacm_port & ~0xfffful) {
305 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
306 "illegal tcp port", true, (int) rdmacm_port);
307 rdmacm_port = 0;
308 }
309
310 rdmacm_resolve_timeout = 30000;
311 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
312 "connect_rdmacm_resolve_timeout",
313 "The timeout (in miliseconds) for address and route resolution",
314 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
315 OPAL_INFO_LVL_9,
316 MCA_BASE_VAR_SCOPE_READONLY,
317 &rdmacm_resolve_timeout);
318 if (0 > rdmacm_resolve_timeout) {
319 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
320 "illegal timeout", true, rdmacm_resolve_timeout);
321 rdmacm_resolve_timeout = 30000;
322 }
323
324 rdmacm_resolve_max_retry_count = 20;
325 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
326 "connect_rdmacm_retry_count",
327 "Maximum number of times rdmacm will retry route resolution",
328 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
329 OPAL_INFO_LVL_9,
330 MCA_BASE_VAR_SCOPE_READONLY,
331 &rdmacm_resolve_max_retry_count);
332 if (0 > rdmacm_resolve_max_retry_count) {
333 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
334 "illegal retry count", true, rdmacm_resolve_max_retry_count);
335 rdmacm_resolve_max_retry_count = 20;
336 }
337
338 rdmacm_reject_causes_connect_error = false;
339 (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
340 "connect_rdmacm_reject_causes_connect_error",
341 "The drivers for some devices are buggy such that an RDMA REJECT action may result in a CONNECT_ERROR event instead of a REJECTED event. Setting this MCA parameter to true tells Open MPI to treat CONNECT_ERROR events on connections where a REJECT is expected as a REJECT (default: false)",
342 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
343 OPAL_INFO_LVL_9,
344 MCA_BASE_VAR_SCOPE_READONLY,
345 &rdmacm_reject_causes_connect_error);
346 }
347
348 /*
349 * Helper function for when we are debugging
350 */
stringify(uint32_t addr)351 static char *stringify(uint32_t addr)
352 {
353 char *line = (char *) malloc(64);
354 asprintf(&line, "%d.%d.%d.%d (0x%x)",
355 #if defined(WORDS_BIGENDIAN)
356 (addr >> 24),
357 (addr >> 16) & 0xff,
358 (addr >> 8) & 0xff,
359 addr & 0xff,
360 #else
361 addr & 0xff,
362 (addr >> 8) & 0xff,
363 (addr >> 16) & 0xff,
364 (addr >> 24),
365 #endif
366 addr);
367 return line;
368 }
369
370 /*
371 * Invoked by service thread
372 *
373 * This function traverses the list of endpoints associated with the
374 * device and determines which of them the remote side is attempting
375 * to connect to. This is determined based on the local endpoint's
376 * modex message recevied and the IP address and port associated with
377 * the rdma_cm event id
378 */
rdmacm_find_endpoint(rdmacm_contents_t * contents,opal_process_name_t rem_name)379 static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents,
380 opal_process_name_t rem_name)
381 {
382 mca_btl_openib_module_t *btl = contents->openib_btl;
383 mca_btl_openib_endpoint_t *ep = NULL;
384 opal_proc_t *opal_proc;
385
386 opal_proc = opal_proc_for_name (rem_name);
387 if (NULL == opal_proc) {
388 BTL_ERROR(("could not get proc associated with remote peer %s",
389 opal_process_name_print (rem_name)));
390 return NULL;
391 }
392
393 ep = mca_btl_openib_get_ep (&btl->super, opal_proc);
394 if (NULL == ep) {
395 BTL_ERROR(("could not find endpoint for peer %s",
396 opal_process_name_print (rem_name)));
397 }
398
399 return ep;
400 }
401
402 /*
403 * Returns max inlne size for qp #N
404 */
max_inline_size(int qp,mca_btl_openib_device_t * device)405 static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device)
406 {
407 if (mca_btl_openib_component.qp_infos[qp].size <= device->max_inline_data) {
408 /* If qp message size is smaller than max_inline_data,
409 * we should enable inline messages */
410 return mca_btl_openib_component.qp_infos[qp].size;
411 } else if (mca_btl_openib_component.rdma_qp == qp || 0 == qp) {
412 /* If qp message size is bigger that max_inline_data, we
413 * should enable inline messages only for RDMA QP (for PUT/GET
414 * fin messages) and for the first qp */
415 return device->max_inline_data;
416 }
417 /* Otherwise it is no reason for inline */
418 return 0;
419 }
420
421
422 /*
423 * Invoked by both main and service threads
424 */
rdmacm_setup_qp(rdmacm_contents_t * contents,mca_btl_openib_endpoint_t * endpoint,struct rdma_cm_id * id,int qpnum)425 static int rdmacm_setup_qp(rdmacm_contents_t *contents,
426 mca_btl_openib_endpoint_t *endpoint,
427 struct rdma_cm_id *id,
428 int qpnum)
429 {
430 struct ibv_qp_init_attr attr;
431 struct ibv_qp *qp;
432 struct ibv_srq *srq = NULL;
433 int credits = 0, reserved = 0, max_recv_wr, max_send_wr;
434 size_t req_inline;
435
436 if (qpnum == mca_btl_openib_component.credits_qp) {
437 int qp;
438
439 for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
440 if(BTL_OPENIB_QP_TYPE_PP(qp)) {
441 reserved += mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
442 }
443 }
444 credits = mca_btl_openib_component.num_qps;
445 }
446
447 if (BTL_OPENIB_QP_TYPE_PP(qpnum)) {
448 max_recv_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + reserved;
449 max_send_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + credits;
450 } else {
451 srq = endpoint->endpoint_btl->qps[qpnum].u.srq_qp.srq;
452 max_recv_wr = reserved;
453 max_send_wr = mca_btl_openib_component.qp_infos[qpnum].u.srq_qp.sd_max + credits;
454 }
455
456 memset(&attr, 0, sizeof(attr));
457 attr.qp_type = IBV_QPT_RC;
458 attr.send_cq = contents->openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
459 attr.recv_cq = contents->openib_btl->device->ib_cq[qp_cq_prio(qpnum)];
460 attr.srq = srq;
461 if(BTL_OPENIB_QP_TYPE_PP(qpnum)) {
462 /* Add one for the CTS receive frag that will be posted */
463 attr.cap.max_recv_wr = max_recv_wr + 1;
464 } else {
465 attr.cap.max_recv_wr = 0;
466 }
467 attr.cap.max_send_wr = max_send_wr;
468 attr.cap.max_inline_data = req_inline =
469 max_inline_size(qpnum, contents->openib_btl->device);
470 attr.cap.max_send_sge = 1;
471 attr.cap.max_recv_sge = 1; /* we do not use SG list */
472
473 {
474 /* JMS Temprary gross hack: we *must* use rdma_create_cp()
475 (vs. ibv_create_qp()) because strange things happen on IB
476 if we don't. However, rdma_create_cp() wants us to use
477 rdma_get_devices() (and therefore the pd that they have
478 allocated). In order to get v1.3 out the door, we're
479 bypassing this functionality - we're temporarily overriding
480 the device context cached on the ID with our own, so that
481 our pd will match. We need to fix this to properly get the
482 pd from the RDMA CM and use that, etc. */
483 struct ibv_context *temp = id->verbs;
484 id->verbs = contents->openib_btl->device->ib_pd->context;
485 if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd,
486 &attr)) {
487 BTL_ERROR(("Failed to create qp with %d", qpnum));
488 goto out;
489 }
490 qp = id->qp;
491 id->verbs = temp;
492 }
493
494 endpoint->qps[qpnum].qp->lcl_qp = qp;
495 endpoint->qps[qpnum].credit_frag = NULL;
496 if (attr.cap.max_inline_data < req_inline) {
497 endpoint->qps[qpnum].ib_inline_max = attr.cap.max_inline_data;
498 opal_show_help("help-mpi-btl-openib-cpc-base.txt",
499 "inline truncated", true,
500 opal_process_info.nodename,
501 ibv_get_device_name(contents->openib_btl->device->ib_dev),
502 contents->openib_btl->port_num,
503 req_inline, attr.cap.max_inline_data);
504 } else {
505 endpoint->qps[qpnum].ib_inline_max = req_inline;
506 }
507 id->qp = qp;
508
509 return OPAL_SUCCESS;
510
511 out:
512 return OPAL_ERROR;
513 }
514
515
516 /*
517 * Invoked by both main and service threads
518 *
519 * To avoid all kinds of nasty race conditions, we only allow
520 * connections to be made in one direction. So use a simple
521 * (arbitrary) test to decide which direction is allowed to initiate
522 * the connection: the process with the lower IP address wins. If the
523 * IP addresses are the same (i.e., the MPI procs are on the same
524 * node), then the process with the lower TCP port wins.
525 */
i_initiate(uint64_t local_port,uint64_t remote_port,union ibv_gid * local_gid,union ibv_gid * remote_gid)526 static bool i_initiate(uint64_t local_port, uint64_t remote_port,
527 #if BTL_OPENIB_RDMACM_IB_ADDR
528 union ibv_gid *local_gid, union ibv_gid *remote_gid)
529 {
530 #else
531 uint32_t local_ipaddr, uint32_t remote_ipaddr)
532 {
533 #if OPAL_ENABLE_DEBUG
534 char *a = stringify(local_ipaddr);
535 char *b = stringify(remote_ipaddr);
536 #endif
537 #endif
538
539 #if BTL_OPENIB_RDMACM_IB_ADDR
540 if (local_gid->global.subnet_prefix < remote_gid->global.subnet_prefix ||
541 (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
542 local_gid->global.interface_id < remote_gid->global.interface_id) ||
543 (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
544 local_gid->global.interface_id == remote_gid->global.interface_id &&
545 #else
546 if (local_ipaddr > remote_ipaddr ||
547 (local_ipaddr == remote_ipaddr &&
548 #endif
549 local_port < remote_port)) {
550 #if !BTL_OPENIB_RDMACM_IB_ADDR
551 OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s",
552 a, b));
553 #if OPAL_ENABLE_DEBUG
554 free(a);
555 free(b);
556 #endif
557 #endif
558 return true;
559 }
560 #if !BTL_OPENIB_RDMACM_IB_ADDR
561 OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s",
562 a, b));
563 #if OPAL_ENABLE_DEBUG
564 free(a);
565 free(b);
566 #endif
567 #endif
568 return false;
569 }
570
571 #if BTL_OPENIB_RDMACM_IB_ADDR
572 static int get_rdma_addr(char *src, char *dst,
573 struct rdma_addrinfo **rdma_addr,
574 int server)
575 {
576 int rc;
577 struct rdma_addrinfo hints, *sres, *dres;
578
579 memset(&hints, 0, sizeof hints);
580
581 hints.ai_family = AF_IB;
582 hints.ai_port_space = RDMA_PS_TCP;
583 hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY | RAI_PASSIVE;
584
585 rc = rdma_getaddrinfo(src, NULL, &hints, &sres);
586 if (0 != rc) {
587 return OPAL_ERROR;
588 }
589
590 if (server) {
591 *rdma_addr = sres;
592 return OPAL_SUCCESS;
593 }
594
595 hints.ai_src_len = sres->ai_src_len;
596 hints.ai_src_addr = sres->ai_src_addr;
597
598 hints.ai_flags &= ~RAI_PASSIVE;
599
600 rc = rdma_getaddrinfo(dst, NULL, &hints, &dres);
601 if (0 != rc) {
602 rdma_freeaddrinfo(sres);
603 return OPAL_ERROR;
604 }
605
606 rdma_freeaddrinfo(sres);
607 *rdma_addr = dres;
608
609 return OPAL_SUCCESS;
610 }
611 #endif
612
613 /*
614 * Invoked by main thread
615 */
616 static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
617 modex_message_t *message,
618 int num)
619 {
620 int rc;
621 id_context_t *context;
622 #if BTL_OPENIB_RDMACM_IB_ADDR
623 char src_addr[32], dst_addr[32];
624 struct rdma_addrinfo *rdma_addr;
625 #else
626 struct sockaddr_in src_in, dest_in;
627
628 #if OPAL_ENABLE_DEBUG
629 char *a, *b;
630 #endif
631 #endif
632
633 /* We'll need to access some data in the event handler. We can
634 * encapsulate it in this data struct and attach it to the id being
635 * created below. The event->id will contain this same pointer.
636 */
637 context = OBJ_NEW(id_context_t);
638 if (NULL == context) {
639 BTL_ERROR(("malloc error"));
640 goto out;
641 }
642
643 context->contents = contents;
644 OBJ_RETAIN(contents);
645 context->qpnum = num;
646 context->endpoint = contents->endpoint;
647
648 rc = rdma_create_id(event_channel, &(context->id),
649 context, RDMA_PS_TCP);
650 if (0 != rc) {
651 BTL_ERROR(("Failed to create a rdma id with %d", rc));
652 goto out1;
653 }
654 #if !BTL_OPENIB_RDMACM_IB_ADDR
655 /* Source address (we must specify this to ensure that the traffic
656 goes out on the device+port that we expect it go out). */
657 memset(&src_in, 0, sizeof(src_in));
658 src_in.sin_family = AF_INET;
659 src_in.sin_addr.s_addr = contents->ipaddr;
660 src_in.sin_port = 0;
661
662 /* Destination address */
663 memset(&dest_in, 0, sizeof(dest_in));
664 dest_in.sin_family = AF_INET;
665 dest_in.sin_addr.s_addr = message->ipaddr;
666 dest_in.sin_port = message->tcp_port;
667
668 /* Once the route to the remote system is discovered, a
669 * RDMA_CM_EVENT_ADDR_RESOLVED event will occur on the local event
670 * handler.
671 */
672 OPAL_OUTPUT((-1, "MAIN Resolving id: from IP %s:%d to IP %s:%d",
673 a = stringify(contents->ipaddr),
674 contents->tcp_port,
675 b = stringify(message->ipaddr),
676 message->tcp_port));
677 #if OPAL_ENABLE_DEBUG
678 free(a);
679 free(b);
680 #endif
681 #endif
682 /* This is odd an worth explaining: when we place the context on
683 the ids list, we need to add an extra RETAIN to the context.
684 The reason is because of a race condition. Let's explain
685 through a few cases:
686
687 1. Normal termination: client side endpoint_finalize removes
688 the context from the ids list, has its service thread call
689 rdma_disconnect(), and then RELEASE. A DISCONNECT event
690 will occur on both sides; the client DISCONNECT will invoke
691 RELEASE again on the context. Note that the DISCONNECT
692 event may occur *very* quickly on the client side, so the
693 order of these two RELEASEs is not known. The destructor
694 will invoke rdma_destroy_id() -- we obviously can't have
695 this happen before both actions complete. Hence,
696 refcounting (and the additional RETAIN) saves us.
697
698 Note that the server side never had the context on the ids
699 list, so it never had an extra RETAIN. So the DISCONNECT on
700 the server side will only invoke one RELEASE.
701
702 2. Abnormal termination: if the server side terminates
703 improperly (e.g., user's app segv's), then the kernel from
704 the server side will send a DISCONNECT event to the client
705 before the item has been removed from the ids list. This
706 will cause an assertion failure in debug builds (because
707 we'll be trying to RELEASE an opal_list_item_t that is still
708 on a list), and possibly other badness in optimized builds
709 because we'll be traversing a freed opal_list_item_t in
710 endpoint_finalize. So the extra RETAIN here right when we
711 put the item on the list prevents it from actually being
712 released in the client until BOTH the endpoint_finalize
713 occurs *and* the DISCONNECT event arrives.
714
715 Asynchronous programming is fun!
716 */
717 OBJ_RETAIN(context);
718 opal_list_append(&(contents->ids), &(context->super));
719 #if BTL_OPENIB_RDMACM_IB_ADDR
720 if (NULL == inet_ntop(AF_INET6, contents->gid.raw,
721 src_addr, sizeof src_addr)) {
722 BTL_ERROR(("local addr string creating fail"));
723 goto out1;
724 }
725
726 if (NULL == inet_ntop(AF_INET6, message->gid,
727 dst_addr, sizeof dst_addr)) {
728 BTL_ERROR(("remote addr string creating fail"));
729 goto out1;
730 }
731
732 rc = get_rdma_addr(src_addr, dst_addr, &rdma_addr, 0);
733 if (OPAL_SUCCESS != rc) {
734 BTL_ERROR(("server: create rdma addr error"));
735 goto out1;
736 }
737
738 ((struct sockaddr_ib *) (rdma_addr->ai_dst_addr))->sib_sid = message->service_id;
739 #endif
740 rc = rdma_resolve_addr(context->id,
741 #if BTL_OPENIB_RDMACM_IB_ADDR
742 rdma_addr->ai_src_addr,
743 rdma_addr->ai_dst_addr,
744 #else
745 (struct sockaddr *) &src_in,
746 (struct sockaddr *) &dest_in,
747 #endif
748 rdmacm_resolve_timeout);
749 if (0 != rc) {
750 BTL_ERROR(("Failed to resolve the remote address with %d", rc));
751 #if BTL_OPENIB_RDMACM_IB_ADDR
752 rdma_freeaddrinfo(rdma_addr);
753 #endif
754 goto out1;
755 }
756 #if BTL_OPENIB_RDMACM_IB_ADDR
757 rdma_freeaddrinfo(rdma_addr);
758 #endif
759
760 return OPAL_SUCCESS;
761
762 out1:
763 OBJ_RELEASE(context);
764 out:
765 return OPAL_ERROR;
766 }
767
768 /*
769 * Invoked by main thread
770 *
771 * Connect method called by the upper layers to connect the local
772 * endpoint to the remote endpoint by creating QP(s) to connect the two.
773 * Already holding endpoint lock when this function is called.
774 */
775 static int rdmacm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc,
776 mca_btl_base_endpoint_t *endpoint)
777 {
778 rdmacm_contents_t *contents;
779 modex_message_t *message, *local_message;
780 int rc, qp;
781 opal_list_item_t *item;
782 #if !BTL_OPENIB_RDMACM_IB_ADDR
783 #if OPAL_ENABLE_DEBUG
784 char *a, *b;
785 #endif
786 #endif
787 /* Don't use the CPC to get the message, because this function is
788 invoked from the event_handler (to intitiate connections in the
789 Right direction), where we don't have the CPC, so it'll be
790 NULL. */
791 local_message =
792 (modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message;
793 message = (modex_message_t *)
794 endpoint->endpoint_remote_cpc_data->cbm_modex_message;
795 #if !BTL_OPENIB_RDMACM_IB_ADDR
796 OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d ep state = %d",
797 a = stringify(local_message->ipaddr), local_message->tcp_port,
798 b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state));
799 #if OPAL_ENABLE_DEBUG
800 free(a);
801 free(b);
802 #endif
803 BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d ep state = %d",
804 message->ipaddr, message->tcp_port, endpoint->endpoint_state));
805 #endif
806 if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state ||
807 MCA_BTL_IB_CONNECTING == endpoint->endpoint_state ||
808 MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) {
809 return OPAL_SUCCESS;
810 }
811
812 /* Set the endpoint state to "connecting" (this function runs in
813 the main MPI thread; not the service thread, so we can set the
814 endpoint_state here). */
815 endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
816
817 contents = OBJ_NEW(rdmacm_contents_t);
818 if (NULL == contents) {
819 BTL_ERROR(("malloc of contents failed"));
820 rc = OPAL_ERR_OUT_OF_RESOURCE;
821 goto out;
822 }
823
824 contents->openib_btl = endpoint->endpoint_btl;
825 contents->endpoint = endpoint;
826 contents->server = false;
827 /* Populate the port information with the local port the server is
828 * listening on instead of the ephemerial port this client is
829 * connecting with. This port is used to determine which endpoint
830 * is being connected from, in the case where there are multiple
831 * listeners on the local system.
832 */
833 #if BTL_OPENIB_RDMACM_IB_ADDR
834 memcpy(contents->gid.raw, local_message->gid, sizeof(contents->gid));
835 contents->service_id = local_message->service_id;
836 #else
837 contents->ipaddr = local_message->ipaddr;
838 contents->tcp_port = local_message->tcp_port;
839 #endif
840
841 /* Are we the initiator? Or do we expect this connect request to
842 be rejected? */
843 endpoint->endpoint_initiator =
844 i_initiate(
845 #if BTL_OPENIB_RDMACM_IB_ADDR
846 contents->service_id, message->service_id,
847 &contents->gid, (union ibv_gid *) message->gid);
848 #else
849 contents->tcp_port, message->tcp_port,
850 contents->ipaddr, message->ipaddr);
851 #endif
852 OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s",
853 (void*) endpoint,
854 (void*) endpoint->endpoint_local_cpc,
855 endpoint->endpoint_initiator ? "am" : "am NOT",
856 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
857
858 /* If we're the initiator, then open all the QPs */
859 if (contents->endpoint->endpoint_initiator) {
860 /* Initiator needs a CTS frag (non-initiator will have a CTS
861 frag allocated later) */
862 if (OPAL_SUCCESS !=
863 (rc = opal_btl_openib_connect_base_alloc_cts(contents->endpoint))) {
864 BTL_ERROR(("Failed to alloc CTS frag"));
865 goto out;
866 }
867
868 for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
869 rc = rdmacm_client_connect_one(contents, message, qp);
870 if (OPAL_SUCCESS != rc) {
871 BTL_ERROR(("rdmacm_client_connect_one error (real QP %d)",
872 qp));
873 goto out;
874 }
875 }
876 }
877 /* Otherwise, only open 1 QP that we expect to be rejected */
878 else {
879 rc = rdmacm_client_connect_one(contents, message, 0);
880 if (OPAL_SUCCESS != rc) {
881 BTL_ERROR(("rdmacm_client_connect_one error (bogus QP)"));
882 goto out;
883 }
884 }
885
886 return OPAL_SUCCESS;
887
888 out:
889 while (NULL != (item = opal_list_remove_first (&contents->ids))) {
890 OBJ_RELEASE(item);
891 }
892
893 return rc;
894 }
895
896 #if !BTL_OPENIB_RDMACM_IB_ADDR
897 static void *show_help_cant_find_endpoint(void *context)
898 {
899 char *msg;
900 cant_find_endpoint_context_t *c =
901 (cant_find_endpoint_context_t*) context;
902
903 if (NULL != c) {
904 msg = stringify(c->peer_ip_addr);
905 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
906 "could not find matching endpoint", true,
907 opal_process_info.nodename,
908 c->device_name,
909 c->peer_tcp_port);
910 free(msg);
911 } else {
912 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
913 "could not find matching endpoint", true,
914 opal_process_info.nodename,
915 "<unknown>", "<unknown>", -1);
916 }
917 free(context);
918
919 /* Now kill it */
920 mca_btl_openib_endpoint_invoke_error(NULL);
921 return NULL;
922 }
923 #endif
924
925 /*
926 * Invoked by service thread
927 *
928 * The server thread will handle the incoming connection requests and
929 * allow them or reject them based on a unidirectional connection
930 * method. The choonections are allowed based on the IP address and
931 * port values. This determination is arbitrary, but is uniform in
932 * allowing the connections only in 1 direction. If the connection in
933 * the requestion is disallowed by this rule, then the server will
934 * reject the connection and make its own in the proper direction.
935 */
936 static int handle_connect_request(struct rdma_cm_event *event)
937 {
938 id_context_t *listener_context = (id_context_t*) event->id->context;
939 id_context_t *new_context = NULL;
940 rdmacm_contents_t *contents = listener_context->contents;
941 mca_btl_openib_endpoint_t *endpoint;
942 struct rdma_conn_param conn_param;
943 opal_process_name_t rem_name;
944 modex_message_t *message;
945 private_data_t msg;
946 int rc = -1, qpnum;
947 uint32_t rem_index;
948 #if BTL_OPENIB_RDMACM_IB_ADDR
949 uint64_t rem_port;
950 #else
951 uint16_t rem_port;
952 #endif
953
954 qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum;
955 rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port;
956 rem_index = ((private_data_t *)event->param.conn.private_data)->rem_index;
957 rem_name = ((private_data_t *)event->param.conn.private_data)->rem_name;
958
959 /* Determine which endpoint the remote side is trying to connect
960 to; use the listener's context->contents to figure it out */
961 endpoint = rdmacm_find_endpoint(contents, rem_name);
962 if (NULL == endpoint) {
963 #if !BTL_OPENIB_RDMACM_IB_ADDR
964 struct sockaddr *peeraddr = rdma_get_peer_addr(event->id);
965 cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c));
966 if (NULL != c) {
967 snprintf(c->device_name, sizeof(c->device_name) - 1,
968 "%s:%d",
969 ibv_get_device_name(contents->openib_btl->device->ib_dev),
970 contents->openib_btl->port_num);
971 c->peer_ip_addr =
972 ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
973 c->peer_tcp_port = rdma_get_dst_port(event->id);
974 }
975 show_help_cant_find_endpoint (c);
976 #else
977 BTL_ERROR(("Cannot find endpoint."));
978 #endif
979 goto out;
980 }
981
982 message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
983 endpoint->endpoint_initiator =
984 i_initiate(
985 #if BTL_OPENIB_RDMACM_IB_ADDR
986 contents->service_id, rem_port,
987 &contents->gid, (union ibv_gid *) message->gid);
988 #else
989 contents->tcp_port, rem_port,
990 contents->ipaddr, message->ipaddr);
991 BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d",
992 endpoint->endpoint_state, contents->ipaddr, message->ipaddr,
993 contents->tcp_port, rem_port));
994 #endif
995 OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s",
996 (void*) endpoint,
997 (void*) endpoint->endpoint_local_cpc,
998 endpoint->endpoint_initiator ? "am" : "am NOT",
999 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1000 if (endpoint->endpoint_initiator) {
1001 reject_reason_t reason = REJECT_WRONG_DIRECTION;
1002
1003 OPAL_OUTPUT((-1, "SERVICE Received a connect request from an endpoint in the wrong direction"));
1004
1005 /* This will cause a event on the remote system. By passing in
1006 * a value in the second arg of rdma_reject, the remote side
1007 * can check for this to know if it was an intentional reject or
1008 * a reject based on an error.
1009 */
1010 rc = rdma_reject(event->id, &reason, sizeof(reject_reason_t));
1011 if (0 != rc) {
1012 BTL_ERROR(("rdma_reject failed %d", rc));
1013 goto out;
1014 }
1015
1016 OPAL_OUTPUT((-1, "SERVICE Starting connection in other direction"));
1017 rdmacm_module_start_connect(NULL, endpoint);
1018
1019 return OPAL_SUCCESS;
1020 }
1021
1022 /* Set the endpoint_state to "CONNECTING". This is running
1023 in the service thread, so we need to do a write barrier. */
1024 endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
1025 opal_atomic_wmb();
1026
1027 endpoint->rem_info.rem_index = rem_index;
1028
1029 /* Setup QP for new connection */
1030 BTL_VERBOSE(("ACCEPTING src port = %d, dst port = %d, qpnum = %d",
1031 rdma_get_src_port(event->id), rdma_get_dst_port(event->id), qpnum));
1032
1033 rc = rdmacm_setup_qp(contents, endpoint, event->id, qpnum);
1034 if (0 != rc) {
1035 BTL_ERROR(("rdmacm_setup_qp error %d", rc));
1036 goto out;
1037 }
1038
1039 /* Post a single receive buffer on the smallest QP for the CTS
1040 protocol */
1041 if (mca_btl_openib_component.credits_qp == qpnum) {
1042 struct ibv_recv_wr *bad_wr, *wr;
1043
1044 if (OPAL_SUCCESS !=
1045 opal_btl_openib_connect_base_alloc_cts(endpoint)) {
1046 BTL_ERROR(("Failed to alloc CTS frag"));
1047 goto out1;
1048 }
1049 wr = &(endpoint->endpoint_cts_frag.rd_desc);
1050 assert(NULL != wr);
1051 wr->next = NULL;
1052
1053 if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
1054 wr, &bad_wr)) {
1055 BTL_ERROR(("failed to post CTS recv buffer"));
1056 goto out1;
1057 }
1058 OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
1059 (void*)((uintptr_t*) wr->sg_list[0].addr),
1060 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
1061 qpnum,
1062 endpoint->qps[qpnum].qp->lcl_qp->qp_num,
1063 (void*)((uintptr_t*) wr->wr_id),
1064 (void*)((uintptr_t*) wr->sg_list[0].addr),
1065 wr->sg_list[0].length,
1066 wr->sg_list[0].lkey));
1067 }
1068
1069 /* Since the event id is already created (since we're the server),
1070 the context that was passed to us was the listen server's
1071 context -- which is no longer useful to us. So allocate a new
1072 context and populate it just for this connection. */
1073 event->id->context = new_context = OBJ_NEW(id_context_t);
1074 if (NULL == new_context) {
1075 BTL_ERROR(("malloc error"));
1076 goto out1;
1077 }
1078
1079 new_context->contents = contents;
1080 OBJ_RETAIN(contents);
1081 new_context->qpnum = qpnum;
1082 new_context->endpoint = endpoint;
1083
1084 memset(&conn_param, 0, sizeof(conn_param));
1085 /* See rdma_connect(3) for a description of these 2 values. We
1086 ensure to pass these values around via the modex so that we can
1087 compute the values properly. */
1088 conn_param.responder_resources =
1089 mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
1090 message->device_max_qp_init_rd_atom);
1091 conn_param.initiator_depth =
1092 mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
1093 message->device_max_qp_rd_atom);
1094 conn_param.retry_count = mca_btl_openib_component.ib_retry_count;
1095 conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(qpnum) ? 0 :
1096 mca_btl_openib_component.ib_rnr_retry;
1097 conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(qpnum);
1098 conn_param.private_data = &msg;
1099 conn_param.private_data_len = sizeof(private_data_t);
1100
1101 /* Fill the private data being sent to the other side */
1102 msg.qpnum = qpnum;
1103 msg.rem_index = endpoint->index;
1104 msg.rem_name = OPAL_PROC_MY_NAME;
1105
1106 /* Accepting the connection will result in a
1107 RDMA_CM_EVENT_ESTABLISHED event on both the client and server
1108 side. */
1109 rc = rdma_accept(event->id, &conn_param);
1110 if (0 != rc) {
1111 BTL_ERROR(("rdma_accept error %d", rc));
1112 goto out2;
1113 }
1114
1115 return OPAL_SUCCESS;
1116
1117 out2:
1118 OBJ_RELEASE(new_context);
1119 out1:
1120 ibv_destroy_qp(endpoint->qps[qpnum].qp->lcl_qp);
1121 out:
1122 return OPAL_ERROR;
1123 }
1124
1125 /*
1126 * Runs in service thread
1127 *
1128 * We call rdma_disconnect() here in the service thread so that there
1129 * is zero chance that the DISCONNECT event is delivered and executed
1130 * in the service thread while rdma_disconnect() is still running in
1131 * the main thread (which causes all manner of Bad Things to occur).
1132 */
1133 static void *call_disconnect_callback(int fd, int flags, void *v)
1134 {
1135 rdmacm_contents_t *contents = (rdmacm_contents_t *) v;
1136 #if OPAL_ENABLE_DEBUG
1137 void *tmp = NULL;
1138 #endif
1139 id_context_t *context;
1140 opal_list_item_t *item;
1141
1142 pthread_mutex_lock (&rdmacm_disconnect_lock);
1143 while (NULL != (item = opal_list_remove_first(&contents->ids))) {
1144 context = (id_context_t *) item;
1145
1146 OPAL_OUTPUT((-1, "RDMACM Event thread calling disconnect on ID %p",
1147 (void*) context->id));
1148
1149 if (!context->already_disconnected) {
1150 #if OPAL_ENABLE_DEBUG
1151 tmp = context->id;
1152 #endif
1153 rdma_disconnect(context->id);
1154 context->already_disconnected = true;
1155 }
1156
1157 OBJ_RELEASE(context);
1158
1159 OPAL_OUTPUT((-1, "RDMACM Event thread disconnect on ID %p done",
1160 (void*) tmp));
1161 }
1162
1163 /* Tell the main thread that we're done */
1164 pthread_cond_signal(&rdmacm_disconnect_cond);
1165 pthread_mutex_unlock(&rdmacm_disconnect_lock);
1166
1167 return NULL;
1168 }
1169
1170 /*
1171 * Invoked by main thread
1172 *
1173 * Runs *while* the progress thread is running. We can't stop the
1174 * progress thread because this function may be invoked to kill a
1175 * specific endpoint that was the result of MPI-2 dynamics (i.e., this
1176 * is not during MPI_FINALIZE).
1177 */
1178 static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
1179 {
1180 rdmacm_contents_t *contents = NULL, *item;
1181 opal_event_t event;
1182
1183 BTL_VERBOSE(("Start disconnecting..."));
1184 OPAL_OUTPUT((-1, "MAIN Endpoint finalizing"));
1185
1186 if (NULL == endpoint) {
1187 BTL_ERROR(("Attempting to shutdown a NULL endpoint"));
1188 return OPAL_SUCCESS;
1189 }
1190
1191 /* Determine which rdmacm_contents_t correlates to the endpoint
1192 * we are shutting down. By disconnecting instead of simply
1193 * destroying the QPs, we are shutting down in a more graceful way
1194 * thus preventing errors on the line.
1195 *
1196 * Need to lock because the client_list is accessed in both the
1197 * main thread and service thread.
1198 */
1199 opal_mutex_lock(&client_list_lock);
1200 OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
1201 if (endpoint == item->endpoint) {
1202 contents = item;
1203 opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
1204 contents->on_client_list = false;
1205
1206 /* Fun race condition: we cannot call
1207 rdma_disconnect() in this thread, because
1208 if we do, there is a nonzero chance that the
1209 DISCONNECT event will be delivered and get executed
1210 in the rdcm event thread immediately. If this all
1211 happens before rdma_disconnect() returns, all
1212 manner of Bad Things can/will occur. So just
1213 invoke rdma_disconnect() in the rdmacm event thread
1214 where we guarantee that we won't be processing an
1215 event when it is called. */
1216
1217 opal_event_set (rdmacm_event_base, &event, -1, OPAL_EV_READ,
1218 call_disconnect_callback, contents);
1219 opal_event_active (&event, OPAL_EV_READ, 1);
1220
1221 /* remove_item returns the item before the item removed,
1222 meaning that the for list is still safe */
1223 break;
1224 }
1225 }
1226
1227 /* Flush writes to ensure we sync across threads */
1228 opal_atomic_wmb();
1229 opal_mutex_unlock(&client_list_lock);
1230
1231 if (NULL != contents) {
1232 /* Now wait for all the disconnect callbacks to occur */
1233 pthread_mutex_lock(&rdmacm_disconnect_lock);
1234 while (opal_list_get_size (&contents->ids)) {
1235 pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1236 }
1237 pthread_mutex_unlock(&rdmacm_disconnect_lock);
1238 }
1239
1240 OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
1241 return OPAL_SUCCESS;
1242 }
1243
1244 /*
1245 * Callback (from main thread) when the endpoint has been connected
1246 */
1247 static void *local_endpoint_cpc_complete(void *context)
1248 {
1249 mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context;
1250
1251 OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
1252 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1253 OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
1254 mca_btl_openib_endpoint_cpc_complete(endpoint);
1255
1256 return NULL;
1257 }
1258
1259 /*
1260 * Runs in service thread
1261 */
1262 static int rdmacm_connect_endpoint(id_context_t *context,
1263 struct rdma_cm_event *event)
1264 {
1265 rdmacm_contents_t *contents = context->contents;
1266 rdmacm_endpoint_local_cpc_data_t *data;
1267
1268 mca_btl_openib_endpoint_t *endpoint;
1269 #if OPAL_ENABLE_DEBUG
1270 #if !BTL_OPENIB_RDMACM_IB_ADDR
1271 modex_message_t *message;
1272 #endif
1273 #endif
1274
1275 if (contents->server) {
1276 endpoint = context->endpoint;
1277 OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s",
1278 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1279 } else {
1280 endpoint = contents->endpoint;
1281 endpoint->rem_info.rem_index =
1282 ((private_data_t *)event->param.conn.private_data)->rem_index;
1283
1284 if (!contents->on_client_list) {
1285 opal_mutex_lock(&client_list_lock);
1286 opal_list_append(&client_list, &(contents->super));
1287 /* Flush writes to ensure we sync across threads */
1288 opal_atomic_wmb();
1289 opal_mutex_unlock(&client_list_lock);
1290 contents->on_client_list = true;
1291 }
1292 OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s",
1293 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1294 }
1295 if (NULL == endpoint) {
1296 BTL_ERROR(("Can't find endpoint"));
1297 return OPAL_ERR_NOT_FOUND;
1298 }
1299 data =
1300 (rdmacm_endpoint_local_cpc_data_t *)endpoint->endpoint_local_cpc_data;
1301
1302 /* Only notify the upper layers after the last QP has been
1303 connected */
1304 if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) {
1305 BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client",
1306 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter));
1307 OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client",
1308 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter));
1309 return OPAL_SUCCESS;
1310 }
1311
1312 #if OPAL_ENABLE_DEBUG
1313 #if !BTL_OPENIB_RDMACM_IB_ADDR
1314 message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
1315 BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d",
1316 contents->server?"server":"client",
1317 contents->ipaddr,
1318 message->ipaddr,
1319 endpoint->endpoint_state));
1320 #endif
1321 #endif
1322
1323 /* Ensure that all the writes back to the endpoint and associated
1324 data structures have completed */
1325 opal_atomic_wmb();
1326 mca_btl_openib_run_in_main (local_endpoint_cpc_complete, endpoint);
1327
1328 return OPAL_SUCCESS;
1329 }
1330
1331 /*
1332 * Runs in service thread
1333 */
1334 static int rdmacm_disconnected(id_context_t *context)
1335 {
1336 /* If this was a client thread, then it *may* still be listed in a
1337 contents->ids list. */
1338
1339 OPAL_OUTPUT((-1, "SERVICE Releasing context because of DISCONNECT: context %p, id %p",
1340 (void*) context, (void*) context->id));
1341 OBJ_RELEASE(context);
1342
1343 return OPAL_SUCCESS;
1344 }
1345
1346 /*
1347 * Runs in service thread
1348 */
1349 static int rdmacm_destroy_dummy_qp(id_context_t *context)
1350 {
1351 /* We need to check id pointer because of retransmitions.
1352 Maybe the reject was already done. */
1353
1354 if (NULL != context->id) {
1355 if (NULL != context->id->qp) {
1356 ibv_destroy_qp(context->id->qp);
1357 context->id->qp = NULL;
1358 }
1359 }
1360
1361 if (NULL != context->contents->dummy_cq) {
1362 ibv_destroy_cq(context->contents->dummy_cq);
1363 }
1364 /* This item was appended to the contents->ids list (the list will
1365 only have just this one item), so remove it before RELEASEing
1366 the item */
1367 opal_list_remove_first(&(context->contents->ids));
1368 OBJ_RELEASE(context);
1369
1370 return OPAL_SUCCESS;
1371 }
1372
1373 /*
1374 * Runs in service thread
1375 */
1376 static int rdmacm_rejected(id_context_t *context, struct rdma_cm_event *event)
1377 {
1378 if (NULL != event->param.conn.private_data) {
1379 /* Why were we rejected? */
1380 switch (*((reject_reason_t*) event->param.conn.private_data)) {
1381 case REJECT_WRONG_DIRECTION:
1382 OPAL_OUTPUT((-1, "SERVICE A good reject! for qp %d, id 0x%p",
1383 context->qpnum, (void*) context->id));
1384 rdmacm_destroy_dummy_qp(context);
1385 break;
1386
1387 default:
1388 /* Just so compilers won't complain */
1389 break;
1390 }
1391 }
1392
1393 return OPAL_SUCCESS;
1394 }
1395
1396 /*
1397 * Runs in service thread
1398 */
1399 static int resolve_route(id_context_t *context)
1400 {
1401 int rc;
1402
1403 /* Resolve the route to the remote system. Once established, the
1404 * local system will get a RDMA_CM_EVENT_ROUTE_RESOLVED event.
1405 */
1406 rc = rdma_resolve_route(context->id, rdmacm_resolve_timeout);
1407 if (0 != rc) {
1408 BTL_ERROR(("Failed to resolve the route with %d", rc));
1409 goto out;
1410 }
1411
1412 #if OPAL_ENABLE_DEBUG
1413 {
1414 char *a, *b;
1415 OPAL_OUTPUT((-1, "Resolved route ID %p (local addr %s, remote addr %s)",
1416 (void*) context->id,
1417 a = stringify(((struct sockaddr_in*) rdma_get_local_addr(context->id))->sin_addr.s_addr),
1418 b = stringify(((struct sockaddr_in*) rdma_get_peer_addr(context->id))->sin_addr.s_addr)));
1419 free(a);
1420 free(b);
1421 }
1422 #endif
1423
1424 return OPAL_SUCCESS;
1425
1426 out:
1427 return OPAL_ERROR;
1428 }
1429
1430 /*
1431 * Runs in service thread
1432 */
1433 static int create_dummy_cq(rdmacm_contents_t *contents,
1434 mca_btl_openib_module_t *openib_btl)
1435 {
1436 contents->dummy_cq =
1437 ibv_create_cq(openib_btl->device->ib_dev_context, 1, NULL, NULL, 0);
1438 if (NULL == contents->dummy_cq) {
1439 BTL_ERROR(("dummy_cq not created"));
1440 goto out;
1441 }
1442
1443 return OPAL_SUCCESS;
1444 out:
1445 return OPAL_ERROR;
1446 }
1447
1448 /*
1449 * Runs in service thread
1450 */
1451 static int create_dummy_qp(rdmacm_contents_t *contents,
1452 struct rdma_cm_id *id, int qpnum)
1453 {
1454 struct ibv_qp_init_attr attr;
1455
1456 memset(&attr, 0, sizeof(attr));
1457 attr.qp_type = IBV_QPT_RC;
1458 attr.send_cq = contents->dummy_cq;
1459 attr.recv_cq = contents->dummy_cq;
1460 attr.cap.max_recv_wr = 1;
1461 attr.cap.max_send_wr = 1;
1462 attr.cap.max_send_sge = 1;
1463 attr.cap.max_recv_sge = 1;
1464
1465 {
1466 /* JMS Temprary gross hack: we *must* use rdma_create_cp()
1467 (vs. ibv_create_qp()) because strange things happen on IB
1468 if we don't. However, rdma_create_cp() wants us to use
1469 rdma_get_devices() (and therefore the pd that they have
1470 allocated). In order to get v1.3 out the door, we're
1471 bypassing this functionality - we're temporarily overriding
1472 the device context cached on the ID with our own, so that
1473 our pd will match. We need to fix this to properly get the
1474 pd from the RDMA CM and use that, etc. */
1475 struct ibv_context *temp = id->verbs;
1476 id->verbs = contents->openib_btl->device->ib_pd->context;
1477 if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd,
1478 &attr)) {
1479 BTL_ERROR(("Failed to create qp with %d", qpnum));
1480 goto out;
1481 }
1482 id->verbs = temp;
1483 }
1484 BTL_VERBOSE(("dummy qp created %d", qpnum));
1485
1486 return OPAL_SUCCESS;
1487
1488 out:
1489 return OPAL_ERROR;
1490 }
1491
1492 /*
1493 * Runs in service thread
1494 */
1495 static int finish_connect(id_context_t *context)
1496 {
1497 rdmacm_contents_t *contents = context->contents;
1498 struct rdma_conn_param conn_param;
1499 private_data_t msg;
1500 int rc;
1501 #if OPAL_ENABLE_DEBUG
1502 #if !BTL_OPENIB_RDMACM_IB_ADDR
1503 struct sockaddr *peeraddr;
1504 uint32_t remoteipaddr;
1505 uint16_t remoteport;
1506 #endif
1507 #endif
1508 modex_message_t *message;
1509
1510 #if OPAL_ENABLE_DEBUG
1511 #if !BTL_OPENIB_RDMACM_IB_ADDR
1512 peeraddr = rdma_get_peer_addr(context->id);
1513 remoteport = rdma_get_dst_port(context->id);
1514 remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
1515 #endif
1516 #endif
1517
1518 message = (modex_message_t *)
1519 context->endpoint->endpoint_remote_cpc_data->cbm_modex_message;
1520
1521 /* If we're the initiator, then setup the QP's and post the CTS
1522 message buffer */
1523 if (contents->endpoint->endpoint_initiator) {
1524 rc = rdmacm_setup_qp(contents, contents->endpoint,
1525 context->id, context->qpnum);
1526 if (0 != rc) {
1527 BTL_ERROR(("rdmacm_setup_qp error %d", rc));
1528 goto out;
1529 }
1530
1531 if (mca_btl_openib_component.credits_qp == context->qpnum) {
1532 /* Post a single receive buffer on the smallest QP for the CTS
1533 protocol */
1534
1535 struct ibv_recv_wr *bad_wr, *wr;
1536 assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr);
1537 wr = &(contents->endpoint->endpoint_cts_frag.rd_desc);
1538 assert(NULL != wr);
1539 wr->next = NULL;
1540
1541 if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
1542 wr, &bad_wr)) {
1543 BTL_ERROR(("failed to post CTS recv buffer"));
1544 goto out1;
1545 }
1546 OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
1547 (void*)((uintptr_t*) wr->sg_list[0].addr),
1548 wr->sg_list[0].length,
1549 opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal),
1550 context->qpnum,
1551 contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
1552 }
1553 } else {
1554 /* If we are establishing a connection in the "wrong" direction,
1555 * setup a dummy CQ and QP and do NOT post any recvs on them.
1556 * Otherwise this will screwup the recv accounting and will
1557 * result in not posting recvs when you really really wanted to.
1558 * All of the dummy cq and qps will be cleaned up on the reject
1559 * event.
1560 */
1561 rc = create_dummy_cq(contents, contents->openib_btl);
1562 if (0 != rc) {
1563 BTL_ERROR(("create_dummy_cq error %d", rc));
1564 goto out;
1565 }
1566
1567 rc = create_dummy_qp(contents, context->id, context->qpnum);
1568 if (0 != rc) {
1569 BTL_ERROR(("create_dummy_qp error %d", rc));
1570 goto out;
1571 }
1572 }
1573
1574 memset(&conn_param, 0, sizeof(conn_param));
1575 /* See above comment about rdma_connect(3) and these two values. */
1576 conn_param.responder_resources =
1577 mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
1578 message->device_max_qp_init_rd_atom);
1579 conn_param.initiator_depth =
1580 mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
1581 message->device_max_qp_rd_atom);
1582 conn_param.flow_control = 0;
1583 conn_param.retry_count = mca_btl_openib_component.ib_retry_count;
1584 conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(context->qpnum) ? 0 :
1585 mca_btl_openib_component.ib_rnr_retry;
1586 conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(context->qpnum);
1587 conn_param.private_data = &msg;
1588 conn_param.private_data_len = sizeof(private_data_t);
1589
1590 msg.qpnum = context->qpnum;
1591 msg.rem_index = contents->endpoint->index;
1592 msg.rem_name = OPAL_PROC_MY_NAME;
1593 #if BTL_OPENIB_RDMACM_IB_ADDR
1594 memset(msg.librdmacm_header, 0, sizeof(msg.librdmacm_header));
1595 msg.rem_port = contents->service_id;
1596 #else
1597 msg.rem_port = contents->tcp_port;
1598 if (contents->endpoint->endpoint_initiator) {
1599 #if OPAL_ENABLE_DEBUG
1600 char *a;
1601 #endif
1602 OPAL_OUTPUT((-1, "Finish connect (I am initiator): sending from %s:%d, TCP port %d, qp index %d (num %d) to IP %s:%d",
1603 ibv_get_device_name(contents->openib_btl->device->ib_dev),
1604 contents->openib_btl->port_num,
1605 contents->tcp_port,
1606 context->qpnum,
1607 contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num,
1608 a = stringify(remoteipaddr), remoteport));
1609 #if OPAL_ENABLE_DEBUG
1610 free(a);
1611 #endif
1612 }
1613 #endif
1614
1615 /* Now all of the local setup has been done. The remote system
1616 should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further
1617 the setup of the QP. */
1618 OPAL_OUTPUT((-1, "SERVICE in finish_connect; ep=%p (%p), I still %s the initiator to %s",
1619 (void*) contents->endpoint,
1620 (void*) contents->endpoint->endpoint_local_cpc,
1621 contents->endpoint->endpoint_initiator ? "am" : "am NOT",
1622 opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal)));
1623 rc = rdma_connect(context->id, &conn_param);
1624 if (0 != rc) {
1625 BTL_ERROR(("rdma_connect Failed with %d", rc));
1626 goto out1;
1627 }
1628
1629 return OPAL_SUCCESS;
1630
1631 out1:
1632 ibv_destroy_qp(context->id->qp);
1633 out:
1634 OBJ_RELEASE(contents);
1635
1636 return OPAL_ERROR;
1637 }
1638
1639 /*
1640 * Runs in main thread
1641 */
1642 static void *show_help_rdmacm_event_error (struct rdma_cm_event *event)
1643 {
1644 id_context_t *context = (id_context_t*) event->id->context;
1645
1646 if (RDMA_CM_EVENT_DEVICE_REMOVAL == event->event) {
1647 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
1648 "rdma cm device removal", true,
1649 opal_process_info.nodename,
1650 ibv_get_device_name(event->id->verbs->device));
1651 } else {
1652 const char *device = "Unknown";
1653 if (NULL != event->id &&
1654 NULL != event->id->verbs &&
1655 NULL != event->id->verbs->device) {
1656 device = ibv_get_device_name(event->id->verbs->device);
1657 }
1658 opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
1659 "rdma cm event error", true,
1660 opal_process_info.nodename,
1661 device,
1662 rdma_event_str(event->event),
1663 opal_get_proc_hostname(context->endpoint->endpoint_proc->proc_opal));
1664 }
1665
1666 return NULL;
1667 }
1668
1669 /*
1670 * Runs in service thread
1671 */
1672 static int event_handler(struct rdma_cm_event *event)
1673 {
1674 id_context_t *context = (id_context_t*) event->id->context;
1675 #if !BTL_OPENIB_RDMACM_IB_ADDR
1676 rdmacm_contents_t *contents;
1677 struct sockaddr *localaddr;
1678 uint32_t localipaddr;
1679 #if OPAL_ENABLE_DEBUG
1680 struct sockaddr *peeraddr;
1681 uint32_t peeripaddr;
1682 #endif
1683 #endif
1684 int rc = -1;
1685 opal_btl_openib_ini_values_t ini;
1686 bool found;
1687
1688 if (NULL == context) {
1689 return rc;
1690 }
1691
1692 #if !BTL_OPENIB_RDMACM_IB_ADDR
1693 contents = context->contents;
1694
1695 localaddr = rdma_get_local_addr(event->id);
1696 localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr;
1697 #if OPAL_ENABLE_DEBUG
1698 peeraddr = rdma_get_peer_addr(event->id);
1699 peeripaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
1700 #endif
1701
1702 BTL_VERBOSE(("%s event_handler -- %s, status = %d to %x",
1703 contents->server?"server":"client",
1704 rdma_event_str(event->event),
1705 event->status,
1706 peeripaddr));
1707 #endif
1708
1709 switch (event->event) {
1710 case RDMA_CM_EVENT_ADDR_RESOLVED:
1711 OPAL_OUTPUT((-1, "SERVICE Got ADDR_RESOLVED: ID %p", (void*) context->id));
1712 rc = resolve_route(context);
1713 break;
1714
1715 case RDMA_CM_EVENT_ROUTE_RESOLVED:
1716 OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id));
1717 #if !BTL_OPENIB_RDMACM_IB_ADDR
1718 contents->ipaddr = localipaddr;
1719 #endif
1720 rc = finish_connect(context);
1721 break;
1722
1723 case RDMA_CM_EVENT_CONNECT_REQUEST:
1724 OPAL_OUTPUT((-1, "SERVICE Got CONNECT_REQUEST: ID %p, context %p",
1725 (void*) event->id, (void*) context));
1726 rc = handle_connect_request(event);
1727 break;
1728
1729 case RDMA_CM_EVENT_ESTABLISHED:
1730 OPAL_OUTPUT((-1, "SERVICE Got ESTABLISHED: %p", (void*) event->id));
1731 rc = rdmacm_connect_endpoint(context, event);
1732 break;
1733
1734 case RDMA_CM_EVENT_DISCONNECTED:
1735 OPAL_OUTPUT((-1, "SERVICE Got DISCONNECTED: %p", (void*) event->id));
1736 rc = rdmacm_disconnected(context);
1737 break;
1738
1739 case RDMA_CM_EVENT_REJECTED:
1740 OPAL_OUTPUT((-1, "SERVICE Got REJECTED: %p", (void*) event->id));
1741 rc = rdmacm_rejected(context, event);
1742 break;
1743
1744 case RDMA_CM_EVENT_CONNECT_ERROR:
1745 /* Some adapters have broken REJECT behavior; the recipient
1746 gets a CONNECT_ERROR event instead of the expected REJECTED
1747 event. So if we get a CONNECT_ERROR, see if it's on a
1748 connection that we're expecting a REJECT (i.e., we have a
1749 dummy_cq setup). If it is, and if a) the MCA param
1750 btl_openib_connect_rdmacm_reject_causes_connect_error is
1751 true, or b) if rdmacm_reject_causes_connect_error set on
1752 the device INI values, then just treat this CONNECT_ERROR
1753 as if it were the REJECT. */
1754 if (NULL != context->contents->dummy_cq) {
1755 struct ibv_device_attr *attr =
1756 &(context->endpoint->endpoint_btl->device->ib_dev_attr);
1757 found = false;
1758 if (OPAL_SUCCESS == opal_btl_openib_ini_query(attr->vendor_id,
1759 attr->vendor_part_id,
1760 &ini) &&
1761 ini.rdmacm_reject_causes_connect_error) {
1762 found = true;
1763 }
1764 if (rdmacm_reject_causes_connect_error) {
1765 found = true;
1766 }
1767
1768 if (found) {
1769 OPAL_OUTPUT((-1, "SERVICE Got CONNECT_ERROR, but ignored: %p", (void*) event->id));
1770 rc = rdmacm_destroy_dummy_qp(context);
1771 break;
1772 }
1773 }
1774
1775 /* Otherwise, fall through and handle the error as normal */
1776
1777 case RDMA_CM_EVENT_UNREACHABLE:
1778 case RDMA_CM_EVENT_CONNECT_RESPONSE:
1779 case RDMA_CM_EVENT_ADDR_ERROR:
1780 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1781 show_help_rdmacm_event_error (event);
1782 rc = OPAL_ERROR;
1783 break;
1784
1785 case RDMA_CM_EVENT_ROUTE_ERROR:
1786 /* Route lookup does not necessarily handle retries, and there
1787 appear to be cases where the subnet manager node can no
1788 longer handle incoming requests. The rdma connection
1789 manager and lower level code doesn't handle retries, so we
1790 have to. */
1791 if (context->route_retry_count < rdmacm_resolve_max_retry_count) {
1792 context->route_retry_count++;
1793 rc = resolve_route(context);
1794 break;
1795 }
1796 show_help_rdmacm_event_error (event);
1797 rc = OPAL_ERROR;
1798 break;
1799
1800 default:
1801 /* Unknown error */
1802 BTL_ERROR(("Unknown RDMA CM error event_handler: %s, status = %d",
1803 rdma_event_str(event->event), event->status));
1804 rc = OPAL_ERROR;
1805 break;
1806 }
1807
1808 return rc;
1809 }
1810
1811 /*
1812 * Runs in event thread
1813 */
1814 static inline void rdmamcm_event_error(struct rdma_cm_event *event)
1815 {
1816 mca_btl_base_endpoint_t *endpoint = NULL;
1817
1818 if (event->id->context) {
1819 endpoint = ((id_context_t *)event->id->context)->contents->endpoint;
1820 }
1821
1822 mca_btl_openib_run_in_main (mca_btl_openib_endpoint_invoke_error,
1823 endpoint);
1824 }
1825
1826 /*
1827 * Runs in event thread
1828 */
1829 static void *rdmacm_event_dispatch(int fd, int flags, void *context)
1830 {
1831 struct rdma_cm_event *event, ecopy;
1832 void *data = NULL;
1833 int rc;
1834
1835 /* blocks until next cm_event */
1836 rc = rdma_get_cm_event(event_channel, &event);
1837 if (0 != rc) {
1838 BTL_ERROR(("rdma_get_cm_event error %d", rc));
1839 return NULL;
1840 }
1841
1842 /* If the incoming event is not acked in a sufficient amount of
1843 * time, there will be a timeout error and the connection will be
1844 * torndown. Also, the act of acking the event destroys the
1845 * included data in the event. In certain circumstances, the time
1846 * it takes to handle a incoming event could approach or exceed
1847 * this time. To prevent this from happening, we will copy the
1848 * event and all of its data, ack the event, and process the copy
1849 * of the event.
1850 */
1851 memcpy(&ecopy, event, sizeof(struct rdma_cm_event));
1852 if (event->param.conn.private_data_len > 0) {
1853 data = malloc(event->param.conn.private_data_len);
1854 if (NULL == data) {
1855 BTL_ERROR(("error mallocing memory"));
1856 return NULL;
1857 }
1858 memcpy(data, event->param.conn.private_data, event->param.conn.private_data_len);
1859 ecopy.param.conn.private_data = data;
1860 }
1861 rdma_ack_cm_event(event);
1862
1863 rc = event_handler(&ecopy);
1864 if (OPAL_SUCCESS != rc) {
1865 rdmamcm_event_error(&ecopy);
1866 }
1867
1868 if (NULL != data) {
1869 free(data);
1870 }
1871
1872 return NULL;
1873 }
1874
1875 /*
1876 * Runs in main thread
1877 *
1878 * CPC init function - Setup all globals here
1879 */
1880 static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint)
1881 {
1882 void *data;
1883
1884 data = calloc(1, sizeof(rdmacm_endpoint_local_cpc_data_t));
1885 if (NULL == data) {
1886 BTL_ERROR(("malloc failed"));
1887 return OPAL_ERR_OUT_OF_RESOURCE;
1888 }
1889 endpoint->endpoint_local_cpc_data = data;
1890
1891 return OPAL_SUCCESS;
1892 }
1893
1894 #if !BTL_OPENIB_RDMACM_IB_ADDR
1895 static int ipaddrcheck(id_context_t *context,
1896 mca_btl_openib_module_t *openib_btl)
1897 {
1898 rdmacm_contents_t *server = context->contents;
1899 uint32_t ipaddr;
1900 bool already_exists = false;
1901 rdmacm_contents_t *contents;
1902 int server_tcp_port = rdma_get_src_port(context->id);
1903 char *str;
1904
1905 /* Look up the IP address of this device/port. This call should not be
1906 * necessary, as rdma_get_local_addr would be more correct in returning the
1907 * IP address given the cm_id (and not necessitate having to do a list look
1908 * up). Unfortunately, the subnet and IP address look up needs to match or
1909 * there could be a mismatch if IP Aliases are being used. For more
1910 * information on this, please read comment above
1911 * mca_btl_openib_get_ip_subnet_id in btl_openib_ip.c
1912 */
1913 ipaddr =
1914 mca_btl_openib_rdma_get_ipv4addr(openib_btl->device->ib_dev_context,
1915 openib_btl->port_num);
1916 if (0 == ipaddr) {
1917 BTL_VERBOSE(("*** Could not find IP address for %s:%d -- is there an IP address configured for this device?",
1918 ibv_get_device_name(openib_btl->device->ib_dev),
1919 openib_btl->port_num));
1920 return OPAL_ERR_NOT_FOUND;
1921 }
1922 str = stringify(ipaddr);
1923 BTL_VERBOSE(("Found device %s:%d = IP address %s:%d",
1924 ibv_get_device_name(openib_btl->device->ib_dev),
1925 openib_btl->port_num, str, server_tcp_port));
1926 free(str);
1927
1928 /* Ok, we found the IP address of this device/port. Have we
1929 already see this IP address/TCP port before? */
1930 OPAL_LIST_FOREACH(contents, &server_listener_list, rdmacm_contents_t) {
1931 BTL_VERBOSE(("paddr = %x, ipaddr addr = %x",
1932 contents->ipaddr, ipaddr));
1933 if (contents->ipaddr == ipaddr &&
1934 contents->tcp_port == server_tcp_port) {
1935 str = stringify(ipaddr);
1936 BTL_VERBOSE(("server already listening on %s:%d",
1937 str, server_tcp_port));
1938 free(str);
1939 already_exists = true;
1940 break;
1941 }
1942 }
1943
1944 /* If we haven't seen it before, save it */
1945 if (!already_exists) {
1946 str = stringify(ipaddr);
1947 BTL_VERBOSE(("creating new server to listen on %s:%d",
1948 str, server_tcp_port));
1949 free(str);
1950 server->ipaddr = ipaddr;
1951 server->tcp_port = server_tcp_port;
1952 }
1953
1954 return already_exists ? OPAL_ERROR : OPAL_SUCCESS;
1955 }
1956 #endif
1957
1958 static int create_message(rdmacm_contents_t *server,
1959 mca_btl_openib_module_t *openib_btl,
1960 opal_btl_openib_connect_base_module_data_t *data)
1961 {
1962 modex_message_t *message;
1963 #if !BTL_OPENIB_RDMACM_IB_ADDR
1964 #if OPAL_ENABLE_DEBUG
1965 char *a;
1966 #endif
1967 #endif
1968
1969 message = (modex_message_t *) malloc(sizeof(modex_message_t));
1970 if (NULL == message) {
1971 BTL_ERROR(("malloc failed"));
1972 return OPAL_ERR_OUT_OF_RESOURCE;
1973 }
1974
1975 message->device_max_qp_rd_atom =
1976 openib_btl->device->ib_dev_attr.max_qp_rd_atom;
1977 message->device_max_qp_init_rd_atom =
1978 openib_btl->device->ib_dev_attr.max_qp_init_rd_atom;
1979
1980 #if BTL_OPENIB_RDMACM_IB_ADDR
1981 memcpy(message->gid, server->gid.raw, sizeof(server->gid));
1982 message->service_id = server->service_id;
1983 #else
1984 message->ipaddr = server->ipaddr;
1985 message->tcp_port = server->tcp_port;
1986
1987 OPAL_OUTPUT((-1, "Message IP address is %s, port %d",
1988 a = stringify(message->ipaddr), message->tcp_port));
1989 #if OPAL_ENABLE_DEBUG
1990 free(a);
1991 #endif
1992 #endif
1993 data->cbm_modex_message = message;
1994 data->cbm_modex_message_len = message_len;
1995
1996 return OPAL_SUCCESS;
1997 }
1998
1999 /*
2000 * Runs in main thread
2001 *
2002 * This function determines if the RDMACM is a possible cpc method and
2003 * sets it up accordingly.
2004 */
2005 static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, opal_btl_openib_connect_base_module_t **cpc)
2006 {
2007 int rc;
2008
2009 id_context_t *context;
2010 rdmacm_contents_t *server = NULL;
2011
2012 #if BTL_OPENIB_RDMACM_IB_ADDR
2013 char rdmacm_addr_str[32];
2014 struct rdma_addrinfo *rdma_addr;
2015 #else
2016 struct sockaddr_in sin;
2017 #endif
2018
2019 /* RDMACM is not supported for MPI_THREAD_MULTIPLE */
2020 if (opal_using_threads()) {
2021 BTL_VERBOSE(("rdmacm CPC is not supported with MPI_THREAD_MULTIPLE; skipped on %s:%d",
2022 ibv_get_device_name(openib_btl->device->ib_dev),
2023 openib_btl->port_num));
2024 rc = OPAL_ERR_NOT_SUPPORTED;
2025 goto out;
2026 }
2027
2028 /* RDMACM is not supported if we have any XRC QPs */
2029 if (mca_btl_openib_component.num_xrc_qps > 0) {
2030 BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
2031 ibv_get_device_name(openib_btl->device->ib_dev),
2032 openib_btl->port_num));
2033 rc = OPAL_ERR_NOT_SUPPORTED;
2034 goto out;
2035 }
2036 if (!BTL_OPENIB_QP_TYPE_PP(0)) {
2037 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2038 "rdmacm CPC only supported when the first QP is a PP QP; skipped");
2039 rc = OPAL_ERR_NOT_SUPPORTED;
2040 goto out;
2041 }
2042
2043 BTL_VERBOSE(("rdmacm_component_query"));
2044
2045 *cpc = (opal_btl_openib_connect_base_module_t *) malloc(sizeof(opal_btl_openib_connect_base_module_t));
2046 if (NULL == *cpc) {
2047 rc = OPAL_ERR_OUT_OF_RESOURCE;
2048 goto out;
2049 }
2050
2051 (*cpc)->data.cbm_component = &opal_btl_openib_connect_rdmacm;
2052 (*cpc)->data.cbm_priority = rdmacm_priority;
2053 (*cpc)->data.cbm_modex_message = NULL;
2054 (*cpc)->data.cbm_modex_message_len = 0;
2055 (*cpc)->cbm_endpoint_init = rdmacm_init;
2056 (*cpc)->cbm_start_connect = rdmacm_module_start_connect;
2057 (*cpc)->cbm_endpoint_finalize = rdmacm_endpoint_finalize;
2058 (*cpc)->cbm_finalize = NULL;
2059 /* Setting uses_cts=true also guarantees that we'll only be
2060 selected if QP 0 is PP */
2061 (*cpc)->cbm_uses_cts = true;
2062
2063 /* Start monitoring the fd associated with the cm_device */
2064 server = OBJ_NEW(rdmacm_contents_t);
2065 if (NULL == server) {
2066 rc = OPAL_ERR_OUT_OF_RESOURCE;
2067 goto out1;
2068 }
2069 server->server = true;
2070 server->openib_btl = openib_btl;
2071
2072 context = OBJ_NEW(id_context_t);
2073 OPAL_OUTPUT((-1, "MAIN Server context: %p", (void*) context));
2074 if (NULL == context) {
2075 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2076 "openib BTL: rdmacm CPC system error (malloc failed)");
2077 rc = OPAL_ERR_OUT_OF_RESOURCE;
2078 goto out3;
2079 }
2080 context->contents = server;
2081 OBJ_RETAIN(context->contents);
2082 opal_list_append(&(server->ids), &(context->super));
2083 context->qpnum = 0;
2084
2085 rc = rdma_create_id(event_channel, &(context->id), context, RDMA_PS_TCP);
2086 if (0 != rc) {
2087 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2088 "openib BTL: rdmacm CPC failed to create ID");
2089 rc = OPAL_ERR_OUT_OF_RESOURCE;
2090 goto out4;
2091 }
2092 #if !BTL_OPENIB_RDMACM_IB_ADDR
2093 memset(&sin, 0, sizeof(sin));
2094 sin.sin_family = AF_INET;
2095 sin.sin_addr.s_addr = rdmacm_addr;
2096 sin.sin_port = (uint16_t) rdmacm_port;
2097 #else
2098 rc = ibv_query_gid(openib_btl->device->ib_pd->context, openib_btl->port_num,
2099 mca_btl_openib_component.gid_index, &server->gid);
2100 if (0 != rc) {
2101 BTL_ERROR(("local gid query failed"));
2102 goto out4;
2103 }
2104
2105 if (NULL == inet_ntop(AF_INET6, server->gid.raw,
2106 rdmacm_addr_str, sizeof rdmacm_addr_str)) {
2107 BTL_ERROR(("local gaddr string creating fail"));
2108 goto out4;
2109 }
2110
2111 rc = get_rdma_addr(rdmacm_addr_str, NULL, &rdma_addr, 1);
2112 if (OPAL_SUCCESS != rc) {
2113 BTL_ERROR(("server: create rdma addr error"));
2114 goto out4;
2115 }
2116 #endif
2117 /* Bind the rdmacm server to the local IP address and an ephemerial
2118 * port or one specified by a comand arg.
2119 */
2120 rc = rdma_bind_addr(context->id,
2121 #if BTL_OPENIB_RDMACM_IB_ADDR
2122 rdma_addr->ai_src_addr);
2123 #else
2124 (struct sockaddr *)&sin);
2125 #endif
2126 if (0 != rc) {
2127 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2128 "openib BTL: rdmacm CPC unable to bind to address");
2129 rc = OPAL_ERR_UNREACH;
2130 #if BTL_OPENIB_RDMACM_IB_ADDR
2131 rdma_freeaddrinfo(rdma_addr);
2132 #endif
2133 goto out5;
2134 }
2135 #if BTL_OPENIB_RDMACM_IB_ADDR
2136 server->service_id = ((struct sockaddr_ib *) (&context->id->route.addr.src_addr))->sib_sid;
2137 rdma_freeaddrinfo(rdma_addr);
2138 #else
2139 /* Verify that the device has a valid IP address on it, or we
2140 cannot use the cpc */
2141 rc = ipaddrcheck(context, openib_btl);
2142 if (OPAL_SUCCESS != rc) {
2143 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2144 "openib BTL: rdmacm IP address not found on port");
2145 rc = OPAL_ERR_NOT_SUPPORTED;
2146 goto out5;
2147 }
2148 #endif
2149 /* Listen on the specified address/port with the rdmacm, limit the
2150 amount of incoming connections to 1024 */
2151 /* FIXME - 1024 should be (num of connectors *
2152 mca_btl_openib_component.num_qps) */
2153 rc = rdma_listen(context->id, 1024);
2154 if (0 != rc) {
2155 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2156 "openib BTL: rdmacm CPC unable to listen");
2157 rc = OPAL_ERR_UNREACH;
2158 goto out5;
2159 }
2160
2161 rc = create_message(server, openib_btl, &(*cpc)->data);
2162 if (0 != rc) {
2163 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2164 "openib BTL: rdmacm CPC unable to create message");
2165 rc = OPAL_ERR_OUT_OF_RESOURCE;
2166 goto out5;
2167 }
2168
2169 opal_list_append(&server_listener_list, &(server->super));
2170
2171 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2172 "openib BTL: rdmacm CPC available for use on %s:%d",
2173 ibv_get_device_name(openib_btl->device->ib_dev),
2174 openib_btl->port_num);
2175 return OPAL_SUCCESS;
2176
2177 out5:
2178 /*
2179 * Since rdma_create_id() succeeded, we need "rdma_destroy_id(context->id)".
2180 * But don't do it here since it's part of out4:OBJ_RELEASE(context),
2181 * and we don't want to do it twice.
2182 */
2183 out4:
2184 opal_list_remove_first(&(server->ids));
2185 OBJ_RELEASE(context);
2186 out3:
2187 OBJ_RELEASE(server);
2188 out1:
2189 free(*cpc);
2190 out:
2191 if (OPAL_ERR_NOT_SUPPORTED == rc) {
2192 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2193 "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped",
2194 ibv_get_device_name(openib_btl->device->ib_dev),
2195 openib_btl->port_num);
2196 } else {
2197 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2198 "openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)",
2199 ibv_get_device_name(openib_btl->device->ib_dev),
2200 openib_btl->port_num, rc,
2201 opal_strerror(rc));
2202 }
2203 return rc;
2204 }
2205
2206 /*
2207 * Invoked by main thread
2208 *
2209 * Shutting down the whole thing.
2210 */
2211 static int rdmacm_component_finalize(void)
2212 {
2213 opal_list_item_t *item, *item2;
2214
2215 BTL_VERBOSE(("rdmacm_component_finalize"));
2216
2217 /* If we're just trolling through ompi_info, don't bother doing
2218 anything */
2219 if (!rdmacm_component_initialized) {
2220 return OPAL_SUCCESS;
2221 }
2222
2223 if (rdmacm_event_base) {
2224 opal_event_del (&rdmacm_event);
2225 opal_progress_thread_finalize (NULL);
2226 rdmacm_event_base = NULL;
2227 }
2228
2229 /* The event thread is no longer running; no need to lock access
2230 to the client_list */
2231 OPAL_LIST_DESTRUCT(&client_list);
2232
2233 /* For each of the items in the server list, there's only one item
2234 in the "ids" list -- the server listener. So explicitly
2235 destroy its RDMA ID context. */
2236 while (NULL != (item = opal_list_remove_first(&server_listener_list))) {
2237 rdmacm_contents_t *contents = (rdmacm_contents_t*) item;
2238 item2 = opal_list_remove_first(&(contents->ids));
2239 OBJ_RELEASE(item2);
2240 OBJ_RELEASE(item);
2241 }
2242 OBJ_DESTRUCT(&server_listener_list);
2243
2244 /* Now we're all done -- destroy the event channel */
2245 if (NULL != event_channel) {
2246 rdma_destroy_event_channel(event_channel);
2247 event_channel = NULL;
2248 }
2249
2250 mca_btl_openib_free_rdma_addr_list();
2251
2252 pthread_cond_destroy (&rdmacm_disconnect_cond);
2253 pthread_mutex_destroy (&rdmacm_disconnect_lock);
2254
2255 return OPAL_SUCCESS;
2256 }
2257
2258 #if BTL_OPENIB_RDMACM_IB_ADDR
2259 static int rdmacm_check_ibaddr_support(void)
2260 {
2261 int rsock;
2262 rsock = rsocket(AF_IB, SOCK_STREAM, 0);
2263 if (rsock < 0) {
2264 return OPAL_ERROR;
2265 }
2266
2267 rclose(rsock);
2268
2269 return OPAL_SUCCESS;
2270 }
2271 #endif
2272
2273 static int rdmacm_component_init(void)
2274 {
2275 int rc;
2276
2277 OBJ_CONSTRUCT(&server_listener_list, opal_list_t);
2278 OBJ_CONSTRUCT(&client_list, opal_list_t);
2279 OBJ_CONSTRUCT(&client_list_lock, opal_mutex_t);
2280
2281 #if !BTL_OPENIB_RDMACM_IB_ADDR
2282 rc = mca_btl_openib_build_rdma_addr_list();
2283 if (OPAL_SUCCESS != rc) {
2284 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2285 "openib BTL: rdmacm CPC unable to find any valid IP address");
2286 return OPAL_ERR_NOT_SUPPORTED;
2287 }
2288 #else
2289 rc = rdmacm_check_ibaddr_support();
2290 if (OPAL_SUCCESS != rc) {
2291 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2292 "There is no IB_AF addressing support by lib rdmacm");
2293 return OPAL_ERR_NOT_SUPPORTED;
2294 }
2295 #endif
2296
2297 event_channel = rdma_create_event_channel();
2298 if (NULL == event_channel) {
2299 opal_output_verbose(5, opal_btl_base_framework.framework_output,
2300 "openib BTL: rdmacm CPC failed to create channel");
2301 return OPAL_ERR_UNREACH;
2302 }
2303
2304 rdmacm_event_base = opal_progress_thread_init (NULL);
2305 if (NULL == rdmacm_event_base) {
2306 opal_output_verbose (5, opal_btl_base_framework.framework_output,
2307 "openib BTL: could not create rdmacm event thread");
2308 return OPAL_ERR_UNREACH;
2309 }
2310
2311 opal_event_set (rdmacm_event_base, &rdmacm_event, event_channel->fd,
2312 OPAL_EV_READ | OPAL_EV_PERSIST, rdmacm_event_dispatch, NULL);
2313
2314 opal_event_add (&rdmacm_event, 0);
2315
2316 pthread_cond_init (&rdmacm_disconnect_cond, NULL);
2317 pthread_mutex_init (&rdmacm_disconnect_lock, NULL);
2318
2319 rdmacm_component_initialized = true;
2320
2321 return OPAL_SUCCESS;
2322 }
2323