1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2007-2013 Cisco Systems, Inc.  All rights reserved.
4  * Copyright (c) 2007-2008 Chelsio, Inc. All rights reserved.
5  * Copyright (c) 2008      Mellanox Technologies. All rights reserved.
6  * Copyright (c) 2009      Sandia National Laboratories. All rights reserved.
7  * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
8  * Copyright (c) 2012-2017 Los Alamos National Security, LLC.  All rights
9  *                         reserved.
10  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
11  * Copyright (c) 2014      The University of Tennessee and The University
12  *                         of Tennessee Research Foundation.  All rights
13  *                         reserved.
14  *
15  * $COPYRIGHT$
16  *
17  * Additional copyrights may follow
18  *
19  * $HEADER$
20  */
21 
22 #include "opal_config.h"
23 
24 #include <rdma/rdma_cma.h>
25 #ifdef HAVE_UNISTD_H
26 #include <unistd.h>
27 #endif
28 #include <sys/types.h>
29 #ifdef HAVE_SYS_SOCKET_H
30 #include <sys/socket.h>
31 #endif
32 #ifdef HAVE_SYS_IOCTL_H
33 #include <sys/ioctl.h>
34 #endif
35 #ifdef HAVE_ARPA_INET_H
36 #include <arpa/inet.h>
37 #endif
38 #ifdef HAVE_NETINET_IN_H
39 #include <netinet/in.h>
40 #endif
41 #ifdef HAVE_NET_IF_H
42 #include <net/if.h>
43 #endif
44 #include <stdio.h>
45 #include <errno.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #ifdef HAVE_DIRENT_H
50 #include <dirent.h>
51 #endif
52 #include <stddef.h>
53 
54 #include "opal/util/output.h"
55 #include "opal/util/error.h"
56 #include "opal/util/show_help.h"
57 #include "opal/util/proc.h"
58 #include "opal/runtime/opal_progress_threads.h"
59 
60 #include "btl_openib_proc.h"
61 #include "btl_openib_endpoint.h"
62 #include "connect/connect.h"
63 #include "btl_openib_ip.h"
64 #include "btl_openib_ini.h"
65 
66 #if BTL_OPENIB_RDMACM_IB_ADDR
67 #include <stdio.h>
68 #include <netinet/in.h>
69 #include <netinet/tcp.h>
70 #include <sys/types.h>
71 #include <rdma/rsocket.h>
72 #include <infiniband/ib.h>
73 #endif
74 
75 #define mymin(a, b) ((a) < (b) ? (a) : (b))
76 
77 static void rdmacm_component_register(void);
78 static int rdmacm_component_init(void);
79 static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl,
80                                   opal_btl_openib_connect_base_module_t **cpc);
81 static int rdmacm_component_finalize(void);
82 
83 opal_btl_openib_connect_base_component_t opal_btl_openib_connect_rdmacm = {
84     "rdmacm",
85     rdmacm_component_register,
86     rdmacm_component_init,
87     rdmacm_component_query,
88     rdmacm_component_finalize
89 };
90 
91 /*
92  * A single instance of this data structure is shared between one
93  * id_context_t for each BSRQ qp on an endpoint.
94  */
95 typedef struct {
96     opal_list_item_t super;
97     mca_btl_openib_endpoint_t *endpoint;
98     mca_btl_openib_module_t *openib_btl;
99     /* Dummy QP only used when we expect the connection to be
100        rejected */
101     struct ibv_cq *dummy_cq;
102 #if BTL_OPENIB_RDMACM_IB_ADDR
103     union ibv_gid gid;
104     uint64_t service_id;
105 #else
106     uint32_t ipaddr;
107     uint16_t tcp_port;
108 #endif
109     /* server==false means that this proc initiated the connection;
110        server==true means that this proc accepted the incoming
111        connection.  Note that this may be different than the "one way"
112        / i_initiate() direction -- it is possible for server==false
113        and i_initiate() to return false; it means that this proc
114        initially initiated the connection, but we expect it to be
115        rejected. */
116     bool server;
117 
118     /* Whether this contents struct has been saved on the client list
119        or not */
120     bool on_client_list;
121 
122     /* A list of all the id_context_t's that are using this
123        rdmacm_contents_t */
124     opal_list_t ids;
125 } rdmacm_contents_t;
126 
127 static void rdmacm_contents_constructor(rdmacm_contents_t *contents);
128 static void rdmacm_contents_destructor(rdmacm_contents_t *contents);
129 OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t,
130                    rdmacm_contents_constructor,
131                    rdmacm_contents_destructor);
132 
133 typedef struct {
134     int device_max_qp_rd_atom;
135     int device_max_qp_init_rd_atom;
136 #if BTL_OPENIB_RDMACM_IB_ADDR
137     uint8_t  gid[16];
138     uint64_t service_id;
139 #else
140     uint32_t ipaddr;
141     uint16_t tcp_port;
142 #endif
143     uint8_t end;
144 } modex_message_t;
145 
146 typedef struct {
147     int rdmacm_counter;
148 } rdmacm_endpoint_local_cpc_data_t;
149 
150 /*
151  * There are one of these for each RDMA CM ID.  Because of BSRQ, there
152  * can be multiple of these for one endpoint, so all the
153  * id_context_t's on a single endpoing share a single
154  * rdmacm_contents_t.
155  */
156 typedef struct {
157     opal_list_item_t super;
158     rdmacm_contents_t *contents;
159     mca_btl_openib_endpoint_t *endpoint;
160     uint8_t qpnum;
161     bool already_disconnected;
162     uint16_t route_retry_count;
163     struct rdma_cm_id *id;
164 } id_context_t;
165 
166 static void id_context_constructor(id_context_t *context);
167 static void id_context_destructor(id_context_t *context);
168 OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t,
169                    id_context_constructor,
170                    id_context_destructor);
171 
172 typedef struct {
173 #if BTL_OPENIB_RDMACM_IB_ADDR
174     /*
175      * According to infiniband spec a "Consumer Private Data" begings from 36th up
176      * to 91th byte (so the limit is 56 bytes) and first 36 bytes
177      * intended for lib RDMA CM header (sometimes not all of these bytes are used)
178      * so we must take into account that in case of AF_IB user private data pointer
179      * points to a header and not to a "Consumer Private Data".
180      */
181     uint8_t  librdmacm_header[36];
182     uint64_t rem_port;
183 #else
184     uint16_t rem_port;
185 #endif
186     uint32_t rem_index;
187     uint8_t qpnum;
188     opal_process_name_t rem_name;
189 } __opal_attribute_packed__ private_data_t;
190 
191 #if !BTL_OPENIB_RDMACM_IB_ADDR
192 /* Used to send a specific show_help message from the service_thread
193    to the main thread (because we can't call show_help from the
194    service_thread) */
195 typedef struct {
196     char device_name[32];
197     uint32_t peer_ip_addr;
198     uint32_t peer_tcp_port;
199 } cant_find_endpoint_context_t;
200 #endif
201 
202 static opal_list_t server_listener_list;
203 static opal_list_t client_list;
204 static opal_mutex_t client_list_lock;
205 static struct rdma_event_channel *event_channel = NULL;
206 static int rdmacm_priority = 30;
207 static unsigned int rdmacm_port = 0;
208 
209 #if !BTL_OPENIB_RDMACM_IB_ADDR
210 static uint32_t rdmacm_addr = 0;
211 #endif
212 
213 static int rdmacm_resolve_timeout = 30000;
214 static int rdmacm_resolve_max_retry_count = 20;
215 static bool rdmacm_reject_causes_connect_error = false;
216 static pthread_cond_t rdmacm_disconnect_cond;
217 static pthread_mutex_t rdmacm_disconnect_lock;
218 static volatile int disconnect_callbacks = 0;
219 static bool rdmacm_component_initialized = false;
220 static opal_event_base_t *rdmacm_event_base = NULL;
221 static opal_event_t rdmacm_event;
222 
223 /* Calculate the *real* length of the message (not aligned/rounded
224    up) */
225 static int message_len = offsetof(modex_message_t, end);
226 
227 /* Rejection reasons */
228 typedef enum {
229     REJECT_WRONG_DIRECTION,
230     REJECT_TRY_AGAIN
231 } reject_reason_t;
232 
id_context_constructor(id_context_t * context)233 static void id_context_constructor(id_context_t *context)
234 {
235     context->already_disconnected = false;
236     context->id = NULL;
237     context->contents = NULL;
238     context->endpoint = NULL;
239     context->qpnum = 255;
240     context->route_retry_count = 0;
241 }
242 
id_context_destructor(id_context_t * context)243 static void id_context_destructor(id_context_t *context)
244 {
245     if (NULL != context->id) {
246         rdma_destroy_id(context->id);
247         context->id = NULL;
248     }
249     if (NULL != context->contents) {
250         OBJ_RELEASE(context->contents);
251     }
252 }
253 
rdmacm_contents_constructor(rdmacm_contents_t * contents)254 static void rdmacm_contents_constructor(rdmacm_contents_t *contents)
255 {
256     contents->endpoint = NULL;
257     contents->openib_btl = NULL;
258     contents->dummy_cq = NULL;
259 #if BTL_OPENIB_RDMACM_IB_ADDR
260     contents->service_id = 0;
261 #else
262     contents->ipaddr = 0;
263     contents->tcp_port = 0;
264 #endif
265     contents->server = false;
266     contents->on_client_list = false;
267     OBJ_CONSTRUCT(&(contents->ids), opal_list_t);
268 }
269 
rdmacm_contents_destructor(rdmacm_contents_t * contents)270 static void rdmacm_contents_destructor(rdmacm_contents_t *contents)
271 {
272     OBJ_DESTRUCT(&(contents->ids));
273 }
274 
275 /*
276  * Invoked by main thread
277  *
278  * Sets up any rdma_cm specific commandline params
279  */
rdmacm_component_register(void)280 static void rdmacm_component_register(void)
281 {
282     /* the priority is initialized in the declaration above */
283     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
284                                            "connect_rdmacm_priority",
285                                            "The selection method priority for rdma_cm",
286                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
287                                            OPAL_INFO_LVL_9,
288                                            MCA_BASE_VAR_SCOPE_READONLY,
289                                            &rdmacm_priority);
290     if (rdmacm_priority > 100) {
291         rdmacm_priority = 100;
292     } else if (rdmacm_priority < 0) {
293         rdmacm_priority = 0;
294     }
295 
296     rdmacm_port = 0;
297     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
298                                            "connect_rdmacm_port",
299                                            "The selection method port for rdma_cm",
300                                            MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
301                                            OPAL_INFO_LVL_9,
302                                            MCA_BASE_VAR_SCOPE_READONLY,
303                                            &rdmacm_port);
304     if (rdmacm_port & ~0xfffful) {
305         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
306                        "illegal tcp port", true, (int) rdmacm_port);
307         rdmacm_port = 0;
308     }
309 
310     rdmacm_resolve_timeout = 30000;
311     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
312                                            "connect_rdmacm_resolve_timeout",
313                                            "The timeout (in miliseconds) for address and route resolution",
314                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
315                                            OPAL_INFO_LVL_9,
316                                            MCA_BASE_VAR_SCOPE_READONLY,
317                                            &rdmacm_resolve_timeout);
318     if (0 > rdmacm_resolve_timeout) {
319         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
320                        "illegal timeout", true, rdmacm_resolve_timeout);
321         rdmacm_resolve_timeout = 30000;
322     }
323 
324     rdmacm_resolve_max_retry_count = 20;
325     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
326                                            "connect_rdmacm_retry_count",
327                                            "Maximum number of times rdmacm will retry route resolution",
328                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
329                                            OPAL_INFO_LVL_9,
330                                            MCA_BASE_VAR_SCOPE_READONLY,
331                                            &rdmacm_resolve_max_retry_count);
332     if (0 > rdmacm_resolve_max_retry_count) {
333         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
334                        "illegal retry count", true, rdmacm_resolve_max_retry_count);
335         rdmacm_resolve_max_retry_count = 20;
336     }
337 
338     rdmacm_reject_causes_connect_error = false;
339     (void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
340                                            "connect_rdmacm_reject_causes_connect_error",
341                                            "The drivers for some devices are buggy such that an RDMA REJECT action may result in a CONNECT_ERROR event instead of a REJECTED event.  Setting this MCA parameter to true tells Open MPI to treat CONNECT_ERROR events on connections where a REJECT is expected as a REJECT (default: false)",
342                                            MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
343                                            OPAL_INFO_LVL_9,
344                                            MCA_BASE_VAR_SCOPE_READONLY,
345                                            &rdmacm_reject_causes_connect_error);
346 }
347 
348 /*
349  * Helper function for when we are debugging
350  */
stringify(uint32_t addr)351 static char *stringify(uint32_t addr)
352 {
353     char *line = (char *) malloc(64);
354     asprintf(&line, "%d.%d.%d.%d (0x%x)",
355 #if defined(WORDS_BIGENDIAN)
356              (addr >> 24),
357              (addr >> 16) & 0xff,
358              (addr >> 8) & 0xff,
359              addr & 0xff,
360 #else
361              addr & 0xff,
362              (addr >> 8) & 0xff,
363              (addr >> 16) & 0xff,
364              (addr >> 24),
365 #endif
366              addr);
367     return line;
368 }
369 
370 /*
371  * Invoked by service thread
372  *
373  * This function traverses the list of endpoints associated with the
374  * device and determines which of them the remote side is attempting
375  * to connect to.  This is determined based on the local endpoint's
376  * modex message recevied and the IP address and port associated with
377  * the rdma_cm event id
378  */
rdmacm_find_endpoint(rdmacm_contents_t * contents,opal_process_name_t rem_name)379 static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *contents,
380                                                        opal_process_name_t rem_name)
381 {
382     mca_btl_openib_module_t *btl = contents->openib_btl;
383     mca_btl_openib_endpoint_t *ep = NULL;
384     opal_proc_t *opal_proc;
385 
386     opal_proc = opal_proc_for_name (rem_name);
387     if (NULL == opal_proc) {
388         BTL_ERROR(("could not get proc associated with remote peer %s",
389                    opal_process_name_print (rem_name)));
390         return NULL;
391     }
392 
393     ep = mca_btl_openib_get_ep (&btl->super, opal_proc);
394     if (NULL == ep) {
395         BTL_ERROR(("could not find endpoint for peer %s",
396                    opal_process_name_print (rem_name)));
397     }
398 
399     return ep;
400 }
401 
402 /*
403  * Returns max inlne size for qp #N
404  */
max_inline_size(int qp,mca_btl_openib_device_t * device)405 static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device)
406 {
407     if (mca_btl_openib_component.qp_infos[qp].size <= device->max_inline_data) {
408         /* If qp message size is smaller than max_inline_data,
409          * we should enable inline messages */
410         return mca_btl_openib_component.qp_infos[qp].size;
411     } else if (mca_btl_openib_component.rdma_qp == qp || 0 == qp) {
412         /* If qp message size is bigger that max_inline_data, we
413          * should enable inline messages only for RDMA QP (for PUT/GET
414          * fin messages) and for the first qp */
415         return device->max_inline_data;
416     }
417     /* Otherwise it is no reason for inline */
418     return 0;
419 }
420 
421 
422 /*
423  * Invoked by both main and service threads
424  */
rdmacm_setup_qp(rdmacm_contents_t * contents,mca_btl_openib_endpoint_t * endpoint,struct rdma_cm_id * id,int qpnum)425 static int rdmacm_setup_qp(rdmacm_contents_t *contents,
426                            mca_btl_openib_endpoint_t *endpoint,
427                            struct rdma_cm_id *id,
428                            int qpnum)
429 {
430     struct ibv_qp_init_attr attr;
431     struct ibv_qp *qp;
432     struct ibv_srq *srq = NULL;
433     int credits = 0, reserved = 0, max_recv_wr, max_send_wr;
434     size_t req_inline;
435 
436     if (qpnum == mca_btl_openib_component.credits_qp) {
437         int qp;
438 
439         for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
440             if(BTL_OPENIB_QP_TYPE_PP(qp)) {
441                 reserved += mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
442             }
443         }
444         credits = mca_btl_openib_component.num_qps;
445     }
446 
447     if (BTL_OPENIB_QP_TYPE_PP(qpnum)) {
448         max_recv_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + reserved;
449         max_send_wr = mca_btl_openib_component.qp_infos[qpnum].rd_num + credits;
450     } else {
451         srq = endpoint->endpoint_btl->qps[qpnum].u.srq_qp.srq;
452         max_recv_wr = reserved;
453         max_send_wr = mca_btl_openib_component.qp_infos[qpnum].u.srq_qp.sd_max + credits;
454     }
455 
456     memset(&attr, 0, sizeof(attr));
457     attr.qp_type = IBV_QPT_RC;
458     attr.send_cq = contents->openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
459     attr.recv_cq = contents->openib_btl->device->ib_cq[qp_cq_prio(qpnum)];
460     attr.srq = srq;
461     if(BTL_OPENIB_QP_TYPE_PP(qpnum)) {
462         /* Add one for the CTS receive frag that will be posted */
463         attr.cap.max_recv_wr = max_recv_wr + 1;
464     } else {
465         attr.cap.max_recv_wr = 0;
466     }
467     attr.cap.max_send_wr = max_send_wr;
468     attr.cap.max_inline_data = req_inline =
469         max_inline_size(qpnum, contents->openib_btl->device);
470     attr.cap.max_send_sge = 1;
471     attr.cap.max_recv_sge = 1; /* we do not use SG list */
472 
473     {
474         /* JMS Temprary gross hack: we *must* use rdma_create_cp()
475            (vs. ibv_create_qp()) because strange things happen on IB
476            if we don't.  However, rdma_create_cp() wants us to use
477            rdma_get_devices() (and therefore the pd that they have
478            allocated).  In order to get v1.3 out the door, we're
479            bypassing this functionality - we're temporarily overriding
480            the device context cached on the ID with our own, so that
481            our pd will match.  We need to fix this to properly get the
482            pd from the RDMA CM and use that, etc. */
483         struct ibv_context *temp = id->verbs;
484         id->verbs = contents->openib_btl->device->ib_pd->context;
485         if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd,
486                                 &attr)) {
487             BTL_ERROR(("Failed to create qp with %d", qpnum));
488             goto out;
489         }
490         qp = id->qp;
491         id->verbs = temp;
492     }
493 
494     endpoint->qps[qpnum].qp->lcl_qp = qp;
495     endpoint->qps[qpnum].credit_frag = NULL;
496     if (attr.cap.max_inline_data < req_inline) {
497         endpoint->qps[qpnum].ib_inline_max = attr.cap.max_inline_data;
498         opal_show_help("help-mpi-btl-openib-cpc-base.txt",
499                        "inline truncated", true,
500                        opal_process_info.nodename,
501                        ibv_get_device_name(contents->openib_btl->device->ib_dev),
502                        contents->openib_btl->port_num,
503                        req_inline, attr.cap.max_inline_data);
504     } else {
505         endpoint->qps[qpnum].ib_inline_max = req_inline;
506     }
507     id->qp = qp;
508 
509     return OPAL_SUCCESS;
510 
511 out:
512     return OPAL_ERROR;
513 }
514 
515 
516 /*
517  * Invoked by both main and service threads
518  *
519  * To avoid all kinds of nasty race conditions, we only allow
520  * connections to be made in one direction.  So use a simple
521  * (arbitrary) test to decide which direction is allowed to initiate
522  * the connection: the process with the lower IP address wins.  If the
523  * IP addresses are the same (i.e., the MPI procs are on the same
524  * node), then the process with the lower TCP port wins.
525  */
i_initiate(uint64_t local_port,uint64_t remote_port,union ibv_gid * local_gid,union ibv_gid * remote_gid)526 static bool i_initiate(uint64_t local_port, uint64_t remote_port,
527 #if BTL_OPENIB_RDMACM_IB_ADDR
528                        union ibv_gid *local_gid, union ibv_gid *remote_gid)
529 {
530 #else
531                        uint32_t local_ipaddr, uint32_t remote_ipaddr)
532 {
533 #if OPAL_ENABLE_DEBUG
534     char *a = stringify(local_ipaddr);
535     char *b = stringify(remote_ipaddr);
536 #endif
537 #endif
538 
539 #if BTL_OPENIB_RDMACM_IB_ADDR
540     if (local_gid->global.subnet_prefix < remote_gid->global.subnet_prefix ||
541         (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
542          local_gid->global.interface_id < remote_gid->global.interface_id) ||
543         (local_gid->global.subnet_prefix == remote_gid->global.subnet_prefix &&
544          local_gid->global.interface_id == remote_gid->global.interface_id &&
545 #else
546     if (local_ipaddr > remote_ipaddr ||
547         (local_ipaddr == remote_ipaddr &&
548 #endif
549               local_port < remote_port)) {
550 #if !BTL_OPENIB_RDMACM_IB_ADDR
551         OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s",
552                      a, b));
553 #if OPAL_ENABLE_DEBUG
554         free(a);
555         free(b);
556 #endif
557 #endif
558         return true;
559     }
560 #if !BTL_OPENIB_RDMACM_IB_ADDR
561     OPAL_OUTPUT((-1, "i_initiate (I lose): local ipaddr %s, remote ipaddr %s",
562                  a, b));
563 #if OPAL_ENABLE_DEBUG
564     free(a);
565     free(b);
566 #endif
567 #endif
568     return false;
569 }
570 
571 #if BTL_OPENIB_RDMACM_IB_ADDR
572 static int get_rdma_addr(char *src, char *dst,
573                          struct rdma_addrinfo **rdma_addr,
574                          int server)
575 {
576     int rc;
577     struct rdma_addrinfo hints, *sres, *dres;
578 
579     memset(&hints, 0, sizeof hints);
580 
581     hints.ai_family = AF_IB;
582     hints.ai_port_space = RDMA_PS_TCP;
583     hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY | RAI_PASSIVE;
584 
585     rc = rdma_getaddrinfo(src, NULL, &hints, &sres);
586     if (0 != rc) {
587         return OPAL_ERROR;
588     }
589 
590     if (server) {
591         *rdma_addr = sres;
592         return OPAL_SUCCESS;
593     }
594 
595     hints.ai_src_len  = sres->ai_src_len;
596     hints.ai_src_addr = sres->ai_src_addr;
597 
598     hints.ai_flags &= ~RAI_PASSIVE;
599 
600     rc = rdma_getaddrinfo(dst, NULL, &hints, &dres);
601     if (0 != rc) {
602         rdma_freeaddrinfo(sres);
603         return OPAL_ERROR;
604     }
605 
606     rdma_freeaddrinfo(sres);
607     *rdma_addr = dres;
608 
609     return OPAL_SUCCESS;
610 }
611 #endif
612 
613 /*
614  * Invoked by main thread
615  */
616 static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
617                                      modex_message_t *message,
618                                      int num)
619 {
620     int rc;
621     id_context_t *context;
622 #if BTL_OPENIB_RDMACM_IB_ADDR
623     char src_addr[32], dst_addr[32];
624     struct rdma_addrinfo *rdma_addr;
625 #else
626     struct sockaddr_in src_in, dest_in;
627 
628 #if OPAL_ENABLE_DEBUG
629     char *a, *b;
630 #endif
631 #endif
632 
633     /* We'll need to access some data in the event handler.  We can
634      * encapsulate it in this data struct and attach it to the id being
635      * created below.  The event->id will contain this same pointer.
636      */
637     context = OBJ_NEW(id_context_t);
638     if (NULL == context) {
639         BTL_ERROR(("malloc error"));
640         goto out;
641     }
642 
643     context->contents = contents;
644     OBJ_RETAIN(contents);
645     context->qpnum = num;
646     context->endpoint = contents->endpoint;
647 
648     rc = rdma_create_id(event_channel, &(context->id),
649                         context, RDMA_PS_TCP);
650     if (0 != rc) {
651         BTL_ERROR(("Failed to create a rdma id with %d", rc));
652         goto out1;
653     }
654 #if !BTL_OPENIB_RDMACM_IB_ADDR
655     /* Source address (we must specify this to ensure that the traffic
656        goes out on the device+port that we expect it go out). */
657     memset(&src_in, 0, sizeof(src_in));
658     src_in.sin_family = AF_INET;
659     src_in.sin_addr.s_addr = contents->ipaddr;
660     src_in.sin_port = 0;
661 
662     /* Destination address */
663     memset(&dest_in, 0, sizeof(dest_in));
664     dest_in.sin_family = AF_INET;
665     dest_in.sin_addr.s_addr = message->ipaddr;
666     dest_in.sin_port = message->tcp_port;
667 
668     /* Once the route to the remote system is discovered, a
669      * RDMA_CM_EVENT_ADDR_RESOLVED event will occur on the local event
670      * handler.
671      */
672     OPAL_OUTPUT((-1, "MAIN Resolving id: from IP %s:%d to IP %s:%d",
673                  a = stringify(contents->ipaddr),
674                  contents->tcp_port,
675                  b = stringify(message->ipaddr),
676                  message->tcp_port));
677 #if OPAL_ENABLE_DEBUG
678     free(a);
679     free(b);
680 #endif
681 #endif
682     /* This is odd an worth explaining: when we place the context on
683        the ids list, we need to add an extra RETAIN to the context.
684        The reason is because of a race condition.  Let's explain
685        through a few cases:
686 
687        1. Normal termination: client side endpoint_finalize removes
688           the context from the ids list, has its service thread call
689           rdma_disconnect(), and then RELEASE.  A DISCONNECT event
690           will occur on both sides; the client DISCONNECT will invoke
691           RELEASE again on the context.  Note that the DISCONNECT
692           event may occur *very* quickly on the client side, so the
693           order of these two RELEASEs is not known.  The destructor
694           will invoke rdma_destroy_id() -- we obviously can't have
695           this happen before both actions complete.  Hence,
696           refcounting (and the additional RETAIN) saves us.
697 
698           Note that the server side never had the context on the ids
699           list, so it never had an extra RETAIN.  So the DISCONNECT on
700           the server side will only invoke one RELEASE.
701 
702        2. Abnormal termination: if the server side terminates
703           improperly (e.g., user's app segv's), then the kernel from
704           the server side will send a DISCONNECT event to the client
705           before the item has been removed from the ids list.  This
706           will cause an assertion failure in debug builds (because
707           we'll be trying to RELEASE an opal_list_item_t that is still
708           on a list), and possibly other badness in optimized builds
709           because we'll be traversing a freed opal_list_item_t in
710           endpoint_finalize.  So the extra RETAIN here right when we
711           put the item on the list prevents it from actually being
712           released in the client until BOTH the endpoint_finalize
713           occurs *and* the DISCONNECT event arrives.
714 
715        Asynchronous programming is fun!
716      */
717     OBJ_RETAIN(context);
718     opal_list_append(&(contents->ids), &(context->super));
719 #if BTL_OPENIB_RDMACM_IB_ADDR
720     if (NULL == inet_ntop(AF_INET6, contents->gid.raw,
721                                src_addr, sizeof src_addr)) {
722         BTL_ERROR(("local addr string creating fail"));
723         goto out1;
724     }
725 
726     if (NULL == inet_ntop(AF_INET6, message->gid,
727                                dst_addr, sizeof dst_addr)) {
728         BTL_ERROR(("remote addr string creating fail"));
729         goto out1;
730     }
731 
732     rc = get_rdma_addr(src_addr, dst_addr, &rdma_addr, 0);
733     if (OPAL_SUCCESS != rc) {
734         BTL_ERROR(("server: create rdma addr error"));
735         goto out1;
736     }
737 
738     ((struct sockaddr_ib *) (rdma_addr->ai_dst_addr))->sib_sid = message->service_id;
739 #endif
740     rc = rdma_resolve_addr(context->id,
741 #if BTL_OPENIB_RDMACM_IB_ADDR
742                            rdma_addr->ai_src_addr,
743                            rdma_addr->ai_dst_addr,
744 #else
745                            (struct sockaddr *) &src_in,
746                            (struct sockaddr *) &dest_in,
747 #endif
748                            rdmacm_resolve_timeout);
749     if (0 != rc) {
750         BTL_ERROR(("Failed to resolve the remote address with %d", rc));
751 #if BTL_OPENIB_RDMACM_IB_ADDR
752         rdma_freeaddrinfo(rdma_addr);
753 #endif
754         goto out1;
755     }
756 #if BTL_OPENIB_RDMACM_IB_ADDR
757     rdma_freeaddrinfo(rdma_addr);
758 #endif
759 
760     return OPAL_SUCCESS;
761 
762 out1:
763     OBJ_RELEASE(context);
764 out:
765     return OPAL_ERROR;
766 }
767 
768 /*
769  * Invoked by main thread
770  *
771  * Connect method called by the upper layers to connect the local
772  * endpoint to the remote endpoint by creating QP(s) to connect the two.
773  * Already holding endpoint lock when this function is called.
774  */
775 static int rdmacm_module_start_connect(opal_btl_openib_connect_base_module_t *cpc,
776                                        mca_btl_base_endpoint_t *endpoint)
777 {
778     rdmacm_contents_t *contents;
779     modex_message_t *message, *local_message;
780     int rc, qp;
781     opal_list_item_t *item;
782 #if !BTL_OPENIB_RDMACM_IB_ADDR
783 #if OPAL_ENABLE_DEBUG
784     char *a, *b;
785 #endif
786 #endif
787     /* Don't use the CPC to get the message, because this function is
788        invoked from the event_handler (to intitiate connections in the
789        Right direction), where we don't have the CPC, so it'll be
790        NULL. */
791     local_message =
792         (modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message;
793     message = (modex_message_t *)
794         endpoint->endpoint_remote_cpc_data->cbm_modex_message;
795 #if !BTL_OPENIB_RDMACM_IB_ADDR
796     OPAL_OUTPUT((-1, "Connecting from IP %s:%d to remote IP %s:%d  ep state = %d",
797                  a = stringify(local_message->ipaddr), local_message->tcp_port,
798                  b = stringify(message->ipaddr), message->tcp_port, endpoint->endpoint_state));
799 #if OPAL_ENABLE_DEBUG
800     free(a);
801     free(b);
802 #endif
803     BTL_VERBOSE(("Connecting to remote ip addr = %x, port = %d  ep state = %d",
804                  message->ipaddr, message->tcp_port, endpoint->endpoint_state));
805 #endif
806     if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state ||
807         MCA_BTL_IB_CONNECTING == endpoint->endpoint_state ||
808         MCA_BTL_IB_CONNECT_ACK == endpoint->endpoint_state) {
809         return OPAL_SUCCESS;
810     }
811 
812     /* Set the endpoint state to "connecting" (this function runs in
813        the main MPI thread; not the service thread, so we can set the
814        endpoint_state here). */
815     endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
816 
817     contents = OBJ_NEW(rdmacm_contents_t);
818     if (NULL == contents) {
819         BTL_ERROR(("malloc of contents failed"));
820         rc = OPAL_ERR_OUT_OF_RESOURCE;
821         goto out;
822     }
823 
824     contents->openib_btl = endpoint->endpoint_btl;
825     contents->endpoint = endpoint;
826     contents->server = false;
827     /* Populate the port information with the local port the server is
828      * listening on instead of the ephemerial port this client is
829      * connecting with.  This port is used to determine which endpoint
830      * is being connected from, in the case where there are multiple
831      * listeners on the local system.
832      */
833 #if BTL_OPENIB_RDMACM_IB_ADDR
834     memcpy(contents->gid.raw, local_message->gid, sizeof(contents->gid));
835     contents->service_id = local_message->service_id;
836 #else
837     contents->ipaddr = local_message->ipaddr;
838     contents->tcp_port = local_message->tcp_port;
839 #endif
840 
841     /* Are we the initiator?  Or do we expect this connect request to
842        be rejected? */
843     endpoint->endpoint_initiator =
844         i_initiate(
845 #if BTL_OPENIB_RDMACM_IB_ADDR
846                    contents->service_id, message->service_id,
847                    &contents->gid, (union ibv_gid *) message->gid);
848 #else
849                    contents->tcp_port, message->tcp_port,
850                    contents->ipaddr, message->ipaddr);
851 #endif
852     OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s",
853                  (void*) endpoint,
854                  (void*) endpoint->endpoint_local_cpc,
855                  endpoint->endpoint_initiator ? "am" : "am NOT",
856                  opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
857 
858     /* If we're the initiator, then open all the QPs */
859     if (contents->endpoint->endpoint_initiator) {
860         /* Initiator needs a CTS frag (non-initiator will have a CTS
861            frag allocated later) */
862         if (OPAL_SUCCESS !=
863             (rc = opal_btl_openib_connect_base_alloc_cts(contents->endpoint))) {
864             BTL_ERROR(("Failed to alloc CTS frag"));
865             goto out;
866         }
867 
868         for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
869             rc = rdmacm_client_connect_one(contents, message, qp);
870             if (OPAL_SUCCESS != rc) {
871                 BTL_ERROR(("rdmacm_client_connect_one error (real QP %d)",
872                            qp));
873                 goto out;
874             }
875         }
876     }
877     /* Otherwise, only open 1 QP that we expect to be rejected */
878     else {
879         rc = rdmacm_client_connect_one(contents, message, 0);
880         if (OPAL_SUCCESS != rc) {
881             BTL_ERROR(("rdmacm_client_connect_one error (bogus QP)"));
882             goto out;
883         }
884     }
885 
886     return OPAL_SUCCESS;
887 
888 out:
889     while (NULL != (item = opal_list_remove_first (&contents->ids))) {
890         OBJ_RELEASE(item);
891     }
892 
893     return rc;
894 }
895 
896 #if !BTL_OPENIB_RDMACM_IB_ADDR
897 static void *show_help_cant_find_endpoint(void *context)
898 {
899     char *msg;
900     cant_find_endpoint_context_t *c =
901         (cant_find_endpoint_context_t*) context;
902 
903     if (NULL != c) {
904         msg = stringify(c->peer_ip_addr);
905         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
906                        "could not find matching endpoint", true,
907                        opal_process_info.nodename,
908                        c->device_name,
909                        c->peer_tcp_port);
910         free(msg);
911     } else {
912         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
913                        "could not find matching endpoint", true,
914                        opal_process_info.nodename,
915                        "<unknown>", "<unknown>", -1);
916     }
917     free(context);
918 
919     /* Now kill it */
920     mca_btl_openib_endpoint_invoke_error(NULL);
921     return NULL;
922 }
923 #endif
924 
925 /*
926  * Invoked by service thread
927  *
928  * The server thread will handle the incoming connection requests and
929  * allow them or reject them based on a unidirectional connection
930  * method.  The choonections are allowed based on the IP address and
931  * port values.  This determination is arbitrary, but is uniform in
932  * allowing the connections only in 1 direction.  If the connection in
933  * the requestion is disallowed by this rule, then the server will
934  * reject the connection and make its own in the proper direction.
935  */
936 static int handle_connect_request(struct rdma_cm_event *event)
937 {
938     id_context_t *listener_context = (id_context_t*) event->id->context;
939     id_context_t *new_context = NULL;
940     rdmacm_contents_t *contents = listener_context->contents;
941     mca_btl_openib_endpoint_t *endpoint;
942     struct rdma_conn_param conn_param;
943     opal_process_name_t rem_name;
944     modex_message_t *message;
945     private_data_t msg;
946     int rc = -1, qpnum;
947     uint32_t rem_index;
948 #if BTL_OPENIB_RDMACM_IB_ADDR
949     uint64_t rem_port;
950 #else
951     uint16_t rem_port;
952 #endif
953 
954     qpnum = ((private_data_t *)event->param.conn.private_data)->qpnum;
955     rem_port = ((private_data_t *)event->param.conn.private_data)->rem_port;
956     rem_index = ((private_data_t *)event->param.conn.private_data)->rem_index;
957     rem_name = ((private_data_t *)event->param.conn.private_data)->rem_name;
958 
959     /* Determine which endpoint the remote side is trying to connect
960        to; use the listener's context->contents to figure it out */
961     endpoint = rdmacm_find_endpoint(contents, rem_name);
962     if (NULL == endpoint) {
963 #if !BTL_OPENIB_RDMACM_IB_ADDR
964         struct sockaddr *peeraddr = rdma_get_peer_addr(event->id);
965         cant_find_endpoint_context_t *c = (cant_find_endpoint_context_t *) calloc(1, sizeof(*c));
966         if (NULL != c) {
967             snprintf(c->device_name, sizeof(c->device_name) - 1,
968                      "%s:%d",
969                      ibv_get_device_name(contents->openib_btl->device->ib_dev),
970                      contents->openib_btl->port_num);
971             c->peer_ip_addr =
972                 ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
973             c->peer_tcp_port = rdma_get_dst_port(event->id);
974         }
975         show_help_cant_find_endpoint (c);
976 #else
977         BTL_ERROR(("Cannot find endpoint."));
978 #endif
979         goto out;
980     }
981 
982     message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
983     endpoint->endpoint_initiator =
984         i_initiate(
985 #if BTL_OPENIB_RDMACM_IB_ADDR
986                   contents->service_id, rem_port,
987                   &contents->gid, (union ibv_gid *) message->gid);
988 #else
989                   contents->tcp_port, rem_port,
990                   contents->ipaddr, message->ipaddr);
991     BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d",
992                   endpoint->endpoint_state, contents->ipaddr, message->ipaddr,
993                   contents->tcp_port, rem_port));
994 #endif
995     OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s",
996                  (void*) endpoint,
997                  (void*) endpoint->endpoint_local_cpc,
998                  endpoint->endpoint_initiator ? "am" : "am NOT",
999                  opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1000     if (endpoint->endpoint_initiator) {
1001         reject_reason_t reason = REJECT_WRONG_DIRECTION;
1002 
1003         OPAL_OUTPUT((-1, "SERVICE Received a connect request from an endpoint in the wrong direction"));
1004 
1005         /* This will cause a event on the remote system.  By passing in
1006          * a value in the second arg of rdma_reject, the remote side
1007          * can check for this to know if it was an intentional reject or
1008          * a reject based on an error.
1009          */
1010         rc = rdma_reject(event->id, &reason, sizeof(reject_reason_t));
1011         if (0 != rc) {
1012             BTL_ERROR(("rdma_reject failed %d", rc));
1013             goto out;
1014         }
1015 
1016         OPAL_OUTPUT((-1, "SERVICE Starting connection in other direction"));
1017         rdmacm_module_start_connect(NULL, endpoint);
1018 
1019         return OPAL_SUCCESS;
1020     }
1021 
1022     /* Set the endpoint_state to "CONNECTING".  This is running
1023        in the service thread, so we need to do a write barrier. */
1024     endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
1025     opal_atomic_wmb();
1026 
1027     endpoint->rem_info.rem_index = rem_index;
1028 
1029     /* Setup QP for new connection */
1030     BTL_VERBOSE(("ACCEPTING src port = %d, dst port = %d, qpnum = %d",
1031                  rdma_get_src_port(event->id), rdma_get_dst_port(event->id), qpnum));
1032 
1033     rc = rdmacm_setup_qp(contents, endpoint, event->id, qpnum);
1034     if (0 != rc) {
1035         BTL_ERROR(("rdmacm_setup_qp error %d", rc));
1036         goto out;
1037     }
1038 
1039     /* Post a single receive buffer on the smallest QP for the CTS
1040        protocol */
1041     if (mca_btl_openib_component.credits_qp == qpnum) {
1042         struct ibv_recv_wr *bad_wr, *wr;
1043 
1044         if (OPAL_SUCCESS !=
1045             opal_btl_openib_connect_base_alloc_cts(endpoint)) {
1046             BTL_ERROR(("Failed to alloc CTS frag"));
1047             goto out1;
1048         }
1049         wr = &(endpoint->endpoint_cts_frag.rd_desc);
1050         assert(NULL != wr);
1051         wr->next = NULL;
1052 
1053         if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
1054                                wr, &bad_wr)) {
1055             BTL_ERROR(("failed to post CTS recv buffer"));
1056             goto out1;
1057         }
1058         OPAL_OUTPUT((-1, "Posted CTS receiver buffer (%p) for peer %s, qp index %d (QP num %d), WR ID %p, SG addr %p, len %d, lkey %d",
1059                      (void*)((uintptr_t*) wr->sg_list[0].addr),
1060                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
1061                      qpnum,
1062                      endpoint->qps[qpnum].qp->lcl_qp->qp_num,
1063                      (void*)((uintptr_t*) wr->wr_id),
1064                      (void*)((uintptr_t*) wr->sg_list[0].addr),
1065                      wr->sg_list[0].length,
1066                      wr->sg_list[0].lkey));
1067     }
1068 
1069     /* Since the event id is already created (since we're the server),
1070        the context that was passed to us was the listen server's
1071        context -- which is no longer useful to us.  So allocate a new
1072        context and populate it just for this connection. */
1073     event->id->context = new_context = OBJ_NEW(id_context_t);
1074     if (NULL == new_context) {
1075         BTL_ERROR(("malloc error"));
1076         goto out1;
1077     }
1078 
1079     new_context->contents = contents;
1080     OBJ_RETAIN(contents);
1081     new_context->qpnum = qpnum;
1082     new_context->endpoint = endpoint;
1083 
1084     memset(&conn_param, 0, sizeof(conn_param));
1085     /* See rdma_connect(3) for a description of these 2 values.  We
1086        ensure to pass these values around via the modex so that we can
1087        compute the values properly. */
1088     conn_param.responder_resources =
1089         mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
1090               message->device_max_qp_init_rd_atom);
1091     conn_param.initiator_depth =
1092         mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
1093               message->device_max_qp_rd_atom);
1094     conn_param.retry_count = mca_btl_openib_component.ib_retry_count;
1095     conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(qpnum) ? 0 :
1096         mca_btl_openib_component.ib_rnr_retry;
1097     conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(qpnum);
1098     conn_param.private_data = &msg;
1099     conn_param.private_data_len = sizeof(private_data_t);
1100 
1101     /* Fill the private data being sent to the other side */
1102     msg.qpnum = qpnum;
1103     msg.rem_index = endpoint->index;
1104     msg.rem_name = OPAL_PROC_MY_NAME;
1105 
1106     /* Accepting the connection will result in a
1107        RDMA_CM_EVENT_ESTABLISHED event on both the client and server
1108        side. */
1109     rc = rdma_accept(event->id, &conn_param);
1110     if (0 != rc) {
1111         BTL_ERROR(("rdma_accept error %d", rc));
1112         goto out2;
1113     }
1114 
1115     return OPAL_SUCCESS;
1116 
1117 out2:
1118     OBJ_RELEASE(new_context);
1119 out1:
1120     ibv_destroy_qp(endpoint->qps[qpnum].qp->lcl_qp);
1121 out:
1122     return OPAL_ERROR;
1123 }
1124 
1125 /*
1126  * Runs in service thread
1127  *
1128  * We call rdma_disconnect() here in the service thread so that there
1129  * is zero chance that the DISCONNECT event is delivered and executed
1130  * in the service thread while rdma_disconnect() is still running in
1131  * the main thread (which causes all manner of Bad Things to occur).
1132  */
1133 static void *call_disconnect_callback(int fd, int flags, void *v)
1134 {
1135     rdmacm_contents_t *contents = (rdmacm_contents_t *) v;
1136 #if OPAL_ENABLE_DEBUG
1137     void *tmp = NULL;
1138 #endif
1139     id_context_t *context;
1140     opal_list_item_t *item;
1141 
1142     pthread_mutex_lock (&rdmacm_disconnect_lock);
1143     while (NULL != (item = opal_list_remove_first(&contents->ids))) {
1144         context = (id_context_t *) item;
1145 
1146         OPAL_OUTPUT((-1, "RDMACM Event thread calling disconnect on ID %p",
1147                      (void*) context->id));
1148 
1149         if (!context->already_disconnected) {
1150 #if OPAL_ENABLE_DEBUG
1151             tmp = context->id;
1152 #endif
1153             rdma_disconnect(context->id);
1154             context->already_disconnected = true;
1155         }
1156 
1157         OBJ_RELEASE(context);
1158 
1159         OPAL_OUTPUT((-1, "RDMACM Event thread disconnect on ID %p done",
1160                      (void*) tmp));
1161     }
1162 
1163     /* Tell the main thread that we're done */
1164     pthread_cond_signal(&rdmacm_disconnect_cond);
1165     pthread_mutex_unlock(&rdmacm_disconnect_lock);
1166 
1167     return NULL;
1168 }
1169 
1170 /*
1171  * Invoked by main thread
1172  *
1173  * Runs *while* the progress thread is running.  We can't stop the
1174  * progress thread because this function may be invoked to kill a
1175  * specific endpoint that was the result of MPI-2 dynamics (i.e., this
1176  * is not during MPI_FINALIZE).
1177  */
1178 static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
1179 {
1180     rdmacm_contents_t *contents = NULL, *item;
1181     opal_event_t event;
1182 
1183     BTL_VERBOSE(("Start disconnecting..."));
1184     OPAL_OUTPUT((-1, "MAIN Endpoint finalizing"));
1185 
1186     if (NULL == endpoint) {
1187         BTL_ERROR(("Attempting to shutdown a NULL endpoint"));
1188         return OPAL_SUCCESS;
1189     }
1190 
1191     /* Determine which rdmacm_contents_t correlates to the endpoint
1192      * we are shutting down.  By disconnecting instead of simply
1193      * destroying the QPs, we are shutting down in a more graceful way
1194      * thus preventing errors on the line.
1195      *
1196      * Need to lock because the client_list is accessed in both the
1197      * main thread and service thread.
1198      */
1199     opal_mutex_lock(&client_list_lock);
1200     OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
1201         if (endpoint == item->endpoint) {
1202             contents = item;
1203             opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
1204             contents->on_client_list = false;
1205 
1206             /* Fun race condition: we cannot call
1207                rdma_disconnect() in this thread, because
1208                if we do, there is a nonzero chance that the
1209                DISCONNECT event will be delivered and get executed
1210                in the rdcm event thread immediately.  If this all
1211                happens before rdma_disconnect() returns, all
1212                manner of Bad Things can/will occur.  So just
1213                invoke rdma_disconnect() in the rdmacm event thread
1214                where we guarantee that we won't be processing an
1215                event when it is called. */
1216 
1217             opal_event_set (rdmacm_event_base, &event, -1, OPAL_EV_READ,
1218                             call_disconnect_callback, contents);
1219             opal_event_active (&event, OPAL_EV_READ, 1);
1220 
1221             /* remove_item returns the item before the item removed,
1222                meaning that the for list is still safe */
1223             break;
1224         }
1225     }
1226 
1227     /* Flush writes to ensure we sync across threads */
1228     opal_atomic_wmb();
1229     opal_mutex_unlock(&client_list_lock);
1230 
1231     if (NULL != contents) {
1232         /* Now wait for all the disconnect callbacks to occur */
1233         pthread_mutex_lock(&rdmacm_disconnect_lock);
1234         while (opal_list_get_size (&contents->ids)) {
1235             pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1236         }
1237         pthread_mutex_unlock(&rdmacm_disconnect_lock);
1238     }
1239 
1240     OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
1241     return OPAL_SUCCESS;
1242 }
1243 
1244 /*
1245  * Callback (from main thread) when the endpoint has been connected
1246  */
1247 static void *local_endpoint_cpc_complete(void *context)
1248 {
1249     mca_btl_openib_endpoint_t *endpoint = (mca_btl_openib_endpoint_t *)context;
1250 
1251     OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
1252                  opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1253     OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
1254     mca_btl_openib_endpoint_cpc_complete(endpoint);
1255 
1256     return NULL;
1257 }
1258 
1259 /*
1260  * Runs in service thread
1261  */
1262 static int rdmacm_connect_endpoint(id_context_t *context,
1263                                    struct rdma_cm_event *event)
1264 {
1265     rdmacm_contents_t *contents = context->contents;
1266     rdmacm_endpoint_local_cpc_data_t *data;
1267 
1268     mca_btl_openib_endpoint_t *endpoint;
1269 #if OPAL_ENABLE_DEBUG
1270 #if !BTL_OPENIB_RDMACM_IB_ADDR
1271     modex_message_t *message;
1272 #endif
1273 #endif
1274 
1275     if (contents->server) {
1276         endpoint = context->endpoint;
1277         OPAL_OUTPUT((-1, "SERVICE Server CPC complete to %s",
1278                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1279     } else {
1280         endpoint = contents->endpoint;
1281         endpoint->rem_info.rem_index =
1282             ((private_data_t *)event->param.conn.private_data)->rem_index;
1283 
1284         if (!contents->on_client_list) {
1285             opal_mutex_lock(&client_list_lock);
1286             opal_list_append(&client_list, &(contents->super));
1287             /* Flush writes to ensure we sync across threads */
1288             opal_atomic_wmb();
1289             opal_mutex_unlock(&client_list_lock);
1290             contents->on_client_list = true;
1291         }
1292         OPAL_OUTPUT((-1, "SERVICE Client CPC complete to %s",
1293                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1294     }
1295     if (NULL == endpoint) {
1296         BTL_ERROR(("Can't find endpoint"));
1297         return OPAL_ERR_NOT_FOUND;
1298     }
1299     data =
1300         (rdmacm_endpoint_local_cpc_data_t *)endpoint->endpoint_local_cpc_data;
1301 
1302     /* Only notify the upper layers after the last QP has been
1303        connected */
1304     if (++data->rdmacm_counter < mca_btl_openib_component.num_qps) {
1305         BTL_VERBOSE(("%s to peer %s, count == %d", contents->server?"server":"client",
1306                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter));
1307         OPAL_OUTPUT((-1, "%s to peer %s, count == %d", contents->server?"server":"client",
1308                      opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), data->rdmacm_counter));
1309         return OPAL_SUCCESS;
1310     }
1311 
1312 #if OPAL_ENABLE_DEBUG
1313 #if !BTL_OPENIB_RDMACM_IB_ADDR
1314     message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
1315     BTL_VERBOSE(("%s connected!!! local %x remote %x state = %d",
1316                  contents->server?"server":"client",
1317                  contents->ipaddr,
1318                  message->ipaddr,
1319                  endpoint->endpoint_state));
1320 #endif
1321 #endif
1322 
1323     /* Ensure that all the writes back to the endpoint and associated
1324        data structures have completed */
1325     opal_atomic_wmb();
1326     mca_btl_openib_run_in_main (local_endpoint_cpc_complete, endpoint);
1327 
1328     return OPAL_SUCCESS;
1329 }
1330 
1331 /*
1332  * Runs in service thread
1333  */
1334 static int rdmacm_disconnected(id_context_t *context)
1335 {
1336     /* If this was a client thread, then it *may* still be listed in a
1337        contents->ids list. */
1338 
1339     OPAL_OUTPUT((-1, "SERVICE Releasing context because of DISCONNECT: context %p, id %p",
1340                  (void*) context, (void*) context->id));
1341     OBJ_RELEASE(context);
1342 
1343     return OPAL_SUCCESS;
1344 }
1345 
1346 /*
1347  * Runs in service thread
1348  */
1349 static int rdmacm_destroy_dummy_qp(id_context_t *context)
1350 {
1351     /* We need to check id pointer because of retransmitions.
1352        Maybe the reject was already done. */
1353 
1354     if (NULL != context->id) {
1355             if (NULL != context->id->qp) {
1356            ibv_destroy_qp(context->id->qp);
1357            context->id->qp = NULL;
1358         }
1359     }
1360 
1361     if (NULL != context->contents->dummy_cq) {
1362         ibv_destroy_cq(context->contents->dummy_cq);
1363     }
1364     /* This item was appended to the contents->ids list (the list will
1365        only have just this one item), so remove it before RELEASEing
1366        the item */
1367     opal_list_remove_first(&(context->contents->ids));
1368     OBJ_RELEASE(context);
1369 
1370     return OPAL_SUCCESS;
1371 }
1372 
1373 /*
1374  * Runs in service thread
1375  */
1376 static int rdmacm_rejected(id_context_t *context, struct rdma_cm_event *event)
1377 {
1378     if (NULL != event->param.conn.private_data) {
1379         /* Why were we rejected? */
1380         switch (*((reject_reason_t*) event->param.conn.private_data)) {
1381         case REJECT_WRONG_DIRECTION:
1382             OPAL_OUTPUT((-1, "SERVICE A good reject! for qp %d, id 0x%p",
1383                          context->qpnum, (void*) context->id));
1384             rdmacm_destroy_dummy_qp(context);
1385             break;
1386 
1387         default:
1388             /* Just so compilers won't complain */
1389             break;
1390         }
1391     }
1392 
1393     return OPAL_SUCCESS;
1394 }
1395 
1396 /*
1397  * Runs in service thread
1398  */
1399 static int resolve_route(id_context_t *context)
1400 {
1401     int rc;
1402 
1403     /* Resolve the route to the remote system.  Once established, the
1404      * local system will get a RDMA_CM_EVENT_ROUTE_RESOLVED event.
1405      */
1406     rc = rdma_resolve_route(context->id, rdmacm_resolve_timeout);
1407     if (0 != rc) {
1408         BTL_ERROR(("Failed to resolve the route with %d", rc));
1409         goto out;
1410     }
1411 
1412 #if OPAL_ENABLE_DEBUG
1413     {
1414         char *a, *b;
1415         OPAL_OUTPUT((-1, "Resolved route ID %p (local addr %s, remote addr %s)",
1416                      (void*) context->id,
1417                      a = stringify(((struct sockaddr_in*) rdma_get_local_addr(context->id))->sin_addr.s_addr),
1418                      b = stringify(((struct sockaddr_in*) rdma_get_peer_addr(context->id))->sin_addr.s_addr)));
1419         free(a);
1420         free(b);
1421     }
1422 #endif
1423 
1424     return OPAL_SUCCESS;
1425 
1426 out:
1427     return OPAL_ERROR;
1428 }
1429 
1430 /*
1431  * Runs in service thread
1432  */
1433 static int create_dummy_cq(rdmacm_contents_t *contents,
1434                            mca_btl_openib_module_t *openib_btl)
1435 {
1436     contents->dummy_cq =
1437         ibv_create_cq(openib_btl->device->ib_dev_context, 1, NULL, NULL, 0);
1438     if (NULL == contents->dummy_cq) {
1439         BTL_ERROR(("dummy_cq not created"));
1440         goto out;
1441     }
1442 
1443     return OPAL_SUCCESS;
1444 out:
1445     return OPAL_ERROR;
1446 }
1447 
1448 /*
1449  * Runs in service thread
1450  */
1451 static int create_dummy_qp(rdmacm_contents_t *contents,
1452                            struct rdma_cm_id *id, int qpnum)
1453 {
1454     struct ibv_qp_init_attr attr;
1455 
1456     memset(&attr, 0, sizeof(attr));
1457     attr.qp_type = IBV_QPT_RC;
1458     attr.send_cq = contents->dummy_cq;
1459     attr.recv_cq = contents->dummy_cq;
1460     attr.cap.max_recv_wr = 1;
1461     attr.cap.max_send_wr = 1;
1462     attr.cap.max_send_sge = 1;
1463     attr.cap.max_recv_sge = 1;
1464 
1465     {
1466         /* JMS Temprary gross hack: we *must* use rdma_create_cp()
1467            (vs. ibv_create_qp()) because strange things happen on IB
1468            if we don't.  However, rdma_create_cp() wants us to use
1469            rdma_get_devices() (and therefore the pd that they have
1470            allocated).  In order to get v1.3 out the door, we're
1471            bypassing this functionality - we're temporarily overriding
1472            the device context cached on the ID with our own, so that
1473            our pd will match.  We need to fix this to properly get the
1474            pd from the RDMA CM and use that, etc. */
1475         struct ibv_context *temp = id->verbs;
1476         id->verbs = contents->openib_btl->device->ib_pd->context;
1477         if (0 != rdma_create_qp(id, contents->openib_btl->device->ib_pd,
1478                                 &attr)) {
1479             BTL_ERROR(("Failed to create qp with %d", qpnum));
1480             goto out;
1481         }
1482         id->verbs = temp;
1483     }
1484     BTL_VERBOSE(("dummy qp created %d", qpnum));
1485 
1486     return OPAL_SUCCESS;
1487 
1488 out:
1489     return OPAL_ERROR;
1490 }
1491 
1492 /*
1493  * Runs in service thread
1494  */
1495 static int finish_connect(id_context_t *context)
1496 {
1497     rdmacm_contents_t *contents = context->contents;
1498     struct rdma_conn_param conn_param;
1499     private_data_t msg;
1500     int rc;
1501 #if OPAL_ENABLE_DEBUG
1502 #if !BTL_OPENIB_RDMACM_IB_ADDR
1503     struct sockaddr *peeraddr;
1504     uint32_t remoteipaddr;
1505     uint16_t remoteport;
1506 #endif
1507 #endif
1508     modex_message_t *message;
1509 
1510 #if OPAL_ENABLE_DEBUG
1511 #if !BTL_OPENIB_RDMACM_IB_ADDR
1512     peeraddr = rdma_get_peer_addr(context->id);
1513     remoteport = rdma_get_dst_port(context->id);
1514     remoteipaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
1515 #endif
1516 #endif
1517 
1518     message = (modex_message_t *)
1519         context->endpoint->endpoint_remote_cpc_data->cbm_modex_message;
1520 
1521     /* If we're the initiator, then setup the QP's and post the CTS
1522        message buffer */
1523     if (contents->endpoint->endpoint_initiator) {
1524         rc = rdmacm_setup_qp(contents, contents->endpoint,
1525                              context->id, context->qpnum);
1526         if (0 != rc) {
1527             BTL_ERROR(("rdmacm_setup_qp error %d", rc));
1528             goto out;
1529         }
1530 
1531         if (mca_btl_openib_component.credits_qp == context->qpnum) {
1532             /* Post a single receive buffer on the smallest QP for the CTS
1533                protocol */
1534 
1535             struct ibv_recv_wr *bad_wr, *wr;
1536             assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr);
1537             wr = &(contents->endpoint->endpoint_cts_frag.rd_desc);
1538             assert(NULL != wr);
1539             wr->next = NULL;
1540 
1541             if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
1542                                    wr, &bad_wr)) {
1543                 BTL_ERROR(("failed to post CTS recv buffer"));
1544                 goto out1;
1545             }
1546             OPAL_OUTPUT((-1, "Posted initiator CTS buffer (%p, length %d) for peer %s, qp index %d (QP num %d)",
1547                          (void*)((uintptr_t*) wr->sg_list[0].addr),
1548                          wr->sg_list[0].length,
1549                          opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal),
1550                          context->qpnum,
1551                          contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num));
1552         }
1553     } else {
1554         /* If we are establishing a connection in the "wrong" direction,
1555          * setup a dummy CQ and QP and do NOT post any recvs on them.
1556          * Otherwise this will screwup the recv accounting and will
1557          * result in not posting recvs when you really really wanted to.
1558          * All of the dummy cq and qps will be cleaned up on the reject
1559          * event.
1560          */
1561         rc = create_dummy_cq(contents, contents->openib_btl);
1562         if (0 != rc) {
1563             BTL_ERROR(("create_dummy_cq error %d", rc));
1564             goto out;
1565         }
1566 
1567         rc = create_dummy_qp(contents, context->id, context->qpnum);
1568         if (0 != rc) {
1569             BTL_ERROR(("create_dummy_qp error %d", rc));
1570             goto out;
1571         }
1572     }
1573 
1574     memset(&conn_param, 0, sizeof(conn_param));
1575     /* See above comment about rdma_connect(3) and these two values. */
1576     conn_param.responder_resources =
1577         mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
1578               message->device_max_qp_init_rd_atom);
1579     conn_param.initiator_depth =
1580         mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
1581               message->device_max_qp_rd_atom);
1582     conn_param.flow_control = 0;
1583     conn_param.retry_count = mca_btl_openib_component.ib_retry_count;
1584     conn_param.rnr_retry_count = BTL_OPENIB_QP_TYPE_PP(context->qpnum) ? 0 :
1585         mca_btl_openib_component.ib_rnr_retry;
1586     conn_param.srq = BTL_OPENIB_QP_TYPE_SRQ(context->qpnum);
1587     conn_param.private_data = &msg;
1588     conn_param.private_data_len = sizeof(private_data_t);
1589 
1590     msg.qpnum = context->qpnum;
1591     msg.rem_index = contents->endpoint->index;
1592     msg.rem_name = OPAL_PROC_MY_NAME;
1593 #if BTL_OPENIB_RDMACM_IB_ADDR
1594     memset(msg.librdmacm_header, 0, sizeof(msg.librdmacm_header));
1595     msg.rem_port = contents->service_id;
1596 #else
1597     msg.rem_port = contents->tcp_port;
1598     if (contents->endpoint->endpoint_initiator) {
1599 #if OPAL_ENABLE_DEBUG
1600         char *a;
1601 #endif
1602         OPAL_OUTPUT((-1, "Finish connect (I am initiator): sending from %s:%d, TCP port %d, qp index %d (num %d) to IP %s:%d",
1603                      ibv_get_device_name(contents->openib_btl->device->ib_dev),
1604                      contents->openib_btl->port_num,
1605                      contents->tcp_port,
1606                      context->qpnum,
1607                      contents->endpoint->qps[context->qpnum].qp->lcl_qp->qp_num,
1608                      a = stringify(remoteipaddr), remoteport));
1609 #if OPAL_ENABLE_DEBUG
1610         free(a);
1611 #endif
1612     }
1613 #endif
1614 
1615     /* Now all of the local setup has been done.  The remote system
1616        should now get a RDMA_CM_EVENT_CONNECT_REQUEST event to further
1617        the setup of the QP. */
1618     OPAL_OUTPUT((-1, "SERVICE in finish_connect; ep=%p (%p), I still %s the initiator to %s",
1619                  (void*) contents->endpoint,
1620                  (void*) contents->endpoint->endpoint_local_cpc,
1621                  contents->endpoint->endpoint_initiator ? "am" : "am NOT",
1622                  opal_get_proc_hostname(contents->endpoint->endpoint_proc->proc_opal)));
1623     rc = rdma_connect(context->id, &conn_param);
1624     if (0 != rc) {
1625         BTL_ERROR(("rdma_connect Failed with %d", rc));
1626         goto out1;
1627     }
1628 
1629     return OPAL_SUCCESS;
1630 
1631 out1:
1632     ibv_destroy_qp(context->id->qp);
1633 out:
1634     OBJ_RELEASE(contents);
1635 
1636     return OPAL_ERROR;
1637 }
1638 
1639 /*
1640  * Runs in main thread
1641  */
1642 static void *show_help_rdmacm_event_error (struct rdma_cm_event *event)
1643 {
1644     id_context_t *context = (id_context_t*) event->id->context;
1645 
1646     if (RDMA_CM_EVENT_DEVICE_REMOVAL == event->event) {
1647         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
1648                        "rdma cm device removal", true,
1649                        opal_process_info.nodename,
1650                        ibv_get_device_name(event->id->verbs->device));
1651     } else {
1652         const char *device = "Unknown";
1653         if (NULL != event->id &&
1654             NULL != event->id->verbs &&
1655             NULL != event->id->verbs->device) {
1656             device = ibv_get_device_name(event->id->verbs->device);
1657         }
1658         opal_show_help("help-mpi-btl-openib-cpc-rdmacm.txt",
1659                        "rdma cm event error", true,
1660                        opal_process_info.nodename,
1661                        device,
1662                        rdma_event_str(event->event),
1663                        opal_get_proc_hostname(context->endpoint->endpoint_proc->proc_opal));
1664     }
1665 
1666     return NULL;
1667 }
1668 
1669 /*
1670  * Runs in service thread
1671  */
1672 static int event_handler(struct rdma_cm_event *event)
1673 {
1674     id_context_t *context = (id_context_t*) event->id->context;
1675 #if !BTL_OPENIB_RDMACM_IB_ADDR
1676     rdmacm_contents_t *contents;
1677     struct sockaddr *localaddr;
1678     uint32_t localipaddr;
1679 #if OPAL_ENABLE_DEBUG
1680     struct sockaddr *peeraddr;
1681     uint32_t peeripaddr;
1682 #endif
1683 #endif
1684     int rc = -1;
1685     opal_btl_openib_ini_values_t ini;
1686     bool found;
1687 
1688     if (NULL == context) {
1689         return rc;
1690     }
1691 
1692 #if !BTL_OPENIB_RDMACM_IB_ADDR
1693     contents = context->contents;
1694 
1695     localaddr = rdma_get_local_addr(event->id);
1696     localipaddr = ((struct sockaddr_in *)localaddr)->sin_addr.s_addr;
1697 #if OPAL_ENABLE_DEBUG
1698     peeraddr = rdma_get_peer_addr(event->id);
1699     peeripaddr = ((struct sockaddr_in *)peeraddr)->sin_addr.s_addr;
1700 #endif
1701 
1702     BTL_VERBOSE(("%s event_handler -- %s, status = %d to %x",
1703                 contents->server?"server":"client",
1704                 rdma_event_str(event->event),
1705                 event->status,
1706                 peeripaddr));
1707 #endif
1708 
1709     switch (event->event) {
1710     case RDMA_CM_EVENT_ADDR_RESOLVED:
1711         OPAL_OUTPUT((-1, "SERVICE Got ADDR_RESOLVED: ID %p", (void*) context->id));
1712         rc = resolve_route(context);
1713         break;
1714 
1715     case RDMA_CM_EVENT_ROUTE_RESOLVED:
1716         OPAL_OUTPUT((-1, "SERVICE Got ROUTE_RESOLVED: ID %p", (void*) context->id));
1717 #if !BTL_OPENIB_RDMACM_IB_ADDR
1718         contents->ipaddr = localipaddr;
1719 #endif
1720         rc = finish_connect(context);
1721         break;
1722 
1723     case RDMA_CM_EVENT_CONNECT_REQUEST:
1724         OPAL_OUTPUT((-1, "SERVICE Got CONNECT_REQUEST: ID %p, context %p",
1725                      (void*) event->id, (void*) context));
1726         rc = handle_connect_request(event);
1727         break;
1728 
1729     case RDMA_CM_EVENT_ESTABLISHED:
1730         OPAL_OUTPUT((-1, "SERVICE Got ESTABLISHED: %p", (void*) event->id));
1731         rc = rdmacm_connect_endpoint(context, event);
1732         break;
1733 
1734     case RDMA_CM_EVENT_DISCONNECTED:
1735         OPAL_OUTPUT((-1, "SERVICE Got DISCONNECTED: %p", (void*) event->id));
1736         rc = rdmacm_disconnected(context);
1737         break;
1738 
1739     case RDMA_CM_EVENT_REJECTED:
1740         OPAL_OUTPUT((-1, "SERVICE Got REJECTED: %p", (void*) event->id));
1741         rc = rdmacm_rejected(context, event);
1742         break;
1743 
1744     case RDMA_CM_EVENT_CONNECT_ERROR:
1745         /* Some adapters have broken REJECT behavior; the recipient
1746            gets a CONNECT_ERROR event instead of the expected REJECTED
1747            event.  So if we get a CONNECT_ERROR, see if it's on a
1748            connection that we're expecting a REJECT (i.e., we have a
1749            dummy_cq setup).  If it is, and if a) the MCA param
1750            btl_openib_connect_rdmacm_reject_causes_connect_error is
1751            true, or b) if rdmacm_reject_causes_connect_error set on
1752            the device INI values, then just treat this CONNECT_ERROR
1753            as if it were the REJECT. */
1754         if (NULL != context->contents->dummy_cq) {
1755             struct ibv_device_attr *attr =
1756                 &(context->endpoint->endpoint_btl->device->ib_dev_attr);
1757             found = false;
1758             if (OPAL_SUCCESS == opal_btl_openib_ini_query(attr->vendor_id,
1759                                                           attr->vendor_part_id,
1760                                                           &ini) &&
1761                 ini.rdmacm_reject_causes_connect_error) {
1762                 found = true;
1763             }
1764             if (rdmacm_reject_causes_connect_error) {
1765                 found = true;
1766             }
1767 
1768             if (found) {
1769                 OPAL_OUTPUT((-1, "SERVICE Got CONNECT_ERROR, but ignored: %p", (void*) event->id));
1770                 rc = rdmacm_destroy_dummy_qp(context);
1771                 break;
1772             }
1773         }
1774 
1775         /* Otherwise, fall through and handle the error as normal */
1776 
1777     case RDMA_CM_EVENT_UNREACHABLE:
1778     case RDMA_CM_EVENT_CONNECT_RESPONSE:
1779     case RDMA_CM_EVENT_ADDR_ERROR:
1780     case RDMA_CM_EVENT_DEVICE_REMOVAL:
1781         show_help_rdmacm_event_error (event);
1782         rc = OPAL_ERROR;
1783         break;
1784 
1785     case RDMA_CM_EVENT_ROUTE_ERROR:
1786         /* Route lookup does not necessarily handle retries, and there
1787            appear to be cases where the subnet manager node can no
1788            longer handle incoming requests.  The rdma connection
1789            manager and lower level code doesn't handle retries, so we
1790            have to. */
1791         if (context->route_retry_count < rdmacm_resolve_max_retry_count) {
1792                 context->route_retry_count++;
1793                 rc = resolve_route(context);
1794                 break;
1795         }
1796         show_help_rdmacm_event_error (event);
1797         rc = OPAL_ERROR;
1798         break;
1799 
1800     default:
1801         /* Unknown error */
1802         BTL_ERROR(("Unknown RDMA CM error event_handler: %s, status = %d",
1803                    rdma_event_str(event->event), event->status));
1804         rc = OPAL_ERROR;
1805         break;
1806     }
1807 
1808     return rc;
1809 }
1810 
1811 /*
1812  * Runs in event thread
1813  */
1814 static inline void rdmamcm_event_error(struct rdma_cm_event *event)
1815 {
1816     mca_btl_base_endpoint_t *endpoint = NULL;
1817 
1818     if (event->id->context) {
1819         endpoint = ((id_context_t *)event->id->context)->contents->endpoint;
1820     }
1821 
1822     mca_btl_openib_run_in_main (mca_btl_openib_endpoint_invoke_error,
1823                                 endpoint);
1824 }
1825 
1826 /*
1827  * Runs in event thread
1828  */
1829 static void *rdmacm_event_dispatch(int fd, int flags, void *context)
1830 {
1831     struct rdma_cm_event *event, ecopy;
1832     void *data = NULL;
1833     int rc;
1834 
1835     /* blocks until next cm_event */
1836     rc = rdma_get_cm_event(event_channel, &event);
1837     if (0 != rc) {
1838         BTL_ERROR(("rdma_get_cm_event error %d", rc));
1839         return NULL;
1840     }
1841 
1842     /* If the incoming event is not acked in a sufficient amount of
1843      * time, there will be a timeout error and the connection will be
1844      * torndown.  Also, the act of acking the event destroys the
1845      * included data in the event.  In certain circumstances, the time
1846      * it takes to handle a incoming event could approach or exceed
1847      * this time.  To prevent this from happening, we will copy the
1848      * event and all of its data, ack the event, and process the copy
1849      * of the event.
1850      */
1851     memcpy(&ecopy, event, sizeof(struct rdma_cm_event));
1852     if (event->param.conn.private_data_len > 0) {
1853         data = malloc(event->param.conn.private_data_len);
1854         if (NULL == data) {
1855            BTL_ERROR(("error mallocing memory"));
1856            return NULL;
1857         }
1858         memcpy(data, event->param.conn.private_data, event->param.conn.private_data_len);
1859         ecopy.param.conn.private_data = data;
1860     }
1861     rdma_ack_cm_event(event);
1862 
1863     rc = event_handler(&ecopy);
1864     if (OPAL_SUCCESS != rc) {
1865         rdmamcm_event_error(&ecopy);
1866     }
1867 
1868     if (NULL != data) {
1869         free(data);
1870     }
1871 
1872     return NULL;
1873 }
1874 
1875 /*
1876  * Runs in main thread
1877  *
1878  * CPC init function - Setup all globals here
1879  */
1880 static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint)
1881 {
1882     void *data;
1883 
1884     data = calloc(1, sizeof(rdmacm_endpoint_local_cpc_data_t));
1885     if (NULL == data) {
1886         BTL_ERROR(("malloc failed"));
1887         return OPAL_ERR_OUT_OF_RESOURCE;
1888     }
1889     endpoint->endpoint_local_cpc_data = data;
1890 
1891     return OPAL_SUCCESS;
1892 }
1893 
1894 #if !BTL_OPENIB_RDMACM_IB_ADDR
1895 static int ipaddrcheck(id_context_t *context,
1896                        mca_btl_openib_module_t *openib_btl)
1897 {
1898     rdmacm_contents_t *server = context->contents;
1899     uint32_t ipaddr;
1900     bool already_exists = false;
1901     rdmacm_contents_t *contents;
1902     int server_tcp_port = rdma_get_src_port(context->id);
1903     char *str;
1904 
1905     /* Look up the IP address of this device/port.  This call should not be
1906      * necessary, as rdma_get_local_addr would be more correct in returning the
1907      * IP address given the cm_id (and not necessitate having to do a list look
1908      * up).  Unfortunately, the subnet and IP address look up needs to match or
1909      * there could be a mismatch if IP Aliases are being used.  For more
1910      * information on this, please read comment above
1911      * mca_btl_openib_get_ip_subnet_id in btl_openib_ip.c
1912      */
1913     ipaddr =
1914         mca_btl_openib_rdma_get_ipv4addr(openib_btl->device->ib_dev_context,
1915                                          openib_btl->port_num);
1916     if (0 == ipaddr) {
1917         BTL_VERBOSE(("*** Could not find IP address for %s:%d -- is there an IP address configured for this device?",
1918                      ibv_get_device_name(openib_btl->device->ib_dev),
1919                      openib_btl->port_num));
1920         return OPAL_ERR_NOT_FOUND;
1921     }
1922     str = stringify(ipaddr);
1923     BTL_VERBOSE(("Found device %s:%d = IP address %s:%d",
1924                  ibv_get_device_name(openib_btl->device->ib_dev),
1925                  openib_btl->port_num, str, server_tcp_port));
1926     free(str);
1927 
1928     /* Ok, we found the IP address of this device/port.  Have we
1929        already see this IP address/TCP port before? */
1930     OPAL_LIST_FOREACH(contents, &server_listener_list, rdmacm_contents_t) {
1931         BTL_VERBOSE(("paddr = %x, ipaddr addr = %x",
1932                      contents->ipaddr, ipaddr));
1933         if (contents->ipaddr == ipaddr &&
1934             contents->tcp_port == server_tcp_port) {
1935             str = stringify(ipaddr);
1936             BTL_VERBOSE(("server already listening on %s:%d",
1937                          str, server_tcp_port));
1938             free(str);
1939             already_exists = true;
1940             break;
1941         }
1942     }
1943 
1944     /* If we haven't seen it before, save it */
1945     if (!already_exists) {
1946         str = stringify(ipaddr);
1947         BTL_VERBOSE(("creating new server to listen on %s:%d",
1948                      str, server_tcp_port));
1949         free(str);
1950         server->ipaddr = ipaddr;
1951         server->tcp_port = server_tcp_port;
1952     }
1953 
1954     return already_exists ? OPAL_ERROR : OPAL_SUCCESS;
1955 }
1956 #endif
1957 
1958 static int create_message(rdmacm_contents_t *server,
1959                           mca_btl_openib_module_t *openib_btl,
1960                           opal_btl_openib_connect_base_module_data_t *data)
1961 {
1962     modex_message_t *message;
1963 #if !BTL_OPENIB_RDMACM_IB_ADDR
1964 #if OPAL_ENABLE_DEBUG
1965     char *a;
1966 #endif
1967 #endif
1968 
1969     message = (modex_message_t *) malloc(sizeof(modex_message_t));
1970     if (NULL == message) {
1971         BTL_ERROR(("malloc failed"));
1972         return OPAL_ERR_OUT_OF_RESOURCE;
1973     }
1974 
1975     message->device_max_qp_rd_atom =
1976         openib_btl->device->ib_dev_attr.max_qp_rd_atom;
1977     message->device_max_qp_init_rd_atom =
1978         openib_btl->device->ib_dev_attr.max_qp_init_rd_atom;
1979 
1980 #if BTL_OPENIB_RDMACM_IB_ADDR
1981     memcpy(message->gid, server->gid.raw, sizeof(server->gid));
1982     message->service_id = server->service_id;
1983 #else
1984     message->ipaddr = server->ipaddr;
1985     message->tcp_port = server->tcp_port;
1986 
1987     OPAL_OUTPUT((-1, "Message IP address is %s, port %d",
1988                  a = stringify(message->ipaddr), message->tcp_port));
1989 #if OPAL_ENABLE_DEBUG
1990     free(a);
1991 #endif
1992 #endif
1993     data->cbm_modex_message = message;
1994     data->cbm_modex_message_len = message_len;
1995 
1996     return OPAL_SUCCESS;
1997 }
1998 
1999 /*
2000  * Runs in main thread
2001  *
2002  * This function determines if the RDMACM is a possible cpc method and
2003  * sets it up accordingly.
2004  */
2005 static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, opal_btl_openib_connect_base_module_t **cpc)
2006 {
2007     int rc;
2008 
2009     id_context_t *context;
2010     rdmacm_contents_t *server = NULL;
2011 
2012 #if BTL_OPENIB_RDMACM_IB_ADDR
2013     char rdmacm_addr_str[32];
2014     struct rdma_addrinfo *rdma_addr;
2015 #else
2016     struct sockaddr_in sin;
2017 #endif
2018 
2019     /* RDMACM is not supported for MPI_THREAD_MULTIPLE */
2020     if (opal_using_threads()) {
2021         BTL_VERBOSE(("rdmacm CPC is not supported with MPI_THREAD_MULTIPLE; skipped on %s:%d",
2022                      ibv_get_device_name(openib_btl->device->ib_dev),
2023                      openib_btl->port_num));
2024         rc = OPAL_ERR_NOT_SUPPORTED;
2025         goto out;
2026     }
2027 
2028     /* RDMACM is not supported if we have any XRC QPs */
2029     if (mca_btl_openib_component.num_xrc_qps > 0) {
2030         BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
2031                      ibv_get_device_name(openib_btl->device->ib_dev),
2032                      openib_btl->port_num));
2033         rc = OPAL_ERR_NOT_SUPPORTED;
2034         goto out;
2035     }
2036     if (!BTL_OPENIB_QP_TYPE_PP(0)) {
2037         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2038                             "rdmacm CPC only supported when the first QP is a PP QP; skipped");
2039         rc = OPAL_ERR_NOT_SUPPORTED;
2040         goto out;
2041     }
2042 
2043     BTL_VERBOSE(("rdmacm_component_query"));
2044 
2045     *cpc = (opal_btl_openib_connect_base_module_t *) malloc(sizeof(opal_btl_openib_connect_base_module_t));
2046     if (NULL == *cpc) {
2047         rc = OPAL_ERR_OUT_OF_RESOURCE;
2048         goto out;
2049     }
2050 
2051     (*cpc)->data.cbm_component = &opal_btl_openib_connect_rdmacm;
2052     (*cpc)->data.cbm_priority = rdmacm_priority;
2053     (*cpc)->data.cbm_modex_message     = NULL;
2054     (*cpc)->data.cbm_modex_message_len = 0;
2055     (*cpc)->cbm_endpoint_init = rdmacm_init;
2056     (*cpc)->cbm_start_connect = rdmacm_module_start_connect;
2057     (*cpc)->cbm_endpoint_finalize = rdmacm_endpoint_finalize;
2058     (*cpc)->cbm_finalize = NULL;
2059     /* Setting uses_cts=true also guarantees that we'll only be
2060        selected if QP 0 is PP */
2061     (*cpc)->cbm_uses_cts = true;
2062 
2063     /* Start monitoring the fd associated with the cm_device */
2064     server = OBJ_NEW(rdmacm_contents_t);
2065     if (NULL == server) {
2066         rc = OPAL_ERR_OUT_OF_RESOURCE;
2067         goto out1;
2068     }
2069     server->server = true;
2070     server->openib_btl = openib_btl;
2071 
2072     context = OBJ_NEW(id_context_t);
2073     OPAL_OUTPUT((-1, "MAIN Server context: %p", (void*) context));
2074     if (NULL == context) {
2075         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2076                             "openib BTL: rdmacm CPC system error (malloc failed)");
2077         rc = OPAL_ERR_OUT_OF_RESOURCE;
2078         goto out3;
2079     }
2080     context->contents = server;
2081     OBJ_RETAIN(context->contents);
2082     opal_list_append(&(server->ids), &(context->super));
2083     context->qpnum = 0;
2084 
2085     rc = rdma_create_id(event_channel, &(context->id), context, RDMA_PS_TCP);
2086     if (0 != rc) {
2087         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2088                             "openib BTL: rdmacm CPC failed to create ID");
2089         rc = OPAL_ERR_OUT_OF_RESOURCE;
2090         goto out4;
2091     }
2092 #if !BTL_OPENIB_RDMACM_IB_ADDR
2093     memset(&sin, 0, sizeof(sin));
2094     sin.sin_family = AF_INET;
2095     sin.sin_addr.s_addr = rdmacm_addr;
2096     sin.sin_port = (uint16_t) rdmacm_port;
2097 #else
2098     rc = ibv_query_gid(openib_btl->device->ib_pd->context, openib_btl->port_num,
2099                        mca_btl_openib_component.gid_index, &server->gid);
2100     if (0 != rc) {
2101         BTL_ERROR(("local gid query failed"));
2102         goto out4;
2103     }
2104 
2105     if (NULL == inet_ntop(AF_INET6, server->gid.raw,
2106                          rdmacm_addr_str, sizeof rdmacm_addr_str)) {
2107         BTL_ERROR(("local gaddr string creating fail"));
2108         goto out4;
2109     }
2110 
2111     rc = get_rdma_addr(rdmacm_addr_str, NULL, &rdma_addr, 1);
2112     if (OPAL_SUCCESS != rc) {
2113         BTL_ERROR(("server: create rdma addr error"));
2114         goto out4;
2115     }
2116 #endif
2117     /* Bind the rdmacm server to the local IP address and an ephemerial
2118      * port or one specified by a comand arg.
2119      */
2120     rc = rdma_bind_addr(context->id,
2121 #if BTL_OPENIB_RDMACM_IB_ADDR
2122                         rdma_addr->ai_src_addr);
2123 #else
2124                        (struct sockaddr *)&sin);
2125 #endif
2126     if (0 != rc) {
2127         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2128                             "openib BTL: rdmacm CPC unable to bind to address");
2129         rc = OPAL_ERR_UNREACH;
2130 #if BTL_OPENIB_RDMACM_IB_ADDR
2131         rdma_freeaddrinfo(rdma_addr);
2132 #endif
2133         goto out5;
2134     }
2135 #if BTL_OPENIB_RDMACM_IB_ADDR
2136     server->service_id = ((struct sockaddr_ib *) (&context->id->route.addr.src_addr))->sib_sid;
2137     rdma_freeaddrinfo(rdma_addr);
2138 #else
2139     /* Verify that the device has a valid IP address on it, or we
2140        cannot use the cpc */
2141     rc = ipaddrcheck(context, openib_btl);
2142     if (OPAL_SUCCESS != rc) {
2143         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2144                             "openib BTL: rdmacm IP address not found on port");
2145         rc = OPAL_ERR_NOT_SUPPORTED;
2146         goto out5;
2147     }
2148 #endif
2149     /* Listen on the specified address/port with the rdmacm, limit the
2150        amount of incoming connections to 1024 */
2151     /* FIXME - 1024 should be (num of connectors *
2152        mca_btl_openib_component.num_qps) */
2153     rc = rdma_listen(context->id, 1024);
2154     if (0 != rc) {
2155         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2156                             "openib BTL: rdmacm CPC unable to listen");
2157         rc = OPAL_ERR_UNREACH;
2158         goto out5;
2159     }
2160 
2161     rc = create_message(server, openib_btl, &(*cpc)->data);
2162     if (0 != rc) {
2163         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2164                             "openib BTL: rdmacm CPC unable to create message");
2165         rc = OPAL_ERR_OUT_OF_RESOURCE;
2166         goto out5;
2167     }
2168 
2169     opal_list_append(&server_listener_list, &(server->super));
2170 
2171     opal_output_verbose(5, opal_btl_base_framework.framework_output,
2172                         "openib BTL: rdmacm CPC available for use on %s:%d",
2173                         ibv_get_device_name(openib_btl->device->ib_dev),
2174                         openib_btl->port_num);
2175     return OPAL_SUCCESS;
2176 
2177 out5:
2178     /*
2179      * Since rdma_create_id() succeeded, we need "rdma_destroy_id(context->id)".
2180      * But don't do it here since it's part of out4:OBJ_RELEASE(context),
2181      * and we don't want to do it twice.
2182      */
2183 out4:
2184     opal_list_remove_first(&(server->ids));
2185     OBJ_RELEASE(context);
2186 out3:
2187     OBJ_RELEASE(server);
2188 out1:
2189     free(*cpc);
2190 out:
2191     if (OPAL_ERR_NOT_SUPPORTED == rc) {
2192         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2193                             "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped",
2194                             ibv_get_device_name(openib_btl->device->ib_dev),
2195                             openib_btl->port_num);
2196     } else {
2197         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2198                             "openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)",
2199                             ibv_get_device_name(openib_btl->device->ib_dev),
2200                             openib_btl->port_num, rc,
2201                             opal_strerror(rc));
2202     }
2203     return rc;
2204 }
2205 
2206 /*
2207  * Invoked by main thread
2208  *
2209  * Shutting down the whole thing.
2210  */
2211 static int rdmacm_component_finalize(void)
2212 {
2213     opal_list_item_t *item, *item2;
2214 
2215     BTL_VERBOSE(("rdmacm_component_finalize"));
2216 
2217     /* If we're just trolling through ompi_info, don't bother doing
2218        anything */
2219     if (!rdmacm_component_initialized) {
2220         return OPAL_SUCCESS;
2221     }
2222 
2223     if (rdmacm_event_base) {
2224         opal_event_del (&rdmacm_event);
2225         opal_progress_thread_finalize (NULL);
2226         rdmacm_event_base = NULL;
2227     }
2228 
2229     /* The event thread is no longer running; no need to lock access
2230        to the client_list */
2231     OPAL_LIST_DESTRUCT(&client_list);
2232 
2233     /* For each of the items in the server list, there's only one item
2234        in the "ids" list -- the server listener.  So explicitly
2235        destroy its RDMA ID context. */
2236     while (NULL != (item = opal_list_remove_first(&server_listener_list))) {
2237         rdmacm_contents_t *contents = (rdmacm_contents_t*) item;
2238         item2 = opal_list_remove_first(&(contents->ids));
2239         OBJ_RELEASE(item2);
2240         OBJ_RELEASE(item);
2241     }
2242     OBJ_DESTRUCT(&server_listener_list);
2243 
2244     /* Now we're all done -- destroy the event channel */
2245     if (NULL != event_channel) {
2246         rdma_destroy_event_channel(event_channel);
2247         event_channel = NULL;
2248     }
2249 
2250     mca_btl_openib_free_rdma_addr_list();
2251 
2252     pthread_cond_destroy (&rdmacm_disconnect_cond);
2253     pthread_mutex_destroy (&rdmacm_disconnect_lock);
2254 
2255     return OPAL_SUCCESS;
2256 }
2257 
2258 #if BTL_OPENIB_RDMACM_IB_ADDR
2259 static int rdmacm_check_ibaddr_support(void)
2260 {
2261     int rsock;
2262     rsock = rsocket(AF_IB, SOCK_STREAM, 0);
2263     if (rsock < 0) {
2264         return OPAL_ERROR;
2265     }
2266 
2267     rclose(rsock);
2268 
2269     return OPAL_SUCCESS;
2270 }
2271 #endif
2272 
2273 static int rdmacm_component_init(void)
2274 {
2275     int rc;
2276 
2277     OBJ_CONSTRUCT(&server_listener_list, opal_list_t);
2278     OBJ_CONSTRUCT(&client_list, opal_list_t);
2279     OBJ_CONSTRUCT(&client_list_lock, opal_mutex_t);
2280 
2281 #if !BTL_OPENIB_RDMACM_IB_ADDR
2282     rc = mca_btl_openib_build_rdma_addr_list();
2283     if (OPAL_SUCCESS != rc) {
2284         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2285                             "openib BTL: rdmacm CPC unable to find any valid IP address");
2286         return OPAL_ERR_NOT_SUPPORTED;
2287     }
2288 #else
2289     rc = rdmacm_check_ibaddr_support();
2290     if (OPAL_SUCCESS != rc) {
2291         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2292                             "There is no IB_AF addressing support by lib rdmacm");
2293         return OPAL_ERR_NOT_SUPPORTED;
2294     }
2295 #endif
2296 
2297     event_channel = rdma_create_event_channel();
2298     if (NULL == event_channel) {
2299         opal_output_verbose(5, opal_btl_base_framework.framework_output,
2300                             "openib BTL: rdmacm CPC failed to create channel");
2301         return OPAL_ERR_UNREACH;
2302     }
2303 
2304     rdmacm_event_base = opal_progress_thread_init (NULL);
2305     if (NULL == rdmacm_event_base) {
2306         opal_output_verbose (5, opal_btl_base_framework.framework_output,
2307                              "openib BTL: could not create rdmacm event thread");
2308         return OPAL_ERR_UNREACH;
2309     }
2310 
2311     opal_event_set (rdmacm_event_base, &rdmacm_event, event_channel->fd,
2312                     OPAL_EV_READ | OPAL_EV_PERSIST,  rdmacm_event_dispatch, NULL);
2313 
2314     opal_event_add (&rdmacm_event, 0);
2315 
2316     pthread_cond_init (&rdmacm_disconnect_cond, NULL);
2317     pthread_mutex_init (&rdmacm_disconnect_lock, NULL);
2318 
2319     rdmacm_component_initialized = true;
2320 
2321     return OPAL_SUCCESS;
2322 }
2323