1 /*
2 * Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved.
3 * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * BSD license below:
10 *
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
13 * conditions are met:
14 *
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer.
18 *
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 */
33
34 #ifndef FI_VERBS_H
35 #define FI_VERBS_H
36
37 #include "config.h"
38
39 #include <asm/types.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <arpa/inet.h>
43 #include <netinet/in.h>
44 #include <poll.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <pthread.h>
51 #include <ofi_epoll.h>
52
53 #include <infiniband/ib.h>
54 #include <infiniband/verbs.h>
55 #include <rdma/rdma_cma.h>
56
57 #include <rdma/fabric.h>
58 #include <rdma/fi_cm.h>
59 #include <rdma/fi_domain.h>
60 #include <rdma/fi_endpoint.h>
61 #include <rdma/fi_rma.h>
62 #include <rdma/fi_errno.h>
63
64 #include "ofi.h"
65 #include "ofi_atomic.h"
66 #include "ofi_enosys.h"
67 #include <uthash.h>
68 #include "ofi_prov.h"
69 #include "ofi_list.h"
70 #include "ofi_signal.h"
71 #include "ofi_util.h"
72 #include "ofi_tree.h"
73 #include "ofi_indexer.h"
74
75 #include "ofi_verbs_priv.h"
76
77
78 #ifndef AF_IB
79 #define AF_IB 27
80 #endif
81
82 #ifndef RAI_FAMILY
83 #define RAI_FAMILY 0x00000008
84 #endif
85
86 #define VERBS_RESOLVE_TIMEOUT 2000 // ms
87
88 #define VERBS_PROV_NAME "verbs"
89
90 #define VERBS_DBG(subsys, ...) FI_DBG(&vrb_prov, subsys, __VA_ARGS__)
91 #define VERBS_INFO(subsys, ...) FI_INFO(&vrb_prov, subsys, __VA_ARGS__)
92 #define VERBS_INFO_ERRNO(subsys, fn, errno) VERBS_INFO(subsys, fn ": %s(%d)\n", \
93 strerror(errno), errno)
94 #define VERBS_WARN(subsys, ...) FI_WARN(&vrb_prov, subsys, __VA_ARGS__)
95
96
97 #define VERBS_INJECT_FLAGS(ep, len, flags) ((((flags) & FI_INJECT) || \
98 len <= (ep)->inject_limit) ? IBV_SEND_INLINE : 0)
99 #define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->info->tx_attr->op_flags)
100
101 #define VERBS_COMP_FLAGS(ep, flags, context) \
102 (((ep)->util_ep.tx_op_flags | (flags)) & \
103 FI_COMPLETION ? context : VERBS_NO_COMP_FLAG)
104 #define VERBS_COMP(ep, context) \
105 VERBS_COMP_FLAGS((ep), (ep)->info->tx_attr->op_flags, context)
106
107 #define VERBS_WCE_CNT 1024
108 #define VERBS_WRE_CNT 1024
109
110 #define VERBS_DEF_CQ_SIZE 1024
111 #define VERBS_MR_IOV_LIMIT 1
112
113 #define VERBS_NO_COMP_FLAG ((uint64_t)-1)
114
115 #define VRB_CM_DATA_SIZE (56)
116 #define VERBS_CM_DATA_SIZE (VRB_CM_DATA_SIZE - \
117 sizeof(struct vrb_cm_data_hdr))
118
119 #define VRB_CM_REJ_CONSUMER_DEFINED 28
120 #define VRB_CM_REJ_SIDR_CONSUMER_DEFINED 2
121
122 #define VERBS_DGRAM_MSG_PREFIX_SIZE (40)
123
124 #define VRB_EP_TYPE(info) \
125 ((info && info->ep_attr) ? info->ep_attr->type : FI_EP_MSG)
126 #define VRB_EP_PROTO(info) \
127 (((info) && (info)->ep_attr) ? (info)->ep_attr->protocol : \
128 FI_PROTO_UNSPEC)
129
130 #define VRB_MEM_ALIGNMENT (64)
131 #define VRB_BUF_ALIGNMENT (4096) /* TODO: Page or MTU size */
132 #define VRB_POOL_BUF_CNT (100)
133
134 #define VERBS_ANY_DOMAIN "verbs_any_domain"
135 #define VERBS_ANY_FABRIC "verbs_any_fabric"
136
137 extern struct fi_provider vrb_prov;
138 extern struct util_prov vrb_util_prov;
139 extern struct dlist_entry verbs_devs;
140
141 extern struct vrb_gl_data {
142 int def_tx_size;
143 int def_rx_size;
144 int def_tx_iov_limit;
145 int def_rx_iov_limit;
146 int def_inline_size;
147 int min_rnr_timer;
148 int cqread_bunch_size;
149 int use_odp;
150 char *iface;
151 int gid_idx;
152 char *device_name;
153
154 struct {
155 int buffer_num;
156 int buffer_size;
157 int rndv_seg_size;
158 int thread_timeout;
159 char *eager_send_opcode;
160 char *cm_thread_affinity;
161 } rdm;
162
163 struct {
164 int use_name_server;
165 int name_server_port;
166 } dgram;
167
168 struct {
169 int prefer_xrc;
170 char *xrcd_filename;
171 } msg;
172 } vrb_gl_data;
173
174 struct verbs_addr {
175 struct dlist_entry entry;
176 struct rdma_addrinfo *rai;
177 };
178
179 /*
180 * fields of Infiniband packet headers that are used to
181 * represent OFI EP address
182 * - LRH (Local Route Header) - Link Layer:
183 * - LID - destination Local Identifier
184 * - SL - Service Level
185 * - GRH (Global Route Header) - Network Layer:
186 * - GID - destination Global Identifier
187 * - BTH (Base Transport Header) - Transport Layer:
188 * - QPN - destination Queue Pair number
189 * - P_key - Partition Key
190 *
191 * Note: DON'T change the placement of the fields in the structure.
192 * The placement is to keep structure size = 256 bits (32 byte).
193 */
194 struct ofi_ib_ud_ep_name {
195 union ibv_gid gid; /* 64-bit GUID + 64-bit EUI - GRH */
196
197 uint32_t qpn; /* BTH */
198
199 uint16_t lid; /* LRH */
200 uint16_t pkey; /* BTH */
201 uint16_t service; /* for NS src addr, 0 means any */
202
203 uint8_t sl; /* LRH */
204 uint8_t padding[5]; /* forced padding to 256 bits (32 byte) */
205 }; /* 256 bits */
206
207 #define VERBS_IB_UD_NS_ANY_SERVICE 0
208
209 static inline
vrb_dgram_ns_is_service_wildcard(void * svc)210 int vrb_dgram_ns_is_service_wildcard(void *svc)
211 {
212 return (*(int *)svc == VERBS_IB_UD_NS_ANY_SERVICE);
213 }
214
215 static inline
vrb_dgram_ns_service_cmp(void * svc1,void * svc2)216 int vrb_dgram_ns_service_cmp(void *svc1, void *svc2)
217 {
218 int service1 = *(int *)svc1, service2 = *(int *)svc2;
219
220 if (vrb_dgram_ns_is_service_wildcard(svc1) ||
221 vrb_dgram_ns_is_service_wildcard(svc2))
222 return 0;
223 return (service1 < service2) ? -1 : (service1 > service2);
224 }
225
226 struct verbs_dev_info {
227 struct dlist_entry entry;
228 char *name;
229 struct dlist_entry addrs;
230 };
231
232
233 struct vrb_fabric {
234 struct util_fabric util_fabric;
235 const struct fi_info *info;
236 struct util_ns name_server;
237 };
238
239 int vrb_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
240 void *context);
241 int vrb_find_fabric(const struct fi_fabric_attr *attr);
242
243 struct vrb_eq_entry {
244 struct dlist_entry item;
245 uint32_t event;
246 size_t len;
247 union {
248 struct fi_eq_entry *eq_entry;
249 struct fi_eq_cm_entry *cm_entry;
250 uint8_t data[0];
251 };
252 };
253
254 typedef int (*vrb_trywait_func)(struct fid *fid);
255
256 /* An OFI indexer is used to maintain a unique connection request to
257 * endpoint mapping. The key is a 32-bit value (referred to as a
258 * connection tag) and is passed to the remote peer by the active side
259 * of a connection request. When the reciprocal XRC connection in the
260 * reverse direction is made, the key is passed back and used to map
261 * back to the original endpoint. A key is defined as a 32-bit value:
262 *
263 * SSSSSSSS:SSSSSSII:IIIIIIII:IIIIIIII
264 * |-- sequence -||--- unique key ---|
265 */
266 #define VERBS_CONN_TAG_INDEX_BITS 18
267 #define VERBS_CONN_TAG_INVALID 0xFFFFFFFF /* Key is not valid */
268
269 struct vrb_eq {
270 struct fid_eq eq_fid;
271 struct vrb_fabric *fab;
272 fastlock_t lock;
273 struct dlistfd_head list_head;
274 struct rdma_event_channel *channel;
275 uint64_t flags;
276 struct fi_eq_err_entry err;
277
278 ofi_epoll_t epollfd;
279 enum fi_wait_obj wait_obj;
280
281 struct {
282 /* The connection key map is used during the XRC connection
283 * process to map an XRC reciprocal connection request back
284 * to the active endpoint that initiated the original
285 * connection request. It is protected with the eq::lock */
286 struct ofi_key_idx conn_key_idx;
287 struct indexer *conn_key_map;
288
289 /* TODO: This is limiting and restricts applications to using
290 * a single listener per EQ. While sufficient for RXM we should
291 * consider using an internal PEP listener for handling the
292 * internally processed reciprocal connections. */
293 uint16_t pep_port;
294
295 /* SIDR request/responses are a two-way handshake; therefore,
296 * we maintain an RB tree of SIDR accept responses, so that if
297 * a response is lost, the subsequent retried request can be
298 * detected and the original accept response resent. Note, that
299 * rejected requests can be passed to RXM and will be rejected
300 * a second time. */
301 struct ofi_rbmap sidr_conn_rbmap;
302 } xrc;
303 };
304
305 int vrb_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
306 struct fid_eq **eq, void *context);
307 int vrb_eq_trywait(struct vrb_eq *eq);
308 void vrb_eq_remove_events(struct vrb_eq *eq, struct fid *fid);
309
310 int vrb_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
311 struct fid_av **av, void *context);
312
313 struct vrb_pep {
314 struct fid_pep pep_fid;
315 struct vrb_eq *eq;
316 struct rdma_cm_id *id;
317
318 /* XRC uses SIDR based RDMA CM exchanges for setting up
319 * shared QP connections. This ID is bound to the same
320 * port number as "id", but the RDMA_PS_UDP port space. */
321 struct rdma_cm_id *xrc_ps_udp_id;
322
323 int backlog;
324 int bound;
325 size_t src_addrlen;
326 struct fi_info *info;
327 };
328
329 struct fi_ops_cm *vrb_pep_ops_cm(struct vrb_pep *pep);
330
331
332 #if VERBS_HAVE_QUERY_EX
333 #define VRB_ACCESS_ON_DEMAND IBV_ACCESS_ON_DEMAND
334 #else
335 #define VRB_ACCESS_ON_DEMAND 0
336 #endif
337
338 enum {
339 VRB_USE_XRC = BIT(0),
340 VRB_USE_ODP = BIT(1),
341 };
342
343 struct vrb_domain {
344 struct util_domain util_domain;
345 struct ibv_context *verbs;
346 struct ibv_pd *pd;
347
348 enum fi_ep_type ep_type;
349 struct fi_info *info;
350 /* The EQ is utilized by verbs/MSG */
351 struct vrb_eq *eq;
352 uint64_t eq_flags;
353
354 /* Indicates that MSG endpoints should use the XRC transport.
355 * TODO: Move selection of XRC/RC to endpoint info from domain */
356 int flags;
357 struct {
358 int xrcd_fd;
359 struct ibv_xrcd *xrcd;
360
361 /* The domain maintains a RBTree for mapping an endpoint
362 * destination addresses to physical XRC INI QP connected
363 * to that host. The map is protected using the EQ lock
364 * bound to the domain to avoid the need for additional
365 * locking. */
366 struct ofi_rbmap *ini_conn_rbmap;
367 } xrc;
368
369 /* MR stuff */
370 struct ofi_mr_cache cache;
371 };
372
373 struct vrb_cq;
374 typedef void (*vrb_cq_read_entry)(struct ibv_wc *wc, void *buf);
375
376 struct vrb_wc_entry {
377 struct slist_entry entry;
378 struct ibv_wc wc;
379 };
380
381 struct vrb_srq_ep;
382 struct vrb_cq {
383 struct util_cq util_cq;
384 struct ibv_comp_channel *channel;
385 struct ibv_cq *cq;
386 size_t entry_size;
387 uint64_t flags;
388 enum fi_wait_obj wait_obj;
389 enum fi_cq_wait_cond wait_cond;
390 struct ibv_wc wc;
391 int signal_fd[2];
392 vrb_cq_read_entry read_entry;
393 struct slist saved_wc_list;
394 ofi_atomic32_t nevents;
395 struct ofi_bufpool *wce_pool;
396
397 struct {
398 /* The list of XRC SRQ contexts associated with this CQ */
399 fastlock_t srq_list_lock;
400 struct dlist_entry srq_list;
401 } xrc;
402
403 size_t credits;
404 /* As a future optimization, we can use the app's context
405 * if they set FI_CONTEXT.
406 */
407 struct ofi_bufpool *ctx_pool;
408 };
409
410 int vrb_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
411 struct fid_cq **cq, void *context);
412 int vrb_cq_trywait(struct vrb_cq *cq);
413
414 struct vrb_mem_desc {
415 struct fid_mr mr_fid;
416 struct ibv_mr *mr;
417 struct vrb_domain *domain;
418 size_t len;
419 /* this field is used only by MR cache operations */
420 struct ofi_mr_entry *entry;
421 };
422
423 extern struct fi_ops_mr vrb_mr_ops;
424 extern struct fi_ops_mr vrb_mr_cache_ops;
425
426 int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
427 struct ofi_mr_entry *entry);
428 void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache,
429 struct ofi_mr_entry *entry);
430
431 /*
432 * An XRC SRQ cannot be created until the associated RX CQ is known,
433 * maintain a list of validated pre-posted receives to post once
434 * the SRQ is created.
435 */
436 struct vrb_xrc_srx_prepost {
437 struct slist_entry prepost_entry;
438 void *buf;
439 void *desc;
440 void *context;
441 size_t len;
442 fi_addr_t src_addr;
443 };
444
445 struct vrb_srq_ep {
446 struct fid_ep ep_fid;
447 struct ibv_srq *srq;
448 struct vrb_domain *domain;
449 struct ofi_bufpool *ctx_pool;
450 fastlock_t ctx_lock;
451
452 /* For XRC SRQ only */
453 struct {
454 /* XRC SRQ is not created until endpoint enable */
455 fastlock_t prepost_lock;
456 struct slist prepost_list;
457 uint32_t max_recv_wr;
458 uint32_t max_sge;
459 uint32_t prepost_count;
460
461 /* The RX CQ associated with this XRC SRQ. This field
462 * and the srq_entry should only be modified while holding
463 * the associted cq::xrc.srq_list_lock. */
464 struct vrb_cq *cq;
465
466 /* The CQ maintains a list of XRC SRQ associated with it */
467 struct dlist_entry srq_entry;
468 } xrc;
469 };
470
471 int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr,
472 struct fid_ep **rx_ep, void *context);
473
vrb_is_xrc(struct fi_info * info)474 static inline int vrb_is_xrc(struct fi_info *info)
475 {
476 return (VRB_EP_TYPE(info) == FI_EP_MSG) &&
477 (VRB_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC);
478 }
479
480 int vrb_domain_xrc_init(struct vrb_domain *domain);
481 int vrb_domain_xrc_cleanup(struct vrb_domain *domain);
482
483 enum vrb_ini_qp_state {
484 VRB_INI_QP_UNCONNECTED,
485 VRB_INI_QP_CONNECTING,
486 VRB_INI_QP_CONNECTED
487 };
488
489 #define VRB_NO_INI_TGT_QPNUM 0
490 #define VRB_RECIP_CONN 1
491
492 /*
493 * An XRC transport INI QP connection can be shared within a process to
494 * communicate with all the ranks on the same remote node. This structure is
495 * only accessed during connection setup and tear down and should be
496 * done while holding the domain:eq:lock.
497 */
498 struct vrb_ini_shared_conn {
499 /* To share, EP must have same remote peer host addr and TX CQ */
500 struct sockaddr *peer_addr;
501 struct vrb_cq *tx_cq;
502
503 /* The physical INI/TGT QPN connection. Virtual connections to the
504 * same remote peer and TGT QPN will share this connection, with
505 * the remote end opening the specified XRC TGT QPN for sharing
506 * During the physical connection setup, phys_conn_id identifies
507 * the RDMA CM ID (and MSG_EP) associated with the operation. */
508 enum vrb_ini_qp_state state;
509 struct rdma_cm_id *phys_conn_id;
510 struct ibv_qp *ini_qp;
511 uint32_t tgt_qpn;
512
513 /* EP waiting on or using this INI/TGT physical connection will be in
514 * one of these list and hold a reference to the shared connection. */
515 struct dlist_entry pending_list;
516 struct dlist_entry active_list;
517 ofi_atomic32_t ref_cnt;
518 };
519
520 enum vrb_xrc_ep_conn_state {
521 VRB_XRC_UNCONNECTED,
522 VRB_XRC_ORIG_CONNECTING,
523 VRB_XRC_ORIG_CONNECTED,
524 VRB_XRC_RECIP_CONNECTING,
525 VRB_XRC_CONNECTED,
526 VRB_XRC_ERROR
527 };
528
529 /*
530 * The following XRC state is only required during XRC connection
531 * establishment and can be freed once bidirectional connectivity
532 * is established.
533 */
534 #define VRB_MAX_XRC_CONNECT_RETRIES 16
535
536 struct vrb_xrc_ep_conn_setup {
537 int retry_count;
538
539 /* The connection tag is used to associate the reciprocal
540 * XRC INI/TGT QP connection request in the reverse direction
541 * with the original request. The tag is created by the
542 * original active side. */
543 uint32_t conn_tag;
544 uint32_t remote_conn_tag;
545
546 /* Delivery of the FI_CONNECTED event is delayed until
547 * bidirectional connectivity is established. */
548 size_t event_len;
549 uint8_t event_data[VRB_CM_DATA_SIZE];
550
551 /* Connection request may have to queue waiting for the
552 * physical XRC INI/TGT QP connection to complete. */
553 int pending_recip;
554 size_t pending_paramlen;
555 uint8_t pending_param[VRB_CM_DATA_SIZE];
556 };
557
558 struct vrb_ep {
559 struct util_ep util_ep;
560 struct ibv_qp *ibv_qp;
561
562 /* Protected by send CQ lock */
563 size_t tx_credits;
564
565 union {
566 struct rdma_cm_id *id;
567 struct {
568 struct ofi_ib_ud_ep_name ep_name;
569 int service;
570 };
571 };
572
573 size_t inject_limit;
574
575 struct vrb_eq *eq;
576 struct vrb_srq_ep *srq_ep;
577 struct fi_info *info;
578
579 struct {
580 struct ibv_send_wr rma_wr;
581 struct ibv_send_wr msg_wr;
582 struct ibv_sge sge;
583 } *wrs;
584 size_t rx_cq_size;
585 struct rdma_conn_param conn_param;
586 struct vrb_cm_data_hdr *cm_hdr;
587 };
588
589
590 /* Must be cast-able to struct fi_context */
591 struct vrb_context {
592 struct vrb_ep *ep;
593 struct vrb_srq_ep *srx;
594 void *user_ctx;
595 uint32_t flags;
596 };
597
598
599 #define VERBS_XRC_EP_MAGIC 0x1F3D5B79
600 struct vrb_xrc_ep {
601 /* Must be first */
602 struct vrb_ep base_ep;
603
604 /* XRC only fields */
605 struct rdma_cm_id *tgt_id;
606 struct ibv_qp *tgt_ibv_qp;
607 enum vrb_xrc_ep_conn_state conn_state;
608 bool recip_req_received;
609 uint32_t magic;
610 uint32_t srqn;
611 uint32_t peer_srqn;
612
613 /* A reference is held to a shared physical XRC INI/TGT QP connecting
614 * to the destination node. */
615 struct vrb_ini_shared_conn *ini_conn;
616 struct dlist_entry ini_conn_entry;
617
618 /* The following is used for resending lost SIDR accept response
619 * messages when a retransmit SIDR connect request is received. */
620 void *accept_param_data;
621 size_t accept_param_len;
622 uint16_t remote_pep_port;
623 bool recip_accept;
624 struct ofi_rbnode *conn_map_node;
625
626 /* The following state is allocated during XRC bidirectional setup and
627 * freed once the connection is established. */
628 struct vrb_xrc_ep_conn_setup *conn_setup;
629 };
630
631 int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
632 struct fid_ep **ep, void *context);
633 int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
634 struct fid_pep **pep, void *context);
635 int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
636 struct rdma_cm_id **id);
637 int vrb_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
638 struct fid_av **av_fid, void *context);
639 static inline
vrb_ep_to_domain(struct vrb_ep * ep)640 struct vrb_domain *vrb_ep_to_domain(struct vrb_ep *ep)
641 {
642 return container_of(ep->util_ep.domain, struct vrb_domain,
643 util_domain);
644 }
645
646 extern struct fi_ops_atomic vrb_msg_ep_atomic_ops;
647 extern struct fi_ops_atomic vrb_msg_xrc_ep_atomic_ops;
648 extern struct fi_ops_cm vrb_msg_ep_cm_ops;
649 extern struct fi_ops_cm vrb_msg_xrc_ep_cm_ops;
650 extern const struct fi_ops_msg vrb_msg_ep_msg_ops_ts;
651 extern const struct fi_ops_msg vrb_msg_ep_msg_ops;
652 extern const struct fi_ops_msg vrb_dgram_msg_ops_ts;
653 extern const struct fi_ops_msg vrb_dgram_msg_ops;
654 extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops;
655 extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops_ts;
656 extern const struct fi_ops_msg vrb_msg_srq_xrc_ep_msg_ops;
657 extern struct fi_ops_rma vrb_msg_ep_rma_ops_ts;
658 extern struct fi_ops_rma vrb_msg_ep_rma_ops;
659 extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops_ts;
660 extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops;
661
662 #define VRB_XRC_VERSION 2
663
664 struct vrb_xrc_cm_data {
665 uint8_t version;
666 uint8_t reciprocal;
667 uint16_t port;
668 uint32_t tgt_qpn;
669 uint32_t srqn;
670 uint32_t conn_tag;
671 };
672
673 struct vrb_xrc_conn_info {
674 uint32_t conn_tag;
675 uint32_t is_reciprocal;
676 uint32_t ini_qpn;
677 uint32_t tgt_qpn;
678 uint32_t peer_srqn;
679 uint16_t port;
680 struct rdma_conn_param conn_param;
681 };
682
683 struct vrb_connreq {
684 struct fid handle;
685 struct rdma_cm_id *id;
686
687 /* Support for XRC bidirectional connections, and
688 * non-RDMA CM managed QP. */
689 int is_xrc;
690 struct vrb_xrc_conn_info xrc;
691 };
692
693 struct vrb_cm_data_hdr {
694 uint8_t size;
695 char data[];
696 };
697
698 int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
699 void *param_data, size_t param_len);
700 void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep);
701 struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq,
702 struct sockaddr *peer,
703 uint16_t pep_port, bool recip);
704
705 void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
706 struct ibv_qp_init_attr *attr);
707 int vrb_process_xrc_connreq(struct vrb_ep *ep,
708 struct vrb_connreq *connreq);
709
710 void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep);
711 void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep);
712 void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep);
713 void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep);
714 struct vrb_xrc_ep *vrb_eq_xrc_conn_tag2ep(struct vrb_eq *eq,
715 uint32_t conn_tag);
716 void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal,
717 uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn,
718 uint32_t srqn);
719 int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
720 int private_data_len);
721 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
722 int reciprocal, void *param, size_t paramlen);
723 int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
724 void *param, size_t paramlen);
725 int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
726 struct vrb_connreq *connreq,
727 struct rdma_cm_id *id);
728 void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect);
729 void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal,
730 void *conn_param, size_t conn_paramlen);
731 void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn);
732 int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
733 struct vrb_ini_shared_conn **ini_conn);
734 void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep);
735 int vrb_reserve_qpn(struct vrb_xrc_ep *ep, struct ibv_qp **qp);
736
737 void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data,
738 size_t len);
739 int vrb_ep_create_ini_qp(struct vrb_xrc_ep *ep, void *dst_addr,
740 uint32_t *peer_tgt_qpn);
741 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t peer_tgt_qpn);
742 void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep);
743 int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn);
744 void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *qp);
745 int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep);
746
747 int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep);
748 int vrb_sockaddr_len(struct sockaddr *addr);
749
750
751 int vrb_init_info(const struct fi_info **all_infos);
752 int vrb_getinfo(uint32_t version, const char *node, const char *service,
753 uint64_t flags, const struct fi_info *hints,
754 struct fi_info **info);
755 const struct fi_info *vrb_get_verbs_info(const struct fi_info *ilist,
756 const char *domain_name);
757 int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
758 struct rdma_addrinfo *rai);
759 int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
760 const struct fi_info *hints, struct rdma_addrinfo **rai);
761 int vrb_get_matching_info(uint32_t version, const struct fi_info *hints,
762 struct fi_info **info, const struct fi_info *verbs_info,
763 uint8_t passive);
764 void vrb_alter_info(const struct fi_info *hints, struct fi_info *info);
765
766 struct verbs_ep_domain {
767 char *suffix;
768 enum fi_ep_type type;
769 uint32_t protocol;
770 };
771
772 extern const struct verbs_ep_domain verbs_dgram_domain;
773 extern const struct verbs_ep_domain verbs_msg_xrc_domain;
774
775 int vrb_check_ep_attr(const struct fi_info *hints,
776 const struct fi_info *info);
777 int vrb_check_rx_attr(const struct fi_rx_attr *attr,
778 const struct fi_info *hints,
779 const struct fi_info *info);
780
vrb_cmp_xrc_domain_name(const char * domain_name,const char * rdma_name)781 static inline int vrb_cmp_xrc_domain_name(const char *domain_name,
782 const char *rdma_name)
783 {
784 size_t domain_len = strlen(domain_name);
785 size_t suffix_len = strlen(verbs_msg_xrc_domain.suffix);
786
787 return domain_len > suffix_len ? strncmp(domain_name, rdma_name,
788 domain_len - suffix_len) : -1;
789 }
790
791 int vrb_cq_signal(struct fid_cq *cq);
792
793 struct vrb_eq_entry *vrb_eq_alloc_entry(uint32_t event,
794 const void *buf, size_t len);
795 ssize_t vrb_eq_write_event(struct vrb_eq *eq, uint32_t event,
796 const void *buf, size_t len);
797
798 int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
799 enum fi_op op, struct fi_atomic_attr *attr,
800 uint64_t flags);
801 int vrb_set_rnr_timer(struct ibv_qp *qp);
802 void vrb_cleanup_cq(struct vrb_ep *cur_ep);
803 int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context,
804 enum ibv_qp_type qp_type);
805
806 struct vrb_dgram_av {
807 struct util_av util_av;
808 struct dlist_entry av_entry_list;
809 };
810
811 struct vrb_dgram_av_entry {
812 struct dlist_entry list_entry;
813 struct ofi_ib_ud_ep_name addr;
814 struct ibv_ah *ah;
815 };
816
817 static inline struct vrb_dgram_av_entry*
vrb_dgram_av_lookup_av_entry(fi_addr_t fi_addr)818 vrb_dgram_av_lookup_av_entry(fi_addr_t fi_addr)
819 {
820 return (struct vrb_dgram_av_entry *) (uintptr_t) fi_addr;
821 }
822
823 /* NOTE:
824 * When ibv_post_send/recv returns '-1' it means the following:
825 * Deal with non-compliant libibverbs drivers which set errno
826 * instead of directly returning the error value
827 */
vrb_convert_ret(int ret)828 static inline ssize_t vrb_convert_ret(int ret)
829 {
830 if (!ret)
831 return 0;
832 else if (ret == -ENOMEM || ret == ENOMEM)
833 return -FI_EAGAIN;
834 else if (ret == -1)
835 return (errno == ENOMEM) ? -FI_EAGAIN : -errno;
836 else
837 return -abs(ret);
838 }
839
840
841 int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc);
842 int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc);
843
844 #define vrb_init_sge(buf, len, desc) (struct ibv_sge) \
845 { .addr = (uintptr_t)buf, \
846 .length = (uint32_t)len, \
847 .lkey = (uint32_t)(uintptr_t)desc }
848
849 #define vrb_set_sge_iov(sg_list, iov, count, desc) \
850 ({ \
851 size_t i; \
852 sg_list = alloca(sizeof(*sg_list) * count); \
853 for (i = 0; i < count; i++) { \
854 sg_list[i] = vrb_init_sge( \
855 iov[i].iov_base, \
856 iov[i].iov_len, \
857 desc[i]); \
858 } \
859 })
860
861 #define vrb_set_sge_iov_count_len(sg_list, iov, count, desc, len) \
862 ({ \
863 size_t i; \
864 sg_list = alloca(sizeof(*sg_list) * count); \
865 for (i = 0; i < count; i++) { \
866 sg_list[i] = vrb_init_sge( \
867 iov[i].iov_base, \
868 iov[i].iov_len, \
869 desc[i]); \
870 len += iov[i].iov_len; \
871 } \
872 })
873
874 #define vrb_init_sge_inline(buf, len) vrb_init_sge(buf, len, NULL)
875
876 #define vrb_set_sge_iov_inline(sg_list, iov, count, len) \
877 ({ \
878 size_t i; \
879 sg_list = alloca(sizeof(*sg_list) * count); \
880 for (i = 0; i < count; i++) { \
881 sg_list[i] = vrb_init_sge_inline( \
882 iov[i].iov_base, \
883 iov[i].iov_len); \
884 len += iov[i].iov_len; \
885 } \
886 })
887
888 #define vrb_send_iov(ep, wr, iov, desc, count) \
889 vrb_send_iov_flags(ep, wr, iov, desc, count, \
890 (ep)->info->tx_attr->op_flags)
891
892 #define vrb_send_msg(ep, wr, msg, flags) \
893 vrb_send_iov_flags(ep, wr, (msg)->msg_iov, (msg)->desc, \
894 (msg)->iov_count, flags)
895
896
897 ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr);
898 ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr);
899
900 static inline ssize_t
vrb_send_buf(struct vrb_ep * ep,struct ibv_send_wr * wr,const void * buf,size_t len,void * desc)901 vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr,
902 const void *buf, size_t len, void *desc)
903 {
904 struct ibv_sge sge = vrb_init_sge(buf, len, desc);
905
906 assert(wr->wr_id != VERBS_NO_COMP_FLAG);
907
908 wr->sg_list = &sge;
909 wr->num_sge = 1;
910
911 return vrb_post_send(ep, wr);
912 }
913
914 static inline ssize_t
vrb_send_buf_inline(struct vrb_ep * ep,struct ibv_send_wr * wr,const void * buf,size_t len)915 vrb_send_buf_inline(struct vrb_ep *ep, struct ibv_send_wr *wr,
916 const void *buf, size_t len)
917 {
918 struct ibv_sge sge = vrb_init_sge_inline(buf, len);
919
920 assert(wr->wr_id == VERBS_NO_COMP_FLAG);
921
922 wr->sg_list = &sge;
923 wr->num_sge = 1;
924
925 return vrb_post_send(ep, wr);
926 }
927
928 static inline ssize_t
vrb_send_iov_flags(struct vrb_ep * ep,struct ibv_send_wr * wr,const struct iovec * iov,void ** desc,int count,uint64_t flags)929 vrb_send_iov_flags(struct vrb_ep *ep, struct ibv_send_wr *wr,
930 const struct iovec *iov, void **desc, int count,
931 uint64_t flags)
932 {
933 size_t len = 0;
934
935 if (!desc)
936 vrb_set_sge_iov_inline(wr->sg_list, iov, count, len);
937 else
938 vrb_set_sge_iov_count_len(wr->sg_list, iov, count, desc, len);
939
940 wr->num_sge = count;
941 wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags);
942 wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id);
943
944 if (flags & FI_FENCE)
945 wr->send_flags |= IBV_SEND_FENCE;
946
947 return vrb_post_send(ep, wr);
948 }
949
950 int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
951 const struct fi_info *hints, struct rdma_addrinfo **rai,
952 struct rdma_cm_id **id);
953
954 #endif /* FI_VERBS_H */
955