1 /*
2  * Copyright (c) 2013-2018 Intel Corporation, Inc.  All rights reserved.
3  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #ifndef FI_VERBS_H
35 #define FI_VERBS_H
36 
37 #include "config.h"
38 
39 #include <asm/types.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <arpa/inet.h>
43 #include <netinet/in.h>
44 #include <poll.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <pthread.h>
51 #include <ofi_epoll.h>
52 
53 #include <infiniband/ib.h>
54 #include <infiniband/verbs.h>
55 #include <rdma/rdma_cma.h>
56 
57 #include <rdma/fabric.h>
58 #include <rdma/fi_cm.h>
59 #include <rdma/fi_domain.h>
60 #include <rdma/fi_endpoint.h>
61 #include <rdma/fi_rma.h>
62 #include <rdma/fi_errno.h>
63 
64 #include "ofi.h"
65 #include "ofi_atomic.h"
66 #include "ofi_enosys.h"
67 #include <uthash.h>
68 #include "ofi_prov.h"
69 #include "ofi_list.h"
70 #include "ofi_signal.h"
71 #include "ofi_util.h"
72 #include "ofi_tree.h"
73 #include "ofi_indexer.h"
74 
75 #include "ofi_verbs_priv.h"
76 
77 
78 #ifndef AF_IB
79 #define AF_IB 27
80 #endif
81 
82 #ifndef RAI_FAMILY
83 #define RAI_FAMILY              0x00000008
84 #endif
85 
86 #define VERBS_RESOLVE_TIMEOUT 2000	// ms
87 
88 #define VERBS_PROV_NAME "verbs"
89 
90 #define VERBS_DBG(subsys, ...) FI_DBG(&vrb_prov, subsys, __VA_ARGS__)
91 #define VERBS_INFO(subsys, ...) FI_INFO(&vrb_prov, subsys, __VA_ARGS__)
92 #define VERBS_INFO_ERRNO(subsys, fn, errno) VERBS_INFO(subsys, fn ": %s(%d)\n",	\
93 		strerror(errno), errno)
94 #define VERBS_WARN(subsys, ...) FI_WARN(&vrb_prov, subsys, __VA_ARGS__)
95 
96 
97 #define VERBS_INJECT_FLAGS(ep, len, flags) ((((flags) & FI_INJECT) || \
98 		len <= (ep)->inject_limit) ? IBV_SEND_INLINE : 0)
99 #define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->info->tx_attr->op_flags)
100 
101 #define VERBS_COMP_FLAGS(ep, flags, context)		\
102 	(((ep)->util_ep.tx_op_flags | (flags)) &		\
103 	 FI_COMPLETION ? context : VERBS_NO_COMP_FLAG)
104 #define VERBS_COMP(ep, context)						\
105 	VERBS_COMP_FLAGS((ep), (ep)->info->tx_attr->op_flags, context)
106 
107 #define VERBS_WCE_CNT 1024
108 #define VERBS_WRE_CNT 1024
109 
110 #define VERBS_DEF_CQ_SIZE 1024
111 #define VERBS_MR_IOV_LIMIT 1
112 
113 #define VERBS_NO_COMP_FLAG	((uint64_t)-1)
114 
115 #define VRB_CM_DATA_SIZE	(56)
116 #define VERBS_CM_DATA_SIZE	(VRB_CM_DATA_SIZE -		\
117 				 sizeof(struct vrb_cm_data_hdr))
118 
119 #define VRB_CM_REJ_CONSUMER_DEFINED	28
120 #define VRB_CM_REJ_SIDR_CONSUMER_DEFINED	2
121 
122 #define VERBS_DGRAM_MSG_PREFIX_SIZE	(40)
123 
124 #define VRB_EP_TYPE(info)						\
125 	((info && info->ep_attr) ? info->ep_attr->type : FI_EP_MSG)
126 #define VRB_EP_PROTO(info)						\
127 	(((info) && (info)->ep_attr) ? (info)->ep_attr->protocol :	\
128 					FI_PROTO_UNSPEC)
129 
130 #define VRB_MEM_ALIGNMENT (64)
131 #define VRB_BUF_ALIGNMENT (4096) /* TODO: Page or MTU size */
132 #define VRB_POOL_BUF_CNT (100)
133 
134 #define VERBS_ANY_DOMAIN "verbs_any_domain"
135 #define VERBS_ANY_FABRIC "verbs_any_fabric"
136 
137 extern struct fi_provider vrb_prov;
138 extern struct util_prov vrb_util_prov;
139 extern struct dlist_entry verbs_devs;
140 
141 extern struct vrb_gl_data {
142 	int	def_tx_size;
143 	int	def_rx_size;
144 	int	def_tx_iov_limit;
145 	int	def_rx_iov_limit;
146 	int	def_inline_size;
147 	int	min_rnr_timer;
148 	int	cqread_bunch_size;
149 	int	use_odp;
150 	char	*iface;
151 	int	gid_idx;
152 	char	*device_name;
153 
154 	struct {
155 		int	buffer_num;
156 		int	buffer_size;
157 		int	rndv_seg_size;
158 		int	thread_timeout;
159 		char	*eager_send_opcode;
160 		char	*cm_thread_affinity;
161 	} rdm;
162 
163 	struct {
164 		int	use_name_server;
165 		int	name_server_port;
166 	} dgram;
167 
168 	struct {
169 		int	prefer_xrc;
170 		char	*xrcd_filename;
171 	} msg;
172 } vrb_gl_data;
173 
174 struct verbs_addr {
175 	struct dlist_entry entry;
176 	struct rdma_addrinfo *rai;
177 };
178 
179 /*
180  * fields of Infiniband packet headers that are used to
181  * represent OFI EP address
182  * - LRH (Local Route Header) - Link Layer:
183  *   - LID - destination Local Identifier
184  *   - SL - Service Level
185  * - GRH (Global Route Header) - Network Layer:
186  *   - GID - destination Global Identifier
187  * - BTH (Base Transport Header) - Transport Layer:
188  *   - QPN - destination Queue Pair number
189  *   - P_key - Partition Key
190  *
191  * Note: DON'T change the placement of the fields in the structure.
192  *       The placement is to keep structure size = 256 bits (32 byte).
193  */
194 struct ofi_ib_ud_ep_name {
195 	union ibv_gid	gid;		/* 64-bit GUID + 64-bit EUI - GRH */
196 
197 	uint32_t	qpn;		/* BTH */
198 
199 	uint16_t	lid; 		/* LRH */
200 	uint16_t	pkey;		/* BTH */
201 	uint16_t	service;	/* for NS src addr, 0 means any */
202 
203 	uint8_t 	sl;		/* LRH */
204 	uint8_t		padding[5];	/* forced padding to 256 bits (32 byte) */
205 }; /* 256 bits */
206 
207 #define VERBS_IB_UD_NS_ANY_SERVICE	0
208 
209 static inline
vrb_dgram_ns_is_service_wildcard(void * svc)210 int vrb_dgram_ns_is_service_wildcard(void *svc)
211 {
212 	return (*(int *)svc == VERBS_IB_UD_NS_ANY_SERVICE);
213 }
214 
215 static inline
vrb_dgram_ns_service_cmp(void * svc1,void * svc2)216 int vrb_dgram_ns_service_cmp(void *svc1, void *svc2)
217 {
218 	int service1 = *(int *)svc1, service2 = *(int *)svc2;
219 
220 	if (vrb_dgram_ns_is_service_wildcard(svc1) ||
221 	    vrb_dgram_ns_is_service_wildcard(svc2))
222 		return 0;
223 	return (service1 < service2) ? -1 : (service1 > service2);
224 }
225 
226 struct verbs_dev_info {
227 	struct dlist_entry entry;
228 	char *name;
229 	struct dlist_entry addrs;
230 };
231 
232 
233 struct vrb_fabric {
234 	struct util_fabric	util_fabric;
235 	const struct fi_info	*info;
236 	struct util_ns		name_server;
237 };
238 
239 int vrb_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
240 		  void *context);
241 int vrb_find_fabric(const struct fi_fabric_attr *attr);
242 
243 struct vrb_eq_entry {
244 	struct dlist_entry	item;
245 	uint32_t		event;
246 	size_t			len;
247 	union {
248 		struct fi_eq_entry 	*eq_entry;
249 		struct fi_eq_cm_entry	*cm_entry;
250 		uint8_t 		data[0];
251 	};
252 };
253 
254 typedef int (*vrb_trywait_func)(struct fid *fid);
255 
256 /* An OFI indexer is used to maintain a unique connection request to
257  * endpoint mapping. The key is a 32-bit value (referred to as a
258  * connection tag) and is passed to the remote peer by the active side
259  * of a connection request. When the reciprocal XRC connection in the
260  * reverse direction is made, the key is passed back and used to map
261  * back to the original endpoint. A key is defined as a 32-bit value:
262  *
263  *     SSSSSSSS:SSSSSSII:IIIIIIII:IIIIIIII
264  *     |-- sequence -||--- unique key ---|
265  */
266 #define VERBS_CONN_TAG_INDEX_BITS	18
267 #define VERBS_CONN_TAG_INVALID		0xFFFFFFFF	/* Key is not valid */
268 
269 struct vrb_eq {
270 	struct fid_eq		eq_fid;
271 	struct vrb_fabric	*fab;
272 	fastlock_t		lock;
273 	struct dlistfd_head	list_head;
274 	struct rdma_event_channel *channel;
275 	uint64_t		flags;
276 	struct fi_eq_err_entry	err;
277 
278 	ofi_epoll_t		epollfd;
279 	enum fi_wait_obj	wait_obj;
280 
281 	struct {
282 		/* The connection key map is used during the XRC connection
283 		 * process to map an XRC reciprocal connection request back
284 		 * to the active endpoint that initiated the original
285 		 * connection request. It is protected with the eq::lock */
286 		struct ofi_key_idx	conn_key_idx;
287 		struct indexer		*conn_key_map;
288 
289 		/* TODO: This is limiting and restricts applications to using
290 		 * a single listener per EQ. While sufficient for RXM we should
291 		 * consider using an internal PEP listener for handling the
292 		 * internally processed reciprocal connections. */
293 		uint16_t		pep_port;
294 
295 		/* SIDR request/responses are a two-way handshake; therefore,
296 		 * we maintain an RB tree of SIDR accept responses, so that if
297 		 * a response is lost, the subsequent retried request can be
298 		 * detected and the original accept response resent. Note, that
299 		 * rejected requests can be passed to RXM and will be rejected
300 		 * a second time. */
301 		struct ofi_rbmap	sidr_conn_rbmap;
302 	} xrc;
303 };
304 
305 int vrb_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
306 		   struct fid_eq **eq, void *context);
307 int vrb_eq_trywait(struct vrb_eq *eq);
308 void vrb_eq_remove_events(struct vrb_eq *eq, struct fid *fid);
309 
310 int vrb_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
311 		   struct fid_av **av, void *context);
312 
313 struct vrb_pep {
314 	struct fid_pep		pep_fid;
315 	struct vrb_eq	*eq;
316 	struct rdma_cm_id	*id;
317 
318 	/* XRC uses SIDR based RDMA CM exchanges for setting up
319 	 * shared QP connections. This ID is bound to the same
320 	 * port number as "id", but the RDMA_PS_UDP port space. */
321 	struct rdma_cm_id	*xrc_ps_udp_id;
322 
323 	int			backlog;
324 	int			bound;
325 	size_t			src_addrlen;
326 	struct fi_info		*info;
327 };
328 
329 struct fi_ops_cm *vrb_pep_ops_cm(struct vrb_pep *pep);
330 
331 
332 #if VERBS_HAVE_QUERY_EX
333 #define VRB_ACCESS_ON_DEMAND IBV_ACCESS_ON_DEMAND
334 #else
335 #define VRB_ACCESS_ON_DEMAND 0
336 #endif
337 
338 enum {
339 	VRB_USE_XRC = BIT(0),
340 	VRB_USE_ODP = BIT(1),
341 };
342 
343 struct vrb_domain {
344 	struct util_domain		util_domain;
345 	struct ibv_context		*verbs;
346 	struct ibv_pd			*pd;
347 
348 	enum fi_ep_type			ep_type;
349 	struct fi_info			*info;
350 	/* The EQ is utilized by verbs/MSG */
351 	struct vrb_eq		*eq;
352 	uint64_t			eq_flags;
353 
354 	/* Indicates that MSG endpoints should use the XRC transport.
355 	 * TODO: Move selection of XRC/RC to endpoint info from domain */
356 	int				flags;
357 	struct {
358 		int			xrcd_fd;
359 		struct ibv_xrcd		*xrcd;
360 
361 		/* The domain maintains a RBTree for mapping an endpoint
362 		 * destination addresses to physical XRC INI QP connected
363 		 * to that host. The map is protected using the EQ lock
364 		 * bound to the domain to avoid the need for additional
365 		 * locking. */
366 		struct ofi_rbmap	*ini_conn_rbmap;
367 	} xrc;
368 
369 	/* MR stuff */
370 	struct ofi_mr_cache		cache;
371 };
372 
373 struct vrb_cq;
374 typedef void (*vrb_cq_read_entry)(struct ibv_wc *wc, void *buf);
375 
376 struct vrb_wc_entry {
377 	struct slist_entry	entry;
378 	struct ibv_wc		wc;
379 };
380 
381 struct vrb_srq_ep;
382 struct vrb_cq {
383 	struct util_cq		util_cq;
384 	struct ibv_comp_channel	*channel;
385 	struct ibv_cq		*cq;
386 	size_t			entry_size;
387 	uint64_t		flags;
388 	enum fi_wait_obj	wait_obj;
389 	enum fi_cq_wait_cond	wait_cond;
390 	struct ibv_wc		wc;
391 	int			signal_fd[2];
392 	vrb_cq_read_entry	read_entry;
393 	struct slist		saved_wc_list;
394 	ofi_atomic32_t		nevents;
395 	struct ofi_bufpool	*wce_pool;
396 
397 	struct {
398 		/* The list of XRC SRQ contexts associated with this CQ */
399 		fastlock_t		srq_list_lock;
400 		struct dlist_entry	srq_list;
401 	} xrc;
402 
403 	size_t			credits;
404 	/* As a future optimization, we can use the app's context
405 	 * if they set FI_CONTEXT.
406 	 */
407 	struct ofi_bufpool	*ctx_pool;
408 };
409 
410 int vrb_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
411 		   struct fid_cq **cq, void *context);
412 int vrb_cq_trywait(struct vrb_cq *cq);
413 
414 struct vrb_mem_desc {
415 	struct fid_mr		mr_fid;
416 	struct ibv_mr		*mr;
417 	struct vrb_domain	*domain;
418 	size_t			len;
419 	/* this field is used only by MR cache operations */
420 	struct ofi_mr_entry	*entry;
421 };
422 
423 extern struct fi_ops_mr vrb_mr_ops;
424 extern struct fi_ops_mr vrb_mr_cache_ops;
425 
426 int vrb_mr_cache_add_region(struct ofi_mr_cache *cache,
427 			       struct ofi_mr_entry *entry);
428 void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache,
429 				   struct ofi_mr_entry *entry);
430 
431 /*
432  * An XRC SRQ cannot be created until the associated RX CQ is known,
433  * maintain a list of validated pre-posted receives to post once
434  * the SRQ is created.
435  */
436 struct vrb_xrc_srx_prepost {
437 	struct slist_entry	prepost_entry;
438 	void			*buf;
439 	void			*desc;
440 	void			*context;
441 	size_t			len;
442 	fi_addr_t		src_addr;
443 };
444 
445 struct vrb_srq_ep {
446 	struct fid_ep		ep_fid;
447 	struct ibv_srq		*srq;
448 	struct vrb_domain	*domain;
449 	struct ofi_bufpool	*ctx_pool;
450 	fastlock_t		ctx_lock;
451 
452 	/* For XRC SRQ only */
453 	struct {
454 		/* XRC SRQ is not created until endpoint enable */
455 		fastlock_t		prepost_lock;
456 		struct slist		prepost_list;
457 		uint32_t		max_recv_wr;
458 		uint32_t		max_sge;
459 		uint32_t		prepost_count;
460 
461 		/* The RX CQ associated with this XRC SRQ. This field
462 		 * and the srq_entry should only be modified while holding
463 		 * the associted cq::xrc.srq_list_lock. */
464 		struct vrb_cq		*cq;
465 
466 		/* The CQ maintains a list of XRC SRQ associated with it */
467 		struct dlist_entry	srq_entry;
468 	} xrc;
469 };
470 
471 int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr,
472 		       struct fid_ep **rx_ep, void *context);
473 
vrb_is_xrc(struct fi_info * info)474 static inline int vrb_is_xrc(struct fi_info *info)
475 {
476 	return (VRB_EP_TYPE(info) == FI_EP_MSG) &&
477 	       (VRB_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC);
478 }
479 
480 int vrb_domain_xrc_init(struct vrb_domain *domain);
481 int vrb_domain_xrc_cleanup(struct vrb_domain *domain);
482 
483 enum vrb_ini_qp_state {
484 	VRB_INI_QP_UNCONNECTED,
485 	VRB_INI_QP_CONNECTING,
486 	VRB_INI_QP_CONNECTED
487 };
488 
489 #define VRB_NO_INI_TGT_QPNUM 0
490 #define VRB_RECIP_CONN	1
491 
492 /*
493  * An XRC transport INI QP connection can be shared within a process to
494  * communicate with all the ranks on the same remote node. This structure is
495  * only accessed during connection setup and tear down and should be
496  * done while holding the domain:eq:lock.
497  */
498 struct vrb_ini_shared_conn {
499 	/* To share, EP must have same remote peer host addr and TX CQ */
500 	struct sockaddr			*peer_addr;
501 	struct vrb_cq		*tx_cq;
502 
503 	/* The physical INI/TGT QPN connection. Virtual connections to the
504 	 * same remote peer and TGT QPN will share this connection, with
505 	 * the remote end opening the specified XRC TGT QPN for sharing
506 	 * During the physical connection setup, phys_conn_id identifies
507 	 * the RDMA CM ID (and MSG_EP) associated with the operation. */
508 	enum vrb_ini_qp_state	state;
509 	struct rdma_cm_id		*phys_conn_id;
510 	struct ibv_qp			*ini_qp;
511 	uint32_t			tgt_qpn;
512 
513 	/* EP waiting on or using this INI/TGT physical connection will be in
514 	 * one of these list and hold a reference to the shared connection. */
515 	struct dlist_entry		pending_list;
516 	struct dlist_entry		active_list;
517 	ofi_atomic32_t			ref_cnt;
518 };
519 
520 enum vrb_xrc_ep_conn_state {
521 	VRB_XRC_UNCONNECTED,
522 	VRB_XRC_ORIG_CONNECTING,
523 	VRB_XRC_ORIG_CONNECTED,
524 	VRB_XRC_RECIP_CONNECTING,
525 	VRB_XRC_CONNECTED,
526 	VRB_XRC_ERROR
527 };
528 
529 /*
530  * The following XRC state is only required during XRC connection
531  * establishment and can be freed once bidirectional connectivity
532  * is established.
533  */
534 #define VRB_MAX_XRC_CONNECT_RETRIES	16
535 
536 struct vrb_xrc_ep_conn_setup {
537 	int				retry_count;
538 
539 	/* The connection tag is used to associate the reciprocal
540 	 * XRC INI/TGT QP connection request in the reverse direction
541 	 * with the original request. The tag is created by the
542 	 * original active side. */
543 	uint32_t			conn_tag;
544 	uint32_t			remote_conn_tag;
545 
546 	/* Delivery of the FI_CONNECTED event is delayed until
547 	 * bidirectional connectivity is established. */
548 	size_t				event_len;
549 	uint8_t				event_data[VRB_CM_DATA_SIZE];
550 
551 	/* Connection request may have to queue waiting for the
552 	 * physical XRC INI/TGT QP connection to complete. */
553 	int				pending_recip;
554 	size_t				pending_paramlen;
555 	uint8_t				pending_param[VRB_CM_DATA_SIZE];
556 };
557 
558 struct vrb_ep {
559 	struct util_ep			util_ep;
560 	struct ibv_qp			*ibv_qp;
561 
562 	/* Protected by send CQ lock */
563 	size_t				tx_credits;
564 
565 	union {
566 		struct rdma_cm_id		*id;
567 		struct {
568 			struct ofi_ib_ud_ep_name	ep_name;
569 			int				service;
570 		};
571 	};
572 
573 	size_t				inject_limit;
574 
575 	struct vrb_eq		*eq;
576 	struct vrb_srq_ep		*srq_ep;
577 	struct fi_info			*info;
578 
579 	struct {
580 		struct ibv_send_wr	rma_wr;
581 		struct ibv_send_wr	msg_wr;
582 		struct ibv_sge		sge;
583 	} *wrs;
584 	size_t				rx_cq_size;
585 	struct rdma_conn_param		conn_param;
586 	struct vrb_cm_data_hdr	*cm_hdr;
587 };
588 
589 
590 /* Must be cast-able to struct fi_context */
591 struct vrb_context {
592 	struct vrb_ep			*ep;
593 	struct vrb_srq_ep		*srx;
594 	void				*user_ctx;
595 	uint32_t			flags;
596 };
597 
598 
599 #define VERBS_XRC_EP_MAGIC		0x1F3D5B79
600 struct vrb_xrc_ep {
601 	/* Must be first */
602 	struct vrb_ep		base_ep;
603 
604 	/* XRC only fields */
605 	struct rdma_cm_id		*tgt_id;
606 	struct ibv_qp			*tgt_ibv_qp;
607 	enum vrb_xrc_ep_conn_state	conn_state;
608 	bool				recip_req_received;
609 	uint32_t			magic;
610 	uint32_t			srqn;
611 	uint32_t			peer_srqn;
612 
613 	/* A reference is held to a shared physical XRC INI/TGT QP connecting
614 	 * to the destination node. */
615 	struct vrb_ini_shared_conn	*ini_conn;
616 	struct dlist_entry		ini_conn_entry;
617 
618 	/* The following is used for resending lost SIDR accept response
619 	 * messages when a retransmit SIDR connect request is received. */
620 	void				*accept_param_data;
621 	size_t				accept_param_len;
622 	uint16_t			remote_pep_port;
623 	bool				recip_accept;
624 	struct ofi_rbnode		*conn_map_node;
625 
626 	/* The following state is allocated during XRC bidirectional setup and
627 	 * freed once the connection is established. */
628 	struct vrb_xrc_ep_conn_setup	*conn_setup;
629 };
630 
631 int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
632 		   struct fid_ep **ep, void *context);
633 int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
634 		      struct fid_pep **pep, void *context);
635 int vrb_create_ep(const struct fi_info *hints, enum rdma_port_space ps,
636 		     struct rdma_cm_id **id);
637 int vrb_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr,
638 			 struct fid_av **av_fid, void *context);
639 static inline
vrb_ep_to_domain(struct vrb_ep * ep)640 struct vrb_domain *vrb_ep_to_domain(struct vrb_ep *ep)
641 {
642 	return container_of(ep->util_ep.domain, struct vrb_domain,
643 			    util_domain);
644 }
645 
646 extern struct fi_ops_atomic vrb_msg_ep_atomic_ops;
647 extern struct fi_ops_atomic vrb_msg_xrc_ep_atomic_ops;
648 extern struct fi_ops_cm vrb_msg_ep_cm_ops;
649 extern struct fi_ops_cm vrb_msg_xrc_ep_cm_ops;
650 extern const struct fi_ops_msg vrb_msg_ep_msg_ops_ts;
651 extern const struct fi_ops_msg vrb_msg_ep_msg_ops;
652 extern const struct fi_ops_msg vrb_dgram_msg_ops_ts;
653 extern const struct fi_ops_msg vrb_dgram_msg_ops;
654 extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops;
655 extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops_ts;
656 extern const struct fi_ops_msg vrb_msg_srq_xrc_ep_msg_ops;
657 extern struct fi_ops_rma vrb_msg_ep_rma_ops_ts;
658 extern struct fi_ops_rma vrb_msg_ep_rma_ops;
659 extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops_ts;
660 extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops;
661 
662 #define VRB_XRC_VERSION	2
663 
664 struct vrb_xrc_cm_data {
665 	uint8_t		version;
666 	uint8_t		reciprocal;
667 	uint16_t	port;
668 	uint32_t	tgt_qpn;
669 	uint32_t	srqn;
670 	uint32_t	conn_tag;
671 };
672 
673 struct vrb_xrc_conn_info {
674 	uint32_t		conn_tag;
675 	uint32_t		is_reciprocal;
676 	uint32_t		ini_qpn;
677 	uint32_t		tgt_qpn;
678 	uint32_t		peer_srqn;
679 	uint16_t		port;
680 	struct rdma_conn_param	conn_param;
681 };
682 
683 struct vrb_connreq {
684 	struct fid			handle;
685 	struct rdma_cm_id		*id;
686 
687 	/* Support for XRC bidirectional connections, and
688 	 * non-RDMA CM managed QP. */
689 	int				is_xrc;
690 	struct vrb_xrc_conn_info	xrc;
691 };
692 
693 struct vrb_cm_data_hdr {
694 	uint8_t	size;
695 	char	data[];
696 };
697 
698 int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep,
699 			    void *param_data, size_t param_len);
700 void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep);
701 struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq,
702 					      struct sockaddr *peer,
703 					      uint16_t pep_port, bool recip);
704 
705 void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep,
706 			       struct ibv_qp_init_attr *attr);
707 int vrb_process_xrc_connreq(struct vrb_ep *ep,
708 			       struct vrb_connreq *connreq);
709 
710 void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep);
711 void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep);
712 void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep);
713 void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep);
714 struct vrb_xrc_ep *vrb_eq_xrc_conn_tag2ep(struct vrb_eq *eq,
715 						uint32_t conn_tag);
716 void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal,
717 			    uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn,
718 			    uint32_t srqn);
719 int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
720 			      int private_data_len);
721 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
722 		       int reciprocal, void *param, size_t paramlen);
723 int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
724 		      void *param, size_t paramlen);
725 int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
726 				    struct vrb_connreq *connreq,
727 				    struct rdma_cm_id *id);
728 void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect);
729 void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal,
730 				 void *conn_param, size_t conn_paramlen);
731 void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn);
732 int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep,
733 			       struct vrb_ini_shared_conn **ini_conn);
734 void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep);
735 int vrb_reserve_qpn(struct vrb_xrc_ep *ep, struct ibv_qp **qp);
736 
737 void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data,
738 			   size_t len);
739 int vrb_ep_create_ini_qp(struct vrb_xrc_ep *ep, void *dst_addr,
740 			    uint32_t *peer_tgt_qpn);
741 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t peer_tgt_qpn);
742 void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep);
743 int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn);
744 void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *qp);
745 int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep);
746 
747 int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep);
748 int vrb_sockaddr_len(struct sockaddr *addr);
749 
750 
751 int vrb_init_info(const struct fi_info **all_infos);
752 int vrb_getinfo(uint32_t version, const char *node, const char *service,
753 		   uint64_t flags, const struct fi_info *hints,
754 		   struct fi_info **info);
755 const struct fi_info *vrb_get_verbs_info(const struct fi_info *ilist,
756 					    const char *domain_name);
757 int vrb_fi_to_rai(const struct fi_info *fi, uint64_t flags,
758 		     struct rdma_addrinfo *rai);
759 int vrb_get_rdma_rai(const char *node, const char *service, uint64_t flags,
760 			const struct fi_info *hints, struct rdma_addrinfo **rai);
761 int vrb_get_matching_info(uint32_t version, const struct fi_info *hints,
762 			     struct fi_info **info, const struct fi_info *verbs_info,
763 			     uint8_t passive);
764 void vrb_alter_info(const struct fi_info *hints, struct fi_info *info);
765 
766 struct verbs_ep_domain {
767 	char			*suffix;
768 	enum fi_ep_type		type;
769 	uint32_t		protocol;
770 };
771 
772 extern const struct verbs_ep_domain verbs_dgram_domain;
773 extern const struct verbs_ep_domain verbs_msg_xrc_domain;
774 
775 int vrb_check_ep_attr(const struct fi_info *hints,
776 			 const struct fi_info *info);
777 int vrb_check_rx_attr(const struct fi_rx_attr *attr,
778 			 const struct fi_info *hints,
779 			 const struct fi_info *info);
780 
vrb_cmp_xrc_domain_name(const char * domain_name,const char * rdma_name)781 static inline int vrb_cmp_xrc_domain_name(const char *domain_name,
782 					     const char *rdma_name)
783 {
784 	size_t domain_len = strlen(domain_name);
785 	size_t suffix_len = strlen(verbs_msg_xrc_domain.suffix);
786 
787 	return domain_len > suffix_len ? strncmp(domain_name, rdma_name,
788 						 domain_len - suffix_len) : -1;
789 }
790 
791 int vrb_cq_signal(struct fid_cq *cq);
792 
793 struct vrb_eq_entry *vrb_eq_alloc_entry(uint32_t event,
794 					      const void *buf, size_t len);
795 ssize_t vrb_eq_write_event(struct vrb_eq *eq, uint32_t event,
796 		const void *buf, size_t len);
797 
798 int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
799 			enum fi_op op, struct fi_atomic_attr *attr,
800 			uint64_t flags);
801 int vrb_set_rnr_timer(struct ibv_qp *qp);
802 void vrb_cleanup_cq(struct vrb_ep *cur_ep);
803 int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context,
804 			   enum ibv_qp_type qp_type);
805 
806 struct vrb_dgram_av {
807 	struct util_av util_av;
808 	struct dlist_entry av_entry_list;
809 };
810 
811 struct vrb_dgram_av_entry {
812 	struct dlist_entry list_entry;
813 	struct ofi_ib_ud_ep_name addr;
814 	struct ibv_ah *ah;
815 };
816 
817 static inline struct vrb_dgram_av_entry*
vrb_dgram_av_lookup_av_entry(fi_addr_t fi_addr)818 vrb_dgram_av_lookup_av_entry(fi_addr_t fi_addr)
819 {
820 	return (struct vrb_dgram_av_entry *) (uintptr_t) fi_addr;
821 }
822 
823 /* NOTE:
824  * When ibv_post_send/recv returns '-1' it means the following:
825  * Deal with non-compliant libibverbs drivers which set errno
826  * instead of directly returning the error value
827  */
vrb_convert_ret(int ret)828 static inline ssize_t vrb_convert_ret(int ret)
829 {
830 	if (!ret)
831 		return 0;
832 	else if (ret == -ENOMEM || ret == ENOMEM)
833 		return -FI_EAGAIN;
834 	else if (ret == -1)
835 		return (errno == ENOMEM) ? -FI_EAGAIN : -errno;
836 	else
837 		return -abs(ret);
838 }
839 
840 
841 int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc);
842 int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc);
843 
844 #define vrb_init_sge(buf, len, desc) (struct ibv_sge)		\
845 	{ .addr = (uintptr_t)buf,					\
846 	  .length = (uint32_t)len,					\
847 	  .lkey = (uint32_t)(uintptr_t)desc }
848 
849 #define vrb_set_sge_iov(sg_list, iov, count, desc)	\
850 ({							\
851 	size_t i;					\
852 	sg_list = alloca(sizeof(*sg_list) * count);	\
853 	for (i = 0; i < count; i++) {			\
854 		sg_list[i] = vrb_init_sge(		\
855 				iov[i].iov_base,	\
856 				iov[i].iov_len,		\
857 				desc[i]);		\
858 	}						\
859 })
860 
861 #define vrb_set_sge_iov_count_len(sg_list, iov, count, desc, len)	\
862 ({									\
863 	size_t i;							\
864 	sg_list = alloca(sizeof(*sg_list) * count);			\
865 	for (i = 0; i < count; i++) {					\
866 		sg_list[i] = vrb_init_sge(				\
867 				iov[i].iov_base,			\
868 				iov[i].iov_len,				\
869 				desc[i]);				\
870 		len += iov[i].iov_len;					\
871 	}								\
872 })
873 
874 #define vrb_init_sge_inline(buf, len) vrb_init_sge(buf, len, NULL)
875 
876 #define vrb_set_sge_iov_inline(sg_list, iov, count, len)	\
877 ({								\
878 	size_t i;						\
879 	sg_list = alloca(sizeof(*sg_list) * count);		\
880 	for (i = 0; i < count; i++) {				\
881 		sg_list[i] = vrb_init_sge_inline(		\
882 					iov[i].iov_base,	\
883 					iov[i].iov_len);	\
884 		len += iov[i].iov_len;				\
885 	}							\
886 })
887 
888 #define vrb_send_iov(ep, wr, iov, desc, count)		\
889 	vrb_send_iov_flags(ep, wr, iov, desc, count,		\
890 			      (ep)->info->tx_attr->op_flags)
891 
892 #define vrb_send_msg(ep, wr, msg, flags)				\
893 	vrb_send_iov_flags(ep, wr, (msg)->msg_iov, (msg)->desc,	\
894 			      (msg)->iov_count, flags)
895 
896 
897 ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr);
898 ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr);
899 
900 static inline ssize_t
vrb_send_buf(struct vrb_ep * ep,struct ibv_send_wr * wr,const void * buf,size_t len,void * desc)901 vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr,
902 		const void *buf, size_t len, void *desc)
903 {
904 	struct ibv_sge sge = vrb_init_sge(buf, len, desc);
905 
906 	assert(wr->wr_id != VERBS_NO_COMP_FLAG);
907 
908 	wr->sg_list = &sge;
909 	wr->num_sge = 1;
910 
911 	return vrb_post_send(ep, wr);
912 }
913 
914 static inline ssize_t
vrb_send_buf_inline(struct vrb_ep * ep,struct ibv_send_wr * wr,const void * buf,size_t len)915 vrb_send_buf_inline(struct vrb_ep *ep, struct ibv_send_wr *wr,
916 		       const void *buf, size_t len)
917 {
918 	struct ibv_sge sge = vrb_init_sge_inline(buf, len);
919 
920 	assert(wr->wr_id == VERBS_NO_COMP_FLAG);
921 
922 	wr->sg_list = &sge;
923 	wr->num_sge = 1;
924 
925 	return vrb_post_send(ep, wr);
926 }
927 
928 static inline ssize_t
vrb_send_iov_flags(struct vrb_ep * ep,struct ibv_send_wr * wr,const struct iovec * iov,void ** desc,int count,uint64_t flags)929 vrb_send_iov_flags(struct vrb_ep *ep, struct ibv_send_wr *wr,
930 		      const struct iovec *iov, void **desc, int count,
931 		      uint64_t flags)
932 {
933 	size_t len = 0;
934 
935 	if (!desc)
936 		vrb_set_sge_iov_inline(wr->sg_list, iov, count, len);
937 	else
938 		vrb_set_sge_iov_count_len(wr->sg_list, iov, count, desc, len);
939 
940 	wr->num_sge = count;
941 	wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags);
942 	wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id);
943 
944 	if (flags & FI_FENCE)
945 		wr->send_flags |= IBV_SEND_FENCE;
946 
947 	return vrb_post_send(ep, wr);
948 }
949 
950 int vrb_get_rai_id(const char *node, const char *service, uint64_t flags,
951 		      const struct fi_info *hints, struct rdma_addrinfo **rai,
952 		      struct rdma_cm_id **id);
953 
954 #endif /* FI_VERBS_H */
955