1 /*
2  * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 #ifndef _USDF_H_
37 #define _USDF_H_
38 
39 #include <sys/queue.h>
40 #include <pthread.h>
41 
42 #include <rdma/providers/fi_log.h>
43 #include <ofi_epoll.h>
44 
45 #include "usdf_progress.h"
46 #include "usd.h"
47 
48 
49 #define USDF_PROV_NAME "usnic"
50 #define USDF_MAJOR_VERS 1
51 #define USDF_MINOR_VERS 0
52 #define USDF_PROV_VERSION FI_VERSION(USDF_MAJOR_VERS, USDF_MINOR_VERS)
53 
54 extern struct fi_provider usdf_ops;
55 
56 #define USDF_WARN_SYS(subsys, ...) \
57 	FI_WARN(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__)
58 #define USDF_TRACE_SYS(subsys, ...) \
59 	FI_TRACE(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__)
60 #define USDF_INFO_SYS(subsys, ...) \
61 	FI_INFO(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__)
62 #define USDF_DBG_SYS(subsys, ...) \
63 	FI_DBG(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__)
64 
65 /* default to "FI_LOG_FABRIC" */
66 #define USDF_WARN(...) USDF_WARN_SYS(FABRIC, __VA_ARGS__)
67 #define USDF_TRACE(...) USDF_TRACE_SYS(FABRIC, __VA_ARGS__)
68 #define USDF_INFO(...) USDF_INFO_SYS(FABRIC, __VA_ARGS__)
69 #define USDF_DBG(...)  USDF_DBG_SYS(FABRIC, __VA_ARGS__)
70 
71 #define USDF_HDR_BUF_ENTRY 64
72 #define USDF_EP_CAP_PIO (1ULL << 63)
73 
74 #define USDF_MAX_PEERS (16 * 1024)
75 
76 /* usdf event flags */
77 #define USDF_EVENT_FLAG_ERROR (1ULL << 62)
78 #define USDF_EVENT_FLAG_FREE_BUF (1ULL << 63)
79 
80 /* usdf domain capability: no loopback */
81 #define USDF_DOM_CAPS (FI_REMOTE_COMM)
82 
83 #define USDF_MR_IOV_LIMIT 1
84 #define USDF_MR_CNT (65535)
85 #define USDF_ADDR_STR_LEN (INET6_ADDRSTRLEN+8)
86 
87 /*
88  *  TAILQ stuff that should exist
89  */
90 #define TAILQ_REMOVE_MARK(head, elm, link)	\
91 	do {					\
92 		TAILQ_REMOVE(head, elm, link);	\
93 		(elm)->link.tqe_prev = NULL;    \
94 	} while (0)
95 
96 #define TAILQ_ON_LIST(elm, link) ((elm)->link.tqe_prev != NULL)
97 
98 struct usdf_domain;
99 
100 struct usdf_dev_entry {
101 	struct usd_device *ue_dev;
102 	struct usd_device_attrs ue_dattr;
103 	int ue_dev_ok;
104 };
105 struct usdf_usnic_info {
106 	int uu_num_devs;
107 	struct usd_device_entry uu_devs[USD_MAX_DEVICES];
108 	struct usdf_dev_entry uu_info[USD_MAX_DEVICES];
109 };
110 extern struct usdf_usnic_info *__usdf_devinfo;
111 
112 struct usdf_fabric {
113 	struct fid_fabric   fab_fid;
114 	struct fi_fabric_attr fab_attr;
115 	struct usd_device_attrs *fab_dev_attrs;
116 	int fab_arp_sockfd;
117 	ofi_atomic32_t fab_refcnt;
118 	ofi_atomic32_t num_blocked_waiting;
119 	LIST_HEAD(,usdf_domain) fab_domain_list;
120 
121 	/* progression */
122 	pthread_t fab_thread;
123 	int fab_exit;
124 	ofi_epoll_t fab_epollfd;
125 	int fab_eventfd;
126 	struct usdf_poll_item fab_poll_item;
127 
128 	/* timer vars */
129 	uint32_t fab_active_timer_count;
130 	LIST_HEAD(usdf_timer_bucket, usdf_timer_entry) *fab_timer_buckets;
131 	uint64_t fab_cur_bucket_ms;
132 	uint32_t fab_cur_bucket;
133 	pthread_spinlock_t fab_timer_lock;
134 };
135 #define fab_ftou(FAB) container_of(FAB, struct usdf_fabric, fab_fid)
136 #define fab_utof(FP) (&(FP)->fab_fid)
137 #define fab_fidtou(FID) container_of(FID, struct usdf_fabric, fab_fid.fid)
138 
139 struct usdf_domain {
140 	struct fid_domain   dom_fid;
141 	struct usdf_fabric *dom_fabric;
142 	struct fi_info *dom_info;
143 	ofi_atomic32_t dom_refcnt;
144 	struct usdf_eq *dom_eq;
145 	struct usd_device   *dom_dev;
146 
147 	pthread_spinlock_t dom_progress_lock;
148 	TAILQ_HEAD(,usdf_tx) dom_tx_ready;
149 	TAILQ_HEAD(,usdf_cq_hard) dom_hcq_list;
150 
151 	/* used only by connected endpoints */
152 	struct usdf_ep **dom_peer_tab;
153 	uint32_t dom_next_peer;
154 
155 	LIST_ENTRY(usdf_domain) dom_link;
156 };
157 #define dom_ftou(FDOM) container_of(FDOM, struct usdf_domain, dom_fid)
158 #define dom_utof(DOM) (&(DOM)->dom_fid)
159 #define dom_fidtou(FID) container_of(FID, struct usdf_domain, dom_fid.fid)
160 
161 enum usdf_pep_state {
162 	USDF_PEP_UNBOUND,
163 	USDF_PEP_BOUND,
164 	USDF_PEP_LISTENING,
165 
166 	/* A "ROBBED" PEP has had its socket stolen.  The only valid operation
167 	 * to call on a ROBBED PEP is fi_close(). */
168 	USDF_PEP_ROBBED
169 };
170 
171 struct usdf_pep {
172 	struct fid_pep pep_fid;
173 	ofi_atomic32_t pep_refcnt;
174 	struct usdf_fabric *pep_fabric;
175 	struct usdf_eq *pep_eq;
176 	int pep_sock;
177 	union {
178 		struct sockaddr_in sin;
179 		char addr_str[USDF_ADDR_STR_LEN];
180 	} pep_src_addr;
181 	enum usdf_pep_state pep_state;
182 	struct usdf_poll_item pep_pollitem;
183 	struct fi_info *pep_info;
184 
185 	pthread_spinlock_t pep_cr_lock;
186 	size_t pep_cr_max_data;
187 	uint32_t pep_backlog;
188 	uint32_t pep_cr_alloced;
189 	TAILQ_HEAD(,usdf_connreq) pep_cr_free;
190 	TAILQ_HEAD(,usdf_connreq) pep_cr_pending;
191 };
192 #define pep_ftou(FPEP) container_of(FPEP, struct usdf_pep, pep_fid)
193 #define pep_fidtou(FID) container_of(FID, struct usdf_pep, pep_fid.fid)
194 #define pep_utof(PEP) (&(PEP)->pep_fid)
195 #define pep_utofid(PEP) (&(PEP)->pep_fid.fid)
196 
197 struct usdf_tx {
198 	struct fid_stx tx_fid;
199 	ofi_atomic32_t tx_refcnt;
200 	struct usdf_domain *tx_domain;
201 	TAILQ_ENTRY(usdf_tx) tx_link;
202 
203 	struct fi_tx_attr tx_attr;
204 	struct usd_qp *tx_qp;
205 	void (*tx_progress)(struct usdf_tx *tx);
206 
207 	union {
208 		struct {
209 			struct usdf_cq_hard *tx_hcq;
210 
211 			uint8_t *tx_inject_bufs;
212 			struct usdf_msg_qe *tx_wqe_buf;
213 			TAILQ_HEAD(,usdf_msg_qe) tx_free_wqe;
214 			TAILQ_HEAD(,usdf_ep) tx_ep_ready;
215 			TAILQ_HEAD(,usdf_ep) tx_ep_have_acks;
216 			size_t tx_num_free_wqe;
217 		} msg;
218 		struct {
219 			struct usdf_cq_hard *tx_hcq;
220 
221 			ofi_atomic32_t tx_next_msg_id;
222 			struct usdf_rdm_qe *tx_wqe_buf;
223 			uint8_t *tx_inject_bufs;
224 			TAILQ_HEAD(,usdf_rdm_qe) tx_free_wqe;
225 			TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_ready;
226 			TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_have_acks;
227 			size_t tx_num_free_wqe;
228 		} rdm;
229 	} t;
230 };
231 #define tx_ftou(FEP) container_of(FEP, struct usdf_tx, tx_fid)
232 #define tx_fidtou(FID) container_of(FID, struct usdf_tx, tx_fid)
233 #define tx_utof(RX) (&(RX)->tx_fid)
234 #define tx_utofid(RX) (&(RX)->tx_fid.fid)
235 
236 struct usdf_rx {
237 	struct fid_ep rx_fid;
238 	ofi_atomic32_t rx_refcnt;
239 	struct usdf_domain *rx_domain;
240 
241 	struct fi_rx_attr rx_attr;
242 	struct usd_qp *rx_qp;
243 
244 	union {
245 		struct {
246 			struct usdf_cq_hard *rx_hcq;
247 
248 			uint8_t *rx_bufs;
249 			struct usdf_msg_qe *rx_rqe_buf;
250 			TAILQ_HEAD(,usdf_msg_qe) rx_free_rqe;
251 			TAILQ_HEAD(,usdf_msg_qe) rx_posted_rqe;
252 			size_t rx_num_free_rqe;
253 		} msg;
254 		struct {
255 			int rx_sock;
256 			struct usdf_cq_hard *rx_hcq;
257 			struct usdf_tx *rx_tx;
258 
259 			uint8_t *rx_bufs;
260 			struct usdf_rdm_qe *rx_rqe_buf;
261 			TAILQ_HEAD(,usdf_rdm_qe) rx_free_rqe;
262 			TAILQ_HEAD(,usdf_rdm_qe) rx_posted_rqe;
263 			size_t rx_num_free_rqe;
264 		} rdm;
265 	} r;
266 };
267 #define rx_ftou(FEP) container_of(FEP, struct usdf_rx, rx_fid)
268 #define rx_fidtou(FID) container_of(FID, struct usdf_rx, rx_fid)
269 #define rx_utof(RX) (&(RX)->rx_fid)
270 #define rx_utofid(RX) (&(RX)->rx_fid.fid)
271 
272 enum {
273 	USDF_EP_ENABLED = (1 << 0)
274 };
275 
276 struct usdf_ep {
277 	struct fid_ep ep_fid;
278 	struct usdf_domain *ep_domain;
279 	ofi_atomic32_t ep_refcnt;
280 	uint64_t ep_caps;
281 	uint64_t ep_mode;
282 
283 	uint8_t ep_tx_dflt_signal_comp;
284 	uint8_t ep_rx_dflt_signal_comp;
285 
286 	uint8_t ep_tx_completion;
287 	uint8_t ep_rx_completion;
288 
289 	uint32_t flags;
290 
291 	uint32_t ep_wqe;	/* requested queue sizes */
292 	uint32_t ep_rqe;
293 
294 	struct usd_qp_attrs ep_qp_attrs;
295 
296 	struct usdf_eq *ep_eq;
297 
298 	struct usdf_tx *ep_tx;
299 	struct usdf_rx *ep_rx;
300 
301 	size_t max_msg_size;
302 
303 	union {
304 		struct {
305 			struct usd_qp *ep_qp;
306 			struct usdf_cq *ep_wcq;
307 			struct usdf_cq *ep_rcq;
308 
309 			int ep_sock;
310 			struct usdf_av *ep_av;
311 
312 			/* TODO: Remove in favor of accessing op flags through
313 			 * ep_tx and ep_rx. Update once tx/rx context support
314 			 * is added to dgram */
315 			uint64_t tx_op_flags;
316 			uint64_t rx_op_flags;
317 
318 			size_t tx_iov_limit;
319 			size_t rx_iov_limit;
320 
321 			void *ep_hdr_buf;
322 			struct usd_udp_hdr **ep_hdr_ptr;
323 		} dg;
324 		struct {
325 			struct usdf_connreq *ep_connreq;
326 			int ep_cm_sock;
327 			struct sockaddr_in ep_lcl_addr;
328 			struct usd_dest *ep_dest;
329 			uint32_t ep_rem_peer_id;
330 			uint32_t ep_lcl_peer_id;
331 
332 			TAILQ_HEAD(,usdf_msg_qe) ep_posted_wqe;
333 			TAILQ_HEAD(usdf_msg_qe_head ,usdf_msg_qe) ep_sent_wqe;
334 			uint32_t ep_fairness_credits;
335 			uint32_t ep_seq_credits;
336 			uint16_t ep_next_tx_seq;
337 			uint16_t ep_last_rx_ack;
338 			int ep_send_nak;
339 
340 			struct usdf_msg_qe *ep_cur_recv;
341 			uint16_t ep_next_rx_seq;
342 			TAILQ_ENTRY(usdf_ep) ep_ack_link;
343 
344 			struct usdf_timer_entry *ep_ack_timer;
345 
346 			TAILQ_ENTRY(usdf_ep) ep_link;
347 		} msg;
348 		struct {
349 			int ep_sock;
350 			struct usdf_av *ep_av;
351 
352 		} rdm;
353 	 } e;
354 };
355 #define ep_ftou(FEP) container_of(FEP, struct usdf_ep, ep_fid)
356 #define ep_fidtou(FID) container_of(FID, struct usdf_ep, ep_fid.fid)
357 #define ep_utof(EP) (&(EP)->ep_fid)
358 #define ep_utofid(EP) (&(EP)->ep_fid.fid)
359 
360 struct usdf_mr {
361 	struct fid_mr mr_fid;
362 	struct usd_mr *mr_mr;
363 };
364 
365 struct usdf_cq_hard {
366 	struct usdf_cq *cqh_cq;
367 	struct usd_cq *cqh_ucq;
368 	ofi_atomic32_t cqh_refcnt;
369 	void (*cqh_progress)(struct usdf_cq_hard *hcq);
370 	void (*cqh_post)(struct usdf_cq_hard *hcq, void *context, size_t len,
371 			int prov_errno, uint64_t flags);
372 	TAILQ_ENTRY(usdf_cq_hard) cqh_link;
373 	TAILQ_ENTRY(usdf_cq_hard) cqh_dom_link;
374 };
375 
376 struct usdf_cq_soft_entry {
377 	void		*cse_context;
378 	uint64_t	cse_flags;
379 	size_t		cse_len;
380 	void		*cse_buf;
381 	uint64_t	cse_data;
382 	int		cse_prov_errno;
383 };
384 
385 struct usdf_cq {
386 	struct fid_cq cq_fid;
387 	ofi_atomic32_t cq_refcnt;
388 	struct usdf_domain *cq_domain;
389 	struct fi_cq_attr cq_attr;
390 	uint8_t cq_is_soft;
391 	uint8_t cq_waiting;
392 
393 	union {
394 		int fd;
395 		struct fi_mutex_cond mutex_cond;
396 	} object;
397 
398 	union {
399 		struct {
400 			struct usd_cq *cq_cq;
401 		} hard;
402 		struct {
403 			struct usdf_cq_soft_entry *cq_comps;
404 			struct usdf_cq_soft_entry *cq_end;
405 			struct usdf_cq_soft_entry *cq_head;
406 			struct usdf_cq_soft_entry *cq_tail;
407 			/* Last operation used to distinguish full vs empty. */
408 			uint8_t cq_last_op;
409 			TAILQ_HEAD(,usdf_cq_hard) cq_list;
410 		} soft;
411 	} c;
412 	struct usd_completion cq_comp;
413 	struct fi_ops_cq cq_ops;
414 };
415 
416 enum {
417 	USDF_SOFT_CQ_READ,
418 	USDF_SOFT_CQ_WRITE
419 };
420 
421 #define cq_ftou(FCQ) container_of(FCQ, struct usdf_cq, cq_fid)
422 #define cq_fidtou(FID) container_of(FID, struct usdf_cq, cq_fid.fid)
423 #define cq_utof(CQ) (&(CQ)->cq_fid)
424 
425 struct usdf_err_data_entry {
426 	struct slist_entry entry;
427 	uint8_t seen;
428 	uint8_t err_data[0];
429 };
430 
431 struct usdf_event {
432 	uint32_t ue_event;
433 	void *ue_buf;
434 	size_t ue_len;
435 	uint64_t ue_flags;
436 };
437 
438 struct usdf_eq {
439 	struct fid_eq eq_fid;
440 	struct usdf_fabric *eq_fabric;
441 	ofi_atomic32_t eq_refcnt;
442 
443 	pthread_spinlock_t eq_lock;
444 
445 	struct fi_eq_err_entry *eq_ev_buf;
446 	struct usdf_event *eq_ev_ring;
447 	struct usdf_event *eq_ev_head;
448 	struct usdf_event *eq_ev_tail;
449 	struct usdf_event *eq_ev_end;
450 	int eq_ev_ring_size;
451 	ofi_atomic32_t eq_num_events;
452 
453 	/* various ways to wait */
454 	struct fi_eq_attr eq_attr;
455 	union {
456 		int eq_fd;
457 	};
458 
459 	struct slist eq_err_data;
460 	struct fi_ops_eq eq_ops_data;
461 };
462 #define eq_ftou(FEQ) container_of(FEQ, struct usdf_eq, eq_fid)
463 #define eq_fidtou(FID) container_of(FID, struct usdf_eq, eq_fid.fid)
464 #define eq_utof(EQ) (&(EQ)->eq_fid)
465 
466 /*
467  * Prototypes
468  */
469 
470 ssize_t usdf_eq_write_internal(struct usdf_eq *eq, uint32_t event,
471 		const void *buf, size_t len, uint64_t flags);
472 
473 /* fi_ops_fabric */
474 int usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info,
475 	struct fid_domain **domain, void *context);
476 int usdf_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
477 	struct fid_eq **eq, void *context);
478 int usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info,
479 		struct fid_pep **pep_p, void *context);
480 
481 /* fi_ops_domain */
482 int usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr,
483 		 struct fid_cq **cq_o, void *context);
484 int usdf_endpoint_open(struct fid_domain *domain, struct fi_info *info,
485 		struct fid_ep **ep, void *context);
486 int usdf_av_open(struct fid_domain *domain, struct fi_av_attr *attr,
487 		 struct fid_av **av_o, void *context);
488 int usdf_query_atomic(struct fid_domain *domain, enum fi_datatype datatype,
489 		enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags);
490 
491 /* Domain name functionality */
492 int usdf_domain_getname(uint32_t version, struct usd_device_attrs *dap,
493 			char **name);
494 bool usdf_domain_checkname(uint32_t version, struct usd_device_attrs *dap,
495 			const char *hint);
496 
497 /* fi_ops_mr */
498 int usdf_reg_mr(struct fid *fid, const void *buf, size_t len,
499 		uint64_t access, uint64_t offset, uint64_t requested_key,
500 		uint64_t flags, struct fid_mr **mr_o, void *context);
501 int usdf_regv_mr(struct fid *fid, const struct iovec *iov,
502 		 size_t count, uint64_t access,
503 		 uint64_t offset, uint64_t requested_key,
504 		 uint64_t flags, struct fid_mr **mr, void *context);
505 int usdf_regattr(struct fid *fid, const struct fi_mr_attr *attr,
506 		 uint64_t flags, struct fid_mr **mr);
507 
508 /* Fake IBV provider */
509 void usdf_setup_fake_ibv_provider(void);
510 
511 /* passive endpoint functions */
512 int usdf_pep_steal_socket(struct usdf_pep *pep, int *is_bound, int *sock_o);
513 
514 /* Utility functions */
515 int usdf_catch_dom_attr(uint32_t version, const struct fi_info *hints,
516 			struct fi_domain_attr *dom_attr);
517 int usdf_catch_tx_attr(uint32_t version, const struct fi_tx_attr *tx_attr);
518 int usdf_catch_rx_attr(uint32_t version, const struct fi_rx_attr *rx_attr);
519 struct sockaddr_in *usdf_format_to_sin(const struct fi_info *info, const void *addr);
520 void *usdf_sin_to_format(const struct fi_info *info, void *addr, size_t *len);
521 void usdf_free_sin_if_needed(const struct fi_info *info, struct sockaddr_in *sin);
522 
523 #endif /* _USDF_H_ */
524