1 /*
2  * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include "config.h"
38 
39 #include <asm/types.h>
40 #include <assert.h>
41 #include <errno.h>
42 #include <fcntl.h>
43 #include <netinet/in.h>
44 #include <poll.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <stdbool.h>
49 
50 #include <rdma/fabric.h>
51 #include <rdma/fi_cm.h>
52 #include <rdma/fi_domain.h>
53 #include <rdma/fi_endpoint.h>
54 #include <rdma/fi_rma.h>
55 #include <rdma/fi_errno.h>
56 #include "ofi.h"
57 #include "ofi_enosys.h"
58 #include "ofi_util.h"
59 
60 #include "usnic_direct.h"
61 #include "usd.h"
62 #include "usdf.h"
63 #include "usdf_endpoint.h"
64 #include "usdf_dgram.h"
65 #include "usdf_av.h"
66 #include "usdf_cq.h"
67 #include "usdf_cm.h"
68 
69 static int
usdf_ep_dgram_enable(struct fid_ep * fep)70 usdf_ep_dgram_enable(struct fid_ep *fep)
71 {
72 	struct usdf_ep *ep;
73 	struct usd_filter filt;
74 	struct usd_qp_impl *uqp;
75 	int ret;
76 
77 	USDF_TRACE_SYS(EP_CTRL, "\n");
78 
79 	ep = ep_ftou(fep);
80 
81 	if (ep->e.dg.ep_wcq == NULL) {
82 		ret = -FI_EOPBADSTATE;
83 		goto fail;
84 	}
85 	if (ep->e.dg.ep_rcq == NULL) {
86 		ret = -FI_EOPBADSTATE;
87 		goto fail;
88 	}
89 
90 	filt.uf_type = USD_FTY_UDP_SOCK;
91 	filt.uf_filter.uf_udp_sock.u_sock = ep->e.dg.ep_sock;
92 
93 	if (ep->ep_caps & USDF_EP_CAP_PIO) {
94 		ret = usd_create_qp(ep->ep_domain->dom_dev,
95 				USD_QTR_UDP,
96 				USD_QTY_UD_PIO,
97 				ep->e.dg.ep_wcq->c.hard.cq_cq,
98 				ep->e.dg.ep_rcq->c.hard.cq_cq,
99 				127,	// XXX
100 				127,	// XXX
101 				&filt,
102 				&ep->e.dg.ep_qp);
103 	} else {
104 		ret = -FI_EAGAIN;
105 	}
106 
107 	if (ret != 0) {
108 		ret = usd_create_qp(ep->ep_domain->dom_dev,
109 				USD_QTR_UDP,
110 				USD_QTY_UD,
111 				ep->e.dg.ep_wcq->c.hard.cq_cq,
112 				ep->e.dg.ep_rcq->c.hard.cq_cq,
113 				ep->ep_wqe,
114 				ep->ep_rqe,
115 				&filt,
116 				&ep->e.dg.ep_qp);
117 	}
118 	if (ret != 0) {
119 		goto fail;
120 	}
121 	ep->e.dg.ep_qp->uq_context = ep;
122 
123 	/*
124 	 * Allocate a memory region big enough to hold a header for each
125 	 * RQ entry
126 	 */
127 	uqp = to_qpi(ep->e.dg.ep_qp);
128 	ep->e.dg.ep_hdr_ptr = calloc(uqp->uq_rq.urq_num_entries,
129 			sizeof(ep->e.dg.ep_hdr_ptr[0]));
130 	if (ep->e.dg.ep_hdr_ptr == NULL) {
131 		ret = -FI_ENOMEM;
132 		goto fail;
133 	}
134 
135 	ret = usd_alloc_mr(ep->ep_domain->dom_dev,
136 		usd_get_recv_credits(ep->e.dg.ep_qp) * USDF_HDR_BUF_ENTRY,
137 			&ep->e.dg.ep_hdr_buf);
138 	if (ret != 0) {
139 		goto fail;
140 	}
141 
142 	ep->flags |= USDF_EP_ENABLED;
143 
144 	return 0;
145 
146 fail:
147 	free(ep->e.dg.ep_hdr_ptr);
148 	ep->e.dg.ep_hdr_ptr = NULL;
149 
150 	if (ep->e.dg.ep_qp != NULL) {
151 		usd_destroy_qp(ep->e.dg.ep_qp);
152 		ep->e.dg.ep_qp = NULL;
153 	}
154 	return ret;
155 }
156 
157 static int
usdf_ep_dgram_bind(struct fid * fid,struct fid * bfid,uint64_t flags)158 usdf_ep_dgram_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
159 {
160 	int ret;
161 	struct usdf_ep *ep;
162 	struct usdf_cq *cq;
163 	struct usdf_av *av;
164 
165 	USDF_TRACE_SYS(EP_CTRL, "\n");
166 
167 	/* Backward compatibility case for Open MPI. We haven't been validating the flags until now.
168 	 * Older version of Open MPI gives FI_RECV as AV bind flag (bug). */
169 	if (bfid->fclass == FI_CLASS_AV) {
170 		av = av_fidtou(bfid);
171 		if (av->av_domain->dom_info->fabric_attr->api_version <= FI_VERSION(1, 4) && (flags & FI_RECV))
172 			flags = flags & ~FI_RECV;
173 	}
174 
175 	/* Check if the binding flags are valid. */
176 	ret = ofi_ep_bind_valid(&usdf_ops, bfid, flags);
177 	if (ret)
178 		return ret;
179 
180 	ep = ep_fidtou(fid);
181 
182 	switch (bfid->fclass) {
183 
184 	case FI_CLASS_AV:
185 		if (ep->e.dg.ep_av != NULL) {
186 			return -FI_EINVAL;
187 		}
188 
189 		av = av_fidtou(bfid);
190 		ep->e.dg.ep_av = av;
191 		ofi_atomic_inc32(&av->av_refcnt);
192 		break;
193 
194 	case FI_CLASS_CQ:
195 		cq = cq_fidtou(bfid);
196 
197 		/* actually, could look through CQ list for a hard
198 		 * CQ with function usd_poll_cq() and use that... XXX
199 		 */
200 		if (cq->cq_is_soft) {
201 			return -FI_EINVAL;
202 		}
203 		if (cq->c.hard.cq_cq == NULL) {
204 			ret = usdf_cq_create_cq(cq, &cq->c.hard.cq_cq, true);
205 			if (ret != 0) {
206 				return ret;
207 			}
208 		}
209 
210 		if (flags & FI_SEND) {
211 			if (ep->e.dg.ep_wcq != NULL) {
212 				return -FI_EINVAL;
213 			}
214 
215 			ep->ep_tx_dflt_signal_comp =
216 				(flags & FI_SELECTIVE_COMPLETION) ? 0 : 1;
217 
218 			ep->ep_tx_completion = (ep->ep_tx_dflt_signal_comp ||
219 					(ep->e.dg.tx_op_flags & FI_COMPLETION));
220 
221 			ep->e.dg.ep_wcq = cq;
222 			ofi_atomic_inc32(&cq->cq_refcnt);
223 		}
224 
225 		if (flags & FI_RECV) {
226 			if (ep->e.dg.ep_rcq != NULL) {
227 				return -FI_EINVAL;
228 			}
229 
230 			if (flags & FI_SELECTIVE_COMPLETION)
231 				return -FI_EOPNOTSUPP;
232 
233 			ep->ep_rx_dflt_signal_comp =
234 				(flags & FI_SELECTIVE_COMPLETION) ? 0 : 1;
235 
236 			ep->ep_rx_completion = (ep->ep_rx_dflt_signal_comp ||
237 					(ep->e.dg.rx_op_flags & FI_COMPLETION));
238 
239 			ep->e.dg.ep_rcq = cq;
240 			ofi_atomic_inc32(&cq->cq_refcnt);
241 		}
242 		break;
243 
244 	case FI_CLASS_EQ:
245 		if (ep->ep_eq != NULL) {
246 			return -FI_EINVAL;
247 		}
248 		ep->ep_eq = eq_fidtou(bfid);
249 		ofi_atomic_inc32(&ep->ep_eq->eq_refcnt);
250 		break;
251 	default:
252 		return -FI_EINVAL;
253 	}
254 
255 	return 0;
256 }
257 
258 static void
usdf_ep_dgram_deref_cq(struct usdf_cq * cq)259 usdf_ep_dgram_deref_cq(struct usdf_cq *cq)
260 {
261 	struct usdf_cq_hard *hcq;
262 	void (*rtn)(struct usdf_cq_hard *hcq);
263 
264 	if (cq == NULL) {
265 		return;
266 	}
267 	ofi_atomic_dec32(&cq->cq_refcnt);
268 
269 	rtn = usdf_progress_hard_cq;
270 
271 	if (cq->cq_is_soft) {
272 		TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) {
273 			if (hcq->cqh_progress == rtn) {
274 				ofi_atomic_dec32(&hcq->cqh_refcnt);
275 				return;
276 			}
277 		}
278 	}
279 }
280 
281 static int
usdf_ep_dgram_close(fid_t fid)282 usdf_ep_dgram_close(fid_t fid)
283 {
284 	struct usdf_ep *ep;
285 
286 	USDF_TRACE_SYS(EP_CTRL, "\n");
287 
288 	ep = ep_fidtou(fid);
289 
290 	if (ofi_atomic_get32(&ep->ep_refcnt) > 0) {
291 		return -FI_EBUSY;
292 	}
293 
294 	free(ep->e.dg.ep_hdr_ptr);
295 
296 	if (ep->e.dg.ep_qp != NULL) {
297 		usd_destroy_qp(ep->e.dg.ep_qp);
298 	}
299 	ofi_atomic_dec32(&ep->ep_domain->dom_refcnt);
300 	if (ep->ep_eq != NULL) {
301 		ofi_atomic_dec32(&ep->ep_eq->eq_refcnt);
302 	}
303 
304 	if (ep->e.dg.ep_av)
305 		ofi_atomic_dec32(&ep->e.dg.ep_av->av_refcnt);
306 
307 	usdf_ep_dgram_deref_cq(ep->e.dg.ep_wcq);
308 	usdf_ep_dgram_deref_cq(ep->e.dg.ep_rcq);
309 
310 	if (ep->e.dg.ep_sock != -1) {
311 		close(ep->e.dg.ep_sock);
312 	}
313 
314 	free(ep);
315 	return 0;
316 }
317 
318 static struct fi_ops_ep usdf_base_dgram_ops = {
319 	.size = sizeof(struct fi_ops_ep),
320 	.cancel = fi_no_cancel,
321 	.getopt = fi_no_getopt,
322 	.setopt = fi_no_setopt,
323 	.tx_ctx = fi_no_tx_ctx,
324 	.rx_ctx = fi_no_rx_ctx,
325 	.rx_size_left = usdf_dgram_rx_size_left,
326 	.tx_size_left = usdf_dgram_tx_size_left,
327 };
328 
329 static struct fi_ops_ep usdf_base_dgram_prefix_ops = {
330 	.size = sizeof(struct fi_ops_ep),
331 	.cancel = fi_no_cancel,
332 	.getopt = fi_no_getopt,
333 	.setopt = fi_no_setopt,
334 	.tx_ctx = fi_no_tx_ctx,
335 	.rx_ctx = fi_no_rx_ctx,
336 	.rx_size_left = usdf_dgram_prefix_rx_size_left,
337 	.tx_size_left = usdf_dgram_prefix_tx_size_left,
338 };
339 
340 static struct fi_ops_msg usdf_dgram_ops = {
341 	.size = sizeof(struct fi_ops_msg),
342 	.recv = usdf_dgram_recv,
343 	.recvv = usdf_dgram_recvv,
344 	.recvmsg = usdf_dgram_recvmsg,
345 	.send = usdf_dgram_send,
346 	.sendv = usdf_dgram_sendv,
347 	.sendmsg = usdf_dgram_sendmsg,
348 	.inject = usdf_dgram_inject,
349 	.senddata = fi_no_msg_senddata,
350 	.injectdata = fi_no_msg_injectdata,
351 };
352 
353 static struct fi_ops_msg usdf_dgram_prefix_ops = {
354 	.size = sizeof(struct fi_ops_msg),
355 	.recv = usdf_dgram_prefix_recv,
356 	.recvv = usdf_dgram_prefix_recvv,
357 	.recvmsg = usdf_dgram_prefix_recvmsg,
358 	.send = usdf_dgram_prefix_send,
359 	.sendv = usdf_dgram_prefix_sendv,
360 	.sendmsg = usdf_dgram_prefix_sendmsg,
361 	.inject = usdf_dgram_prefix_inject,
362 	.senddata = fi_no_msg_senddata,
363 	.injectdata = fi_no_msg_injectdata,
364 };
365 
366 static struct fi_ops_cm usdf_cm_dgram_ops = {
367 	.size = sizeof(struct fi_ops_cm),
368 	.setname = fi_no_setname,
369 	.getname = usdf_cm_dgram_getname,
370 	.getpeer = fi_no_getpeer,
371 	.connect = fi_no_connect,
372 	.listen = fi_no_listen,
373 	.accept = fi_no_accept,
374 	.reject = fi_no_reject,
375 	.shutdown = fi_no_shutdown,
376 	.join = fi_no_join,
377 };
378 
379 static struct fi_ops_atomic usdf_dgram_atomic_ops = {
380 	.size = sizeof(struct fi_ops_atomic),
381 	.write = fi_no_atomic_write,
382 	.writev = fi_no_atomic_writev,
383 	.writemsg = fi_no_atomic_writemsg,
384 	.inject = fi_no_atomic_inject,
385 	.readwrite = fi_no_atomic_readwrite,
386 	.readwritev = fi_no_atomic_readwritev,
387 	.readwritemsg = fi_no_atomic_readwritemsg,
388 	.compwrite = fi_no_atomic_compwrite,
389 	.compwritev = fi_no_atomic_compwritev,
390 	.compwritemsg = fi_no_atomic_compwritemsg,
391 	.writevalid = fi_no_atomic_writevalid,
392 	.readwritevalid = fi_no_atomic_readwritevalid,
393 	.compwritevalid = fi_no_atomic_compwritevalid,
394 };
395 
396 /*******************************************************************************
397  * Default values for dgram attributes
398  ******************************************************************************/
399 static const struct fi_tx_attr dgram_dflt_tx_attr = {
400 	.caps = USDF_DGRAM_CAPS,
401 	.mode = USDF_DGRAM_SUPP_MODE,
402 	.op_flags = 0,
403 	.msg_order = USDF_DGRAM_MSG_ORDER,
404 	.comp_order = USDF_DGRAM_COMP_ORDER,
405 	.inject_size = USDF_DGRAM_INJECT_SIZE,
406 	.iov_limit = USDF_DGRAM_IOV_LIMIT,
407 	.rma_iov_limit = USDF_DGRAM_RMA_IOV_LIMIT
408 };
409 
410 static const struct fi_rx_attr dgram_dflt_rx_attr = {
411 	.caps = USDF_DGRAM_CAPS,
412 	.mode = USDF_DGRAM_SUPP_MODE,
413 	.op_flags = 0,
414 	.msg_order = USDF_DGRAM_MSG_ORDER,
415 	.comp_order = USDF_DGRAM_COMP_ORDER,
416 	.total_buffered_recv = 0,
417 	.iov_limit = USDF_DGRAM_IOV_LIMIT
418 };
419 
420 static const struct fi_ep_attr dgram_dflt_ep_attr = {
421 	.type = FI_EP_DGRAM,
422 	.protocol = FI_PROTO_UDP,
423 	.msg_prefix_size = 0,
424 	.max_order_raw_size = 0,
425 	.max_order_war_size = 0,
426 	.max_order_waw_size = 0,
427 	.mem_tag_format = 0,
428 	.tx_ctx_cnt = 1,
429 	.rx_ctx_cnt = 1
430 };
431 
432 static const struct fi_domain_attr dgram_dflt_domain_attr = {
433 	.caps = USDF_DOM_CAPS,
434 	.threading = FI_THREAD_ENDPOINT,
435 	.control_progress = FI_PROGRESS_AUTO,
436 	.data_progress = FI_PROGRESS_MANUAL,
437 	.resource_mgmt = FI_RM_DISABLED,
438 	.mr_mode = FI_MR_ALLOCATED | FI_MR_LOCAL | FI_MR_BASIC,
439 	.cntr_cnt = USDF_DGRAM_CNTR_CNT,
440 	.mr_iov_limit = USDF_DGRAM_MR_IOV_LIMIT,
441 	.mr_cnt = USDF_DGRAM_MR_CNT,
442 };
443 
444 /*******************************************************************************
445  * Fill functions for attributes
446  ******************************************************************************/
usdf_dgram_fill_ep_attr(uint32_t version,const struct fi_info * hints,struct fi_info * fi,struct usd_device_attrs * dap)447 int usdf_dgram_fill_ep_attr(uint32_t version, const struct fi_info *hints, struct
448 		fi_info *fi, struct usd_device_attrs *dap)
449 {
450 	struct fi_ep_attr defaults;
451 
452 	defaults = dgram_dflt_ep_attr;
453 
454 	/* The ethernet header does not count against the MTU. */
455 	defaults.max_msg_size = dap->uda_mtu - sizeof(struct usd_udp_hdr);
456 
457 	if (FI_VERSION_GE(version, FI_VERSION(1, 3)))
458 		defaults.max_msg_size += sizeof(struct ether_header);
459 
460 	if (!hints || !hints->ep_attr)
461 		goto out;
462 
463 	/* In prefix mode the max message size is the same as in non-prefix mode
464 	 * with the advertised header size added on top.
465 	 */
466 
467 	if (hints->mode & FI_MSG_PREFIX) {
468 		defaults.msg_prefix_size = USDF_HDR_BUF_ENTRY;
469 
470 		if (FI_VERSION_GE(version, FI_VERSION(1, 3)))
471 			defaults.max_msg_size += defaults.msg_prefix_size;
472 	}
473 
474 	if (hints->ep_attr->max_msg_size > defaults.max_msg_size)
475 		return -FI_ENODATA;
476 
477 	switch (hints->ep_attr->protocol) {
478 	case FI_PROTO_UNSPEC:
479 	case FI_PROTO_UDP:
480 		break;
481 	default:
482 		return -FI_ENODATA;
483 	}
484 
485 	if (hints->ep_attr->tx_ctx_cnt > defaults.tx_ctx_cnt)
486 		return -FI_ENODATA;
487 	if (hints->ep_attr->rx_ctx_cnt > defaults.rx_ctx_cnt)
488 		return -FI_ENODATA;
489 
490 	if (hints->ep_attr->max_order_raw_size > defaults.max_order_raw_size)
491 		return -FI_ENODATA;
492 	if (hints->ep_attr->max_order_war_size > defaults.max_order_war_size)
493 		return -FI_ENODATA;
494 	if (hints->ep_attr->max_order_waw_size > defaults.max_order_waw_size)
495 		return -FI_ENODATA;
496 
497 out:
498 	*fi->ep_attr = defaults;
499 
500 	return FI_SUCCESS;
501 }
502 
usdf_dgram_fill_dom_attr(uint32_t version,const struct fi_info * hints,struct fi_info * fi,struct usd_device_attrs * dap)503 int usdf_dgram_fill_dom_attr(uint32_t version, const struct fi_info *hints,
504 			     struct fi_info *fi, struct usd_device_attrs *dap)
505 {
506 	int ret;
507 	struct fi_domain_attr defaults;
508 
509 	defaults = dgram_dflt_domain_attr;
510 	ret = usdf_domain_getname(version, dap, &defaults.name);
511 	if (ret < 0)
512 		return -FI_ENODATA;
513 
514 	if (!hints || !hints->domain_attr)
515 		goto catch;
516 
517 	switch (hints->domain_attr->threading) {
518 	case FI_THREAD_UNSPEC:
519 	case FI_THREAD_ENDPOINT:
520 		break;
521 	case FI_THREAD_FID:
522 	case FI_THREAD_COMPLETION:
523 	case FI_THREAD_DOMAIN:
524 		defaults.threading = hints->domain_attr->threading;
525 		break;
526 	default:
527 		return -FI_ENODATA;
528 	}
529 
530 	switch (hints->domain_attr->control_progress) {
531 	case FI_PROGRESS_UNSPEC:
532 	case FI_PROGRESS_AUTO:
533 		break;
534 	case FI_PROGRESS_MANUAL:
535 		defaults.control_progress =
536 			hints->domain_attr->control_progress;
537 		break;
538 	default:
539 		return -FI_ENODATA;
540 	}
541 
542 	switch (hints->domain_attr->data_progress) {
543 	case FI_PROGRESS_UNSPEC:
544 	case FI_PROGRESS_MANUAL:
545 		break;
546 	default:
547 		return -FI_ENODATA;
548 	}
549 
550 	switch (hints->domain_attr->resource_mgmt) {
551 	case FI_RM_UNSPEC:
552 	case FI_RM_DISABLED:
553 		break;
554 	default:
555 		return -FI_ENODATA;
556 	}
557 
558 	switch (hints->domain_attr->caps) {
559 	case 0:
560 	case FI_REMOTE_COMM:
561 		break;
562 	default:
563 		USDF_WARN_SYS(DOMAIN,
564 			"invalid domain capabilities\n");
565 		return -FI_ENODATA;
566 	}
567 
568 	switch (hints->domain_attr->av_type) {
569 	case FI_AV_UNSPEC:
570 	case FI_AV_MAP:
571 		break;
572 	default:
573 		return -FI_ENODATA;
574 	}
575 
576 	if (ofi_check_mr_mode(&usdf_ops, version, defaults.mr_mode, hints))
577 		return -FI_ENODATA;
578 
579 	if (hints->domain_attr->mr_cnt) {
580 		if (hints->domain_attr->mr_cnt <= USDF_DGRAM_MR_CNT) {
581 			defaults.mr_cnt = hints->domain_attr->mr_cnt;
582 		} else {
583 			USDF_DBG_SYS(DOMAIN,
584 				     "mr_count exceeded provider limit\n");
585 			return -FI_ENODATA;
586 		}
587 	}
588 
589 catch:
590 	/* catch the version change here. */
591 	ret = usdf_catch_dom_attr(version, hints, &defaults);
592 	if (ret)
593 		return ret;
594 
595 	*fi->domain_attr = defaults;
596 	return FI_SUCCESS;
597 }
598 
usdf_dgram_fill_tx_attr(uint32_t version,const struct fi_info * hints,struct fi_info * fi,struct usd_device_attrs * dap)599 int usdf_dgram_fill_tx_attr(uint32_t version, const struct fi_info *hints,
600 			    struct fi_info *fi,
601 			    struct usd_device_attrs *dap)
602 {
603 	int ret;
604 	struct fi_tx_attr defaults;
605 	size_t entries;
606 
607 	defaults = dgram_dflt_tx_attr;
608 
609 	defaults.size = dap->uda_max_send_credits / defaults.iov_limit;
610 
611 	if (!hints || !hints->tx_attr)
612 		goto out;
613 
614 	/* make sure we can support the capabilities that are requested */
615 	if (hints->tx_attr->caps & ~USDF_DGRAM_CAPS)
616 		return -FI_ENODATA;
617 
618 	/* clear the mode bits the app doesn't support */
619 	if (hints->mode || hints->tx_attr->mode)
620 		defaults.mode &= (hints->mode | hints->tx_attr->mode);
621 
622 	defaults.op_flags |= hints->tx_attr->op_flags;
623 
624 	if ((hints->tx_attr->msg_order | USDF_DGRAM_MSG_ORDER) !=
625 			USDF_DGRAM_MSG_ORDER)
626 		return -FI_ENODATA;
627 	if ((hints->tx_attr->comp_order | USDF_DGRAM_COMP_ORDER) !=
628 			USDF_DGRAM_COMP_ORDER)
629 		return -FI_ENODATA;
630 
631 	if (hints->tx_attr->inject_size > defaults.inject_size)
632 		return -FI_ENODATA;
633 
634 	if (hints->tx_attr->iov_limit > USDF_DGRAM_MAX_SGE)
635 		return -FI_ENODATA;
636 
637 	/* make sure the values for iov_limit and size are within appropriate
638 	 * bounds. if only one of the two was given, then set the other based
639 	 * on:
640 	 * 	max_credits = size * iov_limit;
641 	 */
642 	if (hints->tx_attr->iov_limit && hints->tx_attr->size) {
643 		defaults.size = hints->tx_attr->size;
644 		defaults.iov_limit = hints->tx_attr->iov_limit;
645 	} else if (hints->tx_attr->iov_limit) {
646 		defaults.iov_limit = hints->tx_attr->iov_limit;
647 		defaults.size =
648 			dap->uda_max_send_credits / defaults.iov_limit;
649 	} else if (hints->tx_attr->size) {
650 		defaults.size = hints->tx_attr->size;
651 		defaults.iov_limit =
652 			dap->uda_max_send_credits / defaults.size;
653 	}
654 
655 	entries = defaults.size * defaults.iov_limit;
656 	if (entries > dap->uda_max_send_credits)
657 		return -FI_ENODATA;
658 
659 	if (hints->tx_attr->rma_iov_limit > defaults.rma_iov_limit)
660 		return -FI_ENODATA;
661 
662 out:
663 	/* Non-prefix mode requires extra descriptor for header.
664 	 */
665 	if (!hints || (hints && !(hints->mode & FI_MSG_PREFIX)))
666 		defaults.iov_limit -= 1;
667 
668 	/* catch version changes here. */
669 	ret = usdf_catch_tx_attr(version, &defaults);
670 	if (ret)
671 		return ret;
672 
673 	*fi->tx_attr = defaults;
674 
675 	return FI_SUCCESS;
676 }
677 
usdf_dgram_fill_rx_attr(uint32_t version,const struct fi_info * hints,struct fi_info * fi,struct usd_device_attrs * dap)678 int usdf_dgram_fill_rx_attr(uint32_t version, const struct fi_info *hints,
679 			    struct fi_info *fi, struct usd_device_attrs *dap)
680 {
681 	int ret;
682 	struct fi_rx_attr defaults;
683 	size_t entries;
684 
685 	defaults = dgram_dflt_rx_attr;
686 
687 	defaults.size = dap->uda_max_recv_credits / defaults.iov_limit;
688 
689 	if (!hints || !hints->rx_attr)
690 		goto out;
691 
692 	/* make sure we can support the capabilities that are requested */
693 	if (hints->rx_attr->caps & ~USDF_DGRAM_CAPS)
694 		return -FI_ENODATA;
695 
696 	/* clear the mode bits the app doesn't support */
697 	if (hints->mode || hints->tx_attr->mode)
698 		defaults.mode &= (hints->mode | hints->rx_attr->mode);
699 
700 	defaults.op_flags |= hints->rx_attr->op_flags;
701 
702 	if ((hints->rx_attr->msg_order | USDF_DGRAM_MSG_ORDER) !=
703 			USDF_DGRAM_MSG_ORDER)
704 		return -FI_ENODATA;
705 	if ((hints->rx_attr->comp_order | USDF_DGRAM_COMP_ORDER) !=
706 			USDF_DGRAM_COMP_ORDER)
707 		return -FI_ENODATA;
708 
709 	if (hints->rx_attr->total_buffered_recv >
710 			defaults.total_buffered_recv)
711 		return -FI_ENODATA;
712 
713 	if (hints->rx_attr->iov_limit > USDF_DGRAM_MAX_SGE)
714 		return -FI_ENODATA;
715 
716 	/* make sure the values for iov_limit and size are within appropriate
717 	 * bounds. if only one of the two was given, then set the other based
718 	 * on:
719 	 * 	max_credits = size * iov_limit;
720 	 */
721 	if (hints->rx_attr->iov_limit && hints->rx_attr->size) {
722 		defaults.size = hints->rx_attr->size;
723 		defaults.iov_limit = hints->rx_attr->iov_limit;
724 	} else if (hints->rx_attr->iov_limit) {
725 		defaults.iov_limit = hints->rx_attr->iov_limit;
726 		defaults.size =
727 			dap->uda_max_recv_credits / defaults.iov_limit;
728 	} else if (hints->rx_attr->size) {
729 		defaults.size = hints->rx_attr->size;
730 		defaults.iov_limit =
731 			dap->uda_max_recv_credits / defaults.size;
732 	}
733 
734 	entries = defaults.size * defaults.iov_limit;
735 	if (entries > dap->uda_max_recv_credits)
736 		return -FI_ENODATA;
737 
738 out:
739 	/* Non-prefix mode requires extra descriptor for header.
740 	 */
741 	if (!hints || (hints && !(hints->mode & FI_MSG_PREFIX)))
742 		defaults.iov_limit -= 1;
743 
744 	/* catch version changes here. */
745 	ret = usdf_catch_rx_attr(version, &defaults);
746 	if (ret)
747 		return ret;
748 
749 	*fi->rx_attr = defaults;
750 
751 	return FI_SUCCESS;
752 }
753 
usdf_ep_dgram_control(struct fid * fid,int command,void * arg)754 static int usdf_ep_dgram_control(struct fid *fid, int command, void *arg)
755 {
756 	struct fid_ep *ep;
757 	int ret;
758 
759 	USDF_TRACE_SYS(EP_CTRL, "\n");
760 
761 	switch (fid->fclass) {
762 	case FI_CLASS_EP:
763 		ep = container_of(fid, struct fid_ep, fid);
764 		switch (command) {
765 		case FI_ENABLE:
766 			ret = usdf_ep_dgram_enable(ep);
767 			break;
768 		default:
769 			ret = -FI_ENOSYS;
770 		}
771 		break;
772 	default:
773 		ret = -FI_ENOSYS;
774 	}
775 
776 	return ret;
777 }
778 
779 static struct fi_ops usdf_ep_dgram_ops = {
780 	.size = sizeof(struct fi_ops),
781 	.close = usdf_ep_dgram_close,
782 	.bind = usdf_ep_dgram_bind,
783 	.control = usdf_ep_dgram_control,
784 	.ops_open = fi_no_ops_open
785 };
786 
787 int
usdf_ep_dgram_open(struct fid_domain * domain,struct fi_info * info,struct fid_ep ** ep_o,void * context)788 usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info,
789 	    struct fid_ep **ep_o, void *context)
790 {
791 	struct usdf_domain *udp;
792 	struct usdf_ep *ep;
793 	int ret;
794 	struct usdf_pep *parent_pep;
795 	void *src_addr;
796 	int is_bound;
797 	size_t tx_size;
798 	size_t rx_size;
799 
800 	USDF_TRACE_SYS(EP_CTRL, "\n");
801 
802 	parent_pep = NULL;
803 	src_addr = NULL;
804 
805 	if ((info->caps & ~USDF_DGRAM_CAPS) != 0) {
806 		return -FI_EBADF;
807 	}
808 
809 	if (info->handle != NULL) {
810 		if (info->handle->fclass != FI_CLASS_PEP) {
811 			USDF_WARN_SYS(EP_CTRL,
812 				"\"handle\" should be a PEP (or NULL)\n");
813 			return -FI_EINVAL;
814 		}
815 		parent_pep = pep_fidtou(info->handle);
816 	}
817 
818 	udp = dom_ftou(domain);
819 
820 	ep = calloc(1, sizeof(*ep));
821 	if (ep == NULL) {
822 		return -FI_ENOMEM;
823 	}
824 
825 	is_bound = 0;
826 	if (parent_pep != NULL) {
827 		ret = usdf_pep_steal_socket(parent_pep, &is_bound, &ep->e.dg.ep_sock);
828 		if (ret) {
829 			goto fail;
830 		}
831 	} else {
832 		ep->e.dg.ep_sock = socket(AF_INET, SOCK_DGRAM, 0);
833 		if (ep->e.dg.ep_sock == -1) {
834 			ret = -errno;
835 			goto fail;
836 		}
837 	}
838 
839 	if (!is_bound) {
840 		if (info->src_addr != NULL)
841 			src_addr = usdf_format_to_sin(info, info->src_addr);
842 
843 		if (src_addr != NULL) {
844 			ret = bind(ep->e.dg.ep_sock, src_addr,
845 				sizeof(struct sockaddr_in));
846 			if (ret == -1) {
847 				ret = -errno;
848 				goto fail;
849 			}
850 		}
851 
852 		usdf_free_sin_if_needed(info, src_addr);
853 	}
854 
855 	ep->ep_fid.fid.fclass = FI_CLASS_EP;
856 	ep->ep_fid.fid.context = context;
857 	ep->ep_fid.fid.ops = &usdf_ep_dgram_ops;
858 	ep->ep_fid.cm = &usdf_cm_dgram_ops;
859 	ep->ep_fid.atomic = &usdf_dgram_atomic_ops;
860 	ep->ep_domain = udp;
861 	ep->ep_caps = info->caps;
862 	ep->ep_mode = info->mode;
863 
864 	ep->e.dg.tx_iov_limit = USDF_DGRAM_IOV_LIMIT;
865 	tx_size = udp->dom_fabric->fab_dev_attrs->uda_max_send_credits /
866 		ep->e.dg.tx_iov_limit;
867 
868 	ep->e.dg.rx_iov_limit = USDF_DGRAM_IOV_LIMIT;
869 	rx_size = udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits /
870 		ep->e.dg.rx_iov_limit;
871 
872 	/*
873 	 * TODO: Add better management of tx_attr/rx_attr to getinfo and dgram
874 	 * open.
875 	 */
876 	if (info->tx_attr) {
877 		ep->e.dg.tx_op_flags = info->tx_attr->op_flags;
878 		if (info->tx_attr->iov_limit)
879 			ep->e.dg.tx_iov_limit = info->tx_attr->iov_limit;
880 		if (info->tx_attr->size)
881 			tx_size = info->tx_attr->size;
882 	}
883 
884 	if (info->rx_attr) {
885 		ep->e.dg.rx_op_flags = info->rx_attr->op_flags;
886 		if (info->rx_attr->iov_limit)
887 			ep->e.dg.rx_iov_limit = info->rx_attr->iov_limit;
888 		if (info->rx_attr->size)
889 			rx_size = info->rx_attr->size;
890 	}
891 
892 	if (info->ep_attr)
893 		ep->max_msg_size = info->ep_attr->max_msg_size;
894 
895 	if (ep->ep_mode & FI_MSG_PREFIX) {
896 		ep->ep_wqe = tx_size * ep->e.dg.tx_iov_limit;
897 		ep->ep_rqe = rx_size * ep->e.dg.rx_iov_limit;
898 	} else {
899 		ep->ep_wqe = tx_size * (ep->e.dg.tx_iov_limit + 1);
900 		ep->ep_rqe = rx_size * (ep->e.dg.rx_iov_limit + 1);
901 	}
902 
903 	/* Check that the requested credit size is less than the max credit
904 	 * counts. If the fi_info struct was acquired from fi_getinfo then this
905 	 * will always be the case.
906 	 */
907 	if (ep->ep_wqe > udp->dom_fabric->fab_dev_attrs->uda_max_send_credits) {
908 		ret = -FI_EINVAL;
909 		goto fail;
910 	}
911 	if (ep->ep_rqe > udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits) {
912 		ret = -FI_EINVAL;
913 		goto fail;
914 	}
915 
916 	if (ep->ep_mode & FI_MSG_PREFIX) {
917 		if (info->ep_attr == NULL) {
918 			ret = -FI_EBADF;
919 			goto fail;
920 		}
921 
922 		ep->ep_fid.ops = &usdf_base_dgram_prefix_ops;
923 		info->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY;
924 		ep->ep_fid.msg = &usdf_dgram_prefix_ops;
925 	} else {
926 		ep->ep_fid.ops = &usdf_base_dgram_ops;
927 		ep->ep_fid.msg = &usdf_dgram_ops;
928 	}
929 	ofi_atomic_initialize32(&ep->ep_refcnt, 0);
930 	ofi_atomic_inc32(&udp->dom_refcnt);
931 
932 	*ep_o = ep_utof(ep);
933 	return 0;
934 
935 fail:
936 	if (ep != NULL) {
937 		if (ep->e.dg.ep_sock != -1) {
938 			close(ep->e.dg.ep_sock);
939 		}
940 		free(ep);
941 	}
942 	return ret;
943 }
944