1 /*
2  * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include "config.h"
38 
39 #include <arpa/inet.h>
40 #include <asm/types.h>
41 #include <assert.h>
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <netinet/in.h>
45 #include <ofi_epoll.h>
46 #include <sys/eventfd.h>
47 #include <sys/socket.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <stdbool.h>
51 #include <string.h>
52 #include <netdb.h>
53 #include <unistd.h>
54 
55 #include <rdma/fabric.h>
56 #include <rdma/fi_cm.h>
57 #include <rdma/fi_domain.h>
58 #include <rdma/fi_endpoint.h>
59 #include <rdma/fi_rma.h>
60 #include <rdma/fi_errno.h>
61 #include "ofi.h"
62 #include "ofi_enosys.h"
63 #include "ofi_prov.h"
64 
65 #include "usnic_direct.h"
66 #include "libnl_utils.h"
67 
68 #include "usdf.h"
69 #include "usdf_wait.h"
70 #include "fi_ext_usnic.h"
71 #include "usdf_progress.h"
72 #include "usdf_timer.h"
73 #include "usdf_dgram.h"
74 #include "usdf_cm.h"
75 
76 struct usdf_usnic_info *__usdf_devinfo;
77 
usdf_fabric_getname(uint32_t version,struct usd_device_attrs * dap,char ** name)78 static int usdf_fabric_getname(uint32_t version, struct usd_device_attrs *dap,
79 			       char **name)
80 {
81 	int ret = FI_SUCCESS;
82 	char *bufp = NULL;
83 	struct in_addr in;
84 	char *addrnetw;
85 
86 	if (FI_VERSION_GE(version, FI_VERSION(1, 4))) {
87 		in.s_addr = dap->uda_ipaddr_be & dap->uda_netmask_be;
88 		addrnetw = inet_ntoa(in);
89 		ret = asprintf(&bufp, "%s/%d", addrnetw, dap->uda_prefixlen);
90 		if (ret < 0) {
91 			USDF_DBG(
92 			    "asprintf failed while creating fabric name\n");
93 			ret = -ENOMEM;
94 		}
95 	} else {
96 		bufp = strdup(dap->uda_devname);
97 		if (!bufp) {
98 			USDF_DBG("strdup failed while creating fabric name\n");
99 			ret = -errno;
100 		}
101 	}
102 
103 	*name = bufp;
104 
105 	return ret;
106 }
107 
usdf_fabric_checkname(uint32_t version,struct usd_device_attrs * dap,const char * hint)108 static bool usdf_fabric_checkname(uint32_t version,
109 				  struct usd_device_attrs *dap, const char *hint)
110 {
111 	int ret;
112 	bool valid = false;
113 	char *reference;
114 
115 	USDF_DBG("checking devname: version=%d, devname='%s'\n", version, hint);
116 
117 	if (version) {
118 		ret = usdf_fabric_getname(version, dap, &reference);
119 		if (ret < 0)
120 			return false;
121 
122 		if (strcmp(reference, hint) == 0) {
123 			valid = true;
124 		} else {
125 			USDF_DBG("hint %s failed to match %s\n", hint,
126 				 reference);
127 		}
128 
129 		free(reference);
130 		return valid;
131 	}
132 
133 	/* The hint string itself is kind of a version check, in pre-1.4 the
134 	* name was just the device name. In 1.4 and beyond, then name is
135 	* actually CIDR
136 	* notation.
137 	*/
138 	if (strstr(hint, "/"))
139 		return usdf_fabric_checkname(FI_VERSION(1, 4), dap, hint);
140 
141 	return usdf_fabric_checkname(FI_VERSION(1, 3), dap, hint);
142 }
143 
usdf_validate_hints(uint32_t version,const struct fi_info * hints)144 static int usdf_validate_hints(uint32_t version, const struct fi_info *hints)
145 {
146 	struct fi_fabric_attr *fattrp;
147 	size_t size;
148 
149 	switch (hints->addr_format) {
150 	case FI_FORMAT_UNSPEC:
151 	case FI_SOCKADDR_IN:
152 		size = sizeof(struct sockaddr_in);
153 		break;
154 	case FI_SOCKADDR:
155 		size = sizeof(struct sockaddr);
156 		break;
157 	case FI_ADDR_STR:
158 		if (hints->src_addr != NULL &&
159 		    strlen((char *)hints->src_addr) > USDF_ADDR_STR_LEN)
160 			return -FI_ENODATA;
161 
162 		if (hints->dest_addr != NULL &&
163 		    strlen((char *)hints->dest_addr) > USDF_ADDR_STR_LEN)
164 			return -FI_ENODATA;
165 
166 		goto skip_sockaddr_size_check;
167 	default:
168 		return -FI_ENODATA;
169 	}
170 
171 	if (hints->src_addr != NULL && hints->src_addrlen < size) {
172 		return -FI_ENODATA;
173 	}
174 	if (hints->dest_addr != NULL && hints->dest_addrlen < size) {
175 		return -FI_ENODATA;
176 	}
177 
178 skip_sockaddr_size_check:
179 	if (hints->ep_attr != NULL) {
180 		switch (hints->ep_attr->protocol) {
181 		case FI_PROTO_UNSPEC:
182 		case FI_PROTO_UDP:
183 		case FI_PROTO_RUDP:
184 			break;
185 		default:
186 			return -FI_ENODATA;
187 		}
188 
189 		if (hints->ep_attr->auth_key || hints->ep_attr->auth_key_size) {
190 			USDF_WARN_SYS(EP_CTRL,
191 				"\"authorization key\" is not supported in this provider.\n");
192 			return -FI_ENODATA;
193 		}
194 	}
195 
196 	fattrp = hints->fabric_attr;
197 	if (fattrp != NULL) {
198 		if (fattrp->prov_version != 0 &&
199 		    fattrp->prov_version != USDF_PROV_VERSION) {
200 			return -FI_ENODATA;
201 		}
202 	}
203 	return FI_SUCCESS;
204 }
205 
206 static int
usdf_fill_sockaddr_info(struct fi_info * fi,struct sockaddr_in * src,struct sockaddr_in * dest,struct usd_device_attrs * dap)207 usdf_fill_sockaddr_info(struct fi_info *fi,
208 	struct sockaddr_in *src, struct sockaddr_in *dest,
209 	struct usd_device_attrs *dap)
210 {
211 	int ret;
212 	struct sockaddr_in *sin;
213 
214 	sin = calloc(1, sizeof(*sin));
215 	fi->src_addr = sin;
216 	if (sin == NULL) {
217 		ret = -FI_ENOMEM;
218 		return ret;
219 	}
220 	fi->src_addrlen = sizeof(struct sockaddr_in);
221 	sin->sin_family = AF_INET;
222 	sin->sin_addr.s_addr = dap->uda_ipaddr_be;
223 	if (src != NULL)
224 		sin->sin_port = src->sin_port;
225 
226 	/* copy in dest if specified */
227 	if (dest != NULL) {
228 		sin = calloc(1, sizeof(*sin));
229 		if (NULL == sin) {
230 			free(fi->src_addr);
231 			return -FI_ENOMEM;
232 		}
233 		*sin = *dest;
234 		fi->dest_addr = sin;
235 		fi->dest_addrlen = sizeof(*sin);
236 	}
237 	return FI_SUCCESS;
238 }
239 
240 static int
usdf_fill_straddr_info(struct fi_info * fi,char * src,char * dest,struct usd_device_attrs * dap)241 usdf_fill_straddr_info(struct fi_info *fi,
242 	char *src, char *dest, struct usd_device_attrs *dap)
243 {
244 	char *address_string;
245 	struct sockaddr_in *sin;
246 
247 	/* If NULL, we have to create the sockaddr_in
248 	 * and convert it to string format.
249 	 */
250 	if (src == NULL) {
251 		sin = calloc(1, sizeof(*sin));
252 		if (NULL == sin)
253 			return -FI_ENOMEM;
254 		sin->sin_family = AF_INET;
255 		sin->sin_addr.s_addr = dap->uda_ipaddr_be;
256 
257 		address_string = calloc(1, USDF_ADDR_STR_LEN);
258 		fi->src_addr = address_string;
259 		fi->src_addrlen = USDF_ADDR_STR_LEN;
260 
261 		usdf_addr_tostr(sin, fi->src_addr, &fi->src_addrlen);
262 		free(sin);
263 	} else {
264 	/* Otherwise, it is already in string format.
265 	 * Just copy it.
266 	 */
267 		address_string = strdup(src);
268 		if (NULL == address_string)
269 			return -FI_ENOMEM;
270 		fi->src_addr = address_string;
271 		fi->src_addrlen = strlen(address_string);
272 	}
273 
274 	/* Same goes for dest. */
275 	if (dest != NULL) {
276 		address_string = strdup(dest);
277 		fi->dest_addr = address_string;
278 		fi->dest_addrlen = strlen(address_string);
279 	}
280 
281 	return FI_SUCCESS;
282 }
283 static int
usdf_fill_addr_info(struct fi_info * fi,uint32_t addr_format,void * src,void * dest,struct usd_device_attrs * dap)284 usdf_fill_addr_info(struct fi_info *fi, uint32_t addr_format,
285 		void *src, void *dest, struct usd_device_attrs *dap)
286 {
287 	int ret;
288 
289 	if (addr_format != FI_FORMAT_UNSPEC) {
290 		fi->addr_format = addr_format;
291 	} else {
292 		fi->addr_format = FI_SOCKADDR_IN;
293 	}
294 
295 	switch (fi->addr_format) {
296 	case FI_SOCKADDR:
297 	case FI_SOCKADDR_IN:
298 		ret = usdf_fill_sockaddr_info(fi, src, dest, dap);
299 		if (ret != FI_SUCCESS)
300 			goto fail;
301 		break;
302 	case FI_ADDR_STR:
303 		ret = usdf_fill_straddr_info(fi, src, dest, dap);
304 		if (ret != FI_SUCCESS)
305 			goto fail;
306 		break;
307 	default:
308 		ret = -FI_ENODATA;
309 		goto fail;
310 	}
311 
312 	return 0;
313 
314 fail:
315 	return ret;		// fi_freeinfo() in caller frees all
316 }
317 
validate_modebits(uint32_t version,const struct fi_info * hints,uint64_t supported,uint64_t * mode_out)318 static int validate_modebits(uint32_t version, const struct fi_info *hints,
319 			       uint64_t supported, uint64_t *mode_out)
320 {
321 	uint64_t mode;
322 
323 	/* If there is no hints, return everything we supported. */
324 	if (!hints) {
325 		*mode_out = supported;
326 		return FI_SUCCESS;
327 	}
328 
329 	mode = hints->mode & supported;
330 
331 	/* Before version 1.5, FI_LOCAL_MR is a requirement. */
332 	if (FI_VERSION_LT(version, FI_VERSION(1, 5))) {
333 		if ((mode & FI_LOCAL_MR) == 0)
334 			return -FI_ENODATA;
335 	}
336 
337 	*mode_out = mode;
338 
339 	return FI_SUCCESS;
340 }
341 
usdf_alloc_fid_nic(struct fi_info * fi,struct usd_device_attrs * dap)342 static int usdf_alloc_fid_nic(struct fi_info *fi,
343 			struct usd_device_attrs *dap)
344 {
345 	int ret;
346 	struct fid_nic *nic = NULL;
347 	struct fi_device_attr *da = NULL;
348 	struct fi_link_attr *la = NULL;
349 
350 	nic = ofi_nic_dup(NULL);
351 	if (!nic)
352 		goto nomem;
353 
354 	da = nic->device_attr;
355 	da->name = strdup(dap->uda_devname);
356 	if (!da->name)
357 		goto nomem;
358 	ret = asprintf(&da->device_id, "%s (%s)",
359 		usd_devid_to_pid(dap->uda_vendor_id,
360 				dap->uda_device_id),
361 		usd_devid_to_nicname(dap->uda_vendor_id,
362 				dap->uda_device_id));
363 	if (ret < 0)
364 		goto nomem;
365 	ret = asprintf(&da->device_version, "0x%x", dap->uda_vendor_part_id);
366 	if (ret < 0)
367 		goto nomem;
368 	ret = asprintf(&da->vendor_id, "0x%x", dap->uda_vendor_id);
369 	if (ret < 0)
370 		goto nomem;
371 	da->driver = strdup("usnic_verbs");
372 	if (!da->driver)
373 		goto nomem;
374 	da->firmware = strdup(dap->uda_firmware);
375 	if (!da->firmware)
376 		goto nomem;
377 
378 	// usnic does not currently expose PCI bus information, so we
379 	// set the bus type to unknown.
380 	nic->bus_attr->bus_type = FI_BUS_UNKNOWN;
381 
382 	la = nic->link_attr;
383 
384 	socklen_t size = INET_ADDRSTRLEN;
385 	la->address = calloc(1, size);
386 	if (!la->address)
387 		goto nomem;
388 	inet_ntop(AF_INET, &dap->uda_ipaddr_be, la->address, size);
389 	la->mtu = dap->uda_mtu;
390 	la->speed = dap->uda_bandwidth;
391 	switch (dap->uda_link_state) {
392 	case USD_LINK_UP:
393 		la->state = FI_LINK_UP;
394 		break;
395 	case USD_LINK_DOWN:
396 		la->state = FI_LINK_DOWN;
397 		break;
398 	default:
399 		la->state = FI_LINK_UNKNOWN;
400 		break;
401 	}
402 	la->network_type = strdup("Ethernet");
403 	if (!la->network_type)
404 		goto nomem;
405 
406 	fi->nic = nic;
407 
408 	return FI_SUCCESS;
409 
410 nomem:
411 	if (nic)
412 		fi_close(&nic->fid);
413 	return -FI_ENOMEM;
414 }
415 
usdf_fill_info_dgram(uint32_t version,const struct fi_info * hints,void * src,void * dest,struct usd_device_attrs * dap,struct fi_info ** fi_first,struct fi_info ** fi_last)416 static int usdf_fill_info_dgram(
417 	uint32_t version,
418 	const struct fi_info *hints,
419 	void *src,
420 	void *dest,
421 	struct usd_device_attrs *dap,
422 	struct fi_info **fi_first,
423 	struct fi_info **fi_last)
424 {
425 	struct fi_info *fi;
426 	struct fi_fabric_attr *fattrp;
427 	uint32_t addr_format;
428 	int ret;
429 
430 	fi = fi_allocinfo();
431 	if (fi == NULL) {
432 		ret = -FI_ENOMEM;
433 		goto fail;
434 	}
435 
436 	fi->caps = USDF_DGRAM_CAPS;
437 
438 	ret = validate_modebits(version, hints,
439 				  USDF_DGRAM_SUPP_MODE, &fi->mode);
440 	if (ret)
441 		goto fail;
442 
443 	if (hints != NULL) {
444 		addr_format = hints->addr_format;
445 
446 		/* check that we are capable of what's requested */
447 		if ((hints->caps & ~USDF_DGRAM_CAPS) != 0) {
448 			ret = -FI_ENODATA;
449 			goto fail;
450 		}
451 
452 		fi->handle = hints->handle;
453 	} else {
454 		addr_format = FI_FORMAT_UNSPEC;
455 	}
456 	fi->ep_attr->type = FI_EP_DGRAM;
457 
458 	ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap);
459 	if (ret != 0) {
460 		goto fail;
461 	}
462 
463 	/* fabric attrs */
464 	fattrp = fi->fabric_attr;
465 	ret = usdf_fabric_getname(version, dap, &fattrp->name);
466 	if (ret < 0 || fattrp->name == NULL) {
467 		ret = -FI_ENOMEM;
468 		goto fail;
469 	}
470 
471 	if (fi->mode & FI_MSG_PREFIX) {
472 		if (FI_VERSION_GE(version, FI_VERSION(1, 1)))
473 			fi->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY;
474 		else
475 			fi->mode &= ~FI_MSG_PREFIX;
476 	}
477 
478 	ret = usdf_dgram_fill_ep_attr(version, hints, fi, dap);
479 	if (ret)
480 		goto fail;
481 
482 	ret = usdf_dgram_fill_dom_attr(version, hints, fi, dap);
483 	if (ret)
484 		goto fail;
485 
486 	ret = usdf_dgram_fill_tx_attr(version, hints, fi, dap);
487 	if (ret)
488 		goto fail;
489 
490 	ret = usdf_dgram_fill_rx_attr(version, hints, fi, dap);
491 	if (ret)
492 		goto fail;
493 
494 	ret = usdf_alloc_fid_nic(fi, dap);
495 	if (ret)
496 		goto fail;
497 
498 	/* add to tail of list */
499 	if (*fi_first == NULL) {
500 		*fi_first = fi;
501 	} else {
502 		(*fi_last)->next = fi;
503 	}
504 	*fi_last = fi;
505 
506 	return 0;
507 
508 fail:
509 	if (fi != NULL) {
510 		fi_freeinfo(fi);
511 	}
512 	return ret;
513 }
514 
515 static int
usdf_get_devinfo(void)516 usdf_get_devinfo(void)
517 {
518 	struct usdf_usnic_info *dp;
519 	struct usdf_dev_entry *dep;
520 	struct usd_open_params params;
521 	int ret;
522 	int d;
523 
524 	assert(__usdf_devinfo == NULL);
525 
526 	dp = calloc(1, sizeof(*dp));
527 	if (dp == NULL) {
528 		ret = -FI_ENOMEM;
529 		goto fail;
530 	}
531 	__usdf_devinfo = dp;
532 
533 	dp->uu_num_devs = USD_MAX_DEVICES;
534 	ret = usd_get_device_list(dp->uu_devs, &dp->uu_num_devs);
535 	if (ret != 0) {
536 		dp->uu_num_devs = 0;
537 		goto fail;
538 	}
539 
540 	for (d = 0; d < dp->uu_num_devs; ++d) {
541 		dep = &dp->uu_info[d];
542 
543 		memset(&params, 0, sizeof(params));
544 		params.flags = UOPF_SKIP_PD_ALLOC;
545 		params.cmd_fd = -1;
546 		params.context = NULL;
547 		ret = usd_open_with_params(dp->uu_devs[d].ude_devname,
548 						&params, &dep->ue_dev);
549 		if (ret != 0) {
550 			continue;
551 		}
552 
553 		ret = usd_get_device_attrs(dep->ue_dev, &dep->ue_dattr);
554 		if (ret != 0) {
555 			continue;
556 		}
557 
558 		dep->ue_dev_ok = 1;	/* this device is OK */
559 
560 		usd_close(dep->ue_dev);
561 		dep->ue_dev = NULL;
562 	}
563 	return 0;
564 
565 fail:
566 	return ret;
567 }
568 
569 static int
usdf_get_distance(struct usd_device_attrs * dap,uint32_t daddr_be,int * metric_o)570 usdf_get_distance(
571     struct usd_device_attrs *dap,
572     uint32_t daddr_be,
573     int *metric_o)
574 {
575     uint32_t nh_ip_addr;
576     int ret;
577 
578     USDF_TRACE("\n");
579 
580     ret = usnic_nl_rt_lookup(dap->uda_ipaddr_be, daddr_be,
581             dap->uda_ifindex, &nh_ip_addr);
582     if (ret != 0) {
583         *metric_o = -1;
584         ret = 0;
585     } else if (nh_ip_addr == 0) {
586         *metric_o = 0;
587     } else {
588         *metric_o = 1;
589     }
590 
591     return ret;
592 }
593 
594 /* Check all things related to a device. Make sure it's okay, the source address
595  * matches the requested address, the destination is reachable from the device,
596  * the device fabric name matches the requested fabric name, and the device
597  * domain name matches the requested domain name.
598  *
599  * @param version Libfabric API version used to verify the domain / fabric name.
600  * @param hints   Hints passed to fi_getinfo.
601  * @param src     Source address being requested.
602  * @param dest    Destination address to communicate with.
603  * @param dep     usNIC device entry being checked.
604  *
605  * @return true on success, false on failure. For debug logging can be enabled
606  *         to see why a device was disqualified.
607  */
usdf_check_device(uint32_t version,const struct fi_info * hints,void * src,void * dest,struct usdf_dev_entry * dep)608 static bool usdf_check_device(uint32_t version, const struct fi_info *hints,
609 			      void *src, void *dest,
610 			      struct usdf_dev_entry *dep)
611 {
612 	char dest_str[INET_ADDRSTRLEN];
613 	char src_str[INET_ADDRSTRLEN];
614 	char dev_str[INET_ADDRSTRLEN];
615 	struct usd_device_attrs *dap;
616 	struct sockaddr_in *sin;
617 	int reachable;
618 	int ret;
619 
620 	reachable = -1;
621 	dap = &dep->ue_dattr;
622 
623 	/* Skip the device if it has problems. */
624 	if (!dep->ue_dev_ok) {
625 		USDF_WARN_SYS(FABRIC, "skipping %s/%s device not ok\n",
626 			      dap->uda_devname, dap->uda_ifname);
627 		return false;
628 	}
629 
630 	/* If the given source address is not INADDR_ANY, compare against the
631 	 * device.
632 	 */
633 	if (src) {
634 		sin = usdf_format_to_sin(hints, src);
635 		if (sin->sin_addr.s_addr != INADDR_ANY) {
636 			if (sin->sin_addr.s_addr != dap->uda_ipaddr_be) {
637 				inet_ntop(AF_INET, &sin->sin_addr.s_addr,
638 					  src_str, sizeof(src_str));
639 				inet_ntop(AF_INET, &dap->uda_ipaddr_be,
640 					  dev_str, sizeof(dev_str));
641 				USDF_WARN_SYS(FABRIC,
642 					      "src addr<%s> != dev addr<%s>\n",
643 					      src_str, dev_str);
644 				goto fail;
645 			}
646 		}
647 
648 		usdf_free_sin_if_needed(hints, sin);
649 	}
650 
651 	/* Check that the given destination address is reachable from the
652 	 * interface.
653 	 */
654 	if (dest) {
655 		sin = usdf_format_to_sin(hints, dest);
656 		if (sin->sin_addr.s_addr != INADDR_ANY) {
657 			ret = usdf_get_distance(dap, sin->sin_addr.s_addr,
658 						&reachable);
659 			if (ret) {
660 				inet_ntop(AF_INET,
661 					  &sin->sin_addr.s_addr, dest_str,
662 					  sizeof(dest_str));
663 				USDF_WARN_SYS(FABRIC,
664 					      "get_distance failed @ %s\n",
665 					      dest_str);
666 				goto fail;
667 			}
668 		}
669 
670 		if (reachable == -1) {
671 			inet_ntop(AF_INET, &sin->sin_addr.s_addr, dest_str,
672 				  sizeof(dest_str));
673 			USDF_WARN_SYS(FABRIC,
674 				      "dest %s unreachable from %s/%s, skipping\n",
675 				      dest_str, dap->uda_devname,
676 				      dap->uda_ifname);
677 			goto fail;
678 		}
679 
680 		usdf_free_sin_if_needed(hints, sin);
681 	}
682 
683 	/* Checks that the fabric name is correct for the given interface. The
684 	 * fabric name contains the CIDR notation for the interface.
685 	 */
686 	if (hints && hints->fabric_attr && hints->fabric_attr->name) {
687 		if (!usdf_fabric_checkname(version, dap,
688 					  hints->fabric_attr->name))
689 			return false;
690 	}
691 
692 	/* Check that the domain name is correct for the given interface. The
693 	 * domain name is the device name.
694 	 */
695 	if (hints && hints->domain_attr && hints->domain_attr->name) {
696 		if (!usdf_domain_checkname(version, dap,
697 					   hints->domain_attr->name))
698 			return false;
699 	}
700 
701 	return true;
702 
703 fail:
704 	usdf_free_sin_if_needed(hints, sin);
705 
706 	return false;
707 }
708 
709 static int
usdf_handle_node_and_service(const char * node,const char * service,uint64_t flags,void ** src,void ** dest,const struct fi_info * hints,struct addrinfo ** ai)710 usdf_handle_node_and_service(const char *node, const char *service,
711 		uint64_t flags, void **src, void **dest,
712 		const struct fi_info *hints, struct addrinfo **ai)
713 {
714 	int ret;
715 	struct sockaddr_in *sin;
716 
717 	if (node != NULL || service != NULL) {
718 		if (hints && hints->addr_format == FI_ADDR_STR) {
719 			/* FI_ADDR_STR can't have service param. */
720 			if (service)
721 				return -FI_EINVAL;
722 
723 			sin = usdf_format_to_sin(hints, node);
724 
725 			if (!sin)
726 				/* This could be invalid or no memory. */
727 				return -FI_EINVAL;
728 		} else {
729 			ret = getaddrinfo(node, service, NULL, ai);
730 			if (ret != 0) {
731 				USDF_DBG("getaddrinfo failed: %d: <%s>\n", ret,
732 					 gai_strerror(ret));
733 				return ret;
734 			}
735 			sin = (struct sockaddr_in *)(*ai)->ai_addr;
736 		}
737 
738 		if (flags & FI_SOURCE)
739 			*src = usdf_sin_to_format(hints, sin, NULL);
740 		else
741 			*dest = usdf_sin_to_format(hints, sin, NULL);
742 	}
743 
744 	return FI_SUCCESS;
745 }
746 
747 static int
usdf_getinfo(uint32_t version,const char * node,const char * service,uint64_t flags,const struct fi_info * hints,struct fi_info ** info)748 usdf_getinfo(uint32_t version, const char *node, const char *service,
749 	       uint64_t flags, const struct fi_info *hints, struct fi_info **info)
750 {
751 	struct usdf_usnic_info *dp;
752 	struct usdf_dev_entry *dep;
753 	struct usd_device_attrs *dap;
754 	struct fi_info *fi_first;
755 	struct fi_info *fi_last;
756 	struct addrinfo *ai;
757 	void *src;
758 	void *dest;
759 	enum fi_ep_type ep_type;
760 	int d;
761 	int ret;
762 
763 	USDF_TRACE("\n");
764 
765 	fi_first = NULL;
766 	fi_last = NULL;
767 	ai = NULL;
768 	src = NULL;
769 	dest = NULL;
770 
771 	/*
772 	 * Get and cache usNIC device info
773 	 */
774 	if (__usdf_devinfo == NULL) {
775 		ret = usdf_get_devinfo();
776 		if (ret != 0) {
777 			USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n",
778 					ret, fi_strerror(-ret));
779 			if (ret == -FI_ENODEV)
780 				ret = -FI_ENODATA;
781 			goto fail;
782 		}
783 	}
784 	dp = __usdf_devinfo;
785 
786 	/* Check the hints up front and fail if they're invalid. */
787 	if (hints) {
788 		ret = usdf_validate_hints(version, hints);
789 		if (ret) {
790 			USDF_WARN_SYS(FABRIC, "hints failed to validate\n");
791 			goto fail;
792 		}
793 	}
794 
795 	/* Get the src and dest if user specified. */
796 	ret = usdf_handle_node_and_service(node, service, flags,
797 					   &src, &dest, hints, &ai);
798 	if (ret) {
799 		USDF_WARN_SYS(FABRIC, "failed to handle node and service.\n");
800 		goto fail;
801 	}
802 
803 	if (hints != NULL) {
804 		if (dest == NULL && hints->dest_addr != NULL)
805 			dest = hints->dest_addr;
806 		if (src == NULL && hints->src_addr != NULL)
807 			src = hints->src_addr;
808 	}
809 
810 	for (d = 0; d < dp->uu_num_devs; ++d) {
811 		dep = &dp->uu_info[d];
812 		dap = &dep->ue_dattr;
813 
814 		/* If the device has an issue or the hints don't match the
815 		 * device information, then skip.
816 		 */
817 		if (!usdf_check_device(version, hints, src, dest, dep))
818 			continue;
819 
820 		if (hints && hints->ep_attr)
821 			ep_type = hints->ep_attr->type;
822 		else
823 			ep_type = FI_EP_UNSPEC;
824 
825 		if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) {
826 			ret = usdf_fill_info_dgram(version, hints, src, dest,
827 					dap, &fi_first, &fi_last);
828 			if (ret != 0 && ret != -FI_ENODATA) {
829 				goto fail;
830 			}
831 		}
832 	}
833 
834 	if (fi_first != NULL) {
835 		*info = fi_first;
836 		ret = 0;
837 	} else {
838 		ret = -FI_ENODATA;
839 	}
840 
841 
842 fail:
843 	if (ai)
844 		freeaddrinfo(ai);
845 
846 	if (ret != 0) {
847 		fi_freeinfo(fi_first);
848 		USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret));
849 	}
850 
851 	return ret;
852 }
853 
854 static int
usdf_fabric_close(fid_t fid)855 usdf_fabric_close(fid_t fid)
856 {
857 	struct usdf_fabric *fp;
858 	int ret;
859 	void *rv;
860 
861 	USDF_TRACE("\n");
862 
863 	fp = fab_fidtou(fid);
864 	if (ofi_atomic_get32(&fp->fab_refcnt) > 0) {
865 		return -FI_EBUSY;
866 	}
867 	/* Tell progression thread to exit */
868 	fp->fab_exit = 1;
869 
870 	free(fp->fab_attr.name);
871 	free(fp->fab_attr.prov_name);
872 
873 	if (fp->fab_thread) {
874 		ret = usdf_fabric_wake_thread(fp);
875 		if (ret != 0) {
876 			return ret;
877 		}
878 		pthread_join(fp->fab_thread, &rv);
879 	}
880 	usdf_timer_deinit(fp);
881 	if (fp->fab_epollfd != OFI_EPOLL_INVALID) {
882 		ofi_epoll_close(fp->fab_epollfd);
883 	}
884 	if (fp->fab_eventfd != -1) {
885 		close(fp->fab_eventfd);
886 	}
887 	if (fp->fab_arp_sockfd != -1) {
888 		close(fp->fab_arp_sockfd);
889 	}
890 
891 	free(fp);
892 	return 0;
893 }
894 
895 static struct fi_ops usdf_fi_ops = {
896 	.size = sizeof(struct fi_ops),
897 	.close = usdf_fabric_close,
898 	.bind = fi_no_bind,
899 	.control = fi_no_control,
900 	.ops_open = usdf_fabric_ops_open,
901 };
902 
903 static struct fi_ops_fabric usdf_ops_fabric = {
904 	.size = sizeof(struct fi_ops_fabric),
905 	.domain = usdf_domain_open,
906 	.passive_ep = usdf_pep_open,
907 	.eq_open = usdf_eq_open,
908 	.wait_open = usdf_wait_open,
909 	.trywait = usdf_trywait
910 };
911 
912 static int
usdf_fabric_open(struct fi_fabric_attr * fattrp,struct fid_fabric ** fabric,void * context)913 usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric,
914 	       void *context)
915 {
916 	struct fid_fabric *ff;
917 	struct usdf_fabric *fp;
918 	struct usdf_usnic_info *dp;
919 	struct usdf_dev_entry *dep;
920 	struct sockaddr_in sin;
921 	int ret;
922 	int d;
923 
924 	USDF_TRACE("\n");
925 
926 	/* Make sure this fabric exists */
927 	dp = __usdf_devinfo;
928 	for (d = 0; d < dp->uu_num_devs; ++d) {
929 		dep = &dp->uu_info[d];
930 		if (dep->ue_dev_ok &&
931 		    usdf_fabric_checkname(0, &(dep->ue_dattr), fattrp->name)) {
932 			break;
933 		}
934 	}
935 	if (d >= dp->uu_num_devs) {
936 		USDF_INFO("device \"%s\" does not exit, returning -FI_ENODEV\n",
937 				fattrp->name);
938 		return -FI_ENODEV;
939 	}
940 
941 	fp = calloc(1, sizeof(*fp));
942 	if (fp == NULL) {
943 		USDF_INFO("unable to allocate memory for fabric\n");
944 		return -FI_ENOMEM;
945 	}
946 	fp->fab_epollfd = OFI_EPOLL_INVALID;
947 	fp->fab_arp_sockfd = -1;
948 	LIST_INIT(&fp->fab_domain_list);
949 
950 	fp->fab_attr.fabric = fab_utof(fp);
951 	fp->fab_attr.name = strdup(fattrp->name);
952 	fp->fab_attr.prov_name = strdup(USDF_PROV_NAME);
953 	fp->fab_attr.prov_version = USDF_PROV_VERSION;
954 	if (fp->fab_attr.name == NULL ||
955 			fp->fab_attr.prov_name == NULL) {
956 		ret = -FI_ENOMEM;
957 		goto fail;
958 	}
959 
960 	fp->fab_fid.fid.fclass = FI_CLASS_FABRIC;
961 	fp->fab_fid.fid.context = context;
962 	fp->fab_fid.fid.ops = &usdf_fi_ops;
963 	fp->fab_fid.ops = &usdf_ops_fabric;
964 
965 	fp->fab_dev_attrs = &dep->ue_dattr;
966 
967 	ret = ofi_epoll_create(&fp->fab_epollfd);
968 	if (ret) {
969 		USDF_INFO("unable to allocate epoll fd\n");
970 		goto fail;
971 	}
972 
973 	fp->fab_eventfd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE);
974 	if (fp->fab_eventfd == -1) {
975 		ret = -errno;
976 		USDF_INFO("unable to allocate event fd\n");
977 		goto fail;
978 	}
979 	fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb;
980 	fp->fab_poll_item.pi_context = fp;
981 	ret = ofi_epoll_add(fp->fab_epollfd, fp->fab_eventfd, OFI_EPOLL_IN,
982 			    &fp->fab_poll_item);
983 	if (ret) {
984 		USDF_INFO("unable to EPOLL_CTL_ADD\n");
985 		goto fail;
986 	}
987 
988 	/* initialize timer subsystem */
989 	ret = usdf_timer_init(fp);
990 	if (ret != 0) {
991 		USDF_INFO("unable to initialize timer\n");
992 		goto fail;
993 	}
994 
995 	/* create and bind socket for ARP resolution */
996 	memset(&sin, 0, sizeof(sin));
997 	sin.sin_family = AF_INET;
998 	sin.sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be;
999 	fp->fab_arp_sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1000 	if (fp->fab_arp_sockfd == -1) {
1001 		USDF_INFO("unable to create socket\n");
1002 		goto fail;
1003 	}
1004 	ret = bind(fp->fab_arp_sockfd, (struct sockaddr *) &sin, sizeof(sin));
1005 	if (ret == -1) {
1006 		ret = -errno;
1007 		goto fail;
1008 	}
1009 
1010 	ofi_atomic_initialize32(&fp->fab_refcnt, 0);
1011 	ofi_atomic_initialize32(&fp->num_blocked_waiting, 0);
1012 
1013 	ret = pthread_create(&fp->fab_thread, NULL,
1014 			usdf_fabric_progression_thread, fp);
1015 	if (ret != 0) {
1016 		ret = -ret;
1017 		USDF_INFO("unable to create progress thread\n");
1018 		goto fail;
1019 	}
1020 
1021 	fattrp->fabric = fab_utof(fp);
1022 	fattrp->prov_version = USDF_PROV_VERSION;
1023 	*fabric = fab_utof(fp);
1024 	USDF_INFO("successfully opened %s/%s\n", fattrp->name,
1025 			fp->fab_dev_attrs->uda_ifname);
1026 	return 0;
1027 
1028 fail:
1029 	free(fp->fab_attr.name);
1030 	free(fp->fab_attr.prov_name);
1031 	ff = fab_utof(fp);
1032 	usdf_fabric_close(&ff->fid);
1033 	USDF_DBG("returning %d (%s)\n", ret, fi_strerror(-ret));
1034 	return ret;
1035 }
1036 
usdf_fini(void)1037 static void usdf_fini(void)
1038 {
1039 	USDF_TRACE("\n");
1040 }
1041 
1042 struct fi_provider usdf_ops = {
1043 	.name = USDF_PROV_NAME,
1044 	.version = USDF_PROV_VERSION,
1045 	.fi_version = OFI_VERSION_LATEST,
1046 	.getinfo = usdf_getinfo,
1047 	.fabric = usdf_fabric_open,
1048 	.cleanup =  usdf_fini
1049 };
1050 
1051 USNIC_INI
1052 {
1053 #if USNIC_BUILD_FAKE_VERBS_DRIVER
1054 	usdf_setup_fake_ibv_provider();
1055 #endif
1056 	return (&usdf_ops);
1057 }
1058