1 /*
2 * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36
37 #include "config.h"
38
39 #include <arpa/inet.h>
40 #include <asm/types.h>
41 #include <assert.h>
42 #include <errno.h>
43 #include <fcntl.h>
44 #include <netinet/in.h>
45 #include <ofi_epoll.h>
46 #include <sys/eventfd.h>
47 #include <sys/socket.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <stdbool.h>
51 #include <string.h>
52 #include <netdb.h>
53 #include <unistd.h>
54
55 #include <rdma/fabric.h>
56 #include <rdma/fi_cm.h>
57 #include <rdma/fi_domain.h>
58 #include <rdma/fi_endpoint.h>
59 #include <rdma/fi_rma.h>
60 #include <rdma/fi_errno.h>
61 #include "ofi.h"
62 #include "ofi_enosys.h"
63 #include "ofi_prov.h"
64
65 #include "usnic_direct.h"
66 #include "libnl_utils.h"
67
68 #include "usdf.h"
69 #include "usdf_wait.h"
70 #include "fi_ext_usnic.h"
71 #include "usdf_progress.h"
72 #include "usdf_timer.h"
73 #include "usdf_dgram.h"
74 #include "usdf_cm.h"
75
76 struct usdf_usnic_info *__usdf_devinfo;
77
usdf_fabric_getname(uint32_t version,struct usd_device_attrs * dap,char ** name)78 static int usdf_fabric_getname(uint32_t version, struct usd_device_attrs *dap,
79 char **name)
80 {
81 int ret = FI_SUCCESS;
82 char *bufp = NULL;
83 struct in_addr in;
84 char *addrnetw;
85
86 if (FI_VERSION_GE(version, FI_VERSION(1, 4))) {
87 in.s_addr = dap->uda_ipaddr_be & dap->uda_netmask_be;
88 addrnetw = inet_ntoa(in);
89 ret = asprintf(&bufp, "%s/%d", addrnetw, dap->uda_prefixlen);
90 if (ret < 0) {
91 USDF_DBG(
92 "asprintf failed while creating fabric name\n");
93 ret = -ENOMEM;
94 }
95 } else {
96 bufp = strdup(dap->uda_devname);
97 if (!bufp) {
98 USDF_DBG("strdup failed while creating fabric name\n");
99 ret = -errno;
100 }
101 }
102
103 *name = bufp;
104
105 return ret;
106 }
107
usdf_fabric_checkname(uint32_t version,struct usd_device_attrs * dap,const char * hint)108 static bool usdf_fabric_checkname(uint32_t version,
109 struct usd_device_attrs *dap, const char *hint)
110 {
111 int ret;
112 bool valid = false;
113 char *reference;
114
115 USDF_DBG("checking devname: version=%d, devname='%s'\n", version, hint);
116
117 if (version) {
118 ret = usdf_fabric_getname(version, dap, &reference);
119 if (ret < 0)
120 return false;
121
122 if (strcmp(reference, hint) == 0) {
123 valid = true;
124 } else {
125 USDF_DBG("hint %s failed to match %s\n", hint,
126 reference);
127 }
128
129 free(reference);
130 return valid;
131 }
132
133 /* The hint string itself is kind of a version check, in pre-1.4 the
134 * name was just the device name. In 1.4 and beyond, then name is
135 * actually CIDR
136 * notation.
137 */
138 if (strstr(hint, "/"))
139 return usdf_fabric_checkname(FI_VERSION(1, 4), dap, hint);
140
141 return usdf_fabric_checkname(FI_VERSION(1, 3), dap, hint);
142 }
143
usdf_validate_hints(uint32_t version,const struct fi_info * hints)144 static int usdf_validate_hints(uint32_t version, const struct fi_info *hints)
145 {
146 struct fi_fabric_attr *fattrp;
147 size_t size;
148
149 switch (hints->addr_format) {
150 case FI_FORMAT_UNSPEC:
151 case FI_SOCKADDR_IN:
152 size = sizeof(struct sockaddr_in);
153 break;
154 case FI_SOCKADDR:
155 size = sizeof(struct sockaddr);
156 break;
157 case FI_ADDR_STR:
158 if (hints->src_addr != NULL &&
159 strlen((char *)hints->src_addr) > USDF_ADDR_STR_LEN)
160 return -FI_ENODATA;
161
162 if (hints->dest_addr != NULL &&
163 strlen((char *)hints->dest_addr) > USDF_ADDR_STR_LEN)
164 return -FI_ENODATA;
165
166 goto skip_sockaddr_size_check;
167 default:
168 return -FI_ENODATA;
169 }
170
171 if (hints->src_addr != NULL && hints->src_addrlen < size) {
172 return -FI_ENODATA;
173 }
174 if (hints->dest_addr != NULL && hints->dest_addrlen < size) {
175 return -FI_ENODATA;
176 }
177
178 skip_sockaddr_size_check:
179 if (hints->ep_attr != NULL) {
180 switch (hints->ep_attr->protocol) {
181 case FI_PROTO_UNSPEC:
182 case FI_PROTO_UDP:
183 case FI_PROTO_RUDP:
184 break;
185 default:
186 return -FI_ENODATA;
187 }
188
189 if (hints->ep_attr->auth_key || hints->ep_attr->auth_key_size) {
190 USDF_WARN_SYS(EP_CTRL,
191 "\"authorization key\" is not supported in this provider.\n");
192 return -FI_ENODATA;
193 }
194 }
195
196 fattrp = hints->fabric_attr;
197 if (fattrp != NULL) {
198 if (fattrp->prov_version != 0 &&
199 fattrp->prov_version != USDF_PROV_VERSION) {
200 return -FI_ENODATA;
201 }
202 }
203 return FI_SUCCESS;
204 }
205
206 static int
usdf_fill_sockaddr_info(struct fi_info * fi,struct sockaddr_in * src,struct sockaddr_in * dest,struct usd_device_attrs * dap)207 usdf_fill_sockaddr_info(struct fi_info *fi,
208 struct sockaddr_in *src, struct sockaddr_in *dest,
209 struct usd_device_attrs *dap)
210 {
211 int ret;
212 struct sockaddr_in *sin;
213
214 sin = calloc(1, sizeof(*sin));
215 fi->src_addr = sin;
216 if (sin == NULL) {
217 ret = -FI_ENOMEM;
218 return ret;
219 }
220 fi->src_addrlen = sizeof(struct sockaddr_in);
221 sin->sin_family = AF_INET;
222 sin->sin_addr.s_addr = dap->uda_ipaddr_be;
223 if (src != NULL)
224 sin->sin_port = src->sin_port;
225
226 /* copy in dest if specified */
227 if (dest != NULL) {
228 sin = calloc(1, sizeof(*sin));
229 if (NULL == sin) {
230 free(fi->src_addr);
231 return -FI_ENOMEM;
232 }
233 *sin = *dest;
234 fi->dest_addr = sin;
235 fi->dest_addrlen = sizeof(*sin);
236 }
237 return FI_SUCCESS;
238 }
239
240 static int
usdf_fill_straddr_info(struct fi_info * fi,char * src,char * dest,struct usd_device_attrs * dap)241 usdf_fill_straddr_info(struct fi_info *fi,
242 char *src, char *dest, struct usd_device_attrs *dap)
243 {
244 char *address_string;
245 struct sockaddr_in *sin;
246
247 /* If NULL, we have to create the sockaddr_in
248 * and convert it to string format.
249 */
250 if (src == NULL) {
251 sin = calloc(1, sizeof(*sin));
252 if (NULL == sin)
253 return -FI_ENOMEM;
254 sin->sin_family = AF_INET;
255 sin->sin_addr.s_addr = dap->uda_ipaddr_be;
256
257 address_string = calloc(1, USDF_ADDR_STR_LEN);
258 fi->src_addr = address_string;
259 fi->src_addrlen = USDF_ADDR_STR_LEN;
260
261 usdf_addr_tostr(sin, fi->src_addr, &fi->src_addrlen);
262 free(sin);
263 } else {
264 /* Otherwise, it is already in string format.
265 * Just copy it.
266 */
267 address_string = strdup(src);
268 if (NULL == address_string)
269 return -FI_ENOMEM;
270 fi->src_addr = address_string;
271 fi->src_addrlen = strlen(address_string);
272 }
273
274 /* Same goes for dest. */
275 if (dest != NULL) {
276 address_string = strdup(dest);
277 fi->dest_addr = address_string;
278 fi->dest_addrlen = strlen(address_string);
279 }
280
281 return FI_SUCCESS;
282 }
283 static int
usdf_fill_addr_info(struct fi_info * fi,uint32_t addr_format,void * src,void * dest,struct usd_device_attrs * dap)284 usdf_fill_addr_info(struct fi_info *fi, uint32_t addr_format,
285 void *src, void *dest, struct usd_device_attrs *dap)
286 {
287 int ret;
288
289 if (addr_format != FI_FORMAT_UNSPEC) {
290 fi->addr_format = addr_format;
291 } else {
292 fi->addr_format = FI_SOCKADDR_IN;
293 }
294
295 switch (fi->addr_format) {
296 case FI_SOCKADDR:
297 case FI_SOCKADDR_IN:
298 ret = usdf_fill_sockaddr_info(fi, src, dest, dap);
299 if (ret != FI_SUCCESS)
300 goto fail;
301 break;
302 case FI_ADDR_STR:
303 ret = usdf_fill_straddr_info(fi, src, dest, dap);
304 if (ret != FI_SUCCESS)
305 goto fail;
306 break;
307 default:
308 ret = -FI_ENODATA;
309 goto fail;
310 }
311
312 return 0;
313
314 fail:
315 return ret; // fi_freeinfo() in caller frees all
316 }
317
validate_modebits(uint32_t version,const struct fi_info * hints,uint64_t supported,uint64_t * mode_out)318 static int validate_modebits(uint32_t version, const struct fi_info *hints,
319 uint64_t supported, uint64_t *mode_out)
320 {
321 uint64_t mode;
322
323 /* If there is no hints, return everything we supported. */
324 if (!hints) {
325 *mode_out = supported;
326 return FI_SUCCESS;
327 }
328
329 mode = hints->mode & supported;
330
331 /* Before version 1.5, FI_LOCAL_MR is a requirement. */
332 if (FI_VERSION_LT(version, FI_VERSION(1, 5))) {
333 if ((mode & FI_LOCAL_MR) == 0)
334 return -FI_ENODATA;
335 }
336
337 *mode_out = mode;
338
339 return FI_SUCCESS;
340 }
341
usdf_alloc_fid_nic(struct fi_info * fi,struct usd_device_attrs * dap)342 static int usdf_alloc_fid_nic(struct fi_info *fi,
343 struct usd_device_attrs *dap)
344 {
345 int ret;
346 struct fid_nic *nic = NULL;
347 struct fi_device_attr *da = NULL;
348 struct fi_link_attr *la = NULL;
349
350 nic = ofi_nic_dup(NULL);
351 if (!nic)
352 goto nomem;
353
354 da = nic->device_attr;
355 da->name = strdup(dap->uda_devname);
356 if (!da->name)
357 goto nomem;
358 ret = asprintf(&da->device_id, "%s (%s)",
359 usd_devid_to_pid(dap->uda_vendor_id,
360 dap->uda_device_id),
361 usd_devid_to_nicname(dap->uda_vendor_id,
362 dap->uda_device_id));
363 if (ret < 0)
364 goto nomem;
365 ret = asprintf(&da->device_version, "0x%x", dap->uda_vendor_part_id);
366 if (ret < 0)
367 goto nomem;
368 ret = asprintf(&da->vendor_id, "0x%x", dap->uda_vendor_id);
369 if (ret < 0)
370 goto nomem;
371 da->driver = strdup("usnic_verbs");
372 if (!da->driver)
373 goto nomem;
374 da->firmware = strdup(dap->uda_firmware);
375 if (!da->firmware)
376 goto nomem;
377
378 // usnic does not currently expose PCI bus information, so we
379 // set the bus type to unknown.
380 nic->bus_attr->bus_type = FI_BUS_UNKNOWN;
381
382 la = nic->link_attr;
383
384 socklen_t size = INET_ADDRSTRLEN;
385 la->address = calloc(1, size);
386 if (!la->address)
387 goto nomem;
388 inet_ntop(AF_INET, &dap->uda_ipaddr_be, la->address, size);
389 la->mtu = dap->uda_mtu;
390 la->speed = dap->uda_bandwidth;
391 switch (dap->uda_link_state) {
392 case USD_LINK_UP:
393 la->state = FI_LINK_UP;
394 break;
395 case USD_LINK_DOWN:
396 la->state = FI_LINK_DOWN;
397 break;
398 default:
399 la->state = FI_LINK_UNKNOWN;
400 break;
401 }
402 la->network_type = strdup("Ethernet");
403 if (!la->network_type)
404 goto nomem;
405
406 fi->nic = nic;
407
408 return FI_SUCCESS;
409
410 nomem:
411 if (nic)
412 fi_close(&nic->fid);
413 return -FI_ENOMEM;
414 }
415
usdf_fill_info_dgram(uint32_t version,const struct fi_info * hints,void * src,void * dest,struct usd_device_attrs * dap,struct fi_info ** fi_first,struct fi_info ** fi_last)416 static int usdf_fill_info_dgram(
417 uint32_t version,
418 const struct fi_info *hints,
419 void *src,
420 void *dest,
421 struct usd_device_attrs *dap,
422 struct fi_info **fi_first,
423 struct fi_info **fi_last)
424 {
425 struct fi_info *fi;
426 struct fi_fabric_attr *fattrp;
427 uint32_t addr_format;
428 int ret;
429
430 fi = fi_allocinfo();
431 if (fi == NULL) {
432 ret = -FI_ENOMEM;
433 goto fail;
434 }
435
436 fi->caps = USDF_DGRAM_CAPS;
437
438 ret = validate_modebits(version, hints,
439 USDF_DGRAM_SUPP_MODE, &fi->mode);
440 if (ret)
441 goto fail;
442
443 if (hints != NULL) {
444 addr_format = hints->addr_format;
445
446 /* check that we are capable of what's requested */
447 if ((hints->caps & ~USDF_DGRAM_CAPS) != 0) {
448 ret = -FI_ENODATA;
449 goto fail;
450 }
451
452 fi->handle = hints->handle;
453 } else {
454 addr_format = FI_FORMAT_UNSPEC;
455 }
456 fi->ep_attr->type = FI_EP_DGRAM;
457
458 ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap);
459 if (ret != 0) {
460 goto fail;
461 }
462
463 /* fabric attrs */
464 fattrp = fi->fabric_attr;
465 ret = usdf_fabric_getname(version, dap, &fattrp->name);
466 if (ret < 0 || fattrp->name == NULL) {
467 ret = -FI_ENOMEM;
468 goto fail;
469 }
470
471 if (fi->mode & FI_MSG_PREFIX) {
472 if (FI_VERSION_GE(version, FI_VERSION(1, 1)))
473 fi->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY;
474 else
475 fi->mode &= ~FI_MSG_PREFIX;
476 }
477
478 ret = usdf_dgram_fill_ep_attr(version, hints, fi, dap);
479 if (ret)
480 goto fail;
481
482 ret = usdf_dgram_fill_dom_attr(version, hints, fi, dap);
483 if (ret)
484 goto fail;
485
486 ret = usdf_dgram_fill_tx_attr(version, hints, fi, dap);
487 if (ret)
488 goto fail;
489
490 ret = usdf_dgram_fill_rx_attr(version, hints, fi, dap);
491 if (ret)
492 goto fail;
493
494 ret = usdf_alloc_fid_nic(fi, dap);
495 if (ret)
496 goto fail;
497
498 /* add to tail of list */
499 if (*fi_first == NULL) {
500 *fi_first = fi;
501 } else {
502 (*fi_last)->next = fi;
503 }
504 *fi_last = fi;
505
506 return 0;
507
508 fail:
509 if (fi != NULL) {
510 fi_freeinfo(fi);
511 }
512 return ret;
513 }
514
515 static int
usdf_get_devinfo(void)516 usdf_get_devinfo(void)
517 {
518 struct usdf_usnic_info *dp;
519 struct usdf_dev_entry *dep;
520 struct usd_open_params params;
521 int ret;
522 int d;
523
524 assert(__usdf_devinfo == NULL);
525
526 dp = calloc(1, sizeof(*dp));
527 if (dp == NULL) {
528 ret = -FI_ENOMEM;
529 goto fail;
530 }
531 __usdf_devinfo = dp;
532
533 dp->uu_num_devs = USD_MAX_DEVICES;
534 ret = usd_get_device_list(dp->uu_devs, &dp->uu_num_devs);
535 if (ret != 0) {
536 dp->uu_num_devs = 0;
537 goto fail;
538 }
539
540 for (d = 0; d < dp->uu_num_devs; ++d) {
541 dep = &dp->uu_info[d];
542
543 memset(¶ms, 0, sizeof(params));
544 params.flags = UOPF_SKIP_PD_ALLOC;
545 params.cmd_fd = -1;
546 params.context = NULL;
547 ret = usd_open_with_params(dp->uu_devs[d].ude_devname,
548 ¶ms, &dep->ue_dev);
549 if (ret != 0) {
550 continue;
551 }
552
553 ret = usd_get_device_attrs(dep->ue_dev, &dep->ue_dattr);
554 if (ret != 0) {
555 continue;
556 }
557
558 dep->ue_dev_ok = 1; /* this device is OK */
559
560 usd_close(dep->ue_dev);
561 dep->ue_dev = NULL;
562 }
563 return 0;
564
565 fail:
566 return ret;
567 }
568
569 static int
usdf_get_distance(struct usd_device_attrs * dap,uint32_t daddr_be,int * metric_o)570 usdf_get_distance(
571 struct usd_device_attrs *dap,
572 uint32_t daddr_be,
573 int *metric_o)
574 {
575 uint32_t nh_ip_addr;
576 int ret;
577
578 USDF_TRACE("\n");
579
580 ret = usnic_nl_rt_lookup(dap->uda_ipaddr_be, daddr_be,
581 dap->uda_ifindex, &nh_ip_addr);
582 if (ret != 0) {
583 *metric_o = -1;
584 ret = 0;
585 } else if (nh_ip_addr == 0) {
586 *metric_o = 0;
587 } else {
588 *metric_o = 1;
589 }
590
591 return ret;
592 }
593
594 /* Check all things related to a device. Make sure it's okay, the source address
595 * matches the requested address, the destination is reachable from the device,
596 * the device fabric name matches the requested fabric name, and the device
597 * domain name matches the requested domain name.
598 *
599 * @param version Libfabric API version used to verify the domain / fabric name.
600 * @param hints Hints passed to fi_getinfo.
601 * @param src Source address being requested.
602 * @param dest Destination address to communicate with.
603 * @param dep usNIC device entry being checked.
604 *
605 * @return true on success, false on failure. For debug logging can be enabled
606 * to see why a device was disqualified.
607 */
usdf_check_device(uint32_t version,const struct fi_info * hints,void * src,void * dest,struct usdf_dev_entry * dep)608 static bool usdf_check_device(uint32_t version, const struct fi_info *hints,
609 void *src, void *dest,
610 struct usdf_dev_entry *dep)
611 {
612 char dest_str[INET_ADDRSTRLEN];
613 char src_str[INET_ADDRSTRLEN];
614 char dev_str[INET_ADDRSTRLEN];
615 struct usd_device_attrs *dap;
616 struct sockaddr_in *sin;
617 int reachable;
618 int ret;
619
620 reachable = -1;
621 dap = &dep->ue_dattr;
622
623 /* Skip the device if it has problems. */
624 if (!dep->ue_dev_ok) {
625 USDF_WARN_SYS(FABRIC, "skipping %s/%s device not ok\n",
626 dap->uda_devname, dap->uda_ifname);
627 return false;
628 }
629
630 /* If the given source address is not INADDR_ANY, compare against the
631 * device.
632 */
633 if (src) {
634 sin = usdf_format_to_sin(hints, src);
635 if (sin->sin_addr.s_addr != INADDR_ANY) {
636 if (sin->sin_addr.s_addr != dap->uda_ipaddr_be) {
637 inet_ntop(AF_INET, &sin->sin_addr.s_addr,
638 src_str, sizeof(src_str));
639 inet_ntop(AF_INET, &dap->uda_ipaddr_be,
640 dev_str, sizeof(dev_str));
641 USDF_WARN_SYS(FABRIC,
642 "src addr<%s> != dev addr<%s>\n",
643 src_str, dev_str);
644 goto fail;
645 }
646 }
647
648 usdf_free_sin_if_needed(hints, sin);
649 }
650
651 /* Check that the given destination address is reachable from the
652 * interface.
653 */
654 if (dest) {
655 sin = usdf_format_to_sin(hints, dest);
656 if (sin->sin_addr.s_addr != INADDR_ANY) {
657 ret = usdf_get_distance(dap, sin->sin_addr.s_addr,
658 &reachable);
659 if (ret) {
660 inet_ntop(AF_INET,
661 &sin->sin_addr.s_addr, dest_str,
662 sizeof(dest_str));
663 USDF_WARN_SYS(FABRIC,
664 "get_distance failed @ %s\n",
665 dest_str);
666 goto fail;
667 }
668 }
669
670 if (reachable == -1) {
671 inet_ntop(AF_INET, &sin->sin_addr.s_addr, dest_str,
672 sizeof(dest_str));
673 USDF_WARN_SYS(FABRIC,
674 "dest %s unreachable from %s/%s, skipping\n",
675 dest_str, dap->uda_devname,
676 dap->uda_ifname);
677 goto fail;
678 }
679
680 usdf_free_sin_if_needed(hints, sin);
681 }
682
683 /* Checks that the fabric name is correct for the given interface. The
684 * fabric name contains the CIDR notation for the interface.
685 */
686 if (hints && hints->fabric_attr && hints->fabric_attr->name) {
687 if (!usdf_fabric_checkname(version, dap,
688 hints->fabric_attr->name))
689 return false;
690 }
691
692 /* Check that the domain name is correct for the given interface. The
693 * domain name is the device name.
694 */
695 if (hints && hints->domain_attr && hints->domain_attr->name) {
696 if (!usdf_domain_checkname(version, dap,
697 hints->domain_attr->name))
698 return false;
699 }
700
701 return true;
702
703 fail:
704 usdf_free_sin_if_needed(hints, sin);
705
706 return false;
707 }
708
709 static int
usdf_handle_node_and_service(const char * node,const char * service,uint64_t flags,void ** src,void ** dest,const struct fi_info * hints,struct addrinfo ** ai)710 usdf_handle_node_and_service(const char *node, const char *service,
711 uint64_t flags, void **src, void **dest,
712 const struct fi_info *hints, struct addrinfo **ai)
713 {
714 int ret;
715 struct sockaddr_in *sin;
716
717 if (node != NULL || service != NULL) {
718 if (hints && hints->addr_format == FI_ADDR_STR) {
719 /* FI_ADDR_STR can't have service param. */
720 if (service)
721 return -FI_EINVAL;
722
723 sin = usdf_format_to_sin(hints, node);
724
725 if (!sin)
726 /* This could be invalid or no memory. */
727 return -FI_EINVAL;
728 } else {
729 ret = getaddrinfo(node, service, NULL, ai);
730 if (ret != 0) {
731 USDF_DBG("getaddrinfo failed: %d: <%s>\n", ret,
732 gai_strerror(ret));
733 return ret;
734 }
735 sin = (struct sockaddr_in *)(*ai)->ai_addr;
736 }
737
738 if (flags & FI_SOURCE)
739 *src = usdf_sin_to_format(hints, sin, NULL);
740 else
741 *dest = usdf_sin_to_format(hints, sin, NULL);
742 }
743
744 return FI_SUCCESS;
745 }
746
747 static int
usdf_getinfo(uint32_t version,const char * node,const char * service,uint64_t flags,const struct fi_info * hints,struct fi_info ** info)748 usdf_getinfo(uint32_t version, const char *node, const char *service,
749 uint64_t flags, const struct fi_info *hints, struct fi_info **info)
750 {
751 struct usdf_usnic_info *dp;
752 struct usdf_dev_entry *dep;
753 struct usd_device_attrs *dap;
754 struct fi_info *fi_first;
755 struct fi_info *fi_last;
756 struct addrinfo *ai;
757 void *src;
758 void *dest;
759 enum fi_ep_type ep_type;
760 int d;
761 int ret;
762
763 USDF_TRACE("\n");
764
765 fi_first = NULL;
766 fi_last = NULL;
767 ai = NULL;
768 src = NULL;
769 dest = NULL;
770
771 /*
772 * Get and cache usNIC device info
773 */
774 if (__usdf_devinfo == NULL) {
775 ret = usdf_get_devinfo();
776 if (ret != 0) {
777 USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n",
778 ret, fi_strerror(-ret));
779 if (ret == -FI_ENODEV)
780 ret = -FI_ENODATA;
781 goto fail;
782 }
783 }
784 dp = __usdf_devinfo;
785
786 /* Check the hints up front and fail if they're invalid. */
787 if (hints) {
788 ret = usdf_validate_hints(version, hints);
789 if (ret) {
790 USDF_WARN_SYS(FABRIC, "hints failed to validate\n");
791 goto fail;
792 }
793 }
794
795 /* Get the src and dest if user specified. */
796 ret = usdf_handle_node_and_service(node, service, flags,
797 &src, &dest, hints, &ai);
798 if (ret) {
799 USDF_WARN_SYS(FABRIC, "failed to handle node and service.\n");
800 goto fail;
801 }
802
803 if (hints != NULL) {
804 if (dest == NULL && hints->dest_addr != NULL)
805 dest = hints->dest_addr;
806 if (src == NULL && hints->src_addr != NULL)
807 src = hints->src_addr;
808 }
809
810 for (d = 0; d < dp->uu_num_devs; ++d) {
811 dep = &dp->uu_info[d];
812 dap = &dep->ue_dattr;
813
814 /* If the device has an issue or the hints don't match the
815 * device information, then skip.
816 */
817 if (!usdf_check_device(version, hints, src, dest, dep))
818 continue;
819
820 if (hints && hints->ep_attr)
821 ep_type = hints->ep_attr->type;
822 else
823 ep_type = FI_EP_UNSPEC;
824
825 if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) {
826 ret = usdf_fill_info_dgram(version, hints, src, dest,
827 dap, &fi_first, &fi_last);
828 if (ret != 0 && ret != -FI_ENODATA) {
829 goto fail;
830 }
831 }
832 }
833
834 if (fi_first != NULL) {
835 *info = fi_first;
836 ret = 0;
837 } else {
838 ret = -FI_ENODATA;
839 }
840
841
842 fail:
843 if (ai)
844 freeaddrinfo(ai);
845
846 if (ret != 0) {
847 fi_freeinfo(fi_first);
848 USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret));
849 }
850
851 return ret;
852 }
853
854 static int
usdf_fabric_close(fid_t fid)855 usdf_fabric_close(fid_t fid)
856 {
857 struct usdf_fabric *fp;
858 int ret;
859 void *rv;
860
861 USDF_TRACE("\n");
862
863 fp = fab_fidtou(fid);
864 if (ofi_atomic_get32(&fp->fab_refcnt) > 0) {
865 return -FI_EBUSY;
866 }
867 /* Tell progression thread to exit */
868 fp->fab_exit = 1;
869
870 free(fp->fab_attr.name);
871 free(fp->fab_attr.prov_name);
872
873 if (fp->fab_thread) {
874 ret = usdf_fabric_wake_thread(fp);
875 if (ret != 0) {
876 return ret;
877 }
878 pthread_join(fp->fab_thread, &rv);
879 }
880 usdf_timer_deinit(fp);
881 if (fp->fab_epollfd != OFI_EPOLL_INVALID) {
882 ofi_epoll_close(fp->fab_epollfd);
883 }
884 if (fp->fab_eventfd != -1) {
885 close(fp->fab_eventfd);
886 }
887 if (fp->fab_arp_sockfd != -1) {
888 close(fp->fab_arp_sockfd);
889 }
890
891 free(fp);
892 return 0;
893 }
894
895 static struct fi_ops usdf_fi_ops = {
896 .size = sizeof(struct fi_ops),
897 .close = usdf_fabric_close,
898 .bind = fi_no_bind,
899 .control = fi_no_control,
900 .ops_open = usdf_fabric_ops_open,
901 };
902
903 static struct fi_ops_fabric usdf_ops_fabric = {
904 .size = sizeof(struct fi_ops_fabric),
905 .domain = usdf_domain_open,
906 .passive_ep = usdf_pep_open,
907 .eq_open = usdf_eq_open,
908 .wait_open = usdf_wait_open,
909 .trywait = usdf_trywait
910 };
911
912 static int
usdf_fabric_open(struct fi_fabric_attr * fattrp,struct fid_fabric ** fabric,void * context)913 usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric,
914 void *context)
915 {
916 struct fid_fabric *ff;
917 struct usdf_fabric *fp;
918 struct usdf_usnic_info *dp;
919 struct usdf_dev_entry *dep;
920 struct sockaddr_in sin;
921 int ret;
922 int d;
923
924 USDF_TRACE("\n");
925
926 /* Make sure this fabric exists */
927 dp = __usdf_devinfo;
928 for (d = 0; d < dp->uu_num_devs; ++d) {
929 dep = &dp->uu_info[d];
930 if (dep->ue_dev_ok &&
931 usdf_fabric_checkname(0, &(dep->ue_dattr), fattrp->name)) {
932 break;
933 }
934 }
935 if (d >= dp->uu_num_devs) {
936 USDF_INFO("device \"%s\" does not exit, returning -FI_ENODEV\n",
937 fattrp->name);
938 return -FI_ENODEV;
939 }
940
941 fp = calloc(1, sizeof(*fp));
942 if (fp == NULL) {
943 USDF_INFO("unable to allocate memory for fabric\n");
944 return -FI_ENOMEM;
945 }
946 fp->fab_epollfd = OFI_EPOLL_INVALID;
947 fp->fab_arp_sockfd = -1;
948 LIST_INIT(&fp->fab_domain_list);
949
950 fp->fab_attr.fabric = fab_utof(fp);
951 fp->fab_attr.name = strdup(fattrp->name);
952 fp->fab_attr.prov_name = strdup(USDF_PROV_NAME);
953 fp->fab_attr.prov_version = USDF_PROV_VERSION;
954 if (fp->fab_attr.name == NULL ||
955 fp->fab_attr.prov_name == NULL) {
956 ret = -FI_ENOMEM;
957 goto fail;
958 }
959
960 fp->fab_fid.fid.fclass = FI_CLASS_FABRIC;
961 fp->fab_fid.fid.context = context;
962 fp->fab_fid.fid.ops = &usdf_fi_ops;
963 fp->fab_fid.ops = &usdf_ops_fabric;
964
965 fp->fab_dev_attrs = &dep->ue_dattr;
966
967 ret = ofi_epoll_create(&fp->fab_epollfd);
968 if (ret) {
969 USDF_INFO("unable to allocate epoll fd\n");
970 goto fail;
971 }
972
973 fp->fab_eventfd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE);
974 if (fp->fab_eventfd == -1) {
975 ret = -errno;
976 USDF_INFO("unable to allocate event fd\n");
977 goto fail;
978 }
979 fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb;
980 fp->fab_poll_item.pi_context = fp;
981 ret = ofi_epoll_add(fp->fab_epollfd, fp->fab_eventfd, OFI_EPOLL_IN,
982 &fp->fab_poll_item);
983 if (ret) {
984 USDF_INFO("unable to EPOLL_CTL_ADD\n");
985 goto fail;
986 }
987
988 /* initialize timer subsystem */
989 ret = usdf_timer_init(fp);
990 if (ret != 0) {
991 USDF_INFO("unable to initialize timer\n");
992 goto fail;
993 }
994
995 /* create and bind socket for ARP resolution */
996 memset(&sin, 0, sizeof(sin));
997 sin.sin_family = AF_INET;
998 sin.sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be;
999 fp->fab_arp_sockfd = socket(AF_INET, SOCK_DGRAM, 0);
1000 if (fp->fab_arp_sockfd == -1) {
1001 USDF_INFO("unable to create socket\n");
1002 goto fail;
1003 }
1004 ret = bind(fp->fab_arp_sockfd, (struct sockaddr *) &sin, sizeof(sin));
1005 if (ret == -1) {
1006 ret = -errno;
1007 goto fail;
1008 }
1009
1010 ofi_atomic_initialize32(&fp->fab_refcnt, 0);
1011 ofi_atomic_initialize32(&fp->num_blocked_waiting, 0);
1012
1013 ret = pthread_create(&fp->fab_thread, NULL,
1014 usdf_fabric_progression_thread, fp);
1015 if (ret != 0) {
1016 ret = -ret;
1017 USDF_INFO("unable to create progress thread\n");
1018 goto fail;
1019 }
1020
1021 fattrp->fabric = fab_utof(fp);
1022 fattrp->prov_version = USDF_PROV_VERSION;
1023 *fabric = fab_utof(fp);
1024 USDF_INFO("successfully opened %s/%s\n", fattrp->name,
1025 fp->fab_dev_attrs->uda_ifname);
1026 return 0;
1027
1028 fail:
1029 free(fp->fab_attr.name);
1030 free(fp->fab_attr.prov_name);
1031 ff = fab_utof(fp);
1032 usdf_fabric_close(&ff->fid);
1033 USDF_DBG("returning %d (%s)\n", ret, fi_strerror(-ret));
1034 return ret;
1035 }
1036
usdf_fini(void)1037 static void usdf_fini(void)
1038 {
1039 USDF_TRACE("\n");
1040 }
1041
1042 struct fi_provider usdf_ops = {
1043 .name = USDF_PROV_NAME,
1044 .version = USDF_PROV_VERSION,
1045 .fi_version = OFI_VERSION_LATEST,
1046 .getinfo = usdf_getinfo,
1047 .fabric = usdf_fabric_open,
1048 .cleanup = usdf_fini
1049 };
1050
1051 USNIC_INI
1052 {
1053 #if USNIC_BUILD_FAKE_VERBS_DRIVER
1054 usdf_setup_fake_ibv_provider();
1055 #endif
1056 return (&usdf_ops);
1057 }
1058