1 /*- 2 * Copyright (c) 2020 Mellanox Technologies. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #include "opt_inet.h" 27 #include "opt_inet6.h" 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/devctl.h> 35 #include <sys/eventhandler.h> 36 #include <sys/kernel.h> 37 #include <sys/mbuf.h> 38 #include <sys/module.h> 39 #include <sys/socket.h> 40 #include <sys/sysctl.h> 41 42 #include <net/bpf.h> 43 #include <net/ethernet.h> 44 #include <net/infiniband.h> 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/if_private.h> 48 #include <net/if_dl.h> 49 #include <net/if_media.h> 50 #include <net/if_lagg.h> 51 #include <net/if_llatbl.h> 52 #include <net/if_types.h> 53 #include <net/netisr.h> 54 #include <net/route.h> 55 #include <netinet/if_ether.h> 56 #include <netinet/in.h> 57 #include <netinet/ip6.h> 58 #include <netinet6/in6_var.h> 59 #include <netinet6/nd6.h> 60 61 #include <security/mac/mac_framework.h> 62 63 /* if_lagg(4) support */ 64 struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); 65 66 #ifdef INET 67 static inline void 68 infiniband_ipv4_multicast_map(uint32_t addr, 69 const uint8_t *broadcast, uint8_t *buf) 70 { 71 uint8_t scope; 72 73 addr = ntohl(addr); 74 scope = broadcast[5] & 0xF; 75 76 buf[0] = 0; 77 buf[1] = 0xff; 78 buf[2] = 0xff; 79 buf[3] = 0xff; 80 buf[4] = 0xff; 81 buf[5] = 0x10 | scope; 82 buf[6] = 0x40; 83 buf[7] = 0x1b; 84 buf[8] = broadcast[8]; 85 buf[9] = broadcast[9]; 86 buf[10] = 0; 87 buf[11] = 0; 88 buf[12] = 0; 89 buf[13] = 0; 90 buf[14] = 0; 91 buf[15] = 0; 92 buf[16] = (addr >> 24) & 0xff; 93 buf[17] = (addr >> 16) & 0xff; 94 buf[18] = (addr >> 8) & 0xff; 95 buf[19] = addr & 0xff; 96 } 97 #endif 98 99 #ifdef INET6 100 static inline void 101 infiniband_ipv6_multicast_map(const struct in6_addr *addr, 102 const uint8_t *broadcast, uint8_t *buf) 103 { 104 uint8_t scope; 105 106 scope = broadcast[5] & 0xF; 107 108 buf[0] = 0; 109 buf[1] = 0xff; 110 buf[2] = 0xff; 111 buf[3] = 0xff; 112 buf[4] = 0xff; 113 buf[5] = 0x10 | scope; 114 buf[6] = 0x60; 115 buf[7] = 0x1b; 116 buf[8] = broadcast[8]; 117 buf[9] = broadcast[9]; 118 memcpy(&buf[10], &addr->s6_addr[6], 10); 119 } 120 #endif 121 122 /* 123 * This is for clients that have an infiniband_header in the mbuf. 124 */ 125 void 126 infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) 127 { 128 struct infiniband_header *ibh; 129 struct ether_header eh; 130 131 if (!bpf_peers_present(ifp->if_bpf)) 132 return; 133 134 M_ASSERTVALID(mb); 135 if (mb->m_len < sizeof(*ibh)) 136 return; 137 138 ibh = mtod(mb, struct infiniband_header *); 139 eh.ether_type = ibh->ib_protocol; 140 memset(eh.ether_shost, 0, ETHER_ADDR_LEN); 141 memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); 142 mb->m_data += sizeof(*ibh); 143 mb->m_len -= sizeof(*ibh); 144 mb->m_pkthdr.len -= sizeof(*ibh); 145 bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); 146 mb->m_data -= sizeof(*ibh); 147 mb->m_len += sizeof(*ibh); 148 mb->m_pkthdr.len += sizeof(*ibh); 149 } 150 151 static void 152 update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) 153 { 154 int csum_flags = 0; 155 156 if (src->m_pkthdr.csum_flags & CSUM_IP) 157 csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); 158 if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 159 csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); 160 if (src->m_pkthdr.csum_flags & CSUM_SCTP) 161 csum_flags |= CSUM_SCTP_VALID; 162 dst->m_pkthdr.csum_flags |= csum_flags; 163 if (csum_flags & CSUM_DATA_VALID) 164 dst->m_pkthdr.csum_data = 0xffff; 165 } 166 167 /* 168 * Handle link-layer encapsulation requests. 169 */ 170 static int 171 infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) 172 { 173 struct infiniband_header *ih; 174 struct arphdr *ah; 175 uint16_t etype; 176 const uint8_t *lladdr; 177 178 if (req->rtype != IFENCAP_LL) 179 return (EOPNOTSUPP); 180 181 if (req->bufsize < INFINIBAND_HDR_LEN) 182 return (ENOMEM); 183 184 ih = (struct infiniband_header *)req->buf; 185 lladdr = req->lladdr; 186 req->lladdr_off = 0; 187 188 switch (req->family) { 189 case AF_INET: 190 etype = htons(ETHERTYPE_IP); 191 break; 192 case AF_INET6: 193 etype = htons(ETHERTYPE_IPV6); 194 break; 195 case AF_ARP: 196 ah = (struct arphdr *)req->hdata; 197 ah->ar_hrd = htons(ARPHRD_INFINIBAND); 198 199 switch (ntohs(ah->ar_op)) { 200 case ARPOP_REVREQUEST: 201 case ARPOP_REVREPLY: 202 etype = htons(ETHERTYPE_REVARP); 203 break; 204 case ARPOP_REQUEST: 205 case ARPOP_REPLY: 206 default: 207 etype = htons(ETHERTYPE_ARP); 208 break; 209 } 210 211 if (req->flags & IFENCAP_FLAG_BROADCAST) 212 lladdr = ifp->if_broadcastaddr; 213 break; 214 default: 215 return (EAFNOSUPPORT); 216 } 217 218 ih->ib_protocol = etype; 219 ih->ib_reserved = 0; 220 memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); 221 req->bufsize = sizeof(struct infiniband_header); 222 223 return (0); 224 } 225 226 static int 227 infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, 228 const struct sockaddr *dst, struct route *ro, uint8_t *phdr, 229 uint32_t *pflags, struct llentry **plle) 230 { 231 #if defined(INET) || defined(INET6) 232 struct infiniband_header *ih = (struct infiniband_header *)phdr; 233 #endif 234 uint32_t lleflags = 0; 235 int error = 0; 236 237 if (plle) 238 *plle = NULL; 239 240 switch (dst->sa_family) { 241 #ifdef INET 242 case AF_INET: 243 if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { 244 error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); 245 } else { 246 if (m->m_flags & M_BCAST) { 247 memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, 248 INFINIBAND_ADDR_LEN); 249 } else { 250 infiniband_ipv4_multicast_map( 251 ((const struct sockaddr_in *)dst)->sin_addr.s_addr, 252 ifp->if_broadcastaddr, ih->ib_hwaddr); 253 } 254 ih->ib_protocol = htons(ETHERTYPE_IP); 255 ih->ib_reserved = 0; 256 } 257 break; 258 #endif 259 #ifdef INET6 260 case AF_INET6: 261 if ((m->m_flags & M_MCAST) == 0) { 262 int af = RO_GET_FAMILY(ro, dst); 263 error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr, 264 &lleflags, plle); 265 } else { 266 infiniband_ipv6_multicast_map( 267 &((const struct sockaddr_in6 *)dst)->sin6_addr, 268 ifp->if_broadcastaddr, ih->ib_hwaddr); 269 ih->ib_protocol = htons(ETHERTYPE_IPV6); 270 ih->ib_reserved = 0; 271 } 272 break; 273 #endif 274 default: 275 if_printf(ifp, "can't handle af%d\n", dst->sa_family); 276 if (m != NULL) 277 m_freem(m); 278 return (EAFNOSUPPORT); 279 } 280 281 if (error == EHOSTDOWN) { 282 if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) 283 error = EHOSTUNREACH; 284 } 285 286 if (error != 0) 287 return (error); 288 289 *pflags = RT_MAY_LOOP; 290 if (lleflags & LLE_IFADDR) 291 *pflags |= RT_L2_ME; 292 293 return (0); 294 } 295 296 /* 297 * Infiniband output routine. 298 */ 299 static int 300 infiniband_output(struct ifnet *ifp, struct mbuf *m, 301 const struct sockaddr *dst, struct route *ro) 302 { 303 uint8_t linkhdr[INFINIBAND_HDR_LEN]; 304 uint8_t *phdr; 305 struct llentry *lle = NULL; 306 struct infiniband_header *ih; 307 int error = 0; 308 int hlen; /* link layer header length */ 309 uint32_t pflags; 310 bool addref; 311 312 NET_EPOCH_ASSERT(); 313 314 addref = false; 315 phdr = NULL; 316 pflags = 0; 317 if (ro != NULL) { 318 /* XXX BPF uses ro_prepend */ 319 if (ro->ro_prepend != NULL) { 320 phdr = ro->ro_prepend; 321 hlen = ro->ro_plen; 322 } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { 323 if ((ro->ro_flags & RT_LLE_CACHE) != 0) { 324 lle = ro->ro_lle; 325 if (lle != NULL && 326 (lle->la_flags & LLE_VALID) == 0) { 327 LLE_FREE(lle); 328 lle = NULL; /* redundant */ 329 ro->ro_lle = NULL; 330 } 331 if (lle == NULL) { 332 /* if we lookup, keep cache */ 333 addref = 1; 334 } else 335 /* 336 * Notify LLE code that 337 * the entry was used 338 * by datapath. 339 */ 340 llentry_provide_feedback(lle); 341 } 342 if (lle != NULL) { 343 phdr = lle->r_linkdata; 344 hlen = lle->r_hdrlen; 345 pflags = lle->r_flags; 346 } 347 } 348 } 349 350 #ifdef MAC 351 error = mac_ifnet_check_transmit(ifp, m); 352 if (error) 353 goto bad; 354 #endif 355 356 M_PROFILE(m); 357 if (ifp->if_flags & IFF_MONITOR) { 358 error = ENETDOWN; 359 goto bad; 360 } 361 if (!((ifp->if_flags & IFF_UP) && 362 (ifp->if_drv_flags & IFF_DRV_RUNNING))) { 363 error = ENETDOWN; 364 goto bad; 365 } 366 367 if (phdr == NULL) { 368 /* No prepend data supplied. Try to calculate ourselves. */ 369 phdr = linkhdr; 370 hlen = INFINIBAND_HDR_LEN; 371 error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, 372 addref ? &lle : NULL); 373 if (addref && lle != NULL) 374 ro->ro_lle = lle; 375 if (error != 0) 376 return (error == EWOULDBLOCK ? 0 : error); 377 } 378 379 if ((pflags & RT_L2_ME) != 0) { 380 update_mbuf_csumflags(m, m); 381 return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0)); 382 } 383 384 /* 385 * Add local infiniband header. If no space in first mbuf, 386 * allocate another. 387 */ 388 M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); 389 if (m == NULL) { 390 error = ENOBUFS; 391 goto bad; 392 } 393 if ((pflags & RT_HAS_HEADER) == 0) { 394 ih = mtod(m, struct infiniband_header *); 395 memcpy(ih, phdr, hlen); 396 } 397 398 /* 399 * Queue message on interface, update output statistics if 400 * successful, and start output if interface not yet active. 401 */ 402 return (ifp->if_transmit(ifp, m)); 403 bad: 404 if (m != NULL) 405 m_freem(m); 406 return (error); 407 } 408 409 /* 410 * Process a received Infiniband packet. 411 */ 412 static void 413 infiniband_input(struct ifnet *ifp, struct mbuf *m) 414 { 415 struct infiniband_header *ibh; 416 struct epoch_tracker et; 417 int isr; 418 bool needs_epoch; 419 420 needs_epoch = (ifp->if_flags & IFF_KNOWSEPOCH) == 0; 421 422 CURVNET_SET_QUIET(ifp->if_vnet); 423 if (__predict_false(needs_epoch)) 424 NET_EPOCH_ENTER(et); 425 426 if ((ifp->if_flags & IFF_UP) == 0) { 427 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 428 m_freem(m); 429 goto done; 430 } 431 432 ibh = mtod(m, struct infiniband_header *); 433 434 /* 435 * Reset layer specific mbuf flags to avoid confusing upper 436 * layers: 437 */ 438 m->m_flags &= ~M_VLANTAG; 439 m_clrprotoflags(m); 440 441 if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { 442 if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, 443 ifp->if_addrlen) == 0) 444 m->m_flags |= M_BCAST; 445 else 446 m->m_flags |= M_MCAST; 447 if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); 448 } 449 450 /* Let BPF have it before we strip the header. */ 451 infiniband_bpf_mtap(ifp, m); 452 453 /* Allow monitor mode to claim this frame, after stats are updated. */ 454 if (ifp->if_flags & IFF_MONITOR) { 455 m_freem(m); 456 goto done; 457 } 458 459 /* Direct packet to correct FIB based on interface config. */ 460 M_SETFIB(m, ifp->if_fib); 461 462 /* Handle input from a lagg<N> port */ 463 if (ifp->if_type == IFT_INFINIBANDLAG) { 464 KASSERT(lagg_input_infiniband_p != NULL, 465 ("%s: if_lagg not loaded!", __func__)); 466 m = (*lagg_input_infiniband_p)(ifp, m); 467 if (__predict_false(m == NULL)) 468 goto done; 469 ifp = m->m_pkthdr.rcvif; 470 } 471 472 /* 473 * Dispatch frame to upper layer. 474 */ 475 switch (ibh->ib_protocol) { 476 #ifdef INET 477 case htons(ETHERTYPE_IP): 478 isr = NETISR_IP; 479 break; 480 481 case htons(ETHERTYPE_ARP): 482 if (ifp->if_flags & IFF_NOARP) { 483 /* Discard packet if ARP is disabled on interface */ 484 m_freem(m); 485 goto done; 486 } 487 isr = NETISR_ARP; 488 break; 489 #endif 490 #ifdef INET6 491 case htons(ETHERTYPE_IPV6): 492 isr = NETISR_IPV6; 493 break; 494 #endif 495 default: 496 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); 497 m_freem(m); 498 goto done; 499 } 500 501 /* Strip off the Infiniband header. */ 502 m_adj(m, INFINIBAND_HDR_LEN); 503 504 #ifdef MAC 505 /* 506 * Tag the mbuf with an appropriate MAC label before any other 507 * consumers can get to it. 508 */ 509 mac_ifnet_create_mbuf(ifp, m); 510 #endif 511 /* Allow monitor mode to claim this frame, after stats are updated. */ 512 netisr_dispatch(isr, m); 513 done: 514 if (__predict_false(needs_epoch)) 515 NET_EPOCH_EXIT(et); 516 CURVNET_RESTORE(); 517 } 518 519 static int 520 infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, 521 struct sockaddr *sa) 522 { 523 struct sockaddr_dl *sdl; 524 #ifdef INET 525 struct sockaddr_in *sin; 526 #endif 527 #ifdef INET6 528 struct sockaddr_in6 *sin6; 529 #endif 530 uint8_t *e_addr; 531 532 switch (sa->sa_family) { 533 case AF_LINK: 534 /* 535 * No mapping needed. Just check that it's a valid MC address. 536 */ 537 sdl = (struct sockaddr_dl *)sa; 538 e_addr = LLADDR(sdl); 539 if (!INFINIBAND_IS_MULTICAST(e_addr)) 540 return (EADDRNOTAVAIL); 541 *llsa = NULL; 542 return 0; 543 544 #ifdef INET 545 case AF_INET: 546 sin = (struct sockaddr_in *)sa; 547 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 548 return (EADDRNOTAVAIL); 549 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 550 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 551 e_addr = LLADDR(sdl); 552 infiniband_ipv4_multicast_map( 553 sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); 554 *llsa = (struct sockaddr *)sdl; 555 return (0); 556 #endif 557 #ifdef INET6 558 case AF_INET6: 559 sin6 = (struct sockaddr_in6 *)sa; 560 /* 561 * An IP6 address of 0 means listen to all of the 562 * multicast address used for IP6. This has no meaning 563 * in infiniband. 564 */ 565 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 566 return (EADDRNOTAVAIL); 567 if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) 568 return (EADDRNOTAVAIL); 569 sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); 570 sdl->sdl_alen = INFINIBAND_ADDR_LEN; 571 e_addr = LLADDR(sdl); 572 infiniband_ipv6_multicast_map( 573 &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); 574 *llsa = (struct sockaddr *)sdl; 575 return (0); 576 #endif 577 default: 578 return (EAFNOSUPPORT); 579 } 580 } 581 582 void 583 infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) 584 { 585 struct sockaddr_dl *sdl; 586 struct ifaddr *ifa; 587 int i; 588 589 ifp->if_addrlen = INFINIBAND_ADDR_LEN; 590 ifp->if_hdrlen = INFINIBAND_HDR_LEN; 591 ifp->if_mtu = INFINIBAND_MTU; 592 if_attach(ifp); 593 ifp->if_output = infiniband_output; 594 ifp->if_input = infiniband_input; 595 ifp->if_resolvemulti = infiniband_resolvemulti; 596 ifp->if_requestencap = infiniband_requestencap; 597 598 if (ifp->if_baudrate == 0) 599 ifp->if_baudrate = IF_Gbps(10); /* default value */ 600 if (llb != NULL) 601 ifp->if_broadcastaddr = llb; 602 603 ifa = ifp->if_addr; 604 KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); 605 sdl = (struct sockaddr_dl *)ifa->ifa_addr; 606 sdl->sdl_type = IFT_INFINIBAND; 607 sdl->sdl_alen = ifp->if_addrlen; 608 609 if (lla != NULL) { 610 memcpy(LLADDR(sdl), lla, ifp->if_addrlen); 611 612 if (ifp->if_hw_addr != NULL) 613 memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); 614 } else { 615 lla = LLADDR(sdl); 616 } 617 618 /* Attach ethernet compatible network device */ 619 bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); 620 621 /* Announce Infiniband MAC address if non-zero. */ 622 for (i = 0; i < ifp->if_addrlen; i++) 623 if (lla[i] != 0) 624 break; 625 if (i != ifp->if_addrlen) 626 if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); 627 628 /* Add necessary bits are setup; announce it now. */ 629 EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); 630 631 if (IS_DEFAULT_VNET(curvnet)) 632 devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); 633 } 634 635 /* 636 * Perform common duties while detaching an Infiniband interface 637 */ 638 void 639 infiniband_ifdetach(struct ifnet *ifp) 640 { 641 bpfdetach(ifp); 642 if_detach(ifp); 643 } 644 645 static int 646 infiniband_modevent(module_t mod, int type, void *data) 647 { 648 switch (type) { 649 case MOD_LOAD: 650 case MOD_UNLOAD: 651 return (0); 652 default: 653 return (EOPNOTSUPP); 654 } 655 } 656 657 static moduledata_t infiniband_mod = { 658 .name = "if_infiniband", 659 .evhand = &infiniband_modevent, 660 }; 661 662 DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); 663 MODULE_VERSION(if_infiniband, 1); 664