xref: /openbsd/sys/net/if_vxlan.c (revision 09467b48)
1 /*	$OpenBSD: if_vxlan.c,v 1.80 2020/07/28 09:52:32 mvs Exp $	*/
2 
3 /*
4  * Copyright (c) 2013 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "vxlan.h"
21 #include "vlan.h"
22 #include "pf.h"
23 #include "bridge.h"
24 
25 #include <sys/param.h>
26 #include <sys/systm.h>
27 #include <sys/mbuf.h>
28 #include <sys/socket.h>
29 #include <sys/sockio.h>
30 #include <sys/ioctl.h>
31 
32 #include <net/if.h>
33 #include <net/if_var.h>
34 #include <net/if_media.h>
35 #include <net/route.h>
36 
37 #if NBPFILTER > 0
38 #include <net/bpf.h>
39 #endif
40 
41 #include <netinet/in.h>
42 #include <netinet/in_var.h>
43 #include <netinet/if_ether.h>
44 #include <netinet/ip.h>
45 #include <netinet/ip_var.h>
46 #include <netinet/udp.h>
47 #include <netinet/udp_var.h>
48 #include <netinet/in_pcb.h>
49 
50 #if NPF > 0
51 #include <net/pfvar.h>
52 #endif
53 
54 #if NBRIDGE > 0
55 #include <net/if_bridge.h>
56 #endif
57 
58 #include <net/if_vxlan.h>
59 
60 struct vxlan_softc {
61 	struct arpcom		 sc_ac;
62 	struct ifmedia		 sc_media;
63 
64 	struct ip_moptions	 sc_imo;
65 	struct task		 sc_atask;
66 	struct task		 sc_ltask;
67 	struct task		 sc_dtask;
68 
69 	struct sockaddr_storage	 sc_src;
70 	struct sockaddr_storage	 sc_dst;
71 	in_port_t		 sc_dstport;
72 	u_int			 sc_rdomain;
73 	int64_t			 sc_vnetid;
74 	uint16_t		 sc_df;
75 	u_int8_t		 sc_ttl;
76 	int			 sc_txhprio;
77 
78 	struct task		 sc_sendtask;
79 
80 	LIST_ENTRY(vxlan_softc)	 sc_entry;
81 };
82 
83 void	 vxlanattach(int);
84 int	 vxlanioctl(struct ifnet *, u_long, caddr_t);
85 void	 vxlanstart(struct ifnet *);
86 int	 vxlan_clone_create(struct if_clone *, int);
87 int	 vxlan_clone_destroy(struct ifnet *);
88 void	 vxlan_multicast_cleanup(struct ifnet *);
89 int	 vxlan_multicast_join(struct ifnet *, struct sockaddr *,
90 	    struct sockaddr *);
91 int	 vxlan_media_change(struct ifnet *);
92 void	 vxlan_media_status(struct ifnet *, struct ifmediareq *);
93 int	 vxlan_config(struct ifnet *, struct sockaddr *, struct sockaddr *);
94 int	 vxlan_output(struct ifnet *, struct mbuf *);
95 void	 vxlan_addr_change(void *);
96 void	 vxlan_if_change(void *);
97 void	 vxlan_link_change(void *);
98 void	 vxlan_send_dispatch(void *);
99 
100 int	 vxlan_sockaddr_cmp(struct sockaddr *, struct sockaddr *);
101 uint16_t vxlan_sockaddr_port(struct sockaddr *);
102 
103 struct if_clone	vxlan_cloner =
104     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
105 
106 int	 vxlan_enable = 0;
107 u_long	 vxlan_tagmask;
108 
109 #define VXLAN_TAGHASHSIZE		 32
110 #define VXLAN_TAGHASH(tag)		 ((unsigned int)tag & vxlan_tagmask)
111 LIST_HEAD(vxlan_taghash, vxlan_softc)	*vxlan_tagh, vxlan_any;
112 
113 void
114 vxlanattach(int count)
115 {
116 	/* Regular vxlan interfaces with a VNI */
117 	if ((vxlan_tagh = hashinit(VXLAN_TAGHASHSIZE, M_DEVBUF, M_NOWAIT,
118 	    &vxlan_tagmask)) == NULL)
119 		panic("vxlanattach: hashinit");
120 
121 	/* multipoint-to-multipoint interfaces that accept any VNI */
122 	LIST_INIT(&vxlan_any);
123 
124 	if_clone_attach(&vxlan_cloner);
125 }
126 
127 int
128 vxlan_clone_create(struct if_clone *ifc, int unit)
129 {
130 	struct ifnet		*ifp;
131 	struct vxlan_softc	*sc;
132 
133 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
134 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
135 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
136 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
137 	sc->sc_dstport = htons(VXLAN_PORT);
138 	sc->sc_vnetid = VXLAN_VNI_UNSET;
139 	sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */
140 	sc->sc_df = htons(0);
141 	task_set(&sc->sc_atask, vxlan_addr_change, sc);
142 	task_set(&sc->sc_ltask, vxlan_link_change, sc);
143 	task_set(&sc->sc_dtask, vxlan_if_change, sc);
144 	task_set(&sc->sc_sendtask, vxlan_send_dispatch, sc);
145 
146 	ifp = &sc->sc_ac.ac_if;
147 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "vxlan%d", unit);
148 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
149 	ether_fakeaddr(ifp);
150 
151 	ifp->if_softc = sc;
152 	ifp->if_ioctl = vxlanioctl;
153 	ifp->if_start = vxlanstart;
154 	ifq_set_maxlen(&ifp->if_snd, IFQ_MAXLEN);
155 
156 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
157 	ifp->if_capabilities = IFCAP_VLAN_MTU;
158 	ifp->if_xflags = IFXF_CLONED;
159 
160 	ifmedia_init(&sc->sc_media, 0, vxlan_media_change,
161 	    vxlan_media_status);
162 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
163 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
164 
165 	if_counters_alloc(ifp);
166 	if_attach(ifp);
167 	ether_ifattach(ifp);
168 
169 #if 0
170 	/*
171 	 * Instead of using a decreased MTU of 1450 bytes, prefer
172 	 * to use the default Ethernet-size MTU of 1500 bytes and to
173 	 * increase the MTU of the outer transport interfaces to
174 	 * at least 1550 bytes. The following is disabled by default.
175 	 */
176 	ifp->if_mtu = ETHERMTU - sizeof(struct ether_header);
177 	ifp->if_mtu -= sizeof(struct vxlanudphdr) + sizeof(struct ipovly);
178 #endif
179 
180 	LIST_INSERT_HEAD(&vxlan_tagh[VXLAN_TAGHASH(0)], sc, sc_entry);
181 	vxlan_enable++;
182 
183 	return (0);
184 }
185 
186 int
187 vxlan_clone_destroy(struct ifnet *ifp)
188 {
189 	struct vxlan_softc	*sc = ifp->if_softc;
190 
191 	NET_LOCK();
192 	vxlan_multicast_cleanup(ifp);
193 	NET_UNLOCK();
194 
195 	vxlan_enable--;
196 	LIST_REMOVE(sc, sc_entry);
197 
198 	ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY);
199 	ether_ifdetach(ifp);
200 	if_detach(ifp);
201 
202 	if (!task_del(net_tq(ifp->if_index), &sc->sc_sendtask))
203 		taskq_barrier(net_tq(ifp->if_index));
204 
205 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
206 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
207 	free(sc, M_DEVBUF, sizeof(*sc));
208 
209 	return (0);
210 }
211 
212 void
213 vxlan_multicast_cleanup(struct ifnet *ifp)
214 {
215 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
216 	struct ip_moptions	*imo = &sc->sc_imo;
217 	struct ifnet		*mifp;
218 
219 	mifp = if_get(imo->imo_ifidx);
220 	if (mifp != NULL) {
221 		if_addrhook_del(mifp, &sc->sc_atask);
222 		if_linkstatehook_del(mifp, &sc->sc_ltask);
223 		if_detachhook_del(mifp, &sc->sc_dtask);
224 
225 		if_put(mifp);
226 	}
227 
228 	if (imo->imo_num_memberships > 0) {
229 		in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
230 		imo->imo_ifidx = 0;
231 	}
232 }
233 
234 int
235 vxlan_multicast_join(struct ifnet *ifp, struct sockaddr *src,
236     struct sockaddr *dst)
237 {
238 	struct vxlan_softc	*sc = ifp->if_softc;
239 	struct ip_moptions	*imo = &sc->sc_imo;
240 	struct sockaddr_in	*src4, *dst4;
241 #ifdef INET6
242 	struct sockaddr_in6	*dst6;
243 #endif /* INET6 */
244 	struct ifaddr		*ifa;
245 	struct ifnet		*mifp;
246 
247 	switch (dst->sa_family) {
248 	case AF_INET:
249 		dst4 = satosin(dst);
250 		if (!IN_MULTICAST(dst4->sin_addr.s_addr))
251 			return (0);
252 		break;
253 #ifdef INET6
254 	case AF_INET6:
255 		dst6 = satosin6(dst);
256 		if (!IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr))
257 			return (0);
258 
259 		/* Multicast mode is currently not supported for IPv6 */
260 		return (EAFNOSUPPORT);
261 #endif /* INET6 */
262 	default:
263 		return (EAFNOSUPPORT);
264 	}
265 
266 	src4 = satosin(src);
267 	dst4 = satosin(dst);
268 
269 	if (src4->sin_addr.s_addr == INADDR_ANY ||
270 	    IN_MULTICAST(src4->sin_addr.s_addr))
271 		return (EINVAL);
272 	if ((ifa = ifa_ifwithaddr(src, sc->sc_rdomain)) == NULL ||
273 	    (mifp = ifa->ifa_ifp) == NULL ||
274 	    (mifp->if_flags & IFF_MULTICAST) == 0)
275 		return (EADDRNOTAVAIL);
276 
277 	if ((imo->imo_membership[0] =
278 	    in_addmulti(&dst4->sin_addr, mifp)) == NULL)
279 		return (ENOBUFS);
280 
281 	imo->imo_num_memberships++;
282 	imo->imo_ifidx = mifp->if_index;
283 	if (sc->sc_ttl > 0)
284 		imo->imo_ttl = sc->sc_ttl;
285 	else
286 		imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL;
287 	imo->imo_loop = 0;
288 
289 	/*
290 	 * Use interface hooks to track any changes on the interface
291 	 * that is used to send out the tunnel traffic as multicast.
292 	 */
293 	if_addrhook_add(mifp, &sc->sc_atask);
294 	if_linkstatehook_add(mifp, &sc->sc_ltask);
295 	if_detachhook_add(mifp, &sc->sc_dtask);
296 
297 	return (0);
298 }
299 
300 void
301 vxlanstart(struct ifnet *ifp)
302 {
303 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
304 
305 	task_add(net_tq(ifp->if_index), &sc->sc_sendtask);
306 }
307 
308 void
309 vxlan_send_dispatch(void *xsc)
310 {
311 	struct vxlan_softc	*sc = xsc;
312 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
313 	struct mbuf		*m;
314 	struct mbuf_list	 ml;
315 
316 	ml_init(&ml);
317 	for (;;) {
318 		m = ifq_dequeue(&ifp->if_snd);
319 		if (m == NULL)
320 			break;
321 
322 #if NBPFILTER > 0
323 		if (ifp->if_bpf)
324 			bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
325 #endif
326 
327 		ml_enqueue(&ml, m);
328 	}
329 
330 	if (ml_empty(&ml))
331 		return;
332 
333 	NET_LOCK();
334 	while ((m = ml_dequeue(&ml)) != NULL) {
335 		vxlan_output(ifp, m);
336 	}
337 	NET_UNLOCK();
338 }
339 
340 
341 int
342 vxlan_config(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst)
343 {
344 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
345 	int			 reset = 0, error, af;
346 	socklen_t		 slen;
347 	in_port_t		 port;
348 	struct vxlan_taghash	*tagh;
349 
350 	if (src != NULL && dst != NULL) {
351 		if ((af = src->sa_family) != dst->sa_family)
352 			return (EAFNOSUPPORT);
353 	} else {
354 		/* Reset current configuration */
355 		af = sc->sc_src.ss_family;
356 		src = sstosa(&sc->sc_src);
357 		dst = sstosa(&sc->sc_dst);
358 		reset = 1;
359 	}
360 
361 	switch (af) {
362 	case AF_INET:
363 		slen = sizeof(struct sockaddr_in);
364 		break;
365 #ifdef INET6
366 	case AF_INET6:
367 		slen = sizeof(struct sockaddr_in6);
368 		break;
369 #endif /* INET6 */
370 	default:
371 		return (EAFNOSUPPORT);
372 	}
373 
374 	if (src->sa_len != slen || dst->sa_len != slen)
375 		return (EINVAL);
376 
377 	vxlan_multicast_cleanup(ifp);
378 
379 	/* returns without error if multicast is not configured */
380 	if ((error = vxlan_multicast_join(ifp, src, dst)) != 0)
381 		return (error);
382 
383 	if ((port = vxlan_sockaddr_port(dst)) != 0)
384 		sc->sc_dstport = port;
385 
386 	if (!reset) {
387 		bzero(&sc->sc_src, sizeof(sc->sc_src));
388 		bzero(&sc->sc_dst, sizeof(sc->sc_dst));
389 		memcpy(&sc->sc_src, src, src->sa_len);
390 		memcpy(&sc->sc_dst, dst, dst->sa_len);
391 	}
392 
393 	if (sc->sc_vnetid == VXLAN_VNI_ANY) {
394 		/*
395 		 * If the interface accepts any VNI, put it into a separate
396 		 * list that is not part of the main hash.
397 		 */
398 		tagh = &vxlan_any;
399 	} else
400 		tagh = &vxlan_tagh[VXLAN_TAGHASH(sc->sc_vnetid)];
401 
402 	LIST_REMOVE(sc, sc_entry);
403 	LIST_INSERT_HEAD(tagh, sc, sc_entry);
404 
405 	return (0);
406 }
407 
408 int
409 vxlanioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
410 {
411 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
412 	struct ifreq		*ifr = (struct ifreq *)data;
413 	struct if_laddrreq	*lifr = (struct if_laddrreq *)data;
414 	int			 error = 0;
415 
416 	switch (cmd) {
417 	case SIOCSIFADDR:
418 		ifp->if_flags |= IFF_UP;
419 		/* FALLTHROUGH */
420 
421 	case SIOCSIFFLAGS:
422 		if (ifp->if_flags & IFF_UP) {
423 			ifp->if_flags |= IFF_RUNNING;
424 		} else {
425 			ifp->if_flags &= ~IFF_RUNNING;
426 		}
427 		break;
428 
429 	case SIOCADDMULTI:
430 	case SIOCDELMULTI:
431 		break;
432 
433 	case SIOCGIFMEDIA:
434 	case SIOCSIFMEDIA:
435 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
436 		break;
437 
438 	case SIOCSLIFPHYADDR:
439 		error = vxlan_config(ifp,
440 		    sstosa(&lifr->addr),
441 		    sstosa(&lifr->dstaddr));
442 		break;
443 
444 	case SIOCDIFPHYADDR:
445 		vxlan_multicast_cleanup(ifp);
446 		bzero(&sc->sc_src, sizeof(sc->sc_src));
447 		bzero(&sc->sc_dst, sizeof(sc->sc_dst));
448 		sc->sc_dstport = htons(VXLAN_PORT);
449 		break;
450 
451 	case SIOCGLIFPHYADDR:
452 		if (sc->sc_dst.ss_family == AF_UNSPEC) {
453 			error = EADDRNOTAVAIL;
454 			break;
455 		}
456 		bzero(&lifr->addr, sizeof(lifr->addr));
457 		bzero(&lifr->dstaddr, sizeof(lifr->dstaddr));
458 		memcpy(&lifr->addr, &sc->sc_src, sc->sc_src.ss_len);
459 		memcpy(&lifr->dstaddr, &sc->sc_dst, sc->sc_dst.ss_len);
460 		break;
461 
462 	case SIOCSLIFPHYRTABLE:
463 		if (ifr->ifr_rdomainid < 0 ||
464 		    ifr->ifr_rdomainid > RT_TABLEID_MAX ||
465 		    !rtable_exists(ifr->ifr_rdomainid)) {
466 			error = EINVAL;
467 			break;
468 		}
469 		sc->sc_rdomain = ifr->ifr_rdomainid;
470 		(void)vxlan_config(ifp, NULL, NULL);
471 		break;
472 
473 	case SIOCGLIFPHYRTABLE:
474 		ifr->ifr_rdomainid = sc->sc_rdomain;
475 		break;
476 
477 	case SIOCSLIFPHYTTL:
478 		if (ifr->ifr_ttl < 0 || ifr->ifr_ttl > 0xff) {
479 			error = EINVAL;
480 			break;
481 		}
482 		if (sc->sc_ttl == (u_int8_t)ifr->ifr_ttl)
483 			break;
484 		sc->sc_ttl = (u_int8_t)(ifr->ifr_ttl);
485 		(void)vxlan_config(ifp, NULL, NULL);
486 		break;
487 
488 	case SIOCGLIFPHYTTL:
489 		ifr->ifr_ttl = (int)sc->sc_ttl;
490 		break;
491 
492 	case SIOCSLIFPHYDF:
493 		/* commit */
494 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
495 		break;
496 	case SIOCGLIFPHYDF:
497 		ifr->ifr_df = sc->sc_df ? 1 : 0;
498 		break;
499 
500 	case SIOCSTXHPRIO:
501 		if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET)
502 			; /* fall through */
503 		else if (ifr->ifr_hdrprio < IF_HDRPRIO_MIN ||
504 		    ifr->ifr_hdrprio > IF_HDRPRIO_MAX) {
505 			error = EINVAL;
506 			break;
507 		}
508 
509 		sc->sc_txhprio = ifr->ifr_hdrprio;
510 		break;
511 	case SIOCGTXHPRIO:
512 		ifr->ifr_hdrprio = sc->sc_txhprio;
513 		break;
514 
515 	case SIOCSVNETID:
516 		if (sc->sc_vnetid == ifr->ifr_vnetid)
517 			break;
518 
519 		if ((ifr->ifr_vnetid != VXLAN_VNI_ANY) &&
520 		    (ifr->ifr_vnetid > VXLAN_VNI_MAX ||
521 		     ifr->ifr_vnetid < VXLAN_VNI_MIN)) {
522 			error = EINVAL;
523 			break;
524 		}
525 
526 		sc->sc_vnetid = (int)ifr->ifr_vnetid;
527 		(void)vxlan_config(ifp, NULL, NULL);
528 		break;
529 
530 	case SIOCGVNETID:
531 		if ((sc->sc_vnetid != VXLAN_VNI_ANY) &&
532 		    (sc->sc_vnetid > VXLAN_VNI_MAX ||
533 		     sc->sc_vnetid < VXLAN_VNI_MIN)) {
534 			error = EADDRNOTAVAIL;
535 			break;
536 		}
537 
538 		ifr->ifr_vnetid = sc->sc_vnetid;
539 		break;
540 
541 	case SIOCDVNETID:
542 		sc->sc_vnetid = VXLAN_VNI_UNSET;
543 		(void)vxlan_config(ifp, NULL, NULL);
544 		break;
545 
546 	default:
547 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
548 		break;
549 	}
550 
551 	return (error);
552 }
553 
554 int
555 vxlan_media_change(struct ifnet *ifp)
556 {
557 	return (0);
558 }
559 
560 void
561 vxlan_media_status(struct ifnet *ifp, struct ifmediareq *imr)
562 {
563 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
564 }
565 
566 int
567 vxlan_sockaddr_cmp(struct sockaddr *srcsa, struct sockaddr *dstsa)
568 {
569 	struct sockaddr_in	*src4, *dst4;
570 #ifdef INET6
571 	struct sockaddr_in6	*src6, *dst6;
572 #endif /* INET6 */
573 
574 	if (srcsa->sa_family != dstsa->sa_family)
575 		return (1);
576 
577 	switch (dstsa->sa_family) {
578 	case AF_INET:
579 		src4 = satosin(srcsa);
580 		dst4 = satosin(dstsa);
581 		if (src4->sin_addr.s_addr == dst4->sin_addr.s_addr)
582 			return (0);
583 		break;
584 #ifdef INET6
585 	case AF_INET6:
586 		src6 = satosin6(srcsa);
587 		dst6 = satosin6(dstsa);
588 		if (IN6_ARE_ADDR_EQUAL(&src6->sin6_addr, &dst6->sin6_addr) &&
589 		    src6->sin6_scope_id == dst6->sin6_scope_id)
590 			return (0);
591 		break;
592 #endif /* INET6 */
593 	}
594 
595 	return (1);
596 }
597 
598 uint16_t
599 vxlan_sockaddr_port(struct sockaddr *sa)
600 {
601 	struct sockaddr_in	*sin4;
602 #ifdef INET6
603 	struct sockaddr_in6	*sin6;
604 #endif /* INET6 */
605 
606 	switch (sa->sa_family) {
607 	case AF_INET:
608 		sin4 = satosin(sa);
609 		return (sin4->sin_port);
610 #ifdef INET6
611 	case AF_INET6:
612 		sin6 = satosin6(sa);
613 		return (sin6->sin6_port);
614 #endif /* INET6 */
615 	default:
616 		break;
617 	}
618 
619 	return (0);
620 }
621 
622 int
623 vxlan_lookup(struct mbuf *m, struct udphdr *uh, int iphlen,
624     struct sockaddr *srcsa, struct sockaddr *dstsa)
625 {
626 	struct vxlan_softc	*sc = NULL, *sc_cand = NULL;
627 	struct vxlan_header	 v;
628 	int			 vni;
629 	struct ifnet		*ifp;
630 	int			 skip;
631 #if NBRIDGE > 0
632 	struct bridge_tunneltag	*brtag;
633 #endif
634 	struct mbuf		*n;
635 	int			 off;
636 
637 	/* XXX Should verify the UDP port first before copying the packet */
638 	skip = iphlen + sizeof(*uh);
639 	if (m->m_pkthdr.len - skip < sizeof(v))
640 		return (0);
641 	m_copydata(m, skip, sizeof(v), (caddr_t)&v);
642 	skip += sizeof(v);
643 
644 	if (v.vxlan_flags & htonl(VXLAN_RESERVED1) ||
645 	    v.vxlan_id & htonl(VXLAN_RESERVED2))
646 		return (0);
647 
648 	vni = ntohl(v.vxlan_id) >> VXLAN_VNI_S;
649 	if ((v.vxlan_flags & htonl(VXLAN_FLAGS_VNI)) == 0) {
650 		if (vni != 0)
651 			return (0);
652 
653 		vni = VXLAN_VNI_UNSET;
654 	}
655 
656 	NET_ASSERT_LOCKED();
657 	/* First search for a vxlan(4) interface with the packet's VNI */
658 	LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) {
659 		if ((uh->uh_dport == sc->sc_dstport) &&
660 		    vni == sc->sc_vnetid &&
661 		    sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid)) {
662 			sc_cand = sc;
663 			if (vxlan_sockaddr_cmp(srcsa, sstosa(&sc->sc_dst)) == 0)
664 				goto found;
665 		}
666 	}
667 
668 	/*
669 	 * Now loop through all the vxlan(4) interfaces that are configured
670 	 * to accept any VNI and operating in multipoint-to-multipoint mode
671 	 * that is used in combination with bridge(4) or switch(4).
672 	 * If a vxlan(4) interface has been found for the packet's VNI, this
673 	 * code is not reached as the other interface is more specific.
674 	 */
675 	LIST_FOREACH(sc, &vxlan_any, sc_entry) {
676 		if ((uh->uh_dport == sc->sc_dstport) &&
677 		    (sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid))) {
678 			sc_cand = sc;
679 			goto found;
680 		}
681 	}
682 
683 	if (sc_cand) {
684 		sc = sc_cand;
685 		goto found;
686 	}
687 
688 	/* not found */
689 	return (0);
690 
691  found:
692 	if (m->m_pkthdr.len < skip + sizeof(struct ether_header)) {
693 		m_freem(m);
694 		return (EINVAL);
695 	}
696 
697 	m_adj(m, skip);
698 	ifp = &sc->sc_ac.ac_if;
699 
700 #if NBRIDGE > 0
701 	/* Store the tunnel src/dst IP and vni for the bridge or switch */
702 	if ((ifp->if_bridgeidx != 0 || ifp->if_switchport != NULL) &&
703 	    srcsa->sa_family != AF_UNSPEC &&
704 	    ((brtag = bridge_tunneltag(m)) != NULL)) {
705 		memcpy(&brtag->brtag_peer.sa, srcsa, srcsa->sa_len);
706 		memcpy(&brtag->brtag_local.sa, dstsa, dstsa->sa_len);
707 		brtag->brtag_id = vni;
708 	}
709 #endif
710 
711 	m->m_flags &= ~(M_BCAST|M_MCAST);
712 
713 #if NPF > 0
714 	pf_pkt_addr_changed(m);
715 #endif
716 	if ((m->m_len < sizeof(struct ether_header)) &&
717 	    (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
718 		return (ENOBUFS);
719 
720 	n = m_getptr(m, sizeof(struct ether_header), &off);
721 	if (n == NULL) {
722 		m_freem(m);
723 		return (EINVAL);
724 	}
725 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
726 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
727 		/* Dispose of the original mbuf chain */
728 		m_freem(m);
729 		if (n == NULL)
730 			return (ENOBUFS);
731 		m = n;
732 	}
733 
734 	if_vinput(ifp, m);
735 
736 	/* success */
737 	return (1);
738 }
739 
740 struct mbuf *
741 vxlan_encap4(struct ifnet *ifp, struct mbuf *m,
742     struct sockaddr *src, struct sockaddr *dst)
743 {
744 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
745 	struct ip		*ip;
746 
747 	/*
748 	 * Remove multicast and broadcast flags or encapsulated packet
749 	 * ends up as multicast or broadcast packet.
750 	 */
751 	m->m_flags &= ~(M_BCAST|M_MCAST);
752 
753 	M_PREPEND(m, sizeof(*ip), M_DONTWAIT);
754 	if (m == NULL)
755 		return (NULL);
756 
757 	ip = mtod(m, struct ip *);
758 	ip->ip_v = IPVERSION;
759 	ip->ip_hl = sizeof(struct ip) >> 2;
760 	ip->ip_id = htons(ip_randomid());
761 	ip->ip_off = sc->sc_df;
762 	ip->ip_p = IPPROTO_UDP;
763 	ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
764 	    m->m_pkthdr.pf.prio : sc->sc_txhprio);
765 	ip->ip_len = htons(m->m_pkthdr.len);
766 
767 	ip->ip_src = satosin(src)->sin_addr;
768 	ip->ip_dst = satosin(dst)->sin_addr;
769 
770 	if (sc->sc_ttl > 0)
771 		ip->ip_ttl = sc->sc_ttl;
772 	else
773 		ip->ip_ttl = IPDEFTTL;
774 
775 	return (m);
776 }
777 
778 #ifdef INET6
779 struct mbuf *
780 vxlan_encap6(struct ifnet *ifp, struct mbuf *m,
781     struct sockaddr *src, struct sockaddr *dst)
782 {
783 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
784 	struct ip6_hdr		*ip6;
785 	struct in6_addr		*in6a;
786 	uint32_t		 flow;
787 
788 	/*
789 	 * Remove multicast and broadcast flags or encapsulated packet
790 	 * ends up as multicast or broadcast packet.
791 	 */
792 	m->m_flags &= ~(M_BCAST|M_MCAST);
793 
794 	M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
795 	if (m == NULL)
796 		return (NULL);
797 
798 	flow = (uint32_t)IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
799 	    m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20;
800 
801 	ip6 = mtod(m, struct ip6_hdr *);
802 	ip6->ip6_flow = htonl(flow);
803 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
804 	ip6->ip6_vfc |= IPV6_VERSION;
805 	ip6->ip6_nxt = IPPROTO_UDP;
806 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
807 	if (in6_embedscope(&ip6->ip6_src, satosin6(src), NULL) != 0)
808 		goto drop;
809 	if (in6_embedscope(&ip6->ip6_dst, satosin6(dst), NULL) != 0)
810 		goto drop;
811 
812 	if (sc->sc_ttl > 0)
813 		ip6->ip6_hlim = sc->sc_ttl;
814 	else
815 		ip6->ip6_hlim = ip6_defhlim;
816 
817 	if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) {
818 		if (in6_selectsrc(&in6a, satosin6(dst), NULL,
819 		    sc->sc_rdomain) != 0)
820 			goto drop;
821 
822 		ip6->ip6_src = *in6a;
823 	}
824 
825 	if (sc->sc_df)
826 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
827 
828 	/*
829 	 * The UDP checksum of VXLAN packets should be set to zero,
830 	 * but the IPv6 UDP checksum is not optional.  There is an RFC 6539
831 	 * to relax the IPv6 UDP checksum requirement for tunnels, but it
832 	 * is currently not supported by most implementations.
833 	 */
834 	m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
835 
836 	return (m);
837 
838 drop:
839 	m_freem(m);
840 	return (NULL);
841 }
842 #endif /* INET6 */
843 
844 int
845 vxlan_output(struct ifnet *ifp, struct mbuf *m)
846 {
847 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
848 	struct vxlanudphdr	*vu;
849 	struct sockaddr		*src, *dst;
850 #if NBRIDGE > 0
851 	struct bridge_tunneltag	*brtag;
852 #endif
853 	int			 error, af;
854 	uint32_t		 tag;
855 	struct mbuf		*m0;
856 
857 	/* VXLAN header, needs new mbuf because of alignment issues */
858 	MGET(m0, M_DONTWAIT, m->m_type);
859 	if (m0 == NULL) {
860 		ifp->if_oerrors++;
861 		return (ENOBUFS);
862 	}
863 	M_MOVE_PKTHDR(m0, m);
864 	m0->m_next = m;
865 	m = m0;
866 	m_align(m, sizeof(*vu));
867 	m->m_len = sizeof(*vu);
868 	m->m_pkthdr.len += sizeof(*vu);
869 
870 	src = sstosa(&sc->sc_src);
871 	dst = sstosa(&sc->sc_dst);
872 	af = src->sa_family;
873 
874 	vu = mtod(m, struct vxlanudphdr *);
875 	vu->vu_u.uh_sport = sc->sc_dstport;
876 	vu->vu_u.uh_dport = sc->sc_dstport;
877 	vu->vu_u.uh_ulen = htons(m->m_pkthdr.len);
878 	vu->vu_u.uh_sum = 0;
879 	tag = sc->sc_vnetid;
880 
881 #if NBRIDGE > 0
882 	if ((brtag = bridge_tunnel(m)) != NULL) {
883 		dst = &brtag->brtag_peer.sa;
884 
885 		/* If accepting any VNI, source ip address is from brtag */
886 		if (sc->sc_vnetid == VXLAN_VNI_ANY) {
887 			src = &brtag->brtag_local.sa;
888 			tag = (uint32_t)brtag->brtag_id;
889 			af = src->sa_family;
890 		}
891 
892 		if (dst->sa_family != af) {
893 			ifp->if_oerrors++;
894 			m_freem(m);
895 			return (EINVAL);
896 		}
897 	} else
898 #endif
899 	if (sc->sc_vnetid == VXLAN_VNI_ANY) {
900 		/*
901 		 * If accepting any VNI, build the vxlan header only by
902 		 * bridge_tunneltag or drop packet if the tag does not exist.
903 		 */
904 		ifp->if_oerrors++;
905 		m_freem(m);
906 		return (ENETUNREACH);
907 	}
908 
909 	if (sc->sc_vnetid != VXLAN_VNI_UNSET) {
910 		vu->vu_v.vxlan_flags = htonl(VXLAN_FLAGS_VNI);
911 		vu->vu_v.vxlan_id = htonl(tag << VXLAN_VNI_S);
912 	} else {
913 		vu->vu_v.vxlan_flags = htonl(0);
914 		vu->vu_v.vxlan_id = htonl(0);
915 	}
916 
917 	switch (af) {
918 	case AF_INET:
919 		m = vxlan_encap4(ifp, m, src, dst);
920 		break;
921 #ifdef INET6
922 	case AF_INET6:
923 		m = vxlan_encap6(ifp, m, src, dst);
924 		break;
925 #endif /* INET6 */
926 	default:
927 		m_freem(m);
928 		m = NULL;
929 	}
930 
931 	if (m == NULL) {
932 		ifp->if_oerrors++;
933 		return (ENOBUFS);
934 	}
935 
936 #if NBRIDGE > 0
937 	if (brtag != NULL)
938 		bridge_tunneluntag(m);
939 #endif
940 
941 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
942 
943 #if NPF > 0
944 	pf_pkt_addr_changed(m);
945 #endif
946 
947 	switch (af) {
948 	case AF_INET:
949 		error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
950 		    &sc->sc_imo, NULL, 0);
951 		break;
952 #ifdef INET6
953 	case AF_INET6:
954 		error = ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL);
955 		break;
956 #endif /* INET6 */
957 	default:
958 		m_freem(m);
959 		error = EAFNOSUPPORT;
960 	}
961 
962 	if (error)
963 		ifp->if_oerrors++;
964 
965 	return (error);
966 }
967 
968 void
969 vxlan_addr_change(void *arg)
970 {
971 	struct vxlan_softc	*sc = arg;
972 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
973 	int			 error;
974 
975 	/*
976 	 * Reset the configuration after resume or any possible address
977 	 * configuration changes.
978 	 */
979 	if ((error = vxlan_config(ifp, NULL, NULL))) {
980 		/*
981 		 * The source address of the tunnel can temporarily disappear,
982 		 * after a link state change when running the DHCP client,
983 		 * so keep it configured.
984 		 */
985 	}
986 }
987 
988 void
989 vxlan_if_change(void *arg)
990 {
991 	struct vxlan_softc	*sc = arg;
992 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
993 
994 	/*
995 	 * Reset the configuration after the parent interface disappeared.
996 	 */
997 	vxlan_multicast_cleanup(ifp);
998 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
999 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
1000 	sc->sc_dstport = htons(VXLAN_PORT);
1001 }
1002 
1003 void
1004 vxlan_link_change(void *arg)
1005 {
1006 	struct vxlan_softc	*sc = arg;
1007 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
1008 
1009 	/*
1010 	 * The machine might have lost its multicast associations after
1011 	 * link state changes.  This fixes a problem with VMware after
1012 	 * suspend/resume of the host or guest.
1013 	 */
1014 	(void)vxlan_config(ifp, NULL, NULL);
1015 }
1016