xref: /openbsd/sys/net/if_vxlan.c (revision d415bd75)
1 /*	$OpenBSD: if_vxlan.c,v 1.94 2023/10/27 20:56:48 jan Exp $ */
2 
3 /*
4  * Copyright (c) 2021 David Gwynne <dlg@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "pf.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/ioctl.h>
28 #include <sys/timeout.h>
29 #include <sys/pool.h>
30 #include <sys/tree.h>
31 #include <sys/refcnt.h>
32 #include <sys/smr.h>
33 
34 #include <sys/socketvar.h>
35 
36 #include <net/if.h>
37 #include <net/if_var.h>
38 #include <net/if_dl.h>
39 #include <net/if_media.h>
40 #include <net/if_types.h>
41 #include <net/route.h>
42 #include <net/rtable.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/in_var.h>
46 #include <netinet/if_ether.h>
47 #include <netinet/ip.h>
48 #include <netinet/udp.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/ip_var.h>
51 
52 #ifdef INET6
53 #include <netinet/ip6.h>
54 #include <netinet6/ip6_var.h>
55 #include <netinet6/in6_var.h>
56 #endif
57 
58 /* for bridge stuff */
59 #include <net/if_bridge.h>
60 #include <net/if_etherbridge.h>
61 
62 #if NBPFILTER > 0
63 #include <net/bpf.h>
64 #endif
65 
66 /*
67  * The protocol.
68  */
69 
70 #define VXLANMTU		1492
71 #define VXLAN_PORT		4789
72 
73 struct vxlan_header {
74 	uint32_t		vxlan_flags;
75 #define VXLAN_F_I			(1U << 27)
76 	uint32_t		vxlan_id;
77 #define VXLAN_VNI_SHIFT			8
78 #define VXLAN_VNI_MASK			(0xffffffU << VXLAN_VNI_SHIFT)
79 };
80 
81 #define VXLAN_VNI_MAX			0x00ffffffU
82 #define VXLAN_VNI_MIN			0x00000000U
83 
84 /*
85  * The driver.
86  */
87 
88 union vxlan_addr {
89 	struct in_addr		in4;
90 	struct in6_addr		in6;
91 };
92 
93 struct vxlan_softc;
94 
95 struct vxlan_peer {
96 	RBT_ENTRY(vxlan_peer)	 p_entry;
97 
98 	struct vxlan_header	 p_header;
99 	union vxlan_addr	 p_addr;
100 
101 	struct vxlan_softc	*p_sc;
102 };
103 
104 RBT_HEAD(vxlan_peers, vxlan_peer);
105 
106 struct vxlan_tep {
107 	TAILQ_ENTRY(vxlan_tep)	 vt_entry;
108 
109 	sa_family_t		 vt_af;
110 	unsigned int		 vt_rdomain;
111 	union vxlan_addr	 vt_addr;
112 #define vt_addr4 vt_addr.in4
113 #define vt_addr6 vt_addr.in6
114 	in_port_t		 vt_port;
115 
116 	struct socket		*vt_so;
117 
118 	struct mutex		 vt_mtx;
119 	struct vxlan_peers	 vt_peers;
120 };
121 
122 TAILQ_HEAD(vxlan_teps, vxlan_tep);
123 
124 enum vxlan_tunnel_mode {
125 	VXLAN_TMODE_UNSET,
126 	VXLAN_TMODE_P2P,	 /* unicast destination, no learning */
127 	VXLAN_TMODE_LEARNING,	 /* multicast destination, learning */
128 	VXLAN_TMODE_ENDPOINT,	 /* unset destination, no learning */
129 };
130 
131 struct vxlan_softc {
132 	struct arpcom		 sc_ac;
133 	struct etherbridge	 sc_eb;
134 
135 	unsigned int		 sc_rdomain;
136 	sa_family_t		 sc_af;
137 	union vxlan_addr	 sc_src;
138 	union vxlan_addr	 sc_dst;
139 	in_port_t		 sc_port;
140 	struct vxlan_header	 sc_header;
141 	unsigned int		 sc_if_index0;
142 
143 	struct task		 sc_dtask;
144 	void			*sc_inmulti;
145 
146 	enum vxlan_tunnel_mode	 sc_mode;
147 	struct vxlan_peer	*sc_ucast_peer;
148 	struct vxlan_peer	*sc_mcast_peer;
149 	struct refcnt		 sc_refs;
150 
151 	uint16_t		 sc_df;
152 	int			 sc_ttl;
153 	int			 sc_txhprio;
154 	int			 sc_rxhprio;
155 
156 	struct task		 sc_send_task;
157 };
158 
159 void		vxlanattach(int);
160 
161 static int	vxlan_clone_create(struct if_clone *, int);
162 static int	vxlan_clone_destroy(struct ifnet *);
163 
164 static int	vxlan_output(struct ifnet *, struct mbuf *,
165 		    struct sockaddr *, struct rtentry *);
166 static int	vxlan_enqueue(struct ifnet *, struct mbuf *);
167 static void	vxlan_start(struct ifqueue *);
168 static void	vxlan_send(void *);
169 
170 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
171 static int	vxlan_up(struct vxlan_softc *);
172 static int	vxlan_down(struct vxlan_softc *);
173 static int	vxlan_addmulti(struct vxlan_softc *, struct ifnet *);
174 static void	vxlan_delmulti(struct vxlan_softc *);
175 
176 static struct mbuf *
177 		vxlan_input(void *, struct mbuf *,
178 		    struct ip *, struct ip6_hdr *, void *, int);
179 
180 static int	vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *);
181 static int	vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *);
182 static int	vxlan_set_tunnel(struct vxlan_softc *,
183 		    const struct if_laddrreq *);
184 static int	vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *);
185 static int	vxlan_del_tunnel(struct vxlan_softc *);
186 static int	vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *);
187 static int	vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *);
188 static int	vxlan_del_vnetid(struct vxlan_softc *);
189 static int	vxlan_set_parent(struct vxlan_softc *,
190 		    const struct if_parent *);
191 static int	vxlan_get_parent(struct vxlan_softc *, struct if_parent *);
192 static int	vxlan_del_parent(struct vxlan_softc *);
193 
194 static int	vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *);
195 static int	vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *);
196 
197 static void	vxlan_detach_hook(void *);
198 
199 static struct if_clone vxlan_cloner =
200     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
201 
202 static int	 vxlan_eb_port_eq(void *, void *, void *);
203 static void	*vxlan_eb_port_take(void *, void *);
204 static void	 vxlan_eb_port_rele(void *, void *);
205 static size_t	 vxlan_eb_port_ifname(void *, char *, size_t, void *);
206 static void	 vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *);
207 
208 static const struct etherbridge_ops vxlan_etherbridge_ops = {
209 	vxlan_eb_port_eq,
210 	vxlan_eb_port_take,
211 	vxlan_eb_port_rele,
212 	vxlan_eb_port_ifname,
213 	vxlan_eb_port_sa,
214 };
215 
216 static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps");
217 static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps);
218 static struct pool vxlan_endpoint_pool;
219 
220 static inline int	vxlan_peer_cmp(const struct vxlan_peer *,
221 			    const struct vxlan_peer *);
222 
223 RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
224 
225 void
226 vxlanattach(int count)
227 {
228 	if_clone_attach(&vxlan_cloner);
229 }
230 
231 static int
232 vxlan_clone_create(struct if_clone *ifc, int unit)
233 {
234 	struct vxlan_softc *sc;
235 	struct ifnet *ifp;
236 	int error;
237 
238 	if (vxlan_endpoint_pool.pr_size == 0) {
239 		pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr),
240 		    0, IPL_SOFTNET, 0, "vxlanep", NULL);
241 	}
242 
243 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
244 	if (sc == NULL)
245 		return (ENOMEM);
246 
247 	ifp = &sc->sc_ac.ac_if;
248 
249 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d",
250 	    ifc->ifc_name, unit);
251 
252 	error = etherbridge_init(&sc->sc_eb, ifp->if_xname,
253 	    &vxlan_etherbridge_ops, sc);
254 	if (error == -1) {
255 		free(sc, M_DEVBUF, sizeof(*sc));
256 		return (error);
257 	}
258 
259 	sc->sc_af = AF_UNSPEC;
260 	sc->sc_txhprio = 0;
261 	sc->sc_rxhprio = IF_HDRPRIO_OUTER;
262 	sc->sc_df = 0;
263 	sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL;
264 
265 	task_set(&sc->sc_dtask, vxlan_detach_hook, sc);
266 	refcnt_init(&sc->sc_refs);
267 	task_set(&sc->sc_send_task, vxlan_send, sc);
268 
269 	ifp->if_softc = sc;
270 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
271 	ifp->if_ioctl = vxlan_ioctl;
272 	ifp->if_output = vxlan_output;
273 	ifp->if_enqueue = vxlan_enqueue;
274 	ifp->if_qstart = vxlan_start;
275 	ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
276 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
277 	ether_fakeaddr(ifp);
278 
279 	if_counters_alloc(ifp);
280 	if_attach(ifp);
281 	ether_ifattach(ifp);
282 
283 	return (0);
284 }
285 
286 static int
287 vxlan_clone_destroy(struct ifnet *ifp)
288 {
289 	struct vxlan_softc *sc = ifp->if_softc;
290 
291 	NET_LOCK();
292 	if (ISSET(ifp->if_flags, IFF_RUNNING))
293 		vxlan_down(sc);
294 	NET_UNLOCK();
295 
296 	ether_ifdetach(ifp);
297 	if_detach(ifp);
298 
299 	etherbridge_destroy(&sc->sc_eb);
300 
301 	refcnt_finalize(&sc->sc_refs, "vxlanfini");
302 
303 	free(sc, M_DEVBUF, sizeof(*sc));
304 
305 	return (0);
306 }
307 
308 static struct vxlan_softc *
309 vxlan_take(struct vxlan_softc *sc)
310 {
311 	refcnt_take(&sc->sc_refs);
312 	return (sc);
313 }
314 
315 static void
316 vxlan_rele(struct vxlan_softc *sc)
317 {
318 	refcnt_rele_wake(&sc->sc_refs);
319 }
320 
321 static struct mbuf *
322 vxlan_encap(struct vxlan_softc *sc, struct mbuf *m,
323     struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *,
324     const union vxlan_addr *, uint8_t))
325 {
326 	struct ifnet *ifp = &sc->sc_ac.ac_if;
327 	struct m_tag *mtag;
328 	struct mbuf *m0;
329 	union vxlan_addr gateway;
330 	const union vxlan_addr *endpoint;
331 	struct vxlan_header *vh;
332 	struct udphdr *uh;
333 	int prio;
334 	uint8_t tos;
335 
336 	if (sc->sc_mode == VXLAN_TMODE_UNSET)
337 		goto drop;
338 
339 	if (sc->sc_mode == VXLAN_TMODE_P2P)
340 		endpoint = &sc->sc_dst;
341 	else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */
342 		struct ether_header *eh = mtod(m, struct ether_header *);
343 
344 		smr_read_enter();
345 		endpoint = etherbridge_resolve_ea(&sc->sc_eb,
346 		    (struct ether_addr *)eh->ether_dhost);
347 		if (endpoint != NULL) {
348 			gateway = *endpoint;
349 			endpoint = &gateway;
350 		}
351 		smr_read_leave();
352 
353 		if (endpoint == NULL) {
354 			if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
355 				goto drop;
356 
357 			/* "flood" to unknown destinations */
358 			endpoint = &sc->sc_dst;
359 		}
360 	}
361 
362 	/* force prepend mbuf because of payload alignment */
363 	m0 = m_get(M_DONTWAIT, m->m_type);
364 	if (m0 == NULL)
365 		goto drop;
366 
367 	m_align(m0, 0);
368 	m0->m_len = 0;
369 
370 	M_MOVE_PKTHDR(m0, m);
371 	m0->m_next = m;
372 
373 	m = m_prepend(m0, sizeof(*vh), M_DONTWAIT);
374 	if (m == NULL)
375 		return (NULL);
376 
377 	vh = mtod(m, struct vxlan_header *);
378 	*vh = sc->sc_header;
379 
380 	m = m_prepend(m, sizeof(*uh), M_DONTWAIT);
381 	if (m == NULL)
382 		return (NULL);
383 
384 	uh = mtod(m, struct udphdr *);
385 	uh->uh_sport = sc->sc_port; /* XXX */
386 	uh->uh_dport = sc->sc_port;
387 	htobem16(&uh->uh_ulen, m->m_pkthdr.len);
388 	uh->uh_sum = htons(0);
389 
390 	SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
391 
392 	mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT);
393 	if (mtag == NULL)
394 		goto drop;
395 
396 	*(int *)(mtag + 1) = ifp->if_index;
397 	m_tag_prepend(m, mtag);
398 
399 	prio = sc->sc_txhprio;
400 	if (prio == IF_HDRPRIO_PACKET)
401 		prio = m->m_pkthdr.pf.prio;
402 	tos = IFQ_PRIO2TOS(prio);
403 
404 	CLR(m->m_flags, M_BCAST|M_MCAST);
405 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
406 
407 #if NPF > 0
408 	pf_pkt_addr_changed(m);
409 #endif
410 
411 	return ((*ip_encap)(sc, m, endpoint, tos));
412 drop:
413 	m_freem(m);
414 	return (NULL);
415 }
416 
417 static struct mbuf *
418 vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m,
419     const union vxlan_addr *endpoint, uint8_t tos)
420 {
421 	struct ip *ip;
422 
423 	m = m_prepend(m, sizeof(*ip), M_DONTWAIT);
424 	if (m == NULL)
425 		return (NULL);
426 
427 	ip = mtod(m, struct ip *);
428 	ip->ip_v = IPVERSION;
429 	ip->ip_hl = sizeof(*ip) >> 2;
430 	ip->ip_off = sc->sc_df;
431 	ip->ip_tos = tos;
432 	ip->ip_len = htons(m->m_pkthdr.len);
433 	ip->ip_ttl = sc->sc_ttl;
434 	ip->ip_p = IPPROTO_UDP;
435 	ip->ip_src = sc->sc_src.in4;
436 	ip->ip_dst = endpoint->in4;
437 
438 	return (m);
439 }
440 
441 #ifdef INET6
442 static struct mbuf *
443 vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m,
444     const union vxlan_addr *endpoint, uint8_t tos)
445 {
446 	struct ip6_hdr *ip6;
447 	int len = m->m_pkthdr.len;
448 
449 	m = m_prepend(m, sizeof(*ip6), M_DONTWAIT);
450 	if (m == NULL)
451 		return (NULL);
452 
453 	ip6 = mtod(m, struct ip6_hdr *);
454 	ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ?
455 	    htonl(m->m_pkthdr.ph_flowid) : 0;
456 	ip6->ip6_vfc |= IPV6_VERSION;
457 	ip6->ip6_flow |= htonl((uint32_t)tos << 20);
458 	ip6->ip6_plen = htons(len);
459 	ip6->ip6_nxt = IPPROTO_UDP;
460 	ip6->ip6_hlim = sc->sc_ttl;
461 	ip6->ip6_src = sc->sc_src.in6;
462 	ip6->ip6_dst = endpoint->in6;
463 
464 	if (sc->sc_df)
465 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
466 
467 	return (m);
468 }
469 #endif /* INET6 */
470 
471 static int
472 vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
473     struct rtentry *rt)
474 {
475 	struct m_tag *mtag;
476 
477 	mtag = NULL;
478 	while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) {
479 		if (*(int *)(mtag + 1) == ifp->if_index) {
480 			m_freem(m);
481 			return (EIO);
482 		}
483 	}
484 
485 	return (ether_output(ifp, m, dst, rt));
486 }
487 
488 static int
489 vxlan_enqueue(struct ifnet *ifp, struct mbuf *m)
490 {
491 	struct vxlan_softc *sc = ifp->if_softc;
492 	struct ifqueue *ifq = &ifp->if_snd;
493 
494 	if (ifq_enqueue(ifq, m) != 0)
495 		return (ENOBUFS);
496 
497 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
498 
499 	return (0);
500 }
501 
502 static void
503 vxlan_start(struct ifqueue *ifq)
504 {
505 	struct ifnet *ifp = ifq->ifq_if;
506 	struct vxlan_softc *sc = ifp->if_softc;
507 
508 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
509 }
510 
511 static uint64_t
512 vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml)
513 {
514 	struct ip_moptions imo;
515 	struct mbuf *m;
516 	uint64_t oerrors = 0;
517 
518 	imo.imo_ifidx = sc->sc_if_index0;
519 	imo.imo_ttl = sc->sc_ttl;
520 	imo.imo_loop = 0;
521 
522 	NET_LOCK();
523 	while ((m = ml_dequeue(ml)) != NULL) {
524 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0)
525 			oerrors++;
526 	}
527 	NET_UNLOCK();
528 
529 	return (oerrors);
530 }
531 
532 #ifdef INET6
533 static uint64_t
534 vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml)
535 {
536 	struct ip6_moptions im6o;
537 	struct mbuf *m;
538 	uint64_t oerrors = 0;
539 
540 	im6o.im6o_ifidx = sc->sc_if_index0;
541 	im6o.im6o_hlim = sc->sc_ttl;
542 	im6o.im6o_loop = 0;
543 
544 	NET_LOCK();
545 	while ((m = ml_dequeue(ml)) != NULL) {
546 		if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0)
547 			oerrors++;
548 	}
549 	NET_UNLOCK();
550 
551 	return (oerrors);
552 }
553 #endif /* INET6 */
554 
555 static void
556 vxlan_send(void *arg)
557 {
558 	struct vxlan_softc *sc = arg;
559 	struct ifnet *ifp = &sc->sc_ac.ac_if;
560 	struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *,
561 	    const union vxlan_addr *, uint8_t);
562 	uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *);
563 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
564 	struct mbuf *m;
565 	uint64_t oerrors;
566 
567 	if (!ISSET(ifp->if_flags, IFF_RUNNING))
568 		return;
569 
570 	switch (sc->sc_af) {
571 	case AF_INET:
572 		ip_encap = vxlan_encap_ipv4;
573 		ip_send = vxlan_send_ipv4;
574 		break;
575 #ifdef INET6
576 	case AF_INET6:
577 		ip_encap = vxlan_encap_ipv6;
578 		ip_send = vxlan_send_ipv6;
579 		break;
580 #endif
581 	default:
582 		unhandled_af(sc->sc_af);
583 		/* NOTREACHED */
584 	}
585 
586 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) {
587 #if NBPFILTER > 0
588 		caddr_t if_bpf = READ_ONCE(ifp->if_bpf);
589 		if (if_bpf != NULL)
590 			bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
591 #endif
592 		m = vxlan_encap(sc, m, ip_encap);
593 		if (m == NULL)
594 			continue;
595 
596 		ml_enqueue(&ml, m);
597 	}
598 
599 	oerrors = (*ip_send)(sc, &ml);
600 
601 	counters_add(ifp->if_counters, ifc_oerrors, oerrors);
602 }
603 
604 static struct mbuf *
605 vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6,
606     void *uhp, int hlen)
607 {
608 	struct vxlan_tep *vt = arg;
609 	union vxlan_addr addr;
610 	struct vxlan_peer key, *p;
611 	struct udphdr *uh;
612 	struct vxlan_header *vh;
613 	struct ether_header *eh;
614 	int vhlen = hlen + sizeof(*vh);
615 	struct mbuf *n;
616 	int off;
617 	in_port_t port;
618 	struct vxlan_softc *sc = NULL;
619 	struct ifnet *ifp;
620 	int rxhprio;
621 	uint8_t tos;
622 
623 	if (m->m_pkthdr.len < vhlen)
624 		goto drop;
625 
626 	uh = uhp;
627 	port = uh->uh_sport;
628 
629 	if (ip != NULL) {
630 		memset(&addr, 0, sizeof(addr));
631 		addr.in4 = ip->ip_src;
632 		tos = ip->ip_tos;
633 	}
634 #ifdef INET6
635 	else {
636 		addr.in6 = ip6->ip6_src;
637 		tos = bemtoh32(&ip6->ip6_flow) >> 20;
638 	}
639 #endif
640 
641 	if (m->m_len < vhlen) {
642 		m = m_pullup(m, vhlen);
643 		if (m == NULL)
644 			return (NULL);
645 	}
646 
647 	/* can't use ip/ip6/uh after this */
648 
649 	vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen);
650 
651 	memset(&key, 0, sizeof(key));
652 	key.p_addr = addr;
653 	key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I);
654 	key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK);
655 
656 	mtx_enter(&vt->vt_mtx);
657 	p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
658 	if (p == NULL) {
659 		memset(&key.p_addr, 0, sizeof(key.p_addr));
660 		p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
661 	}
662 	if (p != NULL)
663 		sc = vxlan_take(p->p_sc);
664 	mtx_leave(&vt->vt_mtx);
665 
666 	if (sc == NULL)
667 		goto drop;
668 
669 	ifp = &sc->sc_ac.ac_if;
670 	if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port)
671 		goto rele_drop;
672 
673 	m_adj(m, vhlen);
674 
675 	if (m->m_pkthdr.len < sizeof(*eh))
676 		goto rele_drop;
677 
678 	if (m->m_len < sizeof(*eh)) {
679 		m = m_pullup(m, sizeof(*eh));
680 		if (m == NULL)
681 			goto rele;
682 	}
683 
684 	n = m_getptr(m, sizeof(*eh), &off);
685 	if (n == NULL)
686 		goto rele_drop;
687 
688 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
689 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
690 		m_freem(m);
691 		if (n == NULL)
692 			goto rele;
693 		m = n;
694 	}
695 
696 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
697 		eh = mtod(m, struct ether_header *);
698 		etherbridge_map_ea(&sc->sc_eb, &addr,
699 		    (struct ether_addr *)eh->ether_shost);
700 	}
701 
702 	rxhprio = sc->sc_rxhprio;
703 	switch (rxhprio) {
704 	case IF_HDRPRIO_PACKET:
705 		/* nop */
706 		break;
707 	case IF_HDRPRIO_OUTER:
708 		m->m_pkthdr.pf.prio = IFQ_TOS2PRIO(tos);
709 		break;
710 	default:
711 		m->m_pkthdr.pf.prio = rxhprio;
712 		break;                                                  \
713         }                                                               \
714 
715 	if_vinput(ifp, m);
716 rele:
717 	vxlan_rele(sc);
718 	return (NULL);
719 
720 rele_drop:
721 	vxlan_rele(sc);
722 drop:
723 	m_freem(m);
724 	return (NULL);
725 }
726 
727 static int
728 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
729 {
730 	struct vxlan_softc *sc = ifp->if_softc;
731 	struct ifreq *ifr = (struct ifreq *)data;
732 	struct ifbrparam *bparam = (struct ifbrparam *)data;
733 	int error = 0;
734 
735 	switch (cmd) {
736 	case SIOCSIFADDR:
737 		break;
738 	case SIOCSIFFLAGS:
739 		if (ISSET(ifp->if_flags, IFF_UP)) {
740 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
741 				error = vxlan_up(sc);
742 			else
743 				error = 0;
744 		} else {
745 			if (ISSET(ifp->if_flags, IFF_RUNNING))
746 				error = vxlan_down(sc);
747 		}
748 		break;
749 
750 	case SIOCSLIFPHYRTABLE:
751 		error = vxlan_set_rdomain(sc, ifr);
752 		break;
753 	case SIOCGLIFPHYRTABLE:
754 		error = vxlan_get_rdomain(sc, ifr);
755 		break;
756 
757 	case SIOCSLIFPHYADDR:
758 		error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data);
759 		break;
760 	case SIOCGLIFPHYADDR:
761 		error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data);
762 		break;
763 	case SIOCDIFPHYADDR:
764 		error = vxlan_del_tunnel(sc);
765 		break;
766 
767 	case SIOCSVNETID:
768 		error = vxlan_set_vnetid(sc, ifr);
769 		break;
770 	case SIOCGVNETID:
771 		error = vxlan_get_vnetid(sc, ifr);
772 		break;
773 	case SIOCDVNETID:
774 		error = vxlan_del_vnetid(sc);
775 		break;
776 
777 	case SIOCSIFPARENT:
778 		error = vxlan_set_parent(sc, (struct if_parent *)data);
779 		break;
780 	case SIOCGIFPARENT:
781 		error = vxlan_get_parent(sc, (struct if_parent *)data);
782 		break;
783 	case SIOCDIFPARENT:
784 		error = vxlan_del_parent(sc);
785 		break;
786 
787 	case SIOCSTXHPRIO:
788 		error = if_txhprio_l2_check(ifr->ifr_hdrprio);
789 		if (error != 0)
790 			break;
791 
792 		sc->sc_txhprio = ifr->ifr_hdrprio;
793 		break;
794 	case SIOCGTXHPRIO:
795 		ifr->ifr_hdrprio = sc->sc_txhprio;
796 		break;
797 
798 	case SIOCSRXHPRIO:
799 		error = if_rxhprio_l2_check(ifr->ifr_hdrprio);
800 		if (error != 0)
801 			break;
802 
803 		sc->sc_rxhprio = ifr->ifr_hdrprio;
804 		break;
805 	case SIOCGRXHPRIO:
806 		ifr->ifr_hdrprio = sc->sc_rxhprio;
807 		break;
808 
809 	case SIOCSLIFPHYDF:
810 		/* commit */
811 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
812 		break;
813 	case SIOCGLIFPHYDF:
814 		ifr->ifr_df = sc->sc_df ? 1 : 0;
815 		break;
816 
817 	case SIOCSLIFPHYTTL:
818 		if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) {
819 			error = EINVAL;
820 			break;
821 		}
822 
823 		/* commit */
824 		sc->sc_ttl = (uint8_t)ifr->ifr_ttl;
825 		break;
826 	case SIOCGLIFPHYTTL:
827 		ifr->ifr_ttl = (int)sc->sc_ttl;
828 		break;
829 
830 	case SIOCBRDGSCACHE:
831 		error = etherbridge_set_max(&sc->sc_eb, bparam);
832 		break;
833 	case SIOCBRDGGCACHE:
834 		error = etherbridge_get_max(&sc->sc_eb, bparam);
835 		break;
836 	case SIOCBRDGSTO:
837 		error = etherbridge_set_tmo(&sc->sc_eb, bparam);
838 		break;
839 	case SIOCBRDGGTO:
840 		error = etherbridge_get_tmo(&sc->sc_eb, bparam);
841 		break;
842 
843 	case SIOCBRDGRTS:
844 		error = etherbridge_rtfind(&sc->sc_eb,
845 		    (struct ifbaconf *)data);
846 		break;
847 	case SIOCBRDGFLUSH:
848 		etherbridge_flush(&sc->sc_eb,
849 		    ((struct ifbreq *)data)->ifbr_ifsflags);
850 		break;
851 	case SIOCBRDGSADDR:
852 		error = vxlan_add_addr(sc, (struct ifbareq *)data);
853 		break;
854 	case SIOCBRDGDADDR:
855 		error = vxlan_del_addr(sc, (struct ifbareq *)data);
856 		break;
857 
858 	case SIOCADDMULTI:
859 	case SIOCDELMULTI:
860 		/* no hardware to program */
861 		break;
862 
863 	default:
864 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
865 		break;
866 	}
867 
868 	if (error == ENETRESET) {
869 		/* no hardware to program */
870 		error = 0;
871 	}
872 
873 	return (error);
874 }
875 
876 static struct vxlan_tep *
877 vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr)
878 {
879 	struct vxlan_tep *vt;
880 
881 	TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) {
882 		if (sc->sc_af == vt->vt_af &&
883 		    sc->sc_rdomain == vt->vt_rdomain &&
884 		    memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 &&
885 		    sc->sc_port == vt->vt_port)
886 			return (vt);
887 	}
888 
889 	return (NULL);
890 }
891 
892 static int
893 vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
894     struct vxlan_peer *p)
895 {
896 	struct mbuf m;
897 	struct vxlan_tep *vt;
898 	struct socket *so;
899 	struct sockaddr_in *sin;
900 #ifdef INET6
901 	struct sockaddr_in6 *sin6;
902 #endif
903 	int error;
904 
905 	vt = vxlan_tep_get(sc, addr);
906 	if (vt != NULL) {
907 		struct vxlan_peer *op;
908 
909 		mtx_enter(&vt->vt_mtx);
910 		op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
911 		mtx_leave(&vt->vt_mtx);
912 
913 		if (op != NULL)
914 			return (EADDRINUSE);
915 
916 		return (0);
917 	}
918 
919 	vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO);
920 	if (vt == NULL)
921 		return (ENOMEM);
922 
923 	vt->vt_af = sc->sc_af;
924 	vt->vt_rdomain = sc->sc_rdomain;
925 	vt->vt_addr = *addr;
926 	vt->vt_port = sc->sc_port;
927 
928 	mtx_init(&vt->vt_mtx, IPL_SOFTNET);
929 	RBT_INIT(vxlan_peers, &vt->vt_peers);
930 	RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
931 
932 	error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP);
933 	if (error != 0)
934 		goto free;
935 
936 	solock(so);
937 	sotoinpcb(so)->inp_upcall = vxlan_input;
938 	sotoinpcb(so)->inp_upcall_arg = vt;
939 	sounlock(so);
940 
941 	m_inithdr(&m);
942 	m.m_len = sizeof(vt->vt_rdomain);
943 	*mtod(&m, unsigned int *) = vt->vt_rdomain;
944 	error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m);
945 	if (error != 0)
946 		goto close;
947 
948 	m_inithdr(&m);
949 	switch (vt->vt_af) {
950 	case AF_INET:
951 		sin = mtod(&m, struct sockaddr_in *);
952 		memset(sin, 0, sizeof(*sin));
953 		sin->sin_len = sizeof(*sin);
954 		sin->sin_family = AF_INET;
955 		sin->sin_addr = addr->in4;
956 		sin->sin_port = vt->vt_port;
957 
958 		m.m_len = sizeof(*sin);
959 		break;
960 
961 #ifdef INET6
962 	case AF_INET6:
963 		sin6 = mtod(&m, struct sockaddr_in6 *);
964 		sin6->sin6_len = sizeof(*sin6);
965 		sin6->sin6_family = AF_INET6;
966 		in6_recoverscope(sin6, &addr->in6);
967 		sin6->sin6_port = sc->sc_port;
968 
969 		m.m_len = sizeof(*sin6);
970 		break;
971 #endif
972 	default:
973 		unhandled_af(vt->vt_af);
974 	}
975 
976 	solock(so);
977 	error = sobind(so, &m, curproc);
978 	sounlock(so);
979 	if (error != 0)
980 		goto close;
981 
982 	rw_assert_wrlock(&vxlan_lock);
983 	TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry);
984 
985 	vt->vt_so = so;
986 
987 	return (0);
988 
989 close:
990 	soclose(so, MSG_DONTWAIT);
991 free:
992 	free(vt, M_DEVBUF, sizeof(*vt));
993 	return (error);
994 }
995 
996 static void
997 vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
998     struct vxlan_peer *p)
999 {
1000 	struct vxlan_tep *vt;
1001 	int empty;
1002 
1003 	vt = vxlan_tep_get(sc, addr);
1004 	if (vt == NULL)
1005 		panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc);
1006 
1007 	mtx_enter(&vt->vt_mtx);
1008 	RBT_REMOVE(vxlan_peers, &vt->vt_peers, p);
1009 	empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers);
1010 	mtx_leave(&vt->vt_mtx);
1011 
1012 	if (!empty)
1013 		return;
1014 
1015 	rw_assert_wrlock(&vxlan_lock);
1016 	TAILQ_REMOVE(&vxlan_teps, vt, vt_entry);
1017 
1018 	soclose(vt->vt_so, MSG_DONTWAIT);
1019 	free(vt, M_DEVBUF, sizeof(*vt));
1020 }
1021 
1022 static int
1023 vxlan_tep_up(struct vxlan_softc *sc)
1024 {
1025 	struct vxlan_peer *up, *mp;
1026 	int error;
1027 
1028 	up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO);
1029 	if (up == NULL)
1030 		return (ENOMEM);
1031 
1032 	if (sc->sc_mode == VXLAN_TMODE_P2P)
1033 		up->p_addr = sc->sc_dst;
1034 	up->p_header = sc->sc_header;
1035 	up->p_sc = vxlan_take(sc);
1036 
1037 	error = vxlan_tep_add_addr(sc, &sc->sc_src, up);
1038 	if (error != 0)
1039 		goto freeup;
1040 
1041 	sc->sc_ucast_peer = up;
1042 
1043 	if (sc->sc_mode != VXLAN_TMODE_LEARNING)
1044 		return (0);
1045 
1046 	mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO);
1047 	if (mp == NULL) {
1048 		error = ENOMEM;
1049 		goto delup;
1050 	}
1051 
1052 	/* addr is multicast, leave it as 0s */
1053 	mp->p_header = sc->sc_header;
1054 	mp->p_sc = vxlan_take(sc);
1055 
1056 	/* destination address is a multicast group we want to join */
1057 	error = vxlan_tep_add_addr(sc, &sc->sc_dst, up);
1058 	if (error != 0)
1059 		goto freemp;
1060 
1061 	sc->sc_mcast_peer = mp;
1062 
1063 	return (0);
1064 
1065 freemp:
1066 	vxlan_rele(mp->p_sc);
1067 	free(mp, M_DEVBUF, sizeof(*mp));
1068 delup:
1069 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1070 freeup:
1071 	vxlan_rele(up->p_sc);
1072 	free(up, M_DEVBUF, sizeof(*up));
1073 	return (error);
1074 }
1075 
1076 static void
1077 vxlan_tep_down(struct vxlan_softc *sc)
1078 {
1079 	struct vxlan_peer *up = sc->sc_ucast_peer;
1080 
1081 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1082 		struct vxlan_peer *mp = sc->sc_mcast_peer;
1083 		vxlan_tep_del_addr(sc, &sc->sc_dst, mp);
1084 		vxlan_rele(mp->p_sc);
1085 		free(mp, M_DEVBUF, sizeof(*mp));
1086 	}
1087 
1088 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1089 	vxlan_rele(up->p_sc);
1090 	free(up, M_DEVBUF, sizeof(*up));
1091 }
1092 
1093 static int
1094 vxlan_up(struct vxlan_softc *sc)
1095 {
1096 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1097 	struct ifnet *ifp0 = NULL;
1098 	int error;
1099 
1100 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
1101 	NET_ASSERT_LOCKED();
1102 
1103 	if (sc->sc_af == AF_UNSPEC)
1104 		return (EDESTADDRREQ);
1105 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1106 
1107 	NET_UNLOCK();
1108 
1109 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1110 	if (error != 0)
1111 		goto netlock;
1112 
1113 	NET_LOCK();
1114 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1115 		/* something else beat us */
1116 		rw_exit(&vxlan_lock);
1117 		return (0);
1118 	}
1119 	NET_UNLOCK();
1120 
1121 	if (sc->sc_mode != VXLAN_TMODE_P2P) {
1122 		error = etherbridge_up(&sc->sc_eb);
1123 		if (error != 0)
1124 			goto unlock;
1125 	}
1126 
1127 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1128 		ifp0 = if_get(sc->sc_if_index0);
1129 		if (ifp0 == NULL) {
1130 			error = ENXIO;
1131 			goto down;
1132 		}
1133 
1134 		/* check again if multicast will work on top of the parent */
1135 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1136 			error = EPROTONOSUPPORT;
1137 			goto put;
1138 		}
1139 
1140 		error = vxlan_addmulti(sc, ifp0);
1141 		if (error != 0)
1142 			goto put;
1143 
1144 		/* Register callback if parent wants to unregister */
1145 		if_detachhook_add(ifp0, &sc->sc_dtask);
1146 	} else {
1147 		if (sc->sc_if_index0 != 0) {
1148 			error = EPROTONOSUPPORT;
1149 			goto down;
1150 		}
1151 	}
1152 
1153 	error = vxlan_tep_up(sc);
1154 	if (error != 0)
1155 		goto del;
1156 
1157 	if_put(ifp0);
1158 
1159 	NET_LOCK();
1160 	SET(ifp->if_flags, IFF_RUNNING);
1161 	rw_exit(&vxlan_lock);
1162 
1163 	return (0);
1164 
1165 del:
1166 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1167 		if (ifp0 != NULL)
1168 			if_detachhook_del(ifp0, &sc->sc_dtask);
1169 		vxlan_delmulti(sc);
1170 	}
1171 put:
1172 	if_put(ifp0);
1173 down:
1174 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1175 		etherbridge_down(&sc->sc_eb);
1176 unlock:
1177 	rw_exit(&vxlan_lock);
1178 netlock:
1179 	NET_LOCK();
1180 
1181 	return (error);
1182 }
1183 
1184 static int
1185 vxlan_down(struct vxlan_softc *sc)
1186 {
1187 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1188 	struct ifnet *ifp0;
1189 	int error;
1190 
1191 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1192 	NET_UNLOCK();
1193 
1194 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1195 	if (error != 0) {
1196 		NET_LOCK();
1197 		return (error);
1198 	}
1199 
1200 	NET_LOCK();
1201 	if (!ISSET(ifp->if_flags, IFF_RUNNING)) {
1202 		/* something else beat us */
1203 		rw_exit(&vxlan_lock);
1204 		return (0);
1205 	}
1206 	NET_UNLOCK();
1207 
1208 	vxlan_tep_down(sc);
1209 
1210 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1211 		vxlan_delmulti(sc);
1212 		ifp0 = if_get(sc->sc_if_index0);
1213 		if (ifp0 != NULL) {
1214 			if_detachhook_del(ifp0, &sc->sc_dtask);
1215 		}
1216 		if_put(ifp0);
1217 	}
1218 
1219 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1220 		etherbridge_down(&sc->sc_eb);
1221 
1222 	taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task);
1223 	NET_LOCK();
1224 	CLR(ifp->if_flags, IFF_RUNNING);
1225 	rw_exit(&vxlan_lock);
1226 
1227 	return (0);
1228 }
1229 
1230 static int
1231 vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0)
1232 {
1233 	int error = 0;
1234 
1235 	NET_LOCK();
1236 
1237 	switch (sc->sc_af) {
1238 	case AF_INET:
1239 		sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0);
1240 		if (sc->sc_inmulti == NULL)
1241 			error = EADDRNOTAVAIL;
1242 		break;
1243 #ifdef INET6
1244 	case AF_INET6:
1245 		sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error);
1246 		break;
1247 #endif
1248 	default:
1249 		unhandled_af(sc->sc_af);
1250 	}
1251 
1252 	NET_UNLOCK();
1253 
1254 	return (error);
1255 }
1256 
1257 static void
1258 vxlan_delmulti(struct vxlan_softc *sc)
1259 {
1260 	NET_LOCK();
1261 
1262 	switch (sc->sc_af) {
1263 	case AF_INET:
1264 		in_delmulti(sc->sc_inmulti);
1265 		break;
1266 #ifdef INET6
1267 	case AF_INET6:
1268 		in6_delmulti(sc->sc_inmulti);
1269 		break;
1270 #endif
1271 	default:
1272 		unhandled_af(sc->sc_af);
1273 	}
1274 
1275 	sc->sc_inmulti = NULL; /* keep it tidy */
1276 
1277 	NET_UNLOCK();
1278 }
1279 
1280 static int
1281 vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr)
1282 {
1283 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1284 
1285 	if (ifr->ifr_rdomainid < 0 ||
1286 	    ifr->ifr_rdomainid > RT_TABLEID_MAX)
1287 		return (EINVAL);
1288 	if (!rtable_exists(ifr->ifr_rdomainid))
1289 		return (EADDRNOTAVAIL);
1290 
1291 	if (sc->sc_rdomain == ifr->ifr_rdomainid)
1292 		return (0);
1293 
1294 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1295 		return (EBUSY);
1296 
1297 	/* commit */
1298 	sc->sc_rdomain = ifr->ifr_rdomainid;
1299 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1300 
1301 	return (0);
1302 }
1303 
1304 static int
1305 vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr)
1306 {
1307 	ifr->ifr_rdomainid = sc->sc_rdomain;
1308 
1309 	return (0);
1310 }
1311 
1312 static int
1313 vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req)
1314 {
1315 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1316 	struct sockaddr *src = (struct sockaddr *)&req->addr;
1317 	struct sockaddr *dst = (struct sockaddr *)&req->dstaddr;
1318 	struct sockaddr_in *src4, *dst4;
1319 #ifdef INET6
1320 	struct sockaddr_in6 *src6, *dst6;
1321 	int error;
1322 #endif
1323 	union vxlan_addr saddr, daddr;
1324 	unsigned int mode = VXLAN_TMODE_ENDPOINT;
1325 	in_port_t port = htons(VXLAN_PORT);
1326 
1327 	memset(&saddr, 0, sizeof(saddr));
1328 	memset(&daddr, 0, sizeof(daddr));
1329 
1330 	/* validate */
1331 	switch (src->sa_family) {
1332 	case AF_INET:
1333 		src4 = (struct sockaddr_in *)src;
1334 		if (in_nullhost(src4->sin_addr) ||
1335 		    IN_MULTICAST(src4->sin_addr.s_addr))
1336 			return (EINVAL);
1337 
1338 		if (src4->sin_port != htons(0))
1339 			port = src4->sin_port;
1340 
1341 		if (dst->sa_family != AF_UNSPEC) {
1342 			if (dst->sa_family != AF_INET)
1343 				return (EINVAL);
1344 
1345 			dst4 = (struct sockaddr_in *)dst;
1346 			if (in_nullhost(dst4->sin_addr))
1347 				return (EINVAL);
1348 
1349 			/* all good */
1350 			mode = IN_MULTICAST(dst4->sin_addr.s_addr) ?
1351 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1352 			daddr.in4 = dst4->sin_addr;
1353 		}
1354 
1355 		saddr.in4 = src4->sin_addr;
1356 		break;
1357 
1358 #ifdef INET6
1359 	case AF_INET6:
1360 		src6 = (struct sockaddr_in6 *)src;
1361 		if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) ||
1362 		    IN6_IS_ADDR_MULTICAST(&src6->sin6_addr))
1363 			return (EINVAL);
1364 
1365 		if (src6->sin6_port != htons(0))
1366 			port = src6->sin6_port;
1367 
1368 		if (dst->sa_family != AF_UNSPEC) {
1369 			if (dst->sa_family != AF_INET6)
1370 				return (EINVAL);
1371 
1372 			dst6 = (struct sockaddr_in6 *)dst;
1373 			if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr))
1374 				return (EINVAL);
1375 
1376 			if (src6->sin6_scope_id != dst6->sin6_scope_id)
1377 				return (EINVAL);
1378 
1379 			/* all good */
1380 			mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ?
1381 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1382 			error = in6_embedscope(&daddr.in6, dst6, NULL);
1383 			if (error != 0)
1384 				return (error);
1385 		}
1386 
1387 		error = in6_embedscope(&saddr.in6, src6, NULL);
1388 		if (error != 0)
1389 			return (error);
1390 
1391 		break;
1392 #endif
1393 	default:
1394 		return (EAFNOSUPPORT);
1395 	}
1396 
1397 	if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 &&
1398 	    memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 &&
1399 	    sc->sc_port == port)
1400 		return (0);
1401 
1402 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1403 		return (EBUSY);
1404 
1405 	/* commit */
1406 	sc->sc_af = src->sa_family;
1407 	sc->sc_src = saddr;
1408 	sc->sc_dst = daddr;
1409 	sc->sc_port = port;
1410 	sc->sc_mode = mode;
1411 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1412 
1413 	return (0);
1414 }
1415 
1416 static int
1417 vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req)
1418 {
1419 	struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr;
1420 	struct sockaddr_in *sin;
1421 #ifdef INET6
1422 	struct sockaddr_in6 *sin6;
1423 #endif
1424 
1425 	if (sc->sc_af == AF_UNSPEC)
1426 		return (EADDRNOTAVAIL);
1427 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1428 
1429 	memset(&req->addr, 0, sizeof(req->addr));
1430 	memset(&req->dstaddr, 0, sizeof(req->dstaddr));
1431 
1432 	/* default to endpoint */
1433 	dstaddr->sa_len = 2;
1434 	dstaddr->sa_family = AF_UNSPEC;
1435 
1436 	switch (sc->sc_af) {
1437 	case AF_INET:
1438 		sin = (struct sockaddr_in *)&req->addr;
1439 		sin->sin_len = sizeof(*sin);
1440 		sin->sin_family = AF_INET;
1441 		sin->sin_addr = sc->sc_src.in4;
1442 		sin->sin_port = sc->sc_port;
1443 
1444 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1445 			break;
1446 
1447 		sin = (struct sockaddr_in *)&req->dstaddr;
1448 		sin->sin_len = sizeof(*sin);
1449 		sin->sin_family = AF_INET;
1450 		sin->sin_addr = sc->sc_dst.in4;
1451 		break;
1452 
1453 #ifdef INET6
1454 	case AF_INET6:
1455 		sin6 = (struct sockaddr_in6 *)&req->addr;
1456 		sin6->sin6_len = sizeof(*sin6);
1457 		sin6->sin6_family = AF_INET6;
1458 		in6_recoverscope(sin6, &sc->sc_src.in6);
1459 		sin6->sin6_port = sc->sc_port;
1460 
1461 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1462 			break;
1463 
1464 		sin6 = (struct sockaddr_in6 *)&req->dstaddr;
1465 		sin6->sin6_len = sizeof(*sin6);
1466 		sin6->sin6_family = AF_INET6;
1467 		in6_recoverscope(sin6, &sc->sc_dst.in6);
1468 		break;
1469 #endif
1470 	default:
1471 		unhandled_af(sc->sc_af);
1472 	}
1473 
1474 	return (0);
1475 }
1476 
1477 static int
1478 vxlan_del_tunnel(struct vxlan_softc *sc)
1479 {
1480 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1481 
1482 	if (sc->sc_af == AF_UNSPEC)
1483 		return (0);
1484 
1485 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1486 		return (EBUSY);
1487 
1488 	/* commit */
1489 	sc->sc_af = AF_UNSPEC;
1490 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
1491 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
1492 	sc->sc_port = htons(0);
1493 	sc->sc_mode = VXLAN_TMODE_UNSET;
1494 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1495 
1496 	return (0);
1497 }
1498 
1499 static int
1500 vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr)
1501 {
1502 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1503 	uint32_t vni;
1504 
1505 	if (ifr->ifr_vnetid < VXLAN_VNI_MIN ||
1506 	    ifr->ifr_vnetid > VXLAN_VNI_MAX)
1507 		return (EINVAL);
1508 
1509 	vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT);
1510 	if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) &&
1511 	    sc->sc_header.vxlan_id == vni)
1512 		return (0);
1513 
1514 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1515 		return (EBUSY);
1516 
1517 	/* commit */
1518 	SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1519 	sc->sc_header.vxlan_id = vni;
1520 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1521 
1522 	return (0);
1523 }
1524 
1525 static int
1526 vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr)
1527 {
1528 	uint32_t vni;
1529 
1530 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1531 		return (EADDRNOTAVAIL);
1532 
1533 	vni = ntohl(sc->sc_header.vxlan_id);
1534 	vni &= VXLAN_VNI_MASK;
1535 	vni >>= VXLAN_VNI_SHIFT;
1536 
1537 	ifr->ifr_vnetid = vni;
1538 
1539 	return (0);
1540 }
1541 
1542 static int
1543 vxlan_del_vnetid(struct vxlan_softc *sc)
1544 {
1545 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1546 
1547 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1548 		return (0);
1549 
1550 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1551 		return (EBUSY);
1552 
1553 	/* commit */
1554 	CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1555 	sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT);
1556 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1557 
1558 	return (0);
1559 }
1560 
1561 static int
1562 vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p)
1563 {
1564 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1565 	struct ifnet *ifp0;
1566 	int error = 0;
1567 
1568 	ifp0 = if_unit(p->ifp_parent);
1569 	if (ifp0 == NULL)
1570 		return (ENXIO);
1571 
1572 	if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1573 		error = ENXIO;
1574 		goto put;
1575 	}
1576 
1577 	if (sc->sc_if_index0 == ifp0->if_index)
1578 		goto put;
1579 
1580 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1581 		error = EBUSY;
1582 		goto put;
1583 	}
1584 
1585 	ifsetlro(ifp0, 0);
1586 
1587 	/* commit */
1588 	sc->sc_if_index0 = ifp0->if_index;
1589 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1590 
1591 put:
1592 	if_put(ifp0);
1593 	return (error);
1594 }
1595 
1596 static int
1597 vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p)
1598 {
1599 	struct ifnet *ifp0;
1600 	int error = 0;
1601 
1602 	ifp0 = if_get(sc->sc_if_index0);
1603 	if (ifp0 == NULL)
1604 		error = EADDRNOTAVAIL;
1605 	else
1606 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
1607 	if_put(ifp0);
1608 
1609 	return (error);
1610 }
1611 
1612 static int
1613 vxlan_del_parent(struct vxlan_softc *sc)
1614 {
1615 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1616 
1617 	if (sc->sc_if_index0 == 0)
1618 		return (0);
1619 
1620 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1621 		return (EBUSY);
1622 
1623 	/* commit */
1624 	sc->sc_if_index0 = 0;
1625 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1626 
1627 	return (0);
1628 }
1629 
1630 static int
1631 vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1632 {
1633 	struct sockaddr_in *sin;
1634 #ifdef INET6
1635 	struct sockaddr_in6 *sin6;
1636 	struct sockaddr_in6 src6 = {
1637 		.sin6_len = sizeof(src6),
1638 		.sin6_family = AF_UNSPEC,
1639 	};
1640 	int error;
1641 #endif
1642 	union vxlan_addr endpoint;
1643 	unsigned int type;
1644 
1645 	switch (sc->sc_mode) {
1646 	case VXLAN_TMODE_UNSET:
1647 		return (ENOPROTOOPT);
1648 	case VXLAN_TMODE_P2P:
1649 		return (EPROTONOSUPPORT);
1650 	default:
1651 		break;
1652 	}
1653 
1654 	/* ignore ifba_ifsname */
1655 
1656 	if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK))
1657 		return (EINVAL);
1658 	switch (ifba->ifba_flags & IFBAF_TYPEMASK) {
1659 	case IFBAF_DYNAMIC:
1660 		type = EBE_DYNAMIC;
1661 		break;
1662 	case IFBAF_STATIC:
1663 		type = EBE_STATIC;
1664 		break;
1665 	default:
1666 		return (EINVAL);
1667 	}
1668 
1669 	memset(&endpoint, 0, sizeof(endpoint));
1670 
1671 	if (ifba->ifba_dstsa.ss_family != sc->sc_af)
1672 		return (EAFNOSUPPORT);
1673 	switch (ifba->ifba_dstsa.ss_family) {
1674 	case AF_INET:
1675 		sin = (struct sockaddr_in *)&ifba->ifba_dstsa;
1676 		if (in_nullhost(sin->sin_addr) ||
1677 		    IN_MULTICAST(sin->sin_addr.s_addr))
1678 			return (EADDRNOTAVAIL);
1679 
1680 		if (sin->sin_port != htons(0))
1681 			return (EADDRNOTAVAIL);
1682 
1683 		endpoint.in4 = sin->sin_addr;
1684 		break;
1685 
1686 #ifdef INET6
1687 	case AF_INET6:
1688 		sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa;
1689 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
1690 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1691 			return (EADDRNOTAVAIL);
1692 
1693 		in6_recoverscope(&src6, &sc->sc_src.in6);
1694 		if (src6.sin6_scope_id != sin6->sin6_scope_id)
1695 			return (EADDRNOTAVAIL);
1696 
1697 		if (sin6->sin6_port != htons(0))
1698 			return (EADDRNOTAVAIL);
1699 
1700 		error = in6_embedscope(&endpoint.in6, sin6, NULL);
1701 		if (error != 0)
1702 			return (error);
1703 
1704 		break;
1705 #endif
1706 	default: /* AF_UNSPEC */
1707 		return (EADDRNOTAVAIL);
1708 	}
1709 
1710 	return (etherbridge_add_addr(&sc->sc_eb, &endpoint,
1711 	    &ifba->ifba_dst, type));
1712 }
1713 
1714 static int
1715 vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1716 {
1717 	return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst));
1718 }
1719 
1720 void
1721 vxlan_detach_hook(void *arg)
1722 {
1723 	struct vxlan_softc *sc = arg;
1724 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1725 
1726 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1727 		vxlan_down(sc);
1728 		CLR(ifp->if_flags, IFF_UP);
1729 	}
1730 
1731 	sc->sc_if_index0 = 0;
1732 }
1733 
1734 static int
1735 vxlan_eb_port_eq(void *arg, void *a, void *b)
1736 {
1737 	const union vxlan_addr *va = a, *vb = b;
1738 	size_t i;
1739 
1740 	for (i = 0; i < nitems(va->in6.s6_addr32); i++) {
1741 		if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i])
1742 			return (0);
1743 	}
1744 
1745 	return (1);
1746 }
1747 
1748 static void *
1749 vxlan_eb_port_take(void *arg, void *port)
1750 {
1751 	union vxlan_addr *endpoint;
1752 
1753 	endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT);
1754 	if (endpoint == NULL)
1755 		return (NULL);
1756 
1757 	*endpoint = *(union vxlan_addr *)port;
1758 
1759 	return (endpoint);
1760 }
1761 
1762 static void
1763 vxlan_eb_port_rele(void *arg, void *port)
1764 {
1765 	union vxlan_addr *endpoint = port;
1766 
1767 	pool_put(&vxlan_endpoint_pool, endpoint);
1768 }
1769 
1770 static size_t
1771 vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
1772 {
1773 	struct vxlan_softc *sc = arg;
1774 
1775 	return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len));
1776 }
1777 
1778 static void
1779 vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port)
1780 {
1781 	struct vxlan_softc *sc = arg;
1782 	union vxlan_addr *endpoint = port;
1783 
1784 	switch (sc->sc_af) {
1785 	case AF_INET: {
1786 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
1787 
1788 		sin->sin_len = sizeof(*sin);
1789 		sin->sin_family = AF_INET;
1790 		sin->sin_addr = endpoint->in4;
1791 		break;
1792 	}
1793 #ifdef INET6
1794 	case AF_INET6: {
1795 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
1796 
1797 		sin6->sin6_len = sizeof(*sin6);
1798 		sin6->sin6_family = AF_INET6;
1799 		in6_recoverscope(sin6, &endpoint->in6);
1800 		break;
1801 	}
1802 #endif /* INET6 */
1803 	default:
1804 		unhandled_af(sc->sc_af);
1805 	}
1806 }
1807 
1808 static inline int
1809 vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp)
1810 {
1811 	size_t i;
1812 
1813 	if (ap->p_header.vxlan_id > bp->p_header.vxlan_id)
1814 		return (1);
1815 	if (ap->p_header.vxlan_id < bp->p_header.vxlan_id)
1816 		return (-1);
1817 	if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags)
1818 		return (1);
1819 	if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags)
1820 		return (-1);
1821 
1822 	for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) {
1823 		if (ap->p_addr.in6.s6_addr32[i] >
1824 		    bp->p_addr.in6.s6_addr32[i])
1825 			return (1);
1826 		if (ap->p_addr.in6.s6_addr32[i] <
1827 		    bp->p_addr.in6.s6_addr32[i])
1828 			return (-1);
1829 	}
1830 
1831 	return (0);
1832 }
1833 
1834 RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
1835