xref: /netbsd/sys/netinet/ip_encap.c (revision bde10826)
1 /*	$NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $	*/
2 /*	$KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 /*
33  * My grandfather said that there's a devil inside tunnelling technology...
34  *
35  * We have surprisingly many protocols that want packets with IP protocol
36  * #4 or #41.  Here's a list of protocols that want protocol #41:
37  *	RFC1933 configured tunnel
38  *	RFC1933 automatic tunnel
39  *	RFC2401 IPsec tunnel
40  *	RFC2473 IPv6 generic packet tunnelling
41  *	RFC2529 6over4 tunnel
42  *	RFC3056 6to4 tunnel
43  *	isatap tunnel
44  *	mobile-ip6 (uses RFC2473)
45  * Here's a list of protocol that want protocol #4:
46  *	RFC1853 IPv4-in-IPv4 tunnelling
47  *	RFC2003 IPv4 encapsulation within IPv4
48  *	RFC2344 reverse tunnelling for mobile-ip4
49  *	RFC2401 IPsec tunnel
50  * Well, what can I say.  They impose different en/decapsulation mechanism
51  * from each other, so they need separate protocol handler.  The only one
52  * we can easily determine by protocol # is IPsec, which always has
53  * AH/ESP/IPComp header right after outer IP header.
54  *
55  * So, clearly good old protosw does not work for protocol #4 and #41.
56  * The code will let you match protocol via src/dst address pair.
57  */
58 /* XXX is M_NETADDR correct? */
59 
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.77 2022/12/07 08:33:02 knakahara Exp $");
62 
63 #ifdef _KERNEL_OPT
64 #include "opt_mrouting.h"
65 #include "opt_inet.h"
66 #include "opt_net_mpsafe.h"
67 #endif
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/socket.h>
72 #include <sys/socketvar.h> /* for softnet_lock */
73 #include <sys/sockio.h>
74 #include <sys/mbuf.h>
75 #include <sys/errno.h>
76 #include <sys/queue.h>
77 #include <sys/kmem.h>
78 #include <sys/mutex.h>
79 #include <sys/condvar.h>
80 #include <sys/psref.h>
81 #include <sys/pslist.h>
82 #include <sys/thmap.h>
83 
84 #include <net/if.h>
85 
86 #include <netinet/in.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/ip.h>
89 #include <netinet/ip_var.h>
90 #include <netinet/ip_encap.h>
91 #ifdef MROUTING
92 #include <netinet/ip_mroute.h>
93 #endif /* MROUTING */
94 
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/ip6_var.h>
98 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */
99 #include <netinet6/in6_var.h>
100 #include <netinet6/in6_pcb.h>
101 #include <netinet/icmp6.h>
102 #endif
103 
104 #ifdef NET_MPSAFE
105 #define ENCAP_MPSAFE	1
106 #endif
107 
108 enum direction { INBOUND, OUTBOUND };
109 
110 #ifdef INET
111 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction,
112     struct psref *);
113 #endif
114 #ifdef INET6
115 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction,
116     struct psref *);
117 #endif
118 static int encap_add(struct encaptab *);
119 static int encap_remove(struct encaptab *);
120 static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *);
121 static void encap_key_init(struct encap_key *, const struct sockaddr *,
122     const struct sockaddr *);
123 static void encap_key_inc(struct encap_key *);
124 
125 /*
126  * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking
127  * encap_table. So, it cannot use pserialize_read_enter()
128  */
129 static struct {
130 	struct pslist_head	list;
131 	pserialize_t		psz;
132 	struct psref_class	*elem_class; /* for the element of et_list */
133 } encaptab  __cacheline_aligned = {
134 	.list = PSLIST_INITIALIZER,
135 };
136 #define encap_table encaptab.list
137 
138 static struct {
139 	kmutex_t	lock;
140 	kcondvar_t	cv;
141 	struct lwp	*busy;
142 } encap_whole __cacheline_aligned;
143 
144 static thmap_t *encap_map[2];	/* 0 for AF_INET, 1 for AF_INET6 */
145 
146 static bool encap_initialized = false;
147 /*
148  * must be done before other encap interfaces initialization.
149  */
150 void
encapinit(void)151 encapinit(void)
152 {
153 
154 	if (encap_initialized)
155 		return;
156 
157 	encaptab.psz = pserialize_create();
158 	encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET);
159 
160 	mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE);
161 	cv_init(&encap_whole.cv, "ip_encap cv");
162 	encap_whole.busy = NULL;
163 
164 	encap_initialized = true;
165 }
166 
167 void
encap_init(void)168 encap_init(void)
169 {
170 	static int initialized = 0;
171 
172 	if (initialized)
173 		return;
174 	initialized++;
175 #if 0
176 	/*
177 	 * we cannot use LIST_INIT() here, since drivers may want to call
178 	 * encap_attach(), on driver attach.  encap_init() will be called
179 	 * on AF_INET{,6} initialization, which happens after driver
180 	 * initialization - using LIST_INIT() here can nuke encap_attach()
181 	 * from drivers.
182 	 */
183 	PSLIST_INIT(&encap_table);
184 #endif
185 
186 	encap_map[0] = thmap_create(0, NULL, THMAP_NOCOPY);
187 #ifdef INET6
188 	encap_map[1] = thmap_create(0, NULL, THMAP_NOCOPY);
189 #endif
190 }
191 
192 #ifdef INET
193 static struct encaptab *
encap4_lookup(struct mbuf * m,int off,int proto,enum direction dir,struct psref * match_psref)194 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir,
195     struct psref *match_psref)
196 {
197 	struct ip *ip;
198 	struct ip_pack4 pack;
199 	struct encaptab *ep, *match;
200 	int prio, matchprio;
201 	int s;
202 	thmap_t *emap = encap_map[0];
203 	struct encap_key key;
204 
205 	KASSERT(m->m_len >= sizeof(*ip));
206 
207 	ip = mtod(m, struct ip *);
208 
209 	memset(&pack, 0, sizeof(pack));
210 	pack.p.sp_len = sizeof(pack);
211 	pack.mine.sin_family = pack.yours.sin_family = AF_INET;
212 	pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in);
213 	if (dir == INBOUND) {
214 		pack.mine.sin_addr = ip->ip_dst;
215 		pack.yours.sin_addr = ip->ip_src;
216 	} else {
217 		pack.mine.sin_addr = ip->ip_src;
218 		pack.yours.sin_addr = ip->ip_dst;
219 	}
220 
221 	match = NULL;
222 	matchprio = 0;
223 
224 	s = pserialize_read_enter();
225 
226 	encap_key_init(&key, sintosa(&pack.mine), sintosa(&pack.yours));
227 	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
228 		struct psref elem_psref;
229 
230 		KASSERT(ep->af == AF_INET);
231 
232 		if (ep->proto >= 0 && ep->proto != proto) {
233 			encap_key_inc(&key);
234 			continue;
235 		}
236 
237 		psref_acquire(&elem_psref, &ep->psref,
238 		    encaptab.elem_class);
239 		if (ep->func) {
240 			pserialize_read_exit(s);
241 			prio = (*ep->func)(m, off, proto, ep->arg);
242 			s = pserialize_read_enter();
243 		} else {
244 			prio = pack.mine.sin_len + pack.yours.sin_len;
245 		}
246 
247 		if (prio <= 0) {
248 			psref_release(&elem_psref, &ep->psref,
249 			    encaptab.elem_class);
250 			encap_key_inc(&key);
251 			continue;
252 		}
253 		if (prio > matchprio) {
254 			/* release last matched ep */
255 			if (match != NULL)
256 				psref_release(match_psref, &match->psref,
257 				    encaptab.elem_class);
258 
259 			psref_copy(match_psref, &elem_psref,
260 			    encaptab.elem_class);
261 			matchprio = prio;
262 			match = ep;
263 		}
264 
265 		psref_release(&elem_psref, &ep->psref,
266 		    encaptab.elem_class);
267 		encap_key_inc(&key);
268 	}
269 
270 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
271 		struct psref elem_psref;
272 
273 		if (ep->af != AF_INET)
274 			continue;
275 		if (ep->proto >= 0 && ep->proto != proto)
276 			continue;
277 
278 		psref_acquire(&elem_psref, &ep->psref,
279 		    encaptab.elem_class);
280 		pserialize_read_exit(s);
281 		/* ep->func is sleepable. e.g. rtalloc1 */
282 		prio = (*ep->func)(m, off, proto, ep->arg);
283 		s = pserialize_read_enter();
284 
285 		/*
286 		 * We prioritize the matches by using bit length of the
287 		 * matches.  user-supplied matching function
288 		 * should return the bit length of the matches (for example,
289 		 * if both src/dst are matched for IPv4, 64 should be returned).
290 		 * 0 or negative return value means "it did not match".
291 		 *
292 		 * We need to loop through all the possible candidates
293 		 * to get the best match - the search takes O(n) for
294 		 * n attachments (i.e. interfaces).
295 		 */
296 		if (prio <= 0) {
297 			psref_release(&elem_psref, &ep->psref,
298 			    encaptab.elem_class);
299 			continue;
300 		}
301 		if (prio > matchprio) {
302 			/* release last matched ep */
303 			if (match != NULL)
304 				psref_release(match_psref, &match->psref,
305 				    encaptab.elem_class);
306 
307 			psref_copy(match_psref, &elem_psref,
308 			    encaptab.elem_class);
309 			matchprio = prio;
310 			match = ep;
311 		}
312 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
313 			encaptab.elem_class),
314 		    "current match = %p, but not hold its psref", match);
315 
316 		psref_release(&elem_psref, &ep->psref,
317 		    encaptab.elem_class);
318 	}
319 	pserialize_read_exit(s);
320 
321 	return match;
322 }
323 
324 void
encap4_input(struct mbuf * m,int off,int proto)325 encap4_input(struct mbuf *m, int off, int proto)
326 {
327 	const struct encapsw *esw;
328 	struct encaptab *match;
329 	struct psref match_psref;
330 
331 	match = encap4_lookup(m, off, proto, INBOUND, &match_psref);
332 	if (match) {
333 		/* found a match, "match" has the best one */
334 		esw = match->esw;
335 		if (esw && esw->encapsw4.pr_input) {
336 			(*esw->encapsw4.pr_input)(m, off, proto, match->arg);
337 			psref_release(&match_psref, &match->psref,
338 			    encaptab.elem_class);
339 		} else {
340 			psref_release(&match_psref, &match->psref,
341 			    encaptab.elem_class);
342 			m_freem(m);
343 		}
344 		return;
345 	}
346 
347 	/* last resort: inject to raw socket */
348 	SOFTNET_LOCK_IF_NET_MPSAFE();
349 	rip_input(m, off, proto);
350 	SOFTNET_UNLOCK_IF_NET_MPSAFE();
351 }
352 #endif
353 
354 #ifdef INET6
355 static struct encaptab *
encap6_lookup(struct mbuf * m,int off,int proto,enum direction dir,struct psref * match_psref)356 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir,
357     struct psref *match_psref)
358 {
359 	struct ip6_hdr *ip6;
360 	struct ip_pack6 pack;
361 	int prio, matchprio;
362 	int s;
363 	struct encaptab *ep, *match;
364 	thmap_t *emap = encap_map[1];
365 	struct encap_key key;
366 
367 	KASSERT(m->m_len >= sizeof(*ip6));
368 
369 	ip6 = mtod(m, struct ip6_hdr *);
370 
371 	memset(&pack, 0, sizeof(pack));
372 	pack.p.sp_len = sizeof(pack);
373 	pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6;
374 	pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6);
375 	if (dir == INBOUND) {
376 		pack.mine.sin6_addr = ip6->ip6_dst;
377 		pack.yours.sin6_addr = ip6->ip6_src;
378 	} else {
379 		pack.mine.sin6_addr = ip6->ip6_src;
380 		pack.yours.sin6_addr = ip6->ip6_dst;
381 	}
382 
383 	match = NULL;
384 	matchprio = 0;
385 
386 	s = pserialize_read_enter();
387 
388 	encap_key_init(&key, sin6tosa(&pack.mine), sin6tosa(&pack.yours));
389 	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
390 		struct psref elem_psref;
391 
392 		KASSERT(ep->af == AF_INET6);
393 
394 		if (ep->proto >= 0 && ep->proto != proto) {
395 			encap_key_inc(&key);
396 			continue;
397 		}
398 
399 		psref_acquire(&elem_psref, &ep->psref,
400 		    encaptab.elem_class);
401 		if (ep->func) {
402 			pserialize_read_exit(s);
403 			prio = (*ep->func)(m, off, proto, ep->arg);
404 			s = pserialize_read_enter();
405 		} else {
406 			prio = pack.mine.sin6_len + pack.yours.sin6_len;
407 		}
408 
409 		if (prio <= 0) {
410 			psref_release(&elem_psref, &ep->psref,
411 			    encaptab.elem_class);
412 			encap_key_inc(&key);
413 			continue;
414 		}
415 		if (prio > matchprio) {
416 			/* release last matched ep */
417 			if (match != NULL)
418 				psref_release(match_psref, &match->psref,
419 				    encaptab.elem_class);
420 
421 			psref_copy(match_psref, &elem_psref,
422 			    encaptab.elem_class);
423 			matchprio = prio;
424 			match = ep;
425 		}
426 		psref_release(&elem_psref, &ep->psref,
427 		    encaptab.elem_class);
428 		encap_key_inc(&key);
429 	}
430 
431 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
432 		struct psref elem_psref;
433 
434 		if (ep->af != AF_INET6)
435 			continue;
436 		if (ep->proto >= 0 && ep->proto != proto)
437 			continue;
438 
439 		psref_acquire(&elem_psref, &ep->psref,
440 		    encaptab.elem_class);
441 
442 		pserialize_read_exit(s);
443 		/* ep->func is sleepable. e.g. rtalloc1 */
444 		prio = (*ep->func)(m, off, proto, ep->arg);
445 		s = pserialize_read_enter();
446 
447 		/* see encap4_lookup() for issues here */
448 		if (prio <= 0) {
449 			psref_release(&elem_psref, &ep->psref,
450 			    encaptab.elem_class);
451 			continue;
452 		}
453 		if (prio > matchprio) {
454 			/* release last matched ep */
455 			if (match != NULL)
456 				psref_release(match_psref, &match->psref,
457 				    encaptab.elem_class);
458 
459 			psref_copy(match_psref, &elem_psref,
460 			    encaptab.elem_class);
461 			matchprio = prio;
462 			match = ep;
463 		}
464 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
465 			encaptab.elem_class),
466 		    "current match = %p, but not hold its psref", match);
467 
468 		psref_release(&elem_psref, &ep->psref,
469 		    encaptab.elem_class);
470 	}
471 	pserialize_read_exit(s);
472 
473 	return match;
474 }
475 
476 int
encap6_input(struct mbuf ** mp,int * offp,int proto)477 encap6_input(struct mbuf **mp, int *offp, int proto)
478 {
479 	struct mbuf *m = *mp;
480 	const struct encapsw *esw;
481 	struct encaptab *match;
482 	struct psref match_psref;
483 	int rv;
484 
485 	match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref);
486 
487 	if (match) {
488 		/* found a match */
489 		esw = match->esw;
490 		if (esw && esw->encapsw6.pr_input) {
491 			int ret;
492 			ret = (*esw->encapsw6.pr_input)(mp, offp, proto,
493 			    match->arg);
494 			psref_release(&match_psref, &match->psref,
495 			    encaptab.elem_class);
496 			return ret;
497 		} else {
498 			psref_release(&match_psref, &match->psref,
499 			    encaptab.elem_class);
500 			m_freem(m);
501 			return IPPROTO_DONE;
502 		}
503 	}
504 
505 	/* last resort: inject to raw socket */
506 	SOFTNET_LOCK_IF_NET_MPSAFE();
507 	rv = rip6_input(mp, offp, proto);
508 	SOFTNET_UNLOCK_IF_NET_MPSAFE();
509 	return rv;
510 }
511 #endif
512 
513 static int
encap_add(struct encaptab * ep)514 encap_add(struct encaptab *ep)
515 {
516 
517 	KASSERT(encap_lock_held());
518 
519 	PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain);
520 
521 	return 0;
522 }
523 
524 static int
encap_remove(struct encaptab * ep)525 encap_remove(struct encaptab *ep)
526 {
527 	int error = 0;
528 
529 	KASSERT(encap_lock_held());
530 
531 	PSLIST_WRITER_REMOVE(ep, chain);
532 
533 	return error;
534 }
535 
536 static void
encap_afcheck(int af,const struct sockaddr * sp,const struct sockaddr * dp)537 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp)
538 {
539 
540 	KASSERT(sp != NULL && dp != NULL);
541 	KASSERT(sp->sa_len == dp->sa_len);
542 	KASSERT(af == sp->sa_family && af == dp->sa_family);
543 
544 	socklen_t len __diagused = sockaddr_getsize_by_family(af);
545 	KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len);
546 }
547 
548 const struct encaptab *
encap_attach_func(int af,int proto,encap_priofunc_t * func,const struct encapsw * esw,void * arg)549 encap_attach_func(int af, int proto,
550     encap_priofunc_t *func,
551     const struct encapsw *esw, void *arg)
552 {
553 	struct encaptab *ep;
554 	int error;
555 #ifndef ENCAP_MPSAFE
556 	int s;
557 
558 	s = splsoftnet();
559 #endif
560 
561 	ASSERT_SLEEPABLE();
562 
563 	/* sanity check on args */
564 	KASSERT(func != NULL);
565 	KASSERT(af == AF_INET
566 #ifdef INET6
567 	    || af == AF_INET6
568 #endif
569 	);
570 
571 	ep = kmem_alloc(sizeof(*ep), KM_SLEEP);
572 	memset(ep, 0, sizeof(*ep));
573 
574 	ep->af = af;
575 	ep->proto = proto;
576 	ep->func = func;
577 	ep->esw = esw;
578 	ep->arg = arg;
579 	psref_target_init(&ep->psref, encaptab.elem_class);
580 
581 	error = encap_add(ep);
582 	if (error)
583 		goto gc;
584 
585 	error = 0;
586 #ifndef ENCAP_MPSAFE
587 	splx(s);
588 #endif
589 	return ep;
590 
591 gc:
592 	kmem_free(ep, sizeof(*ep));
593 #ifndef ENCAP_MPSAFE
594 	splx(s);
595 #endif
596 	return NULL;
597 }
598 
599 static void
encap_key_init(struct encap_key * key,const struct sockaddr * local,const struct sockaddr * remote)600 encap_key_init(struct encap_key *key,
601     const struct sockaddr *local, const struct sockaddr *remote)
602 {
603 
604 	memset(key, 0, sizeof(*key));
605 
606 	sockaddr_copy(&key->local_sa, sizeof(key->local_u), local);
607 	sockaddr_copy(&key->remote_sa, sizeof(key->remote_u), remote);
608 }
609 
610 static void
encap_key_inc(struct encap_key * key)611 encap_key_inc(struct encap_key *key)
612 {
613 
614 	(key->seq)++;
615 }
616 
617 static void
encap_key_dec(struct encap_key * key)618 encap_key_dec(struct encap_key *key)
619 {
620 
621 	(key->seq)--;
622 }
623 
624 static void
encap_key_copy(struct encap_key * dst,const struct encap_key * src)625 encap_key_copy(struct encap_key *dst, const struct encap_key *src)
626 {
627 
628 	memset(dst, 0, sizeof(*dst));
629 	*dst = *src;
630 }
631 
632 /*
633  * src is always my side, and dst is always remote side.
634  * Return value will be necessary as input (cookie) for encap_detach().
635  */
636 const struct encaptab *
encap_attach_addr(int af,int proto,const struct sockaddr * src,const struct sockaddr * dst,encap_priofunc_t * func,const struct encapsw * esw,void * arg)637 encap_attach_addr(int af, int proto,
638     const struct sockaddr *src, const struct sockaddr *dst,
639     encap_priofunc_t *func,
640     const struct encapsw *esw, void *arg)
641 {
642 	struct encaptab *ep;
643 	size_t l;
644 	thmap_t *emap;
645 	void *retep;
646 	struct ip_pack4 *pack4;
647 #ifdef INET6
648 	struct ip_pack6 *pack6;
649 #endif
650 
651 	ASSERT_SLEEPABLE();
652 
653 	encap_afcheck(af, src, dst);
654 
655 	switch (af) {
656 	case AF_INET:
657 		l = sizeof(*pack4);
658 		emap = encap_map[0];
659 		break;
660 #ifdef INET6
661 	case AF_INET6:
662 		l = sizeof(*pack6);
663 		emap = encap_map[1];
664 		break;
665 #endif
666 	default:
667 		return NULL;
668 	}
669 
670 	ep = kmem_zalloc(sizeof(*ep), KM_SLEEP);
671 	ep->addrpack = kmem_zalloc(l, KM_SLEEP);
672 	ep->addrpack->sa_len = l & 0xff;
673 	ep->af = af;
674 	ep->proto = proto;
675 	ep->flag = IP_ENCAP_ADDR_ENABLE;
676 	switch (af) {
677 	case AF_INET:
678 		pack4 = (struct ip_pack4 *)ep->addrpack;
679 		ep->src = (struct sockaddr *)&pack4->mine;
680 		ep->dst = (struct sockaddr *)&pack4->yours;
681 		break;
682 #ifdef INET6
683 	case AF_INET6:
684 		pack6 = (struct ip_pack6 *)ep->addrpack;
685 		ep->src = (struct sockaddr *)&pack6->mine;
686 		ep->dst = (struct sockaddr *)&pack6->yours;
687 		break;
688 #endif
689 	}
690 	memcpy(ep->src, src, src->sa_len);
691 	memcpy(ep->dst, dst, dst->sa_len);
692 	ep->esw = esw;
693 	ep->arg = arg;
694 	ep->func = func;
695 	psref_target_init(&ep->psref, encaptab.elem_class);
696 
697 	encap_key_init(&ep->key, src, dst);
698 	while ((retep = thmap_put(emap, &ep->key, sizeof(ep->key), ep)) != ep)
699 		encap_key_inc(&ep->key);
700 	return ep;
701 }
702 
703 
704 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */
705 
706 #ifdef INET6
707 void *
encap6_ctlinput(int cmd,const struct sockaddr * sa,void * d0)708 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0)
709 {
710 	void *d = d0;
711 	struct ip6_hdr *ip6;
712 	struct mbuf *m;
713 	int off;
714 	struct ip6ctlparam *ip6cp = NULL;
715 	int nxt;
716 	int s;
717 	struct encaptab *ep;
718 	const struct encapsw *esw;
719 
720 	if (sa->sa_family != AF_INET6 ||
721 	    sa->sa_len != sizeof(struct sockaddr_in6))
722 		return NULL;
723 
724 	if ((unsigned)cmd >= PRC_NCMDS)
725 		return NULL;
726 	if (cmd == PRC_HOSTDEAD)
727 		d = NULL;
728 	else if (cmd == PRC_MSGSIZE)
729 		; /* special code is present, see below */
730 	else if (inet6ctlerrmap[cmd] == 0)
731 		return NULL;
732 
733 	/* if the parameter is from icmp6, decode it. */
734 	if (d != NULL) {
735 		ip6cp = (struct ip6ctlparam *)d;
736 		m = ip6cp->ip6c_m;
737 		ip6 = ip6cp->ip6c_ip6;
738 		off = ip6cp->ip6c_off;
739 		nxt = ip6cp->ip6c_nxt;
740 
741 		if (ip6 && cmd == PRC_MSGSIZE) {
742 			int valid = 0;
743 			struct encaptab *match;
744 			struct psref elem_psref;
745 
746 			/*
747 		 	* Check to see if we have a valid encap configuration.
748 		 	*/
749 			match = encap6_lookup(m, off, nxt, OUTBOUND,
750 			    &elem_psref);
751 			if (match) {
752 				valid++;
753 				psref_release(&elem_psref, &match->psref,
754 				    encaptab.elem_class);
755 			}
756 
757 			/*
758 		 	* Depending on the value of "valid" and routing table
759 		 	* size (mtudisc_{hi,lo}wat), we will:
760 		 	* - recalcurate the new MTU and create the
761 		 	*   corresponding routing entry, or
762 		 	* - ignore the MTU change notification.
763 		 	*/
764 			icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
765 		}
766 	} else {
767 		m = NULL;
768 		ip6 = NULL;
769 		nxt = -1;
770 	}
771 
772 	/* inform all listeners */
773 
774 	s = pserialize_read_enter();
775 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
776 		struct psref elem_psref;
777 
778 		if (ep->af != AF_INET6)
779 			continue;
780 		if (ep->proto >= 0 && ep->proto != nxt)
781 			continue;
782 
783 		/* should optimize by looking at address pairs */
784 
785 		/* XXX need to pass ep->arg or ep itself to listeners */
786 		psref_acquire(&elem_psref, &ep->psref,
787 		    encaptab.elem_class);
788 		esw = ep->esw;
789 		if (esw && esw->encapsw6.pr_ctlinput) {
790 			pserialize_read_exit(s);
791 			/* pr_ctlinput is sleepable. e.g. rtcache_free */
792 			(*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg);
793 			s = pserialize_read_enter();
794 		}
795 		psref_release(&elem_psref, &ep->psref,
796 		    encaptab.elem_class);
797 	}
798 	pserialize_read_exit(s);
799 
800 	rip6_ctlinput(cmd, sa, d0);
801 	return NULL;
802 }
803 #endif
804 
805 static int
encap_detach_addr(const struct encaptab * ep)806 encap_detach_addr(const struct encaptab *ep)
807 {
808 	thmap_t *emap;
809 	struct encaptab *retep;
810 	struct encaptab *target;
811 	void *thgc;
812 	struct encap_key key;
813 
814 	KASSERT(encap_lock_held());
815 	KASSERT(ep->flag & IP_ENCAP_ADDR_ENABLE);
816 
817 	switch (ep->af) {
818 	case AF_INET:
819 		emap = encap_map[0];
820 		break;
821 #ifdef INET6
822 	case AF_INET6:
823 		emap = encap_map[1];
824 		break;
825 #endif
826 	default:
827 		return EINVAL;
828 	}
829 
830 	retep = thmap_del(emap, &ep->key, sizeof(ep->key));
831 	if (retep != ep) {
832 		return ENOENT;
833 	}
834 	target = retep;
835 
836 	/*
837 	 * To keep continuity, decrement seq after detached encaptab.
838 	 */
839 	encap_key_copy(&key, &ep->key);
840 	encap_key_inc(&key);
841 	while ((retep = thmap_del(emap, &key, sizeof(key))) != NULL) {
842 		void *pp;
843 
844 		encap_key_dec(&retep->key);
845 		pp = thmap_put(emap, &retep->key, sizeof(retep->key), retep);
846 		KASSERT(retep == pp);
847 
848 		encap_key_inc(&key);
849 	}
850 
851 	thgc = thmap_stage_gc(emap);
852 	pserialize_perform(encaptab.psz);
853 	thmap_gc(emap, thgc);
854 	psref_target_destroy(&target->psref, encaptab.elem_class);
855 	kmem_free(target->addrpack, target->addrpack->sa_len);
856 	kmem_free(target, sizeof(*target));
857 
858 	return 0;
859 }
860 
861 int
encap_detach(const struct encaptab * cookie)862 encap_detach(const struct encaptab *cookie)
863 {
864 	const struct encaptab *ep = cookie;
865 	struct encaptab *p;
866 	int error;
867 
868 	KASSERT(encap_lock_held());
869 
870 	if (ep->flag & IP_ENCAP_ADDR_ENABLE)
871 		return encap_detach_addr(ep);
872 
873 	PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) {
874 		if (p == ep) {
875 			error = encap_remove(p);
876 			if (error)
877 				return error;
878 			else
879 				break;
880 		}
881 	}
882 	if (p == NULL)
883 		return ENOENT;
884 
885 	pserialize_perform(encaptab.psz);
886 	psref_target_destroy(&p->psref,
887 	    encaptab.elem_class);
888 	kmem_free(p, sizeof(*p));
889 
890 	return 0;
891 }
892 
893 int
encap_lock_enter(void)894 encap_lock_enter(void)
895 {
896 	int error;
897 
898 	mutex_enter(&encap_whole.lock);
899 	while (encap_whole.busy != NULL) {
900 		error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock);
901 		if (error) {
902 			mutex_exit(&encap_whole.lock);
903 			return error;
904 		}
905 	}
906 	KASSERT(encap_whole.busy == NULL);
907 	encap_whole.busy = curlwp;
908 	mutex_exit(&encap_whole.lock);
909 
910 	return 0;
911 }
912 
913 void
encap_lock_exit(void)914 encap_lock_exit(void)
915 {
916 
917 	mutex_enter(&encap_whole.lock);
918 	KASSERT(encap_whole.busy == curlwp);
919 	encap_whole.busy = NULL;
920 	cv_broadcast(&encap_whole.cv);
921 	mutex_exit(&encap_whole.lock);
922 }
923 
924 bool
encap_lock_held(void)925 encap_lock_held(void)
926 {
927 
928 	return (encap_whole.busy == curlwp);
929 }
930