xref: /illumos-gate/usr/src/uts/common/inet/iptun/iptun.c (revision 3b860eee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * iptun - IP Tunneling Driver
28  *
29  * This module is a GLDv3 driver that implements virtual datalinks over IP
30  * (a.k.a, IP tunneling).  The datalinks are managed through a dld ioctl
31  * interface (see iptun_ctl.c), and registered with GLDv3 using
32  * mac_register().  It implements the logic for various forms of IP (IPv4 or
33  * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
34  * module below it.  Each virtual IP tunnel datalink has a conn_t associated
35  * with it representing the "outer" IP connection.
36  *
37  * The module implements the following locking semantics:
38  *
39  * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
40  * See comments above iptun_hash_lock for details.
41  *
42  * No locks are ever held while calling up to GLDv3.  The general architecture
43  * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
44  * given link will be held while making downcalls (iptun_m_*() callbacks).
45  * Because we need to hold locks while handling downcalls, holding these locks
46  * while issuing upcalls results in deadlock scenarios.  See the block comment
47  * above iptun_task_cb() for details on how we safely issue upcalls without
48  * holding any locks.
49  *
50  * The contents of each iptun_t is protected by an iptun_mutex which is held
51  * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
52  * iptun_exit().
53  *
54  * See comments in iptun_delete() and iptun_free() for details on how the
55  * iptun_t is deleted safely.
56  */
57 
58 #include <sys/types.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/modhash.h>
62 #include <sys/list.h>
63 #include <sys/strsun.h>
64 #include <sys/file.h>
65 #include <sys/systm.h>
66 #include <sys/tihdr.h>
67 #include <sys/param.h>
68 #include <sys/mac_provider.h>
69 #include <sys/mac_ipv4.h>
70 #include <sys/mac_ipv6.h>
71 #include <sys/mac_6to4.h>
72 #include <sys/tsol/tnet.h>
73 #include <sys/sunldi.h>
74 #include <netinet/in.h>
75 #include <netinet/ip6.h>
76 #include <inet/ip.h>
77 #include <inet/ip_ire.h>
78 #include <inet/ipsec_impl.h>
79 #include <inet/iptun.h>
80 #include "iptun_impl.h"
81 
82 /* Do the tunnel type and address family match? */
83 #define	IPTUN_ADDR_MATCH(iptun_type, family)				\
84 	((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) ||	\
85 	(iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) ||	\
86 	(iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
87 
88 #define	IPTUN_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
89 
90 #define	IPTUNQ_DEV	"/dev/iptunq"
91 
92 #define	IPTUN_MIN_IPV4_MTU	576		/* ip.h still uses 68 (!) */
93 #define	IPTUN_MIN_IPV6_MTU	IPV6_MIN_MTU
94 #define	IPTUN_MAX_IPV4_MTU	(IP_MAXPACKET - sizeof (ipha_t))
95 #define	IPTUN_MAX_IPV6_MTU	(IP_MAXPACKET - sizeof (ip6_t) -	\
96 				    sizeof (iptun_encaplim_t))
97 
98 #define	IPTUN_MIN_HOPLIMIT	1
99 #define	IPTUN_MAX_HOPLIMIT	UINT8_MAX
100 
101 #define	IPTUN_MIN_ENCAPLIMIT	0
102 #define	IPTUN_MAX_ENCAPLIMIT	UINT8_MAX
103 
104 #define	IPTUN_IPSEC_REQ_MASK	(IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
105 
106 static iptun_encaplim_t	iptun_encaplim_init = {
107 	{ IPPROTO_NONE, 0 },
108 	IP6OPT_TUNNEL_LIMIT,
109 	1,
110 	IPTUN_DEFAULT_ENCAPLIMIT,	/* filled in with actual value later */
111 	IP6OPT_PADN,
112 	1,
113 	0
114 };
115 
116 /* Table containing per-iptun-type information. */
117 static iptun_typeinfo_t	iptun_type_table[] = {
118 	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION, ip_output,
119 	    IPTUN_MIN_IPV4_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
120 	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION, ip_output_v6,
121 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV6_MTU,	B_TRUE },
122 	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION, ip_output,
123 	    IPTUN_MIN_IPV4_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
124 	{ IPTUN_TYPE_UNKNOWN, NULL, 0, NULL, 0, 0, B_FALSE }
125 };
126 
127 /*
128  * iptun_hash is an iptun_t lookup table by link ID protected by
129  * iptun_hash_lock.  While the hash table's integrity is maintained via
130  * internal locking in the mod_hash_*() functions, we need additional locking
131  * so that an iptun_t cannot be deleted after a hash lookup has returned an
132  * iptun_t and before iptun_lock has been entered.  As such, we use
133  * iptun_hash_lock when doing lookups and removals from iptun_hash.
134  */
135 mod_hash_t	*iptun_hash;
136 static kmutex_t	iptun_hash_lock;
137 
138 static uint_t	iptun_tunnelcount;	/* total for all stacks */
139 kmem_cache_t	*iptun_cache;
140 ddi_taskq_t 	*iptun_taskq;
141 
142 typedef enum {
143 	IPTUN_TASK_PMTU_UPDATE,	/* obtain new destination path-MTU */
144 	IPTUN_TASK_MTU_UPDATE,	/* tell mac about new tunnel link MTU */
145 	IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
146 	IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
147 	IPTUN_TASK_LINK_UPDATE,	/* tell mac about new link state */
148 	IPTUN_TASK_PDATA_UPDATE	/* tell mac about updated plugin data */
149 } iptun_task_t;
150 
151 typedef struct iptun_task_data_s {
152 	iptun_task_t	itd_task;
153 	datalink_id_t	itd_linkid;
154 } iptun_task_data_t;
155 
156 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
157 static int iptun_enter(iptun_t *);
158 static void iptun_exit(iptun_t *);
159 static void iptun_headergen(iptun_t *, boolean_t);
160 static void iptun_drop_pkt(mblk_t *, uint64_t *);
161 static void iptun_input(void *, mblk_t *, void *);
162 static void iptun_output(iptun_t *, mblk_t *);
163 static uint32_t iptun_get_maxmtu(iptun_t *, uint32_t);
164 static uint32_t iptun_update_mtu(iptun_t *, uint32_t);
165 static uint32_t iptun_get_dst_pmtu(iptun_t *);
166 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
167 
168 static mac_callbacks_t iptun_m_callbacks;
169 
170 static int
171 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
172 {
173 	iptun_t	*iptun = arg;
174 	int	err = 0;
175 
176 	switch (stat) {
177 	case MAC_STAT_IERRORS:
178 		*val = iptun->iptun_ierrors;
179 		break;
180 	case MAC_STAT_OERRORS:
181 		*val = iptun->iptun_oerrors;
182 		break;
183 	case MAC_STAT_RBYTES:
184 		*val = iptun->iptun_rbytes;
185 		break;
186 	case MAC_STAT_IPACKETS:
187 		*val = iptun->iptun_ipackets;
188 		break;
189 	case MAC_STAT_OBYTES:
190 		*val = iptun->iptun_obytes;
191 		break;
192 	case MAC_STAT_OPACKETS:
193 		*val = iptun->iptun_opackets;
194 		break;
195 	case MAC_STAT_NORCVBUF:
196 		*val = iptun->iptun_norcvbuf;
197 		break;
198 	case MAC_STAT_NOXMTBUF:
199 		*val = iptun->iptun_noxmtbuf;
200 		break;
201 	default:
202 		err = ENOTSUP;
203 	}
204 
205 	return (err);
206 }
207 
208 static int
209 iptun_m_start(void *arg)
210 {
211 	iptun_t	*iptun = arg;
212 	int	err;
213 
214 	if ((err = iptun_enter(iptun)) == 0) {
215 		iptun->iptun_flags |= IPTUN_MAC_STARTED;
216 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
217 		iptun_exit(iptun);
218 	}
219 	return (err);
220 }
221 
222 static void
223 iptun_m_stop(void *arg)
224 {
225 	iptun_t *iptun = arg;
226 
227 	if (iptun_enter(iptun) == 0) {
228 		iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
229 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
230 		iptun_exit(iptun);
231 	}
232 }
233 
234 /*
235  * iptun_m_setpromisc() does nothing and always succeeds.  This is because a
236  * tunnel data-link only ever receives packets that are destined exclusively
237  * for the local address of the tunnel.
238  */
239 /* ARGSUSED */
240 static int
241 iptun_m_setpromisc(void *arg, boolean_t on)
242 {
243 	return (0);
244 }
245 
246 /* ARGSUSED */
247 static int
248 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
249 {
250 	return (ENOTSUP);
251 }
252 
253 /*
254  * iptun_m_unicst() sets the local address.
255  */
256 /* ARGSUSED */
257 static int
258 iptun_m_unicst(void *arg, const uint8_t *addrp)
259 {
260 	iptun_t			*iptun = arg;
261 	int			err;
262 	struct sockaddr_storage	ss;
263 	struct sockaddr_in	*sin;
264 	struct sockaddr_in6	*sin6;
265 
266 	if ((err = iptun_enter(iptun)) == 0) {
267 		switch (iptun->iptun_typeinfo->iti_ipvers) {
268 		case IPV4_VERSION:
269 			sin = (struct sockaddr_in *)&ss;
270 			sin->sin_family = AF_INET;
271 			bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
272 			break;
273 		case IPV6_VERSION:
274 			sin6 = (struct sockaddr_in6 *)&ss;
275 			sin6->sin6_family = AF_INET6;
276 			bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
277 			break;
278 		default:
279 			ASSERT(0);
280 		}
281 		err = iptun_setladdr(iptun, &ss);
282 		iptun_exit(iptun);
283 	}
284 	return (err);
285 }
286 
287 static mblk_t *
288 iptun_m_tx(void *arg, mblk_t *mpchain)
289 {
290 	mblk_t	*mp, *nmp;
291 	iptun_t	*iptun = arg;
292 
293 	if (!IS_IPTUN_RUNNING(iptun)) {
294 		iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
295 		return (NULL);
296 	}
297 
298 	/*
299 	 * Request the destination's path MTU information regularly in case
300 	 * path MTU has increased.
301 	 */
302 	if (IPTUN_PMTU_TOO_OLD(iptun))
303 		iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
304 
305 	for (mp = mpchain; mp != NULL; mp = nmp) {
306 		nmp = mp->b_next;
307 		mp->b_next = NULL;
308 		iptun_output(iptun, mp);
309 	}
310 
311 	return (NULL);
312 }
313 
314 /* ARGSUSED */
315 static int
316 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
317     uint_t pr_valsize, const void *pr_val)
318 {
319 	iptun_t		*iptun = barg;
320 	uint32_t	value = *(uint32_t *)pr_val;
321 	int		err;
322 
323 	/*
324 	 * We need to enter this iptun_t since we'll be modifying the outer
325 	 * header.
326 	 */
327 	if ((err = iptun_enter(iptun)) != 0)
328 		return (err);
329 
330 	switch (pr_num) {
331 	case MAC_PROP_IPTUN_HOPLIMIT:
332 		if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
333 			err = EINVAL;
334 			break;
335 		}
336 		if (value != iptun->iptun_hoplimit) {
337 			iptun->iptun_hoplimit = (uint8_t)value;
338 			iptun_headergen(iptun, B_TRUE);
339 		}
340 		break;
341 	case MAC_PROP_IPTUN_ENCAPLIMIT:
342 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
343 		    value > IPTUN_MAX_ENCAPLIMIT) {
344 			err = EINVAL;
345 			break;
346 		}
347 		if (value != iptun->iptun_encaplimit) {
348 			iptun->iptun_encaplimit = (uint8_t)value;
349 			iptun_headergen(iptun, B_TRUE);
350 		}
351 		break;
352 	case MAC_PROP_MTU: {
353 		uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
354 
355 		if (value < iptun->iptun_typeinfo->iti_minmtu ||
356 		    value > maxmtu) {
357 			err = EINVAL;
358 			break;
359 		}
360 		iptun->iptun_flags |= IPTUN_FIXED_MTU;
361 		if (value != iptun->iptun_mtu) {
362 			iptun->iptun_mtu = value;
363 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
364 		}
365 		break;
366 	}
367 	default:
368 		err = EINVAL;
369 	}
370 	iptun_exit(iptun);
371 	return (err);
372 }
373 
374 /* ARGSUSED */
375 static int
376 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
377     uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
378 {
379 	iptun_t			*iptun = barg;
380 	mac_propval_range_t	range;
381 	boolean_t		is_default = (pr_flags & MAC_PROP_DEFAULT);
382 	boolean_t		is_possible = (pr_flags & MAC_PROP_POSSIBLE);
383 	int			err;
384 
385 	if ((err = iptun_enter(iptun)) != 0)
386 		return (err);
387 
388 	if ((pr_flags & ~(MAC_PROP_DEFAULT | MAC_PROP_POSSIBLE)) != 0) {
389 		err = ENOTSUP;
390 		goto done;
391 	}
392 	if (is_default && is_possible) {
393 		err = EINVAL;
394 		goto done;
395 	}
396 
397 	*perm = MAC_PROP_PERM_RW;
398 
399 	if (is_possible) {
400 		if (pr_valsize < sizeof (mac_propval_range_t)) {
401 			err = EINVAL;
402 			goto done;
403 		}
404 		range.mpr_count = 1;
405 		range.mpr_type = MAC_PROPVAL_UINT32;
406 	} else if (pr_valsize < sizeof (uint32_t)) {
407 		err = EINVAL;
408 		goto done;
409 	}
410 
411 	switch (pr_num) {
412 	case MAC_PROP_IPTUN_HOPLIMIT:
413 		if (is_possible) {
414 			range.range_uint32[0].mpur_min = IPTUN_MIN_HOPLIMIT;
415 			range.range_uint32[0].mpur_max = IPTUN_MAX_HOPLIMIT;
416 		} else if (is_default) {
417 			*(uint32_t *)pr_val = IPTUN_DEFAULT_HOPLIMIT;
418 		} else {
419 			*(uint32_t *)pr_val = iptun->iptun_hoplimit;
420 		}
421 		break;
422 	case MAC_PROP_IPTUN_ENCAPLIMIT:
423 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6) {
424 			err = ENOTSUP;
425 			goto done;
426 		}
427 		if (is_possible) {
428 			range.range_uint32[0].mpur_min = IPTUN_MIN_ENCAPLIMIT;
429 			range.range_uint32[0].mpur_max = IPTUN_MAX_ENCAPLIMIT;
430 		} else if (is_default) {
431 			*(uint32_t *)pr_val = IPTUN_DEFAULT_ENCAPLIMIT;
432 		} else {
433 			*(uint32_t *)pr_val = iptun->iptun_encaplimit;
434 		}
435 		break;
436 	case MAC_PROP_MTU: {
437 		uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
438 
439 		if (is_possible) {
440 			range.range_uint32[0].mpur_min =
441 			    iptun->iptun_typeinfo->iti_minmtu;
442 			range.range_uint32[0].mpur_max = maxmtu;
443 		} else {
444 			/*
445 			 * The MAC module knows the current value and should
446 			 * never call us for it.  There is also no default
447 			 * MTU, as by default, it is a dynamic property.
448 			 */
449 			err = ENOTSUP;
450 			goto done;
451 		}
452 		break;
453 	}
454 	default:
455 		err = EINVAL;
456 		goto done;
457 	}
458 	if (is_possible)
459 		bcopy(&range, pr_val, sizeof (range));
460 done:
461 	iptun_exit(iptun);
462 	return (err);
463 }
464 
465 uint_t
466 iptun_count(void)
467 {
468 	return (iptun_tunnelcount);
469 }
470 
471 /*
472  * Enter an iptun_t exclusively.  This is essentially just a mutex, but we
473  * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
474  * being deleted.
475  */
476 static int
477 iptun_enter(iptun_t *iptun)
478 {
479 	mutex_enter(&iptun->iptun_lock);
480 	while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
481 		cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
482 	if (iptun->iptun_flags & IPTUN_CONDEMNED) {
483 		mutex_exit(&iptun->iptun_lock);
484 		return (ENOENT);
485 	}
486 	return (0);
487 }
488 
489 /*
490  * Exit the tunnel entered in iptun_enter().
491  */
492 static void
493 iptun_exit(iptun_t *iptun)
494 {
495 	mutex_exit(&iptun->iptun_lock);
496 }
497 
498 /*
499  * Enter the IP tunnel instance by datalink ID.
500  */
501 static int
502 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
503 {
504 	int err;
505 
506 	mutex_enter(&iptun_hash_lock);
507 	if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
508 	    (mod_hash_val_t *)iptun) == 0)
509 		err = iptun_enter(*iptun);
510 	else
511 		err = ENOENT;
512 	if (err != 0)
513 		*iptun = NULL;
514 	mutex_exit(&iptun_hash_lock);
515 	return (err);
516 }
517 
518 /*
519  * Handle tasks that were deferred through the iptun_taskq.  These fall into
520  * two categories:
521  *
522  * 1. Tasks that were defered because we didn't want to spend time doing them
523  * while in the data path.  Only IPTUN_TASK_PMTU_UPDATE falls into this
524  * category.
525  *
526  * 2. Tasks that were defered because they require calling up to the mac
527  * module, and we can't call up to the mac module while holding locks.
528  *
529  * Handling 1 is easy; we just lookup the iptun_t, perform the task, exit the
530  * tunnel, and we're done.
531  *
532  * Handling 2 is tricky to get right without introducing race conditions and
533  * deadlocks with the mac module, as we cannot issue an upcall while in the
534  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
535  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
536  * module will already have the perimeter held, and will then try and enter
537  * the iptun_t.  You can see the lock ordering problem with this; this will
538  * deadlock.
539  *
540  * The safe way to do this is to enter the iptun_t in question and copy the
541  * information we need out of it so that we can exit it and know that the
542  * information being passed up to the upcalls won't be subject to modification
543  * by other threads.  The problem now is that we need to exit it prior to
544  * issuing the upcall, but once we do this, a thread could come along and
545  * delete the iptun_t and thus the mac handle required to issue the upcall.
546  * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
547  * iptun_t.  This flag is the condition associated with iptun_upcall_cv, which
548  * iptun_delete() will cv_wait() on.  When the upcall completes, we clear
549  * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
550  * iptun_delete().  We can thus still safely use iptun->iptun_mh after having
551  * exited the iptun_t.
552  */
553 static void
554 iptun_task_cb(void *arg)
555 {
556 	iptun_task_data_t	*itd = arg;
557 	iptun_task_t		task = itd->itd_task;
558 	datalink_id_t		linkid = itd->itd_linkid;
559 	iptun_t			*iptun;
560 	uint32_t		mtu;
561 	iptun_addr_t		addr;
562 	link_state_t		linkstate;
563 	size_t			header_size;
564 	iptun_header_t		header;
565 
566 	kmem_free(itd, sizeof (*itd));
567 
568 	/*
569 	 * Note that if the lookup fails, it's because the tunnel was deleted
570 	 * between the time the task was dispatched and now.  That isn't an
571 	 * error.
572 	 */
573 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
574 		return;
575 
576 	if (task == IPTUN_TASK_PMTU_UPDATE) {
577 		(void) iptun_update_mtu(iptun, 0);
578 		iptun_exit(iptun);
579 		return;
580 	}
581 
582 	iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
583 
584 	switch (task) {
585 	case IPTUN_TASK_MTU_UPDATE:
586 		mtu = iptun->iptun_mtu;
587 		break;
588 	case IPTUN_TASK_LADDR_UPDATE:
589 		addr = iptun->iptun_laddr;
590 		break;
591 	case IPTUN_TASK_RADDR_UPDATE:
592 		addr = iptun->iptun_raddr;
593 		break;
594 	case IPTUN_TASK_LINK_UPDATE:
595 		linkstate = IS_IPTUN_RUNNING(iptun) ?
596 		    LINK_STATE_UP : LINK_STATE_DOWN;
597 		break;
598 	case IPTUN_TASK_PDATA_UPDATE:
599 		header_size = iptun->iptun_header_size;
600 		header = iptun->iptun_header;
601 		break;
602 	default:
603 		ASSERT(0);
604 	}
605 
606 	iptun_exit(iptun);
607 
608 	switch (task) {
609 	case IPTUN_TASK_MTU_UPDATE:
610 		(void) mac_maxsdu_update(iptun->iptun_mh, mtu);
611 		break;
612 	case IPTUN_TASK_LADDR_UPDATE:
613 		mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
614 		break;
615 	case IPTUN_TASK_RADDR_UPDATE:
616 		mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
617 		break;
618 	case IPTUN_TASK_LINK_UPDATE:
619 		mac_link_update(iptun->iptun_mh, linkstate);
620 		break;
621 	case IPTUN_TASK_PDATA_UPDATE:
622 		if (mac_pdata_update(iptun->iptun_mh,
623 		    header_size == 0 ? NULL : &header, header_size) != 0)
624 			atomic_inc_64(&iptun->iptun_taskq_fail);
625 		break;
626 	}
627 
628 	mutex_enter(&iptun->iptun_lock);
629 	iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
630 	cv_signal(&iptun->iptun_upcall_cv);
631 	mutex_exit(&iptun->iptun_lock);
632 }
633 
634 static void
635 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
636 {
637 	iptun_task_data_t *itd;
638 
639 	itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
640 	if (itd == NULL) {
641 		atomic_inc_64(&iptun->iptun_taskq_fail);
642 		return;
643 	}
644 	itd->itd_task = iptun_task;
645 	itd->itd_linkid = iptun->iptun_linkid;
646 	if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
647 		atomic_inc_64(&iptun->iptun_taskq_fail);
648 		kmem_free(itd, sizeof (*itd));
649 	}
650 }
651 
652 /*
653  * Convert an iptun_addr_t to sockaddr_storage.
654  */
655 static void
656 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
657 {
658 	struct sockaddr_in	*sin;
659 	struct sockaddr_in6	*sin6;
660 
661 	bzero(ss, sizeof (*ss));
662 	switch (iptun_addr->ia_family) {
663 	case AF_INET:
664 		sin = (struct sockaddr_in *)ss;
665 		sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
666 		break;
667 	case AF_INET6:
668 		sin6 = (struct sockaddr_in6 *)ss;
669 		sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
670 		break;
671 	default:
672 		ASSERT(0);
673 	}
674 	ss->ss_family = iptun_addr->ia_family;
675 }
676 
677 /*
678  * General purpose function to set an IP tunnel source or destination address.
679  */
680 static int
681 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
682     const struct sockaddr_storage *ss)
683 {
684 	if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
685 		return (EINVAL);
686 
687 	switch (ss->ss_family) {
688 	case AF_INET: {
689 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
690 
691 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
692 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
693 		    CLASSD(sin->sin_addr.s_addr)) {
694 			return (EADDRNOTAVAIL);
695 		}
696 		iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
697 		break;
698 	}
699 	case AF_INET6: {
700 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
701 
702 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
703 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
704 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
705 			return (EADDRNOTAVAIL);
706 		}
707 		iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
708 		break;
709 	}
710 	default:
711 		return (EAFNOSUPPORT);
712 	}
713 	iptun_addr->ia_family = ss->ss_family;
714 	return (0);
715 }
716 
717 static int
718 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
719 {
720 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
721 	    &iptun->iptun_laddr, laddr));
722 }
723 
724 static int
725 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
726 {
727 	if (!(iptun->iptun_typeinfo->iti_hasraddr))
728 		return (EINVAL);
729 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
730 	    &iptun->iptun_raddr, raddr));
731 }
732 
733 static boolean_t
734 iptun_canbind(iptun_t *iptun)
735 {
736 	/*
737 	 * A tunnel may bind when its source address has been set, and if its
738 	 * tunnel type requires one, also its destination address.
739 	 */
740 	return ((iptun->iptun_flags & IPTUN_LADDR) &&
741 	    ((iptun->iptun_flags & IPTUN_RADDR) ||
742 	    !(iptun->iptun_typeinfo->iti_hasraddr)));
743 }
744 
745 static int
746 iptun_bind(iptun_t *iptun)
747 {
748 	conn_t	*connp = iptun->iptun_connp;
749 	int	err;
750 
751 	ASSERT(iptun_canbind(iptun));
752 
753 	switch (iptun->iptun_typeinfo->iti_type) {
754 	case IPTUN_TYPE_IPV4:
755 		/*
756 		 * When we set a tunnel's destination address, we do not care
757 		 * if the destination is reachable.  Transient routing issues
758 		 * should not inhibit the creation of a tunnel interface, for
759 		 * example.  For that reason, we pass in B_FALSE for the
760 		 * verify_dst argument of ip_proto_bind_connected_v4() (and
761 		 * similarly for IPv6 tunnels below).
762 		 */
763 		err = ip_proto_bind_connected_v4(connp, NULL, IPPROTO_ENCAP,
764 		    &iptun->iptun_laddr4, 0, iptun->iptun_raddr4, 0, B_TRUE,
765 		    B_FALSE, iptun->iptun_cred);
766 		break;
767 	case IPTUN_TYPE_IPV6:
768 		err = ip_proto_bind_connected_v6(connp, NULL, IPPROTO_IPV6,
769 		    &iptun->iptun_laddr6, 0, &iptun->iptun_raddr6, NULL, 0,
770 		    B_TRUE, B_FALSE, iptun->iptun_cred);
771 		break;
772 	case IPTUN_TYPE_6TO4:
773 		err = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_IPV6,
774 		    iptun->iptun_laddr4, 0, B_TRUE);
775 		break;
776 	}
777 
778 	if (err == 0) {
779 		iptun->iptun_flags |= IPTUN_BOUND;
780 
781 		/*
782 		 * Now that we're bound with ip below us, this is a good time
783 		 * to initialize the destination path MTU and to re-calculate
784 		 * the tunnel's link MTU.
785 		 */
786 		(void) iptun_update_mtu(iptun, 0);
787 
788 		if (IS_IPTUN_RUNNING(iptun))
789 			iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
790 	}
791 	return (err);
792 }
793 
794 static void
795 iptun_unbind(iptun_t *iptun)
796 {
797 	ASSERT(iptun->iptun_flags & IPTUN_BOUND);
798 	ASSERT(mutex_owned(&iptun->iptun_lock) ||
799 	    (iptun->iptun_flags & IPTUN_CONDEMNED));
800 	ip_unbind(iptun->iptun_connp);
801 	iptun->iptun_flags &= ~IPTUN_BOUND;
802 	if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
803 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
804 }
805 
806 /*
807  * Re-generate the template data-link header for a given IP tunnel given the
808  * tunnel's current parameters.
809  */
810 static void
811 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
812 {
813 	switch (iptun->iptun_typeinfo->iti_ipvers) {
814 	case IPV4_VERSION:
815 		/*
816 		 * We only need to use a custom IP header if the administrator
817 		 * has supplied a non-default hoplimit.
818 		 */
819 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
820 			iptun->iptun_header_size = 0;
821 			break;
822 		}
823 		iptun->iptun_header_size = sizeof (ipha_t);
824 		iptun->iptun_header4.ipha_version_and_hdr_length =
825 		    IP_SIMPLE_HDR_VERSION;
826 		iptun->iptun_header4.ipha_fragment_offset_and_flags =
827 		    htons(IPH_DF);
828 		iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
829 		break;
830 	case IPV6_VERSION: {
831 		ip6_t	*ip6hp = &iptun->iptun_header6.it6h_ip6h;
832 
833 		/*
834 		 * We only need to use a custom IPv6 header if either the
835 		 * administrator has supplied a non-default hoplimit, or we
836 		 * need to include an encapsulation limit option in the outer
837 		 * header.
838 		 */
839 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
840 		    iptun->iptun_encaplimit == 0) {
841 			iptun->iptun_header_size = 0;
842 			break;
843 		}
844 
845 		(void) memset(ip6hp, 0, sizeof (*ip6hp));
846 		if (iptun->iptun_encaplimit == 0) {
847 			iptun->iptun_header_size = sizeof (ip6_t);
848 			ip6hp->ip6_nxt = IPPROTO_NONE;
849 		} else {
850 			iptun_encaplim_t	*iel;
851 
852 			iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
853 			/*
854 			 * The mac_ipv6 plugin requires ip6_plen to be in host
855 			 * byte order and reflect the extension headers
856 			 * present in the template.  The actual network byte
857 			 * order ip6_plen will be set on a per-packet basis on
858 			 * transmit.
859 			 */
860 			ip6hp->ip6_plen = sizeof (*iel);
861 			ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
862 			iel = &iptun->iptun_header6.it6h_encaplim;
863 			*iel = iptun_encaplim_init;
864 			iel->iel_telopt.ip6ot_encap_limit =
865 			    iptun->iptun_encaplimit;
866 		}
867 
868 		ip6hp->ip6_hlim = iptun->iptun_hoplimit;
869 		break;
870 	}
871 	}
872 
873 	if (update_mac)
874 		iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
875 }
876 
877 /*
878  * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
879  * head.
880  */
881 static boolean_t
882 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
883     uint_t n, netstack_t *ns)
884 {
885 	int f = IPSEC_AF_V4;
886 
887 	if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
888 	    !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
889 		return (B_FALSE);
890 
891 	f = IPSEC_AF_V6;
892 	return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
893 	    ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
894 }
895 
896 /*
897  * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
898  * IPTUN_MODIFY ioctls.
899  */
900 static int
901 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
902 {
903 	int		rc = 0;
904 	uint_t		nact;
905 	ipsec_act_t	*actp = NULL;
906 	boolean_t	clear_all, old_policy = B_FALSE;
907 	ipsec_tun_pol_t	*itp;
908 	char		name[MAXLINKNAMELEN];
909 	uint64_t	gen;
910 	netstack_t	*ns = iptun->iptun_ns;
911 
912 	/* Can't specify self-encap on a tunnel. */
913 	if (ipsr->ipsr_self_encap_req != 0)
914 		return (EINVAL);
915 
916 	/*
917 	 * If it's a "clear-all" entry, unset the security flags and resume
918 	 * normal cleartext (or inherit-from-global) policy.
919 	 */
920 	clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
921 	    (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
922 
923 	ASSERT(mutex_owned(&iptun->iptun_lock));
924 	itp = iptun->iptun_itp;
925 	if (itp == NULL) {
926 		if (clear_all)
927 			goto bail;
928 		if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
929 		    NULL, NULL)) != 0)
930 			goto bail;
931 		ASSERT(name[0] != '\0');
932 		if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
933 			goto bail;
934 		iptun->iptun_itp = itp;
935 	}
936 
937 	/* Allocate the actvec now, before holding itp or polhead locks. */
938 	ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
939 	if (actp == NULL) {
940 		rc = ENOMEM;
941 		goto bail;
942 	}
943 
944 	/*
945 	 * Just write on the active polhead.  Save the primary/secondary stuff
946 	 * for spdsock operations.
947 	 *
948 	 * Mutex because we need to write to the polhead AND flags atomically.
949 	 * Other threads will acquire the polhead lock as a reader if the
950 	 * (unprotected) flag is set.
951 	 */
952 	mutex_enter(&itp->itp_lock);
953 	if (itp->itp_flags & ITPF_P_TUNNEL) {
954 		/* Oops, we lost a race.  Let's get out of here. */
955 		rc = EBUSY;
956 		goto mutex_bail;
957 	}
958 	old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
959 
960 	if (old_policy) {
961 		ITPF_CLONE(itp->itp_flags);
962 		rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
963 		if (rc != 0) {
964 			/* inactive has already been cleared. */
965 			itp->itp_flags &= ~ITPF_IFLAGS;
966 			goto mutex_bail;
967 		}
968 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
969 		ipsec_polhead_flush(itp->itp_policy, ns);
970 	} else {
971 		/* Else assume itp->itp_policy is already flushed. */
972 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
973 	}
974 
975 	if (clear_all) {
976 		ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
977 		itp->itp_flags &= ~ITPF_PFLAGS;
978 		rw_exit(&itp->itp_policy->iph_lock);
979 		old_policy = B_FALSE;	/* Clear out the inactive one too. */
980 		goto recover_bail;
981 	}
982 
983 	if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
984 		rw_exit(&itp->itp_policy->iph_lock);
985 		/*
986 		 * Adjust MTU and make sure the DL side knows what's up.
987 		 */
988 		itp->itp_flags = ITPF_P_ACTIVE;
989 		(void) iptun_update_mtu(iptun, 0);
990 		old_policy = B_FALSE;	/* Blank out inactive - we succeeded */
991 	} else {
992 		rw_exit(&itp->itp_policy->iph_lock);
993 		rc = ENOMEM;
994 	}
995 
996 recover_bail:
997 	if (old_policy) {
998 		/* Recover policy in in active polhead. */
999 		ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1000 		ITPF_SWAP(itp->itp_flags);
1001 	}
1002 
1003 	/* Clear policy in inactive polhead. */
1004 	itp->itp_flags &= ~ITPF_IFLAGS;
1005 	rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1006 	ipsec_polhead_flush(itp->itp_inactive, ns);
1007 	rw_exit(&itp->itp_inactive->iph_lock);
1008 
1009 mutex_bail:
1010 	mutex_exit(&itp->itp_lock);
1011 
1012 bail:
1013 	if (actp != NULL)
1014 		ipsec_actvec_free(actp, nact);
1015 
1016 	return (rc);
1017 }
1018 
1019 static iptun_typeinfo_t *
1020 iptun_gettypeinfo(iptun_type_t type)
1021 {
1022 	int i;
1023 
1024 	for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1025 		if (iptun_type_table[i].iti_type == type)
1026 			break;
1027 	}
1028 	return (&iptun_type_table[i]);
1029 }
1030 
1031 /*
1032  * Set the parameters included in ik on the tunnel iptun.  Parameters that can
1033  * only be set at creation time are set in iptun_create().
1034  */
1035 static int
1036 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1037 {
1038 	int		err = 0;
1039 	netstack_t	*ns = iptun->iptun_ns;
1040 	iptun_addr_t	orig_laddr, orig_raddr;
1041 	uint_t		orig_flags = iptun->iptun_flags;
1042 
1043 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1044 		if (orig_flags & IPTUN_LADDR)
1045 			orig_laddr = iptun->iptun_laddr;
1046 		if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1047 			return (err);
1048 		iptun->iptun_flags |= IPTUN_LADDR;
1049 	}
1050 
1051 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1052 		if (orig_flags & IPTUN_RADDR)
1053 			orig_raddr = iptun->iptun_raddr;
1054 		if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1055 			goto done;
1056 		iptun->iptun_flags |= IPTUN_RADDR;
1057 	}
1058 
1059 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1060 		/*
1061 		 * Set IPsec policy originating from the ifconfig(1M) command
1062 		 * line.  This is traditionally called "simple" policy because
1063 		 * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1064 		 * simple policy of "do ESP on everything" and/or "do AH on
1065 		 * everything" (as opposed to the rich policy that can be
1066 		 * defined with ipsecconf(1M)).
1067 		 */
1068 		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1069 			/*
1070 			 * Can't set security properties for automatic
1071 			 * tunnels.
1072 			 */
1073 			err = EINVAL;
1074 			goto done;
1075 		}
1076 
1077 		if (!ipsec_loaded(ns->netstack_ipsec)) {
1078 			/* If IPsec can be loaded, try and load it now. */
1079 			if (ipsec_failed(ns->netstack_ipsec)) {
1080 				err = EPROTONOSUPPORT;
1081 				goto done;
1082 			}
1083 			ipsec_loader_loadnow(ns->netstack_ipsec);
1084 			/*
1085 			 * ipsec_loader_loadnow() returns while IPsec is
1086 			 * loaded asynchronously.  While a method exists to
1087 			 * wait for IPsec to load (ipsec_loader_wait()), it
1088 			 * requires use of a STREAMS queue to do a qwait().
1089 			 * We're not in STREAMS context here, and so we can't
1090 			 * use it.  This is not a problem in practice because
1091 			 * in the vast majority of cases, key management and
1092 			 * global policy will have loaded before any tunnels
1093 			 * are plumbed, and so IPsec will already have been
1094 			 * loaded.
1095 			 */
1096 			err = EAGAIN;
1097 			goto done;
1098 		}
1099 
1100 		err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1101 		if (err == 0) {
1102 			iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1103 			iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1104 		}
1105 	}
1106 done:
1107 	if (err != 0) {
1108 		/* Restore original source and destination. */
1109 		if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1110 		    (orig_flags & IPTUN_LADDR))
1111 			iptun->iptun_laddr = orig_laddr;
1112 		if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1113 		    (orig_flags & IPTUN_RADDR))
1114 			iptun->iptun_raddr = orig_raddr;
1115 		iptun->iptun_flags = orig_flags;
1116 	}
1117 	return (err);
1118 }
1119 
1120 static int
1121 iptun_register(iptun_t *iptun)
1122 {
1123 	mac_register_t	*mac;
1124 	int		err;
1125 
1126 	ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1127 
1128 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1129 		return (EINVAL);
1130 
1131 	mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1132 	mac->m_driver = iptun;
1133 	mac->m_dip = iptun_dip;
1134 	mac->m_instance = (uint_t)-1;
1135 	mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1136 	mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1137 	    (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1138 	mac->m_callbacks = &iptun_m_callbacks;
1139 	mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1140 	mac->m_max_sdu = iptun->iptun_mtu;
1141 	if (iptun->iptun_header_size != 0) {
1142 		mac->m_pdata = &iptun->iptun_header;
1143 		mac->m_pdata_size = iptun->iptun_header_size;
1144 	}
1145 	if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1146 		iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1147 	mac_free(mac);
1148 	return (err);
1149 }
1150 
1151 static int
1152 iptun_unregister(iptun_t *iptun)
1153 {
1154 	int err;
1155 
1156 	ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1157 	if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1158 		iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1159 	return (err);
1160 }
1161 
1162 static conn_t *
1163 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1164 {
1165 	conn_t *connp;
1166 
1167 	if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1168 		return (NULL);
1169 
1170 	connp->conn_flags |= IPCL_IPTUN;
1171 	connp->conn_iptun = iptun;
1172 	connp->conn_recv = iptun_input;
1173 	connp->conn_rq = ns->netstack_iptun->iptuns_g_q;
1174 	connp->conn_wq = WR(connp->conn_rq);
1175 	/*
1176 	 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1177 	 * for all other conn_t's.
1178 	 *
1179 	 * Note that there's an important distinction between iptun_zoneid and
1180 	 * conn_zoneid.  The conn_zoneid is set to GLOBAL_ZONEID in non-global
1181 	 * exclusive stack zones to make the ip module believe that the
1182 	 * non-global zone is actually a global zone.  Therefore, when
1183 	 * interacting with the ip module, we must always use conn_zoneid.
1184 	 */
1185 	connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1186 	    crgetzoneid(credp) : GLOBAL_ZONEID;
1187 	connp->conn_cred = credp;
1188 	/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1189 	crhold(connp->conn_cred);
1190 
1191 	connp->conn_send = iptun->iptun_typeinfo->iti_txfunc;
1192 	connp->conn_af_isv6 = iptun->iptun_typeinfo->iti_ipvers == IPV6_VERSION;
1193 	ASSERT(connp->conn_ref == 1);
1194 
1195 	mutex_enter(&connp->conn_lock);
1196 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1197 	mutex_exit(&connp->conn_lock);
1198 	return (connp);
1199 }
1200 
1201 static void
1202 iptun_conn_destroy(conn_t *connp)
1203 {
1204 	ip_quiesce_conn(connp);
1205 	connp->conn_iptun = NULL;
1206 	ASSERT(connp->conn_ref == 1);
1207 	CONN_DEC_REF(connp);
1208 }
1209 
1210 static int
1211 iptun_create_g_q(iptun_stack_t *iptuns, cred_t *credp)
1212 {
1213 	int	err;
1214 	conn_t	*connp;
1215 
1216 	ASSERT(iptuns->iptuns_g_q == NULL);
1217 	/*
1218 	 * The global queue for this stack is set when iptunq_open() calls
1219 	 * iptun_set_g_q().
1220 	 */
1221 	err = ldi_open_by_name(IPTUNQ_DEV, FWRITE|FREAD, credp,
1222 	    &iptuns->iptuns_g_q_lh, iptun_ldi_ident);
1223 	if (err == 0) {
1224 		connp = iptuns->iptuns_g_q->q_ptr;
1225 		connp->conn_recv = iptun_input;
1226 	}
1227 	return (err);
1228 }
1229 
1230 static iptun_t *
1231 iptun_alloc(void)
1232 {
1233 	iptun_t *iptun;
1234 
1235 	if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1236 		bzero(iptun, sizeof (*iptun));
1237 		atomic_inc_32(&iptun_tunnelcount);
1238 	}
1239 	return (iptun);
1240 }
1241 
1242 static void
1243 iptun_free(iptun_t *iptun)
1244 {
1245 	ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1246 
1247 	if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1248 		iptun_stack_t	*iptuns = iptun->iptun_iptuns;
1249 
1250 		mutex_enter(&iptun_hash_lock);
1251 		VERIFY(mod_hash_remove(iptun_hash,
1252 		    IPTUN_HASH_KEY(iptun->iptun_linkid),
1253 		    (mod_hash_val_t *)&iptun) == 0);
1254 		mutex_exit(&iptun_hash_lock);
1255 		iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1256 		mutex_enter(&iptuns->iptuns_lock);
1257 		list_remove(&iptuns->iptuns_iptunlist, iptun);
1258 		mutex_exit(&iptuns->iptuns_lock);
1259 	}
1260 
1261 	if (iptun->iptun_flags & IPTUN_BOUND)
1262 		iptun_unbind(iptun);
1263 
1264 	/*
1265 	 * After iptun_unregister(), there will be no threads executing a
1266 	 * downcall from the mac module, including in the tx datapath.
1267 	 */
1268 	if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1269 		VERIFY(iptun_unregister(iptun) == 0);
1270 
1271 	if (iptun->iptun_itp != NULL) {
1272 		/*
1273 		 * Remove from the AVL tree, AND release the reference iptun_t
1274 		 * itself holds on the ITP.
1275 		 */
1276 		itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1277 		ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1278 		iptun->iptun_itp = NULL;
1279 		iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1280 	}
1281 
1282 	/*
1283 	 * After ipcl_conn_destroy(), there will be no threads executing an
1284 	 * upcall from ip (i.e., iptun_input()), and it is then safe to free
1285 	 * the iptun_t.
1286 	 */
1287 	if (iptun->iptun_connp != NULL) {
1288 		iptun_conn_destroy(iptun->iptun_connp);
1289 		iptun->iptun_connp = NULL;
1290 	}
1291 
1292 	netstack_rele(iptun->iptun_ns);
1293 	iptun->iptun_ns = NULL;
1294 	crfree(iptun->iptun_cred);
1295 	iptun->iptun_cred = NULL;
1296 
1297 	kmem_cache_free(iptun_cache, iptun);
1298 	atomic_dec_32(&iptun_tunnelcount);
1299 }
1300 
1301 int
1302 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1303 {
1304 	iptun_t		*iptun = NULL;
1305 	int		err = 0, mherr;
1306 	char		linkname[MAXLINKNAMELEN];
1307 	ipsec_tun_pol_t	*itp;
1308 	netstack_t	*ns = NULL;
1309 	iptun_stack_t	*iptuns;
1310 	datalink_id_t	tmpid;
1311 	zoneid_t	zoneid = crgetzoneid(credp);
1312 	boolean_t	link_created = B_FALSE;
1313 
1314 	/* The tunnel type is mandatory */
1315 	if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1316 		return (EINVAL);
1317 
1318 	/*
1319 	 * Is the linkid that the caller wishes to associate with this new
1320 	 * tunnel assigned to this zone?
1321 	 */
1322 	if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1323 		if (zoneid != GLOBAL_ZONEID)
1324 			return (EINVAL);
1325 	} else if (zoneid == GLOBAL_ZONEID) {
1326 		return (EINVAL);
1327 	}
1328 
1329 	/*
1330 	 * Make sure that we're not trying to create a tunnel that has already
1331 	 * been created.
1332 	 */
1333 	if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1334 		iptun_exit(iptun);
1335 		iptun = NULL;
1336 		err = EEXIST;
1337 		goto done;
1338 	}
1339 
1340 	ns = netstack_find_by_cred(credp);
1341 	iptuns = ns->netstack_iptun;
1342 
1343 	/*
1344 	 * Before we create any tunnel, we need to ensure that the default
1345 	 * STREAMS queue (used to satisfy the ip module's requirement for one)
1346 	 * is created.  We only do this once per stack.  The stream is closed
1347 	 * when the stack is destroyed in iptun_stack_fni().
1348 	 */
1349 	mutex_enter(&iptuns->iptuns_lock);
1350 	if (iptuns->iptuns_g_q == NULL)
1351 		err = iptun_create_g_q(iptuns, zone_kcred());
1352 	mutex_exit(&iptuns->iptuns_lock);
1353 	if (err != 0)
1354 		goto done;
1355 
1356 	if ((iptun = iptun_alloc()) == NULL) {
1357 		err = ENOMEM;
1358 		goto done;
1359 	}
1360 
1361 	iptun->iptun_linkid = ik->iptun_kparam_linkid;
1362 	iptun->iptun_zoneid = zoneid;
1363 	crhold(credp);
1364 	iptun->iptun_cred = credp;
1365 	iptun->iptun_ns = ns;
1366 
1367 	iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1368 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1369 		err = EINVAL;
1370 		goto done;
1371 	}
1372 
1373 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1374 		iptun->iptun_flags |= IPTUN_IMPLICIT;
1375 
1376 	if ((err = iptun_setparams(iptun, ik)) != 0)
1377 		goto done;
1378 
1379 	iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1380 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1381 		iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1382 
1383 	iptun_headergen(iptun, B_FALSE);
1384 
1385 	iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1386 	if (iptun->iptun_connp == NULL) {
1387 		err = ENOMEM;
1388 		goto done;
1389 	}
1390 
1391 	iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1392 	iptun->iptun_dpmtu = iptun->iptun_mtu;
1393 
1394 	/*
1395 	 * Find an ITP based on linkname.  If we have parms already set via
1396 	 * the iptun_setparams() call above, it may have created an ITP for
1397 	 * us.  We always try get_tunnel_policy() for DEBUG correctness
1398 	 * checks, and we may wish to refactor this to only check when
1399 	 * iptun_itp is NULL.
1400 	 */
1401 	if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1402 	    NULL, NULL)) != 0)
1403 		goto done;
1404 	if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1405 		iptun->iptun_itp = itp;
1406 
1407 	/*
1408 	 * See if we have the necessary IP addresses assigned to this tunnel
1409 	 * to try and bind them with ip underneath us.  If we're not ready to
1410 	 * bind yet, then we'll defer the bind operation until the addresses
1411 	 * are modified.
1412 	 */
1413 	if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1414 		goto done;
1415 
1416 	if ((err = iptun_register(iptun)) != 0)
1417 		goto done;
1418 
1419 	err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1420 	    iptun->iptun_zoneid);
1421 	if (err != 0)
1422 		goto done;
1423 	link_created = B_TRUE;
1424 
1425 	/*
1426 	 * We hash by link-id as that is the key used by all other iptun
1427 	 * interfaces (modify, delete, etc.).
1428 	 */
1429 	if ((mherr = mod_hash_insert(iptun_hash,
1430 	    IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1431 		mutex_enter(&iptuns->iptuns_lock);
1432 		list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1433 		mutex_exit(&iptuns->iptuns_lock);
1434 		iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1435 	} else if (mherr == MH_ERR_NOMEM) {
1436 		err = ENOMEM;
1437 	} else if (mherr == MH_ERR_DUPLICATE) {
1438 		err = EEXIST;
1439 	} else {
1440 		err = EINVAL;
1441 	}
1442 
1443 done:
1444 	if (iptun == NULL && ns != NULL)
1445 		netstack_rele(ns);
1446 	if (err != 0 && iptun != NULL) {
1447 		if (link_created) {
1448 			(void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1449 			    B_TRUE);
1450 		}
1451 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1452 		iptun_free(iptun);
1453 	}
1454 	return (err);
1455 }
1456 
1457 int
1458 iptun_delete(datalink_id_t linkid, cred_t *credp)
1459 {
1460 	int	err;
1461 	iptun_t	*iptun = NULL;
1462 
1463 	if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1464 		return (err);
1465 
1466 	/* One cannot delete a tunnel that belongs to another zone. */
1467 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1468 		iptun_exit(iptun);
1469 		return (EACCES);
1470 	}
1471 
1472 	/*
1473 	 * We need to exit iptun in order to issue calls up the stack such as
1474 	 * dls_devnet_destroy().  If we call up while still in iptun, deadlock
1475 	 * with calls coming down the stack is possible.  We prevent other
1476 	 * threads from entering this iptun after we've exited it by setting
1477 	 * the IPTUN_DELETE_PENDING flag.  This will cause callers of
1478 	 * iptun_enter() to block waiting on iptun_enter_cv.  The assumption
1479 	 * here is that the functions we're calling while IPTUN_DELETE_PENDING
1480 	 * is set dont resuult in an iptun_enter() call, as that would result
1481 	 * in deadlock.
1482 	 */
1483 	iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1484 
1485 	/* Wait for any pending upcall to the mac module to complete. */
1486 	while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1487 		cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1488 
1489 	iptun_exit(iptun);
1490 
1491 	if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1492 		/*
1493 		 * mac_disable() will fail with EBUSY if there are references
1494 		 * to the iptun MAC.  If there are none, then mac_disable()
1495 		 * will assure that none can be acquired until the MAC is
1496 		 * unregistered.
1497 		 *
1498 		 * XXX CR 6791335 prevents us from calling mac_disable() prior
1499 		 * to dls_devnet_destroy(), so we unfortunately need to
1500 		 * attempt to re-create the devnet node if mac_disable()
1501 		 * fails.
1502 		 */
1503 		if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1504 			(void) dls_devnet_create(iptun->iptun_mh, linkid,
1505 			    iptun->iptun_zoneid);
1506 		}
1507 	}
1508 
1509 	/*
1510 	 * Now that we know the fate of this iptun_t, we need to clear
1511 	 * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1512 	 * slated to be freed.  Either way, we need to signal the threads
1513 	 * waiting in iptun_enter() so that they can either fail if
1514 	 * IPTUN_CONDEMNED is set, or continue if it's not.
1515 	 */
1516 	mutex_enter(&iptun->iptun_lock);
1517 	iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1518 	if (err == 0)
1519 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1520 	cv_broadcast(&iptun->iptun_enter_cv);
1521 	mutex_exit(&iptun->iptun_lock);
1522 
1523 	/*
1524 	 * Note that there is no danger in calling iptun_free() after having
1525 	 * dropped the iptun_lock since callers of iptun_enter() at this point
1526 	 * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1527 	 * threads entering from mac callbacks which call iptun_enter()
1528 	 * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1529 	 * lock in order to remove the iptun_t from the hash table.
1530 	 */
1531 	if (err == 0)
1532 		iptun_free(iptun);
1533 
1534 	return (err);
1535 }
1536 
1537 int
1538 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1539 {
1540 	iptun_t		*iptun;
1541 	boolean_t	laddr_change = B_FALSE, raddr_change = B_FALSE;
1542 	int		err;
1543 
1544 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1545 		return (err);
1546 
1547 	/* One cannot modify a tunnel that belongs to another zone. */
1548 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1549 		err = EACCES;
1550 		goto done;
1551 	}
1552 
1553 	/* The tunnel type cannot be changed */
1554 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1555 		err = EINVAL;
1556 		goto done;
1557 	}
1558 
1559 	if ((err = iptun_setparams(iptun, ik)) != 0)
1560 		goto done;
1561 	iptun_headergen(iptun, B_FALSE);
1562 
1563 	/*
1564 	 * If any of the tunnel's addresses has been modified and the tunnel
1565 	 * has the necessary addresses assigned to it, we need to try to bind
1566 	 * with ip underneath us.  If we're not ready to bind yet, then we'll
1567 	 * try again when the addresses are modified later.
1568 	 */
1569 	laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1570 	raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1571 	if (laddr_change || raddr_change) {
1572 		if (iptun->iptun_flags & IPTUN_BOUND)
1573 			iptun_unbind(iptun);
1574 		if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1575 			if (laddr_change)
1576 				iptun->iptun_flags &= ~IPTUN_LADDR;
1577 			if (raddr_change)
1578 				iptun->iptun_flags &= ~IPTUN_RADDR;
1579 			goto done;
1580 		}
1581 	}
1582 
1583 	if (laddr_change)
1584 		iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1585 	if (raddr_change)
1586 		iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1587 
1588 done:
1589 	iptun_exit(iptun);
1590 	return (err);
1591 }
1592 
1593 /* Given an IP tunnel's datalink id, fill in its parameters. */
1594 int
1595 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1596 {
1597 	iptun_t	*iptun;
1598 	int	err;
1599 
1600 	/* Is the tunnel link visible from the caller's zone? */
1601 	if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1602 	    crgetzoneid(credp)))
1603 		return (ENOENT);
1604 
1605 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1606 		return (err);
1607 
1608 	bzero(ik, sizeof (iptun_kparams_t));
1609 
1610 	ik->iptun_kparam_linkid = iptun->iptun_linkid;
1611 	ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1612 	ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1613 
1614 	if (iptun->iptun_flags & IPTUN_LADDR) {
1615 		iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1616 		ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1617 	}
1618 	if (iptun->iptun_flags & IPTUN_RADDR) {
1619 		iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1620 		ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1621 	}
1622 
1623 	if (iptun->iptun_flags & IPTUN_IMPLICIT)
1624 		ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1625 
1626 	if (iptun->iptun_itp != NULL) {
1627 		mutex_enter(&iptun->iptun_itp->itp_lock);
1628 		if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1629 			ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1630 			if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1631 				ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1632 				ik->iptun_kparam_secinfo =
1633 				    iptun->iptun_simple_policy;
1634 			}
1635 		}
1636 		mutex_exit(&iptun->iptun_itp->itp_lock);
1637 	}
1638 
1639 done:
1640 	iptun_exit(iptun);
1641 	return (err);
1642 }
1643 
1644 int
1645 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1646 {
1647 	if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1648 		return (EADDRNOTAVAIL);
1649 	ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1650 	return (0);
1651 }
1652 
1653 void
1654 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1655 {
1656 	*relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1657 }
1658 
1659 void
1660 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1661 {
1662 	iptun_t	*iptun;
1663 
1664 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1665 		return;
1666 	if (iptun->iptun_itp != itp) {
1667 		ASSERT(iptun->iptun_itp == NULL);
1668 		ITP_REFHOLD(itp);
1669 		iptun->iptun_itp = itp;
1670 		/* IPsec policy means IPsec overhead, which means lower MTU. */
1671 		(void) iptun_update_mtu(iptun, 0);
1672 	}
1673 	iptun_exit(iptun);
1674 }
1675 
1676 /*
1677  * Obtain the path MTU to the tunnel destination.
1678  */
1679 static uint32_t
1680 iptun_get_dst_pmtu(iptun_t *iptun)
1681 {
1682 	ire_t		*ire = NULL;
1683 	ip_stack_t	*ipst = iptun->iptun_ns->netstack_ip;
1684 	uint32_t	pmtu = 0;
1685 
1686 	/*
1687 	 * We only obtain the destination IRE for tunnels that have a remote
1688 	 * tunnel address.
1689 	 */
1690 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1691 		return (0);
1692 
1693 	switch (iptun->iptun_typeinfo->iti_ipvers) {
1694 	case IPV4_VERSION:
1695 		ire = ire_route_lookup(iptun->iptun_raddr4, INADDR_ANY,
1696 		    INADDR_ANY, 0, NULL, NULL, iptun->iptun_connp->conn_zoneid,
1697 		    NULL, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
1698 		break;
1699 	case IPV6_VERSION:
1700 		ire = ire_route_lookup_v6(&iptun->iptun_raddr6, NULL, NULL, 0,
1701 		    NULL, NULL, iptun->iptun_connp->conn_zoneid, NULL,
1702 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
1703 		break;
1704 	}
1705 
1706 	if (ire != NULL) {
1707 		pmtu = ire->ire_max_frag;
1708 		ire_refrele(ire);
1709 	}
1710 	return (pmtu);
1711 }
1712 
1713 /*
1714  * Returns the max of old_ovhd and the overhead associated with pol.
1715  */
1716 static uint32_t
1717 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1718 {
1719 	uint32_t new_ovhd = old_ovhd;
1720 
1721 	while (pol != NULL) {
1722 		new_ovhd = max(new_ovhd,
1723 		    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1724 		pol = pol->ipsp_hash.hash_next;
1725 	}
1726 	return (new_ovhd);
1727 }
1728 
1729 static uint32_t
1730 iptun_get_ipsec_overhead(iptun_t *iptun)
1731 {
1732 	ipsec_policy_root_t	*ipr;
1733 	ipsec_policy_head_t	*iph;
1734 	ipsec_policy_t		*pol;
1735 	ipsec_selector_t	sel;
1736 	int			i;
1737 	uint32_t		ipsec_ovhd = 0;
1738 	ipsec_tun_pol_t		*itp = iptun->iptun_itp;
1739 	netstack_t		*ns = iptun->iptun_ns;
1740 
1741 	if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1742 		/*
1743 		 * Consult global policy, just in case.  This will only work
1744 		 * if we have both source and destination addresses to work
1745 		 * with.
1746 		 */
1747 		if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1748 		    (IPTUN_LADDR|IPTUN_RADDR))
1749 			return (0);
1750 
1751 		iph = ipsec_system_policy(ns);
1752 		bzero(&sel, sizeof (sel));
1753 		sel.ips_isv4 =
1754 		    (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1755 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1756 		case IPV4_VERSION:
1757 			sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1758 			sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1759 			break;
1760 		case IPV6_VERSION:
1761 			sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1762 			sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1763 			break;
1764 		}
1765 		/* Check for both IPv4 and IPv6. */
1766 		sel.ips_protocol = IPPROTO_ENCAP;
1767 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1768 		    &sel, ns);
1769 		if (pol != NULL) {
1770 			ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1771 			IPPOL_REFRELE(pol, ns);
1772 		}
1773 		sel.ips_protocol = IPPROTO_IPV6;
1774 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1775 		    &sel, ns);
1776 		if (pol != NULL) {
1777 			ipsec_ovhd = max(ipsec_ovhd,
1778 			    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1779 			IPPOL_REFRELE(pol, ns);
1780 		}
1781 		IPPH_REFRELE(iph, ns);
1782 	} else {
1783 		/*
1784 		 * Look through all of the possible IPsec actions for the
1785 		 * tunnel, and find the largest potential IPsec overhead.
1786 		 */
1787 		iph = itp->itp_policy;
1788 		rw_enter(&iph->iph_lock, RW_READER);
1789 		ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1790 		ipsec_ovhd = iptun_max_policy_overhead(
1791 		    ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1792 		ipsec_ovhd = iptun_max_policy_overhead(
1793 		    ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1794 		for (i = 0; i < ipr->ipr_nchains; i++) {
1795 			ipsec_ovhd = iptun_max_policy_overhead(
1796 			    ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1797 		}
1798 		rw_exit(&iph->iph_lock);
1799 	}
1800 
1801 	return (ipsec_ovhd);
1802 }
1803 
1804 /*
1805  * Calculate and return the maximum possible MTU for the given tunnel.
1806  */
1807 static uint32_t
1808 iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
1809 {
1810 	size_t		header_size, ipsec_overhead;
1811 	uint32_t	maxmtu, pmtu;
1812 
1813 	/*
1814 	 * Start with the path-MTU to the remote address, which is either
1815 	 * provided as the new_pmtu argument, or obtained using
1816 	 * iptun_get_dst_pmtu().
1817 	 */
1818 	if (new_pmtu != 0) {
1819 		if (iptun->iptun_flags & IPTUN_RADDR) {
1820 			iptun->iptun_dpmtu = new_pmtu;
1821 			iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
1822 		}
1823 		pmtu = new_pmtu;
1824 	} else if (iptun->iptun_flags & IPTUN_RADDR) {
1825 		if ((pmtu = iptun_get_dst_pmtu(iptun)) == 0) {
1826 			/*
1827 			 * We weren't able to obtain the path-MTU of the
1828 			 * destination.  Use the previous value.
1829 			 */
1830 			pmtu = iptun->iptun_dpmtu;
1831 		} else {
1832 			iptun->iptun_dpmtu = pmtu;
1833 			iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
1834 		}
1835 	} else {
1836 		/*
1837 		 * We have no path-MTU information to go on, use the maximum
1838 		 * possible value.
1839 		 */
1840 		pmtu = iptun->iptun_typeinfo->iti_maxmtu;
1841 	}
1842 
1843 	/*
1844 	 * Now calculate tunneling overhead and subtract that from the
1845 	 * path-MTU information obtained above.
1846 	 */
1847 	if (iptun->iptun_header_size != 0) {
1848 		header_size = iptun->iptun_header_size;
1849 	} else {
1850 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1851 		case IPV4_VERSION:
1852 			header_size = sizeof (ipha_t);
1853 			if (is_system_labeled())
1854 				header_size += IP_MAX_OPT_LENGTH;
1855 			break;
1856 		case IPV6_VERSION:
1857 			header_size = sizeof (iptun_ipv6hdrs_t);
1858 			break;
1859 		}
1860 	}
1861 
1862 	ipsec_overhead = iptun_get_ipsec_overhead(iptun);
1863 
1864 	maxmtu = pmtu - (header_size + ipsec_overhead);
1865 	return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
1866 }
1867 
1868 /*
1869  * Re-calculate the tunnel's MTU and notify the MAC layer of any change in
1870  * MTU.  The new_pmtu argument is the new path MTU to the tunnel destination
1871  * to be used in the tunnel MTU calculation.  Passing in 0 for new_pmtu causes
1872  * the path MTU to be dynamically updated using iptun_update_pmtu().
1873  *
1874  * If the calculated tunnel MTU is different than its previous value, then we
1875  * notify the MAC layer above us of this change using mac_maxsdu_update().
1876  */
1877 static uint32_t
1878 iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu)
1879 {
1880 	uint32_t newmtu;
1881 
1882 	/*
1883 	 * We return the current MTU without updating it if it was pegged to a
1884 	 * static value using the MAC_PROP_MTU link property.
1885 	 */
1886 	if (iptun->iptun_flags & IPTUN_FIXED_MTU)
1887 		return (iptun->iptun_mtu);
1888 
1889 	/* If the MTU isn't fixed, then use the maximum possible value. */
1890 	newmtu = iptun_get_maxmtu(iptun, new_pmtu);
1891 
1892 	/*
1893 	 * We only dynamically adjust the tunnel MTU for tunnels with
1894 	 * destinations because dynamic MTU calculations are based on the
1895 	 * destination path-MTU.
1896 	 */
1897 	if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
1898 		iptun->iptun_mtu = newmtu;
1899 		if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1900 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
1901 	}
1902 
1903 	return (newmtu);
1904 }
1905 
1906 /*
1907  * Frees a packet or packet chain and bumps stat for each freed packet.
1908  */
1909 static void
1910 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
1911 {
1912 	mblk_t *pktmp;
1913 
1914 	for (pktmp = mp; pktmp != NULL; pktmp = mp) {
1915 		mp = mp->b_next;
1916 		pktmp->b_next = NULL;
1917 		if (stat != NULL)
1918 			atomic_inc_64(stat);
1919 		freemsg(pktmp);
1920 	}
1921 }
1922 
1923 /*
1924  * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
1925  * original packet to its b_cont.  Returns NULL on failure.
1926  */
1927 static mblk_t *
1928 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
1929 {
1930 	mblk_t *icmperr_mp;
1931 
1932 	if ((icmperr_mp = allocb_tmpl(hdrs_size, orig_pkt)) != NULL) {
1933 		icmperr_mp->b_wptr += hdrs_size;
1934 		/* tack on the offending packet */
1935 		icmperr_mp->b_cont = orig_pkt;
1936 	}
1937 	return (icmperr_mp);
1938 }
1939 
1940 /*
1941  * Transmit an ICMP error.  mp->b_rptr points at the packet to be included in
1942  * the ICMP error.
1943  */
1944 static void
1945 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp)
1946 {
1947 	size_t	orig_pktsize, hdrs_size;
1948 	mblk_t	*icmperr_mp;
1949 	ipha_t	*new_ipha;
1950 	icmph_t	*new_icmp;
1951 
1952 	orig_pktsize = msgdsize(mp);
1953 	hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
1954 	if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
1955 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
1956 		return;
1957 	}
1958 
1959 	new_ipha = (ipha_t *)icmperr_mp->b_rptr;
1960 	new_icmp = (icmph_t *)(new_ipha + 1);
1961 
1962 	new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
1963 	new_ipha->ipha_type_of_service = 0;
1964 	new_ipha->ipha_ident = 0;
1965 	new_ipha->ipha_fragment_offset_and_flags = 0;
1966 	new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
1967 	new_ipha->ipha_protocol = IPPROTO_ICMP;
1968 	new_ipha->ipha_src = orig_ipha->ipha_dst;
1969 	new_ipha->ipha_dst = orig_ipha->ipha_src;
1970 	new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
1971 	new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
1972 
1973 	*new_icmp = *icmp;
1974 	new_icmp->icmph_checksum = 0;
1975 	new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
1976 
1977 	ip_output(iptun->iptun_connp, icmperr_mp, iptun->iptun_connp->conn_wq,
1978 	    IP_WPUT);
1979 }
1980 
1981 static void
1982 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp)
1983 {
1984 	size_t	orig_pktsize, hdrs_size;
1985 	mblk_t	*icmp6err_mp;
1986 	ip6_t	*new_ip6h;
1987 	icmp6_t	*new_icmp6;
1988 
1989 	orig_pktsize = msgdsize(mp);
1990 	hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
1991 	if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
1992 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
1993 		return;
1994 	}
1995 
1996 	new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
1997 	new_icmp6 = (icmp6_t *)(new_ip6h + 1);
1998 
1999 	new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2000 	new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2001 	new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2002 	new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2003 	new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2004 	new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2005 
2006 	*new_icmp6 = *icmp6;
2007 	/* The checksum is calculated in ip_wput_ire_v6(). */
2008 	new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2009 
2010 	ip_output_v6(iptun->iptun_connp, icmp6err_mp,
2011 	    iptun->iptun_connp->conn_wq, IP_WPUT);
2012 }
2013 
2014 static void
2015 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2016     uint8_t type, uint8_t code)
2017 {
2018 	icmph_t icmp;
2019 
2020 	bzero(&icmp, sizeof (icmp));
2021 	icmp.icmph_type = type;
2022 	icmp.icmph_code = code;
2023 
2024 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
2025 }
2026 
2027 static void
2028 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2029     mblk_t *mp)
2030 {
2031 	icmph_t	icmp;
2032 
2033 	icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2034 	icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2035 	icmp.icmph_du_zero = 0;
2036 	icmp.icmph_du_mtu = htons(newmtu);
2037 
2038 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
2039 }
2040 
2041 static void
2042 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2043     uint8_t type, uint8_t code, uint32_t offset)
2044 {
2045 	icmp6_t icmp6;
2046 
2047 	bzero(&icmp6, sizeof (icmp6));
2048 	icmp6.icmp6_type = type;
2049 	icmp6.icmp6_code = code;
2050 	if (type == ICMP6_PARAM_PROB)
2051 		icmp6.icmp6_pptr = htonl(offset);
2052 
2053 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
2054 }
2055 
2056 static void
2057 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2058     mblk_t *mp)
2059 {
2060 	icmp6_t icmp6;
2061 
2062 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2063 	icmp6.icmp6_code = 0;
2064 	icmp6.icmp6_mtu = htonl(newmtu);
2065 
2066 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
2067 }
2068 
2069 /*
2070  * Determines if the packet pointed to by ipha or ip6h is an ICMP error.  The
2071  * mp argument is only used to do bounds checking.
2072  */
2073 static boolean_t
2074 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2075 {
2076 	uint16_t hlen;
2077 
2078 	if (ipha != NULL) {
2079 		icmph_t	*icmph;
2080 
2081 		ASSERT(ip6h == NULL);
2082 		if (ipha->ipha_protocol != IPPROTO_ICMP)
2083 			return (B_FALSE);
2084 
2085 		hlen = IPH_HDR_LENGTH(ipha);
2086 		icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2087 		return (ICMP_IS_ERROR(icmph->icmph_type) ||
2088 		    icmph->icmph_type == ICMP_REDIRECT);
2089 	} else {
2090 		icmp6_t	*icmp6;
2091 		uint8_t	*nexthdrp;
2092 
2093 		ASSERT(ip6h != NULL);
2094 		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2095 		    *nexthdrp != IPPROTO_ICMPV6) {
2096 			return (B_FALSE);
2097 		}
2098 
2099 		icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2100 		return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2101 		    icmp6->icmp6_type == ND_REDIRECT);
2102 	}
2103 }
2104 
2105 /*
2106  * Find inner and outer IP headers from a tunneled packet as setup for calls
2107  * into ipsec_tun_{in,out}bound().
2108  */
2109 static size_t
2110 iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
2111     ip6_t **inner6)
2112 {
2113 	ipha_t	*ipha;
2114 	size_t	outer_hlen;
2115 	size_t	first_mblkl = MBLKL(mp);
2116 	mblk_t	*inner_mp;
2117 
2118 	/*
2119 	 * Don't bother handling packets that don't have a full IP header in
2120 	 * the fist mblk.  For the input path, the ip module ensures that this
2121 	 * won't happen, and on the output path, the IP tunneling MAC-type
2122 	 * plugins ensure that this also won't happen.
2123 	 */
2124 	if (first_mblkl < sizeof (ipha_t))
2125 		return (0);
2126 	ipha = (ipha_t *)(mp->b_rptr);
2127 	switch (IPH_HDR_VERSION(ipha)) {
2128 	case IPV4_VERSION:
2129 		*outer4 = ipha;
2130 		*outer6 = NULL;
2131 		outer_hlen = IPH_HDR_LENGTH(ipha);
2132 		break;
2133 	case IPV6_VERSION:
2134 		*outer4 = NULL;
2135 		*outer6 = (ip6_t *)ipha;
2136 		outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2137 		break;
2138 	default:
2139 		return (0);
2140 	}
2141 
2142 	if (first_mblkl < outer_hlen ||
2143 	    (first_mblkl == outer_hlen && mp->b_cont == NULL))
2144 		return (0);
2145 
2146 	/*
2147 	 * We don't bother doing a pullup here since the outer header will
2148 	 * just get stripped off soon on input anyway.  We just want to ensure
2149 	 * that the inner* pointer points to a full header.
2150 	 */
2151 	if (first_mblkl == outer_hlen) {
2152 		inner_mp = mp->b_cont;
2153 		ipha = (ipha_t *)inner_mp->b_rptr;
2154 	} else {
2155 		inner_mp = mp;
2156 		ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2157 	}
2158 	switch (IPH_HDR_VERSION(ipha)) {
2159 	case IPV4_VERSION:
2160 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2161 			return (0);
2162 		*inner4 = ipha;
2163 		*inner6 = NULL;
2164 		break;
2165 	case IPV6_VERSION:
2166 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2167 			return (0);
2168 		*inner4 = NULL;
2169 		*inner6 = (ip6_t *)ipha;
2170 		break;
2171 	default:
2172 		return (0);
2173 	}
2174 
2175 	return (outer_hlen);
2176 }
2177 
2178 /*
2179  * Received ICMP error in response to an X over IPv4 packet that we
2180  * transmitted.
2181  *
2182  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2183  * the following:
2184  *
2185  * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2186  *
2187  *	or
2188  *
2189  * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2190  *
2191  * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2192  * whatever the very-inner packet is (IPv4(2) or IPv6).
2193  */
2194 static void
2195 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
2196     icmph_t *icmph)
2197 {
2198 	uint8_t	*orig;
2199 	ipha_t	*outer4, *inner4;
2200 	ip6_t	*outer6, *inner6;
2201 	int	outer_hlen;
2202 	uint8_t	type, code;
2203 
2204 	/*
2205 	 * Change the db_type to M_DATA because subsequent operations assume
2206 	 * the ICMP packet is M_DATA again (i.e. calls to msgdsize()).
2207 	 */
2208 	data_mp->b_datap->db_type = M_DATA;
2209 
2210 	ASSERT(data_mp->b_cont == NULL);
2211 	/*
2212 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2213 	 * find headers in the ICMP packet payload.
2214 	 */
2215 	orig = data_mp->b_rptr;
2216 	data_mp->b_rptr = (uint8_t *)(icmph + 1);
2217 	/*
2218 	 * The ip module ensures that ICMP errors contain at least the
2219 	 * original IP header (otherwise, the error would never have made it
2220 	 * here).
2221 	 */
2222 	ASSERT(MBLKL(data_mp) >= 0);
2223 	outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
2224 	    &inner6);
2225 	ASSERT(outer6 == NULL);
2226 	data_mp->b_rptr = orig;
2227 	if (outer_hlen == 0) {
2228 		iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
2229 		    &iptun->iptun_ierrors);
2230 		return;
2231 	}
2232 
2233 	/* Only ICMP errors due to tunneled packets should reach here. */
2234 	ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2235 	    outer4->ipha_protocol == IPPROTO_IPV6);
2236 
2237 	/* ipsec_tun_inbound() always frees ipsec_mp. */
2238 	if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
2239 	    inner4, inner6, outer4, outer6, -outer_hlen,
2240 	    iptun->iptun_ns)) {
2241 		/* Callee did all of the freeing. */
2242 		atomic_inc_64(&iptun->iptun_ierrors);
2243 		return;
2244 	}
2245 	/* We should never see reassembled fragment here. */
2246 	ASSERT(data_mp->b_next == NULL);
2247 
2248 	data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2249 
2250 	/*
2251 	 * If the original packet being transmitted was itself an ICMP error,
2252 	 * then drop this packet.  We don't want to generate an ICMP error in
2253 	 * response to an ICMP error.
2254 	 */
2255 	if (is_icmp_error(data_mp, inner4, inner6)) {
2256 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2257 		return;
2258 	}
2259 
2260 	switch (icmph->icmph_type) {
2261 	case ICMP_DEST_UNREACHABLE:
2262 		type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2263 		switch (icmph->icmph_code) {
2264 		case ICMP_FRAGMENTATION_NEEDED: {
2265 			uint32_t newmtu;
2266 
2267 			/*
2268 			 * We reconcile this with the fact that the tunnel may
2269 			 * also have IPsec policy by letting iptun_update_mtu
2270 			 * take care of it.
2271 			 */
2272 			newmtu =
2273 			    iptun_update_mtu(iptun, ntohs(icmph->icmph_du_mtu));
2274 
2275 			if (inner4 != NULL) {
2276 				iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2277 				    data_mp);
2278 			} else {
2279 				iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2280 				    data_mp);
2281 			}
2282 			return;
2283 		}
2284 		case ICMP_DEST_NET_UNREACH_ADMIN:
2285 		case ICMP_DEST_HOST_UNREACH_ADMIN:
2286 			code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2287 			    ICMP6_DST_UNREACH_ADMIN);
2288 			break;
2289 		default:
2290 			code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2291 			    ICMP6_DST_UNREACH_ADDR);
2292 			break;
2293 		}
2294 		break;
2295 	case ICMP_TIME_EXCEEDED:
2296 		if (inner6 != NULL) {
2297 			type = ICMP6_TIME_EXCEEDED;
2298 			code = 0;
2299 		} /* else we're already set. */
2300 		break;
2301 	case ICMP_PARAM_PROBLEM:
2302 		/*
2303 		 * This is a problem with the outer header we transmitted.
2304 		 * Treat this as an output error.
2305 		 */
2306 		iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2307 		return;
2308 	default:
2309 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2310 		return;
2311 	}
2312 
2313 	if (inner4 != NULL)
2314 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
2315 	else
2316 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
2317 }
2318 
2319 /*
2320  * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2321  * Encapsulation Limit destination option.  If there is one, set encaplim_ptr
2322  * to point to the option value.
2323  */
2324 static boolean_t
2325 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2326 {
2327 	ip6_pkt_t	pkt;
2328 	uint8_t		*endptr;
2329 	ip6_dest_t	*destp;
2330 	struct ip6_opt	*optp;
2331 
2332 	pkt.ipp_fields = 0; /* must be initialized */
2333 	(void) ip_find_hdr_v6(mp, ip6h, &pkt, NULL);
2334 	if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2335 		destp = pkt.ipp_dstopts;
2336 	} else if ((pkt.ipp_fields & IPPF_RTDSTOPTS) != 0) {
2337 		destp = pkt.ipp_rtdstopts;
2338 	} else {
2339 		return (B_FALSE);
2340 	}
2341 
2342 	endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2343 	optp = (struct ip6_opt *)(destp + 1);
2344 	while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2345 		if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2346 			if ((uint8_t *)(optp + 1) >= endptr)
2347 				return (B_FALSE);
2348 			*encaplim_ptr = (uint8_t *)&optp[1];
2349 			return (B_TRUE);
2350 		}
2351 		optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2352 	}
2353 	return (B_FALSE);
2354 }
2355 
2356 /*
2357  * Received ICMPv6 error in response to an X over IPv6 packet that we
2358  * transmitted.
2359  *
2360  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2361  * the following:
2362  *
2363  * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2364  *
2365  *	or
2366  *
2367  * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2368  *
2369  * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2370  * whatever the very-inner packet is (IPv4 or IPv6(2)).
2371  */
2372 static void
2373 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
2374     icmp6_t *icmp6h)
2375 {
2376 	uint8_t	*orig;
2377 	ipha_t	*outer4, *inner4;
2378 	ip6_t	*outer6, *inner6;
2379 	int	outer_hlen;
2380 	uint8_t	type, code;
2381 
2382 	/*
2383 	 * Change the db_type to M_DATA because subsequent operations assume
2384 	 * the ICMP packet is M_DATA again (i.e. calls to msgdsize().)
2385 	 */
2386 	data_mp->b_datap->db_type = M_DATA;
2387 
2388 	ASSERT(data_mp->b_cont == NULL);
2389 
2390 	/*
2391 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2392 	 * find IP headers in the ICMP packet payload.
2393 	 */
2394 	orig = data_mp->b_rptr;
2395 	data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2396 	/*
2397 	 * The ip module ensures that ICMP errors contain at least the
2398 	 * original IP header (otherwise, the error would never have made it
2399 	 * here).
2400 	 */
2401 	ASSERT(MBLKL(data_mp) >= 0);
2402 	outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
2403 	    &inner6);
2404 	ASSERT(outer4 == NULL);
2405 	data_mp->b_rptr = orig;	/* Restore r_ptr */
2406 	if (outer_hlen == 0) {
2407 		iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
2408 		    &iptun->iptun_ierrors);
2409 		return;
2410 	}
2411 
2412 	if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
2413 	    inner4, inner6, outer4, outer6, -outer_hlen,
2414 	    iptun->iptun_ns)) {
2415 		/* Callee did all of the freeing. */
2416 		atomic_inc_64(&iptun->iptun_ierrors);
2417 		return;
2418 	}
2419 	/* We should never see reassembled fragment here. */
2420 	ASSERT(data_mp->b_next == NULL);
2421 
2422 	data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2423 
2424 	/*
2425 	 * If the original packet being transmitted was itself an ICMP error,
2426 	 * then drop this packet.  We don't want to generate an ICMP error in
2427 	 * response to an ICMP error.
2428 	 */
2429 	if (is_icmp_error(data_mp, inner4, inner6)) {
2430 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2431 		return;
2432 	}
2433 
2434 	switch (icmp6h->icmp6_type) {
2435 	case ICMP6_PARAM_PROB: {
2436 		uint8_t *encaplim_ptr;
2437 
2438 		/*
2439 		 * If the ICMPv6 error points to a valid Tunnel Encapsulation
2440 		 * Limit option and the limit value is 0, then fall through
2441 		 * and send a host unreachable message.  Otherwise, treat the
2442 		 * error as an output error, as there must have been a problem
2443 		 * with a packet we sent.
2444 		 */
2445 		if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2446 		    (icmp6h->icmp6_pptr !=
2447 		    ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2448 		    *encaplim_ptr != 0) {
2449 			iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2450 			return;
2451 		}
2452 		/* FALLTHRU */
2453 	}
2454 	case ICMP6_TIME_EXCEEDED:
2455 	case ICMP6_DST_UNREACH:
2456 		type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2457 		    ICMP6_DST_UNREACH);
2458 		code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2459 		    ICMP6_DST_UNREACH_ADDR);
2460 		break;
2461 	case ICMP6_PACKET_TOO_BIG: {
2462 		uint32_t newmtu;
2463 
2464 		/*
2465 		 * We reconcile this with the fact that the tunnel may also
2466 		 * have IPsec policy by letting iptun_update_mtu take care of
2467 		 * it.
2468 		 */
2469 		newmtu = iptun_update_mtu(iptun, ntohl(icmp6h->icmp6_mtu));
2470 
2471 		if (inner4 != NULL) {
2472 			iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2473 			    data_mp);
2474 		} else {
2475 			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp);
2476 		}
2477 		return;
2478 	}
2479 	default:
2480 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2481 		return;
2482 	}
2483 
2484 	if (inner4 != NULL)
2485 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
2486 	else
2487 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
2488 }
2489 
2490 static void
2491 iptun_input_icmp(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp)
2492 {
2493 	mblk_t	*tmpmp;
2494 	size_t	hlen;
2495 
2496 	if (data_mp->b_cont != NULL) {
2497 		/*
2498 		 * Since ICMP error processing necessitates access to bits
2499 		 * that are within the ICMP error payload (the original packet
2500 		 * that caused the error), pull everything up into a single
2501 		 * block for convenience.
2502 		 */
2503 		data_mp->b_datap->db_type = M_DATA;
2504 		if ((tmpmp = msgpullup(data_mp, -1)) == NULL) {
2505 			iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
2506 			    &iptun->iptun_norcvbuf);
2507 			return;
2508 		}
2509 		freemsg(data_mp);
2510 		data_mp = tmpmp;
2511 		if (ipsec_mp != NULL)
2512 			ipsec_mp->b_cont = data_mp;
2513 	}
2514 
2515 	switch (iptun->iptun_typeinfo->iti_ipvers) {
2516 	case IPV4_VERSION:
2517 		/*
2518 		 * The outer IP header coming up from IP is always ipha_t
2519 		 * alligned (otherwise, we would have crashed in ip).
2520 		 */
2521 		hlen = IPH_HDR_LENGTH((ipha_t *)data_mp->b_rptr);
2522 		iptun_input_icmp_v4(iptun, ipsec_mp, data_mp,
2523 		    (icmph_t *)(data_mp->b_rptr + hlen));
2524 		break;
2525 	case IPV6_VERSION:
2526 		hlen = ip_hdr_length_v6(data_mp, (ip6_t *)data_mp->b_rptr);
2527 		iptun_input_icmp_v6(iptun, ipsec_mp, data_mp,
2528 		    (icmp6_t *)(data_mp->b_rptr + hlen));
2529 		break;
2530 	}
2531 }
2532 
2533 static boolean_t
2534 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2535 {
2536 	ipaddr_t v4addr;
2537 
2538 	/*
2539 	 * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2540 	 * IPv4 address of a 6to4 tunnel as the destination.
2541 	 */
2542 	if (inner6 == NULL)
2543 		return (B_FALSE);
2544 
2545 	/*
2546 	 * Make sure that the IPv6 destination is within the site that this
2547 	 * 6to4 tunnel is routing for.  We don't want people bouncing random
2548 	 * tunneled IPv6 packets through this 6to4 router.
2549 	 */
2550 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2551 	if (outer4->ipha_dst != v4addr)
2552 		return (B_FALSE);
2553 
2554 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2555 		/*
2556 		 * Section 9 of RFC 3056 (security considerations) suggests
2557 		 * that when a packet is from a 6to4 site (i.e., it's not a
2558 		 * global address being forwarded froma relay router), make
2559 		 * sure that the packet was tunneled by that site's 6to4
2560 		 * router.
2561 		 */
2562 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2563 		if (outer4->ipha_src != v4addr)
2564 			return (B_FALSE);
2565 	} else {
2566 		/*
2567 		 * Only accept packets from a relay router if we've configured
2568 		 * outbound relay router functionality.
2569 		 */
2570 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2571 			return (B_FALSE);
2572 	}
2573 
2574 	return (B_TRUE);
2575 }
2576 
2577 /*
2578  * Input function for everything that comes up from the ip module below us.
2579  * This is called directly from the ip module via connp->conn_recv().
2580  *
2581  * There are two kinds of packets that can arrive here: (1) IP-in-IP tunneled
2582  * packets and (2) ICMP errors containing IP-in-IP packets transmitted by us.
2583  * They have the following structure:
2584  *
2585  * 1) M_DATA
2586  * 2) M_CTL[->M_DATA]
2587  *
2588  * (2) Is an M_CTL optionally followed by M_DATA, where the M_CTL block is the
2589  * start of the actual ICMP packet (it doesn't contain any special control
2590  * information).
2591  *
2592  * Either (1) or (2) can be IPsec-protected, in which case an M_CTL block
2593  * containing an ipsec_in_t will have been prepended to either (1) or (2),
2594  * making a total of four combinations of possible mblk chains:
2595  *
2596  * A) (1)
2597  * B) (2)
2598  * C) M_CTL(ipsec_in_t)->(1)
2599  * D) M_CTL(ipsec_in_t)->(2)
2600  */
2601 /* ARGSUSED */
2602 static void
2603 iptun_input(void *arg, mblk_t *mp, void *arg2)
2604 {
2605 	conn_t	*connp = arg;
2606 	iptun_t	*iptun = connp->conn_iptun;
2607 	int	outer_hlen;
2608 	ipha_t	*outer4, *inner4;
2609 	ip6_t	*outer6, *inner6;
2610 	mblk_t	*data_mp = mp;
2611 
2612 	ASSERT(IPCL_IS_IPTUN(connp));
2613 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
2614 
2615 	if (DB_TYPE(mp) == M_CTL) {
2616 		if (((ipsec_in_t *)(mp->b_rptr))->ipsec_in_type != IPSEC_IN) {
2617 			iptun_input_icmp(iptun, NULL, mp);
2618 			return;
2619 		}
2620 
2621 		data_mp = mp->b_cont;
2622 		if (DB_TYPE(data_mp) == M_CTL) {
2623 			/* Protected ICMP packet. */
2624 			iptun_input_icmp(iptun, mp, data_mp);
2625 			return;
2626 		}
2627 	}
2628 
2629 	/*
2630 	 * Request the destination's path MTU information regularly in case
2631 	 * path MTU has increased.
2632 	 */
2633 	if (IPTUN_PMTU_TOO_OLD(iptun))
2634 		iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
2635 
2636 	if ((outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
2637 	    &inner6)) == 0)
2638 		goto drop;
2639 
2640 	/*
2641 	 * If the system is labeled, we call tsol_check_dest() on the packet
2642 	 * destination (our local tunnel address) to ensure that the packet as
2643 	 * labeled should be allowed to be sent to us.  We don't need to call
2644 	 * the more involved tsol_receive_local() since the tunnel link itself
2645 	 * cannot be assigned to shared-stack non-global zones.
2646 	 */
2647 	if (is_system_labeled()) {
2648 		cred_t *msg_cred;
2649 
2650 		if ((msg_cred = msg_getcred(data_mp, NULL)) == NULL)
2651 			goto drop;
2652 		if (tsol_check_dest(msg_cred, (outer4 != NULL ?
2653 		    (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2654 		    (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2655 		    CONN_MAC_DEFAULT, NULL) != 0)
2656 			goto drop;
2657 	}
2658 
2659 	if (!ipsec_tun_inbound((mp == data_mp ? NULL : mp), &data_mp,
2660 	    iptun->iptun_itp, inner4, inner6, outer4, outer6, outer_hlen,
2661 	    iptun->iptun_ns)) {
2662 		/* Callee did all of the freeing. */
2663 		return;
2664 	}
2665 	mp = data_mp;
2666 
2667 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2668 	    !iptun_in_6to4_ok(iptun, outer4, inner6))
2669 		goto drop;
2670 
2671 	/*
2672 	 * We need to statistically account for each packet individually, so
2673 	 * we might as well split up any b_next chains here.
2674 	 */
2675 	do {
2676 		mp = data_mp->b_next;
2677 		data_mp->b_next = NULL;
2678 
2679 		atomic_inc_64(&iptun->iptun_ipackets);
2680 		atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2681 		mac_rx(iptun->iptun_mh, NULL, data_mp);
2682 
2683 		data_mp = mp;
2684 	} while (data_mp != NULL);
2685 	return;
2686 drop:
2687 	iptun_drop_pkt(mp, &iptun->iptun_ierrors);
2688 }
2689 
2690 /*
2691  * Do 6to4-specific header-processing on output.  Return B_TRUE if the packet
2692  * was processed without issue, or B_FALSE if the packet had issues and should
2693  * be dropped.
2694  */
2695 static boolean_t
2696 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2697 {
2698 	ipaddr_t v4addr;
2699 
2700 	/*
2701 	 * IPv6 source must be a 6to4 address.  This is because a conscious
2702 	 * decision was made to not allow a Solaris system to be used as a
2703 	 * relay router (for security reasons) when 6to4 was initially
2704 	 * integrated.  If this decision is ever reversed, the following check
2705 	 * can be removed.
2706 	 */
2707 	if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2708 		return (B_FALSE);
2709 
2710 	/*
2711 	 * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2712 	 * portion of the 6to4 IPv6 source address.  In other words, make sure
2713 	 * that we're tunneling packets from our own 6to4 site.
2714 	 */
2715 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2716 	if (outer4->ipha_src != v4addr)
2717 		return (B_FALSE);
2718 
2719 	/*
2720 	 * Automatically set the destination of the outer IPv4 header as
2721 	 * described in RFC3056.  There are two possibilities:
2722 	 *
2723 	 * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2724 	 *    to the IPv4 portion of the 6to4 address.
2725 	 * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2726 	 *    destination to the address of a relay router.
2727 	 *
2728 	 * Design Note: b shouldn't be necessary here, and this is a flaw in
2729 	 * the design of the 6to4relay command.  Instead of setting a 6to4
2730 	 * relay address in this module via an ioctl, the 6to4relay command
2731 	 * could simply add a IPv6 route for native IPv6 addresses (such as a
2732 	 * default route) in the forwarding table that uses a 6to4 destination
2733 	 * as its next hop, and the IPv4 portion of that address could be a
2734 	 * 6to4 relay address.  In order for this to work, IP would have to
2735 	 * resolve the next hop address, which would necessitate a link-layer
2736 	 * address resolver for 6to4 links, which doesn't exist today.
2737 	 *
2738 	 * In fact, if a resolver existed for 6to4 links, then setting the
2739 	 * IPv4 destination in the outer header could be done as part of
2740 	 * link-layer address resolution and fast-path header generation, and
2741 	 * not here.
2742 	 */
2743 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2744 		/* destination is a 6to4 router */
2745 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2746 		    (struct in_addr *)&outer4->ipha_dst);
2747 	} else {
2748 		/*
2749 		 * The destination is a native IPv6 address.  If output to a
2750 		 * relay-router is enabled, use the relay-router's IPv4
2751 		 * address as the destination.
2752 		 */
2753 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2754 			return (B_FALSE);
2755 		outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2756 	}
2757 
2758 	/*
2759 	 * If the outer source and destination are equal, this means that the
2760 	 * 6to4 router somehow forwarded an IPv6 packet destined for its own
2761 	 * 6to4 site to its 6to4 tunnel interface, which will result in this
2762 	 * packet infinitely bouncing between ip and iptun.
2763 	 */
2764 	return (outer4->ipha_src != outer4->ipha_dst);
2765 }
2766 
2767 /*
2768  * Process output packets with outer IPv4 headers.  Frees mp and bumps stat on
2769  * error.
2770  */
2771 static mblk_t *
2772 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2773     ipha_t *inner4, ip6_t *inner6)
2774 {
2775 	uint8_t	*innerptr = (inner4 != NULL ?
2776 	    (uint8_t *)inner4 : (uint8_t *)inner6);
2777 	size_t	minmtu = (inner4 != NULL ?
2778 	    IPTUN_MIN_IPV4_MTU : IPTUN_MIN_IPV6_MTU);
2779 
2780 	if (inner4 != NULL) {
2781 		ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2782 		/*
2783 		 * Copy the tos from the inner IPv4 header. We mask off ECN
2784 		 * bits (bits 6 and 7) because there is currently no
2785 		 * tunnel-tunnel communication to determine if both sides
2786 		 * support ECN.  We opt for the safe choice: don't copy the
2787 		 * ECN bits when doing encapsulation.
2788 		 */
2789 		outer4->ipha_type_of_service =
2790 		    inner4->ipha_type_of_service & ~0x03;
2791 	} else {
2792 		ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2793 		    inner6 != NULL);
2794 
2795 		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2796 		    !iptun_out_process_6to4(iptun, outer4, inner6)) {
2797 			iptun_drop_pkt(mp, &iptun->iptun_oerrors);
2798 			return (NULL);
2799 		}
2800 	}
2801 
2802 	/*
2803 	 * As described in section 3.2.2 of RFC4213, if the packet payload is
2804 	 * less than or equal to the minimum MTU size, then we need to allow
2805 	 * IPv4 to fragment the packet.  The reason is that even if we end up
2806 	 * receiving an ICMP frag-needed, the interface above this tunnel
2807 	 * won't be allowed to drop its MTU as a result, since the packet was
2808 	 * already smaller than the smallest allowable MTU for that interface.
2809 	 */
2810 	if (mp->b_wptr - innerptr <= minmtu)
2811 		outer4->ipha_fragment_offset_and_flags = 0;
2812 
2813 	outer4->ipha_length = htons(msgdsize(mp));
2814 
2815 	return (mp);
2816 }
2817 
2818 /*
2819  * Insert an encapsulation limit destination option in the packet provided.
2820  * Always consumes the mp argument and returns a new mblk pointer.
2821  */
2822 static mblk_t *
2823 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2824     uint8_t limit)
2825 {
2826 	mblk_t			*newmp;
2827 	iptun_ipv6hdrs_t	*newouter6;
2828 
2829 	ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
2830 	ASSERT(mp->b_cont == NULL);
2831 
2832 	mp->b_rptr += sizeof (ip6_t);
2833 	newmp = allocb_tmpl(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), mp);
2834 	if (newmp == NULL) {
2835 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2836 		return (NULL);
2837 	}
2838 	newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
2839 	/* Copy the payload (Starting with the inner IPv6 header). */
2840 	bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
2841 	newmp->b_wptr += MBLKL(mp);
2842 	newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
2843 	/* Now copy the outer IPv6 header. */
2844 	bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
2845 	newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
2846 	newouter6->it6h_encaplim = iptun_encaplim_init;
2847 	newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
2848 	newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
2849 
2850 	/*
2851 	 * The payload length will be set at the end of
2852 	 * iptun_out_process_ipv6().
2853 	 */
2854 
2855 	freemsg(mp);
2856 	return (newmp);
2857 }
2858 
2859 /*
2860  * Process output packets with outer IPv6 headers.  Frees mp and bumps stats
2861  * on error.
2862  */
2863 static mblk_t *
2864 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
2865 {
2866 	uint8_t		*limit, *configlimit;
2867 	uint32_t	offset;
2868 	iptun_ipv6hdrs_t *v6hdrs;
2869 
2870 	if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
2871 		/*
2872 		 * The inner packet is an IPv6 packet which itself contains an
2873 		 * encapsulation limit option.  The limit variable points to
2874 		 * the value in the embedded option.  Process the
2875 		 * encapsulation limit option as specified in RFC 2473.
2876 		 *
2877 		 * If limit is 0, then we've exceeded the limit and we need to
2878 		 * send back an ICMPv6 parameter problem message.
2879 		 *
2880 		 * If limit is > 0, then we decrement it by 1 and make sure
2881 		 * that the encapsulation limit option in the outer header
2882 		 * reflects that (adding an option if one isn't already
2883 		 * there).
2884 		 */
2885 		ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
2886 		if (*limit == 0) {
2887 			mp->b_rptr = (uint8_t *)inner6;
2888 			offset = limit - mp->b_rptr;
2889 			iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
2890 			    0, offset);
2891 			atomic_inc_64(&iptun->iptun_noxmtbuf);
2892 			return (NULL);
2893 		}
2894 
2895 		/*
2896 		 * The outer header requires an encapsulation limit option.
2897 		 * If there isn't one already, add one.
2898 		 */
2899 		if (iptun->iptun_encaplimit == 0) {
2900 			if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
2901 			    (*limit - 1))) == NULL)
2902 				return (NULL);
2903 		} else {
2904 			/*
2905 			 * There is an existing encapsulation limit option in
2906 			 * the outer header.  If the inner encapsulation limit
2907 			 * is less than the configured encapsulation limit,
2908 			 * update the outer encapsulation limit to reflect
2909 			 * this lesser value.
2910 			 */
2911 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
2912 			configlimit =
2913 			    &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
2914 			if ((*limit - 1) < *configlimit)
2915 				*configlimit = (*limit - 1);
2916 		}
2917 	}
2918 
2919 	outer6->ip6_plen = htons(msgdsize(mp) - sizeof (ip6_t));
2920 	return (mp);
2921 }
2922 
2923 /*
2924  * The IP tunneling MAC-type plugins have already done most of the header
2925  * processing and validity checks.  We are simply responsible for multiplexing
2926  * down to the ip module below us.
2927  */
2928 static void
2929 iptun_output(iptun_t *iptun, mblk_t *mp)
2930 {
2931 	conn_t	*connp = iptun->iptun_connp;
2932 	int	outer_hlen;
2933 	mblk_t	*newmp;
2934 	ipha_t	*outer4, *inner4;
2935 	ip6_t	*outer6, *inner6;
2936 	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
2937 
2938 	ASSERT(mp->b_datap->db_type == M_DATA);
2939 
2940 	if (mp->b_cont != NULL) {
2941 		if ((newmp = msgpullup(mp, -1)) == NULL) {
2942 			iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2943 			return;
2944 		}
2945 		freemsg(mp);
2946 		mp = newmp;
2947 	}
2948 
2949 	outer_hlen = iptun_find_headers(mp, &outer4, &inner4, &outer6, &inner6);
2950 	if (outer_hlen == 0) {
2951 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
2952 		return;
2953 	}
2954 
2955 	/* Perform header processing. */
2956 	if (outer4 != NULL)
2957 		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6);
2958 	else
2959 		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner6);
2960 	if (mp == NULL)
2961 		return;
2962 
2963 	/*
2964 	 * Let's hope the compiler optimizes this with "branch taken".
2965 	 */
2966 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
2967 		if ((mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
2968 		    outer6, outer_hlen)) == NULL) {
2969 			/* ipsec_tun_outbound() frees mp on error. */
2970 			atomic_inc_64(&iptun->iptun_oerrors);
2971 			return;
2972 		}
2973 		/*
2974 		 * ipsec_tun_outbound() returns a chain of tunneled IP
2975 		 * fragments linked with b_next (or a single message if the
2976 		 * tunneled packet wasn't a fragment).  Each message in the
2977 		 * chain is prepended by an IPSEC_OUT M_CTL block with
2978 		 * instructions for outbound IPsec processing.
2979 		 */
2980 		for (newmp = mp; newmp != NULL; newmp = mp) {
2981 			ASSERT(newmp->b_datap->db_type == M_CTL);
2982 			atomic_inc_64(&iptun->iptun_opackets);
2983 			atomic_add_64(&iptun->iptun_obytes,
2984 			    msgdsize(newmp->b_cont));
2985 			mp = mp->b_next;
2986 			newmp->b_next = NULL;
2987 			connp->conn_send(connp, newmp, connp->conn_wq, IP_WPUT);
2988 		}
2989 	} else {
2990 		/*
2991 		 * The ip module will potentially apply global policy to the
2992 		 * packet in its output path if there's no active tunnel
2993 		 * policy.
2994 		 */
2995 		atomic_inc_64(&iptun->iptun_opackets);
2996 		atomic_add_64(&iptun->iptun_obytes, msgdsize(mp));
2997 		connp->conn_send(connp, mp, connp->conn_wq, IP_WPUT);
2998 	}
2999 }
3000 
3001 /*
3002  * Note that the setting or clearing iptun_{set,get}_g_q() is serialized via
3003  * iptuns_lock and iptunq_open(), so we must never be in a situation where
3004  * iptun_set_g_q() is called if the queue has already been set or vice versa
3005  * (hence the ASSERT()s.)
3006  */
3007 void
3008 iptun_set_g_q(netstack_t *ns, queue_t *q)
3009 {
3010 	ASSERT(ns->netstack_iptun->iptuns_g_q == NULL);
3011 	ns->netstack_iptun->iptuns_g_q = q;
3012 }
3013 
3014 void
3015 iptun_clear_g_q(netstack_t *ns)
3016 {
3017 	ASSERT(ns->netstack_iptun->iptuns_g_q != NULL);
3018 	ns->netstack_iptun->iptuns_g_q = NULL;
3019 }
3020 
3021 static mac_callbacks_t iptun_m_callbacks = {
3022 	.mc_callbacks	= (MC_SETPROP | MC_GETPROP),
3023 	.mc_getstat	= iptun_m_getstat,
3024 	.mc_start	= iptun_m_start,
3025 	.mc_stop	= iptun_m_stop,
3026 	.mc_setpromisc	= iptun_m_setpromisc,
3027 	.mc_multicst	= iptun_m_multicst,
3028 	.mc_unicst	= iptun_m_unicst,
3029 	.mc_tx		= iptun_m_tx,
3030 	.mc_setprop	= iptun_m_setprop,
3031 	.mc_getprop	= iptun_m_getprop
3032 };
3033