xref: /illumos-gate/usr/src/uts/common/inet/ip/icmp.c (revision 3b860eee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/policy.h>
42 #include <sys/priv.h>
43 #include <sys/zone.h>
44 #include <sys/time.h>
45 
46 #include <sys/sockio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/isa_defs.h>
50 #include <sys/suntpi.h>
51 #include <sys/xti_inet.h>
52 #include <sys/netstack.h>
53 
54 #include <net/route.h>
55 #include <net/if.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/icmp6.h>
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip6.h>
63 #include <inet/proto_set.h>
64 #include <inet/nd.h>
65 #include <inet/optcom.h>
66 #include <inet/snmpcom.h>
67 #include <inet/kstatcom.h>
68 #include <inet/rawip_impl.h>
69 
70 #include <netinet/ip_mroute.h>
71 #include <inet/tcp.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 #include <inet/ipclassifier.h>
75 
76 #include <sys/tsol/label.h>
77 #include <sys/tsol/tnet.h>
78 
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 
82 #include <inet/ip_impl.h>
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
106 static int	icmp_bind_proto(conn_t *connp);
107 static int	icmp_build_hdrs(icmp_t *icmp);
108 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
109 static int	icmp_close(queue_t *q, int flags);
110 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
111 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
112 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
113 		    int sys_error);
114 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
115 		    t_scalar_t t_error, int sys_error);
116 static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
117 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
118 static void	icmp_info_req(queue_t *q, mblk_t *mp);
119 static void	icmp_input(void *, mblk_t *, void *);
120 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
121 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
122 		    cred_t *credp);
123 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
124 		    cred_t *credp);
125 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
126 		    int *errorp, void *thisdg_attrs);
127 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
128 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
129 		    int level, int name, uint_t inlen,
130 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
131 		    void *thisdg_attrs, cred_t *cr);
132 int		icmp_opt_get(conn_t *connp, int level, int name,
133 		    uchar_t *ptr);
134 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
135 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
136 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
137 		    caddr_t cp, cred_t *cr);
138 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
139 		    uchar_t *ptr, int len);
140 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
141 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
142 static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
143 static void	icmp_wput(queue_t *q, mblk_t *mp);
144 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
145 static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
146 		    sin6_t *sin6, ip6_pkt_t *ipp);
147 static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
148 		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
149 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
150 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
151 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
152 static void	icmp_ulp_recv(conn_t *, mblk_t *);
153 
154 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
155 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
156 
157 static void	*rawip_kstat_init(netstackid_t stackid);
158 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
159 static int	rawip_kstat_update(kstat_t *kp, int rw);
160 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
161 static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
162 		    uint_t *salenp);
163 static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
164 		    uint_t *salenp);
165 
166 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
167 		    socklen_t *, cred_t *);
168 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
169 		    socklen_t *, cred_t *);
170 
171 static struct module_info icmp_mod_info =  {
172 	5707, "icmp", 1, INFPSZ, 512, 128
173 };
174 
175 /*
176  * Entry points for ICMP as a device.
177  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
178  */
179 static struct qinit icmprinitv4 = {
180 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
181 };
182 
183 static struct qinit icmprinitv6 = {
184 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
185 };
186 
187 static struct qinit icmpwinit = {
188 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* ICMP entry point during fallback */
192 static struct qinit icmp_fallback_sock_winit = {
193 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
194 };
195 
196 /* For AF_INET aka /dev/icmp */
197 struct streamtab icmpinfov4 = {
198 	&icmprinitv4, &icmpwinit
199 };
200 
201 /* For AF_INET6 aka /dev/icmp6 */
202 struct streamtab icmpinfov6 = {
203 	&icmprinitv6, &icmpwinit
204 };
205 
206 static sin_t	sin_null;	/* Zero address for quick clears */
207 static sin6_t	sin6_null;	/* Zero address for quick clears */
208 
209 /* Default structure copied into T_INFO_ACK messages */
210 static struct T_info_ack icmp_g_t_info_ack = {
211 	T_INFO_ACK,
212 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
213 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
214 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
215 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
216 	0,		/* ADDR_size - filled in later. */
217 	0,		/* OPT_size - not initialized here */
218 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
219 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
220 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
221 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
222 };
223 
224 /*
225  * Table of ND variables supported by icmp.  These are loaded into is_nd
226  * when the stack instance is created.
227  * All of these are alterable, within the min/max values given, at run time.
228  */
229 static icmpparam_t	icmp_param_arr[] = {
230 	/* min	max	value	name */
231 	{ 0,	128,	32,	"icmp_wroff_extra" },
232 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
233 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
234 	{ 0,	1,	1,	"icmp_bsd_compat" },
235 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
236 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
237 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
238 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
239 };
240 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
241 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
242 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
243 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
244 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
245 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
246 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
247 #define	is_max_buf			is_param_arr[7].icmp_param_value
248 
249 static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
250 static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
251     socklen_t len, cred_t *cr);
252 static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
253 
254 /*
255  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
256  * passed to icmp_wput.
257  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
258  * protocol type placed in the message following the address. A T_BIND_ACK
259  * message is returned by ip_bind_v4/v6.
260  */
261 static void
262 icmp_tpi_bind(queue_t *q, mblk_t *mp)
263 {
264 	int	error;
265 	struct sockaddr *sa;
266 	struct T_bind_req *tbr;
267 	socklen_t	len;
268 	sin_t	*sin;
269 	sin6_t	*sin6;
270 	icmp_t		*icmp;
271 	conn_t	*connp = Q_TO_CONN(q);
272 	mblk_t *mp1;
273 	cred_t *cr;
274 
275 	/*
276 	 * All Solaris components should pass a db_credp
277 	 * for this TPI message, hence we ASSERT.
278 	 * But in case there is some other M_PROTO that looks
279 	 * like a TPI message sent by some other kernel
280 	 * component, we check and return an error.
281 	 */
282 	cr = msg_getcred(mp, NULL);
283 	ASSERT(cr != NULL);
284 	if (cr == NULL) {
285 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
286 		return;
287 	}
288 
289 	icmp = connp->conn_icmp;
290 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
291 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
292 		    "icmp_bind: bad req, len %u",
293 		    (uint_t)(mp->b_wptr - mp->b_rptr));
294 		icmp_err_ack(q, mp, TPROTO, 0);
295 		return;
296 	}
297 
298 	if (icmp->icmp_state != TS_UNBND) {
299 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
300 		    "icmp_bind: bad state, %d", icmp->icmp_state);
301 		icmp_err_ack(q, mp, TOUTSTATE, 0);
302 		return;
303 	}
304 
305 	/*
306 	 * Reallocate the message to make sure we have enough room for an
307 	 * address and the protocol type.
308 	 */
309 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
310 	if (!mp1) {
311 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
312 		return;
313 	}
314 	mp = mp1;
315 
316 	/* Reset the message type in preparation for shipping it back. */
317 	DB_TYPE(mp) = M_PCPROTO;
318 	tbr = (struct T_bind_req *)mp->b_rptr;
319 	len = tbr->ADDR_length;
320 	switch (len) {
321 	case 0:	/* request for a generic port */
322 		tbr->ADDR_offset = sizeof (struct T_bind_req);
323 		if (icmp->icmp_family == AF_INET) {
324 			tbr->ADDR_length = sizeof (sin_t);
325 			sin = (sin_t *)&tbr[1];
326 			*sin = sin_null;
327 			sin->sin_family = AF_INET;
328 			mp->b_wptr = (uchar_t *)&sin[1];
329 			sa = (struct sockaddr *)sin;
330 			len = sizeof (sin_t);
331 		} else {
332 			ASSERT(icmp->icmp_family == AF_INET6);
333 			tbr->ADDR_length = sizeof (sin6_t);
334 			sin6 = (sin6_t *)&tbr[1];
335 			*sin6 = sin6_null;
336 			sin6->sin6_family = AF_INET6;
337 			mp->b_wptr = (uchar_t *)&sin6[1];
338 			sa = (struct sockaddr *)sin6;
339 			len = sizeof (sin6_t);
340 		}
341 		break;
342 
343 	case sizeof (sin_t):	/* Complete IPv4 address */
344 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
345 		    sizeof (sin_t));
346 		break;
347 
348 	case sizeof (sin6_t):	/* Complete IPv6 address */
349 		sa = (struct sockaddr *)mi_offset_param(mp,
350 		    tbr->ADDR_offset, sizeof (sin6_t));
351 		break;
352 
353 	default:
354 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
355 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
356 		icmp_err_ack(q, mp, TBADADDR, 0);
357 		return;
358 	}
359 
360 	error = rawip_do_bind(connp, sa, len);
361 done:
362 	ASSERT(mp->b_cont == NULL);
363 	if (error != 0) {
364 		if (error > 0) {
365 			icmp_err_ack(q, mp, TSYSERR, error);
366 		} else {
367 			icmp_err_ack(q, mp, -error, 0);
368 		}
369 	} else {
370 		tbr->PRIM_type = T_BIND_ACK;
371 		qreply(q, mp);
372 	}
373 }
374 
375 static int
376 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
377 {
378 	sin_t		*sin;
379 	sin6_t		*sin6;
380 	icmp_t		*icmp;
381 	int		error = 0;
382 	mblk_t		*ire_mp;
383 
384 
385 	icmp = connp->conn_icmp;
386 
387 	if (sa == NULL || !OK_32PTR((char *)sa)) {
388 		return (EINVAL);
389 	}
390 
391 	/*
392 	 * The state must be TS_UNBND. TPI mandates that users must send
393 	 * TPI primitives only 1 at a time and wait for the response before
394 	 * sending the next primitive.
395 	 */
396 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
397 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
398 		error = -TOUTSTATE;
399 		goto done;
400 	}
401 
402 	ASSERT(len != 0);
403 	switch (len) {
404 	case sizeof (sin_t):    /* Complete IPv4 address */
405 		sin = (sin_t *)sa;
406 		if (sin->sin_family != AF_INET ||
407 		    icmp->icmp_family != AF_INET) {
408 			/* TSYSERR, EAFNOSUPPORT */
409 			error = EAFNOSUPPORT;
410 			goto done;
411 		}
412 		break;
413 	case sizeof (sin6_t): /* Complete IPv6 address */
414 		sin6 = (sin6_t *)sa;
415 		if (sin6->sin6_family != AF_INET6 ||
416 		    icmp->icmp_family != AF_INET6) {
417 			/* TSYSERR, EAFNOSUPPORT */
418 			error = EAFNOSUPPORT;
419 			goto done;
420 		}
421 		/* No support for mapped addresses on raw sockets */
422 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
423 			/* TSYSERR, EADDRNOTAVAIL */
424 			error = EADDRNOTAVAIL;
425 			goto done;
426 		}
427 		break;
428 
429 	default:
430 		/* TBADADDR */
431 		error = EADDRNOTAVAIL;
432 		goto done;
433 	}
434 
435 	icmp->icmp_pending_op = T_BIND_REQ;
436 	icmp->icmp_state = TS_IDLE;
437 
438 	/*
439 	 * Copy the source address into our icmp structure.  This address
440 	 * may still be zero; if so, ip will fill in the correct address
441 	 * each time an outbound packet is passed to it.
442 	 * If we are binding to a broadcast or multicast address then
443 	 * rawip_post_ip_bind_connect will clear the source address.
444 	 */
445 
446 	if (icmp->icmp_family == AF_INET) {
447 		ASSERT(sin != NULL);
448 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
449 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
450 		    &icmp->icmp_v6src);
451 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
452 		    icmp->icmp_ip_snd_options_len;
453 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
454 	} else {
455 		int error;
456 
457 		ASSERT(sin6 != NULL);
458 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
459 		icmp->icmp_v6src = sin6->sin6_addr;
460 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
461 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
462 
463 		/* Rebuild the header template */
464 		error = icmp_build_hdrs(icmp);
465 		if (error != 0) {
466 			icmp->icmp_pending_op = -1;
467 			/*
468 			 * TSYSERR
469 			 */
470 			goto done;
471 		}
472 	}
473 
474 	ire_mp = NULL;
475 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
476 		/*
477 		 * request an IRE if src not 0 (INADDR_ANY)
478 		 */
479 		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
480 		if (ire_mp == NULL) {
481 			icmp->icmp_pending_op = -1;
482 			error = ENOMEM;
483 			goto done;
484 		}
485 		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
486 	}
487 done:
488 	rw_exit(&icmp->icmp_rwlock);
489 	if (error != 0)
490 		return (error);
491 
492 	if (icmp->icmp_family == AF_INET6) {
493 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
494 		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
495 	} else {
496 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
497 		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
498 	}
499 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
500 	return (error);
501 }
502 
503 static void
504 rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
505 {
506 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
507 	if (icmp->icmp_state == TS_UNBND) {
508 		/*
509 		 * not yet bound - bind sent by icmp_bind_proto.
510 		 */
511 		rw_exit(&icmp->icmp_rwlock);
512 		return;
513 	}
514 	ASSERT(icmp->icmp_pending_op != -1);
515 	icmp->icmp_pending_op = -1;
516 
517 	if (error != 0) {
518 		if (icmp->icmp_state == TS_DATA_XFER) {
519 			/* Connect failed */
520 			/* Revert back to the bound source */
521 			icmp->icmp_v6src = icmp->icmp_bound_v6src;
522 			icmp->icmp_state = TS_IDLE;
523 			if (icmp->icmp_family == AF_INET6)
524 				(void) icmp_build_hdrs(icmp);
525 		} else {
526 			V6_SET_ZERO(icmp->icmp_v6src);
527 			V6_SET_ZERO(icmp->icmp_bound_v6src);
528 			icmp->icmp_state = TS_UNBND;
529 			if (icmp->icmp_family == AF_INET6)
530 				(void) icmp_build_hdrs(icmp);
531 		}
532 	} else {
533 		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
534 			ire_t *ire;
535 
536 			ire = (ire_t *)ire_mp->b_rptr;
537 			/*
538 			 * If a broadcast/multicast address was bound set
539 			 * the source address to 0.
540 			 * This ensures no datagrams with broadcast address
541 			 * as source address are emitted (which would violate
542 			 * RFC1122 - Hosts requirements)
543 			 * Note: we get IRE_BROADCAST for IPv6
544 			 * to "mark" a multicast local address.
545 			 */
546 
547 
548 			if (ire->ire_type == IRE_BROADCAST &&
549 			    icmp->icmp_state != TS_DATA_XFER) {
550 				/*
551 				 * This was just a local bind to a
552 				 * MC/broadcast addr
553 				 */
554 				V6_SET_ZERO(icmp->icmp_v6src);
555 				if (icmp->icmp_family == AF_INET6)
556 					(void) icmp_build_hdrs(icmp);
557 			}
558 		}
559 
560 	}
561 	rw_exit(&icmp->icmp_rwlock);
562 	if (ire_mp != NULL)
563 		freeb(ire_mp);
564 }
565 
566 /*
567  * Send message to IP to just bind to the protocol.
568  */
569 static int
570 icmp_bind_proto(conn_t *connp)
571 {
572 	icmp_t	*icmp;
573 	int	error;
574 
575 	icmp = connp->conn_icmp;
576 
577 	if (icmp->icmp_family == AF_INET6)
578 		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
579 		    &sin6_null.sin6_addr, 0, B_TRUE);
580 	else
581 		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
582 		    sin_null.sin_addr.s_addr, 0, B_TRUE);
583 
584 	rawip_post_ip_bind_connect(icmp, NULL, error);
585 	return (error);
586 }
587 
588 static void
589 icmp_tpi_connect(queue_t *q, mblk_t *mp)
590 {
591 	conn_t	*connp = Q_TO_CONN(q);
592 	struct T_conn_req	*tcr;
593 	icmp_t	*icmp;
594 	struct sockaddr *sa;
595 	socklen_t len;
596 	int error;
597 	cred_t *cr;
598 
599 	/*
600 	 * All Solaris components should pass a db_credp
601 	 * for this TPI message, hence we ASSERT.
602 	 * But in case there is some other M_PROTO that looks
603 	 * like a TPI message sent by some other kernel
604 	 * component, we check and return an error.
605 	 */
606 	cr = msg_getcred(mp, NULL);
607 	ASSERT(cr != NULL);
608 	if (cr == NULL) {
609 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
610 		return;
611 	}
612 
613 	icmp = connp->conn_icmp;
614 	tcr = (struct T_conn_req *)mp->b_rptr;
615 	/* Sanity checks */
616 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
617 		icmp_err_ack(q, mp, TPROTO, 0);
618 		return;
619 	}
620 
621 	if (tcr->OPT_length != 0) {
622 		icmp_err_ack(q, mp, TBADOPT, 0);
623 		return;
624 	}
625 
626 	len = tcr->DEST_length;
627 
628 	switch (len) {
629 	default:
630 		icmp_err_ack(q, mp, TBADADDR, 0);
631 		return;
632 	case sizeof (sin_t):
633 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
634 		    sizeof (sin_t));
635 		break;
636 	case sizeof (sin6_t):
637 		sa = (struct sockaddr *)mi_offset_param(mp,
638 		    tcr->DEST_offset, sizeof (sin6_t));
639 		break;
640 	}
641 
642 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
643 	if (error != 0) {
644 		icmp_err_ack(q, mp, TSYSERR, error);
645 		return;
646 	}
647 
648 	error = rawip_do_connect(connp, sa, len, cr);
649 	if (error != 0) {
650 		if (error < 0) {
651 			icmp_err_ack(q, mp, -error, 0);
652 		} else {
653 			icmp_err_ack(q, mp, 0, error);
654 		}
655 	} else {
656 		mblk_t *mp1;
657 
658 		/*
659 		 * We have to send a connection confirmation to
660 		 * keep TLI happy.
661 		 */
662 		if (icmp->icmp_family == AF_INET) {
663 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
664 			    sizeof (sin_t), NULL, 0);
665 		} else {
666 			ASSERT(icmp->icmp_family == AF_INET6);
667 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
668 			    sizeof (sin6_t), NULL, 0);
669 		}
670 		if (mp1 == NULL) {
671 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
672 			return;
673 		}
674 
675 		/*
676 		 * Send ok_ack for T_CONN_REQ
677 		 */
678 		mp = mi_tpi_ok_ack_alloc(mp);
679 		if (mp == NULL) {
680 			/* Unable to reuse the T_CONN_REQ for the ack. */
681 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
682 			return;
683 		}
684 		putnext(connp->conn_rq, mp);
685 		putnext(connp->conn_rq, mp1);
686 	}
687 }
688 
689 static int
690 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
691     cred_t *cr)
692 {
693 	icmp_t	*icmp;
694 	sin_t	*sin;
695 	sin6_t	*sin6;
696 	mblk_t  *ire_mp;
697 	int	error;
698 	ipaddr_t	v4dst;
699 	in6_addr_t	v6dst;
700 
701 	icmp = connp->conn_icmp;
702 
703 	if (sa == NULL || !OK_32PTR((char *)sa)) {
704 		return (EINVAL);
705 	}
706 
707 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
708 	if (ire_mp == NULL)
709 		return (ENOMEM);
710 	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
711 
712 
713 	ASSERT(sa != NULL && len != 0);
714 
715 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
716 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
717 		rw_exit(&icmp->icmp_rwlock);
718 		freeb(ire_mp);
719 		return (-TOUTSTATE);
720 	}
721 
722 	switch (len) {
723 	case sizeof (sin_t):
724 		sin = (sin_t *)sa;
725 
726 		ASSERT(icmp->icmp_family == AF_INET);
727 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
728 
729 		v4dst = sin->sin_addr.s_addr;
730 		/*
731 		 * Interpret a zero destination to mean loopback.
732 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
733 		 * generate the T_CONN_CON.
734 		 */
735 		if (v4dst == INADDR_ANY) {
736 			v4dst = htonl(INADDR_LOOPBACK);
737 		}
738 
739 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
740 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
741 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
742 		    icmp->icmp_ip_snd_options_len;
743 		icmp->icmp_v6dst.sin6_addr = v6dst;
744 		icmp->icmp_v6dst.sin6_family = AF_INET6;
745 		icmp->icmp_v6dst.sin6_flowinfo = 0;
746 		icmp->icmp_v6dst.sin6_port = 0;
747 
748 		/*
749 		 * If the destination address is multicast and
750 		 * an outgoing multicast interface has been set,
751 		 * use the address of that interface as our
752 		 * source address if no source address has been set.
753 		 */
754 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
755 		    CLASSD(v4dst) &&
756 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
757 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
758 			    &icmp->icmp_v6src);
759 		}
760 		break;
761 	case sizeof (sin6_t):
762 		sin6 = (sin6_t *)sa;
763 
764 		/* No support for mapped addresses on raw sockets */
765 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
766 			rw_exit(&icmp->icmp_rwlock);
767 			freeb(ire_mp);
768 			return (EADDRNOTAVAIL);
769 		}
770 
771 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
772 		ASSERT(icmp->icmp_family == AF_INET6);
773 
774 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
775 
776 		icmp->icmp_v6dst = *sin6;
777 		icmp->icmp_v6dst.sin6_port = 0;
778 
779 		/*
780 		 * Interpret a zero destination to mean loopback.
781 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
782 		 * generate the T_CONN_CON.
783 		 */
784 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
785 			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
786 		}
787 		/*
788 		 * If the destination address is multicast and
789 		 * an outgoing multicast interface has been set,
790 		 * then the ip bind logic will pick the correct source
791 		 * address (i.e. matching the outgoing multicast interface).
792 		 */
793 		break;
794 	}
795 
796 	icmp->icmp_pending_op = T_CONN_REQ;
797 
798 	if (icmp->icmp_state == TS_DATA_XFER) {
799 		/* Already connected - clear out state */
800 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
801 		icmp->icmp_state = TS_IDLE;
802 	}
803 
804 	icmp->icmp_state = TS_DATA_XFER;
805 	rw_exit(&icmp->icmp_rwlock);
806 
807 	if (icmp->icmp_family == AF_INET6) {
808 		error = ip_proto_bind_connected_v6(connp, &ire_mp,
809 		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
810 		    &icmp->icmp_v6dst.sin6_addr,
811 		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
812 	} else {
813 		error = ip_proto_bind_connected_v4(connp, &ire_mp,
814 		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
815 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
816 		    B_TRUE, B_TRUE, cr);
817 	}
818 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
819 	return (error);
820 }
821 
822 static void
823 icmp_close_free(conn_t *connp)
824 {
825 	icmp_t *icmp = connp->conn_icmp;
826 
827 	/* If there are any options associated with the stream, free them. */
828 	if (icmp->icmp_ip_snd_options != NULL) {
829 		mi_free((char *)icmp->icmp_ip_snd_options);
830 		icmp->icmp_ip_snd_options = NULL;
831 		icmp->icmp_ip_snd_options_len = 0;
832 	}
833 
834 	if (icmp->icmp_filter != NULL) {
835 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
836 		icmp->icmp_filter = NULL;
837 	}
838 
839 	/* Free memory associated with sticky options */
840 	if (icmp->icmp_sticky_hdrs_len != 0) {
841 		kmem_free(icmp->icmp_sticky_hdrs,
842 		    icmp->icmp_sticky_hdrs_len);
843 		icmp->icmp_sticky_hdrs = NULL;
844 		icmp->icmp_sticky_hdrs_len = 0;
845 	}
846 
847 	if (icmp->icmp_last_cred != NULL) {
848 		crfree(icmp->icmp_last_cred);
849 		icmp->icmp_last_cred = NULL;
850 	}
851 
852 	if (icmp->icmp_effective_cred != NULL) {
853 		crfree(icmp->icmp_effective_cred);
854 		icmp->icmp_effective_cred = NULL;
855 	}
856 
857 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
858 
859 	/*
860 	 * Clear any fields which the kmem_cache constructor clears.
861 	 * Only icmp_connp needs to be preserved.
862 	 * TBD: We should make this more efficient to avoid clearing
863 	 * everything.
864 	 */
865 	ASSERT(icmp->icmp_connp == connp);
866 	bzero(icmp, sizeof (icmp_t));
867 	icmp->icmp_connp = connp;
868 }
869 
870 static int
871 rawip_do_close(conn_t *connp)
872 {
873 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
874 
875 	ip_quiesce_conn(connp);
876 
877 	if (!IPCL_IS_NONSTR(connp)) {
878 		qprocsoff(connp->conn_rq);
879 	}
880 
881 	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
882 	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
883 	icmp_close_free(connp);
884 
885 	/*
886 	 * Now we are truly single threaded on this stream, and can
887 	 * delete the things hanging off the connp, and finally the connp.
888 	 * We removed this connp from the fanout list, it cannot be
889 	 * accessed thru the fanouts, and we already waited for the
890 	 * conn_ref to drop to 0. We are already in close, so
891 	 * there cannot be any other thread from the top. qprocsoff
892 	 * has completed, and service has completed or won't run in
893 	 * future.
894 	 */
895 	ASSERT(connp->conn_ref == 1);
896 
897 	if (!IPCL_IS_NONSTR(connp)) {
898 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
899 	} else {
900 		ip_free_helper_stream(connp);
901 	}
902 
903 	connp->conn_ref--;
904 	ipcl_conn_destroy(connp);
905 
906 	return (0);
907 }
908 
909 static int
910 icmp_close(queue_t *q, int flags)
911 {
912 	conn_t  *connp;
913 
914 	if (flags & SO_FALLBACK) {
915 		/*
916 		 * stream is being closed while in fallback
917 		 * simply free the resources that were allocated
918 		 */
919 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
920 		qprocsoff(q);
921 		goto done;
922 	}
923 
924 	connp = Q_TO_CONN(q);
925 	(void) rawip_do_close(connp);
926 done:
927 	q->q_ptr = WR(q)->q_ptr = NULL;
928 	return (0);
929 }
930 
931 /*
932  * This routine handles each T_DISCON_REQ message passed to icmp
933  * as an indicating that ICMP is no longer connected. This results
934  * in sending a T_BIND_REQ to IP to restore the binding to just
935  * the local address.
936  *
937  * The disconnect completes in rawip_post_ip_bind_connect.
938  */
939 static int
940 icmp_do_disconnect(conn_t *connp)
941 {
942 	icmp_t	*icmp;
943 	mblk_t	*ire_mp;
944 	int error;
945 
946 	icmp = connp->conn_icmp;
947 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
948 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
949 		rw_exit(&icmp->icmp_rwlock);
950 		return (-TOUTSTATE);
951 	}
952 	icmp->icmp_pending_op = T_DISCON_REQ;
953 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
954 	icmp->icmp_state = TS_IDLE;
955 
956 
957 	if (icmp->icmp_family == AF_INET6) {
958 		/* Rebuild the header template */
959 		error = icmp_build_hdrs(icmp);
960 		if (error != 0) {
961 			icmp->icmp_pending_op = -1;
962 			rw_exit(&icmp->icmp_rwlock);
963 			return (error);
964 		}
965 	}
966 
967 	rw_exit(&icmp->icmp_rwlock);
968 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
969 	if (ire_mp == NULL) {
970 		return (ENOMEM);
971 	}
972 
973 	if (icmp->icmp_family == AF_INET6) {
974 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
975 		    &icmp->icmp_bound_v6src, 0, B_TRUE);
976 	} else {
977 
978 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
979 		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
980 	}
981 
982 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
983 
984 	return (error);
985 }
986 
987 static void
988 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
989 {
990 	conn_t	*connp = Q_TO_CONN(q);
991 	int	error;
992 
993 	/*
994 	 * Allocate the largest primitive we need to send back
995 	 * T_error_ack is > than T_ok_ack
996 	 */
997 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
998 	if (mp == NULL) {
999 		/* Unable to reuse the T_DISCON_REQ for the ack. */
1000 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1001 		return;
1002 	}
1003 
1004 	error = icmp_do_disconnect(connp);
1005 
1006 	if (error != 0) {
1007 		if (error > 0) {
1008 			icmp_err_ack(q, mp, 0, error);
1009 		} else {
1010 			icmp_err_ack(q, mp, -error, 0);
1011 		}
1012 	} else {
1013 		mp = mi_tpi_ok_ack_alloc(mp);
1014 		ASSERT(mp != NULL);
1015 		qreply(q, mp);
1016 	}
1017 
1018 }
1019 
1020 static int
1021 icmp_disconnect(conn_t *connp)
1022 {
1023 	int	error;
1024 	icmp_t	*icmp = connp->conn_icmp;
1025 
1026 	icmp->icmp_dgram_errind = B_FALSE;
1027 
1028 	error = icmp_do_disconnect(connp);
1029 
1030 	if (error < 0)
1031 		error = proto_tlitosyserr(-error);
1032 	return (error);
1033 }
1034 
1035 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1036 static void
1037 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1038 {
1039 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1040 		qreply(q, mp);
1041 }
1042 
1043 /* Shorthand to generate and send TPI error acks to our client */
1044 static void
1045 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1046     t_scalar_t t_error, int sys_error)
1047 {
1048 	struct T_error_ack	*teackp;
1049 
1050 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1051 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1052 		teackp = (struct T_error_ack *)mp->b_rptr;
1053 		teackp->ERROR_prim = primitive;
1054 		teackp->TLI_error = t_error;
1055 		teackp->UNIX_error = sys_error;
1056 		qreply(q, mp);
1057 	}
1058 }
1059 
1060 /*
1061  * icmp_icmp_error is called by icmp_input to process ICMP
1062  * messages passed up by IP.
1063  * Generates the appropriate permanent (non-transient) errors.
1064  * Assumes that IP has pulled up everything up to and including
1065  * the ICMP header.
1066  */
1067 static void
1068 icmp_icmp_error(conn_t *connp, mblk_t *mp)
1069 {
1070 	icmph_t *icmph;
1071 	ipha_t	*ipha;
1072 	int	iph_hdr_length;
1073 	sin_t	sin;
1074 	mblk_t	*mp1;
1075 	int	error = 0;
1076 	icmp_t	*icmp = connp->conn_icmp;
1077 
1078 	ipha = (ipha_t *)mp->b_rptr;
1079 
1080 	ASSERT(OK_32PTR(mp->b_rptr));
1081 
1082 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1083 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1084 		icmp_icmp_error_ipv6(connp, mp);
1085 		return;
1086 	}
1087 
1088 	/*
1089 	 * icmp does not support v4 mapped addresses
1090 	 * so we can never be here for a V6 socket
1091 	 * i.e. icmp_family == AF_INET6
1092 	 */
1093 	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
1094 	    (icmp->icmp_family == AF_INET));
1095 
1096 	ASSERT(icmp->icmp_family == AF_INET);
1097 
1098 	/* Skip past the outer IP and ICMP headers */
1099 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1100 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
1101 	ipha = (ipha_t *)&icmph[1];
1102 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1103 
1104 	switch (icmph->icmph_type) {
1105 	case ICMP_DEST_UNREACHABLE:
1106 		switch (icmph->icmph_code) {
1107 		case ICMP_FRAGMENTATION_NEEDED:
1108 			/*
1109 			 * IP has already adjusted the path MTU.
1110 			 */
1111 			break;
1112 		case ICMP_PORT_UNREACHABLE:
1113 		case ICMP_PROTOCOL_UNREACHABLE:
1114 			error = ECONNREFUSED;
1115 			break;
1116 		default:
1117 			/* Transient errors */
1118 			break;
1119 		}
1120 		break;
1121 	default:
1122 		/* Transient errors */
1123 		break;
1124 	}
1125 	if (error == 0) {
1126 		freemsg(mp);
1127 		return;
1128 	}
1129 
1130 	/*
1131 	 * Deliver T_UDERROR_IND when the application has asked for it.
1132 	 * The socket layer enables this automatically when connected.
1133 	 */
1134 	if (!icmp->icmp_dgram_errind) {
1135 		freemsg(mp);
1136 		return;
1137 	}
1138 
1139 	sin = sin_null;
1140 	sin.sin_family = AF_INET;
1141 	sin.sin_addr.s_addr = ipha->ipha_dst;
1142 
1143 	if (IPCL_IS_NONSTR(connp)) {
1144 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1145 		if (icmp->icmp_state == TS_DATA_XFER) {
1146 			if (sin.sin_addr.s_addr ==
1147 			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
1148 				rw_exit(&icmp->icmp_rwlock);
1149 				(*connp->conn_upcalls->su_set_error)
1150 				    (connp->conn_upper_handle, error);
1151 				goto done;
1152 			}
1153 		} else {
1154 			icmp->icmp_delayed_error = error;
1155 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1156 		}
1157 		rw_exit(&icmp->icmp_rwlock);
1158 	} else {
1159 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
1160 		    0, error);
1161 		if (mp1 != NULL)
1162 			putnext(connp->conn_rq, mp1);
1163 	}
1164 done:
1165 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1166 	freemsg(mp);
1167 }
1168 
1169 /*
1170  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
1171  * for IPv6 packets.
1172  * Send permanent (non-transient) errors upstream.
1173  * Assumes that IP has pulled up all the extension headers as well
1174  * as the ICMPv6 header.
1175  */
1176 static void
1177 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
1178 {
1179 	icmp6_t		*icmp6;
1180 	ip6_t		*ip6h, *outer_ip6h;
1181 	uint16_t	iph_hdr_length;
1182 	uint8_t		*nexthdrp;
1183 	sin6_t		sin6;
1184 	mblk_t		*mp1;
1185 	int		error = 0;
1186 	icmp_t		*icmp = connp->conn_icmp;
1187 
1188 	outer_ip6h = (ip6_t *)mp->b_rptr;
1189 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1190 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1191 	else
1192 		iph_hdr_length = IPV6_HDR_LEN;
1193 
1194 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1195 	ip6h = (ip6_t *)&icmp6[1];
1196 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1197 		freemsg(mp);
1198 		return;
1199 	}
1200 
1201 	switch (icmp6->icmp6_type) {
1202 	case ICMP6_DST_UNREACH:
1203 		switch (icmp6->icmp6_code) {
1204 		case ICMP6_DST_UNREACH_NOPORT:
1205 			error = ECONNREFUSED;
1206 			break;
1207 		case ICMP6_DST_UNREACH_ADMIN:
1208 		case ICMP6_DST_UNREACH_NOROUTE:
1209 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1210 		case ICMP6_DST_UNREACH_ADDR:
1211 			/* Transient errors */
1212 			break;
1213 		default:
1214 			break;
1215 		}
1216 		break;
1217 	case ICMP6_PACKET_TOO_BIG: {
1218 		struct T_unitdata_ind	*tudi;
1219 		struct T_opthdr		*toh;
1220 		size_t			udi_size;
1221 		mblk_t			*newmp;
1222 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1223 		    sizeof (struct ip6_mtuinfo);
1224 		sin6_t			*sin6;
1225 		struct ip6_mtuinfo	*mtuinfo;
1226 
1227 		/*
1228 		 * If the application has requested to receive path mtu
1229 		 * information, send up an empty message containing an
1230 		 * IPV6_PATHMTU ancillary data item.
1231 		 */
1232 		if (!icmp->icmp_ipv6_recvpathmtu)
1233 			break;
1234 
1235 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1236 		    opt_length;
1237 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1238 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1239 			break;
1240 		}
1241 
1242 		/*
1243 		 * newmp->b_cont is left to NULL on purpose.  This is an
1244 		 * empty message containing only ancillary data.
1245 		 */
1246 		newmp->b_datap->db_type = M_PROTO;
1247 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1248 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1249 		tudi->PRIM_type = T_UNITDATA_IND;
1250 		tudi->SRC_length = sizeof (sin6_t);
1251 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1252 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1253 		tudi->OPT_length = opt_length;
1254 
1255 		sin6 = (sin6_t *)&tudi[1];
1256 		bzero(sin6, sizeof (sin6_t));
1257 		sin6->sin6_family = AF_INET6;
1258 		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
1259 
1260 		toh = (struct T_opthdr *)&sin6[1];
1261 		toh->level = IPPROTO_IPV6;
1262 		toh->name = IPV6_PATHMTU;
1263 		toh->len = opt_length;
1264 		toh->status = 0;
1265 
1266 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1267 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1268 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1269 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1270 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1271 		/*
1272 		 * We've consumed everything we need from the original
1273 		 * message.  Free it, then send our empty message.
1274 		 */
1275 		freemsg(mp);
1276 		icmp_ulp_recv(connp, newmp);
1277 
1278 		return;
1279 	}
1280 	case ICMP6_TIME_EXCEEDED:
1281 		/* Transient errors */
1282 		break;
1283 	case ICMP6_PARAM_PROB:
1284 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1285 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1286 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1287 		    (uchar_t *)nexthdrp) {
1288 			error = ECONNREFUSED;
1289 			break;
1290 		}
1291 		break;
1292 	}
1293 	if (error == 0) {
1294 		freemsg(mp);
1295 		return;
1296 	}
1297 
1298 	/*
1299 	 * Deliver T_UDERROR_IND when the application has asked for it.
1300 	 * The socket layer enables this automatically when connected.
1301 	 */
1302 	if (!icmp->icmp_dgram_errind) {
1303 		freemsg(mp);
1304 		return;
1305 	}
1306 
1307 	sin6 = sin6_null;
1308 	sin6.sin6_family = AF_INET6;
1309 	sin6.sin6_addr = ip6h->ip6_dst;
1310 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1311 
1312 	if (IPCL_IS_NONSTR(connp)) {
1313 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1314 		if (icmp->icmp_state == TS_DATA_XFER) {
1315 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1316 			    &icmp->icmp_v6dst.sin6_addr)) {
1317 				rw_exit(&icmp->icmp_rwlock);
1318 				(*connp->conn_upcalls->su_set_error)
1319 				    (connp->conn_upper_handle, error);
1320 				goto done;
1321 			}
1322 		} else {
1323 			icmp->icmp_delayed_error = error;
1324 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1325 		}
1326 		rw_exit(&icmp->icmp_rwlock);
1327 	} else {
1328 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1329 		    NULL, 0, error);
1330 		if (mp1 != NULL)
1331 			putnext(connp->conn_rq, mp1);
1332 	}
1333 done:
1334 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1335 	freemsg(mp);
1336 }
1337 
1338 /*
1339  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1340  * The local address is filled in if endpoint is bound. The remote address
1341  * is filled in if remote address has been precified ("connected endpoint")
1342  * (The concept of connected CLTS sockets is alien to published TPI
1343  *  but we support it anyway).
1344  */
1345 static void
1346 icmp_addr_req(queue_t *q, mblk_t *mp)
1347 {
1348 	icmp_t	*icmp = Q_TO_ICMP(q);
1349 	mblk_t	*ackmp;
1350 	struct T_addr_ack *taa;
1351 
1352 	/* Make it large enough for worst case */
1353 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1354 	    2 * sizeof (sin6_t), 1);
1355 	if (ackmp == NULL) {
1356 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1357 		return;
1358 	}
1359 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1360 
1361 	bzero(taa, sizeof (struct T_addr_ack));
1362 	ackmp->b_wptr = (uchar_t *)&taa[1];
1363 
1364 	taa->PRIM_type = T_ADDR_ACK;
1365 	ackmp->b_datap->db_type = M_PCPROTO;
1366 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1367 	/*
1368 	 * Note: Following code assumes 32 bit alignment of basic
1369 	 * data structures like sin_t and struct T_addr_ack.
1370 	 */
1371 	if (icmp->icmp_state != TS_UNBND) {
1372 		/*
1373 		 * Fill in local address
1374 		 */
1375 		taa->LOCADDR_offset = sizeof (*taa);
1376 		if (icmp->icmp_family == AF_INET) {
1377 			sin_t	*sin;
1378 
1379 			taa->LOCADDR_length = sizeof (sin_t);
1380 			sin = (sin_t *)&taa[1];
1381 			/* Fill zeroes and then intialize non-zero fields */
1382 			*sin = sin_null;
1383 			sin->sin_family = AF_INET;
1384 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1385 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1386 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1387 				    sin->sin_addr.s_addr);
1388 			} else {
1389 				/*
1390 				 * INADDR_ANY
1391 				 * icmp_v6src is not set, we might be bound to
1392 				 * broadcast/multicast. Use icmp_bound_v6src as
1393 				 * local address instead (that could
1394 				 * also still be INADDR_ANY)
1395 				 */
1396 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1397 				    sin->sin_addr.s_addr);
1398 			}
1399 			ackmp->b_wptr = (uchar_t *)&sin[1];
1400 		} else {
1401 			sin6_t	*sin6;
1402 
1403 			ASSERT(icmp->icmp_family == AF_INET6);
1404 			taa->LOCADDR_length = sizeof (sin6_t);
1405 			sin6 = (sin6_t *)&taa[1];
1406 			/* Fill zeroes and then intialize non-zero fields */
1407 			*sin6 = sin6_null;
1408 			sin6->sin6_family = AF_INET6;
1409 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1410 				sin6->sin6_addr = icmp->icmp_v6src;
1411 			} else {
1412 				/*
1413 				 * UNSPECIFIED
1414 				 * icmp_v6src is not set, we might be bound to
1415 				 * broadcast/multicast. Use icmp_bound_v6src as
1416 				 * local address instead (that could
1417 				 * also still be UNSPECIFIED)
1418 				 */
1419 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1420 			}
1421 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1422 		}
1423 	}
1424 	rw_exit(&icmp->icmp_rwlock);
1425 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1426 	qreply(q, ackmp);
1427 }
1428 
1429 static void
1430 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1431 {
1432 	*tap = icmp_g_t_info_ack;
1433 
1434 	if (icmp->icmp_family == AF_INET6)
1435 		tap->ADDR_size = sizeof (sin6_t);
1436 	else
1437 		tap->ADDR_size = sizeof (sin_t);
1438 	tap->CURRENT_state = icmp->icmp_state;
1439 	tap->OPT_size = icmp_max_optsize;
1440 }
1441 
1442 static void
1443 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1444     t_uscalar_t cap_bits1)
1445 {
1446 	tcap->CAP_bits1 = 0;
1447 
1448 	if (cap_bits1 & TC1_INFO) {
1449 		icmp_copy_info(&tcap->INFO_ack, icmp);
1450 		tcap->CAP_bits1 |= TC1_INFO;
1451 	}
1452 }
1453 
1454 /*
1455  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1456  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1457  * icmp_g_t_info_ack.  The current state of the stream is copied from
1458  * icmp_state.
1459  */
1460 static void
1461 icmp_capability_req(queue_t *q, mblk_t *mp)
1462 {
1463 	icmp_t			*icmp = Q_TO_ICMP(q);
1464 	t_uscalar_t		cap_bits1;
1465 	struct T_capability_ack	*tcap;
1466 
1467 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1468 
1469 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1470 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1471 	if (!mp)
1472 		return;
1473 
1474 	tcap = (struct T_capability_ack *)mp->b_rptr;
1475 
1476 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1477 
1478 	qreply(q, mp);
1479 }
1480 
1481 /*
1482  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1483  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1484  * The current state of the stream is copied from icmp_state.
1485  */
1486 static void
1487 icmp_info_req(queue_t *q, mblk_t *mp)
1488 {
1489 	icmp_t	*icmp = Q_TO_ICMP(q);
1490 
1491 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1492 	    T_INFO_ACK);
1493 	if (!mp)
1494 		return;
1495 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1496 	qreply(q, mp);
1497 }
1498 
1499 /* For /dev/icmp aka AF_INET open */
1500 static int
1501 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1502     int family)
1503 {
1504 	conn_t *connp;
1505 	dev_t	conn_dev;
1506 	icmp_stack_t *is;
1507 	int	error;
1508 
1509 	conn_dev = NULL;
1510 
1511 	/* If the stream is already open, return immediately. */
1512 	if (q->q_ptr != NULL)
1513 		return (0);
1514 
1515 	if (sflag == MODOPEN)
1516 		return (EINVAL);
1517 
1518 	/*
1519 	 * Since ICMP is not used so heavily, allocating from the small
1520 	 * arena should be sufficient.
1521 	 */
1522 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1523 		return (EBUSY);
1524 	}
1525 
1526 	if (flag & SO_FALLBACK) {
1527 		/*
1528 		 * Non streams socket needs a stream to fallback to
1529 		 */
1530 		RD(q)->q_ptr = (void *)conn_dev;
1531 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1532 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1533 		qprocson(q);
1534 		return (0);
1535 	}
1536 
1537 	connp = icmp_open(family, credp, &error, KM_SLEEP);
1538 	if (connp == NULL) {
1539 		ASSERT(error != NULL);
1540 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1541 		return (error);
1542 	}
1543 
1544 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1545 	connp->conn_dev = conn_dev;
1546 	connp->conn_minor_arena = ip_minor_arena_sa;
1547 
1548 	is = connp->conn_icmp->icmp_is;
1549 
1550 	/*
1551 	 * Initialize the icmp_t structure for this stream.
1552 	 */
1553 	q->q_ptr = connp;
1554 	WR(q)->q_ptr = connp;
1555 	connp->conn_rq = q;
1556 	connp->conn_wq = WR(q);
1557 
1558 	if (connp->conn_icmp->icmp_family == AF_INET6) {
1559 		/* Build initial header template for transmit */
1560 		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
1561 		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
1562 			rw_exit(&connp->conn_icmp->icmp_rwlock);
1563 			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1564 			ipcl_conn_destroy(connp);
1565 			return (error);
1566 		}
1567 		rw_exit(&connp->conn_icmp->icmp_rwlock);
1568 	}
1569 
1570 
1571 	q->q_hiwat = is->is_recv_hiwat;
1572 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1573 	WR(q)->q_lowat = is->is_xmit_lowat;
1574 
1575 	qprocson(q);
1576 
1577 	/* Set the Stream head write offset. */
1578 	(void) proto_set_tx_wroff(q, connp,
1579 	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
1580 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
1581 
1582 	mutex_enter(&connp->conn_lock);
1583 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1584 	mutex_exit(&connp->conn_lock);
1585 
1586 	return (0);
1587 }
1588 
1589 /* For /dev/icmp4 aka AF_INET open */
1590 static int
1591 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1592 {
1593 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1594 }
1595 
1596 /* For /dev/icmp6 aka AF_INET6 open */
1597 static int
1598 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1599 {
1600 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1601 }
1602 
1603 /*
1604  * This is the open routine for icmp.  It allocates a icmp_t structure for
1605  * the stream and, on the first open of the module, creates an ND table.
1606  */
1607 /* ARGSUSED */
1608 static conn_t *
1609 icmp_open(int family, cred_t *credp, int *err, int flags)
1610 {
1611 	icmp_t	*icmp;
1612 	conn_t *connp;
1613 	zoneid_t zoneid;
1614 	netstack_t *ns;
1615 	icmp_stack_t *is;
1616 	boolean_t isv6 = B_FALSE;
1617 
1618 	*err = secpolicy_net_icmpaccess(credp);
1619 	if (*err != 0)
1620 		return (NULL);
1621 
1622 	if (family == AF_INET6)
1623 		isv6 = B_TRUE;
1624 	ns = netstack_find_by_cred(credp);
1625 	ASSERT(ns != NULL);
1626 	is = ns->netstack_icmp;
1627 	ASSERT(is != NULL);
1628 
1629 	/*
1630 	 * For exclusive stacks we set the zoneid to zero
1631 	 * to make ICMP operate as if in the global zone.
1632 	 */
1633 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1634 		zoneid = GLOBAL_ZONEID;
1635 	else
1636 		zoneid = crgetzoneid(credp);
1637 
1638 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1639 
1640 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1641 	icmp = connp->conn_icmp;
1642 	icmp->icmp_v6dst = sin6_null;
1643 
1644 	/*
1645 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1646 	 * done by netstack_find_by_cred()
1647 	 */
1648 	netstack_rele(ns);
1649 
1650 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1651 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1652 	ASSERT(connp->conn_icmp == icmp);
1653 	ASSERT(icmp->icmp_connp == connp);
1654 
1655 	/* Set the initial state of the stream and the privilege status. */
1656 	icmp->icmp_state = TS_UNBND;
1657 	if (isv6) {
1658 		icmp->icmp_ipversion = IPV6_VERSION;
1659 		icmp->icmp_family = AF_INET6;
1660 		connp->conn_ulp = IPPROTO_ICMPV6;
1661 		/* May be changed by a SO_PROTOTYPE socket option. */
1662 		icmp->icmp_proto = IPPROTO_ICMPV6;
1663 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1664 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1665 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1666 		connp->conn_af_isv6 = B_TRUE;
1667 	} else {
1668 		icmp->icmp_ipversion = IPV4_VERSION;
1669 		icmp->icmp_family = AF_INET;
1670 		/* May be changed by a SO_PROTOTYPE socket option. */
1671 		icmp->icmp_proto = IPPROTO_ICMP;
1672 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1673 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1674 		connp->conn_af_isv6 = B_FALSE;
1675 	}
1676 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1677 	icmp->icmp_pending_op = -1;
1678 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1679 	connp->conn_zoneid = zoneid;
1680 
1681 	/*
1682 	 * If the caller has the process-wide flag set, then default to MAC
1683 	 * exempt mode.  This allows read-down to unlabeled hosts.
1684 	 */
1685 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1686 		connp->conn_mac_mode = CONN_MAC_AWARE;
1687 
1688 	connp->conn_ulp_labeled = is_system_labeled();
1689 
1690 	icmp->icmp_is = is;
1691 
1692 	connp->conn_recv = icmp_input;
1693 	crhold(credp);
1694 	connp->conn_cred = credp;
1695 
1696 	rw_exit(&icmp->icmp_rwlock);
1697 
1698 	connp->conn_flow_cntrld = B_FALSE;
1699 	return (connp);
1700 }
1701 
1702 /*
1703  * Which ICMP options OK to set through T_UNITDATA_REQ...
1704  */
1705 /* ARGSUSED */
1706 static boolean_t
1707 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1708 {
1709 	return (B_TRUE);
1710 }
1711 
1712 /*
1713  * This routine gets default values of certain options whose default
1714  * values are maintained by protcol specific code
1715  */
1716 /* ARGSUSED */
1717 int
1718 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1719 {
1720 	icmp_t *icmp = Q_TO_ICMP(q);
1721 	icmp_stack_t *is = icmp->icmp_is;
1722 	int *i1 = (int *)ptr;
1723 
1724 	switch (level) {
1725 	case IPPROTO_IP:
1726 		switch (name) {
1727 		case IP_MULTICAST_TTL:
1728 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1729 			return (sizeof (uchar_t));
1730 		case IP_MULTICAST_LOOP:
1731 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1732 			return (sizeof (uchar_t));
1733 		}
1734 		break;
1735 	case IPPROTO_IPV6:
1736 		switch (name) {
1737 		case IPV6_MULTICAST_HOPS:
1738 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1739 			return (sizeof (int));
1740 		case IPV6_MULTICAST_LOOP:
1741 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1742 			return (sizeof (int));
1743 		case IPV6_UNICAST_HOPS:
1744 			*i1 = is->is_ipv6_hoplimit;
1745 			return (sizeof (int));
1746 		}
1747 		break;
1748 	case IPPROTO_ICMPV6:
1749 		switch (name) {
1750 		case ICMP6_FILTER:
1751 			/* Make it look like "pass all" */
1752 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1753 			return (sizeof (icmp6_filter_t));
1754 		}
1755 		break;
1756 	}
1757 	return (-1);
1758 }
1759 
1760 /*
1761  * This routine retrieves the current status of socket options.
1762  * It returns the size of the option retrieved.
1763  */
1764 int
1765 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1766 {
1767 	icmp_t		*icmp = connp->conn_icmp;
1768 	icmp_stack_t	*is = icmp->icmp_is;
1769 	int		*i1 = (int *)ptr;
1770 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1771 	int		ret = 0;
1772 
1773 	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
1774 	switch (level) {
1775 	case SOL_SOCKET:
1776 		switch (name) {
1777 		case SO_DEBUG:
1778 			*i1 = icmp->icmp_debug;
1779 			break;
1780 		case SO_TYPE:
1781 			*i1 = SOCK_RAW;
1782 			break;
1783 		case SO_PROTOTYPE:
1784 			*i1 = icmp->icmp_proto;
1785 			break;
1786 		case SO_REUSEADDR:
1787 			*i1 = icmp->icmp_reuseaddr;
1788 			break;
1789 
1790 		/*
1791 		 * The following three items are available here,
1792 		 * but are only meaningful to IP.
1793 		 */
1794 		case SO_DONTROUTE:
1795 			*i1 = icmp->icmp_dontroute;
1796 			break;
1797 		case SO_USELOOPBACK:
1798 			*i1 = icmp->icmp_useloopback;
1799 			break;
1800 		case SO_BROADCAST:
1801 			*i1 = icmp->icmp_broadcast;
1802 			break;
1803 
1804 		case SO_SNDBUF:
1805 			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
1806 			*i1 = icmp->icmp_xmit_hiwat;
1807 			break;
1808 		case SO_RCVBUF:
1809 			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
1810 			*i1 = icmp->icmp_recv_hiwat;
1811 			break;
1812 		case SO_DGRAM_ERRIND:
1813 			*i1 = icmp->icmp_dgram_errind;
1814 			break;
1815 		case SO_TIMESTAMP:
1816 			*i1 = icmp->icmp_timestamp;
1817 			break;
1818 		case SO_MAC_EXEMPT:
1819 			*i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
1820 			break;
1821 		case SO_MAC_IMPLICIT:
1822 			*i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
1823 			break;
1824 		case SO_DOMAIN:
1825 			*i1 = icmp->icmp_family;
1826 			break;
1827 
1828 		/*
1829 		 * Following four not meaningful for icmp
1830 		 * Action is same as "default" to which we fallthrough
1831 		 * so we keep them in comments.
1832 		 * case SO_LINGER:
1833 		 * case SO_KEEPALIVE:
1834 		 * case SO_OOBINLINE:
1835 		 * case SO_ALLZONES:
1836 		 */
1837 		default:
1838 			ret = -1;
1839 			goto done;
1840 		}
1841 		break;
1842 	case IPPROTO_IP:
1843 		/*
1844 		 * Only allow IPv4 option processing on IPv4 sockets.
1845 		 */
1846 		if (icmp->icmp_family != AF_INET) {
1847 			ret = -1;
1848 			goto done;
1849 		}
1850 
1851 		switch (name) {
1852 		case IP_OPTIONS:
1853 		case T_IP_OPTIONS:
1854 			/* Options are passed up with each packet */
1855 			ret = 0;
1856 			goto done;
1857 		case IP_HDRINCL:
1858 			*i1 = (int)icmp->icmp_hdrincl;
1859 			break;
1860 		case IP_TOS:
1861 		case T_IP_TOS:
1862 			*i1 = (int)icmp->icmp_type_of_service;
1863 			break;
1864 		case IP_TTL:
1865 			*i1 = (int)icmp->icmp_ttl;
1866 			break;
1867 		case IP_MULTICAST_IF:
1868 			/* 0 address if not set */
1869 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1870 			ret = sizeof (ipaddr_t);
1871 			goto done;
1872 		case IP_MULTICAST_TTL:
1873 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1874 			ret = sizeof (uchar_t);
1875 			goto done;
1876 		case IP_MULTICAST_LOOP:
1877 			*ptr = connp->conn_multicast_loop;
1878 			ret = sizeof (uint8_t);
1879 			goto done;
1880 		case IP_BOUND_IF:
1881 			/* Zero if not set */
1882 			*i1 = icmp->icmp_bound_if;
1883 			break;	/* goto sizeof (int) option return */
1884 		case IP_UNSPEC_SRC:
1885 			*ptr = icmp->icmp_unspec_source;
1886 			break;	/* goto sizeof (int) option return */
1887 		case IP_RECVIF:
1888 			*ptr = icmp->icmp_recvif;
1889 			break;	/* goto sizeof (int) option return */
1890 		case IP_BROADCAST_TTL:
1891 			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
1892 			return (sizeof (uchar_t));
1893 		case IP_RECVPKTINFO:
1894 			/*
1895 			 * This also handles IP_PKTINFO.
1896 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1897 			 * Differentiation is based on the size of the argument
1898 			 * passed in.
1899 			 * This option is handled in IP which will return an
1900 			 * error for IP_PKTINFO as it's not supported as a
1901 			 * sticky option.
1902 			 */
1903 			ret = -EINVAL;
1904 			goto done;
1905 		/*
1906 		 * Cannot "get" the value of following options
1907 		 * at this level. Action is same as "default" to
1908 		 * which we fallthrough so we keep them in comments.
1909 		 *
1910 		 * case IP_ADD_MEMBERSHIP:
1911 		 * case IP_DROP_MEMBERSHIP:
1912 		 * case IP_BLOCK_SOURCE:
1913 		 * case IP_UNBLOCK_SOURCE:
1914 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1915 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1916 		 * case MCAST_JOIN_GROUP:
1917 		 * case MCAST_LEAVE_GROUP:
1918 		 * case MCAST_BLOCK_SOURCE:
1919 		 * case MCAST_UNBLOCK_SOURCE:
1920 		 * case MCAST_JOIN_SOURCE_GROUP:
1921 		 * case MCAST_LEAVE_SOURCE_GROUP:
1922 		 * case MRT_INIT:
1923 		 * case MRT_DONE:
1924 		 * case MRT_ADD_VIF:
1925 		 * case MRT_DEL_VIF:
1926 		 * case MRT_ADD_MFC:
1927 		 * case MRT_DEL_MFC:
1928 		 * case MRT_VERSION:
1929 		 * case MRT_ASSERT:
1930 		 * case IP_SEC_OPT:
1931 		 * case IP_NEXTHOP:
1932 		 */
1933 		default:
1934 			ret = -1;
1935 			goto done;
1936 		}
1937 		break;
1938 	case IPPROTO_IPV6:
1939 		/*
1940 		 * Only allow IPv6 option processing on native IPv6 sockets.
1941 		 */
1942 		if (icmp->icmp_family != AF_INET6) {
1943 			ret = -1;
1944 			goto done;
1945 		}
1946 		switch (name) {
1947 		case IPV6_UNICAST_HOPS:
1948 			*i1 = (unsigned int)icmp->icmp_ttl;
1949 			break;
1950 		case IPV6_MULTICAST_IF:
1951 			/* 0 index if not set */
1952 			*i1 = icmp->icmp_multicast_if_index;
1953 			break;
1954 		case IPV6_MULTICAST_HOPS:
1955 			*i1 = icmp->icmp_multicast_ttl;
1956 			break;
1957 		case IPV6_MULTICAST_LOOP:
1958 			*i1 = connp->conn_multicast_loop;
1959 			break;
1960 		case IPV6_BOUND_IF:
1961 			/* Zero if not set */
1962 			*i1 = icmp->icmp_bound_if;
1963 			break;
1964 		case IPV6_UNSPEC_SRC:
1965 			*i1 = icmp->icmp_unspec_source;
1966 			break;
1967 		case IPV6_CHECKSUM:
1968 			/*
1969 			 * Return offset or -1 if no checksum offset.
1970 			 * Does not apply to IPPROTO_ICMPV6
1971 			 */
1972 			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
1973 				ret = -1;
1974 				goto done;
1975 			}
1976 
1977 			if (icmp->icmp_raw_checksum) {
1978 				*i1 = icmp->icmp_checksum_off;
1979 			} else {
1980 				*i1 = -1;
1981 			}
1982 			break;
1983 		case IPV6_JOIN_GROUP:
1984 		case IPV6_LEAVE_GROUP:
1985 		case MCAST_JOIN_GROUP:
1986 		case MCAST_LEAVE_GROUP:
1987 		case MCAST_BLOCK_SOURCE:
1988 		case MCAST_UNBLOCK_SOURCE:
1989 		case MCAST_JOIN_SOURCE_GROUP:
1990 		case MCAST_LEAVE_SOURCE_GROUP:
1991 			/* cannot "get" the value for these */
1992 			ret = -1;
1993 			goto done;
1994 		case IPV6_RECVPKTINFO:
1995 			*i1 = icmp->icmp_ip_recvpktinfo;
1996 			break;
1997 		case IPV6_RECVTCLASS:
1998 			*i1 = icmp->icmp_ipv6_recvtclass;
1999 			break;
2000 		case IPV6_RECVPATHMTU:
2001 			*i1 = icmp->icmp_ipv6_recvpathmtu;
2002 			break;
2003 		case IPV6_V6ONLY:
2004 			*i1 = 1;
2005 			break;
2006 		case IPV6_RECVHOPLIMIT:
2007 			*i1 = icmp->icmp_ipv6_recvhoplimit;
2008 			break;
2009 		case IPV6_RECVHOPOPTS:
2010 			*i1 = icmp->icmp_ipv6_recvhopopts;
2011 			break;
2012 		case IPV6_RECVDSTOPTS:
2013 			*i1 = icmp->icmp_ipv6_recvdstopts;
2014 			break;
2015 		case _OLD_IPV6_RECVDSTOPTS:
2016 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
2017 			break;
2018 		case IPV6_RECVRTHDRDSTOPTS:
2019 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
2020 			break;
2021 		case IPV6_RECVRTHDR:
2022 			*i1 = icmp->icmp_ipv6_recvrthdr;
2023 			break;
2024 		case IPV6_PKTINFO: {
2025 			/* XXX assumes that caller has room for max size! */
2026 			struct in6_pktinfo *pkti;
2027 
2028 			pkti = (struct in6_pktinfo *)ptr;
2029 			if (ipp->ipp_fields & IPPF_IFINDEX)
2030 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
2031 			else
2032 				pkti->ipi6_ifindex = 0;
2033 			if (ipp->ipp_fields & IPPF_ADDR)
2034 				pkti->ipi6_addr = ipp->ipp_addr;
2035 			else
2036 				pkti->ipi6_addr = ipv6_all_zeros;
2037 			ret = sizeof (struct in6_pktinfo);
2038 			goto done;
2039 		}
2040 		case IPV6_NEXTHOP: {
2041 			sin6_t *sin6 = (sin6_t *)ptr;
2042 
2043 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
2044 				return (0);
2045 			*sin6 = sin6_null;
2046 			sin6->sin6_family = AF_INET6;
2047 			sin6->sin6_addr = ipp->ipp_nexthop;
2048 			ret = (sizeof (sin6_t));
2049 			goto done;
2050 		}
2051 		case IPV6_HOPOPTS:
2052 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
2053 				return (0);
2054 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
2055 				return (0);
2056 			bcopy((char *)ipp->ipp_hopopts +
2057 			    icmp->icmp_label_len_v6, ptr,
2058 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2059 			if (icmp->icmp_label_len_v6 > 0) {
2060 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
2061 				ptr[1] = (ipp->ipp_hopoptslen -
2062 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
2063 			}
2064 			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2065 			goto done;
2066 		case IPV6_RTHDRDSTOPTS:
2067 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
2068 				return (0);
2069 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
2070 			ret = ipp->ipp_rtdstoptslen;
2071 			goto done;
2072 		case IPV6_RTHDR:
2073 			if (!(ipp->ipp_fields & IPPF_RTHDR))
2074 				return (0);
2075 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
2076 			ret = ipp->ipp_rthdrlen;
2077 			goto done;
2078 		case IPV6_DSTOPTS:
2079 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2080 				ret = 0;
2081 				goto done;
2082 			}
2083 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
2084 			ret = ipp->ipp_dstoptslen;
2085 			goto done;
2086 		case IPV6_PATHMTU:
2087 			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
2088 				ret = 0;
2089 			} else {
2090 				ret = ip_fill_mtuinfo(
2091 				    &icmp->icmp_v6dst.sin6_addr, 0,
2092 				    (struct ip6_mtuinfo *)ptr,
2093 				    is->is_netstack);
2094 			}
2095 			goto done;
2096 		case IPV6_TCLASS:
2097 			if (ipp->ipp_fields & IPPF_TCLASS)
2098 				*i1 = ipp->ipp_tclass;
2099 			else
2100 				*i1 = IPV6_FLOW_TCLASS(
2101 				    IPV6_DEFAULT_VERS_AND_FLOW);
2102 			break;
2103 		default:
2104 			ret = -1;
2105 			goto done;
2106 		}
2107 		break;
2108 	case IPPROTO_ICMPV6:
2109 		/*
2110 		 * Only allow IPv6 option processing on native IPv6 sockets.
2111 		 */
2112 		if (icmp->icmp_family != AF_INET6) {
2113 			ret = -1;
2114 		}
2115 
2116 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2117 			ret = -1;
2118 		}
2119 
2120 		switch (name) {
2121 		case ICMP6_FILTER:
2122 			if (icmp->icmp_filter == NULL) {
2123 				/* Make it look like "pass all" */
2124 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
2125 			} else {
2126 				(void) bcopy(icmp->icmp_filter, ptr,
2127 				    sizeof (icmp6_filter_t));
2128 			}
2129 			ret = sizeof (icmp6_filter_t);
2130 			goto done;
2131 		default:
2132 			ret = -1;
2133 			goto done;
2134 		}
2135 	default:
2136 		ret = -1;
2137 		goto done;
2138 	}
2139 	ret = sizeof (int);
2140 done:
2141 	return (ret);
2142 }
2143 
2144 /*
2145  * This routine retrieves the current status of socket options.
2146  * It returns the size of the option retrieved.
2147  */
2148 int
2149 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
2150 {
2151 	conn_t  *connp = Q_TO_CONN(q);
2152 	icmp_t	*icmp = connp->conn_icmp;
2153 	int 	err;
2154 
2155 	rw_enter(&icmp->icmp_rwlock, RW_READER);
2156 	err = icmp_opt_get(connp, level, name, ptr);
2157 	rw_exit(&icmp->icmp_rwlock);
2158 	return (err);
2159 }
2160 
2161 int
2162 icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
2163     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
2164     void *thisdg_attrs, boolean_t checkonly)
2165 {
2166 
2167 	int	*i1 = (int *)invalp;
2168 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2169 	icmp_t *icmp = connp->conn_icmp;
2170 	icmp_stack_t *is = icmp->icmp_is;
2171 	int	error;
2172 
2173 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2174 	/*
2175 	 * For fixed length options, no sanity check
2176 	 * of passed in length is done. It is assumed *_optcom_req()
2177 	 * routines do the right thing.
2178 	 */
2179 	switch (level) {
2180 	case SOL_SOCKET:
2181 		switch (name) {
2182 		case SO_DEBUG:
2183 			if (!checkonly)
2184 				icmp->icmp_debug = onoff;
2185 			break;
2186 		case SO_PROTOTYPE:
2187 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2188 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2189 			    secpolicy_net_rawaccess(cr) != 0) {
2190 				*outlenp = 0;
2191 				return (EACCES);
2192 			}
2193 			/* Can't use IPPROTO_RAW with IPv6 */
2194 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2195 			    icmp->icmp_family == AF_INET6) {
2196 				*outlenp = 0;
2197 				return (EPROTONOSUPPORT);
2198 			}
2199 			if (checkonly) {
2200 				/* T_CHECK case */
2201 				*(int *)outvalp = (*i1 & 0xFF);
2202 				break;
2203 			}
2204 			icmp->icmp_proto = *i1 & 0xFF;
2205 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2206 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2207 			    icmp->icmp_family == AF_INET)
2208 				icmp->icmp_hdrincl = 1;
2209 			else
2210 				icmp->icmp_hdrincl = 0;
2211 
2212 			if (icmp->icmp_family == AF_INET6 &&
2213 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2214 				/* Set offset for icmp6_cksum */
2215 				icmp->icmp_raw_checksum = 0;
2216 				icmp->icmp_checksum_off = 2;
2217 			}
2218 			if (icmp->icmp_proto == IPPROTO_UDP ||
2219 			    icmp->icmp_proto == IPPROTO_TCP ||
2220 			    icmp->icmp_proto == IPPROTO_SCTP) {
2221 				icmp->icmp_no_tp_cksum = 1;
2222 				icmp->icmp_sticky_ipp.ipp_fields |=
2223 				    IPPF_NO_CKSUM;
2224 			} else {
2225 				icmp->icmp_no_tp_cksum = 0;
2226 				icmp->icmp_sticky_ipp.ipp_fields &=
2227 				    ~IPPF_NO_CKSUM;
2228 			}
2229 
2230 			if (icmp->icmp_filter != NULL &&
2231 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2232 				kmem_free(icmp->icmp_filter,
2233 				    sizeof (icmp6_filter_t));
2234 				icmp->icmp_filter = NULL;
2235 			}
2236 
2237 			/* Rebuild the header template */
2238 			error = icmp_build_hdrs(icmp);
2239 			if (error != 0) {
2240 				*outlenp = 0;
2241 				return (error);
2242 			}
2243 
2244 			/*
2245 			 * For SCTP, we don't use icmp_bind_proto() for
2246 			 * raw socket binding.  Note that we do not need
2247 			 * to set *outlenp.
2248 			 * FIXME: how does SCTP work?
2249 			 */
2250 			if (icmp->icmp_proto == IPPROTO_SCTP)
2251 				return (0);
2252 
2253 			*outlenp = sizeof (int);
2254 			*(int *)outvalp = *i1 & 0xFF;
2255 
2256 			/* Drop lock across the bind operation */
2257 			rw_exit(&icmp->icmp_rwlock);
2258 			(void) icmp_bind_proto(connp);
2259 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2260 			return (0);
2261 		case SO_REUSEADDR:
2262 			if (!checkonly) {
2263 				icmp->icmp_reuseaddr = onoff;
2264 				PASS_OPT_TO_IP(connp);
2265 			}
2266 			break;
2267 
2268 		/*
2269 		 * The following three items are available here,
2270 		 * but are only meaningful to IP.
2271 		 */
2272 		case SO_DONTROUTE:
2273 			if (!checkonly) {
2274 				icmp->icmp_dontroute = onoff;
2275 				PASS_OPT_TO_IP(connp);
2276 			}
2277 			break;
2278 		case SO_USELOOPBACK:
2279 			if (!checkonly) {
2280 				icmp->icmp_useloopback = onoff;
2281 				PASS_OPT_TO_IP(connp);
2282 			}
2283 			break;
2284 		case SO_BROADCAST:
2285 			if (!checkonly) {
2286 				icmp->icmp_broadcast = onoff;
2287 				PASS_OPT_TO_IP(connp);
2288 			}
2289 			break;
2290 
2291 		case SO_SNDBUF:
2292 			if (*i1 > is->is_max_buf) {
2293 				*outlenp = 0;
2294 				return (ENOBUFS);
2295 			}
2296 			if (!checkonly) {
2297 				if (!IPCL_IS_NONSTR(connp)) {
2298 					connp->conn_wq->q_hiwat = *i1;
2299 				}
2300 				icmp->icmp_xmit_hiwat = *i1;
2301 			}
2302 			break;
2303 		case SO_RCVBUF:
2304 			if (*i1 > is->is_max_buf) {
2305 				*outlenp = 0;
2306 				return (ENOBUFS);
2307 			}
2308 			if (!checkonly) {
2309 				icmp->icmp_recv_hiwat = *i1;
2310 				rw_exit(&icmp->icmp_rwlock);
2311 				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2312 				    *i1);
2313 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2314 			}
2315 			break;
2316 		case SO_DGRAM_ERRIND:
2317 			if (!checkonly)
2318 				icmp->icmp_dgram_errind = onoff;
2319 			break;
2320 		case SO_ALLZONES:
2321 			/*
2322 			 * "soft" error (negative)
2323 			 * option not handled at this level
2324 			 * Note: Do not modify *outlenp
2325 			 */
2326 			return (-EINVAL);
2327 		case SO_TIMESTAMP:
2328 			if (!checkonly) {
2329 				icmp->icmp_timestamp = onoff;
2330 			}
2331 			break;
2332 		case SO_MAC_EXEMPT:
2333 			/*
2334 			 * "soft" error (negative)
2335 			 * option not handled at this level
2336 			 * Note: Do not modify *outlenp
2337 			 */
2338 			return (-EINVAL);
2339 		case SO_RCVTIMEO:
2340 		case SO_SNDTIMEO:
2341 			/*
2342 			 * Pass these two options in order for third part
2343 			 * protocol usage. Here just return directly.
2344 			 */
2345 			return (0);
2346 		/*
2347 		 * Following three not meaningful for icmp
2348 		 * Action is same as "default" so we keep them
2349 		 * in comments.
2350 		 * case SO_LINGER:
2351 		 * case SO_KEEPALIVE:
2352 		 * case SO_OOBINLINE:
2353 		 */
2354 		default:
2355 			*outlenp = 0;
2356 			return (EINVAL);
2357 		}
2358 		break;
2359 	case IPPROTO_IP:
2360 		/*
2361 		 * Only allow IPv4 option processing on IPv4 sockets.
2362 		 */
2363 		if (icmp->icmp_family != AF_INET) {
2364 			*outlenp = 0;
2365 			return (ENOPROTOOPT);
2366 		}
2367 		switch (name) {
2368 		case IP_OPTIONS:
2369 		case T_IP_OPTIONS:
2370 			/* Save options for use by IP. */
2371 			if ((inlen & 0x3) ||
2372 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2373 				*outlenp = 0;
2374 				return (EINVAL);
2375 			}
2376 			if (checkonly)
2377 				break;
2378 
2379 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2380 			    &icmp->icmp_ip_snd_options_len,
2381 			    icmp->icmp_label_len, invalp, inlen)) {
2382 				*outlenp = 0;
2383 				return (ENOMEM);
2384 			}
2385 
2386 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2387 			    icmp->icmp_ip_snd_options_len;
2388 			rw_exit(&icmp->icmp_rwlock);
2389 			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
2390 			    RD(connp->conn_rq), connp,
2391 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
2392 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2393 			break;
2394 		case IP_HDRINCL:
2395 			if (!checkonly)
2396 				icmp->icmp_hdrincl = onoff;
2397 			break;
2398 		case IP_TOS:
2399 		case T_IP_TOS:
2400 			if (!checkonly) {
2401 				icmp->icmp_type_of_service = (uint8_t)*i1;
2402 			}
2403 			break;
2404 		case IP_TTL:
2405 			if (!checkonly) {
2406 				icmp->icmp_ttl = (uint8_t)*i1;
2407 			}
2408 			break;
2409 		case IP_MULTICAST_IF:
2410 			/*
2411 			 * TODO should check OPTMGMT reply and undo this if
2412 			 * there is an error.
2413 			 */
2414 			if (!checkonly) {
2415 				icmp->icmp_multicast_if_addr = *i1;
2416 				PASS_OPT_TO_IP(connp);
2417 			}
2418 			break;
2419 		case IP_MULTICAST_TTL:
2420 			if (!checkonly)
2421 				icmp->icmp_multicast_ttl = *invalp;
2422 			break;
2423 		case IP_MULTICAST_LOOP:
2424 			if (!checkonly) {
2425 				connp->conn_multicast_loop =
2426 				    (*invalp == 0) ? 0 : 1;
2427 				PASS_OPT_TO_IP(connp);
2428 			}
2429 			break;
2430 		case IP_BOUND_IF:
2431 			if (!checkonly) {
2432 				icmp->icmp_bound_if = *i1;
2433 				PASS_OPT_TO_IP(connp);
2434 			}
2435 			break;
2436 		case IP_UNSPEC_SRC:
2437 			if (!checkonly) {
2438 				icmp->icmp_unspec_source = onoff;
2439 				PASS_OPT_TO_IP(connp);
2440 			}
2441 			break;
2442 		case IP_BROADCAST_TTL:
2443 			if (!checkonly)
2444 				connp->conn_broadcast_ttl = *invalp;
2445 			break;
2446 		case IP_RECVIF:
2447 			if (!checkonly) {
2448 				icmp->icmp_recvif = onoff;
2449 			}
2450 			/*
2451 			 * pass to ip
2452 			 */
2453 			return (-EINVAL);
2454 		case IP_PKTINFO: {
2455 			/*
2456 			 * This also handles IP_RECVPKTINFO.
2457 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2458 			 * Differentiation is based on the size of the argument
2459 			 * passed in.
2460 			 */
2461 			struct in_pktinfo *pktinfop;
2462 			ip4_pkt_t *attr_pktinfop;
2463 
2464 			if (checkonly)
2465 				break;
2466 
2467 			if (inlen == sizeof (int)) {
2468 				/*
2469 				 * This is IP_RECVPKTINFO option.
2470 				 * Keep a local copy of wether this option is
2471 				 * set or not and pass it down to IP for
2472 				 * processing.
2473 				 */
2474 				icmp->icmp_ip_recvpktinfo = onoff;
2475 				return (-EINVAL);
2476 			}
2477 
2478 
2479 			if (inlen != sizeof (struct in_pktinfo)) {
2480 				return (EINVAL);
2481 			}
2482 
2483 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2484 			    == NULL) {
2485 				/*
2486 				 * sticky option is not supported
2487 				 */
2488 				return (EINVAL);
2489 			}
2490 
2491 			pktinfop = (struct in_pktinfo *)invalp;
2492 
2493 			/*
2494 			 * Atleast one of the values should be specified
2495 			 */
2496 			if (pktinfop->ipi_ifindex == 0 &&
2497 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2498 				return (EINVAL);
2499 			}
2500 
2501 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2502 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2503 		}
2504 			break;
2505 		case IP_ADD_MEMBERSHIP:
2506 		case IP_DROP_MEMBERSHIP:
2507 		case IP_BLOCK_SOURCE:
2508 		case IP_UNBLOCK_SOURCE:
2509 		case IP_ADD_SOURCE_MEMBERSHIP:
2510 		case IP_DROP_SOURCE_MEMBERSHIP:
2511 		case MCAST_JOIN_GROUP:
2512 		case MCAST_LEAVE_GROUP:
2513 		case MCAST_BLOCK_SOURCE:
2514 		case MCAST_UNBLOCK_SOURCE:
2515 		case MCAST_JOIN_SOURCE_GROUP:
2516 		case MCAST_LEAVE_SOURCE_GROUP:
2517 		case MRT_INIT:
2518 		case MRT_DONE:
2519 		case MRT_ADD_VIF:
2520 		case MRT_DEL_VIF:
2521 		case MRT_ADD_MFC:
2522 		case MRT_DEL_MFC:
2523 		case MRT_VERSION:
2524 		case MRT_ASSERT:
2525 		case IP_SEC_OPT:
2526 		case IP_NEXTHOP:
2527 			/*
2528 			 * "soft" error (negative)
2529 			 * option not handled at this level
2530 			 * Note: Do not modify *outlenp
2531 			 */
2532 			return (-EINVAL);
2533 		default:
2534 			*outlenp = 0;
2535 			return (EINVAL);
2536 		}
2537 		break;
2538 	case IPPROTO_IPV6: {
2539 		ip6_pkt_t		*ipp;
2540 		boolean_t		sticky;
2541 
2542 		if (icmp->icmp_family != AF_INET6) {
2543 			*outlenp = 0;
2544 			return (ENOPROTOOPT);
2545 		}
2546 		/*
2547 		 * Deal with both sticky options and ancillary data
2548 		 */
2549 		if (thisdg_attrs == NULL) {
2550 			/* sticky options, or none */
2551 			ipp = &icmp->icmp_sticky_ipp;
2552 			sticky = B_TRUE;
2553 		} else {
2554 			/* ancillary data */
2555 			ipp = (ip6_pkt_t *)thisdg_attrs;
2556 			sticky = B_FALSE;
2557 		}
2558 
2559 		switch (name) {
2560 		case IPV6_MULTICAST_IF:
2561 			if (!checkonly) {
2562 				icmp->icmp_multicast_if_index = *i1;
2563 				PASS_OPT_TO_IP(connp);
2564 			}
2565 			break;
2566 		case IPV6_UNICAST_HOPS:
2567 			/* -1 means use default */
2568 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2569 				*outlenp = 0;
2570 				return (EINVAL);
2571 			}
2572 			if (!checkonly) {
2573 				if (*i1 == -1) {
2574 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2575 					    is->is_ipv6_hoplimit;
2576 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2577 					/* Pass modified value to IP. */
2578 					*i1 = ipp->ipp_hoplimit;
2579 				} else {
2580 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2581 					    (uint8_t)*i1;
2582 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2583 				}
2584 				/* Rebuild the header template */
2585 				error = icmp_build_hdrs(icmp);
2586 				if (error != 0) {
2587 					*outlenp = 0;
2588 					return (error);
2589 				}
2590 			}
2591 			break;
2592 		case IPV6_MULTICAST_HOPS:
2593 			/* -1 means use default */
2594 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2595 				*outlenp = 0;
2596 				return (EINVAL);
2597 			}
2598 			if (!checkonly) {
2599 				if (*i1 == -1) {
2600 					icmp->icmp_multicast_ttl =
2601 					    ipp->ipp_multicast_hops =
2602 					    IP_DEFAULT_MULTICAST_TTL;
2603 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2604 					/* Pass modified value to IP. */
2605 					*i1 = icmp->icmp_multicast_ttl;
2606 				} else {
2607 					icmp->icmp_multicast_ttl =
2608 					    ipp->ipp_multicast_hops =
2609 					    (uint8_t)*i1;
2610 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2611 				}
2612 			}
2613 			break;
2614 		case IPV6_MULTICAST_LOOP:
2615 			if (*i1 != 0 && *i1 != 1) {
2616 				*outlenp = 0;
2617 				return (EINVAL);
2618 			}
2619 			if (!checkonly) {
2620 				connp->conn_multicast_loop = *i1;
2621 				PASS_OPT_TO_IP(connp);
2622 			}
2623 			break;
2624 		case IPV6_CHECKSUM:
2625 			/*
2626 			 * Integer offset into the user data of where the
2627 			 * checksum is located.
2628 			 * Offset of -1 disables option.
2629 			 * Does not apply to IPPROTO_ICMPV6.
2630 			 */
2631 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2632 				*outlenp = 0;
2633 				return (EINVAL);
2634 			}
2635 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2636 				/* Negative or not 16 bit aligned offset */
2637 				*outlenp = 0;
2638 				return (EINVAL);
2639 			}
2640 			if (checkonly)
2641 				break;
2642 
2643 			if (*i1 == -1) {
2644 				icmp->icmp_raw_checksum = 0;
2645 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2646 			} else {
2647 				icmp->icmp_raw_checksum = 1;
2648 				icmp->icmp_checksum_off = *i1;
2649 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2650 			}
2651 			/* Rebuild the header template */
2652 			error = icmp_build_hdrs(icmp);
2653 			if (error != 0) {
2654 				*outlenp = 0;
2655 				return (error);
2656 			}
2657 			break;
2658 		case IPV6_JOIN_GROUP:
2659 		case IPV6_LEAVE_GROUP:
2660 		case MCAST_JOIN_GROUP:
2661 		case MCAST_LEAVE_GROUP:
2662 		case MCAST_BLOCK_SOURCE:
2663 		case MCAST_UNBLOCK_SOURCE:
2664 		case MCAST_JOIN_SOURCE_GROUP:
2665 		case MCAST_LEAVE_SOURCE_GROUP:
2666 			/*
2667 			 * "soft" error (negative)
2668 			 * option not handled at this level
2669 			 * Note: Do not modify *outlenp
2670 			 */
2671 			return (-EINVAL);
2672 		case IPV6_BOUND_IF:
2673 			if (!checkonly) {
2674 				icmp->icmp_bound_if = *i1;
2675 				PASS_OPT_TO_IP(connp);
2676 			}
2677 			break;
2678 		case IPV6_UNSPEC_SRC:
2679 			if (!checkonly) {
2680 				icmp->icmp_unspec_source = onoff;
2681 				PASS_OPT_TO_IP(connp);
2682 			}
2683 			break;
2684 		case IPV6_RECVTCLASS:
2685 			if (!checkonly) {
2686 				icmp->icmp_ipv6_recvtclass = onoff;
2687 				PASS_OPT_TO_IP(connp);
2688 			}
2689 			break;
2690 		/*
2691 		 * Set boolean switches for ancillary data delivery
2692 		 */
2693 		case IPV6_RECVPKTINFO:
2694 			if (!checkonly) {
2695 				icmp->icmp_ip_recvpktinfo = onoff;
2696 				PASS_OPT_TO_IP(connp);
2697 			}
2698 			break;
2699 		case IPV6_RECVPATHMTU:
2700 			if (!checkonly) {
2701 				icmp->icmp_ipv6_recvpathmtu = onoff;
2702 				PASS_OPT_TO_IP(connp);
2703 			}
2704 			break;
2705 		case IPV6_RECVHOPLIMIT:
2706 			if (!checkonly) {
2707 				icmp->icmp_ipv6_recvhoplimit = onoff;
2708 				PASS_OPT_TO_IP(connp);
2709 			}
2710 			break;
2711 		case IPV6_RECVHOPOPTS:
2712 			if (!checkonly) {
2713 				icmp->icmp_ipv6_recvhopopts = onoff;
2714 				PASS_OPT_TO_IP(connp);
2715 			}
2716 			break;
2717 		case IPV6_RECVDSTOPTS:
2718 			if (!checkonly) {
2719 				icmp->icmp_ipv6_recvdstopts = onoff;
2720 				PASS_OPT_TO_IP(connp);
2721 			}
2722 			break;
2723 		case _OLD_IPV6_RECVDSTOPTS:
2724 			if (!checkonly)
2725 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2726 			break;
2727 		case IPV6_RECVRTHDRDSTOPTS:
2728 			if (!checkonly) {
2729 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2730 				PASS_OPT_TO_IP(connp);
2731 			}
2732 			break;
2733 		case IPV6_RECVRTHDR:
2734 			if (!checkonly) {
2735 				icmp->icmp_ipv6_recvrthdr = onoff;
2736 				PASS_OPT_TO_IP(connp);
2737 			}
2738 			break;
2739 		/*
2740 		 * Set sticky options or ancillary data.
2741 		 * If sticky options, (re)build any extension headers
2742 		 * that might be needed as a result.
2743 		 */
2744 		case IPV6_PKTINFO:
2745 			/*
2746 			 * The source address and ifindex are verified
2747 			 * in ip_opt_set(). For ancillary data the
2748 			 * source address is checked in ip_wput_v6.
2749 			 */
2750 			if (inlen != 0 && inlen !=
2751 			    sizeof (struct in6_pktinfo)) {
2752 				return (EINVAL);
2753 			}
2754 			if (checkonly)
2755 				break;
2756 
2757 			if (inlen == 0) {
2758 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2759 				ipp->ipp_sticky_ignored |=
2760 				    (IPPF_IFINDEX|IPPF_ADDR);
2761 			} else {
2762 				struct in6_pktinfo *pkti;
2763 
2764 				pkti = (struct in6_pktinfo *)invalp;
2765 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2766 				ipp->ipp_addr = pkti->ipi6_addr;
2767 				if (ipp->ipp_ifindex != 0)
2768 					ipp->ipp_fields |= IPPF_IFINDEX;
2769 				else
2770 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2771 				if (!IN6_IS_ADDR_UNSPECIFIED(
2772 				    &ipp->ipp_addr))
2773 					ipp->ipp_fields |= IPPF_ADDR;
2774 				else
2775 					ipp->ipp_fields &= ~IPPF_ADDR;
2776 			}
2777 			if (sticky) {
2778 				error = icmp_build_hdrs(icmp);
2779 				if (error != 0)
2780 					return (error);
2781 				PASS_OPT_TO_IP(connp);
2782 			}
2783 			break;
2784 		case IPV6_HOPLIMIT:
2785 			/* This option can only be used as ancillary data. */
2786 			if (sticky)
2787 				return (EINVAL);
2788 			if (inlen != 0 && inlen != sizeof (int))
2789 				return (EINVAL);
2790 			if (checkonly)
2791 				break;
2792 
2793 			if (inlen == 0) {
2794 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2795 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2796 			} else {
2797 				if (*i1 > 255 || *i1 < -1)
2798 					return (EINVAL);
2799 				if (*i1 == -1)
2800 					ipp->ipp_hoplimit =
2801 					    is->is_ipv6_hoplimit;
2802 				else
2803 					ipp->ipp_hoplimit = *i1;
2804 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2805 			}
2806 			break;
2807 		case IPV6_TCLASS:
2808 			/*
2809 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2810 			 * and [0, 255] as the actualy traffic class.
2811 			 */
2812 			if (inlen != 0 && inlen != sizeof (int)) {
2813 				return (EINVAL);
2814 			}
2815 			if (checkonly)
2816 				break;
2817 
2818 			if (inlen == 0) {
2819 				ipp->ipp_fields &= ~IPPF_TCLASS;
2820 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2821 			} else {
2822 				if (*i1 >= 256 || *i1 < -1)
2823 					return (EINVAL);
2824 				if (*i1 == -1) {
2825 					ipp->ipp_tclass =
2826 					    IPV6_FLOW_TCLASS(
2827 					    IPV6_DEFAULT_VERS_AND_FLOW);
2828 				} else {
2829 					ipp->ipp_tclass = *i1;
2830 				}
2831 				ipp->ipp_fields |= IPPF_TCLASS;
2832 			}
2833 			if (sticky) {
2834 				error = icmp_build_hdrs(icmp);
2835 				if (error != 0)
2836 					return (error);
2837 			}
2838 			break;
2839 		case IPV6_NEXTHOP:
2840 			/*
2841 			 * IP will verify that the nexthop is reachable
2842 			 * and fail for sticky options.
2843 			 */
2844 			if (inlen != 0 && inlen != sizeof (sin6_t)) {
2845 				return (EINVAL);
2846 			}
2847 			if (checkonly)
2848 				break;
2849 
2850 			if (inlen == 0) {
2851 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2852 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2853 			} else {
2854 				sin6_t *sin6 = (sin6_t *)invalp;
2855 
2856 				if (sin6->sin6_family != AF_INET6) {
2857 					return (EAFNOSUPPORT);
2858 				}
2859 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
2860 					return (EADDRNOTAVAIL);
2861 				}
2862 				ipp->ipp_nexthop = sin6->sin6_addr;
2863 				if (!IN6_IS_ADDR_UNSPECIFIED(
2864 				    &ipp->ipp_nexthop))
2865 					ipp->ipp_fields |= IPPF_NEXTHOP;
2866 				else
2867 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2868 			}
2869 			if (sticky) {
2870 				error = icmp_build_hdrs(icmp);
2871 				if (error != 0)
2872 					return (error);
2873 				PASS_OPT_TO_IP(connp);
2874 			}
2875 			break;
2876 		case IPV6_HOPOPTS: {
2877 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2878 			/*
2879 			 * Sanity checks - minimum size, size a multiple of
2880 			 * eight bytes, and matching size passed in.
2881 			 */
2882 			if (inlen != 0 &&
2883 			    inlen != (8 * (hopts->ip6h_len + 1))) {
2884 				return (EINVAL);
2885 			}
2886 
2887 			if (checkonly)
2888 				break;
2889 			error = optcom_pkt_set(invalp, inlen, sticky,
2890 			    (uchar_t **)&ipp->ipp_hopopts,
2891 			    &ipp->ipp_hopoptslen,
2892 			    sticky ? icmp->icmp_label_len_v6 : 0);
2893 			if (error != 0)
2894 				return (error);
2895 			if (ipp->ipp_hopoptslen == 0) {
2896 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2897 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2898 			} else {
2899 				ipp->ipp_fields |= IPPF_HOPOPTS;
2900 			}
2901 			if (sticky) {
2902 				error = icmp_build_hdrs(icmp);
2903 				if (error != 0)
2904 					return (error);
2905 			}
2906 			break;
2907 		}
2908 		case IPV6_RTHDRDSTOPTS: {
2909 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2910 
2911 			/*
2912 			 * Sanity checks - minimum size, size a multiple of
2913 			 * eight bytes, and matching size passed in.
2914 			 */
2915 			if (inlen != 0 &&
2916 			    inlen != (8 * (dopts->ip6d_len + 1)))
2917 				return (EINVAL);
2918 
2919 			if (checkonly)
2920 				break;
2921 
2922 			if (inlen == 0) {
2923 				if (sticky &&
2924 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2925 					kmem_free(ipp->ipp_rtdstopts,
2926 					    ipp->ipp_rtdstoptslen);
2927 					ipp->ipp_rtdstopts = NULL;
2928 					ipp->ipp_rtdstoptslen = 0;
2929 				}
2930 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2931 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2932 			} else {
2933 				error = optcom_pkt_set(invalp, inlen, sticky,
2934 				    (uchar_t **)&ipp->ipp_rtdstopts,
2935 				    &ipp->ipp_rtdstoptslen, 0);
2936 				if (error != 0)
2937 					return (error);
2938 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2939 			}
2940 			if (sticky) {
2941 				error = icmp_build_hdrs(icmp);
2942 				if (error != 0)
2943 					return (error);
2944 			}
2945 			break;
2946 		}
2947 		case IPV6_DSTOPTS: {
2948 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2949 
2950 			/*
2951 			 * Sanity checks - minimum size, size a multiple of
2952 			 * eight bytes, and matching size passed in.
2953 			 */
2954 			if (inlen != 0 &&
2955 			    inlen != (8 * (dopts->ip6d_len + 1)))
2956 				return (EINVAL);
2957 
2958 			if (checkonly)
2959 				break;
2960 
2961 			if (inlen == 0) {
2962 				if (sticky &&
2963 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2964 					kmem_free(ipp->ipp_dstopts,
2965 					    ipp->ipp_dstoptslen);
2966 					ipp->ipp_dstopts = NULL;
2967 					ipp->ipp_dstoptslen = 0;
2968 				}
2969 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2970 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2971 			} else {
2972 				error = optcom_pkt_set(invalp, inlen, sticky,
2973 				    (uchar_t **)&ipp->ipp_dstopts,
2974 				    &ipp->ipp_dstoptslen, 0);
2975 				if (error != 0)
2976 					return (error);
2977 				ipp->ipp_fields |= IPPF_DSTOPTS;
2978 			}
2979 			if (sticky) {
2980 				error = icmp_build_hdrs(icmp);
2981 				if (error != 0)
2982 					return (error);
2983 			}
2984 			break;
2985 		}
2986 		case IPV6_RTHDR: {
2987 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2988 
2989 			/*
2990 			 * Sanity checks - minimum size, size a multiple of
2991 			 * eight bytes, and matching size passed in.
2992 			 */
2993 			if (inlen != 0 &&
2994 			    inlen != (8 * (rt->ip6r_len + 1)))
2995 				return (EINVAL);
2996 
2997 			if (checkonly)
2998 				break;
2999 
3000 			if (inlen == 0) {
3001 				if (sticky &&
3002 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
3003 					kmem_free(ipp->ipp_rthdr,
3004 					    ipp->ipp_rthdrlen);
3005 					ipp->ipp_rthdr = NULL;
3006 					ipp->ipp_rthdrlen = 0;
3007 				}
3008 				ipp->ipp_fields &= ~IPPF_RTHDR;
3009 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
3010 			} else {
3011 				error = optcom_pkt_set(invalp, inlen, sticky,
3012 				    (uchar_t **)&ipp->ipp_rthdr,
3013 				    &ipp->ipp_rthdrlen, 0);
3014 				if (error != 0)
3015 					return (error);
3016 				ipp->ipp_fields |= IPPF_RTHDR;
3017 			}
3018 			if (sticky) {
3019 				error = icmp_build_hdrs(icmp);
3020 				if (error != 0)
3021 					return (error);
3022 			}
3023 			break;
3024 		}
3025 
3026 		case IPV6_DONTFRAG:
3027 			if (checkonly)
3028 				break;
3029 
3030 			if (onoff) {
3031 				ipp->ipp_fields |= IPPF_DONTFRAG;
3032 			} else {
3033 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
3034 			}
3035 			break;
3036 
3037 		case IPV6_USE_MIN_MTU:
3038 			if (inlen != sizeof (int))
3039 				return (EINVAL);
3040 
3041 			if (*i1 < -1 || *i1 > 1)
3042 				return (EINVAL);
3043 
3044 			if (checkonly)
3045 				break;
3046 
3047 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
3048 			ipp->ipp_use_min_mtu = *i1;
3049 			break;
3050 
3051 		/*
3052 		 * This option can't be set.  Its only returned via
3053 		 * getsockopt() or ancillary data.
3054 		 */
3055 		case IPV6_PATHMTU:
3056 			return (EINVAL);
3057 
3058 		case IPV6_SEC_OPT:
3059 		case IPV6_SRC_PREFERENCES:
3060 		case IPV6_V6ONLY:
3061 			/* Handled at IP level */
3062 			return (-EINVAL);
3063 		default:
3064 			*outlenp = 0;
3065 			return (EINVAL);
3066 		}
3067 		break;
3068 	}		/* end IPPROTO_IPV6 */
3069 
3070 	case IPPROTO_ICMPV6:
3071 		/*
3072 		 * Only allow IPv6 option processing on IPv6 sockets.
3073 		 */
3074 		if (icmp->icmp_family != AF_INET6) {
3075 			*outlenp = 0;
3076 			return (ENOPROTOOPT);
3077 		}
3078 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
3079 			*outlenp = 0;
3080 			return (ENOPROTOOPT);
3081 		}
3082 		switch (name) {
3083 		case ICMP6_FILTER:
3084 			if (!checkonly) {
3085 				if ((inlen != 0) &&
3086 				    (inlen != sizeof (icmp6_filter_t)))
3087 					return (EINVAL);
3088 
3089 				if (inlen == 0) {
3090 					if (icmp->icmp_filter != NULL) {
3091 						kmem_free(icmp->icmp_filter,
3092 						    sizeof (icmp6_filter_t));
3093 						icmp->icmp_filter = NULL;
3094 					}
3095 				} else {
3096 					if (icmp->icmp_filter == NULL) {
3097 						icmp->icmp_filter = kmem_alloc(
3098 						    sizeof (icmp6_filter_t),
3099 						    KM_NOSLEEP);
3100 						if (icmp->icmp_filter == NULL) {
3101 							*outlenp = 0;
3102 							return (ENOBUFS);
3103 						}
3104 					}
3105 					(void) bcopy(invalp, icmp->icmp_filter,
3106 					    inlen);
3107 				}
3108 			}
3109 			break;
3110 
3111 		default:
3112 			*outlenp = 0;
3113 			return (EINVAL);
3114 		}
3115 		break;
3116 	default:
3117 		*outlenp = 0;
3118 		return (EINVAL);
3119 	}
3120 	/*
3121 	 * Common case of OK return with outval same as inval.
3122 	 */
3123 	if (invalp != outvalp) {
3124 		/* don't trust bcopy for identical src/dst */
3125 		(void) bcopy(invalp, outvalp, inlen);
3126 	}
3127 	*outlenp = inlen;
3128 	return (0);
3129 }
3130 
3131 /* This routine sets socket options. */
3132 /* ARGSUSED */
3133 int
3134 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
3135     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3136     void *thisdg_attrs, cred_t *cr)
3137 {
3138 	boolean_t checkonly;
3139 	int	error;
3140 
3141 	error = 0;
3142 	switch (optset_context) {
3143 	case SETFN_OPTCOM_CHECKONLY:
3144 		checkonly = B_TRUE;
3145 		/*
3146 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
3147 		 * inlen != 0 implies value supplied and
3148 		 * 	we have to "pretend" to set it.
3149 		 * inlen == 0 implies that there is no
3150 		 * 	value part in T_CHECK request and just validation
3151 		 * done elsewhere should be enough, we just return here.
3152 		 */
3153 		if (inlen == 0) {
3154 			*outlenp = 0;
3155 			error = 0;
3156 			goto done;
3157 		}
3158 		break;
3159 	case SETFN_OPTCOM_NEGOTIATE:
3160 		checkonly = B_FALSE;
3161 		break;
3162 	case SETFN_UD_NEGOTIATE:
3163 	case SETFN_CONN_NEGOTIATE:
3164 		checkonly = B_FALSE;
3165 		/*
3166 		 * Negotiating local and "association-related" options
3167 		 * through T_UNITDATA_REQ.
3168 		 *
3169 		 * Following routine can filter out ones we do not
3170 		 * want to be "set" this way.
3171 		 */
3172 		if (!icmp_opt_allow_udr_set(level, name)) {
3173 			*outlenp = 0;
3174 			error = EINVAL;
3175 			goto done;
3176 		}
3177 		break;
3178 	default:
3179 		/*
3180 		 * We should never get here
3181 		 */
3182 		*outlenp = 0;
3183 		error = EINVAL;
3184 		goto done;
3185 	}
3186 
3187 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
3188 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
3189 	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
3190 	    outvalp, cr, thisdg_attrs, checkonly);
3191 
3192 done:
3193 	return (error);
3194 }
3195 
3196 /* This routine sets socket options. */
3197 /* ARGSUSED */
3198 int
3199 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
3200     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3201     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
3202 {
3203 	conn_t	*connp =  Q_TO_CONN(q);
3204 	icmp_t	*icmp;
3205 	int error;
3206 
3207 	icmp = connp->conn_icmp;
3208 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3209 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
3210 	    outlenp, outvalp, thisdg_attrs, cr);
3211 	rw_exit(&icmp->icmp_rwlock);
3212 	return (error);
3213 }
3214 
3215 /*
3216  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
3217  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
3218  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
3219  * headers.
3220  * Returns failure if can't allocate memory.
3221  */
3222 static int
3223 icmp_build_hdrs(icmp_t *icmp)
3224 {
3225 	icmp_stack_t *is = icmp->icmp_is;
3226 	uchar_t	*hdrs;
3227 	uint_t	hdrs_len;
3228 	ip6_t	*ip6h;
3229 	ip6i_t	*ip6i;
3230 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
3231 
3232 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
3233 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3234 	ASSERT(hdrs_len != 0);
3235 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3236 		/* Need to reallocate */
3237 		if (hdrs_len != 0) {
3238 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3239 			if (hdrs == NULL)
3240 				return (ENOMEM);
3241 		} else {
3242 			hdrs = NULL;
3243 		}
3244 		if (icmp->icmp_sticky_hdrs_len != 0) {
3245 			kmem_free(icmp->icmp_sticky_hdrs,
3246 			    icmp->icmp_sticky_hdrs_len);
3247 		}
3248 		icmp->icmp_sticky_hdrs = hdrs;
3249 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3250 	}
3251 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3252 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3253 
3254 	/* Set header fields not in ipp */
3255 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3256 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3257 		ip6h = (ip6_t *)&ip6i[1];
3258 
3259 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3260 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3261 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3262 		}
3263 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3264 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3265 		}
3266 	} else {
3267 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3268 	}
3269 
3270 	if (!(ipp->ipp_fields & IPPF_ADDR))
3271 		ip6h->ip6_src = icmp->icmp_v6src;
3272 
3273 	/* Try to get everything in a single mblk */
3274 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3275 		icmp->icmp_max_hdr_len = hdrs_len;
3276 		rw_exit(&icmp->icmp_rwlock);
3277 		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
3278 		    icmp->icmp_connp,
3279 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3280 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3281 	}
3282 	return (0);
3283 }
3284 
3285 /*
3286  * This routine retrieves the value of an ND variable in a icmpparam_t
3287  * structure.  It is called through nd_getset when a user reads the
3288  * variable.
3289  */
3290 /* ARGSUSED */
3291 static int
3292 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3293 {
3294 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3295 
3296 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3297 	return (0);
3298 }
3299 
3300 /*
3301  * Walk through the param array specified registering each element with the
3302  * named dispatch (ND) handler.
3303  */
3304 static boolean_t
3305 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3306 {
3307 	for (; cnt-- > 0; icmppa++) {
3308 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3309 			if (!nd_load(ndp, icmppa->icmp_param_name,
3310 			    icmp_param_get, icmp_param_set,
3311 			    (caddr_t)icmppa)) {
3312 				nd_free(ndp);
3313 				return (B_FALSE);
3314 			}
3315 		}
3316 	}
3317 	return (B_TRUE);
3318 }
3319 
3320 /* This routine sets an ND variable in a icmpparam_t structure. */
3321 /* ARGSUSED */
3322 static int
3323 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3324 {
3325 	long		new_value;
3326 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3327 
3328 	/*
3329 	 * Fail the request if the new value does not lie within the
3330 	 * required bounds.
3331 	 */
3332 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3333 	    new_value < icmppa->icmp_param_min ||
3334 	    new_value > icmppa->icmp_param_max) {
3335 		return (EINVAL);
3336 	}
3337 	/* Set the new value */
3338 	icmppa->icmp_param_value = new_value;
3339 	return (0);
3340 }
3341 
3342 static mblk_t *
3343 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
3344 {
3345 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
3346 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
3347 		/*
3348 		 * fallback has started but messages have not been moved yet
3349 		 */
3350 		if (icmp->icmp_fallback_queue_head == NULL) {
3351 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
3352 			icmp->icmp_fallback_queue_head = mp;
3353 			icmp->icmp_fallback_queue_tail = mp;
3354 		} else {
3355 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
3356 			icmp->icmp_fallback_queue_tail->b_next = mp;
3357 			icmp->icmp_fallback_queue_tail = mp;
3358 		}
3359 		return (NULL);
3360 	} else {
3361 		/*
3362 		 * Fallback completed, let the caller putnext() the mblk.
3363 		 */
3364 		return (mp);
3365 	}
3366 }
3367 
3368 /*
3369  * Deliver data to ULP. In case we have a socket, and it's falling back to
3370  * TPI, then we'll queue the mp for later processing.
3371  */
3372 static void
3373 icmp_ulp_recv(conn_t *connp, mblk_t *mp)
3374 {
3375 
3376 	if (IPCL_IS_NONSTR(connp)) {
3377 		icmp_t *icmp = connp->conn_icmp;
3378 		int error;
3379 
3380 		if ((*connp->conn_upcalls->su_recv)
3381 		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
3382 		    NULL) < 0) {
3383 			mutex_enter(&icmp->icmp_recv_lock);
3384 			if (error == ENOSPC) {
3385 				/*
3386 				 * let's confirm while holding the lock
3387 				 */
3388 				if ((*connp->conn_upcalls->su_recv)
3389 				    (connp->conn_upper_handle, NULL, 0, 0,
3390 				    &error, NULL) < 0) {
3391 					ASSERT(error == ENOSPC);
3392 					if (error == ENOSPC) {
3393 						connp->conn_flow_cntrld =
3394 						    B_TRUE;
3395 					}
3396 				}
3397 				mutex_exit(&icmp->icmp_recv_lock);
3398 			} else {
3399 				ASSERT(error == EOPNOTSUPP);
3400 				mp = icmp_queue_fallback(icmp, mp);
3401 				mutex_exit(&icmp->icmp_recv_lock);
3402 				if (mp != NULL)
3403 					putnext(connp->conn_rq, mp);
3404 			}
3405 		}
3406 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
3407 	} else {
3408 		putnext(connp->conn_rq, mp);
3409 	}
3410 }
3411 
3412 /*ARGSUSED2*/
3413 static void
3414 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3415 {
3416 	conn_t *connp = (conn_t *)arg1;
3417 	struct T_unitdata_ind	*tudi;
3418 	uchar_t			*rptr;
3419 	icmp_t			*icmp;
3420 	icmp_stack_t		*is;
3421 	sin_t			*sin;
3422 	sin6_t			*sin6;
3423 	ip6_t			*ip6h;
3424 	ip6i_t			*ip6i;
3425 	mblk_t			*mp1;
3426 	int			hdr_len;
3427 	ipha_t			*ipha;
3428 	int			udi_size;	/* Size of T_unitdata_ind */
3429 	uint_t			ipvers;
3430 	ip6_pkt_t		ipp;
3431 	uint8_t			nexthdr;
3432 	ip_pktinfo_t		*pinfo = NULL;
3433 	mblk_t			*options_mp = NULL;
3434 	uint_t			icmp_opt = 0;
3435 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3436 	uint_t			hopstrip;
3437 
3438 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3439 
3440 	icmp = connp->conn_icmp;
3441 	is = icmp->icmp_is;
3442 	rptr = mp->b_rptr;
3443 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3444 	ASSERT(OK_32PTR(rptr));
3445 
3446 	/*
3447 	 * IP should have prepended the options data in an M_CTL
3448 	 * Check M_CTL "type" to make sure are not here bcos of
3449 	 * a valid ICMP message
3450 	 */
3451 	if (DB_TYPE(mp) == M_CTL) {
3452 		/*
3453 		 * FIXME: does IP still do this?
3454 		 * IP sends up the IPSEC_IN message for handling IPSEC
3455 		 * policy at the TCP level. We don't need it here.
3456 		 */
3457 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3458 			mp1 = mp->b_cont;
3459 			freeb(mp);
3460 			mp = mp1;
3461 			rptr = mp->b_rptr;
3462 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3463 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3464 		    IN_PKTINFO) {
3465 			/*
3466 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3467 			 * has been prepended to the packet by IP. We need to
3468 			 * extract the mblk and adjust the rptr
3469 			 */
3470 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3471 			options_mp = mp;
3472 			mp = mp->b_cont;
3473 			rptr = mp->b_rptr;
3474 		} else {
3475 			/*
3476 			 * ICMP messages.
3477 			 */
3478 			icmp_icmp_error(connp, mp);
3479 			return;
3480 		}
3481 	}
3482 
3483 	/*
3484 	 * Discard message if it is misaligned or smaller than the IP header.
3485 	 */
3486 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3487 		freemsg(mp);
3488 		if (options_mp != NULL)
3489 			freeb(options_mp);
3490 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3491 		return;
3492 	}
3493 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3494 
3495 	/* Handle M_DATA messages containing IP packets messages */
3496 	if (ipvers == IPV4_VERSION) {
3497 		/*
3498 		 * Special case where IP attaches
3499 		 * the IRE needs to be handled so that we don't send up
3500 		 * IRE to the user land.
3501 		 */
3502 		ipha = (ipha_t *)rptr;
3503 		hdr_len = IPH_HDR_LENGTH(ipha);
3504 
3505 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3506 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3507 
3508 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3509 			    TH_SYN) && mp->b_cont != NULL) {
3510 				mp1 = mp->b_cont;
3511 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3512 					freeb(mp1);
3513 					mp->b_cont = NULL;
3514 				}
3515 			}
3516 		}
3517 		if (is->is_bsd_compat) {
3518 			ushort_t len;
3519 			len = ntohs(ipha->ipha_length);
3520 
3521 			if (mp->b_datap->db_ref > 1) {
3522 				/*
3523 				 * Allocate a new IP header so that we can
3524 				 * modify ipha_length.
3525 				 */
3526 				mblk_t	*mp1;
3527 
3528 				mp1 = allocb(hdr_len, BPRI_MED);
3529 				if (!mp1) {
3530 					freemsg(mp);
3531 					if (options_mp != NULL)
3532 						freeb(options_mp);
3533 					BUMP_MIB(&is->is_rawip_mib,
3534 					    rawipInErrors);
3535 					return;
3536 				}
3537 				bcopy(rptr, mp1->b_rptr, hdr_len);
3538 				mp->b_rptr = rptr + hdr_len;
3539 				rptr = mp1->b_rptr;
3540 				ipha = (ipha_t *)rptr;
3541 				mp1->b_cont = mp;
3542 				mp1->b_wptr = rptr + hdr_len;
3543 				mp = mp1;
3544 			}
3545 			len -= hdr_len;
3546 			ipha->ipha_length = htons(len);
3547 		}
3548 	}
3549 
3550 	/*
3551 	 * This is the inbound data path.  Packets are passed upstream as
3552 	 * T_UNITDATA_IND messages with full IP headers still attached.
3553 	 */
3554 	if (icmp->icmp_family == AF_INET) {
3555 		ASSERT(ipvers == IPV4_VERSION);
3556 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3557 		if (icmp->icmp_recvif && (pinfo != NULL) &&
3558 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3559 			udi_size += sizeof (struct T_opthdr) +
3560 			    sizeof (uint_t);
3561 		}
3562 
3563 		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3564 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3565 			udi_size += sizeof (struct T_opthdr) +
3566 			    sizeof (struct in_pktinfo);
3567 		}
3568 
3569 		/*
3570 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3571 		 * buffer. Since gethrestime() expects a pointer aligned
3572 		 * argument, we allocate space necessary for extra
3573 		 * alignment (even though it might not be used).
3574 		 */
3575 		if (icmp->icmp_timestamp) {
3576 			udi_size += sizeof (struct T_opthdr) +
3577 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3578 		}
3579 		mp1 = allocb(udi_size, BPRI_MED);
3580 		if (mp1 == NULL) {
3581 			freemsg(mp);
3582 			if (options_mp != NULL)
3583 				freeb(options_mp);
3584 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3585 			return;
3586 		}
3587 		mp1->b_cont = mp;
3588 		mp = mp1;
3589 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3590 		mp->b_datap->db_type = M_PROTO;
3591 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3592 		tudi->PRIM_type = T_UNITDATA_IND;
3593 		tudi->SRC_length = sizeof (sin_t);
3594 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3595 		sin = (sin_t *)&tudi[1];
3596 		*sin = sin_null;
3597 		sin->sin_family = AF_INET;
3598 		sin->sin_addr.s_addr = ipha->ipha_src;
3599 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3600 		    sizeof (sin_t);
3601 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3602 		tudi->OPT_length = udi_size;
3603 
3604 		/*
3605 		 * Add options if IP_RECVIF is set
3606 		 */
3607 		if (udi_size != 0) {
3608 			char *dstopt;
3609 
3610 			dstopt = (char *)&sin[1];
3611 			if (icmp->icmp_recvif && (pinfo != NULL) &&
3612 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3613 
3614 				struct T_opthdr *toh;
3615 				uint_t		*dstptr;
3616 
3617 				toh = (struct T_opthdr *)dstopt;
3618 				toh->level = IPPROTO_IP;
3619 				toh->name = IP_RECVIF;
3620 				toh->len = sizeof (struct T_opthdr) +
3621 				    sizeof (uint_t);
3622 				toh->status = 0;
3623 				dstopt += sizeof (struct T_opthdr);
3624 				dstptr = (uint_t *)dstopt;
3625 				*dstptr = pinfo->ip_pkt_ifindex;
3626 				dstopt += sizeof (uint_t);
3627 				udi_size -= toh->len;
3628 			}
3629 			if (icmp->icmp_timestamp) {
3630 				struct	T_opthdr *toh;
3631 
3632 				toh = (struct T_opthdr *)dstopt;
3633 				toh->level = SOL_SOCKET;
3634 				toh->name = SCM_TIMESTAMP;
3635 				toh->len = sizeof (struct T_opthdr) +
3636 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3637 				toh->status = 0;
3638 				dstopt += sizeof (struct T_opthdr);
3639 				/* Align for gethrestime() */
3640 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3641 				    sizeof (intptr_t));
3642 				gethrestime((timestruc_t *)dstopt);
3643 				dstopt = (char *)toh + toh->len;
3644 				udi_size -= toh->len;
3645 			}
3646 			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3647 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3648 				struct	T_opthdr *toh;
3649 				struct	in_pktinfo *pktinfop;
3650 
3651 				toh = (struct T_opthdr *)dstopt;
3652 				toh->level = IPPROTO_IP;
3653 				toh->name = IP_PKTINFO;
3654 				toh->len = sizeof (struct T_opthdr) +
3655 				    sizeof (in_pktinfo_t);
3656 				toh->status = 0;
3657 				dstopt += sizeof (struct T_opthdr);
3658 				pktinfop = (struct in_pktinfo *)dstopt;
3659 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3660 				pktinfop->ipi_spec_dst =
3661 				    pinfo->ip_pkt_match_addr;
3662 
3663 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3664 
3665 				dstopt += sizeof (struct in_pktinfo);
3666 				udi_size -= toh->len;
3667 			}
3668 
3669 			/* Consumed all of allocated space */
3670 			ASSERT(udi_size == 0);
3671 		}
3672 
3673 		if (options_mp != NULL)
3674 			freeb(options_mp);
3675 
3676 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3677 		goto deliver;
3678 	}
3679 
3680 	/*
3681 	 * We don't need options_mp in the IPv6 path.
3682 	 */
3683 	if (options_mp != NULL) {
3684 		freeb(options_mp);
3685 		options_mp = NULL;
3686 	}
3687 
3688 	/*
3689 	 * Discard message if it is smaller than the IPv6 header
3690 	 * or if the header is malformed.
3691 	 */
3692 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3693 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3694 	    icmp->icmp_family != AF_INET6) {
3695 		freemsg(mp);
3696 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3697 		return;
3698 	}
3699 
3700 	/* Initialize */
3701 	ipp.ipp_fields = 0;
3702 	hopstrip = 0;
3703 
3704 	ip6h = (ip6_t *)rptr;
3705 	/*
3706 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3707 	 * as well as individual lenghts of ext hdrs (and ptrs to
3708 	 * them).
3709 	 */
3710 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3711 		/* Look for ifindex information */
3712 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3713 			ip6i = (ip6i_t *)ip6h;
3714 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3715 				ASSERT(ip6i->ip6i_ifindex != 0);
3716 				ipp.ipp_fields |= IPPF_IFINDEX;
3717 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3718 			}
3719 			rptr = (uchar_t *)&ip6i[1];
3720 			mp->b_rptr = rptr;
3721 			if (rptr == mp->b_wptr) {
3722 				mp1 = mp->b_cont;
3723 				freeb(mp);
3724 				mp = mp1;
3725 				rptr = mp->b_rptr;
3726 			}
3727 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3728 			ip6h = (ip6_t *)rptr;
3729 		}
3730 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3731 
3732 		/*
3733 		 * We need to lie a bit to the user because users inside
3734 		 * labeled compartments should not see their own labels.  We
3735 		 * assume that in all other respects IP has checked the label,
3736 		 * and that the label is always first among the options.  (If
3737 		 * it's not first, then this code won't see it, and the option
3738 		 * will be passed along to the user.)
3739 		 *
3740 		 * If we had multilevel ICMP sockets, then the following code
3741 		 * should be skipped for them to allow the user to see the
3742 		 * label.
3743 		 *
3744 		 * Alignment restrictions in the definition of IP options
3745 		 * (namely, the requirement that the 4-octet DOI goes on a
3746 		 * 4-octet boundary) mean that we know exactly where the option
3747 		 * should start, but we're lenient for other hosts.
3748 		 *
3749 		 * Note that there are no multilevel ICMP or raw IP sockets
3750 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3751 		 */
3752 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3753 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3754 			const uchar_t *ucp =
3755 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3756 			int remlen = ipp.ipp_hopoptslen - 2;
3757 
3758 			while (remlen > 0) {
3759 				if (*ucp == IP6OPT_PAD1) {
3760 					remlen--;
3761 					ucp++;
3762 				} else if (*ucp == IP6OPT_PADN) {
3763 					remlen -= ucp[1] + 2;
3764 					ucp += ucp[1] + 2;
3765 				} else if (*ucp == ip6opt_ls) {
3766 					hopstrip = (ucp -
3767 					    (const uchar_t *)ipp.ipp_hopopts) +
3768 					    ucp[1] + 2;
3769 					hopstrip = (hopstrip + 7) & ~7;
3770 					break;
3771 				} else {
3772 					/* label option must be first */
3773 					break;
3774 				}
3775 			}
3776 		}
3777 	} else {
3778 		hdr_len = IPV6_HDR_LEN;
3779 		ip6i = NULL;
3780 		nexthdr = ip6h->ip6_nxt;
3781 	}
3782 	/*
3783 	 * One special case where IP attaches the IRE needs to
3784 	 * be handled so that we don't send up IRE to the user land.
3785 	 */
3786 	if (nexthdr == IPPROTO_TCP) {
3787 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3788 
3789 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3790 		    mp->b_cont != NULL) {
3791 			mp1 = mp->b_cont;
3792 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3793 				freeb(mp1);
3794 				mp->b_cont = NULL;
3795 			}
3796 		}
3797 	}
3798 	/*
3799 	 * Check a filter for ICMPv6 types if needed.
3800 	 * Verify raw checksums if needed.
3801 	 */
3802 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3803 		if (icmp->icmp_filter != NULL) {
3804 			int type;
3805 
3806 			/* Assumes that IP has done the pullupmsg */
3807 			type = mp->b_rptr[hdr_len];
3808 
3809 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3810 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3811 				freemsg(mp);
3812 				return;
3813 			}
3814 		} else {
3815 			/* Checksum */
3816 			uint16_t	*up;
3817 			uint32_t	sum;
3818 			int		remlen;
3819 
3820 			up = (uint16_t *)&ip6h->ip6_src;
3821 
3822 			remlen = msgdsize(mp) - hdr_len;
3823 			sum = htons(icmp->icmp_proto + remlen)
3824 			    + up[0] + up[1] + up[2] + up[3]
3825 			    + up[4] + up[5] + up[6] + up[7]
3826 			    + up[8] + up[9] + up[10] + up[11]
3827 			    + up[12] + up[13] + up[14] + up[15];
3828 			sum = (sum & 0xffff) + (sum >> 16);
3829 			sum = IP_CSUM(mp, hdr_len, sum);
3830 			if (sum != 0) {
3831 				/* IPv6 RAW checksum failed */
3832 				ip0dbg(("icmp_rput: RAW checksum "
3833 				    "failed %x\n", sum));
3834 				freemsg(mp);
3835 				BUMP_MIB(&is->is_rawip_mib,
3836 				    rawipInCksumErrs);
3837 				return;
3838 			}
3839 		}
3840 	}
3841 	/* Skip all the IPv6 headers per API */
3842 	mp->b_rptr += hdr_len;
3843 
3844 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3845 
3846 	/*
3847 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3848 	 * maintain state information, instead of relying on icmp_t
3849 	 * structure, since there arent any locks protecting these members
3850 	 * and there is a window where there might be a race between a
3851 	 * thread setting options on the write side and a thread reading
3852 	 * these options on the read size.
3853 	 */
3854 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3855 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3856 		if (icmp->icmp_ipv6_recvhopopts &&
3857 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3858 		    ipp.ipp_hopoptslen > hopstrip) {
3859 			udi_size += sizeof (struct T_opthdr) +
3860 			    ipp.ipp_hopoptslen - hopstrip;
3861 			icmp_opt |= IPPF_HOPOPTS;
3862 		}
3863 		if ((icmp->icmp_ipv6_recvdstopts ||
3864 		    icmp->icmp_old_ipv6_recvdstopts) &&
3865 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3866 			udi_size += sizeof (struct T_opthdr) +
3867 			    ipp.ipp_dstoptslen;
3868 			icmp_opt |= IPPF_DSTOPTS;
3869 		}
3870 		if (((icmp->icmp_ipv6_recvdstopts &&
3871 		    icmp->icmp_ipv6_recvrthdr &&
3872 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3873 		    icmp->icmp_ipv6_recvrtdstopts) &&
3874 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3875 			udi_size += sizeof (struct T_opthdr) +
3876 			    ipp.ipp_rtdstoptslen;
3877 			icmp_opt |= IPPF_RTDSTOPTS;
3878 		}
3879 		if (icmp->icmp_ipv6_recvrthdr &&
3880 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3881 			udi_size += sizeof (struct T_opthdr) +
3882 			    ipp.ipp_rthdrlen;
3883 			icmp_opt |= IPPF_RTHDR;
3884 		}
3885 		if (icmp->icmp_ip_recvpktinfo &&
3886 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3887 			udi_size += sizeof (struct T_opthdr) +
3888 			    sizeof (struct in6_pktinfo);
3889 			icmp_opt |= IPPF_IFINDEX;
3890 		}
3891 	}
3892 	if (icmp->icmp_ipv6_recvhoplimit) {
3893 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3894 		icmp_ipv6_recvhoplimit = B_TRUE;
3895 	}
3896 
3897 	if (icmp->icmp_ipv6_recvtclass)
3898 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3899 
3900 	/*
3901 	 * If SO_TIMESTAMP is set allocate the appropriate sized
3902 	 * buffer. Since gethrestime() expects a pointer aligned
3903 	 * argument, we allocate space necessary for extra
3904 	 * alignment (even though it might not be used).
3905 	 */
3906 	if (icmp->icmp_timestamp) {
3907 		udi_size += sizeof (struct T_opthdr) +
3908 		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3909 	}
3910 
3911 	mp1 = allocb(udi_size, BPRI_MED);
3912 	if (mp1 == NULL) {
3913 		freemsg(mp);
3914 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3915 		return;
3916 	}
3917 	mp1->b_cont = mp;
3918 	mp = mp1;
3919 	mp->b_datap->db_type = M_PROTO;
3920 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3921 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3922 	tudi->PRIM_type = T_UNITDATA_IND;
3923 	tudi->SRC_length = sizeof (sin6_t);
3924 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3925 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3926 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3927 	tudi->OPT_length = udi_size;
3928 	sin6 = (sin6_t *)&tudi[1];
3929 	sin6->sin6_port = 0;
3930 	sin6->sin6_family = AF_INET6;
3931 
3932 	sin6->sin6_addr = ip6h->ip6_src;
3933 	/* No sin6_flowinfo per API */
3934 	sin6->sin6_flowinfo = 0;
3935 	/* For link-scope source pass up scope id */
3936 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3937 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3938 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3939 	else
3940 		sin6->sin6_scope_id = 0;
3941 
3942 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3943 	    icmp->icmp_zoneid, is->is_netstack);
3944 
3945 	if (udi_size != 0) {
3946 		uchar_t *dstopt;
3947 
3948 		dstopt = (uchar_t *)&sin6[1];
3949 		if (icmp_opt & IPPF_IFINDEX) {
3950 			struct T_opthdr *toh;
3951 			struct in6_pktinfo *pkti;
3952 
3953 			toh = (struct T_opthdr *)dstopt;
3954 			toh->level = IPPROTO_IPV6;
3955 			toh->name = IPV6_PKTINFO;
3956 			toh->len = sizeof (struct T_opthdr) +
3957 			    sizeof (*pkti);
3958 			toh->status = 0;
3959 			dstopt += sizeof (struct T_opthdr);
3960 			pkti = (struct in6_pktinfo *)dstopt;
3961 			pkti->ipi6_addr = ip6h->ip6_dst;
3962 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3963 			dstopt += sizeof (*pkti);
3964 			udi_size -= toh->len;
3965 		}
3966 		if (icmp_ipv6_recvhoplimit) {
3967 			struct T_opthdr *toh;
3968 
3969 			toh = (struct T_opthdr *)dstopt;
3970 			toh->level = IPPROTO_IPV6;
3971 			toh->name = IPV6_HOPLIMIT;
3972 			toh->len = sizeof (struct T_opthdr) +
3973 			    sizeof (uint_t);
3974 			toh->status = 0;
3975 			dstopt += sizeof (struct T_opthdr);
3976 			*(uint_t *)dstopt = ip6h->ip6_hops;
3977 			dstopt += sizeof (uint_t);
3978 			udi_size -= toh->len;
3979 		}
3980 		if (icmp->icmp_ipv6_recvtclass) {
3981 			struct T_opthdr *toh;
3982 
3983 			toh = (struct T_opthdr *)dstopt;
3984 			toh->level = IPPROTO_IPV6;
3985 			toh->name = IPV6_TCLASS;
3986 			toh->len = sizeof (struct T_opthdr) +
3987 			    sizeof (uint_t);
3988 			toh->status = 0;
3989 			dstopt += sizeof (struct T_opthdr);
3990 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3991 			dstopt += sizeof (uint_t);
3992 			udi_size -= toh->len;
3993 		}
3994 		if (icmp->icmp_timestamp) {
3995 			struct  T_opthdr *toh;
3996 
3997 			toh = (struct T_opthdr *)dstopt;
3998 			toh->level = SOL_SOCKET;
3999 			toh->name = SCM_TIMESTAMP;
4000 			toh->len = sizeof (struct T_opthdr) +
4001 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
4002 			toh->status = 0;
4003 			dstopt += sizeof (struct T_opthdr);
4004 			/* Align for gethrestime() */
4005 			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
4006 			    sizeof (intptr_t));
4007 			gethrestime((timestruc_t *)dstopt);
4008 			dstopt = (uchar_t *)toh + toh->len;
4009 			udi_size -= toh->len;
4010 		}
4011 
4012 		if (icmp_opt & IPPF_HOPOPTS) {
4013 			struct T_opthdr *toh;
4014 
4015 			toh = (struct T_opthdr *)dstopt;
4016 			toh->level = IPPROTO_IPV6;
4017 			toh->name = IPV6_HOPOPTS;
4018 			toh->len = sizeof (struct T_opthdr) +
4019 			    ipp.ipp_hopoptslen - hopstrip;
4020 			toh->status = 0;
4021 			dstopt += sizeof (struct T_opthdr);
4022 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
4023 			    ipp.ipp_hopoptslen - hopstrip);
4024 			if (hopstrip > 0) {
4025 				/* copy next header value and fake length */
4026 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
4027 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
4028 				    hopstrip / 8;
4029 			}
4030 			dstopt += ipp.ipp_hopoptslen - hopstrip;
4031 			udi_size -= toh->len;
4032 		}
4033 		if (icmp_opt & IPPF_RTDSTOPTS) {
4034 			struct T_opthdr *toh;
4035 
4036 			toh = (struct T_opthdr *)dstopt;
4037 			toh->level = IPPROTO_IPV6;
4038 			toh->name = IPV6_DSTOPTS;
4039 			toh->len = sizeof (struct T_opthdr) +
4040 			    ipp.ipp_rtdstoptslen;
4041 			toh->status = 0;
4042 			dstopt += sizeof (struct T_opthdr);
4043 			bcopy(ipp.ipp_rtdstopts, dstopt,
4044 			    ipp.ipp_rtdstoptslen);
4045 			dstopt += ipp.ipp_rtdstoptslen;
4046 			udi_size -= toh->len;
4047 		}
4048 		if (icmp_opt & IPPF_RTHDR) {
4049 			struct T_opthdr *toh;
4050 
4051 			toh = (struct T_opthdr *)dstopt;
4052 			toh->level = IPPROTO_IPV6;
4053 			toh->name = IPV6_RTHDR;
4054 			toh->len = sizeof (struct T_opthdr) +
4055 			    ipp.ipp_rthdrlen;
4056 			toh->status = 0;
4057 			dstopt += sizeof (struct T_opthdr);
4058 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
4059 			dstopt += ipp.ipp_rthdrlen;
4060 			udi_size -= toh->len;
4061 		}
4062 		if (icmp_opt & IPPF_DSTOPTS) {
4063 			struct T_opthdr *toh;
4064 
4065 			toh = (struct T_opthdr *)dstopt;
4066 			toh->level = IPPROTO_IPV6;
4067 			toh->name = IPV6_DSTOPTS;
4068 			toh->len = sizeof (struct T_opthdr) +
4069 			    ipp.ipp_dstoptslen;
4070 			toh->status = 0;
4071 			dstopt += sizeof (struct T_opthdr);
4072 			bcopy(ipp.ipp_dstopts, dstopt,
4073 			    ipp.ipp_dstoptslen);
4074 			dstopt += ipp.ipp_dstoptslen;
4075 			udi_size -= toh->len;
4076 		}
4077 		/* Consumed all of allocated space */
4078 		ASSERT(udi_size == 0);
4079 	}
4080 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
4081 
4082 deliver:
4083 	icmp_ulp_recv(connp, mp);
4084 
4085 }
4086 
4087 /*
4088  * return SNMP stuff in buffer in mpdata
4089  */
4090 mblk_t *
4091 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
4092 {
4093 	mblk_t			*mpdata;
4094 	struct opthdr		*optp;
4095 	conn_t			*connp = Q_TO_CONN(q);
4096 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
4097 	mblk_t			*mp2ctl;
4098 
4099 	/*
4100 	 * make a copy of the original message
4101 	 */
4102 	mp2ctl = copymsg(mpctl);
4103 
4104 	if (mpctl == NULL ||
4105 	    (mpdata = mpctl->b_cont) == NULL) {
4106 		freemsg(mpctl);
4107 		freemsg(mp2ctl);
4108 		return (0);
4109 	}
4110 
4111 	/* fixed length structure for IPv4 and IPv6 counters */
4112 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4113 	optp->level = EXPER_RAWIP;
4114 	optp->name = 0;
4115 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4116 	    sizeof (is->is_rawip_mib));
4117 	optp->len = msgdsize(mpdata);
4118 	qreply(q, mpctl);
4119 
4120 	return (mp2ctl);
4121 }
4122 
4123 /*
4124  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4125  * TODO:  If this ever actually tries to set anything, it needs to be
4126  * to do the appropriate locking.
4127  */
4128 /* ARGSUSED */
4129 int
4130 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4131     uchar_t *ptr, int len)
4132 {
4133 	switch (level) {
4134 	case EXPER_RAWIP:
4135 		return (0);
4136 	default:
4137 		return (1);
4138 	}
4139 }
4140 
4141 /*
4142  * This routine creates a T_UDERROR_IND message and passes it upstream.
4143  * The address and options are copied from the T_UNITDATA_REQ message
4144  * passed in mp.  This message is freed.
4145  */
4146 static void
4147 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4148 {
4149 	mblk_t	*mp1;
4150 	uchar_t	*rptr = mp->b_rptr;
4151 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4152 
4153 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4154 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4155 	    tudr->OPT_length, err);
4156 	if (mp1)
4157 		qreply(q, mp1);
4158 	freemsg(mp);
4159 }
4160 
4161 
4162 static int
4163 rawip_do_unbind(conn_t *connp)
4164 {
4165 	icmp_t *icmp = connp->conn_icmp;
4166 
4167 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4168 	/* If a bind has not been done, we can't unbind. */
4169 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4170 		rw_exit(&icmp->icmp_rwlock);
4171 		return (-TOUTSTATE);
4172 	}
4173 	icmp->icmp_pending_op = T_UNBIND_REQ;
4174 	rw_exit(&icmp->icmp_rwlock);
4175 
4176 	/*
4177 	 * Call ip to unbind
4178 	 */
4179 
4180 	ip_unbind(connp);
4181 
4182 	/*
4183 	 * Once we're unbound from IP, the pending operation may be cleared
4184 	 * here.
4185 	 */
4186 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4187 	V6_SET_ZERO(icmp->icmp_v6src);
4188 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4189 	icmp->icmp_pending_op = -1;
4190 	icmp->icmp_state = TS_UNBND;
4191 	if (icmp->icmp_family == AF_INET6)
4192 		(void) icmp_build_hdrs(icmp);
4193 	rw_exit(&icmp->icmp_rwlock);
4194 	return (0);
4195 }
4196 
4197 /*
4198  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4199  * After some error checking, the message is passed downstream to ip.
4200  */
4201 static void
4202 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
4203 {
4204 	conn_t	*connp = Q_TO_CONN(q);
4205 	int	error;
4206 
4207 	ASSERT(mp->b_cont == NULL);
4208 	error = rawip_do_unbind(connp);
4209 	if (error) {
4210 		if (error < 0) {
4211 			icmp_err_ack(q, mp, -error, 0);
4212 		} else {
4213 			icmp_err_ack(q, mp, 0, error);
4214 		}
4215 		return;
4216 	}
4217 
4218 	/*
4219 	 * Convert mp into a T_OK_ACK
4220 	 */
4221 
4222 	mp = mi_tpi_ok_ack_alloc(mp);
4223 
4224 	/*
4225 	 * should not happen in practice... T_OK_ACK is smaller than the
4226 	 * original message.
4227 	 */
4228 	ASSERT(mp != NULL);
4229 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4230 	qreply(q, mp);
4231 }
4232 
4233 
4234 /*
4235  * Process IPv4 packets that already include an IP header.
4236  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4237  * IPPROTO_IGMP).
4238  */
4239 static int
4240 icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
4241     ip4_pkt_t *pktinfop)
4242 {
4243 	icmp_stack_t *is = icmp->icmp_is;
4244 	ipha_t	*ipha;
4245 	int	ip_hdr_length;
4246 	int	tp_hdr_len;
4247 	int	error;
4248 	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
4249 	uint32_t ip_snd_opt_len = 0;
4250 	mblk_t	*mp1;
4251 	uint_t	pkt_len;
4252 	ip_opt_info_t optinfo;
4253 	pid_t	cpid;
4254 	cred_t	*cr;
4255 
4256 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4257 
4258 	optinfo.ip_opt_flags = 0;
4259 	optinfo.ip_opt_ill_index = 0;
4260 	ipha = (ipha_t *)mp->b_rptr;
4261 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4262 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4263 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4264 			ASSERT(icmp != NULL);
4265 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4266 			freemsg(mp);
4267 			rw_exit(&icmp->icmp_rwlock);
4268 			return (0);
4269 		}
4270 		ipha = (ipha_t *)mp->b_rptr;
4271 	}
4272 	ipha->ipha_version_and_hdr_length =
4273 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4274 
4275 	/*
4276 	 * Check if our saved options are valid; update if not.
4277 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
4278 	 * to different destination may require different labels,
4279 	 * or worse, ICMP packets to same IP address may require
4280 	 * different labels due to use of shared all-zones address.
4281 	 * We use conn_lock to ensure that lastdst, ip_snd_options,
4282 	 * and ip_snd_options_len are consistent for the current
4283 	 * destination and are updated atomically.
4284 	 */
4285 	mutex_enter(&connp->conn_lock);
4286 	if (is_system_labeled()) {
4287 		/*
4288 		 * Recompute the Trusted Extensions security label if
4289 		 * we're not going to the same destination as last
4290 		 * time or the cred attached to the received mblk
4291 		 * changed.
4292 		 */
4293 		cr = msg_getcred(mp, &cpid);
4294 		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4295 		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != ipha->ipha_dst ||
4296 		    cr != icmp->icmp_last_cred) {
4297 			error = icmp_update_label(icmp, mp, ipha->ipha_dst);
4298 			if (error != 0) {
4299 				mutex_exit(&connp->conn_lock);
4300 				rw_exit(&icmp->icmp_rwlock);
4301 				return (error);
4302 			}
4303 		}
4304 		/*
4305 		 * Apply credentials with modified security label if they
4306 		 * exist. icmp_update_label() may have generated these
4307 		 * credentials for packets to unlabeled remote nodes.
4308 		 */
4309 		if (icmp->icmp_effective_cred != NULL)
4310 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
4311 	}
4312 
4313 	if (icmp->icmp_ip_snd_options_len > 0) {
4314 		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
4315 		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
4316 	}
4317 	mutex_exit(&connp->conn_lock);
4318 
4319 	/*
4320 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4321 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4322 	 * tell IP that the application has sent a complete IP header and not
4323 	 * to compute the transport checksum nor change the DF flag.
4324 	 */
4325 	ipha->ipha_ident = IP_HDR_INCLUDED;
4326 	ipha->ipha_hdr_checksum = 0;
4327 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4328 	/* Insert options if any */
4329 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4330 		/*
4331 		 * Put the IP header plus any transport header that is
4332 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4333 		 * that at least the checksum field is in the first mblk.)
4334 		 */
4335 		switch (ipha->ipha_protocol) {
4336 		case IPPROTO_UDP:
4337 			tp_hdr_len = 8;
4338 			break;
4339 		case IPPROTO_TCP:
4340 			tp_hdr_len = 20;
4341 			break;
4342 		default:
4343 			tp_hdr_len = 0;
4344 			break;
4345 		}
4346 		/*
4347 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4348 		 * tp_hdr_len bytes will be in a single mblk.
4349 		 */
4350 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4351 		    tp_hdr_len)) {
4352 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4353 			    tp_hdr_len)) {
4354 				BUMP_MIB(&is->is_rawip_mib,
4355 				    rawipOutErrors);
4356 				freemsg(mp);
4357 				rw_exit(&icmp->icmp_rwlock);
4358 				return (0);
4359 			}
4360 			ipha = (ipha_t *)mp->b_rptr;
4361 		}
4362 
4363 		/*
4364 		 * if the length is larger then the max allowed IP packet,
4365 		 * then send an error and abort the processing.
4366 		 */
4367 		pkt_len = ntohs(ipha->ipha_length)
4368 		    + ip_snd_opt_len;
4369 		if (pkt_len > IP_MAXPACKET) {
4370 			rw_exit(&icmp->icmp_rwlock);
4371 			return (EMSGSIZE);
4372 		}
4373 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4374 		    tp_hdr_len, BPRI_LO))) {
4375 			rw_exit(&icmp->icmp_rwlock);
4376 			return (ENOMEM);
4377 		}
4378 		mp1->b_rptr += is->is_wroff_extra;
4379 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4380 
4381 		ipha->ipha_length = htons((uint16_t)pkt_len);
4382 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4383 
4384 		/* Copy transport header if any */
4385 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4386 		mp1->b_wptr += tp_hdr_len;
4387 
4388 		/* Add options */
4389 		ipha = (ipha_t *)mp1->b_rptr;
4390 		bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
4391 
4392 		/* Drop IP header and transport header from original */
4393 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4394 
4395 		mp1->b_cont = mp;
4396 		mp = mp1;
4397 		/*
4398 		 * Massage source route putting first source
4399 		 * route in ipha_dst.
4400 		 */
4401 		(void) ip_massage_options(ipha, is->is_netstack);
4402 	}
4403 
4404 	if (pktinfop != NULL) {
4405 		/*
4406 		 * Over write the source address provided in the header
4407 		 */
4408 		if (pktinfop->ip4_addr != INADDR_ANY) {
4409 			ipha->ipha_src = pktinfop->ip4_addr;
4410 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4411 		}
4412 
4413 		if (pktinfop->ip4_ill_index != 0) {
4414 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4415 		}
4416 	}
4417 
4418 	rw_exit(&icmp->icmp_rwlock);
4419 
4420 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4421 	return (0);
4422 }
4423 
4424 static int
4425 icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4426 {
4427 	int err;
4428 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4429 	icmp_stack_t		*is = icmp->icmp_is;
4430 	conn_t			*connp = icmp->icmp_connp;
4431 	cred_t	*cred;
4432 	cred_t	*msg_cred;
4433 	cred_t	*effective_cred;
4434 
4435 	/*
4436 	 * All Solaris components should pass a db_credp
4437 	 * for this message, hence we ASSERT.
4438 	 * On production kernels we return an error to be robust against
4439 	 * random streams modules sitting on top of us.
4440 	 */
4441 	cred = msg_cred = msg_getcred(mp, NULL);
4442 	ASSERT(cred != NULL);
4443 	if (cred == NULL)
4444 		return (EINVAL);
4445 
4446 	/*
4447 	 * Verify the destination is allowed to receive packets at
4448 	 * the security label of the message data. check_dest()
4449 	 * may create a new effective cred for this message
4450 	 * with a modified label or label flags.
4451 	 */
4452 	if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
4453 	    connp->conn_mac_mode, &effective_cred)) != 0)
4454 		goto done;
4455 	if (effective_cred != NULL)
4456 		cred = effective_cred;
4457 
4458 	/*
4459 	 * Calculate the security label to be placed in the text
4460 	 * of the message (if any).
4461 	 */
4462 	if ((err = tsol_compute_label(cred, dst, opt_storage,
4463 	    is->is_netstack->netstack_ip)) != 0)
4464 		goto done;
4465 
4466 	/*
4467 	 * Insert the security label in the cached ip options,
4468 	 * removing any old label that may exist.
4469 	 */
4470 	if ((err = tsol_update_options(&icmp->icmp_ip_snd_options,
4471 	    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4472 	    opt_storage)) != 0)
4473 		goto done;
4474 
4475 	/*
4476 	 * Save the destination address and cred we used to generate
4477 	 * the security label text.
4478 	 */
4479 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4480 	if (cred != icmp->icmp_effective_cred) {
4481 		if (icmp->icmp_effective_cred != NULL)
4482 			crfree(icmp->icmp_effective_cred);
4483 		crhold(cred);
4484 		icmp->icmp_effective_cred = cred;
4485 	}
4486 
4487 	if (msg_cred != icmp->icmp_last_cred) {
4488 		if (icmp->icmp_last_cred != NULL)
4489 			crfree(icmp->icmp_last_cred);
4490 		crhold(msg_cred);
4491 		icmp->icmp_last_cred = msg_cred;
4492 	}
4493 
4494 done:
4495 	if (effective_cred != NULL)
4496 		crfree(effective_cred);
4497 
4498 	if (err != 0) {
4499 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4500 		DTRACE_PROBE4(
4501 		    tx__ip__log__drop__updatelabel__icmp,
4502 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4503 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4504 		return (err);
4505 	}
4506 	return (0);
4507 }
4508 
4509 /*
4510  * This routine handles all messages passed downstream.  It either
4511  * consumes the message or passes it downstream; it never queues a
4512  * a message.
4513  */
4514 static void
4515 icmp_wput(queue_t *q, mblk_t *mp)
4516 {
4517 	uchar_t	*rptr = mp->b_rptr;
4518 	mblk_t	*mp1;
4519 #define	tudr ((struct T_unitdata_req *)rptr)
4520 	size_t	ip_len;
4521 	conn_t	*connp = Q_TO_CONN(q);
4522 	icmp_t	*icmp = connp->conn_icmp;
4523 	icmp_stack_t *is = icmp->icmp_is;
4524 	sin6_t	*sin6;
4525 	sin_t	*sin;
4526 	ipaddr_t	v4dst;
4527 	ip4_pkt_t	pktinfo;
4528 	ip4_pkt_t	*pktinfop = &pktinfo;
4529 	ip6_pkt_t	ipp_s;  /* For ancillary data options */
4530 	ip6_pkt_t	*ipp = &ipp_s;
4531 	int error;
4532 
4533 	ipp->ipp_fields = 0;
4534 	ipp->ipp_sticky_ignored = 0;
4535 
4536 	switch (mp->b_datap->db_type) {
4537 	case M_DATA:
4538 		if (icmp->icmp_hdrincl) {
4539 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4540 			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
4541 			if (error != 0)
4542 				icmp_ud_err(q, mp, error);
4543 			return;
4544 		}
4545 		freemsg(mp);
4546 		return;
4547 	case M_PROTO:
4548 	case M_PCPROTO:
4549 		ip_len = mp->b_wptr - rptr;
4550 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4551 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4552 			if (((union T_primitives *)rptr)->type
4553 			    == T_UNITDATA_REQ)
4554 				break;
4555 		}
4556 		/* FALLTHRU */
4557 	default:
4558 		icmp_wput_other(q, mp);
4559 		return;
4560 	}
4561 
4562 	/* Handle T_UNITDATA_REQ messages here. */
4563 
4564 	mp1 = mp->b_cont;
4565 	if (mp1 == NULL) {
4566 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4567 		icmp_ud_err(q, mp, EPROTO);
4568 		return;
4569 	}
4570 
4571 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4572 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4573 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4574 		return;
4575 	}
4576 
4577 	switch (icmp->icmp_family) {
4578 	case AF_INET6:
4579 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4580 		if (!OK_32PTR((char *)sin6) ||
4581 		    tudr->DEST_length != sizeof (sin6_t) ||
4582 		    sin6->sin6_family != AF_INET6) {
4583 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4584 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4585 			return;
4586 		}
4587 
4588 		/* No support for mapped addresses on raw sockets */
4589 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4590 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4591 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4592 			return;
4593 		}
4594 
4595 		/*
4596 		 * Destination is a native IPv6 address.
4597 		 * Send out an IPv6 format packet.
4598 		 */
4599 		if (tudr->OPT_length != 0) {
4600 			int error;
4601 
4602 			error = 0;
4603 			if (icmp_unitdata_opt_process(q, mp, &error,
4604 			    (void *)ipp) < 0) {
4605 				/* failure */
4606 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4607 				icmp_ud_err(q, mp, error);
4608 				return;
4609 			}
4610 			ASSERT(error == 0);
4611 		}
4612 
4613 		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
4614 		goto done;
4615 
4616 	case AF_INET:
4617 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4618 		if (!OK_32PTR((char *)sin) ||
4619 		    tudr->DEST_length != sizeof (sin_t) ||
4620 		    sin->sin_family != AF_INET) {
4621 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4622 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4623 			return;
4624 		}
4625 		/* Extract and ipaddr */
4626 		v4dst = sin->sin_addr.s_addr;
4627 		break;
4628 
4629 	default:
4630 		ASSERT(0);
4631 	}
4632 
4633 	pktinfop->ip4_ill_index = 0;
4634 	pktinfop->ip4_addr = INADDR_ANY;
4635 
4636 	/*
4637 	 * If options passed in, feed it for verification and handling
4638 	 */
4639 	if (tudr->OPT_length != 0) {
4640 		int error;
4641 
4642 		error = 0;
4643 		if (icmp_unitdata_opt_process(q, mp, &error,
4644 		    (void *)pktinfop) < 0) {
4645 			/* failure */
4646 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4647 			icmp_ud_err(q, mp, error);
4648 			return;
4649 		}
4650 		ASSERT(error == 0);
4651 		/*
4652 		 * Note: Success in processing options.
4653 		 * mp option buffer represented by
4654 		 * OPT_length/offset now potentially modified
4655 		 * and contain option setting results
4656 		 */
4657 	}
4658 
4659 	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
4660 done:
4661 	if (error != 0) {
4662 		icmp_ud_err(q, mp, error);
4663 		return;
4664 	} else {
4665 		mp->b_cont = NULL;
4666 		freeb(mp);
4667 	}
4668 }
4669 
4670 
4671 /* ARGSUSED */
4672 static void
4673 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4674 {
4675 #ifdef DEBUG
4676 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4677 #endif
4678 	freemsg(mp);
4679 }
4680 
4681 static int
4682 raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
4683     ip4_pkt_t *pktinfop)
4684 {
4685 	ipha_t	*ipha;
4686 	size_t	ip_len;
4687 	icmp_t	*icmp = connp->conn_icmp;
4688 	icmp_stack_t *is = icmp->icmp_is;
4689 	int	ip_hdr_length;
4690 	ip_opt_info_t	optinfo;
4691 	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
4692 	uint32_t ip_snd_opt_len = 0;
4693 	pid_t	cpid;
4694 	cred_t	*cr;
4695 
4696 	optinfo.ip_opt_flags = 0;
4697 	optinfo.ip_opt_ill_index = 0;
4698 
4699 	if (icmp->icmp_state == TS_UNBND) {
4700 		/* If a port has not been bound to the stream, fail. */
4701 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4702 		return (EPROTO);
4703 	}
4704 
4705 	if (v4dst == INADDR_ANY)
4706 		v4dst = htonl(INADDR_LOOPBACK);
4707 
4708 	/* Protocol 255 contains full IP headers */
4709 	if (icmp->icmp_hdrincl)
4710 		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
4711 
4712 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4713 
4714 	/*
4715 	 * Check if our saved options are valid; update if not.
4716 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
4717 	 * to different destination may require different labels,
4718 	 * or worse, ICMP packets to same IP address may require
4719 	 * different labels due to use of shared all-zones address.
4720 	 * We use conn_lock to ensure that lastdst, ip_snd_options,
4721 	 * and ip_snd_options_len are consistent for the current
4722 	 * destination and are updated atomically.
4723 	 */
4724 	mutex_enter(&connp->conn_lock);
4725 	if (is_system_labeled()) {
4726 
4727 		/*
4728 		 * Recompute the Trusted Extensions security label if we're not
4729 		 * going to the same destination as last time or the cred
4730 		 * attached to the received mblk changed.
4731 		 */
4732 		cr = msg_getcred(mp, &cpid);
4733 		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4734 		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst ||
4735 		    cr != icmp->icmp_last_cred) {
4736 			int error = icmp_update_label(icmp, mp, v4dst);
4737 			if (error != 0) {
4738 				mutex_exit(&connp->conn_lock);
4739 				rw_exit(&icmp->icmp_rwlock);
4740 				return (error);
4741 			}
4742 		}
4743 		/*
4744 		 * Apply credentials with modified security label if they
4745 		 * exist. icmp_update_label() may have generated these
4746 		 * credentials for packets to unlabeled remote nodes.
4747 		 */
4748 		if (icmp->icmp_effective_cred != NULL)
4749 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
4750 	}
4751 
4752 	if (icmp->icmp_ip_snd_options_len > 0) {
4753 		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
4754 		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
4755 	}
4756 	mutex_exit(&connp->conn_lock);
4757 
4758 	/* Add an IP header */
4759 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + ip_snd_opt_len;
4760 	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
4761 	if ((uchar_t *)ipha < mp->b_datap->db_base ||
4762 	    mp->b_datap->db_ref != 1 ||
4763 	    !OK_32PTR(ipha)) {
4764 		mblk_t	*mp1;
4765 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4766 		    BPRI_LO))) {
4767 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4768 			rw_exit(&icmp->icmp_rwlock);
4769 			return (ENOMEM);
4770 		}
4771 		mp1->b_cont = mp;
4772 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4773 		mp1->b_wptr = (uchar_t *)ipha;
4774 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4775 		mp = mp1;
4776 	}
4777 #ifdef	_BIG_ENDIAN
4778 	/* Set version, header length, and tos */
4779 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4780 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4781 	    icmp->icmp_type_of_service);
4782 	/* Set ttl and protocol */
4783 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4784 #else
4785 	/* Set version, header length, and tos */
4786 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4787 	    ((icmp->icmp_type_of_service << 8) |
4788 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4789 	/* Set ttl and protocol */
4790 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4791 #endif
4792 	if (pktinfop->ip4_addr != INADDR_ANY) {
4793 		ipha->ipha_src = pktinfop->ip4_addr;
4794 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4795 	} else {
4796 
4797 		/*
4798 		 * Copy our address into the packet.  If this is zero,
4799 		 * ip will fill in the real source address.
4800 		 */
4801 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4802 	}
4803 
4804 	ipha->ipha_fragment_offset_and_flags = 0;
4805 
4806 	if (pktinfop->ip4_ill_index != 0) {
4807 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4808 	}
4809 
4810 
4811 	/*
4812 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4813 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4814 	 * tell IP that the application has sent a complete IP header and not
4815 	 * to compute the transport checksum nor change the DF flag.
4816 	 */
4817 	ipha->ipha_ident = IP_HDR_INCLUDED;
4818 
4819 	/* Finish common formatting of the packet. */
4820 	mp->b_rptr = (uchar_t *)ipha;
4821 
4822 	ip_len = mp->b_wptr - (uchar_t *)ipha;
4823 	if (mp->b_cont != NULL)
4824 		ip_len += msgdsize(mp->b_cont);
4825 
4826 	/*
4827 	 * Set the length into the IP header.
4828 	 * If the length is greater than the maximum allowed by IP,
4829 	 * then free the message and return. Do not try and send it
4830 	 * as this can cause problems in layers below.
4831 	 */
4832 	if (ip_len > IP_MAXPACKET) {
4833 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4834 		rw_exit(&icmp->icmp_rwlock);
4835 		return (EMSGSIZE);
4836 	}
4837 	ipha->ipha_length = htons((uint16_t)ip_len);
4838 	/*
4839 	 * Copy in the destination address request
4840 	 */
4841 	ipha->ipha_dst = v4dst;
4842 
4843 	/*
4844 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4845 	 */
4846 	if (CLASSD(v4dst))
4847 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4848 
4849 	/* Copy in options if any */
4850 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4851 		bcopy(ip_snd_opt,
4852 		    &ipha[1], ip_snd_opt_len);
4853 		/*
4854 		 * Massage source route putting first source route in ipha_dst.
4855 		 * Ignore the destination in the T_unitdata_req.
4856 		 */
4857 		(void) ip_massage_options(ipha, is->is_netstack);
4858 	}
4859 
4860 	rw_exit(&icmp->icmp_rwlock);
4861 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4862 
4863 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4864 	return (0);
4865 }
4866 
4867 static int
4868 icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4869 {
4870 	int err;
4871 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4872 	icmp_stack_t		*is = icmp->icmp_is;
4873 	conn_t			*connp = icmp->icmp_connp;
4874 	cred_t	*cred;
4875 	cred_t	*msg_cred;
4876 	cred_t	*effective_cred;
4877 
4878 	/*
4879 	 * All Solaris components should pass a db_credp
4880 	 * for this message, hence we ASSERT.
4881 	 * On production kernels we return an error to be robust against
4882 	 * random streams modules sitting on top of us.
4883 	 */
4884 	cred = msg_cred = msg_getcred(mp, NULL);
4885 	ASSERT(cred != NULL);
4886 	if (cred == NULL)
4887 		return (EINVAL);
4888 
4889 	/*
4890 	 * Verify the destination is allowed to receive packets at
4891 	 * the security label of the message data. check_dest()
4892 	 * may create a new effective cred for this message
4893 	 * with a modified label or label flags.
4894 	 */
4895 	if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
4896 	    connp->conn_mac_mode, &effective_cred)) != 0)
4897 		goto done;
4898 	if (effective_cred != NULL)
4899 		cred = effective_cred;
4900 
4901 	/*
4902 	 * Calculate the security label to be placed in the text
4903 	 * of the message (if any).
4904 	 */
4905 	if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
4906 	    is->is_netstack->netstack_ip)) != 0)
4907 		goto done;
4908 
4909 	/*
4910 	 * Insert the security label in the cached ip options,
4911 	 * removing any old label that may exist.
4912 	 */
4913 	if ((err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4914 	    &icmp->icmp_label_len_v6, opt_storage)) != 0)
4915 		goto done;
4916 
4917 	/*
4918 	 * Save the destination address and cred we used to generate
4919 	 * the security label text.
4920 	 */
4921 	icmp->icmp_v6lastdst = *dst;
4922 	if (cred != icmp->icmp_effective_cred) {
4923 		if (icmp->icmp_effective_cred != NULL)
4924 			crfree(icmp->icmp_effective_cred);
4925 		crhold(cred);
4926 		icmp->icmp_effective_cred = cred;
4927 	}
4928 
4929 	if (msg_cred != icmp->icmp_last_cred) {
4930 		if (icmp->icmp_last_cred != NULL)
4931 			crfree(icmp->icmp_last_cred);
4932 		crhold(msg_cred);
4933 		icmp->icmp_last_cred = msg_cred;
4934 	}
4935 
4936 done:
4937 	if (effective_cred != NULL)
4938 		crfree(effective_cred);
4939 
4940 	if (err != 0) {
4941 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4942 		DTRACE_PROBE4(
4943 		    tx__ip__log__drop__updatelabel__icmp6,
4944 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4945 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4946 		return (err);
4947 	}
4948 	return (0);
4949 }
4950 
4951 /*
4952  * raw_ip_send_data_v6():
4953  * Assumes that icmp_wput did some sanity checking on the destination
4954  * address, but that the label may not yet be correct.
4955  */
4956 static int
4957 raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
4958     ip6_pkt_t *ipp)
4959 {
4960 	ip6_t			*ip6h;
4961 	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
4962 	int			ip_hdr_len = IPV6_HDR_LEN;
4963 	size_t			ip_len;
4964 	icmp_t			*icmp = connp->conn_icmp;
4965 	icmp_stack_t		*is = icmp->icmp_is;
4966 	ip6_pkt_t		*tipp;
4967 	ip6_hbh_t		*hopoptsptr = NULL;
4968 	uint_t			hopoptslen = 0;
4969 	uint32_t		csum = 0;
4970 	uint_t			ignore = 0;
4971 	uint_t			option_exists = 0, is_sticky = 0;
4972 	uint8_t			*cp;
4973 	uint8_t			*nxthdr_ptr;
4974 	in6_addr_t		ip6_dst;
4975 	pid_t			cpid;
4976 	cred_t			*cr;
4977 
4978 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4979 
4980 	/*
4981 	 * If the local address is a mapped address return
4982 	 * an error.
4983 	 * It would be possible to send an IPv6 packet but the
4984 	 * response would never make it back to the application
4985 	 * since it is bound to a mapped address.
4986 	 */
4987 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4988 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4989 		rw_exit(&icmp->icmp_rwlock);
4990 		return (EADDRNOTAVAIL);
4991 	}
4992 
4993 	ignore = ipp->ipp_sticky_ignored;
4994 	if (sin6->sin6_scope_id != 0 &&
4995 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4996 		/*
4997 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4998 		 * option nor ancillary data.  It needs to be
4999 		 * explicitly set in options_exists.
5000 		 */
5001 		option_exists |= IPPF_SCOPE_ID;
5002 	}
5003 
5004 	/*
5005 	 * Compute the destination address
5006 	 */
5007 	ip6_dst = sin6->sin6_addr;
5008 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5009 		ip6_dst = ipv6_loopback;
5010 
5011 	/*
5012 	 * Check if our saved options are valid; update if not.
5013 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
5014 	 * to different destination may require different labels,
5015 	 * or worse, ICMP packets to same IP address may require
5016 	 * different labels due to use of shared all-zones address.
5017 	 * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
5018 	 * and sticky ipp_hopoptslen are consistent for the current
5019 	 * destination and are updated atomically.
5020 	 */
5021 	mutex_enter(&connp->conn_lock);
5022 	if (is_system_labeled()) {
5023 		/*
5024 		 * Recompute the Trusted Extensions security label if we're
5025 		 * not going to the same destination as last time or the cred
5026 		 * attached to the received mblk changed. This is done in a
5027 		 * separate routine to avoid blowing up our stack here.
5028 		 */
5029 		cr = msg_getcred(mp, &cpid);
5030 		if (!IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) ||
5031 		    cr != icmp->icmp_last_cred) {
5032 			int error = 0;
5033 			error = icmp_update_label_v6(icmp, mp, &ip6_dst);
5034 			if (error != 0) {
5035 				mutex_exit(&connp->conn_lock);
5036 				rw_exit(&icmp->icmp_rwlock);
5037 				return (error);
5038 			}
5039 		}
5040 
5041 		/*
5042 		 * Apply credentials with modified security label if they exist.
5043 		 * icmp_update_label_v6() may have generated these credentials
5044 		 * for MAC-Exempt connections.
5045 		 */
5046 		if (icmp->icmp_effective_cred != NULL)
5047 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
5048 	}
5049 
5050 	/*
5051 	 * If there's a security label here, then we ignore any options the
5052 	 * user may try to set.  We keep the peer's label as a hidden sticky
5053 	 * option.
5054 	 */
5055 	if (icmp->icmp_label_len_v6 > 0) {
5056 		ignore &= ~IPPF_HOPOPTS;
5057 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
5058 	}
5059 
5060 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
5061 	    (ipp->ipp_fields == 0)) {
5062 		/* No sticky options nor ancillary data. */
5063 		mutex_exit(&connp->conn_lock);
5064 		goto no_options;
5065 	}
5066 
5067 	/*
5068 	 * Go through the options figuring out where each is going to
5069 	 * come from and build two masks.  The first mask indicates if
5070 	 * the option exists at all.  The second mask indicates if the
5071 	 * option is sticky or ancillary.
5072 	 */
5073 	if (!(ignore & IPPF_HOPOPTS)) {
5074 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
5075 			option_exists |= IPPF_HOPOPTS;
5076 			ip_hdr_len += ipp->ipp_hopoptslen;
5077 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
5078 			option_exists |= IPPF_HOPOPTS;
5079 			is_sticky |= IPPF_HOPOPTS;
5080 			ASSERT(icmp->icmp_sticky_ipp.ipp_hopoptslen != 0);
5081 			hopoptsptr = kmem_alloc(
5082 			    icmp->icmp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
5083 			if (hopoptsptr == NULL) {
5084 				mutex_exit(&connp->conn_lock);
5085 				rw_exit(&icmp->icmp_rwlock);
5086 				return (ENOMEM);
5087 			}
5088 			hopoptslen = icmp->icmp_sticky_ipp.ipp_hopoptslen;
5089 			bcopy(icmp->icmp_sticky_ipp.ipp_hopopts, hopoptsptr,
5090 			    hopoptslen);
5091 			ip_hdr_len += hopoptslen;
5092 		}
5093 	}
5094 	mutex_exit(&connp->conn_lock);
5095 
5096 	if (!(ignore & IPPF_RTHDR)) {
5097 		if (ipp->ipp_fields & IPPF_RTHDR) {
5098 			option_exists |= IPPF_RTHDR;
5099 			ip_hdr_len += ipp->ipp_rthdrlen;
5100 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
5101 			option_exists |= IPPF_RTHDR;
5102 			is_sticky |= IPPF_RTHDR;
5103 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
5104 		}
5105 	}
5106 
5107 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
5108 		/*
5109 		 * Need to have a router header to use these.
5110 		 */
5111 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
5112 			option_exists |= IPPF_RTDSTOPTS;
5113 			ip_hdr_len += ipp->ipp_rtdstoptslen;
5114 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
5115 			option_exists |= IPPF_RTDSTOPTS;
5116 			is_sticky |= IPPF_RTDSTOPTS;
5117 			ip_hdr_len +=
5118 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
5119 		}
5120 	}
5121 
5122 	if (!(ignore & IPPF_DSTOPTS)) {
5123 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
5124 			option_exists |= IPPF_DSTOPTS;
5125 			ip_hdr_len += ipp->ipp_dstoptslen;
5126 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
5127 			option_exists |= IPPF_DSTOPTS;
5128 			is_sticky |= IPPF_DSTOPTS;
5129 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
5130 		}
5131 	}
5132 
5133 	if (!(ignore & IPPF_IFINDEX)) {
5134 		if (ipp->ipp_fields & IPPF_IFINDEX) {
5135 			option_exists |= IPPF_IFINDEX;
5136 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
5137 			option_exists |= IPPF_IFINDEX;
5138 			is_sticky |= IPPF_IFINDEX;
5139 		}
5140 	}
5141 
5142 	if (!(ignore & IPPF_ADDR)) {
5143 		if (ipp->ipp_fields & IPPF_ADDR) {
5144 			option_exists |= IPPF_ADDR;
5145 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
5146 			option_exists |= IPPF_ADDR;
5147 			is_sticky |= IPPF_ADDR;
5148 		}
5149 	}
5150 
5151 	if (!(ignore & IPPF_DONTFRAG)) {
5152 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
5153 			option_exists |= IPPF_DONTFRAG;
5154 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
5155 			option_exists |= IPPF_DONTFRAG;
5156 			is_sticky |= IPPF_DONTFRAG;
5157 		}
5158 	}
5159 
5160 	if (!(ignore & IPPF_USE_MIN_MTU)) {
5161 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
5162 			option_exists |= IPPF_USE_MIN_MTU;
5163 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
5164 		    IPPF_USE_MIN_MTU) {
5165 			option_exists |= IPPF_USE_MIN_MTU;
5166 			is_sticky |= IPPF_USE_MIN_MTU;
5167 		}
5168 	}
5169 
5170 	if (!(ignore & IPPF_NEXTHOP)) {
5171 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
5172 			option_exists |= IPPF_NEXTHOP;
5173 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
5174 			option_exists |= IPPF_NEXTHOP;
5175 			is_sticky |= IPPF_NEXTHOP;
5176 		}
5177 	}
5178 
5179 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
5180 		option_exists |= IPPF_HOPLIMIT;
5181 	/* IPV6_HOPLIMIT can never be sticky */
5182 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
5183 
5184 	if (!(ignore & IPPF_UNICAST_HOPS) &&
5185 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
5186 		option_exists |= IPPF_UNICAST_HOPS;
5187 		is_sticky |= IPPF_UNICAST_HOPS;
5188 	}
5189 
5190 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
5191 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
5192 		option_exists |= IPPF_MULTICAST_HOPS;
5193 		is_sticky |= IPPF_MULTICAST_HOPS;
5194 	}
5195 
5196 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
5197 		/* This is a sticky socket option only */
5198 		option_exists |= IPPF_NO_CKSUM;
5199 		is_sticky |= IPPF_NO_CKSUM;
5200 	}
5201 
5202 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
5203 		/* This is a sticky socket option only */
5204 		option_exists |= IPPF_RAW_CKSUM;
5205 		is_sticky |= IPPF_RAW_CKSUM;
5206 	}
5207 
5208 	if (!(ignore & IPPF_TCLASS)) {
5209 		if (ipp->ipp_fields & IPPF_TCLASS) {
5210 			option_exists |= IPPF_TCLASS;
5211 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
5212 			option_exists |= IPPF_TCLASS;
5213 			is_sticky |= IPPF_TCLASS;
5214 		}
5215 	}
5216 
5217 no_options:
5218 
5219 	/*
5220 	 * If any options carried in the ip6i_t were specified, we
5221 	 * need to account for the ip6i_t in the data we'll be sending
5222 	 * down.
5223 	 */
5224 	if (option_exists & IPPF_HAS_IP6I)
5225 		ip_hdr_len += sizeof (ip6i_t);
5226 
5227 	/* check/fix buffer config, setup pointers into it */
5228 	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
5229 	if ((mp->b_datap->db_ref != 1) ||
5230 	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
5231 	    !OK_32PTR(ip6h)) {
5232 		mblk_t	*mp1;
5233 
5234 		/* Try to get everything in a single mblk next time */
5235 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
5236 			icmp->icmp_max_hdr_len = ip_hdr_len;
5237 
5238 			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
5239 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
5240 		}
5241 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
5242 		if (!mp1) {
5243 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5244 			kmem_free(hopoptsptr, hopoptslen);
5245 			rw_exit(&icmp->icmp_rwlock);
5246 			return (ENOMEM);
5247 		}
5248 		mp1->b_cont = mp;
5249 		mp1->b_wptr = mp1->b_datap->db_lim;
5250 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
5251 		mp = mp1;
5252 	}
5253 	mp->b_rptr = (unsigned char *)ip6h;
5254 	ip6i = (ip6i_t *)ip6h;
5255 
5256 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
5257 	if (option_exists & IPPF_HAS_IP6I) {
5258 		ip6h = (ip6_t *)&ip6i[1];
5259 		ip6i->ip6i_flags = 0;
5260 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5261 
5262 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
5263 		if (option_exists & IPPF_SCOPE_ID) {
5264 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5265 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
5266 		} else if (option_exists & IPPF_IFINDEX) {
5267 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
5268 			ASSERT(tipp->ipp_ifindex != 0);
5269 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5270 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
5271 		}
5272 
5273 		if (option_exists & IPPF_RAW_CKSUM) {
5274 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
5275 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
5276 		}
5277 
5278 		if (option_exists & IPPF_NO_CKSUM) {
5279 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
5280 		}
5281 
5282 		if (option_exists & IPPF_ADDR) {
5283 			/*
5284 			 * Enable per-packet source address verification if
5285 			 * IPV6_PKTINFO specified the source address.
5286 			 * ip6_src is set in the transport's _wput function.
5287 			 */
5288 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
5289 		}
5290 
5291 		if (option_exists & IPPF_DONTFRAG) {
5292 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
5293 		}
5294 
5295 		if (option_exists & IPPF_USE_MIN_MTU) {
5296 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
5297 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
5298 		}
5299 
5300 		if (option_exists & IPPF_NEXTHOP) {
5301 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
5302 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
5303 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
5304 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
5305 		}
5306 
5307 		/*
5308 		 * tell IP this is an ip6i_t private header
5309 		 */
5310 		ip6i->ip6i_nxt = IPPROTO_RAW;
5311 	}
5312 
5313 	/* Initialize IPv6 header */
5314 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5315 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
5316 
5317 	/* Set the hoplimit of the outgoing packet. */
5318 	if (option_exists & IPPF_HOPLIMIT) {
5319 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
5320 		ip6h->ip6_hops = ipp->ipp_hoplimit;
5321 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5322 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
5323 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
5324 		if (option_exists & IPPF_MULTICAST_HOPS)
5325 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5326 	} else {
5327 		ip6h->ip6_hops = icmp->icmp_ttl;
5328 		if (option_exists & IPPF_UNICAST_HOPS)
5329 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5330 	}
5331 
5332 	if (option_exists & IPPF_ADDR) {
5333 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
5334 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
5335 		ip6h->ip6_src = tipp->ipp_addr;
5336 	} else {
5337 		/*
5338 		 * The source address was not set using IPV6_PKTINFO.
5339 		 * First look at the bound source.
5340 		 * If unspecified fallback to __sin6_src_id.
5341 		 */
5342 		ip6h->ip6_src = icmp->icmp_v6src;
5343 		if (sin6->__sin6_src_id != 0 &&
5344 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5345 			ip_srcid_find_id(sin6->__sin6_src_id,
5346 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5347 			    is->is_netstack);
5348 		}
5349 	}
5350 
5351 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5352 	cp = (uint8_t *)&ip6h[1];
5353 
5354 	/*
5355 	 * Here's where we have to start stringing together
5356 	 * any extension headers in the right order:
5357 	 * Hop-by-hop, destination, routing, and final destination opts.
5358 	 */
5359 	if (option_exists & IPPF_HOPOPTS) {
5360 		/* Hop-by-hop options */
5361 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5362 
5363 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5364 		nxthdr_ptr = &hbh->ip6h_nxt;
5365 
5366 		if (hopoptslen == 0) {
5367 			tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5368 			bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5369 			cp += tipp->ipp_hopoptslen;
5370 		} else {
5371 			bcopy(hopoptsptr, cp, hopoptslen);
5372 			cp += hopoptslen;
5373 			kmem_free(hopoptsptr, hopoptslen);
5374 		}
5375 	}
5376 	/*
5377 	 * En-route destination options
5378 	 * Only do them if there's a routing header as well
5379 	 */
5380 	if (option_exists & IPPF_RTDSTOPTS) {
5381 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5382 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5383 
5384 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5385 		nxthdr_ptr = &dst->ip6d_nxt;
5386 
5387 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5388 		cp += tipp->ipp_rtdstoptslen;
5389 	}
5390 	/*
5391 	 * Routing header next
5392 	 */
5393 	if (option_exists & IPPF_RTHDR) {
5394 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5395 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5396 
5397 		*nxthdr_ptr = IPPROTO_ROUTING;
5398 		nxthdr_ptr = &rt->ip6r_nxt;
5399 
5400 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5401 		cp += tipp->ipp_rthdrlen;
5402 	}
5403 	/*
5404 	 * Do ultimate destination options
5405 	 */
5406 	if (option_exists & IPPF_DSTOPTS) {
5407 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5408 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5409 
5410 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5411 		nxthdr_ptr = &dest->ip6d_nxt;
5412 
5413 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5414 		cp += tipp->ipp_dstoptslen;
5415 	}
5416 
5417 	/*
5418 	 * Now set the last header pointer to the proto passed in
5419 	 */
5420 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5421 	*nxthdr_ptr = icmp->icmp_proto;
5422 
5423 	/*
5424 	 * Copy in the destination address
5425 	 */
5426 	ip6h->ip6_dst = ip6_dst;
5427 
5428 	ip6h->ip6_vcf =
5429 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5430 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5431 
5432 	if (option_exists & IPPF_TCLASS) {
5433 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5434 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5435 		    tipp->ipp_tclass);
5436 	}
5437 	if (option_exists & IPPF_RTHDR) {
5438 		ip6_rthdr_t	*rth;
5439 
5440 		/*
5441 		 * Perform any processing needed for source routing.
5442 		 * We know that all extension headers will be in the same mblk
5443 		 * as the IPv6 header.
5444 		 */
5445 		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
5446 		if (rth != NULL && rth->ip6r_segleft != 0) {
5447 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5448 				/*
5449 				 * Drop packet - only support Type 0 routing.
5450 				 * Notify the application as well.
5451 				 */
5452 				BUMP_MIB(&is->is_rawip_mib,
5453 				    rawipOutErrors);
5454 				rw_exit(&icmp->icmp_rwlock);
5455 				return (EPROTO);
5456 			}
5457 			/*
5458 			 * rth->ip6r_len is twice the number of
5459 			 * addresses in the header
5460 			 */
5461 			if (rth->ip6r_len & 0x1) {
5462 				BUMP_MIB(&is->is_rawip_mib,
5463 				    rawipOutErrors);
5464 				rw_exit(&icmp->icmp_rwlock);
5465 				return (EPROTO);
5466 			}
5467 			/*
5468 			 * Shuffle the routing header and ip6_dst
5469 			 * addresses, and get the checksum difference
5470 			 * between the first hop (in ip6_dst) and
5471 			 * the destination (in the last routing hdr entry).
5472 			 */
5473 			csum = ip_massage_options_v6(ip6h, rth,
5474 			    is->is_netstack);
5475 			/*
5476 			 * Verify that the first hop isn't a mapped address.
5477 			 * Routers along the path need to do this verification
5478 			 * for subsequent hops.
5479 			 */
5480 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5481 				BUMP_MIB(&is->is_rawip_mib,
5482 				    rawipOutErrors);
5483 				rw_exit(&icmp->icmp_rwlock);
5484 				return (EADDRNOTAVAIL);
5485 			}
5486 		}
5487 	}
5488 
5489 	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5490 	if (mp->b_cont != NULL)
5491 		ip_len += msgdsize(mp->b_cont);
5492 
5493 	/*
5494 	 * Set the length into the IP header.
5495 	 * If the length is greater than the maximum allowed by IP,
5496 	 * then free the message and return. Do not try and send it
5497 	 * as this can cause problems in layers below.
5498 	 */
5499 	if (ip_len > IP_MAXPACKET) {
5500 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5501 		rw_exit(&icmp->icmp_rwlock);
5502 		return (EMSGSIZE);
5503 	}
5504 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5505 		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
5506 		uint16_t *cksum_ptr;
5507 		uint_t	ext_hdrs_len;
5508 
5509 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5510 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5511 		    icmp->icmp_checksum_off == 2);
5512 
5513 		/*
5514 		 * We make it easy for IP to include our pseudo header
5515 		 * by putting our length in uh_checksum, modified (if
5516 		 * we have a routing header) by the checksum difference
5517 		 * between the ultimate destination and first hop addresses.
5518 		 * Note: ICMPv6 must always checksum the packet.
5519 		 */
5520 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5521 		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
5522 			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
5523 				BUMP_MIB(&is->is_rawip_mib,
5524 				    rawipOutErrors);
5525 				freemsg(mp);
5526 				rw_exit(&icmp->icmp_rwlock);
5527 				return (0);
5528 			}
5529 			ip6i = (ip6i_t *)mp->b_rptr;
5530 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5531 				ip6h = (ip6_t *)&ip6i[1];
5532 			else
5533 				ip6h = (ip6_t *)ip6i;
5534 		}
5535 		/* Add payload length to checksum */
5536 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5537 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5538 		csum += htons(ip_len - ext_hdrs_len);
5539 
5540 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5541 		csum = (csum & 0xFFFF) + (csum >> 16);
5542 		*cksum_ptr = (uint16_t)csum;
5543 	}
5544 
5545 #ifdef _LITTLE_ENDIAN
5546 	ip_len = htons(ip_len);
5547 #endif
5548 	ip6h->ip6_plen = (uint16_t)ip_len;
5549 
5550 	/* We're done. Pass the packet to IP */
5551 	rw_exit(&icmp->icmp_rwlock);
5552 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5553 	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
5554 	return (0);
5555 }
5556 
5557 static void
5558 icmp_wput_other(queue_t *q, mblk_t *mp)
5559 {
5560 	uchar_t	*rptr = mp->b_rptr;
5561 	struct iocblk *iocp;
5562 #define	tudr ((struct T_unitdata_req *)rptr)
5563 	conn_t	*connp = Q_TO_CONN(q);
5564 	icmp_t	*icmp = connp->conn_icmp;
5565 	icmp_stack_t *is = icmp->icmp_is;
5566 	cred_t *cr;
5567 
5568 	switch (mp->b_datap->db_type) {
5569 	case M_PROTO:
5570 	case M_PCPROTO:
5571 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5572 			/*
5573 			 * If the message does not contain a PRIM_type,
5574 			 * throw it away.
5575 			 */
5576 			freemsg(mp);
5577 			return;
5578 		}
5579 		switch (((union T_primitives *)rptr)->type) {
5580 		case T_ADDR_REQ:
5581 			icmp_addr_req(q, mp);
5582 			return;
5583 		case O_T_BIND_REQ:
5584 		case T_BIND_REQ:
5585 			icmp_tpi_bind(q, mp);
5586 			return;
5587 		case T_CONN_REQ:
5588 			icmp_tpi_connect(q, mp);
5589 			return;
5590 		case T_CAPABILITY_REQ:
5591 			icmp_capability_req(q, mp);
5592 			return;
5593 		case T_INFO_REQ:
5594 			icmp_info_req(q, mp);
5595 			return;
5596 		case T_UNITDATA_REQ:
5597 			/*
5598 			 * If a T_UNITDATA_REQ gets here, the address must
5599 			 * be bad.  Valid T_UNITDATA_REQs are found above
5600 			 * and break to below this switch.
5601 			 */
5602 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5603 			return;
5604 		case T_UNBIND_REQ:
5605 			icmp_tpi_unbind(q, mp);
5606 			return;
5607 
5608 		case T_SVR4_OPTMGMT_REQ:
5609 			/*
5610 			 * All Solaris components should pass a db_credp
5611 			 * for this TPI message, hence we ASSERT.
5612 			 * But in case there is some other M_PROTO that looks
5613 			 * like a TPI message sent by some other kernel
5614 			 * component, we check and return an error.
5615 			 */
5616 			cr = msg_getcred(mp, NULL);
5617 			ASSERT(cr != NULL);
5618 			if (cr == NULL) {
5619 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5620 				return;
5621 			}
5622 
5623 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5624 			    cr)) {
5625 				/* Only IP can return anything meaningful */
5626 				(void) svr4_optcom_req(q, mp, cr,
5627 				    &icmp_opt_obj, B_TRUE);
5628 			}
5629 			return;
5630 
5631 		case T_OPTMGMT_REQ:
5632 			/*
5633 			 * All Solaris components should pass a db_credp
5634 			 * for this TPI message, hence we ASSERT.
5635 			 * But in case there is some other M_PROTO that looks
5636 			 * like a TPI message sent by some other kernel
5637 			 * component, we check and return an error.
5638 			 */
5639 			cr = msg_getcred(mp, NULL);
5640 			ASSERT(cr != NULL);
5641 			if (cr == NULL) {
5642 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5643 				return;
5644 			}
5645 			/* Only IP can return anything meaningful */
5646 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5647 			return;
5648 
5649 		case T_DISCON_REQ:
5650 			icmp_tpi_disconnect(q, mp);
5651 			return;
5652 
5653 		/* The following TPI message is not supported by icmp. */
5654 		case O_T_CONN_RES:
5655 		case T_CONN_RES:
5656 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5657 			return;
5658 
5659 		/* The following 3 TPI requests are illegal for icmp. */
5660 		case T_DATA_REQ:
5661 		case T_EXDATA_REQ:
5662 		case T_ORDREL_REQ:
5663 			freemsg(mp);
5664 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5665 			return;
5666 		default:
5667 			break;
5668 		}
5669 		break;
5670 	case M_IOCTL:
5671 		iocp = (struct iocblk *)mp->b_rptr;
5672 		switch (iocp->ioc_cmd) {
5673 		case TI_GETPEERNAME:
5674 			if (icmp->icmp_state != TS_DATA_XFER) {
5675 				/*
5676 				 * If a default destination address has not
5677 				 * been associated with the stream, then we
5678 				 * don't know the peer's name.
5679 				 */
5680 				iocp->ioc_error = ENOTCONN;
5681 		err_ret:;
5682 				iocp->ioc_count = 0;
5683 				mp->b_datap->db_type = M_IOCACK;
5684 				qreply(q, mp);
5685 				return;
5686 			}
5687 			/* FALLTHRU */
5688 		case TI_GETMYNAME:
5689 			/*
5690 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5691 			 * need to copyin the user's strbuf structure.
5692 			 * Processing will continue in the M_IOCDATA case
5693 			 * below.
5694 			 */
5695 			mi_copyin(q, mp, NULL,
5696 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5697 			return;
5698 		case ND_SET:
5699 			/* nd_getset performs the necessary error checking */
5700 		case ND_GET:
5701 			if (nd_getset(q, is->is_nd, mp)) {
5702 				qreply(q, mp);
5703 				return;
5704 			}
5705 			break;
5706 		case _SIOCSOCKFALLBACK:
5707 			/*
5708 			 * socket is falling back to be a
5709 			 * streams socket. Nothing  to do
5710 			 */
5711 			iocp->ioc_count = 0;
5712 			iocp->ioc_rval = 0;
5713 			qreply(q, mp);
5714 			return;
5715 		default:
5716 			break;
5717 		}
5718 		break;
5719 	case M_IOCDATA:
5720 		icmp_wput_iocdata(q, mp);
5721 		return;
5722 	default:
5723 		break;
5724 	}
5725 	ip_wput(q, mp);
5726 }
5727 
5728 /*
5729  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5730  * messages.
5731  */
5732 static void
5733 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5734 {
5735 	mblk_t	*mp1;
5736 	STRUCT_HANDLE(strbuf, sb);
5737 	icmp_t	*icmp;
5738 	uint_t	addrlen;
5739 	uint_t	error;
5740 
5741 	/* Make sure it is one of ours. */
5742 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5743 	case TI_GETMYNAME:
5744 	case TI_GETPEERNAME:
5745 		break;
5746 	default:
5747 		icmp = Q_TO_ICMP(q);
5748 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5749 		return;
5750 	}
5751 	switch (mi_copy_state(q, mp, &mp1)) {
5752 	case -1:
5753 		return;
5754 	case MI_COPY_CASE(MI_COPY_IN, 1):
5755 		break;
5756 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5757 		/*
5758 		 * The address has been copied out, so now
5759 		 * copyout the strbuf.
5760 		 */
5761 		mi_copyout(q, mp);
5762 		return;
5763 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5764 		/*
5765 		 * The address and strbuf have been copied out.
5766 		 * We're done, so just acknowledge the original
5767 		 * M_IOCTL.
5768 		 */
5769 		mi_copy_done(q, mp, 0);
5770 		return;
5771 	default:
5772 		/*
5773 		 * Something strange has happened, so acknowledge
5774 		 * the original M_IOCTL with an EPROTO error.
5775 		 */
5776 		mi_copy_done(q, mp, EPROTO);
5777 		return;
5778 	}
5779 	/*
5780 	 * Now we have the strbuf structure for TI_GETMYNAME
5781 	 * and TI_GETPEERNAME.  Next we copyout the requested
5782 	 * address and then we'll copyout the strbuf.
5783 	 */
5784 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5785 	    (void *)mp1->b_rptr);
5786 	icmp = Q_TO_ICMP(q);
5787 	if (icmp->icmp_family == AF_INET)
5788 		addrlen = sizeof (sin_t);
5789 	else
5790 		addrlen = sizeof (sin6_t);
5791 
5792 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5793 		mi_copy_done(q, mp, EINVAL);
5794 		return;
5795 	}
5796 
5797 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5798 
5799 	if (mp1 == NULL)
5800 		return;
5801 
5802 	rw_enter(&icmp->icmp_rwlock, RW_READER);
5803 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5804 	case TI_GETMYNAME:
5805 		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
5806 		    &addrlen);
5807 		break;
5808 	case TI_GETPEERNAME:
5809 		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
5810 		    &addrlen);
5811 		break;
5812 	}
5813 	rw_exit(&icmp->icmp_rwlock);
5814 
5815 	if (error != 0) {
5816 		mi_copy_done(q, mp, error);
5817 	} else {
5818 		mp1->b_wptr += addrlen;
5819 		STRUCT_FSET(sb, len, addrlen);
5820 
5821 		/* Copy out the address */
5822 		mi_copyout(q, mp);
5823 	}
5824 }
5825 
5826 static int
5827 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5828     void *thisdg_attrs)
5829 {
5830 	struct T_unitdata_req *udreqp;
5831 	int is_absreq_failure;
5832 	cred_t *cr;
5833 
5834 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5835 	*errorp = 0;
5836 
5837 	/*
5838 	 * All Solaris components should pass a db_credp
5839 	 * for this TPI message, hence we ASSERT.
5840 	 * But in case there is some other M_PROTO that looks
5841 	 * like a TPI message sent by some other kernel
5842 	 * component, we check and return an error.
5843 	 */
5844 	cr = msg_getcred(mp, NULL);
5845 	ASSERT(cr != NULL);
5846 	if (cr == NULL)
5847 		return (-1);
5848 
5849 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5850 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5851 	    thisdg_attrs, &is_absreq_failure);
5852 
5853 	if (*errorp != 0) {
5854 		/*
5855 		 * Note: No special action needed in this
5856 		 * module for "is_absreq_failure"
5857 		 */
5858 		return (-1);		/* failure */
5859 	}
5860 	ASSERT(is_absreq_failure == 0);
5861 	return (0);	/* success */
5862 }
5863 
5864 void
5865 icmp_ddi_g_init(void)
5866 {
5867 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5868 	    icmp_opt_obj.odb_opt_arr_cnt);
5869 
5870 	/*
5871 	 * We want to be informed each time a stack is created or
5872 	 * destroyed in the kernel, so we can maintain the
5873 	 * set of icmp_stack_t's.
5874 	 */
5875 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5876 }
5877 
5878 void
5879 icmp_ddi_g_destroy(void)
5880 {
5881 	netstack_unregister(NS_ICMP);
5882 }
5883 
5884 #define	INET_NAME	"ip"
5885 
5886 /*
5887  * Initialize the ICMP stack instance.
5888  */
5889 static void *
5890 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5891 {
5892 	icmp_stack_t	*is;
5893 	icmpparam_t	*pa;
5894 	int		error = 0;
5895 	major_t		major;
5896 
5897 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5898 	is->is_netstack = ns;
5899 
5900 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5901 	is->is_param_arr = pa;
5902 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5903 
5904 	(void) icmp_param_register(&is->is_nd,
5905 	    is->is_param_arr, A_CNT(icmp_param_arr));
5906 	is->is_ksp = rawip_kstat_init(stackid);
5907 
5908 	major = mod_name_to_major(INET_NAME);
5909 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5910 	ASSERT(error == 0);
5911 	return (is);
5912 }
5913 
5914 /*
5915  * Free the ICMP stack instance.
5916  */
5917 static void
5918 rawip_stack_fini(netstackid_t stackid, void *arg)
5919 {
5920 	icmp_stack_t *is = (icmp_stack_t *)arg;
5921 
5922 	nd_free(&is->is_nd);
5923 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5924 	is->is_param_arr = NULL;
5925 
5926 	rawip_kstat_fini(stackid, is->is_ksp);
5927 	is->is_ksp = NULL;
5928 	ldi_ident_release(is->is_ldi_ident);
5929 	kmem_free(is, sizeof (*is));
5930 }
5931 
5932 static void *
5933 rawip_kstat_init(netstackid_t stackid) {
5934 	kstat_t	*ksp;
5935 
5936 	rawip_named_kstat_t template = {
5937 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5938 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5939 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5940 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5941 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5942 	};
5943 
5944 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5945 					KSTAT_TYPE_NAMED,
5946 					NUM_OF_FIELDS(rawip_named_kstat_t),
5947 					0, stackid);
5948 	if (ksp == NULL || ksp->ks_data == NULL)
5949 		return (NULL);
5950 
5951 	bcopy(&template, ksp->ks_data, sizeof (template));
5952 	ksp->ks_update = rawip_kstat_update;
5953 	ksp->ks_private = (void *)(uintptr_t)stackid;
5954 
5955 	kstat_install(ksp);
5956 	return (ksp);
5957 }
5958 
5959 static void
5960 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5961 {
5962 	if (ksp != NULL) {
5963 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5964 		kstat_delete_netstack(ksp, stackid);
5965 	}
5966 }
5967 
5968 static int
5969 rawip_kstat_update(kstat_t *ksp, int rw)
5970 {
5971 	rawip_named_kstat_t *rawipkp;
5972 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5973 	netstack_t	*ns;
5974 	icmp_stack_t	*is;
5975 
5976 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5977 		return (EIO);
5978 
5979 	if (rw == KSTAT_WRITE)
5980 		return (EACCES);
5981 
5982 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5983 
5984 	ns = netstack_find_by_stackid(stackid);
5985 	if (ns == NULL)
5986 		return (-1);
5987 	is = ns->netstack_icmp;
5988 	if (is == NULL) {
5989 		netstack_rele(ns);
5990 		return (-1);
5991 	}
5992 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5993 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5994 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5995 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5996 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5997 	netstack_rele(ns);
5998 	return (0);
5999 }
6000 
6001 /* ARGSUSED */
6002 int
6003 rawip_accept(sock_lower_handle_t lproto_handle,
6004     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
6005     cred_t *cr)
6006 {
6007 	return (EOPNOTSUPP);
6008 }
6009 
6010 /* ARGSUSED */
6011 int
6012 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6013     socklen_t len, cred_t *cr)
6014 {
6015 	conn_t  *connp = (conn_t *)proto_handle;
6016 	int error;
6017 
6018 	/* All Solaris components should pass a cred for this operation. */
6019 	ASSERT(cr != NULL);
6020 
6021 	/* Binding to a NULL address really means unbind */
6022 	if (sa == NULL)
6023 		error = rawip_do_unbind(connp);
6024 	else
6025 		error = rawip_do_bind(connp, sa, len);
6026 
6027 	if (error < 0) {
6028 		if (error == -TOUTSTATE)
6029 			error = EINVAL;
6030 		else
6031 			error = proto_tlitosyserr(-error);
6032 	}
6033 	return (error);
6034 }
6035 
6036 static int
6037 rawip_implicit_bind(conn_t *connp)
6038 {
6039 	sin6_t sin6addr;
6040 	sin_t *sin;
6041 	sin6_t *sin6;
6042 	socklen_t len;
6043 	int error;
6044 
6045 	if (connp->conn_icmp->icmp_family == AF_INET) {
6046 		len = sizeof (struct sockaddr_in);
6047 		sin = (sin_t *)&sin6addr;
6048 		*sin = sin_null;
6049 		sin->sin_family = AF_INET;
6050 		sin->sin_addr.s_addr = INADDR_ANY;
6051 	} else {
6052 		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
6053 		len = sizeof (sin6_t);
6054 		sin6 = (sin6_t *)&sin6addr;
6055 		*sin6 = sin6_null;
6056 		sin6->sin6_family = AF_INET6;
6057 		V6_SET_ZERO(sin6->sin6_addr);
6058 	}
6059 
6060 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
6061 
6062 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
6063 }
6064 
6065 static int
6066 rawip_unbind(conn_t *connp)
6067 {
6068 	int error;
6069 
6070 	error = rawip_do_unbind(connp);
6071 	if (error < 0) {
6072 		error = proto_tlitosyserr(-error);
6073 	}
6074 	return (error);
6075 }
6076 
6077 /* ARGSUSED */
6078 int
6079 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
6080 {
6081 	return (EOPNOTSUPP);
6082 }
6083 
6084 /* ARGSUSED */
6085 int
6086 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
6087     socklen_t len, sock_connid_t *id, cred_t *cr)
6088 {
6089 	conn_t	*connp = (conn_t *)proto_handle;
6090 	icmp_t *icmp = connp->conn_icmp;
6091 	int	error;
6092 	boolean_t did_bind = B_FALSE;
6093 
6094 	/* All Solaris components should pass a cred for this operation. */
6095 	ASSERT(cr != NULL);
6096 
6097 	if (sa == NULL) {
6098 		/*
6099 		 * Disconnect
6100 		 * Make sure we are connected
6101 		 */
6102 		if (icmp->icmp_state != TS_DATA_XFER)
6103 			return (EINVAL);
6104 
6105 		error = icmp_disconnect(connp);
6106 		return (error);
6107 	}
6108 
6109 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
6110 	if (error != 0)
6111 		return (error);
6112 
6113 	/* do an implicit bind if necessary */
6114 	if (icmp->icmp_state == TS_UNBND) {
6115 		error = rawip_implicit_bind(connp);
6116 		/*
6117 		 * We could be racing with an actual bind, in which case
6118 		 * we would see EPROTO. We cross our fingers and try
6119 		 * to connect.
6120 		 */
6121 		if (!(error == 0 || error == EPROTO))
6122 			return (error);
6123 		did_bind = B_TRUE;
6124 	}
6125 
6126 	/*
6127 	 * set SO_DGRAM_ERRIND
6128 	 */
6129 	icmp->icmp_dgram_errind = B_TRUE;
6130 
6131 	error = rawip_do_connect(connp, sa, len, cr);
6132 
6133 	if (error != 0 && did_bind) {
6134 		int unbind_err;
6135 
6136 		unbind_err = rawip_unbind(connp);
6137 		ASSERT(unbind_err == 0);
6138 	}
6139 
6140 	if (error == 0) {
6141 		*id = 0;
6142 		(*connp->conn_upcalls->su_connected)
6143 		    (connp->conn_upper_handle, 0, NULL, -1);
6144 	} else if (error < 0) {
6145 		error = proto_tlitosyserr(-error);
6146 	}
6147 	return (error);
6148 }
6149 
6150 /* ARGSUSED */
6151 int
6152 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
6153     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
6154 {
6155 	conn_t  *connp = (conn_t *)proto_handle;
6156 	icmp_t	*icmp;
6157 	struct T_capability_ack tca;
6158 	struct sockaddr_in6 laddr, faddr;
6159 	socklen_t laddrlen, faddrlen;
6160 	short opts;
6161 	struct stroptions *stropt;
6162 	mblk_t *stropt_mp;
6163 	int error;
6164 
6165 	icmp = connp->conn_icmp;
6166 
6167 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
6168 
6169 	/*
6170 	 * setup the fallback stream that was allocated
6171 	 */
6172 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
6173 	connp->conn_minor_arena = WR(q)->q_ptr;
6174 
6175 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
6176 
6177 	WR(q)->q_qinfo = &icmpwinit;
6178 
6179 	connp->conn_rq = RD(q);
6180 	connp->conn_wq = WR(q);
6181 
6182 	/* Notify stream head about options before sending up data */
6183 	stropt_mp->b_datap->db_type = M_SETOPTS;
6184 	stropt_mp->b_wptr += sizeof (*stropt);
6185 	stropt = (struct stroptions *)stropt_mp->b_rptr;
6186 	stropt->so_flags = SO_WROFF | SO_HIWAT;
6187 	stropt->so_wroff =
6188 	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
6189 	stropt->so_hiwat = icmp->icmp_recv_hiwat;
6190 	putnext(RD(q), stropt_mp);
6191 
6192 	/*
6193 	 * free helper stream
6194 	 */
6195 	ip_free_helper_stream(connp);
6196 
6197 	/*
6198 	 * Collect the information needed to sync with the sonode
6199 	 */
6200 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
6201 
6202 	laddrlen = faddrlen = sizeof (sin6_t);
6203 	(void) rawip_getsockname((sock_lower_handle_t)connp,
6204 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
6205 	error = rawip_getpeername((sock_lower_handle_t)connp,
6206 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
6207 	if (error != 0)
6208 		faddrlen = 0;
6209 	opts = 0;
6210 	if (icmp->icmp_dgram_errind)
6211 		opts |= SO_DGRAM_ERRIND;
6212 	if (icmp->icmp_dontroute)
6213 		opts |= SO_DONTROUTE;
6214 
6215 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
6216 	    (struct sockaddr *)&laddr, laddrlen,
6217 	    (struct sockaddr *)&faddr, faddrlen, opts);
6218 
6219 	/*
6220 	 * Attempts to send data up during fallback will result in it being
6221 	 * queued in udp_t. Now we push up any queued packets.
6222 	 */
6223 	mutex_enter(&icmp->icmp_recv_lock);
6224 	while (icmp->icmp_fallback_queue_head != NULL) {
6225 		mblk_t	*mp;
6226 
6227 		mp = icmp->icmp_fallback_queue_head;
6228 		icmp->icmp_fallback_queue_head = mp->b_next;
6229 		mp->b_next = NULL;
6230 		mutex_exit(&icmp->icmp_recv_lock);
6231 		putnext(RD(q), mp);
6232 		mutex_enter(&icmp->icmp_recv_lock);
6233 	}
6234 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
6235 
6236 	/*
6237 	 * No longer a streams less socket
6238 	 */
6239 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6240 	connp->conn_flags &= ~IPCL_NONSTR;
6241 	rw_exit(&icmp->icmp_rwlock);
6242 
6243 	mutex_exit(&icmp->icmp_recv_lock);
6244 
6245 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
6246 	    icmp->icmp_fallback_queue_tail == NULL);
6247 
6248 	ASSERT(connp->conn_ref >= 1);
6249 
6250 	return (0);
6251 }
6252 
6253 /* ARGSUSED */
6254 sock_lower_handle_t
6255 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
6256     uint_t *smodep, int *errorp, int flags, cred_t *credp)
6257 {
6258 	conn_t *connp;
6259 
6260 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
6261 		*errorp = EPROTONOSUPPORT;
6262 		return (NULL);
6263 	}
6264 
6265 	connp = icmp_open(family, credp, errorp, flags);
6266 	if (connp != NULL) {
6267 		icmp_stack_t *is;
6268 
6269 		is = connp->conn_icmp->icmp_is;
6270 		connp->conn_flags |= IPCL_NONSTR;
6271 
6272 		if (connp->conn_icmp->icmp_family == AF_INET6) {
6273 			/* Build initial header template for transmit */
6274 			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
6275 			if ((*errorp =
6276 			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
6277 				rw_exit(&connp->conn_icmp->icmp_rwlock);
6278 				ipcl_conn_destroy(connp);
6279 				return (NULL);
6280 			}
6281 			rw_exit(&connp->conn_icmp->icmp_rwlock);
6282 		}
6283 
6284 		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
6285 		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
6286 
6287 		if ((*errorp = ip_create_helper_stream(connp,
6288 		    is->is_ldi_ident)) != 0) {
6289 			cmn_err(CE_CONT, "create of IP helper stream failed\n");
6290 			(void) rawip_do_close(connp);
6291 			return (NULL);
6292 		}
6293 
6294 		mutex_enter(&connp->conn_lock);
6295 		connp->conn_state_flags &= ~CONN_INCIPIENT;
6296 		mutex_exit(&connp->conn_lock);
6297 		*sock_downcalls = &sock_rawip_downcalls;
6298 		*smodep = SM_ATOMIC;
6299 	} else {
6300 		ASSERT(*errorp != 0);
6301 	}
6302 
6303 	return ((sock_lower_handle_t)connp);
6304 }
6305 
6306 /* ARGSUSED */
6307 void
6308 rawip_activate(sock_lower_handle_t proto_handle,
6309     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
6310     cred_t *cr)
6311 {
6312 	conn_t 			*connp = (conn_t *)proto_handle;
6313 	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
6314 	struct sock_proto_props sopp;
6315 
6316 	/* All Solaris components should pass a cred for this operation. */
6317 	ASSERT(cr != NULL);
6318 
6319 	connp->conn_upcalls = sock_upcalls;
6320 	connp->conn_upper_handle = sock_handle;
6321 
6322 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
6323 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
6324 	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
6325 	    is->is_wroff_extra;
6326 	sopp.sopp_rxhiwat = is->is_recv_hiwat;
6327 	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
6328 	sopp.sopp_maxblk = INFPSZ;
6329 	sopp.sopp_maxpsz = IP_MAXPACKET;
6330 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
6331 	    icmp_mod_info.mi_minpsz;
6332 
6333 	(*connp->conn_upcalls->su_set_proto_props)
6334 	    (connp->conn_upper_handle, &sopp);
6335 }
6336 
6337 static int
6338 rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6339 {
6340 	sin_t	*sin = (sin_t *)sa;
6341 	sin6_t	*sin6 = (sin6_t *)sa;
6342 
6343 	ASSERT(icmp != NULL);
6344 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6345 
6346 	switch (icmp->icmp_family) {
6347 	case AF_INET:
6348 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6349 		if (*salenp < sizeof (sin_t))
6350 			return (EINVAL);
6351 
6352 		*salenp = sizeof (sin_t);
6353 		*sin = sin_null;
6354 		sin->sin_family = AF_INET;
6355 		if (icmp->icmp_state == TS_UNBND) {
6356 			break;
6357 		}
6358 
6359 		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
6360 		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6361 			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
6362 		} else {
6363 			/*
6364 			 * INADDR_ANY
6365 			 * icmp_v6src is not set, we might be bound to
6366 			 * broadcast/multicast. Use icmp_bound_v6src as
6367 			 * local address instead (that could
6368 			 * also still be INADDR_ANY)
6369 			 */
6370 			sin->sin_addr.s_addr =
6371 			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
6372 		}
6373 		break;
6374 	case AF_INET6:
6375 
6376 		if (*salenp < sizeof (sin6_t))
6377 			return (EINVAL);
6378 
6379 		*salenp = sizeof (sin6_t);
6380 		*sin6 = sin6_null;
6381 		sin6->sin6_family = AF_INET6;
6382 		if (icmp->icmp_state == TS_UNBND) {
6383 			break;
6384 		}
6385 		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6386 			sin6->sin6_addr = icmp->icmp_v6src;
6387 		} else {
6388 			/*
6389 			 * UNSPECIFIED
6390 			 * icmp_v6src is not set, we might be bound to
6391 			 * broadcast/multicast. Use icmp_bound_v6src as
6392 			 * local address instead (that could
6393 			 * also still be UNSPECIFIED)
6394 			 */
6395 
6396 			sin6->sin6_addr = icmp->icmp_bound_v6src;
6397 		}
6398 		break;
6399 	}
6400 	return (0);
6401 }
6402 
6403 static int
6404 rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6405 {
6406 	sin_t   *sin = (sin_t *)sa;
6407 	sin6_t  *sin6 = (sin6_t *)sa;
6408 
6409 	ASSERT(icmp != NULL);
6410 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6411 
6412 	if (icmp->icmp_state != TS_DATA_XFER)
6413 		return (ENOTCONN);
6414 
6415 	sa->sa_family = icmp->icmp_family;
6416 	switch (icmp->icmp_family) {
6417 	case AF_INET:
6418 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6419 
6420 		if (*salenp < sizeof (sin_t))
6421 			return (EINVAL);
6422 
6423 		*salenp = sizeof (sin_t);
6424 		*sin = sin_null;
6425 		sin->sin_family = AF_INET;
6426 		sin->sin_addr.s_addr =
6427 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6428 		break;
6429 	case AF_INET6:
6430 		if (*salenp < sizeof (sin6_t))
6431 			return (EINVAL);
6432 
6433 		*salenp = sizeof (sin6_t);
6434 		*sin6 = sin6_null;
6435 		*sin6 = icmp->icmp_v6dst;
6436 		break;
6437 	}
6438 	return (0);
6439 }
6440 
6441 /* ARGSUSED */
6442 int
6443 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6444     socklen_t *salenp, cred_t *cr)
6445 {
6446 	conn_t  *connp = (conn_t *)proto_handle;
6447 	icmp_t  *icmp = connp->conn_icmp;
6448 	int	error;
6449 
6450 	/* All Solaris components should pass a cred for this operation. */
6451 	ASSERT(cr != NULL);
6452 
6453 	ASSERT(icmp != NULL);
6454 
6455 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6456 
6457 	error = rawip_do_getpeername(icmp, sa, salenp);
6458 
6459 	rw_exit(&icmp->icmp_rwlock);
6460 
6461 	return (error);
6462 }
6463 
6464 /* ARGSUSED */
6465 int
6466 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6467     socklen_t *salenp, cred_t *cr)
6468 {
6469 	conn_t  *connp = (conn_t *)proto_handle;
6470 	icmp_t	*icmp = connp->conn_icmp;
6471 	int	error;
6472 
6473 	/* All Solaris components should pass a cred for this operation. */
6474 	ASSERT(cr != NULL);
6475 
6476 	ASSERT(icmp != NULL);
6477 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6478 
6479 	error = rawip_do_getsockname(icmp, sa, salenp);
6480 
6481 	rw_exit(&icmp->icmp_rwlock);
6482 
6483 	return (error);
6484 }
6485 
6486 int
6487 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6488     const void *optvalp, socklen_t optlen, cred_t *cr)
6489 {
6490 	conn_t	*connp = (conn_t *)proto_handle;
6491 	icmp_t *icmp = connp->conn_icmp;
6492 	int error;
6493 
6494 	/* All Solaris components should pass a cred for this operation. */
6495 	ASSERT(cr != NULL);
6496 
6497 	error = proto_opt_check(level, option_name, optlen, NULL,
6498 	    icmp_opt_obj.odb_opt_des_arr,
6499 	    icmp_opt_obj.odb_opt_arr_cnt,
6500 	    icmp_opt_obj.odb_topmost_tpiprovider,
6501 	    B_TRUE, B_FALSE, cr);
6502 
6503 	if (error != 0) {
6504 		/*
6505 		 * option not recognized
6506 		 */
6507 		if (error < 0) {
6508 			error = proto_tlitosyserr(-error);
6509 		}
6510 		return (error);
6511 	}
6512 
6513 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6514 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
6515 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
6516 	    (uchar_t *)optvalp, NULL, cr);
6517 	rw_exit(&icmp->icmp_rwlock);
6518 
6519 	if (error < 0) {
6520 		/*
6521 		 * Pass on to ip
6522 		 */
6523 		error = ip_set_options(connp, level, option_name, optvalp,
6524 		    optlen, cr);
6525 	}
6526 
6527 	ASSERT(error >= 0);
6528 
6529 	return (error);
6530 }
6531 
6532 int
6533 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6534     void *optvalp, socklen_t *optlen, cred_t *cr)
6535 {
6536 	int		error;
6537 	conn_t		*connp = (conn_t *)proto_handle;
6538 	icmp_t		*icmp = connp->conn_icmp;
6539 	t_uscalar_t	max_optbuf_len;
6540 	void		*optvalp_buf;
6541 	int		len;
6542 
6543 	/* All Solaris components should pass a cred for this operation. */
6544 	ASSERT(cr != NULL);
6545 
6546 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
6547 	    icmp_opt_obj.odb_opt_des_arr,
6548 	    icmp_opt_obj.odb_opt_arr_cnt,
6549 	    icmp_opt_obj.odb_topmost_tpiprovider,
6550 	    B_FALSE, B_TRUE, cr);
6551 
6552 	if (error != 0) {
6553 		if (error < 0) {
6554 			error = proto_tlitosyserr(-error);
6555 		}
6556 		return (error);
6557 	}
6558 
6559 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
6560 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6561 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
6562 	rw_exit(&icmp->icmp_rwlock);
6563 
6564 	if (len < 0) {
6565 		/*
6566 		 * Pass on to IP
6567 		 */
6568 		kmem_free(optvalp_buf, max_optbuf_len);
6569 		return (ip_get_options(connp, level, option_name, optvalp,
6570 		    optlen, cr));
6571 	} else {
6572 		/*
6573 		 * update optlen and copy option value
6574 		 */
6575 		t_uscalar_t size = MIN(len, *optlen);
6576 		bcopy(optvalp_buf, optvalp, size);
6577 		bcopy(&size, optlen, sizeof (size));
6578 
6579 		kmem_free(optvalp_buf, max_optbuf_len);
6580 		return (0);
6581 	}
6582 }
6583 
6584 /* ARGSUSED */
6585 int
6586 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
6587 {
6588 	conn_t	*connp = (conn_t *)proto_handle;
6589 
6590 	/* All Solaris components should pass a cred for this operation. */
6591 	ASSERT(cr != NULL);
6592 
6593 	(void) rawip_do_close(connp);
6594 	return (0);
6595 }
6596 
6597 /* ARGSUSED */
6598 int
6599 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
6600 {
6601 	conn_t  *connp = (conn_t *)proto_handle;
6602 
6603 	/* All Solaris components should pass a cred for this operation. */
6604 	ASSERT(cr != NULL);
6605 
6606 	/* shut down the send side */
6607 	if (how != SHUT_RD)
6608 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6609 		    SOCK_OPCTL_SHUT_SEND, 0);
6610 	/* shut down the recv side */
6611 	if (how != SHUT_WR)
6612 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6613 		    SOCK_OPCTL_SHUT_RECV, 0);
6614 	return (0);
6615 }
6616 
6617 void
6618 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
6619 {
6620 	conn_t  *connp = (conn_t *)proto_handle;
6621 	icmp_t	*icmp = connp->conn_icmp;
6622 
6623 	mutex_enter(&icmp->icmp_recv_lock);
6624 	connp->conn_flow_cntrld = B_FALSE;
6625 	mutex_exit(&icmp->icmp_recv_lock);
6626 }
6627 
6628 int
6629 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
6630     int mode, int32_t *rvalp, cred_t *cr)
6631 {
6632 	conn_t  	*connp = (conn_t *)proto_handle;
6633 	int		error;
6634 
6635 	/* All Solaris components should pass a cred for this operation. */
6636 	ASSERT(cr != NULL);
6637 
6638 	switch (cmd) {
6639 	case ND_SET:
6640 	case ND_GET:
6641 	case _SIOCSOCKFALLBACK:
6642 	case TI_GETPEERNAME:
6643 	case TI_GETMYNAME:
6644 #ifdef DEBUG
6645 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
6646 		    " socket", cmd);
6647 #endif
6648 		error = EINVAL;
6649 		break;
6650 	default:
6651 		/*
6652 		 * Pass on to IP using helper stream
6653 		 */
6654 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
6655 		    cmd, arg, mode, cr, rvalp);
6656 		break;
6657 	}
6658 	return (error);
6659 }
6660 
6661 /* ARGSUSED */
6662 int
6663 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6664     cred_t *cr)
6665 {
6666 	conn_t *connp = (conn_t *)proto_handle;
6667 	icmp_t	*icmp = connp->conn_icmp;
6668 	icmp_stack_t *is = icmp->icmp_is;
6669 	int error = 0;
6670 	boolean_t bypass_dgram_errind = B_FALSE;
6671 
6672 	ASSERT(DB_TYPE(mp) == M_DATA);
6673 
6674 	/* All Solaris components should pass a cred for this operation. */
6675 	ASSERT(cr != NULL);
6676 
6677 	/* If labeled then sockfs should have already set db_credp */
6678 	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
6679 
6680 	/* do an implicit bind if necessary */
6681 	if (icmp->icmp_state == TS_UNBND) {
6682 		error = rawip_implicit_bind(connp);
6683 		/*
6684 		 * We could be racing with an actual bind, in which case
6685 		 * we would see EPROTO. We cross our fingers and try
6686 		 * to connect.
6687 		 */
6688 		if (!(error == 0 || error == EPROTO)) {
6689 			freemsg(mp);
6690 			return (error);
6691 		}
6692 	}
6693 
6694 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6695 
6696 	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
6697 		error = EISCONN;
6698 		goto done_lock;
6699 	}
6700 
6701 	switch (icmp->icmp_family) {
6702 	case AF_INET6: {
6703 		sin6_t	*sin6;
6704 		ip6_pkt_t	ipp_s;	/* For ancillary data options */
6705 		ip6_pkt_t	*ipp = &ipp_s;
6706 
6707 		sin6 = (sin6_t *)msg->msg_name;
6708 		if (sin6 != NULL) {
6709 			error = proto_verify_ip_addr(icmp->icmp_family,
6710 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6711 			if (error != 0) {
6712 				bypass_dgram_errind = B_TRUE;
6713 				goto done_lock;
6714 			}
6715 			if (icmp->icmp_delayed_error != 0) {
6716 				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
6717 				sin6_t  *sin2 = (sin6_t *)
6718 				    &icmp->icmp_delayed_addr;
6719 
6720 				error = icmp->icmp_delayed_error;
6721 				icmp->icmp_delayed_error = 0;
6722 
6723 				/* Compare IP address and port */
6724 
6725 				if (sin1->sin6_port == sin2->sin6_port &&
6726 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
6727 				    &sin2->sin6_addr)) {
6728 					goto done_lock;
6729 				}
6730 			}
6731 		} else {
6732 			/*
6733 			 * Use connected address
6734 			 */
6735 			if (icmp->icmp_state != TS_DATA_XFER) {
6736 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6737 				error = EDESTADDRREQ;
6738 				bypass_dgram_errind = B_TRUE;
6739 				goto done_lock;
6740 			}
6741 			sin6 = &icmp->icmp_v6dst;
6742 		}
6743 
6744 		/* No support for mapped addresses on raw sockets */
6745 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6746 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6747 			error = EADDRNOTAVAIL;
6748 			goto done_lock;
6749 		}
6750 
6751 		ipp->ipp_fields = 0;
6752 		ipp->ipp_sticky_ignored = 0;
6753 
6754 		/*
6755 		 * If options passed in, feed it for verification and handling
6756 		 */
6757 		if (msg->msg_controllen != 0) {
6758 			error = process_auxiliary_options(connp,
6759 			    msg->msg_control, msg->msg_controllen,
6760 			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
6761 			if (error != 0) {
6762 				goto done_lock;
6763 			}
6764 		}
6765 
6766 		rw_exit(&icmp->icmp_rwlock);
6767 
6768 		/*
6769 		 * Destination is a native IPv6 address.
6770 		 * Send out an IPv6 format packet.
6771 		 */
6772 
6773 		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
6774 		    ipp);
6775 	}
6776 		break;
6777 	case AF_INET: {
6778 		sin_t	*sin;
6779 		ip4_pkt_t pktinfo;
6780 		ip4_pkt_t *pktinfop = &pktinfo;
6781 		ipaddr_t	v4dst;
6782 
6783 		sin = (sin_t *)msg->msg_name;
6784 		if (sin != NULL) {
6785 			error = proto_verify_ip_addr(icmp->icmp_family,
6786 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6787 			if (error != 0) {
6788 				bypass_dgram_errind = B_TRUE;
6789 				goto done_lock;
6790 			}
6791 			v4dst = sin->sin_addr.s_addr;
6792 			if (icmp->icmp_delayed_error != 0) {
6793 				sin_t *sin1 = (sin_t *)msg->msg_name;
6794 				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
6795 
6796 				error = icmp->icmp_delayed_error;
6797 				icmp->icmp_delayed_error = 0;
6798 
6799 				/* Compare IP address and port */
6800 				if (sin1->sin_port == sin2->sin_port &&
6801 				    sin1->sin_addr.s_addr ==
6802 				    sin2->sin_addr.s_addr) {
6803 					goto done_lock;
6804 				}
6805 
6806 			}
6807 		} else {
6808 			/*
6809 			 * Use connected address
6810 			 */
6811 			if (icmp->icmp_state != TS_DATA_XFER) {
6812 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6813 				error = EDESTADDRREQ;
6814 				bypass_dgram_errind = B_TRUE;
6815 				goto done_lock;
6816 			}
6817 			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6818 		}
6819 
6820 
6821 		pktinfop->ip4_ill_index = 0;
6822 		pktinfop->ip4_addr = INADDR_ANY;
6823 
6824 		/*
6825 		 * If options passed in, feed it for verification and handling
6826 		 */
6827 		if (msg->msg_controllen != 0) {
6828 			error = process_auxiliary_options(connp,
6829 			    msg->msg_control, msg->msg_controllen,
6830 			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
6831 			if (error != 0) {
6832 				goto done_lock;
6833 			}
6834 		}
6835 		rw_exit(&icmp->icmp_rwlock);
6836 
6837 		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
6838 		    v4dst, pktinfop);
6839 		break;
6840 	}
6841 
6842 	default:
6843 		ASSERT(0);
6844 	}
6845 
6846 	goto done;
6847 
6848 done_lock:
6849 	rw_exit(&icmp->icmp_rwlock);
6850 	if (error != 0) {
6851 		ASSERT(mp != NULL);
6852 		freemsg(mp);
6853 	}
6854 done:
6855 	if (bypass_dgram_errind)
6856 		return (error);
6857 	return (icmp->icmp_dgram_errind ? error : 0);
6858 }
6859 
6860 sock_downcalls_t sock_rawip_downcalls = {
6861 	rawip_activate,
6862 	rawip_accept,
6863 	rawip_bind,
6864 	rawip_listen,
6865 	rawip_connect,
6866 	rawip_getpeername,
6867 	rawip_getsockname,
6868 	rawip_getsockopt,
6869 	rawip_setsockopt,
6870 	rawip_send,
6871 	NULL,
6872 	NULL,
6873 	NULL,
6874 	rawip_shutdown,
6875 	rawip_clr_flowctrl,
6876 	rawip_ioctl,
6877 	rawip_close
6878 };
6879