xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_rts.c (revision a9c12afd)
1 /*
2  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1988, 1991, 1993
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
39  */
40 
41 /*
42  * This file contains routines that processes routing socket requests.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/ddi.h>
49 #include <sys/strsubr.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/policy.h>
53 #include <sys/zone.h>
54 
55 #include <sys/systm.h>
56 #include <sys/param.h>
57 #include <sys/socket.h>
58 #include <sys/strsun.h>
59 #include <net/if.h>
60 #include <net/route.h>
61 #include <netinet/in.h>
62 #include <net/if_dl.h>
63 #include <netinet/ip6.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/ip_if.h>
69 #include <inet/ip_ire.h>
70 #include <inet/ip_ftable.h>
71 #include <inet/ip_rts.h>
72 
73 #include <inet/ipclassifier.h>
74 
75 #include <sys/tsol/tndb.h>
76 #include <sys/tsol/tnet.h>
77 
78 #define	RTS_MSG_SIZE(type, rtm_addrs, af, sacnt) \
79 	(rts_data_msg_size(rtm_addrs, af, sacnt) + rts_header_msg_size(type))
80 
81 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
82 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
83     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
84     ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
85     const tsol_gc_t *);
86 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
87     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
88     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
89     sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
90 static void	rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
91 static int	rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics);
92 static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire,
93     const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af);
94 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
95 static ire_t	*ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask,
96     ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid,
97     const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire,
98     ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp);
99 static ire_t	*ire_lookup_v6(const in6_addr_t *dst_addr_v6,
100     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
101     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
102     ip_stack_t *ipst, ire_t **pifire,
103     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp);
104 
105 /*
106  * Send `mp' to all eligible routing queues.  A queue is ineligible if:
107  *
108  *  1. SO_USELOOPBACK is off and it is not the originating queue.
109  *  2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'.
110  *  3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'.
111  *  4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
112  */
113 void
114 rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
115     ip_stack_t *ipst)
116 {
117 	mblk_t	*mp1;
118 	conn_t 	*connp, *next_connp;
119 
120 	/*
121 	 * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
122 	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point.
123 	 */
124 	ASSERT(!(flags & RTSQ_DEFAULT));
125 
126 	mutex_enter(&ipst->ips_rts_clients->connf_lock);
127 	connp = ipst->ips_rts_clients->connf_head;
128 
129 	for (; connp != NULL; connp = next_connp) {
130 		next_connp = connp->conn_next;
131 		/*
132 		 * If there was a family specified when this routing socket was
133 		 * created and it doesn't match the family of the message to
134 		 * copy, then continue.
135 		 */
136 		if ((connp->conn_proto != AF_UNSPEC) &&
137 		    (connp->conn_proto != af))
138 			continue;
139 
140 		/*
141 		 * Queue the message only if the conn_t and flags match.
142 		 */
143 		if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
144 			if (!(flags & RTSQ_UNDER_IPMP))
145 				continue;
146 		} else {
147 			if (!(flags & RTSQ_NORMAL))
148 				continue;
149 		}
150 		/*
151 		 * For the originating queue, we only copy the message upstream
152 		 * if loopback is set.  For others reading on the routing
153 		 * socket, we check if there is room upstream for a copy of the
154 		 * message.
155 		 */
156 		if ((o_connp == connp) && connp->conn_useloopback == 0) {
157 			connp = connp->conn_next;
158 			continue;
159 		}
160 		CONN_INC_REF(connp);
161 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
162 		/* Pass to rts_input */
163 		if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld :
164 		    canputnext(connp->conn_rq)) {
165 			mp1 = dupmsg(mp);
166 			if (mp1 == NULL)
167 				mp1 = copymsg(mp);
168 			/* Note that we pass a NULL ira to rts_input */
169 			if (mp1 != NULL)
170 				(connp->conn_recv)(connp, mp1, NULL, NULL);
171 		}
172 
173 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
174 		/* reload next_connp since conn_next may have changed */
175 		next_connp = connp->conn_next;
176 		CONN_DEC_REF(connp);
177 	}
178 	mutex_exit(&ipst->ips_rts_clients->connf_lock);
179 	freemsg(mp);
180 }
181 
182 /*
183  * Takes an ire and sends an ack to all the routing sockets. This
184  * routine is used
185  * - when a route is created/deleted through the ioctl interface.
186  * - when a stale redirect is deleted
187  */
188 void
189 ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
190 {
191 	mblk_t		*mp;
192 	rt_msghdr_t	*rtm;
193 	int		rtm_addrs = (RTA_DST | RTA_NETMASK | RTA_GATEWAY);
194 	sa_family_t	af;
195 	in6_addr_t	gw_addr_v6;
196 
197 	if (ire == NULL)
198 		return;
199 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
200 	    ire->ire_ipversion == IPV6_VERSION);
201 
202 	ASSERT(!(ire->ire_type & IRE_IF_CLONE));
203 
204 	if (ire->ire_flags & RTF_SETSRC)
205 		rtm_addrs |= RTA_SRC;
206 
207 	switch (ire->ire_ipversion) {
208 	case IPV4_VERSION:
209 		af = AF_INET;
210 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
211 		if (mp == NULL)
212 			return;
213 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
214 		    ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL,
215 		    mp, NULL);
216 		break;
217 	case IPV6_VERSION:
218 		af = AF_INET6;
219 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
220 		if (mp == NULL)
221 			return;
222 		mutex_enter(&ire->ire_lock);
223 		gw_addr_v6 = ire->ire_gateway_addr_v6;
224 		mutex_exit(&ire->ire_lock);
225 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
226 		    &ire->ire_mask_v6, &gw_addr_v6,
227 		    &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
228 		    &ipv6_all_zeros, NULL, mp, NULL);
229 		break;
230 	}
231 	rtm = (rt_msghdr_t *)mp->b_rptr;
232 	mp->b_wptr = (uchar_t *)&mp->b_rptr[rtm->rtm_msglen];
233 	rtm->rtm_addrs = rtm_addrs;
234 	rtm->rtm_flags = ire->ire_flags;
235 	if (error != 0)
236 		rtm->rtm_errno = error;
237 	else
238 		rtm->rtm_flags |= RTF_DONE;
239 	rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
240 }
241 
242 /*
243  * This is a call from the RTS module
244  * indicating that this is a Routing Socket
245  * Stream. Insert this conn_t in routing
246  * socket client list.
247  */
248 void
249 ip_rts_register(conn_t *connp)
250 {
251 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
252 
253 	connp->conn_useloopback = 1;
254 	ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
255 }
256 
257 /*
258  * This is a call from the RTS module indicating that it is closing.
259  */
260 void
261 ip_rts_unregister(conn_t *connp)
262 {
263 	ipcl_hash_remove(connp);
264 }
265 
266 /*
267  * Processes requests received on a routing socket. It extracts all the
268  * arguments and calls the appropriate function to process the request.
269  *
270  * RTA_SRC bit flag requests are sent by 'route -setsrc'.
271  *
272  * In general, this function does not consume the message supplied but rather
273  * sends the message upstream with an appropriate UNIX errno.
274  */
275 int
276 ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
277 {
278 	rt_msghdr_t	*rtm = NULL;
279 	in6_addr_t	dst_addr_v6;
280 	in6_addr_t	src_addr_v6;
281 	in6_addr_t	gw_addr_v6;
282 	in6_addr_t	net_mask_v6;
283 	in6_addr_t	author_v6;
284 	in6_addr_t	if_addr_v6;
285 	mblk_t		*mp1;
286 	ire_t		*ire = NULL;
287 	ire_t		*ifire = NULL;
288 	ipaddr_t	v4setsrc;
289 	in6_addr_t	v6setsrc = ipv6_all_zeros;
290 	tsol_ire_gw_secattr_t *gwattr = NULL;
291 	int		error = 0;
292 	int		match_flags = MATCH_IRE_DSTONLY;
293 	int		match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
294 	int		found_addrs;
295 	sa_family_t	af;
296 	ipaddr_t	dst_addr;
297 	ipaddr_t	gw_addr;
298 	ipaddr_t	src_addr;
299 	ipaddr_t	net_mask;
300 	ushort_t	index;
301 	boolean_t	gcgrp_xtraref = B_FALSE;
302 	tsol_gcgrp_addr_t ga;
303 	tsol_rtsecattr_t rtsecattr;
304 	struct rtsa_s	*rtsap = NULL;
305 	tsol_gcgrp_t	*gcgrp = NULL;
306 	tsol_gc_t	*gc = NULL;
307 	ts_label_t	*tsl = NULL;
308 	zoneid_t	zoneid;
309 	ip_stack_t	*ipst;
310 	ill_t   	*ill = NULL;
311 
312 	zoneid = connp->conn_zoneid;
313 	ipst = connp->conn_netstack->netstack_ip;
314 
315 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
316 		freemsg(mp);
317 		error =  EINVAL;
318 		goto done;
319 	}
320 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
321 		freemsg(mp);
322 		error = EINVAL;
323 		goto done;
324 	}
325 
326 	/*
327 	 * Check the routing message for basic consistency including the
328 	 * version number and that the number of octets written is the same
329 	 * as specified by the rtm_msglen field.
330 	 *
331 	 * At this point, an error can be delivered back via rtm_errno.
332 	 */
333 	rtm = (rt_msghdr_t *)mp->b_rptr;
334 	if ((mp->b_wptr - mp->b_rptr) != rtm->rtm_msglen) {
335 		error = EINVAL;
336 		goto done;
337 	}
338 	if (rtm->rtm_version != RTM_VERSION) {
339 		error = EPROTONOSUPPORT;
340 		goto done;
341 	}
342 
343 	/* Only allow RTM_GET or RTM_RESOLVE for unprivileged process */
344 	if (rtm->rtm_type != RTM_GET &&
345 	    rtm->rtm_type != RTM_RESOLVE &&
346 	    (ioc_cr == NULL ||
347 	    secpolicy_ip_config(ioc_cr, B_FALSE) != 0)) {
348 		error = EPERM;
349 		goto done;
350 	}
351 
352 	found_addrs = rts_getaddrs(rtm, &dst_addr_v6, &gw_addr_v6, &net_mask_v6,
353 	    &author_v6, &if_addr_v6, &src_addr_v6, &index, &af, &rtsecattr,
354 	    &error);
355 
356 	if (error != 0)
357 		goto done;
358 
359 	if ((found_addrs & RTA_DST) == 0) {
360 		error = EINVAL;
361 		goto done;
362 	}
363 
364 	/*
365 	 * Based on the address family of the destination address, determine
366 	 * the destination, gateway and netmask and return the appropriate error
367 	 * if an unknown address family was specified (following the errno
368 	 * values that 4.4BSD-Lite2 returns.)
369 	 */
370 	switch (af) {
371 	case AF_INET:
372 		IN6_V4MAPPED_TO_IPADDR(&dst_addr_v6, dst_addr);
373 		IN6_V4MAPPED_TO_IPADDR(&src_addr_v6, src_addr);
374 		IN6_V4MAPPED_TO_IPADDR(&gw_addr_v6, gw_addr);
375 		if (((found_addrs & RTA_NETMASK) == 0) ||
376 		    (rtm->rtm_flags & RTF_HOST))
377 			net_mask = IP_HOST_MASK;
378 		else
379 			IN6_V4MAPPED_TO_IPADDR(&net_mask_v6, net_mask);
380 		break;
381 	case AF_INET6:
382 		if (((found_addrs & RTA_NETMASK) == 0) ||
383 		    (rtm->rtm_flags & RTF_HOST))
384 			net_mask_v6 = ipv6_all_ones;
385 		break;
386 	default:
387 		/*
388 		 * These errno values are meant to be compatible with
389 		 * 4.4BSD-Lite2 for the given message types.
390 		 */
391 		switch (rtm->rtm_type) {
392 		case RTM_ADD:
393 		case RTM_DELETE:
394 			error = ESRCH;
395 			goto done;
396 		case RTM_GET:
397 		case RTM_CHANGE:
398 			error = EAFNOSUPPORT;
399 			goto done;
400 		default:
401 			error = EOPNOTSUPP;
402 			goto done;
403 		}
404 	}
405 
406 	/*
407 	 * At this point, the address family must be something known.
408 	 */
409 	ASSERT(af == AF_INET || af == AF_INET6);
410 
411 	/* Handle RTA_IFP */
412 	if (index != 0) {
413 		ipif_t		*ipif;
414 lookup:
415 		ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst);
416 		if (ill == NULL) {
417 			error = EINVAL;
418 			goto done;
419 		}
420 
421 		/*
422 		 * Since all interfaces in an IPMP group must be equivalent,
423 		 * we prevent changes to a specific underlying interface's
424 		 * routing configuration.  However, for backward compatibility,
425 		 * we intepret a request to add a route on an underlying
426 		 * interface as a request to add a route on its IPMP interface.
427 		 */
428 		if (IS_UNDER_IPMP(ill)) {
429 			switch (rtm->rtm_type) {
430 			case RTM_CHANGE:
431 			case RTM_DELETE:
432 				error = EINVAL;
433 				goto done;
434 			case RTM_ADD:
435 				index = ipmp_ill_get_ipmp_ifindex(ill);
436 				ill_refrele(ill);
437 				if (index == 0) {
438 					ill = NULL; /* already refrele'd */
439 					error = EINVAL;
440 					goto done;
441 				}
442 				goto lookup;
443 			}
444 		}
445 
446 		match_flags |= MATCH_IRE_ILL;
447 		/*
448 		 * This provides the same zoneid as in Solaris 10
449 		 * that -ifp picks the zoneid from the first ipif on the ill.
450 		 * But it might not be useful since the first ipif will always
451 		 * have the same zoneid as the ill.
452 		 */
453 		ipif = ipif_get_next_ipif(NULL, ill);
454 		if (ipif != NULL) {
455 			zoneid = ipif->ipif_zoneid;
456 			ipif_refrele(ipif);
457 		}
458 	}
459 
460 	/*
461 	 * If a netmask was supplied in the message, then subsequent route
462 	 * lookups will attempt to match on the netmask as well.
463 	 */
464 	if ((found_addrs & RTA_NETMASK) != 0)
465 		match_flags |= MATCH_IRE_MASK;
466 
467 	/*
468 	 * We only process any passed-in route security attributes for
469 	 * either RTM_ADD or RTM_CHANGE message; We overload them
470 	 * to do an RTM_GET as a different label; ignore otherwise.
471 	 */
472 	if (rtm->rtm_type == RTM_ADD || rtm->rtm_type == RTM_CHANGE ||
473 	    rtm->rtm_type == RTM_GET) {
474 		ASSERT(rtsecattr.rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
475 		if (rtsecattr.rtsa_cnt > 0)
476 			rtsap = &rtsecattr.rtsa_attr[0];
477 	}
478 
479 	switch (rtm->rtm_type) {
480 	case RTM_ADD:
481 		/* if we are adding a route, gateway is a must */
482 		if ((found_addrs & RTA_GATEWAY) == 0) {
483 			error = EINVAL;
484 			goto done;
485 		}
486 
487 		/* Multirouting does not support net routes. */
488 		if ((rtm->rtm_flags & (RTF_MULTIRT | RTF_HOST)) ==
489 		    RTF_MULTIRT) {
490 			error = EADDRNOTAVAIL;
491 			goto done;
492 		}
493 
494 		/*
495 		 * Multirouting and user-specified source addresses
496 		 * do not support interface based routing.
497 		 * Assigning a source address to an interface based
498 		 * route is achievable by plumbing a new ipif and
499 		 * setting up the interface route via this ipif,
500 		 * though.
501 		 */
502 		if (rtm->rtm_flags & (RTF_MULTIRT | RTF_SETSRC)) {
503 			if ((rtm->rtm_flags & RTF_GATEWAY) == 0) {
504 				error = EADDRNOTAVAIL;
505 				goto done;
506 			}
507 		}
508 
509 		switch (af) {
510 		case AF_INET:
511 			if (src_addr != INADDR_ANY) {
512 				uint_t type;
513 
514 				/*
515 				 * The RTF_SETSRC flag is present, check that
516 				 * the supplied src address is not the loopback
517 				 * address. This would produce martian packets.
518 				 */
519 				if (src_addr == htonl(INADDR_LOOPBACK)) {
520 					error = EINVAL;
521 					goto done;
522 				}
523 				/*
524 				 * Also check that the supplied address is a
525 				 * valid, local one. Only allow IFF_UP ones
526 				 */
527 				type = ip_type_v4(src_addr, ipst);
528 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
529 					error = EADDRNOTAVAIL;
530 					goto done;
531 				}
532 			} else {
533 				/*
534 				 * The RTF_SETSRC modifier must be associated
535 				 * to a non-null source address.
536 				 */
537 				if (rtm->rtm_flags & RTF_SETSRC) {
538 					error = EINVAL;
539 					goto done;
540 				}
541 			}
542 
543 			error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
544 			    rtm->rtm_flags, ill, &ire, B_FALSE,
545 			    rtsap, ipst, zoneid);
546 			if (ill != NULL)
547 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
548 			break;
549 		case AF_INET6:
550 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
551 				uint_t type;
552 
553 				/*
554 				 * The RTF_SETSRC flag is present, check that
555 				 * the supplied src address is not the loopback
556 				 * address. This would produce martian packets.
557 				 */
558 				if (IN6_IS_ADDR_LOOPBACK(&src_addr_v6)) {
559 					error = EINVAL;
560 					goto done;
561 				}
562 				/*
563 				 * Also check that the supplied address is a
564 				 * valid, local one. Only allow UP ones.
565 				 */
566 				type = ip_type_v6(&src_addr_v6, ipst);
567 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
568 					error = EADDRNOTAVAIL;
569 					goto done;
570 				}
571 
572 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
573 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
574 				    ill, &ire, rtsap, ipst, zoneid);
575 				break;
576 			}
577 			/*
578 			 * The RTF_SETSRC modifier must be associated
579 			 * to a non-null source address.
580 			 */
581 			if (rtm->rtm_flags & RTF_SETSRC) {
582 				error = EINVAL;
583 				goto done;
584 			}
585 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
586 			    &gw_addr_v6, NULL, rtm->rtm_flags,
587 			    ill, &ire, rtsap, ipst, zoneid);
588 			if (ill != NULL)
589 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
590 			break;
591 		}
592 		if (error != 0)
593 			goto done;
594 		ASSERT(ire != NULL);
595 		rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
596 		break;
597 	case RTM_DELETE:
598 		/* if we are deleting a route, gateway is a must */
599 		if ((found_addrs & RTA_GATEWAY) == 0) {
600 			error = EINVAL;
601 			goto done;
602 		}
603 		/*
604 		 * The RTF_SETSRC modifier does not make sense
605 		 * when deleting a route.
606 		 */
607 		if (rtm->rtm_flags & RTF_SETSRC) {
608 			error = EINVAL;
609 			goto done;
610 		}
611 
612 		switch (af) {
613 		case AF_INET:
614 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
615 			    found_addrs, rtm->rtm_flags, ill, B_FALSE,
616 			    ipst, zoneid);
617 			break;
618 		case AF_INET6:
619 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
620 			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ill,
621 			    ipst, zoneid);
622 			break;
623 		}
624 		break;
625 	case RTM_GET:
626 	case RTM_CHANGE:
627 		/*
628 		 * In the case of RTM_GET, the forwarding table should be
629 		 * searched recursively.  Also, if a gateway was
630 		 * specified then the gateway address must also be matched.
631 		 *
632 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
633 		 * is the new gateway address so matching on the gateway address
634 		 * is not done.  This can lead to ambiguity when looking up the
635 		 * route to change as usually only the destination (and netmask,
636 		 * if supplied) is used for the lookup.  However if a RTA_IFP
637 		 * sockaddr is also supplied, it can disambiguate which route to
638 		 * change provided the ambigous routes are tied to distinct
639 		 * ill's (or interface indices).  If the routes are not tied to
640 		 * any particular interfaces (for example, with traditional
641 		 * gateway routes), then a RTA_IFP sockaddr will be of no use as
642 		 * it won't match any such routes.
643 		 * RTA_SRC is not supported for RTM_GET and RTM_CHANGE,
644 		 * except when RTM_CHANGE is combined to RTF_SETSRC.
645 		 */
646 		if (((found_addrs & RTA_SRC) != 0) &&
647 		    ((rtm->rtm_type == RTM_GET) ||
648 		    !(rtm->rtm_flags & RTF_SETSRC))) {
649 			error = EOPNOTSUPP;
650 			goto done;
651 		}
652 
653 		if (rtm->rtm_type == RTM_GET) {
654 			match_flags |= MATCH_IRE_SECATTR;
655 			match_flags_local |= MATCH_IRE_SECATTR;
656 			if ((found_addrs & RTA_GATEWAY) != 0)
657 				match_flags |= MATCH_IRE_GW;
658 			if (ioc_cr)
659 				tsl = crgetlabel(ioc_cr);
660 			if (rtsap != NULL) {
661 				if (rtsa_validate(rtsap) != 0) {
662 					error = EINVAL;
663 					goto done;
664 				}
665 				if (tsl != NULL &&
666 				    crgetzoneid(ioc_cr) != GLOBAL_ZONEID &&
667 				    (tsl->tsl_doi != rtsap->rtsa_doi ||
668 				    !bldominates(&tsl->tsl_label,
669 				    &rtsap->rtsa_slrange.lower_bound))) {
670 					error = EPERM;
671 					goto done;
672 				}
673 				tsl = labelalloc(
674 				    &rtsap->rtsa_slrange.lower_bound,
675 				    rtsap->rtsa_doi, KM_NOSLEEP);
676 			}
677 		}
678 		if (rtm->rtm_type == RTM_CHANGE) {
679 			if ((found_addrs & RTA_GATEWAY) &&
680 			    (rtm->rtm_flags & RTF_SETSRC)) {
681 				/*
682 				 * Do not want to change the gateway,
683 				 * but rather the source address.
684 				 */
685 				match_flags |= MATCH_IRE_GW;
686 			}
687 		}
688 
689 		/*
690 		 * If the netmask is all ones (either as supplied or as derived
691 		 * above), then first check for an IRE_LOOPBACK or
692 		 * IRE_LOCAL entry.
693 		 *
694 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
695 		 * entry, then look for any other type of IRE.
696 		 */
697 		switch (af) {
698 		case AF_INET:
699 			if (net_mask == IP_HOST_MASK) {
700 				ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr,
701 				    IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
702 				    tsl, match_flags_local, 0, ipst, NULL);
703 			}
704 			if (ire == NULL) {
705 				ire = ire_lookup_v4(dst_addr, net_mask,
706 				    gw_addr, ill, zoneid, tsl, match_flags,
707 				    ipst, &ifire, &v4setsrc, &gwattr);
708 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc);
709 			}
710 			break;
711 		case AF_INET6:
712 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
713 				ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL,
714 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
715 				    zoneid, tsl, match_flags_local, 0, ipst,
716 				    NULL);
717 			}
718 			if (ire == NULL) {
719 				ire = ire_lookup_v6(&dst_addr_v6,
720 				    &net_mask_v6, &gw_addr_v6, ill, zoneid,
721 				    tsl, match_flags, ipst, &ifire, &v6setsrc,
722 				    &gwattr);
723 			}
724 			break;
725 		}
726 		if (tsl != NULL && tsl != crgetlabel(ioc_cr))
727 			label_rele(tsl);
728 
729 		if (ire == NULL) {
730 			error = ESRCH;
731 			goto done;
732 		}
733 		/*
734 		 * Want to return failure if we get an IRE_NOROUTE from
735 		 * ire_route_recursive
736 		 */
737 		if (ire->ire_type & IRE_NOROUTE) {
738 			ire_refrele(ire);
739 			ire = NULL;
740 			error = ESRCH;
741 			goto done;
742 		}
743 
744 		/* we know the IRE before we come here */
745 		switch (rtm->rtm_type) {
746 		case RTM_GET:
747 			mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af);
748 			if (mp1 == NULL) {
749 				error = ENOBUFS;
750 				goto done;
751 			}
752 			freemsg(mp);
753 			mp = mp1;
754 			rtm = (rt_msghdr_t *)mp->b_rptr;
755 			break;
756 		case RTM_CHANGE:
757 			/*
758 			 * Do not allow to the multirouting state of a route
759 			 * to be changed. This aims to prevent undesirable
760 			 * stages where both multirt and non-multirt routes
761 			 * for the same destination are declared.
762 			 */
763 			if ((ire->ire_flags & RTF_MULTIRT) !=
764 			    (rtm->rtm_flags & RTF_MULTIRT)) {
765 				error = EINVAL;
766 				goto done;
767 			}
768 			/*
769 			 * Note that we do not need to do
770 			 * ire_flush_cache_*(IRE_FLUSH_ADD) as a change
771 			 * in metrics or gateway will not affect existing
772 			 * routes since it does not create a more specific
773 			 * route.
774 			 */
775 			switch (af) {
776 			case AF_INET:
777 				if ((found_addrs & RTA_GATEWAY) != 0 &&
778 				    (ire->ire_gateway_addr != gw_addr)) {
779 					ire->ire_gateway_addr = gw_addr;
780 				}
781 
782 				if (rtsap != NULL) {
783 					ga.ga_af = AF_INET;
784 					IN6_IPADDR_TO_V4MAPPED(
785 					    ire->ire_gateway_addr, &ga.ga_addr);
786 
787 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
788 					if (gcgrp == NULL) {
789 						error = ENOMEM;
790 						goto done;
791 					}
792 				}
793 
794 				if ((found_addrs & RTA_SRC) != 0 &&
795 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
796 				    (ire->ire_setsrc_addr != src_addr)) {
797 					if (src_addr != INADDR_ANY) {
798 						uint_t type;
799 
800 						/*
801 						 * The RTF_SETSRC flag is
802 						 * present, check that the
803 						 * supplied src address is not
804 						 * the loopback address. This
805 						 * would produce martian
806 						 * packets.
807 						 */
808 						if (src_addr ==
809 						    htonl(INADDR_LOOPBACK)) {
810 							error = EINVAL;
811 							goto done;
812 						}
813 						/*
814 						 * Also check that the
815 						 * supplied addr is a valid
816 						 * local address.
817 						 */
818 						type = ip_type_v4(src_addr,
819 						    ipst);
820 						if (!(type &
821 						    (IRE_LOCAL|IRE_LOOPBACK))) {
822 							error = EADDRNOTAVAIL;
823 							goto done;
824 						}
825 						ire->ire_flags |= RTF_SETSRC;
826 						ire->ire_setsrc_addr =
827 						    src_addr;
828 					} else {
829 						ire->ire_flags &= ~RTF_SETSRC;
830 						ire->ire_setsrc_addr =
831 						    INADDR_ANY;
832 					}
833 					/*
834 					 * Let conn_ixa caching know that
835 					 * source address selection changed
836 					 */
837 					ip_update_source_selection(ipst);
838 				}
839 				ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE);
840 				break;
841 			case AF_INET6:
842 				mutex_enter(&ire->ire_lock);
843 				if ((found_addrs & RTA_GATEWAY) != 0 &&
844 				    !IN6_ARE_ADDR_EQUAL(
845 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
846 					ire->ire_gateway_addr_v6 = gw_addr_v6;
847 				}
848 				mutex_exit(&ire->ire_lock);
849 
850 				if (rtsap != NULL) {
851 					ga.ga_af = AF_INET6;
852 					mutex_enter(&ire->ire_lock);
853 					ga.ga_addr = ire->ire_gateway_addr_v6;
854 					mutex_exit(&ire->ire_lock);
855 
856 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
857 					if (gcgrp == NULL) {
858 						error = ENOMEM;
859 						goto done;
860 					}
861 				}
862 
863 				if ((found_addrs & RTA_SRC) != 0 &&
864 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
865 				    !IN6_ARE_ADDR_EQUAL(
866 				    &ire->ire_setsrc_addr_v6, &src_addr_v6)) {
867 					if (!IN6_IS_ADDR_UNSPECIFIED(
868 					    &src_addr_v6)) {
869 						uint_t type;
870 
871 						/*
872 						 * The RTF_SETSRC flag is
873 						 * present, check that the
874 						 * supplied src address is not
875 						 * the loopback address. This
876 						 * would produce martian
877 						 * packets.
878 						 */
879 						if (IN6_IS_ADDR_LOOPBACK(
880 						    &src_addr_v6)) {
881 							error = EINVAL;
882 							goto done;
883 						}
884 						/*
885 						 * Also check that the
886 						 * supplied addr is a valid
887 						 * local address.
888 						 */
889 						type = ip_type_v6(&src_addr_v6,
890 						    ipst);
891 						if (!(type &
892 						    (IRE_LOCAL|IRE_LOOPBACK))) {
893 							error = EADDRNOTAVAIL;
894 							goto done;
895 						}
896 						mutex_enter(&ire->ire_lock);
897 						ire->ire_flags |= RTF_SETSRC;
898 						ire->ire_setsrc_addr_v6 =
899 						    src_addr_v6;
900 						mutex_exit(&ire->ire_lock);
901 					} else {
902 						mutex_enter(&ire->ire_lock);
903 						ire->ire_flags &= ~RTF_SETSRC;
904 						ire->ire_setsrc_addr_v6 =
905 						    ipv6_all_zeros;
906 						mutex_exit(&ire->ire_lock);
907 					}
908 					/*
909 					 * Let conn_ixa caching know that
910 					 * source address selection changed
911 					 */
912 					ip_update_source_selection(ipst);
913 				}
914 				ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE);
915 				break;
916 			}
917 
918 			if (rtsap != NULL) {
919 				ASSERT(gcgrp != NULL);
920 
921 				/*
922 				 * Create and add the security attribute to
923 				 * prefix IRE; it will add a reference to the
924 				 * group upon allocating a new entry.  If it
925 				 * finds an already-existing entry for the
926 				 * security attribute, it simply returns it
927 				 * and no new group reference is made.
928 				 */
929 				gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
930 				if (gc == NULL ||
931 				    (error = tsol_ire_init_gwattr(ire,
932 				    ire->ire_ipversion, gc)) != 0) {
933 					if (gc != NULL) {
934 						GC_REFRELE(gc);
935 					} else {
936 						/* gc_create failed */
937 						error = ENOMEM;
938 					}
939 					goto done;
940 				}
941 			}
942 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
943 			break;
944 		}
945 		break;
946 	default:
947 		error = EOPNOTSUPP;
948 		break;
949 	}
950 done:
951 	if (ire != NULL)
952 		ire_refrele(ire);
953 	if (ifire != NULL)
954 		ire_refrele(ifire);
955 	if (ill != NULL)
956 		ill_refrele(ill);
957 
958 	if (gcgrp_xtraref)
959 		GCGRP_REFRELE(gcgrp);
960 
961 	if (rtm != NULL) {
962 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
963 		if (error != 0) {
964 			rtm->rtm_errno = error;
965 			/* Send error ACK */
966 			ip1dbg(("ip_rts_request: error %d\n", error));
967 		} else {
968 			rtm->rtm_flags |= RTF_DONE;
969 			/* OK ACK already set up by caller except this */
970 			ip2dbg(("ip_rts_request: OK ACK\n"));
971 		}
972 		rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
973 	}
974 	return (error);
975 }
976 
977 /*
978  * Helper function that can do recursive lookups including when
979  * MATCH_IRE_GW and/or MATCH_IRE_MASK is set.
980  */
981 static ire_t *
982 ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr,
983     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
984     int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp,
985     tsol_ire_gw_secattr_t **gwattrp)
986 {
987 	ire_t		*ire;
988 	ire_t		*ifire = NULL;
989 	uint_t		ire_type;
990 
991 	*pifire = NULL;
992 	*v4setsrcp = INADDR_ANY;
993 	*gwattrp = NULL;
994 
995 	/* Skip IRE_IF_CLONE */
996 	match_flags |= MATCH_IRE_TYPE;
997 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
998 
999 	/*
1000 	 * ire_route_recursive can't match gateway or mask thus if they are
1001 	 * set we have to do two steps of lookups
1002 	 */
1003 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1004 		ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr,
1005 		    ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL);
1006 
1007 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1008 			return (ire);
1009 
1010 		if (ire->ire_type & IRE_ONLINK)
1011 			return (ire);
1012 
1013 		if (ire->ire_flags & RTF_SETSRC) {
1014 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1015 			*v4setsrcp = ire->ire_setsrc_addr;
1016 			v4setsrcp = NULL;
1017 		}
1018 
1019 		/* The first ire_gw_secattr is passed back */
1020 		if (ire->ire_gw_secattr != NULL) {
1021 			*gwattrp = ire->ire_gw_secattr;
1022 			gwattrp = NULL;
1023 		}
1024 
1025 		/* Look for an interface ire recursively based on the gateway */
1026 		dst_addr = ire->ire_gateway_addr;
1027 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1028 		ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1029 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1030 		    gwattrp, NULL);
1031 		/*
1032 		 * Don't allow anything unusual past the first
1033 		 * iteration. Clearing ifire means caller will not see a
1034 		 * complete response - there will be no RTA_IFP returned.
1035 		 */
1036 		if ((ifire->ire_type &
1037 		    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1038 		    ire_pref(ifire) <= ire_pref(ire)) {
1039 			ire_refrele(ifire);
1040 			ifire = NULL;
1041 		}
1042 	} else {
1043 		ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1044 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1045 		    gwattrp, NULL);
1046 	}
1047 	*pifire = ifire;
1048 	return (ire);
1049 }
1050 
1051 static ire_t *
1052 ire_lookup_v6(const in6_addr_t *dst_addr_v6,
1053     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
1054     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
1055     ip_stack_t *ipst, ire_t **pifire,
1056     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp)
1057 {
1058 	ire_t		*ire;
1059 	ire_t		*ifire = NULL;
1060 	uint_t		ire_type;
1061 
1062 	*pifire = NULL;
1063 	*v6setsrcp = ipv6_all_zeros;
1064 	*gwattrp = NULL;
1065 
1066 	/* Skip IRE_IF_CLONE */
1067 	match_flags |= MATCH_IRE_TYPE;
1068 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
1069 
1070 	/*
1071 	 * ire_route_recursive can't match gateway or mask thus if they are
1072 	 * set we have to do two steps of lookups
1073 	 */
1074 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1075 		in6_addr_t dst;
1076 
1077 		ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6,
1078 		    gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0,
1079 		    ipst, NULL);
1080 
1081 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1082 			return (ire);
1083 
1084 		if (ire->ire_type & IRE_ONLINK)
1085 			return (ire);
1086 
1087 		if (ire->ire_flags & RTF_SETSRC) {
1088 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1089 			    &ire->ire_setsrc_addr_v6));
1090 			*v6setsrcp = ire->ire_setsrc_addr_v6;
1091 			v6setsrcp = NULL;
1092 		}
1093 
1094 		/* The first ire_gw_secattr is passed back */
1095 		if (ire->ire_gw_secattr != NULL) {
1096 			*gwattrp = ire->ire_gw_secattr;
1097 			gwattrp = NULL;
1098 		}
1099 
1100 		mutex_enter(&ire->ire_lock);
1101 		dst = ire->ire_gateway_addr_v6;
1102 		mutex_exit(&ire->ire_lock);
1103 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1104 		ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl,
1105 		    match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp, gwattrp,
1106 		    NULL);
1107 		/*
1108 		 * Don't allow anything unusual past the first
1109 		 * iteration. Clearing ifire means caller will not see a
1110 		 * complete response - there will be no RTA_IFP returned.
1111 		 */
1112 		if ((ifire->ire_type &
1113 		    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1114 		    ire_pref(ifire) <= ire_pref(ire)) {
1115 			ire_refrele(ifire);
1116 			ifire = NULL;
1117 		}
1118 	} else {
1119 		ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid,
1120 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp,
1121 		    gwattrp, NULL);
1122 	}
1123 	*pifire = ifire;
1124 	return (ire);
1125 }
1126 
1127 
1128 /*
1129  * Handle IP_IOC_RTS_REQUEST ioctls
1130  */
1131 int
1132 ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
1133 {
1134 	conn_t	*connp = Q_TO_CONN(q);
1135 	IOCP	iocp = (IOCP)mp->b_rptr;
1136 	mblk_t	*mp1, *ioc_mp = mp;
1137 	int	error = 0;
1138 	ip_stack_t	*ipst;
1139 
1140 	ipst = connp->conn_netstack->netstack_ip;
1141 
1142 	ASSERT(mp->b_cont != NULL);
1143 	/* ioc_mp holds mp */
1144 	mp = mp->b_cont;
1145 
1146 	/*
1147 	 * The Routing Socket data starts on
1148 	 * next block. If there is no next block
1149 	 * this is an indication from routing module
1150 	 * that it is a routing socket stream queue.
1151 	 * We need to support that for compatibility with SDP since
1152 	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
1153 	 * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this.
1154 	 */
1155 	if (mp->b_cont == NULL) {
1156 		/*
1157 		 * This is a message from SDP
1158 		 * indicating that this is a Routing Socket
1159 		 * Stream. Insert this conn_t in routing
1160 		 * socket client list.
1161 		 */
1162 		connp->conn_useloopback = 1;
1163 		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
1164 		goto done;
1165 	}
1166 	mp1 = dupmsg(mp->b_cont);
1167 	if (mp1 == NULL) {
1168 		error  = ENOBUFS;
1169 		goto done;
1170 	}
1171 	mp = mp1;
1172 
1173 	error = ip_rts_request_common(mp, connp, ioc_cr);
1174 done:
1175 	iocp->ioc_error = error;
1176 	ioc_mp->b_datap->db_type = M_IOCACK;
1177 	if (iocp->ioc_error != 0)
1178 		iocp->ioc_count = 0;
1179 	/* Note that we pass a NULL ira to rts_input */
1180 	(connp->conn_recv)(connp, ioc_mp, NULL, NULL);
1181 
1182 	/* conn was refheld in ip_wput_ioctl. */
1183 	CONN_OPER_PENDING_DONE(connp);
1184 
1185 	return (error);
1186 }
1187 
1188 /*
1189  * Build a reply to the RTM_GET request contained in the given message block
1190  * using the retrieved IRE of the destination address, the parent IRE (if it
1191  * exists) and the address family.
1192  *
1193  * Returns a pointer to a message block containing the reply if successful,
1194  * otherwise NULL is returned.
1195  */
1196 static mblk_t *
1197 rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc,
1198     tsol_ire_gw_secattr_t *attrp, sa_family_t af)
1199 {
1200 	rt_msghdr_t	*rtm;
1201 	rt_msghdr_t	*new_rtm;
1202 	mblk_t		*new_mp;
1203 	int		rtm_addrs;
1204 	int		rtm_flags;
1205 	tsol_gc_t	*gc = NULL;
1206 	tsol_gcgrp_t	*gcgrp = NULL;
1207 	ill_t		*ill;
1208 	ipif_t		*ipif = NULL;
1209 	ipaddr_t	brdaddr;	/* IFF_POINTOPOINT destination */
1210 	ipaddr_t	ifaddr;
1211 	in6_addr_t	brdaddr6;	/* IFF_POINTOPOINT destination */
1212 	in6_addr_t	ifaddr6;
1213 	ipaddr_t	v4setsrc;
1214 
1215 	rtm = (rt_msghdr_t *)mp->b_rptr;
1216 
1217 	/*
1218 	 * Find the ill used to send packets. This will be NULL in case
1219 	 * of a reject or blackhole.
1220 	 */
1221 	if (ifire != NULL)
1222 		ill = ire_nexthop_ill(ifire);
1223 	else
1224 		ill = ire_nexthop_ill(ire);
1225 
1226 	if (attrp != NULL) {
1227 		mutex_enter(&attrp->igsa_lock);
1228 		if ((gc = attrp->igsa_gc) != NULL) {
1229 			gcgrp = gc->gc_grp;
1230 			ASSERT(gcgrp != NULL);
1231 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1232 		}
1233 		mutex_exit(&attrp->igsa_lock);
1234 	}
1235 
1236 	/*
1237 	 * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
1238 	 *
1239 	 * The 4.4BSD-Lite2 code (net/rtsock.c) returns both
1240 	 * RTA_IFP and RTA_IFA if either is defined, and also
1241 	 * returns RTA_BRD if the appropriate interface is
1242 	 * point-to-point.
1243 	 */
1244 	rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
1245 	if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) {
1246 		rtm_addrs |= (RTA_IFP | RTA_IFA);
1247 		/*
1248 		 * We associate an IRE with an ILL, hence we don't exactly
1249 		 * know what might make sense for RTA_IFA and RTA_BRD. We
1250 		 * pick the first ipif on the ill.
1251 		 */
1252 		ipif = ipif_get_next_ipif(NULL, ill);
1253 		if (ipif != NULL) {
1254 			if (ipif->ipif_isv6)
1255 				ifaddr6 = ipif->ipif_v6lcl_addr;
1256 			else
1257 				ifaddr = ipif->ipif_lcl_addr;
1258 			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
1259 				rtm_addrs |= RTA_BRD;
1260 				if (ipif->ipif_isv6)
1261 					brdaddr6 = ipif->ipif_v6pp_dst_addr;
1262 				else
1263 					brdaddr = ipif->ipif_pp_dst_addr;
1264 			}
1265 			ipif_refrele(ipif);
1266 		}
1267 	}
1268 
1269 	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0);
1270 	if (new_mp == NULL) {
1271 		if (gcgrp != NULL)
1272 			rw_exit(&gcgrp->gcgrp_rwlock);
1273 		if (ill != NULL)
1274 			ill_refrele(ill);
1275 		return (NULL);
1276 	}
1277 
1278 	/*
1279 	 * We set the destination address, gateway address,
1280 	 * netmask and flags in the RTM_GET response depending
1281 	 * on whether we found a parent IRE or not.
1282 	 * In particular, if we did find a parent IRE during the
1283 	 * recursive search, use that IRE's gateway address.
1284 	 * Otherwise, we use the IRE's source address for the
1285 	 * gateway address.
1286 	 */
1287 	ASSERT(af == AF_INET || af == AF_INET6);
1288 	switch (af) {
1289 	case AF_INET:
1290 		IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc);
1291 		if (v4setsrc != INADDR_ANY)
1292 			rtm_addrs |= RTA_SRC;
1293 
1294 		rtm_flags = ire->ire_flags;
1295 		rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
1296 		    ire->ire_mask, ire->ire_gateway_addr, v4setsrc,
1297 		    brdaddr, 0, ifaddr, ill, new_mp, gc);
1298 		break;
1299 	case AF_INET6:
1300 		if (!IN6_IS_ADDR_UNSPECIFIED(setsrc))
1301 			rtm_addrs |= RTA_SRC;
1302 
1303 		rtm_flags = ire->ire_flags;
1304 		rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
1305 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
1306 		    setsrc, &brdaddr6, &ipv6_all_zeros,
1307 		    &ifaddr6, ill, new_mp, gc);
1308 		break;
1309 	}
1310 
1311 	if (gcgrp != NULL)
1312 		rw_exit(&gcgrp->gcgrp_rwlock);
1313 
1314 	new_rtm = (rt_msghdr_t *)new_mp->b_rptr;
1315 
1316 	/*
1317 	 * The rtm_msglen, rtm_version and rtm_type fields in
1318 	 * RTM_GET response are filled in by rts_fill_msg.
1319 	 *
1320 	 * rtm_addrs and rtm_flags are filled in based on what
1321 	 * was requested and the state of the IREs looked up
1322 	 * above.
1323 	 *
1324 	 * rtm_inits and rtm_rmx are filled in with metrics
1325 	 * based on whether a parent IRE was found or not.
1326 	 *
1327 	 * TODO: rtm_index and rtm_use should probably be
1328 	 * filled in with something resonable here and not just
1329 	 * copied from the request.
1330 	 */
1331 	new_rtm->rtm_index = rtm->rtm_index;
1332 	new_rtm->rtm_pid = rtm->rtm_pid;
1333 	new_rtm->rtm_seq = rtm->rtm_seq;
1334 	new_rtm->rtm_use = rtm->rtm_use;
1335 	new_rtm->rtm_addrs = rtm_addrs;
1336 	new_rtm->rtm_flags = rtm_flags;
1337 	new_rtm->rtm_inits = rts_getmetrics(ire, ill, &new_rtm->rtm_rmx);
1338 	if (ill != NULL)
1339 		ill_refrele(ill);
1340 	return (new_mp);
1341 }
1342 
1343 /*
1344  * Fill the given if_data_t with interface statistics.
1345  */
1346 static void
1347 rts_getifdata(if_data_t *if_data, const ipif_t *ipif)
1348 {
1349 	if_data->ifi_type = ipif->ipif_ill->ill_type;
1350 						/* ethernet, tokenring, etc */
1351 	if_data->ifi_addrlen = 0;		/* media address length */
1352 	if_data->ifi_hdrlen = 0;		/* media header length */
1353 	if_data->ifi_mtu = ipif->ipif_ill->ill_mtu;	/* mtu */
1354 	if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */
1355 	if_data->ifi_baudrate = 0;		/* linespeed */
1356 
1357 	if_data->ifi_ipackets = 0;		/* packets received on if */
1358 	if_data->ifi_ierrors = 0;		/* input errors on interface */
1359 	if_data->ifi_opackets = 0;		/* packets sent on interface */
1360 	if_data->ifi_oerrors = 0;		/* output errors on if */
1361 	if_data->ifi_collisions = 0;		/* collisions on csma if */
1362 	if_data->ifi_ibytes = 0;		/* total number received */
1363 	if_data->ifi_obytes = 0;		/* total number sent */
1364 	if_data->ifi_imcasts = 0;		/* multicast packets received */
1365 	if_data->ifi_omcasts = 0;		/* multicast packets sent */
1366 	if_data->ifi_iqdrops = 0;		/* dropped on input */
1367 	if_data->ifi_noproto = 0;		/* destined for unsupported */
1368 						/* protocol. */
1369 }
1370 
1371 /*
1372  * Set the metrics on a forwarding table route.
1373  */
1374 static void
1375 rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
1376 {
1377 	clock_t		rtt;
1378 	clock_t		rtt_sd;
1379 	ill_t		*ill;
1380 	ifrt_t		*ifrt;
1381 	mblk_t		*mp;
1382 	in6_addr_t	gw_addr_v6;
1383 
1384 	/* Need to add back some metrics to the IRE? */
1385 	/*
1386 	 * Bypass obtaining the lock and searching ill_saved_ire_mp in the
1387 	 * common case of no metrics.
1388 	 */
1389 	if (which == 0)
1390 		return;
1391 	ire->ire_metrics.iulp_set = B_TRUE;
1392 
1393 	/*
1394 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1395 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1396 	 * microseconds.
1397 	 */
1398 	if (which & RTV_RTT)
1399 		rtt = metrics->rmx_rtt / 1000;
1400 	if (which & RTV_RTTVAR)
1401 		rtt_sd = metrics->rmx_rttvar / 1000;
1402 
1403 	/*
1404 	 * Update the metrics in the IRE itself.
1405 	 */
1406 	mutex_enter(&ire->ire_lock);
1407 	if (which & RTV_MTU)
1408 		ire->ire_metrics.iulp_mtu = metrics->rmx_mtu;
1409 	if (which & RTV_RTT)
1410 		ire->ire_metrics.iulp_rtt = rtt;
1411 	if (which & RTV_SSTHRESH)
1412 		ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh;
1413 	if (which & RTV_RTTVAR)
1414 		ire->ire_metrics.iulp_rtt_sd = rtt_sd;
1415 	if (which & RTV_SPIPE)
1416 		ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe;
1417 	if (which & RTV_RPIPE)
1418 		ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1419 	mutex_exit(&ire->ire_lock);
1420 
1421 	/*
1422 	 * Search through the ifrt_t chain hanging off the ILL in order to
1423 	 * reflect the metric change there.
1424 	 */
1425 	ill = ire->ire_ill;
1426 	if (ill == NULL)
1427 		return;
1428 	ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
1429 	    ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION)));
1430 	if (ill->ill_isv6) {
1431 		mutex_enter(&ire->ire_lock);
1432 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1433 		mutex_exit(&ire->ire_lock);
1434 	}
1435 	mutex_enter(&ill->ill_saved_ire_lock);
1436 	for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
1437 		/*
1438 		 * On a given ill, the tuple of address, gateway, mask,
1439 		 * ire_type and zoneid unique for each saved IRE.
1440 		 */
1441 		ifrt = (ifrt_t *)mp->b_rptr;
1442 		if (ill->ill_isv6) {
1443 			if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
1444 			    &ire->ire_addr_v6) ||
1445 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
1446 			    &gw_addr_v6) ||
1447 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
1448 			    &ire->ire_mask_v6))
1449 				continue;
1450 		} else {
1451 			if (ifrt->ifrt_addr != ire->ire_addr ||
1452 			    ifrt->ifrt_gateway_addr != ire->ire_gateway_addr ||
1453 			    ifrt->ifrt_mask != ire->ire_mask)
1454 				continue;
1455 		}
1456 		if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
1457 		    ifrt->ifrt_type != ire->ire_type)
1458 			continue;
1459 
1460 		if (which & RTV_MTU)
1461 			ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu;
1462 		if (which & RTV_RTT)
1463 			ifrt->ifrt_metrics.iulp_rtt = rtt;
1464 		if (which & RTV_SSTHRESH) {
1465 			ifrt->ifrt_metrics.iulp_ssthresh =
1466 			    metrics->rmx_ssthresh;
1467 		}
1468 		if (which & RTV_RTTVAR)
1469 			ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar;
1470 		if (which & RTV_SPIPE)
1471 			ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe;
1472 		if (which & RTV_RPIPE)
1473 			ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1474 		break;
1475 	}
1476 	mutex_exit(&ill->ill_saved_ire_lock);
1477 
1478 	/*
1479 	 * Update any IRE_IF_CLONE hanging created from this IRE_IF so they
1480 	 * get any new iulp_mtu.
1481 	 * We do that by deleting them; ire_create_if_clone will pick
1482 	 * up the new metrics.
1483 	 */
1484 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
1485 		ire_dep_delete_if_clone(ire);
1486 }
1487 
1488 /*
1489  * Get the metrics from a forwarding table route.
1490  */
1491 static int
1492 rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics)
1493 {
1494 	int	metrics_set = 0;
1495 
1496 	bzero(metrics, sizeof (rt_metrics_t));
1497 
1498 	/*
1499 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1500 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1501 	 * microseconds.
1502 	 */
1503 	metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000;
1504 	metrics_set |= RTV_RTT;
1505 	if (ire->ire_metrics.iulp_mtu != 0) {
1506 		metrics->rmx_mtu = ire->ire_metrics.iulp_mtu;
1507 		metrics_set |= RTV_MTU;
1508 	} else if (ill != NULL) {
1509 		metrics->rmx_mtu = ill->ill_mtu;
1510 		metrics_set |= RTV_MTU;
1511 	}
1512 	metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh;
1513 	metrics_set |= RTV_SSTHRESH;
1514 	metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000;
1515 	metrics_set |= RTV_RTTVAR;
1516 	metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe;
1517 	metrics_set |= RTV_SPIPE;
1518 	metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe;
1519 	metrics_set |= RTV_RPIPE;
1520 	return (metrics_set);
1521 }
1522 
1523 /*
1524  * Given two sets of metrics (src and dst), use the dst values if they are
1525  * set. If a dst value is not set but the src value is set, then we use
1526  * the src value.
1527  * dst is updated with the new values.
1528  * This is used to merge information from a dce_t and ire_metrics, where the
1529  * dce values takes precedence.
1530  */
1531 void
1532 rts_merge_metrics(iulp_t *dst, const iulp_t *src)
1533 {
1534 	if (!src->iulp_set)
1535 		return;
1536 
1537 	if (dst->iulp_ssthresh == 0)
1538 		dst->iulp_ssthresh = src->iulp_ssthresh;
1539 	if (dst->iulp_rtt == 0)
1540 		dst->iulp_rtt = src->iulp_rtt;
1541 	if (dst->iulp_rtt_sd == 0)
1542 		dst->iulp_rtt_sd = src->iulp_rtt_sd;
1543 	if (dst->iulp_spipe == 0)
1544 		dst->iulp_spipe = src->iulp_spipe;
1545 	if (dst->iulp_rpipe == 0)
1546 		dst->iulp_rpipe = src->iulp_rpipe;
1547 	if (dst->iulp_rtomax == 0)
1548 		dst->iulp_rtomax = src->iulp_rtomax;
1549 	if (dst->iulp_sack == 0)
1550 		dst->iulp_sack = src->iulp_sack;
1551 	if (dst->iulp_tstamp_ok == 0)
1552 		dst->iulp_tstamp_ok = src->iulp_tstamp_ok;
1553 	if (dst->iulp_wscale_ok == 0)
1554 		dst->iulp_wscale_ok = src->iulp_wscale_ok;
1555 	if (dst->iulp_ecn_ok == 0)
1556 		dst->iulp_ecn_ok = src->iulp_ecn_ok;
1557 	if (dst->iulp_pmtud_ok == 0)
1558 		dst->iulp_pmtud_ok = src->iulp_pmtud_ok;
1559 	if (dst->iulp_mtu == 0)
1560 		dst->iulp_mtu = src->iulp_mtu;
1561 }
1562 
1563 
1564 /*
1565  * Takes a pointer to a routing message and extracts necessary info by looking
1566  * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
1567  * passed (all of which must be valid).
1568  *
1569  * The bitmask of sockaddrs actually found in the message is returned, or zero
1570  * is returned in the case of an error.
1571  */
1572 static int
1573 rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
1574     in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp,
1575     in6_addr_t *in_src_addrp, ushort_t *indexp, sa_family_t *afp,
1576     tsol_rtsecattr_t *rtsecattr, int *error)
1577 {
1578 	struct sockaddr *sa;
1579 	int	i;
1580 	int	addr_bits;
1581 	int	length;
1582 	int	found_addrs = 0;
1583 	caddr_t	cp;
1584 	size_t	size;
1585 	struct sockaddr_dl *sdl;
1586 
1587 	*dst_addrp = ipv6_all_zeros;
1588 	*gw_addrp = ipv6_all_zeros;
1589 	*net_maskp = ipv6_all_zeros;
1590 	*authorp = ipv6_all_zeros;
1591 	*if_addrp = ipv6_all_zeros;
1592 	*in_src_addrp = ipv6_all_zeros;
1593 	*indexp = 0;
1594 	*afp = AF_UNSPEC;
1595 	rtsecattr->rtsa_cnt = 0;
1596 	*error = 0;
1597 
1598 	/*
1599 	 * At present we handle only RTA_DST, RTA_GATEWAY, RTA_NETMASK, RTA_IFP,
1600 	 * RTA_IFA and RTA_AUTHOR.  The rest will be added as we need them.
1601 	 */
1602 	cp = (caddr_t)&rtm[1];
1603 	length = rtm->rtm_msglen;
1604 	for (i = 0; (i < RTA_NUMBITS) && ((cp - (caddr_t)rtm) < length); i++) {
1605 		/*
1606 		 * The address family we are working with starts out as
1607 		 * AF_UNSPEC, but is set to the one specified with the
1608 		 * destination address.
1609 		 *
1610 		 * If the "working" address family that has been set to
1611 		 * something other than AF_UNSPEC, then the address family of
1612 		 * subsequent sockaddrs must either be AF_UNSPEC (for
1613 		 * compatibility with older programs) or must be the same as our
1614 		 * "working" one.
1615 		 *
1616 		 * This code assumes that RTA_DST (1) comes first in the loop.
1617 		 */
1618 		sa = (struct sockaddr *)cp;
1619 		addr_bits = (rtm->rtm_addrs & (1 << i));
1620 		if (addr_bits == 0)
1621 			continue;
1622 		switch (addr_bits) {
1623 		case RTA_DST:
1624 			size = rts_copyfromsockaddr(sa, dst_addrp);
1625 			*afp = sa->sa_family;
1626 			break;
1627 		case RTA_GATEWAY:
1628 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1629 				return (0);
1630 			size = rts_copyfromsockaddr(sa, gw_addrp);
1631 			break;
1632 		case RTA_NETMASK:
1633 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1634 				return (0);
1635 			size = rts_copyfromsockaddr(sa, net_maskp);
1636 			break;
1637 		case RTA_IFP:
1638 			if (sa->sa_family != AF_LINK &&
1639 			    sa->sa_family != AF_UNSPEC)
1640 				return (0);
1641 			sdl = (struct sockaddr_dl *)cp;
1642 			*indexp = sdl->sdl_index;
1643 			size = sizeof (struct sockaddr_dl);
1644 			break;
1645 		case RTA_SRC:
1646 			/* Source address of the incoming packet */
1647 			size = rts_copyfromsockaddr(sa, in_src_addrp);
1648 			*afp = sa->sa_family;
1649 			break;
1650 		case RTA_IFA:
1651 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1652 				return (0);
1653 			size = rts_copyfromsockaddr(sa, if_addrp);
1654 			break;
1655 		case RTA_AUTHOR:
1656 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1657 				return (0);
1658 			size = rts_copyfromsockaddr(sa, authorp);
1659 			break;
1660 		default:
1661 			return (0);
1662 		}
1663 		if (size == 0)
1664 			return (0);
1665 		cp += size;
1666 		found_addrs |= addr_bits;
1667 	}
1668 
1669 	/*
1670 	 * Parse the routing message and look for any security-
1671 	 * related attributes for the route.  For each valid
1672 	 * attribute, allocate/obtain the corresponding kernel
1673 	 * route security attributes.
1674 	 */
1675 	if (((cp - (caddr_t)rtm) < length) && is_system_labeled()) {
1676 		*error = tsol_rtsa_init(rtm, rtsecattr, cp);
1677 		ASSERT(rtsecattr->rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
1678 	}
1679 
1680 	return (found_addrs);
1681 }
1682 
1683 /*
1684  * Fills the message with the given info.
1685  */
1686 static void
1687 rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
1688     ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
1689     ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
1690     const tsol_gc_t *gc)
1691 {
1692 	rt_msghdr_t	*rtm;
1693 	sin_t		*sin;
1694 	size_t		data_size, header_size;
1695 	uchar_t		*cp;
1696 	int		i;
1697 
1698 	ASSERT(mp != NULL);
1699 	/*
1700 	 * First find the type of the message
1701 	 * and its length.
1702 	 */
1703 	header_size = rts_header_msg_size(type);
1704 	/*
1705 	 * Now find the size of the data
1706 	 * that follows the message header.
1707 	 */
1708 	data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0);
1709 
1710 	rtm = (rt_msghdr_t *)mp->b_rptr;
1711 	mp->b_wptr = &mp->b_rptr[header_size];
1712 	cp = mp->b_wptr;
1713 	bzero(cp, data_size);
1714 	for (i = 0; i < RTA_NUMBITS; i++) {
1715 		sin = (sin_t *)cp;
1716 		switch (rtm_addrs & (1 << i)) {
1717 		case RTA_DST:
1718 			sin->sin_addr.s_addr = dst;
1719 			sin->sin_family = AF_INET;
1720 			cp += sizeof (sin_t);
1721 			break;
1722 		case RTA_GATEWAY:
1723 			sin->sin_addr.s_addr = gateway;
1724 			sin->sin_family = AF_INET;
1725 			cp += sizeof (sin_t);
1726 			break;
1727 		case RTA_NETMASK:
1728 			sin->sin_addr.s_addr = mask;
1729 			sin->sin_family = AF_INET;
1730 			cp += sizeof (sin_t);
1731 			break;
1732 		case RTA_IFP:
1733 			cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
1734 			break;
1735 		case RTA_IFA:
1736 			sin->sin_addr.s_addr = ifaddr;
1737 			sin->sin_family = AF_INET;
1738 			cp += sizeof (sin_t);
1739 			break;
1740 		case RTA_SRC:
1741 			sin->sin_addr.s_addr = src_addr;
1742 			sin->sin_family = AF_INET;
1743 			cp += sizeof (sin_t);
1744 			break;
1745 		case RTA_AUTHOR:
1746 			sin->sin_addr.s_addr = author;
1747 			sin->sin_family = AF_INET;
1748 			cp += sizeof (sin_t);
1749 			break;
1750 		case RTA_BRD:
1751 			/*
1752 			 * RTA_BRD is used typically to specify a point-to-point
1753 			 * destination address.
1754 			 */
1755 			sin->sin_addr.s_addr = brd_addr;
1756 			sin->sin_family = AF_INET;
1757 			cp += sizeof (sin_t);
1758 			break;
1759 		}
1760 	}
1761 
1762 	if (gc != NULL) {
1763 		rtm_ext_t *rtm_ext;
1764 		struct rtsa_s *rp_dst;
1765 		tsol_rtsecattr_t *rsap;
1766 
1767 		ASSERT(gc->gc_grp != NULL);
1768 		ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
1769 
1770 		rtm_ext = (rtm_ext_t *)cp;
1771 		rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
1772 		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
1773 
1774 		rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
1775 		rsap->rtsa_cnt = 1;
1776 		rp_dst = rsap->rtsa_attr;
1777 
1778 		ASSERT(gc->gc_db != NULL);
1779 		bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
1780 		cp = (uchar_t *)rp_dst;
1781 	}
1782 
1783 	mp->b_wptr = cp;
1784 	mp->b_cont = NULL;
1785 	/*
1786 	 * set the fields that are common to
1787 	 * to different messages.
1788 	 */
1789 	rtm->rtm_msglen = (short)(header_size + data_size);
1790 	rtm->rtm_version = RTM_VERSION;
1791 	rtm->rtm_type = (uchar_t)type;
1792 }
1793 
1794 /*
1795  * Allocates and initializes a routing socket message.
1796  * Note that sacnt is either zero or one.
1797  */
1798 mblk_t *
1799 rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt)
1800 {
1801 	size_t	length;
1802 	mblk_t	*mp;
1803 
1804 	length = RTS_MSG_SIZE(type, rtm_addrs, af, sacnt);
1805 	mp = allocb(length, BPRI_MED);
1806 	if (mp == NULL)
1807 		return (mp);
1808 	bzero(mp->b_rptr, length);
1809 	return (mp);
1810 }
1811 
1812 /*
1813  * Returns the size of the routing
1814  * socket message header size.
1815  */
1816 size_t
1817 rts_header_msg_size(int type)
1818 {
1819 	switch (type) {
1820 	case RTM_DELADDR:
1821 	case RTM_NEWADDR:
1822 	case RTM_CHGADDR:
1823 	case RTM_FREEADDR:
1824 		return (sizeof (ifa_msghdr_t));
1825 	case RTM_IFINFO:
1826 		return (sizeof (if_msghdr_t));
1827 	default:
1828 		return (sizeof (rt_msghdr_t));
1829 	}
1830 }
1831 
1832 /*
1833  * Returns the size of the message needed with the given rtm_addrs and family.
1834  *
1835  * It is assumed that all of the sockaddrs (with the exception of RTA_IFP) are
1836  * of the same family (currently either AF_INET or AF_INET6).
1837  */
1838 size_t
1839 rts_data_msg_size(int rtm_addrs, sa_family_t af, uint_t sacnt)
1840 {
1841 	int	i;
1842 	size_t	length = 0;
1843 
1844 	for (i = 0; i < RTA_NUMBITS; i++) {
1845 		switch (rtm_addrs & (1 << i)) {
1846 		case RTA_IFP:
1847 			length += sizeof (struct sockaddr_dl);
1848 			break;
1849 		case RTA_DST:
1850 		case RTA_GATEWAY:
1851 		case RTA_NETMASK:
1852 		case RTA_SRC:
1853 		case RTA_IFA:
1854 		case RTA_AUTHOR:
1855 		case RTA_BRD:
1856 			ASSERT(af == AF_INET || af == AF_INET6);
1857 			switch (af) {
1858 			case AF_INET:
1859 				length += sizeof (sin_t);
1860 				break;
1861 			case AF_INET6:
1862 				length += sizeof (sin6_t);
1863 				break;
1864 			}
1865 			break;
1866 		}
1867 	}
1868 	if (sacnt > 0)
1869 		length += sizeof (rtm_ext_t) + TSOL_RTSECATTR_SIZE(sacnt);
1870 
1871 	return (length);
1872 }
1873 
1874 /*
1875  * This routine is called to generate a message to the routing
1876  * socket indicating that a redirect has occured, a routing lookup
1877  * has failed, or that a protocol has detected timeouts to a particular
1878  * destination. This routine is called for message types RTM_LOSING,
1879  * RTM_REDIRECT, and RTM_MISS.
1880  */
1881 void
1882 ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
1883     ipaddr_t source, ipaddr_t author, int flags, int error, int rtm_addrs,
1884     ip_stack_t *ipst)
1885 {
1886 	rt_msghdr_t	*rtm;
1887 	mblk_t		*mp;
1888 
1889 	if (rtm_addrs == 0)
1890 		return;
1891 	mp = rts_alloc_msg(type, rtm_addrs, AF_INET, 0);
1892 	if (mp == NULL)
1893 		return;
1894 	rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
1895 	    author, 0, NULL, mp, NULL);
1896 	rtm = (rt_msghdr_t *)mp->b_rptr;
1897 	rtm->rtm_flags = flags;
1898 	rtm->rtm_errno = error;
1899 	rtm->rtm_flags |= RTF_DONE;
1900 	rtm->rtm_addrs = rtm_addrs;
1901 	rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
1902 }
1903 
1904 /*
1905  * This routine is called to generate a message to the routing
1906  * socket indicating that the status of a network interface has changed.
1907  * Message type generated RTM_IFINFO.
1908  */
1909 void
1910 ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
1911 {
1912 	ip_rts_xifmsg(ipif, 0, 0, flags);
1913 }
1914 
1915 void
1916 ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
1917 {
1918 	if_msghdr_t	*ifm;
1919 	mblk_t		*mp;
1920 	sa_family_t	af;
1921 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1922 
1923 	/*
1924 	 * This message should be generated only
1925 	 * when the physical device is changing
1926 	 * state.
1927 	 */
1928 	if (ipif->ipif_id != 0)
1929 		return;
1930 	if (ipif->ipif_isv6) {
1931 		af = AF_INET6;
1932 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1933 		if (mp == NULL)
1934 			return;
1935 		rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
1936 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1937 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1938 		    ipif->ipif_ill, mp, NULL);
1939 	} else {
1940 		af = AF_INET;
1941 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1942 		if (mp == NULL)
1943 			return;
1944 		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0,
1945 		    ipif->ipif_ill, mp, NULL);
1946 	}
1947 	ifm = (if_msghdr_t *)mp->b_rptr;
1948 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
1949 	ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
1950 	    ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
1951 	rts_getifdata(&ifm->ifm_data, ipif);
1952 	ifm->ifm_addrs = RTA_IFP;
1953 
1954 	if (flags & RTSQ_DEFAULT) {
1955 		flags = RTSQ_ALL;
1956 		/*
1957 		 * If this message is for an underlying interface, prevent
1958 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
1959 		 */
1960 		if (IS_UNDER_IPMP(ipif->ipif_ill))
1961 			flags &= ~RTSQ_NORMAL;
1962 	}
1963 
1964 	rts_queue_input(mp, NULL, af, flags, ipst);
1965 }
1966 
1967 /*
1968  * If cmd is RTM_ADD or RTM_DELETE, generate the rt_msghdr_t message;
1969  * otherwise (RTM_NEWADDR, RTM_DELADDR, RTM_CHGADDR and RTM_FREEADDR)
1970  * generate the ifa_msghdr_t message.
1971  */
1972 static void
1973 rts_new_rtsmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
1974 {
1975 	int		rtm_addrs;
1976 	mblk_t		*mp;
1977 	ifa_msghdr_t	*ifam;
1978 	rt_msghdr_t	*rtm;
1979 	sa_family_t	af;
1980 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1981 
1982 	/*
1983 	 * Do not report unspecified address if this is the RTM_CHGADDR or
1984 	 * RTM_FREEADDR message.
1985 	 */
1986 	if (cmd == RTM_CHGADDR || cmd == RTM_FREEADDR) {
1987 		if (!ipif->ipif_isv6) {
1988 			if (ipif->ipif_lcl_addr == INADDR_ANY)
1989 				return;
1990 		} else if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
1991 			return;
1992 		}
1993 	}
1994 
1995 	if (ipif->ipif_isv6)
1996 		af = AF_INET6;
1997 	else
1998 		af = AF_INET;
1999 
2000 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2001 		rtm_addrs = (RTA_DST | RTA_NETMASK);
2002 	else
2003 		rtm_addrs = (RTA_IFA | RTA_NETMASK | RTA_BRD | RTA_IFP);
2004 
2005 	mp = rts_alloc_msg(cmd, rtm_addrs, af, 0);
2006 	if (mp == NULL)
2007 		return;
2008 
2009 	if (cmd != RTM_ADD && cmd != RTM_DELETE) {
2010 		switch (af) {
2011 		case AF_INET:
2012 			rts_fill_msg(cmd, rtm_addrs, 0,
2013 			    ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
2014 			    ipif->ipif_pp_dst_addr, 0,
2015 			    ipif->ipif_lcl_addr, ipif->ipif_ill,
2016 			    mp, NULL);
2017 			break;
2018 		case AF_INET6:
2019 			rts_fill_msg_v6(cmd, rtm_addrs,
2020 			    &ipv6_all_zeros, &ipif->ipif_v6net_mask,
2021 			    &ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
2022 			    &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
2023 			    &ipif->ipif_v6lcl_addr, ipif->ipif_ill,
2024 			    mp, NULL);
2025 			break;
2026 		}
2027 		ifam = (ifa_msghdr_t *)mp->b_rptr;
2028 		ifam->ifam_index =
2029 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2030 		ifam->ifam_metric = ipif->ipif_metric;
2031 		ifam->ifam_flags = ((cmd == RTM_NEWADDR) ? RTF_UP : 0);
2032 		ifam->ifam_addrs = rtm_addrs;
2033 	} else {
2034 		switch (af) {
2035 		case AF_INET:
2036 			rts_fill_msg(cmd, rtm_addrs,
2037 			    ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
2038 			    0, 0, 0, 0, NULL, mp, NULL);
2039 			break;
2040 		case AF_INET6:
2041 			rts_fill_msg_v6(cmd, rtm_addrs,
2042 			    &ipif->ipif_v6lcl_addr,
2043 			    &ipif->ipif_v6net_mask, &ipv6_all_zeros,
2044 			    &ipv6_all_zeros, &ipv6_all_zeros,
2045 			    &ipv6_all_zeros, &ipv6_all_zeros,
2046 			    NULL, mp, NULL);
2047 			break;
2048 		}
2049 		rtm = (rt_msghdr_t *)mp->b_rptr;
2050 		rtm->rtm_index =
2051 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2052 		rtm->rtm_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
2053 		rtm->rtm_errno = error;
2054 		if (error == 0)
2055 			rtm->rtm_flags |= RTF_DONE;
2056 		rtm->rtm_addrs = rtm_addrs;
2057 	}
2058 	rts_queue_input(mp, NULL, af, flags, ipst);
2059 }
2060 
2061 /*
2062  * This is called to generate messages to the routing socket
2063  * indicating a network interface has had addresses associated with it.
2064  * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
2065  */
2066 void
2067 ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
2068 {
2069 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2070 
2071 	if (flags & RTSQ_DEFAULT) {
2072 		flags = RTSQ_ALL;
2073 		/*
2074 		 * If this message is for an underlying interface, prevent
2075 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
2076 		 */
2077 		if (IS_UNDER_IPMP(ipif->ipif_ill))
2078 			flags &= ~RTSQ_NORMAL;
2079 	}
2080 
2081 	/*
2082 	 * Let conn_ixa caching know that source address selection
2083 	 * changed
2084 	 */
2085 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2086 		ip_update_source_selection(ipst);
2087 
2088 	/*
2089 	 * If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
2090 	 * if the request is ADD, send RTM_NEWADDR and RTM_ADD.
2091 	 * otherwise simply send the request.
2092 	 */
2093 	switch (cmd) {
2094 	case RTM_ADD:
2095 		rts_new_rtsmsg(RTM_NEWADDR, error, ipif, flags);
2096 		rts_new_rtsmsg(RTM_ADD, error, ipif, flags);
2097 		break;
2098 	case RTM_DELETE:
2099 		rts_new_rtsmsg(RTM_DELETE, error, ipif, flags);
2100 		rts_new_rtsmsg(RTM_DELADDR, error, ipif, flags);
2101 		break;
2102 	default:
2103 		rts_new_rtsmsg(cmd, error, ipif, flags);
2104 		break;
2105 	}
2106 }
2107 
2108 /*
2109  * Based on the address family specified in a sockaddr, copy the address field
2110  * into an in6_addr_t.
2111  *
2112  * In the case of AF_UNSPEC, we assume the family is actually AF_INET for
2113  * compatibility with programs that leave the family cleared in the sockaddr.
2114  * Callers of rts_copyfromsockaddr should check the family themselves if they
2115  * wish to verify its value.
2116  *
2117  * In the case of AF_INET6, a check is made to ensure that address is not an
2118  * IPv4-mapped address.
2119  */
2120 size_t
2121 rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp)
2122 {
2123 	switch (sa->sa_family) {
2124 	case AF_INET:
2125 	case AF_UNSPEC:
2126 		IN6_IPADDR_TO_V4MAPPED(((sin_t *)sa)->sin_addr.s_addr, addrp);
2127 		return (sizeof (sin_t));
2128 	case AF_INET6:
2129 		*addrp = ((sin6_t *)sa)->sin6_addr;
2130 		if (IN6_IS_ADDR_V4MAPPED(addrp))
2131 			return (0);
2132 		return (sizeof (sin6_t));
2133 	default:
2134 		return (0);
2135 	}
2136 }
2137