xref: /freebsd/sys/net/rtsock.c (revision 9768746b)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1988, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
32  * $FreeBSD$
33  */
34 #include "opt_ddb.h"
35 #include "opt_route.h"
36 #include "opt_inet.h"
37 #include "opt_inet6.h"
38 
39 #include <sys/param.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/eventhandler.h>
43 #include <sys/domain.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/priv.h>
48 #include <sys/proc.h>
49 #include <sys/protosw.h>
50 #include <sys/rmlock.h>
51 #include <sys/rwlock.h>
52 #include <sys/signalvar.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/sysctl.h>
56 #include <sys/systm.h>
57 
58 #include <net/if.h>
59 #include <net/if_var.h>
60 #include <net/if_private.h>
61 #include <net/if_dl.h>
62 #include <net/if_llatbl.h>
63 #include <net/if_types.h>
64 #include <net/netisr.h>
65 #include <net/route.h>
66 #include <net/route/route_ctl.h>
67 #include <net/route/route_var.h>
68 #include <net/vnet.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/if_ether.h>
72 #include <netinet/ip_carp.h>
73 #ifdef INET6
74 #include <netinet6/in6_var.h>
75 #include <netinet6/ip6_var.h>
76 #include <netinet6/scope6_var.h>
77 #endif
78 #include <net/route/nhop.h>
79 
80 #define	DEBUG_MOD_NAME	rtsock
81 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
82 #include <net/route/route_debug.h>
83 _DECLARE_DEBUG(LOG_INFO);
84 
85 #ifdef COMPAT_FREEBSD32
86 #include <sys/mount.h>
87 #include <compat/freebsd32/freebsd32.h>
88 
89 struct if_msghdr32 {
90 	uint16_t ifm_msglen;
91 	uint8_t	ifm_version;
92 	uint8_t	ifm_type;
93 	int32_t	ifm_addrs;
94 	int32_t	ifm_flags;
95 	uint16_t ifm_index;
96 	uint16_t _ifm_spare1;
97 	struct	if_data ifm_data;
98 };
99 
100 struct if_msghdrl32 {
101 	uint16_t ifm_msglen;
102 	uint8_t	ifm_version;
103 	uint8_t	ifm_type;
104 	int32_t	ifm_addrs;
105 	int32_t	ifm_flags;
106 	uint16_t ifm_index;
107 	uint16_t _ifm_spare1;
108 	uint16_t ifm_len;
109 	uint16_t ifm_data_off;
110 	uint32_t _ifm_spare2;
111 	struct	if_data ifm_data;
112 };
113 
114 struct ifa_msghdrl32 {
115 	uint16_t ifam_msglen;
116 	uint8_t	ifam_version;
117 	uint8_t	ifam_type;
118 	int32_t	ifam_addrs;
119 	int32_t	ifam_flags;
120 	uint16_t ifam_index;
121 	uint16_t _ifam_spare1;
122 	uint16_t ifam_len;
123 	uint16_t ifam_data_off;
124 	int32_t	ifam_metric;
125 	struct	if_data ifam_data;
126 };
127 
128 #define SA_SIZE32(sa)						\
129     (  (((struct sockaddr *)(sa))->sa_len == 0) ?		\
130 	sizeof(int)		:				\
131 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) )
132 
133 #endif /* COMPAT_FREEBSD32 */
134 
135 struct linear_buffer {
136 	char		*base;	/* Base allocated memory pointer */
137 	uint32_t	offset;	/* Currently used offset */
138 	uint32_t	size;	/* Total buffer size */
139 };
140 #define	SCRATCH_BUFFER_SIZE	1024
141 
142 #define	RTS_PID_LOG(_l, _fmt, ...)	RT_LOG_##_l(_l, "PID %d: " _fmt, curproc ? curproc->p_pid : 0, ## __VA_ARGS__)
143 
144 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
145 
146 /* NB: these are not modified */
147 static struct	sockaddr route_src = { 2, PF_ROUTE, };
148 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
149 
150 /* These are external hooks for CARP. */
151 int	(*carp_get_vhid_p)(struct ifaddr *);
152 
153 /*
154  * Used by rtsock callback code to decide whether to filter the update
155  * notification to a socket bound to a particular FIB.
156  */
157 #define	RTS_FILTER_FIB	M_PROTO8
158 /*
159  * Used to store address family of the notification.
160  */
161 #define	m_rtsock_family	m_pkthdr.PH_loc.eight[0]
162 
163 struct rcb {
164 	LIST_ENTRY(rcb) list;
165 	struct socket	*rcb_socket;
166 	sa_family_t	rcb_family;
167 };
168 
169 typedef struct {
170 	LIST_HEAD(, rcb)	cblist;
171 	int	ip_count;	/* attached w/ AF_INET */
172 	int	ip6_count;	/* attached w/ AF_INET6 */
173 	int	any_count;	/* total attached */
174 } route_cb_t;
175 VNET_DEFINE_STATIC(route_cb_t, route_cb);
176 #define	V_route_cb VNET(route_cb)
177 
178 struct mtx rtsock_mtx;
179 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
180 
181 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
182 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
183 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
184 
185 SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
186 
187 struct walkarg {
188 	int	family;
189 	int	w_tmemsize;
190 	int	w_op, w_arg;
191 	caddr_t	w_tmem;
192 	struct sysctl_req *w_req;
193 	struct sockaddr *dst;
194 	struct sockaddr *mask;
195 };
196 
197 static void	rts_input(struct mbuf *m);
198 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
199 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
200 			struct walkarg *w, int *plen);
201 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
202 			struct rt_addrinfo *rtinfo);
203 static int	cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb);
204 static int	sysctl_dumpentry(struct rtentry *rt, void *vw);
205 static int	sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh,
206 			uint32_t weight, struct walkarg *w);
207 static int	sysctl_iflist(int af, struct walkarg *w);
208 static int	sysctl_ifmalist(int af, struct walkarg *w);
209 static void	rt_getmetrics(const struct rtentry *rt,
210 			const struct nhop_object *nh, struct rt_metrics *out);
211 static void	rt_dispatch(struct mbuf *, sa_family_t);
212 static void	rt_ifannouncemsg(struct ifnet *ifp, int what);
213 static int	handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
214 			struct rt_msghdr *rtm, struct rib_cmd_info *rc);
215 static int	update_rtm_from_rc(struct rt_addrinfo *info,
216 			struct rt_msghdr **prtm, int alloc_len,
217 			struct rib_cmd_info *rc, struct nhop_object *nh);
218 static void	send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
219 			struct mbuf *m, sa_family_t saf, u_int fibnum,
220 			int rtm_errno);
221 static bool	can_export_rte(struct ucred *td_ucred, bool rt_is_host,
222 			const struct sockaddr *rt_dst);
223 static void	rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc);
224 static void	rtsock_ifmsg(struct ifnet *ifp, int if_flags_mask);
225 
226 static struct netisr_handler rtsock_nh = {
227 	.nh_name = "rtsock",
228 	.nh_handler = rts_input,
229 	.nh_proto = NETISR_ROUTE,
230 	.nh_policy = NETISR_POLICY_SOURCE,
231 };
232 
233 static int
234 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
235 {
236 	int error, qlimit;
237 
238 	netisr_getqlimit(&rtsock_nh, &qlimit);
239 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
240         if (error || !req->newptr)
241                 return (error);
242 	if (qlimit < 1)
243 		return (EINVAL);
244 	return (netisr_setqlimit(&rtsock_nh, qlimit));
245 }
246 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen,
247     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
248     0, 0, sysctl_route_netisr_maxqlen, "I",
249     "maximum routing socket dispatch queue length");
250 
251 static void
252 vnet_rts_init(void)
253 {
254 	int tmp;
255 
256 	if (IS_DEFAULT_VNET(curvnet)) {
257 		if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
258 			rtsock_nh.nh_qlimit = tmp;
259 		netisr_register(&rtsock_nh);
260 	}
261 #ifdef VIMAGE
262 	 else
263 		netisr_register_vnet(&rtsock_nh);
264 #endif
265 }
266 VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
267     vnet_rts_init, 0);
268 
269 #ifdef VIMAGE
270 static void
271 vnet_rts_uninit(void)
272 {
273 
274 	netisr_unregister_vnet(&rtsock_nh);
275 }
276 VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
277     vnet_rts_uninit, 0);
278 #endif
279 
280 static void
281 report_route_event(const struct rib_cmd_info *rc, void *_cbdata)
282 {
283 	uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata;
284 	struct nhop_object *nh;
285 
286 	nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new;
287 	rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum);
288 }
289 
290 static void
291 rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc)
292 {
293 #ifdef ROUTE_MPATH
294 	if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) ||
295 	    (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) {
296 		rib_decompose_notification(rc, report_route_event,
297 		    (void *)(uintptr_t)fibnum);
298 	} else
299 #endif
300 		report_route_event(rc, (void *)(uintptr_t)fibnum);
301 }
302 static struct rtbridge rtsbridge = {
303 	.route_f = rts_handle_route_event,
304 	.ifmsg_f = rtsock_ifmsg,
305 };
306 static struct rtbridge *rtsbridge_orig_p;
307 
308 static void
309 rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc)
310 {
311 	netlink_callback_p->route_f(fibnum, rc);
312 }
313 
314 static void
315 rtsock_init(void)
316 {
317 	rtsbridge_orig_p = rtsock_callback_p;
318 	rtsock_callback_p = &rtsbridge;
319 }
320 SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL);
321 
322 static void
323 rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
324 {
325 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
326 }
327 EVENTHANDLER_DEFINE(ifnet_arrival_event, rts_handle_ifnet_arrival, NULL, 0);
328 
329 static void
330 rts_handle_ifnet_departure(void *arg __unused, struct ifnet *ifp)
331 {
332 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
333 }
334 EVENTHANDLER_DEFINE(ifnet_departure_event, rts_handle_ifnet_departure, NULL, 0);
335 
336 static void
337 rts_append_data(struct socket *so, struct mbuf *m)
338 {
339 
340 	if (sbappendaddr(&so->so_rcv, &route_src, m, NULL) == 0) {
341 		soroverflow(so);
342 		m_freem(m);
343 	} else
344 		sorwakeup(so);
345 }
346 
347 static void
348 rts_input(struct mbuf *m)
349 {
350 	struct rcb *rcb;
351 	struct socket *last;
352 
353 	last = NULL;
354 	RTSOCK_LOCK();
355 	LIST_FOREACH(rcb, &V_route_cb.cblist, list) {
356 		if (rcb->rcb_family != AF_UNSPEC &&
357 		    rcb->rcb_family != m->m_rtsock_family)
358 			continue;
359 		if ((m->m_flags & RTS_FILTER_FIB) &&
360 		    M_GETFIB(m) != rcb->rcb_socket->so_fibnum)
361 			continue;
362 		if (last != NULL) {
363 			struct mbuf *n;
364 
365 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
366 			if (n != NULL)
367 				rts_append_data(last, n);
368 		}
369 		last = rcb->rcb_socket;
370 	}
371 	if (last != NULL)
372 		rts_append_data(last, m);
373 	else
374 		m_freem(m);
375 	RTSOCK_UNLOCK();
376 }
377 
378 static void
379 rts_close(struct socket *so)
380 {
381 
382 	soisdisconnected(so);
383 }
384 
385 static SYSCTL_NODE(_net, OID_AUTO, rtsock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
386     "Routing socket infrastructure");
387 static u_long rts_sendspace = 8192;
388 SYSCTL_ULONG(_net_rtsock, OID_AUTO, sendspace, CTLFLAG_RW, &rts_sendspace, 0,
389     "Default routing socket send space");
390 static u_long rts_recvspace = 8192;
391 SYSCTL_ULONG(_net_rtsock, OID_AUTO, recvspace, CTLFLAG_RW, &rts_recvspace, 0,
392     "Default routing socket receive space");
393 
394 static int
395 rts_attach(struct socket *so, int proto, struct thread *td)
396 {
397 	struct rcb *rcb;
398 	int error;
399 
400 	error = soreserve(so, rts_sendspace, rts_recvspace);
401 	if (error)
402 		return (error);
403 
404 	rcb = malloc(sizeof(*rcb), M_PCB, M_WAITOK);
405 	rcb->rcb_socket = so;
406 	rcb->rcb_family = proto;
407 
408 	so->so_pcb = rcb;
409 	so->so_fibnum = td->td_proc->p_fibnum;
410 	so->so_options |= SO_USELOOPBACK;
411 
412 	RTSOCK_LOCK();
413 	LIST_INSERT_HEAD(&V_route_cb.cblist, rcb, list);
414 	switch (proto) {
415 	case AF_INET:
416 		V_route_cb.ip_count++;
417 		break;
418 	case AF_INET6:
419 		V_route_cb.ip6_count++;
420 		break;
421 	}
422 	V_route_cb.any_count++;
423 	RTSOCK_UNLOCK();
424 	soisconnected(so);
425 
426 	return (0);
427 }
428 
429 static void
430 rts_detach(struct socket *so)
431 {
432 	struct rcb *rcb = so->so_pcb;
433 
434 	RTSOCK_LOCK();
435 	LIST_REMOVE(rcb, list);
436 	switch(rcb->rcb_family) {
437 	case AF_INET:
438 		V_route_cb.ip_count--;
439 		break;
440 	case AF_INET6:
441 		V_route_cb.ip6_count--;
442 		break;
443 	}
444 	V_route_cb.any_count--;
445 	RTSOCK_UNLOCK();
446 	free(rcb, M_PCB);
447 	so->so_pcb = NULL;
448 }
449 
450 static int
451 rts_disconnect(struct socket *so)
452 {
453 
454 	return (ENOTCONN);
455 }
456 
457 static int
458 rts_shutdown(struct socket *so)
459 {
460 
461 	socantsendmore(so);
462 	return (0);
463 }
464 
465 #ifndef _SOCKADDR_UNION_DEFINED
466 #define	_SOCKADDR_UNION_DEFINED
467 /*
468  * The union of all possible address formats we handle.
469  */
470 union sockaddr_union {
471 	struct sockaddr		sa;
472 	struct sockaddr_in	sin;
473 	struct sockaddr_in6	sin6;
474 };
475 #endif /* _SOCKADDR_UNION_DEFINED */
476 
477 static int
478 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
479     struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred)
480 {
481 #if defined(INET) || defined(INET6)
482 	struct epoch_tracker et;
483 #endif
484 
485 	/* First, see if the returned address is part of the jail. */
486 	if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) {
487 		info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
488 		return (0);
489 	}
490 
491 	switch (info->rti_info[RTAX_DST]->sa_family) {
492 #ifdef INET
493 	case AF_INET:
494 	{
495 		struct in_addr ia;
496 		struct ifaddr *ifa;
497 		int found;
498 
499 		found = 0;
500 		/*
501 		 * Try to find an address on the given outgoing interface
502 		 * that belongs to the jail.
503 		 */
504 		NET_EPOCH_ENTER(et);
505 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
506 			struct sockaddr *sa;
507 			sa = ifa->ifa_addr;
508 			if (sa->sa_family != AF_INET)
509 				continue;
510 			ia = ((struct sockaddr_in *)sa)->sin_addr;
511 			if (prison_check_ip4(cred, &ia) == 0) {
512 				found = 1;
513 				break;
514 			}
515 		}
516 		NET_EPOCH_EXIT(et);
517 		if (!found) {
518 			/*
519 			 * As a last resort return the 'default' jail address.
520 			 */
521 			ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)->
522 			    sin_addr;
523 			if (prison_get_ip4(cred, &ia) != 0)
524 				return (ESRCH);
525 		}
526 		bzero(&saun->sin, sizeof(struct sockaddr_in));
527 		saun->sin.sin_len = sizeof(struct sockaddr_in);
528 		saun->sin.sin_family = AF_INET;
529 		saun->sin.sin_addr.s_addr = ia.s_addr;
530 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
531 		break;
532 	}
533 #endif
534 #ifdef INET6
535 	case AF_INET6:
536 	{
537 		struct in6_addr ia6;
538 		struct ifaddr *ifa;
539 		int found;
540 
541 		found = 0;
542 		/*
543 		 * Try to find an address on the given outgoing interface
544 		 * that belongs to the jail.
545 		 */
546 		NET_EPOCH_ENTER(et);
547 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
548 			struct sockaddr *sa;
549 			sa = ifa->ifa_addr;
550 			if (sa->sa_family != AF_INET6)
551 				continue;
552 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
553 			    &ia6, sizeof(struct in6_addr));
554 			if (prison_check_ip6(cred, &ia6) == 0) {
555 				found = 1;
556 				break;
557 			}
558 		}
559 		NET_EPOCH_EXIT(et);
560 		if (!found) {
561 			/*
562 			 * As a last resort return the 'default' jail address.
563 			 */
564 			ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)->
565 			    sin6_addr;
566 			if (prison_get_ip6(cred, &ia6) != 0)
567 				return (ESRCH);
568 		}
569 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
570 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
571 		saun->sin6.sin6_family = AF_INET6;
572 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
573 		if (sa6_recoverscope(&saun->sin6) != 0)
574 			return (ESRCH);
575 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
576 		break;
577 	}
578 #endif
579 	default:
580 		return (ESRCH);
581 	}
582 	return (0);
583 }
584 
585 static int
586 fill_blackholeinfo(struct rt_addrinfo *info, union sockaddr_union *saun)
587 {
588 	struct ifaddr *ifa;
589 	sa_family_t saf;
590 
591 	if (V_loif == NULL) {
592 		RTS_PID_LOG(LOG_INFO, "Unable to add blackhole/reject nhop without loopback");
593 		return (ENOTSUP);
594 	}
595 	info->rti_ifp = V_loif;
596 
597 	saf = info->rti_info[RTAX_DST]->sa_family;
598 
599 	CK_STAILQ_FOREACH(ifa, &info->rti_ifp->if_addrhead, ifa_link) {
600 		if (ifa->ifa_addr->sa_family == saf) {
601 			info->rti_ifa = ifa;
602 			break;
603 		}
604 	}
605 	if (info->rti_ifa == NULL) {
606 		RTS_PID_LOG(LOG_INFO, "Unable to find ifa for blackhole/reject nhop");
607 		return (ENOTSUP);
608 	}
609 
610 	bzero(saun, sizeof(union sockaddr_union));
611 	switch (saf) {
612 #ifdef INET
613 	case AF_INET:
614 		saun->sin.sin_family = AF_INET;
615 		saun->sin.sin_len = sizeof(struct sockaddr_in);
616 		saun->sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
617 		break;
618 #endif
619 #ifdef INET6
620 	case AF_INET6:
621 		saun->sin6.sin6_family = AF_INET6;
622 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
623 		saun->sin6.sin6_addr = in6addr_loopback;
624 		break;
625 #endif
626 	default:
627 		RTS_PID_LOG(LOG_INFO, "unsupported family: %d", saf);
628 		return (ENOTSUP);
629 	}
630 	info->rti_info[RTAX_GATEWAY] = &saun->sa;
631 	info->rti_flags |= RTF_GATEWAY;
632 
633 	return (0);
634 }
635 
636 /*
637  * Fills in @info based on userland-provided @rtm message.
638  *
639  * Returns 0 on success.
640  */
641 static int
642 fill_addrinfo(struct rt_msghdr *rtm, int len, struct linear_buffer *lb, u_int fibnum,
643     struct rt_addrinfo *info)
644 {
645 	int error;
646 
647 	rtm->rtm_pid = curproc->p_pid;
648 	info->rti_addrs = rtm->rtm_addrs;
649 
650 	info->rti_mflags = rtm->rtm_inits;
651 	info->rti_rmx = &rtm->rtm_rmx;
652 
653 	/*
654 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
655 	 * link-local address because rtrequest requires addresses with
656 	 * embedded scope id.
657 	 */
658 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, info))
659 		return (EINVAL);
660 
661 	info->rti_flags = rtm->rtm_flags;
662 	error = cleanup_xaddrs(info, lb);
663 	if (error != 0)
664 		return (error);
665 	/*
666 	 * Verify that the caller has the appropriate privilege; RTM_GET
667 	 * is the only operation the non-superuser is allowed.
668 	 */
669 	if (rtm->rtm_type != RTM_GET) {
670 		error = priv_check(curthread, PRIV_NET_ROUTE);
671 		if (error != 0)
672 			return (error);
673 	}
674 
675 	/*
676 	 * The given gateway address may be an interface address.
677 	 * For example, issuing a "route change" command on a route
678 	 * entry that was created from a tunnel, and the gateway
679 	 * address given is the local end point. In this case the
680 	 * RTF_GATEWAY flag must be cleared or the destination will
681 	 * not be reachable even though there is no error message.
682 	 */
683 	if (info->rti_info[RTAX_GATEWAY] != NULL &&
684 	    info->rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
685 		struct nhop_object *nh;
686 
687 		/*
688 		 * A host route through the loopback interface is
689 		 * installed for each interface adddress. In pre 8.0
690 		 * releases the interface address of a PPP link type
691 		 * is not reachable locally. This behavior is fixed as
692 		 * part of the new L2/L3 redesign and rewrite work. The
693 		 * signature of this interface address route is the
694 		 * AF_LINK sa_family type of the gateway, and the
695 		 * rt_ifp has the IFF_LOOPBACK flag set.
696 		 */
697 		nh = rib_lookup(fibnum, info->rti_info[RTAX_GATEWAY], NHR_NONE, 0);
698 		if (nh != NULL && nh->gw_sa.sa_family == AF_LINK &&
699 		    nh->nh_ifp->if_flags & IFF_LOOPBACK) {
700 				info->rti_flags &= ~RTF_GATEWAY;
701 				info->rti_flags |= RTF_GWFLAG_COMPAT;
702 		}
703 	}
704 
705 	return (0);
706 }
707 
708 static struct nhop_object *
709 select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
710 {
711 	if (!NH_IS_NHGRP(nh))
712 		return (nh);
713 #ifdef ROUTE_MPATH
714 	const struct weightened_nhop *wn;
715 	uint32_t num_nhops;
716 	wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
717 	if (gw == NULL)
718 		return (wn[0].nh);
719 	for (int i = 0; i < num_nhops; i++) {
720 		if (match_nhop_gw(wn[i].nh, gw))
721 			return (wn[i].nh);
722 	}
723 #endif
724 	return (NULL);
725 }
726 
727 /*
728  * Handles RTM_GET message from routing socket, returning matching rt.
729  *
730  * Returns:
731  * 0 on success, with locked and referenced matching rt in @rt_nrt
732  * errno of failure
733  */
734 static int
735 handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
736     struct rt_msghdr *rtm, struct rib_cmd_info *rc)
737 {
738 	RIB_RLOCK_TRACKER;
739 	struct rib_head *rnh;
740 	struct nhop_object *nh;
741 	sa_family_t saf;
742 
743 	saf = info->rti_info[RTAX_DST]->sa_family;
744 
745 	rnh = rt_tables_get_rnh(fibnum, saf);
746 	if (rnh == NULL)
747 		return (EAFNOSUPPORT);
748 
749 	RIB_RLOCK(rnh);
750 
751 	/*
752 	 * By (implicit) convention host route (one without netmask)
753 	 * means longest-prefix-match request and the route with netmask
754 	 * means exact-match lookup.
755 	 * As cleanup_xaddrs() cleans up info flags&addrs for the /32,/128
756 	 * prefixes, use original data to check for the netmask presence.
757 	 */
758 	if ((rtm->rtm_addrs & RTA_NETMASK) == 0) {
759 		/*
760 		 * Provide longest prefix match for
761 		 * address lookup (no mask).
762 		 * 'route -n get addr'
763 		 */
764 		rc->rc_rt = (struct rtentry *) rnh->rnh_matchaddr(
765 		    info->rti_info[RTAX_DST], &rnh->head);
766 	} else
767 		rc->rc_rt = (struct rtentry *) rnh->rnh_lookup(
768 		    info->rti_info[RTAX_DST],
769 		    info->rti_info[RTAX_NETMASK], &rnh->head);
770 
771 	if (rc->rc_rt == NULL) {
772 		RIB_RUNLOCK(rnh);
773 		return (ESRCH);
774 	}
775 
776 	nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
777 	if (nh == NULL) {
778 		RIB_RUNLOCK(rnh);
779 		return (ESRCH);
780 	}
781 	/*
782 	 * If performing proxied L2 entry insertion, and
783 	 * the actual PPP host entry is found, perform
784 	 * another search to retrieve the prefix route of
785 	 * the local end point of the PPP link.
786 	 * TODO: move this logic to userland.
787 	 */
788 	if (rtm->rtm_flags & RTF_ANNOUNCE) {
789 		struct sockaddr_storage laddr;
790 
791 		if (nh->nh_ifp != NULL &&
792 		    nh->nh_ifp->if_type == IFT_PROPVIRTUAL) {
793 			struct ifaddr *ifa;
794 
795 			ifa = ifa_ifwithnet(info->rti_info[RTAX_DST], 1,
796 					RT_ALL_FIBS);
797 			if (ifa != NULL)
798 				rt_maskedcopy(ifa->ifa_addr,
799 					      (struct sockaddr *)&laddr,
800 					      ifa->ifa_netmask);
801 		} else
802 			rt_maskedcopy(nh->nh_ifa->ifa_addr,
803 				      (struct sockaddr *)&laddr,
804 				      nh->nh_ifa->ifa_netmask);
805 		/*
806 		 * refactor rt and no lock operation necessary
807 		 */
808 		rc->rc_rt = (struct rtentry *)rnh->rnh_matchaddr(
809 		    (struct sockaddr *)&laddr, &rnh->head);
810 		if (rc->rc_rt == NULL) {
811 			RIB_RUNLOCK(rnh);
812 			return (ESRCH);
813 		}
814 		nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
815 		if (nh == NULL) {
816 			RIB_RUNLOCK(rnh);
817 			return (ESRCH);
818 		}
819 	}
820 	rc->rc_nh_new = nh;
821 	rc->rc_nh_weight = rc->rc_rt->rt_weight;
822 	RIB_RUNLOCK(rnh);
823 
824 	return (0);
825 }
826 
827 static void
828 init_sockaddrs_family(int family, struct sockaddr *dst, struct sockaddr *mask)
829 {
830 #ifdef INET
831 	if (family == AF_INET) {
832 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
833 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
834 
835 		bzero(dst4, sizeof(struct sockaddr_in));
836 		bzero(mask4, sizeof(struct sockaddr_in));
837 
838 		dst4->sin_family = AF_INET;
839 		dst4->sin_len = sizeof(struct sockaddr_in);
840 		mask4->sin_family = AF_INET;
841 		mask4->sin_len = sizeof(struct sockaddr_in);
842 	}
843 #endif
844 #ifdef INET6
845 	if (family == AF_INET6) {
846 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
847 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
848 
849 		bzero(dst6, sizeof(struct sockaddr_in6));
850 		bzero(mask6, sizeof(struct sockaddr_in6));
851 
852 		dst6->sin6_family = AF_INET6;
853 		dst6->sin6_len = sizeof(struct sockaddr_in6);
854 		mask6->sin6_family = AF_INET6;
855 		mask6->sin6_len = sizeof(struct sockaddr_in6);
856 	}
857 #endif
858 }
859 
860 static void
861 export_rtaddrs(const struct rtentry *rt, struct sockaddr *dst,
862     struct sockaddr *mask)
863 {
864 #ifdef INET
865 	if (dst->sa_family == AF_INET) {
866 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
867 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
868 		uint32_t scopeid = 0;
869 		rt_get_inet_prefix_pmask(rt, &dst4->sin_addr, &mask4->sin_addr,
870 		    &scopeid);
871 		return;
872 	}
873 #endif
874 #ifdef INET6
875 	if (dst->sa_family == AF_INET6) {
876 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
877 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
878 		uint32_t scopeid = 0;
879 		rt_get_inet6_prefix_pmask(rt, &dst6->sin6_addr,
880 		    &mask6->sin6_addr, &scopeid);
881 		dst6->sin6_scope_id = scopeid;
882 		return;
883 	}
884 #endif
885 }
886 
887 static int
888 update_rtm_from_info(struct rt_addrinfo *info, struct rt_msghdr **prtm,
889     int alloc_len)
890 {
891 	struct rt_msghdr *rtm, *orig_rtm = NULL;
892 	struct walkarg w;
893 	int len;
894 
895 	rtm = *prtm;
896 	/* Check if we need to realloc storage */
897 	rtsock_msg_buffer(rtm->rtm_type, info, NULL, &len);
898 	if (len > alloc_len) {
899 		struct rt_msghdr *tmp_rtm;
900 
901 		tmp_rtm = malloc(len, M_TEMP, M_NOWAIT);
902 		if (tmp_rtm == NULL)
903 			return (ENOBUFS);
904 		bcopy(rtm, tmp_rtm, rtm->rtm_msglen);
905 		orig_rtm = rtm;
906 		rtm = tmp_rtm;
907 		alloc_len = len;
908 
909 		/*
910 		 * Delay freeing original rtm as info contains
911 		 * data referencing it.
912 		 */
913 	}
914 
915 	w.w_tmem = (caddr_t)rtm;
916 	w.w_tmemsize = alloc_len;
917 	rtsock_msg_buffer(rtm->rtm_type, info, &w, &len);
918 	rtm->rtm_addrs = info->rti_addrs;
919 
920 	if (orig_rtm != NULL)
921 		free(orig_rtm, M_TEMP);
922 	*prtm = rtm;
923 	return (0);
924 }
925 
926 
927 /*
928  * Update sockaddrs, flags, etc in @prtm based on @rc data.
929  * rtm can be reallocated.
930  *
931  * Returns 0 on success, along with pointer to (potentially reallocated)
932  *  rtm.
933  *
934  */
935 static int
936 update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm,
937     int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh)
938 {
939 	union sockaddr_union saun;
940 	struct rt_msghdr *rtm;
941 	struct ifnet *ifp;
942 	int error;
943 
944 	rtm = *prtm;
945 	union sockaddr_union sa_dst, sa_mask;
946 	int family = info->rti_info[RTAX_DST]->sa_family;
947 	init_sockaddrs_family(family, &sa_dst.sa, &sa_mask.sa);
948 	export_rtaddrs(rc->rc_rt, &sa_dst.sa, &sa_mask.sa);
949 
950 	info->rti_info[RTAX_DST] = &sa_dst.sa;
951 	info->rti_info[RTAX_NETMASK] = rt_is_host(rc->rc_rt) ? NULL : &sa_mask.sa;
952 	info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
953 	info->rti_info[RTAX_GENMASK] = 0;
954 	ifp = nh->nh_ifp;
955 	if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
956 		if (ifp) {
957 			info->rti_info[RTAX_IFP] =
958 			    ifp->if_addr->ifa_addr;
959 			error = rtm_get_jailed(info, ifp, nh,
960 			    &saun, curthread->td_ucred);
961 			if (error != 0)
962 				return (error);
963 			if (ifp->if_flags & IFF_POINTOPOINT)
964 				info->rti_info[RTAX_BRD] =
965 				    nh->nh_ifa->ifa_dstaddr;
966 			rtm->rtm_index = ifp->if_index;
967 		} else {
968 			info->rti_info[RTAX_IFP] = NULL;
969 			info->rti_info[RTAX_IFA] = NULL;
970 		}
971 	} else if (ifp != NULL)
972 		rtm->rtm_index = ifp->if_index;
973 
974 	if ((error = update_rtm_from_info(info, prtm, alloc_len)) != 0)
975 		return (error);
976 
977 	rtm = *prtm;
978 	rtm->rtm_flags = rc->rc_rt->rte_flags | nhop_get_rtflags(nh);
979 	if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
980 		rtm->rtm_flags = RTF_GATEWAY |
981 			(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
982 	rt_getmetrics(rc->rc_rt, nh, &rtm->rtm_rmx);
983 	rtm->rtm_rmx.rmx_weight = rc->rc_nh_weight;
984 
985 	return (0);
986 }
987 
988 #ifdef ROUTE_MPATH
989 static void
990 save_del_notification(const struct rib_cmd_info *rc, void *_cbdata)
991 {
992 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
993 
994 	if (rc->rc_cmd == RTM_DELETE)
995 		*rc_new = *rc;
996 }
997 
998 static void
999 save_add_notification(const struct rib_cmd_info *rc, void *_cbdata)
1000 {
1001 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
1002 
1003 	if (rc->rc_cmd == RTM_ADD)
1004 		*rc_new = *rc;
1005 }
1006 #endif
1007 
1008 #if defined(INET6) || defined(INET)
1009 static struct sockaddr *
1010 alloc_sockaddr_aligned(struct linear_buffer *lb, int len)
1011 {
1012 	len = roundup2(len, sizeof(uint64_t));
1013 	if (lb->offset + len > lb->size)
1014 		return (NULL);
1015 	struct sockaddr *sa = (struct sockaddr *)(lb->base + lb->offset);
1016 	lb->offset += len;
1017 	return (sa);
1018 }
1019 #endif
1020 
1021 static int
1022 rts_send(struct socket *so, int flags, struct mbuf *m,
1023     struct sockaddr *nam, struct mbuf *control, struct thread *td)
1024 {
1025 	struct rt_msghdr *rtm = NULL;
1026 	struct rt_addrinfo info;
1027 	struct epoch_tracker et;
1028 #ifdef INET6
1029 	struct sockaddr_storage ss;
1030 	struct sockaddr_in6 *sin6;
1031 	int i, rti_need_deembed = 0;
1032 #endif
1033 	int alloc_len = 0, len, error = 0, fibnum;
1034 	sa_family_t saf = AF_UNSPEC;
1035 	struct rib_cmd_info rc;
1036 	struct nhop_object *nh;
1037 
1038 	if ((flags & PRUS_OOB) || control != NULL) {
1039 		m_freem(m);
1040 		if (control != NULL)
1041 			m_freem(control);
1042 		return (EOPNOTSUPP);
1043 	}
1044 
1045 	fibnum = so->so_fibnum;
1046 #define senderr(e) { error = e; goto flush;}
1047 	if (m == NULL || ((m->m_len < sizeof(long)) &&
1048 		       (m = m_pullup(m, sizeof(long))) == NULL))
1049 		return (ENOBUFS);
1050 	if ((m->m_flags & M_PKTHDR) == 0)
1051 		panic("route_output");
1052 	NET_EPOCH_ENTER(et);
1053 	len = m->m_pkthdr.len;
1054 	if (len < sizeof(*rtm) ||
1055 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
1056 		senderr(EINVAL);
1057 
1058 	/*
1059 	 * Most of current messages are in range 200-240 bytes,
1060 	 * minimize possible re-allocation on reply using larger size
1061 	 * buffer aligned on 1k boundaty.
1062 	 */
1063 	alloc_len = roundup2(len, 1024);
1064 	int total_len = alloc_len + SCRATCH_BUFFER_SIZE;
1065 	if ((rtm = malloc(total_len, M_TEMP, M_NOWAIT)) == NULL)
1066 		senderr(ENOBUFS);
1067 
1068 	m_copydata(m, 0, len, (caddr_t)rtm);
1069 	bzero(&info, sizeof(info));
1070 	nh = NULL;
1071 	struct linear_buffer lb = {
1072 		.base = (char *)rtm + alloc_len,
1073 		.size = SCRATCH_BUFFER_SIZE,
1074 	};
1075 
1076 	if (rtm->rtm_version != RTM_VERSION) {
1077 		/* Do not touch message since format is unknown */
1078 		free(rtm, M_TEMP);
1079 		rtm = NULL;
1080 		senderr(EPROTONOSUPPORT);
1081 	}
1082 
1083 	/*
1084 	 * Starting from here, it is possible
1085 	 * to alter original message and insert
1086 	 * caller PID and error value.
1087 	 */
1088 
1089 	if ((error = fill_addrinfo(rtm, len, &lb, fibnum, &info)) != 0) {
1090 		senderr(error);
1091 	}
1092 	/* fill_addringo() embeds scope into IPv6 addresses */
1093 #ifdef INET6
1094 	rti_need_deembed = 1;
1095 #endif
1096 
1097 	saf = info.rti_info[RTAX_DST]->sa_family;
1098 
1099 	/* support for new ARP code */
1100 	if (rtm->rtm_flags & RTF_LLDATA) {
1101 		error = lla_rt_output(rtm, &info);
1102 		goto flush;
1103 	}
1104 
1105 	union sockaddr_union gw_saun;
1106 	int blackhole_flags = rtm->rtm_flags & (RTF_BLACKHOLE|RTF_REJECT);
1107 	if (blackhole_flags != 0) {
1108 		if (blackhole_flags != (RTF_BLACKHOLE | RTF_REJECT))
1109 			error = fill_blackholeinfo(&info, &gw_saun);
1110 		else {
1111 			RTS_PID_LOG(LOG_DEBUG, "both BLACKHOLE and REJECT flags specifiied");
1112 			error = EINVAL;
1113 		}
1114 		if (error != 0)
1115 			senderr(error);
1116 	}
1117 
1118 	switch (rtm->rtm_type) {
1119 	case RTM_ADD:
1120 	case RTM_CHANGE:
1121 		if (rtm->rtm_type == RTM_ADD) {
1122 			if (info.rti_info[RTAX_GATEWAY] == NULL) {
1123 				RTS_PID_LOG(LOG_DEBUG, "RTM_ADD w/o gateway");
1124 				senderr(EINVAL);
1125 			}
1126 		}
1127 		error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
1128 		if (error == 0) {
1129 			rtsock_notify_event(fibnum, &rc);
1130 #ifdef ROUTE_MPATH
1131 			if (NH_IS_NHGRP(rc.rc_nh_new) ||
1132 			    (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
1133 				struct rib_cmd_info rc_simple = {};
1134 				rib_decompose_notification(&rc,
1135 				    save_add_notification, (void *)&rc_simple);
1136 				rc = rc_simple;
1137 			}
1138 #endif
1139 			/* nh MAY be empty if RTM_CHANGE request is no-op */
1140 			nh = rc.rc_nh_new;
1141 			if (nh != NULL) {
1142 				rtm->rtm_index = nh->nh_ifp->if_index;
1143 				rtm->rtm_flags = rc.rc_rt->rte_flags | nhop_get_rtflags(nh);
1144 			}
1145 		}
1146 		break;
1147 
1148 	case RTM_DELETE:
1149 		error = rib_action(fibnum, RTM_DELETE, &info, &rc);
1150 		if (error == 0) {
1151 			rtsock_notify_event(fibnum, &rc);
1152 #ifdef ROUTE_MPATH
1153 			if (NH_IS_NHGRP(rc.rc_nh_old) ||
1154 			    (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
1155 				struct rib_cmd_info rc_simple = {};
1156 				rib_decompose_notification(&rc,
1157 				    save_del_notification, (void *)&rc_simple);
1158 				rc = rc_simple;
1159 			}
1160 #endif
1161 			nh = rc.rc_nh_old;
1162 		}
1163 		break;
1164 
1165 	case RTM_GET:
1166 		error = handle_rtm_get(&info, fibnum, rtm, &rc);
1167 		if (error != 0)
1168 			senderr(error);
1169 		nh = rc.rc_nh_new;
1170 
1171 		if (!can_export_rte(curthread->td_ucred,
1172 		    info.rti_info[RTAX_NETMASK] == NULL,
1173 		    info.rti_info[RTAX_DST])) {
1174 			senderr(ESRCH);
1175 		}
1176 		break;
1177 
1178 	default:
1179 		senderr(EOPNOTSUPP);
1180 	}
1181 
1182 	if (error == 0 && nh != NULL) {
1183 		error = update_rtm_from_rc(&info, &rtm, alloc_len, &rc, nh);
1184 		/*
1185 		 * Note that some sockaddr pointers may have changed to
1186 		 * point to memory outsize @rtm. Some may be pointing
1187 		 * to the on-stack variables.
1188 		 * Given that, any pointer in @info CANNOT BE USED.
1189 		 */
1190 
1191 		/*
1192 		 * scopeid deembedding has been performed while
1193 		 * writing updated rtm in rtsock_msg_buffer().
1194 		 * With that in mind, skip deembedding procedure below.
1195 		 */
1196 #ifdef INET6
1197 		rti_need_deembed = 0;
1198 #endif
1199 	}
1200 
1201 flush:
1202 	NET_EPOCH_EXIT(et);
1203 
1204 #ifdef INET6
1205 	if (rtm != NULL) {
1206 		if (rti_need_deembed) {
1207 			/* sin6_scope_id is recovered before sending rtm. */
1208 			sin6 = (struct sockaddr_in6 *)&ss;
1209 			for (i = 0; i < RTAX_MAX; i++) {
1210 				if (info.rti_info[i] == NULL)
1211 					continue;
1212 				if (info.rti_info[i]->sa_family != AF_INET6)
1213 					continue;
1214 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
1215 				if (sa6_recoverscope(sin6) == 0)
1216 					bcopy(sin6, info.rti_info[i],
1217 						    sizeof(*sin6));
1218 			}
1219 			if (update_rtm_from_info(&info, &rtm, alloc_len) != 0) {
1220 				if (error != 0)
1221 					error = ENOBUFS;
1222 			}
1223 		}
1224 	}
1225 #endif
1226 	send_rtm_reply(so, rtm, m, saf, fibnum, error);
1227 
1228 	return (error);
1229 }
1230 
1231 /*
1232  * Sends the prepared reply message in @rtm to all rtsock clients.
1233  * Frees @m and @rtm.
1234  *
1235  */
1236 static void
1237 send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m,
1238     sa_family_t saf, u_int fibnum, int rtm_errno)
1239 {
1240 	struct rcb *rcb = NULL;
1241 
1242 	/*
1243 	 * Check to see if we don't want our own messages.
1244 	 */
1245 	if ((so->so_options & SO_USELOOPBACK) == 0) {
1246 		if (V_route_cb.any_count <= 1) {
1247 			if (rtm != NULL)
1248 				free(rtm, M_TEMP);
1249 			m_freem(m);
1250 			return;
1251 		}
1252 		/* There is another listener, so construct message */
1253 		rcb = so->so_pcb;
1254 	}
1255 
1256 	if (rtm != NULL) {
1257 		if (rtm_errno!= 0)
1258 			rtm->rtm_errno = rtm_errno;
1259 		else
1260 			rtm->rtm_flags |= RTF_DONE;
1261 
1262 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
1263 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
1264 			m_freem(m);
1265 			m = NULL;
1266 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
1267 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
1268 
1269 		free(rtm, M_TEMP);
1270 	}
1271 	if (m != NULL) {
1272 		M_SETFIB(m, fibnum);
1273 		m->m_flags |= RTS_FILTER_FIB;
1274 		if (rcb) {
1275 			/*
1276 			 * XXX insure we don't get a copy by
1277 			 * invalidating our protocol
1278 			 */
1279 			sa_family_t family = rcb->rcb_family;
1280 			rcb->rcb_family = AF_UNSPEC;
1281 			rt_dispatch(m, saf);
1282 			rcb->rcb_family = family;
1283 		} else
1284 			rt_dispatch(m, saf);
1285 	}
1286 }
1287 
1288 static void
1289 rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh,
1290     struct rt_metrics *out)
1291 {
1292 
1293 	bzero(out, sizeof(*out));
1294 	out->rmx_mtu = nh->nh_mtu;
1295 	out->rmx_weight = rt->rt_weight;
1296 	out->rmx_nhidx = nhop_get_idx(nh);
1297 	/* Kernel -> userland timebase conversion. */
1298 	out->rmx_expire = nhop_get_expire(nh) ?
1299 	    nhop_get_expire(nh) - time_uptime + time_second : 0;
1300 }
1301 
1302 /*
1303  * Extract the addresses of the passed sockaddrs.
1304  * Do a little sanity checking so as to avoid bad memory references.
1305  * This data is derived straight from userland.
1306  */
1307 static int
1308 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
1309 {
1310 	struct sockaddr *sa;
1311 	int i;
1312 
1313 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
1314 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
1315 			continue;
1316 		sa = (struct sockaddr *)cp;
1317 		/*
1318 		 * It won't fit.
1319 		 */
1320 		if (cp + sa->sa_len > cplim) {
1321 			RTS_PID_LOG(LOG_DEBUG, "sa_len too big for sa type %d", i);
1322 			return (EINVAL);
1323 		}
1324 		/*
1325 		 * there are no more.. quit now
1326 		 * If there are more bits, they are in error.
1327 		 * I've seen this. route(1) can evidently generate these.
1328 		 * This causes kernel to core dump.
1329 		 * for compatibility, If we see this, point to a safe address.
1330 		 */
1331 		if (sa->sa_len == 0) {
1332 			rtinfo->rti_info[i] = &sa_zero;
1333 			return (0); /* should be EINVAL but for compat */
1334 		}
1335 		/* accept it */
1336 #ifdef INET6
1337 		if (sa->sa_family == AF_INET6)
1338 			sa6_embedscope((struct sockaddr_in6 *)sa,
1339 			    V_ip6_use_defzone);
1340 #endif
1341 		rtinfo->rti_info[i] = sa;
1342 		cp += SA_SIZE(sa);
1343 	}
1344 	return (0);
1345 }
1346 
1347 #ifdef INET
1348 static inline void
1349 fill_sockaddr_inet(struct sockaddr_in *sin, struct in_addr addr)
1350 {
1351 
1352 	const struct sockaddr_in nsin = {
1353 		.sin_family = AF_INET,
1354 		.sin_len = sizeof(struct sockaddr_in),
1355 		.sin_addr = addr,
1356 	};
1357 	*sin = nsin;
1358 }
1359 #endif
1360 
1361 #ifdef INET6
1362 static inline void
1363 fill_sockaddr_inet6(struct sockaddr_in6 *sin6, const struct in6_addr *addr6,
1364     uint32_t scopeid)
1365 {
1366 
1367 	const struct sockaddr_in6 nsin6 = {
1368 		.sin6_family = AF_INET6,
1369 		.sin6_len = sizeof(struct sockaddr_in6),
1370 		.sin6_addr = *addr6,
1371 		.sin6_scope_id = scopeid,
1372 	};
1373 	*sin6 = nsin6;
1374 }
1375 #endif
1376 
1377 #if defined(INET6) || defined(INET)
1378 /*
1379  * Checks if gateway is suitable for lltable operations.
1380  * Lltable code requires AF_LINK gateway with ifindex
1381  *  and mac address specified.
1382  * Returns 0 on success.
1383  */
1384 static int
1385 cleanup_xaddrs_lladdr(struct rt_addrinfo *info)
1386 {
1387 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
1388 
1389 	if (sdl->sdl_family != AF_LINK)
1390 		return (EINVAL);
1391 
1392 	if (sdl->sdl_index == 0) {
1393 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gateway w/o ifindex");
1394 		return (EINVAL);
1395 	}
1396 
1397 	if (offsetof(struct sockaddr_dl, sdl_data) + sdl->sdl_nlen + sdl->sdl_alen > sdl->sdl_len) {
1398 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gw: sdl_nlen/sdl_alen too large");
1399 		return (EINVAL);
1400 	}
1401 
1402 	return (0);
1403 }
1404 
1405 static int
1406 cleanup_xaddrs_gateway(struct rt_addrinfo *info, struct linear_buffer *lb)
1407 {
1408 	struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
1409 	struct sockaddr *sa;
1410 
1411 	if (info->rti_flags & RTF_LLDATA)
1412 		return (cleanup_xaddrs_lladdr(info));
1413 
1414 	switch (gw->sa_family) {
1415 #ifdef INET
1416 	case AF_INET:
1417 		{
1418 			struct sockaddr_in *gw_sin = (struct sockaddr_in *)gw;
1419 
1420 			/* Ensure reads do not go beyoud SA boundary */
1421 			if (SA_SIZE(gw) < offsetof(struct sockaddr_in, sin_zero)) {
1422 				RTS_PID_LOG(LOG_DEBUG, "gateway sin_len too small: %d",
1423 				    gw->sa_len);
1424 				return (EINVAL);
1425 			}
1426 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_in));
1427 			if (sa == NULL)
1428 				return (ENOBUFS);
1429 			fill_sockaddr_inet((struct sockaddr_in *)sa, gw_sin->sin_addr);
1430 			info->rti_info[RTAX_GATEWAY] = sa;
1431 		}
1432 		break;
1433 #endif
1434 #ifdef INET6
1435 	case AF_INET6:
1436 		{
1437 			struct sockaddr_in6 *gw_sin6 = (struct sockaddr_in6 *)gw;
1438 			if (gw_sin6->sin6_len < sizeof(struct sockaddr_in6)) {
1439 				RTS_PID_LOG(LOG_DEBUG, "gateway sin6_len too small: %d",
1440 				    gw->sa_len);
1441 				return (EINVAL);
1442 			}
1443 			fill_sockaddr_inet6(gw_sin6, &gw_sin6->sin6_addr, 0);
1444 			break;
1445 		}
1446 #endif
1447 	case AF_LINK:
1448 		{
1449 			struct sockaddr_dl *gw_sdl;
1450 
1451 			size_t sdl_min_len = offsetof(struct sockaddr_dl, sdl_data);
1452 			gw_sdl = (struct sockaddr_dl *)gw;
1453 			if (gw_sdl->sdl_len < sdl_min_len) {
1454 				RTS_PID_LOG(LOG_DEBUG, "gateway sdl_len too small: %d",
1455 				    gw_sdl->sdl_len);
1456 				return (EINVAL);
1457 			}
1458 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_dl_short));
1459 			if (sa == NULL)
1460 				return (ENOBUFS);
1461 
1462 			const struct sockaddr_dl_short sdl = {
1463 				.sdl_family = AF_LINK,
1464 				.sdl_len = sizeof(struct sockaddr_dl_short),
1465 				.sdl_index = gw_sdl->sdl_index,
1466 			};
1467 			*((struct sockaddr_dl_short *)sa) = sdl;
1468 			info->rti_info[RTAX_GATEWAY] = sa;
1469 			break;
1470 		}
1471 	}
1472 
1473 	return (0);
1474 }
1475 #endif
1476 
1477 static void
1478 remove_netmask(struct rt_addrinfo *info)
1479 {
1480 	info->rti_info[RTAX_NETMASK] = NULL;
1481 	info->rti_flags |= RTF_HOST;
1482 	info->rti_addrs &= ~RTA_NETMASK;
1483 }
1484 
1485 #ifdef INET
1486 static int
1487 cleanup_xaddrs_inet(struct rt_addrinfo *info, struct linear_buffer *lb)
1488 {
1489 	struct sockaddr_in *dst_sa, *mask_sa;
1490 	const int sa_len = sizeof(struct sockaddr_in);
1491 	struct in_addr dst, mask;
1492 
1493 	/* Check & fixup dst/netmask combination first */
1494 	dst_sa = (struct sockaddr_in *)info->rti_info[RTAX_DST];
1495 	mask_sa = (struct sockaddr_in *)info->rti_info[RTAX_NETMASK];
1496 
1497 	/* Ensure reads do not go beyound the buffer size */
1498 	if (SA_SIZE(dst_sa) < offsetof(struct sockaddr_in, sin_zero)) {
1499 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin_len too small: %d",
1500 		    dst_sa->sin_len);
1501 		return (EINVAL);
1502 	}
1503 
1504 	if ((mask_sa != NULL) && mask_sa->sin_len < sizeof(struct sockaddr_in)) {
1505 		/*
1506 		 * Some older routing software encode mask length into the
1507 		 * sin_len, thus resulting in "truncated" sockaddr.
1508 		 */
1509 		int len = mask_sa->sin_len - offsetof(struct sockaddr_in, sin_addr);
1510 		if (len >= 0) {
1511 			mask.s_addr = 0;
1512 			if (len > sizeof(struct in_addr))
1513 				len = sizeof(struct in_addr);
1514 			memcpy(&mask, &mask_sa->sin_addr, len);
1515 		} else {
1516 			RTS_PID_LOG(LOG_DEBUG, "prefix mask sin_len too small: %d",
1517 			    mask_sa->sin_len);
1518 			return (EINVAL);
1519 		}
1520 	} else
1521 		mask.s_addr = mask_sa ? mask_sa->sin_addr.s_addr : INADDR_BROADCAST;
1522 
1523 	dst.s_addr = htonl(ntohl(dst_sa->sin_addr.s_addr) & ntohl(mask.s_addr));
1524 
1525 	/* Construct new "clean" dst/mask sockaddresses */
1526 	if ((dst_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
1527 		return (ENOBUFS);
1528 	fill_sockaddr_inet(dst_sa, dst);
1529 	info->rti_info[RTAX_DST] = (struct sockaddr *)dst_sa;
1530 
1531 	if (mask.s_addr != INADDR_BROADCAST) {
1532 		if ((mask_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
1533 			return (ENOBUFS);
1534 		fill_sockaddr_inet(mask_sa, mask);
1535 		info->rti_info[RTAX_NETMASK] = (struct sockaddr *)mask_sa;
1536 		info->rti_flags &= ~RTF_HOST;
1537 	} else
1538 		remove_netmask(info);
1539 
1540 	/* Check gateway */
1541 	if (info->rti_info[RTAX_GATEWAY] != NULL)
1542 		return (cleanup_xaddrs_gateway(info, lb));
1543 
1544 	return (0);
1545 }
1546 #endif
1547 
1548 #ifdef INET6
1549 static int
1550 cleanup_xaddrs_inet6(struct rt_addrinfo *info, struct linear_buffer *lb)
1551 {
1552 	struct sockaddr *sa;
1553 	struct sockaddr_in6 *dst_sa, *mask_sa;
1554 	struct in6_addr mask, *dst;
1555 	const int sa_len = sizeof(struct sockaddr_in6);
1556 
1557 	/* Check & fixup dst/netmask combination first */
1558 	dst_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_DST];
1559 	mask_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_NETMASK];
1560 
1561 	if (dst_sa->sin6_len < sizeof(struct sockaddr_in6)) {
1562 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin6_len too small: %d",
1563 		    dst_sa->sin6_len);
1564 		return (EINVAL);
1565 	}
1566 
1567 	if (mask_sa && mask_sa->sin6_len < sizeof(struct sockaddr_in6)) {
1568 		/*
1569 		 * Some older routing software encode mask length into the
1570 		 * sin6_len, thus resulting in "truncated" sockaddr.
1571 		 */
1572 		int len = mask_sa->sin6_len - offsetof(struct sockaddr_in6, sin6_addr);
1573 		if (len >= 0) {
1574 			bzero(&mask, sizeof(mask));
1575 			if (len > sizeof(struct in6_addr))
1576 				len = sizeof(struct in6_addr);
1577 			memcpy(&mask, &mask_sa->sin6_addr, len);
1578 		} else {
1579 			RTS_PID_LOG(LOG_DEBUG, "rtsock: prefix mask sin6_len too small: %d",
1580 			    mask_sa->sin6_len);
1581 			return (EINVAL);
1582 		}
1583 	} else
1584 		mask = mask_sa ? mask_sa->sin6_addr : in6mask128;
1585 
1586 	dst = &dst_sa->sin6_addr;
1587 	IN6_MASK_ADDR(dst, &mask);
1588 
1589 	if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
1590 		return (ENOBUFS);
1591 	fill_sockaddr_inet6((struct sockaddr_in6 *)sa, dst, 0);
1592 	info->rti_info[RTAX_DST] = sa;
1593 
1594 	if (!IN6_ARE_ADDR_EQUAL(&mask, &in6mask128)) {
1595 		if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
1596 			return (ENOBUFS);
1597 		fill_sockaddr_inet6((struct sockaddr_in6 *)sa, &mask, 0);
1598 		info->rti_info[RTAX_NETMASK] = sa;
1599 		info->rti_flags &= ~RTF_HOST;
1600 	} else
1601 		remove_netmask(info);
1602 
1603 	/* Check gateway */
1604 	if (info->rti_info[RTAX_GATEWAY] != NULL)
1605 		return (cleanup_xaddrs_gateway(info, lb));
1606 
1607 	return (0);
1608 }
1609 #endif
1610 
1611 static int
1612 cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb)
1613 {
1614 	int error = EAFNOSUPPORT;
1615 
1616 	if (info->rti_info[RTAX_DST] == NULL) {
1617 		RTS_PID_LOG(LOG_DEBUG, "prefix dst is not set");
1618 		return (EINVAL);
1619 	}
1620 
1621 	if (info->rti_flags & RTF_LLDATA) {
1622 		/*
1623 		 * arp(8)/ndp(8) sends RTA_NETMASK for the associated
1624 		 * prefix along with the actual address in RTA_DST.
1625 		 * Remove netmask to avoid unnecessary address masking.
1626 		 */
1627 		remove_netmask(info);
1628 	}
1629 
1630 	switch (info->rti_info[RTAX_DST]->sa_family) {
1631 #ifdef INET
1632 	case AF_INET:
1633 		error = cleanup_xaddrs_inet(info, lb);
1634 		break;
1635 #endif
1636 #ifdef INET6
1637 	case AF_INET6:
1638 		error = cleanup_xaddrs_inet6(info, lb);
1639 		break;
1640 #endif
1641 	}
1642 
1643 	return (error);
1644 }
1645 
1646 /*
1647  * Fill in @dmask with valid netmask leaving original @smask
1648  * intact. Mostly used with radix netmasks.
1649  */
1650 struct sockaddr *
1651 rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask,
1652     struct sockaddr_storage *dmask)
1653 {
1654 	if (dst == NULL || smask == NULL)
1655 		return (NULL);
1656 
1657 	memset(dmask, 0, dst->sa_len);
1658 	memcpy(dmask, smask, smask->sa_len);
1659 	dmask->ss_len = dst->sa_len;
1660 	dmask->ss_family = dst->sa_family;
1661 
1662 	return ((struct sockaddr *)dmask);
1663 }
1664 
1665 /*
1666  * Writes information related to @rtinfo object to newly-allocated mbuf.
1667  * Assumes MCLBYTES is enough to construct any message.
1668  * Used for OS notifications of vaious events (if/ifa announces,etc)
1669  *
1670  * Returns allocated mbuf or NULL on failure.
1671  */
1672 static struct mbuf *
1673 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
1674 {
1675 	struct sockaddr_storage ss;
1676 	struct rt_msghdr *rtm;
1677 	struct mbuf *m;
1678 	int i;
1679 	struct sockaddr *sa;
1680 #ifdef INET6
1681 	struct sockaddr_in6 *sin6;
1682 #endif
1683 	int len, dlen;
1684 
1685 	switch (type) {
1686 	case RTM_DELADDR:
1687 	case RTM_NEWADDR:
1688 		len = sizeof(struct ifa_msghdr);
1689 		break;
1690 
1691 	case RTM_DELMADDR:
1692 	case RTM_NEWMADDR:
1693 		len = sizeof(struct ifma_msghdr);
1694 		break;
1695 
1696 	case RTM_IFINFO:
1697 		len = sizeof(struct if_msghdr);
1698 		break;
1699 
1700 	case RTM_IFANNOUNCE:
1701 	case RTM_IEEE80211:
1702 		len = sizeof(struct if_announcemsghdr);
1703 		break;
1704 
1705 	default:
1706 		len = sizeof(struct rt_msghdr);
1707 	}
1708 
1709 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
1710 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
1711 	if (len > MHLEN)
1712 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1713 	else
1714 		m = m_gethdr(M_NOWAIT, MT_DATA);
1715 	if (m == NULL)
1716 		return (m);
1717 
1718 	m->m_pkthdr.len = m->m_len = len;
1719 	rtm = mtod(m, struct rt_msghdr *);
1720 	bzero((caddr_t)rtm, len);
1721 	for (i = 0; i < RTAX_MAX; i++) {
1722 		if ((sa = rtinfo->rti_info[i]) == NULL)
1723 			continue;
1724 		rtinfo->rti_addrs |= (1 << i);
1725 
1726 		dlen = SA_SIZE(sa);
1727 		KASSERT(dlen <= sizeof(ss),
1728 		    ("%s: sockaddr size overflow", __func__));
1729 		bzero(&ss, sizeof(ss));
1730 		bcopy(sa, &ss, sa->sa_len);
1731 		sa = (struct sockaddr *)&ss;
1732 #ifdef INET6
1733 		if (sa->sa_family == AF_INET6) {
1734 			sin6 = (struct sockaddr_in6 *)sa;
1735 			(void)sa6_recoverscope(sin6);
1736 		}
1737 #endif
1738 		m_copyback(m, len, dlen, (caddr_t)sa);
1739 		len += dlen;
1740 	}
1741 	if (m->m_pkthdr.len != len) {
1742 		m_freem(m);
1743 		return (NULL);
1744 	}
1745 	rtm->rtm_msglen = len;
1746 	rtm->rtm_version = RTM_VERSION;
1747 	rtm->rtm_type = type;
1748 	return (m);
1749 }
1750 
1751 /*
1752  * Writes information related to @rtinfo object to preallocated buffer.
1753  * Stores needed size in @plen. If @w is NULL, calculates size without
1754  * writing.
1755  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
1756  *
1757  * Returns 0 on success.
1758  *
1759  */
1760 static int
1761 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
1762 {
1763 	struct sockaddr_storage ss;
1764 	int len, buflen = 0, dlen, i;
1765 	caddr_t cp = NULL;
1766 	struct rt_msghdr *rtm = NULL;
1767 #ifdef INET6
1768 	struct sockaddr_in6 *sin6;
1769 #endif
1770 #ifdef COMPAT_FREEBSD32
1771 	bool compat32 = false;
1772 #endif
1773 
1774 	switch (type) {
1775 	case RTM_DELADDR:
1776 	case RTM_NEWADDR:
1777 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
1778 #ifdef COMPAT_FREEBSD32
1779 			if (w->w_req->flags & SCTL_MASK32) {
1780 				len = sizeof(struct ifa_msghdrl32);
1781 				compat32 = true;
1782 			} else
1783 #endif
1784 				len = sizeof(struct ifa_msghdrl);
1785 		} else
1786 			len = sizeof(struct ifa_msghdr);
1787 		break;
1788 
1789 	case RTM_IFINFO:
1790 #ifdef COMPAT_FREEBSD32
1791 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
1792 			if (w->w_op == NET_RT_IFLISTL)
1793 				len = sizeof(struct if_msghdrl32);
1794 			else
1795 				len = sizeof(struct if_msghdr32);
1796 			compat32 = true;
1797 			break;
1798 		}
1799 #endif
1800 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
1801 			len = sizeof(struct if_msghdrl);
1802 		else
1803 			len = sizeof(struct if_msghdr);
1804 		break;
1805 
1806 	case RTM_NEWMADDR:
1807 		len = sizeof(struct ifma_msghdr);
1808 		break;
1809 
1810 	default:
1811 		len = sizeof(struct rt_msghdr);
1812 	}
1813 
1814 	if (w != NULL) {
1815 		rtm = (struct rt_msghdr *)w->w_tmem;
1816 		buflen = w->w_tmemsize - len;
1817 		cp = (caddr_t)w->w_tmem + len;
1818 	}
1819 
1820 	rtinfo->rti_addrs = 0;
1821 	for (i = 0; i < RTAX_MAX; i++) {
1822 		struct sockaddr *sa;
1823 
1824 		if ((sa = rtinfo->rti_info[i]) == NULL)
1825 			continue;
1826 		rtinfo->rti_addrs |= (1 << i);
1827 #ifdef COMPAT_FREEBSD32
1828 		if (compat32)
1829 			dlen = SA_SIZE32(sa);
1830 		else
1831 #endif
1832 			dlen = SA_SIZE(sa);
1833 		if (cp != NULL && buflen >= dlen) {
1834 			KASSERT(dlen <= sizeof(ss),
1835 			    ("%s: sockaddr size overflow", __func__));
1836 			bzero(&ss, sizeof(ss));
1837 			bcopy(sa, &ss, sa->sa_len);
1838 			sa = (struct sockaddr *)&ss;
1839 #ifdef INET6
1840 			if (sa->sa_family == AF_INET6) {
1841 				sin6 = (struct sockaddr_in6 *)sa;
1842 				(void)sa6_recoverscope(sin6);
1843 			}
1844 #endif
1845 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
1846 			cp += dlen;
1847 			buflen -= dlen;
1848 		} else if (cp != NULL) {
1849 			/*
1850 			 * Buffer too small. Count needed size
1851 			 * and return with error.
1852 			 */
1853 			cp = NULL;
1854 		}
1855 
1856 		len += dlen;
1857 	}
1858 
1859 	if (cp != NULL) {
1860 		dlen = ALIGN(len) - len;
1861 		if (buflen < dlen)
1862 			cp = NULL;
1863 		else {
1864 			bzero(cp, dlen);
1865 			cp += dlen;
1866 			buflen -= dlen;
1867 		}
1868 	}
1869 	len = ALIGN(len);
1870 
1871 	if (cp != NULL) {
1872 		/* fill header iff buffer is large enough */
1873 		rtm->rtm_version = RTM_VERSION;
1874 		rtm->rtm_type = type;
1875 		rtm->rtm_msglen = len;
1876 	}
1877 
1878 	*plen = len;
1879 
1880 	if (w != NULL && cp == NULL)
1881 		return (ENOBUFS);
1882 
1883 	return (0);
1884 }
1885 
1886 /*
1887  * This routine is called to generate a message from the routing
1888  * socket indicating that a redirect has occurred, a routing lookup
1889  * has failed, or that a protocol has detected timeouts to a particular
1890  * destination.
1891  */
1892 void
1893 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
1894     int fibnum)
1895 {
1896 	struct rt_msghdr *rtm;
1897 	struct mbuf *m;
1898 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
1899 
1900 	if (V_route_cb.any_count == 0)
1901 		return;
1902 	m = rtsock_msg_mbuf(type, rtinfo);
1903 	if (m == NULL)
1904 		return;
1905 
1906 	if (fibnum != RT_ALL_FIBS) {
1907 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
1908 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
1909 		M_SETFIB(m, fibnum);
1910 		m->m_flags |= RTS_FILTER_FIB;
1911 	}
1912 
1913 	rtm = mtod(m, struct rt_msghdr *);
1914 	rtm->rtm_flags = RTF_DONE | flags;
1915 	rtm->rtm_errno = error;
1916 	rtm->rtm_addrs = rtinfo->rti_addrs;
1917 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
1918 }
1919 
1920 void
1921 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
1922 {
1923 
1924 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
1925 }
1926 
1927 /*
1928  * This routine is called to generate a message from the routing
1929  * socket indicating that the status of a network interface has changed.
1930  */
1931 static void
1932 rtsock_ifmsg(struct ifnet *ifp, int if_flags_mask __unused)
1933 {
1934 	struct if_msghdr *ifm;
1935 	struct mbuf *m;
1936 	struct rt_addrinfo info;
1937 
1938 	if (V_route_cb.any_count == 0)
1939 		return;
1940 	bzero((caddr_t)&info, sizeof(info));
1941 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
1942 	if (m == NULL)
1943 		return;
1944 	ifm = mtod(m, struct if_msghdr *);
1945 	ifm->ifm_index = ifp->if_index;
1946 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
1947 	if_data_copy(ifp, &ifm->ifm_data);
1948 	ifm->ifm_addrs = 0;
1949 	rt_dispatch(m, AF_UNSPEC);
1950 }
1951 
1952 /*
1953  * Announce interface address arrival/withdraw.
1954  * Please do not call directly, use rt_addrmsg().
1955  * Assume input data to be valid.
1956  * Returns 0 on success.
1957  */
1958 int
1959 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
1960 {
1961 	struct rt_addrinfo info;
1962 	struct sockaddr *sa;
1963 	int ncmd;
1964 	struct mbuf *m;
1965 	struct ifa_msghdr *ifam;
1966 	struct ifnet *ifp = ifa->ifa_ifp;
1967 	struct sockaddr_storage ss;
1968 
1969 	if (V_route_cb.any_count == 0)
1970 		return (0);
1971 
1972 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
1973 
1974 	bzero((caddr_t)&info, sizeof(info));
1975 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
1976 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
1977 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
1978 	    info.rti_info[RTAX_IFA], ifa->ifa_netmask, &ss);
1979 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1980 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
1981 		return (ENOBUFS);
1982 	ifam = mtod(m, struct ifa_msghdr *);
1983 	ifam->ifam_index = ifp->if_index;
1984 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
1985 	ifam->ifam_flags = ifa->ifa_flags;
1986 	ifam->ifam_addrs = info.rti_addrs;
1987 
1988 	if (fibnum != RT_ALL_FIBS) {
1989 		M_SETFIB(m, fibnum);
1990 		m->m_flags |= RTS_FILTER_FIB;
1991 	}
1992 
1993 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
1994 
1995 	return (0);
1996 }
1997 
1998 /*
1999  * Announce route addition/removal to rtsock based on @rt data.
2000  * Callers are advives to use rt_routemsg() instead of using this
2001  *  function directly.
2002  * Assume @rt data is consistent.
2003  *
2004  * Returns 0 on success.
2005  */
2006 int
2007 rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh,
2008     int fibnum)
2009 {
2010 	union sockaddr_union dst, mask;
2011 	struct rt_addrinfo info;
2012 
2013 	if (V_route_cb.any_count == 0)
2014 		return (0);
2015 
2016 	int family = rt_get_family(rt);
2017 	init_sockaddrs_family(family, &dst.sa, &mask.sa);
2018 	export_rtaddrs(rt, &dst.sa, &mask.sa);
2019 
2020 	bzero((caddr_t)&info, sizeof(info));
2021 	info.rti_info[RTAX_DST] = &dst.sa;
2022 	info.rti_info[RTAX_NETMASK] = &mask.sa;
2023 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
2024 	info.rti_flags = rt->rte_flags | nhop_get_rtflags(nh);
2025 	info.rti_ifp = nh->nh_ifp;
2026 
2027 	return (rtsock_routemsg_info(cmd, &info, fibnum));
2028 }
2029 
2030 int
2031 rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
2032 {
2033 	struct rt_msghdr *rtm;
2034 	struct sockaddr *sa;
2035 	struct mbuf *m;
2036 
2037 	if (V_route_cb.any_count == 0)
2038 		return (0);
2039 
2040 	if (info->rti_flags & RTF_HOST)
2041 		info->rti_info[RTAX_NETMASK] = NULL;
2042 
2043 	m = rtsock_msg_mbuf(cmd, info);
2044 	if (m == NULL)
2045 		return (ENOBUFS);
2046 
2047 	if (fibnum != RT_ALL_FIBS) {
2048 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
2049 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
2050 		M_SETFIB(m, fibnum);
2051 		m->m_flags |= RTS_FILTER_FIB;
2052 	}
2053 
2054 	rtm = mtod(m, struct rt_msghdr *);
2055 	rtm->rtm_addrs = info->rti_addrs;
2056 	if (info->rti_ifp != NULL)
2057 		rtm->rtm_index = info->rti_ifp->if_index;
2058 	/* Add RTF_DONE to indicate command 'completion' required by API */
2059 	info->rti_flags |= RTF_DONE;
2060 	/* Reported routes has to be up */
2061 	if (cmd == RTM_ADD || cmd == RTM_CHANGE)
2062 		info->rti_flags |= RTF_UP;
2063 	rtm->rtm_flags = info->rti_flags;
2064 
2065 	sa = info->rti_info[RTAX_DST];
2066 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
2067 
2068 	return (0);
2069 }
2070 
2071 /*
2072  * This is the analogue to the rt_newaddrmsg which performs the same
2073  * function but for multicast group memberhips.  This is easier since
2074  * there is no route state to worry about.
2075  */
2076 void
2077 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
2078 {
2079 	struct rt_addrinfo info;
2080 	struct mbuf *m = NULL;
2081 	struct ifnet *ifp = ifma->ifma_ifp;
2082 	struct ifma_msghdr *ifmam;
2083 
2084 	if (V_route_cb.any_count == 0)
2085 		return;
2086 
2087 	bzero((caddr_t)&info, sizeof(info));
2088 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
2089 	if (ifp && ifp->if_addr)
2090 		info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
2091 	else
2092 		info.rti_info[RTAX_IFP] = NULL;
2093 	/*
2094 	 * If a link-layer address is present, present it as a ``gateway''
2095 	 * (similarly to how ARP entries, e.g., are presented).
2096 	 */
2097 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
2098 	m = rtsock_msg_mbuf(cmd, &info);
2099 	if (m == NULL)
2100 		return;
2101 	ifmam = mtod(m, struct ifma_msghdr *);
2102 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
2103 	    __func__));
2104 	ifmam->ifmam_index = ifp->if_index;
2105 	ifmam->ifmam_addrs = info.rti_addrs;
2106 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
2107 }
2108 
2109 static struct mbuf *
2110 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
2111 	struct rt_addrinfo *info)
2112 {
2113 	struct if_announcemsghdr *ifan;
2114 	struct mbuf *m;
2115 
2116 	if (V_route_cb.any_count == 0)
2117 		return NULL;
2118 	bzero((caddr_t)info, sizeof(*info));
2119 	m = rtsock_msg_mbuf(type, info);
2120 	if (m != NULL) {
2121 		ifan = mtod(m, struct if_announcemsghdr *);
2122 		ifan->ifan_index = ifp->if_index;
2123 		strlcpy(ifan->ifan_name, ifp->if_xname,
2124 			sizeof(ifan->ifan_name));
2125 		ifan->ifan_what = what;
2126 	}
2127 	return m;
2128 }
2129 
2130 /*
2131  * This is called to generate routing socket messages indicating
2132  * IEEE80211 wireless events.
2133  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
2134  */
2135 void
2136 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
2137 {
2138 	struct mbuf *m;
2139 	struct rt_addrinfo info;
2140 
2141 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
2142 	if (m != NULL) {
2143 		/*
2144 		 * Append the ieee80211 data.  Try to stick it in the
2145 		 * mbuf containing the ifannounce msg; otherwise allocate
2146 		 * a new mbuf and append.
2147 		 *
2148 		 * NB: we assume m is a single mbuf.
2149 		 */
2150 		if (data_len > M_TRAILINGSPACE(m)) {
2151 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
2152 			if (n == NULL) {
2153 				m_freem(m);
2154 				return;
2155 			}
2156 			bcopy(data, mtod(n, void *), data_len);
2157 			n->m_len = data_len;
2158 			m->m_next = n;
2159 		} else if (data_len > 0) {
2160 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
2161 			m->m_len += data_len;
2162 		}
2163 		if (m->m_flags & M_PKTHDR)
2164 			m->m_pkthdr.len += data_len;
2165 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
2166 		rt_dispatch(m, AF_UNSPEC);
2167 	}
2168 }
2169 
2170 /*
2171  * This is called to generate routing socket messages indicating
2172  * network interface arrival and departure.
2173  */
2174 static void
2175 rt_ifannouncemsg(struct ifnet *ifp, int what)
2176 {
2177 	struct mbuf *m;
2178 	struct rt_addrinfo info;
2179 
2180 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
2181 	if (m != NULL)
2182 		rt_dispatch(m, AF_UNSPEC);
2183 }
2184 
2185 static void
2186 rt_dispatch(struct mbuf *m, sa_family_t saf)
2187 {
2188 
2189 	M_ASSERTPKTHDR(m);
2190 
2191 	m->m_rtsock_family = saf;
2192 	if (V_loif)
2193 		m->m_pkthdr.rcvif = V_loif;
2194 	else {
2195 		m_freem(m);
2196 		return;
2197 	}
2198 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
2199 }
2200 
2201 /*
2202  * Checks if rte can be exported w.r.t jails/vnets.
2203  *
2204  * Returns true if it can, false otherwise.
2205  */
2206 static bool
2207 can_export_rte(struct ucred *td_ucred, bool rt_is_host,
2208     const struct sockaddr *rt_dst)
2209 {
2210 
2211 	if ((!rt_is_host) ? jailed_without_vnet(td_ucred)
2212 	    : prison_if(td_ucred, rt_dst) != 0)
2213 		return (false);
2214 	return (true);
2215 }
2216 
2217 
2218 /*
2219  * This is used in dumping the kernel table via sysctl().
2220  */
2221 static int
2222 sysctl_dumpentry(struct rtentry *rt, void *vw)
2223 {
2224 	struct walkarg *w = vw;
2225 	struct nhop_object *nh;
2226 
2227 	NET_EPOCH_ASSERT();
2228 
2229 	export_rtaddrs(rt, w->dst, w->mask);
2230 	if (!can_export_rte(w->w_req->td->td_ucred, rt_is_host(rt), w->dst))
2231 		return (0);
2232 	nh = rt_get_raw_nhop(rt);
2233 #ifdef ROUTE_MPATH
2234 	if (NH_IS_NHGRP(nh)) {
2235 		const struct weightened_nhop *wn;
2236 		uint32_t num_nhops;
2237 		int error;
2238 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
2239 		for (int i = 0; i < num_nhops; i++) {
2240 			error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
2241 			if (error != 0)
2242 				return (error);
2243 		}
2244 	} else
2245 #endif
2246 		sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
2247 
2248 	return (0);
2249 }
2250 
2251 
2252 static int
2253 sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight,
2254     struct walkarg *w)
2255 {
2256 	struct rt_addrinfo info;
2257 	int error = 0, size;
2258 	uint32_t rtflags;
2259 
2260 	rtflags = nhop_get_rtflags(nh);
2261 
2262 	if (w->w_op == NET_RT_FLAGS && !(rtflags & w->w_arg))
2263 		return (0);
2264 
2265 	bzero((caddr_t)&info, sizeof(info));
2266 	info.rti_info[RTAX_DST] = w->dst;
2267 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
2268 	info.rti_info[RTAX_NETMASK] = (rtflags & RTF_HOST) ? NULL : w->mask;
2269 	info.rti_info[RTAX_GENMASK] = 0;
2270 	if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) {
2271 		info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr;
2272 		info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
2273 		if (nh->nh_ifp->if_flags & IFF_POINTOPOINT)
2274 			info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr;
2275 	}
2276 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
2277 		return (error);
2278 	if (w->w_req && w->w_tmem) {
2279 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
2280 
2281 		bzero(&rtm->rtm_index,
2282 		    sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index));
2283 
2284 		/*
2285 		 * rte flags may consist of RTF_HOST (duplicated in nhop rtflags)
2286 		 * and RTF_UP (if entry is linked, which is always true here).
2287 		 * Given that, use nhop rtflags & add RTF_UP.
2288 		 */
2289 		rtm->rtm_flags = rtflags | RTF_UP;
2290 		if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
2291 			rtm->rtm_flags = RTF_GATEWAY |
2292 				(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
2293 		rt_getmetrics(rt, nh, &rtm->rtm_rmx);
2294 		rtm->rtm_rmx.rmx_weight = weight;
2295 		rtm->rtm_index = nh->nh_ifp->if_index;
2296 		rtm->rtm_addrs = info.rti_addrs;
2297 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
2298 		return (error);
2299 	}
2300 	return (error);
2301 }
2302 
2303 static int
2304 sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd,
2305     struct rt_addrinfo *info, struct walkarg *w, int len)
2306 {
2307 	struct if_msghdrl *ifm;
2308 	struct if_data *ifd;
2309 
2310 	ifm = (struct if_msghdrl *)w->w_tmem;
2311 
2312 #ifdef COMPAT_FREEBSD32
2313 	if (w->w_req->flags & SCTL_MASK32) {
2314 		struct if_msghdrl32 *ifm32;
2315 
2316 		ifm32 = (struct if_msghdrl32 *)ifm;
2317 		ifm32->ifm_addrs = info->rti_addrs;
2318 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
2319 		ifm32->ifm_index = ifp->if_index;
2320 		ifm32->_ifm_spare1 = 0;
2321 		ifm32->ifm_len = sizeof(*ifm32);
2322 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
2323 		ifm32->_ifm_spare2 = 0;
2324 		ifd = &ifm32->ifm_data;
2325 	} else
2326 #endif
2327 	{
2328 		ifm->ifm_addrs = info->rti_addrs;
2329 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
2330 		ifm->ifm_index = ifp->if_index;
2331 		ifm->_ifm_spare1 = 0;
2332 		ifm->ifm_len = sizeof(*ifm);
2333 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
2334 		ifm->_ifm_spare2 = 0;
2335 		ifd = &ifm->ifm_data;
2336 	}
2337 
2338 	memcpy(ifd, src_ifd, sizeof(*ifd));
2339 
2340 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
2341 }
2342 
2343 static int
2344 sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd,
2345     struct rt_addrinfo *info, struct walkarg *w, int len)
2346 {
2347 	struct if_msghdr *ifm;
2348 	struct if_data *ifd;
2349 
2350 	ifm = (struct if_msghdr *)w->w_tmem;
2351 
2352 #ifdef COMPAT_FREEBSD32
2353 	if (w->w_req->flags & SCTL_MASK32) {
2354 		struct if_msghdr32 *ifm32;
2355 
2356 		ifm32 = (struct if_msghdr32 *)ifm;
2357 		ifm32->ifm_addrs = info->rti_addrs;
2358 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
2359 		ifm32->ifm_index = ifp->if_index;
2360 		ifm32->_ifm_spare1 = 0;
2361 		ifd = &ifm32->ifm_data;
2362 	} else
2363 #endif
2364 	{
2365 		ifm->ifm_addrs = info->rti_addrs;
2366 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
2367 		ifm->ifm_index = ifp->if_index;
2368 		ifm->_ifm_spare1 = 0;
2369 		ifd = &ifm->ifm_data;
2370 	}
2371 
2372 	memcpy(ifd, src_ifd, sizeof(*ifd));
2373 
2374 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
2375 }
2376 
2377 static int
2378 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
2379     struct walkarg *w, int len)
2380 {
2381 	struct ifa_msghdrl *ifam;
2382 	struct if_data *ifd;
2383 
2384 	ifam = (struct ifa_msghdrl *)w->w_tmem;
2385 
2386 #ifdef COMPAT_FREEBSD32
2387 	if (w->w_req->flags & SCTL_MASK32) {
2388 		struct ifa_msghdrl32 *ifam32;
2389 
2390 		ifam32 = (struct ifa_msghdrl32 *)ifam;
2391 		ifam32->ifam_addrs = info->rti_addrs;
2392 		ifam32->ifam_flags = ifa->ifa_flags;
2393 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
2394 		ifam32->_ifam_spare1 = 0;
2395 		ifam32->ifam_len = sizeof(*ifam32);
2396 		ifam32->ifam_data_off =
2397 		    offsetof(struct ifa_msghdrl32, ifam_data);
2398 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
2399 		ifd = &ifam32->ifam_data;
2400 	} else
2401 #endif
2402 	{
2403 		ifam->ifam_addrs = info->rti_addrs;
2404 		ifam->ifam_flags = ifa->ifa_flags;
2405 		ifam->ifam_index = ifa->ifa_ifp->if_index;
2406 		ifam->_ifam_spare1 = 0;
2407 		ifam->ifam_len = sizeof(*ifam);
2408 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
2409 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
2410 		ifd = &ifam->ifam_data;
2411 	}
2412 
2413 	bzero(ifd, sizeof(*ifd));
2414 	ifd->ifi_datalen = sizeof(struct if_data);
2415 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
2416 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
2417 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
2418 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
2419 
2420 	/* Fixup if_data carp(4) vhid. */
2421 	if (carp_get_vhid_p != NULL)
2422 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
2423 
2424 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
2425 }
2426 
2427 static int
2428 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
2429     struct walkarg *w, int len)
2430 {
2431 	struct ifa_msghdr *ifam;
2432 
2433 	ifam = (struct ifa_msghdr *)w->w_tmem;
2434 	ifam->ifam_addrs = info->rti_addrs;
2435 	ifam->ifam_flags = ifa->ifa_flags;
2436 	ifam->ifam_index = ifa->ifa_ifp->if_index;
2437 	ifam->_ifam_spare1 = 0;
2438 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
2439 
2440 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
2441 }
2442 
2443 static int
2444 sysctl_iflist(int af, struct walkarg *w)
2445 {
2446 	struct ifnet *ifp;
2447 	struct ifaddr *ifa;
2448 	struct if_data ifd;
2449 	struct rt_addrinfo info;
2450 	int len, error = 0;
2451 	struct sockaddr_storage ss;
2452 
2453 	bzero((caddr_t)&info, sizeof(info));
2454 	bzero(&ifd, sizeof(ifd));
2455 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
2456 		if (w->w_arg && w->w_arg != ifp->if_index)
2457 			continue;
2458 		if_data_copy(ifp, &ifd);
2459 		ifa = ifp->if_addr;
2460 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
2461 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
2462 		if (error != 0)
2463 			goto done;
2464 		info.rti_info[RTAX_IFP] = NULL;
2465 		if (w->w_req && w->w_tmem) {
2466 			if (w->w_op == NET_RT_IFLISTL)
2467 				error = sysctl_iflist_ifml(ifp, &ifd, &info, w,
2468 				    len);
2469 			else
2470 				error = sysctl_iflist_ifm(ifp, &ifd, &info, w,
2471 				    len);
2472 			if (error)
2473 				goto done;
2474 		}
2475 		while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) {
2476 			if (af && af != ifa->ifa_addr->sa_family)
2477 				continue;
2478 			if (prison_if(w->w_req->td->td_ucred,
2479 			    ifa->ifa_addr) != 0)
2480 				continue;
2481 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
2482 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
2483 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
2484 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
2485 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
2486 			if (error != 0)
2487 				goto done;
2488 			if (w->w_req && w->w_tmem) {
2489 				if (w->w_op == NET_RT_IFLISTL)
2490 					error = sysctl_iflist_ifaml(ifa, &info,
2491 					    w, len);
2492 				else
2493 					error = sysctl_iflist_ifam(ifa, &info,
2494 					    w, len);
2495 				if (error)
2496 					goto done;
2497 			}
2498 		}
2499 		info.rti_info[RTAX_IFA] = NULL;
2500 		info.rti_info[RTAX_NETMASK] = NULL;
2501 		info.rti_info[RTAX_BRD] = NULL;
2502 	}
2503 done:
2504 	return (error);
2505 }
2506 
2507 static int
2508 sysctl_ifmalist(int af, struct walkarg *w)
2509 {
2510 	struct rt_addrinfo info;
2511 	struct ifaddr *ifa;
2512 	struct ifmultiaddr *ifma;
2513 	struct ifnet *ifp;
2514 	int error, len;
2515 
2516 	NET_EPOCH_ASSERT();
2517 
2518 	error = 0;
2519 	bzero((caddr_t)&info, sizeof(info));
2520 
2521 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
2522 		if (w->w_arg && w->w_arg != ifp->if_index)
2523 			continue;
2524 		ifa = ifp->if_addr;
2525 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
2526 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2527 			if (af && af != ifma->ifma_addr->sa_family)
2528 				continue;
2529 			if (prison_if(w->w_req->td->td_ucred,
2530 			    ifma->ifma_addr) != 0)
2531 				continue;
2532 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
2533 			info.rti_info[RTAX_GATEWAY] =
2534 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
2535 			    ifma->ifma_lladdr : NULL;
2536 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
2537 			if (error != 0)
2538 				break;
2539 			if (w->w_req && w->w_tmem) {
2540 				struct ifma_msghdr *ifmam;
2541 
2542 				ifmam = (struct ifma_msghdr *)w->w_tmem;
2543 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
2544 				ifmam->ifmam_flags = 0;
2545 				ifmam->ifmam_addrs = info.rti_addrs;
2546 				ifmam->_ifmam_spare1 = 0;
2547 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
2548 				if (error != 0)
2549 					break;
2550 			}
2551 		}
2552 		if (error != 0)
2553 			break;
2554 	}
2555 	return (error);
2556 }
2557 
2558 static void
2559 rtable_sysctl_dump(uint32_t fibnum, int family, struct walkarg *w)
2560 {
2561 	union sockaddr_union sa_dst, sa_mask;
2562 
2563 	w->family = family;
2564 	w->dst = (struct sockaddr *)&sa_dst;
2565 	w->mask = (struct sockaddr *)&sa_mask;
2566 
2567 	init_sockaddrs_family(family, w->dst, w->mask);
2568 
2569 	rib_walk(fibnum, family, false, sysctl_dumpentry, w);
2570 }
2571 
2572 static int
2573 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
2574 {
2575 	struct epoch_tracker et;
2576 	int	*name = (int *)arg1;
2577 	u_int	namelen = arg2;
2578 	struct rib_head *rnh = NULL; /* silence compiler. */
2579 	int	i, lim, error = EINVAL;
2580 	int	fib = 0;
2581 	u_char	af;
2582 	struct	walkarg w;
2583 
2584 	if (namelen < 3)
2585 		return (EINVAL);
2586 
2587 	name++;
2588 	namelen--;
2589 	if (req->newptr)
2590 		return (EPERM);
2591 	if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
2592 		if (namelen == 3)
2593 			fib = req->td->td_proc->p_fibnum;
2594 		else if (namelen == 4)
2595 			fib = (name[3] == RT_ALL_FIBS) ?
2596 			    req->td->td_proc->p_fibnum : name[3];
2597 		else
2598 			return ((namelen < 3) ? EISDIR : ENOTDIR);
2599 		if (fib < 0 || fib >= rt_numfibs)
2600 			return (EINVAL);
2601 	} else if (namelen != 3)
2602 		return ((namelen < 3) ? EISDIR : ENOTDIR);
2603 	af = name[0];
2604 	if (af > AF_MAX)
2605 		return (EINVAL);
2606 	bzero(&w, sizeof(w));
2607 	w.w_op = name[1];
2608 	w.w_arg = name[2];
2609 	w.w_req = req;
2610 
2611 	error = sysctl_wire_old_buffer(req, 0);
2612 	if (error)
2613 		return (error);
2614 
2615 	/*
2616 	 * Allocate reply buffer in advance.
2617 	 * All rtsock messages has maximum length of u_short.
2618 	 */
2619 	w.w_tmemsize = 65536;
2620 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
2621 
2622 	NET_EPOCH_ENTER(et);
2623 	switch (w.w_op) {
2624 	case NET_RT_DUMP:
2625 	case NET_RT_FLAGS:
2626 		if (af == 0) {			/* dump all tables */
2627 			i = 1;
2628 			lim = AF_MAX;
2629 		} else				/* dump only one table */
2630 			i = lim = af;
2631 
2632 		/*
2633 		 * take care of llinfo entries, the caller must
2634 		 * specify an AF
2635 		 */
2636 		if (w.w_op == NET_RT_FLAGS &&
2637 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
2638 			if (af != 0)
2639 				error = lltable_sysctl_dumparp(af, w.w_req);
2640 			else
2641 				error = EINVAL;
2642 			break;
2643 		}
2644 		/*
2645 		 * take care of routing entries
2646 		 */
2647 		for (error = 0; error == 0 && i <= lim; i++) {
2648 			rnh = rt_tables_get_rnh(fib, i);
2649 			if (rnh != NULL) {
2650 				rtable_sysctl_dump(fib, i, &w);
2651 			} else if (af != 0)
2652 				error = EAFNOSUPPORT;
2653 		}
2654 		break;
2655 	case NET_RT_NHOP:
2656 	case NET_RT_NHGRP:
2657 		/* Allow dumping one specific af/fib at a time */
2658 		if (namelen < 4) {
2659 			error = EINVAL;
2660 			break;
2661 		}
2662 		fib = name[3];
2663 		if (fib < 0 || fib > rt_numfibs) {
2664 			error = EINVAL;
2665 			break;
2666 		}
2667 		rnh = rt_tables_get_rnh(fib, af);
2668 		if (rnh == NULL) {
2669 			error = EAFNOSUPPORT;
2670 			break;
2671 		}
2672 		if (w.w_op == NET_RT_NHOP)
2673 			error = nhops_dump_sysctl(rnh, w.w_req);
2674 		else
2675 #ifdef ROUTE_MPATH
2676 			error = nhgrp_dump_sysctl(rnh, w.w_req);
2677 #else
2678 			error = ENOTSUP;
2679 #endif
2680 		break;
2681 	case NET_RT_IFLIST:
2682 	case NET_RT_IFLISTL:
2683 		error = sysctl_iflist(af, &w);
2684 		break;
2685 
2686 	case NET_RT_IFMALIST:
2687 		error = sysctl_ifmalist(af, &w);
2688 		break;
2689 	}
2690 	NET_EPOCH_EXIT(et);
2691 
2692 	free(w.w_tmem, M_TEMP);
2693 	return (error);
2694 }
2695 
2696 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_MPSAFE,
2697     sysctl_rtsock, "Return route tables and interface/address lists");
2698 
2699 /*
2700  * Definitions of protocols supported in the ROUTE domain.
2701  */
2702 
2703 static struct domain routedomain;		/* or at least forward */
2704 
2705 static struct protosw routesw = {
2706 	.pr_type =		SOCK_RAW,
2707 	.pr_flags =		PR_ATOMIC|PR_ADDR,
2708 	.pr_abort =		rts_close,
2709 	.pr_attach =		rts_attach,
2710 	.pr_detach =		rts_detach,
2711 	.pr_send =		rts_send,
2712 	.pr_shutdown =		rts_shutdown,
2713 	.pr_disconnect =	rts_disconnect,
2714 	.pr_close =		rts_close,
2715 };
2716 
2717 static struct domain routedomain = {
2718 	.dom_family =		PF_ROUTE,
2719 	.dom_name =		"route",
2720 	.dom_nprotosw =		1,
2721 	.dom_protosw =		{ &routesw },
2722 };
2723 
2724 DOMAIN_SET(route);
2725