xref: /freebsd/sys/net/route/route_ctl.c (revision 9768746b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_private.h>
48 #include <net/if_dl.h>
49 #include <net/vnet.h>
50 #include <net/route.h>
51 #include <net/route/route_ctl.h>
52 #include <net/route/route_var.h>
53 #include <net/route/nhop_utils.h>
54 #include <net/route/nhop.h>
55 #include <net/route/nhop_var.h>
56 #include <netinet/in.h>
57 #include <netinet6/scope6_var.h>
58 #include <netinet6/in6_var.h>
59 
60 #define	DEBUG_MOD_NAME	route_ctl
61 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
62 #include <net/route/route_debug.h>
63 _DECLARE_DEBUG(LOG_INFO);
64 
65 /*
66  * This file contains control plane routing tables functions.
67  *
68  * All functions assumes they are called in net epoch.
69  */
70 
71 union sockaddr_union {
72 	struct sockaddr		sa;
73 	struct sockaddr_in	sin;
74 	struct sockaddr_in6	sin6;
75 	char			_buf[32];
76 };
77 
78 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
79     struct rib_cmd_info *rc);
80 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
81     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
82     struct rib_cmd_info *rc);
83 
84 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
85     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 #ifdef ROUTE_MPATH
87 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
88     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
89     int op_flags, struct rib_cmd_info *rc);
90 #endif
91 
92 static int add_route(struct rib_head *rnh, struct rtentry *rt,
93     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
94 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
95     struct rib_cmd_info *rc);
96 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
97     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 
99 static int get_prio_from_info(const struct rt_addrinfo *info);
100 static int nhop_get_prio(const struct nhop_object *nh);
101 
102 #ifdef ROUTE_MPATH
103 static bool rib_can_multipath(struct rib_head *rh);
104 #endif
105 
106 /* Per-vnet multipath routing configuration */
107 SYSCTL_DECL(_net_route);
108 #define	V_rib_route_multipath	VNET(rib_route_multipath)
109 #ifdef ROUTE_MPATH
110 #define _MP_FLAGS	CTLFLAG_RW
111 #else
112 #define _MP_FLAGS	CTLFLAG_RD
113 #endif
114 VNET_DEFINE(u_int, rib_route_multipath) = 1;
115 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
116     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
117 #undef _MP_FLAGS
118 
119 #ifdef ROUTE_MPATH
120 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
121 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
122     &VNET_NAME(fib_hash_outbound), 0,
123     "Compute flowid for locally-originated packets");
124 
125 /* Default entropy to add to the hash calculation for the outbound connections*/
126 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
127 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
128 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
129 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
130 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
131 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
132 };
133 #endif
134 
135 #if defined(INET) && defined(INET6)
136 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
137 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
138 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
139 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
140     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
141 #endif
142 
143 /* Debug bits */
144 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
145 
146 static struct rib_head *
147 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
148 {
149 	struct rib_head *rnh;
150 	struct sockaddr *dst;
151 
152 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
153 
154 	dst = info->rti_info[RTAX_DST];
155 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
156 
157 	return (rnh);
158 }
159 
160 #if defined(INET) && defined(INET6)
161 bool
162 rib_can_4o6_nhop(void)
163 {
164 	return (!!V_rib_route_ipv6_nexthop);
165 }
166 #endif
167 
168 #ifdef ROUTE_MPATH
169 static bool
170 rib_can_multipath(struct rib_head *rh)
171 {
172 	int result;
173 
174 	CURVNET_SET(rh->rib_vnet);
175 	result = !!V_rib_route_multipath;
176 	CURVNET_RESTORE();
177 
178 	return (result);
179 }
180 
181 /*
182  * Check is nhop is multipath-eligible.
183  * Avoid nhops without gateways and redirects.
184  *
185  * Returns 1 for multipath-eligible nexthop,
186  * 0 otherwise.
187  */
188 bool
189 nhop_can_multipath(const struct nhop_object *nh)
190 {
191 
192 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
193 		return (1);
194 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
195 		return (0);
196 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
197 		return (0);
198 
199 	return (1);
200 }
201 #endif
202 
203 static int
204 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
205 {
206 	uint32_t weight;
207 
208 	if (info->rti_mflags & RTV_WEIGHT)
209 		weight = info->rti_rmx->rmx_weight;
210 	else
211 		weight = default_weight;
212 	/* Keep upper 1 byte for adm distance purposes */
213 	if (weight > RT_MAX_WEIGHT)
214 		weight = RT_MAX_WEIGHT;
215 	else if (weight == 0)
216 		weight = default_weight;
217 
218 	return (weight);
219 }
220 
221 /*
222  * File-local concept for distingushing between the normal and
223  * RTF_PINNED routes tha can override the "normal" one.
224  */
225 #define	NH_PRIORITY_HIGH	2
226 #define	NH_PRIORITY_NORMAL	1
227 static int
228 get_prio_from_info(const struct rt_addrinfo *info)
229 {
230 	if (info->rti_flags & RTF_PINNED)
231 		return (NH_PRIORITY_HIGH);
232 	return (NH_PRIORITY_NORMAL);
233 }
234 
235 static int
236 nhop_get_prio(const struct nhop_object *nh)
237 {
238 	if (NH_IS_PINNED(nh))
239 		return (NH_PRIORITY_HIGH);
240 	return (NH_PRIORITY_NORMAL);
241 }
242 
243 /*
244  * Check if specified @gw matches gw data in the nexthop @nh.
245  *
246  * Returns true if matches, false otherwise.
247  */
248 bool
249 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
250 {
251 
252 	if (nh->gw_sa.sa_family != gw->sa_family)
253 		return (false);
254 
255 	switch (gw->sa_family) {
256 	case AF_INET:
257 		return (nh->gw4_sa.sin_addr.s_addr ==
258 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
259 	case AF_INET6:
260 		{
261 			const struct sockaddr_in6 *gw6;
262 			gw6 = (const struct sockaddr_in6 *)gw;
263 
264 			/*
265 			 * Currently (2020-09) IPv6 gws in kernel have their
266 			 * scope embedded. Once this becomes false, this code
267 			 * has to be revisited.
268 			 */
269 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
270 			    &gw6->sin6_addr))
271 				return (true);
272 			return (false);
273 		}
274 	case AF_LINK:
275 		{
276 			const struct sockaddr_dl *sdl;
277 			sdl = (const struct sockaddr_dl *)gw;
278 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
279 		}
280 	default:
281 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
282 	}
283 
284 	/* NOTREACHED */
285 	return (false);
286 }
287 
288 /*
289  * Matches all nexthop with given @gw.
290  * Can be used as rib_filter_f callback.
291  */
292 int
293 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
294 {
295 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
296 
297 	return (match_nhop_gw(nh, gw));
298 }
299 
300 struct gw_filter_data {
301 	const struct sockaddr *gw;
302 	int count;
303 };
304 
305 /*
306  * Matches first occurence of the gateway provided in @gwd
307  */
308 static int
309 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
310 {
311 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
312 
313 	/* Return only first match to make rtsock happy */
314 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
315 		return (1);
316 	return (0);
317 }
318 
319 /*
320  * Checks if data in @info matches nexhop @nh.
321  *
322  * Returns 0 on success,
323  * ESRCH if not matched,
324  * ENOENT if filter function returned false
325  */
326 int
327 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
328     const struct nhop_object *nh)
329 {
330 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
331 
332 	if (info->rti_filter != NULL) {
333 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
334 		    return (ENOENT);
335 	    else
336 		    return (0);
337 	}
338 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
339 		return (ESRCH);
340 
341 	return (0);
342 }
343 
344 /*
345  * Runs exact prefix match based on @dst and @netmask.
346  * Returns matched @rtentry if found or NULL.
347  * If rtentry was found, saves nexthop / weight value into @rnd.
348  */
349 static struct rtentry *
350 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
351     const struct sockaddr *netmask, struct route_nhop_data *rnd)
352 {
353 	struct rtentry *rt;
354 
355 	RIB_LOCK_ASSERT(rnh);
356 
357 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
358 	if (rt != NULL) {
359 		rnd->rnd_nhop = rt->rt_nhop;
360 		rnd->rnd_weight = rt->rt_weight;
361 	} else {
362 		rnd->rnd_nhop = NULL;
363 		rnd->rnd_weight = 0;
364 	}
365 
366 	return (rt);
367 }
368 
369 struct rtentry *
370 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
371     struct route_nhop_data *rnd)
372 {
373 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
374 }
375 
376 /*
377  * Runs exact prefix match based on dst/netmask from @info.
378  * Assumes RIB lock is held.
379  * Returns matched @rtentry if found or NULL.
380  * If rtentry was found, saves nexthop / weight value into @rnd.
381  */
382 struct rtentry *
383 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
384     struct route_nhop_data *rnd)
385 {
386 	struct rtentry *rt;
387 
388 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
389 	    info->rti_info[RTAX_NETMASK], rnd);
390 
391 	return (rt);
392 }
393 
394 static bool
395 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
396     struct sockaddr **pmask)
397 {
398 	if (plen == -1) {
399 		*pmask = NULL;
400 		return (true);
401 	}
402 
403 	switch (family) {
404 #ifdef INET
405 	case AF_INET:
406 		{
407 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
408 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
409 
410 			memset(mask, 0, sizeof(*mask));
411 			mask->sin_family = family;
412 			mask->sin_len = sizeof(*mask);
413 			if (plen == 32)
414 				*pmask = NULL;
415 			else if (plen > 32 || plen < 0)
416 				return (false);
417 			else {
418 				uint32_t daddr, maddr;
419 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
420 				mask->sin_addr.s_addr = maddr;
421 				daddr = dst->sin_addr.s_addr;
422 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
423 				dst->sin_addr.s_addr = daddr;
424 			}
425 			return (true);
426 		}
427 		break;
428 #endif
429 #ifdef INET6
430 	case AF_INET6:
431 		{
432 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
433 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
434 
435 			memset(mask, 0, sizeof(*mask));
436 			mask->sin6_family = family;
437 			mask->sin6_len = sizeof(*mask);
438 			if (plen == 128)
439 				*pmask = NULL;
440 			else if (plen > 128 || plen < 0)
441 				return (false);
442 			else {
443 				ip6_writemask(&mask->sin6_addr, plen);
444 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
445 			}
446 			return (true);
447 		}
448 		break;
449 #endif
450 	}
451 	return (false);
452 }
453 
454 /*
455  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
456  * to the routing table.
457  *
458  * @fibnum: rtable id to insert route to
459  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
460  * @plen: prefix length (or -1 if host route or not applicable for AF)
461  * @op_flags: combination of RTM_F_ flags
462  * @rc: storage to report operation result
463  *
464  * Returns 0 on success.
465  */
466 int
467 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
468     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
469 {
470 	union sockaddr_union mask_storage;
471 	struct sockaddr *netmask = &mask_storage.sa;
472 	struct rtentry *rt = NULL;
473 
474 	NET_EPOCH_ASSERT();
475 
476 	bzero(rc, sizeof(struct rib_cmd_info));
477 	rc->rc_cmd = RTM_ADD;
478 
479 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
480 	if (rnh == NULL)
481 		return (EAFNOSUPPORT);
482 
483 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
484 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
485 		return (EINVAL);
486 	}
487 
488 	if (op_flags & RTM_F_CREATE) {
489 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
490 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
491 			return (ENOMEM);
492 		}
493 	}
494 
495 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
496 }
497 
498 /*
499  * Attempts to delete @dst/plen prefix matching gateway @gw from the
500  *  routing rable.
501  *
502  * @fibnum: rtable id to remove route from
503  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
504  * @plen: prefix length (or -1 if host route or not applicable for AF)
505  * @gw: gateway to match
506  * @op_flags: combination of RTM_F_ flags
507  * @rc: storage to report operation result
508  *
509  * Returns 0 on success.
510  */
511 int
512 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
513     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
514 {
515 	struct gw_filter_data gwd = { .gw = gw };
516 
517 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
518 }
519 
520 /*
521  * Attempts to delete @dst/plen prefix matching @filter_func from the
522  *  routing rable.
523  *
524  * @fibnum: rtable id to remove route from
525  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
526  * @plen: prefix length (or -1 if host route or not applicable for AF)
527  * @filter_func: func to be called for each nexthop of the prefix for matching
528  * @filter_arg: argument to pass to @filter_func
529  * @op_flags: combination of RTM_F_ flags
530  * @rc: storage to report operation result
531  *
532  * Returns 0 on success.
533  */
534 int
535 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
536     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
537     struct rib_cmd_info *rc)
538 {
539 	union sockaddr_union mask_storage;
540 	struct sockaddr *netmask = &mask_storage.sa;
541 	int error;
542 
543 	NET_EPOCH_ASSERT();
544 
545 	bzero(rc, sizeof(struct rib_cmd_info));
546 	rc->rc_cmd = RTM_DELETE;
547 
548 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
549 	if (rnh == NULL)
550 		return (EAFNOSUPPORT);
551 
552 	if (dst->sa_len > sizeof(mask_storage)) {
553 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
554 		return (EINVAL);
555 	}
556 
557 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
558 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
559 		return (EINVAL);
560 	}
561 
562 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
563 
564 	RIB_WLOCK(rnh);
565 	struct route_nhop_data rnd;
566 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
567 	if (rt != NULL) {
568 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
569 		    filter_arg, rc);
570 	} else
571 		error = ESRCH;
572 	RIB_WUNLOCK(rnh);
573 
574 	if (error != 0)
575 		return (error);
576 
577 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
578 
579 	if (rc->rc_cmd == RTM_DELETE)
580 		rt_free(rc->rc_rt);
581 #ifdef ROUTE_MPATH
582 	else {
583 		/*
584 		 * Deleting 1 path may result in RTM_CHANGE to
585 		 * a different mpath group/nhop.
586 		 * Free old mpath group.
587 		 */
588 		nhop_free_any(rc->rc_nh_old);
589 	}
590 #endif
591 
592 	return (0);
593 }
594 
595 /*
596  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
597  * @rt: route to copy.
598  * @rnd_src: nhop and weight. Multipath routes are not supported
599  * @rh_dst: target rtable.
600  * @rc: operation result storage
601  *
602  * Return 0 on success.
603  */
604 int
605 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
606     struct rib_head *rh_dst, struct rib_cmd_info *rc)
607 {
608 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
609 	int error;
610 
611 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
612 
613 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
614 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
615 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
616 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
617 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
618 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
619 	}
620 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
621 	if (nh == NULL) {
622 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
623 		return (ENOMEM);
624 	}
625 	nhop_copy(nh, rnd_src->rnd_nhop);
626 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
627 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
628 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
629 	if (error != 0) {
630 		FIB_RH_LOG(LOG_INFO, rh_dst,
631 		    "unable to finalize new nexthop: error %d", error);
632 		return (ENOMEM);
633 	}
634 
635 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
636 	if (rt_new == NULL) {
637 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
638 		nhop_free(nh);
639 		return (ENOMEM);
640 	}
641 
642 	struct route_nhop_data rnd = {
643 		.rnd_nhop = nh,
644 		.rnd_weight = rnd_src->rnd_weight
645 	};
646 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
647 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
648 
649 	if (error != 0) {
650 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
651 			char buf[NHOP_PRINT_BUFSIZE];
652 			rt_print_buf(rt_new, buf, sizeof(buf));
653 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
654 			    "Unable to add route %s: error %d", buf, error);
655 		}
656 		nhop_free(nh);
657 		rt_free_immediate(rt_new);
658 	}
659 	return (error);
660 }
661 
662 /*
663  * Adds route defined by @info into the kernel table specified by @fibnum and
664  * sa_family in @info->rti_info[RTAX_DST].
665  *
666  * Returns 0 on success and fills in operation metadata into @rc.
667  */
668 int
669 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
670     struct rib_cmd_info *rc)
671 {
672 	struct rib_head *rnh;
673 	int error;
674 
675 	NET_EPOCH_ASSERT();
676 
677 	rnh = get_rnh(fibnum, info);
678 	if (rnh == NULL)
679 		return (EAFNOSUPPORT);
680 
681 	/*
682 	 * Check consistency between RTF_HOST flag and netmask
683 	 * existence.
684 	 */
685 	if (info->rti_flags & RTF_HOST)
686 		info->rti_info[RTAX_NETMASK] = NULL;
687 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
688 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
689 		return (EINVAL);
690 	}
691 
692 	bzero(rc, sizeof(struct rib_cmd_info));
693 	rc->rc_cmd = RTM_ADD;
694 
695 	error = add_route_byinfo(rnh, info, rc);
696 	if (error == 0)
697 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
698 
699 	return (error);
700 }
701 
702 static int
703 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
704     struct rib_cmd_info *rc)
705 {
706 	struct route_nhop_data rnd_add;
707 	struct nhop_object *nh;
708 	struct rtentry *rt;
709 	struct sockaddr *dst, *gateway, *netmask;
710 	int error;
711 
712 	dst = info->rti_info[RTAX_DST];
713 	gateway = info->rti_info[RTAX_GATEWAY];
714 	netmask = info->rti_info[RTAX_NETMASK];
715 
716 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
717 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
718 		return (EINVAL);
719 	}
720 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
721 		FIB_RH_LOG(LOG_DEBUG, rnh,
722 		    "error: invalid dst/gateway family combination (%d, %d)",
723 		    dst->sa_family, gateway->sa_family);
724 		return (EINVAL);
725 	}
726 
727 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
728 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
729 		    dst->sa_len);
730 		return (EINVAL);
731 	}
732 
733 	if (info->rti_ifa == NULL) {
734 		error = rt_getifa_fib(info, rnh->rib_fibnum);
735 		if (error)
736 			return (error);
737 	}
738 
739 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
740 		return (ENOBUFS);
741 
742 	error = nhop_create_from_info(rnh, info, &nh);
743 	if (error != 0) {
744 		rt_free_immediate(rt);
745 		return (error);
746 	}
747 
748 	rnd_add.rnd_nhop = nh;
749 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
750 
751 	int op_flags = RTM_F_CREATE;
752 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
753 		op_flags |= RTM_F_FORCE;
754 	else
755 		op_flags |= RTM_F_APPEND;
756 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
757 
758 }
759 
760 static int
761 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
762     int op_flags, struct rib_cmd_info *rc)
763 {
764 	struct route_nhop_data rnd_orig;
765 	struct nhop_object *nh;
766 	struct rtentry *rt_orig;
767 	int error = 0;
768 
769 	nh = rnd_add->rnd_nhop;
770 
771 	RIB_WLOCK(rnh);
772 
773 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
774 
775 	if (rt_orig == NULL) {
776 		if (op_flags & RTM_F_CREATE)
777 			error = add_route(rnh, rt, rnd_add, rc);
778 		else
779 			error = ESRCH; /* no entry but creation was not required */
780 		RIB_WUNLOCK(rnh);
781 		if (error != 0)
782 			goto out;
783 		return (0);
784 	}
785 
786 	if (op_flags & RTM_F_EXCL) {
787 		/* We have existing route in the RIB but not allowed to replace. */
788 		RIB_WUNLOCK(rnh);
789 		error = EEXIST;
790 		goto out;
791 	}
792 
793 	/* Now either append or replace */
794 	if (op_flags & RTM_F_REPLACE) {
795 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
796 			/* Old path is "better" (e.g. has PINNED flag set) */
797 			RIB_WUNLOCK(rnh);
798 			error = EEXIST;
799 			goto out;
800 		}
801 		change_route(rnh, rt_orig, rnd_add, rc);
802 		RIB_WUNLOCK(rnh);
803 		nh = rc->rc_nh_old;
804 		goto out;
805 	}
806 
807 	RIB_WUNLOCK(rnh);
808 
809 #ifdef ROUTE_MPATH
810 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
811 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
812 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
813 
814 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
815 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
816 			    op_flags, rc);
817 			if (error != EAGAIN)
818 				break;
819 			RTSTAT_INC(rts_add_retry);
820 		}
821 
822 		/*
823 		 *  Original nhop reference is unused in any case.
824 		 */
825 		nhop_free_any(rnd_add->rnd_nhop);
826 		if (op_flags & RTM_F_CREATE) {
827 			if (error != 0 || rc->rc_cmd != RTM_ADD)
828 				rt_free_immediate(rt);
829 		}
830 		return (error);
831 	}
832 #endif
833 	/* Out of options - free state and return error */
834 	error = EEXIST;
835 out:
836 	if (op_flags & RTM_F_CREATE)
837 		rt_free_immediate(rt);
838 	nhop_free_any(nh);
839 
840 	return (error);
841 }
842 
843 #ifdef ROUTE_MPATH
844 static int
845 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
846     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
847     int op_flags, struct rib_cmd_info *rc)
848 {
849 	RIB_RLOCK_TRACKER;
850 	struct route_nhop_data rnd_new;
851 	int error = 0;
852 
853 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
854 	if (error != 0) {
855 		if (error == EAGAIN) {
856 			/*
857 			 * Group creation failed, most probably because
858 			 * @rnd_orig data got scheduled for deletion.
859 			 * Refresh @rnd_orig data and retry.
860 			 */
861 			RIB_RLOCK(rnh);
862 			lookup_prefix_rt(rnh, rt, rnd_orig);
863 			RIB_RUNLOCK(rnh);
864 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
865 				/* In this iteration route doesn't exist */
866 				error = ENOENT;
867 			}
868 		}
869 		return (error);
870 	}
871 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
872 	if (error != 0)
873 		return (error);
874 
875 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
876 		/*
877 		 * First multipath route got installed. Enable local
878 		 * outbound connections hashing.
879 		 */
880 		if (bootverbose)
881 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
882 		V_fib_hash_outbound = 1;
883 	}
884 
885 	return (0);
886 }
887 #endif
888 
889 /*
890  * Removes route defined by @info from the kernel table specified by @fibnum and
891  * sa_family in @info->rti_info[RTAX_DST].
892  *
893  * Returns 0 on success and fills in operation metadata into @rc.
894  */
895 int
896 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
897 {
898 	struct rib_head *rnh;
899 	struct sockaddr *dst, *netmask;
900 	struct sockaddr_storage mdst;
901 	int error;
902 
903 	NET_EPOCH_ASSERT();
904 
905 	rnh = get_rnh(fibnum, info);
906 	if (rnh == NULL)
907 		return (EAFNOSUPPORT);
908 
909 	bzero(rc, sizeof(struct rib_cmd_info));
910 	rc->rc_cmd = RTM_DELETE;
911 
912 	dst = info->rti_info[RTAX_DST];
913 	netmask = info->rti_info[RTAX_NETMASK];
914 
915 	if (netmask != NULL) {
916 		/* Ensure @dst is always properly masked */
917 		if (dst->sa_len > sizeof(mdst)) {
918 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
919 			return (EINVAL);
920 		}
921 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
922 		dst = (struct sockaddr *)&mdst;
923 	}
924 
925 	rib_filter_f_t *filter_func = NULL;
926 	void *filter_arg = NULL;
927 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
928 
929 	if (info->rti_filter != NULL) {
930 		filter_func = info->rti_filter;
931 		filter_arg = info->rti_filterdata;
932 	} else if (gwd.gw != NULL) {
933 		filter_func = match_gw_one;
934 		filter_arg = &gwd;
935 	}
936 
937 	int prio = get_prio_from_info(info);
938 
939 	RIB_WLOCK(rnh);
940 	struct route_nhop_data rnd;
941 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
942 	if (rt != NULL) {
943 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
944 		    filter_arg, rc);
945 	} else
946 		error = ESRCH;
947 	RIB_WUNLOCK(rnh);
948 
949 	if (error != 0)
950 		return (error);
951 
952 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
953 
954 	if (rc->rc_cmd == RTM_DELETE)
955 		rt_free(rc->rc_rt);
956 #ifdef ROUTE_MPATH
957 	else {
958 		/*
959 		 * Deleting 1 path may result in RTM_CHANGE to
960 		 * a different mpath group/nhop.
961 		 * Free old mpath group.
962 		 */
963 		nhop_free_any(rc->rc_nh_old);
964 	}
965 #endif
966 
967 	return (0);
968 }
969 
970 /*
971  * Conditionally unlinks rtentry paths from @rnh matching @cb.
972  * Returns 0 on success with operation result stored in @rc.
973  * On error, returns:
974  * ESRCH - if prefix was not found or filter function failed to match
975  * EADDRINUSE - if trying to delete higher priority route.
976  */
977 static int
978 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
979     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
980 {
981 	struct nhop_object *nh = rt->rt_nhop;
982 
983 #ifdef ROUTE_MPATH
984 	if (NH_IS_NHGRP(nh)) {
985 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
986 		struct route_nhop_data rnd;
987 		int error;
988 
989 		if (cb == NULL)
990 			return (ESRCH);
991 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
992 		if (error == 0) {
993 			if (rnd.rnd_nhgrp == nhg) {
994 				/* No match, unreference new group and return. */
995 				nhop_free_any(rnd.rnd_nhop);
996 				return (ESRCH);
997 			}
998 			error = change_route(rnh, rt, &rnd, rc);
999 		}
1000 		return (error);
1001 	}
1002 #endif
1003 	if (cb != NULL && !cb(rt, nh, cbdata))
1004 		return (ESRCH);
1005 
1006 	if (prio < nhop_get_prio(nh))
1007 		return (EADDRINUSE);
1008 
1009 	return (delete_route(rnh, rt, rc));
1010 }
1011 
1012 int
1013 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1014     struct rib_cmd_info *rc)
1015 {
1016 	RIB_RLOCK_TRACKER;
1017 	struct route_nhop_data rnd_orig;
1018 	struct rib_head *rnh;
1019 	struct rtentry *rt;
1020 	int error;
1021 
1022 	NET_EPOCH_ASSERT();
1023 
1024 	rnh = get_rnh(fibnum, info);
1025 	if (rnh == NULL)
1026 		return (EAFNOSUPPORT);
1027 
1028 	bzero(rc, sizeof(struct rib_cmd_info));
1029 	rc->rc_cmd = RTM_CHANGE;
1030 
1031 	/* Check if updated gateway exists */
1032 	if ((info->rti_flags & RTF_GATEWAY) &&
1033 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1034 
1035 		/*
1036 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1037 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1038 		 * compatibility..
1039 		 */
1040 		info->rti_flags &= ~RTF_GATEWAY;
1041 	}
1042 
1043 	/*
1044 	 * route change is done in multiple steps, with dropping and
1045 	 * reacquiring lock. In the situations with multiple processes
1046 	 * changes the same route in can lead to the case when route
1047 	 * is changed between the steps. Address it by retrying the operation
1048 	 * multiple times before failing.
1049 	 */
1050 
1051 	RIB_RLOCK(rnh);
1052 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1053 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1054 
1055 	if (rt == NULL) {
1056 		RIB_RUNLOCK(rnh);
1057 		return (ESRCH);
1058 	}
1059 
1060 	rnd_orig.rnd_nhop = rt->rt_nhop;
1061 	rnd_orig.rnd_weight = rt->rt_weight;
1062 
1063 	RIB_RUNLOCK(rnh);
1064 
1065 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1066 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1067 		if (error != EAGAIN)
1068 			break;
1069 	}
1070 
1071 	return (error);
1072 }
1073 
1074 static int
1075 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1076     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1077 {
1078 	int error;
1079 
1080 	/*
1081 	 * New gateway could require new ifaddr, ifp;
1082 	 * flags may also be different; ifp may be specified
1083 	 * by ll sockaddr when protocol address is ambiguous
1084 	 */
1085 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1086 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1087 	    info->rti_info[RTAX_IFP] != NULL ||
1088 	    (info->rti_info[RTAX_IFA] != NULL &&
1089 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1090 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1091 
1092 		if (error != 0) {
1093 			info->rti_ifa = NULL;
1094 			return (error);
1095 		}
1096 	}
1097 
1098 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1099 	info->rti_ifa = NULL;
1100 
1101 	return (error);
1102 }
1103 
1104 #ifdef ROUTE_MPATH
1105 static int
1106 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1107     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1108     struct rib_cmd_info *rc)
1109 {
1110 	int error = 0, found_idx = 0;
1111 	struct nhop_object *nh_orig = NULL, *nh_new;
1112 	struct route_nhop_data rnd_new = {};
1113 	const struct weightened_nhop *wn = NULL;
1114 	struct weightened_nhop *wn_new;
1115 	uint32_t num_nhops;
1116 
1117 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1118 	for (int i = 0; i < num_nhops; i++) {
1119 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1120 			nh_orig = wn[i].nh;
1121 			found_idx = i;
1122 			break;
1123 		}
1124 	}
1125 
1126 	if (nh_orig == NULL)
1127 		return (ESRCH);
1128 
1129 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1130 	if (error != 0)
1131 		return (error);
1132 
1133 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1134 	    M_TEMP, M_NOWAIT | M_ZERO);
1135 	if (wn_new == NULL) {
1136 		nhop_free(nh_new);
1137 		return (EAGAIN);
1138 	}
1139 
1140 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1141 	wn_new[found_idx].nh = nh_new;
1142 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1143 
1144 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1145 	nhop_free(nh_new);
1146 	free(wn_new, M_TEMP);
1147 
1148 	if (error != 0)
1149 		return (error);
1150 
1151 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1152 
1153 	return (error);
1154 }
1155 #endif
1156 
1157 static int
1158 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1159     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1160     struct rib_cmd_info *rc)
1161 {
1162 	int error = 0;
1163 	struct nhop_object *nh_orig;
1164 	struct route_nhop_data rnd_new;
1165 
1166 	nh_orig = rnd_orig->rnd_nhop;
1167 	if (nh_orig == NULL)
1168 		return (ESRCH);
1169 
1170 #ifdef ROUTE_MPATH
1171 	if (NH_IS_NHGRP(nh_orig))
1172 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1173 #endif
1174 
1175 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1176 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1177 	if (error != 0)
1178 		return (error);
1179 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1180 
1181 	return (error);
1182 }
1183 
1184 /*
1185  * Insert @rt with nhop data from @rnd_new to @rnh.
1186  * Returns 0 on success and stores operation results in @rc.
1187  */
1188 static int
1189 add_route(struct rib_head *rnh, struct rtentry *rt,
1190     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1191 {
1192 	struct radix_node *rn;
1193 
1194 	RIB_WLOCK_ASSERT(rnh);
1195 
1196 	rt->rt_nhop = rnd->rnd_nhop;
1197 	rt->rt_weight = rnd->rnd_weight;
1198 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1199 
1200 	if (rn != NULL) {
1201 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1202 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1203 
1204 		/* Finalize notification */
1205 		rib_bump_gen(rnh);
1206 		rnh->rnh_prefixes++;
1207 
1208 		rc->rc_cmd = RTM_ADD;
1209 		rc->rc_rt = rt;
1210 		rc->rc_nh_old = NULL;
1211 		rc->rc_nh_new = rnd->rnd_nhop;
1212 		rc->rc_nh_weight = rnd->rnd_weight;
1213 
1214 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1215 		return (0);
1216 	}
1217 
1218 	/* Existing route or memory allocation failure. */
1219 	return (EEXIST);
1220 }
1221 
1222 /*
1223  * Unconditionally deletes @rt from @rnh.
1224  */
1225 static int
1226 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1227 {
1228 	RIB_WLOCK_ASSERT(rnh);
1229 
1230 	/* Route deletion requested. */
1231 	struct radix_node *rn;
1232 
1233 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1234 	if (rn == NULL)
1235 		return (ESRCH);
1236 	rt = RNTORT(rn);
1237 	rt->rte_flags &= ~RTF_UP;
1238 
1239 	rib_bump_gen(rnh);
1240 	rnh->rnh_prefixes--;
1241 
1242 	rc->rc_cmd = RTM_DELETE;
1243 	rc->rc_rt = rt;
1244 	rc->rc_nh_old = rt->rt_nhop;
1245 	rc->rc_nh_new = NULL;
1246 	rc->rc_nh_weight = rt->rt_weight;
1247 
1248 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1249 
1250 	return (0);
1251 }
1252 
1253 /*
1254  * Switch @rt nhop/weigh to the ones specified in @rnd.
1255  * Returns 0 on success.
1256  */
1257 int
1258 change_route(struct rib_head *rnh, struct rtentry *rt,
1259     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1260 {
1261 	struct nhop_object *nh_orig;
1262 
1263 	RIB_WLOCK_ASSERT(rnh);
1264 
1265 	nh_orig = rt->rt_nhop;
1266 
1267 	if (rnd->rnd_nhop == NULL)
1268 		return (delete_route(rnh, rt, rc));
1269 
1270 	/* Changing nexthop & weight to a new one */
1271 	rt->rt_nhop = rnd->rnd_nhop;
1272 	rt->rt_weight = rnd->rnd_weight;
1273 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1274 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1275 
1276 	/* Finalize notification */
1277 	rib_bump_gen(rnh);
1278 	rc->rc_cmd = RTM_CHANGE;
1279 	rc->rc_rt = rt;
1280 	rc->rc_nh_old = nh_orig;
1281 	rc->rc_nh_new = rnd->rnd_nhop;
1282 	rc->rc_nh_weight = rnd->rnd_weight;
1283 
1284 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1285 
1286 	return (0);
1287 }
1288 
1289 /*
1290  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1291  *  consistent with the current route data.
1292  * Nexthop in @nhd_new is consumed.
1293  */
1294 int
1295 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1296     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1297     struct rib_cmd_info *rc)
1298 {
1299 	struct rtentry *rt_new;
1300 	int error = 0;
1301 
1302 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1303 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1304 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1305 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1306 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1307 		    "trying change %s -> %s", buf_old, buf_new);
1308 	}
1309 	RIB_WLOCK(rnh);
1310 
1311 	struct route_nhop_data rnd;
1312 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1313 
1314 	if (rt_new == NULL) {
1315 		if (rnd_orig->rnd_nhop == NULL)
1316 			error = add_route(rnh, rt, rnd_new, rc);
1317 		else {
1318 			/*
1319 			 * Prefix does not exist, which was not our assumption.
1320 			 * Update @rnd_orig with the new data and return
1321 			 */
1322 			rnd_orig->rnd_nhop = NULL;
1323 			rnd_orig->rnd_weight = 0;
1324 			error = EAGAIN;
1325 		}
1326 	} else {
1327 		/* Prefix exists, try to update */
1328 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1329 			/*
1330 			 * Nhop/mpath group hasn't changed. Flip
1331 			 * to the new precalculated one and return
1332 			 */
1333 			error = change_route(rnh, rt_new, rnd_new, rc);
1334 		} else {
1335 			/* Update and retry */
1336 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1337 			rnd_orig->rnd_weight = rt_new->rt_weight;
1338 			error = EAGAIN;
1339 		}
1340 	}
1341 
1342 	RIB_WUNLOCK(rnh);
1343 
1344 	if (error == 0) {
1345 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1346 
1347 		if (rnd_orig->rnd_nhop != NULL)
1348 			nhop_free_any(rnd_orig->rnd_nhop);
1349 
1350 	} else {
1351 		if (rnd_new->rnd_nhop != NULL)
1352 			nhop_free_any(rnd_new->rnd_nhop);
1353 	}
1354 
1355 	return (error);
1356 }
1357 
1358 /*
1359  * Performs modification of routing table specificed by @action.
1360  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1361  * Needs to be run in network epoch.
1362  *
1363  * Returns 0 on success and fills in @rc with action result.
1364  */
1365 int
1366 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1367     struct rib_cmd_info *rc)
1368 {
1369 	int error;
1370 
1371 	switch (action) {
1372 	case RTM_ADD:
1373 		error = rib_add_route(fibnum, info, rc);
1374 		break;
1375 	case RTM_DELETE:
1376 		error = rib_del_route(fibnum, info, rc);
1377 		break;
1378 	case RTM_CHANGE:
1379 		error = rib_change_route(fibnum, info, rc);
1380 		break;
1381 	default:
1382 		error = ENOTSUP;
1383 	}
1384 
1385 	return (error);
1386 }
1387 
1388 struct rt_delinfo
1389 {
1390 	struct rib_head *rnh;
1391 	struct rtentry *head;
1392 	rib_filter_f_t *filter_f;
1393 	void *filter_arg;
1394 	int prio;
1395 	struct rib_cmd_info rc;
1396 };
1397 
1398 /*
1399  * Conditionally unlinks rtenties or paths from radix tree based
1400  * on the callback data passed in @arg.
1401  */
1402 static int
1403 rt_checkdelroute(struct radix_node *rn, void *arg)
1404 {
1405 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1406 	struct rtentry *rt = (struct rtentry *)rn;
1407 
1408 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1409 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1410 		return (0);
1411 
1412 	/*
1413 	 * Add deleted rtentries to the list to GC them
1414 	 *  after dropping the lock.
1415 	 *
1416 	 * XXX: Delayed notifications not implemented
1417 	 *  for nexthop updates.
1418 	 */
1419 	if (di->rc.rc_cmd == RTM_DELETE) {
1420 		/* Add to the list and return */
1421 		rt->rt_chain = di->head;
1422 		di->head = rt;
1423 #ifdef ROUTE_MPATH
1424 	} else {
1425 		/*
1426 		 * RTM_CHANGE to a different nexthop or nexthop group.
1427 		 * Free old multipath group.
1428 		 */
1429 		nhop_free_any(di->rc.rc_nh_old);
1430 #endif
1431 	}
1432 
1433 	return (0);
1434 }
1435 
1436 /*
1437  * Iterates over a routing table specified by @fibnum and @family and
1438  *  deletes elements marked by @filter_f.
1439  * @fibnum: rtable id
1440  * @family: AF_ address family
1441  * @filter_f: function returning non-zero value for items to delete
1442  * @arg: data to pass to the @filter_f function
1443  * @report: true if rtsock notification is needed.
1444  */
1445 void
1446 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1447     bool report)
1448 {
1449 	struct rib_head *rnh;
1450 	struct rtentry *rt;
1451 	struct nhop_object *nh;
1452 	struct epoch_tracker et;
1453 
1454 	rnh = rt_tables_get_rnh(fibnum, family);
1455 	if (rnh == NULL)
1456 		return;
1457 
1458 	struct rt_delinfo di = {
1459 		.rnh = rnh,
1460 		.filter_f = filter_f,
1461 		.filter_arg = filter_arg,
1462 		.prio = NH_PRIORITY_NORMAL,
1463 	};
1464 
1465 	NET_EPOCH_ENTER(et);
1466 
1467 	RIB_WLOCK(rnh);
1468 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1469 	RIB_WUNLOCK(rnh);
1470 
1471 	/* We might have something to reclaim. */
1472 	bzero(&di.rc, sizeof(di.rc));
1473 	di.rc.rc_cmd = RTM_DELETE;
1474 	while (di.head != NULL) {
1475 		rt = di.head;
1476 		di.head = rt->rt_chain;
1477 		rt->rt_chain = NULL;
1478 		nh = rt->rt_nhop;
1479 
1480 		di.rc.rc_rt = rt;
1481 		di.rc.rc_nh_old = nh;
1482 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1483 
1484 		if (report) {
1485 #ifdef ROUTE_MPATH
1486 			struct nhgrp_object *nhg;
1487 			const struct weightened_nhop *wn;
1488 			uint32_t num_nhops;
1489 			if (NH_IS_NHGRP(nh)) {
1490 				nhg = (struct nhgrp_object *)nh;
1491 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1492 				for (int i = 0; i < num_nhops; i++)
1493 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1494 			} else
1495 #endif
1496 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1497 		}
1498 		rt_free(rt);
1499 	}
1500 
1501 	NET_EPOCH_EXIT(et);
1502 }
1503 
1504 static int
1505 rt_delete_unconditional(struct radix_node *rn, void *arg)
1506 {
1507 	struct rtentry *rt = RNTORT(rn);
1508 	struct rib_head *rnh = (struct rib_head *)arg;
1509 
1510 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1511 	if (RNTORT(rn) == rt)
1512 		rt_free(rt);
1513 
1514 	return (0);
1515 }
1516 
1517 /*
1518  * Removes all routes from the routing table without executing notifications.
1519  * rtentres will be removed after the end of a current epoch.
1520  */
1521 static void
1522 rib_flush_routes(struct rib_head *rnh)
1523 {
1524 	RIB_WLOCK(rnh);
1525 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1526 	RIB_WUNLOCK(rnh);
1527 }
1528 
1529 void
1530 rib_flush_routes_family(int family)
1531 {
1532 	struct rib_head *rnh;
1533 
1534 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1535 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1536 			rib_flush_routes(rnh);
1537 	}
1538 }
1539 
1540 const char *
1541 rib_print_family(int family)
1542 {
1543 	switch (family) {
1544 	case AF_INET:
1545 		return ("inet");
1546 	case AF_INET6:
1547 		return ("inet6");
1548 	case AF_LINK:
1549 		return ("link");
1550 	}
1551 	return ("unknown");
1552 }
1553 
1554