xref: /freebsd/sys/net/route/route_ctl.c (revision 2b833162)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_private.h>
48 #include <net/if_dl.h>
49 #include <net/vnet.h>
50 #include <net/route.h>
51 #include <net/route/route_ctl.h>
52 #include <net/route/route_var.h>
53 #include <net/route/nhop_utils.h>
54 #include <net/route/nhop.h>
55 #include <net/route/nhop_var.h>
56 #include <netinet/in.h>
57 #include <netinet6/scope6_var.h>
58 #include <netinet6/in6_var.h>
59 
60 #define	DEBUG_MOD_NAME	route_ctl
61 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
62 #include <net/route/route_debug.h>
63 _DECLARE_DEBUG(LOG_INFO);
64 
65 /*
66  * This file contains control plane routing tables functions.
67  *
68  * All functions assumes they are called in net epoch.
69  */
70 
71 union sockaddr_union {
72 	struct sockaddr		sa;
73 	struct sockaddr_in	sin;
74 	struct sockaddr_in6	sin6;
75 	char			_buf[32];
76 };
77 
78 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
79     struct rib_cmd_info *rc);
80 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
81     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
82     struct rib_cmd_info *rc);
83 
84 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
85     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
86 #ifdef ROUTE_MPATH
87 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
88     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
89     int op_flags, struct rib_cmd_info *rc);
90 #endif
91 
92 static int add_route(struct rib_head *rnh, struct rtentry *rt,
93     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
94 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
95     struct rib_cmd_info *rc);
96 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
97     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
98 
99 static bool fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
100     struct sockaddr **pmask);
101 static int get_prio_from_info(const struct rt_addrinfo *info);
102 static int nhop_get_prio(const struct nhop_object *nh);
103 
104 #ifdef ROUTE_MPATH
105 static bool rib_can_multipath(struct rib_head *rh);
106 #endif
107 
108 /* Per-vnet multipath routing configuration */
109 SYSCTL_DECL(_net_route);
110 #define	V_rib_route_multipath	VNET(rib_route_multipath)
111 #ifdef ROUTE_MPATH
112 #define _MP_FLAGS	CTLFLAG_RW
113 #else
114 #define _MP_FLAGS	CTLFLAG_RD
115 #endif
116 VNET_DEFINE(u_int, rib_route_multipath) = 1;
117 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
118     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
119 #undef _MP_FLAGS
120 
121 #ifdef ROUTE_MPATH
122 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
123 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
124     &VNET_NAME(fib_hash_outbound), 0,
125     "Compute flowid for locally-originated packets");
126 
127 /* Default entropy to add to the hash calculation for the outbound connections*/
128 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
129 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
130 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
131 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
132 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
133 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
134 };
135 #endif
136 
137 #if defined(INET) && defined(INET6)
138 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
139 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
140 VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
141 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
142     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
143 #endif
144 
145 /* Debug bits */
146 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
147 
148 static struct rib_head *
149 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
150 {
151 	struct rib_head *rnh;
152 	struct sockaddr *dst;
153 
154 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
155 
156 	dst = info->rti_info[RTAX_DST];
157 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
158 
159 	return (rnh);
160 }
161 
162 #if defined(INET) && defined(INET6)
163 bool
164 rib_can_4o6_nhop(void)
165 {
166 	return (!!V_rib_route_ipv6_nexthop);
167 }
168 #endif
169 
170 #ifdef ROUTE_MPATH
171 static bool
172 rib_can_multipath(struct rib_head *rh)
173 {
174 	int result;
175 
176 	CURVNET_SET(rh->rib_vnet);
177 	result = !!V_rib_route_multipath;
178 	CURVNET_RESTORE();
179 
180 	return (result);
181 }
182 
183 /*
184  * Check is nhop is multipath-eligible.
185  * Avoid nhops without gateways and redirects.
186  *
187  * Returns 1 for multipath-eligible nexthop,
188  * 0 otherwise.
189  */
190 bool
191 nhop_can_multipath(const struct nhop_object *nh)
192 {
193 
194 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
195 		return (1);
196 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
197 		return (0);
198 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
199 		return (0);
200 
201 	return (1);
202 }
203 #endif
204 
205 static int
206 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
207 {
208 	uint32_t weight;
209 
210 	if (info->rti_mflags & RTV_WEIGHT)
211 		weight = info->rti_rmx->rmx_weight;
212 	else
213 		weight = default_weight;
214 	/* Keep upper 1 byte for adm distance purposes */
215 	if (weight > RT_MAX_WEIGHT)
216 		weight = RT_MAX_WEIGHT;
217 	else if (weight == 0)
218 		weight = default_weight;
219 
220 	return (weight);
221 }
222 
223 /*
224  * File-local concept for distingushing between the normal and
225  * RTF_PINNED routes tha can override the "normal" one.
226  */
227 #define	NH_PRIORITY_HIGH	2
228 #define	NH_PRIORITY_NORMAL	1
229 static int
230 get_prio_from_info(const struct rt_addrinfo *info)
231 {
232 	if (info->rti_flags & RTF_PINNED)
233 		return (NH_PRIORITY_HIGH);
234 	return (NH_PRIORITY_NORMAL);
235 }
236 
237 static int
238 nhop_get_prio(const struct nhop_object *nh)
239 {
240 	if (NH_IS_PINNED(nh))
241 		return (NH_PRIORITY_HIGH);
242 	return (NH_PRIORITY_NORMAL);
243 }
244 
245 /*
246  * Check if specified @gw matches gw data in the nexthop @nh.
247  *
248  * Returns true if matches, false otherwise.
249  */
250 bool
251 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
252 {
253 
254 	if (nh->gw_sa.sa_family != gw->sa_family)
255 		return (false);
256 
257 	switch (gw->sa_family) {
258 	case AF_INET:
259 		return (nh->gw4_sa.sin_addr.s_addr ==
260 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
261 	case AF_INET6:
262 		{
263 			const struct sockaddr_in6 *gw6;
264 			gw6 = (const struct sockaddr_in6 *)gw;
265 
266 			/*
267 			 * Currently (2020-09) IPv6 gws in kernel have their
268 			 * scope embedded. Once this becomes false, this code
269 			 * has to be revisited.
270 			 */
271 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
272 			    &gw6->sin6_addr))
273 				return (true);
274 			return (false);
275 		}
276 	case AF_LINK:
277 		{
278 			const struct sockaddr_dl *sdl;
279 			sdl = (const struct sockaddr_dl *)gw;
280 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
281 		}
282 	default:
283 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
284 	}
285 
286 	/* NOTREACHED */
287 	return (false);
288 }
289 
290 /*
291  * Matches all nexthop with given @gw.
292  * Can be used as rib_filter_f callback.
293  */
294 int
295 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
296 {
297 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
298 
299 	return (match_nhop_gw(nh, gw));
300 }
301 
302 struct gw_filter_data {
303 	const struct sockaddr *gw;
304 	int count;
305 };
306 
307 /*
308  * Matches first occurence of the gateway provided in @gwd
309  */
310 static int
311 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
312 {
313 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
314 
315 	/* Return only first match to make rtsock happy */
316 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
317 		return (1);
318 	return (0);
319 }
320 
321 /*
322  * Checks if data in @info matches nexhop @nh.
323  *
324  * Returns 0 on success,
325  * ESRCH if not matched,
326  * ENOENT if filter function returned false
327  */
328 int
329 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
330     const struct nhop_object *nh)
331 {
332 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
333 
334 	if (info->rti_filter != NULL) {
335 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
336 		    return (ENOENT);
337 	    else
338 		    return (0);
339 	}
340 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
341 		return (ESRCH);
342 
343 	return (0);
344 }
345 
346 /*
347  * Runs exact prefix match based on @dst and @netmask.
348  * Returns matched @rtentry if found or NULL.
349  * If rtentry was found, saves nexthop / weight value into @rnd.
350  */
351 static struct rtentry *
352 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
353     const struct sockaddr *netmask, struct route_nhop_data *rnd)
354 {
355 	struct rtentry *rt;
356 
357 	RIB_LOCK_ASSERT(rnh);
358 
359 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
360 	if (rt != NULL) {
361 		rnd->rnd_nhop = rt->rt_nhop;
362 		rnd->rnd_weight = rt->rt_weight;
363 	} else {
364 		rnd->rnd_nhop = NULL;
365 		rnd->rnd_weight = 0;
366 	}
367 
368 	return (rt);
369 }
370 
371 struct rtentry *
372 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
373     struct route_nhop_data *rnd)
374 {
375 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
376 }
377 
378 /*
379  * Runs exact prefix match based on dst/netmask from @info.
380  * Assumes RIB lock is held.
381  * Returns matched @rtentry if found or NULL.
382  * If rtentry was found, saves nexthop / weight value into @rnd.
383  */
384 struct rtentry *
385 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
386     struct route_nhop_data *rnd)
387 {
388 	struct rtentry *rt;
389 
390 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
391 	    info->rti_info[RTAX_NETMASK], rnd);
392 
393 	return (rt);
394 }
395 
396 const struct rtentry *
397 rib_lookup_prefix_plen(struct rib_head *rnh, struct sockaddr *dst, int plen,
398     struct route_nhop_data *rnd)
399 {
400 	union sockaddr_union mask_storage;
401 	struct sockaddr *netmask = &mask_storage.sa;
402 
403 	if (fill_pxmask_family(dst->sa_family, plen, dst, &netmask))
404 		return (lookup_prefix_bysa(rnh, dst, netmask, rnd));
405 	return (NULL);
406 }
407 
408 static bool
409 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
410     struct sockaddr **pmask)
411 {
412 	if (plen == -1) {
413 		*pmask = NULL;
414 		return (true);
415 	}
416 
417 	switch (family) {
418 #ifdef INET
419 	case AF_INET:
420 		{
421 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
422 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
423 
424 			memset(mask, 0, sizeof(*mask));
425 			mask->sin_family = family;
426 			mask->sin_len = sizeof(*mask);
427 			if (plen == 32)
428 				*pmask = NULL;
429 			else if (plen > 32 || plen < 0)
430 				return (false);
431 			else {
432 				uint32_t daddr, maddr;
433 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
434 				mask->sin_addr.s_addr = maddr;
435 				daddr = dst->sin_addr.s_addr;
436 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
437 				dst->sin_addr.s_addr = daddr;
438 			}
439 			return (true);
440 		}
441 		break;
442 #endif
443 #ifdef INET6
444 	case AF_INET6:
445 		{
446 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
447 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
448 
449 			memset(mask, 0, sizeof(*mask));
450 			mask->sin6_family = family;
451 			mask->sin6_len = sizeof(*mask);
452 			if (plen == 128)
453 				*pmask = NULL;
454 			else if (plen > 128 || plen < 0)
455 				return (false);
456 			else {
457 				ip6_writemask(&mask->sin6_addr, plen);
458 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
459 			}
460 			return (true);
461 		}
462 		break;
463 #endif
464 	}
465 	return (false);
466 }
467 
468 /*
469  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
470  * to the routing table.
471  *
472  * @fibnum: verified kernel rtable id to insert route to
473  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
474  * @plen: prefix length (or -1 if host route or not applicable for AF)
475  * @op_flags: combination of RTM_F_ flags
476  * @rc: storage to report operation result
477  *
478  * Returns 0 on success.
479  */
480 int
481 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
482     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
483 {
484 	union sockaddr_union mask_storage;
485 	struct sockaddr *netmask = &mask_storage.sa;
486 	struct rtentry *rt = NULL;
487 
488 	NET_EPOCH_ASSERT();
489 
490 	bzero(rc, sizeof(struct rib_cmd_info));
491 	rc->rc_cmd = RTM_ADD;
492 
493 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
494 	if (rnh == NULL)
495 		return (EAFNOSUPPORT);
496 
497 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
498 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
499 		return (EINVAL);
500 	}
501 
502 	if (op_flags & RTM_F_CREATE) {
503 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
504 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
505 			return (ENOMEM);
506 		}
507 	} else {
508 		struct route_nhop_data rnd_tmp;
509 		RIB_RLOCK_TRACKER;
510 
511 		RIB_RLOCK(rnh);
512 		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
513 		RIB_RUNLOCK(rnh);
514 
515 		if (rt == NULL)
516 			return (ESRCH);
517 	}
518 
519 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
520 }
521 
522 /*
523  * Attempts to delete @dst/plen prefix matching gateway @gw from the
524  *  routing rable.
525  *
526  * @fibnum: rtable id to remove route from
527  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
528  * @plen: prefix length (or -1 if host route or not applicable for AF)
529  * @gw: gateway to match
530  * @op_flags: combination of RTM_F_ flags
531  * @rc: storage to report operation result
532  *
533  * Returns 0 on success.
534  */
535 int
536 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
537     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
538 {
539 	struct gw_filter_data gwd = { .gw = gw };
540 
541 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
542 }
543 
544 /*
545  * Attempts to delete @dst/plen prefix matching @filter_func from the
546  *  routing rable.
547  *
548  * @fibnum: rtable id to remove route from
549  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
550  * @plen: prefix length (or -1 if host route or not applicable for AF)
551  * @filter_func: func to be called for each nexthop of the prefix for matching
552  * @filter_arg: argument to pass to @filter_func
553  * @op_flags: combination of RTM_F_ flags
554  * @rc: storage to report operation result
555  *
556  * Returns 0 on success.
557  */
558 int
559 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
560     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
561     struct rib_cmd_info *rc)
562 {
563 	union sockaddr_union mask_storage;
564 	struct sockaddr *netmask = &mask_storage.sa;
565 	int error;
566 
567 	NET_EPOCH_ASSERT();
568 
569 	bzero(rc, sizeof(struct rib_cmd_info));
570 	rc->rc_cmd = RTM_DELETE;
571 
572 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
573 	if (rnh == NULL)
574 		return (EAFNOSUPPORT);
575 
576 	if (dst->sa_len > sizeof(mask_storage)) {
577 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
578 		return (EINVAL);
579 	}
580 
581 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
582 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
583 		return (EINVAL);
584 	}
585 
586 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
587 
588 	RIB_WLOCK(rnh);
589 	struct route_nhop_data rnd;
590 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
591 	if (rt != NULL) {
592 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
593 		    filter_arg, rc);
594 	} else
595 		error = ESRCH;
596 	RIB_WUNLOCK(rnh);
597 
598 	if (error != 0)
599 		return (error);
600 
601 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
602 
603 	if (rc->rc_cmd == RTM_DELETE)
604 		rt_free(rc->rc_rt);
605 #ifdef ROUTE_MPATH
606 	else {
607 		/*
608 		 * Deleting 1 path may result in RTM_CHANGE to
609 		 * a different mpath group/nhop.
610 		 * Free old mpath group.
611 		 */
612 		nhop_free_any(rc->rc_nh_old);
613 	}
614 #endif
615 
616 	return (0);
617 }
618 
619 /*
620  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
621  * @rt: route to copy.
622  * @rnd_src: nhop and weight. Multipath routes are not supported
623  * @rh_dst: target rtable.
624  * @rc: operation result storage
625  *
626  * Return 0 on success.
627  */
628 int
629 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
630     struct rib_head *rh_dst, struct rib_cmd_info *rc)
631 {
632 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
633 	int error;
634 
635 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
636 
637 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
638 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
639 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
640 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
641 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
642 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
643 	}
644 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
645 	if (nh == NULL) {
646 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
647 		return (ENOMEM);
648 	}
649 	nhop_copy(nh, rnd_src->rnd_nhop);
650 	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
651 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
652 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
653 	if (error != 0) {
654 		FIB_RH_LOG(LOG_INFO, rh_dst,
655 		    "unable to finalize new nexthop: error %d", error);
656 		return (ENOMEM);
657 	}
658 
659 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
660 	if (rt_new == NULL) {
661 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
662 		nhop_free(nh);
663 		return (ENOMEM);
664 	}
665 
666 	struct route_nhop_data rnd = {
667 		.rnd_nhop = nh,
668 		.rnd_weight = rnd_src->rnd_weight
669 	};
670 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
671 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
672 
673 	if (error != 0) {
674 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
675 			char buf[NHOP_PRINT_BUFSIZE];
676 			rt_print_buf(rt_new, buf, sizeof(buf));
677 			FIB_RH_LOG(LOG_DEBUG, rh_dst,
678 			    "Unable to add route %s: error %d", buf, error);
679 		}
680 		nhop_free(nh);
681 		rt_free_immediate(rt_new);
682 	}
683 	return (error);
684 }
685 
686 /*
687  * Adds route defined by @info into the kernel table specified by @fibnum and
688  * sa_family in @info->rti_info[RTAX_DST].
689  *
690  * Returns 0 on success and fills in operation metadata into @rc.
691  */
692 int
693 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
694     struct rib_cmd_info *rc)
695 {
696 	struct rib_head *rnh;
697 	int error;
698 
699 	NET_EPOCH_ASSERT();
700 
701 	rnh = get_rnh(fibnum, info);
702 	if (rnh == NULL)
703 		return (EAFNOSUPPORT);
704 
705 	/*
706 	 * Check consistency between RTF_HOST flag and netmask
707 	 * existence.
708 	 */
709 	if (info->rti_flags & RTF_HOST)
710 		info->rti_info[RTAX_NETMASK] = NULL;
711 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
712 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
713 		return (EINVAL);
714 	}
715 
716 	bzero(rc, sizeof(struct rib_cmd_info));
717 	rc->rc_cmd = RTM_ADD;
718 
719 	error = add_route_byinfo(rnh, info, rc);
720 	if (error == 0)
721 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
722 
723 	return (error);
724 }
725 
726 static int
727 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
728     struct rib_cmd_info *rc)
729 {
730 	struct route_nhop_data rnd_add;
731 	struct nhop_object *nh;
732 	struct rtentry *rt;
733 	struct sockaddr *dst, *gateway, *netmask;
734 	int error;
735 
736 	dst = info->rti_info[RTAX_DST];
737 	gateway = info->rti_info[RTAX_GATEWAY];
738 	netmask = info->rti_info[RTAX_NETMASK];
739 
740 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
741 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
742 		return (EINVAL);
743 	}
744 	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
745 		FIB_RH_LOG(LOG_DEBUG, rnh,
746 		    "error: invalid dst/gateway family combination (%d, %d)",
747 		    dst->sa_family, gateway->sa_family);
748 		return (EINVAL);
749 	}
750 
751 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
752 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
753 		    dst->sa_len);
754 		return (EINVAL);
755 	}
756 
757 	if (info->rti_ifa == NULL) {
758 		error = rt_getifa_fib(info, rnh->rib_fibnum);
759 		if (error)
760 			return (error);
761 	}
762 
763 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
764 		return (ENOBUFS);
765 
766 	error = nhop_create_from_info(rnh, info, &nh);
767 	if (error != 0) {
768 		rt_free_immediate(rt);
769 		return (error);
770 	}
771 
772 	rnd_add.rnd_nhop = nh;
773 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
774 
775 	int op_flags = RTM_F_CREATE;
776 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
777 		op_flags |= RTM_F_FORCE;
778 	else
779 		op_flags |= RTM_F_APPEND;
780 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
781 
782 }
783 
784 static int
785 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
786     int op_flags, struct rib_cmd_info *rc)
787 {
788 	struct route_nhop_data rnd_orig;
789 	struct nhop_object *nh;
790 	struct rtentry *rt_orig;
791 	int error = 0;
792 
793 	MPASS(rt != NULL);
794 
795 	nh = rnd_add->rnd_nhop;
796 
797 	RIB_WLOCK(rnh);
798 
799 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
800 
801 	if (rt_orig == NULL) {
802 		if (op_flags & RTM_F_CREATE)
803 			error = add_route(rnh, rt, rnd_add, rc);
804 		else
805 			error = ESRCH; /* no entry but creation was not required */
806 		RIB_WUNLOCK(rnh);
807 		if (error != 0)
808 			goto out;
809 		return (0);
810 	}
811 
812 	if (op_flags & RTM_F_EXCL) {
813 		/* We have existing route in the RIB but not allowed to replace. */
814 		RIB_WUNLOCK(rnh);
815 		error = EEXIST;
816 		goto out;
817 	}
818 
819 	/* Now either append or replace */
820 	if (op_flags & RTM_F_REPLACE) {
821 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
822 			/* Old path is "better" (e.g. has PINNED flag set) */
823 			RIB_WUNLOCK(rnh);
824 			error = EEXIST;
825 			goto out;
826 		}
827 		change_route(rnh, rt_orig, rnd_add, rc);
828 		RIB_WUNLOCK(rnh);
829 		nh = rc->rc_nh_old;
830 		goto out;
831 	}
832 
833 	RIB_WUNLOCK(rnh);
834 
835 #ifdef ROUTE_MPATH
836 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
837 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
838 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
839 
840 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
841 			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
842 			    op_flags, rc);
843 			if (error != EAGAIN)
844 				break;
845 			RTSTAT_INC(rts_add_retry);
846 		}
847 
848 		/*
849 		 *  Original nhop reference is unused in any case.
850 		 */
851 		nhop_free_any(rnd_add->rnd_nhop);
852 		if (op_flags & RTM_F_CREATE) {
853 			if (error != 0 || rc->rc_cmd != RTM_ADD)
854 				rt_free_immediate(rt);
855 		}
856 		return (error);
857 	}
858 #endif
859 	/* Out of options - free state and return error */
860 	error = EEXIST;
861 out:
862 	if (op_flags & RTM_F_CREATE)
863 		rt_free_immediate(rt);
864 	nhop_free_any(nh);
865 
866 	return (error);
867 }
868 
869 #ifdef ROUTE_MPATH
870 static int
871 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
872     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
873     int op_flags, struct rib_cmd_info *rc)
874 {
875 	RIB_RLOCK_TRACKER;
876 	struct route_nhop_data rnd_new;
877 	int error = 0;
878 
879 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
880 	if (error != 0) {
881 		if (error == EAGAIN) {
882 			/*
883 			 * Group creation failed, most probably because
884 			 * @rnd_orig data got scheduled for deletion.
885 			 * Refresh @rnd_orig data and retry.
886 			 */
887 			RIB_RLOCK(rnh);
888 			lookup_prefix_rt(rnh, rt, rnd_orig);
889 			RIB_RUNLOCK(rnh);
890 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
891 				/* In this iteration route doesn't exist */
892 				error = ENOENT;
893 			}
894 		}
895 		return (error);
896 	}
897 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
898 	if (error != 0)
899 		return (error);
900 
901 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
902 		/*
903 		 * First multipath route got installed. Enable local
904 		 * outbound connections hashing.
905 		 */
906 		if (bootverbose)
907 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
908 		V_fib_hash_outbound = 1;
909 	}
910 
911 	return (0);
912 }
913 #endif
914 
915 /*
916  * Removes route defined by @info from the kernel table specified by @fibnum and
917  * sa_family in @info->rti_info[RTAX_DST].
918  *
919  * Returns 0 on success and fills in operation metadata into @rc.
920  */
921 int
922 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
923 {
924 	struct rib_head *rnh;
925 	struct sockaddr *dst, *netmask;
926 	struct sockaddr_storage mdst;
927 	int error;
928 
929 	NET_EPOCH_ASSERT();
930 
931 	rnh = get_rnh(fibnum, info);
932 	if (rnh == NULL)
933 		return (EAFNOSUPPORT);
934 
935 	bzero(rc, sizeof(struct rib_cmd_info));
936 	rc->rc_cmd = RTM_DELETE;
937 
938 	dst = info->rti_info[RTAX_DST];
939 	netmask = info->rti_info[RTAX_NETMASK];
940 
941 	if (netmask != NULL) {
942 		/* Ensure @dst is always properly masked */
943 		if (dst->sa_len > sizeof(mdst)) {
944 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
945 			return (EINVAL);
946 		}
947 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
948 		dst = (struct sockaddr *)&mdst;
949 	}
950 
951 	rib_filter_f_t *filter_func = NULL;
952 	void *filter_arg = NULL;
953 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
954 
955 	if (info->rti_filter != NULL) {
956 		filter_func = info->rti_filter;
957 		filter_arg = info->rti_filterdata;
958 	} else if (gwd.gw != NULL) {
959 		filter_func = match_gw_one;
960 		filter_arg = &gwd;
961 	}
962 
963 	int prio = get_prio_from_info(info);
964 
965 	RIB_WLOCK(rnh);
966 	struct route_nhop_data rnd;
967 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
968 	if (rt != NULL) {
969 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
970 		    filter_arg, rc);
971 	} else
972 		error = ESRCH;
973 	RIB_WUNLOCK(rnh);
974 
975 	if (error != 0)
976 		return (error);
977 
978 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
979 
980 	if (rc->rc_cmd == RTM_DELETE)
981 		rt_free(rc->rc_rt);
982 #ifdef ROUTE_MPATH
983 	else {
984 		/*
985 		 * Deleting 1 path may result in RTM_CHANGE to
986 		 * a different mpath group/nhop.
987 		 * Free old mpath group.
988 		 */
989 		nhop_free_any(rc->rc_nh_old);
990 	}
991 #endif
992 
993 	return (0);
994 }
995 
996 /*
997  * Conditionally unlinks rtentry paths from @rnh matching @cb.
998  * Returns 0 on success with operation result stored in @rc.
999  * On error, returns:
1000  * ESRCH - if prefix was not found or filter function failed to match
1001  * EADDRINUSE - if trying to delete higher priority route.
1002  */
1003 static int
1004 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1005     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1006 {
1007 	struct nhop_object *nh = rt->rt_nhop;
1008 
1009 #ifdef ROUTE_MPATH
1010 	if (NH_IS_NHGRP(nh)) {
1011 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1012 		struct route_nhop_data rnd;
1013 		int error;
1014 
1015 		if (cb == NULL)
1016 			return (ESRCH);
1017 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1018 		if (error == 0) {
1019 			if (rnd.rnd_nhgrp == nhg) {
1020 				/* No match, unreference new group and return. */
1021 				nhop_free_any(rnd.rnd_nhop);
1022 				return (ESRCH);
1023 			}
1024 			error = change_route(rnh, rt, &rnd, rc);
1025 		}
1026 		return (error);
1027 	}
1028 #endif
1029 	if (cb != NULL && !cb(rt, nh, cbdata))
1030 		return (ESRCH);
1031 
1032 	if (prio < nhop_get_prio(nh))
1033 		return (EADDRINUSE);
1034 
1035 	return (delete_route(rnh, rt, rc));
1036 }
1037 
1038 int
1039 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1040     struct rib_cmd_info *rc)
1041 {
1042 	RIB_RLOCK_TRACKER;
1043 	struct route_nhop_data rnd_orig;
1044 	struct rib_head *rnh;
1045 	struct rtentry *rt;
1046 	int error;
1047 
1048 	NET_EPOCH_ASSERT();
1049 
1050 	rnh = get_rnh(fibnum, info);
1051 	if (rnh == NULL)
1052 		return (EAFNOSUPPORT);
1053 
1054 	bzero(rc, sizeof(struct rib_cmd_info));
1055 	rc->rc_cmd = RTM_CHANGE;
1056 
1057 	/* Check if updated gateway exists */
1058 	if ((info->rti_flags & RTF_GATEWAY) &&
1059 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1060 
1061 		/*
1062 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1063 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1064 		 * compatibility..
1065 		 */
1066 		info->rti_flags &= ~RTF_GATEWAY;
1067 	}
1068 
1069 	/*
1070 	 * route change is done in multiple steps, with dropping and
1071 	 * reacquiring lock. In the situations with multiple processes
1072 	 * changes the same route in can lead to the case when route
1073 	 * is changed between the steps. Address it by retrying the operation
1074 	 * multiple times before failing.
1075 	 */
1076 
1077 	RIB_RLOCK(rnh);
1078 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1079 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1080 
1081 	if (rt == NULL) {
1082 		RIB_RUNLOCK(rnh);
1083 		return (ESRCH);
1084 	}
1085 
1086 	rnd_orig.rnd_nhop = rt->rt_nhop;
1087 	rnd_orig.rnd_weight = rt->rt_weight;
1088 
1089 	RIB_RUNLOCK(rnh);
1090 
1091 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1092 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1093 		if (error != EAGAIN)
1094 			break;
1095 	}
1096 
1097 	return (error);
1098 }
1099 
1100 static int
1101 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1102     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1103 {
1104 	int error;
1105 
1106 	/*
1107 	 * New gateway could require new ifaddr, ifp;
1108 	 * flags may also be different; ifp may be specified
1109 	 * by ll sockaddr when protocol address is ambiguous
1110 	 */
1111 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1112 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1113 	    info->rti_info[RTAX_IFP] != NULL ||
1114 	    (info->rti_info[RTAX_IFA] != NULL &&
1115 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1116 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1117 
1118 		if (error != 0) {
1119 			info->rti_ifa = NULL;
1120 			return (error);
1121 		}
1122 	}
1123 
1124 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1125 	info->rti_ifa = NULL;
1126 
1127 	return (error);
1128 }
1129 
1130 #ifdef ROUTE_MPATH
1131 static int
1132 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1133     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1134     struct rib_cmd_info *rc)
1135 {
1136 	int error = 0, found_idx = 0;
1137 	struct nhop_object *nh_orig = NULL, *nh_new;
1138 	struct route_nhop_data rnd_new = {};
1139 	const struct weightened_nhop *wn = NULL;
1140 	struct weightened_nhop *wn_new;
1141 	uint32_t num_nhops;
1142 
1143 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1144 	for (int i = 0; i < num_nhops; i++) {
1145 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1146 			nh_orig = wn[i].nh;
1147 			found_idx = i;
1148 			break;
1149 		}
1150 	}
1151 
1152 	if (nh_orig == NULL)
1153 		return (ESRCH);
1154 
1155 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1156 	if (error != 0)
1157 		return (error);
1158 
1159 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1160 	    M_TEMP, M_NOWAIT | M_ZERO);
1161 	if (wn_new == NULL) {
1162 		nhop_free(nh_new);
1163 		return (EAGAIN);
1164 	}
1165 
1166 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1167 	wn_new[found_idx].nh = nh_new;
1168 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1169 
1170 	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1171 	nhop_free(nh_new);
1172 	free(wn_new, M_TEMP);
1173 
1174 	if (error != 0)
1175 		return (error);
1176 
1177 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1178 
1179 	return (error);
1180 }
1181 #endif
1182 
1183 static int
1184 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1185     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1186     struct rib_cmd_info *rc)
1187 {
1188 	int error = 0;
1189 	struct nhop_object *nh_orig;
1190 	struct route_nhop_data rnd_new;
1191 
1192 	nh_orig = rnd_orig->rnd_nhop;
1193 	if (nh_orig == NULL)
1194 		return (ESRCH);
1195 
1196 #ifdef ROUTE_MPATH
1197 	if (NH_IS_NHGRP(nh_orig))
1198 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1199 #endif
1200 
1201 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1202 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1203 	if (error != 0)
1204 		return (error);
1205 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1206 
1207 	return (error);
1208 }
1209 
1210 /*
1211  * Insert @rt with nhop data from @rnd_new to @rnh.
1212  * Returns 0 on success and stores operation results in @rc.
1213  */
1214 static int
1215 add_route(struct rib_head *rnh, struct rtentry *rt,
1216     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1217 {
1218 	struct radix_node *rn;
1219 
1220 	RIB_WLOCK_ASSERT(rnh);
1221 
1222 	rt->rt_nhop = rnd->rnd_nhop;
1223 	rt->rt_weight = rnd->rnd_weight;
1224 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1225 
1226 	if (rn != NULL) {
1227 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1228 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1229 
1230 		/* Finalize notification */
1231 		rib_bump_gen(rnh);
1232 		rnh->rnh_prefixes++;
1233 
1234 		rc->rc_cmd = RTM_ADD;
1235 		rc->rc_rt = rt;
1236 		rc->rc_nh_old = NULL;
1237 		rc->rc_nh_new = rnd->rnd_nhop;
1238 		rc->rc_nh_weight = rnd->rnd_weight;
1239 
1240 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1241 		return (0);
1242 	}
1243 
1244 	/* Existing route or memory allocation failure. */
1245 	return (EEXIST);
1246 }
1247 
1248 /*
1249  * Unconditionally deletes @rt from @rnh.
1250  */
1251 static int
1252 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1253 {
1254 	RIB_WLOCK_ASSERT(rnh);
1255 
1256 	/* Route deletion requested. */
1257 	struct radix_node *rn;
1258 
1259 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1260 	if (rn == NULL)
1261 		return (ESRCH);
1262 	rt = RNTORT(rn);
1263 	rt->rte_flags &= ~RTF_UP;
1264 
1265 	rib_bump_gen(rnh);
1266 	rnh->rnh_prefixes--;
1267 
1268 	rc->rc_cmd = RTM_DELETE;
1269 	rc->rc_rt = rt;
1270 	rc->rc_nh_old = rt->rt_nhop;
1271 	rc->rc_nh_new = NULL;
1272 	rc->rc_nh_weight = rt->rt_weight;
1273 
1274 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1275 
1276 	return (0);
1277 }
1278 
1279 /*
1280  * Switch @rt nhop/weigh to the ones specified in @rnd.
1281  * Returns 0 on success.
1282  */
1283 int
1284 change_route(struct rib_head *rnh, struct rtentry *rt,
1285     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1286 {
1287 	struct nhop_object *nh_orig;
1288 
1289 	RIB_WLOCK_ASSERT(rnh);
1290 
1291 	nh_orig = rt->rt_nhop;
1292 
1293 	if (rnd->rnd_nhop == NULL)
1294 		return (delete_route(rnh, rt, rc));
1295 
1296 	/* Changing nexthop & weight to a new one */
1297 	rt->rt_nhop = rnd->rnd_nhop;
1298 	rt->rt_weight = rnd->rnd_weight;
1299 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1300 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1301 
1302 	/* Finalize notification */
1303 	rib_bump_gen(rnh);
1304 	rc->rc_cmd = RTM_CHANGE;
1305 	rc->rc_rt = rt;
1306 	rc->rc_nh_old = nh_orig;
1307 	rc->rc_nh_new = rnd->rnd_nhop;
1308 	rc->rc_nh_weight = rnd->rnd_weight;
1309 
1310 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1311 
1312 	return (0);
1313 }
1314 
1315 /*
1316  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1317  *  consistent with the current route data.
1318  * Nexthop in @nhd_new is consumed.
1319  */
1320 int
1321 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1322     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1323     struct rib_cmd_info *rc)
1324 {
1325 	struct rtentry *rt_new;
1326 	int error = 0;
1327 
1328 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1329 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1330 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1331 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1332 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1333 		    "trying change %s -> %s", buf_old, buf_new);
1334 	}
1335 	RIB_WLOCK(rnh);
1336 
1337 	struct route_nhop_data rnd;
1338 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1339 
1340 	if (rt_new == NULL) {
1341 		if (rnd_orig->rnd_nhop == NULL)
1342 			error = add_route(rnh, rt, rnd_new, rc);
1343 		else {
1344 			/*
1345 			 * Prefix does not exist, which was not our assumption.
1346 			 * Update @rnd_orig with the new data and return
1347 			 */
1348 			rnd_orig->rnd_nhop = NULL;
1349 			rnd_orig->rnd_weight = 0;
1350 			error = EAGAIN;
1351 		}
1352 	} else {
1353 		/* Prefix exists, try to update */
1354 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1355 			/*
1356 			 * Nhop/mpath group hasn't changed. Flip
1357 			 * to the new precalculated one and return
1358 			 */
1359 			error = change_route(rnh, rt_new, rnd_new, rc);
1360 		} else {
1361 			/* Update and retry */
1362 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1363 			rnd_orig->rnd_weight = rt_new->rt_weight;
1364 			error = EAGAIN;
1365 		}
1366 	}
1367 
1368 	RIB_WUNLOCK(rnh);
1369 
1370 	if (error == 0) {
1371 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1372 
1373 		if (rnd_orig->rnd_nhop != NULL)
1374 			nhop_free_any(rnd_orig->rnd_nhop);
1375 
1376 	} else {
1377 		if (rnd_new->rnd_nhop != NULL)
1378 			nhop_free_any(rnd_new->rnd_nhop);
1379 	}
1380 
1381 	return (error);
1382 }
1383 
1384 /*
1385  * Performs modification of routing table specificed by @action.
1386  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1387  * Needs to be run in network epoch.
1388  *
1389  * Returns 0 on success and fills in @rc with action result.
1390  */
1391 int
1392 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1393     struct rib_cmd_info *rc)
1394 {
1395 	int error;
1396 
1397 	switch (action) {
1398 	case RTM_ADD:
1399 		error = rib_add_route(fibnum, info, rc);
1400 		break;
1401 	case RTM_DELETE:
1402 		error = rib_del_route(fibnum, info, rc);
1403 		break;
1404 	case RTM_CHANGE:
1405 		error = rib_change_route(fibnum, info, rc);
1406 		break;
1407 	default:
1408 		error = ENOTSUP;
1409 	}
1410 
1411 	return (error);
1412 }
1413 
1414 struct rt_delinfo
1415 {
1416 	struct rib_head *rnh;
1417 	struct rtentry *head;
1418 	rib_filter_f_t *filter_f;
1419 	void *filter_arg;
1420 	int prio;
1421 	struct rib_cmd_info rc;
1422 };
1423 
1424 /*
1425  * Conditionally unlinks rtenties or paths from radix tree based
1426  * on the callback data passed in @arg.
1427  */
1428 static int
1429 rt_checkdelroute(struct radix_node *rn, void *arg)
1430 {
1431 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1432 	struct rtentry *rt = (struct rtentry *)rn;
1433 
1434 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1435 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1436 		return (0);
1437 
1438 	/*
1439 	 * Add deleted rtentries to the list to GC them
1440 	 *  after dropping the lock.
1441 	 *
1442 	 * XXX: Delayed notifications not implemented
1443 	 *  for nexthop updates.
1444 	 */
1445 	if (di->rc.rc_cmd == RTM_DELETE) {
1446 		/* Add to the list and return */
1447 		rt->rt_chain = di->head;
1448 		di->head = rt;
1449 #ifdef ROUTE_MPATH
1450 	} else {
1451 		/*
1452 		 * RTM_CHANGE to a different nexthop or nexthop group.
1453 		 * Free old multipath group.
1454 		 */
1455 		nhop_free_any(di->rc.rc_nh_old);
1456 #endif
1457 	}
1458 
1459 	return (0);
1460 }
1461 
1462 /*
1463  * Iterates over a routing table specified by @fibnum and @family and
1464  *  deletes elements marked by @filter_f.
1465  * @fibnum: rtable id
1466  * @family: AF_ address family
1467  * @filter_f: function returning non-zero value for items to delete
1468  * @arg: data to pass to the @filter_f function
1469  * @report: true if rtsock notification is needed.
1470  */
1471 void
1472 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1473     bool report)
1474 {
1475 	struct rib_head *rnh;
1476 	struct rtentry *rt;
1477 	struct nhop_object *nh;
1478 	struct epoch_tracker et;
1479 
1480 	rnh = rt_tables_get_rnh(fibnum, family);
1481 	if (rnh == NULL)
1482 		return;
1483 
1484 	struct rt_delinfo di = {
1485 		.rnh = rnh,
1486 		.filter_f = filter_f,
1487 		.filter_arg = filter_arg,
1488 		.prio = NH_PRIORITY_NORMAL,
1489 	};
1490 
1491 	NET_EPOCH_ENTER(et);
1492 
1493 	RIB_WLOCK(rnh);
1494 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1495 	RIB_WUNLOCK(rnh);
1496 
1497 	/* We might have something to reclaim. */
1498 	bzero(&di.rc, sizeof(di.rc));
1499 	di.rc.rc_cmd = RTM_DELETE;
1500 	while (di.head != NULL) {
1501 		rt = di.head;
1502 		di.head = rt->rt_chain;
1503 		rt->rt_chain = NULL;
1504 		nh = rt->rt_nhop;
1505 
1506 		di.rc.rc_rt = rt;
1507 		di.rc.rc_nh_old = nh;
1508 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1509 
1510 		if (report) {
1511 #ifdef ROUTE_MPATH
1512 			struct nhgrp_object *nhg;
1513 			const struct weightened_nhop *wn;
1514 			uint32_t num_nhops;
1515 			if (NH_IS_NHGRP(nh)) {
1516 				nhg = (struct nhgrp_object *)nh;
1517 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1518 				for (int i = 0; i < num_nhops; i++)
1519 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1520 			} else
1521 #endif
1522 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1523 		}
1524 		rt_free(rt);
1525 	}
1526 
1527 	NET_EPOCH_EXIT(et);
1528 }
1529 
1530 static int
1531 rt_delete_unconditional(struct radix_node *rn, void *arg)
1532 {
1533 	struct rtentry *rt = RNTORT(rn);
1534 	struct rib_head *rnh = (struct rib_head *)arg;
1535 
1536 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1537 	if (RNTORT(rn) == rt)
1538 		rt_free(rt);
1539 
1540 	return (0);
1541 }
1542 
1543 /*
1544  * Removes all routes from the routing table without executing notifications.
1545  * rtentres will be removed after the end of a current epoch.
1546  */
1547 static void
1548 rib_flush_routes(struct rib_head *rnh)
1549 {
1550 	RIB_WLOCK(rnh);
1551 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1552 	RIB_WUNLOCK(rnh);
1553 }
1554 
1555 void
1556 rib_flush_routes_family(int family)
1557 {
1558 	struct rib_head *rnh;
1559 
1560 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1561 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1562 			rib_flush_routes(rnh);
1563 	}
1564 }
1565 
1566 const char *
1567 rib_print_family(int family)
1568 {
1569 	switch (family) {
1570 	case AF_INET:
1571 		return ("inet");
1572 	case AF_INET6:
1573 		return ("inet6");
1574 	case AF_LINK:
1575 		return ("link");
1576 	}
1577 	return ("unknown");
1578 }
1579 
1580