xref: /freebsd/sys/net/route/route_ctl.c (revision 271171e0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/sysctl.h>
40 #include <sys/syslog.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/rmlock.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_dl.h>
48 #include <net/vnet.h>
49 #include <net/route.h>
50 #include <net/route/route_ctl.h>
51 #include <net/route/route_var.h>
52 #include <net/route/nhop_utils.h>
53 #include <net/route/nhop.h>
54 #include <net/route/nhop_var.h>
55 #include <netinet/in.h>
56 #include <netinet6/scope6_var.h>
57 #include <netinet6/in6_var.h>
58 
59 #define	DEBUG_MOD_NAME	route_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains control plane routing tables functions.
66  *
67  * All functions assumes they are called in net epoch.
68  */
69 
70 union sockaddr_union {
71 	struct sockaddr		sa;
72 	struct sockaddr_in	sin;
73 	struct sockaddr_in6	sin6;
74 	char			_buf[32];
75 };
76 
77 static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78     struct rib_cmd_info *rc);
79 static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80     struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81     struct rib_cmd_info *rc);
82 
83 static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84     struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85 #ifdef ROUTE_MPATH
86 static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88     int op_flags, struct rib_cmd_info *rc);
89 #endif
90 
91 static int add_route(struct rib_head *rnh, struct rtentry *rt,
92     struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93 static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94     struct rib_cmd_info *rc);
95 static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97 
98 static int get_prio_from_info(const struct rt_addrinfo *info);
99 static int nhop_get_prio(const struct nhop_object *nh);
100 
101 #ifdef ROUTE_MPATH
102 static bool rib_can_multipath(struct rib_head *rh);
103 #endif
104 
105 /* Per-vnet multipath routing configuration */
106 SYSCTL_DECL(_net_route);
107 #define	V_rib_route_multipath	VNET(rib_route_multipath)
108 #ifdef ROUTE_MPATH
109 #define _MP_FLAGS	CTLFLAG_RW
110 #else
111 #define _MP_FLAGS	CTLFLAG_RD
112 #endif
113 VNET_DEFINE(u_int, rib_route_multipath) = 1;
114 SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
115     &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
116 #undef _MP_FLAGS
117 
118 #ifdef ROUTE_MPATH
119 VNET_DEFINE(u_int, fib_hash_outbound) = 0;
120 SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
121     &VNET_NAME(fib_hash_outbound), 0,
122     "Compute flowid for locally-originated packets");
123 
124 /* Default entropy to add to the hash calculation for the outbound connections*/
125 uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
126 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
131 };
132 #endif
133 
134 #if defined(INET) && defined(INET6)
135 FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
136 #define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
137 VNET_DEFINE(u_int, rib_route_ipv6_nexthop) = 1;
138 SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
139     &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
140 #endif
141 
142 /* Debug bits */
143 SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
144 
145 static struct rib_head *
146 get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
147 {
148 	struct rib_head *rnh;
149 	struct sockaddr *dst;
150 
151 	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
152 
153 	dst = info->rti_info[RTAX_DST];
154 	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
155 
156 	return (rnh);
157 }
158 
159 #if defined(INET) && defined(INET6)
160 static bool
161 rib_can_ipv6_nexthop_address(struct rib_head *rh)
162 {
163 	int result;
164 
165 	CURVNET_SET(rh->rib_vnet);
166 	result = !!V_rib_route_ipv6_nexthop;
167 	CURVNET_RESTORE();
168 
169 	return (result);
170 }
171 #endif
172 
173 #ifdef ROUTE_MPATH
174 static bool
175 rib_can_multipath(struct rib_head *rh)
176 {
177 	int result;
178 
179 	CURVNET_SET(rh->rib_vnet);
180 	result = !!V_rib_route_multipath;
181 	CURVNET_RESTORE();
182 
183 	return (result);
184 }
185 
186 /*
187  * Check is nhop is multipath-eligible.
188  * Avoid nhops without gateways and redirects.
189  *
190  * Returns 1 for multipath-eligible nexthop,
191  * 0 otherwise.
192  */
193 bool
194 nhop_can_multipath(const struct nhop_object *nh)
195 {
196 
197 	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
198 		return (1);
199 	if ((nh->nh_flags & NHF_GATEWAY) == 0)
200 		return (0);
201 	if ((nh->nh_flags & NHF_REDIRECT) != 0)
202 		return (0);
203 
204 	return (1);
205 }
206 #endif
207 
208 static int
209 get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
210 {
211 	uint32_t weight;
212 
213 	if (info->rti_mflags & RTV_WEIGHT)
214 		weight = info->rti_rmx->rmx_weight;
215 	else
216 		weight = default_weight;
217 	/* Keep upper 1 byte for adm distance purposes */
218 	if (weight > RT_MAX_WEIGHT)
219 		weight = RT_MAX_WEIGHT;
220 	else if (weight == 0)
221 		weight = default_weight;
222 
223 	return (weight);
224 }
225 
226 /*
227  * File-local concept for distingushing between the normal and
228  * RTF_PINNED routes tha can override the "normal" one.
229  */
230 #define	NH_PRIORITY_HIGH	2
231 #define	NH_PRIORITY_NORMAL	1
232 static int
233 get_prio_from_info(const struct rt_addrinfo *info)
234 {
235 	if (info->rti_flags & RTF_PINNED)
236 		return (NH_PRIORITY_HIGH);
237 	return (NH_PRIORITY_NORMAL);
238 }
239 
240 static int
241 nhop_get_prio(const struct nhop_object *nh)
242 {
243 	if (NH_IS_PINNED(nh))
244 		return (NH_PRIORITY_HIGH);
245 	return (NH_PRIORITY_NORMAL);
246 }
247 
248 /*
249  * Check if specified @gw matches gw data in the nexthop @nh.
250  *
251  * Returns true if matches, false otherwise.
252  */
253 bool
254 match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
255 {
256 
257 	if (nh->gw_sa.sa_family != gw->sa_family)
258 		return (false);
259 
260 	switch (gw->sa_family) {
261 	case AF_INET:
262 		return (nh->gw4_sa.sin_addr.s_addr ==
263 		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
264 	case AF_INET6:
265 		{
266 			const struct sockaddr_in6 *gw6;
267 			gw6 = (const struct sockaddr_in6 *)gw;
268 
269 			/*
270 			 * Currently (2020-09) IPv6 gws in kernel have their
271 			 * scope embedded. Once this becomes false, this code
272 			 * has to be revisited.
273 			 */
274 			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
275 			    &gw6->sin6_addr))
276 				return (true);
277 			return (false);
278 		}
279 	case AF_LINK:
280 		{
281 			const struct sockaddr_dl *sdl;
282 			sdl = (const struct sockaddr_dl *)gw;
283 			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
284 		}
285 	default:
286 		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
287 	}
288 
289 	/* NOTREACHED */
290 	return (false);
291 }
292 
293 /*
294  * Matches all nexthop with given @gw.
295  * Can be used as rib_filter_f callback.
296  */
297 int
298 rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
299 {
300 	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
301 
302 	return (match_nhop_gw(nh, gw));
303 }
304 
305 struct gw_filter_data {
306 	const struct sockaddr *gw;
307 	int count;
308 };
309 
310 /*
311  * Matches first occurence of the gateway provided in @gwd
312  */
313 static int
314 match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
315 {
316 	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
317 
318 	/* Return only first match to make rtsock happy */
319 	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
320 		return (1);
321 	return (0);
322 }
323 
324 /*
325  * Checks if data in @info matches nexhop @nh.
326  *
327  * Returns 0 on success,
328  * ESRCH if not matched,
329  * ENOENT if filter function returned false
330  */
331 int
332 check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
333     const struct nhop_object *nh)
334 {
335 	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
336 
337 	if (info->rti_filter != NULL) {
338 	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
339 		    return (ENOENT);
340 	    else
341 		    return (0);
342 	}
343 	if ((gw != NULL) && !match_nhop_gw(nh, gw))
344 		return (ESRCH);
345 
346 	return (0);
347 }
348 
349 /*
350  * Runs exact prefix match based on @dst and @netmask.
351  * Returns matched @rtentry if found or NULL.
352  * If rtentry was found, saves nexthop / weight value into @rnd.
353  */
354 static struct rtentry *
355 lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
356     const struct sockaddr *netmask, struct route_nhop_data *rnd)
357 {
358 	struct rtentry *rt;
359 
360 	RIB_LOCK_ASSERT(rnh);
361 
362 	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
363 	if (rt != NULL) {
364 		rnd->rnd_nhop = rt->rt_nhop;
365 		rnd->rnd_weight = rt->rt_weight;
366 	} else {
367 		rnd->rnd_nhop = NULL;
368 		rnd->rnd_weight = 0;
369 	}
370 
371 	return (rt);
372 }
373 
374 struct rtentry *
375 lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
376     struct route_nhop_data *rnd)
377 {
378 	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
379 }
380 
381 /*
382  * Runs exact prefix match based on dst/netmask from @info.
383  * Assumes RIB lock is held.
384  * Returns matched @rtentry if found or NULL.
385  * If rtentry was found, saves nexthop / weight value into @rnd.
386  */
387 struct rtentry *
388 lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
389     struct route_nhop_data *rnd)
390 {
391 	struct rtentry *rt;
392 
393 	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
394 	    info->rti_info[RTAX_NETMASK], rnd);
395 
396 	return (rt);
397 }
398 
399 static bool
400 fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
401     struct sockaddr **pmask)
402 {
403 	if (plen == -1) {
404 		*pmask = NULL;
405 		return (true);
406 	}
407 
408 	switch (family) {
409 #ifdef INET
410 	case AF_INET:
411 		{
412 			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
413 			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
414 
415 			memset(mask, 0, sizeof(*mask));
416 			mask->sin_family = family;
417 			mask->sin_len = sizeof(*mask);
418 			if (plen == 32)
419 				*pmask = NULL;
420 			else if (plen > 32 || plen < 0)
421 				return (false);
422 			else {
423 				uint32_t daddr, maddr;
424 				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
425 				mask->sin_addr.s_addr = maddr;
426 				daddr = dst->sin_addr.s_addr;
427 				daddr = htonl(ntohl(daddr) & ntohl(maddr));
428 				dst->sin_addr.s_addr = daddr;
429 			}
430 			return (true);
431 		}
432 		break;
433 #endif
434 #ifdef INET6
435 	case AF_INET6:
436 		{
437 			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
438 			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
439 
440 			memset(mask, 0, sizeof(*mask));
441 			mask->sin6_family = family;
442 			mask->sin6_len = sizeof(*mask);
443 			if (plen == 128)
444 				*pmask = NULL;
445 			else if (plen > 128 || plen < 0)
446 				return (false);
447 			else {
448 				ip6_writemask(&mask->sin6_addr, plen);
449 				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
450 			}
451 			return (true);
452 		}
453 		break;
454 #endif
455 	}
456 	return (false);
457 }
458 
459 /*
460  * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
461  * to the routing table.
462  *
463  * @fibnum: rtable id to insert route to
464  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
465  * @plen: prefix length (or -1 if host route or not applicable for AF)
466  * @op_flags: combination of RTM_F_ flags
467  * @rc: storage to report operation result
468  *
469  * Returns 0 on success.
470  */
471 int
472 rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
473     struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
474 {
475 	union sockaddr_union mask_storage;
476 	struct sockaddr *netmask = &mask_storage.sa;
477 	struct rtentry *rt;
478 
479 	NET_EPOCH_ASSERT();
480 
481 	bzero(rc, sizeof(struct rib_cmd_info));
482 	rc->rc_cmd = RTM_ADD;
483 
484 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
485 	if (rnh == NULL)
486 		return (EAFNOSUPPORT);
487 
488 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
489 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
490 		return (EINVAL);
491 	}
492 
493 	if (op_flags & RTM_F_CREATE) {
494 		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
495 			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
496 			return (ENOMEM);
497 		}
498 	} else {
499 		struct route_nhop_data rnd_tmp;
500 
501 		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
502 		if (rt == NULL)
503 			return (ESRCH);
504 	}
505 
506 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
507 	{
508 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
509 		nhop_print_buf_any(rnd->rnd_nhop, nhbuf, sizeof(nhbuf));
510 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
511 		FIB_RH_LOG(LOG_DEBUG2, rnh, "request %s -> %s", rtbuf, nhbuf);
512 	}
513 #endif
514 	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
515 }
516 
517 /*
518  * Attempts to delete @dst/plen prefix matching gateway @gw from the
519  *  routing rable.
520  *
521  * @fibnum: rtable id to remove route from
522  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
523  * @plen: prefix length (or -1 if host route or not applicable for AF)
524  * @gw: gateway to match
525  * @op_flags: combination of RTM_F_ flags
526  * @rc: storage to report operation result
527  *
528  * Returns 0 on success.
529  */
530 int
531 rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
532     const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
533 {
534 	struct gw_filter_data gwd = { .gw = gw };
535 
536 	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
537 }
538 
539 /*
540  * Attempts to delete @dst/plen prefix matching @filter_func from the
541  *  routing rable.
542  *
543  * @fibnum: rtable id to remove route from
544  * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
545  * @plen: prefix length (or -1 if host route or not applicable for AF)
546  * @filter_func: func to be called for each nexthop of the prefix for matching
547  * @filter_arg: argument to pass to @filter_func
548  * @op_flags: combination of RTM_F_ flags
549  * @rc: storage to report operation result
550  *
551  * Returns 0 on success.
552  */
553 int
554 rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
555     rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
556     struct rib_cmd_info *rc)
557 {
558 	union sockaddr_union mask_storage;
559 	struct sockaddr *netmask = &mask_storage.sa;
560 	int error;
561 
562 	NET_EPOCH_ASSERT();
563 
564 	bzero(rc, sizeof(struct rib_cmd_info));
565 	rc->rc_cmd = RTM_DELETE;
566 
567 	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
568 	if (rnh == NULL)
569 		return (EAFNOSUPPORT);
570 
571 	if (dst->sa_len > sizeof(mask_storage)) {
572 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
573 		return (EINVAL);
574 	}
575 
576 	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
577 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
578 		return (EINVAL);
579 	}
580 
581 	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
582 
583 	RIB_WLOCK(rnh);
584 	struct route_nhop_data rnd;
585 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
586 	if (rt != NULL) {
587 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
588 		    filter_arg, rc);
589 	} else
590 		error = ESRCH;
591 	RIB_WUNLOCK(rnh);
592 
593 	if (error != 0)
594 		return (error);
595 
596 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
597 
598 	if (rc->rc_cmd == RTM_DELETE)
599 		rt_free(rc->rc_rt);
600 #ifdef ROUTE_MPATH
601 	else {
602 		/*
603 		 * Deleting 1 path may result in RTM_CHANGE to
604 		 * a different mpath group/nhop.
605 		 * Free old mpath group.
606 		 */
607 		nhop_free_any(rc->rc_nh_old);
608 	}
609 #endif
610 
611 	return (0);
612 }
613 
614 /*
615  * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
616  * @rt: route to copy.
617  * @rnd_src: nhop and weight. Multipath routes are not supported
618  * @rh_dst: target rtable.
619  * @rc: operation result storage
620  *
621  * Return 0 on success.
622  */
623 int
624 rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
625     struct rib_head *rh_dst, struct rib_cmd_info *rc)
626 {
627 	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
628 	int error;
629 
630 	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
631 
632 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
633 		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
634 		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
635 		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
636 		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
637 		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
638 #endif
639 	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
640 	if (nh == NULL) {
641 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
642 		return (ENOMEM);
643 	}
644 	nhop_copy(nh, rnd_src->rnd_nhop);
645 	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
646 	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
647 	if (error != 0) {
648 		FIB_RH_LOG(LOG_INFO, rh_dst,
649 		    "unable to finalize new nexthop: error %d", error);
650 		return (ENOMEM);
651 	}
652 
653 	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
654 	if (rt_new == NULL) {
655 		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
656 		nhop_free(nh);
657 		return (ENOMEM);
658 	}
659 
660 	struct route_nhop_data rnd = {
661 		.rnd_nhop = nh,
662 		.rnd_weight = rnd_src->rnd_weight
663 	};
664 	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
665 	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
666 
667 	if (error != 0) {
668 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
669 		char buf[NHOP_PRINT_BUFSIZE];
670 		rt_print_buf(rt_new, buf, sizeof(buf));
671 		FIB_RH_LOG(LOG_DEBUG, rh_dst, "Unable to add route %s: error %d", buf, error);
672 #endif
673 		nhop_free(nh);
674 		rt_free_immediate(rt_new);
675 	}
676 	return (error);
677 }
678 
679 /*
680  * Adds route defined by @info into the kernel table specified by @fibnum and
681  * sa_family in @info->rti_info[RTAX_DST].
682  *
683  * Returns 0 on success and fills in operation metadata into @rc.
684  */
685 int
686 rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
687     struct rib_cmd_info *rc)
688 {
689 	struct rib_head *rnh;
690 	int error;
691 
692 	NET_EPOCH_ASSERT();
693 
694 	rnh = get_rnh(fibnum, info);
695 	if (rnh == NULL)
696 		return (EAFNOSUPPORT);
697 
698 	/*
699 	 * Check consistency between RTF_HOST flag and netmask
700 	 * existence.
701 	 */
702 	if (info->rti_flags & RTF_HOST)
703 		info->rti_info[RTAX_NETMASK] = NULL;
704 	else if (info->rti_info[RTAX_NETMASK] == NULL) {
705 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
706 		return (EINVAL);
707 	}
708 
709 	bzero(rc, sizeof(struct rib_cmd_info));
710 	rc->rc_cmd = RTM_ADD;
711 
712 	error = add_route_byinfo(rnh, info, rc);
713 	if (error == 0)
714 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
715 
716 	return (error);
717 }
718 
719 /*
720  * Checks if @dst and @gateway is valid combination.
721  *
722  * Returns true if is valid, false otherwise.
723  */
724 static bool
725 check_gateway(struct rib_head *rnh, struct sockaddr *dst,
726     struct sockaddr *gateway)
727 {
728 	if (dst->sa_family == gateway->sa_family)
729 		return (true);
730 	else if (gateway->sa_family == AF_UNSPEC)
731 		return (true);
732 	else if (gateway->sa_family == AF_LINK)
733 		return (true);
734 #if defined(INET) && defined(INET6)
735 	else if (dst->sa_family == AF_INET && gateway->sa_family == AF_INET6 &&
736 		rib_can_ipv6_nexthop_address(rnh))
737 		return (true);
738 #endif
739 	else
740 		return (false);
741 }
742 
743 static int
744 add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
745     struct rib_cmd_info *rc)
746 {
747 	struct route_nhop_data rnd_add;
748 	struct nhop_object *nh;
749 	struct rtentry *rt;
750 	struct sockaddr *dst, *gateway, *netmask;
751 	int error;
752 
753 	dst = info->rti_info[RTAX_DST];
754 	gateway = info->rti_info[RTAX_GATEWAY];
755 	netmask = info->rti_info[RTAX_NETMASK];
756 
757 	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
758 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
759 		return (EINVAL);
760 	}
761 	if (dst && gateway && !check_gateway(rnh, dst, gateway)) {
762 		FIB_RH_LOG(LOG_DEBUG, rnh,
763 		    "error: invalid dst/gateway family combination (%d, %d)",
764 		    dst->sa_family, gateway->sa_family);
765 		return (EINVAL);
766 	}
767 
768 	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
769 		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
770 		    dst->sa_len);
771 		return (EINVAL);
772 	}
773 
774 	if (info->rti_ifa == NULL) {
775 		error = rt_getifa_fib(info, rnh->rib_fibnum);
776 		if (error)
777 			return (error);
778 	}
779 
780 	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
781 		return (ENOBUFS);
782 
783 	error = nhop_create_from_info(rnh, info, &nh);
784 	if (error != 0) {
785 		rt_free_immediate(rt);
786 		return (error);
787 	}
788 
789 	rnd_add.rnd_nhop = nh;
790 	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
791 
792 	int op_flags = RTM_F_CREATE;
793 	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
794 		op_flags |= RTM_F_FORCE;
795 	else
796 		op_flags |= RTM_F_APPEND;
797 	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
798 
799 }
800 
801 static int
802 add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
803     int op_flags, struct rib_cmd_info *rc)
804 {
805 	struct route_nhop_data rnd_orig;
806 	struct nhop_object *nh;
807 	struct rtentry *rt_orig;
808 	int error = 0;
809 
810 	nh = rnd_add->rnd_nhop;
811 
812 	RIB_WLOCK(rnh);
813 
814 	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
815 
816 	if (rt_orig == NULL) {
817 		if (op_flags & RTM_F_CREATE)
818 			error = add_route(rnh, rt, rnd_add, rc);
819 		else
820 			error = ENOENT; // no entry but creation was not required
821 		RIB_WUNLOCK(rnh);
822 		if (error != 0)
823 			goto out;
824 		return (0);
825 	}
826 
827 	if (op_flags & RTM_F_EXCL) {
828 		/* We have existing route in the RIB but not allowed to replace. */
829 		RIB_WUNLOCK(rnh);
830 		error = EEXIST;
831 		goto out;
832 	}
833 
834 	/* Now either append or replace */
835 	if (op_flags & RTM_F_REPLACE) {
836 		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
837 			/* Old path is "better" (e.g. has PINNED flag set) */
838 			error = EEXIST;
839 			goto out;
840 		}
841 		change_route(rnh, rt_orig, rnd_add, rc);
842 		RIB_WUNLOCK(rnh);
843 		nh = rc->rc_nh_old;
844 		goto out;
845 	}
846 
847 	RIB_WUNLOCK(rnh);
848 
849 #ifdef ROUTE_MPATH
850 	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
851 	    nhop_can_multipath(rnd_add->rnd_nhop) &&
852 	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
853 
854 		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
855 			error = add_route_flags_mpath(rnh, rt, rnd_add, &rnd_orig,
856 			    op_flags, rc);
857 			if (error != EAGAIN)
858 				break;
859 			RTSTAT_INC(rts_add_retry);
860 		}
861 
862 		/*
863 		 *  Original nhop reference is unused in any case.
864 		 */
865 		nhop_free_any(rnd_add->rnd_nhop);
866 		if (op_flags & RTM_F_CREATE) {
867 			if (error != 0 || rc->rc_cmd != RTM_ADD)
868 				rt_free_immediate(rt);
869 		}
870 		return (error);
871 	}
872 #endif
873 	/* Out of options - free state and return error */
874 	error = EEXIST;
875 out:
876 	if (op_flags & RTM_F_CREATE)
877 		rt_free_immediate(rt);
878 	nhop_free_any(nh);
879 
880 	return (error);
881 }
882 
883 #ifdef ROUTE_MPATH
884 static int
885 add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
886     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
887     int op_flags, struct rib_cmd_info *rc)
888 {
889 	RIB_RLOCK_TRACKER;
890 	struct route_nhop_data rnd_new;
891 	int error = 0;
892 
893 	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
894 	if (error != 0) {
895 		if (error == EAGAIN) {
896 			/*
897 			 * Group creation failed, most probably because
898 			 * @rnd_orig data got scheduled for deletion.
899 			 * Refresh @rnd_orig data and retry.
900 			 */
901 			RIB_RLOCK(rnh);
902 			lookup_prefix_rt(rnh, rt, rnd_orig);
903 			RIB_RUNLOCK(rnh);
904 			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
905 				/* In this iteration route doesn't exist */
906 				error = ENOENT;
907 			}
908 		}
909 		return (error);
910 	}
911 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
912 	if (error != 0)
913 		return (error);
914 
915 	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
916 		/*
917 		 * First multipath route got installed. Enable local
918 		 * outbound connections hashing.
919 		 */
920 		if (bootverbose)
921 			printf("FIB: enabled flowid calculation for locally-originated packets\n");
922 		V_fib_hash_outbound = 1;
923 	}
924 
925 	return (0);
926 }
927 #endif
928 
929 /*
930  * Removes route defined by @info from the kernel table specified by @fibnum and
931  * sa_family in @info->rti_info[RTAX_DST].
932  *
933  * Returns 0 on success and fills in operation metadata into @rc.
934  */
935 int
936 rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
937 {
938 	struct rib_head *rnh;
939 	struct sockaddr *dst, *netmask;
940 	struct sockaddr_storage mdst;
941 	int error;
942 
943 	NET_EPOCH_ASSERT();
944 
945 	rnh = get_rnh(fibnum, info);
946 	if (rnh == NULL)
947 		return (EAFNOSUPPORT);
948 
949 	bzero(rc, sizeof(struct rib_cmd_info));
950 	rc->rc_cmd = RTM_DELETE;
951 
952 	dst = info->rti_info[RTAX_DST];
953 	netmask = info->rti_info[RTAX_NETMASK];
954 
955 	if (netmask != NULL) {
956 		/* Ensure @dst is always properly masked */
957 		if (dst->sa_len > sizeof(mdst)) {
958 			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
959 			return (EINVAL);
960 		}
961 		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
962 		dst = (struct sockaddr *)&mdst;
963 	}
964 
965 	rib_filter_f_t *filter_func = NULL;
966 	void *filter_arg = NULL;
967 	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
968 
969 	if (info->rti_filter != NULL) {
970 		filter_func = info->rti_filter;
971 		filter_arg = info->rti_filterdata;
972 	} else if (gwd.gw != NULL) {
973 		filter_func = match_gw_one;
974 		filter_arg = &gwd;
975 	}
976 
977 	int prio = get_prio_from_info(info);
978 
979 	RIB_WLOCK(rnh);
980 	struct route_nhop_data rnd;
981 	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
982 	if (rt != NULL) {
983 		error = rt_delete_conditional(rnh, rt, prio, filter_func,
984 		    filter_arg, rc);
985 	} else
986 		error = ESRCH;
987 	RIB_WUNLOCK(rnh);
988 
989 	if (error != 0)
990 		return (error);
991 
992 	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
993 
994 	if (rc->rc_cmd == RTM_DELETE)
995 		rt_free(rc->rc_rt);
996 #ifdef ROUTE_MPATH
997 	else {
998 		/*
999 		 * Deleting 1 path may result in RTM_CHANGE to
1000 		 * a different mpath group/nhop.
1001 		 * Free old mpath group.
1002 		 */
1003 		nhop_free_any(rc->rc_nh_old);
1004 	}
1005 #endif
1006 
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Conditionally unlinks rtentry paths from @rnh matching @cb.
1012  * Returns 0 on success with operation result stored in @rc.
1013  * On error, returns:
1014  * ESRCH - if prefix was not found or filter function failed to match
1015  * EADDRINUSE - if trying to delete higher priority route.
1016  */
1017 static int
1018 rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1019     int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1020 {
1021 	struct nhop_object *nh = rt->rt_nhop;
1022 
1023 #ifdef ROUTE_MPATH
1024 	if (NH_IS_NHGRP(nh)) {
1025 		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1026 		struct route_nhop_data rnd;
1027 		int error;
1028 
1029 		if (cb == NULL)
1030 			return (ESRCH);
1031 		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1032 		if (error == 0) {
1033 			if (rnd.rnd_nhgrp == nhg) {
1034 				/* No match, unreference new group and return. */
1035 				nhop_free_any(rnd.rnd_nhop);
1036 				return (ESRCH);
1037 			}
1038 			error = change_route(rnh, rt, &rnd, rc);
1039 		}
1040 		return (error);
1041 	}
1042 #endif
1043 	if (cb != NULL && !cb(rt, nh, cbdata))
1044 		return (ESRCH);
1045 
1046 	if (prio < nhop_get_prio(nh))
1047 		return (EADDRINUSE);
1048 
1049 	return (delete_route(rnh, rt, rc));
1050 }
1051 
1052 int
1053 rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1054     struct rib_cmd_info *rc)
1055 {
1056 	RIB_RLOCK_TRACKER;
1057 	struct route_nhop_data rnd_orig;
1058 	struct rib_head *rnh;
1059 	struct rtentry *rt;
1060 	int error;
1061 
1062 	NET_EPOCH_ASSERT();
1063 
1064 	rnh = get_rnh(fibnum, info);
1065 	if (rnh == NULL)
1066 		return (EAFNOSUPPORT);
1067 
1068 	bzero(rc, sizeof(struct rib_cmd_info));
1069 	rc->rc_cmd = RTM_CHANGE;
1070 
1071 	/* Check if updated gateway exists */
1072 	if ((info->rti_flags & RTF_GATEWAY) &&
1073 	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1074 
1075 		/*
1076 		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1077 		 * Remove RTF_GATEWAY to enforce consistency and maintain
1078 		 * compatibility..
1079 		 */
1080 		info->rti_flags &= ~RTF_GATEWAY;
1081 	}
1082 
1083 	/*
1084 	 * route change is done in multiple steps, with dropping and
1085 	 * reacquiring lock. In the situations with multiple processes
1086 	 * changes the same route in can lead to the case when route
1087 	 * is changed between the steps. Address it by retrying the operation
1088 	 * multiple times before failing.
1089 	 */
1090 
1091 	RIB_RLOCK(rnh);
1092 	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1093 	    info->rti_info[RTAX_NETMASK], &rnh->head);
1094 
1095 	if (rt == NULL) {
1096 		RIB_RUNLOCK(rnh);
1097 		return (ESRCH);
1098 	}
1099 
1100 	rnd_orig.rnd_nhop = rt->rt_nhop;
1101 	rnd_orig.rnd_weight = rt->rt_weight;
1102 
1103 	RIB_RUNLOCK(rnh);
1104 
1105 	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1106 		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1107 		if (error != EAGAIN)
1108 			break;
1109 	}
1110 
1111 	return (error);
1112 }
1113 
1114 static int
1115 change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1116     struct nhop_object *nh_orig, struct nhop_object **nh_new)
1117 {
1118 	int error;
1119 
1120 	/*
1121 	 * New gateway could require new ifaddr, ifp;
1122 	 * flags may also be different; ifp may be specified
1123 	 * by ll sockaddr when protocol address is ambiguous
1124 	 */
1125 	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1126 	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1127 	    info->rti_info[RTAX_IFP] != NULL ||
1128 	    (info->rti_info[RTAX_IFA] != NULL &&
1129 	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1130 		error = rt_getifa_fib(info, rnh->rib_fibnum);
1131 
1132 		if (error != 0) {
1133 			info->rti_ifa = NULL;
1134 			return (error);
1135 		}
1136 	}
1137 
1138 	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1139 	info->rti_ifa = NULL;
1140 
1141 	return (error);
1142 }
1143 
1144 #ifdef ROUTE_MPATH
1145 static int
1146 change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1147     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1148     struct rib_cmd_info *rc)
1149 {
1150 	int error = 0, found_idx = 0;
1151 	struct nhop_object *nh_orig = NULL, *nh_new;
1152 	struct route_nhop_data rnd_new = {};
1153 	const struct weightened_nhop *wn = NULL;
1154 	struct weightened_nhop *wn_new;
1155 	uint32_t num_nhops;
1156 
1157 	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1158 	for (int i = 0; i < num_nhops; i++) {
1159 		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1160 			nh_orig = wn[i].nh;
1161 			found_idx = i;
1162 			break;
1163 		}
1164 	}
1165 
1166 	if (nh_orig == NULL)
1167 		return (ESRCH);
1168 
1169 	error = change_nhop(rnh, info, nh_orig, &nh_new);
1170 	if (error != 0)
1171 		return (error);
1172 
1173 	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1174 	    M_TEMP, M_NOWAIT | M_ZERO);
1175 	if (wn_new == NULL) {
1176 		nhop_free(nh_new);
1177 		return (EAGAIN);
1178 	}
1179 
1180 	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1181 	wn_new[found_idx].nh = nh_new;
1182 	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1183 
1184 	error = nhgrp_get_group(rnh, wn_new, num_nhops, &rnd_new.rnd_nhgrp);
1185 	nhop_free(nh_new);
1186 	free(wn_new, M_TEMP);
1187 
1188 	if (error != 0)
1189 		return (error);
1190 
1191 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1192 
1193 	return (error);
1194 }
1195 #endif
1196 
1197 static int
1198 change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1199     struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1200     struct rib_cmd_info *rc)
1201 {
1202 	int error = 0;
1203 	struct nhop_object *nh_orig;
1204 	struct route_nhop_data rnd_new;
1205 
1206 	nh_orig = rnd_orig->rnd_nhop;
1207 	if (nh_orig == NULL)
1208 		return (ESRCH);
1209 
1210 #ifdef ROUTE_MPATH
1211 	if (NH_IS_NHGRP(nh_orig))
1212 		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1213 #endif
1214 
1215 	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1216 	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1217 	if (error != 0)
1218 		return (error);
1219 	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1220 
1221 	return (error);
1222 }
1223 
1224 /*
1225  * Insert @rt with nhop data from @rnd_new to @rnh.
1226  * Returns 0 on success and stores operation results in @rc.
1227  */
1228 static int
1229 add_route(struct rib_head *rnh, struct rtentry *rt,
1230     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1231 {
1232 	struct radix_node *rn;
1233 
1234 	RIB_WLOCK_ASSERT(rnh);
1235 
1236 	rt->rt_nhop = rnd->rnd_nhop;
1237 	rt->rt_weight = rnd->rnd_weight;
1238 	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1239 
1240 	if (rn != NULL) {
1241 		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1242 			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1243 
1244 		/* Finalize notification */
1245 		rib_bump_gen(rnh);
1246 		rnh->rnh_prefixes++;
1247 
1248 		rc->rc_cmd = RTM_ADD;
1249 		rc->rc_rt = rt;
1250 		rc->rc_nh_old = NULL;
1251 		rc->rc_nh_new = rnd->rnd_nhop;
1252 		rc->rc_nh_weight = rnd->rnd_weight;
1253 
1254 		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1255 		return (0);
1256 	}
1257 
1258 	/* Existing route or memory allocation failure. */
1259 	return (EEXIST);
1260 }
1261 
1262 /*
1263  * Unconditionally deletes @rt from @rnh.
1264  */
1265 static int
1266 delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1267 {
1268 	RIB_WLOCK_ASSERT(rnh);
1269 
1270 	/* Route deletion requested. */
1271 	struct radix_node *rn;
1272 
1273 	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1274 	if (rn == NULL)
1275 		return (ESRCH);
1276 	rt = RNTORT(rn);
1277 	rt->rte_flags &= ~RTF_UP;
1278 
1279 	rib_bump_gen(rnh);
1280 	rnh->rnh_prefixes--;
1281 
1282 	rc->rc_cmd = RTM_DELETE;
1283 	rc->rc_rt = rt;
1284 	rc->rc_nh_old = rt->rt_nhop;
1285 	rc->rc_nh_new = NULL;
1286 	rc->rc_nh_weight = rt->rt_weight;
1287 
1288 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1289 
1290 	return (0);
1291 }
1292 
1293 /*
1294  * Switch @rt nhop/weigh to the ones specified in @rnd.
1295  * Returns 0 on success.
1296  */
1297 int
1298 change_route(struct rib_head *rnh, struct rtentry *rt,
1299     struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1300 {
1301 	struct nhop_object *nh_orig;
1302 
1303 	RIB_WLOCK_ASSERT(rnh);
1304 
1305 	nh_orig = rt->rt_nhop;
1306 
1307 	if (rnd->rnd_nhop == NULL)
1308 		return (delete_route(rnh, rt, rc));
1309 
1310 	/* Changing nexthop & weight to a new one */
1311 	rt->rt_nhop = rnd->rnd_nhop;
1312 	rt->rt_weight = rnd->rnd_weight;
1313 	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1314 		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1315 
1316 	/* Finalize notification */
1317 	rib_bump_gen(rnh);
1318 	rc->rc_cmd = RTM_CHANGE;
1319 	rc->rc_rt = rt;
1320 	rc->rc_nh_old = nh_orig;
1321 	rc->rc_nh_new = rnd->rnd_nhop;
1322 	rc->rc_nh_weight = rnd->rnd_weight;
1323 
1324 	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1325 
1326 	return (0);
1327 }
1328 
1329 /*
1330  * Conditionally update route nhop/weight IFF data in @nhd_orig is
1331  *  consistent with the current route data.
1332  * Nexthop in @nhd_new is consumed.
1333  */
1334 int
1335 change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1336     struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1337     struct rib_cmd_info *rc)
1338 {
1339 	struct rtentry *rt_new;
1340 	int error = 0;
1341 
1342 #if DEBUG_MAX_LEVEL >= LOG_DEBUG2
1343 	{
1344 		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1345 		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1346 		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1347 		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1348 		    "trying change %s -> %s", buf_old, buf_new);
1349 	}
1350 #endif
1351 	RIB_WLOCK(rnh);
1352 
1353 	struct route_nhop_data rnd;
1354 	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1355 
1356 	if (rt_new == NULL) {
1357 		if (rnd_orig->rnd_nhop == NULL)
1358 			error = add_route(rnh, rt, rnd_new, rc);
1359 		else {
1360 			/*
1361 			 * Prefix does not exist, which was not our assumption.
1362 			 * Update @rnd_orig with the new data and return
1363 			 */
1364 			rnd_orig->rnd_nhop = NULL;
1365 			rnd_orig->rnd_weight = 0;
1366 			error = EAGAIN;
1367 		}
1368 	} else {
1369 		/* Prefix exists, try to update */
1370 		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1371 			/*
1372 			 * Nhop/mpath group hasn't changed. Flip
1373 			 * to the new precalculated one and return
1374 			 */
1375 			error = change_route(rnh, rt_new, rnd_new, rc);
1376 		} else {
1377 			/* Update and retry */
1378 			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1379 			rnd_orig->rnd_weight = rt_new->rt_weight;
1380 			error = EAGAIN;
1381 		}
1382 	}
1383 
1384 	RIB_WUNLOCK(rnh);
1385 
1386 	if (error == 0) {
1387 		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1388 
1389 		if (rnd_orig->rnd_nhop != NULL)
1390 			nhop_free_any(rnd_orig->rnd_nhop);
1391 
1392 	} else {
1393 		if (rnd_new->rnd_nhop != NULL)
1394 			nhop_free_any(rnd_new->rnd_nhop);
1395 	}
1396 
1397 	return (error);
1398 }
1399 
1400 /*
1401  * Performs modification of routing table specificed by @action.
1402  * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1403  * Needs to be run in network epoch.
1404  *
1405  * Returns 0 on success and fills in @rc with action result.
1406  */
1407 int
1408 rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1409     struct rib_cmd_info *rc)
1410 {
1411 	int error;
1412 
1413 	switch (action) {
1414 	case RTM_ADD:
1415 		error = rib_add_route(fibnum, info, rc);
1416 		break;
1417 	case RTM_DELETE:
1418 		error = rib_del_route(fibnum, info, rc);
1419 		break;
1420 	case RTM_CHANGE:
1421 		error = rib_change_route(fibnum, info, rc);
1422 		break;
1423 	default:
1424 		error = ENOTSUP;
1425 	}
1426 
1427 	return (error);
1428 }
1429 
1430 struct rt_delinfo
1431 {
1432 	struct rib_head *rnh;
1433 	struct rtentry *head;
1434 	rib_filter_f_t *filter_f;
1435 	void *filter_arg;
1436 	int prio;
1437 	struct rib_cmd_info rc;
1438 };
1439 
1440 /*
1441  * Conditionally unlinks rtenties or paths from radix tree based
1442  * on the callback data passed in @arg.
1443  */
1444 static int
1445 rt_checkdelroute(struct radix_node *rn, void *arg)
1446 {
1447 	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1448 	struct rtentry *rt = (struct rtentry *)rn;
1449 
1450 	if (rt_delete_conditional(di->rnh, rt, di->prio,
1451 	    di->filter_f, di->filter_arg, &di->rc) != 0)
1452 		return (0);
1453 
1454 	/*
1455 	 * Add deleted rtentries to the list to GC them
1456 	 *  after dropping the lock.
1457 	 *
1458 	 * XXX: Delayed notifications not implemented
1459 	 *  for nexthop updates.
1460 	 */
1461 	if (di->rc.rc_cmd == RTM_DELETE) {
1462 		/* Add to the list and return */
1463 		rt->rt_chain = di->head;
1464 		di->head = rt;
1465 #ifdef ROUTE_MPATH
1466 	} else {
1467 		/*
1468 		 * RTM_CHANGE to a different nexthop or nexthop group.
1469 		 * Free old multipath group.
1470 		 */
1471 		nhop_free_any(di->rc.rc_nh_old);
1472 #endif
1473 	}
1474 
1475 	return (0);
1476 }
1477 
1478 /*
1479  * Iterates over a routing table specified by @fibnum and @family and
1480  *  deletes elements marked by @filter_f.
1481  * @fibnum: rtable id
1482  * @family: AF_ address family
1483  * @filter_f: function returning non-zero value for items to delete
1484  * @arg: data to pass to the @filter_f function
1485  * @report: true if rtsock notification is needed.
1486  */
1487 void
1488 rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1489     bool report)
1490 {
1491 	struct rib_head *rnh;
1492 	struct rtentry *rt;
1493 	struct nhop_object *nh;
1494 	struct epoch_tracker et;
1495 
1496 	rnh = rt_tables_get_rnh(fibnum, family);
1497 	if (rnh == NULL)
1498 		return;
1499 
1500 	struct rt_delinfo di = {
1501 		.rnh = rnh,
1502 		.filter_f = filter_f,
1503 		.filter_arg = filter_arg,
1504 		.prio = NH_PRIORITY_NORMAL,
1505 	};
1506 
1507 	NET_EPOCH_ENTER(et);
1508 
1509 	RIB_WLOCK(rnh);
1510 	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1511 	RIB_WUNLOCK(rnh);
1512 
1513 	/* We might have something to reclaim. */
1514 	bzero(&di.rc, sizeof(di.rc));
1515 	di.rc.rc_cmd = RTM_DELETE;
1516 	while (di.head != NULL) {
1517 		rt = di.head;
1518 		di.head = rt->rt_chain;
1519 		rt->rt_chain = NULL;
1520 		nh = rt->rt_nhop;
1521 
1522 		di.rc.rc_rt = rt;
1523 		di.rc.rc_nh_old = nh;
1524 		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1525 
1526 		if (report) {
1527 #ifdef ROUTE_MPATH
1528 			struct nhgrp_object *nhg;
1529 			const struct weightened_nhop *wn;
1530 			uint32_t num_nhops;
1531 			if (NH_IS_NHGRP(nh)) {
1532 				nhg = (struct nhgrp_object *)nh;
1533 				wn = nhgrp_get_nhops(nhg, &num_nhops);
1534 				for (int i = 0; i < num_nhops; i++)
1535 					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1536 			} else
1537 #endif
1538 			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1539 		}
1540 		rt_free(rt);
1541 	}
1542 
1543 	NET_EPOCH_EXIT(et);
1544 }
1545 
1546 static int
1547 rt_delete_unconditional(struct radix_node *rn, void *arg)
1548 {
1549 	struct rtentry *rt = RNTORT(rn);
1550 	struct rib_head *rnh = (struct rib_head *)arg;
1551 
1552 	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1553 	if (RNTORT(rn) == rt)
1554 		rt_free(rt);
1555 
1556 	return (0);
1557 }
1558 
1559 /*
1560  * Removes all routes from the routing table without executing notifications.
1561  * rtentres will be removed after the end of a current epoch.
1562  */
1563 static void
1564 rib_flush_routes(struct rib_head *rnh)
1565 {
1566 	RIB_WLOCK(rnh);
1567 	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1568 	RIB_WUNLOCK(rnh);
1569 }
1570 
1571 void
1572 rib_flush_routes_family(int family)
1573 {
1574 	struct rib_head *rnh;
1575 
1576 	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1577 		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1578 			rib_flush_routes(rnh);
1579 	}
1580 }
1581 
1582 const char *
1583 rib_print_family(int family)
1584 {
1585 	switch (family) {
1586 	case AF_INET:
1587 		return ("inet");
1588 	case AF_INET6:
1589 		return ("inet6");
1590 	case AF_LINK:
1591 		return ("link");
1592 	}
1593 	return ("unknown");
1594 }
1595