xref: /openbsd/sys/net/rtable.c (revision 4bdff4be)
1 /*	$OpenBSD: rtable.c,v 1.85 2023/11/12 17:51:40 bluhm Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as described above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	const struct domain	*dp;
94 	int			 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesn't need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	const struct domain	*dp;
156 	int			 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 
188 	rt_timer_init();
189 }
190 
191 int
192 rtable_add(unsigned int id)
193 {
194 	const struct domain	*dp;
195 	void			*tbl;
196 	struct rtmap		*map;
197 	struct dommp		*dmm;
198 	sa_family_t		 af;
199 	unsigned int		 off, alen;
200 	int			 i, error = 0;
201 
202 	if (id > RT_TABLEID_MAX)
203 		return (EINVAL);
204 
205 	KERNEL_LOCK();
206 
207 	if (rtable_exists(id))
208 		goto out;
209 
210 	for (i = 0; (dp = domains[i]) != NULL; i++) {
211 		if (dp->dom_rtoffset == 0)
212 			continue;
213 
214 		af = dp->dom_family;
215 		off = dp->dom_rtoffset;
216 		alen = dp->dom_maxplen;
217 
218 		if (id >= rtmap_limit)
219 			rtmap_grow(id + 1, af);
220 
221 		tbl = rtable_alloc(id, alen, off);
222 		if (tbl == NULL) {
223 			error = ENOMEM;
224 			goto out;
225 		}
226 
227 		map = srp_get_locked(&afmap[af2idx[af]]);
228 		map->tbl[id] = tbl;
229 	}
230 
231 	/* Reflect possible growth. */
232 	if (id >= rtmap_limit) {
233 		rtmap_grow(id + 1, 0);
234 		rtmap_limit = id + 1;
235 	}
236 
237 	/* Use main rtable/rdomain by default. */
238 	dmm = srp_get_locked(&afmap[0]);
239 	dmm->value[id] = 0;
240 out:
241 	KERNEL_UNLOCK();
242 
243 	return (error);
244 }
245 
246 void *
247 rtable_get(unsigned int rtableid, sa_family_t af)
248 {
249 	struct rtmap	*map;
250 	void		*tbl = NULL;
251 	struct srp_ref	 sr;
252 
253 	if (af >= nitems(af2idx) || af2idx[af] == 0)
254 		return (NULL);
255 
256 	map = srp_enter(&sr, &afmap[af2idx[af]]);
257 	if (rtableid < map->limit)
258 		tbl = map->tbl[rtableid];
259 	srp_leave(&sr);
260 
261 	return (tbl);
262 }
263 
264 int
265 rtable_exists(unsigned int rtableid)
266 {
267 	const struct domain	*dp;
268 	void			*tbl;
269 	int			 i;
270 
271 	for (i = 0; (dp = domains[i]) != NULL; i++) {
272 		if (dp->dom_rtoffset == 0)
273 			continue;
274 
275 		tbl = rtable_get(rtableid, dp->dom_family);
276 		if (tbl != NULL)
277 			return (1);
278 	}
279 
280 	return (0);
281 }
282 
283 int
284 rtable_empty(unsigned int rtableid)
285 {
286 	const struct domain	*dp;
287 	int			 i;
288 	struct art_root		*tbl;
289 
290 	for (i = 0; (dp = domains[i]) != NULL; i++) {
291 		if (dp->dom_rtoffset == 0)
292 			continue;
293 
294 		tbl = rtable_get(rtableid, dp->dom_family);
295 		if (tbl == NULL)
296 			continue;
297 		if (tbl->ar_root.ref != NULL)
298 			return (0);
299 	}
300 
301 	return (1);
302 }
303 
304 unsigned int
305 rtable_l2(unsigned int rtableid)
306 {
307 	struct dommp	*dmm;
308 	unsigned int	 rdomain = 0;
309 	struct srp_ref	 sr;
310 
311 	dmm = srp_enter(&sr, &afmap[0]);
312 	if (rtableid < dmm->limit)
313 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
314 	srp_leave(&sr);
315 
316 	return (rdomain);
317 }
318 
319 unsigned int
320 rtable_loindex(unsigned int rtableid)
321 {
322 	struct dommp	*dmm;
323 	unsigned int	 loifidx = 0;
324 	struct srp_ref	 sr;
325 
326 	dmm = srp_enter(&sr, &afmap[0]);
327 	if (rtableid < dmm->limit)
328 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
329 	srp_leave(&sr);
330 
331 	return (loifidx);
332 }
333 
334 void
335 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
336 {
337 	struct dommp	*dmm;
338 	unsigned int	 value;
339 
340 	KERNEL_ASSERT_LOCKED();
341 
342 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
343 		return;
344 
345 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
346 
347 	dmm = srp_get_locked(&afmap[0]);
348 	dmm->value[rtableid] = value;
349 }
350 
351 
352 static inline const uint8_t *satoaddr(struct art_root *,
353     const struct sockaddr *);
354 
355 int	an_match(struct art_node *, const struct sockaddr *, int);
356 void	rtentry_ref(void *, void *);
357 void	rtentry_unref(void *, void *);
358 
359 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
360 
361 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
362 
363 void
364 rtable_init_backend(void)
365 {
366 	art_init();
367 }
368 
369 void *
370 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
371 {
372 	return (art_alloc(rtableid, alen, off));
373 }
374 
375 int
376 rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
377 {
378 	struct art_root		*ar;
379 
380 	NET_ASSERT_LOCKED_EXCLUSIVE();
381 
382 	if ((ar = rtable_get(rtableid, af)) == NULL)
383 		return (EAFNOSUPPORT);
384 
385 	ar->ar_source = src;
386 
387 	return (0);
388 }
389 
390 struct sockaddr *
391 rtable_getsource(unsigned int rtableid, int af)
392 {
393 	struct art_root		*ar;
394 
395 	NET_ASSERT_LOCKED();
396 
397 	ar = rtable_get(rtableid, af);
398 	if (ar == NULL)
399 		return (NULL);
400 
401 	return (ar->ar_source);
402 }
403 
404 void
405 rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
406 {
407 	struct sockaddr	*addr;
408 
409 	addr = rtable_getsource(rtableid, src->sa_family);
410 	if (addr && (addr->sa_len == src->sa_len)) {
411 		if (memcmp(src, addr, addr->sa_len) == 0) {
412 			rtable_setsource(rtableid, src->sa_family, NULL);
413 		}
414 	}
415 }
416 
417 struct rtentry *
418 rtable_lookup(unsigned int rtableid, const struct sockaddr *dst,
419     const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio)
420 {
421 	struct art_root			*ar;
422 	struct art_node			*an;
423 	struct rtentry			*rt = NULL;
424 	struct srp_ref			 sr, nsr;
425 	const uint8_t			*addr;
426 	int				 plen;
427 
428 	ar = rtable_get(rtableid, dst->sa_family);
429 	if (ar == NULL)
430 		return (NULL);
431 
432 	addr = satoaddr(ar, dst);
433 
434 	/* No need for a perfect match. */
435 	if (mask == NULL) {
436 		an = art_match(ar, addr, &nsr);
437 		if (an == NULL)
438 			goto out;
439 	} else {
440 		plen = rtable_satoplen(dst->sa_family, mask);
441 		if (plen == -1)
442 			return (NULL);
443 
444 		an = art_lookup(ar, addr, plen, &nsr);
445 
446 		/* Make sure we've got a perfect match. */
447 		if (!an_match(an, dst, plen))
448 			goto out;
449 	}
450 
451 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
452 		if (prio != RTP_ANY &&
453 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
454 			continue;
455 
456 		if (gateway == NULL)
457 			break;
458 
459 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
460 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
461 			break;
462 	}
463 	if (rt != NULL)
464 		rtref(rt);
465 
466 	SRPL_LEAVE(&sr);
467 out:
468 	srp_leave(&nsr);
469 
470 	return (rt);
471 }
472 
473 struct rtentry *
474 rtable_match(unsigned int rtableid, const struct sockaddr *dst, uint32_t *src)
475 {
476 	struct art_root			*ar;
477 	struct art_node			*an;
478 	struct rtentry			*rt = NULL;
479 	struct srp_ref			 sr, nsr;
480 	const uint8_t			*addr;
481 	int				 hash;
482 
483 	ar = rtable_get(rtableid, dst->sa_family);
484 	if (ar == NULL)
485 		return (NULL);
486 
487 	addr = satoaddr(ar, dst);
488 
489 	an = art_match(ar, addr, &nsr);
490 	if (an == NULL)
491 		goto out;
492 
493 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
494 	if (rt == NULL) {
495 		SRPL_LEAVE(&sr);
496 		goto out;
497 	}
498 	rtref(rt);
499 	SRPL_LEAVE(&sr);
500 
501 	/* Gateway selection by Hash-Threshold (RFC 2992) */
502 	if ((hash = rt_hash(rt, dst, src)) != -1) {
503 		struct rtentry		*mrt;
504 		int			 threshold, npaths = 0;
505 
506 		KASSERT(hash <= 0xffff);
507 
508 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
509 			/* Only count nexthops with the same priority. */
510 			if (mrt->rt_priority == rt->rt_priority)
511 				npaths++;
512 		}
513 		SRPL_LEAVE(&sr);
514 
515 		threshold = (0xffff / npaths) + 1;
516 
517 		/*
518 		 * we have no protection against concurrent modification of the
519 		 * route list attached to the node, so we won't necessarily
520 		 * have the same number of routes.  for most modifications,
521 		 * we'll pick a route that we wouldn't have if we only saw the
522 		 * list before or after the change.  if we were going to use
523 		 * the last available route, but it got removed, we'll hit
524 		 * the end of the list and then pick the first route.
525 		 */
526 
527 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
528 		while (hash > threshold && mrt != NULL) {
529 			if (mrt->rt_priority == rt->rt_priority)
530 				hash -= threshold;
531 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
532 		}
533 
534 		if (mrt != NULL) {
535 			rtref(mrt);
536 			rtfree(rt);
537 			rt = mrt;
538 		}
539 		SRPL_LEAVE(&sr);
540 	}
541 out:
542 	srp_leave(&nsr);
543 	return (rt);
544 }
545 
546 int
547 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
548     const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio,
549     struct rtentry *rt)
550 {
551 	struct rtentry			*mrt;
552 	struct srp_ref			 sr;
553 	struct art_root			*ar;
554 	struct art_node			*an, *prev;
555 	const uint8_t			*addr;
556 	int				 plen;
557 	unsigned int			 rt_flags;
558 	int				 error = 0;
559 
560 	ar = rtable_get(rtableid, dst->sa_family);
561 	if (ar == NULL)
562 		return (EAFNOSUPPORT);
563 
564 	addr = satoaddr(ar, dst);
565 	plen = rtable_satoplen(dst->sa_family, mask);
566 	if (plen == -1)
567 		return (EINVAL);
568 
569 	rtref(rt); /* guarantee rtfree won't do anything during insert */
570 	rw_enter_write(&ar->ar_lock);
571 
572 	/* Do not permit exactly the same dst/mask/gw pair. */
573 	an = art_lookup(ar, addr, plen, &sr);
574 	srp_leave(&sr); /* an can't go away while we have the lock */
575 	if (an_match(an, dst, plen)) {
576 		struct rtentry  *mrt;
577 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
578 
579 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
580 			if (prio != RTP_ANY &&
581 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
582 				continue;
583 
584 			if (!mpathok ||
585 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
586 			    memcmp(mrt->rt_gateway, gateway,
587 			    gateway->sa_len) == 0)) {
588 				error = EEXIST;
589 				goto leave;
590 			}
591 		}
592 	}
593 
594 	an = art_get(plen);
595 	if (an == NULL) {
596 		error = ENOBUFS;
597 		goto leave;
598 	}
599 
600 	/* prepare for immediate operation if insert succeeds */
601 	rt_flags = rt->rt_flags;
602 	rt->rt_flags &= ~RTF_MPATH;
603 	rt->rt_dest = dst;
604 	rt->rt_plen = plen;
605 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
606 
607 	prev = art_insert(ar, an, addr, plen);
608 	if (prev != an) {
609 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
610 		    rt_next);
611 		rt->rt_flags = rt_flags;
612 		art_put(an);
613 
614 		if (prev == NULL) {
615 			error = ESRCH;
616 			goto leave;
617 		}
618 
619 		an = prev;
620 
621 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
622 		KASSERT(mrt != NULL);
623 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
624 
625 		/*
626 		 * An ART node with the same destination/netmask already
627 		 * exists, MPATH conflict must have been already checked.
628 		 */
629 		if (rt->rt_flags & RTF_MPATH) {
630 			/*
631 			 * Only keep the RTF_MPATH flag if two routes have
632 			 * the same gateway.
633 			 */
634 			rt->rt_flags &= ~RTF_MPATH;
635 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
636 				if (mrt->rt_priority == prio) {
637 					mrt->rt_flags |= RTF_MPATH;
638 					rt->rt_flags |= RTF_MPATH;
639 				}
640 			}
641 		}
642 
643 		/* Put newly inserted entry at the right place. */
644 		rtable_mpath_insert(an, rt);
645 	}
646 leave:
647 	rw_exit_write(&ar->ar_lock);
648 	rtfree(rt);
649 	return (error);
650 }
651 
652 int
653 rtable_delete(unsigned int rtableid, const struct sockaddr *dst,
654     const struct sockaddr *mask, struct rtentry *rt)
655 {
656 	struct art_root			*ar;
657 	struct art_node			*an;
658 	struct srp_ref			 sr;
659 	const uint8_t			*addr;
660 	int				 plen;
661 	struct rtentry			*mrt;
662 	int				 npaths = 0;
663 	int				 error = 0;
664 
665 	ar = rtable_get(rtableid, dst->sa_family);
666 	if (ar == NULL)
667 		return (EAFNOSUPPORT);
668 
669 	addr = satoaddr(ar, dst);
670 	plen = rtable_satoplen(dst->sa_family, mask);
671 	if (plen == -1)
672 		return (EINVAL);
673 
674 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
675 	rw_enter_write(&ar->ar_lock);
676 	an = art_lookup(ar, addr, plen, &sr);
677 	srp_leave(&sr); /* an can't go away while we have the lock */
678 
679 	/* Make sure we've got a perfect match. */
680 	if (!an_match(an, dst, plen)) {
681 		error = ESRCH;
682 		goto leave;
683 	}
684 
685 	/*
686 	 * If other multipath route entries are still attached to
687 	 * this ART node we only have to unlink it.
688 	 */
689 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
690 		npaths++;
691 
692 	if (npaths > 1) {
693 		KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
694 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
695 		    rt_next);
696 
697 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
698 		if (npaths == 2)
699 			mrt->rt_flags &= ~RTF_MPATH;
700 
701 		goto leave;
702 	}
703 
704 	if (art_delete(ar, an, addr, plen) == NULL)
705 		panic("art_delete failed to find node %p", an);
706 
707 	KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
708 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
709 	art_put(an);
710 
711 leave:
712 	rw_exit_write(&ar->ar_lock);
713 	rtfree(rt);
714 
715 	return (error);
716 }
717 
718 struct rtable_walk_cookie {
719 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
720 	void		 *rwc_arg;
721 	struct rtentry	**rwc_prt;
722 	unsigned int	  rwc_rid;
723 };
724 
725 /*
726  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
727  */
728 int
729 rtable_walk_helper(struct art_node *an, void *xrwc)
730 {
731 	struct srp_ref			 sr;
732 	struct rtable_walk_cookie	*rwc = xrwc;
733 	struct rtentry			*rt;
734 	int				 error = 0;
735 
736 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
737 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
738 		if (error != 0)
739 			break;
740 	}
741 	if (rwc->rwc_prt != NULL && rt != NULL) {
742 		rtref(rt);
743 		*rwc->rwc_prt = rt;
744 	}
745 	SRPL_LEAVE(&sr);
746 
747 	return (error);
748 }
749 
750 int
751 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
752     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
753 {
754 	struct art_root			*ar;
755 	struct rtable_walk_cookie	 rwc;
756 	int				 error;
757 
758 	ar = rtable_get(rtableid, af);
759 	if (ar == NULL)
760 		return (EAFNOSUPPORT);
761 
762 	rwc.rwc_func = func;
763 	rwc.rwc_arg = arg;
764 	rwc.rwc_prt = prt;
765 	rwc.rwc_rid = rtableid;
766 
767 	error = art_walk(ar, rtable_walk_helper, &rwc);
768 
769 	return (error);
770 }
771 
772 struct rtentry *
773 rtable_iterate(struct rtentry *rt0)
774 {
775 	struct rtentry *rt = NULL;
776 	struct srp_ref sr;
777 
778 	rt = SRPL_NEXT(&sr, rt0, rt_next);
779 	if (rt != NULL)
780 		rtref(rt);
781 	SRPL_LEAVE(&sr);
782 	rtfree(rt0);
783 	return (rt);
784 }
785 
786 int
787 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
788 {
789 	return (1);
790 }
791 
792 int
793 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
794     int plen, uint8_t prio, struct rtentry *rt)
795 {
796 	struct art_root			*ar;
797 	struct art_node			*an;
798 	struct srp_ref			 sr;
799 	const uint8_t			*addr;
800 	int				 error = 0;
801 
802 	ar = rtable_get(rtableid, dst->sa_family);
803 	if (ar == NULL)
804 		return (EAFNOSUPPORT);
805 
806 	addr = satoaddr(ar, dst);
807 
808 	rw_enter_write(&ar->ar_lock);
809 	an = art_lookup(ar, addr, plen, &sr);
810 	srp_leave(&sr); /* an can't go away while we have the lock */
811 
812 	/* Make sure we've got a perfect match. */
813 	if (!an_match(an, dst, plen)) {
814 		error = ESRCH;
815 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
816 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
817 		/*
818 		 * If there's only one entry on the list do not go
819 		 * through an insert/remove cycle.  This is done to
820 		 * guarantee that ``an->an_rtlist''  is never empty
821 		 * when a node is in the tree.
822 		 */
823 		rt->rt_priority = prio;
824 	} else {
825 		rtref(rt); /* keep rt alive in between remove and insert */
826 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
827 		    rt, rtentry, rt_next);
828 		rt->rt_priority = prio;
829 		rtable_mpath_insert(an, rt);
830 		rtfree(rt);
831 		error = EAGAIN;
832 	}
833 	rw_exit_write(&ar->ar_lock);
834 
835 	return (error);
836 }
837 
838 void
839 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
840 {
841 	struct rtentry			*mrt, *prt = NULL;
842 	uint8_t				 prio = rt->rt_priority;
843 
844 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
845 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
846 		return;
847 	}
848 
849 	/* Iterate until we find the route to be placed after ``rt''. */
850 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
851 		prt = mrt;
852 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
853 	}
854 
855 	if (mrt->rt_priority <= prio) {
856 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
857 	} else if (prt != NULL) {
858 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
859 	} else {
860 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
861 	}
862 }
863 
864 /*
865  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
866  */
867 int
868 an_match(struct art_node *an, const struct sockaddr *dst, int plen)
869 {
870 	struct rtentry			*rt;
871 	struct srp_ref			 sr;
872 	int				 match;
873 
874 	if (an == NULL || an->an_plen != plen)
875 		return (0);
876 
877 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
878 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
879 	SRPL_LEAVE(&sr);
880 
881 	return (match);
882 }
883 
884 void
885 rtentry_ref(void *null, void *xrt)
886 {
887 	struct rtentry *rt = xrt;
888 
889 	rtref(rt);
890 }
891 
892 void
893 rtentry_unref(void *null, void *xrt)
894 {
895 	struct rtentry *rt = xrt;
896 
897 	rtfree(rt);
898 }
899 
900 /*
901  * Return a pointer to the address (key).  This is an heritage from the
902  * BSD radix tree needed to skip the non-address fields from the flavor
903  * of "struct sockaddr" used by this routing table.
904  */
905 static inline const uint8_t *
906 satoaddr(struct art_root *at, const struct sockaddr *sa)
907 {
908 	return (((const uint8_t *)sa) + at->ar_off);
909 }
910 
911 /*
912  * Return the prefix length of a mask.
913  */
914 int
915 rtable_satoplen(sa_family_t af, const struct sockaddr *mask)
916 {
917 	const struct domain	*dp;
918 	uint8_t			*ap, *ep;
919 	int			 mlen, plen = 0;
920 	int			 i;
921 
922 	for (i = 0; (dp = domains[i]) != NULL; i++) {
923 		if (dp->dom_rtoffset == 0)
924 			continue;
925 
926 		if (af == dp->dom_family)
927 			break;
928 	}
929 	if (dp == NULL)
930 		return (-1);
931 
932 	/* Host route */
933 	if (mask == NULL)
934 		return (dp->dom_maxplen);
935 
936 	mlen = mask->sa_len;
937 
938 	/* Default route */
939 	if (mlen == 0)
940 		return (0);
941 
942 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
943 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
944 	if (ap > ep)
945 		return (-1);
946 
947 	/* Trim trailing zeroes. */
948 	while (ap < ep && ep[-1] == 0)
949 		ep--;
950 
951 	if (ap == ep)
952 		return (0);
953 
954 	/* "Beauty" adapted from sbin/route/show.c ... */
955 	while (ap < ep) {
956 		switch (*ap++) {
957 		case 0xff:
958 			plen += 8;
959 			break;
960 		case 0xfe:
961 			plen += 7;
962 			goto out;
963 		case 0xfc:
964 			plen += 6;
965 			goto out;
966 		case 0xf8:
967 			plen += 5;
968 			goto out;
969 		case 0xf0:
970 			plen += 4;
971 			goto out;
972 		case 0xe0:
973 			plen += 3;
974 			goto out;
975 		case 0xc0:
976 			plen += 2;
977 			goto out;
978 		case 0x80:
979 			plen += 1;
980 			goto out;
981 		default:
982 			/* Non contiguous mask. */
983 			return (-1);
984 		}
985 	}
986 
987 out:
988 	if (plen > dp->dom_maxplen || ap != ep)
989 		return -1;
990 
991 	return (plen);
992 }
993