xref: /openbsd/sys/net/rtable.c (revision 91defc8b)
1 /*	$OpenBSD: rtable.c,v 1.87 2024/04/09 12:53:08 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 #include <net/art.h>
34 
35 /*
36  * Structures used by rtable_get() to retrieve the corresponding
37  * routing table for a given pair of ``af'' and ``rtableid''.
38  *
39  * Note that once allocated routing table heads are never freed.
40  * This way we do not need to reference count them.
41  *
42  *	afmap		    rtmap/dommp
43  *   -----------          ---------     -----
44  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
45  *   -----------          ---------     -----   to rdomain/loopback (=value).
46  *   | AF_INET |.
47  *   ----------- `.       .---------.     .---------.
48  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
49  *   -----------          '---------'     '---------'	IPv4 routing tables
50  *   | AF_MPLS |					indexed by ``rtableid''.
51  *   -----------
52  */
53 struct srp	  *afmap;
54 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
55 uint8_t		   af2idx_max;
56 
57 /* Array of routing table pointers. */
58 struct rtmap {
59 	unsigned int	   limit;
60 	void		 **tbl;
61 };
62 
63 /*
64  * Array of rtableid -> rdomain mapping.
65  *
66  * Only used for the first index as described above.
67  */
68 struct dommp {
69 	unsigned int	   limit;
70 	/*
71 	 * Array to get the routing domain and loopback interface related to
72 	 * a routing table. Format:
73 	 *
74 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
75 	 */
76 	unsigned int	  *value;
77 };
78 
79 unsigned int	   rtmap_limit = 0;
80 
81 void		   rtmap_init(void);
82 void		   rtmap_grow(unsigned int, sa_family_t);
83 void		   rtmap_dtor(void *, void *);
84 
85 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
86 
87 void		   rtable_init_backend(void);
88 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
89 void		  *rtable_get(unsigned int, sa_family_t);
90 
91 void
rtmap_init(void)92 rtmap_init(void)
93 {
94 	const struct domain	*dp;
95 	int			 i;
96 
97 	/* Start with a single table for every domain that requires it. */
98 	for (i = 0; (dp = domains[i]) != NULL; i++) {
99 		if (dp->dom_rtoffset == 0)
100 			continue;
101 
102 		rtmap_grow(1, dp->dom_family);
103 	}
104 
105 	/* Initialize the rtableid->rdomain mapping table. */
106 	rtmap_grow(1, 0);
107 
108 	rtmap_limit = 1;
109 }
110 
111 /*
112  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
113  */
114 void
rtmap_grow(unsigned int nlimit,sa_family_t af)115 rtmap_grow(unsigned int nlimit, sa_family_t af)
116 {
117 	struct rtmap	*map, *nmap;
118 	int		 i;
119 
120 	KERNEL_ASSERT_LOCKED();
121 
122 	KASSERT(nlimit > rtmap_limit);
123 
124 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
125 	nmap->limit = nlimit;
126 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
127 	    M_WAITOK|M_ZERO);
128 
129 	map = srp_get_locked(&afmap[af2idx[af]]);
130 	if (map != NULL) {
131 		KASSERT(map->limit == rtmap_limit);
132 
133 		for (i = 0; i < map->limit; i++)
134 			nmap->tbl[i] = map->tbl[i];
135 	}
136 
137 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
138 }
139 
140 void
rtmap_dtor(void * null,void * xmap)141 rtmap_dtor(void *null, void *xmap)
142 {
143 	struct rtmap	*map = xmap;
144 
145 	/*
146 	 * doesn't need to be serialized since this is the last reference
147 	 * to this map. there's nothing to race against.
148 	 */
149 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
150 	free(map, M_RTABLE, sizeof(*map));
151 }
152 
153 void
rtable_init(void)154 rtable_init(void)
155 {
156 	const struct domain	*dp;
157 	int			 i;
158 
159 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
160 
161 	/* We use index 0 for the rtable/rdomain map. */
162 	af2idx_max = 1;
163 	memset(af2idx, 0, sizeof(af2idx));
164 
165 	/*
166 	 * Compute the maximum supported key length in case the routing
167 	 * table backend needs it.
168 	 */
169 	for (i = 0; (dp = domains[i]) != NULL; i++) {
170 		if (dp->dom_rtoffset == 0)
171 			continue;
172 
173 		af2idx[dp->dom_family] = af2idx_max++;
174 	}
175 	rtable_init_backend();
176 
177 	/*
178 	 * Allocate AF-to-id table now that we now how many AFs this
179 	 * kernel supports.
180 	 */
181 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
182 	    M_WAITOK|M_ZERO);
183 
184 	rtmap_init();
185 
186 	if (rtable_add(0) != 0)
187 		panic("unable to create default routing table");
188 
189 	rt_timer_init();
190 }
191 
192 int
rtable_add(unsigned int id)193 rtable_add(unsigned int id)
194 {
195 	const struct domain	*dp;
196 	void			*tbl;
197 	struct rtmap		*map;
198 	struct dommp		*dmm;
199 	sa_family_t		 af;
200 	unsigned int		 off, alen;
201 	int			 i, error = 0;
202 
203 	if (id > RT_TABLEID_MAX)
204 		return (EINVAL);
205 
206 	KERNEL_LOCK();
207 
208 	if (rtable_exists(id))
209 		goto out;
210 
211 	for (i = 0; (dp = domains[i]) != NULL; i++) {
212 		if (dp->dom_rtoffset == 0)
213 			continue;
214 
215 		af = dp->dom_family;
216 		off = dp->dom_rtoffset;
217 		alen = dp->dom_maxplen;
218 
219 		if (id >= rtmap_limit)
220 			rtmap_grow(id + 1, af);
221 
222 		tbl = rtable_alloc(id, alen, off);
223 		if (tbl == NULL) {
224 			error = ENOMEM;
225 			goto out;
226 		}
227 
228 		map = srp_get_locked(&afmap[af2idx[af]]);
229 		map->tbl[id] = tbl;
230 	}
231 
232 	/* Reflect possible growth. */
233 	if (id >= rtmap_limit) {
234 		rtmap_grow(id + 1, 0);
235 		rtmap_limit = id + 1;
236 	}
237 
238 	/* Use main rtable/rdomain by default. */
239 	dmm = srp_get_locked(&afmap[0]);
240 	dmm->value[id] = 0;
241 out:
242 	KERNEL_UNLOCK();
243 
244 	return (error);
245 }
246 
247 void *
rtable_get(unsigned int rtableid,sa_family_t af)248 rtable_get(unsigned int rtableid, sa_family_t af)
249 {
250 	struct rtmap	*map;
251 	void		*tbl = NULL;
252 	struct srp_ref	 sr;
253 
254 	if (af >= nitems(af2idx) || af2idx[af] == 0)
255 		return (NULL);
256 
257 	map = srp_enter(&sr, &afmap[af2idx[af]]);
258 	if (rtableid < map->limit)
259 		tbl = map->tbl[rtableid];
260 	srp_leave(&sr);
261 
262 	return (tbl);
263 }
264 
265 int
rtable_exists(unsigned int rtableid)266 rtable_exists(unsigned int rtableid)
267 {
268 	const struct domain	*dp;
269 	void			*tbl;
270 	int			 i;
271 
272 	for (i = 0; (dp = domains[i]) != NULL; i++) {
273 		if (dp->dom_rtoffset == 0)
274 			continue;
275 
276 		tbl = rtable_get(rtableid, dp->dom_family);
277 		if (tbl != NULL)
278 			return (1);
279 	}
280 
281 	return (0);
282 }
283 
284 int
rtable_empty(unsigned int rtableid)285 rtable_empty(unsigned int rtableid)
286 {
287 	const struct domain	*dp;
288 	int			 i;
289 	struct art_root		*tbl;
290 
291 	for (i = 0; (dp = domains[i]) != NULL; i++) {
292 		if (dp->dom_rtoffset == 0)
293 			continue;
294 
295 		tbl = rtable_get(rtableid, dp->dom_family);
296 		if (tbl == NULL)
297 			continue;
298 		if (tbl->ar_root.ref != NULL)
299 			return (0);
300 	}
301 
302 	return (1);
303 }
304 
305 unsigned int
rtable_l2(unsigned int rtableid)306 rtable_l2(unsigned int rtableid)
307 {
308 	struct dommp	*dmm;
309 	unsigned int	 rdomain = 0;
310 	struct srp_ref	 sr;
311 
312 	dmm = srp_enter(&sr, &afmap[0]);
313 	if (rtableid < dmm->limit)
314 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
315 	srp_leave(&sr);
316 
317 	return (rdomain);
318 }
319 
320 unsigned int
rtable_loindex(unsigned int rtableid)321 rtable_loindex(unsigned int rtableid)
322 {
323 	struct dommp	*dmm;
324 	unsigned int	 loifidx = 0;
325 	struct srp_ref	 sr;
326 
327 	dmm = srp_enter(&sr, &afmap[0]);
328 	if (rtableid < dmm->limit)
329 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
330 	srp_leave(&sr);
331 
332 	return (loifidx);
333 }
334 
335 void
rtable_l2set(unsigned int rtableid,unsigned int rdomain,unsigned int loifidx)336 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
337 {
338 	struct dommp	*dmm;
339 	unsigned int	 value;
340 
341 	KERNEL_ASSERT_LOCKED();
342 
343 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
344 		return;
345 
346 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
347 
348 	dmm = srp_get_locked(&afmap[0]);
349 	dmm->value[rtableid] = value;
350 }
351 
352 
353 static inline const uint8_t *satoaddr(struct art_root *,
354     const struct sockaddr *);
355 
356 int	an_match(struct art_node *, const struct sockaddr *, int);
357 void	rtentry_ref(void *, void *);
358 void	rtentry_unref(void *, void *);
359 
360 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
361 
362 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
363 
364 void
rtable_init_backend(void)365 rtable_init_backend(void)
366 {
367 	art_init();
368 }
369 
370 void *
rtable_alloc(unsigned int rtableid,unsigned int alen,unsigned int off)371 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
372 {
373 	return (art_alloc(rtableid, alen, off));
374 }
375 
376 int
rtable_setsource(unsigned int rtableid,int af,struct sockaddr * src)377 rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
378 {
379 	struct art_root		*ar;
380 
381 	NET_ASSERT_LOCKED_EXCLUSIVE();
382 
383 	if ((ar = rtable_get(rtableid, af)) == NULL)
384 		return (EAFNOSUPPORT);
385 
386 	ar->ar_source = src;
387 
388 	return (0);
389 }
390 
391 struct sockaddr *
rtable_getsource(unsigned int rtableid,int af)392 rtable_getsource(unsigned int rtableid, int af)
393 {
394 	struct art_root		*ar;
395 
396 	NET_ASSERT_LOCKED();
397 
398 	ar = rtable_get(rtableid, af);
399 	if (ar == NULL)
400 		return (NULL);
401 
402 	return (ar->ar_source);
403 }
404 
405 void
rtable_clearsource(unsigned int rtableid,struct sockaddr * src)406 rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
407 {
408 	struct sockaddr	*addr;
409 
410 	addr = rtable_getsource(rtableid, src->sa_family);
411 	if (addr && (addr->sa_len == src->sa_len)) {
412 		if (memcmp(src, addr, addr->sa_len) == 0) {
413 			rtable_setsource(rtableid, src->sa_family, NULL);
414 		}
415 	}
416 }
417 
418 struct rtentry *
rtable_lookup(unsigned int rtableid,const struct sockaddr * dst,const struct sockaddr * mask,const struct sockaddr * gateway,uint8_t prio)419 rtable_lookup(unsigned int rtableid, const struct sockaddr *dst,
420     const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio)
421 {
422 	struct art_root			*ar;
423 	struct art_node			*an;
424 	struct rtentry			*rt = NULL;
425 	struct srp_ref			 sr, nsr;
426 	const uint8_t			*addr;
427 	int				 plen;
428 
429 	ar = rtable_get(rtableid, dst->sa_family);
430 	if (ar == NULL)
431 		return (NULL);
432 
433 	addr = satoaddr(ar, dst);
434 
435 	/* No need for a perfect match. */
436 	if (mask == NULL) {
437 		an = art_match(ar, addr, &nsr);
438 		if (an == NULL)
439 			goto out;
440 	} else {
441 		plen = rtable_satoplen(dst->sa_family, mask);
442 		if (plen == -1)
443 			return (NULL);
444 
445 		an = art_lookup(ar, addr, plen, &nsr);
446 
447 		/* Make sure we've got a perfect match. */
448 		if (!an_match(an, dst, plen))
449 			goto out;
450 	}
451 
452 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
453 		if (prio != RTP_ANY &&
454 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
455 			continue;
456 
457 		if (gateway == NULL)
458 			break;
459 
460 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
461 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
462 			break;
463 	}
464 	if (rt != NULL)
465 		rtref(rt);
466 
467 	SRPL_LEAVE(&sr);
468 out:
469 	srp_leave(&nsr);
470 
471 	return (rt);
472 }
473 
474 struct rtentry *
rtable_match(unsigned int rtableid,const struct sockaddr * dst,uint32_t * src)475 rtable_match(unsigned int rtableid, const struct sockaddr *dst, uint32_t *src)
476 {
477 	struct art_root			*ar;
478 	struct art_node			*an;
479 	struct rtentry			*rt = NULL;
480 	struct srp_ref			 sr, nsr;
481 	const uint8_t			*addr;
482 	int				 hash;
483 
484 	ar = rtable_get(rtableid, dst->sa_family);
485 	if (ar == NULL)
486 		return (NULL);
487 
488 	addr = satoaddr(ar, dst);
489 
490 	an = art_match(ar, addr, &nsr);
491 	if (an == NULL)
492 		goto out;
493 
494 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
495 	if (rt == NULL) {
496 		SRPL_LEAVE(&sr);
497 		goto out;
498 	}
499 	rtref(rt);
500 	SRPL_LEAVE(&sr);
501 
502 	/* Gateway selection by Hash-Threshold (RFC 2992) */
503 	if ((hash = rt_hash(rt, dst, src)) != -1) {
504 		struct rtentry		*mrt;
505 		int			 threshold, npaths = 0;
506 
507 		KASSERT(hash <= 0xffff);
508 
509 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
510 			/* Only count nexthops with the same priority. */
511 			if (mrt->rt_priority == rt->rt_priority)
512 				npaths++;
513 		}
514 		SRPL_LEAVE(&sr);
515 
516 		threshold = (0xffff / npaths) + 1;
517 
518 		/*
519 		 * we have no protection against concurrent modification of the
520 		 * route list attached to the node, so we won't necessarily
521 		 * have the same number of routes.  for most modifications,
522 		 * we'll pick a route that we wouldn't have if we only saw the
523 		 * list before or after the change.  if we were going to use
524 		 * the last available route, but it got removed, we'll hit
525 		 * the end of the list and then pick the first route.
526 		 */
527 
528 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
529 		while (hash > threshold && mrt != NULL) {
530 			if (mrt->rt_priority == rt->rt_priority)
531 				hash -= threshold;
532 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
533 		}
534 
535 		if (mrt != NULL) {
536 			rtref(mrt);
537 			rtfree(rt);
538 			rt = mrt;
539 		}
540 		SRPL_LEAVE(&sr);
541 	}
542 out:
543 	srp_leave(&nsr);
544 	return (rt);
545 }
546 
547 int
rtable_insert(unsigned int rtableid,struct sockaddr * dst,const struct sockaddr * mask,const struct sockaddr * gateway,uint8_t prio,struct rtentry * rt)548 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
549     const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio,
550     struct rtentry *rt)
551 {
552 	struct rtentry			*mrt;
553 	struct srp_ref			 sr;
554 	struct art_root			*ar;
555 	struct art_node			*an, *prev;
556 	const uint8_t			*addr;
557 	int				 plen;
558 	unsigned int			 rt_flags;
559 	int				 error = 0;
560 
561 	ar = rtable_get(rtableid, dst->sa_family);
562 	if (ar == NULL)
563 		return (EAFNOSUPPORT);
564 
565 	addr = satoaddr(ar, dst);
566 	plen = rtable_satoplen(dst->sa_family, mask);
567 	if (plen == -1)
568 		return (EINVAL);
569 
570 	rtref(rt); /* guarantee rtfree won't do anything during insert */
571 	rw_enter_write(&ar->ar_lock);
572 
573 	/* Do not permit exactly the same dst/mask/gw pair. */
574 	an = art_lookup(ar, addr, plen, &sr);
575 	srp_leave(&sr); /* an can't go away while we have the lock */
576 	if (an_match(an, dst, plen)) {
577 		struct rtentry  *mrt;
578 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
579 
580 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
581 			if (prio != RTP_ANY &&
582 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
583 				continue;
584 
585 			if (!mpathok ||
586 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
587 			    memcmp(mrt->rt_gateway, gateway,
588 			    gateway->sa_len) == 0)) {
589 				error = EEXIST;
590 				goto leave;
591 			}
592 		}
593 	}
594 
595 	an = art_get(plen);
596 	if (an == NULL) {
597 		error = ENOBUFS;
598 		goto leave;
599 	}
600 
601 	/* prepare for immediate operation if insert succeeds */
602 	rt_flags = rt->rt_flags;
603 	rt->rt_flags &= ~RTF_MPATH;
604 	rt->rt_dest = dst;
605 	rt->rt_plen = plen;
606 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
607 
608 	prev = art_insert(ar, an, addr, plen);
609 	if (prev != an) {
610 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
611 		    rt_next);
612 		rt->rt_flags = rt_flags;
613 		art_put(an);
614 
615 		if (prev == NULL) {
616 			error = ESRCH;
617 			goto leave;
618 		}
619 
620 		an = prev;
621 
622 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
623 		KASSERT(mrt != NULL);
624 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
625 
626 		/*
627 		 * An ART node with the same destination/netmask already
628 		 * exists, MPATH conflict must have been already checked.
629 		 */
630 		if (rt->rt_flags & RTF_MPATH) {
631 			/*
632 			 * Only keep the RTF_MPATH flag if two routes have
633 			 * the same gateway.
634 			 */
635 			rt->rt_flags &= ~RTF_MPATH;
636 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
637 				if (mrt->rt_priority == prio) {
638 					mrt->rt_flags |= RTF_MPATH;
639 					rt->rt_flags |= RTF_MPATH;
640 				}
641 			}
642 		}
643 
644 		/* Put newly inserted entry at the right place. */
645 		rtable_mpath_insert(an, rt);
646 	}
647 leave:
648 	rw_exit_write(&ar->ar_lock);
649 	rtfree(rt);
650 	return (error);
651 }
652 
653 int
rtable_delete(unsigned int rtableid,const struct sockaddr * dst,const struct sockaddr * mask,struct rtentry * rt)654 rtable_delete(unsigned int rtableid, const struct sockaddr *dst,
655     const struct sockaddr *mask, struct rtentry *rt)
656 {
657 	struct art_root			*ar;
658 	struct art_node			*an;
659 	struct srp_ref			 sr;
660 	const uint8_t			*addr;
661 	int				 plen;
662 	struct rtentry			*mrt;
663 	int				 npaths = 0;
664 	int				 error = 0;
665 
666 	ar = rtable_get(rtableid, dst->sa_family);
667 	if (ar == NULL)
668 		return (EAFNOSUPPORT);
669 
670 	addr = satoaddr(ar, dst);
671 	plen = rtable_satoplen(dst->sa_family, mask);
672 	if (plen == -1)
673 		return (EINVAL);
674 
675 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
676 	rw_enter_write(&ar->ar_lock);
677 	an = art_lookup(ar, addr, plen, &sr);
678 	srp_leave(&sr); /* an can't go away while we have the lock */
679 
680 	/* Make sure we've got a perfect match. */
681 	if (!an_match(an, dst, plen)) {
682 		error = ESRCH;
683 		goto leave;
684 	}
685 
686 	/*
687 	 * If other multipath route entries are still attached to
688 	 * this ART node we only have to unlink it.
689 	 */
690 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
691 		npaths++;
692 
693 	if (npaths > 1) {
694 		KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
695 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
696 		    rt_next);
697 
698 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
699 		if (npaths == 2)
700 			mrt->rt_flags &= ~RTF_MPATH;
701 
702 		goto leave;
703 	}
704 
705 	if (art_delete(ar, an, addr, plen) == NULL)
706 		panic("art_delete failed to find node %p", an);
707 
708 	KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
709 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
710 	art_put(an);
711 
712 leave:
713 	rw_exit_write(&ar->ar_lock);
714 	rtfree(rt);
715 
716 	return (error);
717 }
718 
719 struct rtable_walk_cookie {
720 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
721 	void		 *rwc_arg;
722 	struct rtentry	**rwc_prt;
723 	unsigned int	  rwc_rid;
724 };
725 
726 /*
727  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
728  */
729 int
rtable_walk_helper(struct art_node * an,void * xrwc)730 rtable_walk_helper(struct art_node *an, void *xrwc)
731 {
732 	struct srp_ref			 sr;
733 	struct rtable_walk_cookie	*rwc = xrwc;
734 	struct rtentry			*rt;
735 	int				 error = 0;
736 
737 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
738 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
739 		if (error != 0)
740 			break;
741 	}
742 	if (rwc->rwc_prt != NULL && rt != NULL) {
743 		rtref(rt);
744 		*rwc->rwc_prt = rt;
745 	}
746 	SRPL_LEAVE(&sr);
747 
748 	return (error);
749 }
750 
751 int
rtable_walk(unsigned int rtableid,sa_family_t af,struct rtentry ** prt,int (* func)(struct rtentry *,void *,unsigned int),void * arg)752 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
753     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
754 {
755 	struct art_root			*ar;
756 	struct rtable_walk_cookie	 rwc;
757 	int				 error;
758 
759 	ar = rtable_get(rtableid, af);
760 	if (ar == NULL)
761 		return (EAFNOSUPPORT);
762 
763 	rwc.rwc_func = func;
764 	rwc.rwc_arg = arg;
765 	rwc.rwc_prt = prt;
766 	rwc.rwc_rid = rtableid;
767 
768 	error = art_walk(ar, rtable_walk_helper, &rwc);
769 
770 	return (error);
771 }
772 
773 struct rtentry *
rtable_iterate(struct rtentry * rt0)774 rtable_iterate(struct rtentry *rt0)
775 {
776 	struct rtentry *rt = NULL;
777 	struct srp_ref sr;
778 
779 	rt = SRPL_NEXT(&sr, rt0, rt_next);
780 	if (rt != NULL)
781 		rtref(rt);
782 	SRPL_LEAVE(&sr);
783 	rtfree(rt0);
784 	return (rt);
785 }
786 
787 int
rtable_mpath_capable(unsigned int rtableid,sa_family_t af)788 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
789 {
790 	return (1);
791 }
792 
793 int
rtable_mpath_reprio(unsigned int rtableid,struct sockaddr * dst,int plen,uint8_t prio,struct rtentry * rt)794 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
795     int plen, uint8_t prio, struct rtentry *rt)
796 {
797 	struct art_root			*ar;
798 	struct art_node			*an;
799 	struct srp_ref			 sr;
800 	const uint8_t			*addr;
801 	int				 error = 0;
802 
803 	ar = rtable_get(rtableid, dst->sa_family);
804 	if (ar == NULL)
805 		return (EAFNOSUPPORT);
806 
807 	addr = satoaddr(ar, dst);
808 
809 	rw_enter_write(&ar->ar_lock);
810 	an = art_lookup(ar, addr, plen, &sr);
811 	srp_leave(&sr); /* an can't go away while we have the lock */
812 
813 	/* Make sure we've got a perfect match. */
814 	if (!an_match(an, dst, plen)) {
815 		error = ESRCH;
816 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
817 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
818 		/*
819 		 * If there's only one entry on the list do not go
820 		 * through an insert/remove cycle.  This is done to
821 		 * guarantee that ``an->an_rtlist''  is never empty
822 		 * when a node is in the tree.
823 		 */
824 		rt->rt_priority = prio;
825 	} else {
826 		rtref(rt); /* keep rt alive in between remove and insert */
827 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
828 		    rt, rtentry, rt_next);
829 		rt->rt_priority = prio;
830 		rtable_mpath_insert(an, rt);
831 		rtfree(rt);
832 		error = EAGAIN;
833 	}
834 	rw_exit_write(&ar->ar_lock);
835 
836 	return (error);
837 }
838 
839 void
rtable_mpath_insert(struct art_node * an,struct rtentry * rt)840 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
841 {
842 	struct rtentry			*mrt, *prt = NULL;
843 	uint8_t				 prio = rt->rt_priority;
844 
845 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
846 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
847 		return;
848 	}
849 
850 	/* Iterate until we find the route to be placed after ``rt''. */
851 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
852 		prt = mrt;
853 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
854 	}
855 
856 	if (mrt->rt_priority <= prio) {
857 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
858 	} else if (prt != NULL) {
859 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
860 	} else {
861 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
862 	}
863 }
864 
865 /*
866  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
867  */
868 int
an_match(struct art_node * an,const struct sockaddr * dst,int plen)869 an_match(struct art_node *an, const struct sockaddr *dst, int plen)
870 {
871 	struct rtentry			*rt;
872 	struct srp_ref			 sr;
873 	int				 match;
874 
875 	if (an == NULL || an->an_plen != plen)
876 		return (0);
877 
878 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
879 	match = (rt != NULL && memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
880 	SRPL_LEAVE(&sr);
881 
882 	return (match);
883 }
884 
885 void
rtentry_ref(void * null,void * xrt)886 rtentry_ref(void *null, void *xrt)
887 {
888 	struct rtentry *rt = xrt;
889 
890 	rtref(rt);
891 }
892 
893 void
rtentry_unref(void * null,void * xrt)894 rtentry_unref(void *null, void *xrt)
895 {
896 	struct rtentry *rt = xrt;
897 
898 	rtfree(rt);
899 }
900 
901 /*
902  * Return a pointer to the address (key).  This is an heritage from the
903  * BSD radix tree needed to skip the non-address fields from the flavor
904  * of "struct sockaddr" used by this routing table.
905  */
906 static inline const uint8_t *
satoaddr(struct art_root * at,const struct sockaddr * sa)907 satoaddr(struct art_root *at, const struct sockaddr *sa)
908 {
909 	return (((const uint8_t *)sa) + at->ar_off);
910 }
911 
912 /*
913  * Return the prefix length of a mask.
914  */
915 int
rtable_satoplen(sa_family_t af,const struct sockaddr * mask)916 rtable_satoplen(sa_family_t af, const struct sockaddr *mask)
917 {
918 	const struct domain	*dp;
919 	uint8_t			*ap, *ep;
920 	int			 mlen, plen = 0;
921 	int			 i;
922 
923 	for (i = 0; (dp = domains[i]) != NULL; i++) {
924 		if (dp->dom_rtoffset == 0)
925 			continue;
926 
927 		if (af == dp->dom_family)
928 			break;
929 	}
930 	if (dp == NULL)
931 		return (-1);
932 
933 	/* Host route */
934 	if (mask == NULL)
935 		return (dp->dom_maxplen);
936 
937 	mlen = mask->sa_len;
938 
939 	/* Default route */
940 	if (mlen == 0)
941 		return (0);
942 
943 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
944 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
945 	if (ap > ep)
946 		return (-1);
947 
948 	/* Trim trailing zeroes. */
949 	while (ap < ep && ep[-1] == 0)
950 		ep--;
951 
952 	if (ap == ep)
953 		return (0);
954 
955 	/* "Beauty" adapted from sbin/route/show.c ... */
956 	while (ap < ep) {
957 		switch (*ap++) {
958 		case 0xff:
959 			plen += 8;
960 			break;
961 		case 0xfe:
962 			plen += 7;
963 			goto out;
964 		case 0xfc:
965 			plen += 6;
966 			goto out;
967 		case 0xf8:
968 			plen += 5;
969 			goto out;
970 		case 0xf0:
971 			plen += 4;
972 			goto out;
973 		case 0xe0:
974 			plen += 3;
975 			goto out;
976 		case 0xc0:
977 			plen += 2;
978 			goto out;
979 		case 0x80:
980 			plen += 1;
981 			goto out;
982 		default:
983 			/* Non contiguous mask. */
984 			return (-1);
985 		}
986 	}
987 
988 out:
989 	if (plen > dp->dom_maxplen || ap != ep)
990 		return -1;
991 
992 	return (plen);
993 }
994