xref: /openbsd/usr.sbin/bgpd/rde_rib.c (revision 9b7c3dbb)
1 /*	$OpenBSD: rde_rib.c,v 1.143 2016/08/27 01:26:22 guenther Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 
22 #include <stdlib.h>
23 #include <string.h>
24 #include <siphash.h>
25 #include <time.h>
26 
27 #include "bgpd.h"
28 #include "rde.h"
29 
30 /*
31  * BGP RIB -- Routing Information Base
32  *
33  * The RIB is build with one aspect in mind. Speed -- actually update speed.
34  * Therefore one thing needs to be absolutely avoided, long table walks.
35  * This is achieved by heavily linking the different parts together.
36  */
37 u_int16_t rib_size;
38 struct rib *ribs;
39 
40 LIST_HEAD(, rib_context) rib_dump_h = LIST_HEAD_INITIALIZER(rib_dump_h);
41 
42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
43 int rib_compare(const struct rib_entry *, const struct rib_entry *);
44 void rib_remove(struct rib_entry *);
45 int rib_empty(struct rib_entry *);
46 struct rib_entry *rib_restart(struct rib_context *);
47 
48 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
49 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
50 
51 
52 /* RIB specific functions */
53 u_int16_t
54 rib_new(char *name, u_int rtableid, u_int16_t flags)
55 {
56 	struct rib	*xribs;
57 	u_int16_t	id;
58 
59 	for (id = 0; id < rib_size; id++) {
60 		if (*ribs[id].name == '\0')
61 			break;
62 	}
63 
64 	if (id == RIB_FAILED)
65 		fatalx("rib_new: trying to use reserved id");
66 
67 	if (id >= rib_size) {
68 		if ((xribs = reallocarray(ribs, id + 1,
69 		    sizeof(struct rib))) == NULL) {
70 			/* XXX this is not clever */
71 			fatal("rib_add");
72 		}
73 		ribs = xribs;
74 		rib_size = id + 1;
75 	}
76 
77 	bzero(&ribs[id], sizeof(struct rib));
78 	strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
79 	RB_INIT(&ribs[id].rib);
80 	ribs[id].state = RECONF_REINIT;
81 	ribs[id].id = id;
82 	ribs[id].flags = flags;
83 	ribs[id].rtableid = rtableid;
84 
85 	ribs[id].in_rules = calloc(1, sizeof(struct filter_head));
86 	if (ribs[id].in_rules == NULL)
87 		fatal(NULL);
88 	TAILQ_INIT(ribs[id].in_rules);
89 
90 	return (id);
91 }
92 
93 u_int16_t
94 rib_find(char *name)
95 {
96 	u_int16_t id;
97 
98 	if (name == NULL || *name == '\0')
99 		return (1);	/* no name returns the Loc-RIB */
100 
101 	for (id = 0; id < rib_size; id++) {
102 		if (!strcmp(ribs[id].name, name))
103 			return (id);
104 	}
105 
106 	return (RIB_FAILED);
107 }
108 
109 void
110 rib_free(struct rib *rib)
111 {
112 	struct rib_context *ctx, *next;
113 	struct rib_entry *re, *xre;
114 	struct prefix *p, *np;
115 
116 	/* abort pending rib_dumps */
117 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
118 		next = LIST_NEXT(ctx, entry);
119 		if (ctx->ctx_rib == rib) {
120 			re = ctx->ctx_re;
121 			re->flags &= ~F_RIB_ENTRYLOCK;
122 			LIST_REMOVE(ctx, entry);
123 			if (ctx->ctx_done)
124 				ctx->ctx_done(ctx->ctx_arg);
125 			else
126 				free(ctx);
127 		}
128 	}
129 
130 	for (re = RB_MIN(rib_tree, &rib->rib); re != NULL; re = xre) {
131 		xre = RB_NEXT(rib_tree,  &rib->rib, re);
132 
133 		/*
134 		 * Removing the prefixes is tricky because the last one
135 		 * will remove the rib_entry as well and because we do
136 		 * an empty check in prefix_destroy() it is not possible to
137 		 * use the default for loop.
138 		 */
139 		while ((p = LIST_FIRST(&re->prefix_h))) {
140 			np = LIST_NEXT(p, rib_l);
141 			if (p->aspath->pftableid) {
142 				struct bgpd_addr addr;
143 
144 				pt_getaddr(p->prefix, &addr);
145 				/* Commit is done in peer_down() */
146 				rde_send_pftable(p->aspath->pftableid, &addr,
147 				    p->prefix->prefixlen, 1);
148 			}
149 			prefix_destroy(p);
150 			if (np == NULL)
151 				break;
152 		}
153 	}
154 	filterlist_free(rib->in_rules_tmp);
155 	filterlist_free(rib->in_rules);
156 	bzero(rib, sizeof(struct rib));
157 }
158 
159 int
160 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
161 {
162 	return (pt_prefix_cmp(a->prefix, b->prefix));
163 }
164 
165 struct rib_entry *
166 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
167 {
168 	struct rib_entry xre;
169 	struct pt_entry	*pte;
170 
171 	pte = pt_fill(prefix, prefixlen);
172 	bzero(&xre, sizeof(xre));
173 	xre.prefix = pte;
174 
175 	return (RB_FIND(rib_tree, &rib->rib, &xre));
176 }
177 
178 struct rib_entry *
179 rib_lookup(struct rib *rib, struct bgpd_addr *addr)
180 {
181 	struct rib_entry *re;
182 	int		 i;
183 
184 	switch (addr->aid) {
185 	case AID_INET:
186 	case AID_VPN_IPv4:
187 		for (i = 32; i >= 0; i--) {
188 			re = rib_get(rib, addr, i);
189 			if (re != NULL)
190 				return (re);
191 		}
192 		break;
193 	case AID_INET6:
194 		for (i = 128; i >= 0; i--) {
195 			re = rib_get(rib, addr, i);
196 			if (re != NULL)
197 				return (re);
198 		}
199 		break;
200 	default:
201 		fatalx("rib_lookup: unknown af");
202 	}
203 	return (NULL);
204 }
205 
206 
207 struct rib_entry *
208 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
209 {
210 	struct pt_entry	*pte;
211 	struct rib_entry *re;
212 
213 	pte = pt_get(prefix, prefixlen);
214 	if (pte == NULL)
215 		pte = pt_add(prefix, prefixlen);
216 
217 	if ((re = calloc(1, sizeof(*re))) == NULL)
218 		fatal("rib_add");
219 
220 	LIST_INIT(&re->prefix_h);
221 	re->prefix = pte;
222 	re->flags = rib->flags;
223 	re->ribid = rib->id;
224 
225         if (RB_INSERT(rib_tree, &rib->rib, re) != NULL) {
226 		log_warnx("rib_add: insert failed");
227 		free(re);
228 		return (NULL);
229 	}
230 
231 	pt_ref(pte);
232 
233 	rdemem.rib_cnt++;
234 
235 	return (re);
236 }
237 
238 void
239 rib_remove(struct rib_entry *re)
240 {
241 	if (!rib_empty(re))
242 		fatalx("rib_remove: entry not empty");
243 
244 	if (re->flags & F_RIB_ENTRYLOCK)
245 		/* entry is locked, don't free it. */
246 		return;
247 
248 	pt_unref(re->prefix);
249 	if (pt_empty(re->prefix))
250 		pt_remove(re->prefix);
251 
252 	if (RB_REMOVE(rib_tree, &ribs[re->ribid].rib, re) == NULL)
253 		log_warnx("rib_remove: remove failed.");
254 
255 	free(re);
256 	rdemem.rib_cnt--;
257 }
258 
259 int
260 rib_empty(struct rib_entry *re)
261 {
262 	return LIST_EMPTY(&re->prefix_h);
263 }
264 
265 void
266 rib_dump(struct rib *rib, void (*upcall)(struct rib_entry *, void *),
267     void *arg, u_int8_t aid)
268 {
269 	struct rib_context	*ctx;
270 
271 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
272 		fatal("rib_dump");
273 	ctx->ctx_rib = rib;
274 	ctx->ctx_upcall = upcall;
275 	ctx->ctx_arg = arg;
276 	ctx->ctx_aid = aid;
277 	rib_dump_r(ctx);
278 }
279 
280 void
281 rib_dump_r(struct rib_context *ctx)
282 {
283 	struct rib_entry	*re;
284 	unsigned int		 i;
285 
286 	if (ctx->ctx_re == NULL) {
287 		re = RB_MIN(rib_tree, &ctx->ctx_rib->rib);
288 		LIST_INSERT_HEAD(&rib_dump_h, ctx, entry);
289 	} else
290 		re = rib_restart(ctx);
291 
292 	for (i = 0; re != NULL; re = RB_NEXT(rib_tree, unused, re)) {
293 		if (ctx->ctx_aid != AID_UNSPEC &&
294 		    ctx->ctx_aid != re->prefix->aid)
295 			continue;
296 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
297 		    (re->flags & F_RIB_ENTRYLOCK) == 0) {
298 			/* store and lock last element */
299 			ctx->ctx_re = re;
300 			re->flags |= F_RIB_ENTRYLOCK;
301 			return;
302 		}
303 		ctx->ctx_upcall(re, ctx->ctx_arg);
304 	}
305 
306 	LIST_REMOVE(ctx, entry);
307 	if (ctx->ctx_done)
308 		ctx->ctx_done(ctx->ctx_arg);
309 	else
310 		free(ctx);
311 }
312 
313 struct rib_entry *
314 rib_restart(struct rib_context *ctx)
315 {
316 	struct rib_entry *re;
317 
318 	re = ctx->ctx_re;
319 	re->flags &= ~F_RIB_ENTRYLOCK;
320 
321 	/* find first non empty element */
322 	while (re && rib_empty(re))
323 		re = RB_NEXT(rib_tree, unused, re);
324 
325 	/* free the previously locked rib element if empty */
326 	if (rib_empty(ctx->ctx_re))
327 		rib_remove(ctx->ctx_re);
328 	ctx->ctx_re = NULL;
329 	return (re);
330 }
331 
332 void
333 rib_dump_runner(void)
334 {
335 	struct rib_context	*ctx, *next;
336 
337 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
338 		next = LIST_NEXT(ctx, entry);
339 		rib_dump_r(ctx);
340 	}
341 }
342 
343 int
344 rib_dump_pending(void)
345 {
346 	return (!LIST_EMPTY(&rib_dump_h));
347 }
348 
349 /* used to bump correct prefix counters */
350 #define PREFIX_COUNT(x, op)			\
351 	do {					\
352 		(x)->prefix_cnt += (op);	\
353 	} while (0)
354 
355 /* path specific functions */
356 
357 static void	path_link(struct rde_aspath *, struct rde_peer *);
358 
359 struct path_table pathtable;
360 
361 SIPHASH_KEY pathtablekey;
362 
363 /* XXX the hash should also include communities and the other attrs */
364 #define PATH_HASH(x)				\
365 	&pathtable.path_hashtbl[SipHash24(&pathtablekey, (x)->data, (x)->len) & \
366 	    pathtable.path_hashmask]
367 
368 void
369 path_init(u_int32_t hashsize)
370 {
371 	u_int32_t	hs, i;
372 
373 	for (hs = 1; hs < hashsize; hs <<= 1)
374 		;
375 	pathtable.path_hashtbl = calloc(hs, sizeof(struct aspath_head));
376 	if (pathtable.path_hashtbl == NULL)
377 		fatal("path_init");
378 
379 	for (i = 0; i < hs; i++)
380 		LIST_INIT(&pathtable.path_hashtbl[i]);
381 
382 	pathtable.path_hashmask = hs - 1;
383 	arc4random_buf(&pathtablekey, sizeof(pathtablekey));
384 }
385 
386 void
387 path_shutdown(void)
388 {
389 	u_int32_t	i;
390 
391 	for (i = 0; i <= pathtable.path_hashmask; i++)
392 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
393 			log_warnx("path_free: free non-free table");
394 
395 	free(pathtable.path_hashtbl);
396 }
397 
398 int
399 path_update(struct rib *rib, struct rde_peer *peer, struct rde_aspath *nasp,
400     struct bgpd_addr *prefix, int prefixlen)
401 {
402 	struct rde_aspath	*asp;
403 	struct prefix		*p;
404 
405 	if (nasp->pftableid) {
406 		rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
407 		rde_send_pftable_commit();
408 	}
409 
410 	/*
411 	 * First try to find a prefix in the specified RIB.
412 	 */
413 	if ((p = prefix_get(rib, peer, prefix, prefixlen, 0)) != NULL) {
414 		if (path_compare(nasp, p->aspath) == 0) {
415 			/* no change, update last change */
416 			p->lastchange = time(NULL);
417 			return (0);
418 		}
419 	}
420 
421 	/*
422 	 * Either the prefix does not exist or the path changed.
423 	 * In both cases lookup the new aspath to make sure it is not
424 	 * already in the RIB.
425 	 */
426 	if ((asp = path_lookup(nasp, peer)) == NULL) {
427 		/* Path not available, create and link a new one. */
428 		asp = path_copy(nasp);
429 		path_link(asp, peer);
430 	}
431 
432 	/* If the prefix was found move it else add it to the aspath. */
433 	if (p != NULL)
434 		prefix_move(asp, p);
435 	else
436 		return (prefix_add(rib, asp, prefix, prefixlen));
437 	return (0);
438 }
439 
440 int
441 path_compare(struct rde_aspath *a, struct rde_aspath *b)
442 {
443 	int		 r;
444 
445 	if (a->origin > b->origin)
446 		return (1);
447 	if (a->origin < b->origin)
448 		return (-1);
449 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
450 		return (1);
451 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
452 		return (-1);
453 	if (a->med > b->med)
454 		return (1);
455 	if (a->med < b->med)
456 		return (-1);
457 	if (a->lpref > b->lpref)
458 		return (1);
459 	if (a->lpref < b->lpref)
460 		return (-1);
461 	if (a->weight > b->weight)
462 		return (1);
463 	if (a->weight < b->weight)
464 		return (-1);
465 	if (a->rtlabelid > b->rtlabelid)
466 		return (1);
467 	if (a->rtlabelid < b->rtlabelid)
468 		return (-1);
469 	if (a->pftableid > b->pftableid)
470 		return (1);
471 	if (a->pftableid < b->pftableid)
472 		return (-1);
473 
474 	r = aspath_compare(a->aspath, b->aspath);
475 	if (r == 0)
476 		r = nexthop_compare(a->nexthop, b->nexthop);
477 	if (r > 0)
478 		return (1);
479 	if (r < 0)
480 		return (-1);
481 
482 	return (attr_compare(a, b));
483 }
484 
485 struct rde_aspath *
486 path_lookup(struct rde_aspath *aspath, struct rde_peer *peer)
487 {
488 	struct aspath_head	*head;
489 	struct rde_aspath	*asp;
490 
491 	head = PATH_HASH(aspath->aspath);
492 
493 	LIST_FOREACH(asp, head, path_l) {
494 		if (peer == asp->peer && path_compare(aspath, asp) == 0)
495 			return (asp);
496 	}
497 	return (NULL);
498 }
499 
500 void
501 path_remove(struct rde_aspath *asp)
502 {
503 	struct prefix	*p, *np;
504 
505 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
506 		np = LIST_NEXT(p, path_l);
507 		if (asp->pftableid) {
508 			struct bgpd_addr addr;
509 
510 			pt_getaddr(p->prefix, &addr);
511 			/* Commit is done in peer_down() */
512 			rde_send_pftable(p->aspath->pftableid, &addr,
513 			    p->prefix->prefixlen, 1);
514 		}
515 		prefix_destroy(p);
516 	}
517 }
518 
519 /* remove all stale routes or if staletime is 0 remove all routes for
520    a specified AID. */
521 u_int32_t
522 path_remove_stale(struct rde_aspath *asp, u_int8_t aid)
523 {
524 	struct prefix	*p, *np;
525 	time_t		 staletime;
526 	u_int32_t	 rprefixes;
527 
528 	rprefixes=0;
529 	staletime = asp->peer->staletime[aid];
530 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
531 		np = LIST_NEXT(p, path_l);
532 		if (p->prefix->aid != aid)
533 			continue;
534 
535 		if (staletime && p->lastchange > staletime)
536 			continue;
537 
538 		if (asp->pftableid) {
539 			struct bgpd_addr addr;
540 
541 			pt_getaddr(p->prefix, &addr);
542 			/* Commit is done in peer_flush() */
543 			rde_send_pftable(p->aspath->pftableid, &addr,
544 			    p->prefix->prefixlen, 1);
545 		}
546 
547 		/* only count Adj-RIB-In */
548 		if (p->rib->ribid == 0)
549 			rprefixes++;
550 
551 		prefix_destroy(p);
552 	}
553 	return (rprefixes);
554 }
555 
556 
557 /* this function is only called by prefix_remove and path_remove */
558 void
559 path_destroy(struct rde_aspath *asp)
560 {
561 	/* path_destroy can only unlink and free empty rde_aspath */
562 	if (asp->prefix_cnt != 0 || asp->active_cnt != 0)
563 		log_warnx("path_destroy: prefix count out of sync");
564 
565 	nexthop_unlink(asp);
566 	LIST_REMOVE(asp, path_l);
567 	LIST_REMOVE(asp, peer_l);
568 	asp->peer = NULL;
569 	asp->nexthop = NULL;
570 	asp->flags &= ~F_ATTR_LINKED;
571 
572 	path_put(asp);
573 }
574 
575 int
576 path_empty(struct rde_aspath *asp)
577 {
578 	return LIST_EMPTY(&asp->prefix_h);
579 }
580 
581 /*
582  * the path object is linked into multiple lists for fast access.
583  * These are peer_l, path_l and nexthop_l.
584  * peer_l: list of all aspaths that belong to that peer
585  * path_l: hash list to find paths quickly
586  * nexthop_l: list of all aspaths with an equal exit nexthop
587  */
588 static void
589 path_link(struct rde_aspath *asp, struct rde_peer *peer)
590 {
591 	struct aspath_head	*head;
592 
593 	head = PATH_HASH(asp->aspath);
594 
595 	LIST_INSERT_HEAD(head, asp, path_l);
596 	LIST_INSERT_HEAD(&peer->path_h, asp, peer_l);
597 	asp->peer = peer;
598 	nexthop_link(asp);
599 	asp->flags |= F_ATTR_LINKED;
600 }
601 
602 /*
603  * copy asp to a new UNLINKED one mainly for filtering
604  */
605 struct rde_aspath *
606 path_copy(struct rde_aspath *asp)
607 {
608 	struct rde_aspath *nasp;
609 
610 	nasp = path_get();
611 	nasp->aspath = asp->aspath;
612 	if (nasp->aspath != NULL) {
613 		nasp->aspath->refcnt++;
614 		rdemem.aspath_refs++;
615 	}
616 	nasp->nexthop = asp->nexthop;
617 	nasp->med = asp->med;
618 	nasp->lpref = asp->lpref;
619 	nasp->weight = asp->weight;
620 	nasp->origin = asp->origin;
621 	nasp->rtlabelid = asp->rtlabelid;
622 	rtlabel_ref(nasp->rtlabelid);
623 	nasp->pftableid = asp->pftableid;
624 	pftable_ref(nasp->pftableid);
625 
626 	nasp->flags = asp->flags & ~F_ATTR_LINKED;
627 	attr_copy(nasp, asp);
628 
629 	return (nasp);
630 }
631 
632 /* alloc and initialize new entry. May not fail. */
633 struct rde_aspath *
634 path_get(void)
635 {
636 	struct rde_aspath *asp;
637 
638 	asp = calloc(1, sizeof(*asp));
639 	if (asp == NULL)
640 		fatal("path_alloc");
641 	rdemem.path_cnt++;
642 
643 	LIST_INIT(&asp->prefix_h);
644 	asp->origin = ORIGIN_INCOMPLETE;
645 	asp->lpref = DEFAULT_LPREF;
646 	/* med = 0 */
647 	/* weight = 0 */
648 	/* rtlabel = 0 */
649 
650 	return (asp);
651 }
652 
653 /* free an unlinked element */
654 void
655 path_put(struct rde_aspath *asp)
656 {
657 	if (asp == NULL)
658 		return;
659 
660 	if (asp->flags & F_ATTR_LINKED)
661 		fatalx("path_put: linked object");
662 
663 	rtlabel_unref(asp->rtlabelid);
664 	pftable_unref(asp->pftableid);
665 	aspath_put(asp->aspath);
666 	attr_freeall(asp);
667 	rdemem.path_cnt--;
668 	free(asp);
669 }
670 
671 /* prefix specific functions */
672 
673 static struct prefix	*prefix_alloc(void);
674 static void		 prefix_free(struct prefix *);
675 static void		 prefix_link(struct prefix *, struct rib_entry *,
676 			     struct rde_aspath *);
677 static void		 prefix_unlink(struct prefix *);
678 
679 /*
680  * search for specified prefix of a peer. Returns NULL if not found.
681  */
682 struct prefix *
683 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
684     int prefixlen, u_int32_t flags)
685 {
686 	struct rib_entry	*re;
687 
688 	re = rib_get(rib, prefix, prefixlen);
689 	if (re == NULL)
690 		return (NULL);
691 	return (prefix_bypeer(re, peer, flags));
692 }
693 
694 /*
695  * Adds or updates a prefix.
696  */
697 int
698 prefix_add(struct rib *rib, struct rde_aspath *asp, struct bgpd_addr *prefix,
699     int prefixlen)
700 
701 {
702 	struct prefix		*p;
703 	struct rib_entry	*re;
704 
705 	re = rib_get(rib, prefix, prefixlen);
706 	if (re == NULL)
707 		re = rib_add(rib, prefix, prefixlen);
708 
709 	p = prefix_bypeer(re, asp->peer, asp->flags);
710 	if (p == NULL) {
711 		p = prefix_alloc();
712 		prefix_link(p, re, asp);
713 		return (1);
714 	} else {
715 		if (p->aspath != asp) {
716 			/* prefix belongs to a different aspath so move */
717 			prefix_move(asp, p);
718 		} else
719 			p->lastchange = time(NULL);
720 		return (0);
721 	}
722 }
723 
724 /*
725  * Move the prefix to the specified as path, removes the old asp if needed.
726  */
727 void
728 prefix_move(struct rde_aspath *asp, struct prefix *p)
729 {
730 	struct prefix		*np;
731 	struct rde_aspath	*oasp;
732 
733 	if (asp->peer != p->aspath->peer)
734 		fatalx("prefix_move: cross peer move");
735 
736 	/* create new prefix node */
737 	np = prefix_alloc();
738 	np->aspath = asp;
739 	/* peer and prefix pointers are still equal */
740 	np->prefix = p->prefix;
741 	np->rib = p->rib;
742 	np->lastchange = time(NULL);
743 
744 	/* add to new as path */
745 	LIST_INSERT_HEAD(&asp->prefix_h, np, path_l);
746 	PREFIX_COUNT(asp, 1);
747 	/*
748 	 * no need to update the peer prefix count because we are only moving
749 	 * the prefix without changing the peer.
750 	 */
751 
752 	/*
753 	 * First kick the old prefix node out of the prefix list,
754 	 * afterwards run the route decision for new prefix node.
755 	 * Because of this only one update is generated if the prefix
756 	 * was active.
757 	 * This is save because we create a new prefix and so the change
758 	 * is noticed by prefix_evaluate().
759 	 */
760 	LIST_REMOVE(p, rib_l);
761 	prefix_evaluate(np, np->rib);
762 
763 	/* remove old prefix node */
764 	oasp = p->aspath;
765 	LIST_REMOVE(p, path_l);
766 	PREFIX_COUNT(oasp, -1);
767 	/* as before peer count needs no update because of move */
768 
769 	/* destroy all references to other objects and free the old prefix */
770 	p->aspath = NULL;
771 	p->prefix = NULL;
772 	p->rib = NULL;
773 	prefix_free(p);
774 
775 	/* destroy old path if empty */
776 	if (path_empty(oasp))
777 		path_destroy(oasp);
778 }
779 
780 /*
781  * Removes a prefix from all lists. If the parent objects -- path or
782  * pt_entry -- become empty remove them too.
783  */
784 int
785 prefix_remove(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
786     int prefixlen, u_int32_t flags)
787 {
788 	struct prefix		*p;
789 	struct rib_entry	*re;
790 	struct rde_aspath	*asp;
791 
792 	re = rib_get(rib, prefix, prefixlen);
793 	if (re == NULL)	/* Got a dummy withdrawn request */
794 		return (0);
795 
796 	p = prefix_bypeer(re, peer, flags);
797 	if (p == NULL)		/* Got a dummy withdrawn request. */
798 		return (0);
799 
800 	asp = p->aspath;
801 
802 	if (asp->pftableid) {
803 		/* only prefixes in the local RIB were pushed into pf */
804 		rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
805 		rde_send_pftable_commit();
806 	}
807 
808 	prefix_destroy(p);
809 
810 	return (1);
811 }
812 
813 /* dump a prefix into specified buffer */
814 int
815 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen)
816 {
817 	int	totlen;
818 
819 	switch (prefix->aid) {
820 	case AID_INET:
821 	case AID_INET6:
822 		totlen = PREFIX_SIZE(plen);
823 
824 		if (totlen > len)
825 			return (-1);
826 		*buf++ = plen;
827 		memcpy(buf, &prefix->ba, totlen - 1);
828 		return (totlen);
829 	case AID_VPN_IPv4:
830 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
831 		    prefix->vpn4.labellen;
832 		plen += (sizeof(prefix->vpn4.rd) + prefix->vpn4.labellen) * 8;
833 
834 		if (totlen > len)
835 			return (-1);
836 		*buf++ = plen;
837 		memcpy(buf, &prefix->vpn4.labelstack, prefix->vpn4.labellen);
838 		buf += prefix->vpn4.labellen;
839 		memcpy(buf, &prefix->vpn4.rd, sizeof(prefix->vpn4.rd));
840 		buf += sizeof(prefix->vpn4.rd);
841 		memcpy(buf, &prefix->vpn4.addr, PREFIX_SIZE(plen) - 1);
842 		return (totlen);
843 	default:
844 		return (-1);
845 	}
846 }
847 
848 int
849 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen)
850 {
851 	int	 totlen;
852 	void	*bptr;
853 
854 	switch (prefix->aid) {
855 	case AID_INET:
856 	case AID_INET6:
857 		totlen = PREFIX_SIZE(plen);
858 		break;
859 	case AID_VPN_IPv4:
860 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
861 		    prefix->vpn4.labellen;
862 		break;
863 	default:
864 		return (-1);
865 	}
866 
867 	if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
868 		return (-1);
869 	if (prefix_write(bptr, totlen, prefix, plen) == -1)
870 		return (-1);
871 	return (0);
872 }
873 
874 /*
875  * Searches in the prefix list of specified pt_entry for a prefix entry
876  * belonging to the peer peer. Returns NULL if no match found.
877  */
878 struct prefix *
879 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, u_int32_t flags)
880 {
881 	struct prefix	*p;
882 
883 	LIST_FOREACH(p, &re->prefix_h, rib_l) {
884 		if (p->aspath->peer != peer)
885 			continue;
886 		if (p->aspath->flags & flags &&
887 		    (flags & F_ANN_DYNAMIC) !=
888 		    (p->aspath->flags & F_ANN_DYNAMIC))
889 			continue;
890 		return (p);
891 	}
892 	return (NULL);
893 }
894 
895 void
896 prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
897     enum nexthop_state oldstate)
898 {
899 	struct prefix	*p;
900 
901 	LIST_FOREACH(p, &asp->prefix_h, path_l) {
902 		/*
903 		 * skip non local-RIBs or RIBs that are flagged as noeval.
904 		 */
905 		if (p->rib->flags & F_RIB_NOEVALUATE)
906 			continue;
907 
908 		if (oldstate == state && state == NEXTHOP_REACH) {
909 			/*
910 			 * The state of the nexthop did not change. The only
911 			 * thing that may have changed is the true_nexthop
912 			 * or other internal infos. This will not change
913 			 * the routing decision so shortcut here.
914 			 */
915 			if ((p->rib->flags & F_RIB_NOFIB) == 0 &&
916 			    p == p->rib->active)
917 				rde_send_kroute(p, NULL, p->rib->ribid);
918 			continue;
919 		}
920 
921 		/* redo the route decision */
922 		LIST_REMOVE(p, rib_l);
923 		/*
924 		 * If the prefix is the active one remove it first,
925 		 * this has to be done because we can not detect when
926 		 * the active prefix changes its state. In this case
927 		 * we know that this is a withdrawal and so the second
928 		 * prefix_evaluate() will generate no update because
929 		 * the nexthop is unreachable or ineligible.
930 		 */
931 		if (p == p->rib->active)
932 			prefix_evaluate(NULL, p->rib);
933 		prefix_evaluate(p, p->rib);
934 	}
935 }
936 
937 /* kill a prefix. */
938 void
939 prefix_destroy(struct prefix *p)
940 {
941 	struct rde_aspath	*asp;
942 
943 	asp = p->aspath;
944 	prefix_unlink(p);
945 	prefix_free(p);
946 
947 	if (path_empty(asp))
948 		path_destroy(asp);
949 }
950 
951 /*
952  * helper function to clean up the connected networks after a reload
953  */
954 void
955 prefix_network_clean(struct rde_peer *peer, time_t reloadtime, u_int32_t flags)
956 {
957 	struct rde_aspath	*asp, *xasp;
958 	struct prefix		*p, *xp;
959 
960 	for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = xasp) {
961 		xasp = LIST_NEXT(asp, peer_l);
962 		if ((asp->flags & F_ANN_DYNAMIC) != flags)
963 			continue;
964 		for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = xp) {
965 			xp = LIST_NEXT(p, path_l);
966 			if (reloadtime > p->lastchange) {
967 				prefix_unlink(p);
968 				prefix_free(p);
969 			}
970 		}
971 		if (path_empty(asp))
972 			path_destroy(asp);
973 	}
974 }
975 
976 /*
977  * Link a prefix into the different parent objects.
978  */
979 static void
980 prefix_link(struct prefix *pref, struct rib_entry *re, struct rde_aspath *asp)
981 {
982 	LIST_INSERT_HEAD(&asp->prefix_h, pref, path_l);
983 	PREFIX_COUNT(asp, 1);
984 
985 	pref->aspath = asp;
986 	pref->rib = re;
987 	pref->prefix = re->prefix;
988 	pt_ref(pref->prefix);
989 	pref->lastchange = time(NULL);
990 
991 	/* make route decision */
992 	prefix_evaluate(pref, re);
993 }
994 
995 /*
996  * Unlink a prefix from the different parent objects.
997  */
998 static void
999 prefix_unlink(struct prefix *pref)
1000 {
1001 	struct rib_entry	*re = pref->rib;
1002 
1003 	/* make route decision */
1004 	LIST_REMOVE(pref, rib_l);
1005 	prefix_evaluate(NULL, re);
1006 
1007 	LIST_REMOVE(pref, path_l);
1008 	PREFIX_COUNT(pref->aspath, -1);
1009 
1010 	pt_unref(pref->prefix);
1011 	if (pt_empty(pref->prefix))
1012 		pt_remove(pref->prefix);
1013 	if (rib_empty(re))
1014 		rib_remove(re);
1015 
1016 	/* destroy all references to other objects */
1017 	pref->aspath = NULL;
1018 	pref->prefix = NULL;
1019 	pref->rib = NULL;
1020 
1021 	/*
1022 	 * It's the caller's duty to remove empty aspath structures.
1023 	 * Also freeing the unlinked prefix is the caller's duty.
1024 	 */
1025 }
1026 
1027 /* alloc and bzero new entry. May not fail. */
1028 static struct prefix *
1029 prefix_alloc(void)
1030 {
1031 	struct prefix *p;
1032 
1033 	p = calloc(1, sizeof(*p));
1034 	if (p == NULL)
1035 		fatal("prefix_alloc");
1036 	rdemem.prefix_cnt++;
1037 	return p;
1038 }
1039 
1040 /* free a unlinked entry */
1041 static void
1042 prefix_free(struct prefix *pref)
1043 {
1044 	rdemem.prefix_cnt--;
1045 	free(pref);
1046 }
1047 
1048 /*
1049  * nexthop functions
1050  */
1051 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
1052 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1053 
1054 /*
1055  * In BGP there exist two nexthops: the exit nexthop which was announced via
1056  * BGP and the true nexthop which is used in the FIB -- forward information
1057  * base a.k.a kernel routing table. When sending updates it is even more
1058  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1059  * while in EBGP normally the address of the router is sent. The exit nexthop
1060  * may be passed to the external neighbor if the neighbor and the exit nexthop
1061  * reside in the same subnet -- directly connected.
1062  */
1063 struct nexthop_table {
1064 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
1065 	u_int32_t				 nexthop_hashmask;
1066 } nexthoptable;
1067 
1068 SIPHASH_KEY nexthoptablekey;
1069 
1070 void
1071 nexthop_init(u_int32_t hashsize)
1072 {
1073 	u_int32_t	 hs, i;
1074 
1075 	for (hs = 1; hs < hashsize; hs <<= 1)
1076 		;
1077 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head));
1078 	if (nexthoptable.nexthop_hashtbl == NULL)
1079 		fatal("nextop_init");
1080 
1081 	for (i = 0; i < hs; i++)
1082 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1083 	arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey));
1084 
1085 	nexthoptable.nexthop_hashmask = hs - 1;
1086 }
1087 
1088 void
1089 nexthop_shutdown(void)
1090 {
1091 	u_int32_t		 i;
1092 	struct nexthop		*nh, *nnh;
1093 
1094 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1095 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1096 		    nh != NULL; nh = nnh) {
1097 			nnh = LIST_NEXT(nh, nexthop_l);
1098 			nh->state = NEXTHOP_UNREACH;
1099 			(void)nexthop_delete(nh);
1100 		}
1101 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i]))
1102 			log_warnx("nexthop_shutdown: non-free table");
1103 	}
1104 
1105 	free(nexthoptable.nexthop_hashtbl);
1106 }
1107 
1108 void
1109 nexthop_update(struct kroute_nexthop *msg)
1110 {
1111 	struct nexthop		*nh;
1112 	struct rde_aspath	*asp;
1113 	enum nexthop_state	 oldstate;
1114 
1115 	nh = nexthop_lookup(&msg->nexthop);
1116 	if (nh == NULL) {
1117 		log_warnx("nexthop_update: non-existent nexthop %s",
1118 		    log_addr(&msg->nexthop));
1119 		return;
1120 	}
1121 
1122 	oldstate = nh->state;
1123 	if (msg->valid)
1124 		nh->state = NEXTHOP_REACH;
1125 	else
1126 		nh->state = NEXTHOP_UNREACH;
1127 
1128 	if (msg->connected) {
1129 		nh->flags |= NEXTHOP_CONNECTED;
1130 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1131 		    sizeof(nh->true_nexthop));
1132 	} else
1133 		memcpy(&nh->true_nexthop, &msg->gateway,
1134 		    sizeof(nh->true_nexthop));
1135 
1136 	memcpy(&nh->nexthop_net, &msg->net,
1137 	    sizeof(nh->nexthop_net));
1138 	nh->nexthop_netlen = msg->netlen;
1139 
1140 	if (nexthop_delete(nh))
1141 		/* nexthop no longer used */
1142 		return;
1143 
1144 	if (rde_noevaluate())
1145 		/*
1146 		 * if the decision process is turned off there is no need
1147 		 * for the aspath list walk.
1148 		 */
1149 		return;
1150 
1151 	LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
1152 		prefix_updateall(asp, nh->state, oldstate);
1153 	}
1154 }
1155 
1156 void
1157 nexthop_modify(struct rde_aspath *asp, struct bgpd_addr *nexthop,
1158     enum action_types type, u_int8_t aid)
1159 {
1160 	struct nexthop	*nh;
1161 
1162 	if (type == ACTION_SET_NEXTHOP && aid != nexthop->aid)
1163 		return;
1164 
1165 	asp->flags &= ~F_NEXTHOP_MASK;
1166 	switch (type) {
1167 	case ACTION_SET_NEXTHOP_REJECT:
1168 		asp->flags |= F_NEXTHOP_REJECT;
1169 		break;
1170 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1171 		asp->flags |= F_NEXTHOP_BLACKHOLE;
1172 		break;
1173 	case ACTION_SET_NEXTHOP_NOMODIFY:
1174 		asp->flags |= F_NEXTHOP_NOMODIFY;
1175 		break;
1176 	case ACTION_SET_NEXTHOP_SELF:
1177 		asp->flags |= F_NEXTHOP_SELF;
1178 		break;
1179 	case ACTION_SET_NEXTHOP:
1180 		nh = nexthop_get(nexthop);
1181 		if (asp->flags & F_ATTR_LINKED)
1182 			nexthop_unlink(asp);
1183 		asp->nexthop = nh;
1184 		if (asp->flags & F_ATTR_LINKED)
1185 			nexthop_link(asp);
1186 		break;
1187 	default:
1188 		break;
1189 	}
1190 }
1191 
1192 void
1193 nexthop_link(struct rde_aspath *asp)
1194 {
1195 	if (asp->nexthop == NULL)
1196 		return;
1197 
1198 	LIST_INSERT_HEAD(&asp->nexthop->path_h, asp, nexthop_l);
1199 }
1200 
1201 void
1202 nexthop_unlink(struct rde_aspath *asp)
1203 {
1204 	struct nexthop	*nh;
1205 
1206 	if (asp->nexthop == NULL)
1207 		return;
1208 
1209 	LIST_REMOVE(asp, nexthop_l);
1210 
1211 	/* see if list is empty */
1212 	nh = asp->nexthop;
1213 	asp->nexthop = NULL;
1214 
1215 	(void)nexthop_delete(nh);
1216 }
1217 
1218 int
1219 nexthop_delete(struct nexthop *nh)
1220 {
1221 	/* nexthop still used by some other aspath */
1222 	if (!LIST_EMPTY(&nh->path_h))
1223 		return (0);
1224 
1225 	/* either pinned or in a state where it may not be deleted */
1226 	if (nh->refcnt > 0 || nh->state == NEXTHOP_LOOKUP)
1227 		return (0);
1228 
1229 	LIST_REMOVE(nh, nexthop_l);
1230 	rde_send_nexthop(&nh->exit_nexthop, 0);
1231 
1232 	rdemem.nexthop_cnt--;
1233 	free(nh);
1234 	return (1);
1235 }
1236 
1237 struct nexthop *
1238 nexthop_get(struct bgpd_addr *nexthop)
1239 {
1240 	struct nexthop	*nh;
1241 
1242 	nh = nexthop_lookup(nexthop);
1243 	if (nh == NULL) {
1244 		nh = calloc(1, sizeof(*nh));
1245 		if (nh == NULL)
1246 			fatal("nexthop_alloc");
1247 		rdemem.nexthop_cnt++;
1248 
1249 		LIST_INIT(&nh->path_h);
1250 		nh->state = NEXTHOP_LOOKUP;
1251 		nh->exit_nexthop = *nexthop;
1252 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1253 		    nexthop_l);
1254 
1255 		rde_send_nexthop(&nh->exit_nexthop, 1);
1256 	}
1257 
1258 	return (nh);
1259 }
1260 
1261 int
1262 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1263 {
1264 	struct bgpd_addr	*a, *b;
1265 
1266 	if (na == nb)
1267 		return (0);
1268 	if (na == NULL)
1269 		return (-1);
1270 	if (nb == NULL)
1271 		return (1);
1272 
1273 	a = &na->exit_nexthop;
1274 	b = &nb->exit_nexthop;
1275 
1276 	if (a->aid != b->aid)
1277 		return (a->aid - b->aid);
1278 
1279 	switch (a->aid) {
1280 	case AID_INET:
1281 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1282 			return (1);
1283 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1284 			return (-1);
1285 		return (0);
1286 	case AID_INET6:
1287 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1288 	default:
1289 		fatalx("nexthop_cmp: unknown af");
1290 	}
1291 	return (-1);
1292 }
1293 
1294 struct nexthop *
1295 nexthop_lookup(struct bgpd_addr *nexthop)
1296 {
1297 	struct nexthop	*nh;
1298 
1299 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1300 		if (memcmp(&nh->exit_nexthop, nexthop,
1301 		    sizeof(struct bgpd_addr)) == 0)
1302 			return (nh);
1303 	}
1304 	return (NULL);
1305 }
1306 
1307 struct nexthop_head *
1308 nexthop_hash(struct bgpd_addr *nexthop)
1309 {
1310 	u_int32_t	 h = 0;
1311 
1312 	switch (nexthop->aid) {
1313 	case AID_INET:
1314 		h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr,
1315 		    sizeof(nexthop->v4.s_addr));
1316 		break;
1317 	case AID_INET6:
1318 		h = SipHash24(&nexthoptablekey, &nexthop->v6,
1319 		    sizeof(struct in6_addr));
1320 		break;
1321 	default:
1322 		fatalx("nexthop_hash: unsupported AF");
1323 	}
1324 	return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]);
1325 }
1326 
1327