xref: /openbsd/usr.sbin/bgpd/rde_rib.c (revision cb510c7b)
1 /*	$OpenBSD: rde_rib.c,v 1.262 2024/05/29 10:34:56 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 
27 #include "bgpd.h"
28 #include "rde.h"
29 #include "log.h"
30 
31 /*
32  * BGP RIB -- Routing Information Base
33  *
34  * The RIB is build with one aspect in mind. Speed -- actually update speed.
35  * Therefore one thing needs to be absolutely avoided, long table walks.
36  * This is achieved by heavily linking the different parts together.
37  */
38 uint16_t rib_size;
39 struct rib **ribs;
40 struct rib flowrib = { .id = 1, .tree = RB_INITIALIZER(&flowrib.tree) };
41 
42 struct rib_entry *rib_add(struct rib *, struct pt_entry *);
43 static inline int rib_compare(const struct rib_entry *,
44 			const struct rib_entry *);
45 void rib_remove(struct rib_entry *);
46 int rib_empty(struct rib_entry *);
47 static void rib_dump_abort(uint16_t);
48 
49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
51 
52 struct rib_context {
53 	LIST_ENTRY(rib_context)		 entry;
54 	struct rib_entry		*ctx_re;
55 	struct prefix			*ctx_p;
56 	uint32_t			 ctx_id;
57 	void		(*ctx_rib_call)(struct rib_entry *, void *);
58 	void		(*ctx_prefix_call)(struct prefix *, void *);
59 	void		(*ctx_done)(void *, uint8_t);
60 	int		(*ctx_throttle)(void *);
61 	void				*ctx_arg;
62 	struct bgpd_addr		 ctx_subtree;
63 	unsigned int			 ctx_count;
64 	uint8_t				 ctx_aid;
65 	uint8_t				 ctx_subtreelen;
66 };
67 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps);
68 
69 static void	prefix_dump_r(struct rib_context *);
70 
71 static inline struct rib_entry *
re_lock(struct rib_entry * re)72 re_lock(struct rib_entry *re)
73 {
74 	if (re->lock != 0)
75 		log_warnx("%s: entry already locked", __func__);
76 	re->lock = 1;
77 	return re;
78 }
79 
80 static inline struct rib_entry *
re_unlock(struct rib_entry * re)81 re_unlock(struct rib_entry *re)
82 {
83 	if (re->lock == 0)
84 		log_warnx("%s: entry already unlocked", __func__);
85 	re->lock = 0;
86 	return re;
87 }
88 
89 static inline int
re_is_locked(struct rib_entry * re)90 re_is_locked(struct rib_entry *re)
91 {
92 	return (re->lock != 0);
93 }
94 
95 static inline struct prefix *
prefix_lock(struct prefix * p)96 prefix_lock(struct prefix *p)
97 {
98 	if (p->flags & PREFIX_FLAG_LOCKED)
99 		fatalx("%s: locking locked prefix", __func__);
100 	p->flags |= PREFIX_FLAG_LOCKED;
101 	return p;
102 }
103 
104 static inline struct prefix *
prefix_unlock(struct prefix * p)105 prefix_unlock(struct prefix *p)
106 {
107 	if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
108 		fatalx("%s: unlocking unlocked prefix", __func__);
109 	p->flags &= ~PREFIX_FLAG_LOCKED;
110 	return p;
111 }
112 
113 static inline int
prefix_is_locked(struct prefix * p)114 prefix_is_locked(struct prefix *p)
115 {
116 	return (p->flags & PREFIX_FLAG_LOCKED) != 0;
117 }
118 
119 static inline int
prefix_is_dead(struct prefix * p)120 prefix_is_dead(struct prefix *p)
121 {
122 	return (p->flags & PREFIX_FLAG_DEAD) != 0;
123 }
124 
125 static inline struct rib_tree *
rib_tree(struct rib * rib)126 rib_tree(struct rib *rib)
127 {
128 	return (&rib->tree);
129 }
130 
131 static inline int
rib_compare(const struct rib_entry * a,const struct rib_entry * b)132 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
133 {
134 	return (pt_prefix_cmp(a->prefix, b->prefix));
135 }
136 
137 /* RIB specific functions */
138 struct rib *
rib_new(char * name,u_int rtableid,uint16_t flags)139 rib_new(char *name, u_int rtableid, uint16_t flags)
140 {
141 	struct rib *new;
142 	uint16_t id;
143 
144 	for (id = 0; id < rib_size; id++) {
145 		if (ribs[id] == NULL)
146 			break;
147 	}
148 
149 	if (id >= rib_size) {
150 		if ((ribs = recallocarray(ribs, id, id + 8,
151 		    sizeof(struct rib *))) == NULL)
152 			fatal(NULL);
153 		rib_size = id + 8;
154 	}
155 
156 	if ((new = calloc(1, sizeof(*new))) == NULL)
157 		fatal(NULL);
158 
159 	strlcpy(new->name, name, sizeof(new->name));
160 	RB_INIT(rib_tree(new));
161 	new->state = RECONF_REINIT;
162 	new->id = id;
163 	new->flags = flags;
164 	new->rtableid = rtableid;
165 
166 	new->in_rules = calloc(1, sizeof(struct filter_head));
167 	if (new->in_rules == NULL)
168 		fatal(NULL);
169 	TAILQ_INIT(new->in_rules);
170 
171 	ribs[id] = new;
172 
173 	log_debug("%s: %s -> %u", __func__, name, id);
174 	return (new);
175 }
176 
177 /*
178  * This function is only called when the FIB information of a RIB changed.
179  * It will flush the FIB if there was one previously and change the fibstate
180  * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB)
181  * or RECONF_REINIT (rerun the route decision process for every element)
182  * depending on the new flags.
183  */
184 int
rib_update(struct rib * rib)185 rib_update(struct rib *rib)
186 {
187 	/* flush fib first if there was one */
188 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
189 		rde_send_kroute_flush(rib);
190 
191 	/* if no evaluate changes then a full reinit is needed */
192 	if ((rib->flags & F_RIB_NOEVALUATE) !=
193 	    (rib->flags_tmp & F_RIB_NOEVALUATE))
194 		rib->fibstate = RECONF_REINIT;
195 
196 	rib->flags = rib->flags_tmp;
197 	rib->rtableid = rib->rtableid_tmp;
198 
199 	/* reload fib if there is no reinit pending and there will be a fib */
200 	if (rib->fibstate != RECONF_REINIT &&
201 	    (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
202 		rib->fibstate = RECONF_RELOAD;
203 
204 	return (rib->fibstate == RECONF_REINIT);
205 }
206 
207 struct rib *
rib_byid(uint16_t id)208 rib_byid(uint16_t id)
209 {
210 	if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL)
211 		return NULL;
212 	return ribs[id];
213 }
214 
215 uint16_t
rib_find(char * name)216 rib_find(char *name)
217 {
218 	uint16_t id;
219 
220 	/* no name returns the first Loc-RIB */
221 	if (name == NULL || *name == '\0')
222 		return RIB_LOC_START;
223 
224 	for (id = 0; id < rib_size; id++) {
225 		if (ribs[id] != NULL && !strcmp(ribs[id]->name, name))
226 			return id;
227 	}
228 
229 	return RIB_NOTFOUND;
230 }
231 
232 void
rib_free(struct rib * rib)233 rib_free(struct rib *rib)
234 {
235 	struct rib_entry *re, *xre;
236 	struct prefix *p;
237 
238 	rib_dump_abort(rib->id);
239 
240 	/*
241 	 * flush the rib, disable route evaluation and fib sync to speed up
242 	 * the prefix removal. Nothing depends on this data anymore.
243 	 */
244 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
245 		rde_send_kroute_flush(rib);
246 	rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB;
247 
248 	for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
249 		xre = RB_NEXT(rib_tree, rib_tree(rib), re);
250 
251 		/*
252 		 * Removing the prefixes is tricky because the last one
253 		 * will remove the rib_entry as well and because we do
254 		 * an empty check in prefix_destroy() it is not possible to
255 		 * use the default for loop.
256 		 */
257 		while ((p = TAILQ_FIRST(&re->prefix_h))) {
258 			struct rde_aspath *asp = prefix_aspath(p);
259 			if (asp && asp->pftableid)
260 				rde_pftable_del(asp->pftableid, p);
261 			prefix_destroy(p);
262 		}
263 	}
264 	if (rib->id <= RIB_LOC_START)
265 		return; /* never remove the default ribs */
266 	filterlist_free(rib->in_rules_tmp);
267 	filterlist_free(rib->in_rules);
268 	ribs[rib->id] = NULL;
269 	free(rib);
270 }
271 
272 void
rib_shutdown(void)273 rib_shutdown(void)
274 {
275 	struct rib *rib;
276 	uint16_t id;
277 
278 	for (id = 0; id < rib_size; id++) {
279 		rib = rib_byid(id);
280 		if (rib == NULL)
281 			continue;
282 		if (!RB_EMPTY(rib_tree(ribs[id]))) {
283 			log_warnx("%s: rib %s is not empty", __func__,
284 			    ribs[id]->name);
285 		}
286 		rib_free(ribs[id]);
287 	}
288 	for (id = 0; id <= RIB_LOC_START; id++) {
289 		rib = rib_byid(id);
290 		if (rib == NULL)
291 			continue;
292 		filterlist_free(rib->in_rules_tmp);
293 		filterlist_free(rib->in_rules);
294 		ribs[id] = NULL;
295 		free(rib);
296 	}
297 	free(ribs);
298 }
299 
300 struct rib_entry *
rib_get(struct rib * rib,struct pt_entry * pte)301 rib_get(struct rib *rib, struct pt_entry *pte)
302 {
303 	struct rib_entry xre, *re;
304 
305 	memset(&xre, 0, sizeof(xre));
306 	xre.prefix = pte;
307 
308 	re = RB_FIND(rib_tree, rib_tree(rib), &xre);
309 	if (re && re->rib_id != rib->id)
310 		fatalx("%s: Unexpected RIB %u != %u.", __func__,
311 		    re->rib_id, rib->id);
312 	return re;
313 }
314 
315 struct rib_entry *
rib_get_addr(struct rib * rib,struct bgpd_addr * prefix,int prefixlen)316 rib_get_addr(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
317 {
318 	return rib_get(rib, pt_fill(prefix, prefixlen));
319 }
320 
321 struct rib_entry *
rib_match(struct rib * rib,struct bgpd_addr * addr)322 rib_match(struct rib *rib, struct bgpd_addr *addr)
323 {
324 	struct rib_entry *re;
325 	int		 i;
326 
327 	switch (addr->aid) {
328 	case AID_INET:
329 	case AID_VPN_IPv4:
330 		for (i = 32; i >= 0; i--) {
331 			re = rib_get_addr(rib, addr, i);
332 			if (re != NULL)
333 				return (re);
334 		}
335 		break;
336 	case AID_INET6:
337 	case AID_VPN_IPv6:
338 		for (i = 128; i >= 0; i--) {
339 			re = rib_get_addr(rib, addr, i);
340 			if (re != NULL)
341 				return (re);
342 		}
343 		break;
344 	default:
345 		fatalx("%s: unknown af", __func__);
346 	}
347 	return (NULL);
348 }
349 
350 
351 struct rib_entry *
rib_add(struct rib * rib,struct pt_entry * pte)352 rib_add(struct rib *rib, struct pt_entry *pte)
353 {
354 	struct rib_entry *re;
355 
356 	if ((re = calloc(1, sizeof(*re))) == NULL)
357 		fatal("rib_add");
358 
359 	TAILQ_INIT(&re->prefix_h);
360 	re->prefix = pt_ref(pte);
361 	re->rib_id = rib->id;
362 
363 	if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
364 		log_warnx("rib_add: insert failed");
365 		free(re);
366 		return (NULL);
367 	}
368 
369 
370 	rdemem.rib_cnt++;
371 
372 	return (re);
373 }
374 
375 void
rib_remove(struct rib_entry * re)376 rib_remove(struct rib_entry *re)
377 {
378 	if (!rib_empty(re))
379 		fatalx("rib_remove: entry not empty");
380 
381 	if (re_is_locked(re))
382 		/* entry is locked, don't free it. */
383 		return;
384 
385 	pt_unref(re->prefix);
386 
387 	if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
388 		log_warnx("rib_remove: remove failed.");
389 
390 	free(re);
391 	rdemem.rib_cnt--;
392 }
393 
394 int
rib_empty(struct rib_entry * re)395 rib_empty(struct rib_entry *re)
396 {
397 	return TAILQ_EMPTY(&re->prefix_h);
398 }
399 
400 static struct rib_entry *
rib_restart(struct rib_context * ctx)401 rib_restart(struct rib_context *ctx)
402 {
403 	struct rib_entry *re = NULL;
404 
405 	if (ctx->ctx_re)
406 		re = re_unlock(ctx->ctx_re);
407 
408 	/* find first non empty element */
409 	while (re && rib_empty(re))
410 		re = RB_NEXT(rib_tree, unused, re);
411 
412 	/* free the previously locked rib element if empty */
413 	if (ctx->ctx_re && rib_empty(ctx->ctx_re))
414 		rib_remove(ctx->ctx_re);
415 	ctx->ctx_re = NULL;
416 	return (re);
417 }
418 
419 static void
rib_dump_r(struct rib_context * ctx)420 rib_dump_r(struct rib_context *ctx)
421 {
422 	struct rib_entry	*re, *next;
423 	struct rib		*rib;
424 	unsigned int		 i;
425 
426 	rib = rib_byid(ctx->ctx_id);
427 	if (rib == NULL)
428 		fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
429 
430 	if (ctx->ctx_re == NULL && ctx->ctx_subtree.aid == AID_UNSPEC)
431 		re = RB_MIN(rib_tree, rib_tree(rib));
432 	else
433 		re = rib_restart(ctx);
434 
435 	for (i = 0; re != NULL; re = next) {
436 		next = RB_NEXT(rib_tree, unused, re);
437 		if (re->rib_id != ctx->ctx_id)
438 			fatalx("%s: Unexpected RIB %u != %u.", __func__,
439 			    re->rib_id, ctx->ctx_id);
440 		if (ctx->ctx_aid != AID_UNSPEC &&
441 		    ctx->ctx_aid != re->prefix->aid)
442 			continue;
443 		if (ctx->ctx_subtree.aid != AID_UNSPEC) {
444 			struct bgpd_addr addr;
445 			pt_getaddr(re->prefix, &addr);
446 			if (prefix_compare(&ctx->ctx_subtree, &addr,
447 			    ctx->ctx_subtreelen) != 0)
448 				/* left subtree, walk is done */
449 				break;
450 		}
451 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
452 		    !re_is_locked(re)) {
453 			/* store and lock last element */
454 			ctx->ctx_re = re_lock(re);
455 			return;
456 		}
457 		ctx->ctx_rib_call(re, ctx->ctx_arg);
458 	}
459 
460 	if (ctx->ctx_done)
461 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
462 	LIST_REMOVE(ctx, entry);
463 	free(ctx);
464 }
465 
466 int
rib_dump_pending(void)467 rib_dump_pending(void)
468 {
469 	struct rib_context *ctx;
470 
471 	/* return true if at least one context is not throttled */
472 	LIST_FOREACH(ctx, &rib_dumps, entry) {
473 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
474 			continue;
475 		return 1;
476 	}
477 	return 0;
478 }
479 
480 void
rib_dump_runner(void)481 rib_dump_runner(void)
482 {
483 	struct rib_context *ctx, *next;
484 
485 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
486 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
487 			continue;
488 		if (ctx->ctx_rib_call != NULL)
489 			rib_dump_r(ctx);
490 		else
491 			prefix_dump_r(ctx);
492 	}
493 }
494 
495 static void
rib_dump_abort(uint16_t id)496 rib_dump_abort(uint16_t id)
497 {
498 	struct rib_context *ctx, *next;
499 
500 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
501 		if (id != ctx->ctx_id)
502 			continue;
503 		if (ctx->ctx_done)
504 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
505 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
506 			rib_remove(ctx->ctx_re);
507 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
508 			prefix_adjout_destroy(ctx->ctx_p);
509 		LIST_REMOVE(ctx, entry);
510 		free(ctx);
511 	}
512 }
513 
514 void
rib_dump_terminate(void * arg)515 rib_dump_terminate(void *arg)
516 {
517 	struct rib_context *ctx, *next;
518 
519 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
520 		if (ctx->ctx_arg != arg)
521 			continue;
522 		if (ctx->ctx_done)
523 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
524 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
525 			rib_remove(ctx->ctx_re);
526 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
527 			prefix_adjout_destroy(ctx->ctx_p);
528 		LIST_REMOVE(ctx, entry);
529 		free(ctx);
530 	}
531 }
532 
533 int
rib_dump_new(uint16_t id,uint8_t aid,unsigned int count,void * arg,void (* upcall)(struct rib_entry *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))534 rib_dump_new(uint16_t id, uint8_t aid, unsigned int count, void *arg,
535     void (*upcall)(struct rib_entry *, void *), void (*done)(void *, uint8_t),
536     int (*throttle)(void *))
537 {
538 	struct rib_context *ctx;
539 
540 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
541 		return -1;
542 	ctx->ctx_id = id;
543 	ctx->ctx_aid = aid;
544 	ctx->ctx_count = count;
545 	ctx->ctx_arg = arg;
546 	ctx->ctx_rib_call = upcall;
547 	ctx->ctx_done = done;
548 	ctx->ctx_throttle = throttle;
549 
550 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
551 
552 	/* requested a sync traversal */
553 	if (count == 0)
554 		rib_dump_r(ctx);
555 
556 	return 0;
557 }
558 
559 int
rib_dump_subtree(uint16_t id,struct bgpd_addr * subtree,uint8_t subtreelen,unsigned int count,void * arg,void (* upcall)(struct rib_entry *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))560 rib_dump_subtree(uint16_t id, struct bgpd_addr *subtree, uint8_t subtreelen,
561     unsigned int count, void *arg, void (*upcall)(struct rib_entry *, void *),
562     void (*done)(void *, uint8_t), int (*throttle)(void *))
563 {
564 	struct rib_context *ctx;
565 	struct rib_entry xre;
566 
567 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
568 		return -1;
569 	ctx->ctx_id = id;
570 	ctx->ctx_aid = subtree->aid;
571 	ctx->ctx_count = count;
572 	ctx->ctx_arg = arg;
573 	ctx->ctx_rib_call = upcall;
574 	ctx->ctx_done = done;
575 	ctx->ctx_throttle = throttle;
576 	ctx->ctx_subtree = *subtree;
577 	ctx->ctx_subtreelen = subtreelen;
578 
579 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
580 
581 	/* lookup start of subtree */
582 	memset(&xre, 0, sizeof(xre));
583 	xre.prefix = pt_fill(subtree, subtreelen);
584 	ctx->ctx_re = RB_NFIND(rib_tree, rib_tree(rib_byid(id)), &xre);
585 	if (ctx->ctx_re)
586 		re_lock(ctx->ctx_re);
587 
588 	/* requested a sync traversal */
589 	if (count == 0)
590 		rib_dump_r(ctx);
591 
592 	return 0;
593 }
594 
595 /* path specific functions */
596 
597 static struct rde_aspath *path_lookup(struct rde_aspath *);
598 static void path_link(struct rde_aspath *);
599 static void path_unlink(struct rde_aspath *);
600 
601 static inline int
path_compare(struct rde_aspath * a,struct rde_aspath * b)602 path_compare(struct rde_aspath *a, struct rde_aspath *b)
603 {
604 	int		 r;
605 
606 	if (a == NULL && b == NULL)
607 		return (0);
608 	else if (b == NULL)
609 		return (1);
610 	else if (a == NULL)
611 		return (-1);
612 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
613 		return (1);
614 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
615 		return (-1);
616 	if (a->origin > b->origin)
617 		return (1);
618 	if (a->origin < b->origin)
619 		return (-1);
620 	if (a->med > b->med)
621 		return (1);
622 	if (a->med < b->med)
623 		return (-1);
624 	if (a->lpref > b->lpref)
625 		return (1);
626 	if (a->lpref < b->lpref)
627 		return (-1);
628 	if (a->weight > b->weight)
629 		return (1);
630 	if (a->weight < b->weight)
631 		return (-1);
632 	if (a->rtlabelid > b->rtlabelid)
633 		return (1);
634 	if (a->rtlabelid < b->rtlabelid)
635 		return (-1);
636 	if (a->pftableid > b->pftableid)
637 		return (1);
638 	if (a->pftableid < b->pftableid)
639 		return (-1);
640 
641 	/* no need to check aspa_state or aspa_generation */
642 
643 	r = aspath_compare(a->aspath, b->aspath);
644 	if (r > 0)
645 		return (1);
646 	if (r < 0)
647 		return (-1);
648 
649 	return (attr_compare(a, b));
650 }
651 
652 RB_HEAD(path_tree, rde_aspath)	pathtable = RB_INITIALIZER(&pathtable);
653 RB_GENERATE_STATIC(path_tree, rde_aspath, entry, path_compare);
654 
655 static inline struct rde_aspath *
path_ref(struct rde_aspath * asp)656 path_ref(struct rde_aspath *asp)
657 {
658 	if ((asp->flags & F_ATTR_LINKED) == 0)
659 		fatalx("%s: unlinked object", __func__);
660 	asp->refcnt++;
661 	rdemem.path_refs++;
662 
663 	return asp;
664 }
665 
666 static inline void
path_unref(struct rde_aspath * asp)667 path_unref(struct rde_aspath *asp)
668 {
669 	if (asp == NULL)
670 		return;
671 	if ((asp->flags & F_ATTR_LINKED) == 0)
672 		fatalx("%s: unlinked object", __func__);
673 	asp->refcnt--;
674 	rdemem.path_refs--;
675 	if (asp->refcnt <= 0)
676 		path_unlink(asp);
677 }
678 
679 void
path_shutdown(void)680 path_shutdown(void)
681 {
682 	if (!RB_EMPTY(&pathtable))
683 		log_warnx("path_free: free non-free table");
684 }
685 
686 static struct rde_aspath *
path_lookup(struct rde_aspath * aspath)687 path_lookup(struct rde_aspath *aspath)
688 {
689 	return (RB_FIND(path_tree, &pathtable, aspath));
690 }
691 
692 /*
693  * Link this aspath into the global table.
694  * The asp had to be alloced with path_get.
695  */
696 static void
path_link(struct rde_aspath * asp)697 path_link(struct rde_aspath *asp)
698 {
699 	if (RB_INSERT(path_tree, &pathtable, asp) != NULL)
700 		fatalx("%s: already linked object", __func__);
701 	asp->flags |= F_ATTR_LINKED;
702 }
703 
704 /*
705  * This function can only be called when all prefix have been removed first.
706  * Normally this happens directly out of the prefix removal functions.
707  */
708 static void
path_unlink(struct rde_aspath * asp)709 path_unlink(struct rde_aspath *asp)
710 {
711 	if (asp == NULL)
712 		return;
713 
714 	/* make sure no reference is hold for this rde_aspath */
715 	if (asp->refcnt != 0)
716 		fatalx("%s: still holds references", __func__);
717 
718 	RB_REMOVE(path_tree, &pathtable, asp);
719 	asp->flags &= ~F_ATTR_LINKED;
720 
721 	path_put(asp);
722 }
723 
724 /*
725  * Copy asp to a new UNLINKED aspath.
726  * On dst either path_get() or path_prep() had to be called before.
727  */
728 struct rde_aspath *
path_copy(struct rde_aspath * dst,const struct rde_aspath * src)729 path_copy(struct rde_aspath *dst, const struct rde_aspath *src)
730 {
731 	dst->aspath = aspath_copy(src->aspath);
732 	dst->refcnt = 0;
733 	dst->flags = src->flags & ~F_ATTR_LINKED;
734 
735 	dst->med = src->med;
736 	dst->lpref = src->lpref;
737 	dst->weight = src->weight;
738 	dst->rtlabelid = rtlabel_ref(src->rtlabelid);
739 	dst->pftableid = pftable_ref(src->pftableid);
740 	dst->origin = src->origin;
741 
742 	attr_copy(dst, src);
743 
744 	return (dst);
745 }
746 
747 /* initialize or prepare an aspath for use */
748 struct rde_aspath *
path_prep(struct rde_aspath * asp)749 path_prep(struct rde_aspath *asp)
750 {
751 	memset(asp, 0, sizeof(*asp));
752 	asp->origin = ORIGIN_INCOMPLETE;
753 	asp->lpref = DEFAULT_LPREF;
754 
755 	return (asp);
756 }
757 
758 /* alloc and initialize new entry. May not fail. */
759 struct rde_aspath *
path_get(void)760 path_get(void)
761 {
762 	struct rde_aspath *asp;
763 
764 	asp = malloc(sizeof(*asp));
765 	if (asp == NULL)
766 		fatal("path_get");
767 	rdemem.path_cnt++;
768 
769 	return (path_prep(asp));
770 }
771 
772 /* clean up an asp after use (frees all references to sub-objects) */
773 void
path_clean(struct rde_aspath * asp)774 path_clean(struct rde_aspath *asp)
775 {
776 	if (asp->flags & F_ATTR_LINKED)
777 		fatalx("%s: linked object", __func__);
778 
779 	rtlabel_unref(asp->rtlabelid);
780 	pftable_unref(asp->pftableid);
781 	aspath_put(asp->aspath);
782 	asp->aspath = NULL;
783 	attr_freeall(asp);
784 }
785 
786 /* free an unlinked element */
787 void
path_put(struct rde_aspath * asp)788 path_put(struct rde_aspath *asp)
789 {
790 	if (asp == NULL)
791 		return;
792 
793 	path_clean(asp);
794 
795 	rdemem.path_cnt--;
796 	free(asp);
797 }
798 
799 /* prefix specific functions */
800 
801 static int	prefix_add(struct bgpd_addr *, int, struct rib *,
802 		    struct rde_peer *, uint32_t, uint32_t, struct rde_aspath *,
803 		    struct rde_community *, struct nexthop *,
804 		    uint8_t, uint8_t);
805 static int	prefix_move(struct prefix *, struct rde_peer *,
806 		    struct rde_aspath *, struct rde_community *,
807 		    struct nexthop *, uint8_t, uint8_t);
808 
809 static void	prefix_link(struct prefix *, struct rib_entry *,
810 		    struct pt_entry *, struct rde_peer *, uint32_t, uint32_t,
811 		    struct rde_aspath *, struct rde_community *,
812 		    struct nexthop *, uint8_t, uint8_t);
813 static void	prefix_unlink(struct prefix *);
814 
815 static struct prefix	*prefix_alloc(void);
816 static void		 prefix_free(struct prefix *);
817 
818 /* RB tree comparison function */
819 static inline int
prefix_index_cmp(struct prefix * a,struct prefix * b)820 prefix_index_cmp(struct prefix *a, struct prefix *b)
821 {
822 	int r;
823 	r = pt_prefix_cmp(a->pt, b->pt);
824 	if (r != 0)
825 		return r;
826 
827 	if (a->path_id_tx > b->path_id_tx)
828 		return 1;
829 	if (a->path_id_tx < b->path_id_tx)
830 		return -1;
831 	return 0;
832 }
833 
834 static inline int
prefix_cmp(struct prefix * a,struct prefix * b)835 prefix_cmp(struct prefix *a, struct prefix *b)
836 {
837 	if ((a->flags & PREFIX_FLAG_EOR) != (b->flags & PREFIX_FLAG_EOR))
838 		return (a->flags & PREFIX_FLAG_EOR) ? 1 : -1;
839 	/* if EOR marker no need to check the rest */
840 	if (a->flags & PREFIX_FLAG_EOR)
841 		return 0;
842 
843 	if (a->aspath != b->aspath)
844 		return (a->aspath > b->aspath ? 1 : -1);
845 	if (a->communities != b->communities)
846 		return (a->communities > b->communities ? 1 : -1);
847 	if (a->nexthop != b->nexthop)
848 		return (a->nexthop > b->nexthop ? 1 : -1);
849 	if (a->nhflags != b->nhflags)
850 		return (a->nhflags > b->nhflags ? 1 : -1);
851 	return prefix_index_cmp(a, b);
852 }
853 
854 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
855 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
856 
857 /*
858  * Search for specified prefix of a peer. Returns NULL if not found.
859  */
860 struct prefix *
prefix_get(struct rib * rib,struct rde_peer * peer,uint32_t path_id,struct bgpd_addr * prefix,int prefixlen)861 prefix_get(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
862     struct bgpd_addr *prefix, int prefixlen)
863 {
864 	struct rib_entry	*re;
865 
866 	re = rib_get_addr(rib, prefix, prefixlen);
867 	if (re == NULL)
868 		return (NULL);
869 	return (prefix_bypeer(re, peer, path_id));
870 }
871 
872 /*
873  * Search for specified prefix in the peer prefix_index.
874  * Returns NULL if not found.
875  */
876 struct prefix *
prefix_adjout_get(struct rde_peer * peer,uint32_t path_id_tx,struct pt_entry * pte)877 prefix_adjout_get(struct rde_peer *peer, uint32_t path_id_tx,
878     struct pt_entry *pte)
879 {
880 	struct prefix xp;
881 
882 	memset(&xp, 0, sizeof(xp));
883 	xp.pt = pte;
884 	xp.path_id_tx = path_id_tx;
885 
886 	return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
887 }
888 
889 /*
890  * Lookup a prefix without considering path_id in the peer prefix_index.
891  * Returns NULL if not found.
892  */
893 struct prefix *
prefix_adjout_first(struct rde_peer * peer,struct pt_entry * pte)894 prefix_adjout_first(struct rde_peer *peer, struct pt_entry *pte)
895 {
896 	struct prefix xp, *np;
897 
898 	memset(&xp, 0, sizeof(xp));
899 	xp.pt = pte;
900 
901 	np = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp);
902 	if (np == NULL || pt_prefix_cmp(np->pt, xp.pt) != 0)
903 		return NULL;
904 	return np;
905 }
906 
907 /*
908  * Return next prefix after a lookup that is actually an update.
909  */
910 struct prefix *
prefix_adjout_next(struct rde_peer * peer,struct prefix * p)911 prefix_adjout_next(struct rde_peer *peer, struct prefix *p)
912 {
913 	struct prefix *np;
914 
915 	np = RB_NEXT(prefix_index, &peer->adj_rib_out, p);
916 	if (np == NULL || np->pt != p->pt)
917 		return NULL;
918 	return np;
919 }
920 
921 /*
922  * Lookup addr/prefixlen in the peer prefix_index. Returns first match.
923  * Returns NULL if not found.
924  */
925 struct prefix *
prefix_adjout_lookup(struct rde_peer * peer,struct bgpd_addr * addr,int plen)926 prefix_adjout_lookup(struct rde_peer *peer, struct bgpd_addr *addr, int plen)
927 {
928 	return prefix_adjout_first(peer, pt_fill(addr, plen));
929 }
930 
931 /*
932  * Lookup addr in the peer prefix_index. Returns first match.
933  * Returns NULL if not found.
934  */
935 struct prefix *
prefix_adjout_match(struct rde_peer * peer,struct bgpd_addr * addr)936 prefix_adjout_match(struct rde_peer *peer, struct bgpd_addr *addr)
937 {
938 	struct prefix *p;
939 	int i;
940 
941 	switch (addr->aid) {
942 	case AID_INET:
943 	case AID_VPN_IPv4:
944 		for (i = 32; i >= 0; i--) {
945 			p = prefix_adjout_lookup(peer, addr, i);
946 			if (p != NULL)
947 				return p;
948 		}
949 		break;
950 	case AID_INET6:
951 	case AID_VPN_IPv6:
952 		for (i = 128; i >= 0; i--) {
953 			p = prefix_adjout_lookup(peer, addr, i);
954 			if (p != NULL)
955 				return p;
956 		}
957 		break;
958 	default:
959 		fatalx("%s: unknown af", __func__);
960 	}
961 	return NULL;
962 }
963 
964 /*
965  * Update a prefix.
966  * Return 1 if prefix was newly added, 0 if it was just changed.
967  */
968 int
prefix_update(struct rib * rib,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct filterstate * state,struct bgpd_addr * prefix,int prefixlen)969 prefix_update(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
970     uint32_t path_id_tx, struct filterstate *state, struct bgpd_addr *prefix,
971     int prefixlen)
972 {
973 	struct rde_aspath	*asp, *nasp = &state->aspath;
974 	struct rde_community	*comm, *ncomm = &state->communities;
975 	struct prefix		*p;
976 
977 	/*
978 	 * First try to find a prefix in the specified RIB.
979 	 */
980 	if ((p = prefix_get(rib, peer, path_id, prefix, prefixlen)) != NULL) {
981 		if (path_id_tx != p->path_id_tx)
982 			fatalx("path_id mismatch");
983 		if (prefix_nexthop(p) == state->nexthop &&
984 		    prefix_nhflags(p) == state->nhflags &&
985 		    communities_equal(ncomm, prefix_communities(p)) &&
986 		    path_compare(nasp, prefix_aspath(p)) == 0) {
987 			/* no change, update last change */
988 			p->lastchange = getmonotime();
989 			p->validation_state = state->vstate;
990 			return (0);
991 		}
992 	}
993 
994 	/*
995 	 * Either the prefix does not exist or the path changed.
996 	 * In both cases lookup the new aspath to make sure it is not
997 	 * already in the RIB.
998 	 */
999 	if ((asp = path_lookup(nasp)) == NULL) {
1000 		/* Path not available, create and link a new one. */
1001 		asp = path_copy(path_get(), nasp);
1002 		path_link(asp);
1003 	}
1004 
1005 	if ((comm = communities_lookup(ncomm)) == NULL) {
1006 		/* Communities not available, create and link a new one. */
1007 		comm = communities_link(ncomm);
1008 	}
1009 
1010 	/* If the prefix was found move it else add it to the RIB. */
1011 	if (p != NULL)
1012 		return (prefix_move(p, peer, asp, comm, state->nexthop,
1013 		    state->nhflags, state->vstate));
1014 	else
1015 		return (prefix_add(prefix, prefixlen, rib, peer, path_id,
1016 		    path_id_tx, asp, comm, state->nexthop, state->nhflags,
1017 		    state->vstate));
1018 }
1019 
1020 /*
1021  * Adds or updates a prefix.
1022  */
1023 static int
prefix_add(struct bgpd_addr * prefix,int prefixlen,struct rib * rib,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate)1024 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib,
1025     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1026     struct rde_aspath *asp, struct rde_community *comm,
1027     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1028 {
1029 	struct pt_entry	*pte;
1030 	struct prefix		*p;
1031 	struct rib_entry	*re;
1032 
1033 	pte = pt_get(prefix, prefixlen);
1034 	if (pte == NULL)
1035 		pte = pt_add(prefix, prefixlen);
1036 	re = rib_get(rib, pte);
1037 	if (re == NULL)
1038 		re = rib_add(rib, pte);
1039 
1040 	p = prefix_alloc();
1041 	prefix_link(p, re, re->prefix, peer, path_id, path_id_tx, asp, comm,
1042 	    nexthop, nhflags, vstate);
1043 
1044 	/* add possible pftable reference form aspath */
1045 	if (asp && asp->pftableid)
1046 		rde_pftable_add(asp->pftableid, p);
1047 	/* make route decision */
1048 	prefix_evaluate(re, p, NULL);
1049 	return (1);
1050 }
1051 
1052 /*
1053  * Move the prefix to the specified as path, removes the old asp if needed.
1054  */
1055 static int
prefix_move(struct prefix * p,struct rde_peer * peer,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate)1056 prefix_move(struct prefix *p, struct rde_peer *peer,
1057     struct rde_aspath *asp, struct rde_community *comm,
1058     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1059 {
1060 	struct prefix		*np;
1061 
1062 	if (p->flags & PREFIX_FLAG_ADJOUT)
1063 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1064 
1065 	if (peer != prefix_peer(p))
1066 		fatalx("prefix_move: cross peer move");
1067 
1068 	/* add new prefix node */
1069 	np = prefix_alloc();
1070 	prefix_link(np, prefix_re(p), p->pt, peer, p->path_id, p->path_id_tx,
1071 	    asp, comm, nexthop, nhflags, vstate);
1072 
1073 	/* add possible pftable reference from new aspath */
1074 	if (asp && asp->pftableid)
1075 		rde_pftable_add(asp->pftableid, np);
1076 
1077 	/*
1078 	 * no need to update the peer prefix count because we are only moving
1079 	 * the prefix without changing the peer.
1080 	 */
1081 	prefix_evaluate(prefix_re(np), np, p);
1082 
1083 	/* remove possible pftable reference from old path */
1084 	if (p->aspath && p->aspath->pftableid)
1085 		rde_pftable_del(p->aspath->pftableid, p);
1086 
1087 	/* remove old prefix node */
1088 	prefix_unlink(p);
1089 	prefix_free(p);
1090 
1091 	return (0);
1092 }
1093 
1094 /*
1095  * Removes a prefix from the specified RIB. If the parent objects -- rib_entry
1096  * or pt_entry -- become empty remove them too.
1097  */
1098 int
prefix_withdraw(struct rib * rib,struct rde_peer * peer,uint32_t path_id,struct bgpd_addr * prefix,int prefixlen)1099 prefix_withdraw(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
1100     struct bgpd_addr *prefix, int prefixlen)
1101 {
1102 	struct prefix		*p;
1103 	struct rde_aspath	*asp;
1104 
1105 	p = prefix_get(rib, peer, path_id, prefix, prefixlen);
1106 	if (p == NULL)		/* Got a dummy withdrawn request. */
1107 		return (0);
1108 
1109 	if (p->flags & PREFIX_FLAG_ADJOUT)
1110 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1111 	asp = prefix_aspath(p);
1112 	if (asp && asp->pftableid)
1113 		/* only prefixes in the local RIB were pushed into pf */
1114 		rde_pftable_del(asp->pftableid, p);
1115 
1116 	prefix_destroy(p);
1117 
1118 	return (1);
1119 }
1120 
1121 /*
1122  * Special functions for flowspec until full integration is available.
1123  * This just directly feeds the prefixes into the Adj-RIB-Out bypassing
1124  * Adj-RIB-In and Loc-RIB for now.
1125  */
1126 int
prefix_flowspec_update(struct rde_peer * peer,struct filterstate * state,struct pt_entry * pte,uint32_t path_id_tx)1127 prefix_flowspec_update(struct rde_peer *peer, struct filterstate *state,
1128     struct pt_entry *pte, uint32_t path_id_tx)
1129 {
1130 	struct rde_aspath *asp, *nasp;
1131 	struct rde_community *comm, *ncomm;
1132 	struct rib_entry *re;
1133 	struct prefix *new, *old;
1134 
1135 	re = rib_get(&flowrib, pte);
1136 	if (re == NULL)
1137 		re = rib_add(&flowrib, pte);
1138 
1139 	old = prefix_bypeer(re, peer, 0);
1140 	new = prefix_alloc();
1141 
1142 	nasp = &state->aspath;
1143 	ncomm = &state->communities;
1144 	if ((asp = path_lookup(nasp)) == NULL) {
1145 		/* Path not available, create and link a new one. */
1146 		asp = path_copy(path_get(), nasp);
1147 		path_link(asp);
1148 	}
1149 	if ((comm = communities_lookup(ncomm)) == NULL) {
1150 		/* Communities not available, create and link a new one. */
1151 		comm = communities_link(ncomm);
1152 	}
1153 
1154 	prefix_link(new, re, re->prefix, peer, 0, path_id_tx, asp, comm,
1155 	    NULL, 0, 0);
1156 	TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
1157 
1158 	rde_generate_updates(re, new, old, EVAL_DEFAULT);
1159 
1160 	if (old != NULL) {
1161 		TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);
1162 		prefix_unlink(old);
1163 		prefix_free(old);
1164 		return 0;
1165 	}
1166 	return 1;
1167 }
1168 
1169 /*
1170  * Remove a possible flowspec prefix from all Adj-RIB-Outs.
1171  */
1172 int
prefix_flowspec_withdraw(struct rde_peer * peer,struct pt_entry * pte)1173 prefix_flowspec_withdraw(struct rde_peer *peer, struct pt_entry *pte)
1174 {
1175 	struct rib_entry *re;
1176 	struct prefix *p;
1177 
1178 	re = rib_get(&flowrib, pte);
1179 	if (re == NULL)
1180 		return 0;
1181 	p = prefix_bypeer(re, peer, 0);
1182 	if (p == NULL)
1183 		return 0;
1184 	rde_generate_updates(re, NULL, p, EVAL_DEFAULT);
1185 	TAILQ_REMOVE(&re->prefix_h, p, entry.list.rib);
1186 	prefix_unlink(p);
1187 	prefix_free(p);
1188 	return 1;
1189 }
1190 
1191 /*
1192  * Push all flowspec rules into a newly available Adj-RIB-Out.
1193  */
1194 void
prefix_flowspec_dump(uint8_t aid,void * arg,void (* call)(struct rib_entry *,void *),void (* done)(void *,uint8_t))1195 prefix_flowspec_dump(uint8_t aid, void *arg,
1196     void (*call)(struct rib_entry *, void *), void (*done)(void *, uint8_t))
1197 {
1198 	struct rib_entry *re, *next;
1199 
1200 	RB_FOREACH_SAFE(re, rib_tree, rib_tree(&flowrib), next) {
1201 		if (aid != AID_UNSPEC && aid != re->prefix->aid)
1202 			continue;
1203 		call(re, arg);
1204 	}
1205 	if (done != NULL)
1206 		done(arg, aid);
1207 }
1208 
1209 /*
1210  * Insert an End-of-RIB marker into the update queue.
1211  */
1212 void
prefix_add_eor(struct rde_peer * peer,uint8_t aid)1213 prefix_add_eor(struct rde_peer *peer, uint8_t aid)
1214 {
1215 	struct prefix *p;
1216 
1217 	p = prefix_alloc();
1218 	p->flags = PREFIX_FLAG_ADJOUT | PREFIX_FLAG_UPDATE | PREFIX_FLAG_EOR;
1219 	if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
1220 		/* no need to add if EoR marker already present */
1221 		prefix_free(p);
1222 	/* EOR marker is not inserted into the adj_rib_out index */
1223 }
1224 
1225 /*
1226  * Put a prefix from the Adj-RIB-Out onto the update queue.
1227  */
1228 void
prefix_adjout_update(struct prefix * p,struct rde_peer * peer,struct filterstate * state,struct pt_entry * pte,uint32_t path_id_tx)1229 prefix_adjout_update(struct prefix *p, struct rde_peer *peer,
1230     struct filterstate *state, struct pt_entry *pte, uint32_t path_id_tx)
1231 {
1232 	struct rde_aspath *asp;
1233 	struct rde_community *comm;
1234 
1235 	if (p == NULL) {
1236 		p = prefix_alloc();
1237 		/* initially mark DEAD so code below is skipped */
1238 		p->flags |= PREFIX_FLAG_ADJOUT | PREFIX_FLAG_DEAD;
1239 
1240 		p->pt = pt_ref(pte);
1241 		p->peer = peer;
1242 		p->path_id_tx = path_id_tx;
1243 
1244 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1245 			fatalx("%s: RB index invariant violated", __func__);
1246 	}
1247 
1248 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1249 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1250 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1251 		/*
1252 		 * XXX for now treat a different path_id_tx like different
1253 		 * attributes and force out an update. It is unclear how
1254 		 * common it is to have equivalent updates from alternative
1255 		 * paths.
1256 		 */
1257 		if (p->path_id_tx == path_id_tx &&
1258 		    prefix_nhflags(p) == state->nhflags &&
1259 		    prefix_nexthop(p) == state->nexthop &&
1260 		    communities_equal(&state->communities,
1261 		    prefix_communities(p)) &&
1262 		    path_compare(&state->aspath, prefix_aspath(p)) == 0) {
1263 			/* nothing changed */
1264 			p->validation_state = state->vstate;
1265 			p->lastchange = getmonotime();
1266 			p->flags &= ~PREFIX_FLAG_STALE;
1267 			return;
1268 		}
1269 
1270 		/* if pending update unhook it before it is unlinked */
1271 		if (p->flags & PREFIX_FLAG_UPDATE) {
1272 			RB_REMOVE(prefix_tree, &peer->updates[pte->aid], p);
1273 			peer->stats.pending_update--;
1274 		}
1275 
1276 		/* unlink prefix so it can be relinked below */
1277 		prefix_unlink(p);
1278 		peer->stats.prefix_out_cnt--;
1279 	}
1280 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1281 		RB_REMOVE(prefix_tree, &peer->withdraws[pte->aid], p);
1282 		peer->stats.pending_withdraw--;
1283 	}
1284 
1285 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1286 	p->flags &= ~PREFIX_FLAG_MASK;
1287 
1288 	/* update path_id_tx now that the prefix is unlinked */
1289 	if (p->path_id_tx != path_id_tx) {
1290 		/* path_id_tx is part of the index so remove and re-insert p */
1291 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1292 		p->path_id_tx = path_id_tx;
1293 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1294 			fatalx("%s: RB index invariant violated", __func__);
1295 	}
1296 
1297 	if ((asp = path_lookup(&state->aspath)) == NULL) {
1298 		/* Path not available, create and link a new one. */
1299 		asp = path_copy(path_get(), &state->aspath);
1300 		path_link(asp);
1301 	}
1302 
1303 	if ((comm = communities_lookup(&state->communities)) == NULL) {
1304 		/* Communities not available, create and link a new one. */
1305 		comm = communities_link(&state->communities);
1306 	}
1307 
1308 	prefix_link(p, NULL, p->pt, peer, 0, p->path_id_tx, asp, comm,
1309 	    state->nexthop, state->nhflags, state->vstate);
1310 	peer->stats.prefix_out_cnt++;
1311 
1312 	if (p->flags & PREFIX_FLAG_MASK)
1313 		fatalx("%s: bad flags %x", __func__, p->flags);
1314 	p->flags |= PREFIX_FLAG_UPDATE;
1315 	if (RB_INSERT(prefix_tree, &peer->updates[pte->aid], p) != NULL)
1316 		fatalx("%s: RB tree invariant violated", __func__);
1317 	peer->stats.pending_update++;
1318 }
1319 
1320 /*
1321  * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves
1322  * the prefix in the RIB linked to the peer withdraw list.
1323  */
1324 void
prefix_adjout_withdraw(struct prefix * p)1325 prefix_adjout_withdraw(struct prefix *p)
1326 {
1327 	struct rde_peer *peer = prefix_peer(p);
1328 
1329 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1330 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1331 
1332 	/* already a withdraw, shortcut */
1333 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1334 		p->lastchange = getmonotime();
1335 		p->flags &= ~PREFIX_FLAG_STALE;
1336 		return;
1337 	}
1338 	/* pending update just got withdrawn */
1339 	if (p->flags & PREFIX_FLAG_UPDATE) {
1340 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1341 		peer->stats.pending_update--;
1342 	}
1343 	/* unlink prefix if it was linked (not a withdraw or dead) */
1344 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1345 		prefix_unlink(p);
1346 		peer->stats.prefix_out_cnt--;
1347 	}
1348 
1349 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1350 	p->flags &= ~PREFIX_FLAG_MASK;
1351 	p->lastchange = getmonotime();
1352 
1353 	p->flags |= PREFIX_FLAG_WITHDRAW;
1354 	if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], p) != NULL)
1355 		fatalx("%s: RB tree invariant violated", __func__);
1356 	peer->stats.pending_withdraw++;
1357 }
1358 
1359 void
prefix_adjout_destroy(struct prefix * p)1360 prefix_adjout_destroy(struct prefix *p)
1361 {
1362 	struct rde_peer *peer = prefix_peer(p);
1363 
1364 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1365 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1366 
1367 	if (p->flags & PREFIX_FLAG_EOR) {
1368 		/* EOR marker is not linked in the index */
1369 		prefix_free(p);
1370 		return;
1371 	}
1372 
1373 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1374 		RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
1375 		peer->stats.pending_withdraw--;
1376 	}
1377 	if (p->flags & PREFIX_FLAG_UPDATE) {
1378 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1379 		peer->stats.pending_update--;
1380 	}
1381 	/* unlink prefix if it was linked (not a withdraw or dead) */
1382 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1383 		prefix_unlink(p);
1384 		peer->stats.prefix_out_cnt--;
1385 	}
1386 
1387 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1388 	p->flags &= ~PREFIX_FLAG_MASK;
1389 
1390 	if (prefix_is_locked(p)) {
1391 		/* mark prefix dead but leave it for prefix_restart */
1392 		p->flags |= PREFIX_FLAG_DEAD;
1393 	} else {
1394 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1395 		/* remove the last prefix reference before free */
1396 		pt_unref(p->pt);
1397 		prefix_free(p);
1398 	}
1399 }
1400 
1401 static struct prefix *
prefix_restart(struct rib_context * ctx)1402 prefix_restart(struct rib_context *ctx)
1403 {
1404 	struct prefix *p = NULL;
1405 
1406 	if (ctx->ctx_p)
1407 		p = prefix_unlock(ctx->ctx_p);
1408 
1409 	if (p && prefix_is_dead(p)) {
1410 		struct prefix *next;
1411 
1412 		next = RB_NEXT(prefix_index, unused, p);
1413 		prefix_adjout_destroy(p);
1414 		p = next;
1415 	}
1416 	ctx->ctx_p = NULL;
1417 	return p;
1418 }
1419 
1420 static void
prefix_dump_r(struct rib_context * ctx)1421 prefix_dump_r(struct rib_context *ctx)
1422 {
1423 	struct prefix *p, *next;
1424 	struct rde_peer *peer;
1425 	unsigned int i;
1426 
1427 	if ((peer = peer_get(ctx->ctx_id)) == NULL)
1428 		goto done;
1429 
1430 	if (ctx->ctx_p == NULL && ctx->ctx_subtree.aid == AID_UNSPEC)
1431 		p = RB_MIN(prefix_index, &peer->adj_rib_out);
1432 	else
1433 		p = prefix_restart(ctx);
1434 
1435 	for (i = 0; p != NULL; p = next) {
1436 		next = RB_NEXT(prefix_index, unused, p);
1437 		if (prefix_is_dead(p))
1438 			continue;
1439 		if (ctx->ctx_aid != AID_UNSPEC &&
1440 		    ctx->ctx_aid != p->pt->aid)
1441 			continue;
1442 		if (ctx->ctx_subtree.aid != AID_UNSPEC) {
1443 			struct bgpd_addr addr;
1444 			pt_getaddr(p->pt, &addr);
1445 			if (prefix_compare(&ctx->ctx_subtree, &addr,
1446 			    ctx->ctx_subtreelen) != 0)
1447 				/* left subtree, walk is done */
1448 				break;
1449 		}
1450 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
1451 		    !prefix_is_locked(p)) {
1452 			/* store and lock last element */
1453 			ctx->ctx_p = prefix_lock(p);
1454 			return;
1455 		}
1456 		ctx->ctx_prefix_call(p, ctx->ctx_arg);
1457 	}
1458 
1459 done:
1460 	if (ctx->ctx_done)
1461 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
1462 	LIST_REMOVE(ctx, entry);
1463 	free(ctx);
1464 }
1465 
1466 int
prefix_dump_new(struct rde_peer * peer,uint8_t aid,unsigned int count,void * arg,void (* upcall)(struct prefix *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))1467 prefix_dump_new(struct rde_peer *peer, uint8_t aid, unsigned int count,
1468     void *arg, void (*upcall)(struct prefix *, void *),
1469     void (*done)(void *, uint8_t), int (*throttle)(void *))
1470 {
1471 	struct rib_context *ctx;
1472 
1473 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1474 		return -1;
1475 	ctx->ctx_id = peer->conf.id;
1476 	ctx->ctx_aid = aid;
1477 	ctx->ctx_count = count;
1478 	ctx->ctx_arg = arg;
1479 	ctx->ctx_prefix_call = upcall;
1480 	ctx->ctx_done = done;
1481 	ctx->ctx_throttle = throttle;
1482 
1483 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1484 
1485 	/* requested a sync traversal */
1486 	if (count == 0)
1487 		prefix_dump_r(ctx);
1488 
1489 	return 0;
1490 }
1491 
1492 int
prefix_dump_subtree(struct rde_peer * peer,struct bgpd_addr * subtree,uint8_t subtreelen,unsigned int count,void * arg,void (* upcall)(struct prefix *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))1493 prefix_dump_subtree(struct rde_peer *peer, struct bgpd_addr *subtree,
1494     uint8_t subtreelen, unsigned int count, void *arg,
1495     void (*upcall)(struct prefix *, void *), void (*done)(void *, uint8_t),
1496     int (*throttle)(void *))
1497 {
1498 	struct rib_context *ctx;
1499 	struct prefix xp;
1500 
1501 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1502 		return -1;
1503 	ctx->ctx_id = peer->conf.id;
1504 	ctx->ctx_aid = subtree->aid;
1505 	ctx->ctx_count = count;
1506 	ctx->ctx_arg = arg;
1507 	ctx->ctx_prefix_call = upcall;
1508 	ctx->ctx_done = done;
1509 	ctx->ctx_throttle = throttle;
1510 	ctx->ctx_subtree = *subtree;
1511 	ctx->ctx_subtreelen = subtreelen;
1512 
1513 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1514 
1515 	/* lookup start of subtree */
1516 	memset(&xp, 0, sizeof(xp));
1517 	xp.pt = pt_fill(subtree, subtreelen);
1518 	ctx->ctx_p = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp);
1519 	if (ctx->ctx_p)
1520 		prefix_lock(ctx->ctx_p);
1521 
1522 	/* requested a sync traversal */
1523 	if (count == 0)
1524 		prefix_dump_r(ctx);
1525 
1526 	return 0;
1527 }
1528 
1529 /*
1530  * Searches in the prefix list of specified rib_entry for a prefix entry
1531  * belonging to the peer peer. Returns NULL if no match found.
1532  */
1533 struct prefix *
prefix_bypeer(struct rib_entry * re,struct rde_peer * peer,uint32_t path_id)1534 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, uint32_t path_id)
1535 {
1536 	struct prefix	*p;
1537 
1538 	TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib)
1539 		if (prefix_peer(p) == peer && p->path_id == path_id)
1540 			return (p);
1541 	return (NULL);
1542 }
1543 
1544 /* kill a prefix. */
1545 void
prefix_destroy(struct prefix * p)1546 prefix_destroy(struct prefix *p)
1547 {
1548 	/* make route decision */
1549 	prefix_evaluate(prefix_re(p), NULL, p);
1550 
1551 	prefix_unlink(p);
1552 	prefix_free(p);
1553 }
1554 
1555 /*
1556  * Link a prefix into the different parent objects.
1557  */
1558 static void
prefix_link(struct prefix * p,struct rib_entry * re,struct pt_entry * pt,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate)1559 prefix_link(struct prefix *p, struct rib_entry *re, struct pt_entry *pt,
1560     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1561     struct rde_aspath *asp, struct rde_community *comm,
1562     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1563 {
1564 	if (re)
1565 		p->entry.list.re = re;
1566 	p->aspath = path_ref(asp);
1567 	p->communities = communities_ref(comm);
1568 	p->peer = peer;
1569 	p->pt = pt_ref(pt);
1570 	p->path_id = path_id;
1571 	p->path_id_tx = path_id_tx;
1572 	p->validation_state = vstate;
1573 	p->nhflags = nhflags;
1574 	p->nexthop = nexthop_ref(nexthop);
1575 	nexthop_link(p);
1576 	p->lastchange = getmonotime();
1577 }
1578 
1579 /*
1580  * Unlink a prefix from the different parent objects.
1581  */
1582 static void
prefix_unlink(struct prefix * p)1583 prefix_unlink(struct prefix *p)
1584 {
1585 	struct rib_entry	*re = prefix_re(p);
1586 
1587 	/* destroy all references to other objects */
1588 	/* remove nexthop ref ... */
1589 	nexthop_unlink(p);
1590 	nexthop_unref(p->nexthop);
1591 	p->nexthop = NULL;
1592 	p->nhflags = 0;
1593 	/* ... communities ... */
1594 	communities_unref(p->communities);
1595 	p->communities = NULL;
1596 	/* and unlink from aspath */
1597 	path_unref(p->aspath);
1598 	p->aspath = NULL;
1599 
1600 	if (re && rib_empty(re))
1601 		rib_remove(re);
1602 
1603 	pt_unref(p->pt);
1604 }
1605 
1606 /* alloc and zero new entry. May not fail. */
1607 static struct prefix *
prefix_alloc(void)1608 prefix_alloc(void)
1609 {
1610 	struct prefix *p;
1611 
1612 	p = calloc(1, sizeof(*p));
1613 	if (p == NULL)
1614 		fatal("prefix_alloc");
1615 	rdemem.prefix_cnt++;
1616 	return p;
1617 }
1618 
1619 /* free a unlinked entry */
1620 static void
prefix_free(struct prefix * p)1621 prefix_free(struct prefix *p)
1622 {
1623 	rdemem.prefix_cnt--;
1624 	free(p);
1625 }
1626 
1627 /*
1628  * nexthop functions
1629  */
1630 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1631 
1632 /*
1633  * In BGP there exist two nexthops: the exit nexthop which was announced via
1634  * BGP and the true nexthop which is used in the FIB -- forward information
1635  * base a.k.a kernel routing table. When sending updates it is even more
1636  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1637  * while in EBGP normally the address of the router is sent. The exit nexthop
1638  * may be passed to the external neighbor if the neighbor and the exit nexthop
1639  * reside in the same subnet -- directly connected.
1640  */
1641 
1642 TAILQ_HEAD(nexthop_queue, nexthop)	nexthop_runners =
1643 				    TAILQ_HEAD_INITIALIZER(nexthop_runners);
1644 
1645 RB_HEAD(nexthop_tree, nexthop)		nexthoptable =
1646 					    RB_INITIALIZER(&nexthoptree);
1647 
1648 static inline int nexthop_cmp(struct nexthop *, struct nexthop *);
1649 
1650 RB_GENERATE_STATIC(nexthop_tree, nexthop, entry, nexthop_cmp);
1651 
1652 void
nexthop_shutdown(void)1653 nexthop_shutdown(void)
1654 {
1655 	struct nexthop		*nh, *nnh;
1656 
1657 	RB_FOREACH_SAFE(nh, nexthop_tree, &nexthoptable, nnh) {
1658 		nh->state = NEXTHOP_UNREACH;
1659 		nexthop_unref(nh);
1660 	}
1661 
1662 	RB_FOREACH(nh, nexthop_tree, &nexthoptable) {
1663 		log_warnx("%s: nexthop %s refcnt %d", __func__,
1664 		    log_addr(&nh->exit_nexthop), nh->refcnt);
1665 	}
1666 }
1667 
1668 int
nexthop_pending(void)1669 nexthop_pending(void)
1670 {
1671 	return !TAILQ_EMPTY(&nexthop_runners);
1672 }
1673 
1674 void
nexthop_runner(void)1675 nexthop_runner(void)
1676 {
1677 	struct nexthop *nh;
1678 	struct prefix *p;
1679 	uint32_t j;
1680 
1681 	nh = TAILQ_FIRST(&nexthop_runners);
1682 	if (nh == NULL)
1683 		return;
1684 
1685 	/* remove from runnner queue */
1686 	TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1687 
1688 	p = nh->next_prefix;
1689 	for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
1690 		prefix_evaluate_nexthop(p, nh->state, nh->oldstate);
1691 		p = LIST_NEXT(p, entry.list.nexthop);
1692 	}
1693 
1694 	/* prep for next run, if not finished readd to tail of queue */
1695 	nh->next_prefix = p;
1696 	if (p != NULL)
1697 		TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l);
1698 	else
1699 		log_debug("nexthop %s update finished",
1700 		    log_addr(&nh->exit_nexthop));
1701 }
1702 
1703 void
nexthop_update(struct kroute_nexthop * msg)1704 nexthop_update(struct kroute_nexthop *msg)
1705 {
1706 	struct nexthop		*nh;
1707 
1708 	nh = nexthop_lookup(&msg->nexthop);
1709 	if (nh == NULL) {
1710 		log_warnx("nexthop_update: non-existent nexthop %s",
1711 		    log_addr(&msg->nexthop));
1712 		return;
1713 	}
1714 
1715 	nh->oldstate = nh->state;
1716 	if (msg->valid)
1717 		nh->state = NEXTHOP_REACH;
1718 	else
1719 		nh->state = NEXTHOP_UNREACH;
1720 
1721 	if (nh->oldstate == NEXTHOP_LOOKUP)
1722 		/* drop reference which was hold during the lookup */
1723 		if (nexthop_unref(nh))
1724 			return;		/* nh lost last ref, no work left */
1725 
1726 	if (nh->next_prefix) {
1727 		/*
1728 		 * If nexthop_runner() is not finished with this nexthop
1729 		 * then ensure that all prefixes are updated by setting
1730 		 * the oldstate to NEXTHOP_FLAPPED.
1731 		 */
1732 		nh->oldstate = NEXTHOP_FLAPPED;
1733 		TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1734 	}
1735 
1736 	if (msg->connected)
1737 		nh->flags |= NEXTHOP_CONNECTED;
1738 
1739 	nh->true_nexthop = msg->gateway;
1740 	nh->nexthop_net = msg->net;
1741 	nh->nexthop_netlen = msg->netlen;
1742 
1743 	nh->next_prefix = LIST_FIRST(&nh->prefix_h);
1744 	if (nh->next_prefix != NULL) {
1745 		TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);
1746 		log_debug("nexthop %s update starting",
1747 		    log_addr(&nh->exit_nexthop));
1748 	}
1749 }
1750 
1751 void
nexthop_modify(struct nexthop * setnh,enum action_types type,uint8_t aid,struct nexthop ** nexthop,uint8_t * flags)1752 nexthop_modify(struct nexthop *setnh, enum action_types type, uint8_t aid,
1753     struct nexthop **nexthop, uint8_t *flags)
1754 {
1755 	switch (type) {
1756 	case ACTION_SET_NEXTHOP_REJECT:
1757 		*flags = NEXTHOP_REJECT;
1758 		break;
1759 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1760 		*flags = NEXTHOP_BLACKHOLE;
1761 		break;
1762 	case ACTION_SET_NEXTHOP_NOMODIFY:
1763 		*flags = NEXTHOP_NOMODIFY;
1764 		break;
1765 	case ACTION_SET_NEXTHOP_SELF:
1766 		*flags = NEXTHOP_SELF;
1767 		break;
1768 	case ACTION_SET_NEXTHOP_REF:
1769 		/*
1770 		 * it is possible that a prefix matches but has the wrong
1771 		 * address family for the set nexthop. In this case ignore it.
1772 		 */
1773 		if (aid != setnh->exit_nexthop.aid)
1774 			break;
1775 		nexthop_unref(*nexthop);
1776 		*nexthop = nexthop_ref(setnh);
1777 		*flags = 0;
1778 		break;
1779 	default:
1780 		break;
1781 	}
1782 }
1783 
1784 void
nexthop_link(struct prefix * p)1785 nexthop_link(struct prefix *p)
1786 {
1787 	p->nhflags &= ~NEXTHOP_VALID;
1788 
1789 	if (p->flags & PREFIX_FLAG_ADJOUT) {
1790 		/* All nexthops are valid in Adj-RIB-Out */
1791 		p->nhflags |= NEXTHOP_VALID;
1792 		return;
1793 	}
1794 
1795 	/* no need to link prefixes in RIBs that have no decision process */
1796 	if (re_rib(prefix_re(p))->flags & F_RIB_NOEVALUATE)
1797 		return;
1798 
1799 	/* self-announce networks use nexthop NULL and are valid as well */
1800 	if (p->nexthop == NULL || p->nexthop->state == NEXTHOP_REACH)
1801 		p->nhflags |= NEXTHOP_VALID;
1802 
1803 	if (p->nexthop == NULL)
1804 		return;
1805 	p->flags |= PREFIX_NEXTHOP_LINKED;
1806 	LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
1807 }
1808 
1809 void
nexthop_unlink(struct prefix * p)1810 nexthop_unlink(struct prefix *p)
1811 {
1812 	p->nhflags &= ~NEXTHOP_VALID;
1813 
1814 	if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
1815 		return;
1816 
1817 	if (p == p->nexthop->next_prefix) {
1818 		p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
1819 		/* remove nexthop from list if no prefixes left to update */
1820 		if (p->nexthop->next_prefix == NULL) {
1821 			TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l);
1822 			log_debug("nexthop %s update finished",
1823 			    log_addr(&p->nexthop->exit_nexthop));
1824 		}
1825 	}
1826 
1827 	p->flags &= ~PREFIX_NEXTHOP_LINKED;
1828 	LIST_REMOVE(p, entry.list.nexthop);
1829 }
1830 
1831 struct nexthop *
nexthop_get(struct bgpd_addr * nexthop)1832 nexthop_get(struct bgpd_addr *nexthop)
1833 {
1834 	struct nexthop	*nh;
1835 
1836 	nh = nexthop_lookup(nexthop);
1837 	if (nh == NULL) {
1838 		nh = calloc(1, sizeof(*nh));
1839 		if (nh == NULL)
1840 			fatal("nexthop_get");
1841 		rdemem.nexthop_cnt++;
1842 
1843 		LIST_INIT(&nh->prefix_h);
1844 		nh->state = NEXTHOP_LOOKUP;
1845 		nexthop_ref(nh);	/* take reference for lookup */
1846 		nh->exit_nexthop = *nexthop;
1847 		if (RB_INSERT(nexthop_tree, &nexthoptable, nh) != NULL)
1848 			fatalx("nexthop tree inconsistency");
1849 
1850 		rde_send_nexthop(&nh->exit_nexthop, 1);
1851 	}
1852 
1853 	return nexthop_ref(nh);
1854 }
1855 
1856 struct nexthop *
nexthop_ref(struct nexthop * nexthop)1857 nexthop_ref(struct nexthop *nexthop)
1858 {
1859 	if (nexthop)
1860 		nexthop->refcnt++;
1861 	return (nexthop);
1862 }
1863 
1864 int
nexthop_unref(struct nexthop * nh)1865 nexthop_unref(struct nexthop *nh)
1866 {
1867 	if (nh == NULL)
1868 		return (0);
1869 	if (--nh->refcnt > 0)
1870 		return (0);
1871 
1872 	/* sanity check */
1873 	if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
1874 		fatalx("%s: refcnt error", __func__);
1875 
1876 	/* is nexthop update running? impossible, that is a refcnt error */
1877 	if (nh->next_prefix)
1878 		fatalx("%s: next_prefix not NULL", __func__);
1879 
1880 	RB_REMOVE(nexthop_tree, &nexthoptable, nh);
1881 	rde_send_nexthop(&nh->exit_nexthop, 0);
1882 
1883 	rdemem.nexthop_cnt--;
1884 	free(nh);
1885 	return (1);
1886 }
1887 
1888 static inline int
nexthop_cmp(struct nexthop * na,struct nexthop * nb)1889 nexthop_cmp(struct nexthop *na, struct nexthop *nb)
1890 {
1891 	struct bgpd_addr	*a, *b;
1892 
1893 	if (na == nb)
1894 		return (0);
1895 	if (na == NULL)
1896 		return (-1);
1897 	if (nb == NULL)
1898 		return (1);
1899 
1900 	a = &na->exit_nexthop;
1901 	b = &nb->exit_nexthop;
1902 
1903 	if (a->aid != b->aid)
1904 		return (a->aid - b->aid);
1905 
1906 	switch (a->aid) {
1907 	case AID_INET:
1908 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1909 			return (1);
1910 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1911 			return (-1);
1912 		return (0);
1913 	case AID_INET6:
1914 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1915 	default:
1916 		fatalx("nexthop_cmp: %s is unsupported", aid2str(a->aid));
1917 	}
1918 	return (-1);
1919 }
1920 
1921 struct nexthop *
nexthop_lookup(struct bgpd_addr * nexthop)1922 nexthop_lookup(struct bgpd_addr *nexthop)
1923 {
1924 	struct nexthop	needle = { 0 };
1925 
1926 	needle.exit_nexthop = *nexthop;
1927 	return RB_FIND(nexthop_tree, &nexthoptable, &needle);
1928 }
1929