xref: /openbsd/usr.sbin/bgpd/rde_rib.c (revision 89ee02f7)
1 /*	$OpenBSD: rde_rib.c,v 1.263 2024/08/14 19:09:51 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 
27 #include "bgpd.h"
28 #include "rde.h"
29 #include "log.h"
30 
31 /*
32  * BGP RIB -- Routing Information Base
33  *
34  * The RIB is build with one aspect in mind. Speed -- actually update speed.
35  * Therefore one thing needs to be absolutely avoided, long table walks.
36  * This is achieved by heavily linking the different parts together.
37  */
38 uint16_t rib_size;
39 struct rib **ribs;
40 struct rib flowrib = { .id = 1, .tree = RB_INITIALIZER(&flowrib.tree) };
41 
42 struct rib_entry *rib_add(struct rib *, struct pt_entry *);
43 static inline int rib_compare(const struct rib_entry *,
44 			const struct rib_entry *);
45 void rib_remove(struct rib_entry *);
46 int rib_empty(struct rib_entry *);
47 static void rib_dump_abort(uint16_t);
48 
49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
51 
52 struct rib_context {
53 	LIST_ENTRY(rib_context)		 entry;
54 	struct rib_entry		*ctx_re;
55 	struct prefix			*ctx_p;
56 	uint32_t			 ctx_id;
57 	void		(*ctx_rib_call)(struct rib_entry *, void *);
58 	void		(*ctx_prefix_call)(struct prefix *, void *);
59 	void		(*ctx_done)(void *, uint8_t);
60 	int		(*ctx_throttle)(void *);
61 	void				*ctx_arg;
62 	struct bgpd_addr		 ctx_subtree;
63 	unsigned int			 ctx_count;
64 	uint8_t				 ctx_aid;
65 	uint8_t				 ctx_subtreelen;
66 };
67 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps);
68 
69 static void	prefix_dump_r(struct rib_context *);
70 
71 static inline struct rib_entry *
re_lock(struct rib_entry * re)72 re_lock(struct rib_entry *re)
73 {
74 	if (re->lock != 0)
75 		log_warnx("%s: entry already locked", __func__);
76 	re->lock = 1;
77 	return re;
78 }
79 
80 static inline struct rib_entry *
re_unlock(struct rib_entry * re)81 re_unlock(struct rib_entry *re)
82 {
83 	if (re->lock == 0)
84 		log_warnx("%s: entry already unlocked", __func__);
85 	re->lock = 0;
86 	return re;
87 }
88 
89 static inline int
re_is_locked(struct rib_entry * re)90 re_is_locked(struct rib_entry *re)
91 {
92 	return (re->lock != 0);
93 }
94 
95 static inline struct prefix *
prefix_lock(struct prefix * p)96 prefix_lock(struct prefix *p)
97 {
98 	if (p->flags & PREFIX_FLAG_LOCKED)
99 		fatalx("%s: locking locked prefix", __func__);
100 	p->flags |= PREFIX_FLAG_LOCKED;
101 	return p;
102 }
103 
104 static inline struct prefix *
prefix_unlock(struct prefix * p)105 prefix_unlock(struct prefix *p)
106 {
107 	if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
108 		fatalx("%s: unlocking unlocked prefix", __func__);
109 	p->flags &= ~PREFIX_FLAG_LOCKED;
110 	return p;
111 }
112 
113 static inline int
prefix_is_locked(struct prefix * p)114 prefix_is_locked(struct prefix *p)
115 {
116 	return (p->flags & PREFIX_FLAG_LOCKED) != 0;
117 }
118 
119 static inline int
prefix_is_dead(struct prefix * p)120 prefix_is_dead(struct prefix *p)
121 {
122 	return (p->flags & PREFIX_FLAG_DEAD) != 0;
123 }
124 
125 static inline struct rib_tree *
rib_tree(struct rib * rib)126 rib_tree(struct rib *rib)
127 {
128 	return (&rib->tree);
129 }
130 
131 static inline int
rib_compare(const struct rib_entry * a,const struct rib_entry * b)132 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
133 {
134 	return (pt_prefix_cmp(a->prefix, b->prefix));
135 }
136 
137 /* RIB specific functions */
138 struct rib *
rib_new(char * name,u_int rtableid,uint16_t flags)139 rib_new(char *name, u_int rtableid, uint16_t flags)
140 {
141 	struct rib *new;
142 	uint16_t id;
143 
144 	for (id = 0; id < rib_size; id++) {
145 		if (ribs[id] == NULL)
146 			break;
147 	}
148 
149 	if (id >= rib_size) {
150 		if ((ribs = recallocarray(ribs, id, id + 8,
151 		    sizeof(struct rib *))) == NULL)
152 			fatal(NULL);
153 		rib_size = id + 8;
154 	}
155 
156 	if ((new = calloc(1, sizeof(*new))) == NULL)
157 		fatal(NULL);
158 
159 	strlcpy(new->name, name, sizeof(new->name));
160 	RB_INIT(rib_tree(new));
161 	new->state = RECONF_REINIT;
162 	new->id = id;
163 	new->flags = flags;
164 	new->rtableid = rtableid;
165 
166 	new->in_rules = calloc(1, sizeof(struct filter_head));
167 	if (new->in_rules == NULL)
168 		fatal(NULL);
169 	TAILQ_INIT(new->in_rules);
170 
171 	ribs[id] = new;
172 
173 	log_debug("%s: %s -> %u", __func__, name, id);
174 	return (new);
175 }
176 
177 /*
178  * This function is only called when the FIB information of a RIB changed.
179  * It will flush the FIB if there was one previously and change the fibstate
180  * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB)
181  * or RECONF_REINIT (rerun the route decision process for every element)
182  * depending on the new flags.
183  */
184 int
rib_update(struct rib * rib)185 rib_update(struct rib *rib)
186 {
187 	/* flush fib first if there was one */
188 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
189 		rde_send_kroute_flush(rib);
190 
191 	/* if no evaluate changes then a full reinit is needed */
192 	if ((rib->flags & F_RIB_NOEVALUATE) !=
193 	    (rib->flags_tmp & F_RIB_NOEVALUATE))
194 		rib->fibstate = RECONF_REINIT;
195 
196 	rib->flags = rib->flags_tmp;
197 	rib->rtableid = rib->rtableid_tmp;
198 
199 	/* reload fib if there is no reinit pending and there will be a fib */
200 	if (rib->fibstate != RECONF_REINIT &&
201 	    (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
202 		rib->fibstate = RECONF_RELOAD;
203 
204 	return (rib->fibstate == RECONF_REINIT);
205 }
206 
207 struct rib *
rib_byid(uint16_t id)208 rib_byid(uint16_t id)
209 {
210 	if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL)
211 		return NULL;
212 	return ribs[id];
213 }
214 
215 uint16_t
rib_find(char * name)216 rib_find(char *name)
217 {
218 	uint16_t id;
219 
220 	/* no name returns the first Loc-RIB */
221 	if (name == NULL || *name == '\0')
222 		return RIB_LOC_START;
223 
224 	for (id = 0; id < rib_size; id++) {
225 		if (ribs[id] != NULL && !strcmp(ribs[id]->name, name))
226 			return id;
227 	}
228 
229 	return RIB_NOTFOUND;
230 }
231 
232 void
rib_free(struct rib * rib)233 rib_free(struct rib *rib)
234 {
235 	struct rib_entry *re, *xre;
236 	struct prefix *p;
237 
238 	rib_dump_abort(rib->id);
239 
240 	/*
241 	 * flush the rib, disable route evaluation and fib sync to speed up
242 	 * the prefix removal. Nothing depends on this data anymore.
243 	 */
244 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
245 		rde_send_kroute_flush(rib);
246 	rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB;
247 
248 	for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
249 		xre = RB_NEXT(rib_tree, rib_tree(rib), re);
250 
251 		/*
252 		 * Removing the prefixes is tricky because the last one
253 		 * will remove the rib_entry as well and because we do
254 		 * an empty check in prefix_destroy() it is not possible to
255 		 * use the default for loop.
256 		 */
257 		while ((p = TAILQ_FIRST(&re->prefix_h))) {
258 			struct rde_aspath *asp = prefix_aspath(p);
259 			if (asp && asp->pftableid)
260 				rde_pftable_del(asp->pftableid, p);
261 			prefix_destroy(p);
262 		}
263 	}
264 	if (rib->id <= RIB_LOC_START)
265 		return; /* never remove the default ribs */
266 	filterlist_free(rib->in_rules_tmp);
267 	filterlist_free(rib->in_rules);
268 	ribs[rib->id] = NULL;
269 	free(rib);
270 }
271 
272 void
rib_shutdown(void)273 rib_shutdown(void)
274 {
275 	struct rib *rib;
276 	uint16_t id;
277 
278 	for (id = 0; id < rib_size; id++) {
279 		rib = rib_byid(id);
280 		if (rib == NULL)
281 			continue;
282 		if (!RB_EMPTY(rib_tree(ribs[id]))) {
283 			log_warnx("%s: rib %s is not empty", __func__,
284 			    ribs[id]->name);
285 		}
286 		rib_free(ribs[id]);
287 	}
288 	for (id = 0; id <= RIB_LOC_START; id++) {
289 		rib = rib_byid(id);
290 		if (rib == NULL)
291 			continue;
292 		filterlist_free(rib->in_rules_tmp);
293 		filterlist_free(rib->in_rules);
294 		ribs[id] = NULL;
295 		free(rib);
296 	}
297 	free(ribs);
298 }
299 
300 struct rib_entry *
rib_get(struct rib * rib,struct pt_entry * pte)301 rib_get(struct rib *rib, struct pt_entry *pte)
302 {
303 	struct rib_entry xre, *re;
304 
305 	memset(&xre, 0, sizeof(xre));
306 	xre.prefix = pte;
307 
308 	re = RB_FIND(rib_tree, rib_tree(rib), &xre);
309 	if (re && re->rib_id != rib->id)
310 		fatalx("%s: Unexpected RIB %u != %u.", __func__,
311 		    re->rib_id, rib->id);
312 	return re;
313 }
314 
315 struct rib_entry *
rib_get_addr(struct rib * rib,struct bgpd_addr * prefix,int prefixlen)316 rib_get_addr(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
317 {
318 	return rib_get(rib, pt_fill(prefix, prefixlen));
319 }
320 
321 struct rib_entry *
rib_match(struct rib * rib,struct bgpd_addr * addr)322 rib_match(struct rib *rib, struct bgpd_addr *addr)
323 {
324 	struct rib_entry *re;
325 	int		 i;
326 
327 	switch (addr->aid) {
328 	case AID_INET:
329 	case AID_VPN_IPv4:
330 		for (i = 32; i >= 0; i--) {
331 			re = rib_get_addr(rib, addr, i);
332 			if (re != NULL)
333 				return (re);
334 		}
335 		break;
336 	case AID_INET6:
337 	case AID_VPN_IPv6:
338 		for (i = 128; i >= 0; i--) {
339 			re = rib_get_addr(rib, addr, i);
340 			if (re != NULL)
341 				return (re);
342 		}
343 		break;
344 	default:
345 		fatalx("%s: unknown af", __func__);
346 	}
347 	return (NULL);
348 }
349 
350 
351 struct rib_entry *
rib_add(struct rib * rib,struct pt_entry * pte)352 rib_add(struct rib *rib, struct pt_entry *pte)
353 {
354 	struct rib_entry *re;
355 
356 	if ((re = calloc(1, sizeof(*re))) == NULL)
357 		fatal("rib_add");
358 
359 	TAILQ_INIT(&re->prefix_h);
360 	re->prefix = pt_ref(pte);
361 	re->rib_id = rib->id;
362 
363 	if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
364 		log_warnx("rib_add: insert failed");
365 		free(re);
366 		return (NULL);
367 	}
368 
369 
370 	rdemem.rib_cnt++;
371 
372 	return (re);
373 }
374 
375 void
rib_remove(struct rib_entry * re)376 rib_remove(struct rib_entry *re)
377 {
378 	if (!rib_empty(re))
379 		fatalx("rib_remove: entry not empty");
380 
381 	if (re_is_locked(re))
382 		/* entry is locked, don't free it. */
383 		return;
384 
385 	pt_unref(re->prefix);
386 
387 	if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
388 		log_warnx("rib_remove: remove failed.");
389 
390 	free(re);
391 	rdemem.rib_cnt--;
392 }
393 
394 int
rib_empty(struct rib_entry * re)395 rib_empty(struct rib_entry *re)
396 {
397 	return TAILQ_EMPTY(&re->prefix_h);
398 }
399 
400 static struct rib_entry *
rib_restart(struct rib_context * ctx)401 rib_restart(struct rib_context *ctx)
402 {
403 	struct rib_entry *re = NULL;
404 
405 	if (ctx->ctx_re)
406 		re = re_unlock(ctx->ctx_re);
407 
408 	/* find first non empty element */
409 	while (re && rib_empty(re))
410 		re = RB_NEXT(rib_tree, unused, re);
411 
412 	/* free the previously locked rib element if empty */
413 	if (ctx->ctx_re && rib_empty(ctx->ctx_re))
414 		rib_remove(ctx->ctx_re);
415 	ctx->ctx_re = NULL;
416 	return (re);
417 }
418 
419 static void
rib_dump_r(struct rib_context * ctx)420 rib_dump_r(struct rib_context *ctx)
421 {
422 	struct rib_entry	*re, *next;
423 	struct rib		*rib;
424 	unsigned int		 i;
425 
426 	rib = rib_byid(ctx->ctx_id);
427 	if (rib == NULL)
428 		fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
429 
430 	if (ctx->ctx_re == NULL && ctx->ctx_subtree.aid == AID_UNSPEC)
431 		re = RB_MIN(rib_tree, rib_tree(rib));
432 	else
433 		re = rib_restart(ctx);
434 
435 	for (i = 0; re != NULL; re = next) {
436 		next = RB_NEXT(rib_tree, unused, re);
437 		if (re->rib_id != ctx->ctx_id)
438 			fatalx("%s: Unexpected RIB %u != %u.", __func__,
439 			    re->rib_id, ctx->ctx_id);
440 		if (ctx->ctx_aid != AID_UNSPEC &&
441 		    ctx->ctx_aid != re->prefix->aid)
442 			continue;
443 		if (ctx->ctx_subtree.aid != AID_UNSPEC) {
444 			struct bgpd_addr addr;
445 			pt_getaddr(re->prefix, &addr);
446 			if (prefix_compare(&ctx->ctx_subtree, &addr,
447 			    ctx->ctx_subtreelen) != 0)
448 				/* left subtree, walk is done */
449 				break;
450 		}
451 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
452 		    !re_is_locked(re)) {
453 			/* store and lock last element */
454 			ctx->ctx_re = re_lock(re);
455 			return;
456 		}
457 		ctx->ctx_rib_call(re, ctx->ctx_arg);
458 	}
459 
460 	if (ctx->ctx_done)
461 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
462 	LIST_REMOVE(ctx, entry);
463 	free(ctx);
464 }
465 
466 int
rib_dump_pending(void)467 rib_dump_pending(void)
468 {
469 	struct rib_context *ctx;
470 
471 	/* return true if at least one context is not throttled */
472 	LIST_FOREACH(ctx, &rib_dumps, entry) {
473 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
474 			continue;
475 		return 1;
476 	}
477 	return 0;
478 }
479 
480 void
rib_dump_runner(void)481 rib_dump_runner(void)
482 {
483 	struct rib_context *ctx, *next;
484 
485 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
486 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
487 			continue;
488 		if (ctx->ctx_rib_call != NULL)
489 			rib_dump_r(ctx);
490 		else
491 			prefix_dump_r(ctx);
492 	}
493 }
494 
495 static void
rib_dump_abort(uint16_t id)496 rib_dump_abort(uint16_t id)
497 {
498 	struct rib_context *ctx, *next;
499 
500 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
501 		if (id != ctx->ctx_id)
502 			continue;
503 		if (ctx->ctx_done)
504 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
505 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
506 			rib_remove(ctx->ctx_re);
507 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
508 			prefix_adjout_destroy(ctx->ctx_p);
509 		LIST_REMOVE(ctx, entry);
510 		free(ctx);
511 	}
512 }
513 
514 void
rib_dump_terminate(void * arg)515 rib_dump_terminate(void *arg)
516 {
517 	struct rib_context *ctx, *next;
518 
519 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
520 		if (ctx->ctx_arg != arg)
521 			continue;
522 		if (ctx->ctx_done)
523 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
524 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
525 			rib_remove(ctx->ctx_re);
526 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
527 			prefix_adjout_destroy(ctx->ctx_p);
528 		LIST_REMOVE(ctx, entry);
529 		free(ctx);
530 	}
531 }
532 
533 int
rib_dump_new(uint16_t id,uint8_t aid,unsigned int count,void * arg,void (* upcall)(struct rib_entry *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))534 rib_dump_new(uint16_t id, uint8_t aid, unsigned int count, void *arg,
535     void (*upcall)(struct rib_entry *, void *), void (*done)(void *, uint8_t),
536     int (*throttle)(void *))
537 {
538 	struct rib_context *ctx;
539 
540 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
541 		return -1;
542 	ctx->ctx_id = id;
543 	ctx->ctx_aid = aid;
544 	ctx->ctx_count = count;
545 	ctx->ctx_arg = arg;
546 	ctx->ctx_rib_call = upcall;
547 	ctx->ctx_done = done;
548 	ctx->ctx_throttle = throttle;
549 
550 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
551 
552 	/* requested a sync traversal */
553 	if (count == 0)
554 		rib_dump_r(ctx);
555 
556 	return 0;
557 }
558 
559 int
rib_dump_subtree(uint16_t id,struct bgpd_addr * subtree,uint8_t subtreelen,unsigned int count,void * arg,void (* upcall)(struct rib_entry *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))560 rib_dump_subtree(uint16_t id, struct bgpd_addr *subtree, uint8_t subtreelen,
561     unsigned int count, void *arg, void (*upcall)(struct rib_entry *, void *),
562     void (*done)(void *, uint8_t), int (*throttle)(void *))
563 {
564 	struct rib_context *ctx;
565 	struct rib_entry xre;
566 
567 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
568 		return -1;
569 	ctx->ctx_id = id;
570 	ctx->ctx_aid = subtree->aid;
571 	ctx->ctx_count = count;
572 	ctx->ctx_arg = arg;
573 	ctx->ctx_rib_call = upcall;
574 	ctx->ctx_done = done;
575 	ctx->ctx_throttle = throttle;
576 	ctx->ctx_subtree = *subtree;
577 	ctx->ctx_subtreelen = subtreelen;
578 
579 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
580 
581 	/* lookup start of subtree */
582 	memset(&xre, 0, sizeof(xre));
583 	xre.prefix = pt_fill(subtree, subtreelen);
584 	ctx->ctx_re = RB_NFIND(rib_tree, rib_tree(rib_byid(id)), &xre);
585 	if (ctx->ctx_re)
586 		re_lock(ctx->ctx_re);
587 
588 	/* requested a sync traversal */
589 	if (count == 0)
590 		rib_dump_r(ctx);
591 
592 	return 0;
593 }
594 
595 /* path specific functions */
596 
597 static struct rde_aspath *path_lookup(struct rde_aspath *);
598 static void path_link(struct rde_aspath *);
599 static void path_unlink(struct rde_aspath *);
600 
601 static inline int
path_compare(struct rde_aspath * a,struct rde_aspath * b)602 path_compare(struct rde_aspath *a, struct rde_aspath *b)
603 {
604 	int		 r;
605 
606 	if (a == NULL && b == NULL)
607 		return (0);
608 	else if (b == NULL)
609 		return (1);
610 	else if (a == NULL)
611 		return (-1);
612 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
613 		return (1);
614 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
615 		return (-1);
616 	if (a->origin > b->origin)
617 		return (1);
618 	if (a->origin < b->origin)
619 		return (-1);
620 	if (a->med > b->med)
621 		return (1);
622 	if (a->med < b->med)
623 		return (-1);
624 	if (a->lpref > b->lpref)
625 		return (1);
626 	if (a->lpref < b->lpref)
627 		return (-1);
628 	if (a->weight > b->weight)
629 		return (1);
630 	if (a->weight < b->weight)
631 		return (-1);
632 	if (a->rtlabelid > b->rtlabelid)
633 		return (1);
634 	if (a->rtlabelid < b->rtlabelid)
635 		return (-1);
636 	if (a->pftableid > b->pftableid)
637 		return (1);
638 	if (a->pftableid < b->pftableid)
639 		return (-1);
640 
641 	/* no need to check aspa_state or aspa_generation */
642 
643 	r = aspath_compare(a->aspath, b->aspath);
644 	if (r > 0)
645 		return (1);
646 	if (r < 0)
647 		return (-1);
648 
649 	return (attr_compare(a, b));
650 }
651 
652 RB_HEAD(path_tree, rde_aspath)	pathtable = RB_INITIALIZER(&pathtable);
653 RB_GENERATE_STATIC(path_tree, rde_aspath, entry, path_compare);
654 
655 static inline struct rde_aspath *
path_ref(struct rde_aspath * asp)656 path_ref(struct rde_aspath *asp)
657 {
658 	if ((asp->flags & F_ATTR_LINKED) == 0)
659 		fatalx("%s: unlinked object", __func__);
660 	asp->refcnt++;
661 	rdemem.path_refs++;
662 
663 	return asp;
664 }
665 
666 static inline void
path_unref(struct rde_aspath * asp)667 path_unref(struct rde_aspath *asp)
668 {
669 	if (asp == NULL)
670 		return;
671 	if ((asp->flags & F_ATTR_LINKED) == 0)
672 		fatalx("%s: unlinked object", __func__);
673 	asp->refcnt--;
674 	rdemem.path_refs--;
675 	if (asp->refcnt <= 0)
676 		path_unlink(asp);
677 }
678 
679 void
path_shutdown(void)680 path_shutdown(void)
681 {
682 	if (!RB_EMPTY(&pathtable))
683 		log_warnx("path_free: free non-free table");
684 }
685 
686 static struct rde_aspath *
path_lookup(struct rde_aspath * aspath)687 path_lookup(struct rde_aspath *aspath)
688 {
689 	return (RB_FIND(path_tree, &pathtable, aspath));
690 }
691 
692 /*
693  * Link this aspath into the global table.
694  * The asp had to be alloced with path_get.
695  */
696 static void
path_link(struct rde_aspath * asp)697 path_link(struct rde_aspath *asp)
698 {
699 	if (RB_INSERT(path_tree, &pathtable, asp) != NULL)
700 		fatalx("%s: already linked object", __func__);
701 	asp->flags |= F_ATTR_LINKED;
702 }
703 
704 /*
705  * This function can only be called when all prefix have been removed first.
706  * Normally this happens directly out of the prefix removal functions.
707  */
708 static void
path_unlink(struct rde_aspath * asp)709 path_unlink(struct rde_aspath *asp)
710 {
711 	if (asp == NULL)
712 		return;
713 
714 	/* make sure no reference is hold for this rde_aspath */
715 	if (asp->refcnt != 0)
716 		fatalx("%s: still holds references", __func__);
717 
718 	RB_REMOVE(path_tree, &pathtable, asp);
719 	asp->flags &= ~F_ATTR_LINKED;
720 
721 	path_put(asp);
722 }
723 
724 /*
725  * Copy asp to a new UNLINKED aspath.
726  * On dst either path_get() or path_prep() had to be called before.
727  */
728 struct rde_aspath *
path_copy(struct rde_aspath * dst,const struct rde_aspath * src)729 path_copy(struct rde_aspath *dst, const struct rde_aspath *src)
730 {
731 	dst->aspath = aspath_copy(src->aspath);
732 	dst->refcnt = 0;
733 	dst->flags = src->flags & ~F_ATTR_LINKED;
734 
735 	dst->med = src->med;
736 	dst->lpref = src->lpref;
737 	dst->weight = src->weight;
738 	dst->rtlabelid = rtlabel_ref(src->rtlabelid);
739 	dst->pftableid = pftable_ref(src->pftableid);
740 	dst->origin = src->origin;
741 
742 	attr_copy(dst, src);
743 
744 	return (dst);
745 }
746 
747 /* initialize or prepare an aspath for use */
748 struct rde_aspath *
path_prep(struct rde_aspath * asp)749 path_prep(struct rde_aspath *asp)
750 {
751 	memset(asp, 0, sizeof(*asp));
752 	asp->origin = ORIGIN_INCOMPLETE;
753 	asp->lpref = DEFAULT_LPREF;
754 
755 	return (asp);
756 }
757 
758 /* alloc and initialize new entry. May not fail. */
759 struct rde_aspath *
path_get(void)760 path_get(void)
761 {
762 	struct rde_aspath *asp;
763 
764 	asp = malloc(sizeof(*asp));
765 	if (asp == NULL)
766 		fatal("path_get");
767 	rdemem.path_cnt++;
768 
769 	return (path_prep(asp));
770 }
771 
772 /* clean up an asp after use (frees all references to sub-objects) */
773 void
path_clean(struct rde_aspath * asp)774 path_clean(struct rde_aspath *asp)
775 {
776 	if (asp->flags & F_ATTR_LINKED)
777 		fatalx("%s: linked object", __func__);
778 
779 	rtlabel_unref(asp->rtlabelid);
780 	pftable_unref(asp->pftableid);
781 	aspath_put(asp->aspath);
782 	asp->aspath = NULL;
783 	attr_freeall(asp);
784 }
785 
786 /* free an unlinked element */
787 void
path_put(struct rde_aspath * asp)788 path_put(struct rde_aspath *asp)
789 {
790 	if (asp == NULL)
791 		return;
792 
793 	path_clean(asp);
794 
795 	rdemem.path_cnt--;
796 	free(asp);
797 }
798 
799 /* prefix specific functions */
800 
801 static int	prefix_add(struct bgpd_addr *, int, struct rib *,
802 		    struct rde_peer *, uint32_t, uint32_t, struct rde_aspath *,
803 		    struct rde_community *, struct nexthop *,
804 		    uint8_t, uint8_t, int);
805 static int	prefix_move(struct prefix *, struct rde_peer *,
806 		    struct rde_aspath *, struct rde_community *,
807 		    struct nexthop *, uint8_t, uint8_t, int);
808 
809 static void	prefix_link(struct prefix *, struct rib_entry *,
810 		    struct pt_entry *, struct rde_peer *, uint32_t, uint32_t,
811 		    struct rde_aspath *, struct rde_community *,
812 		    struct nexthop *, uint8_t, uint8_t);
813 static void	prefix_unlink(struct prefix *);
814 
815 static struct prefix	*prefix_alloc(void);
816 static void		 prefix_free(struct prefix *);
817 
818 /* RB tree comparison function */
819 static inline int
prefix_index_cmp(struct prefix * a,struct prefix * b)820 prefix_index_cmp(struct prefix *a, struct prefix *b)
821 {
822 	int r;
823 	r = pt_prefix_cmp(a->pt, b->pt);
824 	if (r != 0)
825 		return r;
826 
827 	if (a->path_id_tx > b->path_id_tx)
828 		return 1;
829 	if (a->path_id_tx < b->path_id_tx)
830 		return -1;
831 	return 0;
832 }
833 
834 static inline int
prefix_cmp(struct prefix * a,struct prefix * b)835 prefix_cmp(struct prefix *a, struct prefix *b)
836 {
837 	if ((a->flags & PREFIX_FLAG_EOR) != (b->flags & PREFIX_FLAG_EOR))
838 		return (a->flags & PREFIX_FLAG_EOR) ? 1 : -1;
839 	/* if EOR marker no need to check the rest */
840 	if (a->flags & PREFIX_FLAG_EOR)
841 		return 0;
842 
843 	if (a->aspath != b->aspath)
844 		return (a->aspath > b->aspath ? 1 : -1);
845 	if (a->communities != b->communities)
846 		return (a->communities > b->communities ? 1 : -1);
847 	if (a->nexthop != b->nexthop)
848 		return (a->nexthop > b->nexthop ? 1 : -1);
849 	if (a->nhflags != b->nhflags)
850 		return (a->nhflags > b->nhflags ? 1 : -1);
851 	return prefix_index_cmp(a, b);
852 }
853 
854 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
855 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
856 
857 /*
858  * Search for specified prefix of a peer. Returns NULL if not found.
859  */
860 struct prefix *
prefix_get(struct rib * rib,struct rde_peer * peer,uint32_t path_id,struct bgpd_addr * prefix,int prefixlen)861 prefix_get(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
862     struct bgpd_addr *prefix, int prefixlen)
863 {
864 	struct rib_entry	*re;
865 
866 	re = rib_get_addr(rib, prefix, prefixlen);
867 	if (re == NULL)
868 		return (NULL);
869 	return (prefix_bypeer(re, peer, path_id));
870 }
871 
872 /*
873  * Search for specified prefix in the peer prefix_index.
874  * Returns NULL if not found.
875  */
876 struct prefix *
prefix_adjout_get(struct rde_peer * peer,uint32_t path_id_tx,struct pt_entry * pte)877 prefix_adjout_get(struct rde_peer *peer, uint32_t path_id_tx,
878     struct pt_entry *pte)
879 {
880 	struct prefix xp;
881 
882 	memset(&xp, 0, sizeof(xp));
883 	xp.pt = pte;
884 	xp.path_id_tx = path_id_tx;
885 
886 	return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
887 }
888 
889 /*
890  * Lookup a prefix without considering path_id in the peer prefix_index.
891  * Returns NULL if not found.
892  */
893 struct prefix *
prefix_adjout_first(struct rde_peer * peer,struct pt_entry * pte)894 prefix_adjout_first(struct rde_peer *peer, struct pt_entry *pte)
895 {
896 	struct prefix xp, *np;
897 
898 	memset(&xp, 0, sizeof(xp));
899 	xp.pt = pte;
900 
901 	np = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp);
902 	if (np == NULL || pt_prefix_cmp(np->pt, xp.pt) != 0)
903 		return NULL;
904 	return np;
905 }
906 
907 /*
908  * Return next prefix after a lookup that is actually an update.
909  */
910 struct prefix *
prefix_adjout_next(struct rde_peer * peer,struct prefix * p)911 prefix_adjout_next(struct rde_peer *peer, struct prefix *p)
912 {
913 	struct prefix *np;
914 
915 	np = RB_NEXT(prefix_index, &peer->adj_rib_out, p);
916 	if (np == NULL || np->pt != p->pt)
917 		return NULL;
918 	return np;
919 }
920 
921 /*
922  * Lookup addr/prefixlen in the peer prefix_index. Returns first match.
923  * Returns NULL if not found.
924  */
925 struct prefix *
prefix_adjout_lookup(struct rde_peer * peer,struct bgpd_addr * addr,int plen)926 prefix_adjout_lookup(struct rde_peer *peer, struct bgpd_addr *addr, int plen)
927 {
928 	return prefix_adjout_first(peer, pt_fill(addr, plen));
929 }
930 
931 /*
932  * Lookup addr in the peer prefix_index. Returns first match.
933  * Returns NULL if not found.
934  */
935 struct prefix *
prefix_adjout_match(struct rde_peer * peer,struct bgpd_addr * addr)936 prefix_adjout_match(struct rde_peer *peer, struct bgpd_addr *addr)
937 {
938 	struct prefix *p;
939 	int i;
940 
941 	switch (addr->aid) {
942 	case AID_INET:
943 	case AID_VPN_IPv4:
944 		for (i = 32; i >= 0; i--) {
945 			p = prefix_adjout_lookup(peer, addr, i);
946 			if (p != NULL)
947 				return p;
948 		}
949 		break;
950 	case AID_INET6:
951 	case AID_VPN_IPv6:
952 		for (i = 128; i >= 0; i--) {
953 			p = prefix_adjout_lookup(peer, addr, i);
954 			if (p != NULL)
955 				return p;
956 		}
957 		break;
958 	default:
959 		fatalx("%s: unknown af", __func__);
960 	}
961 	return NULL;
962 }
963 
964 /*
965  * Update a prefix.
966  * Return 1 if prefix was newly added, 0 if it was just changed.
967  */
968 int
prefix_update(struct rib * rib,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct filterstate * state,int filtered,struct bgpd_addr * prefix,int prefixlen)969 prefix_update(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
970     uint32_t path_id_tx, struct filterstate *state, int filtered,
971     struct bgpd_addr *prefix, int prefixlen)
972 {
973 	struct rde_aspath	*asp, *nasp = &state->aspath;
974 	struct rde_community	*comm, *ncomm = &state->communities;
975 	struct prefix		*p;
976 
977 	/*
978 	 * First try to find a prefix in the specified RIB.
979 	 */
980 	if ((p = prefix_get(rib, peer, path_id, prefix, prefixlen)) != NULL) {
981 		if (path_id_tx != p->path_id_tx)
982 			fatalx("path_id mismatch");
983 		if (prefix_nexthop(p) == state->nexthop &&
984 		    prefix_nhflags(p) == state->nhflags &&
985 		    communities_equal(ncomm, prefix_communities(p)) &&
986 		    path_compare(nasp, prefix_aspath(p)) == 0) {
987 			/* no change, update last change */
988 			p->lastchange = getmonotime();
989 			p->validation_state = state->vstate;
990 			if (filtered)
991 				p->flags |= PREFIX_FLAG_FILTERED;
992 			else
993 				p->flags &= ~PREFIX_FLAG_FILTERED;
994 			return (0);
995 		}
996 	}
997 
998 	/*
999 	 * Either the prefix does not exist or the path changed.
1000 	 * In both cases lookup the new aspath to make sure it is not
1001 	 * already in the RIB.
1002 	 */
1003 	if ((asp = path_lookup(nasp)) == NULL) {
1004 		/* Path not available, create and link a new one. */
1005 		asp = path_copy(path_get(), nasp);
1006 		path_link(asp);
1007 	}
1008 
1009 	if ((comm = communities_lookup(ncomm)) == NULL) {
1010 		/* Communities not available, create and link a new one. */
1011 		comm = communities_link(ncomm);
1012 	}
1013 
1014 	/* If the prefix was found move it else add it to the RIB. */
1015 	if (p != NULL)
1016 		return (prefix_move(p, peer, asp, comm, state->nexthop,
1017 		    state->nhflags, state->vstate, filtered));
1018 	else
1019 		return (prefix_add(prefix, prefixlen, rib, peer, path_id,
1020 		    path_id_tx, asp, comm, state->nexthop, state->nhflags,
1021 		    state->vstate, filtered));
1022 }
1023 
1024 /*
1025  * Adds or updates a prefix.
1026  */
1027 static int
prefix_add(struct bgpd_addr * prefix,int prefixlen,struct rib * rib,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate,int filtered)1028 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib,
1029     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1030     struct rde_aspath *asp, struct rde_community *comm,
1031     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate, int filtered)
1032 {
1033 	struct pt_entry	*pte;
1034 	struct prefix		*p;
1035 	struct rib_entry	*re;
1036 
1037 	pte = pt_get(prefix, prefixlen);
1038 	if (pte == NULL)
1039 		pte = pt_add(prefix, prefixlen);
1040 	re = rib_get(rib, pte);
1041 	if (re == NULL)
1042 		re = rib_add(rib, pte);
1043 
1044 	p = prefix_alloc();
1045 	prefix_link(p, re, re->prefix, peer, path_id, path_id_tx, asp, comm,
1046 	    nexthop, nhflags, vstate);
1047 
1048 	if (filtered)
1049 		p->flags |= PREFIX_FLAG_FILTERED;
1050 
1051 	/* add possible pftable reference form aspath */
1052 	if (asp && asp->pftableid)
1053 		rde_pftable_add(asp->pftableid, p);
1054 	/* make route decision */
1055 	prefix_evaluate(re, p, NULL);
1056 	return (1);
1057 }
1058 
1059 /*
1060  * Move the prefix to the specified as path, removes the old asp if needed.
1061  */
1062 static int
prefix_move(struct prefix * p,struct rde_peer * peer,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate,int filtered)1063 prefix_move(struct prefix *p, struct rde_peer *peer,
1064     struct rde_aspath *asp, struct rde_community *comm,
1065     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate, int filtered)
1066 {
1067 	struct prefix		*np;
1068 
1069 	if (p->flags & PREFIX_FLAG_ADJOUT)
1070 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1071 
1072 	if (peer != prefix_peer(p))
1073 		fatalx("prefix_move: cross peer move");
1074 
1075 	/* add new prefix node */
1076 	np = prefix_alloc();
1077 	prefix_link(np, prefix_re(p), p->pt, peer, p->path_id, p->path_id_tx,
1078 	    asp, comm, nexthop, nhflags, vstate);
1079 
1080 	if (filtered)
1081 		np->flags |= PREFIX_FLAG_FILTERED;
1082 
1083 	/* add possible pftable reference from new aspath */
1084 	if (asp && asp->pftableid)
1085 		rde_pftable_add(asp->pftableid, np);
1086 
1087 	/*
1088 	 * no need to update the peer prefix count because we are only moving
1089 	 * the prefix without changing the peer.
1090 	 */
1091 	prefix_evaluate(prefix_re(np), np, p);
1092 
1093 	/* remove possible pftable reference from old path */
1094 	if (p->aspath && p->aspath->pftableid)
1095 		rde_pftable_del(p->aspath->pftableid, p);
1096 
1097 	/* remove old prefix node */
1098 	prefix_unlink(p);
1099 	prefix_free(p);
1100 
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Removes a prefix from the specified RIB. If the parent objects -- rib_entry
1106  * or pt_entry -- become empty remove them too.
1107  */
1108 int
prefix_withdraw(struct rib * rib,struct rde_peer * peer,uint32_t path_id,struct bgpd_addr * prefix,int prefixlen)1109 prefix_withdraw(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
1110     struct bgpd_addr *prefix, int prefixlen)
1111 {
1112 	struct prefix		*p;
1113 	struct rde_aspath	*asp;
1114 
1115 	p = prefix_get(rib, peer, path_id, prefix, prefixlen);
1116 	if (p == NULL)		/* Got a dummy withdrawn request. */
1117 		return (0);
1118 
1119 	if (p->flags & PREFIX_FLAG_ADJOUT)
1120 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1121 	asp = prefix_aspath(p);
1122 	if (asp && asp->pftableid)
1123 		/* only prefixes in the local RIB were pushed into pf */
1124 		rde_pftable_del(asp->pftableid, p);
1125 
1126 	prefix_destroy(p);
1127 
1128 	return (1);
1129 }
1130 
1131 /*
1132  * Special functions for flowspec until full integration is available.
1133  * This just directly feeds the prefixes into the Adj-RIB-Out bypassing
1134  * Adj-RIB-In and Loc-RIB for now.
1135  */
1136 int
prefix_flowspec_update(struct rde_peer * peer,struct filterstate * state,struct pt_entry * pte,uint32_t path_id_tx)1137 prefix_flowspec_update(struct rde_peer *peer, struct filterstate *state,
1138     struct pt_entry *pte, uint32_t path_id_tx)
1139 {
1140 	struct rde_aspath *asp, *nasp;
1141 	struct rde_community *comm, *ncomm;
1142 	struct rib_entry *re;
1143 	struct prefix *new, *old;
1144 
1145 	re = rib_get(&flowrib, pte);
1146 	if (re == NULL)
1147 		re = rib_add(&flowrib, pte);
1148 
1149 	old = prefix_bypeer(re, peer, 0);
1150 	new = prefix_alloc();
1151 
1152 	nasp = &state->aspath;
1153 	ncomm = &state->communities;
1154 	if ((asp = path_lookup(nasp)) == NULL) {
1155 		/* Path not available, create and link a new one. */
1156 		asp = path_copy(path_get(), nasp);
1157 		path_link(asp);
1158 	}
1159 	if ((comm = communities_lookup(ncomm)) == NULL) {
1160 		/* Communities not available, create and link a new one. */
1161 		comm = communities_link(ncomm);
1162 	}
1163 
1164 	prefix_link(new, re, re->prefix, peer, 0, path_id_tx, asp, comm,
1165 	    NULL, 0, 0);
1166 	TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
1167 
1168 	rde_generate_updates(re, new, old, EVAL_DEFAULT);
1169 
1170 	if (old != NULL) {
1171 		TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);
1172 		prefix_unlink(old);
1173 		prefix_free(old);
1174 		return 0;
1175 	}
1176 	return 1;
1177 }
1178 
1179 /*
1180  * Remove a possible flowspec prefix from all Adj-RIB-Outs.
1181  */
1182 int
prefix_flowspec_withdraw(struct rde_peer * peer,struct pt_entry * pte)1183 prefix_flowspec_withdraw(struct rde_peer *peer, struct pt_entry *pte)
1184 {
1185 	struct rib_entry *re;
1186 	struct prefix *p;
1187 
1188 	re = rib_get(&flowrib, pte);
1189 	if (re == NULL)
1190 		return 0;
1191 	p = prefix_bypeer(re, peer, 0);
1192 	if (p == NULL)
1193 		return 0;
1194 	rde_generate_updates(re, NULL, p, EVAL_DEFAULT);
1195 	TAILQ_REMOVE(&re->prefix_h, p, entry.list.rib);
1196 	prefix_unlink(p);
1197 	prefix_free(p);
1198 	return 1;
1199 }
1200 
1201 /*
1202  * Push all flowspec rules into a newly available Adj-RIB-Out.
1203  */
1204 void
prefix_flowspec_dump(uint8_t aid,void * arg,void (* call)(struct rib_entry *,void *),void (* done)(void *,uint8_t))1205 prefix_flowspec_dump(uint8_t aid, void *arg,
1206     void (*call)(struct rib_entry *, void *), void (*done)(void *, uint8_t))
1207 {
1208 	struct rib_entry *re, *next;
1209 
1210 	RB_FOREACH_SAFE(re, rib_tree, rib_tree(&flowrib), next) {
1211 		if (aid != AID_UNSPEC && aid != re->prefix->aid)
1212 			continue;
1213 		call(re, arg);
1214 	}
1215 	if (done != NULL)
1216 		done(arg, aid);
1217 }
1218 
1219 /*
1220  * Insert an End-of-RIB marker into the update queue.
1221  */
1222 void
prefix_add_eor(struct rde_peer * peer,uint8_t aid)1223 prefix_add_eor(struct rde_peer *peer, uint8_t aid)
1224 {
1225 	struct prefix *p;
1226 
1227 	p = prefix_alloc();
1228 	p->flags = PREFIX_FLAG_ADJOUT | PREFIX_FLAG_UPDATE | PREFIX_FLAG_EOR;
1229 	if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
1230 		/* no need to add if EoR marker already present */
1231 		prefix_free(p);
1232 	/* EOR marker is not inserted into the adj_rib_out index */
1233 }
1234 
1235 /*
1236  * Put a prefix from the Adj-RIB-Out onto the update queue.
1237  */
1238 void
prefix_adjout_update(struct prefix * p,struct rde_peer * peer,struct filterstate * state,struct pt_entry * pte,uint32_t path_id_tx)1239 prefix_adjout_update(struct prefix *p, struct rde_peer *peer,
1240     struct filterstate *state, struct pt_entry *pte, uint32_t path_id_tx)
1241 {
1242 	struct rde_aspath *asp;
1243 	struct rde_community *comm;
1244 
1245 	if (p == NULL) {
1246 		p = prefix_alloc();
1247 		/* initially mark DEAD so code below is skipped */
1248 		p->flags |= PREFIX_FLAG_ADJOUT | PREFIX_FLAG_DEAD;
1249 
1250 		p->pt = pt_ref(pte);
1251 		p->peer = peer;
1252 		p->path_id_tx = path_id_tx;
1253 
1254 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1255 			fatalx("%s: RB index invariant violated", __func__);
1256 	}
1257 
1258 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1259 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1260 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1261 		/*
1262 		 * XXX for now treat a different path_id_tx like different
1263 		 * attributes and force out an update. It is unclear how
1264 		 * common it is to have equivalent updates from alternative
1265 		 * paths.
1266 		 */
1267 		if (p->path_id_tx == path_id_tx &&
1268 		    prefix_nhflags(p) == state->nhflags &&
1269 		    prefix_nexthop(p) == state->nexthop &&
1270 		    communities_equal(&state->communities,
1271 		    prefix_communities(p)) &&
1272 		    path_compare(&state->aspath, prefix_aspath(p)) == 0) {
1273 			/* nothing changed */
1274 			p->validation_state = state->vstate;
1275 			p->lastchange = getmonotime();
1276 			p->flags &= ~PREFIX_FLAG_STALE;
1277 			return;
1278 		}
1279 
1280 		/* if pending update unhook it before it is unlinked */
1281 		if (p->flags & PREFIX_FLAG_UPDATE) {
1282 			RB_REMOVE(prefix_tree, &peer->updates[pte->aid], p);
1283 			peer->stats.pending_update--;
1284 		}
1285 
1286 		/* unlink prefix so it can be relinked below */
1287 		prefix_unlink(p);
1288 		peer->stats.prefix_out_cnt--;
1289 	}
1290 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1291 		RB_REMOVE(prefix_tree, &peer->withdraws[pte->aid], p);
1292 		peer->stats.pending_withdraw--;
1293 	}
1294 
1295 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1296 	p->flags &= ~PREFIX_FLAG_MASK;
1297 
1298 	/* update path_id_tx now that the prefix is unlinked */
1299 	if (p->path_id_tx != path_id_tx) {
1300 		/* path_id_tx is part of the index so remove and re-insert p */
1301 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1302 		p->path_id_tx = path_id_tx;
1303 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1304 			fatalx("%s: RB index invariant violated", __func__);
1305 	}
1306 
1307 	if ((asp = path_lookup(&state->aspath)) == NULL) {
1308 		/* Path not available, create and link a new one. */
1309 		asp = path_copy(path_get(), &state->aspath);
1310 		path_link(asp);
1311 	}
1312 
1313 	if ((comm = communities_lookup(&state->communities)) == NULL) {
1314 		/* Communities not available, create and link a new one. */
1315 		comm = communities_link(&state->communities);
1316 	}
1317 
1318 	prefix_link(p, NULL, p->pt, peer, 0, p->path_id_tx, asp, comm,
1319 	    state->nexthop, state->nhflags, state->vstate);
1320 	peer->stats.prefix_out_cnt++;
1321 
1322 	if (p->flags & PREFIX_FLAG_MASK)
1323 		fatalx("%s: bad flags %x", __func__, p->flags);
1324 	p->flags |= PREFIX_FLAG_UPDATE;
1325 	if (RB_INSERT(prefix_tree, &peer->updates[pte->aid], p) != NULL)
1326 		fatalx("%s: RB tree invariant violated", __func__);
1327 	peer->stats.pending_update++;
1328 }
1329 
1330 /*
1331  * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves
1332  * the prefix in the RIB linked to the peer withdraw list.
1333  */
1334 void
prefix_adjout_withdraw(struct prefix * p)1335 prefix_adjout_withdraw(struct prefix *p)
1336 {
1337 	struct rde_peer *peer = prefix_peer(p);
1338 
1339 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1340 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1341 
1342 	/* already a withdraw, shortcut */
1343 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1344 		p->lastchange = getmonotime();
1345 		p->flags &= ~PREFIX_FLAG_STALE;
1346 		return;
1347 	}
1348 	/* pending update just got withdrawn */
1349 	if (p->flags & PREFIX_FLAG_UPDATE) {
1350 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1351 		peer->stats.pending_update--;
1352 	}
1353 	/* unlink prefix if it was linked (not a withdraw or dead) */
1354 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1355 		prefix_unlink(p);
1356 		peer->stats.prefix_out_cnt--;
1357 	}
1358 
1359 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1360 	p->flags &= ~PREFIX_FLAG_MASK;
1361 	p->lastchange = getmonotime();
1362 
1363 	p->flags |= PREFIX_FLAG_WITHDRAW;
1364 	if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], p) != NULL)
1365 		fatalx("%s: RB tree invariant violated", __func__);
1366 	peer->stats.pending_withdraw++;
1367 }
1368 
1369 void
prefix_adjout_destroy(struct prefix * p)1370 prefix_adjout_destroy(struct prefix *p)
1371 {
1372 	struct rde_peer *peer = prefix_peer(p);
1373 
1374 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1375 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1376 
1377 	if (p->flags & PREFIX_FLAG_EOR) {
1378 		/* EOR marker is not linked in the index */
1379 		prefix_free(p);
1380 		return;
1381 	}
1382 
1383 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1384 		RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
1385 		peer->stats.pending_withdraw--;
1386 	}
1387 	if (p->flags & PREFIX_FLAG_UPDATE) {
1388 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1389 		peer->stats.pending_update--;
1390 	}
1391 	/* unlink prefix if it was linked (not a withdraw or dead) */
1392 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1393 		prefix_unlink(p);
1394 		peer->stats.prefix_out_cnt--;
1395 	}
1396 
1397 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1398 	p->flags &= ~PREFIX_FLAG_MASK;
1399 
1400 	if (prefix_is_locked(p)) {
1401 		/* mark prefix dead but leave it for prefix_restart */
1402 		p->flags |= PREFIX_FLAG_DEAD;
1403 	} else {
1404 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1405 		/* remove the last prefix reference before free */
1406 		pt_unref(p->pt);
1407 		prefix_free(p);
1408 	}
1409 }
1410 
1411 static struct prefix *
prefix_restart(struct rib_context * ctx)1412 prefix_restart(struct rib_context *ctx)
1413 {
1414 	struct prefix *p = NULL;
1415 
1416 	if (ctx->ctx_p)
1417 		p = prefix_unlock(ctx->ctx_p);
1418 
1419 	if (p && prefix_is_dead(p)) {
1420 		struct prefix *next;
1421 
1422 		next = RB_NEXT(prefix_index, unused, p);
1423 		prefix_adjout_destroy(p);
1424 		p = next;
1425 	}
1426 	ctx->ctx_p = NULL;
1427 	return p;
1428 }
1429 
1430 static void
prefix_dump_r(struct rib_context * ctx)1431 prefix_dump_r(struct rib_context *ctx)
1432 {
1433 	struct prefix *p, *next;
1434 	struct rde_peer *peer;
1435 	unsigned int i;
1436 
1437 	if ((peer = peer_get(ctx->ctx_id)) == NULL)
1438 		goto done;
1439 
1440 	if (ctx->ctx_p == NULL && ctx->ctx_subtree.aid == AID_UNSPEC)
1441 		p = RB_MIN(prefix_index, &peer->adj_rib_out);
1442 	else
1443 		p = prefix_restart(ctx);
1444 
1445 	for (i = 0; p != NULL; p = next) {
1446 		next = RB_NEXT(prefix_index, unused, p);
1447 		if (prefix_is_dead(p))
1448 			continue;
1449 		if (ctx->ctx_aid != AID_UNSPEC &&
1450 		    ctx->ctx_aid != p->pt->aid)
1451 			continue;
1452 		if (ctx->ctx_subtree.aid != AID_UNSPEC) {
1453 			struct bgpd_addr addr;
1454 			pt_getaddr(p->pt, &addr);
1455 			if (prefix_compare(&ctx->ctx_subtree, &addr,
1456 			    ctx->ctx_subtreelen) != 0)
1457 				/* left subtree, walk is done */
1458 				break;
1459 		}
1460 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
1461 		    !prefix_is_locked(p)) {
1462 			/* store and lock last element */
1463 			ctx->ctx_p = prefix_lock(p);
1464 			return;
1465 		}
1466 		ctx->ctx_prefix_call(p, ctx->ctx_arg);
1467 	}
1468 
1469 done:
1470 	if (ctx->ctx_done)
1471 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
1472 	LIST_REMOVE(ctx, entry);
1473 	free(ctx);
1474 }
1475 
1476 int
prefix_dump_new(struct rde_peer * peer,uint8_t aid,unsigned int count,void * arg,void (* upcall)(struct prefix *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))1477 prefix_dump_new(struct rde_peer *peer, uint8_t aid, unsigned int count,
1478     void *arg, void (*upcall)(struct prefix *, void *),
1479     void (*done)(void *, uint8_t), int (*throttle)(void *))
1480 {
1481 	struct rib_context *ctx;
1482 
1483 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1484 		return -1;
1485 	ctx->ctx_id = peer->conf.id;
1486 	ctx->ctx_aid = aid;
1487 	ctx->ctx_count = count;
1488 	ctx->ctx_arg = arg;
1489 	ctx->ctx_prefix_call = upcall;
1490 	ctx->ctx_done = done;
1491 	ctx->ctx_throttle = throttle;
1492 
1493 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1494 
1495 	/* requested a sync traversal */
1496 	if (count == 0)
1497 		prefix_dump_r(ctx);
1498 
1499 	return 0;
1500 }
1501 
1502 int
prefix_dump_subtree(struct rde_peer * peer,struct bgpd_addr * subtree,uint8_t subtreelen,unsigned int count,void * arg,void (* upcall)(struct prefix *,void *),void (* done)(void *,uint8_t),int (* throttle)(void *))1503 prefix_dump_subtree(struct rde_peer *peer, struct bgpd_addr *subtree,
1504     uint8_t subtreelen, unsigned int count, void *arg,
1505     void (*upcall)(struct prefix *, void *), void (*done)(void *, uint8_t),
1506     int (*throttle)(void *))
1507 {
1508 	struct rib_context *ctx;
1509 	struct prefix xp;
1510 
1511 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1512 		return -1;
1513 	ctx->ctx_id = peer->conf.id;
1514 	ctx->ctx_aid = subtree->aid;
1515 	ctx->ctx_count = count;
1516 	ctx->ctx_arg = arg;
1517 	ctx->ctx_prefix_call = upcall;
1518 	ctx->ctx_done = done;
1519 	ctx->ctx_throttle = throttle;
1520 	ctx->ctx_subtree = *subtree;
1521 	ctx->ctx_subtreelen = subtreelen;
1522 
1523 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1524 
1525 	/* lookup start of subtree */
1526 	memset(&xp, 0, sizeof(xp));
1527 	xp.pt = pt_fill(subtree, subtreelen);
1528 	ctx->ctx_p = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp);
1529 	if (ctx->ctx_p)
1530 		prefix_lock(ctx->ctx_p);
1531 
1532 	/* requested a sync traversal */
1533 	if (count == 0)
1534 		prefix_dump_r(ctx);
1535 
1536 	return 0;
1537 }
1538 
1539 /*
1540  * Searches in the prefix list of specified rib_entry for a prefix entry
1541  * belonging to the peer peer. Returns NULL if no match found.
1542  */
1543 struct prefix *
prefix_bypeer(struct rib_entry * re,struct rde_peer * peer,uint32_t path_id)1544 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, uint32_t path_id)
1545 {
1546 	struct prefix	*p;
1547 
1548 	TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib)
1549 		if (prefix_peer(p) == peer && p->path_id == path_id)
1550 			return (p);
1551 	return (NULL);
1552 }
1553 
1554 /* kill a prefix. */
1555 void
prefix_destroy(struct prefix * p)1556 prefix_destroy(struct prefix *p)
1557 {
1558 	/* make route decision */
1559 	prefix_evaluate(prefix_re(p), NULL, p);
1560 
1561 	prefix_unlink(p);
1562 	prefix_free(p);
1563 }
1564 
1565 /*
1566  * Link a prefix into the different parent objects.
1567  */
1568 static void
prefix_link(struct prefix * p,struct rib_entry * re,struct pt_entry * pt,struct rde_peer * peer,uint32_t path_id,uint32_t path_id_tx,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,uint8_t nhflags,uint8_t vstate)1569 prefix_link(struct prefix *p, struct rib_entry *re, struct pt_entry *pt,
1570     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1571     struct rde_aspath *asp, struct rde_community *comm,
1572     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1573 {
1574 	if (re)
1575 		p->entry.list.re = re;
1576 	p->aspath = path_ref(asp);
1577 	p->communities = communities_ref(comm);
1578 	p->peer = peer;
1579 	p->pt = pt_ref(pt);
1580 	p->path_id = path_id;
1581 	p->path_id_tx = path_id_tx;
1582 	p->validation_state = vstate;
1583 	p->nhflags = nhflags;
1584 	p->nexthop = nexthop_ref(nexthop);
1585 	nexthop_link(p);
1586 	p->lastchange = getmonotime();
1587 }
1588 
1589 /*
1590  * Unlink a prefix from the different parent objects.
1591  */
1592 static void
prefix_unlink(struct prefix * p)1593 prefix_unlink(struct prefix *p)
1594 {
1595 	struct rib_entry	*re = prefix_re(p);
1596 
1597 	/* destroy all references to other objects */
1598 	/* remove nexthop ref ... */
1599 	nexthop_unlink(p);
1600 	nexthop_unref(p->nexthop);
1601 	p->nexthop = NULL;
1602 	p->nhflags = 0;
1603 	/* ... communities ... */
1604 	communities_unref(p->communities);
1605 	p->communities = NULL;
1606 	/* and unlink from aspath */
1607 	path_unref(p->aspath);
1608 	p->aspath = NULL;
1609 
1610 	if (re && rib_empty(re))
1611 		rib_remove(re);
1612 
1613 	pt_unref(p->pt);
1614 }
1615 
1616 /* alloc and zero new entry. May not fail. */
1617 static struct prefix *
prefix_alloc(void)1618 prefix_alloc(void)
1619 {
1620 	struct prefix *p;
1621 
1622 	p = calloc(1, sizeof(*p));
1623 	if (p == NULL)
1624 		fatal("prefix_alloc");
1625 	rdemem.prefix_cnt++;
1626 	return p;
1627 }
1628 
1629 /* free a unlinked entry */
1630 static void
prefix_free(struct prefix * p)1631 prefix_free(struct prefix *p)
1632 {
1633 	rdemem.prefix_cnt--;
1634 	free(p);
1635 }
1636 
1637 /*
1638  * nexthop functions
1639  */
1640 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1641 
1642 /*
1643  * In BGP there exist two nexthops: the exit nexthop which was announced via
1644  * BGP and the true nexthop which is used in the FIB -- forward information
1645  * base a.k.a kernel routing table. When sending updates it is even more
1646  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1647  * while in EBGP normally the address of the router is sent. The exit nexthop
1648  * may be passed to the external neighbor if the neighbor and the exit nexthop
1649  * reside in the same subnet -- directly connected.
1650  */
1651 
1652 TAILQ_HEAD(nexthop_queue, nexthop)	nexthop_runners =
1653 				    TAILQ_HEAD_INITIALIZER(nexthop_runners);
1654 
1655 RB_HEAD(nexthop_tree, nexthop)		nexthoptable =
1656 					    RB_INITIALIZER(&nexthoptree);
1657 
1658 static inline int nexthop_cmp(struct nexthop *, struct nexthop *);
1659 
1660 RB_GENERATE_STATIC(nexthop_tree, nexthop, entry, nexthop_cmp);
1661 
1662 void
nexthop_shutdown(void)1663 nexthop_shutdown(void)
1664 {
1665 	struct nexthop		*nh, *nnh;
1666 
1667 	RB_FOREACH_SAFE(nh, nexthop_tree, &nexthoptable, nnh) {
1668 		nh->state = NEXTHOP_UNREACH;
1669 		nexthop_unref(nh);
1670 	}
1671 
1672 	RB_FOREACH(nh, nexthop_tree, &nexthoptable) {
1673 		log_warnx("%s: nexthop %s refcnt %d", __func__,
1674 		    log_addr(&nh->exit_nexthop), nh->refcnt);
1675 	}
1676 }
1677 
1678 int
nexthop_pending(void)1679 nexthop_pending(void)
1680 {
1681 	return !TAILQ_EMPTY(&nexthop_runners);
1682 }
1683 
1684 void
nexthop_runner(void)1685 nexthop_runner(void)
1686 {
1687 	struct nexthop *nh;
1688 	struct prefix *p;
1689 	uint32_t j;
1690 
1691 	nh = TAILQ_FIRST(&nexthop_runners);
1692 	if (nh == NULL)
1693 		return;
1694 
1695 	/* remove from runnner queue */
1696 	TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1697 
1698 	p = nh->next_prefix;
1699 	for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
1700 		prefix_evaluate_nexthop(p, nh->state, nh->oldstate);
1701 		p = LIST_NEXT(p, entry.list.nexthop);
1702 	}
1703 
1704 	/* prep for next run, if not finished readd to tail of queue */
1705 	nh->next_prefix = p;
1706 	if (p != NULL)
1707 		TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l);
1708 	else
1709 		log_debug("nexthop %s update finished",
1710 		    log_addr(&nh->exit_nexthop));
1711 }
1712 
1713 void
nexthop_update(struct kroute_nexthop * msg)1714 nexthop_update(struct kroute_nexthop *msg)
1715 {
1716 	struct nexthop		*nh;
1717 
1718 	nh = nexthop_lookup(&msg->nexthop);
1719 	if (nh == NULL) {
1720 		log_warnx("nexthop_update: non-existent nexthop %s",
1721 		    log_addr(&msg->nexthop));
1722 		return;
1723 	}
1724 
1725 	nh->oldstate = nh->state;
1726 	if (msg->valid)
1727 		nh->state = NEXTHOP_REACH;
1728 	else
1729 		nh->state = NEXTHOP_UNREACH;
1730 
1731 	if (nh->oldstate == NEXTHOP_LOOKUP)
1732 		/* drop reference which was hold during the lookup */
1733 		if (nexthop_unref(nh))
1734 			return;		/* nh lost last ref, no work left */
1735 
1736 	if (nh->next_prefix) {
1737 		/*
1738 		 * If nexthop_runner() is not finished with this nexthop
1739 		 * then ensure that all prefixes are updated by setting
1740 		 * the oldstate to NEXTHOP_FLAPPED.
1741 		 */
1742 		nh->oldstate = NEXTHOP_FLAPPED;
1743 		TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1744 	}
1745 
1746 	if (msg->connected)
1747 		nh->flags |= NEXTHOP_CONNECTED;
1748 
1749 	nh->true_nexthop = msg->gateway;
1750 	nh->nexthop_net = msg->net;
1751 	nh->nexthop_netlen = msg->netlen;
1752 
1753 	nh->next_prefix = LIST_FIRST(&nh->prefix_h);
1754 	if (nh->next_prefix != NULL) {
1755 		TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);
1756 		log_debug("nexthop %s update starting",
1757 		    log_addr(&nh->exit_nexthop));
1758 	}
1759 }
1760 
1761 void
nexthop_modify(struct nexthop * setnh,enum action_types type,uint8_t aid,struct nexthop ** nexthop,uint8_t * flags)1762 nexthop_modify(struct nexthop *setnh, enum action_types type, uint8_t aid,
1763     struct nexthop **nexthop, uint8_t *flags)
1764 {
1765 	switch (type) {
1766 	case ACTION_SET_NEXTHOP_REJECT:
1767 		*flags = NEXTHOP_REJECT;
1768 		break;
1769 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1770 		*flags = NEXTHOP_BLACKHOLE;
1771 		break;
1772 	case ACTION_SET_NEXTHOP_NOMODIFY:
1773 		*flags = NEXTHOP_NOMODIFY;
1774 		break;
1775 	case ACTION_SET_NEXTHOP_SELF:
1776 		*flags = NEXTHOP_SELF;
1777 		break;
1778 	case ACTION_SET_NEXTHOP_REF:
1779 		/*
1780 		 * it is possible that a prefix matches but has the wrong
1781 		 * address family for the set nexthop. In this case ignore it.
1782 		 */
1783 		if (aid != setnh->exit_nexthop.aid)
1784 			break;
1785 		nexthop_unref(*nexthop);
1786 		*nexthop = nexthop_ref(setnh);
1787 		*flags = 0;
1788 		break;
1789 	default:
1790 		break;
1791 	}
1792 }
1793 
1794 void
nexthop_link(struct prefix * p)1795 nexthop_link(struct prefix *p)
1796 {
1797 	p->nhflags &= ~NEXTHOP_VALID;
1798 
1799 	if (p->flags & PREFIX_FLAG_ADJOUT) {
1800 		/* All nexthops are valid in Adj-RIB-Out */
1801 		p->nhflags |= NEXTHOP_VALID;
1802 		return;
1803 	}
1804 
1805 	/* no need to link prefixes in RIBs that have no decision process */
1806 	if (re_rib(prefix_re(p))->flags & F_RIB_NOEVALUATE)
1807 		return;
1808 
1809 	/* self-announce networks use nexthop NULL and are valid as well */
1810 	if (p->nexthop == NULL || p->nexthop->state == NEXTHOP_REACH)
1811 		p->nhflags |= NEXTHOP_VALID;
1812 
1813 	if (p->nexthop == NULL)
1814 		return;
1815 	p->flags |= PREFIX_NEXTHOP_LINKED;
1816 	LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
1817 }
1818 
1819 void
nexthop_unlink(struct prefix * p)1820 nexthop_unlink(struct prefix *p)
1821 {
1822 	p->nhflags &= ~NEXTHOP_VALID;
1823 
1824 	if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
1825 		return;
1826 
1827 	if (p == p->nexthop->next_prefix) {
1828 		p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
1829 		/* remove nexthop from list if no prefixes left to update */
1830 		if (p->nexthop->next_prefix == NULL) {
1831 			TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l);
1832 			log_debug("nexthop %s update finished",
1833 			    log_addr(&p->nexthop->exit_nexthop));
1834 		}
1835 	}
1836 
1837 	p->flags &= ~PREFIX_NEXTHOP_LINKED;
1838 	LIST_REMOVE(p, entry.list.nexthop);
1839 }
1840 
1841 struct nexthop *
nexthop_get(struct bgpd_addr * nexthop)1842 nexthop_get(struct bgpd_addr *nexthop)
1843 {
1844 	struct nexthop	*nh;
1845 
1846 	nh = nexthop_lookup(nexthop);
1847 	if (nh == NULL) {
1848 		nh = calloc(1, sizeof(*nh));
1849 		if (nh == NULL)
1850 			fatal("nexthop_get");
1851 		rdemem.nexthop_cnt++;
1852 
1853 		LIST_INIT(&nh->prefix_h);
1854 		nh->state = NEXTHOP_LOOKUP;
1855 		nexthop_ref(nh);	/* take reference for lookup */
1856 		nh->exit_nexthop = *nexthop;
1857 		if (RB_INSERT(nexthop_tree, &nexthoptable, nh) != NULL)
1858 			fatalx("nexthop tree inconsistency");
1859 
1860 		rde_send_nexthop(&nh->exit_nexthop, 1);
1861 	}
1862 
1863 	return nexthop_ref(nh);
1864 }
1865 
1866 struct nexthop *
nexthop_ref(struct nexthop * nexthop)1867 nexthop_ref(struct nexthop *nexthop)
1868 {
1869 	if (nexthop)
1870 		nexthop->refcnt++;
1871 	return (nexthop);
1872 }
1873 
1874 int
nexthop_unref(struct nexthop * nh)1875 nexthop_unref(struct nexthop *nh)
1876 {
1877 	if (nh == NULL)
1878 		return (0);
1879 	if (--nh->refcnt > 0)
1880 		return (0);
1881 
1882 	/* sanity check */
1883 	if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
1884 		fatalx("%s: refcnt error", __func__);
1885 
1886 	/* is nexthop update running? impossible, that is a refcnt error */
1887 	if (nh->next_prefix)
1888 		fatalx("%s: next_prefix not NULL", __func__);
1889 
1890 	RB_REMOVE(nexthop_tree, &nexthoptable, nh);
1891 	rde_send_nexthop(&nh->exit_nexthop, 0);
1892 
1893 	rdemem.nexthop_cnt--;
1894 	free(nh);
1895 	return (1);
1896 }
1897 
1898 static inline int
nexthop_cmp(struct nexthop * na,struct nexthop * nb)1899 nexthop_cmp(struct nexthop *na, struct nexthop *nb)
1900 {
1901 	struct bgpd_addr	*a, *b;
1902 
1903 	if (na == nb)
1904 		return (0);
1905 	if (na == NULL)
1906 		return (-1);
1907 	if (nb == NULL)
1908 		return (1);
1909 
1910 	a = &na->exit_nexthop;
1911 	b = &nb->exit_nexthop;
1912 
1913 	if (a->aid != b->aid)
1914 		return (a->aid - b->aid);
1915 
1916 	switch (a->aid) {
1917 	case AID_INET:
1918 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1919 			return (1);
1920 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1921 			return (-1);
1922 		return (0);
1923 	case AID_INET6:
1924 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1925 	default:
1926 		fatalx("nexthop_cmp: %s is unsupported", aid2str(a->aid));
1927 	}
1928 	return (-1);
1929 }
1930 
1931 struct nexthop *
nexthop_lookup(struct bgpd_addr * nexthop)1932 nexthop_lookup(struct bgpd_addr *nexthop)
1933 {
1934 	struct nexthop	needle = { 0 };
1935 
1936 	needle.exit_nexthop = *nexthop;
1937 	return RB_FIND(nexthop_tree, &nexthoptable, &needle);
1938 }
1939