1 /* $OpenBSD: rde_rib.c,v 1.220 2021/01/18 12:15:36 claudio Exp $ */
2
3 /*
4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include <sys/types.h>
20 #include <sys/queue.h>
21
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <siphash.h>
26 #include <time.h>
27
28 #include "bgpd.h"
29 #include "rde.h"
30 #include "log.h"
31
32 /*
33 * BGP RIB -- Routing Information Base
34 *
35 * The RIB is build with one aspect in mind. Speed -- actually update speed.
36 * Therefore one thing needs to be absolutely avoided, long table walks.
37 * This is achieved by heavily linking the different parts together.
38 */
39 u_int16_t rib_size;
40 struct rib **ribs;
41
42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
43 static inline int rib_compare(const struct rib_entry *,
44 const struct rib_entry *);
45 void rib_remove(struct rib_entry *);
46 int rib_empty(struct rib_entry *);
47 static void rib_dump_abort(u_int16_t);
48
49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
51
52 struct rib_context {
53 LIST_ENTRY(rib_context) entry;
54 struct rib_entry *ctx_re;
55 struct prefix *ctx_p;
56 u_int32_t ctx_id;
57 void (*ctx_rib_call)(struct rib_entry *, void *);
58 void (*ctx_prefix_call)(struct prefix *, void *);
59 void (*ctx_done)(void *, u_int8_t);
60 int (*ctx_throttle)(void *);
61 void *ctx_arg;
62 unsigned int ctx_count;
63 u_int8_t ctx_aid;
64 };
65 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps);
66
67 static void prefix_dump_r(struct rib_context *);
68
69 static inline struct rib_entry *
re_lock(struct rib_entry * re)70 re_lock(struct rib_entry *re)
71 {
72 if (re->lock != 0)
73 log_warnx("%s: entry already locked", __func__);
74 re->lock = 1;
75 return re;
76 }
77
78 static inline struct rib_entry *
re_unlock(struct rib_entry * re)79 re_unlock(struct rib_entry *re)
80 {
81 if (re->lock == 0)
82 log_warnx("%s: entry already unlocked", __func__);
83 re->lock = 0;
84 return re;
85 }
86
87 static inline int
re_is_locked(struct rib_entry * re)88 re_is_locked(struct rib_entry *re)
89 {
90 return (re->lock != 0);
91 }
92
93 static inline struct prefix *
prefix_lock(struct prefix * p)94 prefix_lock(struct prefix *p)
95 {
96 if (p->flags & PREFIX_FLAG_LOCKED)
97 fatalx("%s: locking locked prefix", __func__);
98 p->flags |= PREFIX_FLAG_LOCKED;
99 return p;
100 }
101
102 static inline struct prefix *
prefix_unlock(struct prefix * p)103 prefix_unlock(struct prefix *p)
104 {
105 if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
106 fatalx("%s: unlocking unlocked prefix", __func__);
107 p->flags &= ~PREFIX_FLAG_LOCKED;
108 return p;
109 }
110
111 static inline int
prefix_is_locked(struct prefix * p)112 prefix_is_locked(struct prefix *p)
113 {
114 return (p->flags & PREFIX_FLAG_LOCKED) != 0;
115 }
116
117 static inline int
prefix_is_dead(struct prefix * p)118 prefix_is_dead(struct prefix *p)
119 {
120 return (p->flags & PREFIX_FLAG_DEAD) != 0;
121 }
122
123 static inline struct rib_tree *
rib_tree(struct rib * rib)124 rib_tree(struct rib *rib)
125 {
126 return (&rib->tree);
127 }
128
129 static inline int
rib_compare(const struct rib_entry * a,const struct rib_entry * b)130 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
131 {
132 return (pt_prefix_cmp(a->prefix, b->prefix));
133 }
134
135 /* RIB specific functions */
136 struct rib *
rib_new(char * name,u_int rtableid,u_int16_t flags)137 rib_new(char *name, u_int rtableid, u_int16_t flags)
138 {
139 struct rib *new;
140 u_int16_t id;
141
142 for (id = 0; id < rib_size; id++) {
143 if (ribs[id] == NULL)
144 break;
145 }
146
147 if (id >= rib_size) {
148 if ((ribs = recallocarray(ribs, id, id + 8,
149 sizeof(struct rib))) == NULL)
150 fatal(NULL);
151 rib_size = id + 8;
152 }
153
154 if ((new = calloc(1, sizeof(*new))) == NULL)
155 fatal(NULL);
156
157 strlcpy(new->name, name, sizeof(new->name));
158 RB_INIT(rib_tree(new));
159 new->state = RECONF_REINIT;
160 new->id = id;
161 new->flags = flags;
162 new->rtableid = rtableid;
163
164 new->in_rules = calloc(1, sizeof(struct filter_head));
165 if (new->in_rules == NULL)
166 fatal(NULL);
167 TAILQ_INIT(new->in_rules);
168
169 ribs[id] = new;
170
171 log_debug("%s: %s -> %u", __func__, name, id);
172 return (new);
173 }
174
175 /*
176 * This function is only called when the FIB information of a RIB changed.
177 * It will flush the FIB if there was one previously and change the fibstate
178 * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB)
179 * or RECONF_REINIT (rerun the route decision process for every element)
180 * depending on the new flags.
181 */
182 void
rib_update(struct rib * rib)183 rib_update(struct rib *rib)
184 {
185 /* flush fib first if there was one */
186 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
187 rde_send_kroute_flush(rib);
188
189 /* if no evaluate changes then a full reinit is needed */
190 if ((rib->flags & F_RIB_NOEVALUATE) !=
191 (rib->flags_tmp & F_RIB_NOEVALUATE))
192 rib->fibstate = RECONF_REINIT;
193
194 rib->flags = rib->flags_tmp;
195 rib->rtableid = rib->rtableid_tmp;
196
197 /* reload fib if there is no reinit pending and there will be a fib */
198 if (rib->fibstate != RECONF_REINIT &&
199 (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
200 rib->fibstate = RECONF_RELOAD;
201 }
202
203 struct rib *
rib_byid(u_int16_t id)204 rib_byid(u_int16_t id)
205 {
206 if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL)
207 return NULL;
208 return ribs[id];
209 }
210
211 u_int16_t
rib_find(char * name)212 rib_find(char *name)
213 {
214 u_int16_t id;
215
216 /* no name returns the first Loc-RIB */
217 if (name == NULL || *name == '\0')
218 return RIB_LOC_START;
219
220 for (id = 0; id < rib_size; id++) {
221 if (ribs[id] != NULL && !strcmp(ribs[id]->name, name))
222 return id;
223 }
224
225 return RIB_NOTFOUND;
226 }
227
228 void
rib_free(struct rib * rib)229 rib_free(struct rib *rib)
230 {
231 struct rib_entry *re, *xre;
232 struct prefix *p;
233
234 rib_dump_abort(rib->id);
235
236 /*
237 * flush the rib, disable route evaluation and fib sync to speed up
238 * the prefix removal. Nothing depends on this data anymore.
239 */
240 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
241 rde_send_kroute_flush(rib);
242 rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB;
243
244 for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
245 xre = RB_NEXT(rib_tree, rib_tree(rib), re);
246
247 /*
248 * Removing the prefixes is tricky because the last one
249 * will remove the rib_entry as well and because we do
250 * an empty check in prefix_destroy() it is not possible to
251 * use the default for loop.
252 */
253 while ((p = LIST_FIRST(&re->prefix_h))) {
254 struct rde_aspath *asp = prefix_aspath(p);
255 if (asp && asp->pftableid)
256 rde_pftable_del(asp->pftableid, p);
257 prefix_destroy(p);
258 }
259 }
260 if (rib->id <= RIB_LOC_START)
261 return; /* never remove the default ribs */
262 filterlist_free(rib->in_rules_tmp);
263 filterlist_free(rib->in_rules);
264 ribs[rib->id] = NULL;
265 free(rib);
266 }
267
268 void
rib_shutdown(void)269 rib_shutdown(void)
270 {
271 struct rib *rib;
272 u_int16_t id;
273
274 for (id = 0; id < rib_size; id++) {
275 rib = rib_byid(id);
276 if (rib == NULL)
277 continue;
278 if (!RB_EMPTY(rib_tree(ribs[id]))) {
279 log_warnx("%s: rib %s is not empty", __func__,
280 ribs[id]->name);
281 }
282 rib_free(ribs[id]);
283 }
284 for (id = 0; id <= RIB_LOC_START; id++) {
285 rib = rib_byid(id);
286 if (rib == NULL)
287 continue;
288 filterlist_free(rib->in_rules_tmp);
289 filterlist_free(rib->in_rules);
290 ribs[id] = NULL;
291 free(rib);
292 }
293 free(ribs);
294 }
295
296 struct rib_entry *
rib_get(struct rib * rib,struct bgpd_addr * prefix,int prefixlen)297 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
298 {
299 struct rib_entry xre, *re;
300 struct pt_entry *pte;
301
302 pte = pt_fill(prefix, prefixlen);
303 memset(&xre, 0, sizeof(xre));
304 xre.prefix = pte;
305
306 re = RB_FIND(rib_tree, rib_tree(rib), &xre);
307 if (re && re->rib_id != rib->id)
308 fatalx("%s: Unexpected RIB %u != %u.", __func__,
309 re->rib_id, rib->id);
310 return re;
311 }
312
313 struct rib_entry *
rib_match(struct rib * rib,struct bgpd_addr * addr)314 rib_match(struct rib *rib, struct bgpd_addr *addr)
315 {
316 struct rib_entry *re;
317 int i;
318
319 switch (addr->aid) {
320 case AID_INET:
321 case AID_VPN_IPv4:
322 for (i = 32; i >= 0; i--) {
323 re = rib_get(rib, addr, i);
324 if (re != NULL)
325 return (re);
326 }
327 break;
328 case AID_INET6:
329 case AID_VPN_IPv6:
330 for (i = 128; i >= 0; i--) {
331 re = rib_get(rib, addr, i);
332 if (re != NULL)
333 return (re);
334 }
335 break;
336 default:
337 fatalx("%s: unknown af", __func__);
338 }
339 return (NULL);
340 }
341
342
343 struct rib_entry *
rib_add(struct rib * rib,struct bgpd_addr * prefix,int prefixlen)344 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
345 {
346 struct pt_entry *pte;
347 struct rib_entry *re;
348
349 pte = pt_get(prefix, prefixlen);
350 if (pte == NULL)
351 pte = pt_add(prefix, prefixlen);
352
353 if ((re = calloc(1, sizeof(*re))) == NULL)
354 fatal("rib_add");
355
356 LIST_INIT(&re->prefix_h);
357 re->prefix = pt_ref(pte);
358 re->rib_id = rib->id;
359
360 if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
361 log_warnx("rib_add: insert failed");
362 free(re);
363 return (NULL);
364 }
365
366
367 rdemem.rib_cnt++;
368
369 return (re);
370 }
371
372 void
rib_remove(struct rib_entry * re)373 rib_remove(struct rib_entry *re)
374 {
375 if (!rib_empty(re))
376 fatalx("rib_remove: entry not empty");
377
378 if (re_is_locked(re))
379 /* entry is locked, don't free it. */
380 return;
381
382 pt_unref(re->prefix);
383
384 if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
385 log_warnx("rib_remove: remove failed.");
386
387 free(re);
388 rdemem.rib_cnt--;
389 }
390
391 int
rib_empty(struct rib_entry * re)392 rib_empty(struct rib_entry *re)
393 {
394 return LIST_EMPTY(&re->prefix_h);
395 }
396
397 static struct rib_entry *
rib_restart(struct rib_context * ctx)398 rib_restart(struct rib_context *ctx)
399 {
400 struct rib_entry *re;
401
402 re = re_unlock(ctx->ctx_re);
403
404 /* find first non empty element */
405 while (re && rib_empty(re))
406 re = RB_NEXT(rib_tree, unused, re);
407
408 /* free the previously locked rib element if empty */
409 if (rib_empty(ctx->ctx_re))
410 rib_remove(ctx->ctx_re);
411 ctx->ctx_re = NULL;
412 return (re);
413 }
414
415 static void
rib_dump_r(struct rib_context * ctx)416 rib_dump_r(struct rib_context *ctx)
417 {
418 struct rib_entry *re, *next;
419 struct rib *rib;
420 unsigned int i;
421
422 rib = rib_byid(ctx->ctx_id);
423 if (rib == NULL)
424 fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
425
426 if (ctx->ctx_re == NULL)
427 re = RB_MIN(rib_tree, rib_tree(rib));
428 else
429 re = rib_restart(ctx);
430
431 for (i = 0; re != NULL; re = next) {
432 next = RB_NEXT(rib_tree, unused, re);
433 if (re->rib_id != ctx->ctx_id)
434 fatalx("%s: Unexpected RIB %u != %u.", __func__,
435 re->rib_id, ctx->ctx_id);
436 if (ctx->ctx_aid != AID_UNSPEC &&
437 ctx->ctx_aid != re->prefix->aid)
438 continue;
439 if (ctx->ctx_count && i++ >= ctx->ctx_count &&
440 !re_is_locked(re)) {
441 /* store and lock last element */
442 ctx->ctx_re = re_lock(re);
443 return;
444 }
445 ctx->ctx_rib_call(re, ctx->ctx_arg);
446 }
447
448 if (ctx->ctx_done)
449 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
450 LIST_REMOVE(ctx, entry);
451 free(ctx);
452 }
453
454 int
rib_dump_pending(void)455 rib_dump_pending(void)
456 {
457 struct rib_context *ctx;
458
459 /* return true if at least one context is not throttled */
460 LIST_FOREACH(ctx, &rib_dumps, entry) {
461 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
462 continue;
463 return 1;
464 }
465 return 0;
466 }
467
468 void
rib_dump_runner(void)469 rib_dump_runner(void)
470 {
471 struct rib_context *ctx, *next;
472
473 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
474 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
475 continue;
476 if (ctx->ctx_rib_call != NULL)
477 rib_dump_r(ctx);
478 else
479 prefix_dump_r(ctx);
480 }
481 }
482
483 static void
rib_dump_abort(u_int16_t id)484 rib_dump_abort(u_int16_t id)
485 {
486 struct rib_context *ctx, *next;
487
488 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
489 if (id != ctx->ctx_id)
490 continue;
491 if (ctx->ctx_done)
492 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
493 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
494 rib_remove(ctx->ctx_re);
495 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
496 prefix_adjout_destroy(ctx->ctx_p);
497 LIST_REMOVE(ctx, entry);
498 free(ctx);
499 }
500 }
501
502 void
rib_dump_terminate(void * arg)503 rib_dump_terminate(void *arg)
504 {
505 struct rib_context *ctx, *next;
506
507 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
508 if (ctx->ctx_arg != arg)
509 continue;
510 if (ctx->ctx_done)
511 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
512 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
513 rib_remove(ctx->ctx_re);
514 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
515 prefix_adjout_destroy(ctx->ctx_p);
516 LIST_REMOVE(ctx, entry);
517 free(ctx);
518 }
519 }
520
521 int
rib_dump_new(u_int16_t id,u_int8_t aid,unsigned int count,void * arg,void (* upcall)(struct rib_entry *,void *),void (* done)(void *,u_int8_t),int (* throttle)(void *))522 rib_dump_new(u_int16_t id, u_int8_t aid, unsigned int count, void *arg,
523 void (*upcall)(struct rib_entry *, void *), void (*done)(void *, u_int8_t),
524 int (*throttle)(void *))
525 {
526 struct rib_context *ctx;
527
528 if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
529 return -1;
530 ctx->ctx_id = id;
531 ctx->ctx_aid = aid;
532 ctx->ctx_count = count;
533 ctx->ctx_arg = arg;
534 ctx->ctx_rib_call = upcall;
535 ctx->ctx_done = done;
536 ctx->ctx_throttle = throttle;
537
538 LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
539
540 /* requested a sync traversal */
541 if (count == 0)
542 rib_dump_r(ctx);
543
544 return 0;
545 }
546
547 /* path specific functions */
548
549 static struct rde_aspath *path_lookup(struct rde_aspath *);
550 static u_int64_t path_hash(struct rde_aspath *);
551 static void path_link(struct rde_aspath *);
552 static void path_unlink(struct rde_aspath *);
553
554 struct path_table {
555 struct aspath_head *path_hashtbl;
556 u_int64_t path_hashmask;
557 } pathtable;
558
559 SIPHASH_KEY pathtablekey;
560
561 #define PATH_HASH(x) &pathtable.path_hashtbl[x & pathtable.path_hashmask]
562
563 static inline struct rde_aspath *
path_ref(struct rde_aspath * asp)564 path_ref(struct rde_aspath *asp)
565 {
566 if ((asp->flags & F_ATTR_LINKED) == 0)
567 fatalx("%s: unlinked object", __func__);
568 asp->refcnt++;
569 rdemem.path_refs++;
570
571 return asp;
572 }
573
574 static inline void
path_unref(struct rde_aspath * asp)575 path_unref(struct rde_aspath *asp)
576 {
577 if (asp == NULL)
578 return;
579 if ((asp->flags & F_ATTR_LINKED) == 0)
580 fatalx("%s: unlinked object", __func__);
581 asp->refcnt--;
582 rdemem.path_refs--;
583 if (asp->refcnt <= 0)
584 path_unlink(asp);
585 }
586
587 void
path_init(u_int32_t hashsize)588 path_init(u_int32_t hashsize)
589 {
590 u_int32_t hs, i;
591
592 for (hs = 1; hs < hashsize; hs <<= 1)
593 ;
594 pathtable.path_hashtbl = calloc(hs, sizeof(*pathtable.path_hashtbl));
595 if (pathtable.path_hashtbl == NULL)
596 fatal("path_init");
597
598 for (i = 0; i < hs; i++)
599 LIST_INIT(&pathtable.path_hashtbl[i]);
600
601 pathtable.path_hashmask = hs - 1;
602 arc4random_buf(&pathtablekey, sizeof(pathtablekey));
603 }
604
605 void
path_shutdown(void)606 path_shutdown(void)
607 {
608 u_int32_t i;
609
610 for (i = 0; i <= pathtable.path_hashmask; i++)
611 if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
612 log_warnx("path_free: free non-free table");
613
614 free(pathtable.path_hashtbl);
615 }
616
617 void
path_hash_stats(struct rde_hashstats * hs)618 path_hash_stats(struct rde_hashstats *hs)
619 {
620 struct rde_aspath *a;
621 u_int32_t i;
622 int64_t n;
623
624 memset(hs, 0, sizeof(*hs));
625 strlcpy(hs->name, "path hash", sizeof(hs->name));
626 hs->min = LLONG_MAX;
627 hs->num = pathtable.path_hashmask + 1;
628
629 for (i = 0; i <= pathtable.path_hashmask; i++) {
630 n = 0;
631 LIST_FOREACH(a, &pathtable.path_hashtbl[i], path_l)
632 n++;
633 if (n < hs->min)
634 hs->min = n;
635 if (n > hs->max)
636 hs->max = n;
637 hs->sum += n;
638 hs->sumq += n * n;
639 }
640 }
641
642 int
path_compare(struct rde_aspath * a,struct rde_aspath * b)643 path_compare(struct rde_aspath *a, struct rde_aspath *b)
644 {
645 int r;
646
647 if (a == NULL && b == NULL)
648 return (0);
649 else if (b == NULL)
650 return (1);
651 else if (a == NULL)
652 return (-1);
653 if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
654 return (1);
655 if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
656 return (-1);
657 if (a->origin > b->origin)
658 return (1);
659 if (a->origin < b->origin)
660 return (-1);
661 if (a->med > b->med)
662 return (1);
663 if (a->med < b->med)
664 return (-1);
665 if (a->lpref > b->lpref)
666 return (1);
667 if (a->lpref < b->lpref)
668 return (-1);
669 if (a->weight > b->weight)
670 return (1);
671 if (a->weight < b->weight)
672 return (-1);
673 if (a->rtlabelid > b->rtlabelid)
674 return (1);
675 if (a->rtlabelid < b->rtlabelid)
676 return (-1);
677 if (a->pftableid > b->pftableid)
678 return (1);
679 if (a->pftableid < b->pftableid)
680 return (-1);
681
682 r = aspath_compare(a->aspath, b->aspath);
683 if (r > 0)
684 return (1);
685 if (r < 0)
686 return (-1);
687
688 return (attr_compare(a, b));
689 }
690
691 static u_int64_t
path_hash(struct rde_aspath * asp)692 path_hash(struct rde_aspath *asp)
693 {
694 SIPHASH_CTX ctx;
695 u_int64_t hash;
696
697 SipHash24_Init(&ctx, &pathtablekey);
698 SipHash24_Update(&ctx, &asp->aspath_hashstart,
699 (char *)&asp->aspath_hashend - (char *)&asp->aspath_hashstart);
700
701 if (asp->aspath)
702 SipHash24_Update(&ctx, asp->aspath->data, asp->aspath->len);
703
704 hash = attr_hash(asp);
705 SipHash24_Update(&ctx, &hash, sizeof(hash));
706
707 return (SipHash24_End(&ctx));
708 }
709
710 static struct rde_aspath *
path_lookup(struct rde_aspath * aspath)711 path_lookup(struct rde_aspath *aspath)
712 {
713 struct aspath_head *head;
714 struct rde_aspath *asp;
715 u_int64_t hash;
716
717 hash = path_hash(aspath);
718 head = PATH_HASH(hash);
719
720 LIST_FOREACH(asp, head, path_l) {
721 if (asp->hash == hash && path_compare(aspath, asp) == 0)
722 return (asp);
723 }
724 return (NULL);
725 }
726
727 /*
728 * Link this aspath into the global hash table.
729 * The asp had to be alloced with path_get.
730 */
731 static void
path_link(struct rde_aspath * asp)732 path_link(struct rde_aspath *asp)
733 {
734 struct aspath_head *head;
735
736 asp->hash = path_hash(asp);
737 head = PATH_HASH(asp->hash);
738
739 LIST_INSERT_HEAD(head, asp, path_l);
740 asp->flags |= F_ATTR_LINKED;
741 }
742
743 /*
744 * This function can only be called when all prefix have been removed first.
745 * Normally this happens directly out of the prefix removal functions.
746 */
747 static void
path_unlink(struct rde_aspath * asp)748 path_unlink(struct rde_aspath *asp)
749 {
750 if (asp == NULL)
751 return;
752
753 /* make sure no reference is hold for this rde_aspath */
754 if (asp->refcnt != 0)
755 fatalx("%s: still holds references", __func__);
756
757 LIST_REMOVE(asp, path_l);
758 asp->flags &= ~F_ATTR_LINKED;
759
760 path_put(asp);
761 }
762
763 /*
764 * Copy asp to a new UNLINKED aspath.
765 * On dst either path_get() or path_prep() had to be called before.
766 */
767 struct rde_aspath *
path_copy(struct rde_aspath * dst,const struct rde_aspath * src)768 path_copy(struct rde_aspath *dst, const struct rde_aspath *src)
769 {
770 dst->aspath = src->aspath;
771 if (dst->aspath != NULL) {
772 dst->aspath->refcnt++;
773 rdemem.aspath_refs++;
774 }
775 dst->hash = 0; /* not linked so no hash and no refcnt */
776 dst->refcnt = 0;
777 dst->flags = src->flags & ~F_ATTR_LINKED;
778
779 dst->med = src->med;
780 dst->lpref = src->lpref;
781 dst->weight = src->weight;
782 dst->rtlabelid = rtlabel_ref(src->rtlabelid);
783 dst->pftableid = pftable_ref(src->pftableid);
784 dst->origin = src->origin;
785
786 attr_copy(dst, src);
787
788 return (dst);
789 }
790
791 /* initialize or pepare an aspath for use */
792 struct rde_aspath *
path_prep(struct rde_aspath * asp)793 path_prep(struct rde_aspath *asp)
794 {
795 memset(asp, 0, sizeof(*asp));
796 asp->origin = ORIGIN_INCOMPLETE;
797 asp->lpref = DEFAULT_LPREF;
798
799 return (asp);
800 }
801
802 /* alloc and initialize new entry. May not fail. */
803 struct rde_aspath *
path_get(void)804 path_get(void)
805 {
806 struct rde_aspath *asp;
807
808 asp = malloc(sizeof(*asp));
809 if (asp == NULL)
810 fatal("path_get");
811 rdemem.path_cnt++;
812
813 return (path_prep(asp));
814 }
815
816 /* clean up an asp after use (frees all references to sub-objects) */
817 void
path_clean(struct rde_aspath * asp)818 path_clean(struct rde_aspath *asp)
819 {
820 if (asp->flags & F_ATTR_LINKED)
821 fatalx("%s: linked object", __func__);
822
823 rtlabel_unref(asp->rtlabelid);
824 pftable_unref(asp->pftableid);
825 aspath_put(asp->aspath);
826 attr_freeall(asp);
827 }
828
829 /* free an unlinked element */
830 void
path_put(struct rde_aspath * asp)831 path_put(struct rde_aspath *asp)
832 {
833 if (asp == NULL)
834 return;
835
836 path_clean(asp);
837
838 rdemem.path_cnt--;
839 free(asp);
840 }
841
842 /* prefix specific functions */
843
844 static int prefix_add(struct bgpd_addr *, int, struct rib *,
845 struct rde_peer *, struct rde_aspath *,
846 struct rde_community *, struct nexthop *,
847 u_int8_t, u_int8_t);
848 static int prefix_move(struct prefix *, struct rde_peer *,
849 struct rde_aspath *, struct rde_community *,
850 struct nexthop *, u_int8_t, u_int8_t);
851
852 static void prefix_link(struct prefix *, struct rib_entry *,
853 struct rde_peer *, struct rde_aspath *,
854 struct rde_community *, struct nexthop *,
855 u_int8_t, u_int8_t);
856 static void prefix_unlink(struct prefix *);
857
858 static struct prefix *prefix_alloc(void);
859 static void prefix_free(struct prefix *);
860
861 /* RB tree comparison function */
862 static inline int
prefix_cmp(struct prefix * a,struct prefix * b)863 prefix_cmp(struct prefix *a, struct prefix *b)
864 {
865 if (a->eor != b->eor)
866 return a->eor - b->eor;
867 /* if EOR marker no need to check the rest also a->eor == b->eor */
868 if (a->eor)
869 return 0;
870
871 if (a->aspath != b->aspath)
872 return (a->aspath > b->aspath ? 1 : -1);
873 if (a->communities != b->communities)
874 return (a->communities > b->communities ? 1 : -1);
875 if (a->nexthop != b->nexthop)
876 return (a->nexthop > b->nexthop ? 1 : -1);
877 if (a->nhflags != b->nhflags)
878 return (a->nhflags > b->nhflags ? 1 : -1);
879 return pt_prefix_cmp(a->pt, b->pt);
880 }
881
882 static inline int
prefix_index_cmp(struct prefix * a,struct prefix * b)883 prefix_index_cmp(struct prefix *a, struct prefix *b)
884 {
885 return pt_prefix_cmp(a->pt, b->pt);
886 }
887
888 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
889 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
890
891 /*
892 * search for specified prefix of a peer. Returns NULL if not found.
893 */
894 struct prefix *
prefix_get(struct rib * rib,struct rde_peer * peer,struct bgpd_addr * prefix,int prefixlen)895 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
896 int prefixlen)
897 {
898 struct rib_entry *re;
899
900 re = rib_get(rib, prefix, prefixlen);
901 if (re == NULL)
902 return (NULL);
903 return (prefix_bypeer(re, peer));
904 }
905
906 /*
907 * lookup prefix in the peer prefix_index. Returns NULL if not found.
908 */
909 struct prefix *
prefix_lookup(struct rde_peer * peer,struct bgpd_addr * prefix,int prefixlen)910 prefix_lookup(struct rde_peer *peer, struct bgpd_addr *prefix,
911 int prefixlen)
912 {
913 struct prefix xp;
914 struct pt_entry *pte;
915
916 memset(&xp, 0, sizeof(xp));
917 pte = pt_fill(prefix, prefixlen);
918 xp.pt = pte;
919
920 return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
921 }
922
923 struct prefix *
prefix_match(struct rde_peer * peer,struct bgpd_addr * addr)924 prefix_match(struct rde_peer *peer, struct bgpd_addr *addr)
925 {
926 struct prefix *p;
927 int i;
928
929 switch (addr->aid) {
930 case AID_INET:
931 case AID_VPN_IPv4:
932 for (i = 32; i >= 0; i--) {
933 p = prefix_lookup(peer, addr, i);
934 if (p != NULL)
935 return p;
936 }
937 break;
938 case AID_INET6:
939 case AID_VPN_IPv6:
940 for (i = 128; i >= 0; i--) {
941 p = prefix_lookup(peer, addr, i);
942 if (p != NULL)
943 return p;
944 }
945 break;
946 default:
947 fatalx("%s: unknown af", __func__);
948 }
949 return NULL;
950 }
951
952 /*
953 * Update a prefix.
954 * Return 1 if prefix was newly added, 0 if it was just changed.
955 */
956 int
prefix_update(struct rib * rib,struct rde_peer * peer,struct filterstate * state,struct bgpd_addr * prefix,int prefixlen,u_int8_t vstate)957 prefix_update(struct rib *rib, struct rde_peer *peer, struct filterstate *state,
958 struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate)
959 {
960 struct rde_aspath *asp, *nasp = &state->aspath;
961 struct rde_community *comm, *ncomm = &state->communities;
962 struct prefix *p;
963
964 /*
965 * First try to find a prefix in the specified RIB.
966 */
967 if ((p = prefix_get(rib, peer, prefix, prefixlen)) != NULL) {
968 if (prefix_nexthop(p) == state->nexthop &&
969 prefix_nhflags(p) == state->nhflags &&
970 communities_equal(ncomm, prefix_communities(p)) &&
971 path_compare(nasp, prefix_aspath(p)) == 0) {
972 /* no change, update last change */
973 p->lastchange = getmonotime();
974 p->validation_state = vstate;
975 return (0);
976 }
977 }
978
979 /*
980 * Either the prefix does not exist or the path changed.
981 * In both cases lookup the new aspath to make sure it is not
982 * already in the RIB.
983 */
984 if ((asp = path_lookup(nasp)) == NULL) {
985 /* Path not available, create and link a new one. */
986 asp = path_copy(path_get(), nasp);
987 path_link(asp);
988 }
989
990 if ((comm = communities_lookup(ncomm)) == NULL) {
991 /* Communities not available, create and link a new one. */
992 comm = communities_link(ncomm);
993 }
994
995 /* If the prefix was found move it else add it to the RIB. */
996 if (p != NULL)
997 return (prefix_move(p, peer, asp, comm, state->nexthop,
998 state->nhflags, vstate));
999 else
1000 return (prefix_add(prefix, prefixlen, rib, peer, asp, comm,
1001 state->nexthop, state->nhflags, vstate));
1002 }
1003
1004 /*
1005 * Adds or updates a prefix.
1006 */
1007 static int
prefix_add(struct bgpd_addr * prefix,int prefixlen,struct rib * rib,struct rde_peer * peer,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,u_int8_t nhflags,u_int8_t vstate)1008 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib,
1009 struct rde_peer *peer, struct rde_aspath *asp, struct rde_community *comm,
1010 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate)
1011 {
1012 struct prefix *p;
1013 struct rib_entry *re;
1014
1015 re = rib_get(rib, prefix, prefixlen);
1016 if (re == NULL)
1017 re = rib_add(rib, prefix, prefixlen);
1018
1019 p = prefix_alloc();
1020 prefix_link(p, re, peer, asp, comm, nexthop, nhflags, vstate);
1021 return (1);
1022 }
1023
1024 /*
1025 * Move the prefix to the specified as path, removes the old asp if needed.
1026 */
1027 static int
prefix_move(struct prefix * p,struct rde_peer * peer,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,u_int8_t nhflags,u_int8_t vstate)1028 prefix_move(struct prefix *p, struct rde_peer *peer,
1029 struct rde_aspath *asp, struct rde_community *comm,
1030 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate)
1031 {
1032 struct prefix *np;
1033
1034 if (peer != prefix_peer(p))
1035 fatalx("prefix_move: cross peer move");
1036
1037 np = prefix_alloc();
1038 /* add reference to new AS path and communities */
1039 np->aspath = path_ref(asp);
1040 np->communities = communities_ref(comm);
1041 np->peer = peer;
1042 np->re = p->re;
1043 np->pt = p->pt; /* skip refcnt update since ref is moved */
1044 np->validation_state = vstate;
1045 np->nhflags = nhflags;
1046 np->nexthop = nexthop_ref(nexthop);
1047 nexthop_link(np);
1048 np->lastchange = getmonotime();
1049
1050 /* add possible pftable reference from new aspath */
1051 if (asp && asp->pftableid)
1052 rde_pftable_add(asp->pftableid, np);
1053
1054 /*
1055 * no need to update the peer prefix count because we are only moving
1056 * the prefix without changing the peer.
1057 */
1058
1059 /*
1060 * First kick the old prefix node out of the prefix list,
1061 * afterwards run the route decision for new prefix node.
1062 * Because of this only one update is generated if the prefix
1063 * was active.
1064 * This is safe because we create a new prefix and so the change
1065 * is noticed by prefix_evaluate().
1066 */
1067 prefix_evaluate(np->re, np, p);
1068
1069 /* remove old prefix node */
1070 /* as before peer count needs no update because of move */
1071
1072 /* remove possible pftable reference first */
1073 if (p->aspath && p->aspath->pftableid)
1074 rde_pftable_del(p->aspath->pftableid, p);
1075
1076 /* destroy all references to other objects and free the old prefix */
1077 nexthop_unlink(p);
1078 nexthop_unref(p->nexthop);
1079 communities_unref(p->communities);
1080 path_unref(p->aspath);
1081 p->communities = NULL;
1082 p->nexthop = NULL;
1083 p->aspath = NULL;
1084 p->peer = NULL;
1085 p->re = NULL;
1086 p->pt = NULL;
1087 prefix_free(p);
1088
1089 return (0);
1090 }
1091
1092 /*
1093 * Removes a prefix from the specified RIB. If the parent objects -- rib_entry
1094 * or pt_entry -- become empty remove them too.
1095 */
1096 int
prefix_withdraw(struct rib * rib,struct rde_peer * peer,struct bgpd_addr * prefix,int prefixlen)1097 prefix_withdraw(struct rib *rib, struct rde_peer *peer,
1098 struct bgpd_addr *prefix, int prefixlen)
1099 {
1100 struct prefix *p;
1101 struct rde_aspath *asp;
1102
1103 p = prefix_get(rib, peer, prefix, prefixlen);
1104 if (p == NULL) /* Got a dummy withdrawn request. */
1105 return (0);
1106
1107 asp = prefix_aspath(p);
1108 if (asp && asp->pftableid)
1109 /* only prefixes in the local RIB were pushed into pf */
1110 rde_pftable_del(asp->pftableid, p);
1111
1112 prefix_destroy(p);
1113
1114 return (1);
1115 }
1116
1117 /*
1118 * Insert an End-of-RIB marker into the update queue.
1119 */
1120 void
prefix_add_eor(struct rde_peer * peer,u_int8_t aid)1121 prefix_add_eor(struct rde_peer *peer, u_int8_t aid)
1122 {
1123 struct prefix *p;
1124
1125 p = prefix_alloc();
1126 p->eor = 1;
1127 p->flags = PREFIX_FLAG_UPDATE;
1128 if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
1129 /* no need to add if EoR marker already present */
1130 prefix_free(p);
1131 /* EOR marker is not inserted into the adj_rib_out index */
1132 }
1133
1134 /*
1135 * Put a prefix from the Adj-RIB-Out onto the update queue.
1136 */
1137 int
prefix_adjout_update(struct rde_peer * peer,struct filterstate * state,struct bgpd_addr * prefix,int prefixlen,u_int8_t vstate)1138 prefix_adjout_update(struct rde_peer *peer, struct filterstate *state,
1139 struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate)
1140 {
1141 struct prefix_tree *prefix_head = NULL;
1142 struct rde_aspath *asp;
1143 struct rde_community *comm;
1144 struct prefix *p;
1145 int created = 0;
1146
1147 if ((p = prefix_lookup(peer, prefix, prefixlen)) != NULL) {
1148 /* prefix is already in the Adj-RIB-Out */
1149 if (p->flags & PREFIX_FLAG_WITHDRAW) {
1150 created = 1; /* consider this a new entry */
1151 peer->up_wcnt--;
1152 prefix_head = &peer->withdraws[prefix->aid];
1153 RB_REMOVE(prefix_tree, prefix_head, p);
1154 } else if (p->flags & PREFIX_FLAG_DEAD) {
1155 created = 1; /* consider this a new entry */
1156 } else {
1157 if (prefix_nhflags(p) == state->nhflags &&
1158 prefix_nexthop(p) == state->nexthop &&
1159 communities_equal(&state->communities,
1160 prefix_communities(p)) &&
1161 path_compare(&state->aspath, prefix_aspath(p)) ==
1162 0) {
1163 /* nothing changed */
1164 p->validation_state = vstate;
1165 p->lastchange = getmonotime();
1166 p->flags &= ~PREFIX_FLAG_STALE;
1167 return 0;
1168 }
1169
1170 if (p->flags & PREFIX_FLAG_UPDATE) {
1171 /* created = 0 so up_nlricnt is not increased */
1172 prefix_head = &peer->updates[prefix->aid];
1173 RB_REMOVE(prefix_tree, prefix_head, p);
1174 }
1175 }
1176 /* unlink from aspath and remove nexthop ref */
1177 nexthop_unref(p->nexthop);
1178 communities_unref(p->communities);
1179 path_unref(p->aspath);
1180 p->flags &= ~PREFIX_FLAG_MASK;
1181
1182 /* peer and pt remain */
1183 } else {
1184 p = prefix_alloc();
1185 created = 1;
1186
1187 p->pt = pt_get(prefix, prefixlen);
1188 if (p->pt == NULL)
1189 p->pt = pt_add(prefix, prefixlen);
1190 pt_ref(p->pt);
1191 p->peer = peer;
1192
1193 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1194 fatalx("%s: RB index invariant violated", __func__);
1195 }
1196
1197 if ((asp = path_lookup(&state->aspath)) == NULL) {
1198 /* Path not available, create and link a new one. */
1199 asp = path_copy(path_get(), &state->aspath);
1200 path_link(asp);
1201 }
1202
1203 if ((comm = communities_lookup(&state->communities)) == NULL) {
1204 /* Communities not available, create and link a new one. */
1205 comm = communities_link(&state->communities);
1206 }
1207
1208 p->aspath = path_ref(asp);
1209 p->communities = communities_ref(comm);
1210 p->nexthop = nexthop_ref(state->nexthop);
1211 p->nhflags = state->nhflags;
1212
1213 p->validation_state = vstate;
1214 p->lastchange = getmonotime();
1215
1216 if (p->flags & PREFIX_FLAG_MASK)
1217 fatalx("%s: bad flags %x", __func__, p->flags);
1218 p->flags |= PREFIX_FLAG_UPDATE;
1219 if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL)
1220 fatalx("%s: RB tree invariant violated", __func__);
1221
1222 return created;
1223 }
1224
1225 /*
1226 * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves
1227 * the prefix in the RIB linked to the peer withdraw list.
1228 */
1229 int
prefix_adjout_withdraw(struct rde_peer * peer,struct bgpd_addr * prefix,int prefixlen)1230 prefix_adjout_withdraw(struct rde_peer *peer, struct bgpd_addr *prefix,
1231 int prefixlen)
1232 {
1233 struct prefix *p;
1234
1235 p = prefix_lookup(peer, prefix, prefixlen);
1236 if (p == NULL) /* Got a dummy withdrawn request. */
1237 return (0);
1238
1239 /* already a withdraw, shortcut */
1240 if (p->flags & PREFIX_FLAG_WITHDRAW) {
1241 p->lastchange = getmonotime();
1242 p->flags &= ~PREFIX_FLAG_STALE;
1243 return (0);
1244 }
1245 /* pending update just got withdrawn */
1246 if (p->flags & PREFIX_FLAG_UPDATE)
1247 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1248 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1249 p->flags &= ~PREFIX_FLAG_MASK;
1250
1251 /* remove nexthop ref ... */
1252 nexthop_unref(p->nexthop);
1253 p->nexthop = NULL;
1254 p->nhflags = 0;
1255
1256 /* unlink from aspath ...*/
1257 path_unref(p->aspath);
1258 p->aspath = NULL;
1259
1260 /* ... communities ... */
1261 communities_unref(p->communities);
1262 p->communities = NULL;
1263 /* and unlink from aspath */
1264 path_unref(p->aspath);
1265 p->aspath = NULL;
1266 /* re already NULL */
1267
1268 p->lastchange = getmonotime();
1269
1270 p->flags |= PREFIX_FLAG_WITHDRAW;
1271 if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL)
1272 fatalx("%s: RB tree invariant violated", __func__);
1273 return (1);
1274 }
1275
1276 static struct prefix *
prefix_restart(struct rib_context * ctx)1277 prefix_restart(struct rib_context *ctx)
1278 {
1279 struct prefix *p;
1280
1281 p = prefix_unlock(ctx->ctx_p);
1282
1283 if (prefix_is_dead(p)) {
1284 struct prefix *next;
1285
1286 next = RB_NEXT(prefix_index, unused, p);
1287 prefix_adjout_destroy(p);
1288 p = next;
1289 }
1290 ctx->ctx_p = NULL;
1291 return p;
1292 }
1293
1294 void
prefix_adjout_destroy(struct prefix * p)1295 prefix_adjout_destroy(struct prefix *p)
1296 {
1297 struct rde_peer *peer = prefix_peer(p);
1298
1299 if (p->eor) {
1300 /* EOR marker is not linked in the index */
1301 prefix_free(p);
1302 return;
1303 }
1304
1305 if (p->flags & PREFIX_FLAG_WITHDRAW)
1306 RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
1307 else if (p->flags & PREFIX_FLAG_UPDATE)
1308 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1309 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1310 p->flags &= ~PREFIX_FLAG_MASK;
1311
1312
1313 if (prefix_is_locked(p)) {
1314 /* remove nexthop ref ... */
1315 nexthop_unref(p->nexthop);
1316 p->nexthop = NULL;
1317 /* ... communities ... */
1318 communities_unref(p->communities);
1319 p->communities = NULL;
1320 /* and unlink from aspath */
1321 path_unref(p->aspath);
1322 p->aspath = NULL;
1323 p->nhflags = 0;
1324 /* re already NULL */
1325
1326 /* finally mark prefix dead */
1327 p->flags |= PREFIX_FLAG_DEAD;
1328 return;
1329 }
1330
1331 RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1332
1333 prefix_unlink(p);
1334 prefix_free(p);
1335 }
1336
1337 static void
prefix_dump_r(struct rib_context * ctx)1338 prefix_dump_r(struct rib_context *ctx)
1339 {
1340 struct prefix *p, *next;
1341 struct rde_peer *peer;
1342 unsigned int i;
1343
1344 if ((peer = peer_get(ctx->ctx_id)) == NULL)
1345 goto done;
1346
1347 if (ctx->ctx_p == NULL)
1348 p = RB_MIN(prefix_index, &peer->adj_rib_out);
1349 else
1350 p = prefix_restart(ctx);
1351
1352 for (i = 0; p != NULL; p = next) {
1353 next = RB_NEXT(prefix_index, unused, p);
1354 if (prefix_is_dead(p))
1355 continue;
1356 if (ctx->ctx_aid != AID_UNSPEC &&
1357 ctx->ctx_aid != p->pt->aid)
1358 continue;
1359 if (ctx->ctx_count && i++ >= ctx->ctx_count &&
1360 !prefix_is_locked(p)) {
1361 /* store and lock last element */
1362 ctx->ctx_p = prefix_lock(p);
1363 return;
1364 }
1365 ctx->ctx_prefix_call(p, ctx->ctx_arg);
1366 }
1367
1368 done:
1369 if (ctx->ctx_done)
1370 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
1371 LIST_REMOVE(ctx, entry);
1372 free(ctx);
1373 }
1374
1375 int
prefix_dump_new(struct rde_peer * peer,u_int8_t aid,unsigned int count,void * arg,void (* upcall)(struct prefix *,void *),void (* done)(void *,u_int8_t),int (* throttle)(void *))1376 prefix_dump_new(struct rde_peer *peer, u_int8_t aid, unsigned int count,
1377 void *arg, void (*upcall)(struct prefix *, void *),
1378 void (*done)(void *, u_int8_t), int (*throttle)(void *))
1379 {
1380 struct rib_context *ctx;
1381
1382 if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1383 return -1;
1384 ctx->ctx_id = peer->conf.id;
1385 ctx->ctx_aid = aid;
1386 ctx->ctx_count = count;
1387 ctx->ctx_arg = arg;
1388 ctx->ctx_prefix_call = upcall;
1389 ctx->ctx_done = done;
1390 ctx->ctx_throttle = throttle;
1391
1392 LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1393
1394 /* requested a sync traversal */
1395 if (count == 0)
1396 prefix_dump_r(ctx);
1397
1398 return 0;
1399 }
1400
1401 /* dump a prefix into specified buffer */
1402 int
prefix_write(u_char * buf,int len,struct bgpd_addr * prefix,u_int8_t plen,int withdraw)1403 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen,
1404 int withdraw)
1405 {
1406 int totlen, psize;
1407
1408 switch (prefix->aid) {
1409 case AID_INET:
1410 case AID_INET6:
1411 totlen = PREFIX_SIZE(plen);
1412
1413 if (totlen > len)
1414 return (-1);
1415 *buf++ = plen;
1416 memcpy(buf, &prefix->ba, totlen - 1);
1417 return (totlen);
1418 case AID_VPN_IPv4:
1419 case AID_VPN_IPv6:
1420 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd);
1421 psize = PREFIX_SIZE(plen) - 1;
1422 plen += sizeof(prefix->rd) * 8;
1423 if (withdraw) {
1424 /* withdraw have one compat label as placeholder */
1425 totlen += 3;
1426 plen += 3 * 8;
1427 } else {
1428 totlen += prefix->labellen;
1429 plen += prefix->labellen * 8;
1430 }
1431
1432 if (totlen > len)
1433 return (-1);
1434 *buf++ = plen;
1435 if (withdraw) {
1436 /* magic compatibility label as per rfc8277 */
1437 *buf++ = 0x80;
1438 *buf++ = 0x0;
1439 *buf++ = 0x0;
1440 } else {
1441 memcpy(buf, &prefix->labelstack,
1442 prefix->labellen);
1443 buf += prefix->labellen;
1444 }
1445 memcpy(buf, &prefix->rd, sizeof(prefix->rd));
1446 buf += sizeof(prefix->rd);
1447 memcpy(buf, &prefix->ba, psize);
1448 return (totlen);
1449 default:
1450 return (-1);
1451 }
1452 }
1453
1454 int
prefix_writebuf(struct ibuf * buf,struct bgpd_addr * prefix,u_int8_t plen)1455 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen)
1456 {
1457 int totlen;
1458 void *bptr;
1459
1460 switch (prefix->aid) {
1461 case AID_INET:
1462 case AID_INET6:
1463 totlen = PREFIX_SIZE(plen);
1464 break;
1465 case AID_VPN_IPv4:
1466 case AID_VPN_IPv6:
1467 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd) +
1468 prefix->labellen;
1469 break;
1470 default:
1471 return (-1);
1472 }
1473
1474 if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
1475 return (-1);
1476 if (prefix_write(bptr, totlen, prefix, plen, 0) == -1)
1477 return (-1);
1478 return (0);
1479 }
1480
1481 /*
1482 * Searches in the prefix list of specified rib_entry for a prefix entry
1483 * belonging to the peer peer. Returns NULL if no match found.
1484 */
1485 struct prefix *
prefix_bypeer(struct rib_entry * re,struct rde_peer * peer)1486 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer)
1487 {
1488 struct prefix *p;
1489
1490 LIST_FOREACH(p, &re->prefix_h, entry.list.rib)
1491 if (prefix_peer(p) == peer)
1492 return (p);
1493 return (NULL);
1494 }
1495
1496 static void
prefix_evaluate_all(struct prefix * p,enum nexthop_state state,enum nexthop_state oldstate)1497 prefix_evaluate_all(struct prefix *p, enum nexthop_state state,
1498 enum nexthop_state oldstate)
1499 {
1500 /* Skip non local-RIBs or RIBs that are flagged as noeval. */
1501 if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) {
1502 log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);
1503 return;
1504 }
1505
1506 if (oldstate == state) {
1507 /*
1508 * The state of the nexthop did not change. The only
1509 * thing that may have changed is the true_nexthop
1510 * or other internal infos. This will not change
1511 * the routing decision so shortcut here.
1512 */
1513 if (state == NEXTHOP_REACH) {
1514 if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 &&
1515 p == p->re->active)
1516 rde_send_kroute(re_rib(p->re), p, NULL);
1517 }
1518 return;
1519 }
1520
1521 /* redo the route decision */
1522 prefix_evaluate(p->re, p, p);
1523 }
1524
1525 /* kill a prefix. */
1526 void
prefix_destroy(struct prefix * p)1527 prefix_destroy(struct prefix *p)
1528 {
1529 /* make route decision */
1530 prefix_evaluate(p->re, NULL, p);
1531
1532 prefix_unlink(p);
1533 prefix_free(p);
1534 }
1535
1536 /*
1537 * Link a prefix into the different parent objects.
1538 */
1539 static void
prefix_link(struct prefix * p,struct rib_entry * re,struct rde_peer * peer,struct rde_aspath * asp,struct rde_community * comm,struct nexthop * nexthop,u_int8_t nhflags,u_int8_t vstate)1540 prefix_link(struct prefix *p, struct rib_entry *re, struct rde_peer *peer,
1541 struct rde_aspath *asp, struct rde_community *comm,
1542 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate)
1543 {
1544 p->aspath = path_ref(asp);
1545 p->communities = communities_ref(comm);
1546 p->peer = peer;
1547 p->pt = pt_ref(re->prefix);
1548 p->re = re;
1549 p->validation_state = vstate;
1550 p->nhflags = nhflags;
1551 p->nexthop = nexthop_ref(nexthop);
1552 nexthop_link(p);
1553 p->lastchange = getmonotime();
1554
1555 /* add possible pftable reference from aspath */
1556 if (asp && asp->pftableid)
1557 rde_pftable_add(asp->pftableid, p);
1558
1559 /* make route decision */
1560 prefix_evaluate(re, p, NULL);
1561 }
1562
1563 /*
1564 * Unlink a prefix from the different parent objects.
1565 */
1566 static void
prefix_unlink(struct prefix * p)1567 prefix_unlink(struct prefix *p)
1568 {
1569 struct rib_entry *re = p->re;
1570
1571 /* destroy all references to other objects */
1572 nexthop_unlink(p);
1573 nexthop_unref(p->nexthop);
1574 communities_unref(p->communities);
1575 path_unref(p->aspath);
1576 pt_unref(p->pt);
1577 p->communities = NULL;
1578 p->nexthop = NULL;
1579 p->aspath = NULL;
1580 p->peer = NULL;
1581 p->re = NULL;
1582 p->pt = NULL;
1583
1584 if (re && rib_empty(re))
1585 rib_remove(re);
1586
1587 /*
1588 * It's the caller's duty to do accounting and remove empty aspath
1589 * structures. Also freeing the unlinked prefix is the caller's duty.
1590 */
1591 }
1592
1593 /* alloc and zero new entry. May not fail. */
1594 static struct prefix *
prefix_alloc(void)1595 prefix_alloc(void)
1596 {
1597 struct prefix *p;
1598
1599 p = calloc(1, sizeof(*p));
1600 if (p == NULL)
1601 fatal("prefix_alloc");
1602 rdemem.prefix_cnt++;
1603 return p;
1604 }
1605
1606 /* free a unlinked entry */
1607 static void
prefix_free(struct prefix * p)1608 prefix_free(struct prefix *p)
1609 {
1610 rdemem.prefix_cnt--;
1611 free(p);
1612 }
1613
1614 /*
1615 * nexthop functions
1616 */
1617 struct nexthop_head *nexthop_hash(struct bgpd_addr *);
1618 struct nexthop *nexthop_lookup(struct bgpd_addr *);
1619
1620 /*
1621 * In BGP there exist two nexthops: the exit nexthop which was announced via
1622 * BGP and the true nexthop which is used in the FIB -- forward information
1623 * base a.k.a kernel routing table. When sending updates it is even more
1624 * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1625 * while in EBGP normally the address of the router is sent. The exit nexthop
1626 * may be passed to the external neighbor if the neighbor and the exit nexthop
1627 * reside in the same subnet -- directly connected.
1628 */
1629 struct nexthop_table {
1630 LIST_HEAD(nexthop_head, nexthop) *nexthop_hashtbl;
1631 u_int32_t nexthop_hashmask;
1632 } nexthoptable;
1633
1634 SIPHASH_KEY nexthoptablekey;
1635
1636 TAILQ_HEAD(nexthop_queue, nexthop) nexthop_runners;
1637
1638 void
nexthop_init(u_int32_t hashsize)1639 nexthop_init(u_int32_t hashsize)
1640 {
1641 u_int32_t hs, i;
1642
1643 for (hs = 1; hs < hashsize; hs <<= 1)
1644 ;
1645 nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head));
1646 if (nexthoptable.nexthop_hashtbl == NULL)
1647 fatal("nextop_init");
1648
1649 TAILQ_INIT(&nexthop_runners);
1650 for (i = 0; i < hs; i++)
1651 LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1652 arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey));
1653
1654 nexthoptable.nexthop_hashmask = hs - 1;
1655 }
1656
1657 void
nexthop_shutdown(void)1658 nexthop_shutdown(void)
1659 {
1660 u_int32_t i;
1661 struct nexthop *nh, *nnh;
1662
1663 for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1664 for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1665 nh != NULL; nh = nnh) {
1666 nnh = LIST_NEXT(nh, nexthop_l);
1667 nh->state = NEXTHOP_UNREACH;
1668 nexthop_unref(nh);
1669 }
1670 if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i])) {
1671 nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1672 log_warnx("nexthop_shutdown: non-free table, "
1673 "nexthop %s refcnt %d",
1674 log_addr(&nh->exit_nexthop), nh->refcnt);
1675 }
1676 }
1677
1678 free(nexthoptable.nexthop_hashtbl);
1679 }
1680
1681 int
nexthop_pending(void)1682 nexthop_pending(void)
1683 {
1684 return !TAILQ_EMPTY(&nexthop_runners);
1685 }
1686
1687 void
nexthop_runner(void)1688 nexthop_runner(void)
1689 {
1690 struct nexthop *nh;
1691 struct prefix *p;
1692 u_int32_t j;
1693
1694 nh = TAILQ_FIRST(&nexthop_runners);
1695 if (nh == NULL)
1696 return;
1697
1698 /* remove from runnner queue */
1699 TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1700
1701 p = nh->next_prefix;
1702 for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
1703 prefix_evaluate_all(p, nh->state, nh->oldstate);
1704 p = LIST_NEXT(p, entry.list.nexthop);
1705 }
1706
1707 /* prep for next run, if not finished readd to tail of queue */
1708 nh->next_prefix = p;
1709 if (p != NULL)
1710 TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l);
1711 else
1712 log_debug("nexthop %s update finished",
1713 log_addr(&nh->exit_nexthop));
1714 }
1715
1716 void
nexthop_update(struct kroute_nexthop * msg)1717 nexthop_update(struct kroute_nexthop *msg)
1718 {
1719 struct nexthop *nh;
1720
1721 nh = nexthop_lookup(&msg->nexthop);
1722 if (nh == NULL) {
1723 log_warnx("nexthop_update: non-existent nexthop %s",
1724 log_addr(&msg->nexthop));
1725 return;
1726 }
1727
1728 nh->oldstate = nh->state;
1729 if (msg->valid)
1730 nh->state = NEXTHOP_REACH;
1731 else
1732 nh->state = NEXTHOP_UNREACH;
1733
1734 if (nh->oldstate == NEXTHOP_LOOKUP)
1735 /* drop reference which was hold during the lookup */
1736 if (nexthop_unref(nh))
1737 return; /* nh lost last ref, no work left */
1738
1739 if (nh->next_prefix) {
1740 /*
1741 * If nexthop_runner() is not finished with this nexthop
1742 * then ensure that all prefixes are updated by setting
1743 * the oldstate to NEXTHOP_FLAPPED.
1744 */
1745 nh->oldstate = NEXTHOP_FLAPPED;
1746 TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1747 }
1748
1749 if (msg->connected) {
1750 nh->flags |= NEXTHOP_CONNECTED;
1751 memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1752 sizeof(nh->true_nexthop));
1753 } else
1754 memcpy(&nh->true_nexthop, &msg->gateway,
1755 sizeof(nh->true_nexthop));
1756
1757 memcpy(&nh->nexthop_net, &msg->net,
1758 sizeof(nh->nexthop_net));
1759 nh->nexthop_netlen = msg->netlen;
1760
1761 nh->next_prefix = LIST_FIRST(&nh->prefix_h);
1762 if (nh->next_prefix != NULL) {
1763 TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);
1764 log_debug("nexthop %s update starting",
1765 log_addr(&nh->exit_nexthop));
1766 }
1767 }
1768
1769 void
nexthop_modify(struct nexthop * setnh,enum action_types type,u_int8_t aid,struct nexthop ** nexthop,u_int8_t * flags)1770 nexthop_modify(struct nexthop *setnh, enum action_types type, u_int8_t aid,
1771 struct nexthop **nexthop, u_int8_t *flags)
1772 {
1773 switch (type) {
1774 case ACTION_SET_NEXTHOP_REJECT:
1775 *flags = NEXTHOP_REJECT;
1776 break;
1777 case ACTION_SET_NEXTHOP_BLACKHOLE:
1778 *flags = NEXTHOP_BLACKHOLE;
1779 break;
1780 case ACTION_SET_NEXTHOP_NOMODIFY:
1781 *flags = NEXTHOP_NOMODIFY;
1782 break;
1783 case ACTION_SET_NEXTHOP_SELF:
1784 *flags = NEXTHOP_SELF;
1785 break;
1786 case ACTION_SET_NEXTHOP_REF:
1787 /*
1788 * it is possible that a prefix matches but has the wrong
1789 * address family for the set nexthop. In this case ignore it.
1790 */
1791 if (aid != setnh->exit_nexthop.aid)
1792 break;
1793 nexthop_unref(*nexthop);
1794 *nexthop = nexthop_ref(setnh);
1795 *flags = 0;
1796 break;
1797 default:
1798 break;
1799 }
1800 }
1801
1802 void
nexthop_link(struct prefix * p)1803 nexthop_link(struct prefix *p)
1804 {
1805 if (p->nexthop == NULL)
1806 return;
1807
1808 /* no need to link prefixes in RIBs that have no decision process */
1809 if (re_rib(p->re)->flags & F_RIB_NOEVALUATE)
1810 return;
1811
1812 p->flags |= PREFIX_NEXTHOP_LINKED;
1813 LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
1814 }
1815
1816 void
nexthop_unlink(struct prefix * p)1817 nexthop_unlink(struct prefix *p)
1818 {
1819 if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
1820 return;
1821
1822 if (p == p->nexthop->next_prefix) {
1823 p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
1824 /* remove nexthop from list if no prefixes left to update */
1825 if (p->nexthop->next_prefix == NULL) {
1826 TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l);
1827 log_debug("nexthop %s update finished",
1828 log_addr(&p->nexthop->exit_nexthop));
1829 }
1830 }
1831
1832 p->flags &= ~PREFIX_NEXTHOP_LINKED;
1833 LIST_REMOVE(p, entry.list.nexthop);
1834 }
1835
1836 struct nexthop *
nexthop_get(struct bgpd_addr * nexthop)1837 nexthop_get(struct bgpd_addr *nexthop)
1838 {
1839 struct nexthop *nh;
1840
1841 nh = nexthop_lookup(nexthop);
1842 if (nh == NULL) {
1843 nh = calloc(1, sizeof(*nh));
1844 if (nh == NULL)
1845 fatal("nexthop_alloc");
1846 rdemem.nexthop_cnt++;
1847
1848 LIST_INIT(&nh->prefix_h);
1849 nh->state = NEXTHOP_LOOKUP;
1850 nexthop_ref(nh); /* take reference for lookup */
1851 nh->exit_nexthop = *nexthop;
1852 LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1853 nexthop_l);
1854
1855 rde_send_nexthop(&nh->exit_nexthop, 1);
1856 }
1857
1858 return nexthop_ref(nh);
1859 }
1860
1861 struct nexthop *
nexthop_ref(struct nexthop * nexthop)1862 nexthop_ref(struct nexthop *nexthop)
1863 {
1864 if (nexthop)
1865 nexthop->refcnt++;
1866 return (nexthop);
1867 }
1868
1869 int
nexthop_unref(struct nexthop * nh)1870 nexthop_unref(struct nexthop *nh)
1871 {
1872 if (nh == NULL)
1873 return (0);
1874 if (--nh->refcnt > 0)
1875 return (0);
1876
1877 /* sanity check */
1878 if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
1879 fatalx("%s: refcnt error", __func__);
1880
1881 /* is nexthop update running? impossible, that is a refcnt error */
1882 if (nh->next_prefix)
1883 fatalx("%s: next_prefix not NULL", __func__);
1884
1885 LIST_REMOVE(nh, nexthop_l);
1886 rde_send_nexthop(&nh->exit_nexthop, 0);
1887
1888 rdemem.nexthop_cnt--;
1889 free(nh);
1890 return (1);
1891 }
1892
1893 int
nexthop_compare(struct nexthop * na,struct nexthop * nb)1894 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1895 {
1896 struct bgpd_addr *a, *b;
1897
1898 if (na == nb)
1899 return (0);
1900 if (na == NULL)
1901 return (-1);
1902 if (nb == NULL)
1903 return (1);
1904
1905 a = &na->exit_nexthop;
1906 b = &nb->exit_nexthop;
1907
1908 if (a->aid != b->aid)
1909 return (a->aid - b->aid);
1910
1911 switch (a->aid) {
1912 case AID_INET:
1913 if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1914 return (1);
1915 if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1916 return (-1);
1917 return (0);
1918 case AID_INET6:
1919 return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1920 default:
1921 fatalx("nexthop_cmp: unknown af");
1922 }
1923 return (-1);
1924 }
1925
1926 struct nexthop *
nexthop_lookup(struct bgpd_addr * nexthop)1927 nexthop_lookup(struct bgpd_addr *nexthop)
1928 {
1929 struct nexthop *nh;
1930
1931 LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1932 if (memcmp(&nh->exit_nexthop, nexthop,
1933 sizeof(struct bgpd_addr)) == 0)
1934 return (nh);
1935 }
1936 return (NULL);
1937 }
1938
1939 struct nexthop_head *
nexthop_hash(struct bgpd_addr * nexthop)1940 nexthop_hash(struct bgpd_addr *nexthop)
1941 {
1942 u_int32_t h = 0;
1943
1944 switch (nexthop->aid) {
1945 case AID_INET:
1946 h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr,
1947 sizeof(nexthop->v4.s_addr));
1948 break;
1949 case AID_INET6:
1950 h = SipHash24(&nexthoptablekey, &nexthop->v6,
1951 sizeof(struct in6_addr));
1952 break;
1953 default:
1954 fatalx("nexthop_hash: unsupported AF");
1955 }
1956 return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]);
1957 }
1958