1 /* $OpenBSD: rde_rib.c,v 1.267 2025/01/14 12:24:23 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/queue.h> 21 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <time.h> 26 27 #include "bgpd.h" 28 #include "rde.h" 29 #include "log.h" 30 31 /* 32 * BGP RIB -- Routing Information Base 33 * 34 * The RIB is build with one aspect in mind. Speed -- actually update speed. 35 * Therefore one thing needs to be absolutely avoided, long table walks. 36 * This is achieved by heavily linking the different parts together. 37 */ 38 uint16_t rib_size; 39 struct rib **ribs; 40 struct rib flowrib = { .id = 1, .tree = RB_INITIALIZER(&flowrib.tree) }; 41 42 struct rib_entry *rib_add(struct rib *, struct pt_entry *); 43 static inline int rib_compare(const struct rib_entry *, 44 const struct rib_entry *); 45 void rib_remove(struct rib_entry *); 46 int rib_empty(struct rib_entry *); 47 static void rib_dump_abort(uint16_t); 48 49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare); 50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare); 51 52 struct rib_context { 53 LIST_ENTRY(rib_context) entry; 54 struct rib_entry *ctx_re; 55 struct prefix *ctx_p; 56 uint32_t ctx_id; 57 void (*ctx_rib_call)(struct rib_entry *, void *); 58 void (*ctx_prefix_call)(struct prefix *, void *); 59 void (*ctx_done)(void *, uint8_t); 60 int (*ctx_throttle)(void *); 61 void *ctx_arg; 62 struct bgpd_addr ctx_subtree; 63 unsigned int ctx_count; 64 uint8_t ctx_aid; 65 uint8_t ctx_subtreelen; 66 }; 67 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps); 68 69 static void prefix_dump_r(struct rib_context *); 70 71 static inline struct rib_entry * 72 re_lock(struct rib_entry *re) 73 { 74 if (re->lock != 0) 75 log_warnx("%s: entry already locked", __func__); 76 re->lock = 1; 77 return re; 78 } 79 80 static inline struct rib_entry * 81 re_unlock(struct rib_entry *re) 82 { 83 if (re->lock == 0) 84 log_warnx("%s: entry already unlocked", __func__); 85 re->lock = 0; 86 return re; 87 } 88 89 static inline int 90 re_is_locked(struct rib_entry *re) 91 { 92 return (re->lock != 0); 93 } 94 95 static inline struct prefix * 96 prefix_lock(struct prefix *p) 97 { 98 if (p->flags & PREFIX_FLAG_LOCKED) 99 fatalx("%s: locking locked prefix", __func__); 100 p->flags |= PREFIX_FLAG_LOCKED; 101 return p; 102 } 103 104 static inline struct prefix * 105 prefix_unlock(struct prefix *p) 106 { 107 if ((p->flags & PREFIX_FLAG_LOCKED) == 0) 108 fatalx("%s: unlocking unlocked prefix", __func__); 109 p->flags &= ~PREFIX_FLAG_LOCKED; 110 return p; 111 } 112 113 static inline int 114 prefix_is_locked(struct prefix *p) 115 { 116 return (p->flags & PREFIX_FLAG_LOCKED) != 0; 117 } 118 119 static inline int 120 prefix_is_dead(struct prefix *p) 121 { 122 return (p->flags & PREFIX_FLAG_DEAD) != 0; 123 } 124 125 static inline struct rib_tree * 126 rib_tree(struct rib *rib) 127 { 128 return (&rib->tree); 129 } 130 131 static inline int 132 rib_compare(const struct rib_entry *a, const struct rib_entry *b) 133 { 134 return (pt_prefix_cmp(a->prefix, b->prefix)); 135 } 136 137 /* RIB specific functions */ 138 struct rib * 139 rib_new(char *name, u_int rtableid, uint16_t flags) 140 { 141 struct rib *new; 142 uint16_t id; 143 144 for (id = 0; id < rib_size; id++) { 145 if (ribs[id] == NULL) 146 break; 147 } 148 149 if (id >= rib_size) { 150 if ((ribs = recallocarray(ribs, id, id + 8, 151 sizeof(struct rib *))) == NULL) 152 fatal(NULL); 153 rib_size = id + 8; 154 } 155 156 if ((new = calloc(1, sizeof(*new))) == NULL) 157 fatal(NULL); 158 159 strlcpy(new->name, name, sizeof(new->name)); 160 RB_INIT(rib_tree(new)); 161 new->state = RECONF_REINIT; 162 new->id = id; 163 new->flags = flags; 164 new->rtableid = rtableid; 165 166 new->in_rules = calloc(1, sizeof(struct filter_head)); 167 if (new->in_rules == NULL) 168 fatal(NULL); 169 TAILQ_INIT(new->in_rules); 170 171 ribs[id] = new; 172 173 log_debug("%s: %s -> %u", __func__, name, id); 174 return (new); 175 } 176 177 /* 178 * This function is only called when the FIB information of a RIB changed. 179 * It will flush the FIB if there was one previously and change the fibstate 180 * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB) 181 * or RECONF_REINIT (rerun the route decision process for every element) 182 * depending on the new flags. 183 */ 184 int 185 rib_update(struct rib *rib) 186 { 187 /* flush fib first if there was one */ 188 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 189 rde_send_kroute_flush(rib); 190 191 /* if no evaluate changes then a full reinit is needed */ 192 if ((rib->flags & F_RIB_NOEVALUATE) != 193 (rib->flags_tmp & F_RIB_NOEVALUATE)) 194 rib->fibstate = RECONF_REINIT; 195 196 rib->flags = rib->flags_tmp; 197 rib->rtableid = rib->rtableid_tmp; 198 199 /* reload fib if there is no reinit pending and there will be a fib */ 200 if (rib->fibstate != RECONF_REINIT && 201 (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 202 rib->fibstate = RECONF_RELOAD; 203 204 return (rib->fibstate == RECONF_REINIT); 205 } 206 207 struct rib * 208 rib_byid(uint16_t id) 209 { 210 if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL) 211 return NULL; 212 return ribs[id]; 213 } 214 215 uint16_t 216 rib_find(char *name) 217 { 218 uint16_t id; 219 220 /* no name returns the first Loc-RIB */ 221 if (name == NULL || *name == '\0') 222 return RIB_LOC_START; 223 224 for (id = 0; id < rib_size; id++) { 225 if (ribs[id] != NULL && !strcmp(ribs[id]->name, name)) 226 return id; 227 } 228 229 return RIB_NOTFOUND; 230 } 231 232 void 233 rib_free(struct rib *rib) 234 { 235 struct rib_entry *re, *xre; 236 struct prefix *p; 237 238 rib_dump_abort(rib->id); 239 240 /* 241 * flush the rib, disable route evaluation and fib sync to speed up 242 * the prefix removal. Nothing depends on this data anymore. 243 */ 244 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 245 rde_send_kroute_flush(rib); 246 rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB; 247 248 for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) { 249 xre = RB_NEXT(rib_tree, rib_tree(rib), re); 250 251 /* 252 * Removing the prefixes is tricky because the last one 253 * will remove the rib_entry as well and because we do 254 * an empty check in prefix_destroy() it is not possible to 255 * use the default for loop. 256 */ 257 while ((p = TAILQ_FIRST(&re->prefix_h))) { 258 struct rde_aspath *asp = prefix_aspath(p); 259 if (asp && asp->pftableid) 260 rde_pftable_del(asp->pftableid, p); 261 prefix_destroy(p); 262 } 263 } 264 if (rib->id <= RIB_LOC_START) 265 return; /* never remove the default ribs */ 266 filterlist_free(rib->in_rules_tmp); 267 filterlist_free(rib->in_rules); 268 ribs[rib->id] = NULL; 269 free(rib); 270 } 271 272 void 273 rib_shutdown(void) 274 { 275 struct rib *rib; 276 uint16_t id; 277 278 for (id = 0; id < rib_size; id++) { 279 rib = rib_byid(id); 280 if (rib == NULL) 281 continue; 282 if (!RB_EMPTY(rib_tree(ribs[id]))) { 283 log_warnx("%s: rib %s is not empty", __func__, 284 ribs[id]->name); 285 } 286 rib_free(ribs[id]); 287 } 288 for (id = 0; id <= RIB_LOC_START; id++) { 289 rib = rib_byid(id); 290 if (rib == NULL) 291 continue; 292 filterlist_free(rib->in_rules_tmp); 293 filterlist_free(rib->in_rules); 294 ribs[id] = NULL; 295 free(rib); 296 } 297 free(ribs); 298 } 299 300 struct rib_entry * 301 rib_get(struct rib *rib, struct pt_entry *pte) 302 { 303 struct rib_entry xre, *re; 304 305 memset(&xre, 0, sizeof(xre)); 306 xre.prefix = pte; 307 308 re = RB_FIND(rib_tree, rib_tree(rib), &xre); 309 if (re && re->rib_id != rib->id) 310 fatalx("%s: Unexpected RIB %u != %u.", __func__, 311 re->rib_id, rib->id); 312 return re; 313 } 314 315 struct rib_entry * 316 rib_get_addr(struct rib *rib, struct bgpd_addr *prefix, int prefixlen) 317 { 318 return rib_get(rib, pt_fill(prefix, prefixlen)); 319 } 320 321 struct rib_entry * 322 rib_match(struct rib *rib, struct bgpd_addr *addr) 323 { 324 struct rib_entry *re; 325 int i; 326 327 switch (addr->aid) { 328 case AID_INET: 329 case AID_VPN_IPv4: 330 for (i = 32; i >= 0; i--) { 331 re = rib_get_addr(rib, addr, i); 332 if (re != NULL) 333 return (re); 334 } 335 break; 336 case AID_INET6: 337 case AID_VPN_IPv6: 338 for (i = 128; i >= 0; i--) { 339 re = rib_get_addr(rib, addr, i); 340 if (re != NULL) 341 return (re); 342 } 343 break; 344 default: 345 fatalx("%s: unknown af", __func__); 346 } 347 return (NULL); 348 } 349 350 351 struct rib_entry * 352 rib_add(struct rib *rib, struct pt_entry *pte) 353 { 354 struct rib_entry *re; 355 356 if ((re = calloc(1, sizeof(*re))) == NULL) 357 fatal("rib_add"); 358 359 TAILQ_INIT(&re->prefix_h); 360 re->prefix = pt_ref(pte); 361 re->rib_id = rib->id; 362 363 if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) { 364 log_warnx("rib_add: insert failed"); 365 free(re); 366 return (NULL); 367 } 368 369 370 rdemem.rib_cnt++; 371 372 return (re); 373 } 374 375 void 376 rib_remove(struct rib_entry *re) 377 { 378 if (!rib_empty(re)) 379 fatalx("rib_remove: entry not empty"); 380 381 if (re_is_locked(re)) 382 /* entry is locked, don't free it. */ 383 return; 384 385 pt_unref(re->prefix); 386 387 if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL) 388 log_warnx("rib_remove: remove failed."); 389 390 free(re); 391 rdemem.rib_cnt--; 392 } 393 394 int 395 rib_empty(struct rib_entry *re) 396 { 397 return TAILQ_EMPTY(&re->prefix_h); 398 } 399 400 static struct rib_entry * 401 rib_restart(struct rib_context *ctx) 402 { 403 struct rib_entry *re = NULL; 404 405 if (ctx->ctx_re) 406 re = re_unlock(ctx->ctx_re); 407 408 /* find first non empty element */ 409 while (re && rib_empty(re)) 410 re = RB_NEXT(rib_tree, unused, re); 411 412 /* free the previously locked rib element if empty */ 413 if (ctx->ctx_re && rib_empty(ctx->ctx_re)) 414 rib_remove(ctx->ctx_re); 415 ctx->ctx_re = NULL; 416 return (re); 417 } 418 419 static void 420 rib_dump_r(struct rib_context *ctx) 421 { 422 struct rib_entry *re, *next; 423 struct rib *rib; 424 unsigned int i; 425 426 rib = rib_byid(ctx->ctx_id); 427 if (rib == NULL) 428 fatalx("%s: rib id %u gone", __func__, ctx->ctx_id); 429 430 if (ctx->ctx_re == NULL && ctx->ctx_subtree.aid == AID_UNSPEC) 431 re = RB_MIN(rib_tree, rib_tree(rib)); 432 else 433 re = rib_restart(ctx); 434 435 for (i = 0; re != NULL; re = next) { 436 next = RB_NEXT(rib_tree, unused, re); 437 if (re->rib_id != ctx->ctx_id) 438 fatalx("%s: Unexpected RIB %u != %u.", __func__, 439 re->rib_id, ctx->ctx_id); 440 if (ctx->ctx_aid != AID_UNSPEC && 441 ctx->ctx_aid != re->prefix->aid) 442 continue; 443 if (ctx->ctx_subtree.aid != AID_UNSPEC) { 444 struct bgpd_addr addr; 445 pt_getaddr(re->prefix, &addr); 446 if (prefix_compare(&ctx->ctx_subtree, &addr, 447 ctx->ctx_subtreelen) != 0) 448 /* left subtree, walk is done */ 449 break; 450 } 451 if (ctx->ctx_count && i++ >= ctx->ctx_count && 452 !re_is_locked(re)) { 453 /* store and lock last element */ 454 ctx->ctx_re = re_lock(re); 455 return; 456 } 457 ctx->ctx_rib_call(re, ctx->ctx_arg); 458 } 459 460 if (ctx->ctx_done) 461 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 462 LIST_REMOVE(ctx, entry); 463 free(ctx); 464 } 465 466 int 467 rib_dump_pending(void) 468 { 469 struct rib_context *ctx; 470 471 /* return true if at least one context is not throttled */ 472 LIST_FOREACH(ctx, &rib_dumps, entry) { 473 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 474 continue; 475 return 1; 476 } 477 return 0; 478 } 479 480 void 481 rib_dump_runner(void) 482 { 483 struct rib_context *ctx, *next; 484 485 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 486 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 487 continue; 488 if (ctx->ctx_rib_call != NULL) 489 rib_dump_r(ctx); 490 else 491 prefix_dump_r(ctx); 492 } 493 } 494 495 static void 496 rib_dump_abort(uint16_t id) 497 { 498 struct rib_context *ctx, *next; 499 500 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 501 if (id != ctx->ctx_id) 502 continue; 503 if (ctx->ctx_done) 504 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 505 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 506 rib_remove(ctx->ctx_re); 507 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 508 prefix_adjout_destroy(ctx->ctx_p); 509 LIST_REMOVE(ctx, entry); 510 free(ctx); 511 } 512 } 513 514 void 515 rib_dump_terminate(void *arg) 516 { 517 struct rib_context *ctx, *next; 518 519 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 520 if (ctx->ctx_arg != arg) 521 continue; 522 if (ctx->ctx_done) 523 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 524 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 525 rib_remove(ctx->ctx_re); 526 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 527 prefix_adjout_destroy(ctx->ctx_p); 528 LIST_REMOVE(ctx, entry); 529 free(ctx); 530 } 531 } 532 533 int 534 rib_dump_new(uint16_t id, uint8_t aid, unsigned int count, void *arg, 535 void (*upcall)(struct rib_entry *, void *), void (*done)(void *, uint8_t), 536 int (*throttle)(void *)) 537 { 538 struct rib_context *ctx; 539 540 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 541 return -1; 542 ctx->ctx_id = id; 543 ctx->ctx_aid = aid; 544 ctx->ctx_count = count; 545 ctx->ctx_arg = arg; 546 ctx->ctx_rib_call = upcall; 547 ctx->ctx_done = done; 548 ctx->ctx_throttle = throttle; 549 550 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 551 552 /* requested a sync traversal */ 553 if (count == 0) 554 rib_dump_r(ctx); 555 556 return 0; 557 } 558 559 int 560 rib_dump_subtree(uint16_t id, struct bgpd_addr *subtree, uint8_t subtreelen, 561 unsigned int count, void *arg, void (*upcall)(struct rib_entry *, void *), 562 void (*done)(void *, uint8_t), int (*throttle)(void *)) 563 { 564 struct rib_context *ctx; 565 struct rib_entry xre; 566 567 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 568 return -1; 569 ctx->ctx_id = id; 570 ctx->ctx_aid = subtree->aid; 571 ctx->ctx_count = count; 572 ctx->ctx_arg = arg; 573 ctx->ctx_rib_call = upcall; 574 ctx->ctx_done = done; 575 ctx->ctx_throttle = throttle; 576 ctx->ctx_subtree = *subtree; 577 ctx->ctx_subtreelen = subtreelen; 578 579 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 580 581 /* lookup start of subtree */ 582 memset(&xre, 0, sizeof(xre)); 583 xre.prefix = pt_fill(subtree, subtreelen); 584 ctx->ctx_re = RB_NFIND(rib_tree, rib_tree(rib_byid(id)), &xre); 585 if (ctx->ctx_re) 586 re_lock(ctx->ctx_re); 587 588 /* requested a sync traversal */ 589 if (count == 0) 590 rib_dump_r(ctx); 591 592 return 0; 593 } 594 595 /* path specific functions */ 596 597 static struct rde_aspath *path_lookup(struct rde_aspath *); 598 static void path_link(struct rde_aspath *); 599 static void path_unlink(struct rde_aspath *); 600 601 static inline int 602 path_compare(struct rde_aspath *a, struct rde_aspath *b) 603 { 604 int r; 605 606 if (a == NULL && b == NULL) 607 return (0); 608 else if (b == NULL) 609 return (1); 610 else if (a == NULL) 611 return (-1); 612 if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED)) 613 return (1); 614 if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED)) 615 return (-1); 616 if (a->origin > b->origin) 617 return (1); 618 if (a->origin < b->origin) 619 return (-1); 620 if (a->med > b->med) 621 return (1); 622 if (a->med < b->med) 623 return (-1); 624 if (a->lpref > b->lpref) 625 return (1); 626 if (a->lpref < b->lpref) 627 return (-1); 628 if (a->weight > b->weight) 629 return (1); 630 if (a->weight < b->weight) 631 return (-1); 632 if (a->rtlabelid > b->rtlabelid) 633 return (1); 634 if (a->rtlabelid < b->rtlabelid) 635 return (-1); 636 if (a->pftableid > b->pftableid) 637 return (1); 638 if (a->pftableid < b->pftableid) 639 return (-1); 640 641 /* no need to check aspa_state or aspa_generation */ 642 643 r = aspath_compare(a->aspath, b->aspath); 644 if (r > 0) 645 return (1); 646 if (r < 0) 647 return (-1); 648 649 return (attr_compare(a, b)); 650 } 651 652 RB_HEAD(path_tree, rde_aspath) pathtable = RB_INITIALIZER(&pathtable); 653 RB_GENERATE_STATIC(path_tree, rde_aspath, entry, path_compare); 654 655 static inline struct rde_aspath * 656 path_ref(struct rde_aspath *asp) 657 { 658 if ((asp->flags & F_ATTR_LINKED) == 0) 659 fatalx("%s: unlinked object", __func__); 660 asp->refcnt++; 661 rdemem.path_refs++; 662 663 return asp; 664 } 665 666 static inline void 667 path_unref(struct rde_aspath *asp) 668 { 669 if (asp == NULL) 670 return; 671 if ((asp->flags & F_ATTR_LINKED) == 0) 672 fatalx("%s: unlinked object", __func__); 673 asp->refcnt--; 674 rdemem.path_refs--; 675 if (asp->refcnt <= 0) 676 path_unlink(asp); 677 } 678 679 void 680 path_shutdown(void) 681 { 682 if (!RB_EMPTY(&pathtable)) 683 log_warnx("path_free: free non-free table"); 684 } 685 686 static struct rde_aspath * 687 path_lookup(struct rde_aspath *aspath) 688 { 689 return (RB_FIND(path_tree, &pathtable, aspath)); 690 } 691 692 /* 693 * Link this aspath into the global table. 694 * The asp had to be alloced with path_get. 695 */ 696 static void 697 path_link(struct rde_aspath *asp) 698 { 699 if (RB_INSERT(path_tree, &pathtable, asp) != NULL) 700 fatalx("%s: already linked object", __func__); 701 asp->flags |= F_ATTR_LINKED; 702 } 703 704 /* 705 * This function can only be called when all prefix have been removed first. 706 * Normally this happens directly out of the prefix removal functions. 707 */ 708 static void 709 path_unlink(struct rde_aspath *asp) 710 { 711 if (asp == NULL) 712 return; 713 714 /* make sure no reference is hold for this rde_aspath */ 715 if (asp->refcnt != 0) 716 fatalx("%s: still holds references", __func__); 717 718 RB_REMOVE(path_tree, &pathtable, asp); 719 asp->flags &= ~F_ATTR_LINKED; 720 721 path_put(asp); 722 } 723 724 /* 725 * Copy asp to a new UNLINKED aspath. 726 * On dst either path_get() or path_prep() had to be called before. 727 */ 728 struct rde_aspath * 729 path_copy(struct rde_aspath *dst, const struct rde_aspath *src) 730 { 731 dst->aspath = aspath_copy(src->aspath); 732 dst->refcnt = 0; 733 dst->flags = src->flags & ~F_ATTR_LINKED; 734 735 dst->med = src->med; 736 dst->lpref = src->lpref; 737 dst->weight = src->weight; 738 dst->rtlabelid = rtlabel_ref(src->rtlabelid); 739 dst->pftableid = pftable_ref(src->pftableid); 740 dst->origin = src->origin; 741 742 attr_copy(dst, src); 743 744 return (dst); 745 } 746 747 /* initialize or prepare an aspath for use */ 748 struct rde_aspath * 749 path_prep(struct rde_aspath *asp) 750 { 751 memset(asp, 0, sizeof(*asp)); 752 asp->origin = ORIGIN_INCOMPLETE; 753 asp->lpref = DEFAULT_LPREF; 754 755 return (asp); 756 } 757 758 /* alloc and initialize new entry. May not fail. */ 759 struct rde_aspath * 760 path_get(void) 761 { 762 struct rde_aspath *asp; 763 764 asp = malloc(sizeof(*asp)); 765 if (asp == NULL) 766 fatal("path_get"); 767 rdemem.path_cnt++; 768 769 return (path_prep(asp)); 770 } 771 772 /* clean up an asp after use (frees all references to sub-objects) */ 773 void 774 path_clean(struct rde_aspath *asp) 775 { 776 if (asp->flags & F_ATTR_LINKED) 777 fatalx("%s: linked object", __func__); 778 779 rtlabel_unref(asp->rtlabelid); 780 pftable_unref(asp->pftableid); 781 aspath_put(asp->aspath); 782 asp->aspath = NULL; 783 attr_freeall(asp); 784 } 785 786 /* free an unlinked element */ 787 void 788 path_put(struct rde_aspath *asp) 789 { 790 if (asp == NULL) 791 return; 792 793 path_clean(asp); 794 795 rdemem.path_cnt--; 796 free(asp); 797 } 798 799 /* prefix specific functions */ 800 801 static int prefix_add(struct bgpd_addr *, int, struct rib *, 802 struct rde_peer *, uint32_t, uint32_t, struct rde_aspath *, 803 struct rde_community *, struct nexthop *, 804 uint8_t, uint8_t, int); 805 static int prefix_move(struct prefix *, struct rde_peer *, 806 struct rde_aspath *, struct rde_community *, 807 struct nexthop *, uint8_t, uint8_t, int); 808 809 static void prefix_link(struct prefix *, struct rib_entry *, 810 struct pt_entry *, struct rde_peer *, uint32_t, uint32_t, 811 struct rde_aspath *, struct rde_community *, 812 struct nexthop *, uint8_t, uint8_t); 813 static void prefix_unlink(struct prefix *); 814 815 static struct prefix *prefix_alloc(void); 816 static void prefix_free(struct prefix *); 817 818 /* RB tree comparison function */ 819 static inline int 820 prefix_index_cmp(struct prefix *a, struct prefix *b) 821 { 822 int r; 823 r = pt_prefix_cmp(a->pt, b->pt); 824 if (r != 0) 825 return r; 826 827 if (a->path_id_tx > b->path_id_tx) 828 return 1; 829 if (a->path_id_tx < b->path_id_tx) 830 return -1; 831 return 0; 832 } 833 834 static inline int 835 prefix_cmp(struct prefix *a, struct prefix *b) 836 { 837 if ((a->flags & PREFIX_FLAG_EOR) != (b->flags & PREFIX_FLAG_EOR)) 838 return (a->flags & PREFIX_FLAG_EOR) ? 1 : -1; 839 /* if EOR marker no need to check the rest */ 840 if (a->flags & PREFIX_FLAG_EOR) 841 return 0; 842 843 if (a->aspath != b->aspath) 844 return (a->aspath > b->aspath ? 1 : -1); 845 if (a->communities != b->communities) 846 return (a->communities > b->communities ? 1 : -1); 847 if (a->nexthop != b->nexthop) 848 return (a->nexthop > b->nexthop ? 1 : -1); 849 if (a->nhflags != b->nhflags) 850 return (a->nhflags > b->nhflags ? 1 : -1); 851 return prefix_index_cmp(a, b); 852 } 853 854 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp) 855 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp) 856 857 /* 858 * Search for specified prefix of a peer. Returns NULL if not found. 859 */ 860 struct prefix * 861 prefix_get(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 862 struct bgpd_addr *prefix, int prefixlen) 863 { 864 struct rib_entry *re; 865 866 re = rib_get_addr(rib, prefix, prefixlen); 867 if (re == NULL) 868 return (NULL); 869 return (prefix_bypeer(re, peer, path_id)); 870 } 871 872 /* 873 * Search for specified prefix in the peer prefix_index. 874 * Returns NULL if not found. 875 */ 876 struct prefix * 877 prefix_adjout_get(struct rde_peer *peer, uint32_t path_id_tx, 878 struct pt_entry *pte) 879 { 880 struct prefix xp; 881 882 memset(&xp, 0, sizeof(xp)); 883 xp.pt = pte; 884 xp.path_id_tx = path_id_tx; 885 886 return RB_FIND(prefix_index, &peer->adj_rib_out, &xp); 887 } 888 889 /* 890 * Lookup a prefix without considering path_id in the peer prefix_index. 891 * Returns NULL if not found. 892 */ 893 struct prefix * 894 prefix_adjout_first(struct rde_peer *peer, struct pt_entry *pte) 895 { 896 struct prefix xp, *np; 897 898 memset(&xp, 0, sizeof(xp)); 899 xp.pt = pte; 900 901 np = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp); 902 if (np == NULL || pt_prefix_cmp(np->pt, xp.pt) != 0) 903 return NULL; 904 return np; 905 } 906 907 /* 908 * Return next prefix after a lookup that is actually an update. 909 */ 910 struct prefix * 911 prefix_adjout_next(struct rde_peer *peer, struct prefix *p) 912 { 913 struct prefix *np; 914 915 np = RB_NEXT(prefix_index, &peer->adj_rib_out, p); 916 if (np == NULL || np->pt != p->pt) 917 return NULL; 918 return np; 919 } 920 921 /* 922 * Lookup addr/prefixlen in the peer prefix_index. Returns first match. 923 * Returns NULL if not found. 924 */ 925 struct prefix * 926 prefix_adjout_lookup(struct rde_peer *peer, struct bgpd_addr *addr, int plen) 927 { 928 return prefix_adjout_first(peer, pt_fill(addr, plen)); 929 } 930 931 /* 932 * Lookup addr in the peer prefix_index. Returns first match. 933 * Returns NULL if not found. 934 */ 935 struct prefix * 936 prefix_adjout_match(struct rde_peer *peer, struct bgpd_addr *addr) 937 { 938 struct prefix *p; 939 int i; 940 941 switch (addr->aid) { 942 case AID_INET: 943 case AID_VPN_IPv4: 944 for (i = 32; i >= 0; i--) { 945 p = prefix_adjout_lookup(peer, addr, i); 946 if (p != NULL) 947 return p; 948 } 949 break; 950 case AID_INET6: 951 case AID_VPN_IPv6: 952 for (i = 128; i >= 0; i--) { 953 p = prefix_adjout_lookup(peer, addr, i); 954 if (p != NULL) 955 return p; 956 } 957 break; 958 default: 959 fatalx("%s: unknown af", __func__); 960 } 961 return NULL; 962 } 963 964 /* 965 * Update a prefix. 966 * Return 1 if prefix was newly added, 0 if it was just changed. 967 */ 968 int 969 prefix_update(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 970 uint32_t path_id_tx, struct filterstate *state, int filtered, 971 struct bgpd_addr *prefix, int prefixlen) 972 { 973 struct rde_aspath *asp, *nasp = &state->aspath; 974 struct rde_community *comm, *ncomm = &state->communities; 975 struct prefix *p; 976 977 /* 978 * First try to find a prefix in the specified RIB. 979 */ 980 if ((p = prefix_get(rib, peer, path_id, prefix, prefixlen)) != NULL) { 981 if (path_id_tx != p->path_id_tx) 982 fatalx("path_id mismatch"); 983 if (prefix_nexthop(p) == state->nexthop && 984 prefix_nhflags(p) == state->nhflags && 985 communities_equal(ncomm, prefix_communities(p)) && 986 path_compare(nasp, prefix_aspath(p)) == 0) { 987 /* no change, update last change */ 988 p->lastchange = getmonotime(); 989 p->validation_state = state->vstate; 990 if (filtered) 991 p->flags |= PREFIX_FLAG_FILTERED; 992 else 993 p->flags &= ~PREFIX_FLAG_FILTERED; 994 return (0); 995 } 996 } 997 998 /* 999 * Either the prefix does not exist or the path changed. 1000 * In both cases lookup the new aspath to make sure it is not 1001 * already in the RIB. 1002 */ 1003 if ((asp = path_lookup(nasp)) == NULL) { 1004 /* Path not available, create and link a new one. */ 1005 asp = path_copy(path_get(), nasp); 1006 path_link(asp); 1007 } 1008 1009 if ((comm = communities_lookup(ncomm)) == NULL) { 1010 /* Communities not available, create and link a new one. */ 1011 comm = communities_link(ncomm); 1012 } 1013 1014 /* If the prefix was found move it else add it to the RIB. */ 1015 if (p != NULL) 1016 return (prefix_move(p, peer, asp, comm, state->nexthop, 1017 state->nhflags, state->vstate, filtered)); 1018 else 1019 return (prefix_add(prefix, prefixlen, rib, peer, path_id, 1020 path_id_tx, asp, comm, state->nexthop, state->nhflags, 1021 state->vstate, filtered)); 1022 } 1023 1024 /* 1025 * Adds or updates a prefix. 1026 */ 1027 static int 1028 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib, 1029 struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx, 1030 struct rde_aspath *asp, struct rde_community *comm, 1031 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate, int filtered) 1032 { 1033 struct pt_entry *pte; 1034 struct prefix *p; 1035 struct rib_entry *re; 1036 1037 pte = pt_get(prefix, prefixlen); 1038 if (pte == NULL) 1039 pte = pt_add(prefix, prefixlen); 1040 re = rib_get(rib, pte); 1041 if (re == NULL) 1042 re = rib_add(rib, pte); 1043 1044 p = prefix_alloc(); 1045 prefix_link(p, re, re->prefix, peer, path_id, path_id_tx, asp, comm, 1046 nexthop, nhflags, vstate); 1047 1048 if (filtered) 1049 p->flags |= PREFIX_FLAG_FILTERED; 1050 1051 /* add possible pftable reference form aspath */ 1052 if (asp && asp->pftableid) 1053 rde_pftable_add(asp->pftableid, p); 1054 /* make route decision */ 1055 prefix_evaluate(re, p, NULL); 1056 return (1); 1057 } 1058 1059 /* 1060 * Move the prefix to the specified as path, removes the old asp if needed. 1061 */ 1062 static int 1063 prefix_move(struct prefix *p, struct rde_peer *peer, 1064 struct rde_aspath *asp, struct rde_community *comm, 1065 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate, int filtered) 1066 { 1067 struct prefix *np; 1068 1069 if (p->flags & PREFIX_FLAG_ADJOUT) 1070 fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__); 1071 1072 if (peer != prefix_peer(p)) 1073 fatalx("prefix_move: cross peer move"); 1074 1075 /* add new prefix node */ 1076 np = prefix_alloc(); 1077 prefix_link(np, prefix_re(p), p->pt, peer, p->path_id, p->path_id_tx, 1078 asp, comm, nexthop, nhflags, vstate); 1079 1080 if (filtered) 1081 np->flags |= PREFIX_FLAG_FILTERED; 1082 1083 /* add possible pftable reference from new aspath */ 1084 if (asp && asp->pftableid) 1085 rde_pftable_add(asp->pftableid, np); 1086 1087 /* 1088 * no need to update the peer prefix count because we are only moving 1089 * the prefix without changing the peer. 1090 */ 1091 prefix_evaluate(prefix_re(np), np, p); 1092 1093 /* remove possible pftable reference from old path */ 1094 if (p->aspath && p->aspath->pftableid) 1095 rde_pftable_del(p->aspath->pftableid, p); 1096 1097 /* remove old prefix node */ 1098 prefix_unlink(p); 1099 prefix_free(p); 1100 1101 return (0); 1102 } 1103 1104 /* 1105 * Removes a prefix from the specified RIB. If the parent objects -- rib_entry 1106 * or pt_entry -- become empty remove them too. 1107 */ 1108 int 1109 prefix_withdraw(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 1110 struct bgpd_addr *prefix, int prefixlen) 1111 { 1112 struct prefix *p; 1113 struct rde_aspath *asp; 1114 1115 p = prefix_get(rib, peer, path_id, prefix, prefixlen); 1116 if (p == NULL) /* Got a dummy withdrawn request. */ 1117 return (0); 1118 1119 if (p->flags & PREFIX_FLAG_ADJOUT) 1120 fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__); 1121 asp = prefix_aspath(p); 1122 if (asp && asp->pftableid) 1123 /* only prefixes in the local RIB were pushed into pf */ 1124 rde_pftable_del(asp->pftableid, p); 1125 1126 prefix_destroy(p); 1127 1128 return (1); 1129 } 1130 1131 /* 1132 * Special functions for flowspec until full integration is available. 1133 * This just directly feeds the prefixes into the Adj-RIB-Out bypassing 1134 * Adj-RIB-In and Loc-RIB for now. 1135 */ 1136 int 1137 prefix_flowspec_update(struct rde_peer *peer, struct filterstate *state, 1138 struct pt_entry *pte, uint32_t path_id_tx) 1139 { 1140 struct rde_aspath *asp, *nasp; 1141 struct rde_community *comm, *ncomm; 1142 struct rib_entry *re; 1143 struct prefix *new, *old; 1144 1145 re = rib_get(&flowrib, pte); 1146 if (re == NULL) 1147 re = rib_add(&flowrib, pte); 1148 1149 old = prefix_bypeer(re, peer, 0); 1150 new = prefix_alloc(); 1151 1152 nasp = &state->aspath; 1153 ncomm = &state->communities; 1154 if ((asp = path_lookup(nasp)) == NULL) { 1155 /* Path not available, create and link a new one. */ 1156 asp = path_copy(path_get(), nasp); 1157 path_link(asp); 1158 } 1159 if ((comm = communities_lookup(ncomm)) == NULL) { 1160 /* Communities not available, create and link a new one. */ 1161 comm = communities_link(ncomm); 1162 } 1163 1164 prefix_link(new, re, re->prefix, peer, 0, path_id_tx, asp, comm, 1165 NULL, 0, 0); 1166 TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); 1167 1168 rde_generate_updates(re, new, old, EVAL_DEFAULT); 1169 1170 if (old != NULL) { 1171 TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib); 1172 prefix_unlink(old); 1173 prefix_free(old); 1174 return 0; 1175 } 1176 return 1; 1177 } 1178 1179 /* 1180 * Remove a possible flowspec prefix from all Adj-RIB-Outs. 1181 */ 1182 int 1183 prefix_flowspec_withdraw(struct rde_peer *peer, struct pt_entry *pte) 1184 { 1185 struct rib_entry *re; 1186 struct prefix *p; 1187 1188 re = rib_get(&flowrib, pte); 1189 if (re == NULL) 1190 return 0; 1191 p = prefix_bypeer(re, peer, 0); 1192 if (p == NULL) 1193 return 0; 1194 rde_generate_updates(re, NULL, p, EVAL_DEFAULT); 1195 TAILQ_REMOVE(&re->prefix_h, p, entry.list.rib); 1196 prefix_unlink(p); 1197 prefix_free(p); 1198 return 1; 1199 } 1200 1201 /* 1202 * Push all flowspec rules into a newly available Adj-RIB-Out. 1203 */ 1204 void 1205 prefix_flowspec_dump(uint8_t aid, void *arg, 1206 void (*call)(struct rib_entry *, void *), void (*done)(void *, uint8_t)) 1207 { 1208 struct rib_entry *re, *next; 1209 1210 RB_FOREACH_SAFE(re, rib_tree, rib_tree(&flowrib), next) { 1211 if (aid != AID_UNSPEC && aid != re->prefix->aid) 1212 continue; 1213 call(re, arg); 1214 } 1215 if (done != NULL) 1216 done(arg, aid); 1217 } 1218 1219 /* 1220 * Insert an End-of-RIB marker into the update queue. 1221 */ 1222 void 1223 prefix_add_eor(struct rde_peer *peer, uint8_t aid) 1224 { 1225 struct prefix *p; 1226 1227 p = prefix_alloc(); 1228 p->flags = PREFIX_FLAG_ADJOUT | PREFIX_FLAG_UPDATE | PREFIX_FLAG_EOR; 1229 if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) 1230 /* no need to add if EoR marker already present */ 1231 prefix_free(p); 1232 /* EOR marker is not inserted into the adj_rib_out index */ 1233 } 1234 1235 /* 1236 * Put a prefix from the Adj-RIB-Out onto the update queue. 1237 */ 1238 void 1239 prefix_adjout_update(struct prefix *p, struct rde_peer *peer, 1240 struct filterstate *state, struct pt_entry *pte, uint32_t path_id_tx) 1241 { 1242 struct rde_aspath *asp; 1243 struct rde_community *comm; 1244 1245 if (p == NULL) { 1246 p = prefix_alloc(); 1247 /* initially mark DEAD so code below is skipped */ 1248 p->flags |= PREFIX_FLAG_ADJOUT | PREFIX_FLAG_DEAD; 1249 1250 p->pt = pt_ref(pte); 1251 p->peer = peer; 1252 p->path_id_tx = path_id_tx; 1253 1254 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL) 1255 fatalx("%s: RB index invariant violated", __func__); 1256 } 1257 1258 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1259 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1260 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1261 /* 1262 * XXX for now treat a different path_id_tx like different 1263 * attributes and force out an update. It is unclear how 1264 * common it is to have equivalent updates from alternative 1265 * paths. 1266 */ 1267 if (p->path_id_tx == path_id_tx && 1268 prefix_nhflags(p) == state->nhflags && 1269 prefix_nexthop(p) == state->nexthop && 1270 communities_equal(&state->communities, 1271 prefix_communities(p)) && 1272 path_compare(&state->aspath, prefix_aspath(p)) == 0) { 1273 /* nothing changed */ 1274 p->validation_state = state->vstate; 1275 p->lastchange = getmonotime(); 1276 p->flags &= ~PREFIX_FLAG_STALE; 1277 return; 1278 } 1279 1280 /* if pending update unhook it before it is unlinked */ 1281 if (p->flags & PREFIX_FLAG_UPDATE) { 1282 RB_REMOVE(prefix_tree, &peer->updates[pte->aid], p); 1283 peer->stats.pending_update--; 1284 } 1285 1286 /* unlink prefix so it can be relinked below */ 1287 prefix_unlink(p); 1288 peer->stats.prefix_out_cnt--; 1289 } 1290 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1291 RB_REMOVE(prefix_tree, &peer->withdraws[pte->aid], p); 1292 peer->stats.pending_withdraw--; 1293 } 1294 1295 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1296 p->flags &= ~PREFIX_FLAG_MASK; 1297 1298 /* update path_id_tx now that the prefix is unlinked */ 1299 if (p->path_id_tx != path_id_tx) { 1300 /* path_id_tx is part of the index so remove and re-insert p */ 1301 RB_REMOVE(prefix_index, &peer->adj_rib_out, p); 1302 p->path_id_tx = path_id_tx; 1303 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL) 1304 fatalx("%s: RB index invariant violated", __func__); 1305 } 1306 1307 if ((asp = path_lookup(&state->aspath)) == NULL) { 1308 /* Path not available, create and link a new one. */ 1309 asp = path_copy(path_get(), &state->aspath); 1310 path_link(asp); 1311 } 1312 1313 if ((comm = communities_lookup(&state->communities)) == NULL) { 1314 /* Communities not available, create and link a new one. */ 1315 comm = communities_link(&state->communities); 1316 } 1317 1318 prefix_link(p, NULL, p->pt, peer, 0, p->path_id_tx, asp, comm, 1319 state->nexthop, state->nhflags, state->vstate); 1320 peer->stats.prefix_out_cnt++; 1321 1322 if (p->flags & PREFIX_FLAG_MASK) 1323 fatalx("%s: bad flags %x", __func__, p->flags); 1324 if (peer_is_up(peer)) { 1325 p->flags |= PREFIX_FLAG_UPDATE; 1326 if (RB_INSERT(prefix_tree, &peer->updates[pte->aid], p) != NULL) 1327 fatalx("%s: RB tree invariant violated", __func__); 1328 peer->stats.pending_update++; 1329 } 1330 } 1331 1332 /* 1333 * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves 1334 * the prefix in the RIB linked to the peer withdraw list. 1335 */ 1336 void 1337 prefix_adjout_withdraw(struct prefix *p) 1338 { 1339 struct rde_peer *peer = prefix_peer(p); 1340 1341 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1342 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1343 1344 /* already a withdraw, shortcut */ 1345 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1346 p->lastchange = getmonotime(); 1347 p->flags &= ~PREFIX_FLAG_STALE; 1348 return; 1349 } 1350 /* pending update just got withdrawn */ 1351 if (p->flags & PREFIX_FLAG_UPDATE) { 1352 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1353 peer->stats.pending_update--; 1354 } 1355 /* unlink prefix if it was linked (not a withdraw or dead) */ 1356 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1357 prefix_unlink(p); 1358 peer->stats.prefix_out_cnt--; 1359 } 1360 1361 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1362 p->flags &= ~PREFIX_FLAG_MASK; 1363 p->lastchange = getmonotime(); 1364 1365 if (peer_is_up(peer)) { 1366 p->flags |= PREFIX_FLAG_WITHDRAW; 1367 if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], 1368 p) != NULL) 1369 fatalx("%s: RB tree invariant violated", __func__); 1370 peer->stats.pending_withdraw++; 1371 } else { 1372 /* mark prefix dead to skip unlink on destroy */ 1373 p->flags |= PREFIX_FLAG_DEAD; 1374 prefix_adjout_destroy(p); 1375 } 1376 } 1377 1378 void 1379 prefix_adjout_destroy(struct prefix *p) 1380 { 1381 struct rde_peer *peer = prefix_peer(p); 1382 1383 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1384 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1385 1386 if (p->flags & PREFIX_FLAG_EOR) { 1387 /* EOR marker is not linked in the index */ 1388 prefix_free(p); 1389 return; 1390 } 1391 1392 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1393 RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p); 1394 peer->stats.pending_withdraw--; 1395 } 1396 if (p->flags & PREFIX_FLAG_UPDATE) { 1397 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1398 peer->stats.pending_update--; 1399 } 1400 /* unlink prefix if it was linked (not a withdraw or dead) */ 1401 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1402 prefix_unlink(p); 1403 peer->stats.prefix_out_cnt--; 1404 } 1405 1406 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1407 p->flags &= ~PREFIX_FLAG_MASK; 1408 1409 if (prefix_is_locked(p)) { 1410 /* mark prefix dead but leave it for prefix_restart */ 1411 p->flags |= PREFIX_FLAG_DEAD; 1412 } else { 1413 RB_REMOVE(prefix_index, &peer->adj_rib_out, p); 1414 /* remove the last prefix reference before free */ 1415 pt_unref(p->pt); 1416 prefix_free(p); 1417 } 1418 } 1419 1420 void 1421 prefix_adjout_flush_pending(struct rde_peer *peer) 1422 { 1423 struct prefix *p, *np; 1424 uint8_t aid; 1425 1426 for (aid = AID_MIN; aid < AID_MAX; aid++) { 1427 RB_FOREACH_SAFE(p, prefix_tree, &peer->withdraws[aid], np) { 1428 prefix_adjout_destroy(p); 1429 } 1430 RB_FOREACH_SAFE(p, prefix_tree, &peer->updates[aid], np) { 1431 p->flags &= ~PREFIX_FLAG_UPDATE; 1432 RB_REMOVE(prefix_tree, &peer->updates[aid], p); 1433 peer->stats.pending_update--; 1434 } 1435 } 1436 } 1437 1438 int 1439 prefix_adjout_reaper(struct rde_peer *peer) 1440 { 1441 struct prefix *p, *np; 1442 int count = RDE_REAPER_ROUNDS; 1443 1444 RB_FOREACH_SAFE(p, prefix_index, &peer->adj_rib_out, np) { 1445 prefix_adjout_destroy(p); 1446 if (count-- <= 0) 1447 return 0; 1448 } 1449 return 1; 1450 } 1451 1452 static struct prefix * 1453 prefix_restart(struct rib_context *ctx) 1454 { 1455 struct prefix *p = NULL; 1456 1457 if (ctx->ctx_p) 1458 p = prefix_unlock(ctx->ctx_p); 1459 1460 if (p && prefix_is_dead(p)) { 1461 struct prefix *next; 1462 1463 next = RB_NEXT(prefix_index, unused, p); 1464 prefix_adjout_destroy(p); 1465 p = next; 1466 } 1467 ctx->ctx_p = NULL; 1468 return p; 1469 } 1470 1471 static void 1472 prefix_dump_r(struct rib_context *ctx) 1473 { 1474 struct prefix *p, *next; 1475 struct rde_peer *peer; 1476 unsigned int i; 1477 1478 if ((peer = peer_get(ctx->ctx_id)) == NULL) 1479 goto done; 1480 1481 if (ctx->ctx_p == NULL && ctx->ctx_subtree.aid == AID_UNSPEC) 1482 p = RB_MIN(prefix_index, &peer->adj_rib_out); 1483 else 1484 p = prefix_restart(ctx); 1485 1486 for (i = 0; p != NULL; p = next) { 1487 next = RB_NEXT(prefix_index, unused, p); 1488 if (prefix_is_dead(p)) 1489 continue; 1490 if (ctx->ctx_aid != AID_UNSPEC && 1491 ctx->ctx_aid != p->pt->aid) 1492 continue; 1493 if (ctx->ctx_subtree.aid != AID_UNSPEC) { 1494 struct bgpd_addr addr; 1495 pt_getaddr(p->pt, &addr); 1496 if (prefix_compare(&ctx->ctx_subtree, &addr, 1497 ctx->ctx_subtreelen) != 0) 1498 /* left subtree, walk is done */ 1499 break; 1500 } 1501 if (ctx->ctx_count && i++ >= ctx->ctx_count && 1502 !prefix_is_locked(p)) { 1503 /* store and lock last element */ 1504 ctx->ctx_p = prefix_lock(p); 1505 return; 1506 } 1507 ctx->ctx_prefix_call(p, ctx->ctx_arg); 1508 } 1509 1510 done: 1511 if (ctx->ctx_done) 1512 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 1513 LIST_REMOVE(ctx, entry); 1514 free(ctx); 1515 } 1516 1517 int 1518 prefix_dump_new(struct rde_peer *peer, uint8_t aid, unsigned int count, 1519 void *arg, void (*upcall)(struct prefix *, void *), 1520 void (*done)(void *, uint8_t), int (*throttle)(void *)) 1521 { 1522 struct rib_context *ctx; 1523 1524 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 1525 return -1; 1526 ctx->ctx_id = peer->conf.id; 1527 ctx->ctx_aid = aid; 1528 ctx->ctx_count = count; 1529 ctx->ctx_arg = arg; 1530 ctx->ctx_prefix_call = upcall; 1531 ctx->ctx_done = done; 1532 ctx->ctx_throttle = throttle; 1533 1534 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 1535 1536 /* requested a sync traversal */ 1537 if (count == 0) 1538 prefix_dump_r(ctx); 1539 1540 return 0; 1541 } 1542 1543 int 1544 prefix_dump_subtree(struct rde_peer *peer, struct bgpd_addr *subtree, 1545 uint8_t subtreelen, unsigned int count, void *arg, 1546 void (*upcall)(struct prefix *, void *), void (*done)(void *, uint8_t), 1547 int (*throttle)(void *)) 1548 { 1549 struct rib_context *ctx; 1550 struct prefix xp; 1551 1552 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 1553 return -1; 1554 ctx->ctx_id = peer->conf.id; 1555 ctx->ctx_aid = subtree->aid; 1556 ctx->ctx_count = count; 1557 ctx->ctx_arg = arg; 1558 ctx->ctx_prefix_call = upcall; 1559 ctx->ctx_done = done; 1560 ctx->ctx_throttle = throttle; 1561 ctx->ctx_subtree = *subtree; 1562 ctx->ctx_subtreelen = subtreelen; 1563 1564 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 1565 1566 /* lookup start of subtree */ 1567 memset(&xp, 0, sizeof(xp)); 1568 xp.pt = pt_fill(subtree, subtreelen); 1569 ctx->ctx_p = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp); 1570 if (ctx->ctx_p) 1571 prefix_lock(ctx->ctx_p); 1572 1573 /* requested a sync traversal */ 1574 if (count == 0) 1575 prefix_dump_r(ctx); 1576 1577 return 0; 1578 } 1579 1580 /* 1581 * Searches in the prefix list of specified rib_entry for a prefix entry 1582 * belonging to the peer peer. Returns NULL if no match found. 1583 */ 1584 struct prefix * 1585 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, uint32_t path_id) 1586 { 1587 struct prefix *p; 1588 1589 TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib) 1590 if (prefix_peer(p) == peer && p->path_id == path_id) 1591 return (p); 1592 return (NULL); 1593 } 1594 1595 /* kill a prefix. */ 1596 void 1597 prefix_destroy(struct prefix *p) 1598 { 1599 /* make route decision */ 1600 prefix_evaluate(prefix_re(p), NULL, p); 1601 1602 prefix_unlink(p); 1603 prefix_free(p); 1604 } 1605 1606 /* 1607 * Link a prefix into the different parent objects. 1608 */ 1609 static void 1610 prefix_link(struct prefix *p, struct rib_entry *re, struct pt_entry *pt, 1611 struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx, 1612 struct rde_aspath *asp, struct rde_community *comm, 1613 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate) 1614 { 1615 if (re) 1616 p->entry.list.re = re; 1617 p->aspath = path_ref(asp); 1618 p->communities = communities_ref(comm); 1619 p->peer = peer; 1620 p->pt = pt_ref(pt); 1621 p->path_id = path_id; 1622 p->path_id_tx = path_id_tx; 1623 p->validation_state = vstate; 1624 p->nhflags = nhflags; 1625 p->nexthop = nexthop_ref(nexthop); 1626 nexthop_link(p); 1627 p->lastchange = getmonotime(); 1628 } 1629 1630 /* 1631 * Unlink a prefix from the different parent objects. 1632 */ 1633 static void 1634 prefix_unlink(struct prefix *p) 1635 { 1636 struct rib_entry *re = prefix_re(p); 1637 1638 /* destroy all references to other objects */ 1639 /* remove nexthop ref ... */ 1640 nexthop_unlink(p); 1641 nexthop_unref(p->nexthop); 1642 p->nexthop = NULL; 1643 p->nhflags = 0; 1644 /* ... communities ... */ 1645 communities_unref(p->communities); 1646 p->communities = NULL; 1647 /* and unlink from aspath */ 1648 path_unref(p->aspath); 1649 p->aspath = NULL; 1650 1651 if (re && rib_empty(re)) 1652 rib_remove(re); 1653 1654 pt_unref(p->pt); 1655 } 1656 1657 /* alloc and zero new entry. May not fail. */ 1658 static struct prefix * 1659 prefix_alloc(void) 1660 { 1661 struct prefix *p; 1662 1663 p = calloc(1, sizeof(*p)); 1664 if (p == NULL) 1665 fatal("prefix_alloc"); 1666 rdemem.prefix_cnt++; 1667 return p; 1668 } 1669 1670 /* free a unlinked entry */ 1671 static void 1672 prefix_free(struct prefix *p) 1673 { 1674 rdemem.prefix_cnt--; 1675 free(p); 1676 } 1677 1678 /* 1679 * nexthop functions 1680 */ 1681 struct nexthop *nexthop_lookup(struct bgpd_addr *); 1682 1683 /* 1684 * In BGP there exist two nexthops: the exit nexthop which was announced via 1685 * BGP and the true nexthop which is used in the FIB -- forward information 1686 * base a.k.a kernel routing table. When sending updates it is even more 1687 * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors 1688 * while in EBGP normally the address of the router is sent. The exit nexthop 1689 * may be passed to the external neighbor if the neighbor and the exit nexthop 1690 * reside in the same subnet -- directly connected. 1691 */ 1692 1693 TAILQ_HEAD(nexthop_queue, nexthop) nexthop_runners = 1694 TAILQ_HEAD_INITIALIZER(nexthop_runners); 1695 1696 RB_HEAD(nexthop_tree, nexthop) nexthoptable = 1697 RB_INITIALIZER(&nexthoptree); 1698 1699 static inline int nexthop_cmp(struct nexthop *, struct nexthop *); 1700 1701 RB_GENERATE_STATIC(nexthop_tree, nexthop, entry, nexthop_cmp); 1702 1703 void 1704 nexthop_shutdown(void) 1705 { 1706 struct nexthop *nh, *nnh; 1707 1708 RB_FOREACH_SAFE(nh, nexthop_tree, &nexthoptable, nnh) { 1709 nh->state = NEXTHOP_UNREACH; 1710 nexthop_unref(nh); 1711 } 1712 1713 RB_FOREACH(nh, nexthop_tree, &nexthoptable) { 1714 log_warnx("%s: nexthop %s refcnt %d", __func__, 1715 log_addr(&nh->exit_nexthop), nh->refcnt); 1716 } 1717 } 1718 1719 int 1720 nexthop_pending(void) 1721 { 1722 return !TAILQ_EMPTY(&nexthop_runners); 1723 } 1724 1725 void 1726 nexthop_runner(void) 1727 { 1728 struct nexthop *nh; 1729 struct prefix *p; 1730 uint32_t j; 1731 1732 nh = TAILQ_FIRST(&nexthop_runners); 1733 if (nh == NULL) 1734 return; 1735 1736 /* remove from runnner queue */ 1737 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1738 1739 p = nh->next_prefix; 1740 for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) { 1741 prefix_evaluate_nexthop(p, nh->state, nh->oldstate); 1742 p = LIST_NEXT(p, entry.list.nexthop); 1743 } 1744 1745 /* prep for next run, if not finished readd to tail of queue */ 1746 nh->next_prefix = p; 1747 if (p != NULL) 1748 TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l); 1749 else 1750 log_debug("nexthop %s update finished", 1751 log_addr(&nh->exit_nexthop)); 1752 } 1753 1754 void 1755 nexthop_update(struct kroute_nexthop *msg) 1756 { 1757 struct nexthop *nh; 1758 1759 nh = nexthop_lookup(&msg->nexthop); 1760 if (nh == NULL) { 1761 log_warnx("nexthop_update: non-existent nexthop %s", 1762 log_addr(&msg->nexthop)); 1763 return; 1764 } 1765 1766 nh->oldstate = nh->state; 1767 if (msg->valid) 1768 nh->state = NEXTHOP_REACH; 1769 else 1770 nh->state = NEXTHOP_UNREACH; 1771 1772 if (nh->oldstate == NEXTHOP_LOOKUP) 1773 /* drop reference which was hold during the lookup */ 1774 if (nexthop_unref(nh)) 1775 return; /* nh lost last ref, no work left */ 1776 1777 if (nh->next_prefix) { 1778 /* 1779 * If nexthop_runner() is not finished with this nexthop 1780 * then ensure that all prefixes are updated by setting 1781 * the oldstate to NEXTHOP_FLAPPED. 1782 */ 1783 nh->oldstate = NEXTHOP_FLAPPED; 1784 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1785 } 1786 1787 if (msg->connected) 1788 nh->flags |= NEXTHOP_CONNECTED; 1789 1790 nh->true_nexthop = msg->gateway; 1791 nh->nexthop_net = msg->net; 1792 nh->nexthop_netlen = msg->netlen; 1793 1794 nh->next_prefix = LIST_FIRST(&nh->prefix_h); 1795 if (nh->next_prefix != NULL) { 1796 TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l); 1797 log_debug("nexthop %s update starting", 1798 log_addr(&nh->exit_nexthop)); 1799 } 1800 } 1801 1802 void 1803 nexthop_modify(struct nexthop *setnh, enum action_types type, uint8_t aid, 1804 struct nexthop **nexthop, uint8_t *flags) 1805 { 1806 switch (type) { 1807 case ACTION_SET_NEXTHOP_REJECT: 1808 *flags = NEXTHOP_REJECT; 1809 break; 1810 case ACTION_SET_NEXTHOP_BLACKHOLE: 1811 *flags = NEXTHOP_BLACKHOLE; 1812 break; 1813 case ACTION_SET_NEXTHOP_NOMODIFY: 1814 *flags = NEXTHOP_NOMODIFY; 1815 break; 1816 case ACTION_SET_NEXTHOP_SELF: 1817 *flags = NEXTHOP_SELF; 1818 break; 1819 case ACTION_SET_NEXTHOP_REF: 1820 /* 1821 * it is possible that a prefix matches but has the wrong 1822 * address family for the set nexthop. In this case ignore it. 1823 */ 1824 if (aid != setnh->exit_nexthop.aid) 1825 break; 1826 nexthop_unref(*nexthop); 1827 *nexthop = nexthop_ref(setnh); 1828 *flags = 0; 1829 break; 1830 default: 1831 break; 1832 } 1833 } 1834 1835 void 1836 nexthop_link(struct prefix *p) 1837 { 1838 p->nhflags &= ~NEXTHOP_VALID; 1839 1840 if (p->flags & PREFIX_FLAG_ADJOUT) { 1841 /* All nexthops are valid in Adj-RIB-Out */ 1842 p->nhflags |= NEXTHOP_VALID; 1843 return; 1844 } 1845 1846 /* no need to link prefixes in RIBs that have no decision process */ 1847 if (re_rib(prefix_re(p))->flags & F_RIB_NOEVALUATE) 1848 return; 1849 1850 /* self-announce networks use nexthop NULL and are valid as well */ 1851 if (p->nexthop == NULL || p->nexthop->state == NEXTHOP_REACH) 1852 p->nhflags |= NEXTHOP_VALID; 1853 1854 if (p->nexthop == NULL) 1855 return; 1856 p->flags |= PREFIX_NEXTHOP_LINKED; 1857 LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop); 1858 } 1859 1860 void 1861 nexthop_unlink(struct prefix *p) 1862 { 1863 p->nhflags &= ~NEXTHOP_VALID; 1864 1865 if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0) 1866 return; 1867 1868 if (p == p->nexthop->next_prefix) { 1869 p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop); 1870 /* remove nexthop from list if no prefixes left to update */ 1871 if (p->nexthop->next_prefix == NULL) { 1872 TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l); 1873 log_debug("nexthop %s update finished", 1874 log_addr(&p->nexthop->exit_nexthop)); 1875 } 1876 } 1877 1878 p->flags &= ~PREFIX_NEXTHOP_LINKED; 1879 LIST_REMOVE(p, entry.list.nexthop); 1880 } 1881 1882 struct nexthop * 1883 nexthop_get(struct bgpd_addr *nexthop) 1884 { 1885 struct nexthop *nh; 1886 1887 nh = nexthop_lookup(nexthop); 1888 if (nh == NULL) { 1889 nh = calloc(1, sizeof(*nh)); 1890 if (nh == NULL) 1891 fatal("nexthop_get"); 1892 rdemem.nexthop_cnt++; 1893 1894 LIST_INIT(&nh->prefix_h); 1895 nh->state = NEXTHOP_LOOKUP; 1896 nexthop_ref(nh); /* take reference for lookup */ 1897 nh->exit_nexthop = *nexthop; 1898 if (RB_INSERT(nexthop_tree, &nexthoptable, nh) != NULL) 1899 fatalx("nexthop tree inconsistency"); 1900 1901 rde_send_nexthop(&nh->exit_nexthop, 1); 1902 } 1903 1904 return nexthop_ref(nh); 1905 } 1906 1907 struct nexthop * 1908 nexthop_ref(struct nexthop *nexthop) 1909 { 1910 if (nexthop) 1911 nexthop->refcnt++; 1912 return (nexthop); 1913 } 1914 1915 int 1916 nexthop_unref(struct nexthop *nh) 1917 { 1918 if (nh == NULL) 1919 return (0); 1920 if (--nh->refcnt > 0) 1921 return (0); 1922 1923 /* sanity check */ 1924 if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP) 1925 fatalx("%s: refcnt error", __func__); 1926 1927 /* is nexthop update running? impossible, that is a refcnt error */ 1928 if (nh->next_prefix) 1929 fatalx("%s: next_prefix not NULL", __func__); 1930 1931 RB_REMOVE(nexthop_tree, &nexthoptable, nh); 1932 rde_send_nexthop(&nh->exit_nexthop, 0); 1933 1934 rdemem.nexthop_cnt--; 1935 free(nh); 1936 return (1); 1937 } 1938 1939 static inline int 1940 nexthop_cmp(struct nexthop *na, struct nexthop *nb) 1941 { 1942 struct bgpd_addr *a, *b; 1943 1944 if (na == nb) 1945 return (0); 1946 if (na == NULL) 1947 return (-1); 1948 if (nb == NULL) 1949 return (1); 1950 1951 a = &na->exit_nexthop; 1952 b = &nb->exit_nexthop; 1953 1954 if (a->aid != b->aid) 1955 return (a->aid - b->aid); 1956 1957 switch (a->aid) { 1958 case AID_INET: 1959 if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr)) 1960 return (1); 1961 if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr)) 1962 return (-1); 1963 return (0); 1964 case AID_INET6: 1965 return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr))); 1966 default: 1967 fatalx("nexthop_cmp: %s is unsupported", aid2str(a->aid)); 1968 } 1969 return (-1); 1970 } 1971 1972 struct nexthop * 1973 nexthop_lookup(struct bgpd_addr *nexthop) 1974 { 1975 struct nexthop needle = { 0 }; 1976 1977 needle.exit_nexthop = *nexthop; 1978 return RB_FIND(nexthop_tree, &nexthoptable, &needle); 1979 } 1980