1 /* $OpenBSD: rde_decide.c,v 1.85 2021/05/04 09:21:05 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5 * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/types.h> 21 #include <sys/queue.h> 22 23 #include <string.h> 24 25 #include "bgpd.h" 26 #include "rde.h" 27 #include "log.h" 28 29 int prefix_cmp(struct prefix *, struct prefix *, int *); 30 void prefix_insert(struct prefix *, struct prefix *, struct rib_entry *); 31 void prefix_remove(struct prefix *, struct rib_entry *); 32 /* 33 * Decision Engine RFC implementation: 34 * Phase 1: 35 * - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes 36 * - IBGP routes may either use LOCAL_PREF or the local system computes 37 * the degree of preference 38 * - If the route is ineligible, the route MAY NOT serve as an input to 39 * the next phase of route selection 40 * - if the route is eligible the computed value MUST be used as the 41 * LOCAL_PREF value in any IBGP readvertisement 42 * 43 * Phase 2: 44 * - If the NEXT_HOP attribute of a BGP route depicts an address that is 45 * not resolvable the BGP route MUST be excluded from the Phase 2 decision 46 * function. 47 * - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP 48 * route should be excluded from the Phase 2 decision function. 49 * - The local BGP speaker identifies the route that has: 50 * a) the highest degree of preference of any route to the same set 51 * of destinations 52 * b) is the only route to that destination 53 * c) is selected as a result of the Phase 2 tie breaking rules 54 * - The local speaker MUST determine the immediate next-hop address from 55 * the NEXT_HOP attribute of the selected route. 56 * - If either the immediate next hop or the IGP cost to the NEXT_HOP changes, 57 * Phase 2 Route Selection MUST be performed again. 58 * 59 * Route Resolvability Condition 60 * - A route Rte1, referencing only the intermediate network address, is 61 * considered resolvable if the Routing Table contains at least one 62 * resolvable route Rte2 that matches Rte1's intermediate network address 63 * and is not recursively resolved through Rte1. 64 * - Routes referencing interfaces are considered resolvable if the state of 65 * the referenced interface is up and IP processing is enabled. 66 * 67 * Breaking Ties (Phase 2) 68 * 1. Remove from consideration all routes which are not tied for having the 69 * smallest number of AS numbers present in their AS_PATH attributes. 70 * Note, that when counting this number, an AS_SET counts as 1 71 * 2. Remove from consideration all routes which are not tied for having the 72 * lowest Origin number in their Origin attribute. 73 * 3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC 74 * attributes. MULTI_EXIT_DISC is only comparable between routes learned 75 * from the same neighboring AS. 76 * 4. If at least one of the candidate routes was received via EBGP, 77 * remove from consideration all routes which were received via IBGP. 78 * 5. Remove from consideration any routes with less-preferred interior cost. 79 * If the NEXT_HOP hop for a route is reachable, but no cost can be 80 * determined, then this step should be skipped. 81 * 6. Remove from consideration all routes other than the route that was 82 * advertised by the BGP speaker whose BGP Identifier has the lowest value. 83 * 7. Prefer the route received from the lowest peer address. 84 * 85 * Phase 3: Route Dissemination 86 * - All routes in the Loc-RIB are processed into Adj-RIBs-Out according 87 * to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out 88 * unless the destination and NEXT_HOP described by this route may be 89 * forwarded appropriately by the Routing Table. 90 */ 91 92 /* 93 * Decision Engine OUR implementation: 94 * The filtering is done first. The filtering calculates the preference and 95 * stores it in LOCAL_PREF (Phase 1). 96 * Ineligible routes are flagged as ineligible via nexthop_add(). 97 * Phase 3 is done together with Phase 2. 98 * In following cases a prefix needs to be reevaluated: 99 * - update of a prefix (prefix_update) 100 * - withdraw of a prefix (prefix_withdraw) 101 * - state change of the nexthop (nexthop-{in}validate) 102 * - state change of session (session down) 103 */ 104 105 /* 106 * Compare two prefixes with equal pt_entry. Returns an integer greater than or 107 * less than 0, according to whether the prefix p1 is more or less preferred 108 * than the prefix p2. p1 should be used for the new prefix and p2 for a 109 * already added prefix. 110 */ 111 int 112 prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) 113 { 114 struct rde_aspath *asp1, *asp2; 115 struct rde_peer *peer1, *peer2; 116 struct attr *a; 117 u_int32_t p1id, p2id; 118 int p1cnt, p2cnt, i; 119 120 /* 121 * If a match happens before the MED check then the list is 122 * correctly sorted. If a match happens after MED then further 123 * elements may need to be checked to ensure that all paths 124 * which could affect this path were considered. This only 125 * matters for strict MED evaluation and in that case testall 126 * is set to 1. If the check happens to be on the MED check 127 * itself testall is set to 2. 128 */ 129 *testall = 0; 130 131 if (p1 == NULL) 132 return -1; 133 if (p2 == NULL) 134 return 1; 135 136 asp1 = prefix_aspath(p1); 137 asp2 = prefix_aspath(p2); 138 peer1 = prefix_peer(p1); 139 peer2 = prefix_peer(p2); 140 141 /* pathes with errors are not eligible */ 142 if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR) 143 return -1; 144 if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR) 145 return 1; 146 147 /* only loop free pathes are eligible */ 148 if (asp1->flags & F_ATTR_LOOP) 149 return -1; 150 if (asp2->flags & F_ATTR_LOOP) 151 return 1; 152 153 /* 154 * 1. check if prefix is eligible a.k.a reachable 155 * A NULL nexthop is eligible since it is used for locally 156 * announced networks. 157 */ 158 if (prefix_nexthop(p2) != NULL && 159 prefix_nexthop(p2)->state != NEXTHOP_REACH) 160 return 1; 161 if (prefix_nexthop(p1) != NULL && 162 prefix_nexthop(p1)->state != NEXTHOP_REACH) 163 return -1; 164 165 /* 2. local preference of prefix, bigger is better */ 166 if (asp1->lpref > asp2->lpref) 167 return 1; 168 if (asp1->lpref < asp2->lpref) 169 return -1; 170 171 /* 3. aspath count, the shorter the better */ 172 if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0) 173 return (asp2->aspath->ascnt - asp1->aspath->ascnt); 174 175 /* 4. origin, the lower the better */ 176 if ((asp2->origin - asp1->origin) != 0) 177 return (asp2->origin - asp1->origin); 178 179 /* 180 * 5. MED decision 181 * Only comparable between the same neighboring AS or if 182 * 'rde med compare always' is set. In the first case 183 * set the testall flag since further elements need to be 184 * evaluated as well. 185 */ 186 if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) || 187 aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) { 188 if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS)) 189 *testall = 2; 190 /* lowest value wins */ 191 if (asp1->med < asp2->med) 192 return 1; 193 if (asp1->med > asp2->med) 194 return -1; 195 } 196 197 if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS)) 198 *testall = 1; 199 200 /* 201 * 6. EBGP is cooler than IBGP 202 * It is absolutely important that the ebgp value in peer_config.ebgp 203 * is bigger than all other ones (IBGP, confederations) 204 */ 205 if (peer1->conf.ebgp != peer2->conf.ebgp) { 206 if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */ 207 return 1; 208 else if (peer2->conf.ebgp) /* peer2 is EBGP */ 209 return -1; 210 } 211 212 /* 213 * 7. local tie-breaker, this weight is here to tip equal long AS 214 * paths in one or the other direction. It happens more and more 215 * that AS paths are equally long and so traffic engineering needs 216 * a metric that weights a prefix at a very late stage in the 217 * decision process. 218 */ 219 if (asp1->weight > asp2->weight) 220 return 1; 221 if (asp1->weight < asp2->weight) 222 return -1; 223 224 /* 8. nexthop costs. NOT YET -> IGNORE */ 225 226 /* 227 * 9. older route (more stable) wins but only if route-age 228 * evaluation is enabled. 229 */ 230 if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) { 231 if (p1->lastchange < p2->lastchange) /* p1 is older */ 232 return 1; 233 if (p1->lastchange > p2->lastchange) 234 return -1; 235 } 236 237 /* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */ 238 if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) { 239 memcpy(&p1id, a->data, sizeof(p1id)); 240 p1id = ntohl(p1id); 241 } else 242 p1id = peer1->remote_bgpid; 243 if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) { 244 memcpy(&p2id, a->data, sizeof(p2id)); 245 p2id = ntohl(p2id); 246 } else 247 p2id = peer2->remote_bgpid; 248 if (p1id < p2id) 249 return 1; 250 if (p1id > p2id) 251 return -1; 252 253 /* 11. compare CLUSTER_LIST length, shorter is better */ 254 p1cnt = p2cnt = 0; 255 if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL) 256 p1cnt = a->len / sizeof(u_int32_t); 257 if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL) 258 p2cnt = a->len / sizeof(u_int32_t); 259 if ((p2cnt - p1cnt) != 0) 260 return (p2cnt - p1cnt); 261 262 /* 12. lowest peer address wins (IPv4 is better than IPv6) */ 263 if (peer1->remote_addr.aid < peer2->remote_addr.aid) 264 return 1; 265 if (peer1->remote_addr.aid > peer2->remote_addr.aid) 266 return -1; 267 switch (peer1->remote_addr.aid) { 268 case AID_INET: 269 i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4, 270 sizeof(struct in_addr)); 271 break; 272 case AID_INET6: 273 i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6, 274 sizeof(struct in6_addr)); 275 break; 276 default: 277 fatalx("%s: unknown af", __func__); 278 } 279 if (i < 0) 280 return 1; 281 if (i > 0) 282 return -1; 283 284 fatalx("Uh, oh a politician in the decision process"); 285 } 286 287 /* 288 * Insert a prefix keeping the total order of the list. For routes 289 * that may depend on a MED selection the set is scanned until the 290 * condition is cleared. If a MED inversion is detected the respective 291 * prefix is taken of the rib list and put onto a redo queue. All 292 * prefixes on the redo queue are re-inserted at the end. 293 */ 294 void 295 prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) 296 { 297 struct prefix_list redo = LIST_HEAD_INITIALIZER(redo); 298 struct prefix *xp, *np, *tailp = NULL, *insertp = ep; 299 int testall, selected = 0; 300 301 /* start scan at the entry point (ep) or if the head if ep == NULL */ 302 if (ep == NULL) 303 ep = LIST_FIRST(&re->prefix_h); 304 305 for (xp = ep; xp != NULL; xp = np) { 306 np = LIST_NEXT(xp, entry.list.rib); 307 308 if (prefix_cmp(new, xp, &testall) > 0) { 309 /* new is preferred over xp */ 310 if (testall == 0) 311 break; /* we're done */ 312 else if (testall == 2) { 313 /* 314 * MED inversion, take out prefix and 315 * put it onto redo queue. 316 */ 317 LIST_REMOVE(xp, entry.list.rib); 318 if (tailp == NULL) 319 LIST_INSERT_HEAD(&redo, xp, 320 entry.list.rib); 321 else 322 LIST_INSERT_AFTER(tailp, xp, 323 entry.list.rib); 324 tailp = xp; 325 } else { 326 /* 327 * lock insertion point and 328 * continue on with scan 329 */ 330 selected = 1; 331 continue; 332 } 333 } else { 334 /* 335 * p is less preferred, remember insertion point 336 * If p got selected during a testall traverse 337 * do not alter the insertion point unless this 338 * happened on an actual MED check. 339 */ 340 if (testall == 2) 341 selected = 0; 342 if (!selected) 343 insertp = xp; 344 } 345 } 346 347 if (insertp == NULL) 348 LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); 349 else 350 LIST_INSERT_AFTER(insertp, new, entry.list.rib); 351 352 /* Fixup MED order again. All elements are < new */ 353 while (!LIST_EMPTY(&redo)) { 354 xp = LIST_FIRST(&redo); 355 LIST_REMOVE(xp, entry.list.rib); 356 357 prefix_insert(xp, new, re); 358 } 359 } 360 361 /* 362 * Remove a prefix from the RIB list ensuring that the total order of the 363 * list remains intact. All routes that differ in the MED are taken of the 364 * list and put on the redo list. To figure out if a route could cause a 365 * resort because of a MED check the next prefix of the to-remove prefix 366 * is compared with the old prefix. A full scan is only done if the next 367 * route differs because of the MED or later checks. 368 * Again at the end all routes on the redo queue are reinserted. 369 */ 370 void 371 prefix_remove(struct prefix *old, struct rib_entry *re) 372 { 373 struct prefix_list redo = LIST_HEAD_INITIALIZER(redo); 374 struct prefix *xp, *np, *tailp = NULL; 375 int testall; 376 377 xp = LIST_NEXT(old, entry.list.rib); 378 LIST_REMOVE(old, entry.list.rib); 379 /* check if a MED inversion could be possible */ 380 prefix_cmp(old, xp, &testall); 381 if (testall > 0) { 382 /* maybe MED route, scan tail for other possible routes */ 383 for (; xp != NULL; xp = np) { 384 np = LIST_NEXT(xp, entry.list.rib); 385 386 /* only interested in the testall result */ 387 prefix_cmp(old, xp, &testall); 388 if (testall == 0) 389 break; /* we're done */ 390 else if (testall == 2) { 391 /* 392 * possible MED inversion, take out prefix and 393 * put it onto redo queue. 394 */ 395 LIST_REMOVE(xp, entry.list.rib); 396 if (tailp == NULL) 397 LIST_INSERT_HEAD(&redo, xp, 398 entry.list.rib); 399 else 400 LIST_INSERT_AFTER(tailp, xp, 401 entry.list.rib); 402 tailp = xp; 403 } 404 } 405 } 406 407 /* Fixup MED order again, reinsert prefixes from the start */ 408 while (!LIST_EMPTY(&redo)) { 409 xp = LIST_FIRST(&redo); 410 LIST_REMOVE(xp, entry.list.rib); 411 412 prefix_insert(xp, NULL, re); 413 } 414 } 415 416 /* helper function to check if a prefix is valid to be selected */ 417 int 418 prefix_eligible(struct prefix *p) 419 { 420 struct rde_aspath *asp = prefix_aspath(p); 421 struct nexthop *nh = prefix_nexthop(p); 422 423 /* The aspath needs to be loop and error free */ 424 if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR)) 425 return 0; 426 /* 427 * If the nexthop exists it must be reachable. 428 * It is OK if the nexthop does not exist (local announcement). 429 */ 430 if (nh != NULL && nh->state != NEXTHOP_REACH) 431 return 0; 432 433 return 1; 434 } 435 436 /* 437 * Find the correct place to insert the prefix in the prefix list. 438 * If the active prefix has changed we need to send an update also special 439 * treatment is needed if 'rde evaluate all' is used on some peers. 440 * To re-evaluate a prefix just call prefix_evaluate with old and new pointing 441 * to the same prefix. 442 */ 443 void 444 prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old) 445 { 446 struct prefix *xp; 447 448 if (re_rib(re)->flags & F_RIB_NOEVALUATE) { 449 /* decision process is turned off */ 450 if (old != NULL) 451 LIST_REMOVE(old, entry.list.rib); 452 if (new != NULL) 453 LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); 454 if (re->active) { 455 /* 456 * During reloads it is possible that the decision 457 * process is turned off but prefixes are still 458 * active. Clean up now to ensure that the RIB 459 * is consistant. 460 */ 461 rde_generate_updates(re_rib(re), NULL, re->active, 0); 462 re->active = NULL; 463 } 464 return; 465 } 466 467 if (old != NULL) 468 prefix_remove(old, re); 469 470 if (new != NULL) 471 prefix_insert(new, NULL, re); 472 473 xp = LIST_FIRST(&re->prefix_h); 474 if (xp != NULL && !prefix_eligible(xp)) 475 xp = NULL; 476 477 /* 478 * If the active prefix changed or the active prefix was removed 479 * and added again then generate an update. 480 */ 481 if (re->active != xp || (old != NULL && xp == old)) { 482 /* 483 * Send update withdrawing re->active and adding xp 484 * but remember that xp may be NULL aka ineligible. 485 * Additional decision may be made by the called functions. 486 */ 487 rde_generate_updates(re_rib(re), xp, re->active, 0); 488 re->active = xp; 489 return; 490 } 491 492 /* 493 * If there are peers with 'rde evaluate all' every update needs 494 * to be passed on (not only a change of the best prefix). 495 * rde_generate_updates() will then take care of distribution. 496 */ 497 if (rde_evaluate_all()) 498 if ((new != NULL && prefix_eligible(new)) || old != NULL) 499 rde_generate_updates(re_rib(re), re->active, NULL, 1); 500 } 501