1 /* $OpenBSD: rde_decide.c,v 1.78 2019/08/09 13:44:27 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5 * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/types.h> 21 #include <sys/queue.h> 22 23 #include <string.h> 24 25 #include "bgpd.h" 26 #include "rde.h" 27 #include "log.h" 28 29 int prefix_cmp(struct prefix *, struct prefix *); 30 /* 31 * Decision Engine RFC implementation: 32 * Phase 1: 33 * - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes 34 * - IBGP routes may either use LOCAL_PREF or the local system computes 35 * the degree of preference 36 * - If the route is ineligible, the route MAY NOT serve as an input to 37 * the next phase of route selection 38 * - if the route is eligible the computed value MUST be used as the 39 * LOCAL_PREF value in any IBGP readvertisement 40 * 41 * Phase 2: 42 * - If the NEXT_HOP attribute of a BGP route depicts an address that is 43 * not resolvable the BGP route MUST be excluded from the Phase 2 decision 44 * function. 45 * - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP 46 * route should be excluded from the Phase 2 decision function. 47 * - The local BGP speaker identifies the route that has: 48 * a) the highest degree of preference of any route to the same set 49 * of destinations 50 * b) is the only route to that destination 51 * c) is selected as a result of the Phase 2 tie breaking rules 52 * - The local speaker MUST determine the immediate next-hop address from 53 * the NEXT_HOP attribute of the selected route. 54 * - If either the immediate next hop or the IGP cost to the NEXT_HOP changes, 55 * Phase 2 Route Selection MUST be performed again. 56 * 57 * Route Resolvability Condition 58 * - A route Rte1, referencing only the intermediate network address, is 59 * considered resolvable if the Routing Table contains at least one 60 * resolvable route Rte2 that matches Rte1's intermediate network address 61 * and is not recursively resolved through Rte1. 62 * - Routes referencing interfaces are considered resolvable if the state of 63 * the referenced interface is up and IP processing is enabled. 64 * 65 * Breaking Ties (Phase 2) 66 * 1. Remove from consideration all routes which are not tied for having the 67 * smallest number of AS numbers present in their AS_PATH attributes. 68 * Note, that when counting this number, an AS_SET counts as 1 69 * 2. Remove from consideration all routes which are not tied for having the 70 * lowest Origin number in their Origin attribute. 71 * 3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC 72 * attributes. MULTI_EXIT_DISC is only comparable between routes learned 73 * from the same neighboring AS. 74 * 4. If at least one of the candidate routes was received via EBGP, 75 * remove from consideration all routes which were received via IBGP. 76 * 5. Remove from consideration any routes with less-preferred interior cost. 77 * If the NEXT_HOP hop for a route is reachable, but no cost can be 78 * determined, then this step should be skipped. 79 * 6. Remove from consideration all routes other than the route that was 80 * advertised by the BGP speaker whose BGP Identifier has the lowest value. 81 * 7. Prefer the route received from the lowest peer address. 82 * 83 * Phase 3: Route Dissemination 84 * - All routes in the Loc-RIB are processed into Adj-RIBs-Out according 85 * to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out 86 * unless the destination and NEXT_HOP described by this route may be 87 * forwarded appropriately by the Routing Table. 88 */ 89 90 /* 91 * Decision Engine OUR implementation: 92 * Our implementation has only one RIB. The filtering is done first. The 93 * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1). 94 * Ineligible routes are flagged as ineligible via nexthop_add(). 95 * Phase 3 is done together with Phase 2. 96 * In following cases a prefix needs to be reevaluated: 97 * - update of a prefix (prefix_update) 98 * - withdraw of a prefix (prefix_withdraw) 99 * - state change of the nexthop (nexthop-{in}validate) 100 * - state change of session (session down) 101 */ 102 103 /* 104 * Compare two prefixes with equal pt_entry. Returns an integer greater than or 105 * less than 0, according to whether the prefix p1 is more or less preferred 106 * than the prefix p2. p1 should be used for the new prefix and p2 for a 107 * already added prefix. 108 */ 109 int 110 prefix_cmp(struct prefix *p1, struct prefix *p2) 111 { 112 struct rde_aspath *asp1, *asp2; 113 struct rde_peer *peer1, *peer2; 114 struct attr *a; 115 u_int32_t p1id, p2id; 116 int p1cnt, p2cnt; 117 118 if (p1 == NULL) 119 return (-1); 120 if (p2 == NULL) 121 return (1); 122 123 asp1 = prefix_aspath(p1); 124 asp2 = prefix_aspath(p2); 125 peer1 = prefix_peer(p1); 126 peer2 = prefix_peer(p2); 127 128 /* pathes with errors are not eligible */ 129 if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR) 130 return (-1); 131 if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR) 132 return (1); 133 134 /* only loop free pathes are eligible */ 135 if (asp1->flags & F_ATTR_LOOP) 136 return (-1); 137 if (asp2->flags & F_ATTR_LOOP) 138 return (1); 139 140 /* 141 * 1. check if prefix is eligible a.k.a reachable 142 * A NULL nexthop is eligible since it is used for locally 143 * announced networks. 144 */ 145 if (prefix_nexthop(p2) != NULL && 146 prefix_nexthop(p2)->state != NEXTHOP_REACH) 147 return (1); 148 if (prefix_nexthop(p1) != NULL && 149 prefix_nexthop(p1)->state != NEXTHOP_REACH) 150 return (-1); 151 152 /* 2. local preference of prefix, bigger is better */ 153 if ((asp1->lpref - asp2->lpref) != 0) 154 return (asp1->lpref - asp2->lpref); 155 156 /* 3. aspath count, the shorter the better */ 157 if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0) 158 return (asp2->aspath->ascnt - asp1->aspath->ascnt); 159 160 /* 4. origin, the lower the better */ 161 if ((asp2->origin - asp1->origin) != 0) 162 return (asp2->origin - asp1->origin); 163 164 /* 5. MED decision, only comparable between the same neighboring AS */ 165 if (rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS || 166 aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) 167 /* lowest value wins */ 168 if ((asp2->med - asp1->med) != 0) 169 return (asp2->med - asp1->med); 170 171 /* 172 * 6. EBGP is cooler than IBGP 173 * It is absolutely important that the ebgp value in peer_config.ebgp 174 * is bigger than all other ones (IBGP, confederations) 175 */ 176 if (peer1->conf.ebgp != peer2->conf.ebgp) { 177 if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */ 178 return 1; 179 else if (peer2->conf.ebgp) /* peer2 is EBGP */ 180 return -1; 181 } 182 183 /* 184 * 7. local tie-breaker, this weight is here to tip equal long AS 185 * paths in one or the other direction. It happens more and more 186 * that AS paths are equally long and so traffic engineering needs 187 * a metric that weights a prefix at a very late stage in the 188 * decision process. 189 */ 190 if ((asp1->weight - asp2->weight) != 0) 191 return (asp1->weight - asp2->weight); 192 193 /* 8. nexthop costs. NOT YET -> IGNORE */ 194 195 /* 196 * 9. older route (more stable) wins but only if route-age 197 * evaluation is enabled. 198 */ 199 if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) 200 if ((p2->lastchange - p1->lastchange) != 0) 201 return (p2->lastchange - p1->lastchange); 202 203 /* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */ 204 if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) { 205 memcpy(&p1id, a->data, sizeof(p1id)); 206 p1id = ntohl(p1id); 207 } else 208 p1id = peer1->remote_bgpid; 209 if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) { 210 memcpy(&p2id, a->data, sizeof(p2id)); 211 p2id = ntohl(p2id); 212 } else 213 p2id = peer2->remote_bgpid; 214 if ((p2id - p1id) != 0) 215 return (p2id - p1id); 216 217 /* 11. compare CLUSTER_LIST length, shorter is better */ 218 p1cnt = p2cnt = 0; 219 if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL) 220 p1cnt = a->len / sizeof(u_int32_t); 221 if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL) 222 p2cnt = a->len / sizeof(u_int32_t); 223 if ((p2cnt - p1cnt) != 0) 224 return (p2cnt - p1cnt); 225 226 /* 12. lowest peer address wins (IPv4 is better than IPv6) */ 227 if (memcmp(&peer1->remote_addr, &peer2->remote_addr, 228 sizeof(peer1->remote_addr)) != 0) 229 return (-memcmp(&peer1->remote_addr, &peer2->remote_addr, 230 sizeof(peer1->remote_addr))); 231 232 fatalx("Uh, oh a politician in the decision process"); 233 } 234 235 /* 236 * Find the correct place to insert the prefix in the prefix list. 237 * If the active prefix has changed we need to send an update. 238 * The to evaluate prefix must not be in the prefix list. 239 */ 240 void 241 prefix_evaluate(struct prefix *p, struct rib_entry *re) 242 { 243 struct prefix *xp; 244 245 if (re_rib(re)->flags & F_RIB_NOEVALUATE) { 246 /* decision process is turned off */ 247 if (p != NULL) 248 LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib); 249 if (re->active) { 250 /* 251 * During reloads it is possible that the decision 252 * process is turned off but prefixes are still 253 * active. Clean up now to ensure that the RIB 254 * is consistant. 255 */ 256 rde_generate_updates(re_rib(re), NULL, re->active); 257 re->active = NULL; 258 } 259 return; 260 } 261 262 if (p != NULL) { 263 if (LIST_EMPTY(&re->prefix_h)) 264 LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib); 265 else { 266 LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) { 267 if (prefix_cmp(p, xp) > 0) { 268 LIST_INSERT_BEFORE(xp, p, 269 entry.list.rib); 270 break; 271 } else if (LIST_NEXT(xp, entry.list.rib) == 272 NULL) { 273 /* if xp last element ... */ 274 LIST_INSERT_AFTER(xp, p, 275 entry.list.rib); 276 break; 277 } 278 } 279 } 280 } 281 282 xp = LIST_FIRST(&re->prefix_h); 283 if (xp != NULL) { 284 struct rde_aspath *xasp = prefix_aspath(xp); 285 if (xasp == NULL || 286 xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || 287 (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state != 288 NEXTHOP_REACH)) 289 /* xp is ineligible */ 290 xp = NULL; 291 } 292 293 if (re->active != xp) { 294 /* need to generate an update */ 295 296 /* 297 * Send update with remove for re->active and add for xp 298 * but remember that xp may be NULL aka ineligible. 299 * Additional decision may be made by the called functions. 300 */ 301 rde_generate_updates(re_rib(re), xp, re->active); 302 if ((re_rib(re)->flags & F_RIB_NOFIB) == 0) 303 rde_send_kroute(re_rib(re), xp, re->active); 304 305 re->active = xp; 306 } 307 } 308