xref: /openbsd/usr.sbin/bgpd/rde_decide.c (revision 771fbea0)
1 /*	$OpenBSD: rde_decide.c,v 1.85 2021/05/04 09:21:05 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/types.h>
21 #include <sys/queue.h>
22 
23 #include <string.h>
24 
25 #include "bgpd.h"
26 #include "rde.h"
27 #include "log.h"
28 
29 int	prefix_cmp(struct prefix *, struct prefix *, int *);
30 void	prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
31 void	prefix_remove(struct prefix *, struct rib_entry *);
32 /*
33  * Decision Engine RFC implementation:
34  *  Phase 1:
35  *   - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes
36  *   - IBGP routes may either use LOCAL_PREF or the local system computes
37  *     the degree of preference
38  *   - If the route is ineligible, the route MAY NOT serve as an input to
39  *     the next phase of route selection
40  *   - if the route is eligible the computed value MUST be used as the
41  *     LOCAL_PREF value in any IBGP readvertisement
42  *
43  *  Phase 2:
44  *   - If the NEXT_HOP attribute of a BGP route depicts an address that is
45  *     not resolvable the BGP route MUST be excluded from the Phase 2 decision
46  *     function.
47  *   - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP
48  *     route should be excluded from the Phase 2 decision function.
49  *   - The local BGP speaker identifies the route that has:
50  *     a) the highest degree of preference of any route to the same set
51  *        of destinations
52  *     b) is the only route to that destination
53  *     c) is selected as a result of the Phase 2 tie breaking rules
54  *   - The local speaker MUST determine the immediate next-hop address from
55  *     the NEXT_HOP attribute of the selected route.
56  *   - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,
57  *     Phase 2 Route Selection MUST be performed again.
58  *
59  *  Route Resolvability Condition
60  *   - A route Rte1, referencing only the intermediate network address, is
61  *     considered resolvable if the Routing Table contains at least one
62  *     resolvable route Rte2 that matches Rte1's intermediate network address
63  *     and is not recursively resolved through Rte1.
64  *   - Routes referencing interfaces are considered resolvable if the state of
65  *     the referenced interface is up and IP processing is enabled.
66  *
67  *  Breaking Ties (Phase 2)
68  *   1. Remove from consideration all routes which are not tied for having the
69  *      smallest number of AS numbers present in their AS_PATH attributes.
70  *      Note, that when counting this number, an AS_SET counts as 1
71  *   2. Remove from consideration all routes which are not tied for having the
72  *      lowest Origin number in their Origin attribute.
73  *   3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC
74  *      attributes. MULTI_EXIT_DISC is only comparable between routes learned
75  *      from the same neighboring AS.
76  *   4. If at least one of the candidate routes was received via EBGP,
77  *      remove from consideration all routes which were received via IBGP.
78  *   5. Remove from consideration any routes with less-preferred interior cost.
79  *      If the NEXT_HOP hop for a route is reachable, but no cost can be
80  *      determined, then this step should be skipped.
81  *   6. Remove from consideration all routes other than the route that was
82  *      advertised by the BGP speaker whose BGP Identifier has the lowest value.
83  *   7. Prefer the route received from the lowest peer address.
84  *
85  * Phase 3: Route Dissemination
86  *   - All routes in the Loc-RIB are processed into Adj-RIBs-Out according
87  *     to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out
88  *     unless the destination and NEXT_HOP described by this route may be
89  *     forwarded appropriately by the Routing Table.
90  */
91 
92 /*
93  * Decision Engine OUR implementation:
94  * The filtering is done first. The filtering calculates the preference and
95  * stores it in LOCAL_PREF (Phase 1).
96  * Ineligible routes are flagged as ineligible via nexthop_add().
97  * Phase 3 is done together with Phase 2.
98  * In following cases a prefix needs to be reevaluated:
99  *  - update of a prefix (prefix_update)
100  *  - withdraw of a prefix (prefix_withdraw)
101  *  - state change of the nexthop (nexthop-{in}validate)
102  *  - state change of session (session down)
103  */
104 
105 /*
106  * Compare two prefixes with equal pt_entry. Returns an integer greater than or
107  * less than 0, according to whether the prefix p1 is more or less preferred
108  * than the prefix p2. p1 should be used for the new prefix and p2 for a
109  * already added prefix.
110  */
111 int
112 prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
113 {
114 	struct rde_aspath	*asp1, *asp2;
115 	struct rde_peer		*peer1, *peer2;
116 	struct attr		*a;
117 	u_int32_t		 p1id, p2id;
118 	int			 p1cnt, p2cnt, i;
119 
120 	/*
121 	 * If a match happens before the MED check then the list is
122 	 * correctly sorted. If a match happens after MED then further
123 	 * elements may need to be checked to ensure that all paths
124 	 * which could affect this path were considered. This only
125 	 * matters for strict MED evaluation and in that case testall
126 	 * is set to 1. If the check happens to be on the MED check
127 	 * itself testall is set to 2.
128 	 */
129 	*testall = 0;
130 
131 	if (p1 == NULL)
132 		return -1;
133 	if (p2 == NULL)
134 		return 1;
135 
136 	asp1 = prefix_aspath(p1);
137 	asp2 = prefix_aspath(p2);
138 	peer1 = prefix_peer(p1);
139 	peer2 = prefix_peer(p2);
140 
141 	/* pathes with errors are not eligible */
142 	if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR)
143 		return -1;
144 	if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR)
145 		return 1;
146 
147 	/* only loop free pathes are eligible */
148 	if (asp1->flags & F_ATTR_LOOP)
149 		return -1;
150 	if (asp2->flags & F_ATTR_LOOP)
151 		return 1;
152 
153 	/*
154 	 * 1. check if prefix is eligible a.k.a reachable
155 	 *    A NULL nexthop is eligible since it is used for locally
156 	 *    announced networks.
157 	 */
158 	if (prefix_nexthop(p2) != NULL &&
159 	    prefix_nexthop(p2)->state != NEXTHOP_REACH)
160 		return 1;
161 	if (prefix_nexthop(p1) != NULL &&
162 	    prefix_nexthop(p1)->state != NEXTHOP_REACH)
163 		return -1;
164 
165 	/* 2. local preference of prefix, bigger is better */
166 	if (asp1->lpref > asp2->lpref)
167 		return 1;
168 	if (asp1->lpref < asp2->lpref)
169 		return -1;
170 
171 	/* 3. aspath count, the shorter the better */
172 	if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0)
173 		return (asp2->aspath->ascnt - asp1->aspath->ascnt);
174 
175 	/* 4. origin, the lower the better */
176 	if ((asp2->origin - asp1->origin) != 0)
177 		return (asp2->origin - asp1->origin);
178 
179 	/*
180 	 * 5. MED decision
181 	 * Only comparable between the same neighboring AS or if
182 	 * 'rde med compare always' is set. In the first case
183 	 * set the testall flag since further elements need to be
184 	 * evaluated as well.
185 	 */
186 	if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
187 	    aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
188 		if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
189 			*testall = 2;
190 		/* lowest value wins */
191 		if (asp1->med < asp2->med)
192 			return 1;
193 		if (asp1->med > asp2->med)
194 			return -1;
195 	}
196 
197 	if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
198 		*testall = 1;
199 
200 	/*
201 	 * 6. EBGP is cooler than IBGP
202 	 * It is absolutely important that the ebgp value in peer_config.ebgp
203 	 * is bigger than all other ones (IBGP, confederations)
204 	 */
205 	if (peer1->conf.ebgp != peer2->conf.ebgp) {
206 		if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */
207 			return 1;
208 		else if (peer2->conf.ebgp) /* peer2 is EBGP */
209 			return -1;
210 	}
211 
212 	/*
213 	 * 7. local tie-breaker, this weight is here to tip equal long AS
214 	 * paths in one or the other direction. It happens more and more
215 	 * that AS paths are equally long and so traffic engineering needs
216 	 * a metric that weights a prefix at a very late stage in the
217 	 * decision process.
218 	 */
219 	if (asp1->weight > asp2->weight)
220 		return 1;
221 	if (asp1->weight < asp2->weight)
222 		return -1;
223 
224 	/* 8. nexthop costs. NOT YET -> IGNORE */
225 
226 	/*
227 	 * 9. older route (more stable) wins but only if route-age
228 	 * evaluation is enabled.
229 	 */
230 	if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) {
231 		if (p1->lastchange < p2->lastchange) /* p1 is older */
232 			return 1;
233 		if (p1->lastchange > p2->lastchange)
234 			return -1;
235 	}
236 
237 	/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */
238 	if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) {
239 		memcpy(&p1id, a->data, sizeof(p1id));
240 		p1id = ntohl(p1id);
241 	} else
242 		p1id = peer1->remote_bgpid;
243 	if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) {
244 		memcpy(&p2id, a->data, sizeof(p2id));
245 		p2id = ntohl(p2id);
246 	} else
247 		p2id = peer2->remote_bgpid;
248 	if (p1id < p2id)
249 		return 1;
250 	if (p1id > p2id)
251 		return -1;
252 
253 	/* 11. compare CLUSTER_LIST length, shorter is better */
254 	p1cnt = p2cnt = 0;
255 	if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL)
256 		p1cnt = a->len / sizeof(u_int32_t);
257 	if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL)
258 		p2cnt = a->len / sizeof(u_int32_t);
259 	if ((p2cnt - p1cnt) != 0)
260 		return (p2cnt - p1cnt);
261 
262 	/* 12. lowest peer address wins (IPv4 is better than IPv6) */
263 	if (peer1->remote_addr.aid < peer2->remote_addr.aid)
264 		return 1;
265 	if (peer1->remote_addr.aid > peer2->remote_addr.aid)
266 		return -1;
267 	switch (peer1->remote_addr.aid) {
268 	case AID_INET:
269 		i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4,
270 		    sizeof(struct in_addr));
271 		break;
272 	case AID_INET6:
273 		i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6,
274 		    sizeof(struct in6_addr));
275 		break;
276 	default:
277 		fatalx("%s: unknown af", __func__);
278 	}
279 	if (i < 0)
280 		return 1;
281 	if (i > 0)
282 		return -1;
283 
284 	fatalx("Uh, oh a politician in the decision process");
285 }
286 
287 /*
288  * Insert a prefix keeping the total order of the list. For routes
289  * that may depend on a MED selection the set is scanned until the
290  * condition is cleared. If a MED inversion is detected the respective
291  * prefix is taken of the rib list and put onto a redo queue. All
292  * prefixes on the redo queue are re-inserted at the end.
293  */
294 void
295 prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
296 {
297 	struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
298 	struct prefix *xp, *np, *tailp = NULL, *insertp = ep;
299 	int testall, selected = 0;
300 
301 	/* start scan at the entry point (ep) or if the head if ep == NULL */
302 	if (ep == NULL)
303 		ep = LIST_FIRST(&re->prefix_h);
304 
305 	for (xp = ep; xp != NULL; xp = np) {
306 		np = LIST_NEXT(xp, entry.list.rib);
307 
308 		if (prefix_cmp(new, xp, &testall) > 0) {
309 			/* new is preferred over xp */
310 			if (testall == 0)
311 				break;		/* we're done */
312 			else if (testall == 2) {
313 				/*
314 				 * MED inversion, take out prefix and
315 				 * put it onto redo queue.
316 				 */
317 				LIST_REMOVE(xp, entry.list.rib);
318 				if (tailp == NULL)
319 					LIST_INSERT_HEAD(&redo, xp,
320 					    entry.list.rib);
321 				else
322 					LIST_INSERT_AFTER(tailp, xp,
323 					    entry.list.rib);
324 				tailp = xp;
325 			} else {
326 				/*
327 				 * lock insertion point and
328 				 * continue on with scan
329 				 */
330 				selected = 1;
331 				continue;
332 			}
333 		} else {
334 			/*
335 			 * p is less preferred, remember insertion point
336 			 * If p got selected during a testall traverse
337 			 * do not alter the insertion point unless this
338 			 * happened on an actual MED check.
339 			 */
340 			if (testall == 2)
341 				selected = 0;
342 			if (!selected)
343 				insertp = xp;
344 		}
345 	}
346 
347 	if (insertp == NULL)
348 		LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
349 	else
350 		LIST_INSERT_AFTER(insertp, new, entry.list.rib);
351 
352 	/* Fixup MED order again. All elements are < new */
353 	while (!LIST_EMPTY(&redo)) {
354 		xp = LIST_FIRST(&redo);
355 		LIST_REMOVE(xp, entry.list.rib);
356 
357 		prefix_insert(xp, new, re);
358 	}
359 }
360 
361 /*
362  * Remove a prefix from the RIB list ensuring that the total order of the
363  * list remains intact. All routes that differ in the MED are taken of the
364  * list and put on the redo list. To figure out if a route could cause a
365  * resort because of a MED check the next prefix of the to-remove prefix
366  * is compared with the old prefix. A full scan is only done if the next
367  * route differs because of the MED or later checks.
368  * Again at the end all routes on the redo queue are reinserted.
369  */
370 void
371 prefix_remove(struct prefix *old, struct rib_entry *re)
372 {
373 	struct prefix_list redo = LIST_HEAD_INITIALIZER(redo);
374 	struct prefix *xp, *np, *tailp = NULL;
375 	int testall;
376 
377 	xp = LIST_NEXT(old, entry.list.rib);
378 	LIST_REMOVE(old, entry.list.rib);
379 	/* check if a MED inversion could be possible */
380 	prefix_cmp(old, xp, &testall);
381 	if (testall > 0) {
382 		/* maybe MED route, scan tail for other possible routes */
383 		for (; xp != NULL; xp = np) {
384 			np = LIST_NEXT(xp, entry.list.rib);
385 
386 			/* only interested in the testall result */
387 			prefix_cmp(old, xp, &testall);
388 			if (testall == 0)
389 				break;		/* we're done */
390 			else if (testall == 2) {
391 				/*
392 				 * possible MED inversion, take out prefix and
393 				 * put it onto redo queue.
394 				 */
395 				LIST_REMOVE(xp, entry.list.rib);
396 				if (tailp == NULL)
397 					LIST_INSERT_HEAD(&redo, xp,
398 					    entry.list.rib);
399 				else
400 					LIST_INSERT_AFTER(tailp, xp,
401 					    entry.list.rib);
402 				tailp = xp;
403 			}
404 		}
405 	}
406 
407 	/* Fixup MED order again, reinsert prefixes from the start */
408 	while (!LIST_EMPTY(&redo)) {
409 		xp = LIST_FIRST(&redo);
410 		LIST_REMOVE(xp, entry.list.rib);
411 
412 		prefix_insert(xp, NULL, re);
413 	}
414 }
415 
416 /* helper function to check if a prefix is valid to be selected */
417 int
418 prefix_eligible(struct prefix *p)
419 {
420 	struct rde_aspath *asp = prefix_aspath(p);
421 	struct nexthop *nh = prefix_nexthop(p);
422 
423 	/* The aspath needs to be loop and error free */
424 	if (asp == NULL || asp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR))
425 		return 0;
426 	/*
427 	 * If the nexthop exists it must be reachable.
428 	 * It is OK if the nexthop does not exist (local announcement).
429 	 */
430 	if (nh != NULL && nh->state != NEXTHOP_REACH)
431 		return 0;
432 
433 	return 1;
434 }
435 
436 /*
437  * Find the correct place to insert the prefix in the prefix list.
438  * If the active prefix has changed we need to send an update also special
439  * treatment is needed if 'rde evaluate all' is used on some peers.
440  * To re-evaluate a prefix just call prefix_evaluate with old and new pointing
441  * to the same prefix.
442  */
443 void
444 prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old)
445 {
446 	struct prefix	*xp;
447 
448 	if (re_rib(re)->flags & F_RIB_NOEVALUATE) {
449 		/* decision process is turned off */
450 		if (old != NULL)
451 			LIST_REMOVE(old, entry.list.rib);
452 		if (new != NULL)
453 			LIST_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
454 		if (re->active) {
455 			/*
456 			 * During reloads it is possible that the decision
457 			 * process is turned off but prefixes are still
458 			 * active. Clean up now to ensure that the RIB
459 			 * is consistant.
460 			 */
461 			rde_generate_updates(re_rib(re), NULL, re->active, 0);
462 			re->active = NULL;
463 		}
464 		return;
465 	}
466 
467 	if (old != NULL)
468 		prefix_remove(old, re);
469 
470 	if (new != NULL)
471 		prefix_insert(new, NULL, re);
472 
473 	xp = LIST_FIRST(&re->prefix_h);
474 	if (xp != NULL && !prefix_eligible(xp))
475 		xp = NULL;
476 
477 	/*
478 	 * If the active prefix changed or the active prefix was removed
479 	 * and added again then generate an update.
480 	 */
481 	if (re->active != xp || (old != NULL && xp == old)) {
482 		/*
483 		 * Send update withdrawing re->active and adding xp
484 		 * but remember that xp may be NULL aka ineligible.
485 		 * Additional decision may be made by the called functions.
486 		 */
487 		rde_generate_updates(re_rib(re), xp, re->active, 0);
488 		re->active = xp;
489 		return;
490 	}
491 
492 	/*
493 	 * If there are peers with 'rde evaluate all' every update needs
494 	 * to be passed on (not only a change of the best prefix).
495 	 * rde_generate_updates() will then take care of distribution.
496 	 */
497 	if (rde_evaluate_all())
498 		if ((new != NULL && prefix_eligible(new)) || old != NULL)
499 			rde_generate_updates(re_rib(re), re->active, NULL, 1);
500 }
501