xref: /openbsd/usr.sbin/bgpd/rde_decide.c (revision 274d7c50)
1 /*	$OpenBSD: rde_decide.c,v 1.78 2019/08/09 13:44:27 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/types.h>
21 #include <sys/queue.h>
22 
23 #include <string.h>
24 
25 #include "bgpd.h"
26 #include "rde.h"
27 #include "log.h"
28 
29 int	prefix_cmp(struct prefix *, struct prefix *);
30 /*
31  * Decision Engine RFC implementation:
32  *  Phase 1:
33  *   - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes
34  *   - IBGP routes may either use LOCAL_PREF or the local system computes
35  *     the degree of preference
36  *   - If the route is ineligible, the route MAY NOT serve as an input to
37  *     the next phase of route selection
38  *   - if the route is eligible the computed value MUST be used as the
39  *     LOCAL_PREF value in any IBGP readvertisement
40  *
41  *  Phase 2:
42  *   - If the NEXT_HOP attribute of a BGP route depicts an address that is
43  *     not resolvable the BGP route MUST be excluded from the Phase 2 decision
44  *     function.
45  *   - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP
46  *     route should be excluded from the Phase 2 decision function.
47  *   - The local BGP speaker identifies the route that has:
48  *     a) the highest degree of preference of any route to the same set
49  *        of destinations
50  *     b) is the only route to that destination
51  *     c) is selected as a result of the Phase 2 tie breaking rules
52  *   - The local speaker MUST determine the immediate next-hop address from
53  *     the NEXT_HOP attribute of the selected route.
54  *   - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,
55  *     Phase 2 Route Selection MUST be performed again.
56  *
57  *  Route Resolvability Condition
58  *   - A route Rte1, referencing only the intermediate network address, is
59  *     considered resolvable if the Routing Table contains at least one
60  *     resolvable route Rte2 that matches Rte1's intermediate network address
61  *     and is not recursively resolved through Rte1.
62  *   - Routes referencing interfaces are considered resolvable if the state of
63  *     the referenced interface is up and IP processing is enabled.
64  *
65  *  Breaking Ties (Phase 2)
66  *   1. Remove from consideration all routes which are not tied for having the
67  *      smallest number of AS numbers present in their AS_PATH attributes.
68  *      Note, that when counting this number, an AS_SET counts as 1
69  *   2. Remove from consideration all routes which are not tied for having the
70  *      lowest Origin number in their Origin attribute.
71  *   3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC
72  *      attributes. MULTI_EXIT_DISC is only comparable between routes learned
73  *      from the same neighboring AS.
74  *   4. If at least one of the candidate routes was received via EBGP,
75  *      remove from consideration all routes which were received via IBGP.
76  *   5. Remove from consideration any routes with less-preferred interior cost.
77  *      If the NEXT_HOP hop for a route is reachable, but no cost can be
78  *      determined, then this step should be skipped.
79  *   6. Remove from consideration all routes other than the route that was
80  *      advertised by the BGP speaker whose BGP Identifier has the lowest value.
81  *   7. Prefer the route received from the lowest peer address.
82  *
83  * Phase 3: Route Dissemination
84  *   - All routes in the Loc-RIB are processed into Adj-RIBs-Out according
85  *     to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out
86  *     unless the destination and NEXT_HOP described by this route may be
87  *     forwarded appropriately by the Routing Table.
88  */
89 
90 /*
91  * Decision Engine OUR implementation:
92  * Our implementation has only one RIB. The filtering is done first. The
93  * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1).
94  * Ineligible routes are flagged as ineligible via nexthop_add().
95  * Phase 3 is done together with Phase 2.
96  * In following cases a prefix needs to be reevaluated:
97  *  - update of a prefix (prefix_update)
98  *  - withdraw of a prefix (prefix_withdraw)
99  *  - state change of the nexthop (nexthop-{in}validate)
100  *  - state change of session (session down)
101  */
102 
103 /*
104  * Compare two prefixes with equal pt_entry. Returns an integer greater than or
105  * less than 0, according to whether the prefix p1 is more or less preferred
106  * than the prefix p2. p1 should be used for the new prefix and p2 for a
107  * already added prefix.
108  */
109 int
110 prefix_cmp(struct prefix *p1, struct prefix *p2)
111 {
112 	struct rde_aspath	*asp1, *asp2;
113 	struct rde_peer		*peer1, *peer2;
114 	struct attr		*a;
115 	u_int32_t		 p1id, p2id;
116 	int			 p1cnt, p2cnt;
117 
118 	if (p1 == NULL)
119 		return (-1);
120 	if (p2 == NULL)
121 		return (1);
122 
123 	asp1 = prefix_aspath(p1);
124 	asp2 = prefix_aspath(p2);
125 	peer1 = prefix_peer(p1);
126 	peer2 = prefix_peer(p2);
127 
128 	/* pathes with errors are not eligible */
129 	if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR)
130 		return (-1);
131 	if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR)
132 		return (1);
133 
134 	/* only loop free pathes are eligible */
135 	if (asp1->flags & F_ATTR_LOOP)
136 		return (-1);
137 	if (asp2->flags & F_ATTR_LOOP)
138 		return (1);
139 
140 	/*
141 	 * 1. check if prefix is eligible a.k.a reachable
142 	 *    A NULL nexthop is eligible since it is used for locally
143 	 *    announced networks.
144 	 */
145 	if (prefix_nexthop(p2) != NULL &&
146 	    prefix_nexthop(p2)->state != NEXTHOP_REACH)
147 		return (1);
148 	if (prefix_nexthop(p1) != NULL &&
149 	    prefix_nexthop(p1)->state != NEXTHOP_REACH)
150 		return (-1);
151 
152 	/* 2. local preference of prefix, bigger is better */
153 	if ((asp1->lpref - asp2->lpref) != 0)
154 		return (asp1->lpref - asp2->lpref);
155 
156 	/* 3. aspath count, the shorter the better */
157 	if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0)
158 		return (asp2->aspath->ascnt - asp1->aspath->ascnt);
159 
160 	/* 4. origin, the lower the better */
161 	if ((asp2->origin - asp1->origin) != 0)
162 		return (asp2->origin - asp1->origin);
163 
164 	/* 5. MED decision, only comparable between the same neighboring AS */
165 	if (rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS ||
166 	    aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath))
167 		/* lowest value wins */
168 		if ((asp2->med - asp1->med) != 0)
169 			return (asp2->med - asp1->med);
170 
171 	/*
172 	 * 6. EBGP is cooler than IBGP
173 	 * It is absolutely important that the ebgp value in peer_config.ebgp
174 	 * is bigger than all other ones (IBGP, confederations)
175 	 */
176 	if (peer1->conf.ebgp != peer2->conf.ebgp) {
177 		if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */
178 			return 1;
179 		else if (peer2->conf.ebgp) /* peer2 is EBGP */
180 			return -1;
181 	}
182 
183 	/*
184 	 * 7. local tie-breaker, this weight is here to tip equal long AS
185 	 * paths in one or the other direction. It happens more and more
186 	 * that AS paths are equally long and so traffic engineering needs
187 	 * a metric that weights a prefix at a very late stage in the
188 	 * decision process.
189 	 */
190 	if ((asp1->weight - asp2->weight) != 0)
191 		return (asp1->weight - asp2->weight);
192 
193 	/* 8. nexthop costs. NOT YET -> IGNORE */
194 
195 	/*
196 	 * 9. older route (more stable) wins but only if route-age
197 	 * evaluation is enabled.
198 	 */
199 	if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE)
200 		if ((p2->lastchange - p1->lastchange) != 0)
201 			return (p2->lastchange - p1->lastchange);
202 
203 	/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */
204 	if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) {
205 		memcpy(&p1id, a->data, sizeof(p1id));
206 		p1id = ntohl(p1id);
207 	} else
208 		p1id = peer1->remote_bgpid;
209 	if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) {
210 		memcpy(&p2id, a->data, sizeof(p2id));
211 		p2id = ntohl(p2id);
212 	} else
213 		p2id = peer2->remote_bgpid;
214 	if ((p2id - p1id) != 0)
215 		return (p2id - p1id);
216 
217 	/* 11. compare CLUSTER_LIST length, shorter is better */
218 	p1cnt = p2cnt = 0;
219 	if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL)
220 		p1cnt = a->len / sizeof(u_int32_t);
221 	if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL)
222 		p2cnt = a->len / sizeof(u_int32_t);
223 	if ((p2cnt - p1cnt) != 0)
224 		return (p2cnt - p1cnt);
225 
226 	/* 12. lowest peer address wins (IPv4 is better than IPv6) */
227 	if (memcmp(&peer1->remote_addr, &peer2->remote_addr,
228 	    sizeof(peer1->remote_addr)) != 0)
229 		return (-memcmp(&peer1->remote_addr, &peer2->remote_addr,
230 		    sizeof(peer1->remote_addr)));
231 
232 	fatalx("Uh, oh a politician in the decision process");
233 }
234 
235 /*
236  * Find the correct place to insert the prefix in the prefix list.
237  * If the active prefix has changed we need to send an update.
238  * The to evaluate prefix must not be in the prefix list.
239  */
240 void
241 prefix_evaluate(struct prefix *p, struct rib_entry *re)
242 {
243 	struct prefix	*xp;
244 
245 	if (re_rib(re)->flags & F_RIB_NOEVALUATE) {
246 		/* decision process is turned off */
247 		if (p != NULL)
248 			LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
249 		if (re->active) {
250 			/*
251 			 * During reloads it is possible that the decision
252 			 * process is turned off but prefixes are still
253 			 * active. Clean up now to ensure that the RIB
254 			 * is consistant.
255 			 */
256 			rde_generate_updates(re_rib(re), NULL, re->active);
257 			re->active = NULL;
258 		}
259 		return;
260 	}
261 
262 	if (p != NULL) {
263 		if (LIST_EMPTY(&re->prefix_h))
264 			LIST_INSERT_HEAD(&re->prefix_h, p, entry.list.rib);
265 		else {
266 			LIST_FOREACH(xp, &re->prefix_h, entry.list.rib) {
267 				if (prefix_cmp(p, xp) > 0) {
268 					LIST_INSERT_BEFORE(xp, p,
269 					    entry.list.rib);
270 					break;
271 				} else if (LIST_NEXT(xp, entry.list.rib) ==
272 				    NULL) {
273 					/* if xp last element ... */
274 					LIST_INSERT_AFTER(xp, p,
275 					    entry.list.rib);
276 					break;
277 				}
278 			}
279 		}
280 	}
281 
282 	xp = LIST_FIRST(&re->prefix_h);
283 	if (xp != NULL) {
284 		struct rde_aspath *xasp = prefix_aspath(xp);
285 		if (xasp == NULL ||
286 		    xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) ||
287 		    (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state !=
288 		    NEXTHOP_REACH))
289 			/* xp is ineligible */
290 			xp = NULL;
291 	}
292 
293 	if (re->active != xp) {
294 		/* need to generate an update */
295 
296 		/*
297 		 * Send update with remove for re->active and add for xp
298 		 * but remember that xp may be NULL aka ineligible.
299 		 * Additional decision may be made by the called functions.
300 		 */
301 		rde_generate_updates(re_rib(re), xp, re->active);
302 		if ((re_rib(re)->flags & F_RIB_NOFIB) == 0)
303 			rde_send_kroute(re_rib(re), xp, re->active);
304 
305 		re->active = xp;
306 	}
307 }
308