1 /*
2  * BGP Multipath
3  * Copyright (C) 2010 Google Inc.
4  *
5  * This file is part of Quagga
6  *
7  * Quagga is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License as published by the
9  * Free Software Foundation; either version 2, or (at your option) any
10  * later version.
11  *
12  * Quagga is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; see the file COPYING; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include <zebra.h>
23 
24 #include "command.h"
25 #include "prefix.h"
26 #include "linklist.h"
27 #include "sockunion.h"
28 #include "memory.h"
29 #include "queue.h"
30 #include "filter.h"
31 
32 #include "bgpd/bgpd.h"
33 #include "bgpd/bgp_table.h"
34 #include "bgpd/bgp_route.h"
35 #include "bgpd/bgp_attr.h"
36 #include "bgpd/bgp_debug.h"
37 #include "bgpd/bgp_aspath.h"
38 #include "bgpd/bgp_community.h"
39 #include "bgpd/bgp_ecommunity.h"
40 #include "bgpd/bgp_lcommunity.h"
41 #include "bgpd/bgp_mpath.h"
42 
43 /*
44  * bgp_maximum_paths_set
45  *
46  * Record maximum-paths configuration for BGP instance
47  */
bgp_maximum_paths_set(struct bgp * bgp,afi_t afi,safi_t safi,int peertype,uint16_t maxpaths,uint16_t options)48 int bgp_maximum_paths_set(struct bgp *bgp, afi_t afi, safi_t safi, int peertype,
49 			  uint16_t maxpaths, uint16_t options)
50 {
51 	if (!bgp || (afi >= AFI_MAX) || (safi >= SAFI_MAX))
52 		return -1;
53 
54 	switch (peertype) {
55 	case BGP_PEER_IBGP:
56 		bgp->maxpaths[afi][safi].maxpaths_ibgp = maxpaths;
57 		bgp->maxpaths[afi][safi].ibgp_flags |= options;
58 		break;
59 	case BGP_PEER_EBGP:
60 		bgp->maxpaths[afi][safi].maxpaths_ebgp = maxpaths;
61 		break;
62 	default:
63 		return -1;
64 	}
65 
66 	return 0;
67 }
68 
69 /*
70  * bgp_maximum_paths_unset
71  *
72  * Remove maximum-paths configuration from BGP instance
73  */
bgp_maximum_paths_unset(struct bgp * bgp,afi_t afi,safi_t safi,int peertype)74 int bgp_maximum_paths_unset(struct bgp *bgp, afi_t afi, safi_t safi,
75 			    int peertype)
76 {
77 	if (!bgp || (afi >= AFI_MAX) || (safi >= SAFI_MAX))
78 		return -1;
79 
80 	switch (peertype) {
81 	case BGP_PEER_IBGP:
82 		bgp->maxpaths[afi][safi].maxpaths_ibgp = multipath_num;
83 		bgp->maxpaths[afi][safi].ibgp_flags = 0;
84 		break;
85 	case BGP_PEER_EBGP:
86 		bgp->maxpaths[afi][safi].maxpaths_ebgp = multipath_num;
87 		break;
88 	default:
89 		return -1;
90 	}
91 
92 	return 0;
93 }
94 
95 /*
96  * bgp_interface_same
97  *
98  * Return true if ifindex for ifp1 and ifp2 are the same, else return false.
99  */
bgp_interface_same(struct interface * ifp1,struct interface * ifp2)100 static int bgp_interface_same(struct interface *ifp1, struct interface *ifp2)
101 {
102 	if (!ifp1 && !ifp2)
103 		return 1;
104 
105 	if (!ifp1 && ifp2)
106 		return 0;
107 
108 	if (ifp1 && !ifp2)
109 		return 0;
110 
111 	return (ifp1->ifindex == ifp2->ifindex);
112 }
113 
114 
115 /*
116  * bgp_path_info_nexthop_cmp
117  *
118  * Compare the nexthops of two paths. Return value is less than, equal to,
119  * or greater than zero if bpi1 is respectively less than, equal to,
120  * or greater than bpi2.
121  */
bgp_path_info_nexthop_cmp(struct bgp_path_info * bpi1,struct bgp_path_info * bpi2)122 int bgp_path_info_nexthop_cmp(struct bgp_path_info *bpi1,
123 			      struct bgp_path_info *bpi2)
124 {
125 	int compare;
126 	struct in6_addr addr1, addr2;
127 
128 	compare = IPV4_ADDR_CMP(&bpi1->attr->nexthop, &bpi2->attr->nexthop);
129 	if (!compare) {
130 		if (bpi1->attr->mp_nexthop_len == bpi2->attr->mp_nexthop_len) {
131 			switch (bpi1->attr->mp_nexthop_len) {
132 			case BGP_ATTR_NHLEN_IPV4:
133 			case BGP_ATTR_NHLEN_VPNV4:
134 				compare = IPV4_ADDR_CMP(
135 					&bpi1->attr->mp_nexthop_global_in,
136 					&bpi2->attr->mp_nexthop_global_in);
137 				break;
138 			case BGP_ATTR_NHLEN_IPV6_GLOBAL:
139 			case BGP_ATTR_NHLEN_VPNV6_GLOBAL:
140 				compare = IPV6_ADDR_CMP(
141 					&bpi1->attr->mp_nexthop_global,
142 					&bpi2->attr->mp_nexthop_global);
143 				break;
144 			case BGP_ATTR_NHLEN_IPV6_GLOBAL_AND_LL:
145 				addr1 = (bpi1->attr->mp_nexthop_prefer_global)
146 						? bpi1->attr->mp_nexthop_global
147 						: bpi1->attr->mp_nexthop_local;
148 				addr2 = (bpi2->attr->mp_nexthop_prefer_global)
149 						? bpi2->attr->mp_nexthop_global
150 						: bpi2->attr->mp_nexthop_local;
151 
152 				if (!bpi1->attr->mp_nexthop_prefer_global
153 				    && !bpi2->attr->mp_nexthop_prefer_global)
154 					compare = !bgp_interface_same(
155 						bpi1->peer->ifp,
156 						bpi2->peer->ifp);
157 
158 				if (!compare)
159 					compare = IPV6_ADDR_CMP(&addr1, &addr2);
160 				break;
161 			}
162 		}
163 
164 		/* This can happen if one IPv6 peer sends you global and
165 		 * link-local
166 		 * nexthops but another IPv6 peer only sends you global
167 		 */
168 		else if (bpi1->attr->mp_nexthop_len
169 				 == BGP_ATTR_NHLEN_IPV6_GLOBAL
170 			 || bpi1->attr->mp_nexthop_len
171 				    == BGP_ATTR_NHLEN_IPV6_GLOBAL_AND_LL) {
172 			compare = IPV6_ADDR_CMP(&bpi1->attr->mp_nexthop_global,
173 						&bpi2->attr->mp_nexthop_global);
174 			if (!compare) {
175 				if (bpi1->attr->mp_nexthop_len
176 				    < bpi2->attr->mp_nexthop_len)
177 					compare = -1;
178 				else
179 					compare = 1;
180 			}
181 		}
182 	}
183 
184 	return compare;
185 }
186 
187 /*
188  * bgp_path_info_mpath_cmp
189  *
190  * This function determines our multipath list ordering. By ordering
191  * the list we can deterministically select which paths are included
192  * in the multipath set. The ordering also helps in detecting changes
193  * in the multipath selection so we can detect whether to send an
194  * update to zebra.
195  *
196  * The order of paths is determined first by received nexthop, and then
197  * by peer address if the nexthops are the same.
198  */
bgp_path_info_mpath_cmp(void * val1,void * val2)199 static int bgp_path_info_mpath_cmp(void *val1, void *val2)
200 {
201 	struct bgp_path_info *bpi1, *bpi2;
202 	int compare;
203 
204 	bpi1 = val1;
205 	bpi2 = val2;
206 
207 	compare = bgp_path_info_nexthop_cmp(bpi1, bpi2);
208 
209 	if (!compare) {
210 		if (!bpi1->peer->su_remote && !bpi2->peer->su_remote)
211 			compare = 0;
212 		else if (!bpi1->peer->su_remote)
213 			compare = 1;
214 		else if (!bpi2->peer->su_remote)
215 			compare = -1;
216 		else
217 			compare = sockunion_cmp(bpi1->peer->su_remote,
218 						bpi2->peer->su_remote);
219 	}
220 
221 	return compare;
222 }
223 
224 /*
225  * bgp_mp_list_init
226  *
227  * Initialize the mp_list, which holds the list of multipaths
228  * selected by bgp_best_selection
229  */
bgp_mp_list_init(struct list * mp_list)230 void bgp_mp_list_init(struct list *mp_list)
231 {
232 	assert(mp_list);
233 	memset(mp_list, 0, sizeof(struct list));
234 	mp_list->cmp = bgp_path_info_mpath_cmp;
235 }
236 
237 /*
238  * bgp_mp_list_clear
239  *
240  * Clears all entries out of the mp_list
241  */
bgp_mp_list_clear(struct list * mp_list)242 void bgp_mp_list_clear(struct list *mp_list)
243 {
244 	assert(mp_list);
245 	list_delete_all_node(mp_list);
246 }
247 
248 /*
249  * bgp_mp_list_add
250  *
251  * Adds a multipath entry to the mp_list
252  */
bgp_mp_list_add(struct list * mp_list,struct bgp_path_info * mpinfo)253 void bgp_mp_list_add(struct list *mp_list, struct bgp_path_info *mpinfo)
254 {
255 	assert(mp_list && mpinfo);
256 	listnode_add_sort(mp_list, mpinfo);
257 }
258 
259 /*
260  * bgp_path_info_mpath_new
261  *
262  * Allocate and zero memory for a new bgp_path_info_mpath element
263  */
bgp_path_info_mpath_new(void)264 static struct bgp_path_info_mpath *bgp_path_info_mpath_new(void)
265 {
266 	struct bgp_path_info_mpath *new_mpath;
267 	new_mpath = XCALLOC(MTYPE_BGP_MPATH_INFO,
268 			    sizeof(struct bgp_path_info_mpath));
269 	return new_mpath;
270 }
271 
272 /*
273  * bgp_path_info_mpath_free
274  *
275  * Release resources for a bgp_path_info_mpath element and zero out pointer
276  */
bgp_path_info_mpath_free(struct bgp_path_info_mpath ** mpath)277 void bgp_path_info_mpath_free(struct bgp_path_info_mpath **mpath)
278 {
279 	if (mpath && *mpath) {
280 		if ((*mpath)->mp_attr)
281 			bgp_attr_unintern(&(*mpath)->mp_attr);
282 		XFREE(MTYPE_BGP_MPATH_INFO, *mpath);
283 	}
284 }
285 
286 /*
287  * bgp_path_info_mpath_get
288  *
289  * Fetch the mpath element for the given bgp_path_info. Used for
290  * doing lazy allocation.
291  */
292 static struct bgp_path_info_mpath *
bgp_path_info_mpath_get(struct bgp_path_info * path)293 bgp_path_info_mpath_get(struct bgp_path_info *path)
294 {
295 	struct bgp_path_info_mpath *mpath;
296 
297 	if (!path)
298 		return NULL;
299 
300 	if (!path->mpath) {
301 		mpath = bgp_path_info_mpath_new();
302 		if (!mpath)
303 			return NULL;
304 		path->mpath = mpath;
305 		mpath->mp_info = path;
306 	}
307 	return path->mpath;
308 }
309 
310 /*
311  * bgp_path_info_mpath_enqueue
312  *
313  * Enqueue a path onto the multipath list given the previous multipath
314  * list entry
315  */
bgp_path_info_mpath_enqueue(struct bgp_path_info * prev_info,struct bgp_path_info * path)316 static void bgp_path_info_mpath_enqueue(struct bgp_path_info *prev_info,
317 					struct bgp_path_info *path)
318 {
319 	struct bgp_path_info_mpath *prev, *mpath;
320 
321 	prev = bgp_path_info_mpath_get(prev_info);
322 	mpath = bgp_path_info_mpath_get(path);
323 	if (!prev || !mpath)
324 		return;
325 
326 	mpath->mp_next = prev->mp_next;
327 	mpath->mp_prev = prev;
328 	if (prev->mp_next)
329 		prev->mp_next->mp_prev = mpath;
330 	prev->mp_next = mpath;
331 
332 	SET_FLAG(path->flags, BGP_PATH_MULTIPATH);
333 }
334 
335 /*
336  * bgp_path_info_mpath_dequeue
337  *
338  * Remove a path from the multipath list
339  */
bgp_path_info_mpath_dequeue(struct bgp_path_info * path)340 void bgp_path_info_mpath_dequeue(struct bgp_path_info *path)
341 {
342 	struct bgp_path_info_mpath *mpath = path->mpath;
343 	if (!mpath)
344 		return;
345 	if (mpath->mp_prev)
346 		mpath->mp_prev->mp_next = mpath->mp_next;
347 	if (mpath->mp_next)
348 		mpath->mp_next->mp_prev = mpath->mp_prev;
349 	mpath->mp_next = mpath->mp_prev = NULL;
350 	UNSET_FLAG(path->flags, BGP_PATH_MULTIPATH);
351 }
352 
353 /*
354  * bgp_path_info_mpath_next
355  *
356  * Given a bgp_path_info, return the next multipath entry
357  */
bgp_path_info_mpath_next(struct bgp_path_info * path)358 struct bgp_path_info *bgp_path_info_mpath_next(struct bgp_path_info *path)
359 {
360 	if (!path->mpath || !path->mpath->mp_next)
361 		return NULL;
362 	return path->mpath->mp_next->mp_info;
363 }
364 
365 /*
366  * bgp_path_info_mpath_first
367  *
368  * Given bestpath bgp_path_info, return the first multipath entry.
369  */
bgp_path_info_mpath_first(struct bgp_path_info * path)370 struct bgp_path_info *bgp_path_info_mpath_first(struct bgp_path_info *path)
371 {
372 	return bgp_path_info_mpath_next(path);
373 }
374 
375 /*
376  * bgp_path_info_mpath_count
377  *
378  * Given the bestpath bgp_path_info, return the number of multipath entries
379  */
bgp_path_info_mpath_count(struct bgp_path_info * path)380 uint32_t bgp_path_info_mpath_count(struct bgp_path_info *path)
381 {
382 	if (!path->mpath)
383 		return 0;
384 	return path->mpath->mp_count;
385 }
386 
387 /*
388  * bgp_path_info_mpath_count_set
389  *
390  * Sets the count of multipaths into bestpath's mpath element
391  */
bgp_path_info_mpath_count_set(struct bgp_path_info * path,uint16_t count)392 static void bgp_path_info_mpath_count_set(struct bgp_path_info *path,
393 					  uint16_t count)
394 {
395 	struct bgp_path_info_mpath *mpath;
396 	if (!count && !path->mpath)
397 		return;
398 	mpath = bgp_path_info_mpath_get(path);
399 	if (!mpath)
400 		return;
401 	mpath->mp_count = count;
402 }
403 
404 /*
405  * bgp_path_info_mpath_lb_update
406  *
407  * Update cumulative info related to link-bandwidth
408  */
bgp_path_info_mpath_lb_update(struct bgp_path_info * path,bool set,bool all_paths_lb,uint64_t cum_bw)409 static void bgp_path_info_mpath_lb_update(struct bgp_path_info *path, bool set,
410 					  bool all_paths_lb, uint64_t cum_bw)
411 {
412 	struct bgp_path_info_mpath *mpath;
413 
414 	if ((mpath = path->mpath) == NULL) {
415 		if (!set || (cum_bw == 0 && !all_paths_lb))
416 			return;
417 
418 		mpath = bgp_path_info_mpath_get(path);
419 		if (!mpath)
420 			return;
421 	}
422 	if (set) {
423 		if (cum_bw)
424 			SET_FLAG(mpath->mp_flags, BGP_MP_LB_PRESENT);
425 		else
426 			UNSET_FLAG(mpath->mp_flags, BGP_MP_LB_PRESENT);
427 		if (all_paths_lb)
428 			SET_FLAG(mpath->mp_flags, BGP_MP_LB_ALL);
429 		else
430 			UNSET_FLAG(mpath->mp_flags, BGP_MP_LB_ALL);
431 		mpath->cum_bw = cum_bw;
432 	} else {
433 		mpath->mp_flags = 0;
434 		mpath->cum_bw = 0;
435 	}
436 }
437 
438 /*
439  * bgp_path_info_mpath_attr
440  *
441  * Given bestpath bgp_path_info, return aggregated attribute set used
442  * for advertising the multipath route
443  */
bgp_path_info_mpath_attr(struct bgp_path_info * path)444 struct attr *bgp_path_info_mpath_attr(struct bgp_path_info *path)
445 {
446 	if (!path->mpath)
447 		return NULL;
448 	return path->mpath->mp_attr;
449 }
450 
451 /*
452  * bgp_path_info_chkwtd
453  *
454  * Return if we should attempt to do weighted ECMP or not
455  * The path passed in is the bestpath.
456  */
bgp_path_info_mpath_chkwtd(struct bgp * bgp,struct bgp_path_info * path)457 bool bgp_path_info_mpath_chkwtd(struct bgp *bgp, struct bgp_path_info *path)
458 {
459 	/* Check if told to ignore weights or not multipath */
460 	if (bgp->lb_handling == BGP_LINK_BW_IGNORE_BW || !path->mpath)
461 		return false;
462 
463 	/* All paths in multipath should have associated weight (bandwidth)
464 	 * unless told explicitly otherwise.
465 	 */
466 	if (bgp->lb_handling != BGP_LINK_BW_SKIP_MISSING &&
467 	    bgp->lb_handling != BGP_LINK_BW_DEFWT_4_MISSING)
468 		return (path->mpath->mp_flags & BGP_MP_LB_ALL);
469 
470 	/* At least one path should have bandwidth. */
471 	return (path->mpath->mp_flags & BGP_MP_LB_PRESENT);
472 }
473 
474 /*
475  * bgp_path_info_mpath_attr
476  *
477  * Given bestpath bgp_path_info, return cumulative bandwidth
478  * computed for all multipaths with bandwidth info
479  */
bgp_path_info_mpath_cumbw(struct bgp_path_info * path)480 uint64_t bgp_path_info_mpath_cumbw(struct bgp_path_info *path)
481 {
482 	if (!path->mpath)
483 		return 0;
484 	return path->mpath->cum_bw;
485 }
486 
487 /*
488  * bgp_path_info_mpath_attr_set
489  *
490  * Sets the aggregated attribute into bestpath's mpath element
491  */
bgp_path_info_mpath_attr_set(struct bgp_path_info * path,struct attr * attr)492 static void bgp_path_info_mpath_attr_set(struct bgp_path_info *path,
493 					 struct attr *attr)
494 {
495 	struct bgp_path_info_mpath *mpath;
496 	if (!attr && !path->mpath)
497 		return;
498 	mpath = bgp_path_info_mpath_get(path);
499 	if (!mpath)
500 		return;
501 	mpath->mp_attr = attr;
502 }
503 
504 /*
505  * bgp_path_info_mpath_update
506  *
507  * Compare and sync up the multipath list with the mp_list generated by
508  * bgp_best_selection
509  */
bgp_path_info_mpath_update(struct bgp_dest * dest,struct bgp_path_info * new_best,struct bgp_path_info * old_best,struct list * mp_list,struct bgp_maxpaths_cfg * mpath_cfg)510 void bgp_path_info_mpath_update(struct bgp_dest *dest,
511 				struct bgp_path_info *new_best,
512 				struct bgp_path_info *old_best,
513 				struct list *mp_list,
514 				struct bgp_maxpaths_cfg *mpath_cfg)
515 {
516 	uint16_t maxpaths, mpath_count, old_mpath_count;
517 	uint32_t bwval;
518 	uint64_t cum_bw, old_cum_bw;
519 	struct listnode *mp_node, *mp_next_node;
520 	struct bgp_path_info *cur_mpath, *new_mpath, *next_mpath, *prev_mpath;
521 	int mpath_changed, debug;
522 	char nh_buf[2][INET6_ADDRSTRLEN];
523 	bool all_paths_lb;
524 	char path_buf[PATH_ADDPATH_STR_BUFFER];
525 
526 	mpath_changed = 0;
527 	maxpaths = multipath_num;
528 	mpath_count = 0;
529 	cur_mpath = NULL;
530 	old_mpath_count = 0;
531 	old_cum_bw = cum_bw = 0;
532 	prev_mpath = new_best;
533 	mp_node = listhead(mp_list);
534 	debug = bgp_debug_bestpath(dest);
535 
536 	if (new_best) {
537 		mpath_count++;
538 		if (new_best != old_best)
539 			bgp_path_info_mpath_dequeue(new_best);
540 		maxpaths = (new_best->peer->sort == BGP_PEER_IBGP)
541 				   ? mpath_cfg->maxpaths_ibgp
542 				   : mpath_cfg->maxpaths_ebgp;
543 	}
544 
545 	if (old_best) {
546 		cur_mpath = bgp_path_info_mpath_first(old_best);
547 		old_mpath_count = bgp_path_info_mpath_count(old_best);
548 		old_cum_bw = bgp_path_info_mpath_cumbw(old_best);
549 		bgp_path_info_mpath_count_set(old_best, 0);
550 		bgp_path_info_mpath_lb_update(old_best, false, false, 0);
551 		bgp_path_info_mpath_dequeue(old_best);
552 	}
553 
554 	if (debug)
555 		zlog_debug(
556 			"%pRN: starting mpath update, newbest %s num candidates %d old-mpath-count %d old-cum-bw u%" PRIu64,
557 			bgp_dest_to_rnode(dest),
558 			new_best ? new_best->peer->host : "NONE",
559 			mp_list ? listcount(mp_list) : 0,
560 			old_mpath_count, old_cum_bw);
561 
562 	/*
563 	 * We perform an ordered walk through both lists in parallel.
564 	 * The reason for the ordered walk is that if there are paths
565 	 * that were previously multipaths and are still multipaths, the walk
566 	 * should encounter them in both lists at the same time. Otherwise
567 	 * there will be paths that are in one list or another, and we
568 	 * will deal with these separately.
569 	 *
570 	 * Note that new_best might be somewhere in the mp_list, so we need
571 	 * to skip over it
572 	 */
573 	all_paths_lb = true; /* We'll reset if any path doesn't have LB. */
574 	while (mp_node || cur_mpath) {
575 		struct bgp_path_info *tmp_info;
576 
577 		/*
578 		 * We can bail out of this loop if all existing paths on the
579 		 * multipath list have been visited (for cleanup purposes) and
580 		 * the maxpath requirement is fulfulled
581 		 */
582 		if (!cur_mpath && (mpath_count >= maxpaths))
583 			break;
584 
585 		mp_next_node = mp_node ? listnextnode(mp_node) : NULL;
586 		next_mpath =
587 			cur_mpath ? bgp_path_info_mpath_next(cur_mpath) : NULL;
588 		tmp_info = mp_node ? listgetdata(mp_node) : NULL;
589 
590 		if (debug)
591 			zlog_debug(
592 				"%pRN: comparing candidate %s with existing mpath %s",
593 				bgp_dest_to_rnode(dest),
594 				tmp_info ? tmp_info->peer->host : "NONE",
595 				cur_mpath ? cur_mpath->peer->host : "NONE");
596 
597 		/*
598 		 * If equal, the path was a multipath and is still a multipath.
599 		 * Insert onto new multipath list if maxpaths allows.
600 		 */
601 		if (mp_node && (listgetdata(mp_node) == cur_mpath)) {
602 			list_delete_node(mp_list, mp_node);
603 			bgp_path_info_mpath_dequeue(cur_mpath);
604 			if ((mpath_count < maxpaths)
605 			    && prev_mpath
606 			    && bgp_path_info_nexthop_cmp(prev_mpath,
607 							 cur_mpath)) {
608 				bgp_path_info_mpath_enqueue(prev_mpath,
609 							    cur_mpath);
610 				prev_mpath = cur_mpath;
611 				mpath_count++;
612 				if (ecommunity_linkbw_present(
613 					cur_mpath->attr->ecommunity, &bwval))
614 					cum_bw += bwval;
615 				else
616 					all_paths_lb = false;
617 				if (debug) {
618 					bgp_path_info_path_with_addpath_rx_str(
619 						cur_mpath, path_buf);
620 					zlog_debug(
621 						"%pRN: %s is still multipath, cur count %d",
622 						bgp_dest_to_rnode(dest),
623 						path_buf, mpath_count);
624 				}
625 			} else {
626 				mpath_changed = 1;
627 				if (debug) {
628 					bgp_path_info_path_with_addpath_rx_str(
629 						cur_mpath, path_buf);
630 					zlog_debug(
631 						"%pRN: remove mpath %s nexthop %s, cur count %d",
632 						bgp_dest_to_rnode(dest),
633 						path_buf,
634 						inet_ntop(AF_INET,
635 							  &cur_mpath->attr
636 								   ->nexthop,
637 							  nh_buf[0],
638 							  sizeof(nh_buf[0])),
639 						mpath_count);
640 				}
641 			}
642 			mp_node = mp_next_node;
643 			cur_mpath = next_mpath;
644 			continue;
645 		}
646 
647 		if (cur_mpath
648 		    && (!mp_node
649 			|| (bgp_path_info_mpath_cmp(cur_mpath,
650 						    listgetdata(mp_node))
651 			    < 0))) {
652 			/*
653 			 * If here, we have an old multipath and either the
654 			 * mp_list
655 			 * is finished or the next mp_node points to a later
656 			 * multipath, so we need to purge this path from the
657 			 * multipath list
658 			 */
659 			bgp_path_info_mpath_dequeue(cur_mpath);
660 			mpath_changed = 1;
661 			if (debug) {
662 				bgp_path_info_path_with_addpath_rx_str(
663 					cur_mpath, path_buf);
664 				zlog_debug(
665 					"%pRN: remove mpath %s nexthop %s, cur count %d",
666 					bgp_dest_to_rnode(dest), path_buf,
667 					inet_ntop(AF_INET,
668 						  &cur_mpath->attr->nexthop,
669 						  nh_buf[0], sizeof(nh_buf[0])),
670 					mpath_count);
671 			}
672 			cur_mpath = next_mpath;
673 		} else {
674 			/*
675 			 * If here, we have a path on the mp_list that was not
676 			 * previously
677 			 * a multipath (due to non-equivalance or maxpaths
678 			 * exceeded),
679 			 * or the matching multipath is sorted later in the
680 			 * multipath
681 			 * list. Before we enqueue the path on the new multipath
682 			 * list,
683 			 * make sure its not on the old_best multipath list or
684 			 * referenced
685 			 * via next_mpath:
686 			 * - If next_mpath points to this new path, update
687 			 * next_mpath to
688 			 *   point to the multipath after this one
689 			 * - Dequeue the path from the multipath list just to
690 			 * make sure
691 			 */
692 			new_mpath = listgetdata(mp_node);
693 			list_delete_node(mp_list, mp_node);
694 			assert(new_mpath);
695 			assert(prev_mpath);
696 			if ((mpath_count < maxpaths) && (new_mpath != new_best)
697 			    && bgp_path_info_nexthop_cmp(prev_mpath,
698 							 new_mpath)) {
699 				bgp_path_info_mpath_dequeue(new_mpath);
700 
701 				bgp_path_info_mpath_enqueue(prev_mpath,
702 							    new_mpath);
703 				prev_mpath = new_mpath;
704 				mpath_changed = 1;
705 				mpath_count++;
706 				if (ecommunity_linkbw_present(
707 					new_mpath->attr->ecommunity, &bwval))
708 					cum_bw += bwval;
709 				else
710 					all_paths_lb = false;
711 				if (debug) {
712 					bgp_path_info_path_with_addpath_rx_str(
713 						new_mpath, path_buf);
714 					zlog_debug(
715 						"%pRN: add mpath %s nexthop %s, cur count %d",
716 						bgp_dest_to_rnode(dest),
717 						path_buf,
718 						inet_ntop(AF_INET,
719 							  &new_mpath->attr
720 								   ->nexthop,
721 							  nh_buf[0],
722 							  sizeof(nh_buf[0])),
723 						mpath_count);
724 				}
725 			}
726 			mp_node = mp_next_node;
727 		}
728 	}
729 
730 	if (new_best) {
731 		bgp_path_info_mpath_count_set(new_best, mpath_count - 1);
732 		if (mpath_count <= 1 ||
733 		    !ecommunity_linkbw_present(
734 			new_best->attr->ecommunity, &bwval))
735 			all_paths_lb = false;
736 		else
737 			cum_bw += bwval;
738 		bgp_path_info_mpath_lb_update(new_best, true,
739 					      all_paths_lb, cum_bw);
740 
741 		if (debug)
742 			zlog_debug(
743 				"%pRN: New mpath count (incl newbest) %d mpath-change %s all_paths_lb %d cum_bw u%" PRIu64,
744 				bgp_dest_to_rnode(dest), mpath_count,
745 				mpath_changed ? "YES" : "NO",
746 				all_paths_lb, cum_bw);
747 
748 		if (mpath_changed
749 		    || (bgp_path_info_mpath_count(new_best) != old_mpath_count))
750 			SET_FLAG(new_best->flags, BGP_PATH_MULTIPATH_CHG);
751 		if ((mpath_count - 1) != old_mpath_count ||
752 		    old_cum_bw != cum_bw)
753 			SET_FLAG(new_best->flags, BGP_PATH_LINK_BW_CHG);
754 	}
755 }
756 
757 /*
758  * bgp_mp_dmed_deselect
759  *
760  * Clean up multipath information for BGP_PATH_DMED_SELECTED path that
761  * is not selected as best path
762  */
bgp_mp_dmed_deselect(struct bgp_path_info * dmed_best)763 void bgp_mp_dmed_deselect(struct bgp_path_info *dmed_best)
764 {
765 	struct bgp_path_info *mpinfo, *mpnext;
766 
767 	if (!dmed_best)
768 		return;
769 
770 	for (mpinfo = bgp_path_info_mpath_first(dmed_best); mpinfo;
771 	     mpinfo = mpnext) {
772 		mpnext = bgp_path_info_mpath_next(mpinfo);
773 		bgp_path_info_mpath_dequeue(mpinfo);
774 	}
775 
776 	bgp_path_info_mpath_count_set(dmed_best, 0);
777 	UNSET_FLAG(dmed_best->flags, BGP_PATH_MULTIPATH_CHG);
778 	UNSET_FLAG(dmed_best->flags, BGP_PATH_LINK_BW_CHG);
779 	assert(bgp_path_info_mpath_first(dmed_best) == NULL);
780 }
781 
782 /*
783  * bgp_path_info_mpath_aggregate_update
784  *
785  * Set the multipath aggregate attribute. We need to see if the
786  * aggregate has changed and then set the ATTR_CHANGED flag on the
787  * bestpath info so that a peer update will be generated. The
788  * change is detected by generating the current attribute,
789  * interning it, and then comparing the interned pointer with the
790  * current value. We can skip this generate/compare step if there
791  * is no change in multipath selection and no attribute change in
792  * any multipath.
793  */
bgp_path_info_mpath_aggregate_update(struct bgp_path_info * new_best,struct bgp_path_info * old_best)794 void bgp_path_info_mpath_aggregate_update(struct bgp_path_info *new_best,
795 					  struct bgp_path_info *old_best)
796 {
797 	struct bgp_path_info *mpinfo;
798 	struct aspath *aspath;
799 	struct aspath *asmerge;
800 	struct attr *new_attr, *old_attr;
801 	uint8_t origin;
802 	struct community *community, *commerge;
803 	struct ecommunity *ecomm, *ecommerge;
804 	struct lcommunity *lcomm, *lcommerge;
805 	struct attr attr = {0};
806 
807 	if (old_best && (old_best != new_best)
808 	    && (old_attr = bgp_path_info_mpath_attr(old_best))) {
809 		bgp_attr_unintern(&old_attr);
810 		bgp_path_info_mpath_attr_set(old_best, NULL);
811 	}
812 
813 	if (!new_best)
814 		return;
815 
816 	if (!bgp_path_info_mpath_count(new_best)) {
817 		if ((new_attr = bgp_path_info_mpath_attr(new_best))) {
818 			bgp_attr_unintern(&new_attr);
819 			bgp_path_info_mpath_attr_set(new_best, NULL);
820 			SET_FLAG(new_best->flags, BGP_PATH_ATTR_CHANGED);
821 		}
822 		return;
823 	}
824 
825 	attr = *new_best->attr;
826 
827 	if (new_best->peer
828 	    && CHECK_FLAG(new_best->peer->bgp->flags,
829 			  BGP_FLAG_MULTIPATH_RELAX_AS_SET)) {
830 
831 		/* aggregate attribute from multipath constituents */
832 		aspath = aspath_dup(attr.aspath);
833 		origin = attr.origin;
834 		community =
835 			attr.community ? community_dup(attr.community) : NULL;
836 		ecomm = (attr.ecommunity) ? ecommunity_dup(attr.ecommunity)
837 					  : NULL;
838 		lcomm = (attr.lcommunity) ? lcommunity_dup(attr.lcommunity)
839 					  : NULL;
840 
841 		for (mpinfo = bgp_path_info_mpath_first(new_best); mpinfo;
842 		     mpinfo = bgp_path_info_mpath_next(mpinfo)) {
843 			asmerge =
844 				aspath_aggregate(aspath, mpinfo->attr->aspath);
845 			aspath_free(aspath);
846 			aspath = asmerge;
847 
848 			if (origin < mpinfo->attr->origin)
849 				origin = mpinfo->attr->origin;
850 
851 			if (mpinfo->attr->community) {
852 				if (community) {
853 					commerge = community_merge(
854 						community,
855 						mpinfo->attr->community);
856 					community =
857 						community_uniq_sort(commerge);
858 					community_free(&commerge);
859 				} else
860 					community = community_dup(
861 						mpinfo->attr->community);
862 			}
863 
864 			if (mpinfo->attr->ecommunity) {
865 				if (ecomm) {
866 					ecommerge = ecommunity_merge(
867 						ecomm,
868 						mpinfo->attr->ecommunity);
869 					ecomm = ecommunity_uniq_sort(ecommerge);
870 					ecommunity_free(&ecommerge);
871 				} else
872 					ecomm = ecommunity_dup(
873 						mpinfo->attr->ecommunity);
874 			}
875 			if (mpinfo->attr->lcommunity) {
876 				if (lcomm) {
877 					lcommerge = lcommunity_merge(
878 						lcomm,
879 						mpinfo->attr->lcommunity);
880 					lcomm = lcommunity_uniq_sort(lcommerge);
881 					lcommunity_free(&lcommerge);
882 				} else
883 					lcomm = lcommunity_dup(
884 						mpinfo->attr->lcommunity);
885 			}
886 		}
887 
888 		attr.aspath = aspath;
889 		attr.origin = origin;
890 		if (community) {
891 			attr.community = community;
892 			attr.flag |= ATTR_FLAG_BIT(BGP_ATTR_COMMUNITIES);
893 		}
894 		if (ecomm) {
895 			attr.ecommunity = ecomm;
896 			attr.flag |= ATTR_FLAG_BIT(BGP_ATTR_EXT_COMMUNITIES);
897 		}
898 		if (lcomm) {
899 			attr.lcommunity = lcomm;
900 			attr.flag |= ATTR_FLAG_BIT(BGP_ATTR_LARGE_COMMUNITIES);
901 		}
902 
903 		/* Zap multipath attr nexthop so we set nexthop to self */
904 		attr.nexthop.s_addr = INADDR_ANY;
905 		memset(&attr.mp_nexthop_global, 0, sizeof(struct in6_addr));
906 
907 		/* TODO: should we set ATOMIC_AGGREGATE and AGGREGATOR? */
908 	}
909 
910 	new_attr = bgp_attr_intern(&attr);
911 
912 	if (new_attr != bgp_path_info_mpath_attr(new_best)) {
913 		if ((old_attr = bgp_path_info_mpath_attr(new_best)))
914 			bgp_attr_unintern(&old_attr);
915 		bgp_path_info_mpath_attr_set(new_best, new_attr);
916 		SET_FLAG(new_best->flags, BGP_PATH_ATTR_CHANGED);
917 	} else
918 		bgp_attr_unintern(&new_attr);
919 }
920