xref: /openbsd/usr.sbin/bgpd/rde_update.c (revision 5af055cd)
1 /*	$OpenBSD: rde_update.c,v 1.82 2014/12/18 19:28:44 tedu Exp $ */
2 
3 /*
4  * Copyright (c) 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include <sys/types.h>
19 #include <sys/queue.h>
20 
21 #include <limits.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <siphash.h>
25 
26 #include "bgpd.h"
27 #include "rde.h"
28 
29 in_addr_t	up_get_nexthop(struct rde_peer *, struct rde_aspath *);
30 int		up_generate_mp_reach(struct rde_peer *, struct update_attr *,
31 		    struct rde_aspath *, u_int8_t);
32 int		up_generate_attr(struct rde_peer *, struct update_attr *,
33 		    struct rde_aspath *, u_int8_t);
34 
35 /* update stuff. */
36 struct update_prefix {
37 	TAILQ_ENTRY(update_prefix)	 prefix_l;
38 	RB_ENTRY(update_prefix)		 entry;
39 	struct uplist_prefix		*prefix_h;
40 	struct bgpd_addr		 prefix;
41 	int				 prefixlen;
42 };
43 
44 struct update_attr {
45 	TAILQ_ENTRY(update_attr)	 attr_l;
46 	RB_ENTRY(update_attr)		 entry;
47 	struct uplist_prefix		 prefix_h;
48 	u_char				*attr;
49 	u_char				*mpattr;
50 	u_int32_t			 attr_hash;
51 	u_int16_t			 attr_len;
52 	u_int16_t			 mpattr_len;
53 };
54 
55 void	up_clear(struct uplist_attr *, struct uplist_prefix *);
56 int	up_prefix_cmp(struct update_prefix *, struct update_prefix *);
57 int	up_attr_cmp(struct update_attr *, struct update_attr *);
58 int	up_add(struct rde_peer *, struct update_prefix *, struct update_attr *);
59 
60 RB_PROTOTYPE(uptree_prefix, update_prefix, entry, up_prefix_cmp)
61 RB_GENERATE(uptree_prefix, update_prefix, entry, up_prefix_cmp)
62 
63 RB_PROTOTYPE(uptree_attr, update_attr, entry, up_attr_cmp)
64 RB_GENERATE(uptree_attr, update_attr, entry, up_attr_cmp)
65 
66 SIPHASH_KEY uptree_key;
67 
68 void
69 up_init(struct rde_peer *peer)
70 {
71 	u_int8_t	i;
72 
73 	for (i = 0; i < AID_MAX; i++) {
74 		TAILQ_INIT(&peer->updates[i]);
75 		TAILQ_INIT(&peer->withdraws[i]);
76 	}
77 	RB_INIT(&peer->up_prefix);
78 	RB_INIT(&peer->up_attrs);
79 	peer->up_pcnt = 0;
80 	peer->up_acnt = 0;
81 	peer->up_nlricnt = 0;
82 	peer->up_wcnt = 0;
83 	arc4random_buf(&uptree_key, sizeof(uptree_key));
84 }
85 
86 void
87 up_clear(struct uplist_attr *updates, struct uplist_prefix *withdraws)
88 {
89 	struct update_attr	*ua;
90 	struct update_prefix	*up;
91 
92 	while ((ua = TAILQ_FIRST(updates)) != NULL) {
93 		TAILQ_REMOVE(updates, ua, attr_l);
94 		while ((up = TAILQ_FIRST(&ua->prefix_h)) != NULL) {
95 			TAILQ_REMOVE(&ua->prefix_h, up, prefix_l);
96 			free(up);
97 		}
98 		free(ua->attr);
99 		free(ua->mpattr);
100 		free(ua);
101 	}
102 
103 	while ((up = TAILQ_FIRST(withdraws)) != NULL) {
104 		TAILQ_REMOVE(withdraws, up, prefix_l);
105 		free(up);
106 	}
107 }
108 
109 void
110 up_down(struct rde_peer *peer)
111 {
112 	u_int8_t	i;
113 
114 	for (i = 0; i < AID_MAX; i++)
115 		up_clear(&peer->updates[i], &peer->withdraws[i]);
116 
117 	RB_INIT(&peer->up_prefix);
118 	RB_INIT(&peer->up_attrs);
119 
120 	peer->up_pcnt = 0;
121 	peer->up_acnt = 0;
122 	peer->up_nlricnt = 0;
123 	peer->up_wcnt = 0;
124 }
125 
126 int
127 up_prefix_cmp(struct update_prefix *a, struct update_prefix *b)
128 {
129 	int	i;
130 
131 	if (a->prefix.aid < b->prefix.aid)
132 		return (-1);
133 	if (a->prefix.aid > b->prefix.aid)
134 		return (1);
135 
136 	switch (a->prefix.aid) {
137 	case AID_INET:
138 		if (ntohl(a->prefix.v4.s_addr) < ntohl(b->prefix.v4.s_addr))
139 			return (-1);
140 		if (ntohl(a->prefix.v4.s_addr) > ntohl(b->prefix.v4.s_addr))
141 			return (1);
142 		break;
143 	case AID_INET6:
144 		i = memcmp(&a->prefix.v6, &b->prefix.v6,
145 		    sizeof(struct in6_addr));
146 		if (i > 0)
147 			return (1);
148 		if (i < 0)
149 			return (-1);
150 		break;
151 	case AID_VPN_IPv4:
152 		if (betoh64(a->prefix.vpn4.rd) < betoh64(b->prefix.vpn4.rd))
153 			return (-1);
154 		if (betoh64(a->prefix.vpn4.rd) > betoh64(b->prefix.vpn4.rd))
155 			return (1);
156 		if (ntohl(a->prefix.v4.s_addr) < ntohl(b->prefix.v4.s_addr))
157 			return (-1);
158 		if (ntohl(a->prefix.v4.s_addr) > ntohl(b->prefix.v4.s_addr))
159 			return (1);
160 		if (a->prefixlen < b->prefixlen)
161 			return (-1);
162 		if (a->prefixlen > b->prefixlen)
163 			return (1);
164 		if (a->prefix.vpn4.labellen < b->prefix.vpn4.labellen)
165 			return (-1);
166 		if (a->prefix.vpn4.labellen > b->prefix.vpn4.labellen)
167 			return (1);
168 		return (memcmp(a->prefix.vpn4.labelstack,
169 		    b->prefix.vpn4.labelstack, a->prefix.vpn4.labellen));
170 	default:
171 		fatalx("pt_prefix_cmp: unknown af");
172 	}
173 	if (a->prefixlen < b->prefixlen)
174 		return (-1);
175 	if (a->prefixlen > b->prefixlen)
176 		return (1);
177 	return (0);
178 }
179 
180 int
181 up_attr_cmp(struct update_attr *a, struct update_attr *b)
182 {
183 	int	r;
184 
185 	if ((r = a->attr_hash - b->attr_hash) != 0)
186 		return (r);
187 	if ((r = a->attr_len - b->attr_len) != 0)
188 		return (r);
189 	if ((r = a->mpattr_len - b->mpattr_len) != 0)
190 		return (r);
191 	if ((r = memcmp(a->mpattr, b->mpattr, a->mpattr_len)) != 0)
192 		return (r);
193 	return (memcmp(a->attr, b->attr, a->attr_len));
194 }
195 
196 int
197 up_add(struct rde_peer *peer, struct update_prefix *p, struct update_attr *a)
198 {
199 	struct update_attr	*na = NULL;
200 	struct update_prefix	*np;
201 	struct uplist_attr	*upl = NULL;
202 	struct uplist_prefix	*wdl = NULL;
203 
204 	upl = &peer->updates[p->prefix.aid];
205 	wdl = &peer->withdraws[p->prefix.aid];
206 
207 	/* 1. search for attr */
208 	if (a != NULL && (na = RB_FIND(uptree_attr, &peer->up_attrs, a)) ==
209 	    NULL) {
210 		/* 1.1 if not found -> add */
211 		TAILQ_INIT(&a->prefix_h);
212 		if (RB_INSERT(uptree_attr, &peer->up_attrs, a) != NULL) {
213 			log_warnx("uptree_attr insert failed");
214 			/* cleanup */
215 			free(a->attr);
216 			free(a->mpattr);
217 			free(a);
218 			free(p);
219 			return (-1);
220 		}
221 		TAILQ_INSERT_TAIL(upl, a, attr_l);
222 		peer->up_acnt++;
223 	} else {
224 		/* 1.2 if found -> use that, free a */
225 		if (a != NULL) {
226 			free(a->attr);
227 			free(a->mpattr);
228 			free(a);
229 			a = na;
230 			/* move to end of update queue */
231 			TAILQ_REMOVE(upl, a, attr_l);
232 			TAILQ_INSERT_TAIL(upl, a, attr_l);
233 		}
234 	}
235 
236 	/* 2. search for prefix */
237 	if ((np = RB_FIND(uptree_prefix, &peer->up_prefix, p)) == NULL) {
238 		/* 2.1 if not found -> add */
239 		if (RB_INSERT(uptree_prefix, &peer->up_prefix, p) != NULL) {
240 			log_warnx("uptree_prefix insert failed");
241 			/*
242 			 * cleanup. But do not free a because it is already
243 			 * linked or NULL. up_dump_attrnlri() will remove and
244 			 * free the empty attribute later.
245 			 */
246 			free(p);
247 			return (-1);
248 		}
249 		peer->up_pcnt++;
250 	} else {
251 		/* 2.2 if found -> use that and free p */
252 		TAILQ_REMOVE(np->prefix_h, np, prefix_l);
253 		free(p);
254 		p = np;
255 		if (p->prefix_h == wdl)
256 			peer->up_wcnt--;
257 		else
258 			peer->up_nlricnt--;
259 	}
260 	/* 3. link prefix to attr */
261 	if (a == NULL) {
262 		TAILQ_INSERT_TAIL(wdl, p, prefix_l);
263 		p->prefix_h = wdl;
264 		peer->up_wcnt++;
265 	} else {
266 		TAILQ_INSERT_TAIL(&a->prefix_h, p, prefix_l);
267 		p->prefix_h = &a->prefix_h;
268 		peer->up_nlricnt++;
269 	}
270 	return (0);
271 }
272 
273 int
274 up_test_update(struct rde_peer *peer, struct prefix *p)
275 {
276 	struct bgpd_addr	 addr;
277 	struct attr		*attr;
278 
279 	if (peer->state != PEER_UP)
280 		return (-1);
281 
282 	if (p == NULL)
283 		/* no prefix available */
284 		return (0);
285 
286 	if (peer == p->aspath->peer)
287 		/* Do not send routes back to sender */
288 		return (0);
289 
290 	if (p->aspath->flags & F_ATTR_PARSE_ERR)
291 		fatalx("try to send out a botched path");
292 	if (p->aspath->flags & F_ATTR_LOOP)
293 		fatalx("try to send out a looped path");
294 
295 	pt_getaddr(p->prefix, &addr);
296 	if (peer->capa.mp[addr.aid] == 0)
297 		return (-1);
298 
299 	if (!p->aspath->peer->conf.ebgp && !peer->conf.ebgp) {
300 		/*
301 		 * route reflector redistribution rules:
302 		 * 1. if announce is set                -> announce
303 		 * 2. old non-client, new non-client    -> no
304 		 * 3. old client, new non-client        -> yes
305 		 * 4. old non-client, new client        -> yes
306 		 * 5. old client, new client            -> yes
307 		 */
308 		if (p->aspath->peer->conf.reflector_client == 0 &&
309 		    peer->conf.reflector_client == 0 &&
310 		    (p->aspath->flags & F_PREFIX_ANNOUNCED) == 0)
311 			/* Do not redistribute updates to ibgp peers */
312 			return (0);
313 	}
314 
315 	/* announce type handling */
316 	switch (peer->conf.announce_type) {
317 	case ANNOUNCE_UNDEF:
318 	case ANNOUNCE_NONE:
319 	case ANNOUNCE_DEFAULT_ROUTE:
320 		/*
321 		 * no need to withdraw old prefix as this will be
322 		 * filtered out as well.
323 		 */
324 		return (-1);
325 	case ANNOUNCE_ALL:
326 		break;
327 	case ANNOUNCE_SELF:
328 		/*
329 		 * pass only prefix that have an aspath count
330 		 * of zero this is equal to the ^$ regex.
331 		 */
332 		if (p->aspath->aspath->ascnt != 0)
333 			return (0);
334 		break;
335 	}
336 
337 	/* well known communities */
338 	if (community_match(p->aspath,
339 	    COMMUNITY_WELLKNOWN, COMMUNITY_NO_ADVERTISE))
340 		return (0);
341 	if (peer->conf.ebgp && community_match(p->aspath,
342 	    COMMUNITY_WELLKNOWN, COMMUNITY_NO_EXPORT))
343 		return (0);
344 	if (peer->conf.ebgp && community_match(p->aspath,
345 	    COMMUNITY_WELLKNOWN, COMMUNITY_NO_EXPSUBCONFED))
346 		return (0);
347 
348 	/*
349 	 * Don't send messages back to originator
350 	 * this is not specified in the RFC but seems logical.
351 	 */
352 	if ((attr = attr_optget(p->aspath, ATTR_ORIGINATOR_ID)) != NULL) {
353 		if (memcmp(attr->data, &peer->remote_bgpid,
354 		    sizeof(peer->remote_bgpid)) == 0) {
355 			/* would cause loop don't send */
356 			return (-1);
357 		}
358 	}
359 
360 	return (1);
361 }
362 
363 int
364 up_generate(struct rde_peer *peer, struct rde_aspath *asp,
365     struct bgpd_addr *addr, u_int8_t prefixlen)
366 {
367 	struct update_attr		*ua = NULL;
368 	struct update_prefix		*up;
369 	SIPHASH_CTX			ctx;
370 
371 	if (asp) {
372 		ua = calloc(1, sizeof(struct update_attr));
373 		if (ua == NULL)
374 			fatal("up_generate");
375 
376 		if (up_generate_attr(peer, ua, asp, addr->aid) == -1) {
377 			log_warnx("generation of bgp path attributes failed");
378 			free(ua);
379 			return (-1);
380 		}
381 		/*
382 		 * use aspath_hash as attr_hash, this may be unoptimal
383 		 * but currently I don't care.
384 		 */
385 		SipHash24_Init(&ctx, &uptree_key);
386 		SipHash24_Update(&ctx, ua->attr, ua->attr_len);
387 		if (ua->mpattr)
388 			SipHash24_Update(&ctx, ua->mpattr, ua->mpattr_len);
389 		ua->attr_hash = SipHash24_End(&ctx);
390 	}
391 
392 	up = calloc(1, sizeof(struct update_prefix));
393 	if (up == NULL)
394 		fatal("up_generate");
395 	up->prefix = *addr;
396 	up->prefixlen = prefixlen;
397 
398 	if (up_add(peer, up, ua) == -1)
399 		return (-1);
400 
401 	return (0);
402 }
403 
404 void
405 up_generate_updates(struct filter_head *rules, struct rde_peer *peer,
406     struct prefix *new, struct prefix *old)
407 {
408 	struct rde_aspath		*asp;
409 	struct bgpd_addr		 addr;
410 
411 	if (peer->state != PEER_UP)
412 		return;
413 
414 	if (new == NULL) {
415 		if (up_test_update(peer, old) != 1)
416 			return;
417 
418 		pt_getaddr(old->prefix, &addr);
419 		if (rde_filter(rules, NULL, peer, old->aspath, &addr,
420 		    old->prefix->prefixlen, old->aspath->peer) == ACTION_DENY)
421 			return;
422 
423 		/* withdraw prefix */
424 		up_generate(peer, NULL, &addr, old->prefix->prefixlen);
425 	} else {
426 		switch (up_test_update(peer, new)) {
427 		case 1:
428 			break;
429 		case 0:
430 			up_generate_updates(rules, peer, NULL, old);
431 			return;
432 		case -1:
433 			return;
434 		}
435 
436 		pt_getaddr(new->prefix, &addr);
437 		if (rde_filter(rules, &asp, peer, new->aspath, &addr,
438 		    new->prefix->prefixlen, new->aspath->peer) == ACTION_DENY) {
439 			path_put(asp);
440 			up_generate_updates(rules, peer, NULL, old);
441 			return;
442 		}
443 
444 		/* generate update */
445 		if (asp != NULL) {
446 			up_generate(peer, asp, &addr, new->prefix->prefixlen);
447 			path_put(asp);
448 		} else
449 			up_generate(peer, new->aspath, &addr,
450 			    new->prefix->prefixlen);
451 	}
452 }
453 
454 /* send a default route to the specified peer */
455 void
456 up_generate_default(struct filter_head *rules, struct rde_peer *peer,
457     u_int8_t aid)
458 {
459 	struct rde_aspath	*asp, *fasp;
460 	struct bgpd_addr	 addr;
461 
462 	if (peer->capa.mp[aid] == 0)
463 		return;
464 
465 	asp = path_get();
466 	asp->aspath = aspath_get(NULL, 0);
467 	asp->origin = ORIGIN_IGP;
468 	/* the other default values are OK, nexthop is once again NULL */
469 
470 	/*
471 	 * XXX apply default overrides. Not yet possible, mainly a parse.y
472 	 * problem.
473 	 */
474 	/* rde_apply_set(asp, set, af, NULL ???, DIR_IN); */
475 
476 	/* filter as usual */
477 	bzero(&addr, sizeof(addr));
478 	addr.aid = aid;
479 
480 	if (rde_filter(rules, &fasp, peer, asp, &addr, 0, NULL) ==
481 	    ACTION_DENY) {
482 		path_put(fasp);
483 		path_put(asp);
484 		return;
485 	}
486 
487 	/* generate update */
488 	if (fasp != NULL)
489 		up_generate(peer, fasp, &addr, 0);
490 	else
491 		up_generate(peer, asp, &addr, 0);
492 
493 	/* no longer needed */
494 	path_put(fasp);
495 	path_put(asp);
496 }
497 
498 /* generate a EoR marker in the update list. This is a horrible hack. */
499 int
500 up_generate_marker(struct rde_peer *peer, u_int8_t aid)
501 {
502 	struct update_attr	*ua;
503 	struct update_attr	*na = NULL;
504 	struct uplist_attr	*upl = NULL;
505 
506 	ua = calloc(1, sizeof(struct update_attr));
507 	if (ua == NULL)
508 		fatal("up_generate_marker");
509 
510 	upl = &peer->updates[aid];
511 
512 	/* 1. search for attr */
513 	if ((na = RB_FIND(uptree_attr, &peer->up_attrs, ua)) == NULL) {
514 		/* 1.1 if not found -> add */
515 		TAILQ_INIT(&ua->prefix_h);
516 		if (RB_INSERT(uptree_attr, &peer->up_attrs, ua) != NULL) {
517 			log_warnx("uptree_attr insert failed");
518 			/* cleanup */
519 			free(ua);
520 			return (-1);
521 		}
522 		TAILQ_INSERT_TAIL(upl, ua, attr_l);
523 		peer->up_acnt++;
524 	} else {
525 		/* 1.2 if found -> use that, free ua */
526 		free(ua);
527 		ua = na;
528 		/* move to end of update queue */
529 		TAILQ_REMOVE(upl, ua, attr_l);
530 		TAILQ_INSERT_TAIL(upl, ua, attr_l);
531 	}
532 	return (0);
533 }
534 
535 u_char	up_attr_buf[4096];
536 
537 /* only for IPv4 */
538 in_addr_t
539 up_get_nexthop(struct rde_peer *peer, struct rde_aspath *a)
540 {
541 	in_addr_t	mask;
542 
543 	/* nexthop, already network byte order */
544 	if (a->flags & F_NEXTHOP_NOMODIFY) {
545 		/* no modify flag set */
546 		if (a->nexthop == NULL)
547 			return (peer->local_v4_addr.v4.s_addr);
548 		else
549 			return (a->nexthop->exit_nexthop.v4.s_addr);
550 	} else if (a->flags & F_NEXTHOP_SELF)
551 		return (peer->local_v4_addr.v4.s_addr);
552 	else if (!peer->conf.ebgp) {
553 		/*
554 		 * If directly connected use peer->local_v4_addr
555 		 * this is only true for announced networks.
556 		 */
557 		if (a->nexthop == NULL)
558 			return (peer->local_v4_addr.v4.s_addr);
559 		else if (a->nexthop->exit_nexthop.v4.s_addr ==
560 		    peer->remote_addr.v4.s_addr)
561 			/*
562 			 * per RFC: if remote peer address is equal to
563 			 * the nexthop set the nexthop to our local address.
564 			 * This reduces the risk of routing loops.
565 			 */
566 			return (peer->local_v4_addr.v4.s_addr);
567 		else
568 			return (a->nexthop->exit_nexthop.v4.s_addr);
569 	} else if (peer->conf.distance == 1) {
570 		/* ebgp directly connected */
571 		if (a->nexthop != NULL &&
572 		    a->nexthop->flags & NEXTHOP_CONNECTED) {
573 			mask = htonl(
574 			    prefixlen2mask(a->nexthop->nexthop_netlen));
575 			if ((peer->remote_addr.v4.s_addr & mask) ==
576 			    (a->nexthop->nexthop_net.v4.s_addr & mask))
577 				/* nexthop and peer are in the same net */
578 				return (a->nexthop->exit_nexthop.v4.s_addr);
579 			else
580 				return (peer->local_v4_addr.v4.s_addr);
581 		} else
582 			return (peer->local_v4_addr.v4.s_addr);
583 	} else
584 		/* ebgp multihop */
585 		/*
586 		 * For ebgp multihop nh->flags should never have
587 		 * NEXTHOP_CONNECTED set so it should be possible to unify the
588 		 * two ebgp cases. But this is safe and RFC compliant.
589 		 */
590 		return (peer->local_v4_addr.v4.s_addr);
591 }
592 
593 int
594 up_generate_mp_reach(struct rde_peer *peer, struct update_attr *upa,
595     struct rde_aspath *a, u_int8_t aid)
596 {
597 	u_int16_t	tmp;
598 
599 	switch (aid) {
600 	case AID_INET6:
601 		upa->mpattr_len = 21; /* AFI + SAFI + NH LEN + NH + Reserved */
602 		upa->mpattr = malloc(upa->mpattr_len);
603 		if (upa->mpattr == NULL)
604 			fatal("up_generate_mp_reach");
605 		if (aid2afi(aid, &tmp, &upa->mpattr[2]))
606 			fatalx("up_generate_mp_reachi: bad AID");
607 		tmp = htons(tmp);
608 		memcpy(upa->mpattr, &tmp, sizeof(tmp));
609 		upa->mpattr[3] = sizeof(struct in6_addr);
610 		upa->mpattr[20] = 0; /* Reserved must be 0 */
611 
612 		/* nexthop dance see also up_get_nexthop() */
613 		if (a->flags & F_NEXTHOP_NOMODIFY) {
614 			/* no modify flag set */
615 			if (a->nexthop == NULL)
616 				memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6,
617 				    sizeof(struct in6_addr));
618 			else
619 				memcpy(&upa->mpattr[4],
620 				    &a->nexthop->exit_nexthop.v6,
621 				    sizeof(struct in6_addr));
622 		} else if (a->flags & F_NEXTHOP_SELF)
623 			memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6,
624 			    sizeof(struct in6_addr));
625 		else if (!peer->conf.ebgp) {
626 			/* ibgp */
627 			if (a->nexthop == NULL ||
628 			    (a->nexthop->exit_nexthop.aid == AID_INET6 &&
629 			    !memcmp(&a->nexthop->exit_nexthop.v6,
630 			    &peer->remote_addr.v6, sizeof(struct in6_addr))))
631 				memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6,
632 				    sizeof(struct in6_addr));
633 			else
634 				memcpy(&upa->mpattr[4],
635 				    &a->nexthop->exit_nexthop.v6,
636 				    sizeof(struct in6_addr));
637 		} else if (peer->conf.distance == 1) {
638 			/* ebgp directly connected */
639 			if (a->nexthop != NULL &&
640 			    a->nexthop->flags & NEXTHOP_CONNECTED)
641 				if (prefix_compare(&peer->remote_addr,
642 				    &a->nexthop->nexthop_net,
643 				    a->nexthop->nexthop_netlen) == 0) {
644 					/*
645 					 * nexthop and peer are in the same
646 					 * subnet
647 					 */
648 					memcpy(&upa->mpattr[4],
649 					    &a->nexthop->exit_nexthop.v6,
650 					    sizeof(struct in6_addr));
651 					return (0);
652 				}
653 			memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6,
654 			    sizeof(struct in6_addr));
655 		} else
656 			/* ebgp multihop */
657 			memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6,
658 			    sizeof(struct in6_addr));
659 		return (0);
660 	case AID_VPN_IPv4:
661 		upa->mpattr_len = 17; /* AFI + SAFI + NH LEN + NH + Reserved */
662 		upa->mpattr = calloc(upa->mpattr_len, 1);
663 		if (upa->mpattr == NULL)
664 			fatal("up_generate_mp_reach");
665 		if (aid2afi(aid, &tmp, &upa->mpattr[2]))
666 			fatalx("up_generate_mp_reachi: bad AID");
667 		tmp = htons(tmp);
668 		memcpy(upa->mpattr, &tmp, sizeof(tmp));
669 		upa->mpattr[3] = sizeof(u_int64_t) + sizeof(struct in_addr);
670 
671 		/* nexthop dance see also up_get_nexthop() */
672 		if (a->flags & F_NEXTHOP_NOMODIFY) {
673 			/* no modify flag set */
674 			if (a->nexthop == NULL)
675 				memcpy(&upa->mpattr[12],
676 				    &peer->local_v4_addr.v4,
677 				    sizeof(struct in_addr));
678 			else
679 				/* nexthops are stored as IPv4 addrs */
680 				memcpy(&upa->mpattr[12],
681 				    &a->nexthop->exit_nexthop.v4,
682 				    sizeof(struct in_addr));
683 		} else if (a->flags & F_NEXTHOP_SELF)
684 			memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4,
685 			    sizeof(struct in_addr));
686 		else if (!peer->conf.ebgp) {
687 			/* ibgp */
688 			if (a->nexthop == NULL ||
689 			    (a->nexthop->exit_nexthop.aid == AID_INET &&
690 			    !memcmp(&a->nexthop->exit_nexthop.v4,
691 			    &peer->remote_addr.v4, sizeof(struct in_addr))))
692 				memcpy(&upa->mpattr[12],
693 				    &peer->local_v4_addr.v4,
694 				    sizeof(struct in_addr));
695 			else
696 				memcpy(&upa->mpattr[12],
697 				    &a->nexthop->exit_nexthop.v4,
698 				    sizeof(struct in_addr));
699 		} else if (peer->conf.distance == 1) {
700 			/* ebgp directly connected */
701 			if (a->nexthop != NULL &&
702 			    a->nexthop->flags & NEXTHOP_CONNECTED)
703 				if (prefix_compare(&peer->remote_addr,
704 				    &a->nexthop->nexthop_net,
705 				    a->nexthop->nexthop_netlen) == 0) {
706 					/*
707 					 * nexthop and peer are in the same
708 					 * subnet
709 					 */
710 					memcpy(&upa->mpattr[12],
711 					    &a->nexthop->exit_nexthop.v4,
712 					    sizeof(struct in_addr));
713 					return (0);
714 				}
715 			memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4,
716 			    sizeof(struct in_addr));
717 		} else
718 			/* ebgp multihop */
719 			memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4,
720 			    sizeof(struct in_addr));
721 		return (0);
722 	default:
723 		break;
724 	}
725 	return (-1);
726 }
727 
728 int
729 up_generate_attr(struct rde_peer *peer, struct update_attr *upa,
730     struct rde_aspath *a, u_int8_t aid)
731 {
732 	struct attr	*oa, *newaggr = NULL;
733 	u_char		*pdata;
734 	u_int32_t	 tmp32;
735 	in_addr_t	 nexthop;
736 	int		 flags, r, ismp = 0, neednewpath = 0;
737 	u_int16_t	 len = sizeof(up_attr_buf), wlen = 0, plen;
738 	u_int8_t	 l;
739 
740 	/* origin */
741 	if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN,
742 	    ATTR_ORIGIN, &a->origin, 1)) == -1)
743 		return (-1);
744 	wlen += r; len -= r;
745 
746 	/* aspath */
747 	if (!peer->conf.ebgp ||
748 	    peer->conf.flags & PEERFLAG_TRANS_AS)
749 		pdata = aspath_prepend(a->aspath, rde_local_as(), 0, &plen);
750 	else
751 		pdata = aspath_prepend(a->aspath, rde_local_as(), 1, &plen);
752 
753 	if (!rde_as4byte(peer))
754 		pdata = aspath_deflate(pdata, &plen, &neednewpath);
755 
756 	if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN,
757 	    ATTR_ASPATH, pdata, plen)) == -1)
758 		return (-1);
759 	wlen += r; len -= r;
760 	free(pdata);
761 
762 	switch (aid) {
763 	case AID_INET:
764 		nexthop = up_get_nexthop(peer, a);
765 		if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN,
766 		    ATTR_NEXTHOP, &nexthop, 4)) == -1)
767 			return (-1);
768 		wlen += r; len -= r;
769 		break;
770 	default:
771 		ismp = 1;
772 		break;
773 	}
774 
775 	/*
776 	 * The old MED from other peers MUST not be announced to others
777 	 * unless the MED is originating from us or the peer is an IBGP one.
778 	 * Only exception are routers with "transparent-as yes" set.
779 	 */
780 	if (a->flags & F_ATTR_MED && (!peer->conf.ebgp ||
781 	    a->flags & F_ATTR_MED_ANNOUNCE ||
782 	    peer->conf.flags & PEERFLAG_TRANS_AS)) {
783 		tmp32 = htonl(a->med);
784 		if ((r = attr_write(up_attr_buf + wlen, len, ATTR_OPTIONAL,
785 		    ATTR_MED, &tmp32, 4)) == -1)
786 			return (-1);
787 		wlen += r; len -= r;
788 	}
789 
790 	if (!peer->conf.ebgp) {
791 		/* local preference, only valid for ibgp */
792 		tmp32 = htonl(a->lpref);
793 		if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN,
794 		    ATTR_LOCALPREF, &tmp32, 4)) == -1)
795 			return (-1);
796 		wlen += r; len -= r;
797 	}
798 
799 	/*
800 	 * dump all other path attributes. Following rules apply:
801 	 *  1. well-known attrs: ATTR_ATOMIC_AGGREGATE and ATTR_AGGREGATOR
802 	 *     pass unmodified (enforce flags to correct values)
803 	 *     Actually ATTR_AGGREGATOR may be deflated for OLD 2-byte peers.
804 	 *  2. non-transitive attrs: don't re-announce to ebgp peers
805 	 *  3. transitive known attrs: announce unmodified
806 	 *  4. transitive unknown attrs: set partial bit and re-announce
807 	 */
808 	for (l = 0; l < a->others_len; l++) {
809 		if ((oa = a->others[l]) == NULL)
810 			break;
811 		switch (oa->type) {
812 		case ATTR_ATOMIC_AGGREGATE:
813 			if ((r = attr_write(up_attr_buf + wlen, len,
814 			    ATTR_WELL_KNOWN, ATTR_ATOMIC_AGGREGATE,
815 			    NULL, 0)) == -1)
816 				return (-1);
817 			break;
818 		case ATTR_AGGREGATOR:
819 			if (!rde_as4byte(peer)) {
820 				/* need to deflate the aggregator */
821 				u_int8_t	t[6];
822 				u_int16_t	tas;
823 
824 				if ((!(oa->flags & ATTR_TRANSITIVE)) &&
825 				    peer->conf.ebgp) {
826 					r = 0;
827 					break;
828 				}
829 
830 				memcpy(&tmp32, oa->data, sizeof(tmp32));
831 				if (ntohl(tmp32) > USHRT_MAX) {
832 					tas = htons(AS_TRANS);
833 					newaggr = oa;
834 				} else
835 					tas = htons(ntohl(tmp32));
836 
837 				memcpy(t, &tas, sizeof(tas));
838 				memcpy(t + sizeof(tas),
839 				    oa->data + sizeof(tmp32),
840 				    oa->len - sizeof(tmp32));
841 				if ((r = attr_write(up_attr_buf + wlen, len,
842 				    oa->flags, oa->type, &t, sizeof(t))) == -1)
843 					return (-1);
844 				break;
845 			}
846 			/* FALLTHROUGH */
847 		case ATTR_COMMUNITIES:
848 		case ATTR_ORIGINATOR_ID:
849 		case ATTR_CLUSTER_LIST:
850 			if ((!(oa->flags & ATTR_TRANSITIVE)) &&
851 			    peer->conf.ebgp) {
852 				r = 0;
853 				break;
854 			}
855 			if ((r = attr_write(up_attr_buf + wlen, len,
856 			    oa->flags, oa->type, oa->data, oa->len)) == -1)
857 				return (-1);
858 			break;
859 		default:
860 			/* unknown attribute */
861 			if (!(oa->flags & ATTR_TRANSITIVE)) {
862 				/*
863 				 * RFC 1771:
864 				 * Unrecognized non-transitive optional
865 				 * attributes must be quietly ignored and
866 				 * not passed along to other BGP peers.
867 				 */
868 				r = 0;
869 				break;
870 			}
871 			if ((r = attr_write(up_attr_buf + wlen, len,
872 			    oa->flags | ATTR_PARTIAL, oa->type,
873 			    oa->data, oa->len)) == -1)
874 				return (-1);
875 			break;
876 		}
877 		wlen += r; len -= r;
878 	}
879 
880 	/* NEW to OLD conversion when going sending stuff to a 2byte AS peer */
881 	if (neednewpath) {
882 		if (!peer->conf.ebgp ||
883 		    peer->conf.flags & PEERFLAG_TRANS_AS)
884 			pdata = aspath_prepend(a->aspath, rde_local_as(), 0,
885 			    &plen);
886 		else
887 			pdata = aspath_prepend(a->aspath, rde_local_as(), 1,
888 			    &plen);
889 		flags = ATTR_OPTIONAL|ATTR_TRANSITIVE;
890 		if (!(a->flags & F_PREFIX_ANNOUNCED))
891 			flags |= ATTR_PARTIAL;
892 		if (plen == 0)
893 			r = 0;
894 		else if ((r = attr_write(up_attr_buf + wlen, len, flags,
895 		    ATTR_AS4_PATH, pdata, plen)) == -1)
896 			return (-1);
897 		wlen += r; len -= r;
898 		free(pdata);
899 	}
900 	if (newaggr) {
901 		flags = ATTR_OPTIONAL|ATTR_TRANSITIVE;
902 		if (!(a->flags & F_PREFIX_ANNOUNCED))
903 			flags |= ATTR_PARTIAL;
904 		if ((r = attr_write(up_attr_buf + wlen, len, flags,
905 		    ATTR_AS4_AGGREGATOR, newaggr->data, newaggr->len)) == -1)
906 			return (-1);
907 		wlen += r; len -= r;
908 	}
909 
910 	/* write mp attribute to different buffer */
911 	if (ismp)
912 		if (up_generate_mp_reach(peer, upa, a, aid) == -1)
913 			return (-1);
914 
915 	/* the bgp path attributes are now stored in the global buf */
916 	upa->attr = malloc(wlen);
917 	if (upa->attr == NULL)
918 		fatal("up_generate_attr");
919 	memcpy(upa->attr, up_attr_buf, wlen);
920 	upa->attr_len = wlen;
921 	return (wlen);
922 }
923 
924 #define MIN_PREFIX_LEN	5	/* 1 byte prefix length + 4 bytes addr */
925 int
926 up_dump_prefix(u_char *buf, int len, struct uplist_prefix *prefix_head,
927     struct rde_peer *peer)
928 {
929 	struct update_prefix	*upp;
930 	int			 r, wpos = 0;
931 	u_int8_t		 i;
932 
933 	while ((upp = TAILQ_FIRST(prefix_head)) != NULL) {
934 		if ((r = prefix_write(buf + wpos, len - wpos,
935 		    &upp->prefix, upp->prefixlen)) == -1)
936 			break;
937 		wpos += r;
938 		if (RB_REMOVE(uptree_prefix, &peer->up_prefix, upp) == NULL)
939 			log_warnx("dequeuing update failed.");
940 		TAILQ_REMOVE(upp->prefix_h, upp, prefix_l);
941 		peer->up_pcnt--;
942 		for (i = 0; i < AID_MAX; i++) {
943 			if (upp->prefix_h == &peer->withdraws[i]) {
944 				peer->up_wcnt--;
945 				peer->prefix_sent_withdraw++;
946 			} else {
947 				peer->up_nlricnt--;
948 				peer->prefix_sent_update++;
949 			}
950 		}
951 		free(upp);
952 	}
953 	return (wpos);
954 }
955 
956 int
957 up_dump_attrnlri(u_char *buf, int len, struct rde_peer *peer)
958 {
959 	struct update_attr	*upa;
960 	int			 r, wpos;
961 	u_int16_t		 attr_len;
962 
963 	/*
964 	 * It is possible that a queued path attribute has no nlri prefix.
965 	 * Ignore and remove those path attributes.
966 	 */
967 	while ((upa = TAILQ_FIRST(&peer->updates[AID_INET])) != NULL)
968 		if (TAILQ_EMPTY(&upa->prefix_h)) {
969 			attr_len = upa->attr_len;
970 			if (RB_REMOVE(uptree_attr, &peer->up_attrs,
971 			    upa) == NULL)
972 				log_warnx("dequeuing update failed.");
973 			TAILQ_REMOVE(&peer->updates[AID_INET], upa, attr_l);
974 			free(upa->attr);
975 			free(upa->mpattr);
976 			free(upa);
977 			peer->up_acnt--;
978 			/* XXX horrible hack,
979 			 * if attr_len is 0, it is a EoR marker */
980 			if (attr_len == 0)
981 				return (-1);
982 		} else
983 			break;
984 
985 	if (upa == NULL || upa->attr_len + MIN_PREFIX_LEN > len) {
986 		/*
987 		 * either no packet or not enough space.
988 		 * The length field needs to be set to zero else it would be
989 		 * an invalid bgp update.
990 		 */
991 		bzero(buf, 2);
992 		return (2);
993 	}
994 
995 	/* first dump the 2-byte path attribute length */
996 	attr_len = htons(upa->attr_len);
997 	memcpy(buf, &attr_len, 2);
998 	wpos = 2;
999 
1000 	/* then the path attributes themselves */
1001 	memcpy(buf + wpos, upa->attr, upa->attr_len);
1002 	wpos += upa->attr_len;
1003 
1004 	/* last but not least dump the nlri */
1005 	r = up_dump_prefix(buf + wpos, len - wpos, &upa->prefix_h, peer);
1006 	wpos += r;
1007 
1008 	/* now check if all prefixes were written */
1009 	if (TAILQ_EMPTY(&upa->prefix_h)) {
1010 		if (RB_REMOVE(uptree_attr, &peer->up_attrs, upa) == NULL)
1011 			log_warnx("dequeuing update failed.");
1012 		TAILQ_REMOVE(&peer->updates[AID_INET], upa, attr_l);
1013 		free(upa->attr);
1014 		free(upa->mpattr);
1015 		free(upa);
1016 		peer->up_acnt--;
1017 	}
1018 
1019 	return (wpos);
1020 }
1021 
1022 u_char *
1023 up_dump_mp_unreach(u_char *buf, u_int16_t *len, struct rde_peer *peer,
1024     u_int8_t aid)
1025 {
1026 	int		wpos;
1027 	u_int16_t	datalen, tmp;
1028 	u_int16_t	attrlen = 2;	/* attribute header (without len) */
1029 	u_int8_t	flags = ATTR_OPTIONAL, safi;
1030 
1031 	/*
1032 	 * reserve space for withdraw len, attr len, the attribute header
1033 	 * and the mp attribute header
1034 	 */
1035 	wpos = 2 + 2 + 4 + 3;
1036 
1037 	if (*len < wpos)
1038 		return (NULL);
1039 
1040 	datalen = up_dump_prefix(buf + wpos, *len - wpos,
1041 	    &peer->withdraws[aid], peer);
1042 	if (datalen == 0)
1043 		return (NULL);
1044 
1045 	datalen += 3;	/* afi + safi */
1046 
1047 	/* prepend header, need to do it reverse */
1048 	/* safi & afi */
1049 	if (aid2afi(aid, &tmp, &safi))
1050 		fatalx("up_dump_mp_unreach: bad AID");
1051 	buf[--wpos] = safi;
1052 	wpos -= sizeof(u_int16_t);
1053 	tmp = htons(tmp);
1054 	memcpy(buf + wpos, &tmp, sizeof(u_int16_t));
1055 
1056 	/* attribute length */
1057 	if (datalen > 255) {
1058 		attrlen += 2 + datalen;
1059 		flags |= ATTR_EXTLEN;
1060 		wpos -= sizeof(u_int16_t);
1061 		tmp = htons(datalen);
1062 		memcpy(buf + wpos, &tmp, sizeof(u_int16_t));
1063 	} else {
1064 		attrlen += 1 + datalen;
1065 		buf[--wpos] = (u_char)datalen;
1066 	}
1067 
1068 	/* mp attribute */
1069 	buf[--wpos] = (u_char)ATTR_MP_UNREACH_NLRI;
1070 	buf[--wpos] = flags;
1071 
1072 	/* attribute length */
1073 	wpos -= sizeof(u_int16_t);
1074 	tmp = htons(attrlen);
1075 	memcpy(buf + wpos, &tmp, sizeof(u_int16_t));
1076 
1077 	/* no IPv4 withdraws */
1078 	wpos -= sizeof(u_int16_t);
1079 	bzero(buf + wpos, sizeof(u_int16_t));
1080 
1081 	if (wpos < 0)
1082 		fatalx("up_dump_mp_unreach: buffer underflow");
1083 
1084 	/* total length includes the two 2-bytes length fields. */
1085 	*len = attrlen + 2 * sizeof(u_int16_t);
1086 
1087 	return (buf + wpos);
1088 }
1089 
1090 int
1091 up_dump_mp_reach(u_char *buf, u_int16_t *len, struct rde_peer *peer,
1092     u_int8_t aid)
1093 {
1094 	struct update_attr	*upa;
1095 	int			wpos;
1096 	u_int16_t		attr_len, datalen, tmp;
1097 	u_int8_t		flags = ATTR_OPTIONAL;
1098 
1099 	/*
1100 	 * It is possible that a queued path attribute has no nlri prefix.
1101 	 * Ignore and remove those path attributes.
1102 	 */
1103 	while ((upa = TAILQ_FIRST(&peer->updates[aid])) != NULL)
1104 		if (TAILQ_EMPTY(&upa->prefix_h)) {
1105 			attr_len = upa->attr_len;
1106 			if (RB_REMOVE(uptree_attr, &peer->up_attrs,
1107 			    upa) == NULL)
1108 				log_warnx("dequeuing update failed.");
1109 			TAILQ_REMOVE(&peer->updates[aid], upa, attr_l);
1110 			free(upa->attr);
1111 			free(upa->mpattr);
1112 			free(upa);
1113 			peer->up_acnt--;
1114 			/* XXX horrible hack,
1115 			 * if attr_len is 0, it is a EoR marker */
1116 			if (attr_len == 0)
1117 				return (-1);
1118 		} else
1119 			break;
1120 
1121 	if (upa == NULL)
1122 		return (-2);
1123 
1124 	/*
1125 	 * reserve space for attr len, the attributes, the
1126 	 * mp attribute and the attribute header
1127 	 */
1128 	wpos = 2 + 2 + upa->attr_len + 4 + upa->mpattr_len;
1129 	if (*len < wpos)
1130 		return (-2);
1131 
1132 	datalen = up_dump_prefix(buf + wpos, *len - wpos,
1133 	    &upa->prefix_h, peer);
1134 	if (datalen == 0)
1135 		return (-2);
1136 
1137 	if (upa->mpattr_len == 0 || upa->mpattr == NULL)
1138 		fatalx("mulitprotocol update without MP attrs");
1139 
1140 	datalen += upa->mpattr_len;
1141 	wpos -= upa->mpattr_len;
1142 	memcpy(buf + wpos, upa->mpattr, upa->mpattr_len);
1143 
1144 	if (datalen > 255) {
1145 		wpos -= 2;
1146 		tmp = htons(datalen);
1147 		memcpy(buf + wpos, &tmp, sizeof(tmp));
1148 		datalen += 4;
1149 		flags |= ATTR_EXTLEN;
1150 	} else {
1151 		buf[--wpos] = (u_char)datalen;
1152 		datalen += 3;
1153 	}
1154 	buf[--wpos] = (u_char)ATTR_MP_REACH_NLRI;
1155 	buf[--wpos] = flags;
1156 
1157 	datalen += upa->attr_len;
1158 	wpos -= upa->attr_len;
1159 	memcpy(buf + wpos, upa->attr, upa->attr_len);
1160 
1161 	if (wpos < 4)
1162 		fatalx("Grrr, mp_reach buffer fucked up");
1163 
1164 	wpos -= 2;
1165 	tmp = htons(datalen);
1166 	memcpy(buf + wpos, &tmp, sizeof(tmp));
1167 
1168 	wpos -= 2;
1169 	bzero(buf + wpos, 2);
1170 
1171 	/* now check if all prefixes were written */
1172 	if (TAILQ_EMPTY(&upa->prefix_h)) {
1173 		if (RB_REMOVE(uptree_attr, &peer->up_attrs, upa) == NULL)
1174 			log_warnx("dequeuing update failed.");
1175 		TAILQ_REMOVE(&peer->updates[aid], upa, attr_l);
1176 		free(upa->attr);
1177 		free(upa->mpattr);
1178 		free(upa);
1179 		peer->up_acnt--;
1180 	}
1181 
1182 	*len = datalen + 4;
1183 	return (wpos);
1184 }
1185