xref: /freebsd/sys/netlink/route/nexthop.c (revision 315ee00f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include "opt_netlink.h"
29 
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_route.h"
34 #include <sys/types.h>
35 #include <sys/ck.h>
36 #include <sys/epoch.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/rmlock.h>
40 #include <sys/socket.h>
41 
42 #include <net/if.h>
43 #include <net/route.h>
44 #include <net/route/nhop.h>
45 #include <net/route/nhop_utils.h>
46 
47 #include <net/route/route_ctl.h>
48 #include <net/route/route_var.h>
49 #include <netinet6/scope6_var.h>
50 #include <netlink/netlink.h>
51 #include <netlink/netlink_ctl.h>
52 #include <netlink/netlink_route.h>
53 #include <netlink/route/route_var.h>
54 
55 #define	DEBUG_MOD_NAME	nl_nhop
56 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
57 #include <netlink/netlink_debug.h>
58 _DECLARE_DEBUG(LOG_INFO);
59 
60 /*
61  * This file contains the logic to maintain kernel nexthops and
62  *  nexhop groups based om the data provided by the user.
63  *
64  * Kernel stores (nearly) all of the routing data in the nexthops,
65  *  including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
66  *
67  * Netlink API provides higher-level abstraction for the user. Each
68  *  user-created nexthop may map to multiple kernel nexthops.
69  *
70  * The following variations require separate kernel nexthop to be
71  *  created:
72  *  * prefix flags (NHF_HOST, NHF_DEFAULT)
73  *  * using IPv6 gateway for IPv4 routes
74  *  * different fibnum
75  *
76  * These kernel nexthops have the lifetime bound to the lifetime of
77  *  the user_nhop object. They are not collected until user requests
78  *  to delete the created user_nhop.
79  *
80  */
81 struct user_nhop {
82         uint32_t                        un_idx; /* Userland-provided index */
83 	uint32_t			un_fibfam; /* fibnum+af(as highest byte) */
84 	uint8_t				un_protocol; /* protocol that install the record */
85 	struct nhop_object		*un_nhop; /* "production" nexthop */
86 	struct nhop_object		*un_nhop_src; /* nexthop to copy from */
87 	struct weightened_nhop		*un_nhgrp_src; /* nexthops for nhg */
88 	uint32_t			un_nhgrp_count; /* number of nexthops */
89         struct user_nhop		*un_next; /* next item in hash chain */
90         struct user_nhop		*un_nextchild; /* master -> children */
91 	struct epoch_context		un_epoch_ctx;	/* epoch ctl helper */
92 };
93 
94 /* produce hash value for an object */
95 #define	unhop_hash_obj(_obj)	(hash_unhop(_obj))
96 /* compare two objects */
97 #define	unhop_cmp(_one, _two)	(cmp_unhop(_one, _two))
98 /* next object accessor */
99 #define	unhop_next(_obj)	(_obj)->un_next
100 
101 CHT_SLIST_DEFINE(unhop, struct user_nhop);
102 
103 struct unhop_ctl {
104 	struct unhop_head	un_head;
105 	struct rmlock		un_lock;
106 };
107 #define	UN_LOCK_INIT(_ctl)	rm_init(&(_ctl)->un_lock, "unhop_ctl")
108 #define	UN_TRACKER		struct rm_priotracker un_tracker
109 #define	UN_RLOCK(_ctl)		rm_rlock(&((_ctl)->un_lock), &un_tracker)
110 #define	UN_RUNLOCK(_ctl)	rm_runlock(&((_ctl)->un_lock), &un_tracker)
111 
112 #define	UN_WLOCK(_ctl)		rm_wlock(&(_ctl)->un_lock);
113 #define	UN_WUNLOCK(_ctl)	rm_wunlock(&(_ctl)->un_lock);
114 
115 VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
116 #define V_un_ctl	VNET(un_ctl)
117 
118 static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
119 static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
120 static unsigned int hash_unhop(const struct user_nhop *obj);
121 
122 static void destroy_unhop(struct user_nhop *unhop);
123 static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
124     uint32_t fibnum, int family, int nh_flags);
125 
126 static int
127 cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
128 {
129         return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
130 }
131 
132 /*
133  * Hash callback: calculate hash of an object
134  */
135 static unsigned int
136 hash_unhop(const struct user_nhop *obj)
137 {
138         return (obj->un_idx ^ obj->un_fibfam);
139 }
140 
141 #define	UNHOP_IS_MASTER(_unhop)	((_unhop)->un_fibfam == 0)
142 
143 /*
144  * Factory interface for creating matching kernel nexthops/nexthop groups
145  *
146  * @uidx: userland nexhop index used to create the nexthop
147  * @fibnum: fibnum nexthop will be used in
148  * @family: upper family nexthop will be used in
149  * @nh_flags: desired nexthop prefix flags
150  * @perror: pointer to store error to
151  *
152  * Returns referenced nexthop linked to @fibnum/@family rib on success.
153  */
154 struct nhop_object *
155 nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
156     int nh_flags, int *perror)
157 {
158 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
159         UN_TRACKER;
160 
161 	if (__predict_false(ctl == NULL))
162 		return (NULL);
163 
164 	struct user_nhop key= {
165 		.un_idx = uidx,
166 		.un_fibfam = fibnum  | ((uint32_t)family) << 24,
167 	};
168 	struct user_nhop *unhop;
169 
170 	nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
171 
172 	if (__predict_false(family == 0))
173 		return (NULL);
174 
175 	UN_RLOCK(ctl);
176 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
177 	if (unhop != NULL) {
178 		struct nhop_object *nh = unhop->un_nhop;
179 		UN_RLOCK(ctl);
180 		*perror = 0;
181 		nhop_ref_any(nh);
182 		return (nh);
183 	}
184 
185 	/*
186 	 * Exact nexthop not found. Search for template nexthop to clone from.
187 	 */
188 	key.un_fibfam = 0;
189 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
190 	if (unhop == NULL) {
191 		UN_RUNLOCK(ctl);
192 		*perror = ESRCH;
193 		return (NULL);
194 	}
195 
196 	UN_RUNLOCK(ctl);
197 
198 	/* Create entry to insert first */
199 	struct user_nhop *un_new, *un_tmp;
200 	un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
201 	if (un_new == NULL) {
202 		*perror = ENOMEM;
203 		return (NULL);
204 	}
205 	un_new->un_idx = uidx;
206 	un_new->un_fibfam = fibnum  | ((uint32_t)family) << 24;
207 
208 	/* Relying on epoch to protect unhop here */
209 	un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
210 	if (un_new->un_nhop == NULL) {
211 		free(un_new, M_NETLINK);
212 		*perror = ENOMEM;
213 		return (NULL);
214 	}
215 
216 	/* Insert back and report */
217 	UN_WLOCK(ctl);
218 
219 	/* First, find template record once again */
220 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
221 	if (unhop == NULL) {
222 		/* Someone deleted the nexthop during the call */
223 		UN_WUNLOCK(ctl);
224 		*perror = ESRCH;
225 		destroy_unhop(un_new);
226 		return (NULL);
227 	}
228 
229 	/* Second, check the direct match */
230 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
231 	struct nhop_object *nh;
232 	if (un_tmp != NULL) {
233 		/* Another thread already created the desired nextop, use it */
234 		nh = un_tmp->un_nhop;
235 	} else {
236 		/* Finally, insert the new nexthop and link it to the primary */
237 		nh = un_new->un_nhop;
238 		CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
239 		un_new->un_nextchild = unhop->un_nextchild;
240 		unhop->un_nextchild = un_new;
241 		un_new = NULL;
242 		NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
243 	}
244 
245 	UN_WUNLOCK(ctl);
246 
247 	if (un_new != NULL)
248 		destroy_unhop(un_new);
249 
250 	*perror = 0;
251 	nhop_ref_any(nh);
252 	return (nh);
253 }
254 
255 static struct user_nhop *
256 nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
257 {
258 	struct user_nhop key= { .un_idx = uidx };
259 	struct user_nhop *unhop = NULL;
260 	UN_TRACKER;
261 
262 	UN_RLOCK(ctl);
263 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
264 	UN_RUNLOCK(ctl);
265 
266 	return (unhop);
267 }
268 
269 #define MAX_STACK_NHOPS	4
270 static struct nhop_object *
271 clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
272 {
273 #ifdef ROUTE_MPATH
274 	const struct weightened_nhop *wn;
275 	struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
276 	uint32_t num_nhops;
277 #endif
278 	struct nhop_object *nh = NULL;
279 	int error;
280 
281 	if (unhop->un_nhop_src != NULL) {
282 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
283 			char nhbuf[NHOP_PRINT_BUFSIZE];
284 			nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
285 			FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
286 			    "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
287 			    family, nh_flags);
288 		}
289 		struct nhop_object *nh;
290 		nh = nhop_alloc(fibnum, AF_UNSPEC);
291 		if (nh == NULL)
292 			return (NULL);
293 		nhop_copy(nh, unhop->un_nhop_src);
294 		/* Check that nexthop gateway is compatible with the new family */
295 		if (!nhop_set_upper_family(nh, family)) {
296 			nhop_free(nh);
297 			return (NULL);
298 		}
299 		nhop_set_uidx(nh, unhop->un_idx);
300 		nhop_set_pxtype_flag(nh, nh_flags);
301 		return (nhop_get_nhop(nh, &error));
302 	}
303 #ifdef ROUTE_MPATH
304 	wn = unhop->un_nhgrp_src;
305 	num_nhops = unhop->un_nhgrp_count;
306 
307 	if (num_nhops > MAX_STACK_NHOPS) {
308 		wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
309 		if (wn_new == NULL)
310 			return (NULL);
311 	} else
312 		wn_new = wn_base;
313 
314 	for (int i = 0; i < num_nhops; i++) {
315 		uint32_t uidx = nhop_get_uidx(wn[i].nh);
316 		MPASS(uidx != 0);
317 		wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
318 		if (error != 0)
319 			break;
320 		wn_new[i].weight = wn[i].weight;
321 	}
322 
323 	if (error == 0) {
324 		struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
325 		struct nhgrp_object *nhg;
326 
327 		error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
328 		nh = (struct nhop_object *)nhg;
329 	}
330 
331 	if (wn_new != wn_base)
332 		free(wn_new, M_TEMP);
333 #endif
334 	return (nh);
335 }
336 
337 static void
338 destroy_unhop(struct user_nhop *unhop)
339 {
340 	if (unhop->un_nhop != NULL)
341 		nhop_free_any(unhop->un_nhop);
342 	if (unhop->un_nhop_src != NULL)
343 		nhop_free_any(unhop->un_nhop_src);
344 	free(unhop, M_NETLINK);
345 }
346 
347 static void
348 destroy_unhop_epoch(epoch_context_t ctx)
349 {
350 	struct user_nhop *unhop;
351 
352 	unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
353 
354 	destroy_unhop(unhop);
355 }
356 
357 static uint32_t
358 find_spare_uidx(struct unhop_ctl *ctl)
359 {
360 	struct user_nhop *unhop, key = {};
361 	uint32_t uidx = 0;
362 	UN_TRACKER;
363 
364 	UN_RLOCK(ctl);
365 	/* This should return spare uid with 75% of 65k used in ~99/100 cases */
366 	for (int i = 0; i < 16; i++) {
367 		key.un_idx = (arc4random() % 65536) + 65536 * 4;
368 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
369 		if (unhop == NULL) {
370 			uidx = key.un_idx;
371 			break;
372 		}
373 	}
374 	UN_RUNLOCK(ctl);
375 
376 	return (uidx);
377 }
378 
379 
380 /*
381  * Actual netlink code
382  */
383 struct netlink_walkargs {
384 	struct nl_writer *nw;
385 	struct nlmsghdr hdr;
386 	struct nlpcb *so;
387 	int family;
388 	int error;
389 	int count;
390 	int dumped;
391 };
392 #define	ENOMEM_IF_NULL(_v)	if ((_v) == NULL) goto enomem
393 
394 static bool
395 dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
396     struct nl_writer *nw)
397 {
398 
399 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
400 		goto enomem;
401 
402 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
403 	nhm->nh_family = AF_UNSPEC;
404 	nhm->nh_scope = 0;
405 	nhm->nh_protocol = unhop->un_protocol;
406 	nhm->nh_flags = 0;
407 
408 	nlattr_add_u32(nw, NHA_ID, unhop->un_idx);
409 	nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH);
410 
411 	struct weightened_nhop *wn = unhop->un_nhgrp_src;
412 	uint32_t num_nhops = unhop->un_nhgrp_count;
413 	/* TODO: a better API? */
414 	int nla_len = sizeof(struct nlattr);
415 	nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
416 	struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr);
417 	if (nla == NULL)
418 		goto enomem;
419 	nla->nla_type = NHA_GROUP;
420 	nla->nla_len = nla_len;
421 	for (int i = 0; i < num_nhops; i++) {
422 		struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
423 		grp->id = nhop_get_uidx(wn[i].nh);
424 		grp->weight = wn[i].weight;
425 		grp->resvd1 = 0;
426 		grp->resvd2 = 0;
427 	}
428 
429         if (nlmsg_end(nw))
430 		return (true);
431 enomem:
432 	NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
433         nlmsg_abort(nw);
434 	return (false);
435 }
436 
437 static bool
438 dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr,
439     struct nl_writer *nw)
440 {
441 	if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg)))
442 		goto enomem;
443 
444 	struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg);
445 	ENOMEM_IF_NULL(nhm);
446 	nhm->nh_family = nhop_get_neigh_family(nh);
447 	nhm->nh_scope = 0; // XXX: what's that?
448 	nhm->nh_protocol = nhop_get_origin(nh);
449 	nhm->nh_flags = 0;
450 
451 	if (uidx != 0)
452 		nlattr_add_u32(nw, NHA_ID, uidx);
453 	if (nh->nh_flags & NHF_BLACKHOLE) {
454 		nlattr_add_flag(nw, NHA_BLACKHOLE);
455 		goto done;
456 	}
457 	nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp));
458 
459 	switch (nh->gw_sa.sa_family) {
460 #ifdef INET
461 	case AF_INET:
462 		nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr);
463 		break;
464 #endif
465 #ifdef INET6
466 	case AF_INET6:
467 		{
468 			struct in6_addr addr = nh->gw6_sa.sin6_addr;
469 			in6_clearscope(&addr);
470 			nlattr_add(nw, NHA_GATEWAY, 16, &addr);
471 			break;
472 		}
473 #endif
474 	}
475 
476 	int off = nlattr_add_nested(nw, NHA_FREEBSD);
477 	if (off != 0) {
478 		nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp));
479 
480 		if (uidx == 0) {
481 			nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh));
482 			nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh));
483 			nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh));
484 		}
485 
486 		nlattr_set_len(nw, off);
487 	}
488 
489 done:
490         if (nlmsg_end(nw))
491 		return (true);
492 enomem:
493 	nlmsg_abort(nw);
494 	return (false);
495 }
496 
497 static void
498 dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
499     struct nl_writer *nw)
500 {
501 	if (unhop->un_nhop_src != NULL)
502 		dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw);
503 	else
504 		dump_nhgrp(unhop, hdr, nw);
505 }
506 
507 static int
508 delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
509 {
510 	struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
511 
512 	struct user_nhop key = { .un_idx = uidx };
513 
514 	UN_WLOCK(ctl);
515 
516 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
517 
518 	if (unhop_base != NULL) {
519 		CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
520 		IF_DEBUG_LEVEL(LOG_DEBUG2) {
521 			char nhbuf[NHOP_PRINT_BUFSIZE];
522 			nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
523 			FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop,
524 			    "removed base nhop %u: %s", uidx, nhbuf);
525 		}
526 		/* Unlink all child nexhops as well, keeping the chain intact */
527 		unhop_chain = unhop_base->un_nextchild;
528 		while (unhop_chain != NULL) {
529 			CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
530 			    unhop_ret);
531 			MPASS(unhop_chain == unhop_ret);
532 			IF_DEBUG_LEVEL(LOG_DEBUG3) {
533 				char nhbuf[NHOP_PRINT_BUFSIZE];
534 				nhop_print_buf_any(unhop_chain->un_nhop,
535 				    nhbuf, sizeof(nhbuf));
536 				FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop,
537 				    "removed child nhop %u: %s", uidx, nhbuf);
538 			}
539 			unhop_chain = unhop_chain->un_nextchild;
540 		}
541 	}
542 
543 	UN_WUNLOCK(ctl);
544 
545 	if (unhop_base == NULL) {
546 		NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
547 		return (ENOENT);
548 	}
549 
550 	/* Report nexthop deletion */
551 	struct netlink_walkargs wa = {
552 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
553 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
554 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
555 		.hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
556 	};
557 
558 	struct nl_writer nw = {};
559 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
560 		NL_LOG(LOG_DEBUG, "error allocating message writer");
561 		return (ENOMEM);
562 	}
563 
564 	dump_unhop(unhop_base, &wa.hdr, &nw);
565 	nlmsg_flush(&nw);
566 
567 	while (unhop_base != NULL) {
568 		unhop_chain = unhop_base->un_nextchild;
569 		NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx);
570 		unhop_base = unhop_chain;
571 	}
572 
573 	return (0);
574 }
575 
576 static void
577 consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
578 {
579 	void *new_ptr = NULL;
580 	size_t alloc_size;
581 
582         if (new_size == 0)
583                 return;
584 
585 	if (new_size != 0) {
586 		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
587 		new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
588                 if (new_ptr == NULL)
589                         return;
590 	}
591 
592 	NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
593 	UN_WLOCK(ctl);
594 	if (new_ptr != NULL) {
595 		CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
596 	}
597 	UN_WUNLOCK(ctl);
598 
599 
600 	if (new_ptr != NULL)
601 		free(new_ptr, M_NETLINK);
602 }
603 
604 static bool __noinline
605 vnet_init_unhops(void)
606 {
607         uint32_t num_buckets = 16;
608         size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
609 
610         struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
611             M_NOWAIT | M_ZERO);
612         if (ctl == NULL)
613                 return (false);
614 
615         void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
616         if (ptr == NULL) {
617 		free(ctl, M_NETLINK);
618                 return (false);
619 	}
620         CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
621 	UN_LOCK_INIT(ctl);
622 
623 	if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
624                 free(ptr, M_NETLINK);
625                 free(ctl, M_NETLINK);
626 	}
627 
628 	if (atomic_load_ptr(&V_un_ctl) == NULL)
629 		return (false);
630 
631 	NL_LOG(LOG_NOTICE, "UNHOPS init done");
632 
633         return (true);
634 }
635 
636 static void
637 vnet_destroy_unhops(const void *unused __unused)
638 {
639 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
640 	struct user_nhop *unhop, *tmp;
641 
642 	if (ctl == NULL)
643 		return;
644 	V_un_ctl = NULL;
645 
646 	/* Wait till all unhop users finish their reads */
647 	NET_EPOCH_WAIT();
648 
649 	UN_WLOCK(ctl);
650 	CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
651 		destroy_unhop(unhop);
652 	} CHT_SLIST_FOREACH_SAFE_END;
653 	UN_WUNLOCK(ctl);
654 
655 	free(ctl->un_head.ptr, M_NETLINK);
656 	free(ctl, M_NETLINK);
657 }
658 VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
659     vnet_destroy_unhops, NULL);
660 
661 static int
662 nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
663 {
664 	int error = 0;
665 
666 	/* Verify attribute correctness */
667 	struct nexthop_grp *grp = NLA_DATA(nla);
668 	int data_len = NLA_DATA_LEN(nla);
669 
670 	int count = data_len / sizeof(*grp);
671 	if (count == 0 || (count * sizeof(*grp) != data_len)) {
672 		NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
673 		return (EINVAL);
674 	}
675 
676 	*((struct nlattr **)target) = nla;
677 	return (error);
678 }
679 
680 static void
681 set_scope6(struct sockaddr *sa, if_t ifp)
682 {
683 #ifdef INET6
684 	if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) {
685 		struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
686 
687 		if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr))
688 			in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp));
689 	}
690 #endif
691 }
692 
693 struct nl_parsed_nhop {
694 	uint32_t	nha_id;
695 	uint8_t		nha_blackhole;
696 	uint8_t		nha_groups;
697 	uint8_t		nhaf_knhops;
698 	uint8_t		nhaf_family;
699 	struct ifnet	*nha_oif;
700 	struct sockaddr	*nha_gw;
701 	struct nlattr	*nha_group;
702 	uint8_t		nh_family;
703 	uint8_t		nh_protocol;
704 	uint32_t	nhaf_table;
705 	uint32_t	nhaf_kid;
706 	uint32_t	nhaf_aif;
707 };
708 
709 #define	_IN(_field)	offsetof(struct nhmsg, _field)
710 #define	_OUT(_field)	offsetof(struct nl_parsed_nhop, _field)
711 static struct nlattr_parser nla_p_nh_fbsd[] = {
712 	{ .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag },
713 	{ .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 },
714 	{ .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 },
715 	{ .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 },
716 	{ .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 },
717 };
718 NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd);
719 
720 static const struct nlfield_parser nlf_p_nh[] = {
721 	{ .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 },
722 	{ .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 },
723 };
724 
725 static const struct nlattr_parser nla_p_nh[] = {
726 	{ .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 },
727 	{ .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg },
728 	{ .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag },
729 	{ .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp },
730 	{ .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip },
731 	{ .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag },
732 	{ .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested },
733 };
734 #undef _IN
735 #undef _OUT
736 
737 static bool
738 post_p_nh(void *_attrs, struct nl_pstate *npt)
739 {
740 	struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs;
741 
742 	set_scope6(attrs->nha_gw, attrs->nha_oif);
743 	return (true);
744 }
745 NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh);
746 
747 static bool
748 eligible_nhg(const struct nhop_object *nh)
749 {
750 	return (nh->nh_flags & NHF_GATEWAY);
751 }
752 
753 static int
754 newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
755 {
756 	struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
757 	int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
758 	struct weightened_nhop *wn;
759 
760 	wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
761 	if (wn == NULL)
762 		return (ENOMEM);
763 
764 	for (int i = 0; i < count; i++) {
765 		struct user_nhop *unhop;
766 		unhop = nl_find_base_unhop(ctl, grp[i].id);
767 		if (unhop == NULL) {
768 			NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
769 			free(wn, M_NETLINK);
770 			return (ESRCH);
771 		} else if (unhop->un_nhop_src == NULL) {
772 			NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
773 			    grp[i].id);
774 			free(wn, M_NETLINK);
775 			return (ENOTSUP);
776 		} else if (!eligible_nhg(unhop->un_nhop_src)) {
777 			NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
778 			    grp[i].id);
779 			free(wn, M_NETLINK);
780 			return (ENOTSUP);
781 		}
782 		/*
783 		 * TODO: consider more rigid eligibility checks:
784 		 * restrict nexthops with the same gateway
785 		 */
786 		wn[i].nh = unhop->un_nhop_src;
787 		wn[i].weight = grp[i].weight;
788 	}
789 	unhop->un_nhgrp_src = wn;
790 	unhop->un_nhgrp_count = count;
791 	return (0);
792 }
793 
794 /*
795  * Sets nexthop @nh gateway specified by @gw.
796  * If gateway is IPv6 link-local, alters @gw to include scopeid equal to
797  * @ifp ifindex.
798  * Returns 0 on success or errno.
799  */
800 int
801 nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp,
802     struct nl_pstate *npt)
803 {
804 #ifdef INET6
805 	if (gw->sa_family == AF_INET6) {
806 		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw;
807 		if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) {
808 			if (ifp == NULL) {
809 				NLMSG_REPORT_ERR_MSG(npt, "interface not set");
810 				return (EINVAL);
811 			}
812 			in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp));
813 		}
814 	}
815 #endif
816 	nhop_set_gw(nh, gw, true);
817 	return (0);
818 }
819 
820 static int
821 newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt)
822 {
823 	struct ifaddr *ifa = NULL;
824 	struct nhop_object *nh;
825 	int error;
826 
827 	if (!attrs->nha_blackhole) {
828 		if (attrs->nha_gw == NULL) {
829 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY");
830 			return (EINVAL);
831 		}
832 		if (attrs->nha_oif == NULL) {
833 			NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF");
834 			return (EINVAL);
835 		}
836 		if (ifa == NULL)
837 			ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
838 		if (ifa == NULL) {
839 			NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP");
840 			return (EINVAL);
841 		}
842 	}
843 
844 	int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
845 
846 	nh = nhop_alloc(RT_DEFAULT_FIB, family);
847 	if (nh == NULL) {
848 		NL_LOG(LOG_DEBUG, "Unable to allocate nexthop");
849 		return (ENOMEM);
850 	}
851 	nhop_set_uidx(nh, attrs->nha_id);
852 	nhop_set_origin(nh, attrs->nh_protocol);
853 
854 	if (attrs->nha_blackhole)
855 		nhop_set_blackhole(nh, NHF_BLACKHOLE);
856 	else {
857 		error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt);
858 		if (error != 0) {
859 			nhop_free(nh);
860 			return (error);
861 		}
862 		nhop_set_transmit_ifp(nh, attrs->nha_oif);
863 		nhop_set_src(nh, ifa);
864 	}
865 
866 	error = nhop_get_unlinked(nh);
867 	if (error != 0) {
868 		NL_LOG(LOG_DEBUG, "unable to finalize nexthop");
869 		return (error);
870 	}
871 
872 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
873 		char nhbuf[NHOP_PRINT_BUFSIZE];
874 		nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
875 		NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
876 	}
877 
878 	unhop->un_nhop_src = nh;
879 	return (0);
880 }
881 
882 static int
883 rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
884     struct nl_pstate *npt)
885 {
886 	struct user_nhop *unhop;
887 	int error;
888 
889         if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
890 		return (ENOMEM);
891 	struct unhop_ctl *ctl = V_un_ctl;
892 
893 	struct nl_parsed_nhop attrs = {};
894 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
895 	if (error != 0)
896 		return (error);
897 
898 	/*
899 	 * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
900 	 *  citizen.
901 	 */
902 	if (attrs.nha_id == 0) {
903 		attrs.nha_id = find_spare_uidx(ctl);
904 		if (attrs.nha_id == 0) {
905 			NL_LOG(LOG_DEBUG, "Unable to get spare uidx");
906 			return (ENOSPC);
907 		}
908 	}
909 
910 	NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0);
911 
912 	unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
913 	if (unhop == NULL) {
914 		NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
915 		return (ENOMEM);
916 	}
917 	unhop->un_idx = attrs.nha_id;
918 	unhop->un_protocol = attrs.nh_protocol;
919 
920 	if (attrs.nha_group)
921 		error = newnhg(ctl, &attrs, unhop);
922 	else
923 		error = newnhop(&attrs, unhop, npt);
924 
925 	if (error != 0) {
926 		free(unhop, M_NETLINK);
927 		return (error);
928 	}
929 
930 	UN_WLOCK(ctl);
931 	/* Check if uidx already exists */
932 	struct user_nhop *tmp = NULL;
933 	CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
934 	if (tmp != NULL) {
935 		UN_WUNLOCK(ctl);
936 		NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
937 		destroy_unhop(unhop);
938 		return (EEXIST);
939 	}
940 	CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
941 	uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
942 	UN_WUNLOCK(ctl);
943 
944 	/* Report addition of the next nexhop */
945 	struct netlink_walkargs wa = {
946 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
947 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
948 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
949 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
950 	};
951 
952 	struct nl_writer nw = {};
953 	if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) {
954 		NL_LOG(LOG_DEBUG, "error allocating message writer");
955 		return (ENOMEM);
956 	}
957 
958 	dump_unhop(unhop, &wa.hdr, &nw);
959 	nlmsg_flush(&nw);
960 
961 	consider_resize(ctl, num_buckets_new);
962 
963         return (0);
964 }
965 
966 static int
967 rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
968     struct nl_pstate *npt)
969 {
970 	struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
971 	int error;
972 
973 	if (__predict_false(ctl == NULL))
974 		return (ESRCH);
975 
976 	struct nl_parsed_nhop attrs = {};
977 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
978 	if (error != 0)
979 		return (error);
980 
981 	if (attrs.nha_id == 0) {
982 		NL_LOG(LOG_DEBUG, "NHA_ID not set");
983 		return (EINVAL);
984 	}
985 
986 	error = delete_unhop(ctl, hdr, attrs.nha_id);
987 
988         return (error);
989 }
990 
991 static bool
992 match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
993 {
994 	if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
995 		return (false);
996 	if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
997 		return (false);
998 	if (attrs->nha_oif != NULL &&
999 	    (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
1000 		return (false);
1001 
1002 	return (true);
1003 }
1004 
1005 static int
1006 rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
1007     struct nl_pstate *npt)
1008 {
1009 	struct user_nhop *unhop;
1010 	UN_TRACKER;
1011 	int error;
1012 
1013 	struct nl_parsed_nhop attrs = {};
1014 	error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs);
1015 	if (error != 0)
1016 		return (error);
1017 
1018 	struct netlink_walkargs wa = {
1019 		.nw = npt->nw,
1020 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
1021 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
1022 		.hdr.nlmsg_flags = hdr->nlmsg_flags,
1023 		.hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
1024 	};
1025 
1026 	if (attrs.nha_id != 0) {
1027 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1028 		struct user_nhop key = { .un_idx = attrs.nha_id };
1029 
1030 		if (__predict_false(ctl == NULL))
1031 			return (ESRCH);
1032 
1033 		NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
1034 		UN_RLOCK(ctl);
1035 		CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
1036 		UN_RUNLOCK(ctl);
1037 
1038 		if (unhop == NULL)
1039 			return (ESRCH);
1040 		dump_unhop(unhop, &wa.hdr, wa.nw);
1041 		return (0);
1042 	} else if (attrs.nhaf_kid != 0) {
1043 		struct nhop_iter iter = {
1044 			.fibnum = attrs.nhaf_table,
1045 			.family = attrs.nhaf_family,
1046 		};
1047 		int error = ESRCH;
1048 
1049 		NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1050 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1051 		    nh = nhops_iter_next(&iter)) {
1052 			NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh));
1053 			if (nhop_get_idx(nh) == attrs.nhaf_kid) {
1054 				dump_nhop(nh, 0, &wa.hdr, wa.nw);
1055 				error = 0;
1056 				break;
1057 			}
1058 		}
1059 		nhops_iter_stop(&iter);
1060 		return (error);
1061 	} else if (attrs.nhaf_knhops) {
1062 		struct nhop_iter iter = {
1063 			.fibnum = attrs.nhaf_table,
1064 			.family = attrs.nhaf_family,
1065 		};
1066 
1067 		NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family);
1068 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1069 		for (struct nhop_object *nh = nhops_iter_start(&iter); nh;
1070 		    nh = nhops_iter_next(&iter)) {
1071 			dump_nhop(nh, 0, &wa.hdr, wa.nw);
1072 		}
1073 		nhops_iter_stop(&iter);
1074 	} else {
1075 		struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
1076 
1077 		if (__predict_false(ctl == NULL))
1078 			return (ESRCH);
1079 
1080 		NL_LOG(LOG_DEBUG2, "DUMP unhops");
1081 		UN_RLOCK(ctl);
1082 		wa.hdr.nlmsg_flags |= NLM_F_MULTI;
1083 		CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
1084 			if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
1085 				dump_unhop(unhop, &wa.hdr, wa.nw);
1086 		} CHT_SLIST_FOREACH_END;
1087 		UN_RUNLOCK(ctl);
1088 	}
1089 
1090 	if (wa.error == 0) {
1091 		if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr))
1092 			return (ENOMEM);
1093 	}
1094         return (0);
1095 }
1096 
1097 static const struct rtnl_cmd_handler cmd_handlers[] = {
1098 	{
1099 		.cmd = NL_RTM_NEWNEXTHOP,
1100 		.name = "RTM_NEWNEXTHOP",
1101 		.cb = &rtnl_handle_newnhop,
1102 		.priv = PRIV_NET_ROUTE,
1103 	},
1104 	{
1105 		.cmd = NL_RTM_DELNEXTHOP,
1106 		.name = "RTM_DELNEXTHOP",
1107 		.cb = &rtnl_handle_delnhop,
1108 		.priv = PRIV_NET_ROUTE,
1109 	},
1110 	{
1111 		.cmd = NL_RTM_GETNEXTHOP,
1112 		.name = "RTM_GETNEXTHOP",
1113 		.cb = &rtnl_handle_getnhop,
1114 	}
1115 };
1116 
1117 static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser };
1118 
1119 void
1120 rtnl_nexthops_init(void)
1121 {
1122 	NL_VERIFY_PARSERS(all_parsers);
1123 	rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers));
1124 }
1125