xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision 06c3fb27)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 #include "opt_inet.h"
28 #include "opt_route.h"
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/lock.h>
33 #include <sys/rmlock.h>
34 #include <sys/malloc.h>
35 #include <sys/mbuf.h>
36 #include <sys/refcount.h>
37 #include <sys/socket.h>
38 #include <sys/sysctl.h>
39 #include <sys/kernel.h>
40 #include <sys/epoch.h>
41 
42 #include <net/if.h>
43 #include <net/if_var.h>
44 #include <net/if_private.h>
45 #include <net/route.h>
46 #include <net/route/route_ctl.h>
47 #include <net/route/route_var.h>
48 #include <net/vnet.h>
49 
50 #include <netinet/in.h>
51 #include <netinet/in_var.h>
52 #include <netinet/in_fib.h>
53 
54 #include <net/route/nhop_utils.h>
55 #include <net/route/nhop.h>
56 #include <net/route/nhop_var.h>
57 #include <net/route/nhgrp_var.h>
58 
59 #define	DEBUG_MOD_NAME	nhgrp_ctl
60 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
61 #include <net/route/route_debug.h>
62 _DECLARE_DEBUG(LOG_INFO);
63 
64 /*
65  * This file contains the supporting functions for creating multipath groups
66  *  and compiling their dataplane parts.
67  */
68 
69 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
70 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
71     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
72 /* Offset and size of flags field has to be the same for nhop/nhop groups */
73 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
74 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
75 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
76 
77 static int wn_cmp_idx(const void *a, const void *b);
78 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
79 
80 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
81     struct weightened_nhop *wn, int num_nhops, uint32_t uidx, int *perror);
82 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
83 static void destroy_nhgrp_epoch(epoch_context_t ctx);
84 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
85 
86 static int
87 wn_cmp_idx(const void *a, const void *b)
88 {
89 	const struct weightened_nhop *w_a = a;
90 	const struct weightened_nhop *w_b = b;
91 	uint32_t a_idx = w_a->nh->nh_priv->nh_idx;
92 	uint32_t b_idx = w_b->nh->nh_priv->nh_idx;
93 
94 	if (a_idx < b_idx)
95 		return (-1);
96 	else if (a_idx > b_idx)
97 		return (1);
98 	else
99 		return (0);
100 }
101 
102 /*
103  * Perform in-place sorting for array of nexthops in @wn.
104  * Sort by nexthop index ascending.
105  */
106 static void
107 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
108 {
109 
110 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp_idx);
111 }
112 
113 /*
114  * In order to determine the minimum weight difference in the array
115  * of weights, create a sorted array of weights, using spare "storage"
116  * field in the `struct weightened_nhop`.
117  * Assume weights to be (mostly) the same and use insertion sort to
118  * make it sorted.
119  */
120 static void
121 sort_weightened_nhops_weights(struct weightened_nhop *wn, int num_items)
122 {
123 	wn[0].storage = wn[0].weight;
124 	for (int i = 1, j = 0; i < num_items; i++) {
125 		uint32_t weight = wn[i].weight; // read from 'weight' as it's not reordered
126 		/* Move all weights > weight 1 position right */
127 		for (j = i - 1; j >= 0 && wn[j].storage > weight; j--)
128 			wn[j + 1].storage = wn[j].storage;
129 		wn[j + 1].storage = weight;
130 	}
131 }
132 
133 /*
134  * Calculate minimum number of slots required to fit the existing
135  * set of weights in the common use case where weights are "easily"
136  * comparable.
137  * Assumes @wn is sorted by weight ascending and each weight is > 0.
138  * Returns number of slots or 0 if precise calculation failed.
139  *
140  * Some examples:
141  * note: (i, X) pair means (nhop=i, weight=X):
142  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
143  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
144  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
145  */
146 static uint32_t
147 calc_min_mpath_slots_fast(struct weightened_nhop *wn, size_t num_items,
148     uint64_t *ptotal)
149 {
150 	uint32_t i, last, xmin;
151 	uint64_t total = 0;
152 
153 	// Get sorted array of weights in .storage field
154 	sort_weightened_nhops_weights(wn, num_items);
155 
156 	last = 0;
157 	xmin = wn[0].storage;
158 	for (i = 0; i < num_items; i++) {
159 		total += wn[i].storage;
160 		if ((wn[i].storage != last) &&
161 		    ((wn[i].storage - last < xmin) || xmin == 0)) {
162 			xmin = wn[i].storage - last;
163 		}
164 		last = wn[i].storage;
165 	}
166 	*ptotal = total;
167 	/* xmin is the minimum unit of desired capacity */
168 	if ((total % xmin) != 0)
169 		return (0);
170 	for (i = 0; i < num_items; i++) {
171 		if ((wn[i].weight % xmin) != 0)
172 			return (0);
173 	}
174 
175 	return ((uint32_t)(total / xmin));
176 }
177 
178 /*
179  * Calculate minimum number of slots required to fit the existing
180  * set of weights while maintaining weight coefficients.
181  *
182  * Assume @wn is sorted by weight ascending and each weight is > 0.
183  *
184  * Tries to find simple precise solution first and falls back to
185  *  RIB_MAX_MPATH_WIDTH in case of any failure.
186  */
187 static uint32_t
188 calc_min_mpath_slots(struct weightened_nhop *wn, size_t num_items)
189 {
190 	uint32_t v;
191 	uint64_t total;
192 
193 	v = calc_min_mpath_slots_fast(wn, num_items, &total);
194 	if (total == 0)
195 		return (0);
196 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
197 		v = RIB_MAX_MPATH_WIDTH;
198 
199 	return (v);
200 }
201 
202 /*
203  * Nexthop group data consists of
204  * 1) dataplane part, with nhgrp_object as a header followed by an
205  *   arbitrary number of nexthop pointers.
206  * 2) control plane part, with nhgrp_priv as a header, followed by
207  *   an arbirtrary number of 'struct weightened_nhop' object.
208  *
209  * Given nexthop groups are (mostly) immutable, allocate all data
210  * in one go.
211  *
212  */
213 __noinline static size_t
214 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
215 {
216 	size_t sz;
217 
218 	sz = sizeof(struct nhgrp_object);
219 	sz += nhg_size * sizeof(struct nhop_object *);
220 	sz += sizeof(struct nhgrp_priv);
221 	sz += num_nhops * sizeof(struct weightened_nhop);
222 	return (sz);
223 }
224 
225 /*
226  * Compile actual list of nexthops to be used by datapath from
227  *  the nexthop group @dst.
228  *
229  * For example, compiling control plane list of 2 nexthops
230  *  [(200, A), (100, B)] would result in the datapath array
231  *  [A, A, B]
232  */
233 static void
234 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
235     uint32_t num_slots)
236 {
237 	struct nhgrp_object *dst;
238 	int i, slot_idx, remaining_slots;
239 	uint64_t remaining_sum, nh_weight, nh_slots;
240 
241 	slot_idx  = 0;
242 	dst = dst_priv->nhg;
243 	/* Calculate sum of all weights */
244 	remaining_sum = 0;
245 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
246 		remaining_sum += x[i].weight;
247 	remaining_slots = num_slots;
248 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
249 	    remaining_sum, remaining_slots);
250 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
251 		/* Calculate number of slots for the current nexthop */
252 		if (remaining_sum > 0) {
253 			nh_weight = (uint64_t)x[i].weight;
254 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
255 		} else
256 			nh_slots = 0;
257 
258 		remaining_sum -= x[i].weight;
259 		remaining_slots -= nh_slots;
260 
261 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
262 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
263 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
264 
265 		KASSERT((slot_idx + nh_slots <= num_slots),
266 		    ("index overflow during nhg compilation"));
267 		while (nh_slots-- > 0)
268 			dst->nhops[slot_idx++] = x[i].nh;
269 	}
270 }
271 
272 /*
273  * Allocates new nexthop group for the list of weightened nexthops.
274  * Assume sorted list.
275  * Does NOT reference any nexthops in the group.
276  * Returns group with refcount=1 or NULL.
277  */
278 static struct nhgrp_priv *
279 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
280 {
281 	uint32_t nhgrp_size;
282 	struct nhgrp_object *nhg;
283 	struct nhgrp_priv *nhg_priv;
284 
285 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
286 	if (nhgrp_size == 0) {
287 		/* Zero weights, abort */
288 		return (NULL);
289 	}
290 
291 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
292 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
293 	if (nhg == NULL) {
294 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
295 		    "unable to allocate group with num_nhops %d (compiled %u)",
296 		    num_nhops, nhgrp_size);
297 		return (NULL);
298 	}
299 
300 	/* Has to be the first to make NHGRP_PRIV() work */
301 	nhg->nhg_size = nhgrp_size;
302 	nhg->nhg_flags = MPF_MULTIPATH;
303 
304 	nhg_priv = NHGRP_PRIV(nhg);
305 	nhg_priv->nhg_nh_count = num_nhops;
306 	refcount_init(&nhg_priv->nhg_refcount, 1);
307 
308 	/* Please see nhgrp_free() comments on the initial value */
309 	refcount_init(&nhg_priv->nhg_linked, 2);
310 
311 	nhg_priv->nhg = nhg;
312 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
313 	  num_nhops * sizeof(struct weightened_nhop));
314 
315 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
316 	    num_nhops, nhgrp_size);
317 
318 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
319 
320 	return (nhg_priv);
321 }
322 
323 void
324 nhgrp_ref_object(struct nhgrp_object *nhg)
325 {
326 	struct nhgrp_priv *nhg_priv;
327 	u_int old __diagused;
328 
329 	nhg_priv = NHGRP_PRIV(nhg);
330 	old = refcount_acquire(&nhg_priv->nhg_refcount);
331 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
332 }
333 
334 void
335 nhgrp_free(struct nhgrp_object *nhg)
336 {
337 	struct nhgrp_priv *nhg_priv;
338 	struct nh_control *ctl;
339 	struct epoch_tracker et;
340 
341 	nhg_priv = NHGRP_PRIV(nhg);
342 
343 	if (!refcount_release(&nhg_priv->nhg_refcount))
344 		return;
345 
346 	/*
347 	 * group objects don't have an explicit lock attached to it.
348 	 * As groups are reclaimed based on reference count, it is possible
349 	 * that some groups will persist after vnet destruction callback
350 	 * called. Given that, handle scenario with nhgrp_free_group() being
351 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
352 	 * by using another reference counter: nhg_linked.
353 	 *
354 	 * There are only 2 places, where nhg_linked can be decreased:
355 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
356 	 * nhg_link can never be increased.
357 	 *
358 	 * Hence, use initial value of 2 to make use of
359 	 *  refcount_release_if_not_last().
360 	 *
361 	 * There can be two scenarious when calling this function:
362 	 *
363 	 * 1) nhg_linked value is 2. This means that either
364 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
365 	 *  but we are guaranteed that nh_control won't be freed in
366 	 *  this epoch. Hence, nexthop can be safely unlinked.
367 	 *
368 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
369 	 *  has been called and nhgrp unlink can be skipped.
370 	 */
371 
372 	NET_EPOCH_ENTER(et);
373 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
374 		ctl = nhg_priv->nh_control;
375 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
376 			/* Do not try to reclaim */
377 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
378 			    nhg_priv);
379 			NET_EPOCH_EXIT(et);
380 			return;
381 		}
382 		MPASS((nhg_priv->nhg_idx == 0));
383 		MPASS((nhg_priv->nhg_refcount == 0));
384 	}
385 	NET_EPOCH_EXIT(et);
386 
387 	NET_EPOCH_CALL(destroy_nhgrp_epoch, &nhg_priv->nhg_epoch_ctx);
388 }
389 
390 /*
391  * Destroys all local resources belonging to @nhg_priv.
392  */
393 __noinline static void
394 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
395 {
396 
397 	free(nhg_priv->nhg, M_NHOP);
398 }
399 
400 __noinline static void
401 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
402 {
403 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
404 		char nhgbuf[NHOP_PRINT_BUFSIZE] __unused;
405 		FIB_NH_LOG(LOG_DEBUG2, nhg_priv->nhg_nh_weights[0].nh,
406 		    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
407 		    nhgbuf, sizeof(nhgbuf)));
408 	}
409 
410 	free_nhgrp_nhops(nhg_priv);
411 	destroy_nhgrp_int(nhg_priv);
412 }
413 
414 /*
415  * Epoch callback indicating group is safe to destroy
416  */
417 static void
418 destroy_nhgrp_epoch(epoch_context_t ctx)
419 {
420 	struct nhgrp_priv *nhg_priv;
421 
422 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
423 
424 	destroy_nhgrp(nhg_priv);
425 }
426 
427 static bool
428 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
429 {
430 
431 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
432 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
433 			continue;
434 
435 		/*
436 		 * Failed to ref the nexthop, b/c it's deleted.
437 		 * Need to rollback references back.
438 		 */
439 		for (int j = 0; j < i; j++)
440 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
441 		return (false);
442 	}
443 
444 	return (true);
445 }
446 
447 static void
448 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
449 {
450 
451 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
452 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
453 }
454 
455 /*
456  * Allocate nexthop group of size @num_nhops with nexthops specified by
457  * @wn. Nexthops have to be unique and match the fibnum/family of the group.
458  * Returns unlinked nhgrp object on success or NULL and non-zero perror.
459  */
460 struct nhgrp_object *
461 nhgrp_alloc(uint32_t fibnum, int family, struct weightened_nhop *wn, int num_nhops,
462     int *perror)
463 {
464 	struct rib_head *rh = rt_tables_get_rnh(fibnum, family);
465 	struct nhgrp_priv *nhg_priv;
466 	struct nh_control *ctl;
467 
468 	if (rh == NULL) {
469 		*perror = E2BIG;
470 		return (NULL);
471 	}
472 
473 	ctl = rh->nh_control;
474 
475 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
476 		*perror = E2BIG;
477 		return (NULL);
478 	}
479 
480 	if (ctl->gr_head.hash_size == 0) {
481 		/* First multipath request. Bootstrap mpath datastructures. */
482 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
483 			*perror = ENOMEM;
484 			return (NULL);
485 		}
486 	}
487 
488 	/* Sort nexthops & check there are no duplicates */
489 	sort_weightened_nhops(wn, num_nhops);
490 	uint32_t last_id = 0;
491 	for (int i = 0; i < num_nhops; i++) {
492 		if (wn[i].nh->nh_priv->nh_control != ctl) {
493 			*perror = EINVAL;
494 			return (NULL);
495 		}
496 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
497 			*perror = EEXIST;
498 			return (NULL);
499 		}
500 		last_id = wn[i].nh->nh_priv->nh_idx;
501 	}
502 
503 	if ((nhg_priv = alloc_nhgrp(wn, num_nhops)) == NULL) {
504 		*perror = ENOMEM;
505 		return (NULL);
506 	}
507 	nhg_priv->nh_control = ctl;
508 
509 	*perror = 0;
510 	return (nhg_priv->nhg);
511 }
512 
513 /*
514  * Finds an existing group matching @nhg or links @nhg to the tree.
515  * Returns the referenced group or NULL and non-zero @perror.
516  */
517 struct nhgrp_object *
518 nhgrp_get_nhgrp(struct nhgrp_object *nhg, int *perror)
519 {
520 	struct nhgrp_priv *nhg_priv, *key = NHGRP_PRIV(nhg);
521 	struct nh_control *ctl = key->nh_control;
522 
523 	nhg_priv = find_nhgrp(ctl, key);
524 	if (nhg_priv != NULL) {
525 		/*
526 		 * Free originally-created group. As it hasn't been linked
527 		 *  and the dependent nexhops haven't been referenced, just free
528 		 *  the group.
529 		 */
530 		destroy_nhgrp_int(key);
531 		*perror = 0;
532 		return (nhg_priv->nhg);
533 	} else {
534 		/* No existing group, try to link the new one */
535 		if (!ref_nhgrp_nhops(key)) {
536 			/*
537 			 * Some of the nexthops have been scheduled for deletion.
538 			 * As the group hasn't been linked / no nexhops have been
539 			 *  referenced, call the final destructor immediately.
540 			 */
541 			destroy_nhgrp_int(key);
542 			*perror = EAGAIN;
543 			return (NULL);
544 		}
545 		if (link_nhgrp(ctl, key) == 0) {
546 			/* Unable to allocate index? */
547 			*perror = EAGAIN;
548 			free_nhgrp_nhops(key);
549 			destroy_nhgrp_int(key);
550 			return (NULL);
551 		}
552 		*perror = 0;
553 		return (nhg);
554 	}
555 
556 	/* NOTREACHED */
557 }
558 
559 /*
560  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
561  *
562  * Returns referenced nhop group or NULL, passing error code in @perror.
563  */
564 struct nhgrp_priv *
565 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
566     uint32_t uidx, int *perror)
567 {
568 	struct nhgrp_object *nhg;
569 
570 	nhg = nhgrp_alloc(ctl->ctl_rh->rib_fibnum, ctl->ctl_rh->rib_family,
571 	    wn, num_nhops, perror);
572 	if (nhg == NULL)
573 		return (NULL);
574 	nhgrp_set_uidx(nhg, uidx);
575 	nhg = nhgrp_get_nhgrp(nhg, perror);
576 	if (nhg != NULL)
577 		return (NHGRP_PRIV(nhg));
578 	return (NULL);
579 }
580 
581 
582 /*
583  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
584  *
585  * Returns referenced nexthop group or NULL. In the latter case, @perror is
586  *  filled with an error code.
587  * Note that function does NOT care if the next nexthops already exists
588  * in the @gr_orig. As a result, they will be added, resulting in the
589  * same nexthop being present multiple times in the new group.
590  */
591 static struct nhgrp_priv *
592 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
593     struct weightened_nhop *wn, int num_nhops, int *perror)
594 {
595 	char storage[64];
596 	struct weightened_nhop *pnhops;
597 	struct nhgrp_priv *nhg_priv;
598 	const struct nhgrp_priv *src_priv;
599 	size_t sz;
600 	int curr_nhops;
601 
602 	src_priv = NHGRP_PRIV_CONST(gr_orig);
603 	curr_nhops = src_priv->nhg_nh_count;
604 
605 	*perror = 0;
606 
607 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
608 	/* optimize for <= 4 paths, each path=16 bytes */
609 	if (sz <= sizeof(storage))
610 		pnhops = (struct weightened_nhop *)&storage[0];
611 	else {
612 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
613 		if (pnhops == NULL) {
614 			*perror = ENOMEM;
615 			return (NULL);
616 		}
617 	}
618 
619 	/* Copy nhops from original group first */
620 	memcpy(pnhops, src_priv->nhg_nh_weights,
621 	  curr_nhops * sizeof(struct weightened_nhop));
622 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
623 	curr_nhops += num_nhops;
624 
625 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, 0, perror);
626 
627 	if (pnhops != (struct weightened_nhop *)&storage[0])
628 		free(pnhops, M_TEMP);
629 
630 	if (nhg_priv == NULL)
631 		return (NULL);
632 
633 	return (nhg_priv);
634 }
635 
636 
637 /*
638  * Creates/finds nexthop group based on @wn and @num_nhops.
639  * Returns 0 on success with referenced group in @rnd, or
640  * errno.
641  *
642  * If the error is EAGAIN, then the operation can be retried.
643  */
644 int
645 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
646     uint32_t uidx, struct nhgrp_object **pnhg)
647 {
648 	struct nh_control *ctl = rh->nh_control;
649 	struct nhgrp_priv *nhg_priv;
650 	int error;
651 
652 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, uidx, &error);
653 	if (nhg_priv != NULL)
654 		*pnhg = nhg_priv->nhg;
655 
656 	return (error);
657 }
658 
659 /*
660  * Creates new nexthop group based on @src group without the nexthops
661  * chosen by @flt_func.
662  * Returns 0 on success, storring the reference nhop group/object in @rnd.
663  */
664 int
665 nhgrp_get_filtered_group(struct rib_head *rh, const struct rtentry *rt,
666     const struct nhgrp_object *src, rib_filter_f_t flt_func, void *flt_data,
667     struct route_nhop_data *rnd)
668 {
669 	char storage[64];
670 	struct nh_control *ctl = rh->nh_control;
671 	struct weightened_nhop *pnhops;
672 	const struct nhgrp_priv *mp_priv, *src_priv;
673 	size_t sz;
674 	int error, i, num_nhops;
675 
676 	src_priv = NHGRP_PRIV_CONST(src);
677 
678 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
679 	/* optimize for <= 4 paths, each path=16 bytes */
680 	if (sz <= sizeof(storage))
681 		pnhops = (struct weightened_nhop *)&storage[0];
682 	else {
683 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
684 			return (ENOMEM);
685 	}
686 
687 	/* Filter nexthops */
688 	error = 0;
689 	num_nhops = 0;
690 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
691 		if (flt_func(rt, src_priv->nhg_nh_weights[i].nh, flt_data))
692 			continue;
693 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
694 		  sizeof(struct weightened_nhop));
695 	}
696 
697 	if (num_nhops == 0) {
698 		rnd->rnd_nhgrp = NULL;
699 		rnd->rnd_weight = 0;
700 	} else if (num_nhops == 1) {
701 		rnd->rnd_nhop = pnhops[0].nh;
702 		rnd->rnd_weight = pnhops[0].weight;
703 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
704 			error = EAGAIN;
705 	} else {
706 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, 0, &error);
707 		if (mp_priv != NULL)
708 			rnd->rnd_nhgrp = mp_priv->nhg;
709 		rnd->rnd_weight = 0;
710 	}
711 
712 	if (pnhops != (struct weightened_nhop *)&storage[0])
713 		free(pnhops, M_TEMP);
714 
715 	return (error);
716 }
717 
718 /*
719  * Creates new multipath group based on existing group/nhop in @rnd_orig and
720  *  to-be-added nhop @wn_add.
721  * Returns 0 on success and stores result in @rnd_new.
722  */
723 int
724 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
725     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
726 {
727 	struct nh_control *ctl = rh->nh_control;
728 	struct nhgrp_priv *nhg_priv;
729 	struct weightened_nhop wn[2] = {};
730 	int error;
731 
732 	if (rnd_orig->rnd_nhop == NULL) {
733 		/* No paths to add to, just reference current nhop */
734 		*rnd_new = *rnd_add;
735 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
736 			return (EAGAIN);
737 		return (0);
738 	}
739 
740 	wn[0].nh = rnd_add->rnd_nhop;
741 	wn[0].weight = rnd_add->rnd_weight;
742 
743 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
744 		/* Simple merge of 2 non-multipath nexthops */
745 		wn[1].nh = rnd_orig->rnd_nhop;
746 		wn[1].weight = rnd_orig->rnd_weight;
747 		nhg_priv = get_nhgrp(ctl, wn, 2, 0, &error);
748 	} else {
749 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
750 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
751 		    &error);
752 	}
753 
754 	if (nhg_priv == NULL)
755 		return (error);
756 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
757 	rnd_new->rnd_weight = 0;
758 
759 	return (0);
760 }
761 
762 /*
763  * Returns pointer to array of nexthops with weights for
764  * given @nhg. Stores number of items in the array into @pnum_nhops.
765  */
766 const struct weightened_nhop *
767 nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops)
768 {
769 	const struct nhgrp_priv *nhg_priv;
770 
771 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
772 
773 	nhg_priv = NHGRP_PRIV_CONST(nhg);
774 	*pnum_nhops = nhg_priv->nhg_nh_count;
775 
776 	return (nhg_priv->nhg_nh_weights);
777 }
778 
779 void
780 nhgrp_set_uidx(struct nhgrp_object *nhg, uint32_t uidx)
781 {
782 	struct nhgrp_priv *nhg_priv;
783 
784 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
785 
786 	nhg_priv = NHGRP_PRIV(nhg);
787 
788 	nhg_priv->nhg_uidx = uidx;
789 }
790 
791 uint32_t
792 nhgrp_get_uidx(const struct nhgrp_object *nhg)
793 {
794 	const struct nhgrp_priv *nhg_priv;
795 
796 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
797 
798 	nhg_priv = NHGRP_PRIV_CONST(nhg);
799 	return (nhg_priv->nhg_uidx);
800 }
801 
802 /*
803  * Prints nexhop group @nhg data in the provided @buf.
804  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
805  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
806  */
807 char *
808 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
809 {
810 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
811 
812 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
813 	    nhg_priv->nhg_nh_count);
814 
815 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
816 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
817 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
818 		    wn->nh->nh_priv->nh_idx, wn->weight);
819 		if (len + off + 3 >= bufsize) {
820 			int len = snprintf(&buf[off], bufsize - off, "...");
821 			off += len;
822 			break;
823 		}
824 		off += len;
825 	}
826 	if (off > 0)
827 		off--; // remove last ","
828 	if (off + 1 < bufsize)
829 		snprintf(&buf[off], bufsize - off, "]");
830 	return buf;
831 }
832 
833 __noinline static int
834 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
835     char *buffer, size_t buffer_size, struct sysctl_req *w)
836 {
837 	struct rt_msghdr *rtm;
838 	struct nhgrp_external *nhge;
839 	struct nhgrp_container *nhgc;
840 	const struct nhgrp_object *nhg;
841 	struct nhgrp_nhop_external *ext;
842 	int error;
843 	size_t sz;
844 
845 	nhg = nhg_priv->nhg;
846 
847 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
848 	/* controlplane nexthops */
849 	sz += sizeof(struct nhgrp_container);
850 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
851 	/* dataplane nexthops */
852 	sz += sizeof(struct nhgrp_container);
853 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
854 
855 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
856 
857 	bzero(buffer, sz);
858 
859 	rtm = (struct rt_msghdr *)buffer;
860 	rtm->rtm_msglen = sz;
861 	rtm->rtm_version = RTM_VERSION;
862 	rtm->rtm_type = RTM_GET;
863 
864 	nhge = (struct nhgrp_external *)(rtm + 1);
865 
866 	nhge->nhg_idx = nhg_priv->nhg_idx;
867 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
868 
869 	/* fill in control plane nexthops firs */
870 	nhgc = (struct nhgrp_container *)(nhge + 1);
871 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
872 	nhgc->nhgc_subtype = 0;
873 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
874 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
875 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
876 
877 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
878 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
879 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
880 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
881 	}
882 
883 	/* fill in dataplane nexthops */
884 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
885 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
886 	nhgc->nhgc_subtype = 0;
887 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
888 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
889 	nhgc->nhgc_count = nhg->nhg_size;
890 
891 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
892 	for (int i = 0; i < nhg->nhg_size; i++) {
893 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
894 		ext[i].nh_weight = 0;
895 	}
896 
897 	error = SYSCTL_OUT(w, buffer, sz);
898 
899 	return (error);
900 }
901 
902 uint32_t
903 nhgrp_get_idx(const struct nhgrp_object *nhg)
904 {
905 	const struct nhgrp_priv *nhg_priv;
906 
907 	nhg_priv = NHGRP_PRIV_CONST(nhg);
908 	return (nhg_priv->nhg_idx);
909 }
910 
911 uint8_t
912 nhgrp_get_origin(const struct nhgrp_object *nhg)
913 {
914 	return (NHGRP_PRIV_CONST(nhg)->nhg_origin);
915 }
916 
917 void
918 nhgrp_set_origin(struct nhgrp_object *nhg, uint8_t origin)
919 {
920 	NHGRP_PRIV(nhg)->nhg_origin = origin;
921 }
922 
923 uint32_t
924 nhgrp_get_count(struct rib_head *rh)
925 {
926 	struct nh_control *ctl;
927 	uint32_t count;
928 
929 	ctl = rh->nh_control;
930 
931 	NHOPS_RLOCK(ctl);
932 	count = ctl->gr_head.items_count;
933 	NHOPS_RUNLOCK(ctl);
934 
935 	return (count);
936 }
937 
938 int
939 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
940 {
941 	struct nh_control *ctl = rh->nh_control;
942 	struct epoch_tracker et;
943 	struct nhgrp_priv *nhg_priv;
944 	char *buffer;
945 	size_t sz;
946 	int error = 0;
947 
948 	if (ctl->gr_head.items_count == 0)
949 		return (0);
950 
951 	/* Calculate the maximum nhop group size in bytes */
952 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
953 	sz += 2 * sizeof(struct nhgrp_container);
954 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
955 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
956 	if (buffer == NULL)
957 		return (ENOMEM);
958 
959 	NET_EPOCH_ENTER(et);
960 	NHOPS_RLOCK(ctl);
961 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
962 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
963 		if (error != 0)
964 			break;
965 	} CHT_SLIST_FOREACH_END;
966 	NHOPS_RUNLOCK(ctl);
967 	NET_EPOCH_EXIT(et);
968 
969 	free(buffer, M_TEMP);
970 
971 	return (error);
972 }
973