xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision 1f1e2261)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 #include "opt_inet.h"
30 #include "opt_route.h"
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/lock.h>
36 #include <sys/rmlock.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/refcount.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/kernel.h>
43 #include <sys/epoch.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/route.h>
48 #include <net/route/route_ctl.h>
49 #include <net/route/route_var.h>
50 #include <net/vnet.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/in_var.h>
54 #include <netinet/in_fib.h>
55 
56 #include <net/route/nhop_utils.h>
57 #include <net/route/nhop.h>
58 #include <net/route/nhop_var.h>
59 #include <net/route/nhgrp_var.h>
60 
61 #define	DEBUG_MOD_NAME	nhgrp_ctl
62 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
63 #include <net/route/route_debug.h>
64 _DECLARE_DEBUG(LOG_INFO);
65 
66 /*
67  * This file contains the supporting functions for creating multipath groups
68  *  and compiling their dataplane parts.
69  */
70 
71 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
72 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
73     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
74 /* Offset and size of flags field has to be the same for nhop/nhop groups */
75 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
76 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
77 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
78 
79 static int wn_cmp(const void *a, const void *b);
80 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
81 
82 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
83     struct weightened_nhop *wn, int num_nhops, int *perror);
84 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
85 static void destroy_nhgrp_epoch(epoch_context_t ctx);
86 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
87 
88 static int
89 wn_cmp(const void *a, const void *b)
90 {
91 	const struct weightened_nhop *wa = a;
92 	const struct weightened_nhop *wb = b;
93 
94 	if (wa->weight > wb->weight)
95 		return (1);
96 	else if (wa->weight < wb->weight)
97 		return (-1);
98 
99 	/* Compare nexthops by pointer */
100 	if (wa->nh > wb->nh)
101 		return (1);
102 	else if (wa->nh < wb->nh)
103 		return (-1);
104 	else
105 		return (0);
106 }
107 
108 /*
109  * Perform in-place sorting for array of nexthops in @wn.
110  *
111  * To avoid nh groups duplication, nexthops/weights in the
112  *   @wn need to be ordered deterministically.
113  * As this sorting is needed only for the control plane functionality,
114  *  there are no specific external requirements.
115  *
116  * Sort by weight first, to ease calculation of the slot sizes.
117  */
118 static void
119 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
120 {
121 
122 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
123 }
124 
125 /*
126  * Calculate minimum number of slots required to fit the existing
127  * set of weights in the common use case where weights are "easily"
128  * comparable.
129  * Assumes @wn is sorted by weight ascending and each weight is > 0.
130  * Returns number of slots or 0 if precise calculation failed.
131  *
132  * Some examples:
133  * note: (i, X) pair means (nhop=i, weight=X):
134  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
135  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
136  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
137  */
138 static uint32_t
139 calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
140 {
141 	uint32_t i, last, xmin;
142 	uint64_t total = 0;
143 
144 	last = 0;
145 	xmin = wn[0].weight;
146 	for (i = 0; i < num_items; i++) {
147 		total += wn[i].weight;
148 		if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
149 			xmin = wn[i].weight - last;
150 		last = wn[i].weight;
151 	}
152 	/* xmin is the minimum unit of desired capacity */
153 	if ((total % xmin) != 0)
154 		return (0);
155 	for (i = 0; i < num_items; i++) {
156 		if ((wn[i].weight % xmin) != 0)
157 			return (0);
158 	}
159 
160 	return ((uint32_t)(total / xmin));
161 }
162 
163 /*
164  * Calculate minimum number of slots required to fit the existing
165  * set of weights while maintaining weight coefficients.
166  *
167  * Assume @wn is sorted by weight ascending and each weight is > 0.
168  *
169  * Tries to find simple precise solution first and falls back to
170  *  RIB_MAX_MPATH_WIDTH in case of any failure.
171  */
172 static uint32_t
173 calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
174 {
175 	uint32_t v;
176 
177 	v = calc_min_mpath_slots_fast(wn, num_items);
178 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
179 		v = RIB_MAX_MPATH_WIDTH;
180 
181 	return (v);
182 }
183 
184 /*
185  * Nexthop group data consists of
186  * 1) dataplane part, with nhgrp_object as a header followed by an
187  *   arbitrary number of nexthop pointers.
188  * 2) control plane part, with nhgrp_priv as a header, followed by
189  *   an arbirtrary number of 'struct weightened_nhop' object.
190  *
191  * Given nexthop groups are (mostly) immutable, allocate all data
192  * in one go.
193  *
194  */
195 __noinline static size_t
196 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
197 {
198 	size_t sz;
199 
200 	sz = sizeof(struct nhgrp_object);
201 	sz += nhg_size * sizeof(struct nhop_object *);
202 	sz += sizeof(struct nhgrp_priv);
203 	sz += num_nhops * sizeof(struct weightened_nhop);
204 	return (sz);
205 }
206 
207 /*
208  * Compile actual list of nexthops to be used by datapath from
209  *  the nexthop group @dst.
210  *
211  * For example, compiling control plane list of 2 nexthops
212  *  [(200, A), (100, B)] would result in the datapath array
213  *  [A, A, B]
214  */
215 static void
216 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
217     uint32_t num_slots)
218 {
219 	struct nhgrp_object *dst;
220 	int i, slot_idx, remaining_slots;
221 	uint64_t remaining_sum, nh_weight, nh_slots;
222 
223 	slot_idx  = 0;
224 	dst = dst_priv->nhg;
225 	/* Calculate sum of all weights */
226 	remaining_sum = 0;
227 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
228 		remaining_sum += x[i].weight;
229 	remaining_slots = num_slots;
230 	FIB_NH_LOG(LOG_DEBUG3, x[0].nh, "sum: %lu, slots: %d",
231 	    remaining_sum, remaining_slots);
232 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
233 		/* Calculate number of slots for the current nexthop */
234 		if (remaining_sum > 0) {
235 			nh_weight = (uint64_t)x[i].weight;
236 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
237 		} else
238 			nh_slots = 0;
239 
240 		remaining_sum -= x[i].weight;
241 		remaining_slots -= nh_slots;
242 
243 		FIB_NH_LOG(LOG_DEBUG3, x[0].nh,
244 		    " rem_sum: %lu, rem_slots: %d nh_slots: %d, slot_idx: %d",
245 		    remaining_sum, remaining_slots, (int)nh_slots, slot_idx);
246 
247 		KASSERT((slot_idx + nh_slots <= num_slots),
248 		    ("index overflow during nhg compilation"));
249 		while (nh_slots-- > 0)
250 			dst->nhops[slot_idx++] = x[i].nh;
251 	}
252 }
253 
254 /*
255  * Allocates new nexthop group for the list of weightened nexthops.
256  * Assume sorted list.
257  * Does NOT reference any nexthops in the group.
258  * Returns group with refcount=1 or NULL.
259  */
260 static struct nhgrp_priv *
261 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
262 {
263 	uint32_t nhgrp_size;
264 	struct nhgrp_object *nhg;
265 	struct nhgrp_priv *nhg_priv;
266 
267 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
268 	if (nhgrp_size == 0) {
269 		/* Zero weights, abort */
270 		return (NULL);
271 	}
272 
273 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
274 	nhg = malloc(sz, M_NHOP, M_NOWAIT | M_ZERO);
275 	if (nhg == NULL) {
276 		FIB_NH_LOG(LOG_INFO, wn[0].nh,
277 		    "unable to allocate group with num_nhops %d (compiled %u)",
278 		    num_nhops, nhgrp_size);
279 		return (NULL);
280 	}
281 
282 	/* Has to be the first to make NHGRP_PRIV() work */
283 	nhg->nhg_size = nhgrp_size;
284 	nhg->nhg_flags = MPF_MULTIPATH;
285 
286 	nhg_priv = NHGRP_PRIV(nhg);
287 	nhg_priv->nhg_nh_count = num_nhops;
288 	refcount_init(&nhg_priv->nhg_refcount, 1);
289 
290 	/* Please see nhgrp_free() comments on the initial value */
291 	refcount_init(&nhg_priv->nhg_linked, 2);
292 
293 	nhg_priv->nhg = nhg;
294 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
295 	  num_nhops * sizeof(struct weightened_nhop));
296 
297 	FIB_NH_LOG(LOG_DEBUG, wn[0].nh, "num_nhops: %d, compiled_nhop: %u",
298 	    num_nhops, nhgrp_size);
299 
300 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
301 
302 	return (nhg_priv);
303 }
304 
305 void
306 nhgrp_ref_object(struct nhgrp_object *nhg)
307 {
308 	struct nhgrp_priv *nhg_priv;
309 	u_int old __diagused;
310 
311 	nhg_priv = NHGRP_PRIV(nhg);
312 	old = refcount_acquire(&nhg_priv->nhg_refcount);
313 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
314 }
315 
316 void
317 nhgrp_free(struct nhgrp_object *nhg)
318 {
319 	struct nhgrp_priv *nhg_priv;
320 	struct nh_control *ctl;
321 	struct epoch_tracker et;
322 
323 	nhg_priv = NHGRP_PRIV(nhg);
324 
325 	if (!refcount_release(&nhg_priv->nhg_refcount))
326 		return;
327 
328 	/*
329 	 * group objects don't have an explicit lock attached to it.
330 	 * As groups are reclaimed based on reference count, it is possible
331 	 * that some groups will persist after vnet destruction callback
332 	 * called. Given that, handle scenario with nhgrp_free_group() being
333 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
334 	 * by using another reference counter: nhg_linked.
335 	 *
336 	 * There are only 2 places, where nhg_linked can be decreased:
337 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
338 	 * nhg_link can never be increased.
339 	 *
340 	 * Hence, use initial value of 2 to make use of
341 	 *  refcount_release_if_not_last().
342 	 *
343 	 * There can be two scenarious when calling this function:
344 	 *
345 	 * 1) nhg_linked value is 2. This means that either
346 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
347 	 *  but we are guaranteed that nh_control won't be freed in
348 	 *  this epoch. Hence, nexthop can be safely unlinked.
349 	 *
350 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
351 	 *  has been called and nhgrp unlink can be skipped.
352 	 */
353 
354 	NET_EPOCH_ENTER(et);
355 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
356 		ctl = nhg_priv->nh_control;
357 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
358 			/* Do not try to reclaim */
359 			RT_LOG(LOG_INFO, "Failed to unlink nexhop group %p",
360 			    nhg_priv);
361 			NET_EPOCH_EXIT(et);
362 			return;
363 		}
364 	}
365 	NET_EPOCH_EXIT(et);
366 
367 	epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
368 	    &nhg_priv->nhg_epoch_ctx);
369 }
370 
371 /*
372  * Destroys all local resources belonging to @nhg_priv.
373  */
374 __noinline static void
375 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
376 {
377 
378 	free(nhg_priv->nhg, M_NHOP);
379 }
380 
381 __noinline static void
382 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
383 {
384 
385 	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
386 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
387 
388 #if DEBUG_MAX_LEVEL >= LOG_DEBUG
389 	char nhgbuf[NHOP_PRINT_BUFSIZE];
390 	FIB_NH_LOG(LOG_DEBUG, nhg_priv->nhg_nh_weights[0].nh,
391 	    "destroying %s", nhgrp_print_buf(nhg_priv->nhg,
392 	    nhgbuf, sizeof(nhgbuf)));
393 #endif
394 
395 	free_nhgrp_nhops(nhg_priv);
396 	destroy_nhgrp_int(nhg_priv);
397 }
398 
399 /*
400  * Epoch callback indicating group is safe to destroy
401  */
402 static void
403 destroy_nhgrp_epoch(epoch_context_t ctx)
404 {
405 	struct nhgrp_priv *nhg_priv;
406 
407 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
408 
409 	destroy_nhgrp(nhg_priv);
410 }
411 
412 static bool
413 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
414 {
415 
416 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
417 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
418 			continue;
419 
420 		/*
421 		 * Failed to ref the nexthop, b/c it's deleted.
422 		 * Need to rollback references back.
423 		 */
424 		for (int j = 0; j < i; j++)
425 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
426 		return (false);
427 	}
428 
429 	return (true);
430 }
431 
432 static void
433 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
434 {
435 
436 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
437 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
438 }
439 
440 /*
441  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
442  *
443  * Returns referenced nhop group or NULL, passing error code in @perror.
444  */
445 struct nhgrp_priv *
446 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
447     int *perror)
448 {
449 	struct nhgrp_priv *key, *nhg_priv;
450 
451 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
452 		*perror = E2BIG;
453 		return (NULL);
454 	}
455 
456 	if (ctl->gr_head.hash_size == 0) {
457 		/* First multipath request. Bootstrap mpath datastructures. */
458 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
459 			*perror = ENOMEM;
460 			return (NULL);
461 		}
462 	}
463 
464 	/* Sort nexthops & check there are no duplicates */
465 	sort_weightened_nhops(wn, num_nhops);
466 	uint32_t last_id = 0;
467 	for (int i = 0; i < num_nhops; i++) {
468 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
469 			*perror = EEXIST;
470 			return (NULL);
471 		}
472 		last_id = wn[i].nh->nh_priv->nh_idx;
473 	}
474 
475 	if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
476 		*perror = ENOMEM;
477 		return (NULL);
478 	}
479 
480 	nhg_priv = find_nhgrp(ctl, key);
481 	if (nhg_priv != NULL) {
482 		/*
483 		 * Free originally-created group. As it hasn't been linked
484 		 *  and the dependent nexhops haven't been referenced, just free
485 		 *  the group.
486 		 */
487 		destroy_nhgrp_int(key);
488 		*perror = 0;
489 		return (nhg_priv);
490 	} else {
491 		/* No existing group, try to link the new one */
492 		if (!ref_nhgrp_nhops(key)) {
493 			/*
494 			 * Some of the nexthops have been scheduled for deletion.
495 			 * As the group hasn't been linked / no nexhops have been
496 			 *  referenced, call the final destructor immediately.
497 			 */
498 			destroy_nhgrp_int(key);
499 			*perror = EAGAIN;
500 			return (NULL);
501 		}
502 		if (link_nhgrp(ctl, key) == 0) {
503 			/* Unable to allocate index? */
504 			*perror = EAGAIN;
505 			free_nhgrp_nhops(key);
506 			destroy_nhgrp_int(key);
507 			return (NULL);
508 		}
509 		*perror = 0;
510 		return (key);
511 	}
512 
513 	/* NOTREACHED */
514 }
515 
516 /*
517  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
518  *
519  * Returns referenced nexthop group or NULL. In the latter case, @perror is
520  *  filled with an error code.
521  * Note that function does NOT care if the next nexthops already exists
522  * in the @gr_orig. As a result, they will be added, resulting in the
523  * same nexthop being present multiple times in the new group.
524  */
525 static struct nhgrp_priv *
526 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
527     struct weightened_nhop *wn, int num_nhops, int *perror)
528 {
529 	char storage[64];
530 	struct weightened_nhop *pnhops;
531 	struct nhgrp_priv *nhg_priv;
532 	const struct nhgrp_priv *src_priv;
533 	size_t sz;
534 	int curr_nhops;
535 
536 	src_priv = NHGRP_PRIV_CONST(gr_orig);
537 	curr_nhops = src_priv->nhg_nh_count;
538 
539 	*perror = 0;
540 
541 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
542 	/* optimize for <= 4 paths, each path=16 bytes */
543 	if (sz <= sizeof(storage))
544 		pnhops = (struct weightened_nhop *)&storage[0];
545 	else {
546 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
547 		if (pnhops == NULL) {
548 			*perror = ENOMEM;
549 			return (NULL);
550 		}
551 	}
552 
553 	/* Copy nhops from original group first */
554 	memcpy(pnhops, src_priv->nhg_nh_weights,
555 	  curr_nhops * sizeof(struct weightened_nhop));
556 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
557 	curr_nhops += num_nhops;
558 
559 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
560 
561 	if (pnhops != (struct weightened_nhop *)&storage[0])
562 		free(pnhops, M_TEMP);
563 
564 	if (nhg_priv == NULL)
565 		return (NULL);
566 
567 	return (nhg_priv);
568 }
569 
570 
571 /*
572  * Creates/finds nexthop group based on @wn and @num_nhops.
573  * Returns 0 on success with referenced group in @rnd, or
574  * errno.
575  *
576  * If the error is EAGAIN, then the operation can be retried.
577  */
578 int
579 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
580     struct route_nhop_data *rnd)
581 {
582 	struct nh_control *ctl = rh->nh_control;
583 	struct nhgrp_priv *nhg_priv;
584 	int error;
585 
586 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
587 	if (nhg_priv != NULL)
588 		rnd->rnd_nhgrp = nhg_priv->nhg;
589 	rnd->rnd_weight = 0;
590 
591 	return (error);
592 }
593 
594 /*
595  * Creates new nexthop group based on @src group without the nexthops
596  * chosen by @flt_func.
597  * Returns 0 on success, storring the reference nhop group/object in @rnd.
598  */
599 int
600 nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
601     nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
602 {
603 	char storage[64];
604 	struct nh_control *ctl = rh->nh_control;
605 	struct weightened_nhop *pnhops;
606 	const struct nhgrp_priv *mp_priv, *src_priv;
607 	size_t sz;
608 	int error, i, num_nhops;
609 
610 	src_priv = NHGRP_PRIV_CONST(src);
611 
612 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
613 	/* optimize for <= 4 paths, each path=16 bytes */
614 	if (sz <= sizeof(storage))
615 		pnhops = (struct weightened_nhop *)&storage[0];
616 	else {
617 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
618 			return (ENOMEM);
619 	}
620 
621 	/* Filter nexthops */
622 	error = 0;
623 	num_nhops = 0;
624 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
625 		if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
626 			continue;
627 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
628 		  sizeof(struct weightened_nhop));
629 	}
630 
631 	if (num_nhops == 0) {
632 		rnd->rnd_nhgrp = NULL;
633 		rnd->rnd_weight = 0;
634 	} else if (num_nhops == 1) {
635 		rnd->rnd_nhop = pnhops[0].nh;
636 		rnd->rnd_weight = pnhops[0].weight;
637 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
638 			error = EAGAIN;
639 	} else {
640 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
641 		if (mp_priv != NULL)
642 			rnd->rnd_nhgrp = mp_priv->nhg;
643 		rnd->rnd_weight = 0;
644 	}
645 
646 	if (pnhops != (struct weightened_nhop *)&storage[0])
647 		free(pnhops, M_TEMP);
648 
649 	return (error);
650 }
651 
652 /*
653  * Creates new multipath group based on existing group/nhop in @rnd_orig and
654  *  to-be-added nhop @wn_add.
655  * Returns 0 on success and stores result in @rnd_new.
656  */
657 int
658 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
659     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
660 {
661 	struct nh_control *ctl = rh->nh_control;
662 	struct nhgrp_priv *nhg_priv;
663 	struct weightened_nhop wn[2] = {};
664 	int error;
665 
666 	if (rnd_orig->rnd_nhop == NULL) {
667 		/* No paths to add to, just reference current nhop */
668 		*rnd_new = *rnd_add;
669 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
670 			return (EAGAIN);
671 		return (0);
672 	}
673 
674 	wn[0].nh = rnd_add->rnd_nhop;
675 	wn[0].weight = rnd_add->rnd_weight;
676 
677 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
678 		/* Simple merge of 2 non-multipath nexthops */
679 		wn[1].nh = rnd_orig->rnd_nhop;
680 		wn[1].weight = rnd_orig->rnd_weight;
681 		nhg_priv = get_nhgrp(ctl, wn, 2, &error);
682 	} else {
683 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
684 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
685 		    &error);
686 	}
687 
688 	if (nhg_priv == NULL)
689 		return (error);
690 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
691 	rnd_new->rnd_weight = 0;
692 
693 	return (0);
694 }
695 
696 /*
697  * Returns pointer to array of nexthops with weights for
698  * given @nhg. Stores number of items in the array into @pnum_nhops.
699  */
700 struct weightened_nhop *
701 nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
702 {
703 	struct nhgrp_priv *nhg_priv;
704 
705 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
706 
707 	nhg_priv = NHGRP_PRIV(nhg);
708 	*pnum_nhops = nhg_priv->nhg_nh_count;
709 
710 	return (nhg_priv->nhg_nh_weights);
711 }
712 
713 /*
714  * Prints nexhop group @nhg data in the provided @buf.
715  * Example: nhg#33/sz=3:[#1:100,#2:100,#3:100]
716  * Example: nhg#33/sz=5:[#1:100,#2:100,..]
717  */
718 char *
719 nhgrp_print_buf(const struct nhgrp_object *nhg, char *buf, size_t bufsize)
720 {
721 	const struct nhgrp_priv *nhg_priv = NHGRP_PRIV_CONST(nhg);
722 
723 	int off = snprintf(buf, bufsize, "nhg#%u/sz=%u:[", nhg_priv->nhg_idx,
724 	    nhg_priv->nhg_nh_count);
725 
726 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
727 		const struct weightened_nhop *wn = &nhg_priv->nhg_nh_weights[i];
728 		int len = snprintf(&buf[off], bufsize - off, "#%u:%u,",
729 		    wn->nh->nh_priv->nh_idx, wn->weight);
730 		if (len + off + 3 >= bufsize) {
731 			int len = snprintf(&buf[off], bufsize - off, "...");
732 			off += len;
733 			break;
734 		}
735 		off += len;
736 	}
737 	if (off > 0)
738 		off--; // remove last ","
739 	if (off + 1 < bufsize)
740 		snprintf(&buf[off], bufsize - off, "]");
741 	return buf;
742 }
743 
744 __noinline static int
745 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
746     char *buffer, size_t buffer_size, struct sysctl_req *w)
747 {
748 	struct rt_msghdr *rtm;
749 	struct nhgrp_external *nhge;
750 	struct nhgrp_container *nhgc;
751 	const struct nhgrp_object *nhg;
752 	struct nhgrp_nhop_external *ext;
753 	int error;
754 	size_t sz;
755 
756 	nhg = nhg_priv->nhg;
757 
758 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
759 	/* controlplane nexthops */
760 	sz += sizeof(struct nhgrp_container);
761 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
762 	/* dataplane nexthops */
763 	sz += sizeof(struct nhgrp_container);
764 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
765 
766 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
767 
768 	bzero(buffer, sz);
769 
770 	rtm = (struct rt_msghdr *)buffer;
771 	rtm->rtm_msglen = sz;
772 	rtm->rtm_version = RTM_VERSION;
773 	rtm->rtm_type = RTM_GET;
774 
775 	nhge = (struct nhgrp_external *)(rtm + 1);
776 
777 	nhge->nhg_idx = nhg_priv->nhg_idx;
778 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
779 
780 	/* fill in control plane nexthops firs */
781 	nhgc = (struct nhgrp_container *)(nhge + 1);
782 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
783 	nhgc->nhgc_subtype = 0;
784 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
785 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
786 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
787 
788 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
789 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
790 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
791 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
792 	}
793 
794 	/* fill in dataplane nexthops */
795 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
796 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
797 	nhgc->nhgc_subtype = 0;
798 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
799 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
800 	nhgc->nhgc_count = nhg->nhg_size;
801 
802 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
803 	for (int i = 0; i < nhg->nhg_size; i++) {
804 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
805 		ext[i].nh_weight = 0;
806 	}
807 
808 	error = SYSCTL_OUT(w, buffer, sz);
809 
810 	return (error);
811 }
812 
813 uint32_t
814 nhgrp_get_idx(const struct nhgrp_object *nhg)
815 {
816 	const struct nhgrp_priv *nhg_priv;
817 
818 	nhg_priv = NHGRP_PRIV_CONST(nhg);
819 	return (nhg_priv->nhg_idx);
820 }
821 
822 uint32_t
823 nhgrp_get_count(struct rib_head *rh)
824 {
825 	struct nh_control *ctl;
826 	uint32_t count;
827 
828 	ctl = rh->nh_control;
829 
830 	NHOPS_RLOCK(ctl);
831 	count = ctl->gr_head.items_count;
832 	NHOPS_RUNLOCK(ctl);
833 
834 	return (count);
835 }
836 
837 int
838 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
839 {
840 	struct nh_control *ctl = rh->nh_control;
841 	struct epoch_tracker et;
842 	struct nhgrp_priv *nhg_priv;
843 	char *buffer;
844 	size_t sz;
845 	int error = 0;
846 
847 	if (ctl->gr_head.items_count == 0)
848 		return (0);
849 
850 	/* Calculate the maximum nhop group size in bytes */
851 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
852 	sz += 2 * sizeof(struct nhgrp_container);
853 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
854 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
855 	if (buffer == NULL)
856 		return (ENOMEM);
857 
858 	NET_EPOCH_ENTER(et);
859 	NHOPS_RLOCK(ctl);
860 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
861 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
862 		if (error != 0)
863 			break;
864 	} CHT_SLIST_FOREACH_END;
865 	NHOPS_RUNLOCK(ctl);
866 	NET_EPOCH_EXIT(et);
867 
868 	free(buffer, M_TEMP);
869 
870 	return (error);
871 }
872