xref: /freebsd/sys/net/route/nhgrp_ctl.c (revision 16038816)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 #include "opt_inet.h"
30 #include "opt_route.h"
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/lock.h>
36 #include <sys/rmlock.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/refcount.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/kernel.h>
43 #include <sys/epoch.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/route.h>
48 #include <net/route/route_ctl.h>
49 #include <net/route/route_var.h>
50 #include <net/vnet.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/in_var.h>
54 #include <netinet/in_fib.h>
55 
56 #include <net/route/nhop_utils.h>
57 #include <net/route/nhop.h>
58 #include <net/route/nhop_var.h>
59 #include <net/route/nhgrp_var.h>
60 
61 /*
62  * This file contains the supporting functions for creating multipath groups
63  *  and compiling their dataplane parts.
64  */
65 
66 /* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
67 _Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
68     "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
69 /* Offset and size of flags field has to be the same for nhop/nhop groups */
70 CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
71 /* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
72 CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
73 
74 static int wn_cmp(const void *a, const void *b);
75 static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
76 
77 static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
78     struct weightened_nhop *wn, int num_nhops, int *perror);
79 static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
80 static void destroy_nhgrp_epoch(epoch_context_t ctx);
81 static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
82 
83 static int
84 wn_cmp(const void *a, const void *b)
85 {
86 	const struct weightened_nhop *wa = a;
87 	const struct weightened_nhop *wb = b;
88 
89 	if (wa->weight > wb->weight)
90 		return (1);
91 	else if (wa->weight < wb->weight)
92 		return (-1);
93 
94 	/* Compare nexthops by pointer */
95 	if (wa->nh > wb->nh)
96 		return (1);
97 	else if (wa->nh < wb->nh)
98 		return (-1);
99 	else
100 		return (0);
101 }
102 
103 /*
104  * Perform in-place sorting for array of nexthops in @wn.
105  *
106  * To avoid nh groups duplication, nexthops/weights in the
107  *   @wn need to be ordered deterministically.
108  * As this sorting is needed only for the control plane functionality,
109  *  there are no specific external requirements.
110  *
111  * Sort by weight first, to ease calculation of the slot sizes.
112  */
113 static void
114 sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
115 {
116 
117 	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
118 }
119 
120 /*
121  * Calculate minimum number of slots required to fit the existing
122  * set of weights in the common use case where weights are "easily"
123  * comparable.
124  * Assumes @wn is sorted by weight ascending and each weight is > 0.
125  * Returns number of slots or 0 if precise calculation failed.
126  *
127  * Some examples:
128  * note: (i, X) pair means (nhop=i, weight=X):
129  * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
130  * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
131  * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
132  */
133 static uint32_t
134 calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
135 {
136 	uint32_t i, last, xmin;
137 	uint64_t total = 0;
138 
139 	last = 0;
140 	xmin = wn[0].weight;
141 	for (i = 0; i < num_items; i++) {
142 		total += wn[i].weight;
143 		if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
144 			xmin = wn[i].weight - last;
145 		last = wn[i].weight;
146 	}
147 	/* xmin is the minimum unit of desired capacity */
148 	if ((total % xmin) != 0)
149 		return (0);
150 	for (i = 0; i < num_items; i++) {
151 		if ((wn[i].weight % xmin) != 0)
152 			return (0);
153 	}
154 
155 	return ((uint32_t)(total / xmin));
156 }
157 
158 /*
159  * Calculate minimum number of slots required to fit the existing
160  * set of weights while maintaining weight coefficients.
161  *
162  * Assume @wn is sorted by weight ascending and each weight is > 0.
163  *
164  * Tries to find simple precise solution first and falls back to
165  *  RIB_MAX_MPATH_WIDTH in case of any failure.
166  */
167 static uint32_t
168 calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
169 {
170 	uint32_t v;
171 
172 	v = calc_min_mpath_slots_fast(wn, num_items);
173 	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
174 		v = RIB_MAX_MPATH_WIDTH;
175 
176 	return (v);
177 }
178 
179 /*
180  * Nexthop group data consists of
181  * 1) dataplane part, with nhgrp_object as a header followed by an
182  *   arbitrary number of nexthop pointers.
183  * 2) control plane part, with nhgrp_priv as a header, followed by
184  *   an arbirtrary number of 'struct weightened_nhop' object.
185  *
186  * Given nexthop groups are (mostly) immutable, allocate all data
187  * in one go.
188  *
189  */
190 __noinline static size_t
191 get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
192 {
193 	size_t sz;
194 
195 	sz = sizeof(struct nhgrp_object);
196 	sz += nhg_size * sizeof(struct nhop_object *);
197 	sz += sizeof(struct nhgrp_priv);
198 	sz += num_nhops * sizeof(struct weightened_nhop);
199 	return (sz);
200 }
201 
202 /*
203  * Compile actual list of nexthops to be used by datapath from
204  *  the nexthop group @dst.
205  *
206  * For example, compiling control plane list of 2 nexthops
207  *  [(200, A), (100, B)] would result in the datapath array
208  *  [A, A, B]
209  */
210 static void
211 compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
212     uint32_t num_slots)
213 {
214 	struct nhgrp_object *dst;
215 	int i, slot_idx, remaining_slots;
216 	uint64_t remaining_sum, nh_weight, nh_slots;
217 
218 	slot_idx  = 0;
219 	dst = dst_priv->nhg;
220 	/* Calculate sum of all weights */
221 	remaining_sum = 0;
222 	for (i = 0; i < dst_priv->nhg_nh_count; i++)
223 		remaining_sum += x[i].weight;
224 	remaining_slots = num_slots;
225 	DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
226 	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
227 		/* Calculate number of slots for the current nexthop */
228 		if (remaining_sum > 0) {
229 			nh_weight = (uint64_t)x[i].weight;
230 			nh_slots = (nh_weight * remaining_slots / remaining_sum);
231 		} else
232 			nh_slots = 0;
233 
234 		remaining_sum -= x[i].weight;
235 		remaining_slots -= nh_slots;
236 
237 		DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
238 		    (uint32_t)remaining_sum, remaining_slots,
239 		    (int)nh_slots, slot_idx);
240 
241 		KASSERT((slot_idx + nh_slots <= num_slots),
242 		    ("index overflow during nhg compilation"));
243 		while (nh_slots-- > 0)
244 			dst->nhops[slot_idx++] = x[i].nh;
245 	}
246 }
247 
248 /*
249  * Allocates new nexthop group for the list of weightened nexthops.
250  * Assume sorted list.
251  * Does NOT reference any nexthops in the group.
252  * Returns group with refcount=1 or NULL.
253  */
254 static struct nhgrp_priv *
255 alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
256 {
257 	uint32_t nhgrp_size;
258 	int flags = M_NOWAIT;
259 	struct nhgrp_object *nhg;
260 	struct nhgrp_priv *nhg_priv;
261 
262 	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
263 	if (nhgrp_size == 0) {
264 		/* Zero weights, abort */
265 		return (NULL);
266 	}
267 
268 	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
269 	nhg = malloc(sz, M_NHOP, flags | M_ZERO);
270 	if (nhg == NULL) {
271 		return (NULL);
272 	}
273 
274 	/* Has to be the first to make NHGRP_PRIV() work */
275 	nhg->nhg_size = nhgrp_size;
276 	DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
277 	nhg->nhg_flags = MPF_MULTIPATH;
278 
279 	nhg_priv = NHGRP_PRIV(nhg);
280 	nhg_priv->nhg_nh_count = num_nhops;
281 	refcount_init(&nhg_priv->nhg_refcount, 1);
282 
283 	/* Please see nhgrp_free() comments on the initial value */
284 	refcount_init(&nhg_priv->nhg_linked, 2);
285 
286 	nhg_priv->nhg = nhg;
287 	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
288 	  num_nhops * sizeof(struct weightened_nhop));
289 
290 	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
291 
292 	return (nhg_priv);
293 }
294 
295 void
296 nhgrp_ref_object(struct nhgrp_object *nhg)
297 {
298 	struct nhgrp_priv *nhg_priv;
299 	u_int old;
300 
301 	nhg_priv = NHGRP_PRIV(nhg);
302 	old = refcount_acquire(&nhg_priv->nhg_refcount);
303 	KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
304 }
305 
306 void
307 nhgrp_free(struct nhgrp_object *nhg)
308 {
309 	struct nhgrp_priv *nhg_priv;
310 	struct nh_control *ctl;
311 	struct epoch_tracker et;
312 
313 	nhg_priv = NHGRP_PRIV(nhg);
314 
315 	if (!refcount_release(&nhg_priv->nhg_refcount))
316 		return;
317 
318 	/*
319 	 * group objects don't have an explicit lock attached to it.
320 	 * As groups are reclaimed based on reference count, it is possible
321 	 * that some groups will persist after vnet destruction callback
322 	 * called. Given that, handle scenario with nhgrp_free_group() being
323 	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
324 	 * by using another reference counter: nhg_linked.
325 	 *
326 	 * There are only 2 places, where nhg_linked can be decreased:
327 	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
328 	 * nhg_link can never be increased.
329 	 *
330 	 * Hence, use initial value of 2 to make use of
331 	 *  refcount_release_if_not_last().
332 	 *
333 	 * There can be two scenarious when calling this function:
334 	 *
335 	 * 1) nhg_linked value is 2. This means that either
336 	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
337 	 *  but we are guaranteed that nh_control won't be freed in
338 	 *  this epoch. Hence, nexthop can be safely unlinked.
339 	 *
340 	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
341 	 *  has been called and nhgrp unlink can be skipped.
342 	 */
343 
344 	NET_EPOCH_ENTER(et);
345 	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
346 		ctl = nhg_priv->nh_control;
347 		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
348 			/* Do not try to reclaim */
349 			DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
350 			NET_EPOCH_EXIT(et);
351 			return;
352 		}
353 	}
354 	NET_EPOCH_EXIT(et);
355 
356 	epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
357 	    &nhg_priv->nhg_epoch_ctx);
358 }
359 
360 /*
361  * Destroys all local resources belonging to @nhg_priv.
362  */
363 __noinline static void
364 destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
365 {
366 
367 	free(nhg_priv->nhg, M_NHOP);
368 }
369 
370 __noinline static void
371 destroy_nhgrp(struct nhgrp_priv *nhg_priv)
372 {
373 
374 	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
375 
376 	DPRINTF("DEL MPATH %p", nhg_priv);
377 
378 	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
379 
380 	free_nhgrp_nhops(nhg_priv);
381 
382 	destroy_nhgrp_int(nhg_priv);
383 }
384 
385 /*
386  * Epoch callback indicating group is safe to destroy
387  */
388 static void
389 destroy_nhgrp_epoch(epoch_context_t ctx)
390 {
391 	struct nhgrp_priv *nhg_priv;
392 
393 	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
394 
395 	destroy_nhgrp(nhg_priv);
396 }
397 
398 static bool
399 ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
400 {
401 
402 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
403 		if (nhop_try_ref_object(nhg_priv->nhg_nh_weights[i].nh) != 0)
404 			continue;
405 
406 		/*
407 		 * Failed to ref the nexthop, b/c it's deleted.
408 		 * Need to rollback references back.
409 		 */
410 		for (int j = 0; j < i; j++)
411 			nhop_free(nhg_priv->nhg_nh_weights[j].nh);
412 		return (false);
413 	}
414 
415 	return (true);
416 }
417 
418 static void
419 free_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
420 {
421 
422 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++)
423 		nhop_free(nhg_priv->nhg_nh_weights[i].nh);
424 }
425 
426 /*
427  * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
428  *
429  * Returns referenced nhop group or NULL, passing error code in @perror.
430  */
431 struct nhgrp_priv *
432 get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
433     int *perror)
434 {
435 	struct nhgrp_priv *key, *nhg_priv;
436 
437 	if (num_nhops > RIB_MAX_MPATH_WIDTH) {
438 		*perror = E2BIG;
439 		return (NULL);
440 	}
441 
442 	if (ctl->gr_head.hash_size == 0) {
443 		/* First multipath request. Bootstrap mpath datastructures. */
444 		if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
445 			*perror = ENOMEM;
446 			return (NULL);
447 		}
448 	}
449 
450 	/* Sort nexthops & check there are no duplicates */
451 	sort_weightened_nhops(wn, num_nhops);
452 	uint32_t last_id = 0;
453 	for (int i = 0; i < num_nhops; i++) {
454 		if (wn[i].nh->nh_priv->nh_idx == last_id) {
455 			*perror = EEXIST;
456 			return (NULL);
457 		}
458 		last_id = wn[i].nh->nh_priv->nh_idx;
459 	}
460 
461 	if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
462 		*perror = ENOMEM;
463 		return (NULL);
464 	}
465 
466 	nhg_priv = find_nhgrp(ctl, key);
467 	if (nhg_priv != NULL) {
468 		/*
469 		 * Free originally-created group. As it hasn't been linked
470 		 *  and the dependent nexhops haven't been referenced, just free
471 		 *  the group.
472 		 */
473 		destroy_nhgrp_int(key);
474 		*perror = 0;
475 		return (nhg_priv);
476 	} else {
477 		/* No existing group, try to link the new one */
478 		if (!ref_nhgrp_nhops(key)) {
479 			/*
480 			 * Some of the nexthops have been scheduled for deletion.
481 			 * As the group hasn't been linked / no nexhops have been
482 			 *  referenced, call the final destructor immediately.
483 			 */
484 			destroy_nhgrp_int(key);
485 			*perror = EAGAIN;
486 			return (NULL);
487 		}
488 		if (link_nhgrp(ctl, key) == 0) {
489 			/* Unable to allocate index? */
490 			*perror = EAGAIN;
491 			free_nhgrp_nhops(key);
492 			destroy_nhgrp_int(key);
493 			return (NULL);
494 		}
495 		*perror = 0;
496 		return (key);
497 	}
498 
499 	/* NOTREACHED */
500 }
501 
502 /*
503  * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
504  *
505  * Returns referenced nexthop group or NULL. In the latter case, @perror is
506  *  filled with an error code.
507  * Note that function does NOT care if the next nexthops already exists
508  * in the @gr_orig. As a result, they will be added, resulting in the
509  * same nexthop being present multiple times in the new group.
510  */
511 static struct nhgrp_priv *
512 append_nhops(struct nh_control *ctl, const struct nhgrp_object *gr_orig,
513     struct weightened_nhop *wn, int num_nhops, int *perror)
514 {
515 	char storage[64];
516 	struct weightened_nhop *pnhops;
517 	struct nhgrp_priv *nhg_priv;
518 	const struct nhgrp_priv *src_priv;
519 	size_t sz;
520 	int curr_nhops;
521 
522 	src_priv = NHGRP_PRIV_CONST(gr_orig);
523 	curr_nhops = src_priv->nhg_nh_count;
524 
525 	*perror = 0;
526 
527 	sz = (src_priv->nhg_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
528 	/* optimize for <= 4 paths, each path=16 bytes */
529 	if (sz <= sizeof(storage))
530 		pnhops = (struct weightened_nhop *)&storage[0];
531 	else {
532 		pnhops = malloc(sz, M_TEMP, M_NOWAIT);
533 		if (pnhops == NULL) {
534 			*perror = ENOMEM;
535 			return (NULL);
536 		}
537 	}
538 
539 	/* Copy nhops from original group first */
540 	memcpy(pnhops, src_priv->nhg_nh_weights,
541 	  curr_nhops * sizeof(struct weightened_nhop));
542 	memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
543 	curr_nhops += num_nhops;
544 
545 	nhg_priv = get_nhgrp(ctl, pnhops, curr_nhops, perror);
546 
547 	if (pnhops != (struct weightened_nhop *)&storage[0])
548 		free(pnhops, M_TEMP);
549 
550 	if (nhg_priv == NULL)
551 		return (NULL);
552 
553 	return (nhg_priv);
554 }
555 
556 
557 /*
558  * Creates/finds nexthop group based on @wn and @num_nhops.
559  * Returns 0 on success with referenced group in @rnd, or
560  * errno.
561  *
562  * If the error is EAGAIN, then the operation can be retried.
563  */
564 int
565 nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
566     struct route_nhop_data *rnd)
567 {
568 	struct nh_control *ctl = rh->nh_control;
569 	struct nhgrp_priv *nhg_priv;
570 	int error;
571 
572 	nhg_priv = get_nhgrp(ctl, wn, num_nhops, &error);
573 	if (nhg_priv != NULL)
574 		rnd->rnd_nhgrp = nhg_priv->nhg;
575 	rnd->rnd_weight = 0;
576 
577 	return (error);
578 }
579 
580 /*
581  * Creates new nexthop group based on @src group without the nexthops
582  * chosen by @flt_func.
583  * Returns 0 on success, storring the reference nhop group/object in @rnd.
584  */
585 int
586 nhgrp_get_filtered_group(struct rib_head *rh, const struct nhgrp_object *src,
587     nhgrp_filter_cb_t flt_func, void *flt_data, struct route_nhop_data *rnd)
588 {
589 	char storage[64];
590 	struct nh_control *ctl = rh->nh_control;
591 	struct weightened_nhop *pnhops;
592 	const struct nhgrp_priv *mp_priv, *src_priv;
593 	size_t sz;
594 	int error, i, num_nhops;
595 
596 	src_priv = NHGRP_PRIV_CONST(src);
597 
598 	sz = src_priv->nhg_nh_count * (sizeof(struct weightened_nhop));
599 	/* optimize for <= 4 paths, each path=16 bytes */
600 	if (sz <= sizeof(storage))
601 		pnhops = (struct weightened_nhop *)&storage[0];
602 	else {
603 		if ((pnhops = malloc(sz, M_TEMP, M_NOWAIT)) == NULL)
604 			return (ENOMEM);
605 	}
606 
607 	/* Filter nexthops */
608 	error = 0;
609 	num_nhops = 0;
610 	for (i = 0; i < src_priv->nhg_nh_count; i++) {
611 		if (flt_func(src_priv->nhg_nh_weights[i].nh, flt_data))
612 			continue;
613 		memcpy(&pnhops[num_nhops++], &src_priv->nhg_nh_weights[i],
614 		  sizeof(struct weightened_nhop));
615 	}
616 
617 	if (num_nhops == 0) {
618 		rnd->rnd_nhgrp = NULL;
619 		rnd->rnd_weight = 0;
620 	} else if (num_nhops == 1) {
621 		rnd->rnd_nhop = pnhops[0].nh;
622 		rnd->rnd_weight = pnhops[0].weight;
623 		if (nhop_try_ref_object(rnd->rnd_nhop) == 0)
624 			error = EAGAIN;
625 	} else {
626 		mp_priv = get_nhgrp(ctl, pnhops, num_nhops, &error);
627 		if (mp_priv != NULL)
628 			rnd->rnd_nhgrp = mp_priv->nhg;
629 		rnd->rnd_weight = 0;
630 	}
631 
632 	if (pnhops != (struct weightened_nhop *)&storage[0])
633 		free(pnhops, M_TEMP);
634 
635 	return (error);
636 }
637 
638 /*
639  * Creates new multipath group based on existing group/nhop in @rnd_orig and
640  *  to-be-added nhop @wn_add.
641  * Returns 0 on success and stores result in @rnd_new.
642  */
643 int
644 nhgrp_get_addition_group(struct rib_head *rh, struct route_nhop_data *rnd_orig,
645     struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_new)
646 {
647 	struct nh_control *ctl = rh->nh_control;
648 	struct nhgrp_priv *nhg_priv;
649 	struct weightened_nhop wn[2] = {};
650 	int error;
651 
652 	if (rnd_orig->rnd_nhop == NULL) {
653 		/* No paths to add to, just reference current nhop */
654 		*rnd_new = *rnd_add;
655 		if (nhop_try_ref_object(rnd_new->rnd_nhop) == 0)
656 			return (EAGAIN);
657 		return (0);
658 	}
659 
660 	wn[0].nh = rnd_add->rnd_nhop;
661 	wn[0].weight = rnd_add->rnd_weight;
662 
663 	if (!NH_IS_NHGRP(rnd_orig->rnd_nhop)) {
664 		/* Simple merge of 2 non-multipath nexthops */
665 		wn[1].nh = rnd_orig->rnd_nhop;
666 		wn[1].weight = rnd_orig->rnd_weight;
667 		nhg_priv = get_nhgrp(ctl, wn, 2, &error);
668 	} else {
669 		/* Get new nhop group with @rt->rt_nhop as an additional nhop */
670 		nhg_priv = append_nhops(ctl, rnd_orig->rnd_nhgrp, &wn[0], 1,
671 		    &error);
672 	}
673 
674 	if (nhg_priv == NULL)
675 		return (error);
676 	rnd_new->rnd_nhgrp = nhg_priv->nhg;
677 	rnd_new->rnd_weight = 0;
678 
679 	return (0);
680 }
681 
682 /*
683  * Returns pointer to array of nexthops with weights for
684  * given @nhg. Stores number of items in the array into @pnum_nhops.
685  */
686 struct weightened_nhop *
687 nhgrp_get_nhops(struct nhgrp_object *nhg, uint32_t *pnum_nhops)
688 {
689 	struct nhgrp_priv *nhg_priv;
690 
691 	KASSERT(((nhg->nhg_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
692 
693 	nhg_priv = NHGRP_PRIV(nhg);
694 	*pnum_nhops = nhg_priv->nhg_nh_count;
695 
696 	return (nhg_priv->nhg_nh_weights);
697 }
698 
699 __noinline static int
700 dump_nhgrp_entry(struct rib_head *rh, const struct nhgrp_priv *nhg_priv,
701     char *buffer, size_t buffer_size, struct sysctl_req *w)
702 {
703 	struct rt_msghdr *rtm;
704 	struct nhgrp_external *nhge;
705 	struct nhgrp_container *nhgc;
706 	const struct nhgrp_object *nhg;
707 	struct nhgrp_nhop_external *ext;
708 	int error;
709 	size_t sz;
710 
711 	nhg = nhg_priv->nhg;
712 
713 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
714 	/* controlplane nexthops */
715 	sz += sizeof(struct nhgrp_container);
716 	sz += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
717 	/* dataplane nexthops */
718 	sz += sizeof(struct nhgrp_container);
719 	sz += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
720 
721 	KASSERT(sz <= buffer_size, ("increase nhgrp buffer size"));
722 
723 	bzero(buffer, sz);
724 
725 	rtm = (struct rt_msghdr *)buffer;
726 	rtm->rtm_msglen = sz;
727 	rtm->rtm_version = RTM_VERSION;
728 	rtm->rtm_type = RTM_GET;
729 
730 	nhge = (struct nhgrp_external *)(rtm + 1);
731 
732 	nhge->nhg_idx = nhg_priv->nhg_idx;
733 	nhge->nhg_refcount = nhg_priv->nhg_refcount;
734 
735 	/* fill in control plane nexthops firs */
736 	nhgc = (struct nhgrp_container *)(nhge + 1);
737 	nhgc->nhgc_type = NHG_C_TYPE_CNHOPS;
738 	nhgc->nhgc_subtype = 0;
739 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
740 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg_priv->nhg_nh_count;
741 	nhgc->nhgc_count = nhg_priv->nhg_nh_count;
742 
743 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
744 	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
745 		ext[i].nh_idx = nhg_priv->nhg_nh_weights[i].nh->nh_priv->nh_idx;
746 		ext[i].nh_weight = nhg_priv->nhg_nh_weights[i].weight;
747 	}
748 
749 	/* fill in dataplane nexthops */
750 	nhgc = (struct nhgrp_container *)(&ext[nhg_priv->nhg_nh_count]);
751 	nhgc->nhgc_type = NHG_C_TYPE_DNHOPS;
752 	nhgc->nhgc_subtype = 0;
753 	nhgc->nhgc_len = sizeof(struct nhgrp_container);
754 	nhgc->nhgc_len += sizeof(struct nhgrp_nhop_external) * nhg->nhg_size;
755 	nhgc->nhgc_count = nhg->nhg_size;
756 
757 	ext = (struct nhgrp_nhop_external *)(nhgc + 1);
758 	for (int i = 0; i < nhg->nhg_size; i++) {
759 		ext[i].nh_idx = nhg->nhops[i]->nh_priv->nh_idx;
760 		ext[i].nh_weight = 0;
761 	}
762 
763 	error = SYSCTL_OUT(w, buffer, sz);
764 
765 	return (error);
766 }
767 
768 uint32_t
769 nhgrp_get_idx(const struct nhgrp_object *nhg)
770 {
771 	const struct nhgrp_priv *nhg_priv;
772 
773 	nhg_priv = NHGRP_PRIV_CONST(nhg);
774 	return (nhg_priv->nhg_idx);
775 }
776 
777 uint32_t
778 nhgrp_get_count(struct rib_head *rh)
779 {
780 	struct nh_control *ctl;
781 	uint32_t count;
782 
783 	ctl = rh->nh_control;
784 
785 	NHOPS_RLOCK(ctl);
786 	count = ctl->gr_head.items_count;
787 	NHOPS_RUNLOCK(ctl);
788 
789 	return (count);
790 }
791 
792 int
793 nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
794 {
795 	struct nh_control *ctl = rh->nh_control;
796 	struct epoch_tracker et;
797 	struct nhgrp_priv *nhg_priv;
798 	char *buffer;
799 	size_t sz;
800 	int error = 0;
801 
802 	if (ctl->gr_head.items_count == 0)
803 		return (0);
804 
805 	/* Calculate the maximum nhop group size in bytes */
806 	sz = sizeof(struct rt_msghdr) + sizeof(struct nhgrp_external);
807 	sz += 2 * sizeof(struct nhgrp_container);
808 	sz += 2 * sizeof(struct nhgrp_nhop_external) * RIB_MAX_MPATH_WIDTH;
809 	buffer = malloc(sz, M_TEMP, M_NOWAIT);
810 	if (buffer == NULL)
811 		return (ENOMEM);
812 
813 	NET_EPOCH_ENTER(et);
814 	NHOPS_RLOCK(ctl);
815 	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
816 		error = dump_nhgrp_entry(rh, nhg_priv, buffer, sz, w);
817 		if (error != 0)
818 			break;
819 	} CHT_SLIST_FOREACH_END;
820 	NHOPS_RUNLOCK(ctl);
821 	NET_EPOCH_EXIT(et);
822 
823 	free(buffer, M_TEMP);
824 
825 	return (error);
826 }
827