xref: /freebsd/sys/net/route/fib_algo.c (revision 4d846d26)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Alexander V. Chernikov
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33 
34 #include <sys/param.h>
35 #include <sys/eventhandler.h>
36 #include <sys/kernel.h>
37 #include <sys/sbuf.h>
38 #include <sys/lock.h>
39 #include <sys/rmlock.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/module.h>
43 #include <sys/kernel.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/syslog.h>
50 #include <sys/queue.h>
51 #include <net/vnet.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/ip_var.h>
60 #ifdef INET6
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #endif
64 
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #include <net/route/route_ctl.h>
68 #include <net/route/route_var.h>
69 #include <net/route/fib_algo.h>
70 
71 #include <machine/stdarg.h>
72 
73 /*
74  * Fib lookup framework.
75  *
76  * This framework enables accelerated longest-prefix-match lookups for the
77  *  routing tables by adding the ability to dynamically attach/detach lookup
78  *  algorithms implementation to/from the datapath.
79  *
80  * flm - fib lookup modules - implementation of particular lookup algorithm
81  * fd - fib data - instance of an flm bound to specific routing table
82  *
83  * This file provides main framework functionality.
84  *
85  * The following are the features provided by the framework
86  *
87  * 1) nexhops abstraction -> provides transparent referencing, indexing
88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89  * 2) Routing table synchronisation
90  * 3) dataplane attachment points
91  * 4) automatic algorithm selection based on the provided preference.
92  *
93  *
94  * DATAPATH
95  * For each supported address family, there is a an allocated array of fib_dp
96  *  structures, indexed by fib number. Each array entry contains callback function
97  *  and its argument. This function will be called with a family-specific lookup key,
98  *  scope and provided argument. This array gets re-created every time when new algo
99  *  instance gets created. Please take a look at the replace_rtables_family() function
100  *  for more details.
101  *
102  */
103 
104 SYSCTL_DECL(_net_route);
105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106     "Fib algorithm lookups");
107 
108 /* Algorithm sync policy */
109 
110 /* Time interval to bucket updates */
111 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
112 #define	V_update_bucket_time_ms	VNET(update_bucket_time_ms)
113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
114     &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
115 
116 /* Minimum update rate to delay sync */
117 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
118 #define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
120     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
121 
122 /* Max allowed delay to sync */
123 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
124 #define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
126     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
127 
128 
129 #ifdef INET6
130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
131 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
133     "IPv6 longest prefix match lookups");
134 #endif
135 #ifdef INET
136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
137 #define	V_algo_fixed_inet	VNET(algo_fixed_inet)
138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
139     "IPv4 longest prefix match lookups");
140 #endif
141 
142 /* Fib instance counter */
143 static uint32_t fib_gen = 0;
144 
145 struct nhop_ref_table {
146 	uint32_t		count;
147 	int32_t			refcnt[0];
148 };
149 
150 enum fib_callout_action {
151 	FDA_NONE,	/* No callout scheduled */
152 	FDA_REBUILD,	/* Asks to rebuild algo instance */
153 	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
154 	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
155 };
156 
157 struct fib_sync_status {
158 	struct timeval		diverge_time;	/* ts when diverged */
159 	uint32_t		num_changes;	/* number of changes since sync */
160 	uint32_t		bucket_changes;	/* num changes within the current bucket */
161 	uint64_t		bucket_id;	/* 50ms bucket # */
162 	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
163 };
164 
165 /*
166  * Data structure for the fib lookup instance tied to the particular rib.
167  */
168 struct fib_data {
169 	uint32_t		number_nhops;	/* current # of nhops */
170 	uint8_t			hit_nhops;	/* true if out of nhop limit */
171 	uint8_t			init_done;	/* true if init is competed */
172 	uint32_t		fd_dead:1;	/* Scheduled for deletion */
173 	uint32_t		fd_linked:1;	/* true if linked */
174 	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
175 	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
176 	uint8_t			fd_family;	/* family */
177 	uint32_t		fd_fibnum;	/* fibnum */
178 	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
179 	uint32_t		fd_gen;		/* instance gen# */
180 	struct callout		fd_callout;	/* rebuild callout */
181 	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
182 	void			*fd_algo_data;	/* algorithm data */
183 	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
184 	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
185 	struct rib_head		*fd_rh;		/* RIB table we're attached to */
186 	struct rib_subscription	*fd_rs;		/* storing table subscription */
187 	struct fib_dp		fd_dp;		/* fib datapath data */
188 	struct vnet		*fd_vnet;	/* vnet fib belongs to */
189 	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
190 	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
191 	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
192 	uint32_t		fd_num_changes;	/* number of changes since last callout */
193 	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
194 };
195 
196 static bool rebuild_fd(struct fib_data *fd, const char *reason);
197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
198 static void handle_fd_callout(void *_data);
199 static void destroy_fd_instance_epoch(epoch_context_t ctx);
200 static bool is_idx_free(struct fib_data *fd, uint32_t index);
201 static void set_algo_fixed(struct rib_head *rh);
202 static bool is_algo_fixed(struct rib_head *rh);
203 
204 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
205 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
206 
207 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
208     struct fib_lookup_module *orig_flm);
209 static void fib_unref_algo(struct fib_lookup_module *flm);
210 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
211 
212 struct mtx fib_mtx;
213 #define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
214 #define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
215 #define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
216 
217 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
218 
219 /* Algorithm has to be this percent better than the current to switch */
220 #define	BEST_DIFF_PERCENT	(5 * 256 / 100)
221 /* Schedule algo re-evaluation X seconds after a change */
222 #define	ALGO_EVAL_DELAY_MS	30000
223 /* Force algo re-evaluation after X changes */
224 #define	ALGO_EVAL_NUM_ROUTES	100
225 /* Try to setup algorithm X times */
226 #define	FIB_MAX_TRIES		32
227 /* Max amount of supported nexthops */
228 #define	FIB_MAX_NHOPS		262144
229 #define	FIB_CALLOUT_DELAY_MS	50
230 
231 
232 /* Debug */
233 static int flm_debug_level = LOG_NOTICE;
234 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
235     &flm_debug_level, 0, "debuglevel");
236 #define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
237 #ifndef	LOG_DEBUG2
238 #define	LOG_DEBUG2	8
239 #endif
240 
241 #define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
242 #define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
243 	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
244 }
245 #define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
246     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
247     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
248 #define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
249     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
250 #define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
251     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
252 }
253 #define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
254 #define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
255     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
256     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
257 }
258 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
259 #define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
260 #else
261 #define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
262 #endif
263 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
264 #define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
265 #else
266 #define	FD_PRINTF_LOG_DEBUG()
267 #endif
268 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
269 #define	FD_PRINTF_LOG_INFO	_FD_PRINTF
270 #else
271 #define	FD_PRINTF_LOG_INFO()
272 #endif
273 #define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
274 #define	FD_PRINTF_LOG_ERR	_FD_PRINTF
275 #define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
276 
277 
278 /* List of all registered lookup algorithms */
279 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
280 
281 /* List of all fib lookup instances in the vnet */
282 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
283 #define	V_fib_data_list	VNET(fib_data_list)
284 
285 /* Datastructure for storing non-transient fib lookup module failures */
286 struct fib_error {
287 	int				fe_family;
288 	uint32_t			fe_fibnum;	/* failed rtable */
289 	struct fib_lookup_module	*fe_flm;	/* failed module */
290 	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
291 };
292 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
293 #define	V_fib_error_list VNET(fib_error_list)
294 
295 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
296 struct fib_dp_header {
297 	struct epoch_context	fdh_epoch_ctx;
298 	uint32_t		fdh_num_tables;
299 	struct fib_dp		fdh_idx[0];
300 };
301 
302 /*
303  * Tries to add new non-transient algorithm error to the list of
304  *  errors.
305  * Returns true on success.
306  */
307 static bool
308 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
309 {
310 	struct fib_error *fe;
311 
312 	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
313 	if (fe == NULL)
314 		return (false);
315 	fe->fe_flm = flm;
316 	fe->fe_family = flm->flm_family;
317 	fe->fe_fibnum = fibnum;
318 
319 	FIB_MOD_LOCK();
320 	/* Avoid duplicates by checking if error already exists first */
321 	if (flm_error_check(flm, fibnum)) {
322 		FIB_MOD_UNLOCK();
323 		free(fe, M_TEMP);
324 		return (true);
325 	}
326 	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
327 	FIB_MOD_UNLOCK();
328 
329 	return (true);
330 }
331 
332 /*
333  * True if non-transient error has been registered for @flm in @fibnum.
334  */
335 static bool
336 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
337 {
338 	const struct fib_error *fe;
339 
340 	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
341 		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
342 			return (true);
343 	}
344 
345 	return (false);
346 }
347 
348 /*
349  * Clear all errors of algo specified by @flm.
350  */
351 static void
352 fib_error_clear_flm(struct fib_lookup_module *flm)
353 {
354 	struct fib_error *fe, *fe_tmp;
355 
356 	FIB_MOD_LOCK_ASSERT();
357 
358 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
359 		if (fe->fe_flm == flm) {
360 			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
361 			free(fe, M_TEMP);
362 		}
363 	}
364 }
365 
366 /*
367  * Clears all errors in current VNET.
368  */
369 static void
370 fib_error_clear(void)
371 {
372 	struct fib_error *fe, *fe_tmp;
373 
374 	FIB_MOD_LOCK_ASSERT();
375 
376 	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
377 		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
378 		free(fe, M_TEMP);
379 	}
380 }
381 
382 static const char *
383 print_op_result(enum flm_op_result result)
384 {
385 	switch (result) {
386 	case FLM_SUCCESS:
387 		return "success";
388 	case FLM_REBUILD:
389 		return "rebuild";
390 	case FLM_BATCH:
391 		return "batch";
392 	case FLM_ERROR:
393 		return "error";
394 	}
395 
396 	return "unknown";
397 }
398 
399 static const char *
400 print_family(int family)
401 {
402 
403 	if (family == AF_INET)
404 		return ("inet");
405 	else if (family == AF_INET6)
406 		return ("inet6");
407 	else
408 		return ("unknown");
409 }
410 
411 /*
412  * Debug function used by lookup algorithms.
413  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
414  */
415 void
416 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
417 {
418 	char buf[128];
419 	va_list ap;
420 
421 	if (level > flm_debug_level)
422 		return;
423 
424 	va_start(ap, fmt);
425 	vsnprintf(buf, sizeof(buf), fmt, ap);
426 	va_end(ap);
427 
428 	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
429 	    fd->fd_gen, func, "%s", buf);
430 }
431 
432 /*
433  * Outputs list of algorithms supported by the provided address family.
434  */
435 static int
436 print_algos_sysctl(struct sysctl_req *req, int family)
437 {
438 	struct fib_lookup_module *flm;
439 	struct sbuf sbuf;
440 	int error, count = 0;
441 
442 	error = sysctl_wire_old_buffer(req, 0);
443 	if (error == 0) {
444 		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
445 		TAILQ_FOREACH(flm, &all_algo_list, entries) {
446 			if (flm->flm_family == family) {
447 				if (count++ > 0)
448 					sbuf_cat(&sbuf, ", ");
449 				sbuf_cat(&sbuf, flm->flm_name);
450 			}
451 		}
452 		error = sbuf_finish(&sbuf);
453 		sbuf_delete(&sbuf);
454 	}
455 	return (error);
456 }
457 
458 #ifdef INET6
459 static int
460 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
461 {
462 
463 	return (print_algos_sysctl(req, AF_INET6));
464 }
465 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
466     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
467     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
468 #endif
469 
470 #ifdef INET
471 static int
472 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
473 {
474 
475 	return (print_algos_sysctl(req, AF_INET));
476 }
477 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
478     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
479     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
480 #endif
481 
482 /*
483  * Calculate delay between repeated failures.
484  * Returns current delay in milliseconds.
485  */
486 static uint32_t
487 callout_calc_delay_ms(struct fib_data *fd)
488 {
489 	uint32_t shift;
490 
491 	if (fd->fd_failed_rebuilds > 10)
492 		shift = 10;
493 	else
494 		shift = fd->fd_failed_rebuilds;
495 
496 	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
497 }
498 
499 static void
500 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
501 {
502 
503 	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
504 	fd->fd_callout_action = action;
505 	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
506 	    handle_fd_callout, fd, 0);
507 }
508 
509 static void
510 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
511 {
512 
513 	RIB_WLOCK_ASSERT(fd->fd_rh);
514 
515 	if (!fd->fd_need_rebuild) {
516 		fd->fd_need_rebuild = true;
517 		/* Stop batch updates */
518 		fd->fd_batch = false;
519 
520 		/*
521 		 * Potentially re-schedules pending callout
522 		 *  initiated by schedule_algo_eval.
523 		 */
524 		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
525 		    reason, fd->fd_failed_rebuilds);
526 		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
527 	}
528 }
529 
530 static void
531 sync_rib_gen(struct fib_data *fd)
532 {
533 	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
534 	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
535 }
536 
537 static int64_t
538 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
539 {
540 	int64_t diff = 0;
541 
542 	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
543 	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
544 
545 	return (diff);
546 }
547 
548 static void
549 add_tv_diff_ms(struct timeval *tv, int ms)
550 {
551 	tv->tv_sec += ms / 1000;
552 	ms = ms % 1000;
553 	if (ms * 1000 + tv->tv_usec < 1000000)
554 		tv->tv_usec += ms * 1000;
555 	else {
556 		tv->tv_sec += 1;
557 		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
558 	}
559 }
560 
561 /*
562  * Marks the time when algo state diverges from the rib state.
563  */
564 static void
565 mark_diverge_time(struct fib_data *fd)
566 {
567 	struct fib_sync_status *fd_ss = &fd->fd_ss;
568 
569 	getmicrouptime(&fd_ss->diverge_time);
570 	fd_ss->bucket_id = 0;
571 	fd_ss->bucket_changes = 0;
572 }
573 
574 /*
575  * Calculates and updates the next algorithm sync time, based on the current activity.
576  *
577  * The intent is to provide reasonable balance between the update
578  *  latency and efficient batching when changing large amount of routes.
579  *
580  * High-level algorithm looks the following:
581  * 1) all changes are bucketed in 50ms intervals
582  * 2) If amount of changes within the bucket is greater than the threshold,
583  *   the update gets delayed, up to maximum delay threshold.
584  */
585 static void
586 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
587 {
588 	uint32_t bucket_id, new_delay = 0;
589 	struct timeval tv;
590 
591 	/* Fetch all variables at once to ensure consistent reads */
592 	uint32_t bucket_time_ms = V_update_bucket_time_ms;
593 	uint32_t threshold_rate = V_bucket_change_threshold_rate;
594 	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
595 
596 	if (bucket_time_ms == 0)
597 		bucket_time_ms = 50;
598 	/* calculate per-bucket threshold rate */
599 	threshold_rate = threshold_rate * bucket_time_ms / 1000;
600 
601 	getmicrouptime(&tv);
602 
603 	struct fib_sync_status *fd_ss = &fd->fd_ss;
604 
605 	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
606 
607 	if (fd_ss->bucket_id == bucket_id) {
608 		fd_ss->bucket_changes++;
609 		if (fd_ss->bucket_changes == threshold_rate) {
610 			new_delay = (bucket_id + 2) * bucket_time_ms;
611 			if (new_delay <= max_delay_ms) {
612 				FD_PRINTF(LOG_DEBUG, fd,
613 				    "hit threshold of %u routes, delay update,"
614 				    "bucket: %u, total delay: %u",
615 				    threshold_rate, bucket_id + 1, new_delay);
616 			} else {
617 				new_delay = 0;
618 				FD_PRINTF(LOG_DEBUG, fd,
619 				    "maximum sync delay (%u ms) reached", max_delay_ms);
620 			}
621 		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
622 			new_delay = bucket_time_ms;
623 	} else {
624 		fd_ss->bucket_id = bucket_id;
625 		fd_ss->bucket_changes = 1;
626 	}
627 
628 	if (new_delay > 0) {
629 		/* Calculated time has been updated */
630 		struct timeval new_tv = fd_ss->diverge_time;
631 		add_tv_diff_ms(&new_tv, new_delay);
632 
633 		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
634 		schedule_callout(fd, action, delay_ms);
635 	}
636 }
637 
638 static void
639 update_algo_state(struct fib_data *fd)
640 {
641 
642 	RIB_WLOCK_ASSERT(fd->fd_rh);
643 
644 	if (fd->fd_batch || fd->fd_need_rebuild) {
645 		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
646 		update_rebuild_delay(fd, action);
647 		return;
648 	}
649 
650 	if (fd->fd_num_changes++ == 0) {
651 		/* Start callout to consider switch */
652 		if (!callout_pending(&fd->fd_callout))
653 			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
654 	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
655 		/* Reset callout to exec immediately */
656 		if (fd->fd_callout_action == FDA_EVAL)
657 			schedule_callout(fd, FDA_EVAL, 1);
658 	}
659 }
660 
661 static bool
662 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
663 {
664 	struct nhop_object *nh;
665 
666 	/* Sync addition/removal of interface routes */
667 	switch (rc->rc_cmd) {
668 	case RTM_ADD:
669 		nh = rc->rc_nh_new;
670 		if (!NH_IS_NHGRP(nh)) {
671 			if (!(nh->nh_flags & NHF_GATEWAY))
672 				return (true);
673 			if (nhop_get_rtflags(nh) & RTF_STATIC)
674 				return (true);
675 		}
676 		break;
677 	case RTM_DELETE:
678 		nh = rc->rc_nh_old;
679 		if (!NH_IS_NHGRP(nh)) {
680 			if (!(nh->nh_flags & NHF_GATEWAY))
681 				return (true);
682 			if (nhop_get_rtflags(nh) & RTF_STATIC)
683 				return (true);
684 		}
685 		break;
686 	}
687 
688 	return (false);
689 }
690 
691 static bool
692 apply_rtable_changes(struct fib_data *fd)
693 {
694 	enum flm_op_result result;
695 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
696 
697 	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
698 
699 	if (result == FLM_SUCCESS) {
700 		sync_rib_gen(fd);
701 		for (int i = 0; i < q->count; i++)
702 			if (q->entries[i].nh_old)
703 				fib_unref_nhop(fd, q->entries[i].nh_old);
704 		q->count = 0;
705 	}
706 	fd->fd_batch = false;
707 
708 	return (result == FLM_SUCCESS);
709 }
710 
711 static bool
712 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
713 {
714 	int plen = 0;
715 
716 	switch (fd->fd_family) {
717 #ifdef INET
718 	case AF_INET:
719 		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
720 		break;
721 #endif
722 #ifdef INET6
723 	case AF_INET6:
724 		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
725 		break;
726 #endif
727 	}
728 
729 	ce->plen = plen;
730 	ce->nh_old = rc->rc_nh_old;
731 	ce->nh_new = rc->rc_nh_new;
732 	if (ce->nh_new != NULL) {
733 		if (fib_ref_nhop(fd, ce->nh_new) == 0)
734 			return (false);
735 	}
736 
737 	return (true);
738 }
739 
740 static bool
741 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
742 {
743 	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
744 
745 	if (q->count >= q->size) {
746 		uint32_t q_size;
747 
748 		if (q->size == 0)
749 			q_size = 256; /* ~18k memory */
750 		else
751 			q_size = q->size * 2;
752 
753 		size_t size = q_size * sizeof(struct fib_change_entry);
754 		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
755 		if (a == NULL) {
756 			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
757 			    q_size);
758 			return (false);
759 		}
760 		q->entries = a;
761 		q->size = q_size;
762 	}
763 
764 	return (fill_change_entry(fd, &q->entries[q->count++], rc));
765 }
766 
767 /*
768  * Rib subscription handler. Checks if the algorithm is ready to
769  *  receive updates, handles nexthop refcounting and passes change
770  *  data to the algorithm callback.
771  */
772 static void
773 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
774     void *_data)
775 {
776 	struct fib_data *fd = (struct fib_data *)_data;
777 	enum flm_op_result result;
778 
779 	RIB_WLOCK_ASSERT(rnh);
780 
781 	/*
782 	 * There is a small gap between subscribing for route changes
783 	 *  and initiating rtable dump. Avoid receiving route changes
784 	 *  prior to finishing rtable dump by checking `init_done`.
785 	 */
786 	if (!fd->init_done)
787 		return;
788 
789 	bool immediate_sync = need_immediate_sync(fd, rc);
790 
791 	/* Consider scheduling algorithm re-evaluation */
792 	update_algo_state(fd);
793 
794 	/*
795 	 * If algo requested rebuild, stop sending updates by default.
796 	 * This simplifies nexthop refcount handling logic.
797 	 */
798 	if (fd->fd_need_rebuild) {
799 		if (immediate_sync)
800 			rebuild_fd(fd, "rtable change type enforced sync");
801 		return;
802 	}
803 
804 	/*
805 	 * Algo requested updates to be delivered in batches.
806 	 * Add the current change to the queue and return.
807 	 */
808 	if (fd->fd_batch) {
809 		if (immediate_sync) {
810 			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
811 				rebuild_fd(fd, "batch sync failed");
812 		} else {
813 			if (!queue_rtable_change(fd, rc))
814 				schedule_fd_rebuild(fd, "batch queue failed");
815 		}
816 		return;
817 	}
818 
819 	/*
820 	 * Maintain guarantee that every nexthop returned by the dataplane
821 	 *  lookup has > 0 refcount, so can be safely referenced within current
822 	 *  epoch.
823 	 */
824 	if (rc->rc_nh_new != NULL) {
825 		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
826 			/* ran out of indexes */
827 			schedule_fd_rebuild(fd, "ran out of nhop indexes");
828 			return;
829 		}
830 	}
831 
832 	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
833 
834 	switch (result) {
835 	case FLM_SUCCESS:
836 		sync_rib_gen(fd);
837 		/* Unref old nexthop on success */
838 		if (rc->rc_nh_old != NULL)
839 			fib_unref_nhop(fd, rc->rc_nh_old);
840 		break;
841 	case FLM_BATCH:
842 
843 		/*
844 		 * Algo asks to batch the changes.
845 		 */
846 		if (queue_rtable_change(fd, rc)) {
847 			if (!immediate_sync) {
848 				fd->fd_batch = true;
849 				mark_diverge_time(fd);
850 				update_rebuild_delay(fd, FDA_BATCH);
851 				break;
852 			}
853 			if (apply_rtable_changes(fd))
854 				break;
855 		}
856 		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
857 
858 	case FLM_REBUILD:
859 
860 		/*
861 		 * Algo is not able to apply the update.
862 		 * Schedule algo rebuild.
863 		 */
864 		if (!immediate_sync) {
865 			mark_diverge_time(fd);
866 			schedule_fd_rebuild(fd, "algo requested rebuild");
867 			break;
868 		}
869 
870 		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
871 		rebuild_fd(fd, "rtable change type enforced sync");
872 		break;
873 	case FLM_ERROR:
874 
875 		/*
876 		 * Algo reported a non-recoverable error.
877 		 * Record the error and schedule rebuild, which will
878 		 *  trigger best algo selection.
879 		 */
880 		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
881 		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
882 			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
883 		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
884 	}
885 }
886 
887 static void
888 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
889 {
890 
891 	if (old_fd == NULL) {
892 		// TODO: read from rtable
893 		fd->number_nhops = 16;
894 		return;
895 	}
896 
897 	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
898 		fd->number_nhops = 2 * old_fd->number_nhops;
899 	else
900 		fd->number_nhops = old_fd->number_nhops;
901 }
902 
903 struct walk_cbdata {
904 	struct fib_data		*fd;
905 	flm_dump_t		*func;
906 	enum flm_op_result	result;
907 };
908 
909 /*
910  * Handler called after all rtenties have been dumped.
911  * Performs post-dump framework checks and calls
912  * algo:flm_dump_end_cb().
913  *
914  * Updates walk_cbdata result.
915  */
916 static void
917 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
918 {
919 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
920 	struct fib_data *fd = w->fd;
921 
922 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
923 
924 	if (rnh->rib_dying) {
925 		w->result = FLM_ERROR;
926 		return;
927 	}
928 
929 	if (fd->hit_nhops) {
930 		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
931 		    fd->nh_ref_table->count);
932 		if (w->result == FLM_SUCCESS)
933 			w->result = FLM_REBUILD;
934 		return;
935 	}
936 
937 	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
938 		return;
939 
940 	/* Post-dump hook, dump successful */
941 	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
942 
943 	if (w->result == FLM_SUCCESS) {
944 		/* Mark init as done to allow routing updates */
945 		fd->init_done = 1;
946 	}
947 }
948 
949 /*
950  * Callback for each entry in rib.
951  * Calls algo:flm_dump_rib_item_cb func as a part of initial
952  *  route table synchronisation.
953  */
954 static int
955 sync_algo_cb(struct rtentry *rt, void *_data)
956 {
957 	struct walk_cbdata *w = (struct walk_cbdata *)_data;
958 
959 	RIB_WLOCK_ASSERT(w->fd->fd_rh);
960 
961 	if (w->result == FLM_SUCCESS && w->func) {
962 
963 		/*
964 		 * Reference nexthops to maintain guarantee that
965 		 *  each nexthop returned by datapath has > 0 references
966 		 *  and can be safely referenced within current epoch.
967 		 */
968 		struct nhop_object *nh = rt_get_raw_nhop(rt);
969 		if (fib_ref_nhop(w->fd, nh) != 0)
970 			w->result = w->func(rt, w->fd->fd_algo_data);
971 		else
972 			w->result = FLM_REBUILD;
973 	}
974 
975 	return (0);
976 }
977 
978 /*
979  * Dump all routing table state to the algo instance.
980  */
981 static enum flm_op_result
982 sync_algo(struct fib_data *fd)
983 {
984 	struct walk_cbdata w = {
985 		.fd = fd,
986 		.func = fd->fd_flm->flm_dump_rib_item_cb,
987 		.result = FLM_SUCCESS,
988 	};
989 
990 	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
991 
992 	FD_PRINTF(LOG_INFO, fd,
993 	    "initial dump completed (rtable version: %d), result: %s",
994 	    fd->fd_rh->rnh_gen, print_op_result(w.result));
995 
996 	return (w.result);
997 }
998 
999 /*
1000  * Schedules epoch-backed @fd instance deletion.
1001  * * Unlinks @fd from the list of active algo instances.
1002  * * Removes rib subscription.
1003  * * Stops callout.
1004  * * Schedules actual deletion.
1005  *
1006  * Assume @fd is already unlinked from the datapath.
1007  */
1008 static int
1009 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1010 {
1011 	bool is_dead;
1012 
1013 	NET_EPOCH_ASSERT();
1014 	RIB_WLOCK_ASSERT(fd->fd_rh);
1015 
1016 	FIB_MOD_LOCK();
1017 	is_dead = fd->fd_dead;
1018 	if (!is_dead)
1019 		fd->fd_dead = true;
1020 	if (fd->fd_linked) {
1021 		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1022 		fd->fd_linked = false;
1023 	}
1024 	FIB_MOD_UNLOCK();
1025 	if (is_dead)
1026 		return (0);
1027 
1028 	FD_PRINTF(LOG_INFO, fd, "DETACH");
1029 
1030 	if (fd->fd_rs != NULL)
1031 		rib_unsubscribe_locked(fd->fd_rs);
1032 
1033 	/*
1034 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1035 	 * will be executed, hence no _new_ callout schedules will happen.
1036 	 */
1037 	callout_stop(&fd->fd_callout);
1038 
1039 	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Wipe all fd instances from the list matching rib specified by @rh.
1046  * If @keep_first is set, remove all but the first record.
1047  */
1048 static void
1049 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1050 {
1051 	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1052 	struct fib_data *fd, *fd_tmp;
1053 	struct epoch_tracker et;
1054 
1055 	FIB_MOD_LOCK();
1056 	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1057 		if (fd->fd_rh == rh) {
1058 			if (keep_first) {
1059 				keep_first = false;
1060 				continue;
1061 			}
1062 			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1063 			fd->fd_linked = false;
1064 			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1065 		}
1066 	}
1067 	FIB_MOD_UNLOCK();
1068 
1069 	/* Pass 2: remove each entry */
1070 	NET_EPOCH_ENTER(et);
1071 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1072 		if (!in_callout)
1073 			RIB_WLOCK(fd->fd_rh);
1074 		schedule_destroy_fd_instance(fd, in_callout);
1075 		if (!in_callout)
1076 			RIB_WUNLOCK(fd->fd_rh);
1077 	}
1078 	NET_EPOCH_EXIT(et);
1079 }
1080 
1081 void
1082 fib_destroy_rib(struct rib_head *rh)
1083 {
1084 
1085 	/*
1086 	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1087 	 *  sync_algo() stage, preventing new entries to be added to the list
1088 	 *  of active algos. Remove all existing entries for the particular rib.
1089 	 */
1090 	fib_cleanup_algo(rh, false, false);
1091 }
1092 
1093 /*
1094  * Finalises fd destruction by freeing all fd resources.
1095  */
1096 static void
1097 destroy_fd_instance(struct fib_data *fd)
1098 {
1099 
1100 	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1101 
1102 	/* Call destroy callback first */
1103 	if (fd->fd_algo_data != NULL)
1104 		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1105 
1106 	/* Nhop table */
1107 	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1108 		for (int i = 0; i < fd->number_nhops; i++) {
1109 			if (!is_idx_free(fd, i)) {
1110 				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1111 				    i, fd->nh_idx[i]);
1112 				nhop_free_any(fd->nh_idx[i]);
1113 			}
1114 		}
1115 		free(fd->nh_idx, M_RTABLE);
1116 	}
1117 	if (fd->nh_ref_table != NULL)
1118 		free(fd->nh_ref_table, M_RTABLE);
1119 
1120 	if (fd->fd_ss.fd_change_queue.entries != NULL)
1121 		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1122 
1123 	fib_unref_algo(fd->fd_flm);
1124 
1125 	free(fd, M_RTABLE);
1126 }
1127 
1128 /*
1129  * Epoch callback indicating fd is safe to destroy
1130  */
1131 static void
1132 destroy_fd_instance_epoch(epoch_context_t ctx)
1133 {
1134 	struct fib_data *fd;
1135 
1136 	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1137 
1138 	CURVNET_SET(fd->fd_vnet);
1139 	destroy_fd_instance(fd);
1140 	CURVNET_RESTORE();
1141 }
1142 
1143 /*
1144  * Tries to setup fd instance.
1145  * - Allocates fd/nhop table
1146  * - Runs algo:flm_init_cb algo init
1147  * - Subscribes fd to the rib
1148  * - Runs rtable dump
1149  * - Adds instance to the list of active instances.
1150  *
1151  * Returns: operation result. Fills in @pfd with resulting fd on success.
1152  *
1153  */
1154 static enum flm_op_result
1155 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1156     struct fib_data *old_fd, struct fib_data **pfd)
1157 {
1158 	struct fib_data *fd;
1159 	size_t size;
1160 	enum flm_op_result result;
1161 
1162 	/* Allocate */
1163 	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1164 	if (fd == NULL)  {
1165 		*pfd = NULL;
1166 		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1167 		return (FLM_REBUILD);
1168 	}
1169 	*pfd = fd;
1170 
1171 	estimate_nhop_scale(old_fd, fd);
1172 
1173 	fd->fd_rh = rh;
1174 	fd->fd_family = rh->rib_family;
1175 	fd->fd_fibnum = rh->rib_fibnum;
1176 	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1177 	fd->fd_vnet = curvnet;
1178 	fd->fd_flm = flm;
1179 
1180 	FIB_MOD_LOCK();
1181 	flm->flm_refcount++;
1182 	fd->fd_gen = ++fib_gen;
1183 	FIB_MOD_UNLOCK();
1184 
1185 	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1186 
1187 	/* Allocate nhidx -> nhop_ptr table */
1188 	size = fd->number_nhops * sizeof(void *);
1189 	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1190 	if (fd->nh_idx == NULL) {
1191 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1192 		return (FLM_REBUILD);
1193 	}
1194 
1195 	/* Allocate nhop index refcount table */
1196 	size = sizeof(struct nhop_ref_table);
1197 	size += fd->number_nhops * sizeof(uint32_t);
1198 	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1199 	if (fd->nh_ref_table == NULL) {
1200 		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1201 		return (FLM_REBUILD);
1202 	}
1203 	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1204 
1205 	/* Okay, we're ready for algo init */
1206 	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1207 	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1208 	if (result != FLM_SUCCESS) {
1209 		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1210 		return (result);
1211 	}
1212 
1213 	/* Try to subscribe */
1214 	if (flm->flm_change_rib_item_cb != NULL) {
1215 		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1216 		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1217 		if (fd->fd_rs == NULL) {
1218 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1219 			return (FLM_REBUILD);
1220 		}
1221 	}
1222 
1223 	/* Dump */
1224 	result = sync_algo(fd);
1225 	if (result != FLM_SUCCESS) {
1226 		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1227 		return (result);
1228 	}
1229 	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1230 
1231 	FIB_MOD_LOCK();
1232 	/*
1233 	 * Insert fd in the beginning of a list, to maintain invariant
1234 	 *  that first matching entry for the AF/fib is always the active
1235 	 *  one.
1236 	 */
1237 	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1238 	fd->fd_linked = true;
1239 	FIB_MOD_UNLOCK();
1240 
1241 	return (FLM_SUCCESS);
1242 }
1243 
1244 /*
1245  * Sets up algo @flm for table @rh and links it to the datapath.
1246  *
1247  */
1248 static enum flm_op_result
1249 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1250     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1251 {
1252 	struct fib_data *prev_fd, *new_fd;
1253 	enum flm_op_result result;
1254 
1255 	NET_EPOCH_ASSERT();
1256 	RIB_WLOCK_ASSERT(rh);
1257 
1258 	prev_fd = orig_fd;
1259 	new_fd = NULL;
1260 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1261 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1262 
1263 		if ((result == FLM_SUCCESS) && attach) {
1264 			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1265 				sync_rib_gen(new_fd);
1266 			else
1267 				result = FLM_REBUILD;
1268 		}
1269 
1270 		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1271 			schedule_destroy_fd_instance(prev_fd, false);
1272 			prev_fd = NULL;
1273 		}
1274 
1275 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1276 		    print_op_result(result));
1277 
1278 		if (result == FLM_REBUILD) {
1279 			prev_fd = new_fd;
1280 			new_fd = NULL;
1281 			continue;
1282 		}
1283 
1284 		break;
1285 	}
1286 
1287 	if (result != FLM_SUCCESS) {
1288 		RH_PRINTF(LOG_WARNING, rh,
1289 		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1290 		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1291 		/* update failure count */
1292 		FIB_MOD_LOCK();
1293 		if (orig_fd != NULL)
1294 			orig_fd->fd_failed_rebuilds++;
1295 		FIB_MOD_UNLOCK();
1296 
1297 		/* Ban algo on non-recoverable error */
1298 		if (result == FLM_ERROR)
1299 			flm_error_add(flm, rh->rib_fibnum);
1300 
1301 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1302 			schedule_destroy_fd_instance(prev_fd, false);
1303 		if (new_fd != NULL) {
1304 			schedule_destroy_fd_instance(new_fd, false);
1305 			new_fd = NULL;
1306 		}
1307 	}
1308 
1309 	*pfd = new_fd;
1310 	return (result);
1311 }
1312 
1313 /*
1314  * Tries to sync algo with the current rtable state, either
1315  * by executing batch update or rebuilding.
1316  * Returns true on success.
1317  */
1318 static bool
1319 execute_callout_action(struct fib_data *fd)
1320 {
1321 	enum fib_callout_action action = fd->fd_callout_action;
1322 	struct fib_lookup_module *flm_new = NULL;
1323 	bool result = true;
1324 
1325 	NET_EPOCH_ASSERT();
1326 	RIB_WLOCK_ASSERT(fd->fd_rh);
1327 
1328 	fd->fd_need_rebuild = false;
1329 	fd->fd_batch = false;
1330 	fd->fd_num_changes = 0;
1331 
1332 	/* First, check if we're still OK to use this algo */
1333 	if (!is_algo_fixed(fd->fd_rh))
1334 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1335 	if (flm_new != NULL)
1336 		action = FDA_REBUILD;
1337 
1338 	if (action == FDA_BATCH) {
1339 		/* Try to sync */
1340 		if (!apply_rtable_changes(fd))
1341 			action = FDA_REBUILD;
1342 	}
1343 
1344 	if (action == FDA_REBUILD)
1345 		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1346 	if (flm_new != NULL)
1347 		fib_unref_algo(flm_new);
1348 
1349 	return (result);
1350 }
1351 
1352 /*
1353  * Callout for all scheduled fd-related work.
1354  * - Checks if the current algo is still the best algo
1355  * - Synchronises algo instance to the rtable (batch usecase)
1356  * - Creates a new instance of an algo for af/fib if desired.
1357  */
1358 static void
1359 handle_fd_callout(void *_data)
1360 {
1361 	struct fib_data *fd = (struct fib_data *)_data;
1362 	struct epoch_tracker et;
1363 
1364 	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1365 
1366 	NET_EPOCH_ENTER(et);
1367 	CURVNET_SET(fd->fd_vnet);
1368 	execute_callout_action(fd);
1369 	CURVNET_RESTORE();
1370 	NET_EPOCH_EXIT(et);
1371 }
1372 
1373 /*
1374  * Tries to create new algo instance based on @fd data.
1375  * Returns true on success.
1376  */
1377 static bool
1378 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1379 {
1380 	struct fib_data *fd_new, *fd_tmp = NULL;
1381 	bool result;
1382 
1383 	if (flm_new == fd->fd_flm)
1384 		fd_tmp = fd;
1385 	else
1386 		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1387 
1388 	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1389 	if (result != FLM_SUCCESS) {
1390 		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1391 		return (false);
1392 	}
1393 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1394 
1395 	/* Remove old instance */
1396 	schedule_destroy_fd_instance(fd, true);
1397 
1398 	return (true);
1399 }
1400 
1401 static bool
1402 rebuild_fd(struct fib_data *fd, const char *reason)
1403 {
1404 	struct fib_lookup_module *flm_new = NULL;
1405 	bool result;
1406 
1407 	if (!is_algo_fixed(fd->fd_rh))
1408 		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1409 
1410 	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1411 	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1412 	if (flm_new != NULL)
1413 		fib_unref_algo(flm_new);
1414 
1415 	if (!result) {
1416 		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1417 		schedule_fd_rebuild(fd, "sync rebuild failed");
1418 	}
1419 
1420 	return (result);
1421 }
1422 
1423 /*
1424  * Finds algo by name/family.
1425  * Returns referenced algo or NULL.
1426  */
1427 static struct fib_lookup_module *
1428 fib_find_algo(const char *algo_name, int family)
1429 {
1430 	struct fib_lookup_module *flm;
1431 
1432 	FIB_MOD_LOCK();
1433 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1434 		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1435 		    (family == flm->flm_family)) {
1436 			flm->flm_refcount++;
1437 			FIB_MOD_UNLOCK();
1438 			return (flm);
1439 		}
1440 	}
1441 	FIB_MOD_UNLOCK();
1442 
1443 	return (NULL);
1444 }
1445 
1446 static void
1447 fib_unref_algo(struct fib_lookup_module *flm)
1448 {
1449 
1450 	FIB_MOD_LOCK();
1451 	flm->flm_refcount--;
1452 	FIB_MOD_UNLOCK();
1453 }
1454 
1455 static int
1456 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1457 {
1458 	struct fib_lookup_module *flm = NULL;
1459 	struct fib_data *fd = NULL;
1460 	char old_algo_name[32], algo_name[32];
1461 	struct rib_head *rh = NULL;
1462 	enum flm_op_result result;
1463 	struct epoch_tracker et;
1464 	int error;
1465 
1466 	/* Fetch current algo/rib for af/family */
1467 	FIB_MOD_LOCK();
1468 	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1469 		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1470 			break;
1471 	}
1472 	if (fd == NULL) {
1473 		FIB_MOD_UNLOCK();
1474 		return (ENOENT);
1475 	}
1476 	rh = fd->fd_rh;
1477 	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1478 	    sizeof(old_algo_name));
1479 	FIB_MOD_UNLOCK();
1480 
1481 	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1482 	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1483 	if (error != 0 || req->newptr == NULL)
1484 		return (error);
1485 
1486 	if (strcmp(algo_name, old_algo_name) == 0)
1487 		return (0);
1488 
1489 	/* New algorithm name is different */
1490 	flm = fib_find_algo(algo_name, family);
1491 	if (flm == NULL) {
1492 		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1493 		return (ESRCH);
1494 	}
1495 
1496 	fd = NULL;
1497 	NET_EPOCH_ENTER(et);
1498 	RIB_WLOCK(rh);
1499 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1500 	RIB_WUNLOCK(rh);
1501 	NET_EPOCH_EXIT(et);
1502 	fib_unref_algo(flm);
1503 	if (result != FLM_SUCCESS)
1504 		return (EINVAL);
1505 
1506 	/* Disable automated jumping between algos */
1507 	FIB_MOD_LOCK();
1508 	set_algo_fixed(rh);
1509 	FIB_MOD_UNLOCK();
1510 	/* Remove old instance(s) */
1511 	fib_cleanup_algo(rh, true, false);
1512 
1513 	/* Drain cb so user can unload the module after userret if so desired */
1514 	NET_EPOCH_DRAIN_CALLBACKS();
1515 
1516 	return (0);
1517 }
1518 
1519 #ifdef INET
1520 static int
1521 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1522 {
1523 
1524 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1525 }
1526 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1527     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1528     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1529 #endif
1530 
1531 #ifdef INET6
1532 static int
1533 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1534 {
1535 
1536 	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1537 }
1538 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1539     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1540     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1541 #endif
1542 
1543 static struct nhop_object *
1544 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1545 {
1546 	return (NULL);
1547 }
1548 
1549 static void
1550 destroy_fdh_epoch(epoch_context_t ctx)
1551 {
1552 	struct fib_dp_header *fdh;
1553 
1554 	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1555 	free(fdh, M_RTABLE);
1556 }
1557 
1558 static struct fib_dp_header *
1559 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1560 {
1561 	size_t sz;
1562 	struct fib_dp_header *fdh;
1563 
1564 	sz = sizeof(struct fib_dp_header);
1565 	sz += sizeof(struct fib_dp) * num_tables;
1566 	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1567 	if (fdh != NULL) {
1568 		fdh->fdh_num_tables = num_tables;
1569 		/*
1570 		 * Set dummy lookup function ptr always returning NULL, so
1571 		 * we can delay algo init.
1572 		 */
1573 		for (uint32_t i = 0; i < num_tables; i++)
1574 			fdh->fdh_idx[i].f = dummy_lookup;
1575 	}
1576 	return (fdh);
1577 }
1578 
1579 static struct fib_dp_header *
1580 get_fib_dp_header(struct fib_dp *dp)
1581 {
1582 
1583 	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1584 }
1585 
1586 /*
1587  * Replace per-family index pool @pdp with a new one which
1588  * contains updated callback/algo data from @fd.
1589  * Returns true on success.
1590  */
1591 static bool
1592 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1593 {
1594 	struct fib_dp_header *new_fdh, *old_fdh;
1595 
1596 	NET_EPOCH_ASSERT();
1597 
1598 	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1599 	    curvnet, dp->f, dp->arg);
1600 
1601 	FIB_MOD_LOCK();
1602 	old_fdh = get_fib_dp_header(*pdp);
1603 
1604 	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1605 		/*
1606 		 * Function is the same, data pointer needs update.
1607 		 * Perform in-line replace without reallocation.
1608 		 */
1609 		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1610 		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1611 		FIB_MOD_UNLOCK();
1612 		return (true);
1613 	}
1614 
1615 	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1616 	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1617 	if (new_fdh == NULL) {
1618 		FIB_MOD_UNLOCK();
1619 		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1620 		return (false);
1621 	}
1622 
1623 	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1624 	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1625 	/* Update relevant data structure for @fd */
1626 	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1627 
1628 	/* Ensure memcpy() writes have completed */
1629 	atomic_thread_fence_rel();
1630 	/* Set new datapath pointer */
1631 	*pdp = &new_fdh->fdh_idx[0];
1632 	FIB_MOD_UNLOCK();
1633 	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1634 
1635 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1636 
1637 	return (true);
1638 }
1639 
1640 static struct fib_dp **
1641 get_family_dp_ptr(int family)
1642 {
1643 	switch (family) {
1644 #ifdef INET
1645 	case AF_INET:
1646 		return (&V_inet_dp);
1647 #endif
1648 #ifdef INET6
1649 	case AF_INET6:
1650 		return (&V_inet6_dp);
1651 #endif
1652 	}
1653 	return (NULL);
1654 }
1655 
1656 /*
1657  * Make datapath use fib instance @fd
1658  */
1659 bool
1660 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1661 {
1662 	struct fib_dp **pdp;
1663 
1664 	pdp = get_family_dp_ptr(fd->fd_family);
1665 	return (replace_rtables_family(pdp, fd, dp));
1666 }
1667 
1668 /*
1669  * Grow datapath pointers array.
1670  * Called from sysctl handler on growing number of routing tables.
1671  */
1672 static void
1673 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1674 {
1675 	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1676 
1677 	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1678 
1679 	FIB_MOD_LOCK();
1680 	if (*pdp != NULL) {
1681 		old_fdh = get_fib_dp_header(*pdp);
1682 		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1683 		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1684 	}
1685 
1686 	/* Wait till all writes completed */
1687 	atomic_thread_fence_rel();
1688 
1689 	*pdp = &new_fdh->fdh_idx[0];
1690 	FIB_MOD_UNLOCK();
1691 
1692 	if (old_fdh != NULL)
1693 		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1694 }
1695 
1696 /*
1697  * Grows per-AF arrays of datapath pointers for each supported family.
1698  * Called from fibs resize sysctl handler.
1699  */
1700 void
1701 fib_grow_rtables(uint32_t new_num_tables)
1702 {
1703 
1704 #ifdef INET
1705 	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1706 #endif
1707 #ifdef INET6
1708 	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1709 #endif
1710 }
1711 
1712 void
1713 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1714 {
1715 
1716 	bzero(rinfo, sizeof(struct rib_rtable_info));
1717 	rinfo->num_prefixes = rh->rnh_prefixes;
1718 	rinfo->num_nhops = nhops_get_count(rh);
1719 #ifdef ROUTE_MPATH
1720 	rinfo->num_nhgrp = nhgrp_get_count(rh);
1721 #endif
1722 }
1723 
1724 /*
1725  * Updates pointer to the algo data for the @fd.
1726  */
1727 void
1728 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1729 {
1730 	RIB_WLOCK_ASSERT(fd->fd_rh);
1731 
1732 	fd->fd_algo_data = algo_data;
1733 }
1734 
1735 /*
1736  * Calls @callback with @ctx after the end of a current epoch.
1737  */
1738 void
1739 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1740 {
1741 	NET_EPOCH_CALL(callback, ctx);
1742 }
1743 
1744 /*
1745  * Accessor to get rib instance @fd is attached to.
1746  */
1747 struct rib_head *
1748 fib_get_rh(struct fib_data *fd)
1749 {
1750 
1751 	return (fd->fd_rh);
1752 }
1753 
1754 /*
1755  * Accessor to export idx->nhop array
1756  */
1757 struct nhop_object **
1758 fib_get_nhop_array(struct fib_data *fd)
1759 {
1760 
1761 	return (fd->nh_idx);
1762 }
1763 
1764 static uint32_t
1765 get_nhop_idx(struct nhop_object *nh)
1766 {
1767 #ifdef ROUTE_MPATH
1768 	if (NH_IS_NHGRP(nh))
1769 		return (nhgrp_get_idx((struct nhgrp_object *)nh));
1770 	else
1771 #endif
1772 		return (nhop_get_idx(nh));
1773 }
1774 
1775 uint32_t
1776 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1777 {
1778 
1779 	return (get_nhop_idx(nh));
1780 }
1781 
1782 static bool
1783 is_idx_free(struct fib_data *fd, uint32_t index)
1784 {
1785 
1786 	return (fd->nh_ref_table->refcnt[index] == 0);
1787 }
1788 
1789 static uint32_t
1790 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1791 {
1792 	uint32_t idx = get_nhop_idx(nh);
1793 
1794 	if (idx >= fd->number_nhops) {
1795 		fd->hit_nhops = 1;
1796 		return (0);
1797 	}
1798 
1799 	if (is_idx_free(fd, idx)) {
1800 		nhop_ref_any(nh);
1801 		fd->nh_idx[idx] = nh;
1802 		fd->nh_ref_table->count++;
1803 		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1804 	}
1805 	fd->nh_ref_table->refcnt[idx]++;
1806 
1807 	return (idx);
1808 }
1809 
1810 struct nhop_release_data {
1811 	struct nhop_object	*nh;
1812 	struct epoch_context	ctx;
1813 };
1814 
1815 static void
1816 release_nhop_epoch(epoch_context_t ctx)
1817 {
1818 	struct nhop_release_data *nrd;
1819 
1820 	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1821 	nhop_free_any(nrd->nh);
1822 	free(nrd, M_TEMP);
1823 }
1824 
1825 /*
1826  * Delays nexthop refcount release.
1827  * Datapath may have the datastructures not updated yet, so the old
1828  *  nexthop may still be returned till the end of current epoch. Delay
1829  *  refcount removal, as we may be removing the last instance, which will
1830  *  trigger nexthop deletion, rendering returned nexthop invalid.
1831  */
1832 static void
1833 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1834 {
1835 	struct nhop_release_data *nrd;
1836 
1837 	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1838 	if (nrd != NULL) {
1839 		nrd->nh = nh;
1840 		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1841 	} else {
1842 		/*
1843 		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1844 		 *  that each nhop can be referenced.
1845 		 */
1846 		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1847 	}
1848 }
1849 
1850 static void
1851 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1852 {
1853 	uint32_t idx = get_nhop_idx(nh);
1854 
1855 	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1856 	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1857 
1858 	fd->nh_ref_table->refcnt[idx]--;
1859 	if (fd->nh_ref_table->refcnt[idx] == 0) {
1860 		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1861 		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1862 	}
1863 }
1864 
1865 static void
1866 set_algo_fixed(struct rib_head *rh)
1867 {
1868 	switch (rh->rib_family) {
1869 #ifdef INET
1870 	case AF_INET:
1871 		V_algo_fixed_inet = true;
1872 		break;
1873 #endif
1874 #ifdef INET6
1875 	case AF_INET6:
1876 		V_algo_fixed_inet6 = true;
1877 		break;
1878 #endif
1879 	}
1880 }
1881 
1882 static bool
1883 is_algo_fixed(struct rib_head *rh)
1884 {
1885 
1886 	switch (rh->rib_family) {
1887 #ifdef INET
1888 	case AF_INET:
1889 		return (V_algo_fixed_inet);
1890 #endif
1891 #ifdef INET6
1892 	case AF_INET6:
1893 		return (V_algo_fixed_inet6);
1894 #endif
1895 	}
1896 	return (false);
1897 }
1898 
1899 /*
1900  * Runs the check on what would be the best algo for rib @rh, assuming
1901  *  that the current algo is the one specified by @orig_flm. Note that
1902  *  it can be NULL for initial selection.
1903  *
1904  * Returns referenced new algo or NULL if the current one is the best.
1905  */
1906 static struct fib_lookup_module *
1907 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1908 {
1909 	uint8_t preference, curr_preference = 0, best_preference = 0;
1910 	struct fib_lookup_module *flm, *best_flm = NULL;
1911 	struct rib_rtable_info rinfo;
1912 	int candidate_algos = 0;
1913 
1914 	fib_get_rtable_info(rh, &rinfo);
1915 
1916 	FIB_MOD_LOCK();
1917 	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1918 		if (flm->flm_family != rh->rib_family)
1919 			continue;
1920 		candidate_algos++;
1921 		preference = flm->flm_get_pref(&rinfo);
1922 		if (preference > best_preference) {
1923 			if (!flm_error_check(flm, rh->rib_fibnum)) {
1924 				best_preference = preference;
1925 				best_flm = flm;
1926 			}
1927 		}
1928 		if (flm == orig_flm)
1929 			curr_preference = preference;
1930 	}
1931 	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1932 		best_flm->flm_refcount++;
1933 	else
1934 		best_flm = NULL;
1935 	FIB_MOD_UNLOCK();
1936 
1937 	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1938 	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1939 	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1940 	    best_preference);
1941 
1942 	return (best_flm);
1943 }
1944 
1945 /*
1946  * Called when new route table is created.
1947  * Selects, allocates and attaches fib algo for the table.
1948  */
1949 static bool
1950 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1951 {
1952 	struct fib_lookup_module *flm;
1953 	struct fib_data *fd = NULL;
1954 	enum flm_op_result result;
1955 	struct epoch_tracker et;
1956 
1957 	flm = fib_check_best_algo(rh, NULL);
1958 	if (flm == NULL) {
1959 		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1960 		return (false);
1961 	}
1962 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1963 
1964 	NET_EPOCH_ENTER(et);
1965 	RIB_WLOCK(rh);
1966 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1967 	RIB_WUNLOCK(rh);
1968 	NET_EPOCH_EXIT(et);
1969 
1970 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1971 	if (result == FLM_SUCCESS)
1972 		*dp = fd->fd_dp;
1973 	else
1974 		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1975 
1976 	fib_unref_algo(flm);
1977 
1978 	return (result == FLM_SUCCESS);
1979 }
1980 
1981 /*
1982  * Sets up fib algo instances for the non-initialized RIBs in the @family.
1983  * Allocates temporary datapath index to amortize datapaint index updates
1984  * with large @num_tables.
1985  */
1986 void
1987 fib_setup_family(int family, uint32_t num_tables)
1988 {
1989 	struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1990 	if (new_fdh == NULL) {
1991 		ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1992 		return;
1993 	}
1994 
1995 	for (int i = 0; i < num_tables; i++) {
1996 		struct rib_head *rh = rt_tables_get_rnh(i, family);
1997 		if (rh->rib_algo_init)
1998 			continue;
1999 		if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
2000 			continue;
2001 
2002 		rh->rib_algo_init = true;
2003 	}
2004 
2005 	FIB_MOD_LOCK();
2006 	struct fib_dp **pdp = get_family_dp_ptr(family);
2007 	struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2008 
2009 	/* Update the items not touched by the new init, from the old data pointer */
2010 	for (int i = 0; i < num_tables; i++) {
2011 		if (new_fdh->fdh_idx[i].f == dummy_lookup)
2012 			new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2013 	}
2014 
2015 	/* Ensure all index writes have completed */
2016 	atomic_thread_fence_rel();
2017 	/* Set new datapath pointer */
2018 	*pdp = &new_fdh->fdh_idx[0];
2019 
2020 	FIB_MOD_UNLOCK();
2021 
2022 	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2023 }
2024 
2025 /*
2026  * Registers fib lookup module within the subsystem.
2027  */
2028 int
2029 fib_module_register(struct fib_lookup_module *flm)
2030 {
2031 
2032 	FIB_MOD_LOCK();
2033 	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2034 	    print_family(flm->flm_family));
2035 	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2036 	FIB_MOD_UNLOCK();
2037 
2038 	return (0);
2039 }
2040 
2041 /*
2042  * Tries to unregister fib lookup module.
2043  *
2044  * Returns 0 on success, EBUSY if module is still used
2045  *  by some of the tables.
2046  */
2047 int
2048 fib_module_unregister(struct fib_lookup_module *flm)
2049 {
2050 
2051 	FIB_MOD_LOCK();
2052 	if (flm->flm_refcount > 0) {
2053 		FIB_MOD_UNLOCK();
2054 		return (EBUSY);
2055 	}
2056 	fib_error_clear_flm(flm);
2057 	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2058 	    print_family(flm->flm_family));
2059 	TAILQ_REMOVE(&all_algo_list, flm, entries);
2060 	FIB_MOD_UNLOCK();
2061 
2062 	return (0);
2063 }
2064 
2065 void
2066 vnet_fib_init(void)
2067 {
2068 
2069 	TAILQ_INIT(&V_fib_data_list);
2070 }
2071 
2072 void
2073 vnet_fib_destroy(void)
2074 {
2075 
2076 	FIB_MOD_LOCK();
2077 	fib_error_clear();
2078 	FIB_MOD_UNLOCK();
2079 }
2080