xref: /openbsd/sys/net/if_pfsync.c (revision 4048fd87)
1 /*	$OpenBSD: if_pfsync.c,v 1.322 2023/10/03 10:22:10 sthen Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009, 2022, 2023 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include "bpfilter.h"
46 #include "pfsync.h"
47 #include "kstat.h"
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/time.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/socket.h>
55 #include <sys/ioctl.h>
56 #include <sys/timeout.h>
57 #include <sys/kernel.h>
58 #include <sys/sysctl.h>
59 #include <sys/pool.h>
60 #include <sys/syslog.h>
61 #include <sys/tree.h>
62 #include <sys/smr.h>
63 #include <sys/percpu.h>
64 #include <sys/refcnt.h>
65 #include <sys/kstat.h>
66 #include <sys/stdarg.h>
67 
68 #include <net/if.h>
69 #include <net/if_types.h>
70 #include <net/bpf.h>
71 #include <net/netisr.h>
72 
73 #include <netinet/in.h>
74 #include <netinet/if_ether.h>
75 #include <netinet/ip.h>
76 #include <netinet/in_var.h>
77 #include <netinet/ip_var.h>
78 #include <netinet/ip_ipsp.h>
79 #include <netinet/ip_icmp.h>
80 #include <netinet/icmp6.h>
81 #include <netinet/tcp.h>
82 #include <netinet/tcp_seq.h>
83 #include <netinet/tcp_fsm.h>
84 #include <netinet/udp.h>
85 
86 #ifdef INET6
87 #include <netinet6/in6_var.h>
88 #include <netinet/ip6.h>
89 #include <netinet6/ip6_var.h>
90 #include <netinet6/nd6.h>
91 #endif /* INET6 */
92 
93 #include "carp.h"
94 #if NCARP > 0
95 #include <netinet/ip_carp.h>
96 #endif
97 
98 #include <net/pfvar.h>
99 #include <net/pfvar_priv.h>
100 #include <net/if_pfsync.h>
101 
102 #define PFSYNC_MINPKT ( \
103 	sizeof(struct ip) + \
104 	sizeof(struct pfsync_header))
105 
106 struct pfsync_softc;
107 
108 struct pfsync_deferral {
109 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
110 	struct pf_state				*pd_st;
111 	struct mbuf				*pd_m;
112 	uint64_t				 pd_deadline;
113 };
114 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
115 
116 #define PFSYNC_DEFER_NSEC	20000000ULL
117 #define PFSYNC_DEFER_LIMIT	128
118 #define PFSYNC_BULK_SND_IVAL_MS	20
119 
120 static struct pool pfsync_deferrals_pool;
121 
122 enum pfsync_bulk_req_state {
123 	PFSYNC_BREQ_S_NONE,
124 	PFSYNC_BREQ_S_START,
125 	PFSYNC_BREQ_S_SENT,
126 	PFSYNC_BREQ_S_BULK,
127 	PFSYNC_BREQ_S_DONE,
128 };
129 
130 static const char *pfsync_bulk_req_state_names[] = {
131 	[PFSYNC_BREQ_S_NONE]		= "none",
132 	[PFSYNC_BREQ_S_START]		= "start",
133 	[PFSYNC_BREQ_S_SENT]		= "sent",
134 	[PFSYNC_BREQ_S_BULK]		= "bulk",
135 	[PFSYNC_BREQ_S_DONE]		= "done",
136 };
137 
138 enum pfsync_bulk_req_event {
139 	PFSYNC_BREQ_EVT_UP,
140 	PFSYNC_BREQ_EVT_DOWN,
141 	PFSYNC_BREQ_EVT_TMO,
142 	PFSYNC_BREQ_EVT_LINK,
143 	PFSYNC_BREQ_EVT_BUS_START,
144 	PFSYNC_BREQ_EVT_BUS_END,
145 };
146 
147 static const char *pfsync_bulk_req_event_names[] = {
148 	[PFSYNC_BREQ_EVT_UP]		= "up",
149 	[PFSYNC_BREQ_EVT_DOWN]		= "down",
150 	[PFSYNC_BREQ_EVT_TMO]		= "timeout",
151 	[PFSYNC_BREQ_EVT_LINK]		= "link",
152 	[PFSYNC_BREQ_EVT_BUS_START]	= "bus-start",
153 	[PFSYNC_BREQ_EVT_BUS_END]	= "bus-end",
154 };
155 
156 struct pfsync_slice {
157 	struct pfsync_softc	*s_pfsync;
158 	struct mutex		 s_mtx;
159 
160 	struct pf_state_queue	 s_qs[PFSYNC_S_COUNT];
161 	TAILQ_HEAD(, tdb)	 s_tdb_q;
162 	size_t			 s_len;
163 	struct mbuf_list	 s_ml;
164 
165 	struct taskq		*s_softnet;
166 	struct task		 s_task;
167 	struct timeout		 s_tmo;
168 
169 	struct mbuf_queue	 s_sendq;
170 	struct task		 s_send;
171 
172 	struct pfsync_deferrals	 s_deferrals;
173 	unsigned int		 s_deferred;
174 	struct task		 s_deferrals_task;
175 	struct timeout		 s_deferrals_tmo;
176 
177 	uint64_t		 s_stat_locks;
178 	uint64_t		 s_stat_contended;
179 	uint64_t		 s_stat_write_nop;
180 	uint64_t		 s_stat_task_add;
181 	uint64_t		 s_stat_task_run;
182 	uint64_t		 s_stat_enqueue;
183 	uint64_t		 s_stat_dequeue;
184 
185 	uint64_t		 s_stat_defer_add;
186 	uint64_t		 s_stat_defer_ack;
187 	uint64_t		 s_stat_defer_run;
188 	uint64_t		 s_stat_defer_overlimit;
189 
190 	struct kstat		*s_kstat;
191 } __aligned(CACHELINESIZE);
192 
193 #define PFSYNC_SLICE_BITS	 1
194 #define PFSYNC_NSLICES		 (1 << PFSYNC_SLICE_BITS)
195 
196 struct pfsync_softc {
197 	struct ifnet		 sc_if;
198 	unsigned int		 sc_dead;
199 	unsigned int		 sc_up;
200 	struct refcnt		 sc_refs;
201 
202 	/* config */
203 	struct in_addr		 sc_syncpeer;
204 	unsigned int		 sc_maxupdates;
205 	unsigned int		 sc_defer;
206 
207 	/* operation */
208 	unsigned int		 sc_sync_ifidx;
209 	unsigned int		 sc_sync_if_down;
210 	void			*sc_inm;
211 	struct task		 sc_ltask;
212 	struct task		 sc_dtask;
213 	struct ip		 sc_template;
214 
215 	struct pfsync_slice	 sc_slices[PFSYNC_NSLICES];
216 
217 	struct {
218 		struct rwlock			 req_lock;
219 		struct timeout			 req_tmo;
220 		enum pfsync_bulk_req_state	 req_state;
221 		unsigned int			 req_tries;
222 		unsigned int			 req_demoted;
223 	}			 sc_bulk_req;
224 
225 	struct {
226 		struct rwlock			 snd_lock;
227 		struct timeout			 snd_tmo;
228 		time_t				 snd_requested;
229 
230 		struct pf_state			*snd_next;
231 		struct pf_state			*snd_tail;
232 		unsigned int			 snd_again;
233 	}			 sc_bulk_snd;
234 };
235 
236 static struct pfsync_softc	*pfsyncif = NULL;
237 static struct cpumem		*pfsynccounters;
238 
239 static inline void
240 pfsyncstat_inc(enum pfsync_counters c)
241 {
242 	counters_inc(pfsynccounters, c);
243 }
244 
245 static int	pfsync_clone_create(struct if_clone *, int);
246 static int	pfsync_clone_destroy(struct ifnet *);
247 
248 static int	pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *,
249 		    struct rtentry *);
250 static void	pfsync_start(struct ifqueue *);
251 
252 static int	pfsync_ioctl(struct ifnet *, u_long, caddr_t);
253 static int	pfsync_up(struct pfsync_softc *);
254 static int	pfsync_down(struct pfsync_softc *);
255 
256 static int	pfsync_set_mtu(struct pfsync_softc *, unsigned int);
257 static int	pfsync_set_parent(struct pfsync_softc *,
258 		    const struct if_parent *);
259 static int	pfsync_get_parent(struct pfsync_softc *, struct if_parent *);
260 static int	pfsync_del_parent(struct pfsync_softc *);
261 
262 static int	pfsync_get_ioc(struct pfsync_softc *, struct ifreq *);
263 static int	pfsync_set_ioc(struct pfsync_softc *, struct ifreq *);
264 
265 static void	pfsync_syncif_link(void *);
266 static void	pfsync_syncif_detach(void *);
267 
268 static void	pfsync_sendout(struct pfsync_softc *, struct mbuf *);
269 static void	pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *);
270 
271 static void	pfsync_slice_tmo(void *);
272 static void	pfsync_slice_task(void *);
273 static void	pfsync_slice_sendq(void *);
274 
275 static void	pfsync_deferrals_tmo(void *);
276 static void	pfsync_deferrals_task(void *);
277 static void	pfsync_defer_output(struct pfsync_deferral *);
278 
279 static void	pfsync_bulk_req_evt(struct pfsync_softc *,
280 		    enum pfsync_bulk_req_event);
281 static void	pfsync_bulk_req_tmo(void *);
282 
283 static void	pfsync_bulk_snd_tmo(void *);
284 
285 #if NKSTAT > 0
286 struct pfsync_kstat_data {
287 	struct kstat_kv pd_locks;
288 	struct kstat_kv pd_contended;
289 	struct kstat_kv pd_write_nop;
290 	struct kstat_kv pd_task_add;
291 	struct kstat_kv pd_task_run;
292 	struct kstat_kv pd_enqueue;
293 	struct kstat_kv pd_dequeue;
294 	struct kstat_kv pd_qdrop;
295 
296 	struct kstat_kv pd_defer_len;
297 	struct kstat_kv pd_defer_add;
298 	struct kstat_kv pd_defer_ack;
299 	struct kstat_kv pd_defer_run;
300 	struct kstat_kv pd_defer_overlimit;
301 };
302 
303 static const struct pfsync_kstat_data pfsync_kstat_tpl = {
304 	KSTAT_KV_INITIALIZER("locks",		KSTAT_KV_T_COUNTER64),
305 	KSTAT_KV_INITIALIZER("contended",	KSTAT_KV_T_COUNTER64),
306 	KSTAT_KV_INITIALIZER("write-nops",	KSTAT_KV_T_COUNTER64),
307 	KSTAT_KV_INITIALIZER("send-sched",	KSTAT_KV_T_COUNTER64),
308 	KSTAT_KV_INITIALIZER("send-run",	KSTAT_KV_T_COUNTER64),
309 	KSTAT_KV_INITIALIZER("enqueues",	KSTAT_KV_T_COUNTER64),
310 	KSTAT_KV_INITIALIZER("dequeues",	KSTAT_KV_T_COUNTER64),
311 	KSTAT_KV_UNIT_INITIALIZER("qdrops",
312 	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
313 
314 	KSTAT_KV_UNIT_INITIALIZER("defer-len",
315 	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
316 	KSTAT_KV_INITIALIZER("defer-add",	KSTAT_KV_T_COUNTER64),
317 	KSTAT_KV_INITIALIZER("defer-ack",	KSTAT_KV_T_COUNTER64),
318 	KSTAT_KV_INITIALIZER("defer-run",	KSTAT_KV_T_COUNTER64),
319 	KSTAT_KV_INITIALIZER("defer-over",	KSTAT_KV_T_COUNTER64),
320 };
321 
322 static int
323 pfsync_kstat_copy(struct kstat *ks, void *dst)
324 {
325 	struct pfsync_slice *s = ks->ks_softc;
326 	struct pfsync_kstat_data *pd = dst;
327 
328 	*pd = pfsync_kstat_tpl;
329 	kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks;
330 	kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended;
331 	kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop;
332 	kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add;
333 	kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run;
334 	kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue;
335 	kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue;
336 	kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq);
337 
338 	kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred;
339 	kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add;
340 	kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack;
341 	kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run;
342 	kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit;
343 
344 	return (0);
345 }
346 #endif /* NKSTAT > 0 */
347 
348 #define PFSYNC_MAX_BULKTRIES	12
349 
350 struct if_clone	pfsync_cloner =
351     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
352 
353 void
354 pfsyncattach(int npfsync)
355 {
356 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
357 	if_clone_attach(&pfsync_cloner);
358 }
359 
360 static int
361 pfsync_clone_create(struct if_clone *ifc, int unit)
362 {
363 	struct pfsync_softc *sc;
364 	struct ifnet *ifp;
365 	size_t i, q;
366 
367 	if (unit != 0)
368 		return (ENXIO);
369 
370 	if (pfsync_deferrals_pool.pr_size == 0) {
371 		pool_init(&pfsync_deferrals_pool,
372 		    sizeof(struct pfsync_deferral), 0,
373 		    IPL_MPFLOOR, 0, "pfdefer", NULL);
374 		/* pool_cache_init(&pfsync_deferrals_pool); */
375 	}
376 
377 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
378 	if (sc == NULL)
379 		return (ENOMEM);
380 
381 	/* sc_refs is "owned" by IFF_RUNNING */
382 
383 	sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
384 	sc->sc_maxupdates = 128;
385 	sc->sc_defer = 0;
386 
387 	task_set(&sc->sc_ltask, pfsync_syncif_link, sc);
388 	task_set(&sc->sc_dtask, pfsync_syncif_detach, sc);
389 
390 	rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq");
391 	/* need process context to take net lock to call ip_output */
392 	timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc);
393 
394 	rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd");
395 	/* need process context to take net lock to call ip_output */
396 	timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc);
397 
398 	ifp = &sc->sc_if;
399 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d",
400 	    ifc->ifc_name, unit);
401 	ifp->if_softc = sc;
402 	ifp->if_ioctl = pfsync_ioctl;
403 	ifp->if_output = pfsync_output;
404 	ifp->if_qstart = pfsync_start;
405 	ifp->if_type = IFT_PFSYNC;
406 	ifp->if_hdrlen = sizeof(struct pfsync_header);
407 	ifp->if_mtu = ETHERMTU;
408 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
409 
410 	for (i = 0; i < nitems(sc->sc_slices); i++) {
411 		struct pfsync_slice *s = &sc->sc_slices[i];
412 
413 		s->s_pfsync = sc;
414 
415 		mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0);
416 		s->s_softnet = net_tq(i);
417 		timeout_set(&s->s_tmo, pfsync_slice_tmo, s);
418 		task_set(&s->s_task, pfsync_slice_task, s);
419 
420 		mq_init(&s->s_sendq, 16, IPL_SOFTNET);
421 		task_set(&s->s_send, pfsync_slice_sendq, s);
422 
423 		s->s_len = PFSYNC_MINPKT;
424 		ml_init(&s->s_ml);
425 
426 		for (q = 0; q < nitems(s->s_qs); q++)
427 			TAILQ_INIT(&s->s_qs[q]);
428 		TAILQ_INIT(&s->s_tdb_q);
429 
430 		/* stupid NET_LOCK */
431 		timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
432 		task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
433 		TAILQ_INIT(&s->s_deferrals);
434 
435 #if NKSTAT > 0
436 		s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i,
437 		    KSTAT_T_KV, 0);
438 
439 		kstat_set_mutex(s->s_kstat, &s->s_mtx);
440 		s->s_kstat->ks_softc = s;
441 		s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl);
442 		s->s_kstat->ks_copy = pfsync_kstat_copy;
443 		kstat_install(s->s_kstat);
444 #endif
445 	}
446 
447 	if_counters_alloc(ifp);
448 	if_attach(ifp);
449 	if_alloc_sadl(ifp);
450 
451 #if NCARP > 0
452 	if_addgroup(ifp, "carp");
453 #endif
454 
455 #if NBPFILTER > 0
456 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
457 #endif
458 
459 	return (0);
460 }
461 
462 static int
463 pfsync_clone_destroy(struct ifnet *ifp)
464 {
465 	struct pfsync_softc *sc = ifp->if_softc;
466 #if NKSTAT > 0
467 	size_t i;
468 #endif
469 
470 	NET_LOCK();
471 	sc->sc_dead = 1;
472 
473 	if (ISSET(ifp->if_flags, IFF_RUNNING))
474 		pfsync_down(sc);
475 	NET_UNLOCK();
476 
477 	if_detach(ifp);
478 
479 #if NKSTAT > 0
480 	for (i = 0; i < nitems(sc->sc_slices); i++) {
481 		struct pfsync_slice *s = &sc->sc_slices[i];
482 
483 		kstat_destroy(s->s_kstat);
484 	}
485 #endif
486 
487 	free(sc, M_DEVBUF, sizeof(*sc));
488 
489 	return (0);
490 }
491 
492 static void
493 pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...)
494 {
495 	struct ifnet *ifp = &sc->sc_if;
496 	va_list ap;
497 
498 	if (!ISSET(ifp->if_flags, IFF_DEBUG))
499 		return;
500 
501 	printf("%s: ", ifp->if_xname);
502 	va_start(ap, fmt);
503 	vprintf(fmt, ap);
504 	va_end(ap);
505 	printf("\n");
506 }
507 
508 static void
509 pfsync_syncif_link(void *arg)
510 {
511 	struct pfsync_softc *sc = arg;
512 	struct ifnet *ifp0;
513 	unsigned int sync_if_down = 1;
514 
515 	ifp0 = if_get(sc->sc_sync_ifidx);
516 	if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) {
517 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK);
518 		sync_if_down = 0;
519 	}
520 	if_put(ifp0);
521 
522 #if NCARP > 0
523 	if (sc->sc_sync_if_down != sync_if_down) {
524 		carp_group_demote_adj(&sc->sc_if,
525 		    sync_if_down ? 1 : -1, "pfsync link");
526 	}
527 #endif
528 
529 	sc->sc_sync_if_down = sync_if_down;
530 }
531 
532 static void
533 pfsync_syncif_detach(void *arg)
534 {
535 	struct pfsync_softc *sc = arg;
536 	struct ifnet *ifp = &sc->sc_if;
537 
538 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
539 		pfsync_down(sc);
540 		if_down(ifp);
541 	}
542 
543 	sc->sc_sync_ifidx = 0;
544 }
545 
546 static int
547 pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
548     struct rtentry *rt)
549 {
550 	m_freem(m);	/* drop packet */
551 	return (EAFNOSUPPORT);
552 }
553 
554 static int
555 pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
556 {
557 	struct pfsync_softc *sc = ifp->if_softc;
558 	struct ifreq *ifr = (struct ifreq *)data;
559 	int error = ENOTTY;
560 
561 	switch (cmd) {
562 	case SIOCSIFADDR:
563 		error = EOPNOTSUPP;
564 		break;
565 
566 	case SIOCSIFFLAGS:
567 		if (ISSET(ifp->if_flags, IFF_UP)) {
568 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
569 				error = pfsync_up(sc);
570 			else
571 				error = ENETRESET;
572 		} else {
573 			if (ISSET(ifp->if_flags, IFF_RUNNING))
574 				error = pfsync_down(sc);
575 		}
576 		break;
577 
578 	case SIOCSIFMTU:
579 		error = pfsync_set_mtu(sc, ifr->ifr_mtu);
580 		break;
581 
582 	case SIOCSIFPARENT:
583 		error = pfsync_set_parent(sc, (struct if_parent *)data);
584 		break;
585 	case SIOCGIFPARENT:
586 		error = pfsync_get_parent(sc, (struct if_parent *)data);
587 		break;
588 	case SIOCDIFPARENT:
589 		error = pfsync_del_parent(sc);
590 		break;
591 
592 	case SIOCSETPFSYNC:
593 		error = pfsync_set_ioc(sc, ifr);
594 		break;
595 	case SIOCGETPFSYNC:
596 		error = pfsync_get_ioc(sc, ifr);
597 		break;
598 
599 	default:
600 		break;
601 	}
602 
603 	if (error == ENETRESET)
604 		error = 0;
605 
606 	return (error);
607 }
608 
609 static int
610 pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu)
611 {
612 	struct ifnet *ifp = &sc->sc_if;
613 	struct ifnet *ifp0;
614 	int error = 0;
615 
616 	ifp0 = if_get(sc->sc_sync_ifidx);
617 	if (ifp0 == NULL)
618 		return (EINVAL);
619 
620 	if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) {
621 		error = EINVAL;
622 		goto put;
623 	}
624 
625 	/* commit */
626 	ifp->if_mtu = mtu;
627 
628 put:
629 	if_put(ifp0);
630 	return (error);
631 }
632 
633 static int
634 pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p)
635 {
636 	struct ifnet *ifp = &sc->sc_if;
637 	struct ifnet *ifp0;
638 	int error = 0;
639 
640 	ifp0 = if_unit(p->ifp_parent);
641 	if (ifp0 == NULL)
642 		return (ENXIO);
643 
644 	if (ifp0->if_index == sc->sc_sync_ifidx)
645 		goto put;
646 
647 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
648 		error = EBUSY;
649 		goto put;
650 	}
651 
652 	/* commit */
653 	sc->sc_sync_ifidx = ifp0->if_index;
654 
655 put:
656 	if_put(ifp0);
657 	return (error);
658 }
659 
660 static int
661 pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p)
662 {
663 	struct ifnet *ifp0;
664 	int error = 0;
665 
666 	ifp0 = if_get(sc->sc_sync_ifidx);
667 	if (ifp0 == NULL)
668 		error = EADDRNOTAVAIL;
669 	else
670 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
671 	if_put(ifp0);
672 
673 	return (error);
674 }
675 
676 static int
677 pfsync_del_parent(struct pfsync_softc *sc)
678 {
679 	struct ifnet *ifp = &sc->sc_if;
680 
681 	if (ISSET(ifp->if_flags, IFF_RUNNING))
682 		return (EBUSY);
683 
684 	/* commit */
685 	sc->sc_sync_ifidx = 0;
686 
687 	return (0);
688 }
689 
690 static int
691 pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
692 {
693 	struct pfsyncreq pfsyncr;
694 	struct ifnet *ifp0;
695 
696 	memset(&pfsyncr, 0, sizeof(pfsyncr));
697 
698 	ifp0 = if_get(sc->sc_sync_ifidx);
699 	if (ifp0 != NULL) {
700 		strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname,
701 		    sizeof(pfsyncr.pfsyncr_syncdev));
702 	}
703 	if_put(ifp0);
704 
705 	pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer;
706 	pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
707 	pfsyncr.pfsyncr_defer = sc->sc_defer;
708 
709 	return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
710 }
711 
712 static int
713 pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
714 {
715 	struct ifnet *ifp = &sc->sc_if;
716 	struct pfsyncreq pfsyncr;
717 	unsigned int sync_ifidx = sc->sc_sync_ifidx;
718 	int wantdown = 0;
719 	int error;
720 
721 	error = suser(curproc);
722 	if (error != 0)
723 		return (error);
724 
725 	error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr));
726 	if (error != 0)
727 		return (error);
728 
729 	if (pfsyncr.pfsyncr_maxupdates > 255)
730 		return (EINVAL);
731 
732 	if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */
733 		struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev);
734 		if (ifp0 == NULL)
735 			return (ENXIO);
736 
737 		if (ifp0->if_index != sync_ifidx)
738 			wantdown = 1;
739 
740 		sync_ifidx = ifp0->if_index;
741 		if_put(ifp0);
742 	} else { /* del */
743 		wantdown = 1;
744 		sync_ifidx = 0;
745 	}
746 
747 	if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY)
748 		pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
749 	if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr)
750 		wantdown = 1;
751 
752 	if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING))
753 		return (EBUSY);
754 
755 	/* commit */
756 	sc->sc_sync_ifidx = sync_ifidx;
757 	sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer;
758 	sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
759 	sc->sc_defer = pfsyncr.pfsyncr_defer;
760 
761 	return (0);
762 }
763 
764 static int
765 pfsync_up(struct pfsync_softc *sc)
766 {
767 	struct ifnet *ifp = &sc->sc_if;
768 	struct ifnet *ifp0;
769 	void *inm = NULL;
770 	int error = 0;
771 	struct ip *ip;
772 
773 	NET_ASSERT_LOCKED();
774 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
775 
776 	if (sc->sc_dead)
777 		return (ENXIO);
778 
779 	/*
780 	 * coordinate with pfsync_down(). if sc_up is still up and
781 	 * we're here then something else is tearing pfsync down.
782 	 */
783 	if (sc->sc_up)
784 		return (EBUSY);
785 
786 	if (sc->sc_syncpeer.s_addr == INADDR_ANY ||
787 	    sc->sc_syncpeer.s_addr == INADDR_BROADCAST)
788 		return (EDESTADDRREQ);
789 
790 	ifp0 = if_get(sc->sc_sync_ifidx);
791 	if (ifp0 == NULL)
792 		return (ENXIO);
793 
794 	if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) {
795 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
796 			error = ENODEV;
797 			goto put;
798 		}
799 		inm = in_addmulti(&sc->sc_syncpeer, ifp0);
800 		if (inm == NULL) {
801 			error = ECONNABORTED;
802 			goto put;
803 		}
804 	}
805 
806 	sc->sc_up = 1;
807 
808 	ip = &sc->sc_template;
809 	memset(ip, 0, sizeof(*ip));
810 	ip->ip_v = IPVERSION;
811 	ip->ip_hl = sizeof(*ip) >> 2;
812 	ip->ip_tos = IPTOS_LOWDELAY;
813 	/* len and id are set later */
814 	ip->ip_off = htons(IP_DF);
815 	ip->ip_ttl = PFSYNC_DFLTTL;
816 	ip->ip_p = IPPROTO_PFSYNC;
817 	ip->ip_src.s_addr = INADDR_ANY;
818 	ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr;
819 
820 	/* commit */
821 	refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */
822 
823 #if NCARP > 0
824 	sc->sc_sync_if_down = 1;
825 	carp_group_demote_adj(&sc->sc_if, 1, "pfsync up");
826 #endif
827 
828 	if_linkstatehook_add(ifp0, &sc->sc_ltask);
829 	if_detachhook_add(ifp0, &sc->sc_dtask);
830 
831 	sc->sc_inm = inm;
832 	SET(ifp->if_flags, IFF_RUNNING);
833 
834 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP);
835 
836 	refcnt_take(&sc->sc_refs); /* give one to SMR */
837 	SMR_PTR_SET_LOCKED(&pfsyncif, sc);
838 
839 	pfsync_syncif_link(sc); /* try and push the bulk req state forward */
840 
841 put:
842 	if_put(ifp0);
843 	return (error);
844 }
845 
846 static struct mbuf *
847 pfsync_encap(struct pfsync_softc *sc, struct mbuf *m)
848 {
849 	struct {
850 		struct ip		ip;
851 		struct pfsync_header	ph;
852 	} __packed __aligned(4) *h;
853 	unsigned int mlen = m->m_pkthdr.len;
854 
855 	m = m_prepend(m, sizeof(*h), M_DONTWAIT);
856 	if (m == NULL)
857 		return (NULL);
858 
859 	h = mtod(m, void *);
860 	memset(h, 0, sizeof(*h));
861 
862 	mlen += sizeof(h->ph);
863 	h->ph.version = PFSYNC_VERSION;
864 	h->ph.len = htons(mlen);
865 	/* h->ph.pfcksum */
866 
867 	mlen += sizeof(h->ip);
868 	h->ip = sc->sc_template;
869 	h->ip.ip_len = htons(mlen);
870 	h->ip.ip_id = htons(ip_randomid());
871 
872 	return (m);
873 }
874 
875 static void
876 pfsync_bulk_req_send(struct pfsync_softc *sc)
877 {
878 	struct {
879 		struct pfsync_subheader	subh;
880 		struct pfsync_upd_req	ur;
881 	} __packed __aligned(4) *h;
882 	unsigned mlen = max_linkhdr +
883 	    sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h);
884 	struct mbuf *m;
885 
886 	m = m_gethdr(M_DONTWAIT, MT_DATA);
887 	if (m == NULL)
888 		goto fail;
889 
890 	if (mlen > MHLEN) {
891 		MCLGETL(m, M_DONTWAIT, mlen);
892 		if (!ISSET(m->m_flags, M_EXT))
893 			goto drop;
894 	}
895 
896 	m_align(m, sizeof(*h));
897 	m->m_len = m->m_pkthdr.len = sizeof(*h);
898 
899 	h = mtod(m, void *);
900 	memset(h, 0, sizeof(*h));
901 
902 	h->subh.action = PFSYNC_ACT_UPD_REQ;
903 	h->subh.len = sizeof(h->ur) >> 2;
904 	h->subh.count = htons(1);
905 
906 	h->ur.id = htobe64(0);
907 	h->ur.creatorid = htobe32(0);
908 
909 	m = pfsync_encap(sc, m);
910 	if (m == NULL)
911 		goto fail;
912 
913 	pfsync_sendout(sc, m);
914 	return;
915 
916 drop:
917 	m_freem(m);
918 fail:
919 	printf("%s: unable to request bulk update\n", sc->sc_if.if_xname);
920 }
921 
922 static void
923 pfsync_bulk_req_nstate(struct pfsync_softc *sc,
924     enum pfsync_bulk_req_state nstate, int seconds)
925 {
926 	sc->sc_bulk_req.req_state = nstate;
927 	if (seconds > 0)
928 		timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds);
929 	else
930 		timeout_del(&sc->sc_bulk_req.req_tmo);
931 }
932 
933 static void
934 pfsync_bulk_req_invstate(struct pfsync_softc *sc,
935     enum pfsync_bulk_req_event evt)
936 {
937 	panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname,
938 	    pfsync_bulk_req_event_names[evt],
939 	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]);
940 }
941 
942 static void
943 pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc)
944 {
945 	/* calculate the number of packets we expect */
946 	int t = pf_pool_limits[PF_LIMIT_STATES].limit /
947 	    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
948 	     sizeof(struct pfsync_state));
949 
950 	/* turn it into seconds */
951 	t /= 1000 / PFSYNC_BULK_SND_IVAL_MS;
952 
953 	if (t == 0)
954 		t = 1;
955 
956 	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4);
957 }
958 
959 static inline void
960 pfsync_bulk_req_nstate_done(struct pfsync_softc *sc)
961 {
962 	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0);
963 
964 	KASSERT(sc->sc_bulk_req.req_demoted == 1);
965 	sc->sc_bulk_req.req_demoted = 0;
966 
967 #if NCARP > 0
968 	carp_group_demote_adj(&sc->sc_if, -32, "pfsync done");
969 #endif
970 }
971 
972 static void
973 pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt)
974 {
975 	struct ifnet *ifp = &sc->sc_if;
976 
977 	rw_enter_write(&sc->sc_bulk_req.req_lock);
978 	pfsync_dprintf(sc, "%s state %s evt %s", __func__,
979 	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state],
980 	    pfsync_bulk_req_event_names[evt]);
981 
982 	if (evt == PFSYNC_BREQ_EVT_DOWN) {
983 		/* unconditionally move down */
984 		sc->sc_bulk_req.req_tries = 0;
985 		pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0);
986 
987 		if (sc->sc_bulk_req.req_demoted) {
988 			sc->sc_bulk_req.req_demoted = 0;
989 #if NCARP > 0
990 			carp_group_demote_adj(&sc->sc_if, -32,
991 			    "pfsync down");
992 #endif
993 		}
994 	} else switch (sc->sc_bulk_req.req_state) {
995 	case PFSYNC_BREQ_S_NONE:
996 		switch (evt) {
997 		case PFSYNC_BREQ_EVT_UP:
998 			KASSERT(sc->sc_bulk_req.req_demoted == 0);
999 			sc->sc_bulk_req.req_demoted = 1;
1000 #if NCARP > 0
1001 			carp_group_demote_adj(&sc->sc_if, 32,
1002 			    "pfsync start");
1003 #endif
1004 			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30);
1005 			break;
1006 		default:
1007 			pfsync_bulk_req_invstate(sc, evt);
1008 		}
1009 
1010 		break;
1011 
1012 	case PFSYNC_BREQ_S_START:
1013 		switch (evt) {
1014 		case PFSYNC_BREQ_EVT_LINK:
1015 			pfsync_bulk_req_send(sc);
1016 			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2);
1017 			break;
1018 		case PFSYNC_BREQ_EVT_TMO:
1019 			pfsync_dprintf(sc, "timeout waiting for link");
1020 			pfsync_bulk_req_nstate_done(sc);
1021 			break;
1022 		case PFSYNC_BREQ_EVT_BUS_START:
1023 			pfsync_bulk_req_nstate_bulk(sc);
1024 			break;
1025 		case PFSYNC_BREQ_EVT_BUS_END:
1026 			/* ignore this */
1027 			break;
1028 		default:
1029 			pfsync_bulk_req_invstate(sc, evt);
1030 		}
1031 		break;
1032 
1033 	case PFSYNC_BREQ_S_SENT:
1034 		switch (evt) {
1035 		case PFSYNC_BREQ_EVT_BUS_START:
1036 			pfsync_bulk_req_nstate_bulk(sc);
1037 			break;
1038 		case PFSYNC_BREQ_EVT_BUS_END:
1039 		case PFSYNC_BREQ_EVT_LINK:
1040 			/* ignore this */
1041 			break;
1042 		case PFSYNC_BREQ_EVT_TMO:
1043 			if (++sc->sc_bulk_req.req_tries <
1044 			    PFSYNC_MAX_BULKTRIES) {
1045 				pfsync_bulk_req_send(sc);
1046 				pfsync_bulk_req_nstate(sc,
1047 				    PFSYNC_BREQ_S_SENT, 2);
1048 				break;
1049 			}
1050 
1051 			pfsync_dprintf(sc,
1052 			    "timeout waiting for bulk transfer start");
1053 			pfsync_bulk_req_nstate_done(sc);
1054 			break;
1055 		default:
1056 			pfsync_bulk_req_invstate(sc, evt);
1057 		}
1058 		break;
1059 
1060 	case PFSYNC_BREQ_S_BULK:
1061 		switch (evt) {
1062 		case PFSYNC_BREQ_EVT_BUS_START:
1063 		case PFSYNC_BREQ_EVT_LINK:
1064 			/* ignore this */
1065 			break;
1066 		case PFSYNC_BREQ_EVT_BUS_END:
1067 			pfsync_bulk_req_nstate_done(sc);
1068 			break;
1069 		case PFSYNC_BREQ_EVT_TMO:
1070 			if (++sc->sc_bulk_req.req_tries <
1071 			    PFSYNC_MAX_BULKTRIES) {
1072 				pfsync_bulk_req_send(sc);
1073 				pfsync_bulk_req_nstate(sc,
1074 				    PFSYNC_BREQ_S_SENT, 2);
1075 			}
1076 
1077 			pfsync_dprintf(sc,
1078 			    "timeout waiting for bulk transfer end");
1079 			pfsync_bulk_req_nstate_done(sc);
1080 			break;
1081 		default:
1082 			pfsync_bulk_req_invstate(sc, evt);
1083 		}
1084 		break;
1085 
1086 	case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */
1087 		switch (evt) {
1088 		case PFSYNC_BREQ_EVT_BUS_START:
1089 		case PFSYNC_BREQ_EVT_BUS_END:
1090 		case PFSYNC_BREQ_EVT_LINK:
1091 			/* nops */
1092 			break;
1093 		default:
1094 			pfsync_bulk_req_invstate(sc, evt);
1095 		}
1096 		break;
1097 
1098 	default:
1099 		panic("%s: unknown event %d", ifp->if_xname, evt);
1100 		/* NOTREACHED */
1101 	}
1102 	rw_exit_write(&sc->sc_bulk_req.req_lock);
1103 }
1104 
1105 static void
1106 pfsync_bulk_req_tmo(void *arg)
1107 {
1108 	struct pfsync_softc *sc = arg;
1109 
1110 	NET_LOCK();
1111 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO);
1112 	NET_UNLOCK();
1113 }
1114 
1115 static int
1116 pfsync_down(struct pfsync_softc *sc)
1117 {
1118 	struct ifnet *ifp = &sc->sc_if;
1119 	struct ifnet *ifp0;
1120 	struct smr_entry smr;
1121 	size_t i;
1122 	void *inm = NULL;
1123 	unsigned int sndbar = 0;
1124 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
1125 	struct pfsync_deferral *pd;
1126 
1127 	NET_ASSERT_LOCKED();
1128 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1129 
1130 	/*
1131 	 * tearing down pfsync involves waiting for pfsync to stop
1132 	 * running in various contexts including softnet taskqs.
1133 	 * this thread cannot hold netlock while waiting for a
1134 	 * barrier in softnet because softnet might be waiting for
1135 	 * the netlock. sc->sc_up is used to coordinate with
1136 	 * pfsync_up.
1137 	 */
1138 
1139 	CLR(ifp->if_flags, IFF_RUNNING);
1140 
1141 	ifp0 = if_get(sc->sc_sync_ifidx);
1142 	if (ifp0 != NULL) {
1143 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
1144 		if_detachhook_del(ifp0, &sc->sc_dtask);
1145 	}
1146 	if_put(ifp0);
1147 
1148 #if NCARP > 0
1149 	if (sc->sc_sync_if_down)
1150 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync down");
1151 #endif
1152 
1153 	NET_UNLOCK();
1154 
1155 	KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc,
1156 	   "pfsyncif %p != sc %p", pfsyncif, sc);
1157 	SMR_PTR_SET_LOCKED(&pfsyncif, NULL);
1158 	smr_init(&smr);
1159 	smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs);
1160 
1161 	/* stop pf producing work before cleaning up the timeouts and tasks */
1162 	refcnt_finalize(&sc->sc_refs, "pfsyncfini");
1163 
1164 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN);
1165 
1166 	rw_enter_read(&pf_state_list.pfs_rwl);
1167 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
1168 	if (sc->sc_bulk_snd.snd_tail != NULL) {
1169 		sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo);
1170 
1171 		sc->sc_bulk_snd.snd_again = 0;
1172 		sc->sc_bulk_snd.snd_next = NULL;
1173 		sc->sc_bulk_snd.snd_tail = NULL;
1174 	}
1175 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
1176 	rw_exit_read(&pf_state_list.pfs_rwl);
1177 
1178 	/*
1179 	 * do a single barrier for all the timeouts. because the
1180 	 * timeouts in each slice are configured the same way, the
1181 	 * barrier for one will work for all of them.
1182 	 */
1183 	for (i = 0; i < nitems(sc->sc_slices); i++) {
1184 		struct pfsync_slice *s = &sc->sc_slices[i];
1185 
1186 		timeout_del(&s->s_tmo);
1187 		task_del(s->s_softnet, &s->s_task);
1188 		task_del(s->s_softnet, &s->s_send);
1189 
1190 		timeout_del(&s->s_deferrals_tmo);
1191 		task_del(s->s_softnet, &s->s_deferrals_task);
1192 	}
1193 	timeout_barrier(&sc->sc_slices[0].s_tmo);
1194 	timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */
1195 	if (sndbar) {
1196 		/* technically the preceding barrier does the same job */
1197 		timeout_barrier(&sc->sc_bulk_snd.snd_tmo);
1198 	}
1199 	net_tq_barriers("pfsyncbar");
1200 
1201 	/* pfsync is no longer running */
1202 
1203 	if (sc->sc_inm != NULL) {
1204 		inm = sc->sc_inm;
1205 		sc->sc_inm = NULL;
1206 	}
1207 
1208 	for (i = 0; i < nitems(sc->sc_slices); i++) {
1209 		struct pfsync_slice *s = &sc->sc_slices[i];
1210 		struct pf_state *st;
1211 
1212 		pfsync_slice_drop(sc, s);
1213 		mq_purge(&s->s_sendq);
1214 
1215 		while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) {
1216 			TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1217 
1218 			st = pd->pd_st;
1219 			st->sync_defer = NULL;
1220 
1221 			TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
1222 		}
1223 		s->s_deferred = 0;
1224 	}
1225 
1226 	NET_LOCK();
1227 	sc->sc_up = 0;
1228 
1229 	if (inm != NULL)
1230 		in_delmulti(inm);
1231 
1232 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
1233 		TAILQ_REMOVE(&pds, pd, pd_entry);
1234 
1235 		pfsync_defer_output(pd);
1236 	}
1237 
1238 	return (0);
1239 }
1240 
1241 int
1242 pfsync_is_up(void)
1243 {
1244 	int rv;
1245 
1246 	smr_read_enter();
1247 	rv = SMR_PTR_GET(&pfsyncif) != NULL;
1248 	smr_read_leave();
1249 
1250 	return (rv);
1251 }
1252 
1253 static void
1254 pfsync_start(struct ifqueue *ifq)
1255 {
1256 	ifq_purge(ifq);
1257 }
1258 
1259 struct pfsync_q {
1260 	void		(*write)(struct pf_state *, void *);
1261 	size_t		len;
1262 	u_int8_t	action;
1263 };
1264 
1265 static struct pfsync_slice *
1266 pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st)
1267 {
1268 	unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices);
1269 	struct pfsync_slice *s = &sc->sc_slices[idx];
1270 
1271 	if (!mtx_enter_try(&s->s_mtx)) {
1272 		mtx_enter(&s->s_mtx);
1273 		s->s_stat_contended++;
1274 	}
1275 	s->s_stat_locks++;
1276 
1277 	return (s);
1278 }
1279 
1280 static void
1281 pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s)
1282 {
1283 	mtx_leave(&s->s_mtx);
1284 }
1285 
1286 /* we have one of these for every PFSYNC_S_ */
1287 static void	pfsync_out_state(struct pf_state *, void *);
1288 static void	pfsync_out_iack(struct pf_state *, void *);
1289 static void	pfsync_out_upd_c(struct pf_state *, void *);
1290 static void	pfsync_out_del(struct pf_state *, void *);
1291 #if defined(IPSEC)
1292 static void	pfsync_out_tdb(struct tdb *, void *);
1293 #endif
1294 
1295 static const struct pfsync_q pfsync_qs[] = {
1296 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
1297 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
1298 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
1299 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
1300 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
1301 };
1302 
1303 static void
1304 pfsync_out_state(struct pf_state *st, void *buf)
1305 {
1306 	struct pfsync_state *sp = buf;
1307 
1308 	mtx_enter(&st->mtx);
1309 	pf_state_export(sp, st);
1310 	mtx_leave(&st->mtx);
1311 }
1312 
1313 static void
1314 pfsync_out_iack(struct pf_state *st, void *buf)
1315 {
1316 	struct pfsync_ins_ack *iack = buf;
1317 
1318 	iack->id = st->id;
1319 	iack->creatorid = st->creatorid;
1320 }
1321 
1322 static void
1323 pfsync_out_upd_c(struct pf_state *st, void *buf)
1324 {
1325 	struct pfsync_upd_c *up = buf;
1326 
1327 	memset(up, 0, sizeof(*up));
1328 	up->id = st->id;
1329 	up->creatorid = st->creatorid;
1330 
1331 	mtx_enter(&st->mtx);
1332 	pf_state_peer_hton(&st->src, &up->src);
1333 	pf_state_peer_hton(&st->dst, &up->dst);
1334 	up->timeout = st->timeout;
1335 	mtx_leave(&st->mtx);
1336 }
1337 
1338 static void
1339 pfsync_out_del(struct pf_state *st, void *buf)
1340 {
1341 	struct pfsync_del_c *dp = buf;
1342 
1343 	dp->id = st->id;
1344 	dp->creatorid = st->creatorid;
1345 
1346 	st->sync_state = PFSYNC_S_DEAD;
1347 }
1348 
1349 #if defined(IPSEC)
1350 static inline void
1351 pfsync_tdb_enter(struct tdb *tdb)
1352 {
1353 	mtx_enter(&tdb->tdb_mtx);
1354 }
1355 
1356 static inline void
1357 pfsync_tdb_leave(struct tdb *tdb)
1358 {
1359 	unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
1360 	mtx_leave(&tdb->tdb_mtx);
1361 	if (snapped)
1362 		wakeup_one(&tdb->tdb_updates);
1363 }
1364 #endif /* defined(IPSEC) */
1365 
1366 static void
1367 pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s)
1368 {
1369 	struct pf_state *st;
1370 	int q;
1371 #if defined(IPSEC)
1372 	struct tdb *tdb;
1373 #endif
1374 
1375 	for (q = 0; q < nitems(s->s_qs); q++) {
1376 		if (TAILQ_EMPTY(&s->s_qs[q]))
1377 			continue;
1378 
1379 		while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) {
1380 			TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1381 #ifdef PFSYNC_DEBUG
1382 			KASSERT(st->sync_state == q);
1383 #endif
1384 			st->sync_state = PFSYNC_S_NONE;
1385 			pf_state_unref(st);
1386 		}
1387 	}
1388 
1389 #if defined(IPSEC)
1390 	while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1391 		TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1392 
1393 		pfsync_tdb_enter(tdb);
1394 		KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1395 		CLR(tdb->tdb_flags, TDBF_PFSYNC);
1396 		pfsync_tdb_leave(tdb);
1397 	}
1398 #endif /* defined(IPSEC) */
1399 
1400 	timeout_del(&s->s_tmo);
1401 	s->s_len = PFSYNC_MINPKT;
1402 }
1403 
1404 static struct mbuf *
1405 pfsync_slice_write(struct pfsync_slice *s)
1406 {
1407 	struct pfsync_softc *sc = s->s_pfsync;
1408 	struct mbuf *m;
1409 
1410 	struct ip *ip;
1411 	struct pfsync_header *ph;
1412 	struct pfsync_subheader *subh;
1413 
1414 	unsigned int mlen = max_linkhdr + s->s_len;
1415 	unsigned int q, count;
1416 	caddr_t ptr;
1417 	size_t off;
1418 
1419 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1420 	if (s->s_len == PFSYNC_MINPKT) {
1421 		s->s_stat_write_nop++;
1422 		return (NULL);
1423 	}
1424 
1425 	task_del(s->s_softnet, &s->s_task);
1426 
1427 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1428 	if (m == NULL)
1429 		goto drop;
1430 
1431 	if (mlen > MHLEN) {
1432 		MCLGETL(m, M_DONTWAIT, mlen);
1433 		if (!ISSET(m->m_flags, M_EXT))
1434 			goto drop;
1435 	}
1436 
1437 	m_align(m, s->s_len);
1438 	m->m_len = m->m_pkthdr.len = s->s_len;
1439 
1440 	ptr = mtod(m, caddr_t);
1441 	off = 0;
1442 
1443 	ip = (struct ip *)(ptr + off);
1444 	off += sizeof(*ip);
1445 	*ip = sc->sc_template;
1446 	ip->ip_len = htons(m->m_pkthdr.len);
1447 	ip->ip_id = htons(ip_randomid());
1448 
1449 	ph = (struct pfsync_header *)(ptr + off);
1450 	off += sizeof(*ph);
1451 	memset(ph, 0, sizeof(*ph));
1452 	ph->version = PFSYNC_VERSION;
1453 	ph->len = htons(m->m_pkthdr.len - sizeof(*ip));
1454 
1455 	for (q = 0; q < nitems(s->s_qs); q++) {
1456 		struct pf_state_queue *psq = &s->s_qs[q];
1457 		struct pf_state *st;
1458 
1459 		if (TAILQ_EMPTY(psq))
1460 			continue;
1461 
1462 		subh = (struct pfsync_subheader *)(ptr + off);
1463 		off += sizeof(*subh);
1464 
1465 		count = 0;
1466 		while ((st = TAILQ_FIRST(psq)) != NULL) {
1467 			TAILQ_REMOVE(psq, st, sync_list);
1468 			count++;
1469 
1470 			KASSERT(st->sync_state == q);
1471 			/* the write handler below may override this */
1472 			st->sync_state = PFSYNC_S_NONE;
1473 
1474 			pfsync_qs[q].write(st, ptr + off);
1475 			off += pfsync_qs[q].len;
1476 
1477 			pf_state_unref(st);
1478 		}
1479 
1480 		subh->action = pfsync_qs[q].action;
1481 		subh->len = pfsync_qs[q].len >> 2;
1482 		subh->count = htons(count);
1483 	}
1484 
1485 #if defined(IPSEC)
1486 	if (!TAILQ_EMPTY(&s->s_tdb_q)) {
1487 		struct tdb *tdb;
1488 
1489 		subh = (struct pfsync_subheader *)(ptr + off);
1490 		off += sizeof(*subh);
1491 
1492 		count = 0;
1493 		while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1494 			TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1495 			count++;
1496 
1497 			pfsync_tdb_enter(tdb);
1498 			KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1499 
1500 			/* get a consistent view of the counters */
1501 			pfsync_out_tdb(tdb, ptr + off);
1502 
1503 			CLR(tdb->tdb_flags, TDBF_PFSYNC);
1504 			pfsync_tdb_leave(tdb);
1505 
1506 			off += sizeof(struct pfsync_tdb);
1507 		}
1508 
1509 		subh->action = PFSYNC_ACT_TDB;
1510 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1511 		subh->count = htons(count);
1512 	}
1513 #endif
1514 
1515 	timeout_del(&s->s_tmo);
1516 	s->s_len = PFSYNC_MINPKT;
1517 
1518 	return (m);
1519 drop:
1520 	m_freem(m);
1521 	pfsyncstat_inc(pfsyncs_onomem);
1522 	pfsync_slice_drop(sc, s);
1523 	return (NULL);
1524 }
1525 
1526 static void
1527 pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m)
1528 {
1529 	struct ip_moptions imo;
1530 	unsigned int len = m->m_pkthdr.len;
1531 #if NBPFILTER > 0
1532 	caddr_t if_bpf = sc->sc_if.if_bpf;
1533 	if (if_bpf)
1534 		bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT);
1535 #endif
1536 
1537 	imo.imo_ifidx = sc->sc_sync_ifidx;
1538 	imo.imo_ttl = PFSYNC_DFLTTL;
1539 	imo.imo_loop = 0;
1540 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1541 
1542 	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) {
1543 		counters_pkt(sc->sc_if.if_counters, ifc_opackets,
1544 		    ifc_obytes, len);
1545 		pfsyncstat_inc(pfsyncs_opackets);
1546 	} else {
1547 		counters_inc(sc->sc_if.if_counters, ifc_oerrors);
1548 		pfsyncstat_inc(pfsyncs_oerrors);
1549 	}
1550 }
1551 
1552 static void
1553 pfsync_slice_tmo(void *arg)
1554 {
1555 	struct pfsync_slice *s = arg;
1556 
1557 	task_add(s->s_softnet, &s->s_task);
1558 }
1559 
1560 static void
1561 pfsync_slice_sched(struct pfsync_slice *s)
1562 {
1563 	s->s_stat_task_add++;
1564 	task_add(s->s_softnet, &s->s_task);
1565 }
1566 
1567 static void
1568 pfsync_slice_task(void *arg)
1569 {
1570 	struct pfsync_slice *s = arg;
1571 	struct mbuf *m;
1572 
1573 	mtx_enter(&s->s_mtx);
1574 	s->s_stat_task_run++;
1575 
1576 	m = pfsync_slice_write(s);
1577 	mtx_leave(&s->s_mtx);
1578 	if (m != NULL) {
1579 		NET_LOCK();
1580 		pfsync_sendout(s->s_pfsync, m);
1581 		NET_UNLOCK();
1582 	}
1583 }
1584 
1585 static void
1586 pfsync_slice_sendq(void *arg)
1587 {
1588 	struct pfsync_slice *s = arg;
1589 	struct mbuf_list ml;
1590 	struct mbuf *m;
1591 
1592 	mq_delist(&s->s_sendq, &ml);
1593 	if (ml_empty(&ml))
1594 		return;
1595 
1596 	mtx_enter(&s->s_mtx);
1597 	s->s_stat_dequeue++;
1598 	mtx_leave(&s->s_mtx);
1599 
1600 	NET_LOCK();
1601 	while ((m = ml_dequeue(&ml)) != NULL)
1602 		pfsync_sendout(s->s_pfsync, m);
1603 	NET_UNLOCK();
1604 }
1605 
1606 static void
1607 pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q)
1608 {
1609 	size_t nlen = pfsync_qs[q].len;
1610 	struct mbuf *m = NULL;
1611 
1612 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1613 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1614 	KASSERT(s->s_len >= PFSYNC_MINPKT);
1615 
1616 	if (TAILQ_EMPTY(&s->s_qs[q]))
1617 		nlen += sizeof(struct pfsync_subheader);
1618 
1619 	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
1620 		m = pfsync_slice_write(s);
1621 		if (m != NULL) {
1622 			s->s_stat_enqueue++;
1623 			if (mq_enqueue(&s->s_sendq, m) == 0)
1624 				task_add(s->s_softnet, &s->s_send);
1625 		}
1626 
1627 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1628 	}
1629 
1630 	s->s_len += nlen;
1631 	pf_state_ref(st);
1632 	TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list);
1633 	st->sync_state = q;
1634 
1635 	if (!timeout_pending(&s->s_tmo))
1636 		timeout_add_sec(&s->s_tmo, 1);
1637 }
1638 
1639 static void
1640 pfsync_q_del(struct pfsync_slice *s, struct pf_state *st)
1641 {
1642 	unsigned int q = st->sync_state;
1643 
1644 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1645 	KASSERT(st->sync_state < PFSYNC_S_NONE);
1646 
1647 	st->sync_state = PFSYNC_S_NONE;
1648 	TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1649 	pf_state_unref(st);
1650 	s->s_len -= pfsync_qs[q].len;
1651 
1652 	if (TAILQ_EMPTY(&s->s_qs[q]))
1653 		s->s_len -= sizeof(struct pfsync_subheader);
1654 }
1655 
1656 /*
1657  * the pfsync hooks that pf calls
1658  */
1659 
1660 void
1661 pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw,
1662     const struct pf_state_key *sks, int flags)
1663 {
1664 	/* this is called before pf_state_insert */
1665 
1666 	if (skw->proto == IPPROTO_PFSYNC)
1667 		SET(st->state_flags, PFSTATE_NOSYNC);
1668 
1669 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1670 		st->sync_state = PFSYNC_S_DEAD;
1671 		return;
1672 	}
1673 
1674 	if (ISSET(flags, PFSYNC_SI_IOCTL)) {
1675 		/* all good */
1676 		return;
1677 	}
1678 
1679 	/* state came off the wire */
1680 	if (ISSET(flags, PFSYNC_SI_PFSYNC)) {
1681 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
1682 			CLR(st->state_flags, PFSTATE_ACK);
1683 
1684 			/* peer wants an iack, not an insert */
1685 			st->sync_state = PFSYNC_S_SYNC;
1686 		} else
1687 			st->sync_state = PFSYNC_S_PFSYNC;
1688 	}
1689 }
1690 
1691 void
1692 pfsync_insert_state(struct pf_state *st)
1693 {
1694 	struct pfsync_softc *sc;
1695 
1696 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1697 
1698 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1699 	    st->sync_state == PFSYNC_S_DEAD)
1700 		return;
1701 
1702 	smr_read_enter();
1703 	sc = SMR_PTR_GET(&pfsyncif);
1704 	if (sc != NULL) {
1705 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1706 
1707 		switch (st->sync_state) {
1708 		case PFSYNC_S_UPD_C:
1709 			/* we must have lost a race after insert */
1710 			pfsync_q_del(s, st);
1711 			/* FALLTHROUGH */
1712 		case PFSYNC_S_NONE:
1713 			pfsync_q_ins(s, st, PFSYNC_S_INS);
1714 			break;
1715 		case PFSYNC_S_SYNC:
1716 			st->sync_state = PFSYNC_S_NONE; /* gross */
1717 			pfsync_q_ins(s, st, PFSYNC_S_IACK);
1718 			pfsync_slice_sched(s); /* the peer is waiting */
1719 			break;
1720 		case PFSYNC_S_PFSYNC:
1721 			/* state was just inserted by pfsync */
1722 			st->sync_state = PFSYNC_S_NONE;
1723 			break;
1724 		default:
1725 			panic("%s: state %p unexpected sync_state %d",
1726 			    __func__, st, st->sync_state);
1727 			/* NOTREACHED */
1728 		}
1729 
1730 		pfsync_slice_leave(sc, s);
1731 	}
1732 	smr_read_leave();
1733 }
1734 
1735 void
1736 pfsync_update_state(struct pf_state *st)
1737 {
1738 	struct pfsync_softc *sc;
1739 
1740 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1741 
1742 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1743 	    st->sync_state == PFSYNC_S_DEAD)
1744 		return;
1745 
1746 	smr_read_enter();
1747 	sc = SMR_PTR_GET(&pfsyncif);
1748 	if (sc != NULL) {
1749 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1750 		int sync = 0;
1751 
1752 		switch (st->sync_state) {
1753 		case PFSYNC_S_UPD_C:
1754 		case PFSYNC_S_UPD:
1755 			/* we're already handling it */
1756 			if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1757 				st->sync_updates++;
1758 				if (st->sync_updates >= sc->sc_maxupdates)
1759 					sync = 1;
1760 			}
1761 			/* FALLTHROUGH */
1762 		case PFSYNC_S_INS:
1763 		case PFSYNC_S_DEL:
1764 		case PFSYNC_S_DEAD:
1765 			break;
1766 
1767 		case PFSYNC_S_IACK:
1768 			pfsync_q_del(s, st);
1769 			/* FALLTHROUGH */
1770 		case PFSYNC_S_NONE:
1771 			pfsync_q_ins(s, st, PFSYNC_S_UPD_C);
1772 			st->sync_updates = 0;
1773 			break;
1774 		default:
1775 			panic("%s: state %p unexpected sync_state %d",
1776 			    __func__, st, st->sync_state);
1777 			/* NOTREACHED */
1778 		}
1779 
1780 		if (!sync && (getuptime() - st->pfsync_time) < 2)
1781 			sync = 1;
1782 
1783 		if (sync)
1784 			pfsync_slice_sched(s);
1785 		pfsync_slice_leave(sc, s);
1786 	}
1787 	smr_read_leave();
1788 }
1789 
1790 void
1791 pfsync_delete_state(struct pf_state *st)
1792 {
1793 	struct pfsync_softc *sc;
1794 
1795 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1796 
1797 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1798 	    st->sync_state == PFSYNC_S_DEAD)
1799 		return;
1800 
1801 	smr_read_enter();
1802 	sc = SMR_PTR_GET(&pfsyncif);
1803 	if (sc != NULL) {
1804 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1805 
1806 		switch (st->sync_state) {
1807 		case PFSYNC_S_INS:
1808 			/* let's pretend this never happened */
1809 			pfsync_q_del(s, st);
1810 			break;
1811 
1812 		case PFSYNC_S_UPD_C:
1813 		case PFSYNC_S_UPD:
1814 		case PFSYNC_S_IACK:
1815 			pfsync_q_del(s, st);
1816 			/* FALLTHROUGH */
1817 		case PFSYNC_S_NONE:
1818 			pfsync_q_ins(s, st, PFSYNC_S_DEL);
1819 			st->sync_updates = 0;
1820 			break;
1821 		case PFSYNC_S_DEL:
1822 		case PFSYNC_S_DEAD:
1823 			/* XXX we should count this */
1824 			break;
1825 		default:
1826 			panic("%s: state %p unexpected sync_state %d",
1827 			    __func__, st, st->sync_state);
1828 			/* NOTREACHED */
1829 		}
1830 
1831 		pfsync_slice_leave(sc, s);
1832 	}
1833 	smr_read_leave();
1834 }
1835 
1836 struct pfsync_subh_clr {
1837 	struct pfsync_subheader	subh;
1838 	struct pfsync_clr	clr;
1839 } __packed __aligned(4);
1840 
1841 void
1842 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1843 {
1844 	struct pfsync_softc *sc;
1845 	struct pfsync_subh_clr *h;
1846 	struct mbuf *m;
1847 	unsigned int hlen, mlen;
1848 
1849 	smr_read_enter();
1850 	sc = SMR_PTR_GET(&pfsyncif);
1851 	if (sc != NULL)
1852 		refcnt_take(&sc->sc_refs);
1853 	smr_read_leave();
1854 
1855 	if (sc == NULL)
1856 		return;
1857 
1858 	hlen = sizeof(sc->sc_template) +
1859 	    sizeof(struct pfsync_header) +
1860 	    sizeof(*h);
1861 
1862 	mlen = max_linkhdr + hlen;
1863 
1864 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1865 	if (m == NULL) {
1866 		/* count error */
1867 		goto leave;
1868 	}
1869 
1870 	if (mlen > MHLEN) {
1871 		MCLGETL(m, M_DONTWAIT, mlen);
1872 		if (!ISSET(m->m_flags, M_EXT)) {
1873 			m_freem(m);
1874 			goto leave;
1875 		}
1876 	}
1877 
1878 	m_align(m, sizeof(*h));
1879 	h = mtod(m, struct pfsync_subh_clr *);
1880 
1881 	h->subh.action = PFSYNC_ACT_CLR;
1882 	h->subh.len = sizeof(h->clr) >> 2;
1883 	h->subh.count = htons(1);
1884 
1885 	strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname));
1886 	h->clr.creatorid = creatorid;
1887 
1888 	m->m_pkthdr.len = m->m_len = sizeof(*h);
1889 	m = pfsync_encap(sc, m);
1890 	if (m == NULL)
1891 		goto leave;
1892 
1893 	pfsync_sendout(sc, m);
1894 leave:
1895 	refcnt_rele_wake(&sc->sc_refs);
1896 }
1897 
1898 int
1899 pfsync_state_in_use(struct pf_state *st)
1900 {
1901 	struct pfsync_softc *sc;
1902 	int rv = 0;
1903 
1904 	smr_read_enter();
1905 	sc = SMR_PTR_GET(&pfsyncif);
1906 	if (sc != NULL) {
1907 		/*
1908 		 * pfsync bulk sends run inside
1909 		 * rw_enter_read(&pf_state_list.pfs_rwl), and this
1910 		 * code (pfsync_state_in_use) is only called from the
1911 		 * purge code inside
1912 		 * rw_enter_write(&pf_state_list.pfs_rwl). therefore,
1913 		 * those two sections are exclusive so we can safely
1914 		 * look at the bulk send pointers.
1915 		 */
1916 		/* rw_assert_wrlock(&pf_state_list.pfs_rwl); */
1917 		if (sc->sc_bulk_snd.snd_next == st ||
1918 		    sc->sc_bulk_snd.snd_tail == st)
1919 			rv = 1;
1920 	}
1921 	smr_read_leave();
1922 
1923 	return (rv);
1924 }
1925 
1926 int
1927 pfsync_defer(struct pf_state *st, struct mbuf *m)
1928 {
1929 	struct pfsync_softc *sc;
1930 	struct pfsync_slice *s;
1931 	struct pfsync_deferral *pd;
1932 	int sched = 0;
1933 	int rv = 0;
1934 
1935 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1936 	    ISSET(m->m_flags, M_BCAST|M_MCAST))
1937 		return (0);
1938 
1939 	smr_read_enter();
1940 	sc = SMR_PTR_GET(&pfsyncif);
1941 	if (sc == NULL || !sc->sc_defer)
1942 		goto leave;
1943 
1944 	pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
1945 	if (pd == NULL) {
1946 		goto leave;
1947 	}
1948 
1949 	s = pfsync_slice_enter(sc, st);
1950 	s->s_stat_defer_add++;
1951 
1952 	pd->pd_st = pf_state_ref(st);
1953 	pd->pd_m = m;
1954 	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1955 
1956 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1957 	st->sync_defer = pd;
1958 
1959 	sched = s->s_deferred++;
1960 	TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
1961 
1962 	if (sched == 0)
1963 		timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
1964 	else if (sched >= PFSYNC_DEFER_LIMIT) {
1965 		s->s_stat_defer_overlimit++;
1966 		timeout_del(&s->s_deferrals_tmo);
1967 		task_add(s->s_softnet, &s->s_deferrals_task);
1968 	}
1969 
1970 	pfsync_slice_sched(s);
1971 	pfsync_slice_leave(sc, s);
1972 	rv = 1;
1973 leave:
1974 	smr_read_leave();
1975 
1976 	return (rv);
1977 }
1978 
1979 static void
1980 pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st)
1981 {
1982 	struct pfsync_slice *s;
1983 	struct pfsync_deferral *pd;
1984 
1985 	s = pfsync_slice_enter(sc, st);
1986 
1987 	pd = st->sync_defer;
1988 	if (pd != NULL) {
1989 		s->s_stat_defer_ack++;
1990 
1991 		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1992 		s->s_deferred--;
1993 
1994 		st = pd->pd_st;
1995 		st->sync_defer = NULL;
1996 	}
1997 	pfsync_slice_leave(sc, s);
1998 
1999 	if (pd != NULL)
2000 		pfsync_defer_output(pd);
2001 }
2002 
2003 static void
2004 pfsync_deferrals_tmo(void *arg)
2005 {
2006 	struct pfsync_slice *s = arg;
2007 
2008 	if (READ_ONCE(s->s_deferred) > 0)
2009 		task_add(s->s_softnet, &s->s_deferrals_task);
2010 }
2011 
2012 static void
2013 pfsync_deferrals_task(void *arg)
2014 {
2015 	struct pfsync_slice *s = arg;
2016 	struct pfsync_deferral *pd;
2017 	struct pf_state *st;
2018 	uint64_t now, nsec = 0;
2019 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
2020 
2021 	now = getnsecuptime();
2022 
2023 	mtx_enter(&s->s_mtx);
2024 	s->s_stat_defer_run++; /* maybe move this into the loop */
2025 	for (;;) {
2026 		pd = TAILQ_FIRST(&s->s_deferrals);
2027 		if (pd == NULL)
2028 			break;
2029 
2030 		if (s->s_deferred < PFSYNC_DEFER_LIMIT &&
2031 		    now < pd->pd_deadline) {
2032 			nsec = pd->pd_deadline - now;
2033 			break;
2034 		}
2035 
2036 		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
2037 		s->s_deferred--;
2038 
2039 		/*
2040 		 * detach the pd from the state. the pd still refers
2041 		 * to the state though.
2042 		 */
2043 		st = pd->pd_st;
2044 		st->sync_defer = NULL;
2045 
2046 		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
2047 	}
2048 	mtx_leave(&s->s_mtx);
2049 
2050 	if (nsec > 0) {
2051 		/* we were looking at a pd, but it wasn't old enough */
2052 		timeout_add_nsec(&s->s_deferrals_tmo, nsec);
2053 	}
2054 
2055 	if (TAILQ_EMPTY(&pds))
2056 		return;
2057 
2058 	NET_LOCK();
2059 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
2060 		TAILQ_REMOVE(&pds, pd, pd_entry);
2061 
2062 		pfsync_defer_output(pd);
2063 	}
2064 	NET_UNLOCK();
2065 }
2066 
2067 static void
2068 pfsync_defer_output(struct pfsync_deferral *pd)
2069 {
2070 	struct pf_pdesc pdesc;
2071 	struct pf_state *st = pd->pd_st;
2072 
2073 	if (st->rt == PF_ROUTETO) {
2074 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
2075 		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
2076 			return;
2077 		switch (st->key[PF_SK_WIRE]->af) {
2078 		case AF_INET:
2079 			pf_route(&pdesc, st);
2080 			break;
2081 #ifdef INET6
2082 		case AF_INET6:
2083 			pf_route6(&pdesc, st);
2084 			break;
2085 #endif /* INET6 */
2086 		default:
2087 			unhandled_af(st->key[PF_SK_WIRE]->af);
2088 		}
2089 		pd->pd_m = pdesc.m;
2090 	} else {
2091 		switch (st->key[PF_SK_WIRE]->af) {
2092 		case AF_INET:
2093 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
2094 			break;
2095 #ifdef INET6
2096 		case AF_INET6:
2097 			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
2098 			break;
2099 #endif /* INET6 */
2100 		default:
2101 			unhandled_af(st->key[PF_SK_WIRE]->af);
2102 		}
2103 
2104 		pd->pd_m = NULL;
2105 	}
2106 
2107 	pf_state_unref(st);
2108 	m_freem(pd->pd_m);
2109 	pool_put(&pfsync_deferrals_pool, pd);
2110 }
2111 
2112 struct pfsync_subh_bus {
2113 	struct pfsync_subheader	subh;
2114 	struct pfsync_bus	bus;
2115 } __packed __aligned(4);
2116 
2117 static unsigned int
2118 pfsync_bulk_snd_bus(struct pfsync_softc *sc,
2119     struct mbuf *m, const unsigned int space,
2120     uint32_t endtime, uint8_t status)
2121 {
2122 	struct pfsync_subh_bus *h;
2123 	unsigned int nlen;
2124 
2125 	nlen = m->m_len + sizeof(*h);
2126 	if (space < nlen)
2127 		return (0);
2128 
2129 	h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len);
2130 	memset(h, 0, sizeof(*h));
2131 
2132 	h->subh.action = PFSYNC_ACT_BUS;
2133 	h->subh.len = sizeof(h->bus) >> 2;
2134 	h->subh.count = htons(1);
2135 
2136 	h->bus.creatorid = pf_status.hostid;
2137 	h->bus.endtime = htonl(endtime);
2138 	h->bus.status = status;
2139 
2140 	m->m_len = nlen;
2141 
2142 	return (1);
2143 }
2144 
2145 static unsigned int
2146 pfsync_bulk_snd_states(struct pfsync_softc *sc,
2147     struct mbuf *m, const unsigned int space, unsigned int len)
2148 {
2149 	struct pf_state *st;
2150 	struct pfsync_state *sp;
2151 	unsigned int nlen;
2152 	unsigned int count = 0;
2153 
2154 	st = sc->sc_bulk_snd.snd_next;
2155 
2156 	for (;;) {
2157 		nlen = len + sizeof(*sp);
2158 		sp = (struct pfsync_state *)(mtod(m, caddr_t) + len);
2159 		if (space < nlen)
2160 			break;
2161 
2162 		mtx_enter(&st->mtx);
2163 		pf_state_export(sp, st);
2164 		mtx_leave(&st->mtx);
2165 
2166 		/* commit */
2167 		count++;
2168 		m->m_len = len = nlen;
2169 
2170 		if (st == sc->sc_bulk_snd.snd_tail) {
2171 			if (pfsync_bulk_snd_bus(sc, m, space,
2172 			    0, PFSYNC_BUS_END) == 0) {
2173 				/* couldn't fit the BUS */
2174 				st = NULL;
2175 				break;
2176 			}
2177 
2178 			/* this BUS is done */
2179 			pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2180 			sc->sc_bulk_snd.snd_again = 0; /* XXX */
2181 			sc->sc_bulk_snd.snd_next = NULL;
2182 			sc->sc_bulk_snd.snd_tail = NULL;
2183 			return (count);
2184 		}
2185 
2186 		st = TAILQ_NEXT(st, entry_list);
2187 	}
2188 
2189 	/* there's still work to do */
2190 	sc->sc_bulk_snd.snd_next = st;
2191 	timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS);
2192 
2193 	return (count);
2194 }
2195 
2196 static unsigned int
2197 pfsync_bulk_snd_sub(struct pfsync_softc *sc,
2198     struct mbuf *m, const unsigned int space)
2199 {
2200 	struct pfsync_subheader *subh;
2201 	unsigned int count;
2202 	unsigned int len, nlen;
2203 
2204 	len = m->m_len;
2205 	nlen = len + sizeof(*subh);
2206 	if (nlen > space)
2207 		return (0);
2208 
2209 	subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len);
2210 
2211 	/*
2212 	 * pfsync_bulk_snd_states only updates m->m_len after
2213 	 * filling in a state after the offset we gave it.
2214 	 */
2215 	count = pfsync_bulk_snd_states(sc, m, space, nlen);
2216 	if (count == 0)
2217 		return (0);
2218 
2219 	subh->action = PFSYNC_ACT_UPD;
2220 	subh->len = sizeof(struct pfsync_state) >> 2;
2221 	subh->count = htons(count);
2222 
2223 	return (count);
2224 }
2225 
2226 static void
2227 pfsync_bulk_snd_start(struct pfsync_softc *sc)
2228 {
2229 	const unsigned int space = sc->sc_if.if_mtu -
2230 	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2231 	struct mbuf *m;
2232 
2233 	rw_enter_read(&pf_state_list.pfs_rwl);
2234 
2235 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2236 	if (sc->sc_bulk_snd.snd_next != NULL) {
2237 		sc->sc_bulk_snd.snd_again = 1;
2238 		goto leave;
2239 	}
2240 
2241 	mtx_enter(&pf_state_list.pfs_mtx);
2242 	sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2243 	sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list,
2244 	    pf_state_queue);
2245 	mtx_leave(&pf_state_list.pfs_mtx);
2246 
2247 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2248 	if (m == NULL)
2249 		goto leave;
2250 
2251 	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2252 	if (!ISSET(m->m_flags, M_EXT)) {
2253 		/* some error++ */
2254 		m_freem(m); /* drop */
2255 		goto leave;
2256 	}
2257 
2258 	m_align(m, space);
2259 	m->m_len = 0;
2260 
2261 	if (sc->sc_bulk_snd.snd_tail == NULL) {
2262 		pfsync_dprintf(sc, "bulk send empty (%s)", __func__);
2263 
2264 		/* list is empty */
2265 		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2266 			panic("%s: mtu is too low", __func__);
2267 		goto encap;
2268 	}
2269 
2270 	pfsync_dprintf(sc, "bulk send start (%s)", __func__);
2271 
2272 	/* start a bulk update. */
2273 	if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0)
2274 		panic("%s: mtu is too low", __func__);
2275 
2276 	/* fill it up with state updates. */
2277 	pfsync_bulk_snd_sub(sc, m, space);
2278 
2279 encap:
2280 	m->m_pkthdr.len = m->m_len;
2281 	m = pfsync_encap(sc, m);
2282 	if (m == NULL)
2283 		goto leave;
2284 
2285 	pfsync_sendout(sc, m);
2286 
2287 leave:
2288 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2289 
2290 	rw_exit_read(&pf_state_list.pfs_rwl);
2291 }
2292 
2293 static void
2294 pfsync_bulk_snd_tmo(void *arg)
2295 {
2296 	struct pfsync_softc *sc = arg;
2297 	const unsigned int space = sc->sc_if.if_mtu -
2298 	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2299 	struct mbuf *m;
2300 
2301 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2302 	if (m == NULL) {
2303 		/* some error++ */
2304 		/* retry later */
2305 		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2306 		    PFSYNC_BULK_SND_IVAL_MS);
2307 		return;
2308 	}
2309 
2310 	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2311 	if (!ISSET(m->m_flags, M_EXT)) {
2312 		/* some error++ */
2313 		m_freem(m);
2314 		/* retry later */
2315 		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2316 		    PFSYNC_BULK_SND_IVAL_MS);
2317 		return;
2318 	}
2319 
2320 	m_align(m, space);
2321 	m->m_len = 0;
2322 
2323 	rw_enter_read(&pf_state_list.pfs_rwl);
2324 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2325 
2326 	if (sc->sc_bulk_snd.snd_next == NULL) {
2327 		/* there was no space in the previous packet for a BUS END */
2328 
2329 		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2330 			panic("%s: mtu is too low", __func__);
2331 
2332 		/* this bulk is done */
2333 		pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2334 		sc->sc_bulk_snd.snd_again = 0; /* XXX */
2335 		sc->sc_bulk_snd.snd_tail = NULL;
2336 	} else {
2337 		pfsync_dprintf(sc, "bulk send again (%s)", __func__);
2338 
2339 		/* fill it up with state updates. */
2340 		pfsync_bulk_snd_sub(sc, m, space);
2341 	}
2342 
2343 	m->m_pkthdr.len = m->m_len;
2344 	m = pfsync_encap(sc, m);
2345 
2346 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2347 	rw_exit_read(&pf_state_list.pfs_rwl);
2348 
2349 	if (m != NULL) {
2350 		NET_LOCK();
2351 		pfsync_sendout(sc, m);
2352 		NET_UNLOCK();
2353 	}
2354 }
2355 
2356 static void
2357 pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st)
2358 {
2359 	struct pfsync_slice *s = pfsync_slice_enter(sc, st);
2360 
2361 	switch (st->sync_state) {
2362 	case PFSYNC_S_UPD_C:
2363 	case PFSYNC_S_IACK:
2364 		pfsync_q_del(s, st);
2365 		/* FALLTHROUGH */
2366 	case PFSYNC_S_NONE:
2367 		pfsync_q_ins(s, st, PFSYNC_S_UPD);
2368 		break;
2369 
2370 	case PFSYNC_S_INS:
2371 	case PFSYNC_S_UPD:
2372 	case PFSYNC_S_DEL:
2373 		/* we're already handling it */
2374 		break;
2375 	default:
2376 		panic("%s: state %p unexpected sync_state %d",
2377 		    __func__, st, st->sync_state);
2378 	}
2379 
2380 	pfsync_slice_sched(s);
2381 	pfsync_slice_leave(sc, s);
2382 }
2383 
2384 #if defined(IPSEC)
2385 static void
2386 pfsync_out_tdb(struct tdb *tdb, void *buf)
2387 {
2388 	struct pfsync_tdb *ut = buf;
2389 
2390 	memset(ut, 0, sizeof(*ut));
2391 	ut->spi = tdb->tdb_spi;
2392 	memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst));
2393 	/*
2394 	 * When a failover happens, the master's rpl is probably above
2395 	 * what we see here (we may be up to a second late), so
2396 	 * increase it a bit for outbound tdbs to manage most such
2397 	 * situations.
2398 	 *
2399 	 * For now, just add an offset that is likely to be larger
2400 	 * than the number of packets we can see in one second. The RFC
2401 	 * just says the next packet must have a higher seq value.
2402 	 *
2403 	 * XXX What is a good algorithm for this? We could use
2404 	 * a rate-determined increase, but to know it, we would have
2405 	 * to extend struct tdb.
2406 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2407 	 * will soon be replaced anyway. For now, just don't handle
2408 	 * this edge case.
2409 	 */
2410 #define RPL_INCR 16384
2411 	ut->rpl = htobe64(tdb->tdb_rpl +
2412 	    (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0));
2413 	ut->cur_bytes = htobe64(tdb->tdb_cur_bytes);
2414 	ut->sproto = tdb->tdb_sproto;
2415 	ut->rdomain = htons(tdb->tdb_rdomain);
2416 }
2417 
2418 static struct pfsync_slice *
2419 pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t)
2420 {
2421 	/*
2422 	 * just use the first slice for all ipsec (for now) until
2423 	 * it's more obvious what property (eg, spi) we can distribute
2424 	 * tdbs over slices with.
2425 	 */
2426 	struct pfsync_slice *s = &sc->sc_slices[0];
2427 
2428 	if (!mtx_enter_try(&s->s_mtx)) {
2429 		mtx_enter(&s->s_mtx);
2430 		s->s_stat_contended++;
2431 	}
2432 	s->s_stat_locks++;
2433 
2434 	return (s);
2435 }
2436 
2437 static void
2438 pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb)
2439 {
2440 	size_t nlen = sizeof(struct pfsync_tdb);
2441 	struct mbuf *m = NULL;
2442 
2443 	KASSERT(s->s_len >= PFSYNC_MINPKT);
2444 
2445 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2446 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2447 
2448 	if (TAILQ_EMPTY(&s->s_tdb_q))
2449 		nlen += sizeof(struct pfsync_subheader);
2450 
2451 	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
2452 		m = pfsync_slice_write(s);
2453 		if (m != NULL) {
2454 			s->s_stat_enqueue++;
2455 			if (mq_enqueue(&s->s_sendq, m) == 0)
2456 				task_add(s->s_softnet, &s->s_send);
2457 		}
2458 
2459 		nlen = sizeof(struct pfsync_subheader) +
2460 		    sizeof(struct pfsync_tdb);
2461 	}
2462 
2463 	s->s_len += nlen;
2464 	TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry);
2465 	tdb->tdb_updates = 0;
2466 
2467 	if (!timeout_pending(&s->s_tmo))
2468 		timeout_add_sec(&s->s_tmo, 1);
2469 }
2470 
2471 static void
2472 pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb)
2473 {
2474 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2475 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2476 
2477 	TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
2478 
2479 	s->s_len -= sizeof(struct pfsync_tdb);
2480 	if (TAILQ_EMPTY(&s->s_tdb_q))
2481 		s->s_len -= sizeof(struct pfsync_subheader);
2482 }
2483 
2484 /*
2485  * the reference that pfsync has to a tdb is accounted for by the
2486  * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is
2487  * called after all other references to a tdb are dropped (with
2488  * tdb_unref) as part of the tdb_free().
2489  *
2490  * tdb_free() needs to wait for pfsync to let go of the tdb though,
2491  * which would be best handled by a reference count, but tdb_free
2492  * needs the NET_LOCK which pfsync is already fighting with. instead
2493  * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop
2494  * with tdb_free.
2495  */
2496 
2497 void
2498 pfsync_update_tdb(struct tdb *tdb, int output)
2499 {
2500 	struct pfsync_softc *sc;
2501 
2502 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2503 
2504 	smr_read_enter();
2505 	sc = SMR_PTR_GET(&pfsyncif);
2506 	if (sc != NULL) {
2507 		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2508 
2509 		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2510 		if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2511 			mtx_enter(&tdb->tdb_mtx);
2512 			SET(tdb->tdb_flags, TDBF_PFSYNC);
2513 			mtx_leave(&tdb->tdb_mtx);
2514 
2515 			pfsync_tdb_ins(s, tdb);
2516 		} else if (++tdb->tdb_updates >= sc->sc_maxupdates)
2517 			pfsync_slice_sched(s);
2518 
2519 		/* XXX no sync timestamp on tdbs to check */
2520 
2521 		pfsync_slice_leave(sc, s);
2522 	}
2523 	smr_read_leave();
2524 }
2525 
2526 void
2527 pfsync_delete_tdb(struct tdb *tdb)
2528 {
2529 	struct pfsync_softc *sc;
2530 
2531 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2532 
2533 	smr_read_enter();
2534 	sc = SMR_PTR_GET(&pfsyncif);
2535 	if (sc != NULL) {
2536 		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2537 
2538 		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2539 		if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2540 			pfsync_tdb_del(s, tdb);
2541 
2542 			mtx_enter(&tdb->tdb_mtx);
2543 			CLR(tdb->tdb_flags, TDBF_PFSYNC);
2544 			mtx_leave(&tdb->tdb_mtx);
2545 		}
2546 
2547 		pfsync_slice_leave(sc, s);
2548 	}
2549 	smr_read_leave();
2550 
2551 	/*
2552 	 * handle pfsync_slice_drop being called from pfsync_down
2553 	 * and the smr/slice access above won't work.
2554 	 */
2555 
2556 	mtx_enter(&tdb->tdb_mtx);
2557 	SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */
2558 	while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2559 		msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT,
2560 		    "tdbfree", INFSLP);
2561 	}
2562 	mtx_leave(&tdb->tdb_mtx);
2563 }
2564 #endif /* defined(IPSEC) */
2565 
2566 struct pfsync_act {
2567 	void (*in)(struct pfsync_softc *, const caddr_t,
2568 	    unsigned int, unsigned int);
2569 	size_t len;
2570 };
2571 
2572 static void	pfsync_in_clr(struct pfsync_softc *,
2573 		    const caddr_t, unsigned int, unsigned int);
2574 static void	pfsync_in_iack(struct pfsync_softc *,
2575 		    const caddr_t, unsigned int, unsigned int);
2576 static void	pfsync_in_upd_c(struct pfsync_softc *,
2577 		    const caddr_t, unsigned int, unsigned int);
2578 static void	pfsync_in_ureq(struct pfsync_softc *,
2579 		    const caddr_t, unsigned int, unsigned int);
2580 static void	pfsync_in_del(struct pfsync_softc *,
2581 		    const caddr_t, unsigned int, unsigned int);
2582 static void	pfsync_in_del_c(struct pfsync_softc *,
2583 		    const caddr_t, unsigned int, unsigned int);
2584 static void	pfsync_in_bus(struct pfsync_softc *,
2585 		    const caddr_t, unsigned int, unsigned int);
2586 static void	pfsync_in_tdb(struct pfsync_softc *,
2587 		    const caddr_t, unsigned int, unsigned int);
2588 static void	pfsync_in_ins(struct pfsync_softc *,
2589 		    const caddr_t, unsigned int, unsigned int);
2590 static void	pfsync_in_upd(struct pfsync_softc *,
2591 		    const caddr_t, unsigned int, unsigned int);
2592 
2593 static const struct pfsync_act pfsync_acts[] = {
2594 	[PFSYNC_ACT_CLR] =
2595 	    { pfsync_in_clr,	sizeof(struct pfsync_clr) },
2596 	[PFSYNC_ACT_INS_ACK] =
2597 	    { pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
2598 	[PFSYNC_ACT_UPD_C] =
2599 	    { pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
2600 	[PFSYNC_ACT_UPD_REQ] =
2601 	    { pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
2602 	[PFSYNC_ACT_DEL] =
2603 	    { pfsync_in_del,	sizeof(struct pfsync_state) },
2604 	[PFSYNC_ACT_DEL_C] =
2605 	    { pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
2606 	[PFSYNC_ACT_BUS] =
2607 	    { pfsync_in_bus,	sizeof(struct pfsync_bus) },
2608 	[PFSYNC_ACT_INS] =
2609 	    { pfsync_in_ins,	sizeof(struct pfsync_state) },
2610 	[PFSYNC_ACT_UPD] =
2611 	    { pfsync_in_upd,	sizeof(struct pfsync_state) },
2612 	[PFSYNC_ACT_TDB] =
2613 	    { pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
2614 };
2615 
2616 static void
2617 pfsync_in_skip(struct pfsync_softc *sc,
2618     const caddr_t buf, unsigned int mlen, unsigned int count)
2619 {
2620 	/* nop */
2621 }
2622 
2623 static struct mbuf *
2624 pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen)
2625 {
2626 	struct pfsync_softc *sc;
2627 	struct pfsync_header *ph;
2628 	struct pfsync_subheader *subh;
2629 	unsigned int len;
2630 	void (*in)(struct pfsync_softc *,
2631 	    const caddr_t, unsigned int, unsigned int);
2632 
2633 	pfsyncstat_inc(pfsyncs_ipackets);
2634 
2635 	if (!pf_status.running)
2636 		return (m);
2637 
2638 	/*
2639 	 * pfsyncif is only set if it is up and running correctly.
2640 	 */
2641 	smr_read_enter();
2642 	sc = SMR_PTR_GET(&pfsyncif);
2643 	if (sc == NULL)
2644 		goto leave;
2645 
2646 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
2647 		pfsyncstat_inc(pfsyncs_badif);
2648 		goto leave;
2649 	}
2650 
2651 	/* verify that the IP TTL is 255. */
2652 	if (ttl != PFSYNC_DFLTTL) {
2653 		pfsyncstat_inc(pfsyncs_badttl);
2654 		goto leave;
2655 	}
2656 
2657 	m_adj(m, hlen);
2658 
2659 	if (m->m_pkthdr.len < sizeof(*ph)) {
2660 		pfsyncstat_inc(pfsyncs_hdrops);
2661 		goto leave;
2662 	}
2663 	if (m->m_len < sizeof(*ph)) {
2664 		m = m_pullup(m, sizeof(*ph));
2665 		if (m == NULL)
2666 			goto leave;
2667 	}
2668 
2669 	ph = mtod(m, struct pfsync_header *);
2670 	if (ph->version != PFSYNC_VERSION) {
2671 		pfsyncstat_inc(pfsyncs_badver);
2672 		goto leave;
2673 	}
2674 
2675 	len = ntohs(ph->len);
2676 	if (m->m_pkthdr.len < len) {
2677 		pfsyncstat_inc(pfsyncs_badlen);
2678 		goto leave;
2679 	}
2680 	if (m->m_pkthdr.len > len)
2681 		m->m_pkthdr.len = len;
2682 
2683 	/* ok, it's serious now */
2684 	refcnt_take(&sc->sc_refs);
2685 	smr_read_leave();
2686 
2687 	counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len);
2688 
2689 	m_adj(m, sizeof(*ph));
2690 
2691 	while (m->m_pkthdr.len >= sizeof(*subh)) {
2692 		unsigned int action, mlen, count;
2693 
2694 		if (m->m_len < sizeof(*subh)) {
2695 			m = m_pullup(m, sizeof(*subh));
2696 			if (m == NULL)
2697 				goto rele;
2698 		}
2699 		subh = mtod(m, struct pfsync_subheader *);
2700 
2701 		action = subh->action;
2702 		mlen = subh->len << 2;
2703 		count = ntohs(subh->count);
2704 
2705 		if (action >= PFSYNC_ACT_MAX ||
2706 		    action >= nitems(pfsync_acts) ||
2707 		    mlen < pfsync_acts[subh->action].len) {
2708 			/*
2709 			 * subheaders are always followed by at least one
2710 			 * message, so if the peer is new
2711 			 * enough to tell us how big its messages are then we
2712 			 * know enough to skip them.
2713 			 */
2714 			if (count == 0 || mlen == 0) {
2715 				pfsyncstat_inc(pfsyncs_badact);
2716 				goto rele;
2717 			}
2718 
2719 			in = pfsync_in_skip;
2720 		} else {
2721 			in = pfsync_acts[action].in;
2722 			if (in == NULL)
2723 				in = pfsync_in_skip;
2724 		}
2725 
2726 		m_adj(m, sizeof(*subh));
2727 		len = mlen * count;
2728 		if (len > m->m_pkthdr.len) {
2729 			pfsyncstat_inc(pfsyncs_badlen);
2730 			goto rele;
2731 		}
2732 		if (m->m_len < len) {
2733 			m = m_pullup(m, len);
2734 			if (m == NULL)
2735 				goto rele;
2736 		}
2737 
2738 		(*in)(sc, mtod(m, caddr_t), mlen, count);
2739 		m_adj(m, len);
2740 	}
2741 
2742 rele:
2743 	refcnt_rele_wake(&sc->sc_refs);
2744 	return (m);
2745 
2746 leave:
2747 	smr_read_leave();
2748 	return (m);
2749 }
2750 
2751 static void
2752 pfsync_in_clr(struct pfsync_softc *sc,
2753     const caddr_t buf, unsigned int mlen, unsigned int count)
2754 {
2755 	const struct pfsync_clr *clr;
2756 	struct pf_state *head, *tail, *st, *next;
2757 	struct pfi_kif *kif;
2758 	uint32_t creatorid;
2759 	unsigned int i;
2760 
2761 	rw_enter_read(&pf_state_list.pfs_rwl);
2762 
2763 	/* get a view of the state list */
2764 	mtx_enter(&pf_state_list.pfs_mtx);
2765 	head = TAILQ_FIRST(&pf_state_list.pfs_list);
2766 	tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2767 	mtx_leave(&pf_state_list.pfs_mtx);
2768 
2769 	PF_LOCK();
2770 	for (i = 0; i < count; i++) {
2771 		clr = (struct pfsync_clr *)(buf + i * mlen);
2772 
2773 		creatorid = clr->creatorid;
2774 		if (clr->ifname[0] == '\0')
2775 			kif = NULL;
2776 		else {
2777 			kif = pfi_kif_find(clr->ifname);
2778 			if (kif == NULL)
2779 				continue;
2780 		}
2781 
2782 		st = NULL;
2783 		next = head;
2784 
2785 		PF_STATE_ENTER_WRITE();
2786 		while (st != tail) {
2787 			st = next;
2788 			next = TAILQ_NEXT(st, entry_list);
2789 
2790 			if (creatorid != st->creatorid)
2791 				continue;
2792 			if (kif != NULL && kif != st->kif)
2793 				continue;
2794 
2795 			mtx_enter(&st->mtx);
2796 			SET(st->state_flags, PFSTATE_NOSYNC);
2797 			mtx_leave(&st->mtx);
2798 			pf_remove_state(st);
2799 		}
2800 		PF_STATE_EXIT_WRITE();
2801 	}
2802 	PF_UNLOCK();
2803 
2804 	rw_exit_read(&pf_state_list.pfs_rwl);
2805 }
2806 
2807 static void
2808 pfsync_in_ins(struct pfsync_softc *sc,
2809     const caddr_t buf, unsigned int mlen, unsigned int count)
2810 {
2811 	const struct pfsync_state *sp;
2812 	sa_family_t af1, af2;
2813 	unsigned int i;
2814 
2815 	PF_LOCK();
2816 	for (i = 0; i < count; i++) {
2817 		sp = (struct pfsync_state *)(buf + mlen * i);
2818 		af1 = sp->key[0].af;
2819 		af2 = sp->key[1].af;
2820 
2821 		/* check for invalid values */
2822 		if (sp->timeout >= PFTM_MAX ||
2823 		    sp->src.state > PF_TCPS_PROXY_DST ||
2824 		    sp->dst.state > PF_TCPS_PROXY_DST ||
2825 		    sp->direction > PF_OUT ||
2826 		    (((af1 || af2) &&
2827 		     ((af1 != AF_INET && af1 != AF_INET6) ||
2828 		      (af2 != AF_INET && af2 != AF_INET6))) ||
2829 		     (sp->af != AF_INET && sp->af != AF_INET6))) {
2830 			pfsyncstat_inc(pfsyncs_badval);
2831 			continue;
2832 		}
2833 
2834 		if (pf_state_import(sp, PFSYNC_SI_PFSYNC) == ENOMEM) {
2835 			/* drop out, but process the rest of the actions */
2836 			break;
2837 		}
2838 	}
2839 	PF_UNLOCK();
2840 }
2841 
2842 static void
2843 pfsync_in_iack(struct pfsync_softc *sc,
2844     const caddr_t buf, unsigned int mlen, unsigned int count)
2845 {
2846 	const struct pfsync_ins_ack *ia;
2847 	struct pf_state_cmp id_key;
2848 	struct pf_state *st;
2849 	unsigned int i;
2850 
2851 	for (i = 0; i < count; i++) {
2852 		ia = (struct pfsync_ins_ack *)(buf + mlen * i);
2853 
2854 		id_key.id = ia->id;
2855 		id_key.creatorid = ia->creatorid;
2856 
2857 		PF_STATE_ENTER_READ();
2858 		st = pf_find_state_byid(&id_key);
2859 		pf_state_ref(st);
2860 		PF_STATE_EXIT_READ();
2861 		if (st == NULL)
2862 			continue;
2863 
2864 		if (READ_ONCE(st->sync_defer) != NULL)
2865 			pfsync_deferred(sc, st);
2866 
2867 		pf_state_unref(st);
2868 	}
2869 }
2870 
2871 static int
2872 pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src,
2873     const struct pfsync_state_peer *dst)
2874 {
2875 	int sync = 0;
2876 
2877 	/*
2878 	 * The state should never go backwards except
2879 	 * for syn-proxy states.  Neither should the
2880 	 * sequence window slide backwards.
2881 	 */
2882 	if ((st->src.state > src->state &&
2883 	    (st->src.state < PF_TCPS_PROXY_SRC ||
2884 	     src->state >= PF_TCPS_PROXY_SRC)) ||
2885 
2886 	    (st->src.state == src->state &&
2887 	     SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
2888 		sync++;
2889 	else
2890 		pf_state_peer_ntoh(src, &st->src);
2891 
2892 	if ((st->dst.state > dst->state) ||
2893 
2894 	    (st->dst.state >= TCPS_SYN_SENT &&
2895 	     SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
2896 		sync++;
2897 	else
2898 		pf_state_peer_ntoh(dst, &st->dst);
2899 
2900 	return (sync);
2901 }
2902 
2903 static void
2904 pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st,
2905     const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst,
2906     uint8_t timeout)
2907 {
2908 	struct pf_state_scrub *sscrub = NULL;
2909 	struct pf_state_scrub *dscrub = NULL;
2910 	int sync;
2911 
2912 	if (src->scrub.scrub_flag && st->src.scrub == NULL) {
2913 		sscrub = pf_state_scrub_get();
2914 		if (sscrub == NULL) {
2915 			/* inc error? */
2916 			goto out;
2917 		}
2918 	}
2919 	if (dst->scrub.scrub_flag && st->dst.scrub == NULL) {
2920 		dscrub = pf_state_scrub_get();
2921 		if (dscrub == NULL) {
2922 			/* inc error? */
2923 			goto out;
2924 		}
2925 	}
2926 
2927 	if (READ_ONCE(st->sync_defer) != NULL)
2928 		pfsync_deferred(sc, st);
2929 
2930 	mtx_enter(&st->mtx);
2931 
2932 	/* attach the scrub memory if needed */
2933 	if (sscrub != NULL && st->src.scrub == NULL) {
2934 		st->src.scrub = sscrub;
2935 		sscrub = NULL;
2936 	}
2937 	if (dscrub != NULL && st->dst.scrub == NULL) {
2938 		st->dst.scrub = dscrub;
2939 		dscrub = NULL;
2940 	}
2941 
2942 	if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
2943 		sync = pfsync_upd_tcp(st, src, dst);
2944 	else {
2945 		sync = 0;
2946 
2947 		/*
2948 		 * Non-TCP protocol state machine always go
2949 		 * forwards
2950 		 */
2951 		if (st->src.state > src->state)
2952 			sync++;
2953 		else
2954 			pf_state_peer_ntoh(src, &st->src);
2955 
2956 		if (st->dst.state > dst->state)
2957 			sync++;
2958 		else
2959 			pf_state_peer_ntoh(dst, &st->dst);
2960 	}
2961 
2962 	st->pfsync_time = getuptime();
2963 	if (sync < 2) {
2964 		st->expire = st->pfsync_time;
2965 		st->timeout = timeout;
2966 	}
2967 
2968 	mtx_leave(&st->mtx);
2969 
2970 	if (sync) {
2971 		pfsyncstat_inc(pfsyncs_stale);
2972 		pfsync_update_state(st);
2973 	}
2974 
2975 out:
2976 	if (sscrub != NULL)
2977 		pf_state_scrub_put(sscrub);
2978 	if (dscrub != NULL)
2979 		pf_state_scrub_put(dscrub);
2980 }
2981 
2982 
2983 static void
2984 pfsync_in_upd(struct pfsync_softc *sc,
2985     const caddr_t buf, unsigned int mlen, unsigned int count)
2986 {
2987 	const struct pfsync_state *sp;
2988 	struct pf_state_cmp id_key;
2989 	struct pf_state *st;
2990 	int error;
2991 	unsigned int i;
2992 
2993 	for (i = 0; i < count; i++) {
2994 		sp = (struct pfsync_state *)(buf + mlen * i);
2995 
2996 		/* check for invalid values */
2997 		if (sp->timeout >= PFTM_MAX ||
2998 		    sp->src.state > PF_TCPS_PROXY_DST ||
2999 		    sp->dst.state > PF_TCPS_PROXY_DST) {
3000 			pfsyncstat_inc(pfsyncs_badval);
3001 			continue;
3002 		}
3003 
3004 		id_key.id = sp->id;
3005 		id_key.creatorid = sp->creatorid;
3006 
3007 		PF_STATE_ENTER_READ();
3008 		st = pf_find_state_byid(&id_key);
3009 		pf_state_ref(st);
3010 		PF_STATE_EXIT_READ();
3011 		if (st == NULL) {
3012 			/* insert the update */
3013 			PF_LOCK();
3014 			error = pf_state_import(sp, PFSYNC_SI_PFSYNC);
3015 			if (error)
3016 				pfsyncstat_inc(pfsyncs_badstate);
3017 			PF_UNLOCK();
3018 			continue;
3019 		}
3020 
3021 		pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout);
3022 
3023 		pf_state_unref(st);
3024 	}
3025 }
3026 
3027 static struct mbuf *
3028 pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count)
3029 {
3030 	struct mbuf *m;
3031 	unsigned int mlen;
3032 
3033 	m = m_gethdr(M_DONTWAIT, MT_DATA);
3034 	if (m == NULL) {
3035 		pfsyncstat_inc(pfsyncs_onomem);
3036 		return (NULL);
3037 	}
3038 
3039 	mlen = max_linkhdr + sizeof(sc->sc_template) +
3040 	    sizeof(struct pfsync_header) +
3041 	    sizeof(struct pfsync_subheader) +
3042 	    sizeof(struct pfsync_upd_req) * count;
3043 
3044 	if (mlen > MHLEN) {
3045 		MCLGETL(m, M_DONTWAIT, mlen);
3046 		if (!ISSET(m->m_flags, M_EXT)) {
3047 			m_freem(m);
3048 			return (NULL);
3049 		}
3050 	}
3051 
3052 	m_align(m, 0);
3053 	m->m_len = 0;
3054 
3055 	return (m);
3056 }
3057 
3058 static void
3059 pfsync_in_upd_c(struct pfsync_softc *sc,
3060     const caddr_t buf, unsigned int mlen, unsigned int count)
3061 {
3062 	const struct pfsync_upd_c *up;
3063 	struct pf_state_cmp id_key;
3064 	struct pf_state *st;
3065 	unsigned int i;
3066 	struct mbuf *m = NULL;
3067 	unsigned int rcount = 0;
3068 
3069 	for (i = 0; i < count; i++) {
3070 		up = (struct pfsync_upd_c *)(buf + mlen * i);
3071 
3072 		/* check for invalid values */
3073 		if (up->timeout >= PFTM_MAX ||
3074 		    up->src.state > PF_TCPS_PROXY_DST ||
3075 		    up->dst.state > PF_TCPS_PROXY_DST) {
3076 			pfsyncstat_inc(pfsyncs_badval);
3077 			continue;
3078 		}
3079 
3080 		id_key.id = up->id;
3081 		id_key.creatorid = up->creatorid;
3082 
3083 		PF_STATE_ENTER_READ();
3084 		st = pf_find_state_byid(&id_key);
3085 		pf_state_ref(st);
3086 		PF_STATE_EXIT_READ();
3087 		if (st == NULL) {
3088 			/* We don't have this state. Ask for it. */
3089 			struct pfsync_upd_req *ur;
3090 
3091 			if (m == NULL) {
3092 				m = pfsync_upd_req_init(sc, count);
3093 				if (m == NULL) {
3094 					pfsyncstat_inc(pfsyncs_onomem);
3095 					continue;
3096 				}
3097 			}
3098 
3099 			m = m_prepend(m, sizeof(*ur), M_DONTWAIT);
3100 			if (m == NULL) {
3101 				pfsyncstat_inc(pfsyncs_onomem);
3102 				continue;
3103 			}
3104 
3105 			ur = mtod(m, struct pfsync_upd_req *);
3106 			ur->id = up->id;
3107 			ur->creatorid = up->creatorid;
3108 			rcount++;
3109 
3110 			continue;
3111 		}
3112 
3113 		pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout);
3114 
3115 		pf_state_unref(st);
3116 	}
3117 
3118 	if (m != NULL) {
3119 		struct pfsync_subheader *subh;
3120 
3121 		m = m_prepend(m, sizeof(*subh), M_DONTWAIT);
3122 		if (m == NULL) {
3123 			pfsyncstat_inc(pfsyncs_onomem);
3124 			return;
3125 		}
3126 
3127 		subh = mtod(m, struct pfsync_subheader *);
3128 		subh->action = PFSYNC_ACT_UPD_REQ;
3129 		subh->len = sizeof(struct pfsync_upd_req) >> 2;
3130 		subh->count = htons(rcount);
3131 
3132 		m = pfsync_encap(sc, m);
3133 		if (m == NULL) {
3134 			pfsyncstat_inc(pfsyncs_onomem);
3135 			return;
3136 		}
3137 
3138 		pfsync_sendout(sc, m);
3139 	}
3140 }
3141 
3142 static void
3143 pfsync_in_ureq(struct pfsync_softc *sc,
3144     const caddr_t buf, unsigned int mlen, unsigned int count)
3145 {
3146 	const struct pfsync_upd_req *ur;
3147 	struct pf_state_cmp id_key;
3148 	struct pf_state *st;
3149 	unsigned int i;
3150 
3151 	for (i = 0; i < count; i++) {
3152 		ur = (struct pfsync_upd_req *)(buf + mlen * i);
3153 
3154 		id_key.id = ur->id;
3155 		id_key.creatorid = ur->creatorid;
3156 
3157 		if (id_key.id == 0 && id_key.creatorid == 0) {
3158 			pfsync_bulk_snd_start(sc);
3159 			continue;
3160 		}
3161 
3162 		PF_STATE_ENTER_READ();
3163 		st = pf_find_state_byid(&id_key);
3164 		if (st != NULL && st->timeout < PFTM_MAX &&
3165 		    !ISSET(st->state_flags, PFSTATE_NOSYNC))
3166 			pf_state_ref(st);
3167 		else
3168 			st = NULL;
3169 		PF_STATE_EXIT_READ();
3170 		if (st == NULL) {
3171 			pfsyncstat_inc(pfsyncs_badstate);
3172 			continue;
3173 		}
3174 
3175 		pfsync_update_state_req(sc, st);
3176 
3177 		pf_state_unref(st);
3178 	}
3179 }
3180 
3181 static void
3182 pfsync_in_del(struct pfsync_softc *sc,
3183     const caddr_t buf, unsigned int mlen, unsigned int count)
3184 {
3185 	const struct pfsync_state *sp;
3186 	struct pf_state_cmp id_key;
3187 	struct pf_state *st;
3188 	unsigned int i;
3189 
3190 	PF_LOCK();
3191 	PF_STATE_ENTER_WRITE();
3192 	for (i = 0; i < count; i++) {
3193 		sp = (struct pfsync_state *)(buf + mlen * i);
3194 
3195 		id_key.id = sp->id;
3196 		id_key.creatorid = sp->creatorid;
3197 
3198 		st = pf_find_state_byid(&id_key);
3199 		if (st == NULL) {
3200 			pfsyncstat_inc(pfsyncs_badstate);
3201 			continue;
3202 		}
3203 
3204 		mtx_enter(&st->mtx);
3205 		SET(st->state_flags, PFSTATE_NOSYNC);
3206 		mtx_leave(&st->mtx);
3207 		pf_remove_state(st);
3208 	}
3209 	PF_STATE_EXIT_WRITE();
3210 	PF_UNLOCK();
3211 }
3212 
3213 static void
3214 pfsync_in_del_c(struct pfsync_softc *sc,
3215     const caddr_t buf, unsigned int mlen, unsigned int count)
3216 {
3217 	const struct pfsync_del_c *sp;
3218 	struct pf_state_cmp id_key;
3219 	struct pf_state *st;
3220 	unsigned int i;
3221 
3222 	PF_LOCK();
3223 	PF_STATE_ENTER_WRITE();
3224 	for (i = 0; i < count; i++) {
3225 		sp = (struct pfsync_del_c *)(buf + mlen * i);
3226 
3227 		id_key.id = sp->id;
3228 		id_key.creatorid = sp->creatorid;
3229 
3230 		st = pf_find_state_byid(&id_key);
3231 		if (st == NULL) {
3232 			pfsyncstat_inc(pfsyncs_badstate);
3233 			continue;
3234 		}
3235 
3236 		mtx_enter(&st->mtx);
3237 		SET(st->state_flags, PFSTATE_NOSYNC);
3238 		mtx_leave(&st->mtx);
3239 		pf_remove_state(st);
3240 	}
3241 	PF_STATE_EXIT_WRITE();
3242 	PF_UNLOCK();
3243 }
3244 
3245 static void
3246 pfsync_in_bus(struct pfsync_softc *sc,
3247     const caddr_t buf, unsigned int len, unsigned int count)
3248 {
3249 	const struct pfsync_bus *bus = (struct pfsync_bus *)buf;
3250 
3251 	switch (bus->status) {
3252 	case PFSYNC_BUS_START:
3253 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START);
3254 		break;
3255 
3256 	case PFSYNC_BUS_END:
3257 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END);
3258 		break;
3259 	}
3260 }
3261 
3262 #if defined(IPSEC)
3263 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
3264 static void
3265 pfsync_update_net_tdb(const struct pfsync_tdb *pt)
3266 {
3267 	struct tdb *tdb;
3268 
3269 	NET_ASSERT_LOCKED();
3270 
3271 	/* check for invalid values */
3272 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
3273 	    (pt->dst.sa.sa_family != AF_INET &&
3274 	     pt->dst.sa.sa_family != AF_INET6))
3275 		goto bad;
3276 
3277 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
3278 	    (union sockaddr_union *)&pt->dst, pt->sproto);
3279 	if (tdb) {
3280 		uint64_t rpl = betoh64(pt->rpl);
3281 		uint64_t cur_bytes = betoh64(pt->cur_bytes);
3282 
3283 		/* Neither replay nor byte counter should ever decrease. */
3284 		mtx_enter(&tdb->tdb_mtx);
3285 		if (rpl >= tdb->tdb_rpl &&
3286 		    cur_bytes >= tdb->tdb_cur_bytes) {
3287 			tdb->tdb_rpl = rpl;
3288 			tdb->tdb_cur_bytes = cur_bytes;
3289 		}
3290 		mtx_leave(&tdb->tdb_mtx);
3291 
3292 		tdb_unref(tdb);
3293 	}
3294 	return;
3295 
3296  bad:
3297 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
3298 	    "invalid value");
3299 	pfsyncstat_inc(pfsyncs_badstate);
3300 	return;
3301 }
3302 #endif
3303 
3304 static void
3305 pfsync_in_tdb(struct pfsync_softc *sc,
3306     const caddr_t buf, unsigned int len, unsigned int count)
3307 {
3308 #if defined(IPSEC)
3309 	const struct pfsync_tdb *tp;
3310 	unsigned int i;
3311 
3312 	for (i = 0; i < count; i++) {
3313 		tp = (const struct pfsync_tdb *)(buf + len * i);
3314 		pfsync_update_net_tdb(tp);
3315 	}
3316 #endif
3317 }
3318 
3319 int
3320 pfsync_input4(struct mbuf **mp, int *offp, int proto, int af)
3321 {
3322 	struct mbuf *m = *mp;
3323 	struct ip *ip;
3324 
3325 	ip = mtod(m, struct ip *);
3326 
3327 	m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2);
3328 
3329 	m_freem(m);
3330 	*mp = NULL;
3331 
3332 	return (IPPROTO_DONE);
3333 }
3334 
3335 int
3336 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
3337 {
3338 	struct pfsyncstats pfsyncstat;
3339 
3340 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
3341 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
3342 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
3343 	    pfsyncs_ncounters, NULL);
3344 	return (sysctl_rdstruct(oldp, oldlenp, newp,
3345 	    &pfsyncstat, sizeof(pfsyncstat)));
3346 }
3347 
3348 int
3349 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
3350     void *newp, size_t newlen)
3351 {
3352 	/* All sysctl names at this level are terminal. */
3353 	if (namelen != 1)
3354 		return (ENOTDIR);
3355 
3356 	switch (name[0]) {
3357 	case PFSYNCCTL_STATS:
3358 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
3359 	default:
3360 		return (ENOPROTOOPT);
3361 	}
3362 }
3363