xref: /openbsd/sys/net/if_pfsync.c (revision 6064c65c)
1 /*	$OpenBSD: if_pfsync.c,v 1.326 2024/05/24 06:38:41 sashan Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009, 2022, 2023 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include "bpfilter.h"
46 #include "pfsync.h"
47 #include "kstat.h"
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/time.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/socket.h>
55 #include <sys/ioctl.h>
56 #include <sys/timeout.h>
57 #include <sys/kernel.h>
58 #include <sys/sysctl.h>
59 #include <sys/pool.h>
60 #include <sys/syslog.h>
61 #include <sys/tree.h>
62 #include <sys/smr.h>
63 #include <sys/percpu.h>
64 #include <sys/refcnt.h>
65 #include <sys/kstat.h>
66 #include <sys/stdarg.h>
67 
68 #include <net/if.h>
69 #include <net/if_types.h>
70 #include <net/bpf.h>
71 #include <net/netisr.h>
72 #include <net/route.h>
73 
74 #include <netinet/in.h>
75 #include <netinet/if_ether.h>
76 #include <netinet/ip.h>
77 #include <netinet/in_var.h>
78 #include <netinet/ip_var.h>
79 #include <netinet/ip_ipsp.h>
80 #include <netinet/ip_icmp.h>
81 #include <netinet/icmp6.h>
82 #include <netinet/tcp.h>
83 #include <netinet/tcp_seq.h>
84 #include <netinet/tcp_fsm.h>
85 #include <netinet/udp.h>
86 
87 #ifdef INET6
88 #include <netinet6/in6_var.h>
89 #include <netinet/ip6.h>
90 #include <netinet6/ip6_var.h>
91 #include <netinet6/nd6.h>
92 #endif /* INET6 */
93 
94 #include "carp.h"
95 #if NCARP > 0
96 #include <netinet/ip_carp.h>
97 #endif
98 
99 #include <net/pfvar.h>
100 #include <net/pfvar_priv.h>
101 #include <net/if_pfsync.h>
102 
103 #define PFSYNC_MINPKT ( \
104 	sizeof(struct ip) + \
105 	sizeof(struct pfsync_header))
106 
107 struct pfsync_softc;
108 
109 struct pfsync_deferral {
110 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
111 	struct pf_state				*pd_st;
112 	struct mbuf				*pd_m;
113 	uint64_t				 pd_deadline;
114 };
115 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
116 
117 #define PFSYNC_DEFER_NSEC	20000000ULL
118 #define PFSYNC_DEFER_LIMIT	128
119 #define PFSYNC_BULK_SND_IVAL_MS	20
120 
121 static struct pool pfsync_deferrals_pool;
122 
123 enum pfsync_bulk_req_state {
124 	PFSYNC_BREQ_S_NONE,
125 	PFSYNC_BREQ_S_START,
126 	PFSYNC_BREQ_S_SENT,
127 	PFSYNC_BREQ_S_BULK,
128 	PFSYNC_BREQ_S_DONE,
129 };
130 
131 static const char *pfsync_bulk_req_state_names[] = {
132 	[PFSYNC_BREQ_S_NONE]		= "none",
133 	[PFSYNC_BREQ_S_START]		= "start",
134 	[PFSYNC_BREQ_S_SENT]		= "sent",
135 	[PFSYNC_BREQ_S_BULK]		= "bulk",
136 	[PFSYNC_BREQ_S_DONE]		= "done",
137 };
138 
139 enum pfsync_bulk_req_event {
140 	PFSYNC_BREQ_EVT_UP,
141 	PFSYNC_BREQ_EVT_DOWN,
142 	PFSYNC_BREQ_EVT_TMO,
143 	PFSYNC_BREQ_EVT_LINK,
144 	PFSYNC_BREQ_EVT_BUS_START,
145 	PFSYNC_BREQ_EVT_BUS_END,
146 };
147 
148 static const char *pfsync_bulk_req_event_names[] = {
149 	[PFSYNC_BREQ_EVT_UP]		= "up",
150 	[PFSYNC_BREQ_EVT_DOWN]		= "down",
151 	[PFSYNC_BREQ_EVT_TMO]		= "timeout",
152 	[PFSYNC_BREQ_EVT_LINK]		= "link",
153 	[PFSYNC_BREQ_EVT_BUS_START]	= "bus-start",
154 	[PFSYNC_BREQ_EVT_BUS_END]	= "bus-end",
155 };
156 
157 struct pfsync_slice {
158 	struct pfsync_softc	*s_pfsync;
159 	struct mutex		 s_mtx;
160 
161 	struct pf_state_queue	 s_qs[PFSYNC_S_COUNT];
162 	TAILQ_HEAD(, tdb)	 s_tdb_q;
163 	size_t			 s_len;
164 	struct mbuf_list	 s_ml;
165 
166 	struct taskq		*s_softnet;
167 	struct task		 s_task;
168 	struct timeout		 s_tmo;
169 
170 	struct mbuf_queue	 s_sendq;
171 	struct task		 s_send;
172 
173 	struct pfsync_deferrals	 s_deferrals;
174 	unsigned int		 s_deferred;
175 	struct task		 s_deferrals_task;
176 	struct timeout		 s_deferrals_tmo;
177 
178 	uint64_t		 s_stat_locks;
179 	uint64_t		 s_stat_contended;
180 	uint64_t		 s_stat_write_nop;
181 	uint64_t		 s_stat_task_add;
182 	uint64_t		 s_stat_task_run;
183 	uint64_t		 s_stat_enqueue;
184 	uint64_t		 s_stat_dequeue;
185 
186 	uint64_t		 s_stat_defer_add;
187 	uint64_t		 s_stat_defer_ack;
188 	uint64_t		 s_stat_defer_run;
189 	uint64_t		 s_stat_defer_overlimit;
190 
191 	struct kstat		*s_kstat;
192 } __aligned(CACHELINESIZE);
193 
194 #define PFSYNC_SLICE_BITS	 1
195 #define PFSYNC_NSLICES		 (1 << PFSYNC_SLICE_BITS)
196 
197 struct pfsync_softc {
198 	struct ifnet		 sc_if;
199 	unsigned int		 sc_dead;
200 	unsigned int		 sc_up;
201 	struct refcnt		 sc_refs;
202 
203 	/* config */
204 	struct in_addr		 sc_syncpeer;
205 	unsigned int		 sc_maxupdates;
206 	unsigned int		 sc_defer;
207 
208 	/* operation */
209 	unsigned int		 sc_sync_ifidx;
210 	unsigned int		 sc_sync_if_down;
211 	void			*sc_inm;
212 	struct task		 sc_ltask;
213 	struct task		 sc_dtask;
214 	struct ip		 sc_template;
215 
216 	struct pfsync_slice	 sc_slices[PFSYNC_NSLICES];
217 
218 	struct {
219 		struct rwlock			 req_lock;
220 		struct timeout			 req_tmo;
221 		enum pfsync_bulk_req_state	 req_state;
222 		unsigned int			 req_tries;
223 		unsigned int			 req_demoted;
224 	}			 sc_bulk_req;
225 
226 	struct {
227 		struct rwlock			 snd_lock;
228 		struct timeout			 snd_tmo;
229 		time_t				 snd_requested;
230 
231 		struct pf_state			*snd_next;
232 		struct pf_state			*snd_tail;
233 		unsigned int			 snd_again;
234 	}			 sc_bulk_snd;
235 };
236 
237 static struct pfsync_softc	*pfsyncif = NULL;
238 static struct cpumem		*pfsynccounters;
239 
240 static inline void
pfsyncstat_inc(enum pfsync_counters c)241 pfsyncstat_inc(enum pfsync_counters c)
242 {
243 	counters_inc(pfsynccounters, c);
244 }
245 
246 static int	pfsync_clone_create(struct if_clone *, int);
247 static int	pfsync_clone_destroy(struct ifnet *);
248 
249 static int	pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *,
250 		    struct rtentry *);
251 static void	pfsync_start(struct ifqueue *);
252 
253 static int	pfsync_ioctl(struct ifnet *, u_long, caddr_t);
254 static int	pfsync_up(struct pfsync_softc *);
255 static int	pfsync_down(struct pfsync_softc *);
256 
257 static int	pfsync_set_mtu(struct pfsync_softc *, unsigned int);
258 static int	pfsync_set_parent(struct pfsync_softc *,
259 		    const struct if_parent *);
260 static int	pfsync_get_parent(struct pfsync_softc *, struct if_parent *);
261 static int	pfsync_del_parent(struct pfsync_softc *);
262 
263 static int	pfsync_get_ioc(struct pfsync_softc *, struct ifreq *);
264 static int	pfsync_set_ioc(struct pfsync_softc *, struct ifreq *);
265 
266 static void	pfsync_syncif_link(void *);
267 static void	pfsync_syncif_detach(void *);
268 
269 static void	pfsync_sendout(struct pfsync_softc *, struct mbuf *);
270 static void	pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *);
271 
272 static void	pfsync_slice_tmo(void *);
273 static void	pfsync_slice_task(void *);
274 static void	pfsync_slice_sendq(void *);
275 
276 static void	pfsync_deferrals_tmo(void *);
277 static void	pfsync_deferrals_task(void *);
278 static void	pfsync_defer_output(struct pfsync_deferral *);
279 
280 static void	pfsync_bulk_req_evt(struct pfsync_softc *,
281 		    enum pfsync_bulk_req_event);
282 static void	pfsync_bulk_req_tmo(void *);
283 
284 static void	pfsync_bulk_snd_tmo(void *);
285 
286 #if NKSTAT > 0
287 struct pfsync_kstat_data {
288 	struct kstat_kv pd_locks;
289 	struct kstat_kv pd_contended;
290 	struct kstat_kv pd_write_nop;
291 	struct kstat_kv pd_task_add;
292 	struct kstat_kv pd_task_run;
293 	struct kstat_kv pd_enqueue;
294 	struct kstat_kv pd_dequeue;
295 	struct kstat_kv pd_qdrop;
296 
297 	struct kstat_kv pd_defer_len;
298 	struct kstat_kv pd_defer_add;
299 	struct kstat_kv pd_defer_ack;
300 	struct kstat_kv pd_defer_run;
301 	struct kstat_kv pd_defer_overlimit;
302 };
303 
304 static const struct pfsync_kstat_data pfsync_kstat_tpl = {
305 	KSTAT_KV_INITIALIZER("locks",		KSTAT_KV_T_COUNTER64),
306 	KSTAT_KV_INITIALIZER("contended",	KSTAT_KV_T_COUNTER64),
307 	KSTAT_KV_INITIALIZER("write-nops",	KSTAT_KV_T_COUNTER64),
308 	KSTAT_KV_INITIALIZER("send-sched",	KSTAT_KV_T_COUNTER64),
309 	KSTAT_KV_INITIALIZER("send-run",	KSTAT_KV_T_COUNTER64),
310 	KSTAT_KV_INITIALIZER("enqueues",	KSTAT_KV_T_COUNTER64),
311 	KSTAT_KV_INITIALIZER("dequeues",	KSTAT_KV_T_COUNTER64),
312 	KSTAT_KV_UNIT_INITIALIZER("qdrops",
313 	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
314 
315 	KSTAT_KV_UNIT_INITIALIZER("defer-len",
316 	    KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS),
317 	KSTAT_KV_INITIALIZER("defer-add",	KSTAT_KV_T_COUNTER64),
318 	KSTAT_KV_INITIALIZER("defer-ack",	KSTAT_KV_T_COUNTER64),
319 	KSTAT_KV_INITIALIZER("defer-run",	KSTAT_KV_T_COUNTER64),
320 	KSTAT_KV_INITIALIZER("defer-over",	KSTAT_KV_T_COUNTER64),
321 };
322 
323 static int
pfsync_kstat_copy(struct kstat * ks,void * dst)324 pfsync_kstat_copy(struct kstat *ks, void *dst)
325 {
326 	struct pfsync_slice *s = ks->ks_softc;
327 	struct pfsync_kstat_data *pd = dst;
328 
329 	*pd = pfsync_kstat_tpl;
330 	kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks;
331 	kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended;
332 	kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop;
333 	kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add;
334 	kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run;
335 	kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue;
336 	kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue;
337 	kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq);
338 
339 	kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred;
340 	kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add;
341 	kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack;
342 	kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run;
343 	kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit;
344 
345 	return (0);
346 }
347 #endif /* NKSTAT > 0 */
348 
349 #define PFSYNC_MAX_BULKTRIES	12
350 
351 struct if_clone	pfsync_cloner =
352     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
353 
354 void
pfsyncattach(int npfsync)355 pfsyncattach(int npfsync)
356 {
357 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
358 	if_clone_attach(&pfsync_cloner);
359 }
360 
361 static int
pfsync_clone_create(struct if_clone * ifc,int unit)362 pfsync_clone_create(struct if_clone *ifc, int unit)
363 {
364 	struct pfsync_softc *sc;
365 	struct ifnet *ifp;
366 	size_t i, q;
367 
368 	if (unit != 0)
369 		return (ENXIO);
370 
371 	if (pfsync_deferrals_pool.pr_size == 0) {
372 		pool_init(&pfsync_deferrals_pool,
373 		    sizeof(struct pfsync_deferral), 0,
374 		    IPL_MPFLOOR, 0, "pfdefer", NULL);
375 		/* pool_cache_init(&pfsync_deferrals_pool); */
376 	}
377 
378 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
379 	if (sc == NULL)
380 		return (ENOMEM);
381 
382 	/* sc_refs is "owned" by IFF_RUNNING */
383 
384 	sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
385 	sc->sc_maxupdates = 128;
386 	sc->sc_defer = 0;
387 
388 	task_set(&sc->sc_ltask, pfsync_syncif_link, sc);
389 	task_set(&sc->sc_dtask, pfsync_syncif_detach, sc);
390 
391 	rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq");
392 	/* need process context to take net lock to call ip_output */
393 	timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc);
394 
395 	rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd");
396 	/* need process context to take net lock to call ip_output */
397 	timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc);
398 
399 	ifp = &sc->sc_if;
400 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d",
401 	    ifc->ifc_name, unit);
402 	ifp->if_softc = sc;
403 	ifp->if_ioctl = pfsync_ioctl;
404 	ifp->if_output = pfsync_output;
405 	ifp->if_qstart = pfsync_start;
406 	ifp->if_type = IFT_PFSYNC;
407 	ifp->if_hdrlen = sizeof(struct pfsync_header);
408 	ifp->if_mtu = ETHERMTU;
409 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
410 
411 	for (i = 0; i < nitems(sc->sc_slices); i++) {
412 		struct pfsync_slice *s = &sc->sc_slices[i];
413 
414 		s->s_pfsync = sc;
415 
416 		mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0);
417 		s->s_softnet = net_tq(i);
418 		timeout_set(&s->s_tmo, pfsync_slice_tmo, s);
419 		task_set(&s->s_task, pfsync_slice_task, s);
420 
421 		mq_init(&s->s_sendq, 16, IPL_SOFTNET);
422 		task_set(&s->s_send, pfsync_slice_sendq, s);
423 
424 		s->s_len = PFSYNC_MINPKT;
425 		ml_init(&s->s_ml);
426 
427 		for (q = 0; q < nitems(s->s_qs); q++)
428 			TAILQ_INIT(&s->s_qs[q]);
429 		TAILQ_INIT(&s->s_tdb_q);
430 
431 		/* stupid NET_LOCK */
432 		timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s);
433 		task_set(&s->s_deferrals_task, pfsync_deferrals_task, s);
434 		TAILQ_INIT(&s->s_deferrals);
435 
436 #if NKSTAT > 0
437 		s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i,
438 		    KSTAT_T_KV, 0);
439 
440 		kstat_set_mutex(s->s_kstat, &s->s_mtx);
441 		s->s_kstat->ks_softc = s;
442 		s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl);
443 		s->s_kstat->ks_copy = pfsync_kstat_copy;
444 		kstat_install(s->s_kstat);
445 #endif
446 	}
447 
448 	if_counters_alloc(ifp);
449 	if_attach(ifp);
450 	if_alloc_sadl(ifp);
451 
452 #if NCARP > 0
453 	if_addgroup(ifp, "carp");
454 #endif
455 
456 #if NBPFILTER > 0
457 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
458 #endif
459 
460 	return (0);
461 }
462 
463 static int
pfsync_clone_destroy(struct ifnet * ifp)464 pfsync_clone_destroy(struct ifnet *ifp)
465 {
466 	struct pfsync_softc *sc = ifp->if_softc;
467 #if NKSTAT > 0
468 	size_t i;
469 #endif
470 
471 	NET_LOCK();
472 	sc->sc_dead = 1;
473 
474 	if (ISSET(ifp->if_flags, IFF_RUNNING))
475 		pfsync_down(sc);
476 	NET_UNLOCK();
477 
478 	if_detach(ifp);
479 
480 #if NKSTAT > 0
481 	for (i = 0; i < nitems(sc->sc_slices); i++) {
482 		struct pfsync_slice *s = &sc->sc_slices[i];
483 
484 		kstat_destroy(s->s_kstat);
485 	}
486 #endif
487 
488 	free(sc, M_DEVBUF, sizeof(*sc));
489 
490 	return (0);
491 }
492 
493 static void
pfsync_dprintf(struct pfsync_softc * sc,const char * fmt,...)494 pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...)
495 {
496 	struct ifnet *ifp = &sc->sc_if;
497 	va_list ap;
498 
499 	if (!ISSET(ifp->if_flags, IFF_DEBUG))
500 		return;
501 
502 	printf("%s: ", ifp->if_xname);
503 	va_start(ap, fmt);
504 	vprintf(fmt, ap);
505 	va_end(ap);
506 	printf("\n");
507 }
508 
509 static void
pfsync_syncif_link(void * arg)510 pfsync_syncif_link(void *arg)
511 {
512 	struct pfsync_softc *sc = arg;
513 	struct ifnet *ifp0;
514 	unsigned int sync_if_down = 1;
515 
516 	ifp0 = if_get(sc->sc_sync_ifidx);
517 	if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) {
518 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK);
519 		sync_if_down = 0;
520 	}
521 	if_put(ifp0);
522 
523 #if NCARP > 0
524 	if (sc->sc_sync_if_down != sync_if_down) {
525 		carp_group_demote_adj(&sc->sc_if,
526 		    sync_if_down ? 1 : -1, "pfsync link");
527 	}
528 #endif
529 
530 	sc->sc_sync_if_down = sync_if_down;
531 }
532 
533 static void
pfsync_syncif_detach(void * arg)534 pfsync_syncif_detach(void *arg)
535 {
536 	struct pfsync_softc *sc = arg;
537 	struct ifnet *ifp = &sc->sc_if;
538 
539 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
540 		pfsync_down(sc);
541 		if_down(ifp);
542 	}
543 
544 	sc->sc_sync_ifidx = 0;
545 }
546 
547 static int
pfsync_output(struct ifnet * ifp,struct mbuf * m,struct sockaddr * dst,struct rtentry * rt)548 pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
549     struct rtentry *rt)
550 {
551 	m_freem(m);	/* drop packet */
552 	return (EAFNOSUPPORT);
553 }
554 
555 static int
pfsync_ioctl(struct ifnet * ifp,u_long cmd,caddr_t data)556 pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
557 {
558 	struct pfsync_softc *sc = ifp->if_softc;
559 	struct ifreq *ifr = (struct ifreq *)data;
560 	int error = ENOTTY;
561 
562 	switch (cmd) {
563 	case SIOCSIFADDR:
564 		error = EOPNOTSUPP;
565 		break;
566 
567 	case SIOCSIFFLAGS:
568 		if (ISSET(ifp->if_flags, IFF_UP)) {
569 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
570 				error = pfsync_up(sc);
571 			else
572 				error = ENETRESET;
573 		} else {
574 			if (ISSET(ifp->if_flags, IFF_RUNNING))
575 				error = pfsync_down(sc);
576 		}
577 		break;
578 
579 	case SIOCSIFMTU:
580 		error = pfsync_set_mtu(sc, ifr->ifr_mtu);
581 		break;
582 
583 	case SIOCSIFPARENT:
584 		error = pfsync_set_parent(sc, (struct if_parent *)data);
585 		break;
586 	case SIOCGIFPARENT:
587 		error = pfsync_get_parent(sc, (struct if_parent *)data);
588 		break;
589 	case SIOCDIFPARENT:
590 		error = pfsync_del_parent(sc);
591 		break;
592 
593 	case SIOCSETPFSYNC:
594 		error = pfsync_set_ioc(sc, ifr);
595 		break;
596 	case SIOCGETPFSYNC:
597 		error = pfsync_get_ioc(sc, ifr);
598 		break;
599 
600 	default:
601 		break;
602 	}
603 
604 	if (error == ENETRESET)
605 		error = 0;
606 
607 	return (error);
608 }
609 
610 static int
pfsync_set_mtu(struct pfsync_softc * sc,unsigned int mtu)611 pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu)
612 {
613 	struct ifnet *ifp = &sc->sc_if;
614 	struct ifnet *ifp0;
615 	int error = 0;
616 
617 	ifp0 = if_get(sc->sc_sync_ifidx);
618 	if (ifp0 == NULL)
619 		return (EINVAL);
620 
621 	if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) {
622 		error = EINVAL;
623 		goto put;
624 	}
625 
626 	/* commit */
627 	ifp->if_mtu = mtu;
628 
629 put:
630 	if_put(ifp0);
631 	return (error);
632 }
633 
634 static int
pfsync_set_parent(struct pfsync_softc * sc,const struct if_parent * p)635 pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p)
636 {
637 	struct ifnet *ifp = &sc->sc_if;
638 	struct ifnet *ifp0;
639 	int error = 0;
640 
641 	ifp0 = if_unit(p->ifp_parent);
642 	if (ifp0 == NULL)
643 		return (ENXIO);
644 
645 	if (ifp0->if_index == sc->sc_sync_ifidx)
646 		goto put;
647 
648 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
649 		error = EBUSY;
650 		goto put;
651 	}
652 
653 	/* commit */
654 	sc->sc_sync_ifidx = ifp0->if_index;
655 
656 put:
657 	if_put(ifp0);
658 	return (error);
659 }
660 
661 static int
pfsync_get_parent(struct pfsync_softc * sc,struct if_parent * p)662 pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p)
663 {
664 	struct ifnet *ifp0;
665 	int error = 0;
666 
667 	ifp0 = if_get(sc->sc_sync_ifidx);
668 	if (ifp0 == NULL)
669 		error = EADDRNOTAVAIL;
670 	else
671 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
672 	if_put(ifp0);
673 
674 	return (error);
675 }
676 
677 static int
pfsync_del_parent(struct pfsync_softc * sc)678 pfsync_del_parent(struct pfsync_softc *sc)
679 {
680 	struct ifnet *ifp = &sc->sc_if;
681 
682 	if (ISSET(ifp->if_flags, IFF_RUNNING))
683 		return (EBUSY);
684 
685 	/* commit */
686 	sc->sc_sync_ifidx = 0;
687 
688 	return (0);
689 }
690 
691 static int
pfsync_get_ioc(struct pfsync_softc * sc,struct ifreq * ifr)692 pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
693 {
694 	struct pfsyncreq pfsyncr;
695 	struct ifnet *ifp0;
696 
697 	memset(&pfsyncr, 0, sizeof(pfsyncr));
698 
699 	ifp0 = if_get(sc->sc_sync_ifidx);
700 	if (ifp0 != NULL) {
701 		strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname,
702 		    sizeof(pfsyncr.pfsyncr_syncdev));
703 	}
704 	if_put(ifp0);
705 
706 	pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer;
707 	pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
708 	pfsyncr.pfsyncr_defer = sc->sc_defer;
709 
710 	return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
711 }
712 
713 static int
pfsync_set_ioc(struct pfsync_softc * sc,struct ifreq * ifr)714 pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr)
715 {
716 	struct ifnet *ifp = &sc->sc_if;
717 	struct pfsyncreq pfsyncr;
718 	unsigned int sync_ifidx = sc->sc_sync_ifidx;
719 	int wantdown = 0;
720 	int error;
721 
722 	error = suser(curproc);
723 	if (error != 0)
724 		return (error);
725 
726 	error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr));
727 	if (error != 0)
728 		return (error);
729 
730 	if (pfsyncr.pfsyncr_maxupdates > 255)
731 		return (EINVAL);
732 
733 	if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */
734 		struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev);
735 		if (ifp0 == NULL)
736 			return (ENXIO);
737 
738 		if (ifp0->if_index != sync_ifidx)
739 			wantdown = 1;
740 
741 		sync_ifidx = ifp0->if_index;
742 		if_put(ifp0);
743 	} else { /* del */
744 		wantdown = 1;
745 		sync_ifidx = 0;
746 	}
747 
748 	if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY)
749 		pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP;
750 	if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr)
751 		wantdown = 1;
752 
753 	if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING))
754 		return (EBUSY);
755 
756 	/* commit */
757 	sc->sc_sync_ifidx = sync_ifidx;
758 	sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer;
759 	sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
760 	sc->sc_defer = pfsyncr.pfsyncr_defer;
761 
762 	return (0);
763 }
764 
765 static int
pfsync_up(struct pfsync_softc * sc)766 pfsync_up(struct pfsync_softc *sc)
767 {
768 	struct ifnet *ifp = &sc->sc_if;
769 	struct ifnet *ifp0;
770 	void *inm = NULL;
771 	int error = 0;
772 	struct ip *ip;
773 
774 	NET_ASSERT_LOCKED();
775 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
776 
777 	if (sc->sc_dead)
778 		return (ENXIO);
779 
780 	/*
781 	 * coordinate with pfsync_down(). if sc_up is still up and
782 	 * we're here then something else is tearing pfsync down.
783 	 */
784 	if (sc->sc_up)
785 		return (EBUSY);
786 
787 	if (sc->sc_syncpeer.s_addr == INADDR_ANY ||
788 	    sc->sc_syncpeer.s_addr == INADDR_BROADCAST)
789 		return (EDESTADDRREQ);
790 
791 	ifp0 = if_get(sc->sc_sync_ifidx);
792 	if (ifp0 == NULL)
793 		return (ENXIO);
794 
795 	if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) {
796 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
797 			error = ENODEV;
798 			goto put;
799 		}
800 		inm = in_addmulti(&sc->sc_syncpeer, ifp0);
801 		if (inm == NULL) {
802 			error = ECONNABORTED;
803 			goto put;
804 		}
805 	}
806 
807 	sc->sc_up = 1;
808 
809 	ip = &sc->sc_template;
810 	memset(ip, 0, sizeof(*ip));
811 	ip->ip_v = IPVERSION;
812 	ip->ip_hl = sizeof(*ip) >> 2;
813 	ip->ip_tos = IPTOS_LOWDELAY;
814 	/* len and id are set later */
815 	ip->ip_off = htons(IP_DF);
816 	ip->ip_ttl = PFSYNC_DFLTTL;
817 	ip->ip_p = IPPROTO_PFSYNC;
818 	ip->ip_src.s_addr = INADDR_ANY;
819 	ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr;
820 
821 	/* commit */
822 	refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */
823 
824 #if NCARP > 0
825 	sc->sc_sync_if_down = 1;
826 	carp_group_demote_adj(&sc->sc_if, 1, "pfsync up");
827 #endif
828 
829 	if_linkstatehook_add(ifp0, &sc->sc_ltask);
830 	if_detachhook_add(ifp0, &sc->sc_dtask);
831 
832 	sc->sc_inm = inm;
833 	SET(ifp->if_flags, IFF_RUNNING);
834 
835 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP);
836 
837 	refcnt_take(&sc->sc_refs); /* give one to SMR */
838 	SMR_PTR_SET_LOCKED(&pfsyncif, sc);
839 
840 	pfsync_syncif_link(sc); /* try and push the bulk req state forward */
841 
842 put:
843 	if_put(ifp0);
844 	return (error);
845 }
846 
847 static struct mbuf *
pfsync_encap(struct pfsync_softc * sc,struct mbuf * m)848 pfsync_encap(struct pfsync_softc *sc, struct mbuf *m)
849 {
850 	struct {
851 		struct ip		ip;
852 		struct pfsync_header	ph;
853 	} __packed __aligned(4) *h;
854 	unsigned int mlen = m->m_pkthdr.len;
855 
856 	m = m_prepend(m, sizeof(*h), M_DONTWAIT);
857 	if (m == NULL)
858 		return (NULL);
859 
860 	h = mtod(m, void *);
861 	memset(h, 0, sizeof(*h));
862 
863 	mlen += sizeof(h->ph);
864 	h->ph.version = PFSYNC_VERSION;
865 	h->ph.len = htons(mlen);
866 	/* h->ph.pfcksum */
867 
868 	mlen += sizeof(h->ip);
869 	h->ip = sc->sc_template;
870 	h->ip.ip_len = htons(mlen);
871 	h->ip.ip_id = htons(ip_randomid());
872 
873 	return (m);
874 }
875 
876 static void
pfsync_bulk_req_send(struct pfsync_softc * sc)877 pfsync_bulk_req_send(struct pfsync_softc *sc)
878 {
879 	struct {
880 		struct pfsync_subheader	subh;
881 		struct pfsync_upd_req	ur;
882 	} __packed __aligned(4) *h;
883 	unsigned mlen = max_linkhdr +
884 	    sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h);
885 	struct mbuf *m;
886 
887 	m = m_gethdr(M_DONTWAIT, MT_DATA);
888 	if (m == NULL)
889 		goto fail;
890 
891 	if (mlen > MHLEN) {
892 		MCLGETL(m, M_DONTWAIT, mlen);
893 		if (!ISSET(m->m_flags, M_EXT))
894 			goto drop;
895 	}
896 
897 	m_align(m, sizeof(*h));
898 	m->m_len = m->m_pkthdr.len = sizeof(*h);
899 
900 	h = mtod(m, void *);
901 	memset(h, 0, sizeof(*h));
902 
903 	h->subh.action = PFSYNC_ACT_UPD_REQ;
904 	h->subh.len = sizeof(h->ur) >> 2;
905 	h->subh.count = htons(1);
906 
907 	h->ur.id = htobe64(0);
908 	h->ur.creatorid = htobe32(0);
909 
910 	m = pfsync_encap(sc, m);
911 	if (m == NULL)
912 		goto fail;
913 
914 	pfsync_sendout(sc, m);
915 	return;
916 
917 drop:
918 	m_freem(m);
919 fail:
920 	printf("%s: unable to request bulk update\n", sc->sc_if.if_xname);
921 }
922 
923 static void
pfsync_bulk_req_nstate(struct pfsync_softc * sc,enum pfsync_bulk_req_state nstate,int seconds)924 pfsync_bulk_req_nstate(struct pfsync_softc *sc,
925     enum pfsync_bulk_req_state nstate, int seconds)
926 {
927 	sc->sc_bulk_req.req_state = nstate;
928 	if (seconds > 0)
929 		timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds);
930 	else
931 		timeout_del(&sc->sc_bulk_req.req_tmo);
932 }
933 
934 static void
pfsync_bulk_req_invstate(struct pfsync_softc * sc,enum pfsync_bulk_req_event evt)935 pfsync_bulk_req_invstate(struct pfsync_softc *sc,
936     enum pfsync_bulk_req_event evt)
937 {
938 	panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname,
939 	    pfsync_bulk_req_event_names[evt],
940 	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]);
941 }
942 
943 static void
pfsync_bulk_req_nstate_bulk(struct pfsync_softc * sc)944 pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc)
945 {
946 	/* calculate the number of packets we expect */
947 	int t = pf_pool_limits[PF_LIMIT_STATES].limit /
948 	    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
949 	     sizeof(struct pfsync_state));
950 
951 	/* turn it into seconds */
952 	t /= 1000 / PFSYNC_BULK_SND_IVAL_MS;
953 
954 	if (t == 0)
955 		t = 1;
956 
957 	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4);
958 }
959 
960 static inline void
pfsync_bulk_req_nstate_done(struct pfsync_softc * sc)961 pfsync_bulk_req_nstate_done(struct pfsync_softc *sc)
962 {
963 	pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0);
964 
965 	KASSERT(sc->sc_bulk_req.req_demoted == 1);
966 	sc->sc_bulk_req.req_demoted = 0;
967 
968 #if NCARP > 0
969 	carp_group_demote_adj(&sc->sc_if, -32, "pfsync done");
970 #endif
971 }
972 
973 static void
pfsync_bulk_req_evt(struct pfsync_softc * sc,enum pfsync_bulk_req_event evt)974 pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt)
975 {
976 	struct ifnet *ifp = &sc->sc_if;
977 
978 	rw_enter_write(&sc->sc_bulk_req.req_lock);
979 	pfsync_dprintf(sc, "%s state %s evt %s", __func__,
980 	    pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state],
981 	    pfsync_bulk_req_event_names[evt]);
982 
983 	if (evt == PFSYNC_BREQ_EVT_DOWN) {
984 		/* unconditionally move down */
985 		sc->sc_bulk_req.req_tries = 0;
986 		pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0);
987 
988 		if (sc->sc_bulk_req.req_demoted) {
989 			sc->sc_bulk_req.req_demoted = 0;
990 #if NCARP > 0
991 			carp_group_demote_adj(&sc->sc_if, -32,
992 			    "pfsync down");
993 #endif
994 		}
995 	} else switch (sc->sc_bulk_req.req_state) {
996 	case PFSYNC_BREQ_S_NONE:
997 		switch (evt) {
998 		case PFSYNC_BREQ_EVT_UP:
999 			KASSERT(sc->sc_bulk_req.req_demoted == 0);
1000 			sc->sc_bulk_req.req_demoted = 1;
1001 #if NCARP > 0
1002 			carp_group_demote_adj(&sc->sc_if, 32,
1003 			    "pfsync start");
1004 #endif
1005 			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30);
1006 			break;
1007 		default:
1008 			pfsync_bulk_req_invstate(sc, evt);
1009 		}
1010 
1011 		break;
1012 
1013 	case PFSYNC_BREQ_S_START:
1014 		switch (evt) {
1015 		case PFSYNC_BREQ_EVT_LINK:
1016 			pfsync_bulk_req_send(sc);
1017 			pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2);
1018 			break;
1019 		case PFSYNC_BREQ_EVT_TMO:
1020 			pfsync_dprintf(sc, "timeout waiting for link");
1021 			pfsync_bulk_req_nstate_done(sc);
1022 			break;
1023 		case PFSYNC_BREQ_EVT_BUS_START:
1024 			pfsync_bulk_req_nstate_bulk(sc);
1025 			break;
1026 		case PFSYNC_BREQ_EVT_BUS_END:
1027 			/* ignore this */
1028 			break;
1029 		default:
1030 			pfsync_bulk_req_invstate(sc, evt);
1031 		}
1032 		break;
1033 
1034 	case PFSYNC_BREQ_S_SENT:
1035 		switch (evt) {
1036 		case PFSYNC_BREQ_EVT_BUS_START:
1037 			pfsync_bulk_req_nstate_bulk(sc);
1038 			break;
1039 		case PFSYNC_BREQ_EVT_BUS_END:
1040 		case PFSYNC_BREQ_EVT_LINK:
1041 			/* ignore this */
1042 			break;
1043 		case PFSYNC_BREQ_EVT_TMO:
1044 			if (++sc->sc_bulk_req.req_tries <
1045 			    PFSYNC_MAX_BULKTRIES) {
1046 				pfsync_bulk_req_send(sc);
1047 				pfsync_bulk_req_nstate(sc,
1048 				    PFSYNC_BREQ_S_SENT, 2);
1049 				break;
1050 			}
1051 
1052 			pfsync_dprintf(sc,
1053 			    "timeout waiting for bulk transfer start");
1054 			pfsync_bulk_req_nstate_done(sc);
1055 			break;
1056 		default:
1057 			pfsync_bulk_req_invstate(sc, evt);
1058 		}
1059 		break;
1060 
1061 	case PFSYNC_BREQ_S_BULK:
1062 		switch (evt) {
1063 		case PFSYNC_BREQ_EVT_BUS_START:
1064 		case PFSYNC_BREQ_EVT_LINK:
1065 			/* ignore this */
1066 			break;
1067 		case PFSYNC_BREQ_EVT_BUS_END:
1068 			pfsync_bulk_req_nstate_done(sc);
1069 			break;
1070 		case PFSYNC_BREQ_EVT_TMO:
1071 			if (++sc->sc_bulk_req.req_tries <
1072 			    PFSYNC_MAX_BULKTRIES) {
1073 				pfsync_bulk_req_send(sc);
1074 				pfsync_bulk_req_nstate(sc,
1075 				    PFSYNC_BREQ_S_SENT, 2);
1076 			}
1077 
1078 			pfsync_dprintf(sc,
1079 			    "timeout waiting for bulk transfer end");
1080 			pfsync_bulk_req_nstate_done(sc);
1081 			break;
1082 		default:
1083 			pfsync_bulk_req_invstate(sc, evt);
1084 		}
1085 		break;
1086 
1087 	case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */
1088 		switch (evt) {
1089 		case PFSYNC_BREQ_EVT_BUS_START:
1090 		case PFSYNC_BREQ_EVT_BUS_END:
1091 		case PFSYNC_BREQ_EVT_LINK:
1092 			/* nops */
1093 			break;
1094 		default:
1095 			pfsync_bulk_req_invstate(sc, evt);
1096 		}
1097 		break;
1098 
1099 	default:
1100 		panic("%s: unknown event %d", ifp->if_xname, evt);
1101 		/* NOTREACHED */
1102 	}
1103 	rw_exit_write(&sc->sc_bulk_req.req_lock);
1104 }
1105 
1106 static void
pfsync_bulk_req_tmo(void * arg)1107 pfsync_bulk_req_tmo(void *arg)
1108 {
1109 	struct pfsync_softc *sc = arg;
1110 
1111 	NET_LOCK();
1112 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO);
1113 	NET_UNLOCK();
1114 }
1115 
1116 static int
pfsync_down(struct pfsync_softc * sc)1117 pfsync_down(struct pfsync_softc *sc)
1118 {
1119 	struct ifnet *ifp = &sc->sc_if;
1120 	struct ifnet *ifp0;
1121 	struct smr_entry smr;
1122 	size_t i;
1123 	void *inm = NULL;
1124 	unsigned int sndbar = 0;
1125 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
1126 	struct pfsync_deferral *pd;
1127 
1128 	NET_ASSERT_LOCKED();
1129 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1130 
1131 	/*
1132 	 * tearing down pfsync involves waiting for pfsync to stop
1133 	 * running in various contexts including softnet taskqs.
1134 	 * this thread cannot hold netlock while waiting for a
1135 	 * barrier in softnet because softnet might be waiting for
1136 	 * the netlock. sc->sc_up is used to coordinate with
1137 	 * pfsync_up.
1138 	 */
1139 
1140 	CLR(ifp->if_flags, IFF_RUNNING);
1141 
1142 	ifp0 = if_get(sc->sc_sync_ifidx);
1143 	if (ifp0 != NULL) {
1144 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
1145 		if_detachhook_del(ifp0, &sc->sc_dtask);
1146 	}
1147 	if_put(ifp0);
1148 
1149 #if NCARP > 0
1150 	if (sc->sc_sync_if_down)
1151 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync down");
1152 #endif
1153 
1154 	NET_UNLOCK();
1155 
1156 	KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc,
1157 	   "pfsyncif %p != sc %p", pfsyncif, sc);
1158 	SMR_PTR_SET_LOCKED(&pfsyncif, NULL);
1159 	smr_init(&smr);
1160 	smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs);
1161 
1162 	/* stop pf producing work before cleaning up the timeouts and tasks */
1163 	refcnt_finalize(&sc->sc_refs, "pfsyncfini");
1164 
1165 	pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN);
1166 
1167 	rw_enter_read(&pf_state_list.pfs_rwl);
1168 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
1169 	if (sc->sc_bulk_snd.snd_tail != NULL) {
1170 		sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo);
1171 
1172 		sc->sc_bulk_snd.snd_again = 0;
1173 		sc->sc_bulk_snd.snd_next = NULL;
1174 		sc->sc_bulk_snd.snd_tail = NULL;
1175 	}
1176 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
1177 	rw_exit_read(&pf_state_list.pfs_rwl);
1178 
1179 	/*
1180 	 * do a single barrier for all the timeouts. because the
1181 	 * timeouts in each slice are configured the same way, the
1182 	 * barrier for one will work for all of them.
1183 	 */
1184 	for (i = 0; i < nitems(sc->sc_slices); i++) {
1185 		struct pfsync_slice *s = &sc->sc_slices[i];
1186 
1187 		timeout_del(&s->s_tmo);
1188 		task_del(s->s_softnet, &s->s_task);
1189 		task_del(s->s_softnet, &s->s_send);
1190 
1191 		timeout_del(&s->s_deferrals_tmo);
1192 		task_del(s->s_softnet, &s->s_deferrals_task);
1193 	}
1194 	timeout_barrier(&sc->sc_slices[0].s_tmo);
1195 	timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */
1196 	if (sndbar) {
1197 		/* technically the preceding barrier does the same job */
1198 		timeout_barrier(&sc->sc_bulk_snd.snd_tmo);
1199 	}
1200 	net_tq_barriers("pfsyncbar");
1201 
1202 	/* pfsync is no longer running */
1203 
1204 	if (sc->sc_inm != NULL) {
1205 		inm = sc->sc_inm;
1206 		sc->sc_inm = NULL;
1207 	}
1208 
1209 	for (i = 0; i < nitems(sc->sc_slices); i++) {
1210 		struct pfsync_slice *s = &sc->sc_slices[i];
1211 		struct pf_state *st;
1212 
1213 		pfsync_slice_drop(sc, s);
1214 		mq_purge(&s->s_sendq);
1215 
1216 		while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) {
1217 			TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1218 
1219 			st = pd->pd_st;
1220 			st->sync_defer = NULL;
1221 
1222 			TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
1223 		}
1224 		s->s_deferred = 0;
1225 	}
1226 
1227 	NET_LOCK();
1228 	sc->sc_up = 0;
1229 
1230 	if (inm != NULL)
1231 		in_delmulti(inm);
1232 
1233 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
1234 		TAILQ_REMOVE(&pds, pd, pd_entry);
1235 
1236 		pfsync_defer_output(pd);
1237 	}
1238 
1239 	return (0);
1240 }
1241 
1242 int
pfsync_is_up(void)1243 pfsync_is_up(void)
1244 {
1245 	int rv;
1246 
1247 	smr_read_enter();
1248 	rv = SMR_PTR_GET(&pfsyncif) != NULL;
1249 	smr_read_leave();
1250 
1251 	return (rv);
1252 }
1253 
1254 static void
pfsync_start(struct ifqueue * ifq)1255 pfsync_start(struct ifqueue *ifq)
1256 {
1257 	ifq_purge(ifq);
1258 }
1259 
1260 struct pfsync_q {
1261 	void		(*write)(struct pf_state *, void *);
1262 	size_t		len;
1263 	u_int8_t	action;
1264 };
1265 
1266 static struct pfsync_slice *
pfsync_slice_enter(struct pfsync_softc * sc,const struct pf_state * st)1267 pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st)
1268 {
1269 	unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices);
1270 	struct pfsync_slice *s = &sc->sc_slices[idx];
1271 
1272 	if (!mtx_enter_try(&s->s_mtx)) {
1273 		mtx_enter(&s->s_mtx);
1274 		s->s_stat_contended++;
1275 	}
1276 	s->s_stat_locks++;
1277 
1278 	return (s);
1279 }
1280 
1281 static void
pfsync_slice_leave(struct pfsync_softc * sc,struct pfsync_slice * s)1282 pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s)
1283 {
1284 	mtx_leave(&s->s_mtx);
1285 }
1286 
1287 /* we have one of these for every PFSYNC_S_ */
1288 static void	pfsync_out_state(struct pf_state *, void *);
1289 static void	pfsync_out_iack(struct pf_state *, void *);
1290 static void	pfsync_out_upd_c(struct pf_state *, void *);
1291 static void	pfsync_out_del(struct pf_state *, void *);
1292 #if defined(IPSEC)
1293 static void	pfsync_out_tdb(struct tdb *, void *);
1294 #endif
1295 
1296 static const struct pfsync_q pfsync_qs[] = {
1297 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
1298 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
1299 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
1300 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
1301 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
1302 };
1303 
1304 static void
pfsync_out_state(struct pf_state * st,void * buf)1305 pfsync_out_state(struct pf_state *st, void *buf)
1306 {
1307 	struct pfsync_state *sp = buf;
1308 
1309 	mtx_enter(&st->mtx);
1310 	pf_state_export(sp, st);
1311 	mtx_leave(&st->mtx);
1312 }
1313 
1314 static void
pfsync_out_iack(struct pf_state * st,void * buf)1315 pfsync_out_iack(struct pf_state *st, void *buf)
1316 {
1317 	struct pfsync_ins_ack *iack = buf;
1318 
1319 	iack->id = st->id;
1320 	iack->creatorid = st->creatorid;
1321 }
1322 
1323 static void
pfsync_out_upd_c(struct pf_state * st,void * buf)1324 pfsync_out_upd_c(struct pf_state *st, void *buf)
1325 {
1326 	struct pfsync_upd_c *up = buf;
1327 
1328 	memset(up, 0, sizeof(*up));
1329 	up->id = st->id;
1330 	up->creatorid = st->creatorid;
1331 
1332 	mtx_enter(&st->mtx);
1333 	pf_state_peer_hton(&st->src, &up->src);
1334 	pf_state_peer_hton(&st->dst, &up->dst);
1335 	up->timeout = st->timeout;
1336 	mtx_leave(&st->mtx);
1337 }
1338 
1339 static void
pfsync_out_del(struct pf_state * st,void * buf)1340 pfsync_out_del(struct pf_state *st, void *buf)
1341 {
1342 	struct pfsync_del_c *dp = buf;
1343 
1344 	dp->id = st->id;
1345 	dp->creatorid = st->creatorid;
1346 
1347 	st->sync_state = PFSYNC_S_DEAD;
1348 }
1349 
1350 #if defined(IPSEC)
1351 static inline void
pfsync_tdb_enter(struct tdb * tdb)1352 pfsync_tdb_enter(struct tdb *tdb)
1353 {
1354 	mtx_enter(&tdb->tdb_mtx);
1355 }
1356 
1357 static inline void
pfsync_tdb_leave(struct tdb * tdb)1358 pfsync_tdb_leave(struct tdb *tdb)
1359 {
1360 	unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
1361 	mtx_leave(&tdb->tdb_mtx);
1362 	if (snapped)
1363 		wakeup_one(&tdb->tdb_updates);
1364 }
1365 #endif /* defined(IPSEC) */
1366 
1367 static void
pfsync_slice_drop(struct pfsync_softc * sc,struct pfsync_slice * s)1368 pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s)
1369 {
1370 	struct pf_state *st;
1371 	int q;
1372 #if defined(IPSEC)
1373 	struct tdb *tdb;
1374 #endif
1375 
1376 	for (q = 0; q < nitems(s->s_qs); q++) {
1377 		if (TAILQ_EMPTY(&s->s_qs[q]))
1378 			continue;
1379 
1380 		while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) {
1381 			TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1382 #ifdef PFSYNC_DEBUG
1383 			KASSERT(st->sync_state == q);
1384 #endif
1385 			st->sync_state = PFSYNC_S_NONE;
1386 			pf_state_unref(st);
1387 		}
1388 	}
1389 
1390 #if defined(IPSEC)
1391 	while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1392 		TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1393 
1394 		pfsync_tdb_enter(tdb);
1395 		KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1396 		CLR(tdb->tdb_flags, TDBF_PFSYNC);
1397 		pfsync_tdb_leave(tdb);
1398 	}
1399 #endif /* defined(IPSEC) */
1400 
1401 	timeout_del(&s->s_tmo);
1402 	s->s_len = PFSYNC_MINPKT;
1403 }
1404 
1405 static struct mbuf *
pfsync_slice_write(struct pfsync_slice * s)1406 pfsync_slice_write(struct pfsync_slice *s)
1407 {
1408 	struct pfsync_softc *sc = s->s_pfsync;
1409 	struct mbuf *m;
1410 
1411 	struct ip *ip;
1412 	struct pfsync_header *ph;
1413 	struct pfsync_subheader *subh;
1414 
1415 	unsigned int mlen = max_linkhdr + s->s_len;
1416 	unsigned int q, count;
1417 	caddr_t ptr;
1418 	size_t off;
1419 
1420 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1421 	if (s->s_len == PFSYNC_MINPKT) {
1422 		s->s_stat_write_nop++;
1423 		return (NULL);
1424 	}
1425 
1426 	task_del(s->s_softnet, &s->s_task);
1427 
1428 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1429 	if (m == NULL)
1430 		goto drop;
1431 
1432 	if (mlen > MHLEN) {
1433 		MCLGETL(m, M_DONTWAIT, mlen);
1434 		if (!ISSET(m->m_flags, M_EXT))
1435 			goto drop;
1436 	}
1437 
1438 	m_align(m, s->s_len);
1439 	m->m_len = m->m_pkthdr.len = s->s_len;
1440 
1441 	ptr = mtod(m, caddr_t);
1442 	off = 0;
1443 
1444 	ip = (struct ip *)(ptr + off);
1445 	off += sizeof(*ip);
1446 	*ip = sc->sc_template;
1447 	ip->ip_len = htons(m->m_pkthdr.len);
1448 	ip->ip_id = htons(ip_randomid());
1449 
1450 	ph = (struct pfsync_header *)(ptr + off);
1451 	off += sizeof(*ph);
1452 	memset(ph, 0, sizeof(*ph));
1453 	ph->version = PFSYNC_VERSION;
1454 	ph->len = htons(m->m_pkthdr.len - sizeof(*ip));
1455 
1456 	for (q = 0; q < nitems(s->s_qs); q++) {
1457 		struct pf_state_queue *psq = &s->s_qs[q];
1458 		struct pf_state *st;
1459 
1460 		if (TAILQ_EMPTY(psq))
1461 			continue;
1462 
1463 		subh = (struct pfsync_subheader *)(ptr + off);
1464 		off += sizeof(*subh);
1465 
1466 		count = 0;
1467 		while ((st = TAILQ_FIRST(psq)) != NULL) {
1468 			TAILQ_REMOVE(psq, st, sync_list);
1469 			count++;
1470 
1471 			KASSERT(st->sync_state == q);
1472 			/* the write handler below may override this */
1473 			st->sync_state = PFSYNC_S_NONE;
1474 
1475 			pfsync_qs[q].write(st, ptr + off);
1476 			off += pfsync_qs[q].len;
1477 
1478 			pf_state_unref(st);
1479 		}
1480 
1481 		subh->action = pfsync_qs[q].action;
1482 		subh->len = pfsync_qs[q].len >> 2;
1483 		subh->count = htons(count);
1484 	}
1485 
1486 #if defined(IPSEC)
1487 	if (!TAILQ_EMPTY(&s->s_tdb_q)) {
1488 		struct tdb *tdb;
1489 
1490 		subh = (struct pfsync_subheader *)(ptr + off);
1491 		off += sizeof(*subh);
1492 
1493 		count = 0;
1494 		while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) {
1495 			TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
1496 			count++;
1497 
1498 			pfsync_tdb_enter(tdb);
1499 			KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC));
1500 
1501 			/* get a consistent view of the counters */
1502 			pfsync_out_tdb(tdb, ptr + off);
1503 
1504 			CLR(tdb->tdb_flags, TDBF_PFSYNC);
1505 			pfsync_tdb_leave(tdb);
1506 
1507 			off += sizeof(struct pfsync_tdb);
1508 		}
1509 
1510 		subh->action = PFSYNC_ACT_TDB;
1511 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1512 		subh->count = htons(count);
1513 	}
1514 #endif
1515 
1516 	timeout_del(&s->s_tmo);
1517 	s->s_len = PFSYNC_MINPKT;
1518 
1519 	return (m);
1520 drop:
1521 	m_freem(m);
1522 	pfsyncstat_inc(pfsyncs_onomem);
1523 	pfsync_slice_drop(sc, s);
1524 	return (NULL);
1525 }
1526 
1527 static void
pfsync_sendout(struct pfsync_softc * sc,struct mbuf * m)1528 pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m)
1529 {
1530 	struct ip_moptions imo;
1531 	unsigned int len = m->m_pkthdr.len;
1532 #if NBPFILTER > 0
1533 	caddr_t if_bpf = sc->sc_if.if_bpf;
1534 	if (if_bpf)
1535 		bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT);
1536 #endif
1537 
1538 	imo.imo_ifidx = sc->sc_sync_ifidx;
1539 	imo.imo_ttl = PFSYNC_DFLTTL;
1540 	imo.imo_loop = 0;
1541 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1542 
1543 	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) {
1544 		counters_pkt(sc->sc_if.if_counters, ifc_opackets,
1545 		    ifc_obytes, len);
1546 		pfsyncstat_inc(pfsyncs_opackets);
1547 	} else {
1548 		counters_inc(sc->sc_if.if_counters, ifc_oerrors);
1549 		pfsyncstat_inc(pfsyncs_oerrors);
1550 	}
1551 }
1552 
1553 static void
pfsync_slice_tmo(void * arg)1554 pfsync_slice_tmo(void *arg)
1555 {
1556 	struct pfsync_slice *s = arg;
1557 
1558 	task_add(s->s_softnet, &s->s_task);
1559 }
1560 
1561 static void
pfsync_slice_sched(struct pfsync_slice * s)1562 pfsync_slice_sched(struct pfsync_slice *s)
1563 {
1564 	s->s_stat_task_add++;
1565 	task_add(s->s_softnet, &s->s_task);
1566 }
1567 
1568 static void
pfsync_slice_task(void * arg)1569 pfsync_slice_task(void *arg)
1570 {
1571 	struct pfsync_slice *s = arg;
1572 	struct mbuf *m;
1573 
1574 	mtx_enter(&s->s_mtx);
1575 	s->s_stat_task_run++;
1576 
1577 	m = pfsync_slice_write(s);
1578 	mtx_leave(&s->s_mtx);
1579 	if (m != NULL) {
1580 		NET_LOCK();
1581 		pfsync_sendout(s->s_pfsync, m);
1582 		NET_UNLOCK();
1583 	}
1584 }
1585 
1586 static void
pfsync_slice_sendq(void * arg)1587 pfsync_slice_sendq(void *arg)
1588 {
1589 	struct pfsync_slice *s = arg;
1590 	struct mbuf_list ml;
1591 	struct mbuf *m;
1592 
1593 	mq_delist(&s->s_sendq, &ml);
1594 	if (ml_empty(&ml))
1595 		return;
1596 
1597 	mtx_enter(&s->s_mtx);
1598 	s->s_stat_dequeue++;
1599 	mtx_leave(&s->s_mtx);
1600 
1601 	NET_LOCK();
1602 	while ((m = ml_dequeue(&ml)) != NULL)
1603 		pfsync_sendout(s->s_pfsync, m);
1604 	NET_UNLOCK();
1605 }
1606 
1607 static void
pfsync_q_ins(struct pfsync_slice * s,struct pf_state * st,unsigned int q)1608 pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q)
1609 {
1610 	size_t nlen = pfsync_qs[q].len;
1611 	struct mbuf *m = NULL;
1612 
1613 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1614 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1615 	KASSERT(s->s_len >= PFSYNC_MINPKT);
1616 
1617 	if (TAILQ_EMPTY(&s->s_qs[q]))
1618 		nlen += sizeof(struct pfsync_subheader);
1619 
1620 	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
1621 		m = pfsync_slice_write(s);
1622 		if (m != NULL) {
1623 			s->s_stat_enqueue++;
1624 			if (mq_enqueue(&s->s_sendq, m) == 0)
1625 				task_add(s->s_softnet, &s->s_send);
1626 		}
1627 
1628 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1629 	}
1630 
1631 	s->s_len += nlen;
1632 	pf_state_ref(st);
1633 	TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list);
1634 	st->sync_state = q;
1635 
1636 	if (!timeout_pending(&s->s_tmo))
1637 		timeout_add_sec(&s->s_tmo, 1);
1638 }
1639 
1640 static void
pfsync_q_del(struct pfsync_slice * s,struct pf_state * st)1641 pfsync_q_del(struct pfsync_slice *s, struct pf_state *st)
1642 {
1643 	unsigned int q = st->sync_state;
1644 
1645 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
1646 	KASSERT(st->sync_state < PFSYNC_S_NONE);
1647 
1648 	st->sync_state = PFSYNC_S_NONE;
1649 	TAILQ_REMOVE(&s->s_qs[q], st, sync_list);
1650 	pf_state_unref(st);
1651 	s->s_len -= pfsync_qs[q].len;
1652 
1653 	if (TAILQ_EMPTY(&s->s_qs[q]))
1654 		s->s_len -= sizeof(struct pfsync_subheader);
1655 }
1656 
1657 /*
1658  * the pfsync hooks that pf calls
1659  */
1660 
1661 void
pfsync_init_state(struct pf_state * st,const struct pf_state_key * skw,const struct pf_state_key * sks,int flags)1662 pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw,
1663     const struct pf_state_key *sks, int flags)
1664 {
1665 	/* this is called before pf_state_insert */
1666 
1667 	if (skw->proto == IPPROTO_PFSYNC)
1668 		SET(st->state_flags, PFSTATE_NOSYNC);
1669 
1670 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1671 		st->sync_state = PFSYNC_S_DEAD;
1672 		return;
1673 	}
1674 
1675 	if (ISSET(flags, PFSYNC_SI_IOCTL)) {
1676 		/* all good */
1677 		return;
1678 	}
1679 
1680 	/* state came off the wire */
1681 	if (ISSET(flags, PFSYNC_SI_PFSYNC)) {
1682 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
1683 			CLR(st->state_flags, PFSTATE_ACK);
1684 
1685 			/* peer wants an iack, not an insert */
1686 			st->sync_state = PFSYNC_S_SYNC;
1687 		} else
1688 			st->sync_state = PFSYNC_S_PFSYNC;
1689 	}
1690 }
1691 
1692 void
pfsync_insert_state(struct pf_state * st)1693 pfsync_insert_state(struct pf_state *st)
1694 {
1695 	struct pfsync_softc *sc;
1696 
1697 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1698 
1699 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1700 	    st->sync_state == PFSYNC_S_DEAD)
1701 		return;
1702 
1703 	smr_read_enter();
1704 	sc = SMR_PTR_GET(&pfsyncif);
1705 	if (sc != NULL) {
1706 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1707 
1708 		switch (st->sync_state) {
1709 		case PFSYNC_S_UPD_C:
1710 			/* we must have lost a race after insert */
1711 			pfsync_q_del(s, st);
1712 			/* FALLTHROUGH */
1713 		case PFSYNC_S_NONE:
1714 			pfsync_q_ins(s, st, PFSYNC_S_INS);
1715 			break;
1716 		case PFSYNC_S_SYNC:
1717 			st->sync_state = PFSYNC_S_NONE; /* gross */
1718 			pfsync_q_ins(s, st, PFSYNC_S_IACK);
1719 			pfsync_slice_sched(s); /* the peer is waiting */
1720 			break;
1721 		case PFSYNC_S_PFSYNC:
1722 			/* state was just inserted by pfsync */
1723 			st->sync_state = PFSYNC_S_NONE;
1724 			break;
1725 		default:
1726 			panic("%s: state %p unexpected sync_state %d",
1727 			    __func__, st, st->sync_state);
1728 			/* NOTREACHED */
1729 		}
1730 
1731 		pfsync_slice_leave(sc, s);
1732 	}
1733 	smr_read_leave();
1734 }
1735 
1736 void
pfsync_update_state(struct pf_state * st)1737 pfsync_update_state(struct pf_state *st)
1738 {
1739 	struct pfsync_softc *sc;
1740 
1741 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1742 
1743 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1744 	    st->sync_state == PFSYNC_S_DEAD)
1745 		return;
1746 
1747 	smr_read_enter();
1748 	sc = SMR_PTR_GET(&pfsyncif);
1749 	if (sc != NULL) {
1750 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1751 		int sync = 0;
1752 
1753 		switch (st->sync_state) {
1754 		case PFSYNC_S_UPD_C:
1755 		case PFSYNC_S_UPD:
1756 			/* we're already handling it */
1757 			if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1758 				st->sync_updates++;
1759 				if (st->sync_updates >= sc->sc_maxupdates)
1760 					sync = 1;
1761 			}
1762 			/* FALLTHROUGH */
1763 		case PFSYNC_S_INS:
1764 		case PFSYNC_S_DEL:
1765 		case PFSYNC_S_DEAD:
1766 			break;
1767 
1768 		case PFSYNC_S_IACK:
1769 			pfsync_q_del(s, st);
1770 			/* FALLTHROUGH */
1771 		case PFSYNC_S_NONE:
1772 			pfsync_q_ins(s, st, PFSYNC_S_UPD_C);
1773 			st->sync_updates = 0;
1774 			break;
1775 		default:
1776 			panic("%s: state %p unexpected sync_state %d",
1777 			    __func__, st, st->sync_state);
1778 			/* NOTREACHED */
1779 		}
1780 
1781 		if (!sync && (getuptime() - st->pfsync_time) < 2)
1782 			sync = 1;
1783 
1784 		if (sync)
1785 			pfsync_slice_sched(s);
1786 		pfsync_slice_leave(sc, s);
1787 	}
1788 	smr_read_leave();
1789 }
1790 
1791 void
pfsync_delete_state(struct pf_state * st)1792 pfsync_delete_state(struct pf_state *st)
1793 {
1794 	struct pfsync_softc *sc;
1795 
1796 	MUTEX_ASSERT_UNLOCKED(&st->mtx);
1797 
1798 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1799 	    st->sync_state == PFSYNC_S_DEAD)
1800 		return;
1801 
1802 	smr_read_enter();
1803 	sc = SMR_PTR_GET(&pfsyncif);
1804 	if (sc != NULL) {
1805 		struct pfsync_slice *s = pfsync_slice_enter(sc, st);
1806 
1807 		switch (st->sync_state) {
1808 		case PFSYNC_S_INS:
1809 			/* let's pretend this never happened */
1810 			pfsync_q_del(s, st);
1811 			break;
1812 
1813 		case PFSYNC_S_UPD_C:
1814 		case PFSYNC_S_UPD:
1815 		case PFSYNC_S_IACK:
1816 			pfsync_q_del(s, st);
1817 			/* FALLTHROUGH */
1818 		case PFSYNC_S_NONE:
1819 			pfsync_q_ins(s, st, PFSYNC_S_DEL);
1820 			st->sync_updates = 0;
1821 			break;
1822 		case PFSYNC_S_DEL:
1823 		case PFSYNC_S_DEAD:
1824 			/* XXX we should count this */
1825 			break;
1826 		default:
1827 			panic("%s: state %p unexpected sync_state %d",
1828 			    __func__, st, st->sync_state);
1829 			/* NOTREACHED */
1830 		}
1831 
1832 		pfsync_slice_leave(sc, s);
1833 	}
1834 	smr_read_leave();
1835 }
1836 
1837 struct pfsync_subh_clr {
1838 	struct pfsync_subheader	subh;
1839 	struct pfsync_clr	clr;
1840 } __packed __aligned(4);
1841 
1842 void
pfsync_clear_states(u_int32_t creatorid,const char * ifname)1843 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1844 {
1845 	struct pfsync_softc *sc;
1846 	struct pfsync_subh_clr *h;
1847 	struct mbuf *m;
1848 	unsigned int hlen, mlen;
1849 
1850 	smr_read_enter();
1851 	sc = SMR_PTR_GET(&pfsyncif);
1852 	if (sc != NULL)
1853 		refcnt_take(&sc->sc_refs);
1854 	smr_read_leave();
1855 
1856 	if (sc == NULL)
1857 		return;
1858 
1859 	hlen = sizeof(sc->sc_template) +
1860 	    sizeof(struct pfsync_header) +
1861 	    sizeof(*h);
1862 
1863 	mlen = max_linkhdr + hlen;
1864 
1865 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1866 	if (m == NULL) {
1867 		/* count error */
1868 		goto leave;
1869 	}
1870 
1871 	if (mlen > MHLEN) {
1872 		MCLGETL(m, M_DONTWAIT, mlen);
1873 		if (!ISSET(m->m_flags, M_EXT)) {
1874 			m_freem(m);
1875 			goto leave;
1876 		}
1877 	}
1878 
1879 	m_align(m, sizeof(*h));
1880 	h = mtod(m, struct pfsync_subh_clr *);
1881 
1882 	h->subh.action = PFSYNC_ACT_CLR;
1883 	h->subh.len = sizeof(h->clr) >> 2;
1884 	h->subh.count = htons(1);
1885 
1886 	strlcpy(h->clr.ifname, ifname, sizeof(h->clr.ifname));
1887 	h->clr.creatorid = creatorid;
1888 
1889 	m->m_pkthdr.len = m->m_len = sizeof(*h);
1890 	m = pfsync_encap(sc, m);
1891 	if (m == NULL)
1892 		goto leave;
1893 
1894 	pfsync_sendout(sc, m);
1895 leave:
1896 	refcnt_rele_wake(&sc->sc_refs);
1897 }
1898 
1899 int
pfsync_state_in_use(struct pf_state * st)1900 pfsync_state_in_use(struct pf_state *st)
1901 {
1902 	struct pfsync_softc *sc;
1903 	int rv = 0;
1904 
1905 	smr_read_enter();
1906 	sc = SMR_PTR_GET(&pfsyncif);
1907 	if (sc != NULL) {
1908 		/*
1909 		 * pfsync bulk sends run inside
1910 		 * rw_enter_read(&pf_state_list.pfs_rwl), and this
1911 		 * code (pfsync_state_in_use) is only called from the
1912 		 * purge code inside
1913 		 * rw_enter_write(&pf_state_list.pfs_rwl). therefore,
1914 		 * those two sections are exclusive so we can safely
1915 		 * look at the bulk send pointers.
1916 		 */
1917 		/* rw_assert_wrlock(&pf_state_list.pfs_rwl); */
1918 		if (sc->sc_bulk_snd.snd_next == st ||
1919 		    sc->sc_bulk_snd.snd_tail == st)
1920 			rv = 1;
1921 	}
1922 	smr_read_leave();
1923 
1924 	return (rv);
1925 }
1926 
1927 int
pfsync_defer(struct pf_state * st,struct mbuf * m)1928 pfsync_defer(struct pf_state *st, struct mbuf *m)
1929 {
1930 	struct pfsync_softc *sc;
1931 	struct pfsync_slice *s;
1932 	struct pfsync_deferral *pd;
1933 	int sched = 0;
1934 	int rv = 0;
1935 
1936 	if (ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1937 	    ISSET(m->m_flags, M_BCAST|M_MCAST))
1938 		return (0);
1939 
1940 	smr_read_enter();
1941 	sc = SMR_PTR_GET(&pfsyncif);
1942 	if (sc == NULL || !sc->sc_defer)
1943 		goto leave;
1944 
1945 	pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT);
1946 	if (pd == NULL) {
1947 		goto leave;
1948 	}
1949 
1950 	s = pfsync_slice_enter(sc, st);
1951 	s->s_stat_defer_add++;
1952 
1953 	pd->pd_st = pf_state_ref(st);
1954 	pd->pd_m = m;
1955 	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1956 
1957 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1958 	st->sync_defer = pd;
1959 
1960 	sched = s->s_deferred++;
1961 	TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry);
1962 
1963 	if (sched == 0)
1964 		timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC);
1965 	else if (sched >= PFSYNC_DEFER_LIMIT) {
1966 		s->s_stat_defer_overlimit++;
1967 		timeout_del(&s->s_deferrals_tmo);
1968 		task_add(s->s_softnet, &s->s_deferrals_task);
1969 	}
1970 
1971 	pfsync_slice_sched(s);
1972 	pfsync_slice_leave(sc, s);
1973 	rv = 1;
1974 leave:
1975 	smr_read_leave();
1976 
1977 	return (rv);
1978 }
1979 
1980 static void
pfsync_deferred(struct pfsync_softc * sc,struct pf_state * st)1981 pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st)
1982 {
1983 	struct pfsync_slice *s;
1984 	struct pfsync_deferral *pd;
1985 
1986 	s = pfsync_slice_enter(sc, st);
1987 
1988 	pd = st->sync_defer;
1989 	if (pd != NULL) {
1990 		s->s_stat_defer_ack++;
1991 
1992 		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
1993 		s->s_deferred--;
1994 
1995 		st = pd->pd_st;
1996 		st->sync_defer = NULL;
1997 	}
1998 	pfsync_slice_leave(sc, s);
1999 
2000 	if (pd != NULL)
2001 		pfsync_defer_output(pd);
2002 }
2003 
2004 static void
pfsync_deferrals_tmo(void * arg)2005 pfsync_deferrals_tmo(void *arg)
2006 {
2007 	struct pfsync_slice *s = arg;
2008 
2009 	if (READ_ONCE(s->s_deferred) > 0)
2010 		task_add(s->s_softnet, &s->s_deferrals_task);
2011 }
2012 
2013 static void
pfsync_deferrals_task(void * arg)2014 pfsync_deferrals_task(void *arg)
2015 {
2016 	struct pfsync_slice *s = arg;
2017 	struct pfsync_deferral *pd;
2018 	struct pf_state *st;
2019 	uint64_t now, nsec = 0;
2020 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
2021 
2022 	now = getnsecuptime();
2023 
2024 	mtx_enter(&s->s_mtx);
2025 	s->s_stat_defer_run++; /* maybe move this into the loop */
2026 	for (;;) {
2027 		pd = TAILQ_FIRST(&s->s_deferrals);
2028 		if (pd == NULL)
2029 			break;
2030 
2031 		if (s->s_deferred < PFSYNC_DEFER_LIMIT &&
2032 		    now < pd->pd_deadline) {
2033 			nsec = pd->pd_deadline - now;
2034 			break;
2035 		}
2036 
2037 		TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry);
2038 		s->s_deferred--;
2039 
2040 		/*
2041 		 * detach the pd from the state. the pd still refers
2042 		 * to the state though.
2043 		 */
2044 		st = pd->pd_st;
2045 		st->sync_defer = NULL;
2046 
2047 		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
2048 	}
2049 	mtx_leave(&s->s_mtx);
2050 
2051 	if (nsec > 0) {
2052 		/* we were looking at a pd, but it wasn't old enough */
2053 		timeout_add_nsec(&s->s_deferrals_tmo, nsec);
2054 	}
2055 
2056 	if (TAILQ_EMPTY(&pds))
2057 		return;
2058 
2059 	NET_LOCK();
2060 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
2061 		TAILQ_REMOVE(&pds, pd, pd_entry);
2062 
2063 		pfsync_defer_output(pd);
2064 	}
2065 	NET_UNLOCK();
2066 }
2067 
2068 static void
pfsync_defer_output(struct pfsync_deferral * pd)2069 pfsync_defer_output(struct pfsync_deferral *pd)
2070 {
2071 	struct pf_pdesc pdesc;
2072 	struct pf_state *st = pd->pd_st;
2073 
2074 	if (st->rt == PF_ROUTETO) {
2075 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
2076 		    st->direction, NULL, pd->pd_m, NULL) != PF_PASS)
2077 			return;
2078 		switch (st->key[PF_SK_WIRE]->af) {
2079 		case AF_INET:
2080 			pf_route(&pdesc, st);
2081 			break;
2082 #ifdef INET6
2083 		case AF_INET6:
2084 			pf_route6(&pdesc, st);
2085 			break;
2086 #endif /* INET6 */
2087 		default:
2088 			unhandled_af(st->key[PF_SK_WIRE]->af);
2089 		}
2090 		pd->pd_m = pdesc.m;
2091 	} else {
2092 		switch (st->key[PF_SK_WIRE]->af) {
2093 		case AF_INET:
2094 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
2095 			break;
2096 #ifdef INET6
2097 		case AF_INET6:
2098 			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
2099 			break;
2100 #endif /* INET6 */
2101 		default:
2102 			unhandled_af(st->key[PF_SK_WIRE]->af);
2103 		}
2104 
2105 		pd->pd_m = NULL;
2106 	}
2107 
2108 	pf_state_unref(st);
2109 	m_freem(pd->pd_m);
2110 	pool_put(&pfsync_deferrals_pool, pd);
2111 }
2112 
2113 struct pfsync_subh_bus {
2114 	struct pfsync_subheader	subh;
2115 	struct pfsync_bus	bus;
2116 } __packed __aligned(4);
2117 
2118 static unsigned int
pfsync_bulk_snd_bus(struct pfsync_softc * sc,struct mbuf * m,const unsigned int space,uint32_t endtime,uint8_t status)2119 pfsync_bulk_snd_bus(struct pfsync_softc *sc,
2120     struct mbuf *m, const unsigned int space,
2121     uint32_t endtime, uint8_t status)
2122 {
2123 	struct pfsync_subh_bus *h;
2124 	unsigned int nlen;
2125 
2126 	nlen = m->m_len + sizeof(*h);
2127 	if (space < nlen)
2128 		return (0);
2129 
2130 	h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len);
2131 	memset(h, 0, sizeof(*h));
2132 
2133 	h->subh.action = PFSYNC_ACT_BUS;
2134 	h->subh.len = sizeof(h->bus) >> 2;
2135 	h->subh.count = htons(1);
2136 
2137 	h->bus.creatorid = pf_status.hostid;
2138 	h->bus.endtime = htonl(endtime);
2139 	h->bus.status = status;
2140 
2141 	m->m_len = nlen;
2142 
2143 	return (1);
2144 }
2145 
2146 static unsigned int
pfsync_bulk_snd_states(struct pfsync_softc * sc,struct mbuf * m,const unsigned int space,unsigned int len)2147 pfsync_bulk_snd_states(struct pfsync_softc *sc,
2148     struct mbuf *m, const unsigned int space, unsigned int len)
2149 {
2150 	struct pf_state *st;
2151 	struct pfsync_state *sp;
2152 	unsigned int nlen;
2153 	unsigned int count = 0;
2154 
2155 	st = sc->sc_bulk_snd.snd_next;
2156 
2157 	for (;;) {
2158 		nlen = len + sizeof(*sp);
2159 		sp = (struct pfsync_state *)(mtod(m, caddr_t) + len);
2160 		if (space < nlen)
2161 			break;
2162 
2163 		mtx_enter(&st->mtx);
2164 		pf_state_export(sp, st);
2165 		mtx_leave(&st->mtx);
2166 
2167 		/* commit */
2168 		count++;
2169 		m->m_len = len = nlen;
2170 
2171 		if (st == sc->sc_bulk_snd.snd_tail) {
2172 			if (pfsync_bulk_snd_bus(sc, m, space,
2173 			    0, PFSYNC_BUS_END) == 0) {
2174 				/* couldn't fit the BUS */
2175 				st = NULL;
2176 				break;
2177 			}
2178 
2179 			/* this BUS is done */
2180 			pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2181 			sc->sc_bulk_snd.snd_again = 0; /* XXX */
2182 			sc->sc_bulk_snd.snd_next = NULL;
2183 			sc->sc_bulk_snd.snd_tail = NULL;
2184 			return (count);
2185 		}
2186 
2187 		st = TAILQ_NEXT(st, entry_list);
2188 	}
2189 
2190 	/* there's still work to do */
2191 	sc->sc_bulk_snd.snd_next = st;
2192 	timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS);
2193 
2194 	return (count);
2195 }
2196 
2197 static unsigned int
pfsync_bulk_snd_sub(struct pfsync_softc * sc,struct mbuf * m,const unsigned int space)2198 pfsync_bulk_snd_sub(struct pfsync_softc *sc,
2199     struct mbuf *m, const unsigned int space)
2200 {
2201 	struct pfsync_subheader *subh;
2202 	unsigned int count;
2203 	unsigned int len, nlen;
2204 
2205 	len = m->m_len;
2206 	nlen = len + sizeof(*subh);
2207 	if (nlen > space)
2208 		return (0);
2209 
2210 	subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len);
2211 
2212 	/*
2213 	 * pfsync_bulk_snd_states only updates m->m_len after
2214 	 * filling in a state after the offset we gave it.
2215 	 */
2216 	count = pfsync_bulk_snd_states(sc, m, space, nlen);
2217 	if (count == 0)
2218 		return (0);
2219 
2220 	subh->action = PFSYNC_ACT_UPD;
2221 	subh->len = sizeof(struct pfsync_state) >> 2;
2222 	subh->count = htons(count);
2223 
2224 	return (count);
2225 }
2226 
2227 static void
pfsync_bulk_snd_start(struct pfsync_softc * sc)2228 pfsync_bulk_snd_start(struct pfsync_softc *sc)
2229 {
2230 	const unsigned int space = sc->sc_if.if_mtu -
2231 	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2232 	struct mbuf *m;
2233 
2234 	rw_enter_read(&pf_state_list.pfs_rwl);
2235 
2236 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2237 	if (sc->sc_bulk_snd.snd_next != NULL) {
2238 		sc->sc_bulk_snd.snd_again = 1;
2239 		goto leave;
2240 	}
2241 
2242 	mtx_enter(&pf_state_list.pfs_mtx);
2243 	sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2244 	sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list,
2245 	    pf_state_queue);
2246 	mtx_leave(&pf_state_list.pfs_mtx);
2247 
2248 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2249 	if (m == NULL)
2250 		goto leave;
2251 
2252 	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2253 	if (!ISSET(m->m_flags, M_EXT)) {
2254 		/* some error++ */
2255 		m_freem(m); /* drop */
2256 		goto leave;
2257 	}
2258 
2259 	m_align(m, space);
2260 	m->m_len = 0;
2261 
2262 	if (sc->sc_bulk_snd.snd_tail == NULL) {
2263 		pfsync_dprintf(sc, "bulk send empty (%s)", __func__);
2264 
2265 		/* list is empty */
2266 		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2267 			panic("%s: mtu is too low", __func__);
2268 		goto encap;
2269 	}
2270 
2271 	pfsync_dprintf(sc, "bulk send start (%s)", __func__);
2272 
2273 	/* start a bulk update. */
2274 	if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0)
2275 		panic("%s: mtu is too low", __func__);
2276 
2277 	/* fill it up with state updates. */
2278 	pfsync_bulk_snd_sub(sc, m, space);
2279 
2280 encap:
2281 	m->m_pkthdr.len = m->m_len;
2282 	m = pfsync_encap(sc, m);
2283 	if (m == NULL)
2284 		goto leave;
2285 
2286 	pfsync_sendout(sc, m);
2287 
2288 leave:
2289 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2290 
2291 	rw_exit_read(&pf_state_list.pfs_rwl);
2292 }
2293 
2294 static void
pfsync_bulk_snd_tmo(void * arg)2295 pfsync_bulk_snd_tmo(void *arg)
2296 {
2297 	struct pfsync_softc *sc = arg;
2298 	const unsigned int space = sc->sc_if.if_mtu -
2299 	    (sizeof(struct ip) + sizeof(struct pfsync_header));
2300 	struct mbuf *m;
2301 
2302 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2303 	if (m == NULL) {
2304 		/* some error++ */
2305 		/* retry later */
2306 		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2307 		    PFSYNC_BULK_SND_IVAL_MS);
2308 		return;
2309 	}
2310 
2311 	MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu);
2312 	if (!ISSET(m->m_flags, M_EXT)) {
2313 		/* some error++ */
2314 		m_freem(m);
2315 		/* retry later */
2316 		timeout_add_msec(&sc->sc_bulk_snd.snd_tmo,
2317 		    PFSYNC_BULK_SND_IVAL_MS);
2318 		return;
2319 	}
2320 
2321 	m_align(m, space);
2322 	m->m_len = 0;
2323 
2324 	rw_enter_read(&pf_state_list.pfs_rwl);
2325 	rw_enter_write(&sc->sc_bulk_snd.snd_lock);
2326 
2327 	if (sc->sc_bulk_snd.snd_next == NULL) {
2328 		/* there was no space in the previous packet for a BUS END */
2329 
2330 		if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0)
2331 			panic("%s: mtu is too low", __func__);
2332 
2333 		/* this bulk is done */
2334 		pfsync_dprintf(sc, "bulk send done (%s)", __func__);
2335 		sc->sc_bulk_snd.snd_again = 0; /* XXX */
2336 		sc->sc_bulk_snd.snd_tail = NULL;
2337 	} else {
2338 		pfsync_dprintf(sc, "bulk send again (%s)", __func__);
2339 
2340 		/* fill it up with state updates. */
2341 		pfsync_bulk_snd_sub(sc, m, space);
2342 	}
2343 
2344 	m->m_pkthdr.len = m->m_len;
2345 	m = pfsync_encap(sc, m);
2346 
2347 	rw_exit_write(&sc->sc_bulk_snd.snd_lock);
2348 	rw_exit_read(&pf_state_list.pfs_rwl);
2349 
2350 	if (m != NULL) {
2351 		NET_LOCK();
2352 		pfsync_sendout(sc, m);
2353 		NET_UNLOCK();
2354 	}
2355 }
2356 
2357 static void
pfsync_update_state_req(struct pfsync_softc * sc,struct pf_state * st)2358 pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st)
2359 {
2360 	struct pfsync_slice *s = pfsync_slice_enter(sc, st);
2361 
2362 	switch (st->sync_state) {
2363 	case PFSYNC_S_UPD_C:
2364 	case PFSYNC_S_IACK:
2365 		pfsync_q_del(s, st);
2366 		/* FALLTHROUGH */
2367 	case PFSYNC_S_NONE:
2368 		pfsync_q_ins(s, st, PFSYNC_S_UPD);
2369 		break;
2370 
2371 	case PFSYNC_S_INS:
2372 	case PFSYNC_S_UPD:
2373 	case PFSYNC_S_DEL:
2374 		/* we're already handling it */
2375 		break;
2376 	default:
2377 		panic("%s: state %p unexpected sync_state %d",
2378 		    __func__, st, st->sync_state);
2379 	}
2380 
2381 	pfsync_slice_sched(s);
2382 	pfsync_slice_leave(sc, s);
2383 }
2384 
2385 #if defined(IPSEC)
2386 static void
pfsync_out_tdb(struct tdb * tdb,void * buf)2387 pfsync_out_tdb(struct tdb *tdb, void *buf)
2388 {
2389 	struct pfsync_tdb *ut = buf;
2390 
2391 	memset(ut, 0, sizeof(*ut));
2392 	ut->spi = tdb->tdb_spi;
2393 	memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst));
2394 	/*
2395 	 * When a failover happens, the master's rpl is probably above
2396 	 * what we see here (we may be up to a second late), so
2397 	 * increase it a bit for outbound tdbs to manage most such
2398 	 * situations.
2399 	 *
2400 	 * For now, just add an offset that is likely to be larger
2401 	 * than the number of packets we can see in one second. The RFC
2402 	 * just says the next packet must have a higher seq value.
2403 	 *
2404 	 * XXX What is a good algorithm for this? We could use
2405 	 * a rate-determined increase, but to know it, we would have
2406 	 * to extend struct tdb.
2407 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2408 	 * will soon be replaced anyway. For now, just don't handle
2409 	 * this edge case.
2410 	 */
2411 #define RPL_INCR 16384
2412 	ut->rpl = htobe64(tdb->tdb_rpl +
2413 	    (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0));
2414 	ut->cur_bytes = htobe64(tdb->tdb_cur_bytes);
2415 	ut->sproto = tdb->tdb_sproto;
2416 	ut->rdomain = htons(tdb->tdb_rdomain);
2417 }
2418 
2419 static struct pfsync_slice *
pfsync_slice_enter_tdb(struct pfsync_softc * sc,const struct tdb * t)2420 pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t)
2421 {
2422 	/*
2423 	 * just use the first slice for all ipsec (for now) until
2424 	 * it's more obvious what property (eg, spi) we can distribute
2425 	 * tdbs over slices with.
2426 	 */
2427 	struct pfsync_slice *s = &sc->sc_slices[0];
2428 
2429 	if (!mtx_enter_try(&s->s_mtx)) {
2430 		mtx_enter(&s->s_mtx);
2431 		s->s_stat_contended++;
2432 	}
2433 	s->s_stat_locks++;
2434 
2435 	return (s);
2436 }
2437 
2438 static void
pfsync_tdb_ins(struct pfsync_slice * s,struct tdb * tdb)2439 pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb)
2440 {
2441 	size_t nlen = sizeof(struct pfsync_tdb);
2442 	struct mbuf *m = NULL;
2443 
2444 	KASSERT(s->s_len >= PFSYNC_MINPKT);
2445 
2446 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2447 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2448 
2449 	if (TAILQ_EMPTY(&s->s_tdb_q))
2450 		nlen += sizeof(struct pfsync_subheader);
2451 
2452 	if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) {
2453 		m = pfsync_slice_write(s);
2454 		if (m != NULL) {
2455 			s->s_stat_enqueue++;
2456 			if (mq_enqueue(&s->s_sendq, m) == 0)
2457 				task_add(s->s_softnet, &s->s_send);
2458 		}
2459 
2460 		nlen = sizeof(struct pfsync_subheader) +
2461 		    sizeof(struct pfsync_tdb);
2462 	}
2463 
2464 	s->s_len += nlen;
2465 	TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry);
2466 	tdb->tdb_updates = 0;
2467 
2468 	if (!timeout_pending(&s->s_tmo))
2469 		timeout_add_sec(&s->s_tmo, 1);
2470 }
2471 
2472 static void
pfsync_tdb_del(struct pfsync_slice * s,struct tdb * tdb)2473 pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb)
2474 {
2475 	MUTEX_ASSERT_LOCKED(&s->s_mtx);
2476 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2477 
2478 	TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry);
2479 
2480 	s->s_len -= sizeof(struct pfsync_tdb);
2481 	if (TAILQ_EMPTY(&s->s_tdb_q))
2482 		s->s_len -= sizeof(struct pfsync_subheader);
2483 }
2484 
2485 /*
2486  * the reference that pfsync has to a tdb is accounted for by the
2487  * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is
2488  * called after all other references to a tdb are dropped (with
2489  * tdb_unref) as part of the tdb_free().
2490  *
2491  * tdb_free() needs to wait for pfsync to let go of the tdb though,
2492  * which would be best handled by a reference count, but tdb_free
2493  * needs the NET_LOCK which pfsync is already fighting with. instead
2494  * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop
2495  * with tdb_free.
2496  */
2497 
2498 void
pfsync_update_tdb(struct tdb * tdb,int output)2499 pfsync_update_tdb(struct tdb *tdb, int output)
2500 {
2501 	struct pfsync_softc *sc;
2502 
2503 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2504 
2505 	smr_read_enter();
2506 	sc = SMR_PTR_GET(&pfsyncif);
2507 	if (sc != NULL) {
2508 		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2509 
2510 		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2511 		if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2512 			mtx_enter(&tdb->tdb_mtx);
2513 			SET(tdb->tdb_flags, TDBF_PFSYNC);
2514 			mtx_leave(&tdb->tdb_mtx);
2515 
2516 			pfsync_tdb_ins(s, tdb);
2517 		} else if (++tdb->tdb_updates >= sc->sc_maxupdates)
2518 			pfsync_slice_sched(s);
2519 
2520 		/* XXX no sync timestamp on tdbs to check */
2521 
2522 		pfsync_slice_leave(sc, s);
2523 	}
2524 	smr_read_leave();
2525 }
2526 
2527 void
pfsync_delete_tdb(struct tdb * tdb)2528 pfsync_delete_tdb(struct tdb *tdb)
2529 {
2530 	struct pfsync_softc *sc;
2531 
2532 	MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx);
2533 
2534 	smr_read_enter();
2535 	sc = SMR_PTR_GET(&pfsyncif);
2536 	if (sc != NULL) {
2537 		struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb);
2538 
2539 		/* TDBF_PFSYNC is only changed while the slice mtx is held */
2540 		if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2541 			pfsync_tdb_del(s, tdb);
2542 
2543 			mtx_enter(&tdb->tdb_mtx);
2544 			CLR(tdb->tdb_flags, TDBF_PFSYNC);
2545 			mtx_leave(&tdb->tdb_mtx);
2546 		}
2547 
2548 		pfsync_slice_leave(sc, s);
2549 	}
2550 	smr_read_leave();
2551 
2552 	/*
2553 	 * handle pfsync_slice_drop being called from pfsync_down
2554 	 * and the smr/slice access above won't work.
2555 	 */
2556 
2557 	mtx_enter(&tdb->tdb_mtx);
2558 	SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */
2559 	while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) {
2560 		msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT,
2561 		    "tdbfree", INFSLP);
2562 	}
2563 	mtx_leave(&tdb->tdb_mtx);
2564 }
2565 #endif /* defined(IPSEC) */
2566 
2567 struct pfsync_act {
2568 	void (*in)(struct pfsync_softc *, const caddr_t,
2569 	    unsigned int, unsigned int);
2570 	size_t len;
2571 };
2572 
2573 static void	pfsync_in_clr(struct pfsync_softc *,
2574 		    const caddr_t, unsigned int, unsigned int);
2575 static void	pfsync_in_iack(struct pfsync_softc *,
2576 		    const caddr_t, unsigned int, unsigned int);
2577 static void	pfsync_in_upd_c(struct pfsync_softc *,
2578 		    const caddr_t, unsigned int, unsigned int);
2579 static void	pfsync_in_ureq(struct pfsync_softc *,
2580 		    const caddr_t, unsigned int, unsigned int);
2581 static void	pfsync_in_del(struct pfsync_softc *,
2582 		    const caddr_t, unsigned int, unsigned int);
2583 static void	pfsync_in_del_c(struct pfsync_softc *,
2584 		    const caddr_t, unsigned int, unsigned int);
2585 static void	pfsync_in_bus(struct pfsync_softc *,
2586 		    const caddr_t, unsigned int, unsigned int);
2587 static void	pfsync_in_tdb(struct pfsync_softc *,
2588 		    const caddr_t, unsigned int, unsigned int);
2589 static void	pfsync_in_ins(struct pfsync_softc *,
2590 		    const caddr_t, unsigned int, unsigned int);
2591 static void	pfsync_in_upd(struct pfsync_softc *,
2592 		    const caddr_t, unsigned int, unsigned int);
2593 
2594 static const struct pfsync_act pfsync_acts[] = {
2595 	[PFSYNC_ACT_CLR] =
2596 	    { pfsync_in_clr,	sizeof(struct pfsync_clr) },
2597 	[PFSYNC_ACT_INS_ACK] =
2598 	    { pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
2599 	[PFSYNC_ACT_UPD_C] =
2600 	    { pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
2601 	[PFSYNC_ACT_UPD_REQ] =
2602 	    { pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
2603 	[PFSYNC_ACT_DEL] =
2604 	    { pfsync_in_del,	sizeof(struct pfsync_state) },
2605 	[PFSYNC_ACT_DEL_C] =
2606 	    { pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
2607 	[PFSYNC_ACT_BUS] =
2608 	    { pfsync_in_bus,	sizeof(struct pfsync_bus) },
2609 	[PFSYNC_ACT_INS] =
2610 	    { pfsync_in_ins,	sizeof(struct pfsync_state) },
2611 	[PFSYNC_ACT_UPD] =
2612 	    { pfsync_in_upd,	sizeof(struct pfsync_state) },
2613 	[PFSYNC_ACT_TDB] =
2614 	    { pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
2615 };
2616 
2617 static void
pfsync_in_skip(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)2618 pfsync_in_skip(struct pfsync_softc *sc,
2619     const caddr_t buf, unsigned int mlen, unsigned int count)
2620 {
2621 	/* nop */
2622 }
2623 
2624 static struct mbuf *
pfsync_input(struct mbuf * m,uint8_t ttl,unsigned int hlen)2625 pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen)
2626 {
2627 	struct pfsync_softc *sc;
2628 	struct pfsync_header *ph;
2629 	struct pfsync_subheader *subh;
2630 	unsigned int len;
2631 	void (*in)(struct pfsync_softc *,
2632 	    const caddr_t, unsigned int, unsigned int);
2633 
2634 	pfsyncstat_inc(pfsyncs_ipackets);
2635 
2636 	if (!pf_status.running)
2637 		return (m);
2638 
2639 	/*
2640 	 * pfsyncif is only set if it is up and running correctly.
2641 	 */
2642 	smr_read_enter();
2643 	sc = SMR_PTR_GET(&pfsyncif);
2644 	if (sc == NULL)
2645 		goto leave;
2646 
2647 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
2648 		pfsyncstat_inc(pfsyncs_badif);
2649 		goto leave;
2650 	}
2651 
2652 	/* verify that the IP TTL is 255. */
2653 	if (ttl != PFSYNC_DFLTTL) {
2654 		pfsyncstat_inc(pfsyncs_badttl);
2655 		goto leave;
2656 	}
2657 
2658 	m_adj(m, hlen);
2659 
2660 	if (m->m_pkthdr.len < sizeof(*ph)) {
2661 		pfsyncstat_inc(pfsyncs_hdrops);
2662 		goto leave;
2663 	}
2664 	if (m->m_len < sizeof(*ph)) {
2665 		m = m_pullup(m, sizeof(*ph));
2666 		if (m == NULL)
2667 			goto leave;
2668 	}
2669 
2670 	ph = mtod(m, struct pfsync_header *);
2671 	if (ph->version != PFSYNC_VERSION) {
2672 		pfsyncstat_inc(pfsyncs_badver);
2673 		goto leave;
2674 	}
2675 
2676 	len = ntohs(ph->len);
2677 	if (m->m_pkthdr.len < len) {
2678 		pfsyncstat_inc(pfsyncs_badlen);
2679 		goto leave;
2680 	}
2681 	if (m->m_pkthdr.len > len)
2682 		m->m_pkthdr.len = len;
2683 
2684 	/* ok, it's serious now */
2685 	refcnt_take(&sc->sc_refs);
2686 	smr_read_leave();
2687 
2688 	counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len);
2689 
2690 	m_adj(m, sizeof(*ph));
2691 
2692 	while (m->m_pkthdr.len >= sizeof(*subh)) {
2693 		unsigned int action, mlen, count;
2694 
2695 		if (m->m_len < sizeof(*subh)) {
2696 			m = m_pullup(m, sizeof(*subh));
2697 			if (m == NULL)
2698 				goto rele;
2699 		}
2700 		subh = mtod(m, struct pfsync_subheader *);
2701 
2702 		action = subh->action;
2703 		mlen = subh->len << 2;
2704 		count = ntohs(subh->count);
2705 
2706 		if (action >= PFSYNC_ACT_MAX ||
2707 		    action >= nitems(pfsync_acts) ||
2708 		    mlen < pfsync_acts[subh->action].len) {
2709 			/*
2710 			 * subheaders are always followed by at least one
2711 			 * message, so if the peer is new
2712 			 * enough to tell us how big its messages are then we
2713 			 * know enough to skip them.
2714 			 */
2715 			if (count == 0 || mlen == 0) {
2716 				pfsyncstat_inc(pfsyncs_badact);
2717 				goto rele;
2718 			}
2719 
2720 			in = pfsync_in_skip;
2721 		} else {
2722 			in = pfsync_acts[action].in;
2723 			if (in == NULL)
2724 				in = pfsync_in_skip;
2725 		}
2726 
2727 		m_adj(m, sizeof(*subh));
2728 		len = mlen * count;
2729 		if (len > m->m_pkthdr.len) {
2730 			pfsyncstat_inc(pfsyncs_badlen);
2731 			goto rele;
2732 		}
2733 		if (m->m_len < len) {
2734 			m = m_pullup(m, len);
2735 			if (m == NULL)
2736 				goto rele;
2737 		}
2738 
2739 		(*in)(sc, mtod(m, caddr_t), mlen, count);
2740 		m_adj(m, len);
2741 	}
2742 
2743 rele:
2744 	refcnt_rele_wake(&sc->sc_refs);
2745 	return (m);
2746 
2747 leave:
2748 	smr_read_leave();
2749 	return (m);
2750 }
2751 
2752 static void
pfsync_in_clr(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)2753 pfsync_in_clr(struct pfsync_softc *sc,
2754     const caddr_t buf, unsigned int mlen, unsigned int count)
2755 {
2756 	const struct pfsync_clr *clr;
2757 	struct pf_state *head, *tail, *st, *next;
2758 	struct pfi_kif *kif;
2759 	uint32_t creatorid;
2760 	unsigned int i;
2761 
2762 	rw_enter_read(&pf_state_list.pfs_rwl);
2763 
2764 	/* get a view of the state list */
2765 	mtx_enter(&pf_state_list.pfs_mtx);
2766 	head = TAILQ_FIRST(&pf_state_list.pfs_list);
2767 	tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2768 	mtx_leave(&pf_state_list.pfs_mtx);
2769 
2770 	PF_LOCK();
2771 	for (i = 0; i < count; i++) {
2772 		clr = (struct pfsync_clr *)(buf + i * mlen);
2773 
2774 		creatorid = clr->creatorid;
2775 		if (clr->ifname[0] == '\0')
2776 			kif = NULL;
2777 		else {
2778 			kif = pfi_kif_find(clr->ifname);
2779 			if (kif == NULL)
2780 				continue;
2781 		}
2782 
2783 		st = NULL;
2784 		next = head;
2785 
2786 		PF_STATE_ENTER_WRITE();
2787 		while (st != tail) {
2788 			st = next;
2789 			next = TAILQ_NEXT(st, entry_list);
2790 
2791 			if (creatorid != st->creatorid)
2792 				continue;
2793 			if (kif != NULL && kif != st->kif)
2794 				continue;
2795 
2796 			mtx_enter(&st->mtx);
2797 			SET(st->state_flags, PFSTATE_NOSYNC);
2798 			mtx_leave(&st->mtx);
2799 			pf_remove_state(st);
2800 		}
2801 		PF_STATE_EXIT_WRITE();
2802 	}
2803 	PF_UNLOCK();
2804 
2805 	rw_exit_read(&pf_state_list.pfs_rwl);
2806 }
2807 
2808 static void
pfsync_in_ins(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)2809 pfsync_in_ins(struct pfsync_softc *sc,
2810     const caddr_t buf, unsigned int mlen, unsigned int count)
2811 {
2812 	const struct pfsync_state *sp;
2813 	sa_family_t af1, af2;
2814 	unsigned int i;
2815 
2816 	PF_LOCK();
2817 	for (i = 0; i < count; i++) {
2818 		sp = (struct pfsync_state *)(buf + mlen * i);
2819 		af1 = sp->key[0].af;
2820 		af2 = sp->key[1].af;
2821 
2822 		/* check for invalid values */
2823 		if (sp->timeout >= PFTM_MAX ||
2824 		    sp->src.state > PF_TCPS_PROXY_DST ||
2825 		    sp->dst.state > PF_TCPS_PROXY_DST ||
2826 		    sp->direction > PF_OUT ||
2827 		    (((af1 || af2) &&
2828 		     ((af1 != AF_INET && af1 != AF_INET6) ||
2829 		      (af2 != AF_INET && af2 != AF_INET6))) ||
2830 		     (sp->af != AF_INET && sp->af != AF_INET6))) {
2831 			pfsyncstat_inc(pfsyncs_badval);
2832 			continue;
2833 		}
2834 
2835 		if (pf_state_import(sp, PFSYNC_SI_PFSYNC) == ENOMEM) {
2836 			/* drop out, but process the rest of the actions */
2837 			break;
2838 		}
2839 	}
2840 	PF_UNLOCK();
2841 }
2842 
2843 static void
pfsync_in_iack(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)2844 pfsync_in_iack(struct pfsync_softc *sc,
2845     const caddr_t buf, unsigned int mlen, unsigned int count)
2846 {
2847 	const struct pfsync_ins_ack *ia;
2848 	struct pf_state_cmp id_key;
2849 	struct pf_state *st;
2850 	unsigned int i;
2851 
2852 	for (i = 0; i < count; i++) {
2853 		ia = (struct pfsync_ins_ack *)(buf + mlen * i);
2854 
2855 		id_key.id = ia->id;
2856 		id_key.creatorid = ia->creatorid;
2857 
2858 		PF_STATE_ENTER_READ();
2859 		st = pf_find_state_byid(&id_key);
2860 		pf_state_ref(st);
2861 		PF_STATE_EXIT_READ();
2862 		if (st == NULL)
2863 			continue;
2864 
2865 		if (READ_ONCE(st->sync_defer) != NULL)
2866 			pfsync_deferred(sc, st);
2867 
2868 		pf_state_unref(st);
2869 	}
2870 }
2871 
2872 static int
pfsync_upd_tcp(struct pf_state * st,const struct pfsync_state_peer * src,const struct pfsync_state_peer * dst)2873 pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src,
2874     const struct pfsync_state_peer *dst)
2875 {
2876 	int sync = 0;
2877 
2878 	/*
2879 	 * The state should never go backwards except
2880 	 * for syn-proxy states.  Neither should the
2881 	 * sequence window slide backwards.
2882 	 */
2883 	if ((st->src.state > src->state &&
2884 	    (st->src.state < PF_TCPS_PROXY_SRC ||
2885 	     src->state >= PF_TCPS_PROXY_SRC)) ||
2886 
2887 	    (st->src.state == src->state &&
2888 	     SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
2889 		sync++;
2890 	else
2891 		pf_state_peer_ntoh(src, &st->src);
2892 
2893 	if ((st->dst.state > dst->state) ||
2894 
2895 	    (st->dst.state == dst->state &&
2896 	     SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
2897 		sync++;
2898 	else
2899 		pf_state_peer_ntoh(dst, &st->dst);
2900 
2901 	return (sync);
2902 }
2903 
2904 static void
pfsync_in_updates(struct pfsync_softc * sc,struct pf_state * st,const struct pfsync_state_peer * src,const struct pfsync_state_peer * dst,uint8_t timeout)2905 pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st,
2906     const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst,
2907     uint8_t timeout)
2908 {
2909 	struct pf_state_scrub *sscrub = NULL;
2910 	struct pf_state_scrub *dscrub = NULL;
2911 	int sync;
2912 
2913 	if (src->scrub.scrub_flag && st->src.scrub == NULL) {
2914 		sscrub = pf_state_scrub_get();
2915 		if (sscrub == NULL) {
2916 			/* inc error? */
2917 			goto out;
2918 		}
2919 	}
2920 	if (dst->scrub.scrub_flag && st->dst.scrub == NULL) {
2921 		dscrub = pf_state_scrub_get();
2922 		if (dscrub == NULL) {
2923 			/* inc error? */
2924 			goto out;
2925 		}
2926 	}
2927 
2928 	if (READ_ONCE(st->sync_defer) != NULL)
2929 		pfsync_deferred(sc, st);
2930 
2931 	mtx_enter(&st->mtx);
2932 
2933 	/* attach the scrub memory if needed */
2934 	if (sscrub != NULL && st->src.scrub == NULL) {
2935 		st->src.scrub = sscrub;
2936 		sscrub = NULL;
2937 	}
2938 	if (dscrub != NULL && st->dst.scrub == NULL) {
2939 		st->dst.scrub = dscrub;
2940 		dscrub = NULL;
2941 	}
2942 
2943 	if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
2944 		sync = pfsync_upd_tcp(st, src, dst);
2945 	else {
2946 		sync = 0;
2947 
2948 		/*
2949 		 * Non-TCP protocol state machine always go
2950 		 * forwards
2951 		 */
2952 		if (st->src.state > src->state)
2953 			sync++;
2954 		else
2955 			pf_state_peer_ntoh(src, &st->src);
2956 
2957 		if (st->dst.state > dst->state)
2958 			sync++;
2959 		else
2960 			pf_state_peer_ntoh(dst, &st->dst);
2961 	}
2962 
2963 	st->pfsync_time = getuptime();
2964 	if (sync < 2) {
2965 		st->expire = st->pfsync_time;
2966 		st->timeout = timeout;
2967 	}
2968 
2969 	mtx_leave(&st->mtx);
2970 
2971 	if (sync) {
2972 		pfsyncstat_inc(pfsyncs_stale);
2973 		pfsync_update_state(st);
2974 	}
2975 
2976 out:
2977 	if (sscrub != NULL)
2978 		pf_state_scrub_put(sscrub);
2979 	if (dscrub != NULL)
2980 		pf_state_scrub_put(dscrub);
2981 }
2982 
2983 
2984 static void
pfsync_in_upd(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)2985 pfsync_in_upd(struct pfsync_softc *sc,
2986     const caddr_t buf, unsigned int mlen, unsigned int count)
2987 {
2988 	const struct pfsync_state *sp;
2989 	struct pf_state_cmp id_key;
2990 	struct pf_state *st;
2991 	int error;
2992 	unsigned int i;
2993 
2994 	for (i = 0; i < count; i++) {
2995 		sp = (struct pfsync_state *)(buf + mlen * i);
2996 
2997 		/* check for invalid values */
2998 		if (sp->timeout >= PFTM_MAX ||
2999 		    sp->src.state > PF_TCPS_PROXY_DST ||
3000 		    sp->dst.state > PF_TCPS_PROXY_DST) {
3001 			pfsyncstat_inc(pfsyncs_badval);
3002 			continue;
3003 		}
3004 
3005 		id_key.id = sp->id;
3006 		id_key.creatorid = sp->creatorid;
3007 
3008 		PF_STATE_ENTER_READ();
3009 		st = pf_find_state_byid(&id_key);
3010 		pf_state_ref(st);
3011 		PF_STATE_EXIT_READ();
3012 		if (st == NULL) {
3013 			/* insert the update */
3014 			PF_LOCK();
3015 			error = pf_state_import(sp, PFSYNC_SI_PFSYNC);
3016 			if (error)
3017 				pfsyncstat_inc(pfsyncs_badstate);
3018 			PF_UNLOCK();
3019 			continue;
3020 		}
3021 
3022 		pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout);
3023 
3024 		pf_state_unref(st);
3025 	}
3026 }
3027 
3028 static struct mbuf *
pfsync_upd_req_init(struct pfsync_softc * sc,unsigned int count)3029 pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count)
3030 {
3031 	struct mbuf *m;
3032 	unsigned int mlen;
3033 
3034 	m = m_gethdr(M_DONTWAIT, MT_DATA);
3035 	if (m == NULL) {
3036 		pfsyncstat_inc(pfsyncs_onomem);
3037 		return (NULL);
3038 	}
3039 
3040 	mlen = max_linkhdr + sizeof(sc->sc_template) +
3041 	    sizeof(struct pfsync_header) +
3042 	    sizeof(struct pfsync_subheader) +
3043 	    sizeof(struct pfsync_upd_req) * count;
3044 
3045 	if (mlen > MHLEN) {
3046 		MCLGETL(m, M_DONTWAIT, mlen);
3047 		if (!ISSET(m->m_flags, M_EXT)) {
3048 			m_freem(m);
3049 			return (NULL);
3050 		}
3051 	}
3052 
3053 	m_align(m, 0);
3054 	m->m_len = 0;
3055 
3056 	return (m);
3057 }
3058 
3059 static void
pfsync_in_upd_c(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)3060 pfsync_in_upd_c(struct pfsync_softc *sc,
3061     const caddr_t buf, unsigned int mlen, unsigned int count)
3062 {
3063 	const struct pfsync_upd_c *up;
3064 	struct pf_state_cmp id_key;
3065 	struct pf_state *st;
3066 	unsigned int i;
3067 	struct mbuf *m = NULL;
3068 	unsigned int rcount = 0;
3069 
3070 	for (i = 0; i < count; i++) {
3071 		up = (struct pfsync_upd_c *)(buf + mlen * i);
3072 
3073 		/* check for invalid values */
3074 		if (up->timeout >= PFTM_MAX ||
3075 		    up->src.state > PF_TCPS_PROXY_DST ||
3076 		    up->dst.state > PF_TCPS_PROXY_DST) {
3077 			pfsyncstat_inc(pfsyncs_badval);
3078 			continue;
3079 		}
3080 
3081 		id_key.id = up->id;
3082 		id_key.creatorid = up->creatorid;
3083 
3084 		PF_STATE_ENTER_READ();
3085 		st = pf_find_state_byid(&id_key);
3086 		pf_state_ref(st);
3087 		PF_STATE_EXIT_READ();
3088 		if (st == NULL) {
3089 			/* We don't have this state. Ask for it. */
3090 			struct pfsync_upd_req *ur;
3091 
3092 			if (m == NULL) {
3093 				m = pfsync_upd_req_init(sc, count);
3094 				if (m == NULL) {
3095 					pfsyncstat_inc(pfsyncs_onomem);
3096 					continue;
3097 				}
3098 			}
3099 
3100 			m = m_prepend(m, sizeof(*ur), M_DONTWAIT);
3101 			if (m == NULL) {
3102 				pfsyncstat_inc(pfsyncs_onomem);
3103 				continue;
3104 			}
3105 
3106 			ur = mtod(m, struct pfsync_upd_req *);
3107 			ur->id = up->id;
3108 			ur->creatorid = up->creatorid;
3109 			rcount++;
3110 
3111 			continue;
3112 		}
3113 
3114 		pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout);
3115 
3116 		pf_state_unref(st);
3117 	}
3118 
3119 	if (m != NULL) {
3120 		struct pfsync_subheader *subh;
3121 
3122 		m = m_prepend(m, sizeof(*subh), M_DONTWAIT);
3123 		if (m == NULL) {
3124 			pfsyncstat_inc(pfsyncs_onomem);
3125 			return;
3126 		}
3127 
3128 		subh = mtod(m, struct pfsync_subheader *);
3129 		subh->action = PFSYNC_ACT_UPD_REQ;
3130 		subh->len = sizeof(struct pfsync_upd_req) >> 2;
3131 		subh->count = htons(rcount);
3132 
3133 		m = pfsync_encap(sc, m);
3134 		if (m == NULL) {
3135 			pfsyncstat_inc(pfsyncs_onomem);
3136 			return;
3137 		}
3138 
3139 		pfsync_sendout(sc, m);
3140 	}
3141 }
3142 
3143 static void
pfsync_in_ureq(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)3144 pfsync_in_ureq(struct pfsync_softc *sc,
3145     const caddr_t buf, unsigned int mlen, unsigned int count)
3146 {
3147 	const struct pfsync_upd_req *ur;
3148 	struct pf_state_cmp id_key;
3149 	struct pf_state *st;
3150 	unsigned int i;
3151 
3152 	for (i = 0; i < count; i++) {
3153 		ur = (struct pfsync_upd_req *)(buf + mlen * i);
3154 
3155 		id_key.id = ur->id;
3156 		id_key.creatorid = ur->creatorid;
3157 
3158 		if (id_key.id == 0 && id_key.creatorid == 0) {
3159 			pfsync_bulk_snd_start(sc);
3160 			continue;
3161 		}
3162 
3163 		PF_STATE_ENTER_READ();
3164 		st = pf_find_state_byid(&id_key);
3165 		if (st != NULL && st->timeout < PFTM_MAX &&
3166 		    !ISSET(st->state_flags, PFSTATE_NOSYNC))
3167 			pf_state_ref(st);
3168 		else
3169 			st = NULL;
3170 		PF_STATE_EXIT_READ();
3171 		if (st == NULL) {
3172 			pfsyncstat_inc(pfsyncs_badstate);
3173 			continue;
3174 		}
3175 
3176 		pfsync_update_state_req(sc, st);
3177 
3178 		pf_state_unref(st);
3179 	}
3180 }
3181 
3182 static void
pfsync_in_del(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)3183 pfsync_in_del(struct pfsync_softc *sc,
3184     const caddr_t buf, unsigned int mlen, unsigned int count)
3185 {
3186 	const struct pfsync_state *sp;
3187 	struct pf_state_cmp id_key;
3188 	struct pf_state *st;
3189 	unsigned int i;
3190 
3191 	PF_LOCK();
3192 	PF_STATE_ENTER_WRITE();
3193 	for (i = 0; i < count; i++) {
3194 		sp = (struct pfsync_state *)(buf + mlen * i);
3195 
3196 		id_key.id = sp->id;
3197 		id_key.creatorid = sp->creatorid;
3198 
3199 		st = pf_find_state_byid(&id_key);
3200 		if (st == NULL) {
3201 			pfsyncstat_inc(pfsyncs_badstate);
3202 			continue;
3203 		}
3204 
3205 		mtx_enter(&st->mtx);
3206 		SET(st->state_flags, PFSTATE_NOSYNC);
3207 		mtx_leave(&st->mtx);
3208 		pf_remove_state(st);
3209 	}
3210 	PF_STATE_EXIT_WRITE();
3211 	PF_UNLOCK();
3212 }
3213 
3214 static void
pfsync_in_del_c(struct pfsync_softc * sc,const caddr_t buf,unsigned int mlen,unsigned int count)3215 pfsync_in_del_c(struct pfsync_softc *sc,
3216     const caddr_t buf, unsigned int mlen, unsigned int count)
3217 {
3218 	const struct pfsync_del_c *sp;
3219 	struct pf_state_cmp id_key;
3220 	struct pf_state *st;
3221 	unsigned int i;
3222 
3223 	PF_LOCK();
3224 	PF_STATE_ENTER_WRITE();
3225 	for (i = 0; i < count; i++) {
3226 		sp = (struct pfsync_del_c *)(buf + mlen * i);
3227 
3228 		id_key.id = sp->id;
3229 		id_key.creatorid = sp->creatorid;
3230 
3231 		st = pf_find_state_byid(&id_key);
3232 		if (st == NULL) {
3233 			pfsyncstat_inc(pfsyncs_badstate);
3234 			continue;
3235 		}
3236 
3237 		mtx_enter(&st->mtx);
3238 		SET(st->state_flags, PFSTATE_NOSYNC);
3239 		mtx_leave(&st->mtx);
3240 		pf_remove_state(st);
3241 	}
3242 	PF_STATE_EXIT_WRITE();
3243 	PF_UNLOCK();
3244 }
3245 
3246 static void
pfsync_in_bus(struct pfsync_softc * sc,const caddr_t buf,unsigned int len,unsigned int count)3247 pfsync_in_bus(struct pfsync_softc *sc,
3248     const caddr_t buf, unsigned int len, unsigned int count)
3249 {
3250 	const struct pfsync_bus *bus = (struct pfsync_bus *)buf;
3251 
3252 	switch (bus->status) {
3253 	case PFSYNC_BUS_START:
3254 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START);
3255 		break;
3256 
3257 	case PFSYNC_BUS_END:
3258 		pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END);
3259 		break;
3260 	}
3261 }
3262 
3263 #if defined(IPSEC)
3264 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
3265 static void
pfsync_update_net_tdb(const struct pfsync_tdb * pt)3266 pfsync_update_net_tdb(const struct pfsync_tdb *pt)
3267 {
3268 	struct tdb *tdb;
3269 
3270 	NET_ASSERT_LOCKED();
3271 
3272 	/* check for invalid values */
3273 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
3274 	    (pt->dst.sa.sa_family != AF_INET &&
3275 	     pt->dst.sa.sa_family != AF_INET6))
3276 		goto bad;
3277 
3278 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
3279 	    (union sockaddr_union *)&pt->dst, pt->sproto);
3280 	if (tdb) {
3281 		uint64_t rpl = betoh64(pt->rpl);
3282 		uint64_t cur_bytes = betoh64(pt->cur_bytes);
3283 
3284 		/* Neither replay nor byte counter should ever decrease. */
3285 		mtx_enter(&tdb->tdb_mtx);
3286 		if (rpl >= tdb->tdb_rpl &&
3287 		    cur_bytes >= tdb->tdb_cur_bytes) {
3288 			tdb->tdb_rpl = rpl;
3289 			tdb->tdb_cur_bytes = cur_bytes;
3290 		}
3291 		mtx_leave(&tdb->tdb_mtx);
3292 
3293 		tdb_unref(tdb);
3294 	}
3295 	return;
3296 
3297  bad:
3298 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
3299 	    "invalid value");
3300 	pfsyncstat_inc(pfsyncs_badstate);
3301 	return;
3302 }
3303 #endif
3304 
3305 static void
pfsync_in_tdb(struct pfsync_softc * sc,const caddr_t buf,unsigned int len,unsigned int count)3306 pfsync_in_tdb(struct pfsync_softc *sc,
3307     const caddr_t buf, unsigned int len, unsigned int count)
3308 {
3309 #if defined(IPSEC)
3310 	const struct pfsync_tdb *tp;
3311 	unsigned int i;
3312 
3313 	for (i = 0; i < count; i++) {
3314 		tp = (const struct pfsync_tdb *)(buf + len * i);
3315 		pfsync_update_net_tdb(tp);
3316 	}
3317 #endif
3318 }
3319 
3320 int
pfsync_input4(struct mbuf ** mp,int * offp,int proto,int af)3321 pfsync_input4(struct mbuf **mp, int *offp, int proto, int af)
3322 {
3323 	struct mbuf *m = *mp;
3324 	struct ip *ip;
3325 
3326 	ip = mtod(m, struct ip *);
3327 
3328 	m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2);
3329 
3330 	m_freem(m);
3331 	*mp = NULL;
3332 
3333 	return (IPPROTO_DONE);
3334 }
3335 
3336 int
pfsync_sysctl_pfsyncstat(void * oldp,size_t * oldlenp,void * newp)3337 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
3338 {
3339 	struct pfsyncstats pfsyncstat;
3340 
3341 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
3342 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
3343 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
3344 	    pfsyncs_ncounters, NULL);
3345 	return (sysctl_rdstruct(oldp, oldlenp, newp,
3346 	    &pfsyncstat, sizeof(pfsyncstat)));
3347 }
3348 
3349 int
pfsync_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen)3350 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
3351     void *newp, size_t newlen)
3352 {
3353 	/* All sysctl names at this level are terminal. */
3354 	if (namelen != 1)
3355 		return (ENOTDIR);
3356 
3357 	switch (name[0]) {
3358 	case PFSYNCCTL_STATS:
3359 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
3360 	default:
3361 		return (ENOPROTOOPT);
3362 	}
3363 }
3364