xref: /dragonfly/sys/net/ipfw/ip_fw2.c (revision 7bcb6caf)
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27 
28 /*
29  * Implement IP packet firewall (new version)
30  */
31 
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58 
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79 
80 #include <net/ipfw/ip_fw2.h>
81 
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85 	if (fw_debug > 0) \
86 		kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)	((void)0)
90 #endif
91 
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153 
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243 
244 #define IPFW_AUTOINC_STEP_MIN	1
245 #define IPFW_AUTOINC_STEP_MAX	1000
246 #define IPFW_AUTOINC_STEP_DEF	100
247 
248 #define IPFW_TABLE_MAX_DEF	64
249 
250 #define	IPFW_DEFAULT_RULE	65535	/* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET	31	/* set number for the default rule */
252 
253 #define MATCH_REVERSE		0
254 #define MATCH_FORWARD		1
255 #define MATCH_NONE		2
256 #define MATCH_UNKNOWN		3
257 
258 #define TIME_LEQ(a, b)		((a) - (b) <= 0)
259 
260 #define IPFW_STATE_TCPFLAGS	(TH_SYN | TH_FIN | TH_RST)
261 #define IPFW_STATE_TCPSTATES	(IPFW_STATE_TCPFLAGS |	\
262 				 (IPFW_STATE_TCPFLAGS << 8))
263 
264 #define BOTH_SYN		(TH_SYN | (TH_SYN << 8))
265 #define BOTH_FIN		(TH_FIN | (TH_FIN << 8))
266 #define BOTH_RST		(TH_RST | (TH_RST << 8))
267 /* TH_ACK here means FIN was ACKed. */
268 #define BOTH_FINACK		(TH_ACK | (TH_ACK << 8))
269 
270 #define IPFW_STATE_TCPCLOSED(s)	((s)->st_proto == IPPROTO_TCP &&	\
271 				 (((s)->st_state & BOTH_RST) ||		\
272 				  ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
273 
274 #define O_ANCHOR		O_NOP
275 
276 #define IPFW_ISXLAT(type)	((type) == O_REDIRECT)
277 #define IPFW_XLAT_INVALID(s)	(IPFW_ISXLAT((s)->st_type) &&	\
278 				 ((struct ipfw_xlat *)(s))->xlat_invalid)
279 
280 #define IPFW_MBUF_XLATINS	FW_MBUF_PRIVATE1
281 #define IPFW_MBUF_XLATFWD	FW_MBUF_PRIVATE2
282 
283 #define IPFW_XLATE_INSERT	0x0001
284 #define IPFW_XLATE_FORWARD	0x0002
285 #define IPFW_XLATE_OUTPUT	0x0004
286 
287 struct netmsg_ipfw {
288 	struct netmsg_base	base;
289 	const struct ipfw_ioc_rule *ioc_rule;
290 	struct ip_fw		*next_rule;
291 	struct ip_fw		*prev_rule;
292 	struct ip_fw		*sibling;
293 	uint32_t		rule_flags;
294 	struct ip_fw		**cross_rules;
295 };
296 
297 struct netmsg_del {
298 	struct netmsg_base	base;
299 	struct ip_fw		*start_rule;
300 	struct ip_fw		*prev_rule;
301 	uint16_t		rulenum;
302 	uint8_t			from_set;
303 	uint8_t			to_set;
304 };
305 
306 struct netmsg_zent {
307 	struct netmsg_base	base;
308 	struct ip_fw		*start_rule;
309 	uint16_t		rulenum;
310 	uint16_t		log_only;
311 };
312 
313 struct netmsg_cpstate {
314 	struct netmsg_base	base;
315 	struct ipfw_ioc_state	*ioc_state;
316 	int			state_cntmax;
317 	int			state_cnt;
318 };
319 
320 struct netmsg_tblent {
321 	struct netmsg_base	base;
322 	struct sockaddr		*key;
323 	struct sockaddr		*netmask;
324 	struct ipfw_tblent	*sibling;
325 	int			tableid;
326 };
327 
328 struct netmsg_tblflush {
329 	struct netmsg_base	base;
330 	int			tableid;
331 	int			destroy;
332 };
333 
334 struct netmsg_tblexp {
335 	struct netmsg_base	base;
336 	time_t			expire;
337 	int			tableid;
338 	int			cnt;
339 	int			expcnt;
340 	struct radix_node_head	*rnh;
341 };
342 
343 struct ipfw_table_cp {
344 	struct ipfw_ioc_tblent	*te;
345 	int			te_idx;
346 	int			te_cnt;
347 };
348 
349 struct ip_fw_local {
350 	/*
351 	 * offset	The offset of a fragment. offset != 0 means that
352 	 *	we have a fragment at this offset of an IPv4 packet.
353 	 *	offset == 0 means that (if this is an IPv4 packet)
354 	 *	this is the first or only fragment.
355 	 */
356 	u_short			offset;
357 
358 	/*
359 	 * Local copies of addresses. They are only valid if we have
360 	 * an IP packet.
361 	 *
362 	 * proto	The protocol. Set to 0 for non-ip packets,
363 	 *	or to the protocol read from the packet otherwise.
364 	 *	proto != 0 means that we have an IPv4 packet.
365 	 *
366 	 * src_port, dst_port	port numbers, in HOST format. Only
367 	 *	valid for TCP and UDP packets.
368 	 *
369 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
370 	 *	Only valid for IPv4 packets.
371 	 */
372 	uint8_t			proto;
373 	uint16_t		src_port;	/* NOTE: host format	*/
374 	uint16_t		dst_port;	/* NOTE: host format	*/
375 	struct in_addr		src_ip;		/* NOTE: network format	*/
376 	struct in_addr		dst_ip;		/* NOTE: network format	*/
377 	uint16_t		ip_len;
378 	struct tcphdr		*tcp;
379 };
380 
381 struct ipfw_addrs {
382 	uint32_t		addr1;	/* host byte order */
383 	uint32_t		addr2;	/* host byte order */
384 };
385 
386 struct ipfw_ports {
387 	uint16_t		port1;	/* host byte order */
388 	uint16_t		port2;	/* host byte order */
389 };
390 
391 struct ipfw_key {
392 	union {
393 		struct ipfw_addrs addrs;
394 		uint64_t	value;
395 	} addr_u;
396 	union {
397 		struct ipfw_ports ports;
398 		uint32_t	value;
399 	} port_u;
400 	uint8_t			proto;
401 	uint8_t			swap;	/* IPFW_KEY_SWAP_ */
402 	uint16_t		rsvd2;
403 };
404 
405 #define IPFW_KEY_SWAP_ADDRS	0x1
406 #define IPFW_KEY_SWAP_PORTS	0x2
407 #define IPFW_KEY_SWAP_ALL	(IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
408 
409 struct ipfw_trkcnt {
410 	RB_ENTRY(ipfw_trkcnt)	tc_rblink;
411 	struct ipfw_key		tc_key;
412 	uintptr_t		tc_ruleid;
413 	int			tc_refs;
414 	int			tc_count;
415 	time_t			tc_expire;	/* userland get-only */
416 	uint16_t		tc_rulenum;	/* userland get-only */
417 } __cachealign;
418 
419 #define tc_addrs		tc_key.addr_u.value
420 #define tc_ports		tc_key.port_u.value
421 #define tc_proto		tc_key.proto
422 #define tc_saddr		tc_key.addr_u.addrs.addr1
423 #define tc_daddr		tc_key.addr_u.addrs.addr2
424 #define tc_sport		tc_key.port_u.ports.port1
425 #define tc_dport		tc_key.port_u.ports.port2
426 
427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
428 
429 struct ipfw_state;
430 
431 struct ipfw_track {
432 	RB_ENTRY(ipfw_track)	t_rblink;
433 	struct ipfw_key		t_key;
434 	struct ip_fw		*t_rule;
435 	time_t			t_lastexp;
436 	LIST_HEAD(, ipfw_state)	t_state_list;
437 	time_t			t_expire;
438 	volatile int		*t_count;
439 	struct ipfw_trkcnt	*t_trkcnt;
440 	TAILQ_ENTRY(ipfw_track)	t_link;
441 };
442 
443 #define t_addrs			t_key.addr_u.value
444 #define t_ports			t_key.port_u.value
445 #define t_proto			t_key.proto
446 #define t_saddr			t_key.addr_u.addrs.addr1
447 #define t_daddr			t_key.addr_u.addrs.addr2
448 #define t_sport			t_key.port_u.ports.port1
449 #define t_dport			t_key.port_u.ports.port2
450 
451 RB_HEAD(ipfw_track_tree, ipfw_track);
452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
453 
454 struct ipfw_state {
455 	RB_ENTRY(ipfw_state)	st_rblink;
456 	struct ipfw_key		st_key;
457 
458 	time_t			st_expire;	/* expire time */
459 	struct ip_fw		*st_rule;
460 
461 	uint64_t		st_pcnt;	/* packets */
462 	uint64_t		st_bcnt;	/* bytes */
463 
464 	/*
465 	 * st_state:
466 	 * State of this rule, typically a combination of TCP flags.
467 	 *
468 	 * st_ack_fwd/st_ack_rev:
469 	 * Most recent ACKs in forward and reverse direction.  They
470 	 * are used to generate keepalives.
471 	 */
472 	uint32_t		st_state;
473 	uint32_t		st_ack_fwd;	/* host byte order */
474 	uint32_t		st_seq_fwd;	/* host byte order */
475 	uint32_t		st_ack_rev;	/* host byte order */
476 	uint32_t		st_seq_rev;	/* host byte order */
477 
478 	uint16_t		st_flags;	/* IPFW_STATE_F_ */
479 	uint16_t		st_type;	/* KEEP_STATE/LIMIT/RDR */
480 	struct ipfw_track	*st_track;
481 
482 	LIST_ENTRY(ipfw_state)	st_trklink;
483 	TAILQ_ENTRY(ipfw_state)	st_link;
484 };
485 
486 #define st_addrs		st_key.addr_u.value
487 #define st_ports		st_key.port_u.value
488 #define st_proto		st_key.proto
489 #define st_swap			st_key.swap
490 
491 #define IPFW_STATE_F_ACKFWD	0x0001
492 #define IPFW_STATE_F_SEQFWD	0x0002
493 #define IPFW_STATE_F_ACKREV	0x0004
494 #define IPFW_STATE_F_SEQREV	0x0008
495 #define IPFW_STATE_F_XLATSRC	0x0010
496 #define IPFW_STATE_F_XLATSLAVE	0x0020
497 #define IPFW_STATE_F_LINKED	0x0040
498 
499 #define IPFW_STATE_SCANSKIP(s)	((s)->st_type == O_ANCHOR ||	\
500 				 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
501 
502 /* Expired or being deleted. */
503 #define IPFW_STATE_ISDEAD(s)	(TIME_LEQ((s)->st_expire, time_uptime) || \
504 				 IPFW_XLAT_INVALID((s)))
505 
506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
507 RB_HEAD(ipfw_state_tree, ipfw_state);
508 
509 struct ipfw_xlat {
510 	struct ipfw_state	xlat_st;	/* MUST be the first field */
511 	uint32_t		xlat_addr;	/* network byte order */
512 	uint16_t		xlat_port;	/* network byte order */
513 	uint16_t		xlat_dir;	/* MATCH_ */
514 	struct ifnet		*xlat_ifp;	/* matching ifnet */
515 	struct ipfw_xlat	*xlat_pair;	/* paired state */
516 	int			xlat_pcpu;	/* paired cpu */
517 	volatile int		xlat_invalid;	/* invalid, but not dtor yet */
518 	volatile uint64_t	xlat_crefs;	/* cross references */
519 	struct netmsg_base	xlat_freenm;	/* for remote free */
520 };
521 
522 #define xlat_type		xlat_st.st_type
523 #define xlat_flags		xlat_st.st_flags
524 #define xlat_rule		xlat_st.st_rule
525 #define xlat_bcnt		xlat_st.st_bcnt
526 #define xlat_pcnt		xlat_st.st_pcnt
527 
528 struct ipfw_tblent {
529 	struct radix_node	te_nodes[2];
530 	struct sockaddr_in	te_key;
531 	u_long			te_use;
532 	time_t			te_lastuse;
533 	struct ipfw_tblent	*te_sibling;
534 	volatile int		te_expired;
535 };
536 
537 struct ipfw_context {
538 	struct ip_fw		*ipfw_layer3_chain;	/* rules for layer3 */
539 	struct ip_fw		*ipfw_default_rule;	/* default rule */
540 	uint64_t		ipfw_norule_counter;	/* ipfw_log(NULL) stat*/
541 
542 	/*
543 	 * ipfw_set_disable contains one bit per set value (0..31).
544 	 * If the bit is set, all rules with the corresponding set
545 	 * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
546 	 * default rule and CANNOT be disabled.
547 	 */
548 	uint32_t		ipfw_set_disable;
549 
550 	uint8_t			ipfw_flags;	/* IPFW_FLAG_ */
551 
552 	struct ip_fw		*ipfw_cont_rule;
553 	struct ipfw_xlat	*ipfw_cont_xlat;
554 
555 	struct ipfw_state_tree	ipfw_state_tree;
556 	struct ipfw_state_list	ipfw_state_list;
557 	int			ipfw_state_loosecnt;
558 	int			ipfw_state_cnt;
559 
560 	union {
561 		struct ipfw_state state;
562 		struct ipfw_track track;
563 		struct ipfw_trkcnt trkcnt;
564 	} ipfw_tmpkey;
565 
566 	struct ipfw_track_tree	ipfw_track_tree;
567 	struct ipfw_track_list	ipfw_track_list;
568 	struct ipfw_trkcnt	*ipfw_trkcnt_spare;
569 
570 	struct callout		ipfw_stateto_ch;
571 	time_t			ipfw_state_lastexp;
572 	struct netmsg_base	ipfw_stateexp_nm;
573 	struct netmsg_base	ipfw_stateexp_more;
574 	struct ipfw_state	ipfw_stateexp_anch;
575 
576 	struct callout		ipfw_trackto_ch;
577 	time_t			ipfw_track_lastexp;
578 	struct netmsg_base	ipfw_trackexp_nm;
579 	struct netmsg_base	ipfw_trackexp_more;
580 	struct ipfw_track	ipfw_trackexp_anch;
581 
582 	struct callout		ipfw_keepalive_ch;
583 	struct netmsg_base	ipfw_keepalive_nm;
584 	struct netmsg_base	ipfw_keepalive_more;
585 	struct ipfw_state	ipfw_keepalive_anch;
586 
587 	struct callout		ipfw_xlatreap_ch;
588 	struct netmsg_base	ipfw_xlatreap_nm;
589 	struct ipfw_state_list	ipfw_xlatreap;
590 
591 	/*
592 	 * Statistics
593 	 */
594 	u_long			ipfw_sts_reap;
595 	u_long			ipfw_sts_reapfailed;
596 	u_long			ipfw_sts_overflow;
597 	u_long			ipfw_sts_nomem;
598 	u_long			ipfw_sts_tcprecycled;
599 
600 	u_long			ipfw_tks_nomem;
601 	u_long			ipfw_tks_reap;
602 	u_long			ipfw_tks_reapfailed;
603 	u_long			ipfw_tks_overflow;
604 	u_long			ipfw_tks_cntnomem;
605 
606 	u_long			ipfw_frags;
607 	u_long			ipfw_defraged;
608 	u_long			ipfw_defrag_remote;
609 
610 	u_long			ipfw_xlated;
611 	u_long			ipfw_xlate_split;
612 	u_long			ipfw_xlate_conflicts;
613 	u_long			ipfw_xlate_cresolved;
614 
615 	/* Last field */
616 	struct radix_node_head	*ipfw_tables[];
617 };
618 
619 #define IPFW_FLAG_KEEPALIVE	0x01
620 #define IPFW_FLAG_STATEEXP	0x02
621 #define IPFW_FLAG_TRACKEXP	0x04
622 #define IPFW_FLAG_STATEREAP	0x08
623 #define IPFW_FLAG_TRACKREAP	0x10
624 
625 #define ipfw_state_tmpkey	ipfw_tmpkey.state
626 #define ipfw_track_tmpkey	ipfw_tmpkey.track
627 #define ipfw_trkcnt_tmpkey	ipfw_tmpkey.trkcnt
628 
629 struct ipfw_global {
630 	int			ipfw_state_loosecnt;	/* cache aligned */
631 	time_t			ipfw_state_globexp __cachealign;
632 
633 	struct lwkt_token	ipfw_trkcnt_token __cachealign;
634 	struct ipfw_trkcnt_tree	ipfw_trkcnt_tree;
635 	int			ipfw_trkcnt_cnt;
636 	time_t			ipfw_track_globexp;
637 
638 	/* Accessed in netisr0. */
639 	struct ip_fw		*ipfw_crossref_free __cachealign;
640 	struct callout		ipfw_crossref_ch;
641 	struct netmsg_base	ipfw_crossref_nm;
642 
643 #ifdef KLD_MODULE
644 	/*
645 	 * Module can not be unloaded, if there are references to
646 	 * certains rules of ipfw(4), e.g. dummynet(4)
647 	 */
648 	int			ipfw_refcnt __cachealign;
649 #endif
650 } __cachealign;
651 
652 static struct ipfw_context	*ipfw_ctx[MAXCPU];
653 
654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
655 
656 /*
657  * Following two global variables are accessed and updated only
658  * in netisr0.
659  */
660 static uint32_t static_count;	/* # of static rules */
661 static uint32_t static_ioc_len;	/* bytes of static rules */
662 
663 /*
664  * If 1, then ipfw static rules are being flushed,
665  * ipfw_chk() will skip to the default rule.
666  */
667 static int ipfw_flushing;
668 
669 static int fw_verbose;
670 static int verbose_limit;
671 
672 static int fw_debug;
673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
674 
675 static int	ipfw_table_max = IPFW_TABLE_MAX_DEF;
676 
677 static int	ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
678 static int	ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
679 
680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
681 
682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
684     "Firewall statistics");
685 
686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
687     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
689     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
690     "Rule number autincrement step");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
692     &fw_one_pass, 0,
693     "Only do a single pass through ipfw when using dummynet(4)");
694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
695     &fw_debug, 0, "Enable printing of debug ip_fw statements");
696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
697     &fw_verbose, 0, "Log matches to ipfw rules");
698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
699     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
701     &ipfw_table_max, 0, "Max # of tables");
702 
703 static int	ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
704 static int	ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
705 static int	ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
706 static int	ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
707 static int	ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
708 static int	ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
709 
710 /*
711  * Timeouts for various events in handing states.
712  *
713  * NOTE:
714  * 1 == 0~1 second.
715  * 2 == 1~2 second(s).
716  *
717  * We use 2 seconds for FIN lifetime, so that the states will not be
718  * ripped prematurely.
719  */
720 static uint32_t dyn_ack_lifetime = 300;
721 static uint32_t dyn_syn_lifetime = 20;
722 static uint32_t dyn_finwait_lifetime = 20;
723 static uint32_t dyn_fin_lifetime = 2;
724 static uint32_t dyn_rst_lifetime = 2;
725 static uint32_t dyn_udp_lifetime = 10;
726 static uint32_t dyn_short_lifetime = 5;	/* used by tracks too */
727 
728 /*
729  * Keepalives are sent if dyn_keepalive is set. They are sent every
730  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
731  * seconds of lifetime of a rule.
732  */
733 static uint32_t dyn_keepalive_interval = 20;
734 static uint32_t dyn_keepalive_period = 5;
735 static uint32_t dyn_keepalive = 1;	/* do send keepalives */
736 
737 static struct ipfw_global	ipfw_gd;
738 static int	ipfw_state_loosecnt_updthr;
739 static int	ipfw_state_max = 4096;	/* max # of states */
740 static int	ipfw_track_max = 4096;	/* max # of tracks */
741 
742 static int	ipfw_state_headroom;	/* setup at module load time */
743 static int	ipfw_state_reap_min = 8;
744 static int	ipfw_state_expire_max = 32;
745 static int	ipfw_state_scan_max = 256;
746 static int	ipfw_keepalive_max = 8;
747 static int	ipfw_track_reap_max = 4;
748 static int	ipfw_track_expire_max = 16;
749 static int	ipfw_track_scan_max = 128;
750 
751 static eventhandler_tag ipfw_ifaddr_event;
752 
753 /* Compat */
754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
755     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
756     "Number of states and tracks");
757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
758     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
759     "Max number of states and tracks");
760 
761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
762     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
763     "Number of states");
764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
765     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
766     "Max number of states");
767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
768     &ipfw_state_headroom, 0, "headroom for state reap");
769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
770     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
772     &ipfw_track_max, 0, "Max number of tracks");
773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
774     &static_count, 0, "Number of static rules");
775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
776     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
778     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
780     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
782     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
784     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
786     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
788     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
790     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
792     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
793     "I", "# of states to scan for each expire iteration");
794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
795     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
796     "I", "# of states to expire for each expire iteration");
797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
798     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
799     "I", "# of states to expire for each expire iteration");
800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
801     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
802     "I", "# of states to reap for state shortage");
803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
804     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
805     "I", "# of tracks to scan for each expire iteration");
806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
807     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
808     "I", "# of tracks to expire for each expire iteration");
809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
810     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
811     "I", "# of tracks to reap for track shortage");
812 
813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
814     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
815     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
816     "LU", "# of state reaps due to states shortage");
817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
818     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
819     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
820     "LU", "# of state reap failure");
821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
822     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
823     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
824     "LU", "# of state overflow");
825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
826     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
827     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
828     "LU", "# of state allocation failure");
829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
830     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
831     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
832     "LU", "# of state deleted due to fast TCP port recycling");
833 
834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
835     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
836     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
837     "LU", "# of track allocation failure");
838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
839     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
840     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
841     "LU", "# of track reap due to tracks shortage");
842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
843     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
844     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
845     "LU", "# of track reap failure");
846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
847     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
848     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
849     "LU", "# of track overflow");
850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
851     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
852     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
853     "LU", "# of track counter allocation failure");
854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
855     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
856     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
857     "LU", "# of IP fragements defraged");
858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
859     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
860     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
861     "LU", "# of IP packets after defrag");
862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
863     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
864     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
865     "LU", "# of IP packets after defrag dispatched to remote cpus");
866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
867     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
868     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
869     "LU", "# address/port translations");
870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
871     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
872     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
873     "LU", "# address/port translations split between different cpus");
874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
875     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
876     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
877     "LU", "# address/port translations conflicts on remote cpu");
878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
879     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
880     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
881     "LU", "# address/port translations conflicts resolved on remote cpu");
882 
883 static int		ipfw_state_cmp(struct ipfw_state *,
884 			    struct ipfw_state *);
885 static int		ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
886 			    struct ipfw_trkcnt *);
887 static int		ipfw_track_cmp(struct ipfw_track *,
888 			    struct ipfw_track *);
889 
890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
892 
893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
895 
896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
898 
899 static int		ipfw_chk(struct ip_fw_args *);
900 static void		ipfw_track_expire_ipifunc(void *);
901 static void		ipfw_state_expire_ipifunc(void *);
902 static void		ipfw_keepalive(void *);
903 static int		ipfw_state_expire_start(struct ipfw_context *,
904 			    int, int);
905 static void		ipfw_crossref_timeo(void *);
906 static void		ipfw_state_remove(struct ipfw_context *,
907 			    struct ipfw_state *);
908 static void		ipfw_xlat_reap_timeo(void *);
909 static void		ipfw_defrag_redispatch(struct mbuf *, int,
910 			    struct ip_fw *);
911 
912 #define IPFW_TRKCNT_TOKGET	lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKREL	lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
914 #define IPFW_TRKCNT_TOKINIT	\
915 	lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
916 
917 static void
918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
919     const struct sockaddr *netmask)
920 {
921 	const u_char *cp1 = (const u_char *)src;
922 	u_char *cp2 = (u_char *)dst;
923 	const u_char *cp3 = (const u_char *)netmask;
924 	u_char *cplim = cp2 + *cp3;
925 	u_char *cplim2 = cp2 + *cp1;
926 
927 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
928 	cp3 += 2;
929 	if (cplim > cplim2)
930 		cplim = cplim2;
931 	while (cp2 < cplim)
932 		*cp2++ = *cp1++ & *cp3++;
933 	if (cp2 < cplim2)
934 		bzero(cp2, cplim2 - cp2);
935 }
936 
937 static __inline uint16_t
938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
939 {
940 	uint32_t l;
941 
942 	if (udp && !cksum)
943 		return (0x0000);
944 	l = cksum + old - new;
945 	l = (l >> 16) + (l & 65535);
946 	l = l & 65535;
947 	if (udp && !l)
948 		return (0xFFFF);
949 	return (l);
950 }
951 
952 static __inline void
953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
954     in_addr_t daddr, uint16_t dport, uint8_t proto)
955 {
956 
957 	key->proto = proto;
958 	key->swap = 0;
959 
960 	if (saddr < daddr) {
961 		key->addr_u.addrs.addr1 = daddr;
962 		key->addr_u.addrs.addr2 = saddr;
963 		key->swap |= IPFW_KEY_SWAP_ADDRS;
964 	} else {
965 		key->addr_u.addrs.addr1 = saddr;
966 		key->addr_u.addrs.addr2 = daddr;
967 	}
968 
969 	if (sport < dport) {
970 		key->port_u.ports.port1 = dport;
971 		key->port_u.ports.port2 = sport;
972 		key->swap |= IPFW_KEY_SWAP_PORTS;
973 	} else {
974 		key->port_u.ports.port1 = sport;
975 		key->port_u.ports.port2 = dport;
976 	}
977 
978 	if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
979 		key->swap |= IPFW_KEY_SWAP_PORTS;
980 	if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
981 		key->swap |= IPFW_KEY_SWAP_ADDRS;
982 }
983 
984 static __inline void
985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
986     in_addr_t *daddr, uint16_t *dport)
987 {
988 
989 	if (key->swap & IPFW_KEY_SWAP_ADDRS) {
990 		*saddr = key->addr_u.addrs.addr2;
991 		*daddr = key->addr_u.addrs.addr1;
992 	} else {
993 		*saddr = key->addr_u.addrs.addr1;
994 		*daddr = key->addr_u.addrs.addr2;
995 	}
996 
997 	if (key->swap & IPFW_KEY_SWAP_PORTS) {
998 		*sport = key->port_u.ports.port2;
999 		*dport = key->port_u.ports.port1;
1000 	} else {
1001 		*sport = key->port_u.ports.port1;
1002 		*dport = key->port_u.ports.port2;
1003 	}
1004 }
1005 
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1008 {
1009 
1010 	if (s1->st_proto > s2->st_proto)
1011 		return (1);
1012 	if (s1->st_proto < s2->st_proto)
1013 		return (-1);
1014 
1015 	if (s1->st_addrs > s2->st_addrs)
1016 		return (1);
1017 	if (s1->st_addrs < s2->st_addrs)
1018 		return (-1);
1019 
1020 	if (s1->st_ports > s2->st_ports)
1021 		return (1);
1022 	if (s1->st_ports < s2->st_ports)
1023 		return (-1);
1024 
1025 	if (s1->st_swap == s2->st_swap ||
1026 	    (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027 		return (0);
1028 
1029 	if (s1->st_swap > s2->st_swap)
1030 		return (1);
1031 	else
1032 		return (-1);
1033 }
1034 
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1037 {
1038 
1039 	if (t1->tc_proto > t2->tc_proto)
1040 		return (1);
1041 	if (t1->tc_proto < t2->tc_proto)
1042 		return (-1);
1043 
1044 	if (t1->tc_addrs > t2->tc_addrs)
1045 		return (1);
1046 	if (t1->tc_addrs < t2->tc_addrs)
1047 		return (-1);
1048 
1049 	if (t1->tc_ports > t2->tc_ports)
1050 		return (1);
1051 	if (t1->tc_ports < t2->tc_ports)
1052 		return (-1);
1053 
1054 	if (t1->tc_ruleid > t2->tc_ruleid)
1055 		return (1);
1056 	if (t1->tc_ruleid < t2->tc_ruleid)
1057 		return (-1);
1058 
1059 	return (0);
1060 }
1061 
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1064 {
1065 
1066 	if (t1->t_proto > t2->t_proto)
1067 		return (1);
1068 	if (t1->t_proto < t2->t_proto)
1069 		return (-1);
1070 
1071 	if (t1->t_addrs > t2->t_addrs)
1072 		return (1);
1073 	if (t1->t_addrs < t2->t_addrs)
1074 		return (-1);
1075 
1076 	if (t1->t_ports > t2->t_ports)
1077 		return (1);
1078 	if (t1->t_ports < t2->t_ports)
1079 		return (-1);
1080 
1081 	if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082 		return (1);
1083 	if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084 		return (-1);
1085 
1086 	return (0);
1087 }
1088 
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 {
1092 	struct ipfw_state *dup;
1093 
1094 	KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095 	    ("state %p was linked", s));
1096 	dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097 	if (dup == NULL) {
1098 		TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099 		s->st_flags |= IPFW_STATE_F_LINKED;
1100 	}
1101 	return (dup);
1102 }
1103 
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1106 {
1107 
1108 	KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109 	    ("state %p was not linked", s));
1110 	RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111 	TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112 	s->st_flags &= ~IPFW_STATE_F_LINKED;
1113 }
1114 
1115 static void
1116 ipfw_state_max_set(int state_max)
1117 {
1118 
1119 	ipfw_state_max = state_max;
1120 	/* Allow 5% states over-allocation. */
1121 	ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1122 }
1123 
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1126 {
1127 	int cpu, state_cnt = 0;
1128 
1129 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130 		state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131 	return (state_cnt);
1132 }
1133 
1134 static __inline int
1135 ipfw_state_cntsync(void)
1136 {
1137 	int state_cnt;
1138 
1139 	state_cnt = ipfw_state_cntcoll();
1140 	ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141 	return (state_cnt);
1142 }
1143 
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1146 {
1147 	KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148 	KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149 	rule->refcnt--;
1150 	if (rule->refcnt == 0) {
1151 		if (rule->cross_rules != NULL)
1152 			kfree(rule->cross_rules, M_IPFW);
1153 		kfree(rule, M_IPFW);
1154 		return 1;
1155 	}
1156 	return 0;
1157 }
1158 
1159 static void
1160 ipfw_unref_rule(void *priv)
1161 {
1162 	ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164 	KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165 	    ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166 	atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1168 }
1169 
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1172 {
1173 	KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175 	atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177 	rule->refcnt++;
1178 }
1179 
1180 /*
1181  * This macro maps an ip pointer into a layer3 header pointer of type T
1182  */
1183 #define	L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184 
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 {
1188 	int type = L3HDR(struct icmp,ip)->icmp_type;
1189 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1190 	int idx = type / 32;
1191 
1192 	if (idx >= idx_max)
1193 		return (0);
1194 	return (cmd->d[idx] & (1 << (type % 32)));
1195 }
1196 
1197 static __inline int
1198 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1199 {
1200 	int code = L3HDR(struct icmp,ip)->icmp_code;
1201 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1202 	int idx = code / 32;
1203 
1204 	if (idx >= idx_max)
1205 		return (0);
1206 	return (cmd->d[idx] & (1 << (code % 32)));
1207 }
1208 
1209 #define TT	((1 << ICMP_ECHO) | \
1210 		 (1 << ICMP_ROUTERSOLICIT) | \
1211 		 (1 << ICMP_TSTAMP) | \
1212 		 (1 << ICMP_IREQ) | \
1213 		 (1 << ICMP_MASKREQ))
1214 
1215 static int
1216 is_icmp_query(struct ip *ip)
1217 {
1218 	int type = L3HDR(struct icmp, ip)->icmp_type;
1219 
1220 	return (type < 32 && (TT & (1 << type)));
1221 }
1222 
1223 #undef TT
1224 
1225 /*
1226  * The following checks use two arrays of 8 or 16 bits to store the
1227  * bits that we want set or clear, respectively. They are in the
1228  * low and high half of cmd->arg1 or cmd->d[0].
1229  *
1230  * We scan options and store the bits we find set. We succeed if
1231  *
1232  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1233  *
1234  * The code is sometimes optimized not to store additional variables.
1235  */
1236 static int
1237 flags_match(ipfw_insn *cmd, uint8_t bits)
1238 {
1239 	u_char want_clear;
1240 	bits = ~bits;
1241 
1242 	if (((cmd->arg1 & 0xff) & bits) != 0)
1243 		return 0; /* some bits we want set were clear */
1244 
1245 	want_clear = (cmd->arg1 >> 8) & 0xff;
1246 	if ((want_clear & bits) != want_clear)
1247 		return 0; /* some bits we want clear were set */
1248 	return 1;
1249 }
1250 
1251 static int
1252 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1253 {
1254 	int optlen, bits = 0;
1255 	u_char *cp = (u_char *)(ip + 1);
1256 	int x = (ip->ip_hl << 2) - sizeof(struct ip);
1257 
1258 	for (; x > 0; x -= optlen, cp += optlen) {
1259 		int opt = cp[IPOPT_OPTVAL];
1260 
1261 		if (opt == IPOPT_EOL)
1262 			break;
1263 
1264 		if (opt == IPOPT_NOP) {
1265 			optlen = 1;
1266 		} else {
1267 			optlen = cp[IPOPT_OLEN];
1268 			if (optlen <= 0 || optlen > x)
1269 				return 0; /* invalid or truncated */
1270 		}
1271 
1272 		switch (opt) {
1273 		case IPOPT_LSRR:
1274 			bits |= IP_FW_IPOPT_LSRR;
1275 			break;
1276 
1277 		case IPOPT_SSRR:
1278 			bits |= IP_FW_IPOPT_SSRR;
1279 			break;
1280 
1281 		case IPOPT_RR:
1282 			bits |= IP_FW_IPOPT_RR;
1283 			break;
1284 
1285 		case IPOPT_TS:
1286 			bits |= IP_FW_IPOPT_TS;
1287 			break;
1288 
1289 		default:
1290 			break;
1291 		}
1292 	}
1293 	return (flags_match(cmd, bits));
1294 }
1295 
1296 static int
1297 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1298 {
1299 	int optlen, bits = 0;
1300 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1301 	u_char *cp = (u_char *)(tcp + 1);
1302 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1303 
1304 	for (; x > 0; x -= optlen, cp += optlen) {
1305 		int opt = cp[0];
1306 
1307 		if (opt == TCPOPT_EOL)
1308 			break;
1309 
1310 		if (opt == TCPOPT_NOP) {
1311 			optlen = 1;
1312 		} else {
1313 			optlen = cp[1];
1314 			if (optlen <= 0)
1315 				break;
1316 		}
1317 
1318 		switch (opt) {
1319 		case TCPOPT_MAXSEG:
1320 			bits |= IP_FW_TCPOPT_MSS;
1321 			break;
1322 
1323 		case TCPOPT_WINDOW:
1324 			bits |= IP_FW_TCPOPT_WINDOW;
1325 			break;
1326 
1327 		case TCPOPT_SACK_PERMITTED:
1328 		case TCPOPT_SACK:
1329 			bits |= IP_FW_TCPOPT_SACK;
1330 			break;
1331 
1332 		case TCPOPT_TIMESTAMP:
1333 			bits |= IP_FW_TCPOPT_TS;
1334 			break;
1335 
1336 		case TCPOPT_CC:
1337 		case TCPOPT_CCNEW:
1338 		case TCPOPT_CCECHO:
1339 			bits |= IP_FW_TCPOPT_CC;
1340 			break;
1341 
1342 		default:
1343 			break;
1344 		}
1345 	}
1346 	return (flags_match(cmd, bits));
1347 }
1348 
1349 static int
1350 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1351 {
1352 	if (ifp == NULL)	/* no iface with this packet, match fails */
1353 		return 0;
1354 
1355 	/* Check by name or by IP address */
1356 	if (cmd->name[0] != '\0') { /* match by name */
1357 		/* Check name */
1358 		if (cmd->p.glob) {
1359 			if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1360 				return(1);
1361 		} else {
1362 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1363 				return(1);
1364 		}
1365 	} else {
1366 		struct ifaddr_container *ifac;
1367 
1368 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1369 			struct ifaddr *ia = ifac->ifa;
1370 
1371 			if (ia->ifa_addr == NULL)
1372 				continue;
1373 			if (ia->ifa_addr->sa_family != AF_INET)
1374 				continue;
1375 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1376 			    (ia->ifa_addr))->sin_addr.s_addr)
1377 				return(1);	/* match */
1378 		}
1379 	}
1380 	return(0);	/* no match, fail ... */
1381 }
1382 
1383 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1384 
1385 /*
1386  * We enter here when we have a rule with O_LOG.
1387  * XXX this function alone takes about 2Kbytes of code!
1388  */
1389 static void
1390 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1391     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1392 {
1393 	char *action;
1394 	int limit_reached = 0;
1395 	char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1396 
1397 	fragment[0] = '\0';
1398 	proto[0] = '\0';
1399 
1400 	if (f == NULL) {	/* bogus pkt */
1401 		if (verbose_limit != 0 &&
1402 		    ctx->ipfw_norule_counter >= verbose_limit)
1403 			return;
1404 		ctx->ipfw_norule_counter++;
1405 		if (ctx->ipfw_norule_counter == verbose_limit)
1406 			limit_reached = verbose_limit;
1407 		action = "Refuse";
1408 	} else {	/* O_LOG is the first action, find the real one */
1409 		ipfw_insn *cmd = ACTION_PTR(f);
1410 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1411 
1412 		if (l->max_log != 0 && l->log_left == 0)
1413 			return;
1414 		l->log_left--;
1415 		if (l->log_left == 0)
1416 			limit_reached = l->max_log;
1417 		cmd += F_LEN(cmd);	/* point to first action */
1418 		if (cmd->opcode == O_PROB)
1419 			cmd += F_LEN(cmd);
1420 
1421 		action = action2;
1422 		switch (cmd->opcode) {
1423 		case O_DENY:
1424 			action = "Deny";
1425 			break;
1426 
1427 		case O_REJECT:
1428 			if (cmd->arg1==ICMP_REJECT_RST) {
1429 				action = "Reset";
1430 			} else if (cmd->arg1==ICMP_UNREACH_HOST) {
1431 				action = "Reject";
1432 			} else {
1433 				ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1434 					  cmd->arg1);
1435 			}
1436 			break;
1437 
1438 		case O_ACCEPT:
1439 			action = "Accept";
1440 			break;
1441 
1442 		case O_COUNT:
1443 			action = "Count";
1444 			break;
1445 
1446 		case O_DIVERT:
1447 			ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1448 			break;
1449 
1450 		case O_TEE:
1451 			ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1452 			break;
1453 
1454 		case O_SKIPTO:
1455 			ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1456 			break;
1457 
1458 		case O_PIPE:
1459 			ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1460 			break;
1461 
1462 		case O_QUEUE:
1463 			ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1464 			break;
1465 
1466 		case O_FORWARD_IP:
1467 			{
1468 				ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1469 				int len;
1470 
1471 				len = ksnprintf(SNPARGS(action2, 0),
1472 				    "Forward to %s",
1473 				    kinet_ntoa(sa->sa.sin_addr, abuf));
1474 				if (sa->sa.sin_port) {
1475 					ksnprintf(SNPARGS(action2, len), ":%d",
1476 						  sa->sa.sin_port);
1477 				}
1478 			}
1479 			break;
1480 
1481 		default:
1482 			action = "UNKNOWN";
1483 			break;
1484 		}
1485 	}
1486 
1487 	if (hlen == 0) {	/* non-ip */
1488 		ksnprintf(SNPARGS(proto, 0), "MAC");
1489 	} else {
1490 		struct ip *ip = mtod(m, struct ip *);
1491 		/* these three are all aliases to the same thing */
1492 		struct icmp *const icmp = L3HDR(struct icmp, ip);
1493 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
1494 		struct udphdr *const udp = (struct udphdr *)icmp;
1495 
1496 		int ip_off, offset, ip_len;
1497 		int len;
1498 
1499 		if (eh != NULL) { /* layer 2 packets are as on the wire */
1500 			ip_off = ntohs(ip->ip_off);
1501 			ip_len = ntohs(ip->ip_len);
1502 		} else {
1503 			ip_off = ip->ip_off;
1504 			ip_len = ip->ip_len;
1505 		}
1506 		offset = ip_off & IP_OFFMASK;
1507 		switch (ip->ip_p) {
1508 		case IPPROTO_TCP:
1509 			len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1510 					kinet_ntoa(ip->ip_src, abuf));
1511 			if (offset == 0) {
1512 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1513 					  ntohs(tcp->th_sport),
1514 					  kinet_ntoa(ip->ip_dst, abuf),
1515 					  ntohs(tcp->th_dport));
1516 			} else {
1517 				ksnprintf(SNPARGS(proto, len), " %s",
1518 					  kinet_ntoa(ip->ip_dst, abuf));
1519 			}
1520 			break;
1521 
1522 		case IPPROTO_UDP:
1523 			len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1524 					kinet_ntoa(ip->ip_src, abuf));
1525 			if (offset == 0) {
1526 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1527 					  ntohs(udp->uh_sport),
1528 					  kinet_ntoa(ip->ip_dst, abuf),
1529 					  ntohs(udp->uh_dport));
1530 			} else {
1531 				ksnprintf(SNPARGS(proto, len), " %s",
1532 					  kinet_ntoa(ip->ip_dst, abuf));
1533 			}
1534 			break;
1535 
1536 		case IPPROTO_ICMP:
1537 			if (offset == 0) {
1538 				len = ksnprintf(SNPARGS(proto, 0),
1539 						"ICMP:%u.%u ",
1540 						icmp->icmp_type,
1541 						icmp->icmp_code);
1542 			} else {
1543 				len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1544 			}
1545 			len += ksnprintf(SNPARGS(proto, len), "%s",
1546 					 kinet_ntoa(ip->ip_src, abuf));
1547 			ksnprintf(SNPARGS(proto, len), " %s",
1548 				  kinet_ntoa(ip->ip_dst, abuf));
1549 			break;
1550 
1551 		default:
1552 			len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1553 					kinet_ntoa(ip->ip_src, abuf));
1554 			ksnprintf(SNPARGS(proto, len), " %s",
1555 				  kinet_ntoa(ip->ip_dst, abuf));
1556 			break;
1557 		}
1558 
1559 		if (ip_off & (IP_MF | IP_OFFMASK)) {
1560 			ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1561 				  ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1562 				  offset << 3, (ip_off & IP_MF) ? "+" : "");
1563 		}
1564 	}
1565 
1566 	if (oif || m->m_pkthdr.rcvif) {
1567 		log(LOG_SECURITY | LOG_INFO,
1568 		    "ipfw: %d %s %s %s via %s%s\n",
1569 		    f ? f->rulenum : -1,
1570 		    action, proto, oif ? "out" : "in",
1571 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1572 		    fragment);
1573 	} else {
1574 		log(LOG_SECURITY | LOG_INFO,
1575 		    "ipfw: %d %s %s [no if info]%s\n",
1576 		    f ? f->rulenum : -1,
1577 		    action, proto, fragment);
1578 	}
1579 
1580 	if (limit_reached) {
1581 		log(LOG_SECURITY | LOG_NOTICE,
1582 		    "ipfw: limit %d reached on entry %d\n",
1583 		    limit_reached, f ? f->rulenum : -1);
1584 	}
1585 }
1586 
1587 #undef SNPARGS
1588 
1589 static void
1590 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1591 {
1592 	struct ip_fw *rule = slave_x->xlat_rule;
1593 
1594 	KKASSERT(rule->cpuid == mycpuid);
1595 
1596 	/* No more cross references; free this pair now. */
1597 	kfree(x, M_IPFW);
1598 	kfree(slave_x, M_IPFW);
1599 
1600 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1601 	rule->cross_refs--;
1602 }
1603 
1604 static void
1605 ipfw_xlat_reap_dispatch(netmsg_t nm)
1606 {
1607 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1608 	struct ipfw_state *s, *ns;
1609 
1610 	ASSERT_NETISR_NCPUS(mycpuid);
1611 
1612 	crit_enter();
1613 	/* Reply ASAP. */
1614 	netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1615 	crit_exit();
1616 
1617 	/* TODO: limit scanning depth */
1618 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1619 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1620 		struct ipfw_xlat *slave_x = x->xlat_pair;
1621 		uint64_t crefs;
1622 
1623 		crefs = slave_x->xlat_crefs + x->xlat_crefs;
1624 		if (crefs == 0) {
1625 			TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1626 			ipfw_xlat_reap(x, slave_x);
1627 		}
1628 	}
1629 	if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1630 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1631 		    &ctx->ipfw_xlatreap_nm);
1632 	}
1633 }
1634 
1635 static void
1636 ipfw_xlat_reap_timeo(void *xnm)
1637 {
1638 	struct netmsg_base *nm = xnm;
1639 
1640 	KKASSERT(mycpuid < netisr_ncpus);
1641 
1642 	crit_enter();
1643 	if (nm->lmsg.ms_flags & MSGF_DONE)
1644 		netisr_sendmsg_oncpu(nm);
1645 	crit_exit();
1646 }
1647 
1648 static void
1649 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1650 {
1651 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1652 	struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1653 	struct ipfw_xlat *slave_x = x->xlat_pair;
1654 	uint64_t crefs;
1655 
1656 	ASSERT_NETISR_NCPUS(mycpuid);
1657 
1658 	KKASSERT(slave_x != NULL);
1659 	KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1660 
1661 	KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1662 	    ("master xlat is still linked"));
1663 	if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1664 		ipfw_state_unlink(ctx, &slave_x->xlat_st);
1665 
1666 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1667 	slave_x->xlat_crefs--;
1668 
1669 	crefs = slave_x->xlat_crefs + x->xlat_crefs;
1670 	if (crefs == 0) {
1671 		ipfw_xlat_reap(x, slave_x);
1672 		return;
1673 	}
1674 
1675 	if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1676 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1677 		    &ctx->ipfw_xlatreap_nm);
1678 	}
1679 
1680 	/*
1681 	 * This pair is still referenced; defer its destruction.
1682 	 * YYY reuse st_link.
1683 	 */
1684 	TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1685 }
1686 
1687 static __inline void
1688 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1689 {
1690 
1691 	x->xlat_invalid = 1;
1692 	x->xlat_pair->xlat_invalid = 1;
1693 }
1694 
1695 static void
1696 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1697 {
1698 	struct ipfw_xlat *x, *slave_x;
1699 	struct netmsg_base *nm;
1700 
1701 	KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1702 	    IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1703 	KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1704 	    ("delete slave xlat"));
1705 
1706 	KASSERT(ctx->ipfw_state_cnt > 0,
1707 	    ("invalid state count %d", ctx->ipfw_state_cnt));
1708 	ctx->ipfw_state_cnt--;
1709 	if (ctx->ipfw_state_loosecnt > 0)
1710 		ctx->ipfw_state_loosecnt--;
1711 
1712 	/*
1713 	 * Unhook this state.
1714 	 */
1715 	if (s->st_track != NULL) {
1716 		struct ipfw_track *t = s->st_track;
1717 
1718 		KASSERT(!LIST_EMPTY(&t->t_state_list),
1719 		    ("track state list is empty"));
1720 		LIST_REMOVE(s, st_trklink);
1721 
1722 		KASSERT(*t->t_count > 0,
1723 		    ("invalid track count %d", *t->t_count));
1724 		atomic_subtract_int(t->t_count, 1);
1725 	}
1726 	ipfw_state_unlink(ctx, s);
1727 
1728 	/*
1729 	 * Free this state.  Xlat requires special processing,
1730 	 * since xlat are paired state and they could be on
1731 	 * different cpus.
1732 	 */
1733 
1734 	if (!IPFW_ISXLAT(s->st_type)) {
1735 		/* Not xlat; free now. */
1736 		kfree(s, M_IPFW);
1737 		/* Done! */
1738 		return;
1739 	}
1740 	x = (struct ipfw_xlat *)s;
1741 
1742 	if (x->xlat_pair == NULL) {
1743 		/* Not setup yet; free now. */
1744 		kfree(x, M_IPFW);
1745 		/* Done! */
1746 		return;
1747 	}
1748 	slave_x = x->xlat_pair;
1749 	KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1750 
1751 	if (x->xlat_pcpu == mycpuid) {
1752 		/*
1753 		 * Paired states are on the same cpu; delete this
1754 		 * pair now.
1755 		 */
1756 		KKASSERT(x->xlat_crefs == 0);
1757 		KKASSERT(slave_x->xlat_crefs == 0);
1758 		if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1759 			ipfw_state_unlink(ctx, &slave_x->xlat_st);
1760 		kfree(x, M_IPFW);
1761 		kfree(slave_x, M_IPFW);
1762 		return;
1763 	}
1764 
1765 	/*
1766 	 * Free the paired states on the cpu owning the slave xlat.
1767 	 */
1768 
1769 	/*
1770 	 * Mark the state pair invalid; completely deleting them
1771 	 * may take some time.
1772 	 */
1773 	ipfw_xlat_invalidate(x);
1774 
1775 	nm = &x->xlat_freenm;
1776 	netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1777 	    ipfw_xlat_free_dispatch);
1778 	nm->lmsg.u.ms_resultp = x;
1779 
1780 	/* See the comment in ipfw_xlate_redispatch(). */
1781 	x->xlat_rule->cross_refs++;
1782 	x->xlat_crefs++;
1783 
1784 	netisr_sendmsg(nm, x->xlat_pcpu);
1785 }
1786 
1787 static void
1788 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1789 {
1790 
1791 	if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1792 		KKASSERT(IPFW_ISXLAT(s->st_type));
1793 		ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1794 		ipfw_state_unlink(ctx, s);
1795 		return;
1796 	}
1797 	ipfw_state_del(ctx, s);
1798 }
1799 
1800 static int
1801 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1802 {
1803 	struct ipfw_state *s, *anchor;
1804 	int expired;
1805 
1806 	if (reap_max < ipfw_state_reap_min)
1807 		reap_max = ipfw_state_reap_min;
1808 
1809 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1810 		/*
1811 		 * Kick start state expiring.  Ignore scan limit,
1812 		 * we are short of states.
1813 		 */
1814 		ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1815 		expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1816 		ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1817 		return (expired);
1818 	}
1819 
1820 	/*
1821 	 * States are being expired.
1822 	 */
1823 
1824 	if (ctx->ipfw_state_cnt == 0)
1825 		return (0);
1826 
1827 	expired = 0;
1828 	anchor = &ctx->ipfw_stateexp_anch;
1829 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1830 		/*
1831 		 * Ignore scan limit; we are short of states.
1832 		 */
1833 
1834 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1835 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1836 
1837 		if (IPFW_STATE_SCANSKIP(s))
1838 			continue;
1839 
1840 		if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1841 			ipfw_state_del(ctx, s);
1842 			if (++expired >= reap_max)
1843 				break;
1844 			if ((expired & 0xff) == 0 &&
1845 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1846 			    ipfw_state_max)
1847 				break;
1848 		}
1849 	}
1850 	/*
1851 	 * NOTE:
1852 	 * Leave the anchor on the list, even if the end of the list has
1853 	 * been reached.  ipfw_state_expire_more_dispatch() will handle
1854 	 * the removal.
1855 	 */
1856 	return (expired);
1857 }
1858 
1859 static void
1860 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1861 {
1862 	struct ipfw_state *s, *sn;
1863 
1864 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1865 		if (IPFW_STATE_SCANSKIP(s))
1866 			continue;
1867 		if (rule != NULL && s->st_rule != rule)
1868 			continue;
1869 		ipfw_state_del(ctx, s);
1870 	}
1871 }
1872 
1873 static void
1874 ipfw_state_expire_done(struct ipfw_context *ctx)
1875 {
1876 
1877 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1878 	    ("stateexp is not in progress"));
1879 	ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1880 	callout_reset(&ctx->ipfw_stateto_ch, hz,
1881 	    ipfw_state_expire_ipifunc, NULL);
1882 }
1883 
1884 static void
1885 ipfw_state_expire_more(struct ipfw_context *ctx)
1886 {
1887 	struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1888 
1889 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1890 	    ("stateexp is not in progress"));
1891 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1892 	    ("stateexp more did not finish"));
1893 	netisr_sendmsg_oncpu(nm);
1894 }
1895 
1896 static int
1897 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1898     int scan_max, int expire_max)
1899 {
1900 	struct ipfw_state *s;
1901 	int scanned = 0, expired = 0;
1902 
1903 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1904 	    ("stateexp is not in progress"));
1905 
1906 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1907 		if (scanned++ >= scan_max) {
1908 			ipfw_state_expire_more(ctx);
1909 			return (expired);
1910 		}
1911 
1912 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1913 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1914 
1915 		if (IPFW_STATE_SCANSKIP(s))
1916 			continue;
1917 
1918 		if (IPFW_STATE_ISDEAD(s) ||
1919 		    ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1920 		     IPFW_STATE_TCPCLOSED(s))) {
1921 			ipfw_state_del(ctx, s);
1922 			if (++expired >= expire_max) {
1923 				ipfw_state_expire_more(ctx);
1924 				return (expired);
1925 			}
1926 			if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1927 			    (expired & 0xff) == 0 &&
1928 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1929 			    ipfw_state_max) {
1930 				ipfw_state_expire_more(ctx);
1931 				return (expired);
1932 			}
1933 		}
1934 	}
1935 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1936 	ipfw_state_expire_done(ctx);
1937 	return (expired);
1938 }
1939 
1940 static void
1941 ipfw_state_expire_more_dispatch(netmsg_t nm)
1942 {
1943 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1944 	struct ipfw_state *anchor;
1945 
1946 	ASSERT_NETISR_NCPUS(mycpuid);
1947 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1948 	    ("statexp is not in progress"));
1949 
1950 	/* Reply ASAP */
1951 	netisr_replymsg(&nm->base, 0);
1952 
1953 	anchor = &ctx->ipfw_stateexp_anch;
1954 	if (ctx->ipfw_state_cnt == 0) {
1955 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1956 		ipfw_state_expire_done(ctx);
1957 		return;
1958 	}
1959 	ipfw_state_expire_loop(ctx, anchor,
1960 	    ipfw_state_scan_max, ipfw_state_expire_max);
1961 }
1962 
1963 static int
1964 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1965 {
1966 	struct ipfw_state *anchor;
1967 
1968 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1969 	    ("stateexp is in progress"));
1970 	ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1971 
1972 	if (ctx->ipfw_state_cnt == 0) {
1973 		ipfw_state_expire_done(ctx);
1974 		return (0);
1975 	}
1976 
1977 	/*
1978 	 * Do not expire more than once per second, it is useless.
1979 	 */
1980 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1981 	    ctx->ipfw_state_lastexp == time_uptime) {
1982 		ipfw_state_expire_done(ctx);
1983 		return (0);
1984 	}
1985 	ctx->ipfw_state_lastexp = time_uptime;
1986 
1987 	anchor = &ctx->ipfw_stateexp_anch;
1988 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1989 	return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1990 }
1991 
1992 static void
1993 ipfw_state_expire_dispatch(netmsg_t nm)
1994 {
1995 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1996 
1997 	ASSERT_NETISR_NCPUS(mycpuid);
1998 
1999 	/* Reply ASAP */
2000 	crit_enter();
2001 	netisr_replymsg(&nm->base, 0);
2002 	crit_exit();
2003 
2004 	if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2005 		/* Running; done. */
2006 		return;
2007 	}
2008 	ipfw_state_expire_start(ctx,
2009 	    ipfw_state_scan_max, ipfw_state_expire_max);
2010 }
2011 
2012 static void
2013 ipfw_state_expire_ipifunc(void *dummy __unused)
2014 {
2015 	struct netmsg_base *msg;
2016 
2017 	KKASSERT(mycpuid < netisr_ncpus);
2018 	msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2019 
2020 	crit_enter();
2021 	if (msg->lmsg.ms_flags & MSGF_DONE)
2022 		netisr_sendmsg_oncpu(msg);
2023 	crit_exit();
2024 }
2025 
2026 static boolean_t
2027 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2028 {
2029 	uint32_t seq = ntohl(tcp->th_seq);
2030 	uint32_t ack = ntohl(tcp->th_ack);
2031 
2032 	if (tcp->th_flags & TH_RST)
2033 		return (TRUE);
2034 
2035 	if (dir == MATCH_FORWARD) {
2036 		if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2037 			s->st_flags |= IPFW_STATE_F_SEQFWD;
2038 			s->st_seq_fwd = seq;
2039 		} else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2040 			s->st_seq_fwd = seq;
2041 		} else {
2042 			/* Out-of-sequence; done. */
2043 			return (FALSE);
2044 		}
2045 		if (tcp->th_flags & TH_ACK) {
2046 			if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2047 				s->st_flags |= IPFW_STATE_F_ACKFWD;
2048 				s->st_ack_fwd = ack;
2049 			} else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2050 				s->st_ack_fwd = ack;
2051 			} else {
2052 				/* Out-of-sequence; done. */
2053 				return (FALSE);
2054 			}
2055 
2056 			if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2057 			    (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2058 				s->st_state |= (TH_ACK << 8);
2059 		}
2060 	} else {
2061 		if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2062 			s->st_flags |= IPFW_STATE_F_SEQREV;
2063 			s->st_seq_rev = seq;
2064 		} else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2065 			s->st_seq_rev = seq;
2066 		} else {
2067 			/* Out-of-sequence; done. */
2068 			return (FALSE);
2069 		}
2070 		if (tcp->th_flags & TH_ACK) {
2071 			if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2072 				s->st_flags |= IPFW_STATE_F_ACKREV;
2073 				s->st_ack_rev= ack;
2074 			} else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2075 				s->st_ack_rev = ack;
2076 			} else {
2077 				/* Out-of-sequence; done. */
2078 				return (FALSE);
2079 			}
2080 
2081 			if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2082 			    s->st_ack_rev == s->st_seq_fwd + 1)
2083 				s->st_state |= TH_ACK;
2084 		}
2085 	}
2086 	return (TRUE);
2087 }
2088 
2089 static void
2090 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2091     const struct tcphdr *tcp, struct ipfw_state *s)
2092 {
2093 
2094 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2095 		u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2096 
2097 		if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2098 			return;
2099 
2100 		s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2101 		switch (s->st_state & IPFW_STATE_TCPSTATES) {
2102 		case TH_SYN:				/* opening */
2103 			s->st_expire = time_uptime + dyn_syn_lifetime;
2104 			break;
2105 
2106 		case BOTH_SYN:			/* move to established */
2107 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
2108 		case BOTH_SYN | (TH_FIN << 8):
2109 			s->st_expire = time_uptime + dyn_ack_lifetime;
2110 			break;
2111 
2112 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
2113 			if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2114 				/* And both FINs were ACKed. */
2115 				s->st_expire = time_uptime + dyn_fin_lifetime;
2116 			} else {
2117 				s->st_expire = time_uptime +
2118 				    dyn_finwait_lifetime;
2119 			}
2120 			break;
2121 
2122 		default:
2123 #if 0
2124 			/*
2125 			 * reset or some invalid combination, but can also
2126 			 * occur if we use keep-state the wrong way.
2127 			 */
2128 			if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2129 				kprintf("invalid state: 0x%x\n", s->st_state);
2130 #endif
2131 			s->st_expire = time_uptime + dyn_rst_lifetime;
2132 			break;
2133 		}
2134 	} else if (pkt->proto == IPPROTO_UDP) {
2135 		s->st_expire = time_uptime + dyn_udp_lifetime;
2136 	} else {
2137 		/* other protocols */
2138 		s->st_expire = time_uptime + dyn_short_lifetime;
2139 	}
2140 }
2141 
2142 /*
2143  * Lookup a state.
2144  */
2145 static struct ipfw_state *
2146 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2147     int *match_direction, const struct tcphdr *tcp)
2148 {
2149 	struct ipfw_state *key, *s;
2150 	int dir = MATCH_NONE;
2151 
2152 	key = &ctx->ipfw_state_tmpkey;
2153 	ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2154 	    pkt->dst_ip, pkt->dst_port, pkt->proto);
2155 	s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2156 	if (s == NULL)
2157 		goto done; /* not found. */
2158 	if (IPFW_STATE_ISDEAD(s)) {
2159 		ipfw_state_remove(ctx, s);
2160 		s = NULL;
2161 		goto done;
2162 	}
2163 	if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2164 		/* TCP ports recycling is too fast. */
2165 		ctx->ipfw_sts_tcprecycled++;
2166 		ipfw_state_remove(ctx, s);
2167 		s = NULL;
2168 		goto done;
2169 	}
2170 
2171 	if (s->st_swap == key->st_swap) {
2172 		dir = MATCH_FORWARD;
2173 	} else {
2174 		KASSERT((s->st_swap & key->st_swap) == 0,
2175 		    ("found mismatch state"));
2176 		dir = MATCH_REVERSE;
2177 	}
2178 
2179 	/* Update this state. */
2180 	ipfw_state_update(pkt, dir, tcp, s);
2181 
2182 	if (s->st_track != NULL) {
2183 		/* This track has been used. */
2184 		s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2185 	}
2186 done:
2187 	if (match_direction)
2188 		*match_direction = dir;
2189 	return (s);
2190 }
2191 
2192 static struct ipfw_state *
2193 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2194     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2195 {
2196 	struct ipfw_state *s;
2197 	size_t sz;
2198 
2199 	KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2200 	    ("invalid state type %u", type));
2201 
2202 	sz = sizeof(struct ipfw_state);
2203 	if (IPFW_ISXLAT(type))
2204 		sz = sizeof(struct ipfw_xlat);
2205 
2206 	s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2207 	if (s == NULL) {
2208 		ctx->ipfw_sts_nomem++;
2209 		return (NULL);
2210 	}
2211 
2212 	ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2213 	    id->dst_ip, id->dst_port, id->proto);
2214 
2215 	s->st_rule = rule;
2216 	s->st_type = type;
2217 	if (IPFW_ISXLAT(type)) {
2218 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2219 
2220 		x->xlat_dir = MATCH_NONE;
2221 		x->xlat_pcpu = -1;
2222 	}
2223 
2224 	/*
2225 	 * Update this state:
2226 	 * Set st_expire and st_state.
2227 	 */
2228 	ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2229 
2230 	return (s);
2231 }
2232 
2233 static struct ipfw_state *
2234 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2235     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2236     const struct tcphdr *tcp)
2237 {
2238 	struct ipfw_state *s, *dup;
2239 
2240 	s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2241 	if (s == NULL)
2242 		return (NULL);
2243 
2244 	ctx->ipfw_state_cnt++;
2245 	ctx->ipfw_state_loosecnt++;
2246 	if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2247 		ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2248 		ctx->ipfw_state_loosecnt = 0;
2249 	}
2250 
2251 	dup = ipfw_state_link(ctx, s);
2252 	if (dup != NULL)
2253 		panic("ipfw: %u state exists %p", type, dup);
2254 
2255 	if (t != NULL) {
2256 		/* Keep the track referenced. */
2257 		LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2258 		s->st_track = t;
2259 	}
2260 	return (s);
2261 }
2262 
2263 static boolean_t
2264 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2265 {
2266 	struct ipfw_trkcnt *trk;
2267 	boolean_t trk_freed = FALSE;
2268 
2269 	KASSERT(t->t_count != NULL, ("track anchor"));
2270 	KASSERT(LIST_EMPTY(&t->t_state_list),
2271 	    ("invalid track is still referenced"));
2272 
2273 	trk = t->t_trkcnt;
2274 	KASSERT(trk != NULL, ("track has no trkcnt"));
2275 
2276 	RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2277 	TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2278 	kfree(t, M_IPFW);
2279 
2280 	/*
2281 	 * fdrop() style reference counting.
2282 	 * See kern/kern_descrip.c fdrop().
2283 	 */
2284 	for (;;) {
2285 		int refs = trk->tc_refs;
2286 
2287 		cpu_ccfence();
2288 		KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2289 		if (refs == 1) {
2290 			IPFW_TRKCNT_TOKGET;
2291 			if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2292 				KASSERT(trk->tc_count == 0,
2293 				    ("%d states reference this trkcnt",
2294 				     trk->tc_count));
2295 				RB_REMOVE(ipfw_trkcnt_tree,
2296 				    &ipfw_gd.ipfw_trkcnt_tree, trk);
2297 
2298 				KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2299 				    ("invalid trkcnt cnt %d",
2300 				     ipfw_gd.ipfw_trkcnt_cnt));
2301 				ipfw_gd.ipfw_trkcnt_cnt--;
2302 				IPFW_TRKCNT_TOKREL;
2303 
2304 				if (ctx->ipfw_trkcnt_spare == NULL)
2305 					ctx->ipfw_trkcnt_spare = trk;
2306 				else
2307 					kfree(trk, M_IPFW);
2308 				trk_freed = TRUE;
2309 				break; /* done! */
2310 			}
2311 			IPFW_TRKCNT_TOKREL;
2312 			/* retry */
2313 		} else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2314 			break; /* done! */
2315 		}
2316 		/* retry */
2317 	}
2318 	return (trk_freed);
2319 }
2320 
2321 static void
2322 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2323 {
2324 	struct ipfw_track *t, *tn;
2325 
2326 	TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2327 		if (t->t_count == NULL) /* anchor */
2328 			continue;
2329 		if (rule != NULL && t->t_rule != rule)
2330 			continue;
2331 		ipfw_track_free(ctx, t);
2332 	}
2333 }
2334 
2335 static boolean_t
2336 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2337     boolean_t reap)
2338 {
2339 	struct ipfw_state *s, *sn;
2340 	boolean_t ret = FALSE;
2341 
2342 	KASSERT(t->t_count != NULL, ("track anchor"));
2343 
2344 	if (LIST_EMPTY(&t->t_state_list))
2345 		return (FALSE);
2346 
2347 	/*
2348 	 * Do not expire more than once per second, it is useless.
2349 	 */
2350 	if (t->t_lastexp == time_uptime)
2351 		return (FALSE);
2352 	t->t_lastexp = time_uptime;
2353 
2354 	LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2355 		if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2356 			KASSERT(s->st_track == t,
2357 			    ("state track %p does not match %p",
2358 			     s->st_track, t));
2359 			ipfw_state_del(ctx, s);
2360 			ret = TRUE;
2361 		}
2362 	}
2363 	return (ret);
2364 }
2365 
2366 static __inline struct ipfw_trkcnt *
2367 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2368 {
2369 	struct ipfw_trkcnt *trk;
2370 
2371 	if (ctx->ipfw_trkcnt_spare != NULL) {
2372 		trk = ctx->ipfw_trkcnt_spare;
2373 		ctx->ipfw_trkcnt_spare = NULL;
2374 	} else {
2375 		trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2376 		    M_INTWAIT | M_NULLOK);
2377 	}
2378 	return (trk);
2379 }
2380 
2381 static void
2382 ipfw_track_expire_done(struct ipfw_context *ctx)
2383 {
2384 
2385 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2386 	    ("trackexp is not in progress"));
2387 	ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2388 	callout_reset(&ctx->ipfw_trackto_ch, hz,
2389 	    ipfw_track_expire_ipifunc, NULL);
2390 }
2391 
2392 static void
2393 ipfw_track_expire_more(struct ipfw_context *ctx)
2394 {
2395 	struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2396 
2397 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2398 	    ("trackexp is not in progress"));
2399 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2400 	    ("trackexp more did not finish"));
2401 	netisr_sendmsg_oncpu(nm);
2402 }
2403 
2404 static int
2405 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2406     int scan_max, int expire_max)
2407 {
2408 	struct ipfw_track *t;
2409 	int scanned = 0, expired = 0;
2410 	boolean_t reap = FALSE;
2411 
2412 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2413 	    ("trackexp is not in progress"));
2414 
2415 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2416 		reap = TRUE;
2417 
2418 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2419 		if (scanned++ >= scan_max) {
2420 			ipfw_track_expire_more(ctx);
2421 			return (expired);
2422 		}
2423 
2424 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2425 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2426 
2427 		if (t->t_count == NULL) /* anchor */
2428 			continue;
2429 
2430 		ipfw_track_state_expire(ctx, t, reap);
2431 		if (!LIST_EMPTY(&t->t_state_list)) {
2432 			/* There are states referencing this track. */
2433 			continue;
2434 		}
2435 
2436 		if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2437 			/* Expired. */
2438 			if (ipfw_track_free(ctx, t)) {
2439 				if (++expired >= expire_max) {
2440 					ipfw_track_expire_more(ctx);
2441 					return (expired);
2442 				}
2443 			}
2444 		}
2445 	}
2446 	TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2447 	ipfw_track_expire_done(ctx);
2448 	return (expired);
2449 }
2450 
2451 static int
2452 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2453 {
2454 	struct ipfw_track *anchor;
2455 
2456 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2457 	    ("trackexp is in progress"));
2458 	ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2459 
2460 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2461 		ipfw_track_expire_done(ctx);
2462 		return (0);
2463 	}
2464 
2465 	/*
2466 	 * Do not expire more than once per second, it is useless.
2467 	 */
2468 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2469 	    ctx->ipfw_track_lastexp == time_uptime) {
2470 		ipfw_track_expire_done(ctx);
2471 		return (0);
2472 	}
2473 	ctx->ipfw_track_lastexp = time_uptime;
2474 
2475 	anchor = &ctx->ipfw_trackexp_anch;
2476 	TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2477 	return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2478 }
2479 
2480 static void
2481 ipfw_track_expire_more_dispatch(netmsg_t nm)
2482 {
2483 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2484 	struct ipfw_track *anchor;
2485 
2486 	ASSERT_NETISR_NCPUS(mycpuid);
2487 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2488 	    ("trackexp is not in progress"));
2489 
2490 	/* Reply ASAP */
2491 	netisr_replymsg(&nm->base, 0);
2492 
2493 	anchor = &ctx->ipfw_trackexp_anch;
2494 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2495 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2496 		ipfw_track_expire_done(ctx);
2497 		return;
2498 	}
2499 	ipfw_track_expire_loop(ctx, anchor,
2500 	    ipfw_track_scan_max, ipfw_track_expire_max);
2501 }
2502 
2503 static void
2504 ipfw_track_expire_dispatch(netmsg_t nm)
2505 {
2506 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2507 
2508 	ASSERT_NETISR_NCPUS(mycpuid);
2509 
2510 	/* Reply ASAP */
2511 	crit_enter();
2512 	netisr_replymsg(&nm->base, 0);
2513 	crit_exit();
2514 
2515 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2516 		/* Running; done. */
2517 		return;
2518 	}
2519 	ipfw_track_expire_start(ctx,
2520 	    ipfw_track_scan_max, ipfw_track_expire_max);
2521 }
2522 
2523 static void
2524 ipfw_track_expire_ipifunc(void *dummy __unused)
2525 {
2526 	struct netmsg_base *msg;
2527 
2528 	KKASSERT(mycpuid < netisr_ncpus);
2529 	msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2530 
2531 	crit_enter();
2532 	if (msg->lmsg.ms_flags & MSGF_DONE)
2533 		netisr_sendmsg_oncpu(msg);
2534 	crit_exit();
2535 }
2536 
2537 static int
2538 ipfw_track_reap(struct ipfw_context *ctx)
2539 {
2540 	struct ipfw_track *t, *anchor;
2541 	int expired;
2542 
2543 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2544 		/*
2545 		 * Kick start track expiring.  Ignore scan limit,
2546 		 * we are short of tracks.
2547 		 */
2548 		ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2549 		expired = ipfw_track_expire_start(ctx, INT_MAX,
2550 		    ipfw_track_reap_max);
2551 		ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2552 		return (expired);
2553 	}
2554 
2555 	/*
2556 	 * Tracks are being expired.
2557 	 */
2558 
2559 	if (RB_EMPTY(&ctx->ipfw_track_tree))
2560 		return (0);
2561 
2562 	expired = 0;
2563 	anchor = &ctx->ipfw_trackexp_anch;
2564 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2565 		/*
2566 		 * Ignore scan limit; we are short of tracks.
2567 		 */
2568 
2569 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2570 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2571 
2572 		if (t->t_count == NULL) /* anchor */
2573 			continue;
2574 
2575 		ipfw_track_state_expire(ctx, t, TRUE);
2576 		if (!LIST_EMPTY(&t->t_state_list)) {
2577 			/* There are states referencing this track. */
2578 			continue;
2579 		}
2580 
2581 		if (ipfw_track_free(ctx, t)) {
2582 			if (++expired >= ipfw_track_reap_max) {
2583 				ipfw_track_expire_more(ctx);
2584 				break;
2585 			}
2586 		}
2587 	}
2588 	/*
2589 	 * NOTE:
2590 	 * Leave the anchor on the list, even if the end of the list has
2591 	 * been reached.  ipfw_track_expire_more_dispatch() will handle
2592 	 * the removal.
2593 	 */
2594 	return (expired);
2595 }
2596 
2597 static struct ipfw_track *
2598 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2599     uint16_t limit_mask, struct ip_fw *rule)
2600 {
2601 	struct ipfw_track *key, *t, *dup;
2602 	struct ipfw_trkcnt *trk, *ret;
2603 	boolean_t do_expire = FALSE;
2604 
2605 	KASSERT(rule->track_ruleid != 0,
2606 	    ("rule %u has no track ruleid", rule->rulenum));
2607 
2608 	key = &ctx->ipfw_track_tmpkey;
2609 	key->t_proto = id->proto;
2610 	key->t_addrs = 0;
2611 	key->t_ports = 0;
2612 	key->t_rule = rule;
2613 	if (limit_mask & DYN_SRC_ADDR)
2614 		key->t_saddr = id->src_ip;
2615 	if (limit_mask & DYN_DST_ADDR)
2616 		key->t_daddr = id->dst_ip;
2617 	if (limit_mask & DYN_SRC_PORT)
2618 		key->t_sport = id->src_port;
2619 	if (limit_mask & DYN_DST_PORT)
2620 		key->t_dport = id->dst_port;
2621 
2622 	t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2623 	if (t != NULL)
2624 		goto done;
2625 
2626 	t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2627 	if (t == NULL) {
2628 		ctx->ipfw_tks_nomem++;
2629 		return (NULL);
2630 	}
2631 
2632 	t->t_key = key->t_key;
2633 	t->t_rule = rule;
2634 	t->t_lastexp = 0;
2635 	LIST_INIT(&t->t_state_list);
2636 
2637 	if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2638 		time_t globexp, uptime;
2639 
2640 		trk = NULL;
2641 		do_expire = TRUE;
2642 
2643 		/*
2644 		 * Do not expire globally more than once per second,
2645 		 * it is useless.
2646 		 */
2647 		uptime = time_uptime;
2648 		globexp = ipfw_gd.ipfw_track_globexp;
2649 		if (globexp != uptime &&
2650 		    atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2651 		    globexp, uptime)) {
2652 			int cpu;
2653 
2654 			/* Expire tracks on other CPUs. */
2655 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2656 				if (cpu == mycpuid)
2657 					continue;
2658 				lwkt_send_ipiq(globaldata_find(cpu),
2659 				    ipfw_track_expire_ipifunc, NULL);
2660 			}
2661 		}
2662 	} else {
2663 		trk = ipfw_trkcnt_alloc(ctx);
2664 	}
2665 	if (trk == NULL) {
2666 		struct ipfw_trkcnt *tkey;
2667 
2668 		tkey = &ctx->ipfw_trkcnt_tmpkey;
2669 		key = NULL; /* tkey overlaps key */
2670 
2671 		tkey->tc_key = t->t_key;
2672 		tkey->tc_ruleid = rule->track_ruleid;
2673 
2674 		IPFW_TRKCNT_TOKGET;
2675 		trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2676 		    tkey);
2677 		if (trk == NULL) {
2678 			IPFW_TRKCNT_TOKREL;
2679 			if (do_expire) {
2680 				ctx->ipfw_tks_reap++;
2681 				if (ipfw_track_reap(ctx) > 0) {
2682 					if (ipfw_gd.ipfw_trkcnt_cnt <
2683 					    ipfw_track_max) {
2684 						trk = ipfw_trkcnt_alloc(ctx);
2685 						if (trk != NULL)
2686 							goto install;
2687 						ctx->ipfw_tks_cntnomem++;
2688 					} else {
2689 						ctx->ipfw_tks_overflow++;
2690 					}
2691 				} else {
2692 					ctx->ipfw_tks_reapfailed++;
2693 					ctx->ipfw_tks_overflow++;
2694 				}
2695 			} else {
2696 				ctx->ipfw_tks_cntnomem++;
2697 			}
2698 			kfree(t, M_IPFW);
2699 			return (NULL);
2700 		}
2701 		KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2702 		    ("invalid trkcnt refs %d", trk->tc_refs));
2703 		atomic_add_int(&trk->tc_refs, 1);
2704 		IPFW_TRKCNT_TOKREL;
2705 	} else {
2706 install:
2707 		trk->tc_key = t->t_key;
2708 		trk->tc_ruleid = rule->track_ruleid;
2709 		trk->tc_refs = 0;
2710 		trk->tc_count = 0;
2711 		trk->tc_expire = 0;
2712 		trk->tc_rulenum = rule->rulenum;
2713 
2714 		IPFW_TRKCNT_TOKGET;
2715 		ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2716 		    trk);
2717 		if (ret != NULL) {
2718 			KASSERT(ret->tc_refs > 0 &&
2719 			    ret->tc_refs < netisr_ncpus,
2720 			    ("invalid trkcnt refs %d", ret->tc_refs));
2721 			KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2722 			    ("trkcnt spare was installed"));
2723 			ctx->ipfw_trkcnt_spare = trk;
2724 			trk = ret;
2725 		} else {
2726 			ipfw_gd.ipfw_trkcnt_cnt++;
2727 		}
2728 		atomic_add_int(&trk->tc_refs, 1);
2729 		IPFW_TRKCNT_TOKREL;
2730 	}
2731 	t->t_count = &trk->tc_count;
2732 	t->t_trkcnt = trk;
2733 
2734 	dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2735 	if (dup != NULL)
2736 		panic("ipfw: track exists");
2737 	TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2738 done:
2739 	t->t_expire = time_uptime + dyn_short_lifetime;
2740 	return (t);
2741 }
2742 
2743 /*
2744  * Install state for rule type cmd->o.opcode
2745  *
2746  * Returns NULL if state is not installed because of errors or because
2747  * states limitations are enforced.
2748  */
2749 static struct ipfw_state *
2750 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2751     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2752 {
2753 	struct ipfw_state *s;
2754 	struct ipfw_track *t;
2755 	int count, diff;
2756 
2757 	if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2758 	    (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2759 		boolean_t overflow = TRUE;
2760 
2761 		ctx->ipfw_sts_reap++;
2762 		if (ipfw_state_reap(ctx, diff) == 0)
2763 			ctx->ipfw_sts_reapfailed++;
2764 		if (ipfw_state_cntsync() < ipfw_state_max)
2765 			overflow = FALSE;
2766 
2767 		if (overflow) {
2768 			time_t globexp, uptime;
2769 			int cpu;
2770 
2771 			/*
2772 			 * Do not expire globally more than once per second,
2773 			 * it is useless.
2774 			 */
2775 			uptime = time_uptime;
2776 			globexp = ipfw_gd.ipfw_state_globexp;
2777 			if (globexp == uptime ||
2778 			    !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2779 			    globexp, uptime)) {
2780 				ctx->ipfw_sts_overflow++;
2781 				return (NULL);
2782 			}
2783 
2784 			/* Expire states on other CPUs. */
2785 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2786 				if (cpu == mycpuid)
2787 					continue;
2788 				lwkt_send_ipiq(globaldata_find(cpu),
2789 				    ipfw_state_expire_ipifunc, NULL);
2790 			}
2791 			ctx->ipfw_sts_overflow++;
2792 			return (NULL);
2793 		}
2794 	}
2795 
2796 	switch (cmd->o.opcode) {
2797 	case O_KEEP_STATE: /* bidir rule */
2798 	case O_REDIRECT:
2799 		s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2800 		    tcp);
2801 		if (s == NULL)
2802 			return (NULL);
2803 		break;
2804 
2805 	case O_LIMIT: /* limit number of sessions */
2806 		t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2807 		if (t == NULL)
2808 			return (NULL);
2809 
2810 		if (*t->t_count >= cmd->conn_limit) {
2811 			if (!ipfw_track_state_expire(ctx, t, TRUE))
2812 				return (NULL);
2813 		}
2814 		for (;;) {
2815 			count = *t->t_count;
2816 			if (count >= cmd->conn_limit)
2817 				return (NULL);
2818 			if (atomic_cmpset_int(t->t_count, count, count + 1))
2819 				break;
2820 		}
2821 
2822 		s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2823 		if (s == NULL) {
2824 			/* Undo damage. */
2825 			atomic_subtract_int(t->t_count, 1);
2826 			return (NULL);
2827 		}
2828 		break;
2829 
2830 	default:
2831 		panic("unknown state type %u\n", cmd->o.opcode);
2832 	}
2833 
2834 	if (s->st_type == O_REDIRECT) {
2835 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2836 		ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2837 
2838 		x->xlat_addr = r->addr.s_addr;
2839 		x->xlat_port = r->port;
2840 		x->xlat_ifp = args->m->m_pkthdr.rcvif;
2841 		x->xlat_dir = MATCH_FORWARD;
2842 		KKASSERT(x->xlat_ifp != NULL);
2843 	}
2844 	return (s);
2845 }
2846 
2847 static int
2848 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2849     const struct in_addr *in)
2850 {
2851 	struct radix_node_head *rnh;
2852 	struct sockaddr_in sin;
2853 	struct ipfw_tblent *te;
2854 
2855 	KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2856 	rnh = ctx->ipfw_tables[tableid];
2857 	if (rnh == NULL)
2858 		return (0); /* no match */
2859 
2860 	memset(&sin, 0, sizeof(sin));
2861 	sin.sin_family = AF_INET;
2862 	sin.sin_len = sizeof(sin);
2863 	sin.sin_addr = *in;
2864 
2865 	te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2866 	if (te == NULL)
2867 		return (0); /* no match */
2868 
2869 	te->te_use++;
2870 	te->te_lastuse = time_second;
2871 	return (1); /* match */
2872 }
2873 
2874 /*
2875  * Transmit a TCP packet, containing either a RST or a keepalive.
2876  * When flags & TH_RST, we are sending a RST packet, because of a
2877  * "reset" action matched the packet.
2878  * Otherwise we are sending a keepalive, and flags & TH_
2879  *
2880  * Only {src,dst}_{ip,port} of "id" are used.
2881  */
2882 static void
2883 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2884 {
2885 	struct mbuf *m;
2886 	struct ip *ip;
2887 	struct tcphdr *tcp;
2888 	struct route sro;	/* fake route */
2889 
2890 	MGETHDR(m, M_NOWAIT, MT_HEADER);
2891 	if (m == NULL)
2892 		return;
2893 	m->m_pkthdr.rcvif = NULL;
2894 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2895 	m->m_data += max_linkhdr;
2896 
2897 	ip = mtod(m, struct ip *);
2898 	bzero(ip, m->m_len);
2899 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2900 	ip->ip_p = IPPROTO_TCP;
2901 	tcp->th_off = 5;
2902 
2903 	/*
2904 	 * Assume we are sending a RST (or a keepalive in the reverse
2905 	 * direction), swap src and destination addresses and ports.
2906 	 */
2907 	ip->ip_src.s_addr = htonl(id->dst_ip);
2908 	ip->ip_dst.s_addr = htonl(id->src_ip);
2909 	tcp->th_sport = htons(id->dst_port);
2910 	tcp->th_dport = htons(id->src_port);
2911 	if (flags & TH_RST) {	/* we are sending a RST */
2912 		if (flags & TH_ACK) {
2913 			tcp->th_seq = htonl(ack);
2914 			tcp->th_ack = htonl(0);
2915 			tcp->th_flags = TH_RST;
2916 		} else {
2917 			if (flags & TH_SYN)
2918 				seq++;
2919 			tcp->th_seq = htonl(0);
2920 			tcp->th_ack = htonl(seq);
2921 			tcp->th_flags = TH_RST | TH_ACK;
2922 		}
2923 	} else {
2924 		/*
2925 		 * We are sending a keepalive. flags & TH_SYN determines
2926 		 * the direction, forward if set, reverse if clear.
2927 		 * NOTE: seq and ack are always assumed to be correct
2928 		 * as set by the caller. This may be confusing...
2929 		 */
2930 		if (flags & TH_SYN) {
2931 			/*
2932 			 * we have to rewrite the correct addresses!
2933 			 */
2934 			ip->ip_dst.s_addr = htonl(id->dst_ip);
2935 			ip->ip_src.s_addr = htonl(id->src_ip);
2936 			tcp->th_dport = htons(id->dst_port);
2937 			tcp->th_sport = htons(id->src_port);
2938 		}
2939 		tcp->th_seq = htonl(seq);
2940 		tcp->th_ack = htonl(ack);
2941 		tcp->th_flags = TH_ACK;
2942 	}
2943 
2944 	/*
2945 	 * set ip_len to the payload size so we can compute
2946 	 * the tcp checksum on the pseudoheader
2947 	 * XXX check this, could save a couple of words ?
2948 	 */
2949 	ip->ip_len = htons(sizeof(struct tcphdr));
2950 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2951 
2952 	/*
2953 	 * now fill fields left out earlier
2954 	 */
2955 	ip->ip_ttl = ip_defttl;
2956 	ip->ip_len = m->m_pkthdr.len;
2957 
2958 	bzero(&sro, sizeof(sro));
2959 	ip_rtaddr(ip->ip_dst, &sro);
2960 
2961 	m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2962 	ip_output(m, NULL, &sro, 0, NULL, NULL);
2963 	if (sro.ro_rt)
2964 		RTFREE(sro.ro_rt);
2965 }
2966 
2967 /*
2968  * Send a reject message, consuming the mbuf passed as an argument.
2969  */
2970 static void
2971 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2972 {
2973 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2974 		/* We need the IP header in host order for icmp_error(). */
2975 		if (args->eh != NULL) {
2976 			struct ip *ip = mtod(args->m, struct ip *);
2977 
2978 			ip->ip_len = ntohs(ip->ip_len);
2979 			ip->ip_off = ntohs(ip->ip_off);
2980 		}
2981 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2982 	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2983 		struct tcphdr *const tcp =
2984 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2985 
2986 		if ((tcp->th_flags & TH_RST) == 0) {
2987 			send_pkt(&args->f_id, ntohl(tcp->th_seq),
2988 				 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2989 		}
2990 		m_freem(args->m);
2991 	} else {
2992 		m_freem(args->m);
2993 	}
2994 	args->m = NULL;
2995 }
2996 
2997 /*
2998  * Given an ip_fw *, lookup_next_rule will return a pointer
2999  * to the next rule, which can be either the jump
3000  * target (for skipto instructions) or the next one in the list (in
3001  * all other cases including a missing jump target).
3002  * The result is also written in the "next_rule" field of the rule.
3003  * Backward jumps are not allowed, so start looking from the next
3004  * rule...
3005  *
3006  * This never returns NULL -- in case we do not have an exact match,
3007  * the next rule is returned. When the ruleset is changed,
3008  * pointers are flushed so we are always correct.
3009  */
3010 static struct ip_fw *
3011 lookup_next_rule(struct ip_fw *me)
3012 {
3013 	struct ip_fw *rule = NULL;
3014 	ipfw_insn *cmd;
3015 
3016 	/* look for action, in case it is a skipto */
3017 	cmd = ACTION_PTR(me);
3018 	if (cmd->opcode == O_LOG)
3019 		cmd += F_LEN(cmd);
3020 	if (cmd->opcode == O_SKIPTO) {
3021 		for (rule = me->next; rule; rule = rule->next) {
3022 			if (rule->rulenum >= cmd->arg1)
3023 				break;
3024 		}
3025 	}
3026 	if (rule == NULL)			/* failure or not a skipto */
3027 		rule = me->next;
3028 	me->next_rule = rule;
3029 	return rule;
3030 }
3031 
3032 static int
3033 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3034 		enum ipfw_opcodes opcode, uid_t uid)
3035 {
3036 	struct in_addr src_ip, dst_ip;
3037 	struct inpcbinfo *pi;
3038 	boolean_t wildcard;
3039 	struct inpcb *pcb;
3040 
3041 	if (fid->proto == IPPROTO_TCP) {
3042 		wildcard = FALSE;
3043 		pi = &tcbinfo[mycpuid];
3044 	} else if (fid->proto == IPPROTO_UDP) {
3045 		wildcard = TRUE;
3046 		pi = &udbinfo[mycpuid];
3047 	} else {
3048 		return 0;
3049 	}
3050 
3051 	/*
3052 	 * Values in 'fid' are in host byte order
3053 	 */
3054 	dst_ip.s_addr = htonl(fid->dst_ip);
3055 	src_ip.s_addr = htonl(fid->src_ip);
3056 	if (oif) {
3057 		pcb = in_pcblookup_hash(pi,
3058 			dst_ip, htons(fid->dst_port),
3059 			src_ip, htons(fid->src_port),
3060 			wildcard, oif);
3061 	} else {
3062 		pcb = in_pcblookup_hash(pi,
3063 			src_ip, htons(fid->src_port),
3064 			dst_ip, htons(fid->dst_port),
3065 			wildcard, NULL);
3066 	}
3067 	if (pcb == NULL || pcb->inp_socket == NULL)
3068 		return 0;
3069 
3070 	if (opcode == O_UID) {
3071 #define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
3072 		return !socheckuid(pcb->inp_socket, uid);
3073 #undef socheckuid
3074 	} else  {
3075 		return groupmember(uid, pcb->inp_socket->so_cred);
3076 	}
3077 }
3078 
3079 static int
3080 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3081 {
3082 
3083 	if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3084 		struct ifaddr_container *ifac;
3085 		struct ifnet *ifp;
3086 
3087 		ifp = ifunit_netisr(cmd->ifname);
3088 		if (ifp == NULL)
3089 			return (0);
3090 
3091 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3092 			struct ifaddr *ia = ifac->ifa;
3093 
3094 			if (ia->ifa_addr == NULL)
3095 				continue;
3096 			if (ia->ifa_addr->sa_family != AF_INET)
3097 				continue;
3098 
3099 			cmd->mask.s_addr = INADDR_ANY;
3100 			if (cmd->o.arg1 & IPFW_IFIP_NET) {
3101 				cmd->mask = ((struct sockaddr_in *)
3102 				    ia->ifa_netmask)->sin_addr;
3103 			}
3104 			if (cmd->mask.s_addr == INADDR_ANY)
3105 				cmd->mask.s_addr = INADDR_BROADCAST;
3106 
3107 			cmd->addr =
3108 			    ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3109 			cmd->addr.s_addr &= cmd->mask.s_addr;
3110 
3111 			cmd->o.arg1 |= IPFW_IFIP_VALID;
3112 			break;
3113 		}
3114 		if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3115 			return (0);
3116 	}
3117 	return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3118 }
3119 
3120 static void
3121 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3122     struct in_addr *old_addr, uint16_t *old_port)
3123 {
3124 	struct ip *ip = mtod(m, struct ip *);
3125 	struct in_addr *addr;
3126 	uint16_t *port, *csum, dlen = 0;
3127 	uint8_t udp = 0;
3128 	boolean_t pseudo = FALSE;
3129 
3130 	if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3131 		addr = &ip->ip_src;
3132 		switch (ip->ip_p) {
3133 		case IPPROTO_TCP:
3134 			port = &L3HDR(struct tcphdr, ip)->th_sport;
3135 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3136 			break;
3137 		case IPPROTO_UDP:
3138 			port = &L3HDR(struct udphdr, ip)->uh_sport;
3139 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3140 			udp = 1;
3141 			break;
3142 		default:
3143 			panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3144 		}
3145 	} else {
3146 		addr = &ip->ip_dst;
3147 		switch (ip->ip_p) {
3148 		case IPPROTO_TCP:
3149 			port = &L3HDR(struct tcphdr, ip)->th_dport;
3150 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3151 			break;
3152 		case IPPROTO_UDP:
3153 			port = &L3HDR(struct udphdr, ip)->uh_dport;
3154 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3155 			udp = 1;
3156 			break;
3157 		default:
3158 			panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3159 		}
3160 	}
3161 	if (old_addr != NULL)
3162 		*old_addr = *addr;
3163 	if (old_port != NULL) {
3164 		if (x->xlat_port != 0)
3165 			*old_port = *port;
3166 		else
3167 			*old_port = 0;
3168 	}
3169 
3170 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3171 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3172 			dlen = ip->ip_len - (ip->ip_hl << 2);
3173 		pseudo = TRUE;
3174 	}
3175 
3176 	if (!pseudo) {
3177 		const uint16_t *oaddr, *naddr;
3178 
3179 		oaddr = (const uint16_t *)&addr->s_addr;
3180 		naddr = (const uint16_t *)&x->xlat_addr;
3181 
3182 		ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3183 		    oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3184 		*csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3185 		    oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3186 	}
3187 	addr->s_addr = x->xlat_addr;
3188 
3189 	if (x->xlat_port != 0) {
3190 		if (!pseudo) {
3191 			*csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3192 			    udp);
3193 		}
3194 		*port = x->xlat_port;
3195 	}
3196 
3197 	if (pseudo) {
3198 		*csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3199 		    htons(dlen + ip->ip_p));
3200 	}
3201 }
3202 
3203 static void
3204 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3205 {
3206 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3207 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3208 	struct mbuf *m = nm->m;
3209 	struct ipfw_xlat *x = nm->arg1;
3210 	struct ip_fw *rule = x->xlat_rule;
3211 
3212 	ASSERT_NETISR_NCPUS(mycpuid);
3213 	KASSERT(rule->cpuid == mycpuid,
3214 	    ("rule does not belong to cpu%d", mycpuid));
3215 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3216 	    ("mbuf does not have ipfw continue rule"));
3217 
3218 	KASSERT(ctx->ipfw_cont_rule == NULL,
3219 	    ("pending ipfw continue rule"));
3220 	KASSERT(ctx->ipfw_cont_xlat == NULL,
3221 	    ("pending ipfw continue xlat"));
3222 	ctx->ipfw_cont_rule = rule;
3223 	ctx->ipfw_cont_xlat = x;
3224 
3225 	if (nm->arg2 == 0)
3226 		ip_input(m);
3227 	else
3228 		ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3229 
3230 	/* May not be cleared, if ipfw was unload/disabled. */
3231 	ctx->ipfw_cont_rule = NULL;
3232 	ctx->ipfw_cont_xlat = NULL;
3233 
3234 	/*
3235 	 * This state is no longer used; decrement its xlat_crefs,
3236 	 * so this state can be deleted.
3237 	 */
3238 	x->xlat_crefs--;
3239 	/*
3240 	 * This rule is no longer used; decrement its cross_refs,
3241 	 * so this rule can be deleted.
3242 	 *
3243 	 * NOTE:
3244 	 * Decrement cross_refs in the last step of this function,
3245 	 * so that the module could be unloaded safely.
3246 	 */
3247 	rule->cross_refs--;
3248 }
3249 
3250 static void
3251 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3252     uint32_t flags)
3253 {
3254 	struct netmsg_genpkt *nm;
3255 
3256 	KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3257 	    x->xlat_pcpu, cpuid));
3258 
3259 	/*
3260 	 * Bump cross_refs to prevent this rule and its siblings
3261 	 * from being deleted, while this mbuf is inflight.  The
3262 	 * cross_refs of the sibling rule on the target cpu will
3263 	 * be decremented, once this mbuf is going to be filtered
3264 	 * on the target cpu.
3265 	 */
3266 	x->xlat_rule->cross_refs++;
3267 	/*
3268 	 * Bump xlat_crefs to prevent this state and its paired
3269 	 * state from being deleted, while this mbuf is inflight.
3270 	 * The xlat_crefs of the paired state on the target cpu
3271 	 * will be decremented, once this mbuf is going to be
3272 	 * filtered on the target cpu.
3273 	 */
3274 	x->xlat_crefs++;
3275 
3276 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3277 	if (flags & IPFW_XLATE_INSERT)
3278 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3279 	if (flags & IPFW_XLATE_FORWARD)
3280 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3281 
3282 	if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3283 		struct ip *ip = mtod(m, struct ip *);
3284 
3285 		/*
3286 		 * NOTE:
3287 		 * ip_input() expects ip_len/ip_off are in network
3288 		 * byte order.
3289 		 */
3290 		ip->ip_len = htons(ip->ip_len);
3291 		ip->ip_off = htons(ip->ip_off);
3292 	}
3293 
3294 	nm = &m->m_hdr.mh_genmsg;
3295 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3296 	    ipfw_ip_xlate_dispatch);
3297 	nm->m = m;
3298 	nm->arg1 = x->xlat_pair;
3299 	nm->arg2 = 0;
3300 	if (flags & IPFW_XLATE_OUTPUT)
3301 		nm->arg2 = 1;
3302 	netisr_sendmsg(&nm->base, cpuid);
3303 }
3304 
3305 static struct mbuf *
3306 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3307     struct ip_fw_local *local, struct ip **ip0)
3308 {
3309 	struct ip *ip = mtod(m, struct ip *);
3310 	struct tcphdr *tcp;
3311 	struct udphdr *udp;
3312 
3313 	/*
3314 	 * Collect parameters into local variables for faster matching.
3315 	 */
3316 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
3317 		local->proto = args->f_id.proto = 0;	/* mark f_id invalid */
3318 		goto done;
3319 	}
3320 
3321 	local->proto = args->f_id.proto = ip->ip_p;
3322 	local->src_ip = ip->ip_src;
3323 	local->dst_ip = ip->ip_dst;
3324 	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3325 		local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3326 		local->ip_len = ntohs(ip->ip_len);
3327 	} else {
3328 		local->offset = ip->ip_off & IP_OFFMASK;
3329 		local->ip_len = ip->ip_len;
3330 	}
3331 
3332 #define PULLUP_TO(len)					\
3333 do {							\
3334 	if (m->m_len < (len)) {				\
3335 		args->m = m = m_pullup(m, (len));	\
3336 		if (m == NULL) {			\
3337 			ip = NULL;			\
3338 			goto done;			\
3339 		}					\
3340 		ip = mtod(m, struct ip *);		\
3341 	}						\
3342 } while (0)
3343 
3344 	if (local->offset == 0) {
3345 		switch (local->proto) {
3346 		case IPPROTO_TCP:
3347 			PULLUP_TO(hlen + sizeof(struct tcphdr));
3348 			local->tcp = tcp = L3HDR(struct tcphdr, ip);
3349 			local->dst_port = tcp->th_dport;
3350 			local->src_port = tcp->th_sport;
3351 			args->f_id.flags = tcp->th_flags;
3352 			break;
3353 
3354 		case IPPROTO_UDP:
3355 			PULLUP_TO(hlen + sizeof(struct udphdr));
3356 			udp = L3HDR(struct udphdr, ip);
3357 			local->dst_port = udp->uh_dport;
3358 			local->src_port = udp->uh_sport;
3359 			break;
3360 
3361 		case IPPROTO_ICMP:
3362 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
3363 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3364 			break;
3365 
3366 		default:
3367 			break;
3368 		}
3369 	}
3370 
3371 #undef PULLUP_TO
3372 
3373 	args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3374 	args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3375 	args->f_id.src_port = local->src_port = ntohs(local->src_port);
3376 	args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3377 done:
3378 	*ip0 = ip;
3379 	return (m);
3380 }
3381 
3382 static struct mbuf *
3383 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3384     struct ip_fw_local *local, struct ip **ip0)
3385 {
3386 	struct ip *ip = mtod(m, struct ip *);
3387 
3388 	ip->ip_len = htons(ip->ip_len);
3389 	ip->ip_off = htons(ip->ip_off);
3390 
3391 	m->m_flags &= ~M_HASH;
3392 	ip_hashfn(&m, 0);
3393 	args->m = m;
3394 	if (m == NULL) {
3395 		*ip0 = NULL;
3396 		return (NULL);
3397 	}
3398 	KASSERT(m->m_flags & M_HASH, ("no hash"));
3399 
3400 	/* 'm' might be changed by ip_hashfn(). */
3401 	ip = mtod(m, struct ip *);
3402 	ip->ip_len = ntohs(ip->ip_len);
3403 	ip->ip_off = ntohs(ip->ip_off);
3404 
3405 	return (ipfw_setup_local(m, hlen, args, local, ip0));
3406 }
3407 
3408 /*
3409  * The main check routine for the firewall.
3410  *
3411  * All arguments are in args so we can modify them and return them
3412  * back to the caller.
3413  *
3414  * Parameters:
3415  *
3416  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
3417  *		Starts with the IP header.
3418  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
3419  *	args->oif	Outgoing interface, or NULL if packet is incoming.
3420  *		The incoming interface is in the mbuf. (in)
3421  *
3422  *	args->rule	Pointer to the last matching rule (in/out)
3423  *	args->f_id	Addresses grabbed from the packet (out)
3424  *
3425  * Return value:
3426  *
3427  *	If the packet was denied/rejected and has been dropped, *m is equal
3428  *	to NULL upon return.
3429  *
3430  *	IP_FW_DENY	the packet must be dropped.
3431  *	IP_FW_PASS	The packet is to be accepted and routed normally.
3432  *	IP_FW_DIVERT	Divert the packet to port (args->cookie)
3433  *	IP_FW_TEE	Tee the packet to port (args->cookie)
3434  *	IP_FW_DUMMYNET	Send the packet to pipe/queue (args->cookie)
3435  *	IP_FW_CONTINUE	Continue processing on another cpu.
3436  */
3437 static int
3438 ipfw_chk(struct ip_fw_args *args)
3439 {
3440 	/*
3441 	 * Local variables hold state during the processing of a packet.
3442 	 *
3443 	 * IMPORTANT NOTE: to speed up the processing of rules, there
3444 	 * are some assumption on the values of the variables, which
3445 	 * are documented here. Should you change them, please check
3446 	 * the implementation of the various instructions to make sure
3447 	 * that they still work.
3448 	 *
3449 	 * args->eh	The MAC header. It is non-null for a layer2
3450 	 *	packet, it is NULL for a layer-3 packet.
3451 	 *
3452 	 * m | args->m	Pointer to the mbuf, as received from the caller.
3453 	 *	It may change if ipfw_chk() does an m_pullup, or if it
3454 	 *	consumes the packet because it calls send_reject().
3455 	 *	XXX This has to change, so that ipfw_chk() never modifies
3456 	 *	or consumes the buffer.
3457 	 * ip	is simply an alias of the value of m, and it is kept
3458 	 *	in sync with it (the packet is	supposed to start with
3459 	 *	the ip header).
3460 	 */
3461 	struct mbuf *m = args->m;
3462 	struct ip *ip = mtod(m, struct ip *);
3463 
3464 	/*
3465 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
3466 	 *	inbound path (ether_input, ip_input).
3467 	 *	If non-NULL, ipfw_chk has been called on the outbound path
3468 	 *	(ether_output, ip_output).
3469 	 */
3470 	struct ifnet *oif = args->oif;
3471 
3472 	struct ip_fw *f = NULL;		/* matching rule */
3473 	int retval = IP_FW_PASS;
3474 	struct m_tag *mtag;
3475 	struct divert_info *divinfo;
3476 	struct ipfw_state *s;
3477 
3478 	/*
3479 	 * hlen	The length of the IPv4 header.
3480 	 *	hlen >0 means we have an IPv4 packet.
3481 	 */
3482 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
3483 
3484 	struct ip_fw_local lc;
3485 
3486 	/*
3487 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3488 	 * 	MATCH_NONE when checked and not matched (dyn_f = NULL),
3489 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3490 	 */
3491 	int dyn_dir = MATCH_UNKNOWN;
3492 	struct ip_fw *dyn_f = NULL;
3493 	int cpuid = mycpuid;
3494 	struct ipfw_context *ctx;
3495 
3496 	ASSERT_NETISR_NCPUS(cpuid);
3497 	ctx = ipfw_ctx[cpuid];
3498 
3499 	if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3500 		return IP_FW_PASS;	/* accept */
3501 
3502 	if (args->eh == NULL ||		/* layer 3 packet */
3503 	    (m->m_pkthdr.len >= sizeof(struct ip) &&
3504 	     ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3505 		hlen = ip->ip_hl << 2;
3506 
3507 	memset(&lc, 0, sizeof(lc));
3508 
3509 	m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3510 	if (m == NULL)
3511 		goto pullup_failed;
3512 
3513 	if (args->rule) {
3514 		/*
3515 		 * Packet has already been tagged. Look for the next rule
3516 		 * to restart processing.
3517 		 *
3518 		 * If fw_one_pass != 0 then just accept it.
3519 		 * XXX should not happen here, but optimized out in
3520 		 * the caller.
3521 		 */
3522 		if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3523 			return IP_FW_PASS;
3524 		args->flags &= ~IP_FWARG_F_CONT;
3525 
3526 		/* This rule is being/has been flushed */
3527 		if (ipfw_flushing)
3528 			return IP_FW_DENY;
3529 
3530 		KASSERT(args->rule->cpuid == cpuid,
3531 			("rule used on cpu%d", cpuid));
3532 
3533 		/* This rule was deleted */
3534 		if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3535 			return IP_FW_DENY;
3536 
3537 		if (args->xlat != NULL) {
3538 			struct ipfw_xlat *x = args->xlat;
3539 
3540 			/* This xlat is being deleted. */
3541 			if (x->xlat_invalid)
3542 				return IP_FW_DENY;
3543 
3544 			f = args->rule;
3545 
3546 			dyn_f = f;
3547 			dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3548 			    MATCH_FORWARD : MATCH_REVERSE;
3549 
3550 			if (args->flags & IP_FWARG_F_XLATINS) {
3551 				KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3552 				    ("not slave %u state", x->xlat_type));
3553 				s = ipfw_state_link(ctx, &x->xlat_st);
3554 				if (s != NULL) {
3555 					ctx->ipfw_xlate_conflicts++;
3556 					if (IPFW_STATE_ISDEAD(s)) {
3557 						ipfw_state_remove(ctx, s);
3558 						s = ipfw_state_link(ctx,
3559 						    &x->xlat_st);
3560 					}
3561 					if (s != NULL) {
3562 						if (bootverbose) {
3563 							kprintf("ipfw: "
3564 							"slave %u state "
3565 							"conflicts %u state\n",
3566 							x->xlat_type,
3567 							s->st_type);
3568 						}
3569 						ipfw_xlat_invalidate(x);
3570 						return IP_FW_DENY;
3571 					}
3572 					ctx->ipfw_xlate_cresolved++;
3573 				}
3574 			} else {
3575 				ipfw_state_update(&args->f_id, dyn_dir,
3576 				    lc.tcp, &x->xlat_st);
3577 			}
3578 		} else {
3579 			/* TODO: setup dyn_f, dyn_dir */
3580 
3581 			f = args->rule->next_rule;
3582 			if (f == NULL)
3583 				f = lookup_next_rule(args->rule);
3584 		}
3585 	} else {
3586 		/*
3587 		 * Find the starting rule. It can be either the first
3588 		 * one, or the one after divert_rule if asked so.
3589 		 */
3590 		int skipto;
3591 
3592 		KKASSERT((args->flags &
3593 		    (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3594 		KKASSERT(args->xlat == NULL);
3595 
3596 		mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3597 		if (mtag != NULL) {
3598 			divinfo = m_tag_data(mtag);
3599 			skipto = divinfo->skipto;
3600 		} else {
3601 			skipto = 0;
3602 		}
3603 
3604 		f = ctx->ipfw_layer3_chain;
3605 		if (args->eh == NULL && skipto != 0) {
3606 			/* No skipto during rule flushing */
3607 			if (ipfw_flushing)
3608 				return IP_FW_DENY;
3609 
3610 			if (skipto >= IPFW_DEFAULT_RULE)
3611 				return IP_FW_DENY; /* invalid */
3612 
3613 			while (f && f->rulenum <= skipto)
3614 				f = f->next;
3615 			if (f == NULL)	/* drop packet */
3616 				return IP_FW_DENY;
3617 		} else if (ipfw_flushing) {
3618 			/* Rules are being flushed; skip to default rule */
3619 			f = ctx->ipfw_default_rule;
3620 		}
3621 	}
3622 	if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3623 		m_tag_delete(m, mtag);
3624 
3625 	/*
3626 	 * Now scan the rules, and parse microinstructions for each rule.
3627 	 */
3628 	for (; f; f = f->next) {
3629 		int l, cmdlen;
3630 		ipfw_insn *cmd;
3631 		int skip_or; /* skip rest of OR block */
3632 
3633 again:
3634 		if (ctx->ipfw_set_disable & (1 << f->set)) {
3635 			args->xlat = NULL;
3636 			continue;
3637 		}
3638 
3639 		if (args->xlat != NULL) {
3640 			args->xlat = NULL;
3641 			l = f->cmd_len - f->act_ofs;
3642 			cmd = ACTION_PTR(f);
3643 		} else {
3644 			l = f->cmd_len;
3645 			cmd = f->cmd;
3646 		}
3647 
3648 		skip_or = 0;
3649 		for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3650 			int match;
3651 
3652 			/*
3653 			 * check_body is a jump target used when we find a
3654 			 * CHECK_STATE, and need to jump to the body of
3655 			 * the target rule.
3656 			 */
3657 check_body:
3658 			cmdlen = F_LEN(cmd);
3659 			/*
3660 			 * An OR block (insn_1 || .. || insn_n) has the
3661 			 * F_OR bit set in all but the last instruction.
3662 			 * The first match will set "skip_or", and cause
3663 			 * the following instructions to be skipped until
3664 			 * past the one with the F_OR bit clear.
3665 			 */
3666 			if (skip_or) {		/* skip this instruction */
3667 				if ((cmd->len & F_OR) == 0)
3668 					skip_or = 0;	/* next one is good */
3669 				continue;
3670 			}
3671 			match = 0; /* set to 1 if we succeed */
3672 
3673 			switch (cmd->opcode) {
3674 			/*
3675 			 * The first set of opcodes compares the packet's
3676 			 * fields with some pattern, setting 'match' if a
3677 			 * match is found. At the end of the loop there is
3678 			 * logic to deal with F_NOT and F_OR flags associated
3679 			 * with the opcode.
3680 			 */
3681 			case O_NOP:
3682 				match = 1;
3683 				break;
3684 
3685 			case O_FORWARD_MAC:
3686 				kprintf("ipfw: opcode %d unimplemented\n",
3687 					cmd->opcode);
3688 				break;
3689 
3690 			case O_GID:
3691 			case O_UID:
3692 				/*
3693 				 * We only check offset == 0 && proto != 0,
3694 				 * as this ensures that we have an IPv4
3695 				 * packet with the ports info.
3696 				 */
3697 				if (lc.offset!=0)
3698 					break;
3699 
3700 				match = ipfw_match_uid(&args->f_id, oif,
3701 					cmd->opcode,
3702 					(uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3703 				break;
3704 
3705 			case O_RECV:
3706 				match = iface_match(m->m_pkthdr.rcvif,
3707 				    (ipfw_insn_if *)cmd);
3708 				break;
3709 
3710 			case O_XMIT:
3711 				match = iface_match(oif, (ipfw_insn_if *)cmd);
3712 				break;
3713 
3714 			case O_VIA:
3715 				match = iface_match(oif ? oif :
3716 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3717 				break;
3718 
3719 			case O_MACADDR2:
3720 				if (args->eh != NULL) {	/* have MAC header */
3721 					uint32_t *want = (uint32_t *)
3722 						((ipfw_insn_mac *)cmd)->addr;
3723 					uint32_t *mask = (uint32_t *)
3724 						((ipfw_insn_mac *)cmd)->mask;
3725 					uint32_t *hdr = (uint32_t *)args->eh;
3726 
3727 					match =
3728 					(want[0] == (hdr[0] & mask[0]) &&
3729 					 want[1] == (hdr[1] & mask[1]) &&
3730 					 want[2] == (hdr[2] & mask[2]));
3731 				}
3732 				break;
3733 
3734 			case O_MAC_TYPE:
3735 				if (args->eh != NULL) {
3736 					uint16_t t =
3737 					    ntohs(args->eh->ether_type);
3738 					uint16_t *p =
3739 					    ((ipfw_insn_u16 *)cmd)->ports;
3740 					int i;
3741 
3742 					/* Special vlan handling */
3743 					if (m->m_flags & M_VLANTAG)
3744 						t = ETHERTYPE_VLAN;
3745 
3746 					for (i = cmdlen - 1; !match && i > 0;
3747 					     i--, p += 2) {
3748 						match =
3749 						(t >= p[0] && t <= p[1]);
3750 					}
3751 				}
3752 				break;
3753 
3754 			case O_FRAG:
3755 				match = (hlen > 0 && lc.offset != 0);
3756 				break;
3757 
3758 			case O_IPFRAG:
3759 				if (hlen > 0) {
3760 					uint16_t off;
3761 
3762 					if (args->eh != NULL)
3763 						off = ntohs(ip->ip_off);
3764 					else
3765 						off = ip->ip_off;
3766 					if (off & (IP_MF | IP_OFFMASK))
3767 						match = 1;
3768 				}
3769 				break;
3770 
3771 			case O_IN:	/* "out" is "not in" */
3772 				match = (oif == NULL);
3773 				break;
3774 
3775 			case O_LAYER2:
3776 				match = (args->eh != NULL);
3777 				break;
3778 
3779 			case O_PROTO:
3780 				/*
3781 				 * We do not allow an arg of 0 so the
3782 				 * check of "proto" only suffices.
3783 				 */
3784 				match = (lc.proto == cmd->arg1);
3785 				break;
3786 
3787 			case O_IP_SRC:
3788 				match = (hlen > 0 &&
3789 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3790 				    lc.src_ip.s_addr);
3791 				break;
3792 
3793 			case O_IP_SRC_MASK:
3794 				match = (hlen > 0 &&
3795 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3796 				     (lc.src_ip.s_addr &
3797 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3798 				break;
3799 
3800 			case O_IP_SRC_ME:
3801 				if (hlen > 0) {
3802 					struct ifnet *tif;
3803 
3804 					tif = INADDR_TO_IFP(&lc.src_ip);
3805 					match = (tif != NULL);
3806 				}
3807 				break;
3808 
3809 			case O_IP_SRC_TABLE:
3810 				match = ipfw_table_lookup(ctx, cmd->arg1,
3811 				    &lc.src_ip);
3812 				break;
3813 
3814 			case O_IP_SRC_IFIP:
3815 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3816 				    &lc.src_ip);
3817 				break;
3818 
3819 			case O_IP_DST_SET:
3820 			case O_IP_SRC_SET:
3821 				if (hlen > 0) {
3822 					uint32_t *d = (uint32_t *)(cmd + 1);
3823 					uint32_t addr =
3824 					    cmd->opcode == O_IP_DST_SET ?
3825 						args->f_id.dst_ip :
3826 						args->f_id.src_ip;
3827 
3828 					if (addr < d[0])
3829 						break;
3830 					addr -= d[0]; /* subtract base */
3831 					match =
3832 					(addr < cmd->arg1) &&
3833 					 (d[1 + (addr >> 5)] &
3834 					  (1 << (addr & 0x1f)));
3835 				}
3836 				break;
3837 
3838 			case O_IP_DST:
3839 				match = (hlen > 0 &&
3840 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3841 				    lc.dst_ip.s_addr);
3842 				break;
3843 
3844 			case O_IP_DST_MASK:
3845 				match = (hlen > 0) &&
3846 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3847 				     (lc.dst_ip.s_addr &
3848 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3849 				break;
3850 
3851 			case O_IP_DST_ME:
3852 				if (hlen > 0) {
3853 					struct ifnet *tif;
3854 
3855 					tif = INADDR_TO_IFP(&lc.dst_ip);
3856 					match = (tif != NULL);
3857 				}
3858 				break;
3859 
3860 			case O_IP_DST_TABLE:
3861 				match = ipfw_table_lookup(ctx, cmd->arg1,
3862 				    &lc.dst_ip);
3863 				break;
3864 
3865 			case O_IP_DST_IFIP:
3866 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3867 				    &lc.dst_ip);
3868 				break;
3869 
3870 			case O_IP_SRCPORT:
3871 			case O_IP_DSTPORT:
3872 				/*
3873 				 * offset == 0 && proto != 0 is enough
3874 				 * to guarantee that we have an IPv4
3875 				 * packet with port info.
3876 				 */
3877 				if ((lc.proto==IPPROTO_UDP ||
3878 				     lc.proto==IPPROTO_TCP)
3879 				    && lc.offset == 0) {
3880 					uint16_t x =
3881 					    (cmd->opcode == O_IP_SRCPORT) ?
3882 						lc.src_port : lc.dst_port;
3883 					uint16_t *p =
3884 					    ((ipfw_insn_u16 *)cmd)->ports;
3885 					int i;
3886 
3887 					for (i = cmdlen - 1; !match && i > 0;
3888 					     i--, p += 2) {
3889 						match =
3890 						(x >= p[0] && x <= p[1]);
3891 					}
3892 				}
3893 				break;
3894 
3895 			case O_ICMPCODE:
3896 				match = (lc.offset == 0 &&
3897 				    lc.proto==IPPROTO_ICMP &&
3898 				    icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3899 				break;
3900 
3901 			case O_ICMPTYPE:
3902 				match = (lc.offset == 0 &&
3903 				    lc.proto==IPPROTO_ICMP &&
3904 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3905 				break;
3906 
3907 			case O_IPOPT:
3908 				match = (hlen > 0 && ipopts_match(ip, cmd));
3909 				break;
3910 
3911 			case O_IPVER:
3912 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3913 				break;
3914 
3915 			case O_IPTTL:
3916 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3917 				break;
3918 
3919 			case O_IPID:
3920 				match = (hlen > 0 &&
3921 				    cmd->arg1 == ntohs(ip->ip_id));
3922 				break;
3923 
3924 			case O_IPLEN:
3925 				match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3926 				break;
3927 
3928 			case O_IPPRECEDENCE:
3929 				match = (hlen > 0 &&
3930 				    (cmd->arg1 == (ip->ip_tos & 0xe0)));
3931 				break;
3932 
3933 			case O_IPTOS:
3934 				match = (hlen > 0 &&
3935 				    flags_match(cmd, ip->ip_tos));
3936 				break;
3937 
3938 			case O_TCPFLAGS:
3939 				match = (lc.proto == IPPROTO_TCP &&
3940 				    lc.offset == 0 &&
3941 				    flags_match(cmd,
3942 					L3HDR(struct tcphdr,ip)->th_flags));
3943 				break;
3944 
3945 			case O_TCPOPTS:
3946 				match = (lc.proto == IPPROTO_TCP &&
3947 				    lc.offset == 0 && tcpopts_match(ip, cmd));
3948 				break;
3949 
3950 			case O_TCPSEQ:
3951 				match = (lc.proto == IPPROTO_TCP &&
3952 				    lc.offset == 0 &&
3953 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3954 					L3HDR(struct tcphdr,ip)->th_seq);
3955 				break;
3956 
3957 			case O_TCPACK:
3958 				match = (lc.proto == IPPROTO_TCP &&
3959 				    lc.offset == 0 &&
3960 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3961 					L3HDR(struct tcphdr,ip)->th_ack);
3962 				break;
3963 
3964 			case O_TCPWIN:
3965 				match = (lc.proto == IPPROTO_TCP &&
3966 				    lc.offset == 0 &&
3967 				    cmd->arg1 ==
3968 					L3HDR(struct tcphdr,ip)->th_win);
3969 				break;
3970 
3971 			case O_ESTAB:
3972 				/* reject packets which have SYN only */
3973 				/* XXX should i also check for TH_ACK ? */
3974 				match = (lc.proto == IPPROTO_TCP &&
3975 				    lc.offset == 0 &&
3976 				    (L3HDR(struct tcphdr,ip)->th_flags &
3977 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3978 				break;
3979 
3980 			case O_LOG:
3981 				if (fw_verbose) {
3982 					ipfw_log(ctx, f, hlen, args->eh, m,
3983 					    oif);
3984 				}
3985 				match = 1;
3986 				break;
3987 
3988 			case O_PROB:
3989 				match = (krandom() <
3990 					((ipfw_insn_u32 *)cmd)->d[0]);
3991 				break;
3992 
3993 			/*
3994 			 * The second set of opcodes represents 'actions',
3995 			 * i.e. the terminal part of a rule once the packet
3996 			 * matches all previous patterns.
3997 			 * Typically there is only one action for each rule,
3998 			 * and the opcode is stored at the end of the rule
3999 			 * (but there are exceptions -- see below).
4000 			 *
4001 			 * In general, here we set retval and terminate the
4002 			 * outer loop (would be a 'break 3' in some language,
4003 			 * but we need to do a 'goto done').
4004 			 *
4005 			 * Exceptions:
4006 			 * O_COUNT and O_SKIPTO actions:
4007 			 *   instead of terminating, we jump to the next rule
4008 			 *   ('goto next_rule', equivalent to a 'break 2'),
4009 			 *   or to the SKIPTO target ('goto again' after
4010 			 *   having set f, cmd and l), respectively.
4011 			 *
4012 			 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
4013 			 *   are not real 'actions', and are stored right
4014 			 *   before the 'action' part of the rule.
4015 			 *   These opcodes try to install an entry in the
4016 			 *   state tables; if successful, we continue with
4017 			 *   the next opcode (match=1; break;), otherwise
4018 			 *   the packet must be dropped ('goto done' after
4019 			 *   setting retval).  If static rules are changed
4020 			 *   during the state installation, the packet will
4021 			 *   be dropped and rule's stats will not beupdated
4022 			 *   ('return IP_FW_DENY').
4023 			 *
4024 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4025 			 *   cause a lookup of the state table, and a jump
4026 			 *   to the 'action' part of the parent rule
4027 			 *   ('goto check_body') if an entry is found, or
4028 			 *   (CHECK_STATE only) a jump to the next rule if
4029 			 *   the entry is not found ('goto next_rule').
4030 			 *   The result of the lookup is cached to make
4031 			 *   further instances of these opcodes are
4032 			 *   effectively NOPs.  If static rules are changed
4033 			 *   during the state looking up, the packet will
4034 			 *   be dropped and rule's stats will not be updated
4035 			 *   ('return IP_FW_DENY').
4036 			 */
4037 			case O_REDIRECT:
4038 				if (f->cross_rules == NULL) {
4039 					/*
4040 					 * This rule was not completely setup;
4041 					 * move on to the next rule.
4042 					 */
4043 					goto next_rule;
4044 				}
4045 				/*
4046 				 * Apply redirect only on input path and
4047 				 * only to non-fragment TCP segments or
4048 				 * UDP datagrams.
4049 				 *
4050 				 * Does _not_ work with layer2 filtering.
4051 				 */
4052 				if (oif != NULL || args->eh != NULL ||
4053 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4054 				    (lc.proto != IPPROTO_TCP &&
4055 				     lc.proto != IPPROTO_UDP))
4056 					break;
4057 				/* FALL THROUGH */
4058 			case O_LIMIT:
4059 			case O_KEEP_STATE:
4060 				if (hlen == 0)
4061 					break;
4062 				s = ipfw_state_install(ctx, f,
4063 				    (ipfw_insn_limit *)cmd, args, lc.tcp);
4064 				if (s == NULL) {
4065 					retval = IP_FW_DENY;
4066 					goto done; /* error/limit violation */
4067 				}
4068 				s->st_pcnt++;
4069 				s->st_bcnt += lc.ip_len;
4070 
4071 				if (s->st_type == O_REDIRECT) {
4072 					struct in_addr oaddr;
4073 					uint16_t oport;
4074 					struct ipfw_xlat *slave_x, *x;
4075 					struct ipfw_state *dup;
4076 
4077 					x = (struct ipfw_xlat *)s;
4078 					ipfw_xlate(x, m, &oaddr, &oport);
4079 					m = ipfw_rehashm(m, hlen, args, &lc,
4080 					    &ip);
4081 					if (m == NULL) {
4082 						ipfw_state_del(ctx, s);
4083 						goto pullup_failed;
4084 					}
4085 
4086 					cpuid = netisr_hashcpu(
4087 					    m->m_pkthdr.hash);
4088 
4089 					slave_x = (struct ipfw_xlat *)
4090 					    ipfw_state_alloc(ctx, &args->f_id,
4091 					    O_REDIRECT, f->cross_rules[cpuid],
4092 					    lc.tcp);
4093 					if (slave_x == NULL) {
4094 						ipfw_state_del(ctx, s);
4095 						retval = IP_FW_DENY;
4096 						goto done;
4097 					}
4098 					slave_x->xlat_addr = oaddr.s_addr;
4099 					slave_x->xlat_port = oport;
4100 					slave_x->xlat_dir = MATCH_REVERSE;
4101 					slave_x->xlat_flags |=
4102 					    IPFW_STATE_F_XLATSRC |
4103 					    IPFW_STATE_F_XLATSLAVE;
4104 
4105 					slave_x->xlat_pair = x;
4106 					slave_x->xlat_pcpu = mycpuid;
4107 					x->xlat_pair = slave_x;
4108 					x->xlat_pcpu = cpuid;
4109 
4110 					ctx->ipfw_xlated++;
4111 					if (cpuid != mycpuid) {
4112 						ctx->ipfw_xlate_split++;
4113 						ipfw_xlate_redispatch(
4114 						    m, cpuid, x,
4115 						    IPFW_XLATE_INSERT |
4116 						    IPFW_XLATE_FORWARD);
4117 						args->m = NULL;
4118 						return (IP_FW_REDISPATCH);
4119 					}
4120 
4121 					dup = ipfw_state_link(ctx,
4122 					    &slave_x->xlat_st);
4123 					if (dup != NULL) {
4124 						ctx->ipfw_xlate_conflicts++;
4125 						if (IPFW_STATE_ISDEAD(dup)) {
4126 							ipfw_state_remove(ctx,
4127 							    dup);
4128 							dup = ipfw_state_link(
4129 							ctx, &slave_x->xlat_st);
4130 						}
4131 						if (dup != NULL) {
4132 							if (bootverbose) {
4133 							    kprintf("ipfw: "
4134 							    "slave %u state "
4135 							    "conflicts "
4136 							    "%u state\n",
4137 							    x->xlat_type,
4138 							    s->st_type);
4139 							}
4140 							ipfw_state_del(ctx, s);
4141 							return (IP_FW_DENY);
4142 						}
4143 						ctx->ipfw_xlate_cresolved++;
4144 					}
4145 				}
4146 				match = 1;
4147 				break;
4148 
4149 			case O_PROBE_STATE:
4150 			case O_CHECK_STATE:
4151 				/*
4152 				 * States are checked at the first keep-state
4153 				 * check-state occurrence, with the result
4154 				 * being stored in dyn_dir.  The compiler
4155 				 * introduces a PROBE_STATE instruction for
4156 				 * us when we have a KEEP_STATE/LIMIT/RDR
4157 				 * (because PROBE_STATE needs to be run first).
4158 				 */
4159 				s = NULL;
4160 				if (dyn_dir == MATCH_UNKNOWN) {
4161 					s = ipfw_state_lookup(ctx,
4162 					    &args->f_id, &dyn_dir, lc.tcp);
4163 				}
4164 				if (s == NULL ||
4165 				    (s->st_type == O_REDIRECT &&
4166 				     (args->eh != NULL ||
4167 				      (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4168 				      (lc.proto != IPPROTO_TCP &&
4169 				       lc.proto != IPPROTO_UDP)))) {
4170 					/*
4171 					 * State not found. If CHECK_STATE,
4172 					 * skip to next rule, if PROBE_STATE
4173 					 * just ignore and continue with next
4174 					 * opcode.
4175 					 */
4176 					if (cmd->opcode == O_CHECK_STATE)
4177 						goto next_rule;
4178 					match = 1;
4179 					break;
4180 				}
4181 
4182 				s->st_pcnt++;
4183 				s->st_bcnt += lc.ip_len;
4184 
4185 				if (s->st_type == O_REDIRECT) {
4186 					struct ipfw_xlat *x =
4187 					    (struct ipfw_xlat *)s;
4188 
4189 					if (oif != NULL &&
4190 					    x->xlat_ifp == NULL) {
4191 						KASSERT(x->xlat_flags &
4192 						    IPFW_STATE_F_XLATSLAVE,
4193 						    ("master rdr state "
4194 						     "missing ifp"));
4195 						x->xlat_ifp = oif;
4196 					} else if (
4197 					    (oif != NULL && x->xlat_ifp!=oif) ||
4198 					    (oif == NULL &&
4199 					     x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4200 						retval = IP_FW_DENY;
4201 						goto done;
4202 					}
4203 					if (x->xlat_dir != dyn_dir)
4204 						goto skip_xlate;
4205 
4206 					ipfw_xlate(x, m, NULL, NULL);
4207 					m = ipfw_rehashm(m, hlen, args, &lc,
4208 					    &ip);
4209 					if (m == NULL)
4210 						goto pullup_failed;
4211 
4212 					cpuid = netisr_hashcpu(
4213 					    m->m_pkthdr.hash);
4214 					if (cpuid != mycpuid) {
4215 						uint32_t xlate = 0;
4216 
4217 						if (oif != NULL) {
4218 							xlate |=
4219 							    IPFW_XLATE_OUTPUT;
4220 						}
4221 						if (dyn_dir == MATCH_FORWARD) {
4222 							xlate |=
4223 							    IPFW_XLATE_FORWARD;
4224 						}
4225 						ipfw_xlate_redispatch(m, cpuid,
4226 						    x, xlate);
4227 						args->m = NULL;
4228 						return (IP_FW_REDISPATCH);
4229 					}
4230 
4231 					KKASSERT(x->xlat_pcpu == mycpuid);
4232 					ipfw_state_update(&args->f_id, dyn_dir,
4233 					    lc.tcp, &x->xlat_pair->xlat_st);
4234 				}
4235 skip_xlate:
4236 				/*
4237 				 * Found a rule from a state; jump to the
4238 				 * 'action' part of the rule.
4239 				 */
4240 				f = s->st_rule;
4241 				KKASSERT(f->cpuid == mycpuid);
4242 
4243 				cmd = ACTION_PTR(f);
4244 				l = f->cmd_len - f->act_ofs;
4245 				dyn_f = f;
4246 				goto check_body;
4247 
4248 			case O_ACCEPT:
4249 				retval = IP_FW_PASS;	/* accept */
4250 				goto done;
4251 
4252 			case O_DEFRAG:
4253 				if (f->cross_rules == NULL) {
4254 					/*
4255 					 * This rule was not completely setup;
4256 					 * move on to the next rule.
4257 					 */
4258 					goto next_rule;
4259 				}
4260 
4261 				/*
4262 				 * Don't defrag for l2 packets, output packets
4263 				 * or non-fragments.
4264 				 */
4265 				if (oif != NULL || args->eh != NULL ||
4266 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4267 					goto next_rule;
4268 
4269 				ctx->ipfw_frags++;
4270 				m = ip_reass(m);
4271 				args->m = m;
4272 				if (m == NULL) {
4273 					retval = IP_FW_PASS;
4274 					goto done;
4275 				}
4276 				ctx->ipfw_defraged++;
4277 				KASSERT((m->m_flags & M_HASH) == 0,
4278 				    ("hash not cleared"));
4279 
4280 				/* Update statistics */
4281 				f->pcnt++;
4282 				f->bcnt += lc.ip_len;
4283 				f->timestamp = time_second;
4284 
4285 				ip = mtod(m, struct ip *);
4286 				hlen = ip->ip_hl << 2;
4287 				ip->ip_len += hlen;
4288 
4289 				ip->ip_len = htons(ip->ip_len);
4290 				ip->ip_off = htons(ip->ip_off);
4291 
4292 				ip_hashfn(&m, 0);
4293 				args->m = m;
4294 				if (m == NULL)
4295 					goto pullup_failed;
4296 
4297 				KASSERT(m->m_flags & M_HASH, ("no hash"));
4298 				cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4299 				if (cpuid != mycpuid) {
4300 					/*
4301 					 * NOTE:
4302 					 * ip_len/ip_off are in network byte
4303 					 * order.
4304 					 */
4305 					ctx->ipfw_defrag_remote++;
4306 					ipfw_defrag_redispatch(m, cpuid, f);
4307 					args->m = NULL;
4308 					return (IP_FW_REDISPATCH);
4309 				}
4310 
4311 				/* 'm' might be changed by ip_hashfn(). */
4312 				ip = mtod(m, struct ip *);
4313 				ip->ip_len = ntohs(ip->ip_len);
4314 				ip->ip_off = ntohs(ip->ip_off);
4315 
4316 				m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4317 				if (m == NULL)
4318 					goto pullup_failed;
4319 
4320 				/* Move on. */
4321 				goto next_rule;
4322 
4323 			case O_PIPE:
4324 			case O_QUEUE:
4325 				args->rule = f; /* report matching rule */
4326 				args->cookie = cmd->arg1;
4327 				retval = IP_FW_DUMMYNET;
4328 				goto done;
4329 
4330 			case O_DIVERT:
4331 			case O_TEE:
4332 				if (args->eh) /* not on layer 2 */
4333 					break;
4334 
4335 				mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4336 				    sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4337 				if (mtag == NULL) {
4338 					retval = IP_FW_DENY;
4339 					goto done;
4340 				}
4341 				divinfo = m_tag_data(mtag);
4342 
4343 				divinfo->skipto = f->rulenum;
4344 				divinfo->port = cmd->arg1;
4345 				divinfo->tee = (cmd->opcode == O_TEE);
4346 				m_tag_prepend(m, mtag);
4347 
4348 				args->cookie = cmd->arg1;
4349 				retval = (cmd->opcode == O_DIVERT) ?
4350 					 IP_FW_DIVERT : IP_FW_TEE;
4351 				goto done;
4352 
4353 			case O_COUNT:
4354 			case O_SKIPTO:
4355 				f->pcnt++;	/* update stats */
4356 				f->bcnt += lc.ip_len;
4357 				f->timestamp = time_second;
4358 				if (cmd->opcode == O_COUNT)
4359 					goto next_rule;
4360 				/* handle skipto */
4361 				if (f->next_rule == NULL)
4362 					lookup_next_rule(f);
4363 				f = f->next_rule;
4364 				goto again;
4365 
4366 			case O_REJECT:
4367 				/*
4368 				 * Drop the packet and send a reject notice
4369 				 * if the packet is not ICMP (or is an ICMP
4370 				 * query), and it is not multicast/broadcast.
4371 				 */
4372 				if (hlen > 0 &&
4373 				    (lc.proto != IPPROTO_ICMP ||
4374 				     is_icmp_query(ip)) &&
4375 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
4376 				    !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4377 					send_reject(args, cmd->arg1,
4378 					    lc.offset, lc.ip_len);
4379 					retval = IP_FW_DENY;
4380 					goto done;
4381 				}
4382 				/* FALLTHROUGH */
4383 			case O_DENY:
4384 				retval = IP_FW_DENY;
4385 				goto done;
4386 
4387 			case O_FORWARD_IP:
4388 				if (args->eh)	/* not valid on layer2 pkts */
4389 					break;
4390 				if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4391 					struct sockaddr_in *sin;
4392 
4393 					mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4394 					    sizeof(*sin), M_INTWAIT | M_NULLOK);
4395 					if (mtag == NULL) {
4396 						retval = IP_FW_DENY;
4397 						goto done;
4398 					}
4399 					sin = m_tag_data(mtag);
4400 
4401 					/* Structure copy */
4402 					*sin = ((ipfw_insn_sa *)cmd)->sa;
4403 
4404 					m_tag_prepend(m, mtag);
4405 					m->m_pkthdr.fw_flags |=
4406 						IPFORWARD_MBUF_TAGGED;
4407 					m->m_pkthdr.fw_flags &=
4408 						~BRIDGE_MBUF_TAGGED;
4409 				}
4410 				retval = IP_FW_PASS;
4411 				goto done;
4412 
4413 			default:
4414 				panic("-- unknown opcode %d", cmd->opcode);
4415 			} /* end of switch() on opcodes */
4416 
4417 			if (cmd->len & F_NOT)
4418 				match = !match;
4419 
4420 			if (match) {
4421 				if (cmd->len & F_OR)
4422 					skip_or = 1;
4423 			} else {
4424 				if (!(cmd->len & F_OR)) /* not an OR block, */
4425 					break;		/* try next rule    */
4426 			}
4427 
4428 		}	/* end of inner for, scan opcodes */
4429 
4430 next_rule:;		/* try next rule		*/
4431 
4432 	}		/* end of outer for, scan rules */
4433 	kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4434 	return IP_FW_DENY;
4435 
4436 done:
4437 	/* Update statistics */
4438 	f->pcnt++;
4439 	f->bcnt += lc.ip_len;
4440 	f->timestamp = time_second;
4441 	return retval;
4442 
4443 pullup_failed:
4444 	if (fw_verbose)
4445 		kprintf("pullup failed\n");
4446 	return IP_FW_DENY;
4447 }
4448 
4449 static struct mbuf *
4450 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4451 {
4452 	struct m_tag *mtag;
4453 	struct dn_pkt *pkt;
4454 	ipfw_insn *cmd;
4455 	const struct ipfw_flow_id *id;
4456 	struct dn_flow_id *fid;
4457 
4458 	M_ASSERTPKTHDR(m);
4459 
4460 	mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4461 	    M_INTWAIT | M_NULLOK);
4462 	if (mtag == NULL) {
4463 		m_freem(m);
4464 		return (NULL);
4465 	}
4466 	m_tag_prepend(m, mtag);
4467 
4468 	pkt = m_tag_data(mtag);
4469 	bzero(pkt, sizeof(*pkt));
4470 
4471 	cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4472 	if (cmd->opcode == O_LOG)
4473 		cmd += F_LEN(cmd);
4474 	KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4475 		("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4476 
4477 	pkt->dn_m = m;
4478 	pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4479 	pkt->ifp = fwa->oif;
4480 	pkt->pipe_nr = pipe_nr;
4481 
4482 	pkt->cpuid = mycpuid;
4483 	pkt->msgport = netisr_curport();
4484 
4485 	id = &fwa->f_id;
4486 	fid = &pkt->id;
4487 	fid->fid_dst_ip = id->dst_ip;
4488 	fid->fid_src_ip = id->src_ip;
4489 	fid->fid_dst_port = id->dst_port;
4490 	fid->fid_src_port = id->src_port;
4491 	fid->fid_proto = id->proto;
4492 	fid->fid_flags = id->flags;
4493 
4494 	ipfw_ref_rule(fwa->rule);
4495 	pkt->dn_priv = fwa->rule;
4496 	pkt->dn_unref_priv = ipfw_unref_rule;
4497 
4498 	if (cmd->opcode == O_PIPE)
4499 		pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4500 
4501 	m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4502 	return (m);
4503 }
4504 
4505 /*
4506  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4507  * These will be reconstructed on the fly as packets are matched.
4508  */
4509 static void
4510 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4511 {
4512 	struct ip_fw *rule;
4513 
4514 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4515 		rule->next_rule = NULL;
4516 }
4517 
4518 static void
4519 ipfw_inc_static_count(struct ip_fw *rule)
4520 {
4521 	/* Static rule's counts are updated only on CPU0 */
4522 	KKASSERT(mycpuid == 0);
4523 
4524 	static_count++;
4525 	static_ioc_len += IOC_RULESIZE(rule);
4526 }
4527 
4528 static void
4529 ipfw_dec_static_count(struct ip_fw *rule)
4530 {
4531 	int l = IOC_RULESIZE(rule);
4532 
4533 	/* Static rule's counts are updated only on CPU0 */
4534 	KKASSERT(mycpuid == 0);
4535 
4536 	KASSERT(static_count > 0, ("invalid static count %u", static_count));
4537 	static_count--;
4538 
4539 	KASSERT(static_ioc_len >= l,
4540 		("invalid static len %u", static_ioc_len));
4541 	static_ioc_len -= l;
4542 }
4543 
4544 static void
4545 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4546 {
4547 	if (fwmsg->sibling != NULL) {
4548 		KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4549 		fwmsg->sibling->sibling = rule;
4550 	}
4551 	fwmsg->sibling = rule;
4552 }
4553 
4554 static struct ip_fw *
4555 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4556 {
4557 	struct ip_fw *rule;
4558 
4559 	rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4560 
4561 	rule->act_ofs = ioc_rule->act_ofs;
4562 	rule->cmd_len = ioc_rule->cmd_len;
4563 	rule->rulenum = ioc_rule->rulenum;
4564 	rule->set = ioc_rule->set;
4565 	rule->usr_flags = ioc_rule->usr_flags;
4566 
4567 	bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4568 
4569 	rule->refcnt = 1;
4570 	rule->cpuid = mycpuid;
4571 	rule->rule_flags = rule_flags;
4572 
4573 	return rule;
4574 }
4575 
4576 static void
4577 ipfw_add_rule_dispatch(netmsg_t nmsg)
4578 {
4579 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4580 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4581 	struct ip_fw *rule;
4582 
4583 	ASSERT_NETISR_NCPUS(mycpuid);
4584 
4585 	rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4586 
4587 	/*
4588 	 * Insert rule into the pre-determined position
4589 	 */
4590 	if (fwmsg->prev_rule != NULL) {
4591 		struct ip_fw *prev, *next;
4592 
4593 		prev = fwmsg->prev_rule;
4594 		KKASSERT(prev->cpuid == mycpuid);
4595 
4596 		next = fwmsg->next_rule;
4597 		KKASSERT(next->cpuid == mycpuid);
4598 
4599 		rule->next = next;
4600 		prev->next = rule;
4601 
4602 		/*
4603 		 * Move to the position on the next CPU
4604 		 * before the msg is forwarded.
4605 		 */
4606 		fwmsg->prev_rule = prev->sibling;
4607 		fwmsg->next_rule = next->sibling;
4608 	} else {
4609 		KKASSERT(fwmsg->next_rule == NULL);
4610 		rule->next = ctx->ipfw_layer3_chain;
4611 		ctx->ipfw_layer3_chain = rule;
4612 	}
4613 
4614 	/* Link rule CPU sibling */
4615 	ipfw_link_sibling(fwmsg, rule);
4616 
4617 	ipfw_flush_rule_ptrs(ctx);
4618 
4619 	if (mycpuid == 0) {
4620 		/* Statistics only need to be updated once */
4621 		ipfw_inc_static_count(rule);
4622 
4623 		/* Return the rule on CPU0 */
4624 		nmsg->lmsg.u.ms_resultp = rule;
4625 	}
4626 
4627 	if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4628 		rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4629 
4630 	if (fwmsg->cross_rules != NULL) {
4631 		/* Save rules for later use. */
4632 		fwmsg->cross_rules[mycpuid] = rule;
4633 	}
4634 
4635 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4636 }
4637 
4638 static void
4639 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4640 {
4641 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4642 	struct ip_fw *rule = fwmsg->sibling;
4643 	int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4644 
4645 	ASSERT_NETISR_NCPUS(mycpuid);
4646 	KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4647 	    ("not crossref rule"));
4648 
4649 	rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4650 	memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4651 
4652 	fwmsg->sibling = rule->sibling;
4653 	netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4654 }
4655 
4656 /*
4657  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4658  * then possibly create a rule number and add the rule to the list.
4659  * Update the rule_number in the input struct so the caller knows
4660  * it as well.
4661  */
4662 static void
4663 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4664 {
4665 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4666 	struct netmsg_ipfw fwmsg;
4667 	struct ip_fw *f, *prev, *rule;
4668 
4669 	ASSERT_NETISR0;
4670 
4671 	/*
4672 	 * If rulenum is 0, find highest numbered rule before the
4673 	 * default rule, and add rule number incremental step.
4674 	 */
4675 	if (ioc_rule->rulenum == 0) {
4676 		int step = autoinc_step;
4677 
4678 		KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4679 			 step <= IPFW_AUTOINC_STEP_MAX);
4680 
4681 		/*
4682 		 * Locate the highest numbered rule before default
4683 		 */
4684 		for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4685 			if (f->rulenum == IPFW_DEFAULT_RULE)
4686 				break;
4687 			ioc_rule->rulenum = f->rulenum;
4688 		}
4689 		if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4690 			ioc_rule->rulenum += step;
4691 	}
4692 	KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4693 		ioc_rule->rulenum != 0,
4694 		("invalid rule num %d", ioc_rule->rulenum));
4695 
4696 	/*
4697 	 * Now find the right place for the new rule in the sorted list.
4698 	 */
4699 	for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4700 	     prev = f, f = f->next) {
4701 		if (f->rulenum > ioc_rule->rulenum) {
4702 			/* Found the location */
4703 			break;
4704 		}
4705 	}
4706 	KASSERT(f != NULL, ("no default rule?!"));
4707 
4708 	/*
4709 	 * Duplicate the rule onto each CPU.
4710 	 * The rule duplicated on CPU0 will be returned.
4711 	 */
4712 	bzero(&fwmsg, sizeof(fwmsg));
4713 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4714 	    ipfw_add_rule_dispatch);
4715 	fwmsg.ioc_rule = ioc_rule;
4716 	fwmsg.prev_rule = prev;
4717 	fwmsg.next_rule = prev == NULL ? NULL : f;
4718 	fwmsg.rule_flags = rule_flags;
4719 	if (rule_flags & IPFW_RULE_F_CROSSREF) {
4720 		fwmsg.cross_rules = kmalloc(
4721 		    sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4722 		    M_WAITOK | M_ZERO);
4723 	}
4724 
4725 	netisr_domsg_global(&fwmsg.base);
4726 	KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4727 
4728 	rule = fwmsg.base.lmsg.u.ms_resultp;
4729 	KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4730 
4731 	if (fwmsg.cross_rules != NULL) {
4732 		netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4733 		    MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4734 		fwmsg.sibling = rule;
4735 		netisr_domsg_global(&fwmsg.base);
4736 		KKASSERT(fwmsg.sibling == NULL);
4737 
4738 		kfree(fwmsg.cross_rules, M_TEMP);
4739 
4740 #ifdef KLD_MODULE
4741 		atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4742 #endif
4743 	}
4744 
4745 	DPRINTF("++ installed rule %d, static count now %d\n",
4746 		rule->rulenum, static_count);
4747 }
4748 
4749 /*
4750  * Free storage associated with a static rule (including derived
4751  * states/tracks).
4752  * The caller is in charge of clearing rule pointers to avoid
4753  * dangling pointers.
4754  * @return a pointer to the next entry.
4755  * Arguments are not checked, so they better be correct.
4756  */
4757 static struct ip_fw *
4758 ipfw_delete_rule(struct ipfw_context *ctx,
4759 		 struct ip_fw *prev, struct ip_fw *rule)
4760 {
4761 	struct ip_fw *n;
4762 
4763 	n = rule->next;
4764 	if (prev == NULL)
4765 		ctx->ipfw_layer3_chain = n;
4766 	else
4767 		prev->next = n;
4768 
4769 	/* Mark the rule as invalid */
4770 	rule->rule_flags |= IPFW_RULE_F_INVALID;
4771 	rule->next_rule = NULL;
4772 	rule->sibling = NULL;
4773 #ifdef foo
4774 	/* Don't reset cpuid here; keep various assertion working */
4775 	rule->cpuid = -1;
4776 #endif
4777 
4778 	/* Statistics only need to be updated once */
4779 	if (mycpuid == 0)
4780 		ipfw_dec_static_count(rule);
4781 
4782 	if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4783 		/* Try to free this rule */
4784 		ipfw_free_rule(rule);
4785 	} else {
4786 		/* TODO: check staging area. */
4787 		if (mycpuid == 0) {
4788 			rule->next = ipfw_gd.ipfw_crossref_free;
4789 			ipfw_gd.ipfw_crossref_free = rule;
4790 		}
4791 	}
4792 
4793 	/* Return the next rule */
4794 	return n;
4795 }
4796 
4797 static void
4798 ipfw_flush_dispatch(netmsg_t nmsg)
4799 {
4800 	int kill_default = nmsg->lmsg.u.ms_result;
4801 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4802 	struct ip_fw *rule;
4803 
4804 	ASSERT_NETISR_NCPUS(mycpuid);
4805 
4806 	/*
4807 	 * Flush states.
4808 	 */
4809 	ipfw_state_flush(ctx, NULL);
4810 	KASSERT(ctx->ipfw_state_cnt == 0,
4811 	    ("%d pcpu states remain", ctx->ipfw_state_cnt));
4812 	ctx->ipfw_state_loosecnt = 0;
4813 	ctx->ipfw_state_lastexp = 0;
4814 
4815 	/*
4816 	 * Flush tracks.
4817 	 */
4818 	ipfw_track_flush(ctx, NULL);
4819 	ctx->ipfw_track_lastexp = 0;
4820 	if (ctx->ipfw_trkcnt_spare != NULL) {
4821 		kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4822 		ctx->ipfw_trkcnt_spare = NULL;
4823 	}
4824 
4825 	ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4826 
4827 	while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4828 	       (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4829 		ipfw_delete_rule(ctx, NULL, rule);
4830 
4831 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4832 }
4833 
4834 /*
4835  * Deletes all rules from a chain (including the default rule
4836  * if the second argument is set).
4837  */
4838 static void
4839 ipfw_flush(int kill_default)
4840 {
4841 	struct netmsg_base nmsg;
4842 #ifdef INVARIANTS
4843 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4844 	int state_cnt;
4845 #endif
4846 
4847 	ASSERT_NETISR0;
4848 
4849 	/*
4850 	 * If 'kill_default' then caller has done the necessary
4851 	 * msgport syncing; unnecessary to do it again.
4852 	 */
4853 	if (!kill_default) {
4854 		/*
4855 		 * Let ipfw_chk() know the rules are going to
4856 		 * be flushed, so it could jump directly to
4857 		 * the default rule.
4858 		 */
4859 		ipfw_flushing = 1;
4860 		/* XXX use priority sync */
4861 		netmsg_service_sync();
4862 	}
4863 
4864 	/*
4865 	 * Press the 'flush' button
4866 	 */
4867 	bzero(&nmsg, sizeof(nmsg));
4868 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4869 	    ipfw_flush_dispatch);
4870 	nmsg.lmsg.u.ms_result = kill_default;
4871 	netisr_domsg_global(&nmsg);
4872 	ipfw_gd.ipfw_state_loosecnt = 0;
4873 	ipfw_gd.ipfw_state_globexp = 0;
4874 	ipfw_gd.ipfw_track_globexp = 0;
4875 
4876 #ifdef INVARIANTS
4877 	state_cnt = ipfw_state_cntcoll();
4878 	KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4879 
4880 	KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4881 	    ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4882 
4883 	if (kill_default) {
4884 		KASSERT(static_count == 0,
4885 			("%u static rules remain", static_count));
4886 		KASSERT(static_ioc_len == 0,
4887 			("%u bytes of static rules remain", static_ioc_len));
4888 	} else {
4889 		KASSERT(static_count == 1,
4890 			("%u static rules remain", static_count));
4891 		KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4892 			("%u bytes of static rules remain, should be %lu",
4893 			 static_ioc_len,
4894 			 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4895 	}
4896 #endif
4897 
4898 	/* Flush is done */
4899 	ipfw_flushing = 0;
4900 }
4901 
4902 static void
4903 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4904 {
4905 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4906 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4907 	struct ip_fw *rule, *prev;
4908 
4909 	ASSERT_NETISR_NCPUS(mycpuid);
4910 
4911 	rule = dmsg->start_rule;
4912 	KKASSERT(rule->cpuid == mycpuid);
4913 	dmsg->start_rule = rule->sibling;
4914 
4915 	prev = dmsg->prev_rule;
4916 	if (prev != NULL) {
4917 		KKASSERT(prev->cpuid == mycpuid);
4918 
4919 		/*
4920 		 * Move to the position on the next CPU
4921 		 * before the msg is forwarded.
4922 		 */
4923 		dmsg->prev_rule = prev->sibling;
4924 	}
4925 
4926 	/*
4927 	 * flush pointers outside the loop, then delete all matching
4928 	 * rules.  'prev' remains the same throughout the cycle.
4929 	 */
4930 	ipfw_flush_rule_ptrs(ctx);
4931 	while (rule && rule->rulenum == dmsg->rulenum) {
4932 		if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4933 			/* Flush states generated by this rule. */
4934 			ipfw_state_flush(ctx, rule);
4935 		}
4936 		if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4937 			/* Flush tracks generated by this rule. */
4938 			ipfw_track_flush(ctx, rule);
4939 		}
4940 		rule = ipfw_delete_rule(ctx, prev, rule);
4941 	}
4942 
4943 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4944 }
4945 
4946 static int
4947 ipfw_alt_delete_rule(uint16_t rulenum)
4948 {
4949 	struct ip_fw *prev, *rule;
4950 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4951 	struct netmsg_del dmsg;
4952 
4953 	ASSERT_NETISR0;
4954 
4955 	/*
4956 	 * Locate first rule to delete
4957 	 */
4958 	for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4959 	     rule && rule->rulenum < rulenum;
4960 	     prev = rule, rule = rule->next)
4961 		; /* EMPTY */
4962 	if (rule->rulenum != rulenum)
4963 		return EINVAL;
4964 
4965 	/*
4966 	 * Get rid of the rule duplications on all CPUs
4967 	 */
4968 	bzero(&dmsg, sizeof(dmsg));
4969 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4970 	    ipfw_alt_delete_rule_dispatch);
4971 	dmsg.prev_rule = prev;
4972 	dmsg.start_rule = rule;
4973 	dmsg.rulenum = rulenum;
4974 
4975 	netisr_domsg_global(&dmsg.base);
4976 	KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4977 	return 0;
4978 }
4979 
4980 static void
4981 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4982 {
4983 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4984 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4985 	struct ip_fw *prev, *rule;
4986 #ifdef INVARIANTS
4987 	int del = 0;
4988 #endif
4989 
4990 	ASSERT_NETISR_NCPUS(mycpuid);
4991 
4992 	ipfw_flush_rule_ptrs(ctx);
4993 
4994 	prev = NULL;
4995 	rule = ctx->ipfw_layer3_chain;
4996 	while (rule != NULL) {
4997 		if (rule->set == dmsg->from_set) {
4998 			if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4999 				/* Flush states generated by this rule. */
5000 				ipfw_state_flush(ctx, rule);
5001 			}
5002 			if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
5003 				/* Flush tracks generated by this rule. */
5004 				ipfw_track_flush(ctx, rule);
5005 			}
5006 			rule = ipfw_delete_rule(ctx, prev, rule);
5007 #ifdef INVARIANTS
5008 			del = 1;
5009 #endif
5010 		} else {
5011 			prev = rule;
5012 			rule = rule->next;
5013 		}
5014 	}
5015 	KASSERT(del, ("no match set?!"));
5016 
5017 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5018 }
5019 
5020 static int
5021 ipfw_alt_delete_ruleset(uint8_t set)
5022 {
5023 	struct netmsg_del dmsg;
5024 	int del;
5025 	struct ip_fw *rule;
5026 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5027 
5028 	ASSERT_NETISR0;
5029 
5030 	/*
5031 	 * Check whether the 'set' exists.  If it exists,
5032 	 * then check whether any rules within the set will
5033 	 * try to create states.
5034 	 */
5035 	del = 0;
5036 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5037 		if (rule->set == set)
5038 			del = 1;
5039 	}
5040 	if (!del)
5041 		return 0; /* XXX EINVAL? */
5042 
5043 	/*
5044 	 * Delete this set
5045 	 */
5046 	bzero(&dmsg, sizeof(dmsg));
5047 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5048 	    ipfw_alt_delete_ruleset_dispatch);
5049 	dmsg.from_set = set;
5050 	netisr_domsg_global(&dmsg.base);
5051 
5052 	return 0;
5053 }
5054 
5055 static void
5056 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5057 {
5058 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5059 	struct ip_fw *rule;
5060 
5061 	ASSERT_NETISR_NCPUS(mycpuid);
5062 
5063 	rule = dmsg->start_rule;
5064 	KKASSERT(rule->cpuid == mycpuid);
5065 
5066 	/*
5067 	 * Move to the position on the next CPU
5068 	 * before the msg is forwarded.
5069 	 */
5070 	dmsg->start_rule = rule->sibling;
5071 
5072 	while (rule && rule->rulenum <= dmsg->rulenum) {
5073 		if (rule->rulenum == dmsg->rulenum)
5074 			rule->set = dmsg->to_set;
5075 		rule = rule->next;
5076 	}
5077 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5078 }
5079 
5080 static int
5081 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5082 {
5083 	struct netmsg_del dmsg;
5084 	struct netmsg_base *nmsg;
5085 	struct ip_fw *rule;
5086 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5087 
5088 	ASSERT_NETISR0;
5089 
5090 	/*
5091 	 * Locate first rule to move
5092 	 */
5093 	for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5094 	     rule = rule->next) {
5095 		if (rule->rulenum == rulenum && rule->set != set)
5096 			break;
5097 	}
5098 	if (rule == NULL || rule->rulenum > rulenum)
5099 		return 0; /* XXX error? */
5100 
5101 	bzero(&dmsg, sizeof(dmsg));
5102 	nmsg = &dmsg.base;
5103 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5104 	    ipfw_alt_move_rule_dispatch);
5105 	dmsg.start_rule = rule;
5106 	dmsg.rulenum = rulenum;
5107 	dmsg.to_set = set;
5108 
5109 	netisr_domsg_global(nmsg);
5110 	KKASSERT(dmsg.start_rule == NULL);
5111 	return 0;
5112 }
5113 
5114 static void
5115 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5116 {
5117 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5118 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5119 	struct ip_fw *rule;
5120 
5121 	ASSERT_NETISR_NCPUS(mycpuid);
5122 
5123 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5124 		if (rule->set == dmsg->from_set)
5125 			rule->set = dmsg->to_set;
5126 	}
5127 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5128 }
5129 
5130 static int
5131 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5132 {
5133 	struct netmsg_del dmsg;
5134 	struct netmsg_base *nmsg;
5135 
5136 	ASSERT_NETISR0;
5137 
5138 	bzero(&dmsg, sizeof(dmsg));
5139 	nmsg = &dmsg.base;
5140 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5141 	    ipfw_alt_move_ruleset_dispatch);
5142 	dmsg.from_set = from_set;
5143 	dmsg.to_set = to_set;
5144 
5145 	netisr_domsg_global(nmsg);
5146 	return 0;
5147 }
5148 
5149 static void
5150 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5151 {
5152 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5153 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5154 	struct ip_fw *rule;
5155 
5156 	ASSERT_NETISR_NCPUS(mycpuid);
5157 
5158 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5159 		if (rule->set == dmsg->from_set)
5160 			rule->set = dmsg->to_set;
5161 		else if (rule->set == dmsg->to_set)
5162 			rule->set = dmsg->from_set;
5163 	}
5164 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5165 }
5166 
5167 static int
5168 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5169 {
5170 	struct netmsg_del dmsg;
5171 	struct netmsg_base *nmsg;
5172 
5173 	ASSERT_NETISR0;
5174 
5175 	bzero(&dmsg, sizeof(dmsg));
5176 	nmsg = &dmsg.base;
5177 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5178 	    ipfw_alt_swap_ruleset_dispatch);
5179 	dmsg.from_set = set1;
5180 	dmsg.to_set = set2;
5181 
5182 	netisr_domsg_global(nmsg);
5183 	return 0;
5184 }
5185 
5186 /*
5187  * Remove all rules with given number, and also do set manipulation.
5188  *
5189  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5190  * the next 8 bits are the new set, the top 8 bits are the command:
5191  *
5192  *	0	delete rules with given number
5193  *	1	delete rules with given set number
5194  *	2	move rules with given number to new set
5195  *	3	move rules with given set number to new set
5196  *	4	swap sets with given numbers
5197  */
5198 static int
5199 ipfw_ctl_alter(uint32_t arg)
5200 {
5201 	uint16_t rulenum;
5202 	uint8_t cmd, new_set;
5203 	int error = 0;
5204 
5205 	ASSERT_NETISR0;
5206 
5207 	rulenum = arg & 0xffff;
5208 	cmd = (arg >> 24) & 0xff;
5209 	new_set = (arg >> 16) & 0xff;
5210 
5211 	if (cmd > 4)
5212 		return EINVAL;
5213 	if (new_set >= IPFW_DEFAULT_SET)
5214 		return EINVAL;
5215 	if (cmd == 0 || cmd == 2) {
5216 		if (rulenum == IPFW_DEFAULT_RULE)
5217 			return EINVAL;
5218 	} else {
5219 		if (rulenum >= IPFW_DEFAULT_SET)
5220 			return EINVAL;
5221 	}
5222 
5223 	switch (cmd) {
5224 	case 0:	/* delete rules with given number */
5225 		error = ipfw_alt_delete_rule(rulenum);
5226 		break;
5227 
5228 	case 1:	/* delete all rules with given set number */
5229 		error = ipfw_alt_delete_ruleset(rulenum);
5230 		break;
5231 
5232 	case 2:	/* move rules with given number to new set */
5233 		error = ipfw_alt_move_rule(rulenum, new_set);
5234 		break;
5235 
5236 	case 3: /* move rules with given set number to new set */
5237 		error = ipfw_alt_move_ruleset(rulenum, new_set);
5238 		break;
5239 
5240 	case 4: /* swap two sets */
5241 		error = ipfw_alt_swap_ruleset(rulenum, new_set);
5242 		break;
5243 	}
5244 	return error;
5245 }
5246 
5247 /*
5248  * Clear counters for a specific rule.
5249  */
5250 static void
5251 clear_counters(struct ip_fw *rule, int log_only)
5252 {
5253 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5254 
5255 	if (log_only == 0) {
5256 		rule->bcnt = rule->pcnt = 0;
5257 		rule->timestamp = 0;
5258 	}
5259 	if (l->o.opcode == O_LOG)
5260 		l->log_left = l->max_log;
5261 }
5262 
5263 static void
5264 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5265 {
5266 	struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5267 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5268 	struct ip_fw *rule;
5269 
5270 	ASSERT_NETISR_NCPUS(mycpuid);
5271 
5272 	if (zmsg->rulenum == 0) {
5273 		KKASSERT(zmsg->start_rule == NULL);
5274 
5275 		ctx->ipfw_norule_counter = 0;
5276 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5277 			clear_counters(rule, zmsg->log_only);
5278 	} else {
5279 		struct ip_fw *start = zmsg->start_rule;
5280 
5281 		KKASSERT(start->cpuid == mycpuid);
5282 		KKASSERT(start->rulenum == zmsg->rulenum);
5283 
5284 		/*
5285 		 * We can have multiple rules with the same number, so we
5286 		 * need to clear them all.
5287 		 */
5288 		for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5289 		     rule = rule->next)
5290 			clear_counters(rule, zmsg->log_only);
5291 
5292 		/*
5293 		 * Move to the position on the next CPU
5294 		 * before the msg is forwarded.
5295 		 */
5296 		zmsg->start_rule = start->sibling;
5297 	}
5298 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5299 }
5300 
5301 /*
5302  * Reset some or all counters on firewall rules.
5303  * @arg frwl is null to clear all entries, or contains a specific
5304  * rule number.
5305  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5306  */
5307 static int
5308 ipfw_ctl_zero_entry(int rulenum, int log_only)
5309 {
5310 	struct netmsg_zent zmsg;
5311 	struct netmsg_base *nmsg;
5312 	const char *msg;
5313 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5314 
5315 	ASSERT_NETISR0;
5316 
5317 	bzero(&zmsg, sizeof(zmsg));
5318 	nmsg = &zmsg.base;
5319 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5320 	    ipfw_zero_entry_dispatch);
5321 	zmsg.log_only = log_only;
5322 
5323 	if (rulenum == 0) {
5324 		msg = log_only ? "ipfw: All logging counts reset.\n"
5325 			       : "ipfw: Accounting cleared.\n";
5326 	} else {
5327 		struct ip_fw *rule;
5328 
5329 		/*
5330 		 * Locate the first rule with 'rulenum'
5331 		 */
5332 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5333 			if (rule->rulenum == rulenum)
5334 				break;
5335 		}
5336 		if (rule == NULL) /* we did not find any matching rules */
5337 			return (EINVAL);
5338 		zmsg.start_rule = rule;
5339 		zmsg.rulenum = rulenum;
5340 
5341 		msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5342 			       : "ipfw: Entry %d cleared.\n";
5343 	}
5344 	netisr_domsg_global(nmsg);
5345 	KKASSERT(zmsg.start_rule == NULL);
5346 
5347 	if (fw_verbose)
5348 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5349 	return (0);
5350 }
5351 
5352 /*
5353  * Check validity of the structure before insert.
5354  * Fortunately rules are simple, so this mostly need to check rule sizes.
5355  */
5356 static int
5357 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5358 {
5359 	int l, cmdlen = 0;
5360 	int have_action = 0;
5361 	ipfw_insn *cmd;
5362 
5363 	*rule_flags = 0;
5364 
5365 	/* Check for valid size */
5366 	if (size < sizeof(*rule)) {
5367 		kprintf("ipfw: rule too short\n");
5368 		return EINVAL;
5369 	}
5370 	l = IOC_RULESIZE(rule);
5371 	if (l != size) {
5372 		kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5373 		return EINVAL;
5374 	}
5375 
5376 	/* Check rule number */
5377 	if (rule->rulenum == IPFW_DEFAULT_RULE) {
5378 		kprintf("ipfw: invalid rule number\n");
5379 		return EINVAL;
5380 	}
5381 
5382 	/*
5383 	 * Now go for the individual checks. Very simple ones, basically only
5384 	 * instruction sizes.
5385 	 */
5386 	for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5387 	     l -= cmdlen, cmd += cmdlen) {
5388 		cmdlen = F_LEN(cmd);
5389 		if (cmdlen > l) {
5390 			kprintf("ipfw: opcode %d size truncated\n",
5391 				cmd->opcode);
5392 			return EINVAL;
5393 		}
5394 
5395 		DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5396 
5397 		if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5398 		    IPFW_ISXLAT(cmd->opcode)) {
5399 			/* This rule will generate states. */
5400 			*rule_flags |= IPFW_RULE_F_GENSTATE;
5401 			if (cmd->opcode == O_LIMIT)
5402 				*rule_flags |= IPFW_RULE_F_GENTRACK;
5403 		}
5404 		if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5405 			*rule_flags |= IPFW_RULE_F_CROSSREF;
5406 		if (cmd->opcode == O_IP_SRC_IFIP ||
5407 		    cmd->opcode == O_IP_DST_IFIP) {
5408 			*rule_flags |= IPFW_RULE_F_DYNIFADDR;
5409 			cmd->arg1 &= IPFW_IFIP_SETTINGS;
5410 		}
5411 
5412 		switch (cmd->opcode) {
5413 		case O_NOP:
5414 		case O_PROBE_STATE:
5415 		case O_KEEP_STATE:
5416 		case O_PROTO:
5417 		case O_IP_SRC_ME:
5418 		case O_IP_DST_ME:
5419 		case O_LAYER2:
5420 		case O_IN:
5421 		case O_FRAG:
5422 		case O_IPFRAG:
5423 		case O_IPOPT:
5424 		case O_IPLEN:
5425 		case O_IPID:
5426 		case O_IPTOS:
5427 		case O_IPPRECEDENCE:
5428 		case O_IPTTL:
5429 		case O_IPVER:
5430 		case O_TCPWIN:
5431 		case O_TCPFLAGS:
5432 		case O_TCPOPTS:
5433 		case O_ESTAB:
5434 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5435 				goto bad_size;
5436 			break;
5437 
5438 		case O_IP_SRC_TABLE:
5439 		case O_IP_DST_TABLE:
5440 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5441 				goto bad_size;
5442 			if (cmd->arg1 >= ipfw_table_max) {
5443 				kprintf("ipfw: invalid table id %u, max %d\n",
5444 				    cmd->arg1, ipfw_table_max);
5445 				return EINVAL;
5446 			}
5447 			break;
5448 
5449 		case O_IP_SRC_IFIP:
5450 		case O_IP_DST_IFIP:
5451 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5452 				goto bad_size;
5453 			break;
5454 
5455 		case O_ICMPCODE:
5456 		case O_ICMPTYPE:
5457 			if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5458 				goto bad_size;
5459 			break;
5460 
5461 		case O_UID:
5462 		case O_GID:
5463 		case O_IP_SRC:
5464 		case O_IP_DST:
5465 		case O_TCPSEQ:
5466 		case O_TCPACK:
5467 		case O_PROB:
5468 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5469 				goto bad_size;
5470 			break;
5471 
5472 		case O_LIMIT:
5473 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5474 				goto bad_size;
5475 			break;
5476 		case O_REDIRECT:
5477 			if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5478 				goto bad_size;
5479 			break;
5480 
5481 		case O_LOG:
5482 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5483 				goto bad_size;
5484 
5485 			((ipfw_insn_log *)cmd)->log_left =
5486 			    ((ipfw_insn_log *)cmd)->max_log;
5487 
5488 			break;
5489 
5490 		case O_IP_SRC_MASK:
5491 		case O_IP_DST_MASK:
5492 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5493 				goto bad_size;
5494 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5495 				kprintf("ipfw: opcode %d, useless rule\n",
5496 					cmd->opcode);
5497 				return EINVAL;
5498 			}
5499 			break;
5500 
5501 		case O_IP_SRC_SET:
5502 		case O_IP_DST_SET:
5503 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5504 				kprintf("ipfw: invalid set size %d\n",
5505 					cmd->arg1);
5506 				return EINVAL;
5507 			}
5508 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5509 			    (cmd->arg1+31)/32 )
5510 				goto bad_size;
5511 			break;
5512 
5513 		case O_MACADDR2:
5514 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5515 				goto bad_size;
5516 			break;
5517 
5518 		case O_MAC_TYPE:
5519 		case O_IP_SRCPORT:
5520 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5521 			if (cmdlen < 2 || cmdlen > 31)
5522 				goto bad_size;
5523 			break;
5524 
5525 		case O_RECV:
5526 		case O_XMIT:
5527 		case O_VIA:
5528 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5529 				goto bad_size;
5530 			break;
5531 
5532 		case O_PIPE:
5533 		case O_QUEUE:
5534 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5535 				goto bad_size;
5536 			goto check_action;
5537 
5538 		case O_FORWARD_IP:
5539 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5540 				goto bad_size;
5541 			} else {
5542 				in_addr_t fwd_addr;
5543 
5544 				fwd_addr = ((ipfw_insn_sa *)cmd)->
5545 					   sa.sin_addr.s_addr;
5546 				if (IN_MULTICAST(ntohl(fwd_addr))) {
5547 					kprintf("ipfw: try forwarding to "
5548 						"multicast address\n");
5549 					return EINVAL;
5550 				}
5551 			}
5552 			goto check_action;
5553 
5554 		case O_FORWARD_MAC: /* XXX not implemented yet */
5555 		case O_CHECK_STATE:
5556 		case O_COUNT:
5557 		case O_ACCEPT:
5558 		case O_DENY:
5559 		case O_REJECT:
5560 		case O_SKIPTO:
5561 		case O_DIVERT:
5562 		case O_TEE:
5563 		case O_DEFRAG:
5564 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5565 				goto bad_size;
5566 check_action:
5567 			if (have_action) {
5568 				kprintf("ipfw: opcode %d, multiple actions"
5569 					" not allowed\n",
5570 					cmd->opcode);
5571 				return EINVAL;
5572 			}
5573 			have_action = 1;
5574 			if (l != cmdlen) {
5575 				kprintf("ipfw: opcode %d, action must be"
5576 					" last opcode\n",
5577 					cmd->opcode);
5578 				return EINVAL;
5579 			}
5580 			break;
5581 		default:
5582 			kprintf("ipfw: opcode %d, unknown opcode\n",
5583 				cmd->opcode);
5584 			return EINVAL;
5585 		}
5586 	}
5587 	if (have_action == 0) {
5588 		kprintf("ipfw: missing action\n");
5589 		return EINVAL;
5590 	}
5591 	return 0;
5592 
5593 bad_size:
5594 	kprintf("ipfw: opcode %d size %d wrong\n",
5595 		cmd->opcode, cmdlen);
5596 	return EINVAL;
5597 }
5598 
5599 static int
5600 ipfw_ctl_add_rule(struct sockopt *sopt)
5601 {
5602 	struct ipfw_ioc_rule *ioc_rule;
5603 	size_t size;
5604 	uint32_t rule_flags;
5605 	int error;
5606 
5607 	ASSERT_NETISR0;
5608 
5609 	size = sopt->sopt_valsize;
5610 	if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5611 	    size < sizeof(*ioc_rule)) {
5612 		return EINVAL;
5613 	}
5614 	if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5615 		sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5616 					  IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5617 	}
5618 	ioc_rule = sopt->sopt_val;
5619 
5620 	error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5621 	if (error)
5622 		return error;
5623 
5624 	ipfw_add_rule(ioc_rule, rule_flags);
5625 
5626 	if (sopt->sopt_dir == SOPT_GET)
5627 		sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5628 	return 0;
5629 }
5630 
5631 static void *
5632 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5633     struct ipfw_ioc_rule *ioc_rule)
5634 {
5635 	const struct ip_fw *sibling;
5636 #ifdef INVARIANTS
5637 	int i;
5638 #endif
5639 
5640 	ASSERT_NETISR0;
5641 	KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5642 
5643 	ioc_rule->act_ofs = rule->act_ofs;
5644 	ioc_rule->cmd_len = rule->cmd_len;
5645 	ioc_rule->rulenum = rule->rulenum;
5646 	ioc_rule->set = rule->set;
5647 	ioc_rule->usr_flags = rule->usr_flags;
5648 
5649 	ioc_rule->set_disable = ctx->ipfw_set_disable;
5650 	ioc_rule->static_count = static_count;
5651 	ioc_rule->static_len = static_ioc_len;
5652 
5653 	/*
5654 	 * Visit (read-only) all of the rule's duplications to get
5655 	 * the necessary statistics
5656 	 */
5657 #ifdef INVARIANTS
5658 	i = 0;
5659 #endif
5660 	ioc_rule->pcnt = 0;
5661 	ioc_rule->bcnt = 0;
5662 	ioc_rule->timestamp = 0;
5663 	for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5664 		ioc_rule->pcnt += sibling->pcnt;
5665 		ioc_rule->bcnt += sibling->bcnt;
5666 		if (sibling->timestamp > ioc_rule->timestamp)
5667 			ioc_rule->timestamp = sibling->timestamp;
5668 #ifdef INVARIANTS
5669 		++i;
5670 #endif
5671 	}
5672 	KASSERT(i == netisr_ncpus,
5673 	    ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5674 
5675 	bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5676 
5677 	return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5678 }
5679 
5680 static boolean_t
5681 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5682 {
5683 	struct ipfw_ioc_flowid *ioc_id;
5684 
5685 	if (trk->tc_expire == 0) {
5686 		/* Not a scanned one. */
5687 		return (FALSE);
5688 	}
5689 
5690 	ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5691 	    0 : trk->tc_expire - time_uptime;
5692 	ioc_state->pcnt = 0;
5693 	ioc_state->bcnt = 0;
5694 
5695 	ioc_state->dyn_type = O_LIMIT_PARENT;
5696 	ioc_state->count = trk->tc_count;
5697 
5698 	ioc_state->rulenum = trk->tc_rulenum;
5699 
5700 	ioc_id = &ioc_state->id;
5701 	ioc_id->type = ETHERTYPE_IP;
5702 	ioc_id->u.ip.proto = trk->tc_proto;
5703 	ioc_id->u.ip.src_ip = trk->tc_saddr;
5704 	ioc_id->u.ip.dst_ip = trk->tc_daddr;
5705 	ioc_id->u.ip.src_port = trk->tc_sport;
5706 	ioc_id->u.ip.dst_port = trk->tc_dport;
5707 
5708 	return (TRUE);
5709 }
5710 
5711 static boolean_t
5712 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5713 {
5714 	struct ipfw_ioc_flowid *ioc_id;
5715 
5716 	if (IPFW_STATE_SCANSKIP(s))
5717 		return (FALSE);
5718 
5719 	ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5720 	    0 : s->st_expire - time_uptime;
5721 	ioc_state->pcnt = s->st_pcnt;
5722 	ioc_state->bcnt = s->st_bcnt;
5723 
5724 	ioc_state->dyn_type = s->st_type;
5725 	ioc_state->count = 0;
5726 
5727 	ioc_state->rulenum = s->st_rule->rulenum;
5728 
5729 	ioc_id = &ioc_state->id;
5730 	ioc_id->type = ETHERTYPE_IP;
5731 	ioc_id->u.ip.proto = s->st_proto;
5732 	ipfw_key_4tuple(&s->st_key,
5733 	    &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5734 	    &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5735 
5736 	if (IPFW_ISXLAT(s->st_type)) {
5737 		const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5738 
5739 		if (x->xlat_port == 0)
5740 			ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5741 		else
5742 			ioc_state->xlat_port = ntohs(x->xlat_port);
5743 		ioc_state->xlat_addr = ntohl(x->xlat_addr);
5744 
5745 		ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5746 		ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5747 	}
5748 
5749 	return (TRUE);
5750 }
5751 
5752 static void
5753 ipfw_state_copy_dispatch(netmsg_t nmsg)
5754 {
5755 	struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5756 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5757 	const struct ipfw_state *s;
5758 	const struct ipfw_track *t;
5759 
5760 	ASSERT_NETISR_NCPUS(mycpuid);
5761 	KASSERT(nm->state_cnt < nm->state_cntmax,
5762 	    ("invalid state count %d, max %d",
5763 	     nm->state_cnt, nm->state_cntmax));
5764 
5765 	TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5766 		if (ipfw_state_copy(s, nm->ioc_state)) {
5767 			nm->ioc_state++;
5768 			nm->state_cnt++;
5769 			if (nm->state_cnt == nm->state_cntmax)
5770 				goto done;
5771 		}
5772 	}
5773 
5774 	/*
5775 	 * Prepare tracks in the global track tree for userland.
5776 	 */
5777 	TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5778 		struct ipfw_trkcnt *trk;
5779 
5780 		if (t->t_count == NULL) /* anchor */
5781 			continue;
5782 		trk = t->t_trkcnt;
5783 
5784 		/*
5785 		 * Only one netisr can run this function at
5786 		 * any time, and only this function accesses
5787 		 * trkcnt's tc_expire, so this is safe w/o
5788 		 * ipfw_gd.ipfw_trkcnt_token.
5789 		 */
5790 		if (trk->tc_expire > t->t_expire)
5791 			continue;
5792 		trk->tc_expire = t->t_expire;
5793 	}
5794 
5795 	/*
5796 	 * Copy tracks in the global track tree to userland in
5797 	 * the last netisr.
5798 	 */
5799 	if (mycpuid == netisr_ncpus - 1) {
5800 		struct ipfw_trkcnt *trk;
5801 
5802 		KASSERT(nm->state_cnt < nm->state_cntmax,
5803 		    ("invalid state count %d, max %d",
5804 		     nm->state_cnt, nm->state_cntmax));
5805 
5806 		IPFW_TRKCNT_TOKGET;
5807 		RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5808 			if (ipfw_track_copy(trk, nm->ioc_state)) {
5809 				nm->ioc_state++;
5810 				nm->state_cnt++;
5811 				if (nm->state_cnt == nm->state_cntmax) {
5812 					IPFW_TRKCNT_TOKREL;
5813 					goto done;
5814 				}
5815 			}
5816 		}
5817 		IPFW_TRKCNT_TOKREL;
5818 	}
5819 done:
5820 	if (nm->state_cnt == nm->state_cntmax) {
5821 		/* No more space; done. */
5822 		netisr_replymsg(&nm->base, 0);
5823 	} else {
5824 		netisr_forwardmsg(&nm->base, mycpuid + 1);
5825 	}
5826 }
5827 
5828 static int
5829 ipfw_ctl_get_rules(struct sockopt *sopt)
5830 {
5831 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5832 	struct ip_fw *rule;
5833 	void *bp;
5834 	size_t size;
5835 	int state_cnt;
5836 
5837 	ASSERT_NETISR0;
5838 
5839 	/*
5840 	 * pass up a copy of the current rules. Static rules
5841 	 * come first (the last of which has number IPFW_DEFAULT_RULE),
5842 	 * followed by a possibly empty list of states.
5843 	 */
5844 
5845 	size = static_ioc_len;	/* size of static rules */
5846 
5847 	/*
5848 	 * Size of the states.
5849 	 * XXX take tracks as state for userland compat.
5850 	 */
5851 	state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5852 	state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5853 	size += state_cnt * sizeof(struct ipfw_ioc_state);
5854 
5855 	if (sopt->sopt_valsize < size) {
5856 		/* short length, no need to return incomplete rules */
5857 		/* XXX: if superuser, no need to zero buffer */
5858 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5859 		return 0;
5860 	}
5861 	bp = sopt->sopt_val;
5862 
5863 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5864 		bp = ipfw_copy_rule(ctx, rule, bp);
5865 
5866 	if (state_cnt) {
5867 		struct netmsg_cpstate nm;
5868 #ifdef INVARIANTS
5869 		size_t old_size = size;
5870 #endif
5871 
5872 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5873 		    MSGF_PRIORITY, ipfw_state_copy_dispatch);
5874 		nm.ioc_state = bp;
5875 		nm.state_cntmax = state_cnt;
5876 		nm.state_cnt = 0;
5877 		netisr_domsg_global(&nm.base);
5878 
5879 		/*
5880 		 * The # of states may be shrinked after the snapshot
5881 		 * of the state count was taken.  To give user a correct
5882 		 * state count, nm->state_cnt is used to recalculate
5883 		 * the actual size.
5884 		 */
5885 		size = static_ioc_len +
5886 		    (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5887 		KKASSERT(size <= old_size);
5888 	}
5889 
5890 	sopt->sopt_valsize = size;
5891 	return 0;
5892 }
5893 
5894 static void
5895 ipfw_set_disable_dispatch(netmsg_t nmsg)
5896 {
5897 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5898 
5899 	ASSERT_NETISR_NCPUS(mycpuid);
5900 
5901 	ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5902 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5903 }
5904 
5905 static void
5906 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5907 {
5908 	struct netmsg_base nmsg;
5909 	uint32_t set_disable;
5910 
5911 	ASSERT_NETISR0;
5912 
5913 	/* IPFW_DEFAULT_SET is always enabled */
5914 	enable |= (1 << IPFW_DEFAULT_SET);
5915 	set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5916 
5917 	bzero(&nmsg, sizeof(nmsg));
5918 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5919 	    ipfw_set_disable_dispatch);
5920 	nmsg.lmsg.u.ms_result32 = set_disable;
5921 
5922 	netisr_domsg_global(&nmsg);
5923 }
5924 
5925 static void
5926 ipfw_table_create_dispatch(netmsg_t nm)
5927 {
5928 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5929 	int tblid = nm->lmsg.u.ms_result;
5930 
5931 	ASSERT_NETISR_NCPUS(mycpuid);
5932 
5933 	if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5934 	    rn_cpumaskhead(mycpuid), 32))
5935 		panic("ipfw: create table%d failed", tblid);
5936 
5937 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5938 }
5939 
5940 static int
5941 ipfw_table_create(struct sockopt *sopt)
5942 {
5943 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5944 	struct ipfw_ioc_table *tbl;
5945 	struct netmsg_base nm;
5946 
5947 	ASSERT_NETISR0;
5948 
5949 	if (sopt->sopt_valsize != sizeof(*tbl))
5950 		return (EINVAL);
5951 
5952 	tbl = sopt->sopt_val;
5953 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5954 		return (EINVAL);
5955 
5956 	if (ctx->ipfw_tables[tbl->tableid] != NULL)
5957 		return (EEXIST);
5958 
5959 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5960 	    ipfw_table_create_dispatch);
5961 	nm.lmsg.u.ms_result = tbl->tableid;
5962 	netisr_domsg_global(&nm);
5963 
5964 	return (0);
5965 }
5966 
5967 static void
5968 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5969 {
5970 	struct radix_node *ret;
5971 
5972 	ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5973 	if (ret != rn)
5974 		panic("deleted other table entry");
5975 	kfree(ret, M_IPFW);
5976 }
5977 
5978 static int
5979 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5980 {
5981 
5982 	ipfw_table_killrn(xrnh, rn);
5983 	return (0);
5984 }
5985 
5986 static void
5987 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5988     int destroy)
5989 {
5990 	struct radix_node_head *rnh;
5991 
5992 	ASSERT_NETISR_NCPUS(mycpuid);
5993 
5994 	rnh = ctx->ipfw_tables[tableid];
5995 	rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5996 	if (destroy) {
5997 		Free(rnh);
5998 		ctx->ipfw_tables[tableid] = NULL;
5999 	}
6000 }
6001 
6002 static void
6003 ipfw_table_flush_dispatch(netmsg_t nmsg)
6004 {
6005 	struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
6006 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6007 
6008 	ASSERT_NETISR_NCPUS(mycpuid);
6009 
6010 	ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
6011 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6012 }
6013 
6014 static void
6015 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
6016 {
6017 	int i;
6018 
6019 	ASSERT_NETISR_NCPUS(mycpuid);
6020 
6021 	for (i = 0; i < ipfw_table_max; ++i) {
6022 		if (ctx->ipfw_tables[i] != NULL)
6023 			ipfw_table_flush_oncpu(ctx, i, destroy);
6024 	}
6025 }
6026 
6027 static void
6028 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6029 {
6030 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6031 
6032 	ASSERT_NETISR_NCPUS(mycpuid);
6033 
6034 	ipfw_table_flushall_oncpu(ctx, 0);
6035 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6036 }
6037 
6038 static int
6039 ipfw_table_flush(struct sockopt *sopt)
6040 {
6041 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6042 	struct ipfw_ioc_table *tbl;
6043 	struct netmsg_tblflush nm;
6044 
6045 	ASSERT_NETISR0;
6046 
6047 	if (sopt->sopt_valsize != sizeof(*tbl))
6048 		return (EINVAL);
6049 
6050 	tbl = sopt->sopt_val;
6051 	if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6052 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6053 		    MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6054 		netisr_domsg_global(&nm.base);
6055 		return (0);
6056 	}
6057 
6058 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6059 		return (EINVAL);
6060 
6061 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6062 		return (ENOENT);
6063 
6064 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6065 	    ipfw_table_flush_dispatch);
6066 	nm.tableid = tbl->tableid;
6067 	nm.destroy = 0;
6068 	if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6069 		nm.destroy = 1;
6070 	netisr_domsg_global(&nm.base);
6071 
6072 	return (0);
6073 }
6074 
6075 static int
6076 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6077 {
6078 	int *cnt = xcnt;
6079 
6080 	(*cnt)++;
6081 	return (0);
6082 }
6083 
6084 static int
6085 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6086 {
6087 	struct ipfw_table_cp *cp = xcp;
6088 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6089 	struct ipfw_ioc_tblent *ioc_te;
6090 #ifdef INVARIANTS
6091 	int cnt;
6092 #endif
6093 
6094 	KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6095 	    cp->te_idx, cp->te_cnt));
6096 	ioc_te = &cp->te[cp->te_idx];
6097 
6098 	if (te->te_nodes->rn_mask != NULL) {
6099 		memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6100 		    *te->te_nodes->rn_mask);
6101 	} else {
6102 		ioc_te->netmask.sin_len = 0;
6103 	}
6104 	memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6105 
6106 	ioc_te->use = te->te_use;
6107 	ioc_te->last_used = te->te_lastuse;
6108 #ifdef INVARIANTS
6109 	cnt = 1;
6110 #endif
6111 
6112 	while ((te = te->te_sibling) != NULL) {
6113 #ifdef INVARIANTS
6114 		++cnt;
6115 #endif
6116 		ioc_te->use += te->te_use;
6117 		if (te->te_lastuse > ioc_te->last_used)
6118 			ioc_te->last_used = te->te_lastuse;
6119 	}
6120 	KASSERT(cnt == netisr_ncpus,
6121 	    ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6122 
6123 	cp->te_idx++;
6124 
6125 	return (0);
6126 }
6127 
6128 static int
6129 ipfw_table_get(struct sockopt *sopt)
6130 {
6131 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6132 	struct radix_node_head *rnh;
6133 	struct ipfw_ioc_table *tbl;
6134 	struct ipfw_ioc_tblcont *cont;
6135 	struct ipfw_table_cp cp;
6136 	int cnt = 0, sz;
6137 
6138 	ASSERT_NETISR0;
6139 
6140 	if (sopt->sopt_valsize < sizeof(*tbl))
6141 		return (EINVAL);
6142 
6143 	tbl = sopt->sopt_val;
6144 	if (tbl->tableid < 0) {
6145 		struct ipfw_ioc_tbllist *list;
6146 		int i;
6147 
6148 		/*
6149 		 * List available table ids.
6150 		 */
6151 		for (i = 0; i < ipfw_table_max; ++i) {
6152 			if (ctx->ipfw_tables[i] != NULL)
6153 				++cnt;
6154 		}
6155 
6156 		sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6157 		if (sopt->sopt_valsize < sz) {
6158 			bzero(sopt->sopt_val, sopt->sopt_valsize);
6159 			return (E2BIG);
6160 		}
6161 		list = sopt->sopt_val;
6162 		list->tablecnt = cnt;
6163 
6164 		cnt = 0;
6165 		for (i = 0; i < ipfw_table_max; ++i) {
6166 			if (ctx->ipfw_tables[i] != NULL) {
6167 				KASSERT(cnt < list->tablecnt,
6168 				    ("invalid idx %d, cnt %d",
6169 				     cnt, list->tablecnt));
6170 				list->tables[cnt++] = i;
6171 			}
6172 		}
6173 		sopt->sopt_valsize = sz;
6174 		return (0);
6175 	} else if (tbl->tableid >= ipfw_table_max) {
6176 		return (EINVAL);
6177 	}
6178 
6179 	rnh = ctx->ipfw_tables[tbl->tableid];
6180 	if (rnh == NULL)
6181 		return (ENOENT);
6182 	rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6183 
6184 	sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6185 	if (sopt->sopt_valsize < sz) {
6186 		bzero(sopt->sopt_val, sopt->sopt_valsize);
6187 		return (E2BIG);
6188 	}
6189 	cont = sopt->sopt_val;
6190 	cont->entcnt = cnt;
6191 
6192 	cp.te = cont->ent;
6193 	cp.te_idx = 0;
6194 	cp.te_cnt = cnt;
6195 	rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6196 
6197 	sopt->sopt_valsize = sz;
6198 	return (0);
6199 }
6200 
6201 static void
6202 ipfw_table_add_dispatch(netmsg_t nmsg)
6203 {
6204 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6205 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6206 	struct radix_node_head *rnh;
6207 	struct ipfw_tblent *te;
6208 
6209 	ASSERT_NETISR_NCPUS(mycpuid);
6210 
6211 	rnh = ctx->ipfw_tables[nm->tableid];
6212 
6213 	te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6214 	te->te_nodes->rn_key = (char *)&te->te_key;
6215 	memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6216 
6217 	if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6218 	    te->te_nodes) == NULL) {
6219 		if (mycpuid == 0) {
6220 			kfree(te, M_IPFW);
6221 			netisr_replymsg(&nm->base, EEXIST);
6222 			return;
6223 		}
6224 		panic("rnh_addaddr failed");
6225 	}
6226 
6227 	/* Link siblings. */
6228 	if (nm->sibling != NULL)
6229 		nm->sibling->te_sibling = te;
6230 	nm->sibling = te;
6231 
6232 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6233 }
6234 
6235 static void
6236 ipfw_table_del_dispatch(netmsg_t nmsg)
6237 {
6238 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6239 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6240 	struct radix_node_head *rnh;
6241 	struct radix_node *rn;
6242 
6243 	ASSERT_NETISR_NCPUS(mycpuid);
6244 
6245 	rnh = ctx->ipfw_tables[nm->tableid];
6246 	rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6247 	if (rn == NULL) {
6248 		if (mycpuid == 0) {
6249 			netisr_replymsg(&nm->base, ESRCH);
6250 			return;
6251 		}
6252 		panic("rnh_deladdr failed");
6253 	}
6254 	kfree(rn, M_IPFW);
6255 
6256 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6257 }
6258 
6259 static int
6260 ipfw_table_alt(struct sockopt *sopt)
6261 {
6262 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6263 	struct ipfw_ioc_tblcont *tbl;
6264 	struct ipfw_ioc_tblent *te;
6265 	struct sockaddr_in key0;
6266 	struct sockaddr *netmask = NULL, *key;
6267 	struct netmsg_tblent nm;
6268 
6269 	ASSERT_NETISR0;
6270 
6271 	if (sopt->sopt_valsize != sizeof(*tbl))
6272 		return (EINVAL);
6273 	tbl = sopt->sopt_val;
6274 
6275 	if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6276 		return (EINVAL);
6277 	if (tbl->entcnt != 1)
6278 		return (EINVAL);
6279 
6280 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6281 		return (ENOENT);
6282 	te = &tbl->ent[0];
6283 
6284 	if (te->key.sin_family != AF_INET ||
6285 	    te->key.sin_port != 0 ||
6286 	    te->key.sin_len != sizeof(struct sockaddr_in))
6287 		return (EINVAL);
6288 	key = (struct sockaddr *)&te->key;
6289 
6290 	if (te->netmask.sin_len != 0) {
6291 		if (te->netmask.sin_port != 0 ||
6292 		    te->netmask.sin_len > sizeof(struct sockaddr_in))
6293 			return (EINVAL);
6294 		netmask = (struct sockaddr *)&te->netmask;
6295 		sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6296 		key = (struct sockaddr *)&key0;
6297 	}
6298 
6299 	if (sopt->sopt_name == IP_FW_TBL_ADD) {
6300 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6301 		    MSGF_PRIORITY, ipfw_table_add_dispatch);
6302 	} else {
6303 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6304 		    MSGF_PRIORITY, ipfw_table_del_dispatch);
6305 	}
6306 	nm.key = key;
6307 	nm.netmask = netmask;
6308 	nm.tableid = tbl->tableid;
6309 	nm.sibling = NULL;
6310 	return (netisr_domsg_global(&nm.base));
6311 }
6312 
6313 static int
6314 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6315 {
6316 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6317 
6318 	te->te_use = 0;
6319 	te->te_lastuse = 0;
6320 	return (0);
6321 }
6322 
6323 static void
6324 ipfw_table_zero_dispatch(netmsg_t nmsg)
6325 {
6326 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6327 	struct radix_node_head *rnh;
6328 
6329 	ASSERT_NETISR_NCPUS(mycpuid);
6330 
6331 	rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6332 	rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6333 
6334 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6335 }
6336 
6337 static void
6338 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6339 {
6340 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6341 	int i;
6342 
6343 	ASSERT_NETISR_NCPUS(mycpuid);
6344 
6345 	for (i = 0; i < ipfw_table_max; ++i) {
6346 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6347 
6348 		if (rnh != NULL)
6349 			rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6350 	}
6351 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6352 }
6353 
6354 static int
6355 ipfw_table_zero(struct sockopt *sopt)
6356 {
6357 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6358 	struct netmsg_base nm;
6359 	struct ipfw_ioc_table *tbl;
6360 
6361 	ASSERT_NETISR0;
6362 
6363 	if (sopt->sopt_valsize != sizeof(*tbl))
6364 		return (EINVAL);
6365 	tbl = sopt->sopt_val;
6366 
6367 	if (tbl->tableid < 0) {
6368 		netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6369 		    ipfw_table_zeroall_dispatch);
6370 		netisr_domsg_global(&nm);
6371 		return (0);
6372 	} else if (tbl->tableid >= ipfw_table_max) {
6373 		return (EINVAL);
6374 	} else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6375 		return (ENOENT);
6376 	}
6377 
6378 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6379 	    ipfw_table_zero_dispatch);
6380 	nm.lmsg.u.ms_result = tbl->tableid;
6381 	netisr_domsg_global(&nm);
6382 
6383 	return (0);
6384 }
6385 
6386 static int
6387 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6388 {
6389 	struct netmsg_tblexp *nm = xnm;
6390 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6391 
6392 	if (te->te_expired) {
6393 		ipfw_table_killrn(nm->rnh, rn);
6394 		nm->expcnt++;
6395 	}
6396 	return (0);
6397 }
6398 
6399 static void
6400 ipfw_table_expire_dispatch(netmsg_t nmsg)
6401 {
6402 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6403 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6404 	struct radix_node_head *rnh;
6405 
6406 	ASSERT_NETISR_NCPUS(mycpuid);
6407 
6408 	rnh = ctx->ipfw_tables[nm->tableid];
6409 	nm->rnh = rnh;
6410 	rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6411 
6412 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6413 	    ("not all expired addresses (%d) were deleted (%d)",
6414 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6415 
6416 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6417 }
6418 
6419 static void
6420 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6421 {
6422 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6423 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6424 	int i;
6425 
6426 	ASSERT_NETISR_NCPUS(mycpuid);
6427 
6428 	for (i = 0; i < ipfw_table_max; ++i) {
6429 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6430 
6431 		if (rnh == NULL)
6432 			continue;
6433 		nm->rnh = rnh;
6434 		rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6435 	}
6436 
6437 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6438 	    ("not all expired addresses (%d) were deleted (%d)",
6439 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6440 
6441 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6442 }
6443 
6444 static int
6445 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6446 {
6447 	struct netmsg_tblexp *nm = xnm;
6448 	struct ipfw_tblent *te;
6449 	time_t lastuse;
6450 
6451 	te = (struct ipfw_tblent *)rn;
6452 	lastuse = te->te_lastuse;
6453 
6454 	while ((te = te->te_sibling) != NULL) {
6455 		if (te->te_lastuse > lastuse)
6456 			lastuse = te->te_lastuse;
6457 	}
6458 	if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6459 		/* Not expired */
6460 		return (0);
6461 	}
6462 
6463 	te = (struct ipfw_tblent *)rn;
6464 	te->te_expired = 1;
6465 	while ((te = te->te_sibling) != NULL)
6466 		te->te_expired = 1;
6467 	nm->cnt++;
6468 
6469 	return (0);
6470 }
6471 
6472 static int
6473 ipfw_table_expire(struct sockopt *sopt)
6474 {
6475 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6476 	struct netmsg_tblexp nm;
6477 	struct ipfw_ioc_tblexp *tbl;
6478 	struct radix_node_head *rnh;
6479 
6480 	ASSERT_NETISR0;
6481 
6482 	if (sopt->sopt_valsize != sizeof(*tbl))
6483 		return (EINVAL);
6484 	tbl = sopt->sopt_val;
6485 	tbl->expcnt = 0;
6486 
6487 	nm.expcnt = 0;
6488 	nm.cnt = 0;
6489 	nm.expire = tbl->expire;
6490 
6491 	if (tbl->tableid < 0) {
6492 		int i;
6493 
6494 		for (i = 0; i < ipfw_table_max; ++i) {
6495 			rnh = ctx->ipfw_tables[i];
6496 			if (rnh == NULL)
6497 				continue;
6498 			rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6499 		}
6500 		if (nm.cnt == 0) {
6501 			/* No addresses can be expired. */
6502 			return (0);
6503 		}
6504 		tbl->expcnt = nm.cnt;
6505 
6506 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6507 		    MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6508 		nm.tableid = -1;
6509 		netisr_domsg_global(&nm.base);
6510 		KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6511 		    ("not all expired addresses (%d) were deleted (%d)",
6512 		     nm.cnt * netisr_ncpus, nm.expcnt));
6513 
6514 		return (0);
6515 	} else if (tbl->tableid >= ipfw_table_max) {
6516 		return (EINVAL);
6517 	}
6518 
6519 	rnh = ctx->ipfw_tables[tbl->tableid];
6520 	if (rnh == NULL)
6521 		return (ENOENT);
6522 	rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6523 	if (nm.cnt == 0) {
6524 		/* No addresses can be expired. */
6525 		return (0);
6526 	}
6527 	tbl->expcnt = nm.cnt;
6528 
6529 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6530 	    ipfw_table_expire_dispatch);
6531 	nm.tableid = tbl->tableid;
6532 	netisr_domsg_global(&nm.base);
6533 	KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6534 	    ("not all expired addresses (%d) were deleted (%d)",
6535 	     nm.cnt * netisr_ncpus, nm.expcnt));
6536 	return (0);
6537 }
6538 
6539 static void
6540 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6541 {
6542 	struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6543 
6544 	KKASSERT((rule->rule_flags &
6545 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6546 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6547 	ipfw_free_rule(rule);
6548 
6549 	netisr_replymsg(&nmsg->base, 0);
6550 }
6551 
6552 static void
6553 ipfw_crossref_reap(void)
6554 {
6555 	struct ip_fw *rule, *prev = NULL;
6556 
6557 	ASSERT_NETISR0;
6558 
6559 	rule = ipfw_gd.ipfw_crossref_free;
6560 	while (rule != NULL) {
6561 		uint64_t inflight = 0;
6562 		int i;
6563 
6564 		for (i = 0; i < netisr_ncpus; ++i)
6565 			inflight += rule->cross_rules[i]->cross_refs;
6566 		if (inflight == 0) {
6567 			struct ip_fw *f = rule;
6568 
6569 			/*
6570 			 * Unlink.
6571 			 */
6572 			rule = rule->next;
6573 			if (prev != NULL)
6574 				prev->next = rule;
6575 			else
6576 				ipfw_gd.ipfw_crossref_free = rule;
6577 
6578 			/*
6579 			 * Free.
6580 			 */
6581 			for (i = 1; i < netisr_ncpus; ++i) {
6582 				struct netmsg_base nm;
6583 
6584 				netmsg_init(&nm, NULL, &curthread->td_msgport,
6585 				    MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6586 				nm.lmsg.u.ms_resultp = f->cross_rules[i];
6587 				netisr_domsg(&nm, i);
6588 			}
6589 			KKASSERT((f->rule_flags &
6590 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6591 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6592 			ipfw_unref_rule(f);
6593 		} else {
6594 			prev = rule;
6595 			rule = rule->next;
6596 		}
6597 	}
6598 
6599 	if (ipfw_gd.ipfw_crossref_free != NULL) {
6600 		callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6601 		    ipfw_crossref_timeo, NULL);
6602 	}
6603 }
6604 
6605 /*
6606  * {set|get}sockopt parser.
6607  */
6608 static int
6609 ipfw_ctl(struct sockopt *sopt)
6610 {
6611 	int error, rulenum;
6612 	uint32_t *masks;
6613 	size_t size;
6614 
6615 	ASSERT_NETISR0;
6616 
6617 	error = 0;
6618 
6619 	switch (sopt->sopt_name) {
6620 	case IP_FW_GET:
6621 		error = ipfw_ctl_get_rules(sopt);
6622 		break;
6623 
6624 	case IP_FW_FLUSH:
6625 		ipfw_flush(0 /* keep default rule */);
6626 		break;
6627 
6628 	case IP_FW_ADD:
6629 		error = ipfw_ctl_add_rule(sopt);
6630 		break;
6631 
6632 	case IP_FW_DEL:
6633 		/*
6634 		 * IP_FW_DEL is used for deleting single rules or sets,
6635 		 * and (ab)used to atomically manipulate sets.
6636 		 * Argument size is used to distinguish between the two:
6637 		 *    sizeof(uint32_t)
6638 		 *	delete single rule or set of rules,
6639 		 *	or reassign rules (or sets) to a different set.
6640 		 *    2 * sizeof(uint32_t)
6641 		 *	atomic disable/enable sets.
6642 		 *	first uint32_t contains sets to be disabled,
6643 		 *	second uint32_t contains sets to be enabled.
6644 		 */
6645 		masks = sopt->sopt_val;
6646 		size = sopt->sopt_valsize;
6647 		if (size == sizeof(*masks)) {
6648 			/*
6649 			 * Delete or reassign static rule
6650 			 */
6651 			error = ipfw_ctl_alter(masks[0]);
6652 		} else if (size == (2 * sizeof(*masks))) {
6653 			/*
6654 			 * Set enable/disable
6655 			 */
6656 			ipfw_ctl_set_disable(masks[0], masks[1]);
6657 		} else {
6658 			error = EINVAL;
6659 		}
6660 		break;
6661 
6662 	case IP_FW_ZERO:
6663 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
6664 		rulenum = 0;
6665 
6666 		if (sopt->sopt_val != 0) {
6667 		    error = soopt_to_kbuf(sopt, &rulenum,
6668 			    sizeof(int), sizeof(int));
6669 		    if (error)
6670 			break;
6671 		}
6672 		error = ipfw_ctl_zero_entry(rulenum,
6673 			sopt->sopt_name == IP_FW_RESETLOG);
6674 		break;
6675 
6676 	case IP_FW_TBL_CREATE:
6677 		error = ipfw_table_create(sopt);
6678 		break;
6679 
6680 	case IP_FW_TBL_ADD:
6681 	case IP_FW_TBL_DEL:
6682 		error = ipfw_table_alt(sopt);
6683 		break;
6684 
6685 	case IP_FW_TBL_FLUSH:
6686 	case IP_FW_TBL_DESTROY:
6687 		error = ipfw_table_flush(sopt);
6688 		break;
6689 
6690 	case IP_FW_TBL_GET:
6691 		error = ipfw_table_get(sopt);
6692 		break;
6693 
6694 	case IP_FW_TBL_ZERO:
6695 		error = ipfw_table_zero(sopt);
6696 		break;
6697 
6698 	case IP_FW_TBL_EXPIRE:
6699 		error = ipfw_table_expire(sopt);
6700 		break;
6701 
6702 	default:
6703 		kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6704 		error = EINVAL;
6705 	}
6706 
6707 	ipfw_crossref_reap();
6708 	return error;
6709 }
6710 
6711 static void
6712 ipfw_keepalive_done(struct ipfw_context *ctx)
6713 {
6714 
6715 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6716 	    ("keepalive is not in progress"));
6717 	ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6718 	callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6719 	    ipfw_keepalive, NULL);
6720 }
6721 
6722 static void
6723 ipfw_keepalive_more(struct ipfw_context *ctx)
6724 {
6725 	struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6726 
6727 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6728 	    ("keepalive is not in progress"));
6729 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6730 	    ("keepalive more did not finish"));
6731 	netisr_sendmsg_oncpu(nm);
6732 }
6733 
6734 static void
6735 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6736 {
6737 	struct ipfw_state *s;
6738 	int scanned = 0, expired = 0, kept = 0;
6739 
6740 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6741 	    ("keepalive is not in progress"));
6742 
6743 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6744 		uint32_t ack_rev, ack_fwd;
6745 		struct ipfw_flow_id id;
6746 		uint8_t send_dir;
6747 
6748 		if (scanned++ >= ipfw_state_scan_max) {
6749 			ipfw_keepalive_more(ctx);
6750 			return;
6751 		}
6752 
6753 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6754 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6755 
6756 		/*
6757 		 * NOTE:
6758 		 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6759 		 * on slave xlat.
6760 		 */
6761 		if (s->st_type == O_ANCHOR)
6762 			continue;
6763 
6764 		if (IPFW_STATE_ISDEAD(s)) {
6765 			ipfw_state_remove(ctx, s);
6766 			if (++expired >= ipfw_state_expire_max) {
6767 				ipfw_keepalive_more(ctx);
6768 				return;
6769 			}
6770 			continue;
6771 		}
6772 
6773 		/*
6774 		 * Keep alive processing
6775 		 */
6776 
6777 		if (s->st_proto != IPPROTO_TCP)
6778 			continue;
6779 		if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6780 			continue;
6781 		if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6782 		    s->st_expire))
6783 			continue;	/* too early */
6784 
6785 		ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6786 		    &id.dst_ip, &id.dst_port);
6787 		ack_rev = s->st_ack_rev;
6788 		ack_fwd = s->st_ack_fwd;
6789 
6790 #define SEND_FWD	0x1
6791 #define SEND_REV	0x2
6792 
6793 		if (IPFW_ISXLAT(s->st_type)) {
6794 			const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6795 
6796 			if (x->xlat_dir == MATCH_FORWARD)
6797 				send_dir = SEND_FWD;
6798 			else
6799 				send_dir = SEND_REV;
6800 		} else {
6801 			send_dir = SEND_FWD | SEND_REV;
6802 		}
6803 
6804 		if (send_dir & SEND_REV)
6805 			send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6806 		if (send_dir & SEND_FWD)
6807 			send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6808 
6809 #undef SEND_FWD
6810 #undef SEND_REV
6811 
6812 		if (++kept >= ipfw_keepalive_max) {
6813 			ipfw_keepalive_more(ctx);
6814 			return;
6815 		}
6816 	}
6817 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6818 	ipfw_keepalive_done(ctx);
6819 }
6820 
6821 static void
6822 ipfw_keepalive_more_dispatch(netmsg_t nm)
6823 {
6824 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6825 	struct ipfw_state *anchor;
6826 
6827 	ASSERT_NETISR_NCPUS(mycpuid);
6828 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6829 	    ("keepalive is not in progress"));
6830 
6831 	/* Reply ASAP */
6832 	netisr_replymsg(&nm->base, 0);
6833 
6834 	anchor = &ctx->ipfw_keepalive_anch;
6835 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6836 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6837 		ipfw_keepalive_done(ctx);
6838 		return;
6839 	}
6840 	ipfw_keepalive_loop(ctx, anchor);
6841 }
6842 
6843 /*
6844  * This procedure is only used to handle keepalives. It is invoked
6845  * every dyn_keepalive_period
6846  */
6847 static void
6848 ipfw_keepalive_dispatch(netmsg_t nm)
6849 {
6850 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6851 	struct ipfw_state *anchor;
6852 
6853 	ASSERT_NETISR_NCPUS(mycpuid);
6854 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6855 	    ("keepalive is in progress"));
6856 	ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6857 
6858 	/* Reply ASAP */
6859 	crit_enter();
6860 	netisr_replymsg(&nm->base, 0);
6861 	crit_exit();
6862 
6863 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6864 		ipfw_keepalive_done(ctx);
6865 		return;
6866 	}
6867 
6868 	anchor = &ctx->ipfw_keepalive_anch;
6869 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6870 	ipfw_keepalive_loop(ctx, anchor);
6871 }
6872 
6873 /*
6874  * This procedure is only used to handle keepalives. It is invoked
6875  * every dyn_keepalive_period
6876  */
6877 static void
6878 ipfw_keepalive(void *dummy __unused)
6879 {
6880 	struct netmsg_base *msg;
6881 
6882 	KKASSERT(mycpuid < netisr_ncpus);
6883 	msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6884 
6885 	crit_enter();
6886 	if (msg->lmsg.ms_flags & MSGF_DONE)
6887 		netisr_sendmsg_oncpu(msg);
6888 	crit_exit();
6889 }
6890 
6891 static void
6892 ipfw_ip_input_dispatch(netmsg_t nmsg)
6893 {
6894 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6895 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6896 	struct mbuf *m = nm->m;
6897 	struct ip_fw *rule = nm->arg1;
6898 
6899 	ASSERT_NETISR_NCPUS(mycpuid);
6900 	KASSERT(rule->cpuid == mycpuid,
6901 	    ("rule does not belong to cpu%d", mycpuid));
6902 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6903 	    ("mbuf does not have ipfw continue rule"));
6904 
6905 	KASSERT(ctx->ipfw_cont_rule == NULL,
6906 	    ("pending ipfw continue rule"));
6907 	ctx->ipfw_cont_rule = rule;
6908 	ip_input(m);
6909 
6910 	/* May not be cleared, if ipfw was unload/disabled. */
6911 	ctx->ipfw_cont_rule = NULL;
6912 
6913 	/*
6914 	 * This rule is no longer used; decrement its cross_refs,
6915 	 * so this rule can be deleted.
6916 	 */
6917 	rule->cross_refs--;
6918 }
6919 
6920 static void
6921 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6922 {
6923 	struct netmsg_genpkt *nm;
6924 
6925 	KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6926 
6927 	/*
6928 	 * NOTE:
6929 	 * Bump cross_refs to prevent this rule and its siblings
6930 	 * from being deleted, while this mbuf is inflight.  The
6931 	 * cross_refs of the sibling rule on the target cpu will
6932 	 * be decremented, once this mbuf is going to be filtered
6933 	 * on the target cpu.
6934 	 */
6935 	rule->cross_refs++;
6936 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6937 
6938 	nm = &m->m_hdr.mh_genmsg;
6939 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6940 	    ipfw_ip_input_dispatch);
6941 	nm->m = m;
6942 	nm->arg1 = rule->cross_rules[cpuid];
6943 	netisr_sendmsg(&nm->base, cpuid);
6944 }
6945 
6946 static void
6947 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6948 {
6949 
6950 	args->flags = 0;
6951 	args->rule = NULL;
6952 	args->xlat = NULL;
6953 
6954 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6955 		struct m_tag *mtag;
6956 
6957 		/* Extract info from dummynet tag */
6958 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6959 		KKASSERT(mtag != NULL);
6960 		args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6961 		KKASSERT(args->rule != NULL);
6962 
6963 		m_tag_delete(m, mtag);
6964 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6965 	} else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6966 		struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6967 
6968 		KKASSERT(ctx->ipfw_cont_rule != NULL);
6969 		args->rule = ctx->ipfw_cont_rule;
6970 		ctx->ipfw_cont_rule = NULL;
6971 
6972 		if (ctx->ipfw_cont_xlat != NULL) {
6973 			args->xlat = ctx->ipfw_cont_xlat;
6974 			ctx->ipfw_cont_xlat = NULL;
6975 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6976 				args->flags |= IP_FWARG_F_XLATINS;
6977 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6978 			}
6979 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6980 				args->flags |= IP_FWARG_F_XLATFWD;
6981 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6982 			}
6983 		}
6984 		KKASSERT((m->m_pkthdr.fw_flags &
6985 		    (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6986 
6987 		args->flags |= IP_FWARG_F_CONT;
6988 		m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6989 	}
6990 
6991 	args->eh = NULL;
6992 	args->oif = oif;
6993 	args->m = m;
6994 }
6995 
6996 static int
6997 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6998 {
6999 	struct ip_fw_args args;
7000 	struct mbuf *m = *m0;
7001 	int tee = 0, error = 0, ret;
7002 
7003 	ipfw_init_args(&args, m, NULL);
7004 
7005 	ret = ipfw_chk(&args);
7006 	m = args.m;
7007 	if (m == NULL) {
7008 		if (ret != IP_FW_REDISPATCH)
7009 			error = EACCES;
7010 		goto back;
7011 	}
7012 
7013 	switch (ret) {
7014 	case IP_FW_PASS:
7015 		break;
7016 
7017 	case IP_FW_DENY:
7018 		m_freem(m);
7019 		m = NULL;
7020 		error = EACCES;
7021 		break;
7022 
7023 	case IP_FW_DUMMYNET:
7024 		/* Send packet to the appropriate pipe */
7025 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7026 		break;
7027 
7028 	case IP_FW_TEE:
7029 		tee = 1;
7030 		/* FALL THROUGH */
7031 
7032 	case IP_FW_DIVERT:
7033 		/*
7034 		 * Must clear bridge tag when changing
7035 		 */
7036 		m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7037 		if (ip_divert_p != NULL) {
7038 			m = ip_divert_p(m, tee, 1);
7039 		} else {
7040 			m_freem(m);
7041 			m = NULL;
7042 			/* not sure this is the right error msg */
7043 			error = EACCES;
7044 		}
7045 		break;
7046 
7047 	default:
7048 		panic("unknown ipfw return value: %d", ret);
7049 	}
7050 back:
7051 	*m0 = m;
7052 	return error;
7053 }
7054 
7055 static int
7056 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7057 {
7058 	struct ip_fw_args args;
7059 	struct mbuf *m = *m0;
7060 	int tee = 0, error = 0, ret;
7061 
7062 	ipfw_init_args(&args, m, ifp);
7063 
7064 	ret = ipfw_chk(&args);
7065 	m = args.m;
7066 	if (m == NULL) {
7067 		if (ret != IP_FW_REDISPATCH)
7068 			error = EACCES;
7069 		goto back;
7070 	}
7071 
7072 	switch (ret) {
7073 	case IP_FW_PASS:
7074 		break;
7075 
7076 	case IP_FW_DENY:
7077 		m_freem(m);
7078 		m = NULL;
7079 		error = EACCES;
7080 		break;
7081 
7082 	case IP_FW_DUMMYNET:
7083 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7084 		break;
7085 
7086 	case IP_FW_TEE:
7087 		tee = 1;
7088 		/* FALL THROUGH */
7089 
7090 	case IP_FW_DIVERT:
7091 		if (ip_divert_p != NULL) {
7092 			m = ip_divert_p(m, tee, 0);
7093 		} else {
7094 			m_freem(m);
7095 			m = NULL;
7096 			/* not sure this is the right error msg */
7097 			error = EACCES;
7098 		}
7099 		break;
7100 
7101 	default:
7102 		panic("unknown ipfw return value: %d", ret);
7103 	}
7104 back:
7105 	*m0 = m;
7106 	return error;
7107 }
7108 
7109 static void
7110 ipfw_hook(void)
7111 {
7112 	struct pfil_head *pfh;
7113 
7114 	ASSERT_NETISR0;
7115 
7116 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7117 	if (pfh == NULL)
7118 		return;
7119 
7120 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7121 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7122 }
7123 
7124 static void
7125 ipfw_dehook(void)
7126 {
7127 	struct pfil_head *pfh;
7128 
7129 	ASSERT_NETISR0;
7130 
7131 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7132 	if (pfh == NULL)
7133 		return;
7134 
7135 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7136 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7137 }
7138 
7139 static int
7140 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7141 {
7142 	int dyn_cnt;
7143 
7144 	dyn_cnt = ipfw_state_cntcoll();
7145 	dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7146 
7147 	return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7148 }
7149 
7150 static int
7151 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7152 {
7153 	int state_cnt;
7154 
7155 	state_cnt = ipfw_state_cntcoll();
7156 	return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7157 }
7158 
7159 static int
7160 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7161 {
7162 	int state_max, error;
7163 
7164 	state_max = ipfw_state_max;
7165 	error = sysctl_handle_int(oidp, &state_max, 0, req);
7166 	if (error || req->newptr == NULL)
7167 		return (error);
7168 
7169 	if (state_max < 1)
7170 		return (EINVAL);
7171 
7172 	ipfw_state_max_set(state_max);
7173 	return (0);
7174 }
7175 
7176 static int
7177 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7178 {
7179 	int dyn_max, error;
7180 
7181 	dyn_max = ipfw_state_max + ipfw_track_max;
7182 
7183 	error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7184 	if (error || req->newptr == NULL)
7185 		return (error);
7186 
7187 	if (dyn_max < 2)
7188 		return (EINVAL);
7189 
7190 	ipfw_state_max_set(dyn_max / 2);
7191 	ipfw_track_max = dyn_max / 2;
7192 	return (0);
7193 }
7194 
7195 static void
7196 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7197 {
7198 	int enable = nmsg->lmsg.u.ms_result;
7199 
7200 	ASSERT_NETISR0;
7201 
7202 	if (fw_enable == enable)
7203 		goto reply;
7204 
7205 	fw_enable = enable;
7206 	if (fw_enable)
7207 		ipfw_hook();
7208 	else
7209 		ipfw_dehook();
7210 reply:
7211 	netisr_replymsg(&nmsg->base, 0);
7212 }
7213 
7214 static int
7215 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7216 {
7217 	struct netmsg_base nmsg;
7218 	int enable, error;
7219 
7220 	enable = fw_enable;
7221 	error = sysctl_handle_int(oidp, &enable, 0, req);
7222 	if (error || req->newptr == NULL)
7223 		return error;
7224 
7225 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7226 	    ipfw_sysctl_enable_dispatch);
7227 	nmsg.lmsg.u.ms_result = enable;
7228 
7229 	return netisr_domsg(&nmsg, 0);
7230 }
7231 
7232 static int
7233 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7234 {
7235 	return sysctl_int_range(oidp, arg1, arg2, req,
7236 	       IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7237 }
7238 
7239 static int
7240 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7241 {
7242 
7243 	return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7244 }
7245 
7246 static int
7247 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7248 {
7249 	u_long stat = 0;
7250 	int cpu, error;
7251 
7252 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7253 		stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7254 
7255 	error = sysctl_handle_long(oidp, &stat, 0, req);
7256 	if (error || req->newptr == NULL)
7257 		return (error);
7258 
7259 	/* Zero out this stat. */
7260 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7261 		*((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7262 	return (0);
7263 }
7264 
7265 static void
7266 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7267 {
7268 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7269 	struct ipfw_context *ctx;
7270 	struct ip_fw *def_rule;
7271 
7272 	ASSERT_NETISR_NCPUS(mycpuid);
7273 
7274 	ctx = kmalloc(__offsetof(struct ipfw_context,
7275 	    ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7276 
7277 	RB_INIT(&ctx->ipfw_state_tree);
7278 	TAILQ_INIT(&ctx->ipfw_state_list);
7279 
7280 	RB_INIT(&ctx->ipfw_track_tree);
7281 	TAILQ_INIT(&ctx->ipfw_track_list);
7282 
7283 	callout_init_mp(&ctx->ipfw_stateto_ch);
7284 	netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7285 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7286 	ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7287 	netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7288 	    MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7289 
7290 	callout_init_mp(&ctx->ipfw_trackto_ch);
7291 	netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7292 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7293 	netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7294 	    MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7295 
7296 	callout_init_mp(&ctx->ipfw_keepalive_ch);
7297 	netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7298 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7299 	ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7300 	netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7301 	    MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7302 
7303 	callout_init_mp(&ctx->ipfw_xlatreap_ch);
7304 	netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7305 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7306 	TAILQ_INIT(&ctx->ipfw_xlatreap);
7307 
7308 	ipfw_ctx[mycpuid] = ctx;
7309 
7310 	def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7311 
7312 	def_rule->act_ofs = 0;
7313 	def_rule->rulenum = IPFW_DEFAULT_RULE;
7314 	def_rule->cmd_len = 1;
7315 	def_rule->set = IPFW_DEFAULT_SET;
7316 
7317 	def_rule->cmd[0].len = 1;
7318 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7319 	def_rule->cmd[0].opcode = O_ACCEPT;
7320 #else
7321 	if (filters_default_to_accept)
7322 		def_rule->cmd[0].opcode = O_ACCEPT;
7323 	else
7324 		def_rule->cmd[0].opcode = O_DENY;
7325 #endif
7326 
7327 	def_rule->refcnt = 1;
7328 	def_rule->cpuid = mycpuid;
7329 
7330 	/* Install the default rule */
7331 	ctx->ipfw_default_rule = def_rule;
7332 	ctx->ipfw_layer3_chain = def_rule;
7333 
7334 	/* Link rule CPU sibling */
7335 	ipfw_link_sibling(fwmsg, def_rule);
7336 
7337 	/* Statistics only need to be updated once */
7338 	if (mycpuid == 0)
7339 		ipfw_inc_static_count(def_rule);
7340 
7341 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7342 }
7343 
7344 static void
7345 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7346 {
7347 
7348 	crit_enter();
7349 	/* Reply ASAP */
7350 	netisr_replymsg(&nmsg->base, 0);
7351 	crit_exit();
7352 	ipfw_crossref_reap();
7353 }
7354 
7355 static void
7356 ipfw_crossref_timeo(void *dummy __unused)
7357 {
7358 	struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7359 
7360 	KKASSERT(mycpuid == 0);
7361 
7362 	crit_enter();
7363 	if (msg->lmsg.ms_flags & MSGF_DONE)
7364 		netisr_sendmsg_oncpu(msg);
7365 	crit_exit();
7366 }
7367 
7368 static void
7369 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7370 {
7371 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7372 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7373 	struct ip_fw *f;
7374 
7375 	ASSERT_NETISR_NCPUS(mycpuid);
7376 
7377 	for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7378 		int l, cmdlen;
7379 		ipfw_insn *cmd;
7380 
7381 		if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7382 			continue;
7383 
7384 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
7385 		     l -= cmdlen, cmd += cmdlen) {
7386 			cmdlen = F_LEN(cmd);
7387 			if (cmd->opcode == O_IP_SRC_IFIP ||
7388 			    cmd->opcode == O_IP_DST_IFIP) {
7389 				if (strncmp(ifp->if_xname,
7390 				    ((ipfw_insn_ifip *)cmd)->ifname,
7391 				    IFNAMSIZ) == 0)
7392 					cmd->arg1 &= ~IPFW_IFIP_VALID;
7393 			}
7394 		}
7395 	}
7396 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7397 }
7398 
7399 static void
7400 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7401     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7402 {
7403 	struct netmsg_base nm;
7404 
7405 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7406 	    ipfw_ifaddr_dispatch);
7407 	nm.lmsg.u.ms_resultp = ifp;
7408 	netisr_domsg_global(&nm);
7409 }
7410 
7411 static void
7412 ipfw_init_dispatch(netmsg_t nmsg)
7413 {
7414 	struct netmsg_ipfw fwmsg;
7415 	int error = 0, cpu;
7416 
7417 	ASSERT_NETISR0;
7418 
7419 	if (IPFW_LOADED) {
7420 		kprintf("IP firewall already loaded\n");
7421 		error = EEXIST;
7422 		goto reply;
7423 	}
7424 
7425 	if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7426 		ipfw_table_max = UINT16_MAX;
7427 
7428 	/* Initialize global track tree. */
7429 	RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7430 	IPFW_TRKCNT_TOKINIT;
7431 
7432 	/* GC for freed crossref rules. */
7433 	callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7434 	netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7435 	    MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7436 
7437 	ipfw_state_max_set(ipfw_state_max);
7438 	ipfw_state_headroom = 8 * netisr_ncpus;
7439 
7440 	bzero(&fwmsg, sizeof(fwmsg));
7441 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7442 	    ipfw_ctx_init_dispatch);
7443 	netisr_domsg_global(&fwmsg.base);
7444 
7445 	ip_fw_chk_ptr = ipfw_chk;
7446 	ip_fw_ctl_ptr = ipfw_ctl;
7447 	ip_fw_dn_io_ptr = ipfw_dummynet_io;
7448 
7449 	kprintf("ipfw2 initialized, default to %s, logging ",
7450 		ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7451 		O_ACCEPT ? "accept" : "deny");
7452 
7453 #ifdef IPFIREWALL_VERBOSE
7454 	fw_verbose = 1;
7455 #endif
7456 #ifdef IPFIREWALL_VERBOSE_LIMIT
7457 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7458 #endif
7459 	if (fw_verbose == 0) {
7460 		kprintf("disabled\n");
7461 	} else if (verbose_limit == 0) {
7462 		kprintf("unlimited\n");
7463 	} else {
7464 		kprintf("limited to %d packets/entry by default\n",
7465 			verbose_limit);
7466 	}
7467 
7468 	ip_fw_loaded = 1;
7469 	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7470 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7471 		    ipfw_state_expire_ipifunc, NULL, cpu);
7472 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7473 		    ipfw_track_expire_ipifunc, NULL, cpu);
7474 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7475 		    ipfw_keepalive, NULL, cpu);
7476 	}
7477 
7478 	if (fw_enable)
7479 		ipfw_hook();
7480 
7481 	ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7482 	    NULL, EVENTHANDLER_PRI_ANY);
7483 	if (ipfw_ifaddr_event == NULL)
7484 		kprintf("ipfw: ifaddr_event register failed\n");
7485 
7486 reply:
7487 	netisr_replymsg(&nmsg->base, error);
7488 }
7489 
7490 static int
7491 ipfw_init(void)
7492 {
7493 	struct netmsg_base smsg;
7494 
7495 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7496 	    ipfw_init_dispatch);
7497 	return netisr_domsg(&smsg, 0);
7498 }
7499 
7500 #ifdef KLD_MODULE
7501 
7502 static void
7503 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7504 {
7505 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7506 
7507 	ASSERT_NETISR_NCPUS(mycpuid);
7508 
7509 	callout_stop_sync(&ctx->ipfw_stateto_ch);
7510 	callout_stop_sync(&ctx->ipfw_trackto_ch);
7511 	callout_stop_sync(&ctx->ipfw_keepalive_ch);
7512 	callout_stop_sync(&ctx->ipfw_xlatreap_ch);
7513 
7514 	crit_enter();
7515 	netisr_dropmsg(&ctx->ipfw_stateexp_more);
7516 	netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7517 	netisr_dropmsg(&ctx->ipfw_trackexp_more);
7518 	netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7519 	netisr_dropmsg(&ctx->ipfw_keepalive_more);
7520 	netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7521 	netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7522 	crit_exit();
7523 
7524 	ipfw_table_flushall_oncpu(ctx, 1);
7525 
7526 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7527 }
7528 
7529 static void
7530 ipfw_fini_dispatch(netmsg_t nmsg)
7531 {
7532 	struct netmsg_base nm;
7533 	int error = 0, cpu;
7534 
7535 	ASSERT_NETISR0;
7536 
7537 	ipfw_crossref_reap();
7538 
7539 	if (ipfw_gd.ipfw_refcnt != 0) {
7540 		error = EBUSY;
7541 		goto reply;
7542 	}
7543 
7544 	ip_fw_loaded = 0;
7545 	ipfw_dehook();
7546 
7547 	/* Synchronize any inflight state/track expire IPIs. */
7548 	lwkt_synchronize_ipiqs("ipfwfini");
7549 
7550 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7551 	    ipfw_ctx_fini_dispatch);
7552 	netisr_domsg_global(&nm);
7553 
7554 	callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
7555 	crit_enter();
7556 	netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7557 	crit_exit();
7558 
7559 	if (ipfw_ifaddr_event != NULL)
7560 		EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7561 
7562 	ip_fw_chk_ptr = NULL;
7563 	ip_fw_ctl_ptr = NULL;
7564 	ip_fw_dn_io_ptr = NULL;
7565 	ipfw_flush(1 /* kill default rule */);
7566 
7567 	/* Free pre-cpu context */
7568 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7569 		kfree(ipfw_ctx[cpu], M_IPFW);
7570 
7571 	kprintf("IP firewall unloaded\n");
7572 reply:
7573 	netisr_replymsg(&nmsg->base, error);
7574 }
7575 
7576 static void
7577 ipfw_fflush_dispatch(netmsg_t nmsg)
7578 {
7579 
7580 	ipfw_flush(0 /* keep default rule */);
7581 	ipfw_crossref_reap();
7582 	netisr_replymsg(&nmsg->base, 0);
7583 }
7584 
7585 static int
7586 ipfw_fini(void)
7587 {
7588 	struct netmsg_base smsg;
7589 	int i = 0;
7590 
7591 	for (;;) {
7592 		netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7593 		    ipfw_fflush_dispatch);
7594 		netisr_domsg(&smsg, 0);
7595 
7596 		if (ipfw_gd.ipfw_refcnt == 0)
7597 			break;
7598 		kprintf("ipfw: flush pending %d\n", ++i);
7599 		tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7600 	}
7601 
7602 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7603 	    ipfw_fini_dispatch);
7604 	return netisr_domsg(&smsg, 0);
7605 }
7606 
7607 #endif	/* KLD_MODULE */
7608 
7609 static int
7610 ipfw_modevent(module_t mod, int type, void *unused)
7611 {
7612 	int err = 0;
7613 
7614 	switch (type) {
7615 	case MOD_LOAD:
7616 		err = ipfw_init();
7617 		break;
7618 
7619 	case MOD_UNLOAD:
7620 #ifndef KLD_MODULE
7621 		kprintf("ipfw statically compiled, cannot unload\n");
7622 		err = EBUSY;
7623 #else
7624 		err = ipfw_fini();
7625 #endif
7626 		break;
7627 	default:
7628 		break;
7629 	}
7630 	return err;
7631 }
7632 
7633 static moduledata_t ipfwmod = {
7634 	"ipfw",
7635 	ipfw_modevent,
7636 	0
7637 };
7638 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7639 MODULE_VERSION(ipfw, 1);
7640