xref: /dragonfly/sys/net/ipfw/ip_fw2.c (revision 1fe7e945)
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27 
28 /*
29  * Implement IP packet firewall (new version)
30  */
31 
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58 
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79 
80 #include <net/ipfw/ip_fw2.h>
81 
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85 	if (fw_debug > 0) \
86 		kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)	((void)0)
90 #endif
91 
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153 
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243 
244 #define IPFW_AUTOINC_STEP_MIN	1
245 #define IPFW_AUTOINC_STEP_MAX	1000
246 #define IPFW_AUTOINC_STEP_DEF	100
247 
248 #define IPFW_TABLE_MAX_DEF	64
249 
250 #define	IPFW_DEFAULT_RULE	65535	/* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET	31	/* set number for the default rule */
252 
253 #define MATCH_REVERSE		0
254 #define MATCH_FORWARD		1
255 #define MATCH_NONE		2
256 #define MATCH_UNKNOWN		3
257 
258 #define IPFW_STATE_TCPFLAGS	(TH_SYN | TH_FIN | TH_RST)
259 #define IPFW_STATE_TCPSTATES	(IPFW_STATE_TCPFLAGS |	\
260 				 (IPFW_STATE_TCPFLAGS << 8))
261 
262 #define BOTH_SYN		(TH_SYN | (TH_SYN << 8))
263 #define BOTH_FIN		(TH_FIN | (TH_FIN << 8))
264 #define BOTH_RST		(TH_RST | (TH_RST << 8))
265 /* TH_ACK here means FIN was ACKed. */
266 #define BOTH_FINACK		(TH_ACK | (TH_ACK << 8))
267 
268 #define IPFW_STATE_TCPCLOSED(s)	((s)->st_proto == IPPROTO_TCP &&	\
269 				 (((s)->st_state & BOTH_RST) ||		\
270 				  ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
271 
272 #define O_ANCHOR		O_NOP
273 
274 struct netmsg_ipfw {
275 	struct netmsg_base	base;
276 	const struct ipfw_ioc_rule *ioc_rule;
277 	struct ip_fw		*next_rule;
278 	struct ip_fw		*prev_rule;
279 	struct ip_fw		*sibling;
280 	uint32_t		rule_flags;
281 	struct ip_fw		**cross_rules;
282 };
283 
284 struct netmsg_del {
285 	struct netmsg_base	base;
286 	struct ip_fw		*start_rule;
287 	struct ip_fw		*prev_rule;
288 	uint16_t		rulenum;
289 	uint8_t			from_set;
290 	uint8_t			to_set;
291 };
292 
293 struct netmsg_zent {
294 	struct netmsg_base	base;
295 	struct ip_fw		*start_rule;
296 	uint16_t		rulenum;
297 	uint16_t		log_only;
298 };
299 
300 struct netmsg_cpstate {
301 	struct netmsg_base	base;
302 	struct ipfw_ioc_state	*ioc_state;
303 	int			state_cntmax;
304 	int			state_cnt;
305 };
306 
307 struct netmsg_tblent {
308 	struct netmsg_base	base;
309 	struct sockaddr		*key;
310 	struct sockaddr		*netmask;
311 	struct ipfw_tblent	*sibling;
312 	int			tableid;
313 };
314 
315 struct netmsg_tblflush {
316 	struct netmsg_base	base;
317 	int			tableid;
318 	int			destroy;
319 };
320 
321 struct netmsg_tblexp {
322 	struct netmsg_base	base;
323 	time_t			expire;
324 	int			tableid;
325 	int			cnt;
326 	int			expcnt;
327 	struct radix_node_head	*rnh;
328 };
329 
330 struct ipfw_table_cp {
331 	struct ipfw_ioc_tblent	*te;
332 	int			te_idx;
333 	int			te_cnt;
334 };
335 
336 struct ip_fw_local {
337 	/*
338 	 * offset	The offset of a fragment. offset != 0 means that
339 	 *	we have a fragment at this offset of an IPv4 packet.
340 	 *	offset == 0 means that (if this is an IPv4 packet)
341 	 *	this is the first or only fragment.
342 	 */
343 	u_short			offset;
344 
345 	/*
346 	 * Local copies of addresses. They are only valid if we have
347 	 * an IP packet.
348 	 *
349 	 * proto	The protocol. Set to 0 for non-ip packets,
350 	 *	or to the protocol read from the packet otherwise.
351 	 *	proto != 0 means that we have an IPv4 packet.
352 	 *
353 	 * src_port, dst_port	port numbers, in HOST format. Only
354 	 *	valid for TCP and UDP packets.
355 	 *
356 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
357 	 *	Only valid for IPv4 packets.
358 	 */
359 	uint8_t			proto;
360 	uint16_t		src_port;	/* NOTE: host format	*/
361 	uint16_t		dst_port;	/* NOTE: host format	*/
362 	struct in_addr		src_ip;		/* NOTE: network format	*/
363 	struct in_addr		dst_ip;		/* NOTE: network format	*/
364 	uint16_t		ip_len;
365 };
366 
367 struct ipfw_addrs {
368 	uint32_t		addr1;
369 	uint32_t		addr2;
370 };
371 
372 struct ipfw_ports {
373 	uint16_t		port1;
374 	uint16_t		port2;
375 };
376 
377 struct ipfw_key {
378 	union {
379 		struct ipfw_addrs addrs;
380 		uint64_t	value;
381 	} addr_u;
382 	union {
383 		struct ipfw_ports ports;
384 		uint32_t	value;
385 	} port_u;
386 	uint8_t			proto;
387 	uint8_t			swap;	/* IPFW_KEY_SWAP_ */
388 	uint16_t		rsvd2;
389 };
390 
391 #define IPFW_KEY_SWAP_ADDRS	0x1
392 #define IPFW_KEY_SWAP_PORTS	0x2
393 #define IPFW_KEY_SWAP_ALL	(IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
394 
395 struct ipfw_trkcnt {
396 	RB_ENTRY(ipfw_trkcnt)	tc_rblink;
397 	struct ipfw_key		tc_key;
398 	uintptr_t		tc_ruleid;
399 	int			tc_refs;
400 	int			tc_count;
401 	time_t			tc_expire;	/* userland get-only */
402 	uint16_t		tc_rulenum;	/* userland get-only */
403 } __cachealign;
404 
405 #define tc_addrs		tc_key.addr_u.value
406 #define tc_ports		tc_key.port_u.value
407 #define tc_proto		tc_key.proto
408 #define tc_saddr		tc_key.addr_u.addrs.addr1
409 #define tc_daddr		tc_key.addr_u.addrs.addr2
410 #define tc_sport		tc_key.port_u.ports.port1
411 #define tc_dport		tc_key.port_u.ports.port2
412 
413 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
414 
415 struct ipfw_state;
416 
417 struct ipfw_track {
418 	RB_ENTRY(ipfw_track)	t_rblink;
419 	struct ipfw_key		t_key;
420 	struct ip_fw		*t_rule;
421 	time_t			t_lastexp;
422 	LIST_HEAD(, ipfw_state)	t_state_list;
423 	time_t			t_expire;
424 	volatile int		*t_count;
425 	struct ipfw_trkcnt	*t_trkcnt;
426 	TAILQ_ENTRY(ipfw_track)	t_link;
427 };
428 
429 #define t_addrs			t_key.addr_u.value
430 #define t_ports			t_key.port_u.value
431 #define t_proto			t_key.proto
432 #define t_saddr			t_key.addr_u.addrs.addr1
433 #define t_daddr			t_key.addr_u.addrs.addr2
434 #define t_sport			t_key.port_u.ports.port1
435 #define t_dport			t_key.port_u.ports.port2
436 
437 RB_HEAD(ipfw_track_tree, ipfw_track);
438 TAILQ_HEAD(ipfw_track_list, ipfw_track);
439 
440 struct ipfw_state {
441 	RB_ENTRY(ipfw_state)	st_rblink;
442 	struct ipfw_key		st_key;
443 
444 	time_t			st_expire;	/* expire time */
445 	struct ip_fw		*st_rule;
446 
447 	uint64_t		st_pcnt;	/* packets */
448 	uint64_t		st_bcnt;	/* bytes */
449 
450 	/*
451 	 * st_state:
452 	 * State of this rule, typically a combination of TCP flags.
453 	 *
454 	 * st_ack_fwd/st_ack_rev:
455 	 * Most recent ACKs in forward and reverse direction.  They
456 	 * are used to generate keepalives.
457 	 */
458 	uint32_t		st_state;
459 	uint32_t		st_ack_fwd;
460 	uint32_t		st_seq_fwd;
461 	uint32_t		st_ack_rev;
462 	uint32_t		st_seq_rev;
463 
464 	uint16_t		st_flags;	/* IPFW_STATE_F_ */
465 	uint16_t		st_type;	/* O_KEEP_STATE/O_LIMIT */
466 	struct ipfw_track	*st_track;
467 
468 	LIST_ENTRY(ipfw_state)	st_trklink;
469 	TAILQ_ENTRY(ipfw_state)	st_link;
470 };
471 
472 #define st_addrs		st_key.addr_u.value
473 #define st_ports		st_key.port_u.value
474 #define st_proto		st_key.proto
475 #define st_swap			st_key.swap
476 
477 #define IPFW_STATE_F_ACKFWD	0x0001
478 #define IPFW_STATE_F_SEQFWD	0x0002
479 #define IPFW_STATE_F_ACKREV	0x0004
480 #define IPFW_STATE_F_SEQREV	0x0008
481 
482 TAILQ_HEAD(ipfw_state_list, ipfw_state);
483 RB_HEAD(ipfw_state_tree, ipfw_state);
484 
485 struct ipfw_tblent {
486 	struct radix_node	te_nodes[2];
487 	struct sockaddr_in	te_key;
488 	u_long			te_use;
489 	time_t			te_lastuse;
490 	struct ipfw_tblent	*te_sibling;
491 	volatile int		te_expired;
492 };
493 
494 struct ipfw_context {
495 	struct ip_fw		*ipfw_layer3_chain;	/* rules for layer3 */
496 	struct ip_fw		*ipfw_default_rule;	/* default rule */
497 	uint64_t		ipfw_norule_counter;	/* ipfw_log(NULL) stat*/
498 
499 	/*
500 	 * ipfw_set_disable contains one bit per set value (0..31).
501 	 * If the bit is set, all rules with the corresponding set
502 	 * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
503 	 * default rule and CANNOT be disabled.
504 	 */
505 	uint32_t		ipfw_set_disable;
506 
507 	uint8_t			ipfw_flags;	/* IPFW_FLAG_ */
508 
509 	struct ip_fw		*ipfw_cont_rule;
510 
511 	struct ipfw_state_tree	ipfw_state_tree;
512 	struct ipfw_state_list	ipfw_state_list;
513 	int			ipfw_state_loosecnt;
514 	int			ipfw_state_cnt;
515 
516 	union {
517 		struct ipfw_state state;
518 		struct ipfw_track track;
519 		struct ipfw_trkcnt trkcnt;
520 	} ipfw_tmpkey;
521 
522 	struct ipfw_track_tree	ipfw_track_tree;
523 	struct ipfw_track_list	ipfw_track_list;
524 	struct ipfw_trkcnt	*ipfw_trkcnt_spare;
525 
526 	struct callout		ipfw_stateto_ch;
527 	time_t			ipfw_state_lastexp;
528 	struct netmsg_base	ipfw_stateexp_nm;
529 	struct netmsg_base	ipfw_stateexp_more;
530 	struct ipfw_state	ipfw_stateexp_anch;
531 
532 	struct callout		ipfw_trackto_ch;
533 	time_t			ipfw_track_lastexp;
534 	struct netmsg_base	ipfw_trackexp_nm;
535 	struct netmsg_base	ipfw_trackexp_more;
536 	struct ipfw_track	ipfw_trackexp_anch;
537 
538 	struct callout		ipfw_keepalive_ch;
539 	struct netmsg_base	ipfw_keepalive_nm;
540 	struct netmsg_base	ipfw_keepalive_more;
541 	struct ipfw_state	ipfw_keepalive_anch;
542 
543 	/*
544 	 * Statistics
545 	 */
546 	u_long			ipfw_sts_reap;
547 	u_long			ipfw_sts_reapfailed;
548 	u_long			ipfw_sts_overflow;
549 	u_long			ipfw_sts_nomem;
550 	u_long			ipfw_sts_tcprecycled;
551 
552 	u_long			ipfw_tks_nomem;
553 	u_long			ipfw_tks_reap;
554 	u_long			ipfw_tks_reapfailed;
555 	u_long			ipfw_tks_overflow;
556 	u_long			ipfw_tks_cntnomem;
557 
558 	u_long			ipfw_frags;
559 	u_long			ipfw_defraged;
560 	u_long			ipfw_defrag_remote;
561 
562 	/* Last field */
563 	struct radix_node_head	*ipfw_tables[];
564 };
565 
566 #define IPFW_FLAG_KEEPALIVE	0x01
567 #define IPFW_FLAG_STATEEXP	0x02
568 #define IPFW_FLAG_TRACKEXP	0x04
569 #define IPFW_FLAG_STATEREAP	0x08
570 #define IPFW_FLAG_TRACKREAP	0x10
571 
572 #define ipfw_state_tmpkey	ipfw_tmpkey.state
573 #define ipfw_track_tmpkey	ipfw_tmpkey.track
574 #define ipfw_trkcnt_tmpkey	ipfw_tmpkey.trkcnt
575 
576 struct ipfw_global {
577 	int			ipfw_state_loosecnt;	/* cache aligned */
578 	time_t			ipfw_state_globexp __cachealign;
579 
580 	struct lwkt_token	ipfw_trkcnt_token __cachealign;
581 	struct ipfw_trkcnt_tree	ipfw_trkcnt_tree;
582 	int			ipfw_trkcnt_cnt;
583 	time_t			ipfw_track_globexp;
584 
585 	/* Accessed in netisr0. */
586 	struct ip_fw		*ipfw_crossref_free __cachealign;
587 	struct callout		ipfw_crossref_ch;
588 	struct netmsg_base	ipfw_crossref_nm;
589 
590 #ifdef KLD_MODULE
591 	/*
592 	 * Module can not be unloaded, if there are references to
593 	 * certains rules of ipfw(4), e.g. dummynet(4)
594 	 */
595 	int			ipfw_refcnt __cachealign;
596 #endif
597 } __cachealign;
598 
599 static struct ipfw_context	*ipfw_ctx[MAXCPU];
600 
601 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
602 
603 /*
604  * Following two global variables are accessed and updated only
605  * in netisr0.
606  */
607 static uint32_t static_count;	/* # of static rules */
608 static uint32_t static_ioc_len;	/* bytes of static rules */
609 
610 /*
611  * If 1, then ipfw static rules are being flushed,
612  * ipfw_chk() will skip to the default rule.
613  */
614 static int ipfw_flushing;
615 
616 static int fw_verbose;
617 static int verbose_limit;
618 
619 static int fw_debug;
620 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
621 
622 static int	ipfw_table_max = IPFW_TABLE_MAX_DEF;
623 
624 static int	ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
625 static int	ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
626 
627 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
628 
629 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
630 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
631     "Firewall statistics");
632 
633 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
634     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
635 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
636     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
637     "Rule number autincrement step");
638 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
639     &fw_one_pass, 0,
640     "Only do a single pass through ipfw when using dummynet(4)");
641 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
642     &fw_debug, 0, "Enable printing of debug ip_fw statements");
643 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
644     &fw_verbose, 0, "Log matches to ipfw rules");
645 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
646     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
647 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
648     &ipfw_table_max, 0, "Max # of tables");
649 
650 static int	ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
651 static int	ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
652 static int	ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
653 static int	ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
654 static int	ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
655 static int	ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
656 
657 /*
658  * Timeouts for various events in handing states.
659  *
660  * NOTE:
661  * 1 == 0~1 second.
662  * 2 == 1~2 second(s).
663  *
664  * We use 2 seconds for FIN lifetime, so that the states will not be
665  * ripped prematurely.
666  */
667 static uint32_t dyn_ack_lifetime = 300;
668 static uint32_t dyn_syn_lifetime = 20;
669 static uint32_t dyn_finwait_lifetime = 20;
670 static uint32_t dyn_fin_lifetime = 2;
671 static uint32_t dyn_rst_lifetime = 2;
672 static uint32_t dyn_udp_lifetime = 10;
673 static uint32_t dyn_short_lifetime = 5;	/* used by tracks too */
674 
675 /*
676  * Keepalives are sent if dyn_keepalive is set. They are sent every
677  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
678  * seconds of lifetime of a rule.
679  */
680 static uint32_t dyn_keepalive_interval = 20;
681 static uint32_t dyn_keepalive_period = 5;
682 static uint32_t dyn_keepalive = 1;	/* do send keepalives */
683 
684 static struct ipfw_global	ipfw_gd;
685 static int	ipfw_state_loosecnt_updthr;
686 static int	ipfw_state_max = 4096;	/* max # of states */
687 static int	ipfw_track_max = 4096;	/* max # of tracks */
688 
689 static int	ipfw_state_headroom;	/* setup at module load time */
690 static int	ipfw_state_reap_min = 8;
691 static int	ipfw_state_expire_max = 32;
692 static int	ipfw_state_scan_max = 256;
693 static int	ipfw_keepalive_max = 8;
694 static int	ipfw_track_reap_max = 4;
695 static int	ipfw_track_expire_max = 16;
696 static int	ipfw_track_scan_max = 128;
697 
698 static eventhandler_tag ipfw_ifaddr_event;
699 
700 /* Compat */
701 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
702     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
703     "Number of states and tracks");
704 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
705     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
706     "Max number of states and tracks");
707 
708 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
709     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
710     "Number of states");
711 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
712     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
713     "Max number of states");
714 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
715     &ipfw_state_headroom, 0, "headroom for state reap");
716 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
717     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
718 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
719     &ipfw_track_max, 0, "Max number of tracks");
720 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
721     &static_count, 0, "Number of static rules");
722 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
723     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
724 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
725     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
726 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
727     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
728 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
729     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
730 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
731     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
732 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
733     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
734 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
735     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
736 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
737     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
738 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
739     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
740     "I", "# of states to scan for each expire iteration");
741 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
742     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
743     "I", "# of states to expire for each expire iteration");
744 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
745     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
746     "I", "# of states to expire for each expire iteration");
747 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
748     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
749     "I", "# of states to reap for state shortage");
750 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
751     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
752     "I", "# of tracks to scan for each expire iteration");
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
754     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
755     "I", "# of tracks to expire for each expire iteration");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
757     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
758     "I", "# of tracks to reap for track shortage");
759 
760 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
761     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
762     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
763     "LU", "# of state reaps due to states shortage");
764 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
765     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
766     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
767     "LU", "# of state reap failure");
768 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
769     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
770     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
771     "LU", "# of state overflow");
772 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
773     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
774     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
775     "LU", "# of state allocation failure");
776 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
777     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
778     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
779     "LU", "# of state deleted due to fast TCP port recycling");
780 
781 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
782     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
783     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
784     "LU", "# of track allocation failure");
785 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
786     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
787     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
788     "LU", "# of track reap due to tracks shortage");
789 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
790     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
791     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
792     "LU", "# of track reap failure");
793 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
794     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
795     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
796     "LU", "# of track overflow");
797 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
798     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
799     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
800     "LU", "# of track counter allocation failure");
801 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
802     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
803     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
804     "LU", "# of IP fragements defraged");
805 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
806     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
807     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
808     "LU", "# of IP packets after defrag");
809 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
810     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
811     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
812     "LU", "# of IP packets after defrag dispatched to remote cpus");
813 
814 static int		ipfw_state_cmp(struct ipfw_state *,
815 			    struct ipfw_state *);
816 static int		ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
817 			    struct ipfw_trkcnt *);
818 static int		ipfw_track_cmp(struct ipfw_track *,
819 			    struct ipfw_track *);
820 
821 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
822 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
823 
824 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
825 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
826 
827 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
828 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
829 
830 static ip_fw_chk_t	ipfw_chk;
831 static void		ipfw_track_expire_ipifunc(void *);
832 static void		ipfw_state_expire_ipifunc(void *);
833 static void		ipfw_keepalive(void *);
834 static int		ipfw_state_expire_start(struct ipfw_context *,
835 			    int, int);
836 static void		ipfw_crossref_timeo(void *);
837 
838 #define IPFW_TRKCNT_TOKGET	lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
839 #define IPFW_TRKCNT_TOKREL	lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
840 #define IPFW_TRKCNT_TOKINIT	\
841 	lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
842 
843 static void
844 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
845     const struct sockaddr *netmask)
846 {
847 	const u_char *cp1 = (const u_char *)src;
848 	u_char *cp2 = (u_char *)dst;
849 	const u_char *cp3 = (const u_char *)netmask;
850 	u_char *cplim = cp2 + *cp3;
851 	u_char *cplim2 = cp2 + *cp1;
852 
853 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
854 	cp3 += 2;
855 	if (cplim > cplim2)
856 		cplim = cplim2;
857 	while (cp2 < cplim)
858 		*cp2++ = *cp1++ & *cp3++;
859 	if (cp2 < cplim2)
860 		bzero(cp2, cplim2 - cp2);
861 }
862 
863 static __inline void
864 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
865     in_addr_t daddr, uint16_t dport, uint8_t proto)
866 {
867 
868 	key->proto = proto;
869 	key->swap = 0;
870 
871 	if (saddr < daddr) {
872 		key->addr_u.addrs.addr1 = daddr;
873 		key->addr_u.addrs.addr2 = saddr;
874 		key->swap |= IPFW_KEY_SWAP_ADDRS;
875 	} else {
876 		key->addr_u.addrs.addr1 = saddr;
877 		key->addr_u.addrs.addr2 = daddr;
878 	}
879 
880 	if (sport < dport) {
881 		key->port_u.ports.port1 = dport;
882 		key->port_u.ports.port2 = sport;
883 		key->swap |= IPFW_KEY_SWAP_PORTS;
884 	} else {
885 		key->port_u.ports.port1 = sport;
886 		key->port_u.ports.port2 = dport;
887 	}
888 
889 	if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
890 		key->swap |= IPFW_KEY_SWAP_PORTS;
891 	if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
892 		key->swap |= IPFW_KEY_SWAP_ADDRS;
893 }
894 
895 static __inline void
896 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
897     in_addr_t *daddr, uint16_t *dport)
898 {
899 
900 	if (key->swap & IPFW_KEY_SWAP_ADDRS) {
901 		*saddr = key->addr_u.addrs.addr2;
902 		*daddr = key->addr_u.addrs.addr1;
903 	} else {
904 		*saddr = key->addr_u.addrs.addr1;
905 		*daddr = key->addr_u.addrs.addr2;
906 	}
907 
908 	if (key->swap & IPFW_KEY_SWAP_PORTS) {
909 		*sport = key->port_u.ports.port2;
910 		*dport = key->port_u.ports.port1;
911 	} else {
912 		*sport = key->port_u.ports.port1;
913 		*dport = key->port_u.ports.port2;
914 	}
915 }
916 
917 static int
918 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
919 {
920 
921 	if (s1->st_proto > s2->st_proto)
922 		return (1);
923 	if (s1->st_proto < s2->st_proto)
924 		return (-1);
925 
926 	if (s1->st_addrs > s2->st_addrs)
927 		return (1);
928 	if (s1->st_addrs < s2->st_addrs)
929 		return (-1);
930 
931 	if (s1->st_ports > s2->st_ports)
932 		return (1);
933 	if (s1->st_ports < s2->st_ports)
934 		return (-1);
935 
936 	if (s1->st_swap == s2->st_swap ||
937 	    (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
938 		return (0);
939 
940 	if (s1->st_swap > s2->st_swap)
941 		return (1);
942 	else
943 		return (-1);
944 }
945 
946 static int
947 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
948 {
949 
950 	if (t1->tc_proto > t2->tc_proto)
951 		return (1);
952 	if (t1->tc_proto < t2->tc_proto)
953 		return (-1);
954 
955 	if (t1->tc_addrs > t2->tc_addrs)
956 		return (1);
957 	if (t1->tc_addrs < t2->tc_addrs)
958 		return (-1);
959 
960 	if (t1->tc_ports > t2->tc_ports)
961 		return (1);
962 	if (t1->tc_ports < t2->tc_ports)
963 		return (-1);
964 
965 	if (t1->tc_ruleid > t2->tc_ruleid)
966 		return (1);
967 	if (t1->tc_ruleid < t2->tc_ruleid)
968 		return (-1);
969 
970 	return (0);
971 }
972 
973 static int
974 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
975 {
976 
977 	if (t1->t_proto > t2->t_proto)
978 		return (1);
979 	if (t1->t_proto < t2->t_proto)
980 		return (-1);
981 
982 	if (t1->t_addrs > t2->t_addrs)
983 		return (1);
984 	if (t1->t_addrs < t2->t_addrs)
985 		return (-1);
986 
987 	if (t1->t_ports > t2->t_ports)
988 		return (1);
989 	if (t1->t_ports < t2->t_ports)
990 		return (-1);
991 
992 	if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
993 		return (1);
994 	if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
995 		return (-1);
996 
997 	return (0);
998 }
999 
1000 static void
1001 ipfw_state_max_set(int state_max)
1002 {
1003 
1004 	ipfw_state_max = state_max;
1005 	/* Allow 5% states over-allocation. */
1006 	ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1007 }
1008 
1009 static __inline int
1010 ipfw_state_cntcoll(void)
1011 {
1012 	int cpu, state_cnt = 0;
1013 
1014 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1015 		state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1016 	return (state_cnt);
1017 }
1018 
1019 static __inline int
1020 ipfw_state_cntsync(void)
1021 {
1022 	int state_cnt;
1023 
1024 	state_cnt = ipfw_state_cntcoll();
1025 	ipfw_gd.ipfw_state_loosecnt = state_cnt;
1026 	return (state_cnt);
1027 }
1028 
1029 static __inline int
1030 ipfw_free_rule(struct ip_fw *rule)
1031 {
1032 	KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1033 	KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1034 	rule->refcnt--;
1035 	if (rule->refcnt == 0) {
1036 		if (rule->cross_rules != NULL)
1037 			kfree(rule->cross_rules, M_IPFW);
1038 		kfree(rule, M_IPFW);
1039 		return 1;
1040 	}
1041 	return 0;
1042 }
1043 
1044 static void
1045 ipfw_unref_rule(void *priv)
1046 {
1047 	ipfw_free_rule(priv);
1048 #ifdef KLD_MODULE
1049 	KASSERT(ipfw_gd.ipfw_refcnt > 0,
1050 	    ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1051 	atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1052 #endif
1053 }
1054 
1055 static __inline void
1056 ipfw_ref_rule(struct ip_fw *rule)
1057 {
1058 	KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1059 #ifdef KLD_MODULE
1060 	atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1061 #endif
1062 	rule->refcnt++;
1063 }
1064 
1065 /*
1066  * This macro maps an ip pointer into a layer3 header pointer of type T
1067  */
1068 #define	L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1069 
1070 static __inline int
1071 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1072 {
1073 	int type = L3HDR(struct icmp,ip)->icmp_type;
1074 
1075 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
1076 }
1077 
1078 #define TT	((1 << ICMP_ECHO) | \
1079 		 (1 << ICMP_ROUTERSOLICIT) | \
1080 		 (1 << ICMP_TSTAMP) | \
1081 		 (1 << ICMP_IREQ) | \
1082 		 (1 << ICMP_MASKREQ))
1083 
1084 static int
1085 is_icmp_query(struct ip *ip)
1086 {
1087 	int type = L3HDR(struct icmp, ip)->icmp_type;
1088 
1089 	return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
1090 }
1091 
1092 #undef TT
1093 
1094 /*
1095  * The following checks use two arrays of 8 or 16 bits to store the
1096  * bits that we want set or clear, respectively. They are in the
1097  * low and high half of cmd->arg1 or cmd->d[0].
1098  *
1099  * We scan options and store the bits we find set. We succeed if
1100  *
1101  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1102  *
1103  * The code is sometimes optimized not to store additional variables.
1104  */
1105 static int
1106 flags_match(ipfw_insn *cmd, uint8_t bits)
1107 {
1108 	u_char want_clear;
1109 	bits = ~bits;
1110 
1111 	if (((cmd->arg1 & 0xff) & bits) != 0)
1112 		return 0; /* some bits we want set were clear */
1113 
1114 	want_clear = (cmd->arg1 >> 8) & 0xff;
1115 	if ((want_clear & bits) != want_clear)
1116 		return 0; /* some bits we want clear were set */
1117 	return 1;
1118 }
1119 
1120 static int
1121 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1122 {
1123 	int optlen, bits = 0;
1124 	u_char *cp = (u_char *)(ip + 1);
1125 	int x = (ip->ip_hl << 2) - sizeof(struct ip);
1126 
1127 	for (; x > 0; x -= optlen, cp += optlen) {
1128 		int opt = cp[IPOPT_OPTVAL];
1129 
1130 		if (opt == IPOPT_EOL)
1131 			break;
1132 
1133 		if (opt == IPOPT_NOP) {
1134 			optlen = 1;
1135 		} else {
1136 			optlen = cp[IPOPT_OLEN];
1137 			if (optlen <= 0 || optlen > x)
1138 				return 0; /* invalid or truncated */
1139 		}
1140 
1141 		switch (opt) {
1142 		case IPOPT_LSRR:
1143 			bits |= IP_FW_IPOPT_LSRR;
1144 			break;
1145 
1146 		case IPOPT_SSRR:
1147 			bits |= IP_FW_IPOPT_SSRR;
1148 			break;
1149 
1150 		case IPOPT_RR:
1151 			bits |= IP_FW_IPOPT_RR;
1152 			break;
1153 
1154 		case IPOPT_TS:
1155 			bits |= IP_FW_IPOPT_TS;
1156 			break;
1157 
1158 		default:
1159 			break;
1160 		}
1161 	}
1162 	return (flags_match(cmd, bits));
1163 }
1164 
1165 static int
1166 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1167 {
1168 	int optlen, bits = 0;
1169 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1170 	u_char *cp = (u_char *)(tcp + 1);
1171 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1172 
1173 	for (; x > 0; x -= optlen, cp += optlen) {
1174 		int opt = cp[0];
1175 
1176 		if (opt == TCPOPT_EOL)
1177 			break;
1178 
1179 		if (opt == TCPOPT_NOP) {
1180 			optlen = 1;
1181 		} else {
1182 			optlen = cp[1];
1183 			if (optlen <= 0)
1184 				break;
1185 		}
1186 
1187 		switch (opt) {
1188 		case TCPOPT_MAXSEG:
1189 			bits |= IP_FW_TCPOPT_MSS;
1190 			break;
1191 
1192 		case TCPOPT_WINDOW:
1193 			bits |= IP_FW_TCPOPT_WINDOW;
1194 			break;
1195 
1196 		case TCPOPT_SACK_PERMITTED:
1197 		case TCPOPT_SACK:
1198 			bits |= IP_FW_TCPOPT_SACK;
1199 			break;
1200 
1201 		case TCPOPT_TIMESTAMP:
1202 			bits |= IP_FW_TCPOPT_TS;
1203 			break;
1204 
1205 		case TCPOPT_CC:
1206 		case TCPOPT_CCNEW:
1207 		case TCPOPT_CCECHO:
1208 			bits |= IP_FW_TCPOPT_CC;
1209 			break;
1210 
1211 		default:
1212 			break;
1213 		}
1214 	}
1215 	return (flags_match(cmd, bits));
1216 }
1217 
1218 static int
1219 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1220 {
1221 	if (ifp == NULL)	/* no iface with this packet, match fails */
1222 		return 0;
1223 
1224 	/* Check by name or by IP address */
1225 	if (cmd->name[0] != '\0') { /* match by name */
1226 		/* Check name */
1227 		if (cmd->p.glob) {
1228 			if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1229 				return(1);
1230 		} else {
1231 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1232 				return(1);
1233 		}
1234 	} else {
1235 		struct ifaddr_container *ifac;
1236 
1237 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1238 			struct ifaddr *ia = ifac->ifa;
1239 
1240 			if (ia->ifa_addr == NULL)
1241 				continue;
1242 			if (ia->ifa_addr->sa_family != AF_INET)
1243 				continue;
1244 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1245 			    (ia->ifa_addr))->sin_addr.s_addr)
1246 				return(1);	/* match */
1247 		}
1248 	}
1249 	return(0);	/* no match, fail ... */
1250 }
1251 
1252 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1253 
1254 /*
1255  * We enter here when we have a rule with O_LOG.
1256  * XXX this function alone takes about 2Kbytes of code!
1257  */
1258 static void
1259 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1260     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1261 {
1262 	char *action;
1263 	int limit_reached = 0;
1264 	char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1265 
1266 	fragment[0] = '\0';
1267 	proto[0] = '\0';
1268 
1269 	if (f == NULL) {	/* bogus pkt */
1270 		if (verbose_limit != 0 &&
1271 		    ctx->ipfw_norule_counter >= verbose_limit)
1272 			return;
1273 		ctx->ipfw_norule_counter++;
1274 		if (ctx->ipfw_norule_counter == verbose_limit)
1275 			limit_reached = verbose_limit;
1276 		action = "Refuse";
1277 	} else {	/* O_LOG is the first action, find the real one */
1278 		ipfw_insn *cmd = ACTION_PTR(f);
1279 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1280 
1281 		if (l->max_log != 0 && l->log_left == 0)
1282 			return;
1283 		l->log_left--;
1284 		if (l->log_left == 0)
1285 			limit_reached = l->max_log;
1286 		cmd += F_LEN(cmd);	/* point to first action */
1287 		if (cmd->opcode == O_PROB)
1288 			cmd += F_LEN(cmd);
1289 
1290 		action = action2;
1291 		switch (cmd->opcode) {
1292 		case O_DENY:
1293 			action = "Deny";
1294 			break;
1295 
1296 		case O_REJECT:
1297 			if (cmd->arg1==ICMP_REJECT_RST) {
1298 				action = "Reset";
1299 			} else if (cmd->arg1==ICMP_UNREACH_HOST) {
1300 				action = "Reject";
1301 			} else {
1302 				ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1303 					  cmd->arg1);
1304 			}
1305 			break;
1306 
1307 		case O_ACCEPT:
1308 			action = "Accept";
1309 			break;
1310 
1311 		case O_COUNT:
1312 			action = "Count";
1313 			break;
1314 
1315 		case O_DIVERT:
1316 			ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1317 			break;
1318 
1319 		case O_TEE:
1320 			ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1321 			break;
1322 
1323 		case O_SKIPTO:
1324 			ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1325 			break;
1326 
1327 		case O_PIPE:
1328 			ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1329 			break;
1330 
1331 		case O_QUEUE:
1332 			ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1333 			break;
1334 
1335 		case O_FORWARD_IP:
1336 			{
1337 				ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1338 				int len;
1339 
1340 				len = ksnprintf(SNPARGS(action2, 0),
1341 				    "Forward to %s",
1342 				    kinet_ntoa(sa->sa.sin_addr, abuf));
1343 				if (sa->sa.sin_port) {
1344 					ksnprintf(SNPARGS(action2, len), ":%d",
1345 						  sa->sa.sin_port);
1346 				}
1347 			}
1348 			break;
1349 
1350 		default:
1351 			action = "UNKNOWN";
1352 			break;
1353 		}
1354 	}
1355 
1356 	if (hlen == 0) {	/* non-ip */
1357 		ksnprintf(SNPARGS(proto, 0), "MAC");
1358 	} else {
1359 		struct ip *ip = mtod(m, struct ip *);
1360 		/* these three are all aliases to the same thing */
1361 		struct icmp *const icmp = L3HDR(struct icmp, ip);
1362 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
1363 		struct udphdr *const udp = (struct udphdr *)icmp;
1364 
1365 		int ip_off, offset, ip_len;
1366 		int len;
1367 
1368 		if (eh != NULL) { /* layer 2 packets are as on the wire */
1369 			ip_off = ntohs(ip->ip_off);
1370 			ip_len = ntohs(ip->ip_len);
1371 		} else {
1372 			ip_off = ip->ip_off;
1373 			ip_len = ip->ip_len;
1374 		}
1375 		offset = ip_off & IP_OFFMASK;
1376 		switch (ip->ip_p) {
1377 		case IPPROTO_TCP:
1378 			len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1379 					kinet_ntoa(ip->ip_src, abuf));
1380 			if (offset == 0) {
1381 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1382 					  ntohs(tcp->th_sport),
1383 					  kinet_ntoa(ip->ip_dst, abuf),
1384 					  ntohs(tcp->th_dport));
1385 			} else {
1386 				ksnprintf(SNPARGS(proto, len), " %s",
1387 					  kinet_ntoa(ip->ip_dst, abuf));
1388 			}
1389 			break;
1390 
1391 		case IPPROTO_UDP:
1392 			len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1393 					kinet_ntoa(ip->ip_src, abuf));
1394 			if (offset == 0) {
1395 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1396 					  ntohs(udp->uh_sport),
1397 					  kinet_ntoa(ip->ip_dst, abuf),
1398 					  ntohs(udp->uh_dport));
1399 			} else {
1400 				ksnprintf(SNPARGS(proto, len), " %s",
1401 					  kinet_ntoa(ip->ip_dst, abuf));
1402 			}
1403 			break;
1404 
1405 		case IPPROTO_ICMP:
1406 			if (offset == 0) {
1407 				len = ksnprintf(SNPARGS(proto, 0),
1408 						"ICMP:%u.%u ",
1409 						icmp->icmp_type,
1410 						icmp->icmp_code);
1411 			} else {
1412 				len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1413 			}
1414 			len += ksnprintf(SNPARGS(proto, len), "%s",
1415 					 kinet_ntoa(ip->ip_src, abuf));
1416 			ksnprintf(SNPARGS(proto, len), " %s",
1417 				  kinet_ntoa(ip->ip_dst, abuf));
1418 			break;
1419 
1420 		default:
1421 			len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1422 					kinet_ntoa(ip->ip_src, abuf));
1423 			ksnprintf(SNPARGS(proto, len), " %s",
1424 				  kinet_ntoa(ip->ip_dst, abuf));
1425 			break;
1426 		}
1427 
1428 		if (ip_off & (IP_MF | IP_OFFMASK)) {
1429 			ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1430 				  ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1431 				  offset << 3, (ip_off & IP_MF) ? "+" : "");
1432 		}
1433 	}
1434 
1435 	if (oif || m->m_pkthdr.rcvif) {
1436 		log(LOG_SECURITY | LOG_INFO,
1437 		    "ipfw: %d %s %s %s via %s%s\n",
1438 		    f ? f->rulenum : -1,
1439 		    action, proto, oif ? "out" : "in",
1440 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1441 		    fragment);
1442 	} else {
1443 		log(LOG_SECURITY | LOG_INFO,
1444 		    "ipfw: %d %s %s [no if info]%s\n",
1445 		    f ? f->rulenum : -1,
1446 		    action, proto, fragment);
1447 	}
1448 
1449 	if (limit_reached) {
1450 		log(LOG_SECURITY | LOG_NOTICE,
1451 		    "ipfw: limit %d reached on entry %d\n",
1452 		    limit_reached, f ? f->rulenum : -1);
1453 	}
1454 }
1455 
1456 #undef SNPARGS
1457 
1458 #define TIME_LEQ(a, b)	((a) - (b) <= 0)
1459 
1460 static void
1461 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1462 {
1463 
1464 	KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT,
1465 	    ("invalid state type %u", s->st_type));
1466 	KASSERT(ctx->ipfw_state_cnt > 0,
1467 	    ("invalid state count %d", ctx->ipfw_state_cnt));
1468 
1469 	if (s->st_track != NULL) {
1470 		struct ipfw_track *t = s->st_track;
1471 
1472 		KASSERT(!LIST_EMPTY(&t->t_state_list),
1473 		    ("track state list is empty"));
1474 		LIST_REMOVE(s, st_trklink);
1475 
1476 		KASSERT(*t->t_count > 0,
1477 		    ("invalid track count %d", *t->t_count));
1478 		atomic_subtract_int(t->t_count, 1);
1479 	}
1480 
1481 	TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1482 	RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1483 	kfree(s, M_IPFW);
1484 
1485 	ctx->ipfw_state_cnt--;
1486 	if (ctx->ipfw_state_loosecnt > 0)
1487 		ctx->ipfw_state_loosecnt--;
1488 }
1489 
1490 static int
1491 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1492 {
1493 	struct ipfw_state *s, *anchor;
1494 	int expired;
1495 
1496 	if (reap_max < ipfw_state_reap_min)
1497 		reap_max = ipfw_state_reap_min;
1498 
1499 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1500 		/*
1501 		 * Kick start state expiring.  Ignore scan limit,
1502 		 * we are short of states.
1503 		 */
1504 		ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1505 		expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1506 		ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1507 		return (expired);
1508 	}
1509 
1510 	/*
1511 	 * States are being expired.
1512 	 */
1513 
1514 	if (ctx->ipfw_state_cnt == 0)
1515 		return (0);
1516 
1517 	expired = 0;
1518 	anchor = &ctx->ipfw_stateexp_anch;
1519 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1520 		/*
1521 		 * Ignore scan limit; we are short of states.
1522 		 */
1523 
1524 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1525 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1526 
1527 		if (s->st_type == O_ANCHOR)
1528 			continue;
1529 
1530 		if (IPFW_STATE_TCPCLOSED(s) ||
1531 		    TIME_LEQ(s->st_expire, time_uptime)) {
1532 			ipfw_state_del(ctx, s);
1533 			if (++expired >= reap_max)
1534 				break;
1535 			if ((expired & 0xff) == 0 &&
1536 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1537 			    ipfw_state_max)
1538 				break;
1539 		}
1540 	}
1541 	/*
1542 	 * NOTE:
1543 	 * Leave the anchor on the list, even if the end of the list has
1544 	 * been reached.  ipfw_state_expire_more_dispatch() will handle
1545 	 * the removal.
1546 	 */
1547 	return (expired);
1548 }
1549 
1550 static void
1551 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1552 {
1553 	struct ipfw_state *s, *sn;
1554 
1555 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1556 		if (s->st_type == O_ANCHOR)
1557 			continue;
1558 		if (rule != NULL && s->st_rule != rule)
1559 			continue;
1560 		ipfw_state_del(ctx, s);
1561 	}
1562 }
1563 
1564 static void
1565 ipfw_state_expire_done(struct ipfw_context *ctx)
1566 {
1567 
1568 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1569 	    ("stateexp is not in progress"));
1570 	ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1571 	callout_reset(&ctx->ipfw_stateto_ch, hz,
1572 	    ipfw_state_expire_ipifunc, NULL);
1573 }
1574 
1575 static void
1576 ipfw_state_expire_more(struct ipfw_context *ctx)
1577 {
1578 	struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1579 
1580 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1581 	    ("stateexp is not in progress"));
1582 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1583 	    ("stateexp more did not finish"));
1584 	netisr_sendmsg_oncpu(nm);
1585 }
1586 
1587 static int
1588 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1589     int scan_max, int expire_max)
1590 {
1591 	struct ipfw_state *s;
1592 	int scanned = 0, expired = 0;
1593 
1594 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1595 	    ("stateexp is not in progress"));
1596 
1597 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1598 		if (scanned++ >= scan_max) {
1599 			ipfw_state_expire_more(ctx);
1600 			return (expired);
1601 		}
1602 
1603 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1604 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1605 
1606 		if (s->st_type == O_ANCHOR)
1607 			continue;
1608 
1609 		if (TIME_LEQ(s->st_expire, time_uptime) ||
1610 		    ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1611 		     IPFW_STATE_TCPCLOSED(s))) {
1612 			ipfw_state_del(ctx, s);
1613 			if (++expired >= expire_max) {
1614 				ipfw_state_expire_more(ctx);
1615 				return (expired);
1616 			}
1617 			if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1618 			    (expired & 0xff) == 0 &&
1619 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1620 			    ipfw_state_max) {
1621 				ipfw_state_expire_more(ctx);
1622 				return (expired);
1623 			}
1624 		}
1625 	}
1626 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1627 	ipfw_state_expire_done(ctx);
1628 	return (expired);
1629 }
1630 
1631 static void
1632 ipfw_state_expire_more_dispatch(netmsg_t nm)
1633 {
1634 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1635 	struct ipfw_state *anchor;
1636 
1637 	ASSERT_NETISR_NCPUS(mycpuid);
1638 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1639 	    ("statexp is not in progress"));
1640 
1641 	/* Reply ASAP */
1642 	netisr_replymsg(&nm->base, 0);
1643 
1644 	anchor = &ctx->ipfw_stateexp_anch;
1645 	if (ctx->ipfw_state_cnt == 0) {
1646 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1647 		ipfw_state_expire_done(ctx);
1648 		return;
1649 	}
1650 	ipfw_state_expire_loop(ctx, anchor,
1651 	    ipfw_state_scan_max, ipfw_state_expire_max);
1652 }
1653 
1654 static int
1655 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1656 {
1657 	struct ipfw_state *anchor;
1658 
1659 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1660 	    ("stateexp is in progress"));
1661 	ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1662 
1663 	if (ctx->ipfw_state_cnt == 0) {
1664 		ipfw_state_expire_done(ctx);
1665 		return (0);
1666 	}
1667 
1668 	/*
1669 	 * Do not expire more than once per second, it is useless.
1670 	 */
1671 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1672 	    ctx->ipfw_state_lastexp == time_uptime) {
1673 		ipfw_state_expire_done(ctx);
1674 		return (0);
1675 	}
1676 	ctx->ipfw_state_lastexp = time_uptime;
1677 
1678 	anchor = &ctx->ipfw_stateexp_anch;
1679 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1680 	return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1681 }
1682 
1683 static void
1684 ipfw_state_expire_dispatch(netmsg_t nm)
1685 {
1686 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1687 
1688 	ASSERT_NETISR_NCPUS(mycpuid);
1689 
1690 	/* Reply ASAP */
1691 	crit_enter();
1692 	netisr_replymsg(&nm->base, 0);
1693 	crit_exit();
1694 
1695 	if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1696 		/* Running; done. */
1697 		return;
1698 	}
1699 	ipfw_state_expire_start(ctx,
1700 	    ipfw_state_scan_max, ipfw_state_expire_max);
1701 }
1702 
1703 static void
1704 ipfw_state_expire_ipifunc(void *dummy __unused)
1705 {
1706 	struct netmsg_base *msg;
1707 
1708 	KKASSERT(mycpuid < netisr_ncpus);
1709 	msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
1710 
1711 	crit_enter();
1712 	if (msg->lmsg.ms_flags & MSGF_DONE)
1713 		netisr_sendmsg_oncpu(msg);
1714 	crit_exit();
1715 }
1716 
1717 static boolean_t
1718 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
1719 {
1720 	uint32_t seq = ntohl(tcp->th_seq);
1721 	uint32_t ack = ntohl(tcp->th_ack);
1722 
1723 	if (tcp->th_flags & TH_RST)
1724 		return (TRUE);
1725 
1726 	if (dir == MATCH_FORWARD) {
1727 		if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
1728 			s->st_flags |= IPFW_STATE_F_SEQFWD;
1729 			s->st_seq_fwd = seq;
1730 		} else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
1731 			s->st_seq_fwd = seq;
1732 		} else {
1733 			/* Out-of-sequence; done. */
1734 			return (FALSE);
1735 		}
1736 		if (tcp->th_flags & TH_ACK) {
1737 			if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
1738 				s->st_flags |= IPFW_STATE_F_ACKFWD;
1739 				s->st_ack_fwd = ack;
1740 			} else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
1741 				s->st_ack_fwd = ack;
1742 			} else {
1743 				/* Out-of-sequence; done. */
1744 				return (FALSE);
1745 			}
1746 
1747 			if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
1748 			    (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
1749 				s->st_state |= (TH_ACK << 8);
1750 		}
1751 	} else {
1752 		if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
1753 			s->st_flags |= IPFW_STATE_F_SEQREV;
1754 			s->st_seq_rev = seq;
1755 		} else if (SEQ_GEQ(seq, s->st_seq_rev)) {
1756 			s->st_seq_rev = seq;
1757 		} else {
1758 			/* Out-of-sequence; done. */
1759 			return (FALSE);
1760 		}
1761 		if (tcp->th_flags & TH_ACK) {
1762 			if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
1763 				s->st_flags |= IPFW_STATE_F_ACKREV;
1764 				s->st_ack_rev= ack;
1765 			} else if (SEQ_GEQ(ack, s->st_ack_rev)) {
1766 				s->st_ack_rev = ack;
1767 			} else {
1768 				/* Out-of-sequence; done. */
1769 				return (FALSE);
1770 			}
1771 
1772 			if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
1773 			    s->st_ack_rev == s->st_seq_fwd + 1)
1774 				s->st_state |= TH_ACK;
1775 		}
1776 	}
1777 	return (TRUE);
1778 }
1779 
1780 static void
1781 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
1782     const struct tcphdr *tcp, struct ipfw_state *s)
1783 {
1784 
1785 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
1786 		u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
1787 
1788 		if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
1789 			return;
1790 
1791 		s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
1792 		switch (s->st_state & IPFW_STATE_TCPSTATES) {
1793 		case TH_SYN:				/* opening */
1794 			s->st_expire = time_uptime + dyn_syn_lifetime;
1795 			break;
1796 
1797 		case BOTH_SYN:			/* move to established */
1798 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
1799 		case BOTH_SYN | (TH_FIN << 8):
1800 			s->st_expire = time_uptime + dyn_ack_lifetime;
1801 			break;
1802 
1803 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
1804 			if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
1805 				/* And both FINs were ACKed. */
1806 				s->st_expire = time_uptime + dyn_fin_lifetime;
1807 			} else {
1808 				s->st_expire = time_uptime +
1809 				    dyn_finwait_lifetime;
1810 			}
1811 			break;
1812 
1813 		default:
1814 #if 0
1815 			/*
1816 			 * reset or some invalid combination, but can also
1817 			 * occur if we use keep-state the wrong way.
1818 			 */
1819 			if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
1820 				kprintf("invalid state: 0x%x\n", s->st_state);
1821 #endif
1822 			s->st_expire = time_uptime + dyn_rst_lifetime;
1823 			break;
1824 		}
1825 	} else if (pkt->proto == IPPROTO_UDP) {
1826 		s->st_expire = time_uptime + dyn_udp_lifetime;
1827 	} else {
1828 		/* other protocols */
1829 		s->st_expire = time_uptime + dyn_short_lifetime;
1830 	}
1831 }
1832 
1833 /*
1834  * Lookup a state.
1835  */
1836 static struct ipfw_state *
1837 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1838     int *match_direction, const struct tcphdr *tcp)
1839 {
1840 	struct ipfw_state *key, *s;
1841 	int dir = MATCH_NONE;
1842 
1843 	key = &ctx->ipfw_state_tmpkey;
1844 	ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
1845 	    pkt->dst_ip, pkt->dst_port, pkt->proto);
1846 	s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
1847 	if (s == NULL)
1848 		goto done; /* not found. */
1849 	if (TIME_LEQ(s->st_expire, time_uptime)) {
1850 		/* Expired. */
1851 		ipfw_state_del(ctx, s);
1852 		s = NULL;
1853 		goto done;
1854 	}
1855 	if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
1856 		/* TCP ports recycling is too fast. */
1857 		ctx->ipfw_sts_tcprecycled++;
1858 		ipfw_state_del(ctx, s);
1859 		s = NULL;
1860 		goto done;
1861 	}
1862 
1863 	if (s->st_swap == key->st_swap) {
1864 		dir = MATCH_FORWARD;
1865 	} else {
1866 		KASSERT((s->st_swap & key->st_swap) == 0,
1867 		    ("found mismatch state"));
1868 		dir = MATCH_REVERSE;
1869 	}
1870 
1871 	/* Update this state. */
1872 	ipfw_state_update(pkt, dir, tcp, s);
1873 
1874 	if (s->st_track != NULL) {
1875 		/* This track has been used. */
1876 		s->st_track->t_expire = time_uptime + dyn_short_lifetime;
1877 	}
1878 done:
1879 	if (match_direction)
1880 		*match_direction = dir;
1881 	return (s);
1882 }
1883 
1884 static __inline struct ip_fw *
1885 ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
1886     int *match_direction, const struct tcphdr *tcp, uint16_t len)
1887 {
1888 	struct ipfw_state *s;
1889 
1890 	s = ipfw_state_lookup(ctx, pkt, match_direction, tcp);
1891 	if (s == NULL)
1892 		return (NULL);
1893 
1894 	KASSERT(s->st_rule->cpuid == mycpuid,
1895 	    ("rule %p (cpu%d) does not belong to the current cpu%d",
1896 	     s->st_rule, s->st_rule->cpuid, mycpuid));
1897 
1898 	s->st_pcnt++;
1899 	s->st_bcnt += len;
1900 
1901 	return (s->st_rule);
1902 }
1903 
1904 static struct ipfw_state *
1905 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
1906     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
1907     const struct tcphdr *tcp)
1908 {
1909 	struct ipfw_state *s, *dup;
1910 
1911 	KASSERT(type == O_KEEP_STATE || type == O_LIMIT,
1912 	    ("invalid state type %u", type));
1913 
1914 	s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
1915 	if (s == NULL) {
1916 		ctx->ipfw_sts_nomem++;
1917 		return (NULL);
1918 	}
1919 
1920 	ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
1921 	    id->dst_ip, id->dst_port, id->proto);
1922 
1923 	s->st_rule = rule;
1924 	s->st_type = type;
1925 
1926 	ctx->ipfw_state_cnt++;
1927 	ctx->ipfw_state_loosecnt++;
1928 	if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
1929 		ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
1930 		ctx->ipfw_state_loosecnt = 0;
1931 	}
1932 
1933 	dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1934 	if (dup != NULL)
1935 		panic("ipfw: state exists");
1936 	TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1937 
1938 	/*
1939 	 * Update this state:
1940 	 * Set st_expire and st_state.
1941 	 */
1942 	ipfw_state_update(id, MATCH_FORWARD, tcp, s);
1943 
1944 	if (t != NULL) {
1945 		/* Keep the track referenced. */
1946 		LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
1947 		s->st_track = t;
1948 	}
1949 	return (s);
1950 }
1951 
1952 static boolean_t
1953 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
1954 {
1955 	struct ipfw_trkcnt *trk;
1956 	boolean_t trk_freed = FALSE;
1957 
1958 	KASSERT(t->t_count != NULL, ("track anchor"));
1959 	KASSERT(LIST_EMPTY(&t->t_state_list),
1960 	    ("invalid track is still referenced"));
1961 
1962 	trk = t->t_trkcnt;
1963 	KASSERT(trk != NULL, ("track has no trkcnt"));
1964 
1965 	RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
1966 	TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
1967 	kfree(t, M_IPFW);
1968 
1969 	/*
1970 	 * fdrop() style reference counting.
1971 	 * See kern/kern_descrip.c fdrop().
1972 	 */
1973 	for (;;) {
1974 		int refs = trk->tc_refs;
1975 
1976 		cpu_ccfence();
1977 		KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
1978 		if (refs == 1) {
1979 			IPFW_TRKCNT_TOKGET;
1980 			if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
1981 				KASSERT(trk->tc_count == 0,
1982 				    ("%d states reference this trkcnt",
1983 				     trk->tc_count));
1984 				RB_REMOVE(ipfw_trkcnt_tree,
1985 				    &ipfw_gd.ipfw_trkcnt_tree, trk);
1986 
1987 				KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
1988 				    ("invalid trkcnt cnt %d",
1989 				     ipfw_gd.ipfw_trkcnt_cnt));
1990 				ipfw_gd.ipfw_trkcnt_cnt--;
1991 				IPFW_TRKCNT_TOKREL;
1992 
1993 				if (ctx->ipfw_trkcnt_spare == NULL)
1994 					ctx->ipfw_trkcnt_spare = trk;
1995 				else
1996 					kfree(trk, M_IPFW);
1997 				trk_freed = TRUE;
1998 				break; /* done! */
1999 			}
2000 			IPFW_TRKCNT_TOKREL;
2001 			/* retry */
2002 		} else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2003 			break; /* done! */
2004 		}
2005 		/* retry */
2006 	}
2007 	return (trk_freed);
2008 }
2009 
2010 static void
2011 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2012 {
2013 	struct ipfw_track *t, *tn;
2014 
2015 	TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2016 		if (t->t_count == NULL) /* anchor */
2017 			continue;
2018 		if (rule != NULL && t->t_rule != rule)
2019 			continue;
2020 		ipfw_track_free(ctx, t);
2021 	}
2022 }
2023 
2024 static boolean_t
2025 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2026     boolean_t reap)
2027 {
2028 	struct ipfw_state *s, *sn;
2029 	boolean_t ret = FALSE;
2030 
2031 	KASSERT(t->t_count != NULL, ("track anchor"));
2032 
2033 	if (LIST_EMPTY(&t->t_state_list))
2034 		return (FALSE);
2035 
2036 	/*
2037 	 * Do not expire more than once per second, it is useless.
2038 	 */
2039 	if (t->t_lastexp == time_uptime)
2040 		return (FALSE);
2041 	t->t_lastexp = time_uptime;
2042 
2043 	LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2044 		if (TIME_LEQ(s->st_expire, time_uptime) ||
2045 		    (reap && IPFW_STATE_TCPCLOSED(s))) {
2046 			KASSERT(s->st_track == t,
2047 			    ("state track %p does not match %p",
2048 			     s->st_track, t));
2049 			ipfw_state_del(ctx, s);
2050 			ret = TRUE;
2051 		}
2052 	}
2053 	return (ret);
2054 }
2055 
2056 static __inline struct ipfw_trkcnt *
2057 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2058 {
2059 	struct ipfw_trkcnt *trk;
2060 
2061 	if (ctx->ipfw_trkcnt_spare != NULL) {
2062 		trk = ctx->ipfw_trkcnt_spare;
2063 		ctx->ipfw_trkcnt_spare = NULL;
2064 	} else {
2065 		trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2066 		    M_INTWAIT | M_NULLOK);
2067 	}
2068 	return (trk);
2069 }
2070 
2071 static void
2072 ipfw_track_expire_done(struct ipfw_context *ctx)
2073 {
2074 
2075 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2076 	    ("trackexp is not in progress"));
2077 	ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2078 	callout_reset(&ctx->ipfw_trackto_ch, hz,
2079 	    ipfw_track_expire_ipifunc, NULL);
2080 }
2081 
2082 static void
2083 ipfw_track_expire_more(struct ipfw_context *ctx)
2084 {
2085 	struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2086 
2087 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2088 	    ("trackexp is not in progress"));
2089 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2090 	    ("trackexp more did not finish"));
2091 	netisr_sendmsg_oncpu(nm);
2092 }
2093 
2094 static int
2095 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2096     int scan_max, int expire_max)
2097 {
2098 	struct ipfw_track *t;
2099 	int scanned = 0, expired = 0;
2100 	boolean_t reap = FALSE;
2101 
2102 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2103 	    ("trackexp is not in progress"));
2104 
2105 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2106 		reap = TRUE;
2107 
2108 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2109 		if (scanned++ >= scan_max) {
2110 			ipfw_track_expire_more(ctx);
2111 			return (expired);
2112 		}
2113 
2114 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2115 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2116 
2117 		if (t->t_count == NULL) /* anchor */
2118 			continue;
2119 
2120 		ipfw_track_state_expire(ctx, t, reap);
2121 		if (!LIST_EMPTY(&t->t_state_list)) {
2122 			/* There are states referencing this track. */
2123 			continue;
2124 		}
2125 
2126 		if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2127 			/* Expired. */
2128 			if (ipfw_track_free(ctx, t)) {
2129 				if (++expired >= expire_max) {
2130 					ipfw_track_expire_more(ctx);
2131 					return (expired);
2132 				}
2133 			}
2134 		}
2135 	}
2136 	TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2137 	ipfw_track_expire_done(ctx);
2138 	return (expired);
2139 }
2140 
2141 static int
2142 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2143 {
2144 	struct ipfw_track *anchor;
2145 
2146 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2147 	    ("trackexp is in progress"));
2148 	ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2149 
2150 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2151 		ipfw_track_expire_done(ctx);
2152 		return (0);
2153 	}
2154 
2155 	/*
2156 	 * Do not expire more than once per second, it is useless.
2157 	 */
2158 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2159 	    ctx->ipfw_track_lastexp == time_uptime) {
2160 		ipfw_track_expire_done(ctx);
2161 		return (0);
2162 	}
2163 	ctx->ipfw_track_lastexp = time_uptime;
2164 
2165 	anchor = &ctx->ipfw_trackexp_anch;
2166 	TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2167 	return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2168 }
2169 
2170 static void
2171 ipfw_track_expire_more_dispatch(netmsg_t nm)
2172 {
2173 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2174 	struct ipfw_track *anchor;
2175 
2176 	ASSERT_NETISR_NCPUS(mycpuid);
2177 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2178 	    ("trackexp is not in progress"));
2179 
2180 	/* Reply ASAP */
2181 	netisr_replymsg(&nm->base, 0);
2182 
2183 	anchor = &ctx->ipfw_trackexp_anch;
2184 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2185 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2186 		ipfw_track_expire_done(ctx);
2187 		return;
2188 	}
2189 	ipfw_track_expire_loop(ctx, anchor,
2190 	    ipfw_track_scan_max, ipfw_track_expire_max);
2191 }
2192 
2193 static void
2194 ipfw_track_expire_dispatch(netmsg_t nm)
2195 {
2196 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2197 
2198 	ASSERT_NETISR_NCPUS(mycpuid);
2199 
2200 	/* Reply ASAP */
2201 	crit_enter();
2202 	netisr_replymsg(&nm->base, 0);
2203 	crit_exit();
2204 
2205 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2206 		/* Running; done. */
2207 		return;
2208 	}
2209 	ipfw_track_expire_start(ctx,
2210 	    ipfw_track_scan_max, ipfw_track_expire_max);
2211 }
2212 
2213 static void
2214 ipfw_track_expire_ipifunc(void *dummy __unused)
2215 {
2216 	struct netmsg_base *msg;
2217 
2218 	KKASSERT(mycpuid < netisr_ncpus);
2219 	msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2220 
2221 	crit_enter();
2222 	if (msg->lmsg.ms_flags & MSGF_DONE)
2223 		netisr_sendmsg_oncpu(msg);
2224 	crit_exit();
2225 }
2226 
2227 static int
2228 ipfw_track_reap(struct ipfw_context *ctx)
2229 {
2230 	struct ipfw_track *t, *anchor;
2231 	int expired;
2232 
2233 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2234 		/*
2235 		 * Kick start track expiring.  Ignore scan limit,
2236 		 * we are short of tracks.
2237 		 */
2238 		ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2239 		expired = ipfw_track_expire_start(ctx, INT_MAX,
2240 		    ipfw_track_reap_max);
2241 		ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2242 		return (expired);
2243 	}
2244 
2245 	/*
2246 	 * Tracks are being expired.
2247 	 */
2248 
2249 	if (RB_EMPTY(&ctx->ipfw_track_tree))
2250 		return (0);
2251 
2252 	expired = 0;
2253 	anchor = &ctx->ipfw_trackexp_anch;
2254 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2255 		/*
2256 		 * Ignore scan limit; we are short of tracks.
2257 		 */
2258 
2259 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2260 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2261 
2262 		if (t->t_count == NULL) /* anchor */
2263 			continue;
2264 
2265 		ipfw_track_state_expire(ctx, t, TRUE);
2266 		if (!LIST_EMPTY(&t->t_state_list)) {
2267 			/* There are states referencing this track. */
2268 			continue;
2269 		}
2270 
2271 		if (ipfw_track_free(ctx, t)) {
2272 			if (++expired >= ipfw_track_reap_max) {
2273 				ipfw_track_expire_more(ctx);
2274 				break;
2275 			}
2276 		}
2277 	}
2278 	/*
2279 	 * NOTE:
2280 	 * Leave the anchor on the list, even if the end of the list has
2281 	 * been reached.  ipfw_track_expire_more_dispatch() will handle
2282 	 * the removal.
2283 	 */
2284 	return (expired);
2285 }
2286 
2287 static struct ipfw_track *
2288 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2289     uint16_t limit_mask, struct ip_fw *rule)
2290 {
2291 	struct ipfw_track *key, *t, *dup;
2292 	struct ipfw_trkcnt *trk, *ret;
2293 	boolean_t do_expire = FALSE;
2294 
2295 	KASSERT(rule->track_ruleid != 0,
2296 	    ("rule %u has no track ruleid", rule->rulenum));
2297 
2298 	key = &ctx->ipfw_track_tmpkey;
2299 	key->t_proto = id->proto;
2300 	key->t_addrs = 0;
2301 	key->t_ports = 0;
2302 	key->t_rule = rule;
2303 	if (limit_mask & DYN_SRC_ADDR)
2304 		key->t_saddr = id->src_ip;
2305 	if (limit_mask & DYN_DST_ADDR)
2306 		key->t_daddr = id->dst_ip;
2307 	if (limit_mask & DYN_SRC_PORT)
2308 		key->t_sport = id->src_port;
2309 	if (limit_mask & DYN_DST_PORT)
2310 		key->t_dport = id->dst_port;
2311 
2312 	t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2313 	if (t != NULL)
2314 		goto done;
2315 
2316 	t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2317 	if (t == NULL) {
2318 		ctx->ipfw_tks_nomem++;
2319 		return (NULL);
2320 	}
2321 
2322 	t->t_key = key->t_key;
2323 	t->t_rule = rule;
2324 	t->t_lastexp = 0;
2325 	LIST_INIT(&t->t_state_list);
2326 
2327 	if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2328 		time_t globexp, uptime;
2329 
2330 		trk = NULL;
2331 		do_expire = TRUE;
2332 
2333 		/*
2334 		 * Do not expire globally more than once per second,
2335 		 * it is useless.
2336 		 */
2337 		uptime = time_uptime;
2338 		globexp = ipfw_gd.ipfw_track_globexp;
2339 		if (globexp != uptime &&
2340 		    atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2341 		    globexp, uptime)) {
2342 			int cpu;
2343 
2344 			/* Expire tracks on other CPUs. */
2345 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2346 				if (cpu == mycpuid)
2347 					continue;
2348 				lwkt_send_ipiq(globaldata_find(cpu),
2349 				    ipfw_track_expire_ipifunc, NULL);
2350 			}
2351 		}
2352 	} else {
2353 		trk = ipfw_trkcnt_alloc(ctx);
2354 	}
2355 	if (trk == NULL) {
2356 		struct ipfw_trkcnt *tkey;
2357 
2358 		tkey = &ctx->ipfw_trkcnt_tmpkey;
2359 		key = NULL; /* tkey overlaps key */
2360 
2361 		tkey->tc_key = t->t_key;
2362 		tkey->tc_ruleid = rule->track_ruleid;
2363 
2364 		IPFW_TRKCNT_TOKGET;
2365 		trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2366 		    tkey);
2367 		if (trk == NULL) {
2368 			IPFW_TRKCNT_TOKREL;
2369 			if (do_expire) {
2370 				ctx->ipfw_tks_reap++;
2371 				if (ipfw_track_reap(ctx) > 0) {
2372 					if (ipfw_gd.ipfw_trkcnt_cnt <
2373 					    ipfw_track_max) {
2374 						trk = ipfw_trkcnt_alloc(ctx);
2375 						if (trk != NULL)
2376 							goto install;
2377 						ctx->ipfw_tks_cntnomem++;
2378 					} else {
2379 						ctx->ipfw_tks_overflow++;
2380 					}
2381 				} else {
2382 					ctx->ipfw_tks_reapfailed++;
2383 					ctx->ipfw_tks_overflow++;
2384 				}
2385 			} else {
2386 				ctx->ipfw_tks_cntnomem++;
2387 			}
2388 			kfree(t, M_IPFW);
2389 			return (NULL);
2390 		}
2391 		KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2392 		    ("invalid trkcnt refs %d", trk->tc_refs));
2393 		atomic_add_int(&trk->tc_refs, 1);
2394 		IPFW_TRKCNT_TOKREL;
2395 	} else {
2396 install:
2397 		trk->tc_key = t->t_key;
2398 		trk->tc_ruleid = rule->track_ruleid;
2399 		trk->tc_refs = 0;
2400 		trk->tc_count = 0;
2401 		trk->tc_expire = 0;
2402 		trk->tc_rulenum = rule->rulenum;
2403 
2404 		IPFW_TRKCNT_TOKGET;
2405 		ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2406 		    trk);
2407 		if (ret != NULL) {
2408 			KASSERT(ret->tc_refs > 0 &&
2409 			    ret->tc_refs < netisr_ncpus,
2410 			    ("invalid trkcnt refs %d", ret->tc_refs));
2411 			KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2412 			    ("trkcnt spare was installed"));
2413 			ctx->ipfw_trkcnt_spare = trk;
2414 			trk = ret;
2415 		} else {
2416 			ipfw_gd.ipfw_trkcnt_cnt++;
2417 		}
2418 		atomic_add_int(&trk->tc_refs, 1);
2419 		IPFW_TRKCNT_TOKREL;
2420 	}
2421 	t->t_count = &trk->tc_count;
2422 	t->t_trkcnt = trk;
2423 
2424 	dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2425 	if (dup != NULL)
2426 		panic("ipfw: track exists");
2427 	TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2428 done:
2429 	t->t_expire = time_uptime + dyn_short_lifetime;
2430 	return (t);
2431 }
2432 
2433 /*
2434  * Install state for rule type cmd->o.opcode
2435  *
2436  * Returns 1 (failure) if state is not installed because of errors or because
2437  * states limitations are enforced.
2438  */
2439 static int
2440 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2441     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2442 {
2443 	struct ipfw_state *s;
2444 	struct ipfw_track *t;
2445 	int count, diff;
2446 
2447 	if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2448 	    (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2449 		boolean_t overflow = TRUE;
2450 
2451 		ctx->ipfw_sts_reap++;
2452 		if (ipfw_state_reap(ctx, diff) == 0)
2453 			ctx->ipfw_sts_reapfailed++;
2454 		if (ipfw_state_cntsync() < ipfw_state_max)
2455 			overflow = FALSE;
2456 
2457 		if (overflow) {
2458 			time_t globexp, uptime;
2459 			int cpu;
2460 
2461 			/*
2462 			 * Do not expire globally more than once per second,
2463 			 * it is useless.
2464 			 */
2465 			uptime = time_uptime;
2466 			globexp = ipfw_gd.ipfw_state_globexp;
2467 			if (globexp == uptime ||
2468 			    !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2469 			    globexp, uptime)) {
2470 				ctx->ipfw_sts_overflow++;
2471 				return (1);
2472 			}
2473 
2474 			/* Expire states on other CPUs. */
2475 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2476 				if (cpu == mycpuid)
2477 					continue;
2478 				lwkt_send_ipiq(globaldata_find(cpu),
2479 				    ipfw_state_expire_ipifunc, NULL);
2480 			}
2481 			ctx->ipfw_sts_overflow++;
2482 			return (1);
2483 		}
2484 	}
2485 
2486 	switch (cmd->o.opcode) {
2487 	case O_KEEP_STATE: /* bidir rule */
2488 		s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL,
2489 		    tcp);
2490 		if (s == NULL)
2491 			return (1);
2492 		break;
2493 
2494 	case O_LIMIT: /* limit number of sessions */
2495 		t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2496 		if (t == NULL)
2497 			return (1);
2498 
2499 		if (*t->t_count >= cmd->conn_limit) {
2500 			if (!ipfw_track_state_expire(ctx, t, TRUE))
2501 				return (1);
2502 		}
2503 		for (;;) {
2504 			count = *t->t_count;
2505 			if (count >= cmd->conn_limit)
2506 				return (1);
2507 			if (atomic_cmpset_int(t->t_count, count, count + 1))
2508 				break;
2509 		}
2510 
2511 		s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2512 		if (s == NULL) {
2513 			/* Undo damage. */
2514 			atomic_subtract_int(t->t_count, 1);
2515 			return (1);
2516 		}
2517 		break;
2518 
2519 	default:
2520 		panic("unknown state type %u\n", cmd->o.opcode);
2521 	}
2522 	return (0);
2523 }
2524 
2525 static int
2526 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2527     const struct in_addr *in)
2528 {
2529 	struct radix_node_head *rnh;
2530 	struct sockaddr_in sin;
2531 	struct ipfw_tblent *te;
2532 
2533 	KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2534 	rnh = ctx->ipfw_tables[tableid];
2535 	if (rnh == NULL)
2536 		return (0); /* no match */
2537 
2538 	memset(&sin, 0, sizeof(sin));
2539 	sin.sin_family = AF_INET;
2540 	sin.sin_len = sizeof(sin);
2541 	sin.sin_addr = *in;
2542 
2543 	te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2544 	if (te == NULL)
2545 		return (0); /* no match */
2546 
2547 	te->te_use++;
2548 	te->te_lastuse = time_second;
2549 	return (1); /* match */
2550 }
2551 
2552 /*
2553  * Transmit a TCP packet, containing either a RST or a keepalive.
2554  * When flags & TH_RST, we are sending a RST packet, because of a
2555  * "reset" action matched the packet.
2556  * Otherwise we are sending a keepalive, and flags & TH_
2557  *
2558  * Only {src,dst}_{ip,port} of "id" are used.
2559  */
2560 static void
2561 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2562 {
2563 	struct mbuf *m;
2564 	struct ip *ip;
2565 	struct tcphdr *tcp;
2566 	struct route sro;	/* fake route */
2567 
2568 	MGETHDR(m, M_NOWAIT, MT_HEADER);
2569 	if (m == NULL)
2570 		return;
2571 	m->m_pkthdr.rcvif = NULL;
2572 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2573 	m->m_data += max_linkhdr;
2574 
2575 	ip = mtod(m, struct ip *);
2576 	bzero(ip, m->m_len);
2577 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2578 	ip->ip_p = IPPROTO_TCP;
2579 	tcp->th_off = 5;
2580 
2581 	/*
2582 	 * Assume we are sending a RST (or a keepalive in the reverse
2583 	 * direction), swap src and destination addresses and ports.
2584 	 */
2585 	ip->ip_src.s_addr = htonl(id->dst_ip);
2586 	ip->ip_dst.s_addr = htonl(id->src_ip);
2587 	tcp->th_sport = htons(id->dst_port);
2588 	tcp->th_dport = htons(id->src_port);
2589 	if (flags & TH_RST) {	/* we are sending a RST */
2590 		if (flags & TH_ACK) {
2591 			tcp->th_seq = htonl(ack);
2592 			tcp->th_ack = htonl(0);
2593 			tcp->th_flags = TH_RST;
2594 		} else {
2595 			if (flags & TH_SYN)
2596 				seq++;
2597 			tcp->th_seq = htonl(0);
2598 			tcp->th_ack = htonl(seq);
2599 			tcp->th_flags = TH_RST | TH_ACK;
2600 		}
2601 	} else {
2602 		/*
2603 		 * We are sending a keepalive. flags & TH_SYN determines
2604 		 * the direction, forward if set, reverse if clear.
2605 		 * NOTE: seq and ack are always assumed to be correct
2606 		 * as set by the caller. This may be confusing...
2607 		 */
2608 		if (flags & TH_SYN) {
2609 			/*
2610 			 * we have to rewrite the correct addresses!
2611 			 */
2612 			ip->ip_dst.s_addr = htonl(id->dst_ip);
2613 			ip->ip_src.s_addr = htonl(id->src_ip);
2614 			tcp->th_dport = htons(id->dst_port);
2615 			tcp->th_sport = htons(id->src_port);
2616 		}
2617 		tcp->th_seq = htonl(seq);
2618 		tcp->th_ack = htonl(ack);
2619 		tcp->th_flags = TH_ACK;
2620 	}
2621 
2622 	/*
2623 	 * set ip_len to the payload size so we can compute
2624 	 * the tcp checksum on the pseudoheader
2625 	 * XXX check this, could save a couple of words ?
2626 	 */
2627 	ip->ip_len = htons(sizeof(struct tcphdr));
2628 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2629 
2630 	/*
2631 	 * now fill fields left out earlier
2632 	 */
2633 	ip->ip_ttl = ip_defttl;
2634 	ip->ip_len = m->m_pkthdr.len;
2635 
2636 	bzero(&sro, sizeof(sro));
2637 	ip_rtaddr(ip->ip_dst, &sro);
2638 
2639 	m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2640 	ip_output(m, NULL, &sro, 0, NULL, NULL);
2641 	if (sro.ro_rt)
2642 		RTFREE(sro.ro_rt);
2643 }
2644 
2645 /*
2646  * Send a reject message, consuming the mbuf passed as an argument.
2647  */
2648 static void
2649 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2650 {
2651 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2652 		/* We need the IP header in host order for icmp_error(). */
2653 		if (args->eh != NULL) {
2654 			struct ip *ip = mtod(args->m, struct ip *);
2655 
2656 			ip->ip_len = ntohs(ip->ip_len);
2657 			ip->ip_off = ntohs(ip->ip_off);
2658 		}
2659 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2660 	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2661 		struct tcphdr *const tcp =
2662 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2663 
2664 		if ((tcp->th_flags & TH_RST) == 0) {
2665 			send_pkt(&args->f_id, ntohl(tcp->th_seq),
2666 				 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2667 		}
2668 		m_freem(args->m);
2669 	} else {
2670 		m_freem(args->m);
2671 	}
2672 	args->m = NULL;
2673 }
2674 
2675 /*
2676  * Given an ip_fw *, lookup_next_rule will return a pointer
2677  * to the next rule, which can be either the jump
2678  * target (for skipto instructions) or the next one in the list (in
2679  * all other cases including a missing jump target).
2680  * The result is also written in the "next_rule" field of the rule.
2681  * Backward jumps are not allowed, so start looking from the next
2682  * rule...
2683  *
2684  * This never returns NULL -- in case we do not have an exact match,
2685  * the next rule is returned. When the ruleset is changed,
2686  * pointers are flushed so we are always correct.
2687  */
2688 static struct ip_fw *
2689 lookup_next_rule(struct ip_fw *me)
2690 {
2691 	struct ip_fw *rule = NULL;
2692 	ipfw_insn *cmd;
2693 
2694 	/* look for action, in case it is a skipto */
2695 	cmd = ACTION_PTR(me);
2696 	if (cmd->opcode == O_LOG)
2697 		cmd += F_LEN(cmd);
2698 	if (cmd->opcode == O_SKIPTO) {
2699 		for (rule = me->next; rule; rule = rule->next) {
2700 			if (rule->rulenum >= cmd->arg1)
2701 				break;
2702 		}
2703 	}
2704 	if (rule == NULL)			/* failure or not a skipto */
2705 		rule = me->next;
2706 	me->next_rule = rule;
2707 	return rule;
2708 }
2709 
2710 static int
2711 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
2712 		enum ipfw_opcodes opcode, uid_t uid)
2713 {
2714 	struct in_addr src_ip, dst_ip;
2715 	struct inpcbinfo *pi;
2716 	boolean_t wildcard;
2717 	struct inpcb *pcb;
2718 
2719 	if (fid->proto == IPPROTO_TCP) {
2720 		wildcard = FALSE;
2721 		pi = &tcbinfo[mycpuid];
2722 	} else if (fid->proto == IPPROTO_UDP) {
2723 		wildcard = TRUE;
2724 		pi = &udbinfo[mycpuid];
2725 	} else {
2726 		return 0;
2727 	}
2728 
2729 	/*
2730 	 * Values in 'fid' are in host byte order
2731 	 */
2732 	dst_ip.s_addr = htonl(fid->dst_ip);
2733 	src_ip.s_addr = htonl(fid->src_ip);
2734 	if (oif) {
2735 		pcb = in_pcblookup_hash(pi,
2736 			dst_ip, htons(fid->dst_port),
2737 			src_ip, htons(fid->src_port),
2738 			wildcard, oif);
2739 	} else {
2740 		pcb = in_pcblookup_hash(pi,
2741 			src_ip, htons(fid->src_port),
2742 			dst_ip, htons(fid->dst_port),
2743 			wildcard, NULL);
2744 	}
2745 	if (pcb == NULL || pcb->inp_socket == NULL)
2746 		return 0;
2747 
2748 	if (opcode == O_UID) {
2749 #define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
2750 		return !socheckuid(pcb->inp_socket, uid);
2751 #undef socheckuid
2752 	} else  {
2753 		return groupmember(uid, pcb->inp_socket->so_cred);
2754 	}
2755 }
2756 
2757 static __inline int
2758 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
2759 {
2760 
2761 	if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
2762 		struct ifaddr_container *ifac;
2763 		struct ifnet *ifp;
2764 
2765 		ifp = ifunit_netisr(cmd->ifname);
2766 		if (ifp == NULL)
2767 			return (0);
2768 
2769 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2770 			struct ifaddr *ia = ifac->ifa;
2771 
2772 			if (ia->ifa_addr == NULL)
2773 				continue;
2774 			if (ia->ifa_addr->sa_family != AF_INET)
2775 				continue;
2776 
2777 			cmd->mask.s_addr = INADDR_ANY;
2778 			if (cmd->o.arg1 & IPFW_IFIP_NET) {
2779 				cmd->mask = ((struct sockaddr_in *)
2780 				    ia->ifa_netmask)->sin_addr;
2781 			}
2782 			if (cmd->mask.s_addr == INADDR_ANY)
2783 				cmd->mask.s_addr = INADDR_BROADCAST;
2784 
2785 			cmd->addr =
2786 			    ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
2787 			cmd->addr.s_addr &= cmd->mask.s_addr;
2788 
2789 			cmd->o.arg1 |= IPFW_IFIP_VALID;
2790 			break;
2791 		}
2792 		if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
2793 			return (0);
2794 	}
2795 	return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
2796 }
2797 
2798 static __inline struct mbuf *
2799 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
2800     struct ip_fw_local *local, struct ip **ip0)
2801 {
2802 	struct ip *ip = mtod(m, struct ip *);
2803 	struct tcphdr *tcp;
2804 	struct udphdr *udp;
2805 
2806 	/*
2807 	 * Collect parameters into local variables for faster matching.
2808 	 */
2809 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
2810 		local->proto = args->f_id.proto = 0;	/* mark f_id invalid */
2811 		goto done;
2812 	}
2813 
2814 	local->proto = args->f_id.proto = ip->ip_p;
2815 	local->src_ip = ip->ip_src;
2816 	local->dst_ip = ip->ip_dst;
2817 	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
2818 		local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
2819 		local->ip_len = ntohs(ip->ip_len);
2820 	} else {
2821 		local->offset = ip->ip_off & IP_OFFMASK;
2822 		local->ip_len = ip->ip_len;
2823 	}
2824 
2825 #define PULLUP_TO(len)					\
2826 do {							\
2827 	if (m->m_len < (len)) {				\
2828 		args->m = m = m_pullup(m, (len));	\
2829 		if (m == NULL) {			\
2830 			ip = NULL;			\
2831 			goto done;			\
2832 		}					\
2833 		ip = mtod(m, struct ip *);		\
2834 	}						\
2835 } while (0)
2836 
2837 	if (local->offset == 0) {
2838 		switch (local->proto) {
2839 		case IPPROTO_TCP:
2840 			PULLUP_TO(hlen + sizeof(struct tcphdr));
2841 			tcp = L3HDR(struct tcphdr, ip);
2842 			local->dst_port = tcp->th_dport;
2843 			local->src_port = tcp->th_sport;
2844 			args->f_id.flags = tcp->th_flags;
2845 			break;
2846 
2847 		case IPPROTO_UDP:
2848 			PULLUP_TO(hlen + sizeof(struct udphdr));
2849 			udp = L3HDR(struct udphdr, ip);
2850 			local->dst_port = udp->uh_dport;
2851 			local->src_port = udp->uh_sport;
2852 			break;
2853 
2854 		case IPPROTO_ICMP:
2855 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
2856 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
2857 			break;
2858 
2859 		default:
2860 			break;
2861 		}
2862 	}
2863 
2864 #undef PULLUP_TO
2865 
2866 	args->f_id.src_ip = ntohl(local->src_ip.s_addr);
2867 	args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
2868 	args->f_id.src_port = local->src_port = ntohs(local->src_port);
2869 	args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
2870 done:
2871 	*ip0 = ip;
2872 	return (m);
2873 }
2874 
2875 /*
2876  * The main check routine for the firewall.
2877  *
2878  * All arguments are in args so we can modify them and return them
2879  * back to the caller.
2880  *
2881  * Parameters:
2882  *
2883  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
2884  *		Starts with the IP header.
2885  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
2886  *	args->oif	Outgoing interface, or NULL if packet is incoming.
2887  *		The incoming interface is in the mbuf. (in)
2888  *
2889  *	args->rule	Pointer to the last matching rule (in/out)
2890  *	args->f_id	Addresses grabbed from the packet (out)
2891  *
2892  * Return value:
2893  *
2894  *	If the packet was denied/rejected and has been dropped, *m is equal
2895  *	to NULL upon return.
2896  *
2897  *	IP_FW_DENY	the packet must be dropped.
2898  *	IP_FW_PASS	The packet is to be accepted and routed normally.
2899  *	IP_FW_DIVERT	Divert the packet to port (args->cookie)
2900  *	IP_FW_TEE	Tee the packet to port (args->cookie)
2901  *	IP_FW_DUMMYNET	Send the packet to pipe/queue (args->cookie)
2902  *	IP_FW_CONTINUE	Continue processing on another cpu.
2903  */
2904 static int
2905 ipfw_chk(struct ip_fw_args *args)
2906 {
2907 	/*
2908 	 * Local variables hold state during the processing of a packet.
2909 	 *
2910 	 * IMPORTANT NOTE: to speed up the processing of rules, there
2911 	 * are some assumption on the values of the variables, which
2912 	 * are documented here. Should you change them, please check
2913 	 * the implementation of the various instructions to make sure
2914 	 * that they still work.
2915 	 *
2916 	 * args->eh	The MAC header. It is non-null for a layer2
2917 	 *	packet, it is NULL for a layer-3 packet.
2918 	 *
2919 	 * m | args->m	Pointer to the mbuf, as received from the caller.
2920 	 *	It may change if ipfw_chk() does an m_pullup, or if it
2921 	 *	consumes the packet because it calls send_reject().
2922 	 *	XXX This has to change, so that ipfw_chk() never modifies
2923 	 *	or consumes the buffer.
2924 	 * ip	is simply an alias of the value of m, and it is kept
2925 	 *	in sync with it (the packet is	supposed to start with
2926 	 *	the ip header).
2927 	 */
2928 	struct mbuf *m = args->m;
2929 	struct ip *ip = mtod(m, struct ip *);
2930 
2931 	/*
2932 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
2933 	 *	inbound path (ether_input, ip_input).
2934 	 *	If non-NULL, ipfw_chk has been called on the outbound path
2935 	 *	(ether_output, ip_output).
2936 	 */
2937 	struct ifnet *oif = args->oif;
2938 
2939 	struct ip_fw *f = NULL;		/* matching rule */
2940 	int retval = IP_FW_PASS;
2941 	struct m_tag *mtag;
2942 	struct divert_info *divinfo;
2943 
2944 	/*
2945 	 * hlen	The length of the IPv4 header.
2946 	 *	hlen >0 means we have an IPv4 packet.
2947 	 */
2948 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
2949 
2950 	struct ip_fw_local lc;
2951 
2952 	/*
2953 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
2954 	 * 	MATCH_NONE when checked and not matched (dyn_f = NULL),
2955 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
2956 	 */
2957 	int dyn_dir = MATCH_UNKNOWN;
2958 	struct ip_fw *dyn_f = NULL;
2959 	int cpuid = mycpuid;
2960 	struct ipfw_context *ctx;
2961 
2962 	ASSERT_NETISR_NCPUS(cpuid);
2963 	ctx = ipfw_ctx[cpuid];
2964 
2965 	if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
2966 		return IP_FW_PASS;	/* accept */
2967 
2968 	if (args->eh == NULL ||		/* layer 3 packet */
2969 	    (m->m_pkthdr.len >= sizeof(struct ip) &&
2970 	     ntohs(args->eh->ether_type) == ETHERTYPE_IP))
2971 		hlen = ip->ip_hl << 2;
2972 
2973 	memset(&lc, 0, sizeof(lc));
2974 
2975 	m = ipfw_setup_local(m, hlen, args, &lc, &ip);
2976 	if (m == NULL)
2977 		goto pullup_failed;
2978 
2979 	if (args->rule) {
2980 		/*
2981 		 * Packet has already been tagged. Look for the next rule
2982 		 * to restart processing.
2983 		 *
2984 		 * If fw_one_pass != 0 then just accept it.
2985 		 * XXX should not happen here, but optimized out in
2986 		 * the caller.
2987 		 */
2988 		if (fw_one_pass && !args->cont)
2989 			return IP_FW_PASS;
2990 		args->cont = 0;
2991 
2992 		/* This rule is being/has been flushed */
2993 		if (ipfw_flushing)
2994 			return IP_FW_DENY;
2995 
2996 		KASSERT(args->rule->cpuid == cpuid,
2997 			("rule used on cpu%d", cpuid));
2998 
2999 		/* This rule was deleted */
3000 		if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3001 			return IP_FW_DENY;
3002 
3003 		f = args->rule->next_rule;
3004 		if (f == NULL)
3005 			f = lookup_next_rule(args->rule);
3006 	} else {
3007 		/*
3008 		 * Find the starting rule. It can be either the first
3009 		 * one, or the one after divert_rule if asked so.
3010 		 */
3011 		int skipto;
3012 
3013 		KKASSERT(!args->cont);
3014 
3015 		mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3016 		if (mtag != NULL) {
3017 			divinfo = m_tag_data(mtag);
3018 			skipto = divinfo->skipto;
3019 		} else {
3020 			skipto = 0;
3021 		}
3022 
3023 		f = ctx->ipfw_layer3_chain;
3024 		if (args->eh == NULL && skipto != 0) {
3025 			/* No skipto during rule flushing */
3026 			if (ipfw_flushing)
3027 				return IP_FW_DENY;
3028 
3029 			if (skipto >= IPFW_DEFAULT_RULE)
3030 				return IP_FW_DENY; /* invalid */
3031 
3032 			while (f && f->rulenum <= skipto)
3033 				f = f->next;
3034 			if (f == NULL)	/* drop packet */
3035 				return IP_FW_DENY;
3036 		} else if (ipfw_flushing) {
3037 			/* Rules are being flushed; skip to default rule */
3038 			f = ctx->ipfw_default_rule;
3039 		}
3040 	}
3041 	if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3042 		m_tag_delete(m, mtag);
3043 
3044 	/*
3045 	 * Now scan the rules, and parse microinstructions for each rule.
3046 	 */
3047 	for (; f; f = f->next) {
3048 		int l, cmdlen;
3049 		ipfw_insn *cmd;
3050 		int skip_or; /* skip rest of OR block */
3051 
3052 again:
3053 		if (ctx->ipfw_set_disable & (1 << f->set))
3054 			continue;
3055 
3056 		skip_or = 0;
3057 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
3058 		     l -= cmdlen, cmd += cmdlen) {
3059 			int match;
3060 
3061 			/*
3062 			 * check_body is a jump target used when we find a
3063 			 * CHECK_STATE, and need to jump to the body of
3064 			 * the target rule.
3065 			 */
3066 
3067 check_body:
3068 			cmdlen = F_LEN(cmd);
3069 			/*
3070 			 * An OR block (insn_1 || .. || insn_n) has the
3071 			 * F_OR bit set in all but the last instruction.
3072 			 * The first match will set "skip_or", and cause
3073 			 * the following instructions to be skipped until
3074 			 * past the one with the F_OR bit clear.
3075 			 */
3076 			if (skip_or) {		/* skip this instruction */
3077 				if ((cmd->len & F_OR) == 0)
3078 					skip_or = 0;	/* next one is good */
3079 				continue;
3080 			}
3081 			match = 0; /* set to 1 if we succeed */
3082 
3083 			switch (cmd->opcode) {
3084 			/*
3085 			 * The first set of opcodes compares the packet's
3086 			 * fields with some pattern, setting 'match' if a
3087 			 * match is found. At the end of the loop there is
3088 			 * logic to deal with F_NOT and F_OR flags associated
3089 			 * with the opcode.
3090 			 */
3091 			case O_NOP:
3092 				match = 1;
3093 				break;
3094 
3095 			case O_FORWARD_MAC:
3096 				kprintf("ipfw: opcode %d unimplemented\n",
3097 					cmd->opcode);
3098 				break;
3099 
3100 			case O_GID:
3101 			case O_UID:
3102 				/*
3103 				 * We only check offset == 0 && proto != 0,
3104 				 * as this ensures that we have an IPv4
3105 				 * packet with the ports info.
3106 				 */
3107 				if (lc.offset!=0)
3108 					break;
3109 
3110 				match = ipfw_match_uid(&args->f_id, oif,
3111 					cmd->opcode,
3112 					(uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3113 				break;
3114 
3115 			case O_RECV:
3116 				match = iface_match(m->m_pkthdr.rcvif,
3117 				    (ipfw_insn_if *)cmd);
3118 				break;
3119 
3120 			case O_XMIT:
3121 				match = iface_match(oif, (ipfw_insn_if *)cmd);
3122 				break;
3123 
3124 			case O_VIA:
3125 				match = iface_match(oif ? oif :
3126 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3127 				break;
3128 
3129 			case O_MACADDR2:
3130 				if (args->eh != NULL) {	/* have MAC header */
3131 					uint32_t *want = (uint32_t *)
3132 						((ipfw_insn_mac *)cmd)->addr;
3133 					uint32_t *mask = (uint32_t *)
3134 						((ipfw_insn_mac *)cmd)->mask;
3135 					uint32_t *hdr = (uint32_t *)args->eh;
3136 
3137 					match =
3138 					(want[0] == (hdr[0] & mask[0]) &&
3139 					 want[1] == (hdr[1] & mask[1]) &&
3140 					 want[2] == (hdr[2] & mask[2]));
3141 				}
3142 				break;
3143 
3144 			case O_MAC_TYPE:
3145 				if (args->eh != NULL) {
3146 					uint16_t t =
3147 					    ntohs(args->eh->ether_type);
3148 					uint16_t *p =
3149 					    ((ipfw_insn_u16 *)cmd)->ports;
3150 					int i;
3151 
3152 					/* Special vlan handling */
3153 					if (m->m_flags & M_VLANTAG)
3154 						t = ETHERTYPE_VLAN;
3155 
3156 					for (i = cmdlen - 1; !match && i > 0;
3157 					     i--, p += 2) {
3158 						match =
3159 						(t >= p[0] && t <= p[1]);
3160 					}
3161 				}
3162 				break;
3163 
3164 			case O_FRAG:
3165 				match = (hlen > 0 && lc.offset != 0);
3166 				break;
3167 
3168 			case O_IPFRAG:
3169 				if (hlen > 0) {
3170 					uint16_t off;
3171 
3172 					if (args->eh != NULL)
3173 						off = ntohs(ip->ip_off);
3174 					else
3175 						off = ip->ip_off;
3176 					if (off & (IP_MF | IP_OFFMASK))
3177 						match = 1;
3178 				}
3179 				break;
3180 
3181 			case O_IN:	/* "out" is "not in" */
3182 				match = (oif == NULL);
3183 				break;
3184 
3185 			case O_LAYER2:
3186 				match = (args->eh != NULL);
3187 				break;
3188 
3189 			case O_PROTO:
3190 				/*
3191 				 * We do not allow an arg of 0 so the
3192 				 * check of "proto" only suffices.
3193 				 */
3194 				match = (lc.proto == cmd->arg1);
3195 				break;
3196 
3197 			case O_IP_SRC:
3198 				match = (hlen > 0 &&
3199 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3200 				    lc.src_ip.s_addr);
3201 				break;
3202 
3203 			case O_IP_SRC_MASK:
3204 				match = (hlen > 0 &&
3205 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3206 				     (lc.src_ip.s_addr &
3207 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3208 				break;
3209 
3210 			case O_IP_SRC_ME:
3211 				if (hlen > 0) {
3212 					struct ifnet *tif;
3213 
3214 					tif = INADDR_TO_IFP(&lc.src_ip);
3215 					match = (tif != NULL);
3216 				}
3217 				break;
3218 
3219 			case O_IP_SRC_TABLE:
3220 				match = ipfw_table_lookup(ctx, cmd->arg1,
3221 				    &lc.src_ip);
3222 				break;
3223 
3224 			case O_IP_SRC_IFIP:
3225 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3226 				    &lc.src_ip);
3227 				break;
3228 
3229 			case O_IP_DST_SET:
3230 			case O_IP_SRC_SET:
3231 				if (hlen > 0) {
3232 					uint32_t *d = (uint32_t *)(cmd + 1);
3233 					uint32_t addr =
3234 					    cmd->opcode == O_IP_DST_SET ?
3235 						args->f_id.dst_ip :
3236 						args->f_id.src_ip;
3237 
3238 					if (addr < d[0])
3239 						break;
3240 					addr -= d[0]; /* subtract base */
3241 					match =
3242 					(addr < cmd->arg1) &&
3243 					 (d[1 + (addr >> 5)] &
3244 					  (1 << (addr & 0x1f)));
3245 				}
3246 				break;
3247 
3248 			case O_IP_DST:
3249 				match = (hlen > 0 &&
3250 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3251 				    lc.dst_ip.s_addr);
3252 				break;
3253 
3254 			case O_IP_DST_MASK:
3255 				match = (hlen > 0) &&
3256 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3257 				     (lc.dst_ip.s_addr &
3258 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3259 				break;
3260 
3261 			case O_IP_DST_ME:
3262 				if (hlen > 0) {
3263 					struct ifnet *tif;
3264 
3265 					tif = INADDR_TO_IFP(&lc.dst_ip);
3266 					match = (tif != NULL);
3267 				}
3268 				break;
3269 
3270 			case O_IP_DST_TABLE:
3271 				match = ipfw_table_lookup(ctx, cmd->arg1,
3272 				    &lc.dst_ip);
3273 				break;
3274 
3275 			case O_IP_DST_IFIP:
3276 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3277 				    &lc.dst_ip);
3278 				break;
3279 
3280 			case O_IP_SRCPORT:
3281 			case O_IP_DSTPORT:
3282 				/*
3283 				 * offset == 0 && proto != 0 is enough
3284 				 * to guarantee that we have an IPv4
3285 				 * packet with port info.
3286 				 */
3287 				if ((lc.proto==IPPROTO_UDP ||
3288 				     lc.proto==IPPROTO_TCP)
3289 				    && lc.offset == 0) {
3290 					uint16_t x =
3291 					    (cmd->opcode == O_IP_SRCPORT) ?
3292 						lc.src_port : lc.dst_port;
3293 					uint16_t *p =
3294 					    ((ipfw_insn_u16 *)cmd)->ports;
3295 					int i;
3296 
3297 					for (i = cmdlen - 1; !match && i > 0;
3298 					     i--, p += 2) {
3299 						match =
3300 						(x >= p[0] && x <= p[1]);
3301 					}
3302 				}
3303 				break;
3304 
3305 			case O_ICMPTYPE:
3306 				match = (lc.offset == 0 &&
3307 				    lc.proto==IPPROTO_ICMP &&
3308 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3309 				break;
3310 
3311 			case O_IPOPT:
3312 				match = (hlen > 0 && ipopts_match(ip, cmd));
3313 				break;
3314 
3315 			case O_IPVER:
3316 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3317 				break;
3318 
3319 			case O_IPTTL:
3320 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3321 				break;
3322 
3323 			case O_IPID:
3324 				match = (hlen > 0 &&
3325 				    cmd->arg1 == ntohs(ip->ip_id));
3326 				break;
3327 
3328 			case O_IPLEN:
3329 				match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3330 				break;
3331 
3332 			case O_IPPRECEDENCE:
3333 				match = (hlen > 0 &&
3334 				    (cmd->arg1 == (ip->ip_tos & 0xe0)));
3335 				break;
3336 
3337 			case O_IPTOS:
3338 				match = (hlen > 0 &&
3339 				    flags_match(cmd, ip->ip_tos));
3340 				break;
3341 
3342 			case O_TCPFLAGS:
3343 				match = (lc.proto == IPPROTO_TCP &&
3344 				    lc.offset == 0 &&
3345 				    flags_match(cmd,
3346 					L3HDR(struct tcphdr,ip)->th_flags));
3347 				break;
3348 
3349 			case O_TCPOPTS:
3350 				match = (lc.proto == IPPROTO_TCP &&
3351 				    lc.offset == 0 && tcpopts_match(ip, cmd));
3352 				break;
3353 
3354 			case O_TCPSEQ:
3355 				match = (lc.proto == IPPROTO_TCP &&
3356 				    lc.offset == 0 &&
3357 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3358 					L3HDR(struct tcphdr,ip)->th_seq);
3359 				break;
3360 
3361 			case O_TCPACK:
3362 				match = (lc.proto == IPPROTO_TCP &&
3363 				    lc.offset == 0 &&
3364 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3365 					L3HDR(struct tcphdr,ip)->th_ack);
3366 				break;
3367 
3368 			case O_TCPWIN:
3369 				match = (lc.proto == IPPROTO_TCP &&
3370 				    lc.offset == 0 &&
3371 				    cmd->arg1 ==
3372 					L3HDR(struct tcphdr,ip)->th_win);
3373 				break;
3374 
3375 			case O_ESTAB:
3376 				/* reject packets which have SYN only */
3377 				/* XXX should i also check for TH_ACK ? */
3378 				match = (lc.proto == IPPROTO_TCP &&
3379 				    lc.offset == 0 &&
3380 				    (L3HDR(struct tcphdr,ip)->th_flags &
3381 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3382 				break;
3383 
3384 			case O_LOG:
3385 				if (fw_verbose) {
3386 					ipfw_log(ctx, f, hlen, args->eh, m,
3387 					    oif);
3388 				}
3389 				match = 1;
3390 				break;
3391 
3392 			case O_PROB:
3393 				match = (krandom() <
3394 					((ipfw_insn_u32 *)cmd)->d[0]);
3395 				break;
3396 
3397 			/*
3398 			 * The second set of opcodes represents 'actions',
3399 			 * i.e. the terminal part of a rule once the packet
3400 			 * matches all previous patterns.
3401 			 * Typically there is only one action for each rule,
3402 			 * and the opcode is stored at the end of the rule
3403 			 * (but there are exceptions -- see below).
3404 			 *
3405 			 * In general, here we set retval and terminate the
3406 			 * outer loop (would be a 'break 3' in some language,
3407 			 * but we need to do a 'goto done').
3408 			 *
3409 			 * Exceptions:
3410 			 * O_COUNT and O_SKIPTO actions:
3411 			 *   instead of terminating, we jump to the next rule
3412 			 *   ('goto next_rule', equivalent to a 'break 2'),
3413 			 *   or to the SKIPTO target ('goto again' after
3414 			 *   having set f, cmd and l), respectively.
3415 			 *
3416 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
3417 			 *   not real 'actions', and are stored right
3418 			 *   before the 'action' part of the rule.
3419 			 *   These opcodes try to install an entry in the
3420 			 *   state tables; if successful, we continue with
3421 			 *   the next opcode (match=1; break;), otherwise
3422 			 *   the packet must be dropped ('goto done' after
3423 			 *   setting retval).  If static rules are changed
3424 			 *   during the state installation, the packet will
3425 			 *   be dropped and rule's stats will not beupdated
3426 			 *   ('return IP_FW_DENY').
3427 			 *
3428 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3429 			 *   cause a lookup of the state table, and a jump
3430 			 *   to the 'action' part of the parent rule
3431 			 *   ('goto check_body') if an entry is found, or
3432 			 *   (CHECK_STATE only) a jump to the next rule if
3433 			 *   the entry is not found ('goto next_rule').
3434 			 *   The result of the lookup is cached to make
3435 			 *   further instances of these opcodes are
3436 			 *   effectively NOPs.  If static rules are changed
3437 			 *   during the state looking up, the packet will
3438 			 *   be dropped and rule's stats will not be updated
3439 			 *   ('return IP_FW_DENY').
3440 			 */
3441 			case O_LIMIT:
3442 			case O_KEEP_STATE:
3443 				if (ipfw_state_install(ctx, f,
3444 				    (ipfw_insn_limit *)cmd, args,
3445 				    (lc.offset == 0 &&
3446 				     lc.proto == IPPROTO_TCP) ?
3447 				    L3HDR(struct tcphdr, ip) : NULL)) {
3448 					retval = IP_FW_DENY;
3449 					goto done; /* error/limit violation */
3450 				}
3451 				match = 1;
3452 				break;
3453 
3454 			case O_PROBE_STATE:
3455 			case O_CHECK_STATE:
3456 				/*
3457 				 * States are checked at the first keep-state
3458 				 * check-state occurrence, with the result
3459 				 * being stored in dyn_dir.  The compiler
3460 				 * introduces a PROBE_STATE instruction for
3461 				 * us when we have a KEEP_STATE/LIMIT (because
3462 				 * PROBE_STATE needs to be run first).
3463 				 */
3464 				if (dyn_dir == MATCH_UNKNOWN) {
3465 					dyn_f = ipfw_state_lookup_rule(ctx,
3466 					    &args->f_id, &dyn_dir,
3467 					    (lc.offset == 0 &&
3468 					     lc.proto == IPPROTO_TCP) ?
3469 					    L3HDR(struct tcphdr, ip) : NULL,
3470 					    lc.ip_len);
3471 					if (dyn_f != NULL) {
3472 						/*
3473 						 * Found a rule from a state;
3474 						 * jump to the 'action' part
3475 						 * of the rule.
3476 						 */
3477 						f = dyn_f;
3478 						cmd = ACTION_PTR(f);
3479 						l = f->cmd_len - f->act_ofs;
3480 						goto check_body;
3481 					}
3482 				}
3483 				/*
3484 				 * State not found. If CHECK_STATE, skip to
3485 				 * next rule, if PROBE_STATE just ignore and
3486 				 * continue with next opcode.
3487 				 */
3488 				if (cmd->opcode == O_CHECK_STATE)
3489 					goto next_rule;
3490 				match = 1;
3491 				break;
3492 
3493 			case O_ACCEPT:
3494 				retval = IP_FW_PASS;	/* accept */
3495 				goto done;
3496 
3497 			case O_DEFRAG:
3498 				if (f->cross_rules == NULL) {
3499 					/*
3500 					 * This rule was not completely setup;
3501 					 * move on to the next rule.
3502 					 */
3503 					goto next_rule;
3504 				}
3505 
3506 				/*
3507 				 * Don't defrag for l2 packets, output packets
3508 				 * or non-fragments.
3509 				 */
3510 				if (oif != NULL || args->eh != NULL ||
3511 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
3512 					goto next_rule;
3513 
3514 				ctx->ipfw_frags++;
3515 				m = ip_reass(m);
3516 				args->m = m;
3517 				if (m == NULL) {
3518 					retval = IP_FW_PASS;
3519 					goto done;
3520 				}
3521 				ctx->ipfw_defraged++;
3522 				KASSERT((m->m_flags & M_HASH) == 0,
3523 				    ("hash not cleared"));
3524 
3525 				/* Update statistics */
3526 				f->pcnt++;
3527 				f->bcnt += lc.ip_len;
3528 				f->timestamp = time_second;
3529 
3530 				ip = mtod(m, struct ip *);
3531 				hlen = ip->ip_hl << 2;
3532 				ip->ip_len += hlen;
3533 
3534 				ip->ip_len = htons(ip->ip_len);
3535 				ip->ip_off = htons(ip->ip_off);
3536 
3537 				ip_hashfn(&m, 0);
3538 				args->m = m;
3539 				if (m == NULL)
3540 					goto pullup_failed;
3541 
3542 				KASSERT(m->m_flags & M_HASH, ("no hash"));
3543 				cpuid = netisr_hashcpu(m->m_pkthdr.hash);
3544 				if (cpuid != mycpuid) {
3545 					/*
3546 					 * NOTE:
3547 					 * ip_len/ip_off are in network byte
3548 					 * order.
3549 					 */
3550 					ctx->ipfw_defrag_remote++;
3551 					args->rule = f;
3552 					return (IP_FW_CONTINUE);
3553 				}
3554 
3555 				/* 'm' might be changed by ip_hashfn(). */
3556 				ip = mtod(m, struct ip *);
3557 				ip->ip_len = ntohs(ip->ip_len);
3558 				ip->ip_off = ntohs(ip->ip_off);
3559 
3560 				m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3561 				if (m == NULL)
3562 					goto pullup_failed;
3563 
3564 				/* Move on. */
3565 				goto next_rule;
3566 
3567 			case O_PIPE:
3568 			case O_QUEUE:
3569 				args->rule = f; /* report matching rule */
3570 				args->cookie = cmd->arg1;
3571 				retval = IP_FW_DUMMYNET;
3572 				goto done;
3573 
3574 			case O_DIVERT:
3575 			case O_TEE:
3576 				if (args->eh) /* not on layer 2 */
3577 					break;
3578 
3579 				mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
3580 				    sizeof(*divinfo), M_INTWAIT | M_NULLOK);
3581 				if (mtag == NULL) {
3582 					retval = IP_FW_DENY;
3583 					goto done;
3584 				}
3585 				divinfo = m_tag_data(mtag);
3586 
3587 				divinfo->skipto = f->rulenum;
3588 				divinfo->port = cmd->arg1;
3589 				divinfo->tee = (cmd->opcode == O_TEE);
3590 				m_tag_prepend(m, mtag);
3591 
3592 				args->cookie = cmd->arg1;
3593 				retval = (cmd->opcode == O_DIVERT) ?
3594 					 IP_FW_DIVERT : IP_FW_TEE;
3595 				goto done;
3596 
3597 			case O_COUNT:
3598 			case O_SKIPTO:
3599 				f->pcnt++;	/* update stats */
3600 				f->bcnt += lc.ip_len;
3601 				f->timestamp = time_second;
3602 				if (cmd->opcode == O_COUNT)
3603 					goto next_rule;
3604 				/* handle skipto */
3605 				if (f->next_rule == NULL)
3606 					lookup_next_rule(f);
3607 				f = f->next_rule;
3608 				goto again;
3609 
3610 			case O_REJECT:
3611 				/*
3612 				 * Drop the packet and send a reject notice
3613 				 * if the packet is not ICMP (or is an ICMP
3614 				 * query), and it is not multicast/broadcast.
3615 				 */
3616 				if (hlen > 0 &&
3617 				    (lc.proto != IPPROTO_ICMP ||
3618 				     is_icmp_query(ip)) &&
3619 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
3620 				    !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
3621 					send_reject(args, cmd->arg1,
3622 					    lc.offset, lc.ip_len);
3623 					retval = IP_FW_DENY;
3624 					goto done;
3625 				}
3626 				/* FALLTHROUGH */
3627 			case O_DENY:
3628 				retval = IP_FW_DENY;
3629 				goto done;
3630 
3631 			case O_FORWARD_IP:
3632 				if (args->eh)	/* not valid on layer2 pkts */
3633 					break;
3634 				if (!dyn_f || dyn_dir == MATCH_FORWARD) {
3635 					struct sockaddr_in *sin;
3636 
3637 					mtag = m_tag_get(PACKET_TAG_IPFORWARD,
3638 					    sizeof(*sin), M_INTWAIT | M_NULLOK);
3639 					if (mtag == NULL) {
3640 						retval = IP_FW_DENY;
3641 						goto done;
3642 					}
3643 					sin = m_tag_data(mtag);
3644 
3645 					/* Structure copy */
3646 					*sin = ((ipfw_insn_sa *)cmd)->sa;
3647 
3648 					m_tag_prepend(m, mtag);
3649 					m->m_pkthdr.fw_flags |=
3650 						IPFORWARD_MBUF_TAGGED;
3651 					m->m_pkthdr.fw_flags &=
3652 						~BRIDGE_MBUF_TAGGED;
3653 				}
3654 				retval = IP_FW_PASS;
3655 				goto done;
3656 
3657 			default:
3658 				panic("-- unknown opcode %d", cmd->opcode);
3659 			} /* end of switch() on opcodes */
3660 
3661 			if (cmd->len & F_NOT)
3662 				match = !match;
3663 
3664 			if (match) {
3665 				if (cmd->len & F_OR)
3666 					skip_or = 1;
3667 			} else {
3668 				if (!(cmd->len & F_OR)) /* not an OR block, */
3669 					break;		/* try next rule    */
3670 			}
3671 
3672 		}	/* end of inner for, scan opcodes */
3673 
3674 next_rule:;		/* try next rule		*/
3675 
3676 	}		/* end of outer for, scan rules */
3677 	kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
3678 	return IP_FW_DENY;
3679 
3680 done:
3681 	/* Update statistics */
3682 	f->pcnt++;
3683 	f->bcnt += lc.ip_len;
3684 	f->timestamp = time_second;
3685 	return retval;
3686 
3687 pullup_failed:
3688 	if (fw_verbose)
3689 		kprintf("pullup failed\n");
3690 	return IP_FW_DENY;
3691 }
3692 
3693 static struct mbuf *
3694 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
3695 {
3696 	struct m_tag *mtag;
3697 	struct dn_pkt *pkt;
3698 	ipfw_insn *cmd;
3699 	const struct ipfw_flow_id *id;
3700 	struct dn_flow_id *fid;
3701 
3702 	M_ASSERTPKTHDR(m);
3703 
3704 	mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
3705 	    M_INTWAIT | M_NULLOK);
3706 	if (mtag == NULL) {
3707 		m_freem(m);
3708 		return (NULL);
3709 	}
3710 	m_tag_prepend(m, mtag);
3711 
3712 	pkt = m_tag_data(mtag);
3713 	bzero(pkt, sizeof(*pkt));
3714 
3715 	cmd = fwa->rule->cmd + fwa->rule->act_ofs;
3716 	if (cmd->opcode == O_LOG)
3717 		cmd += F_LEN(cmd);
3718 	KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
3719 		("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
3720 
3721 	pkt->dn_m = m;
3722 	pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
3723 	pkt->ifp = fwa->oif;
3724 	pkt->pipe_nr = pipe_nr;
3725 
3726 	pkt->cpuid = mycpuid;
3727 	pkt->msgport = netisr_curport();
3728 
3729 	id = &fwa->f_id;
3730 	fid = &pkt->id;
3731 	fid->fid_dst_ip = id->dst_ip;
3732 	fid->fid_src_ip = id->src_ip;
3733 	fid->fid_dst_port = id->dst_port;
3734 	fid->fid_src_port = id->src_port;
3735 	fid->fid_proto = id->proto;
3736 	fid->fid_flags = id->flags;
3737 
3738 	ipfw_ref_rule(fwa->rule);
3739 	pkt->dn_priv = fwa->rule;
3740 	pkt->dn_unref_priv = ipfw_unref_rule;
3741 
3742 	if (cmd->opcode == O_PIPE)
3743 		pkt->dn_flags |= DN_FLAGS_IS_PIPE;
3744 
3745 	m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
3746 	return (m);
3747 }
3748 
3749 /*
3750  * When a rule is added/deleted, clear the next_rule pointers in all rules.
3751  * These will be reconstructed on the fly as packets are matched.
3752  */
3753 static void
3754 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
3755 {
3756 	struct ip_fw *rule;
3757 
3758 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
3759 		rule->next_rule = NULL;
3760 }
3761 
3762 static __inline void
3763 ipfw_inc_static_count(struct ip_fw *rule)
3764 {
3765 	/* Static rule's counts are updated only on CPU0 */
3766 	KKASSERT(mycpuid == 0);
3767 
3768 	static_count++;
3769 	static_ioc_len += IOC_RULESIZE(rule);
3770 }
3771 
3772 static __inline void
3773 ipfw_dec_static_count(struct ip_fw *rule)
3774 {
3775 	int l = IOC_RULESIZE(rule);
3776 
3777 	/* Static rule's counts are updated only on CPU0 */
3778 	KKASSERT(mycpuid == 0);
3779 
3780 	KASSERT(static_count > 0, ("invalid static count %u", static_count));
3781 	static_count--;
3782 
3783 	KASSERT(static_ioc_len >= l,
3784 		("invalid static len %u", static_ioc_len));
3785 	static_ioc_len -= l;
3786 }
3787 
3788 static void
3789 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
3790 {
3791 	if (fwmsg->sibling != NULL) {
3792 		KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
3793 		fwmsg->sibling->sibling = rule;
3794 	}
3795 	fwmsg->sibling = rule;
3796 }
3797 
3798 static struct ip_fw *
3799 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3800 {
3801 	struct ip_fw *rule;
3802 
3803 	rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
3804 
3805 	rule->act_ofs = ioc_rule->act_ofs;
3806 	rule->cmd_len = ioc_rule->cmd_len;
3807 	rule->rulenum = ioc_rule->rulenum;
3808 	rule->set = ioc_rule->set;
3809 	rule->usr_flags = ioc_rule->usr_flags;
3810 
3811 	bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
3812 
3813 	rule->refcnt = 1;
3814 	rule->cpuid = mycpuid;
3815 	rule->rule_flags = rule_flags;
3816 
3817 	return rule;
3818 }
3819 
3820 static void
3821 ipfw_add_rule_dispatch(netmsg_t nmsg)
3822 {
3823 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3824 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3825 	struct ip_fw *rule;
3826 
3827 	ASSERT_NETISR_NCPUS(mycpuid);
3828 
3829 	rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
3830 
3831 	/*
3832 	 * Insert rule into the pre-determined position
3833 	 */
3834 	if (fwmsg->prev_rule != NULL) {
3835 		struct ip_fw *prev, *next;
3836 
3837 		prev = fwmsg->prev_rule;
3838 		KKASSERT(prev->cpuid == mycpuid);
3839 
3840 		next = fwmsg->next_rule;
3841 		KKASSERT(next->cpuid == mycpuid);
3842 
3843 		rule->next = next;
3844 		prev->next = rule;
3845 
3846 		/*
3847 		 * Move to the position on the next CPU
3848 		 * before the msg is forwarded.
3849 		 */
3850 		fwmsg->prev_rule = prev->sibling;
3851 		fwmsg->next_rule = next->sibling;
3852 	} else {
3853 		KKASSERT(fwmsg->next_rule == NULL);
3854 		rule->next = ctx->ipfw_layer3_chain;
3855 		ctx->ipfw_layer3_chain = rule;
3856 	}
3857 
3858 	/* Link rule CPU sibling */
3859 	ipfw_link_sibling(fwmsg, rule);
3860 
3861 	ipfw_flush_rule_ptrs(ctx);
3862 
3863 	if (mycpuid == 0) {
3864 		/* Statistics only need to be updated once */
3865 		ipfw_inc_static_count(rule);
3866 
3867 		/* Return the rule on CPU0 */
3868 		nmsg->lmsg.u.ms_resultp = rule;
3869 	}
3870 
3871 	if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
3872 		rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
3873 
3874 	if (fwmsg->cross_rules != NULL) {
3875 		/* Save rules for later use. */
3876 		fwmsg->cross_rules[mycpuid] = rule;
3877 	}
3878 
3879 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3880 }
3881 
3882 static void
3883 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
3884 {
3885 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
3886 	struct ip_fw *rule = fwmsg->sibling;
3887 	int sz = sizeof(struct ip_fw *) * netisr_ncpus;
3888 
3889 	ASSERT_NETISR_NCPUS(mycpuid);
3890 	KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
3891 	    ("not crossref rule"));
3892 
3893 	rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
3894 	memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
3895 
3896 	fwmsg->sibling = rule->sibling;
3897 	netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
3898 }
3899 
3900 /*
3901  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
3902  * then possibly create a rule number and add the rule to the list.
3903  * Update the rule_number in the input struct so the caller knows
3904  * it as well.
3905  */
3906 static void
3907 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
3908 {
3909 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3910 	struct netmsg_ipfw fwmsg;
3911 	struct ip_fw *f, *prev, *rule;
3912 
3913 	ASSERT_NETISR0;
3914 
3915 	/*
3916 	 * If rulenum is 0, find highest numbered rule before the
3917 	 * default rule, and add rule number incremental step.
3918 	 */
3919 	if (ioc_rule->rulenum == 0) {
3920 		int step = autoinc_step;
3921 
3922 		KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
3923 			 step <= IPFW_AUTOINC_STEP_MAX);
3924 
3925 		/*
3926 		 * Locate the highest numbered rule before default
3927 		 */
3928 		for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
3929 			if (f->rulenum == IPFW_DEFAULT_RULE)
3930 				break;
3931 			ioc_rule->rulenum = f->rulenum;
3932 		}
3933 		if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
3934 			ioc_rule->rulenum += step;
3935 	}
3936 	KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
3937 		ioc_rule->rulenum != 0,
3938 		("invalid rule num %d", ioc_rule->rulenum));
3939 
3940 	/*
3941 	 * Now find the right place for the new rule in the sorted list.
3942 	 */
3943 	for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
3944 	     prev = f, f = f->next) {
3945 		if (f->rulenum > ioc_rule->rulenum) {
3946 			/* Found the location */
3947 			break;
3948 		}
3949 	}
3950 	KASSERT(f != NULL, ("no default rule?!"));
3951 
3952 	/*
3953 	 * Duplicate the rule onto each CPU.
3954 	 * The rule duplicated on CPU0 will be returned.
3955 	 */
3956 	bzero(&fwmsg, sizeof(fwmsg));
3957 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
3958 	    ipfw_add_rule_dispatch);
3959 	fwmsg.ioc_rule = ioc_rule;
3960 	fwmsg.prev_rule = prev;
3961 	fwmsg.next_rule = prev == NULL ? NULL : f;
3962 	fwmsg.rule_flags = rule_flags;
3963 	if (rule_flags & IPFW_RULE_F_CROSSREF) {
3964 		fwmsg.cross_rules = kmalloc(
3965 		    sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
3966 		    M_WAITOK | M_ZERO);
3967 	}
3968 
3969 	netisr_domsg_global(&fwmsg.base);
3970 	KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
3971 
3972 	rule = fwmsg.base.lmsg.u.ms_resultp;
3973 	KKASSERT(rule != NULL && rule->cpuid == mycpuid);
3974 
3975 	if (fwmsg.cross_rules != NULL) {
3976 		netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
3977 		    MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
3978 		fwmsg.sibling = rule;
3979 		netisr_domsg_global(&fwmsg.base);
3980 		KKASSERT(fwmsg.sibling == NULL);
3981 
3982 		kfree(fwmsg.cross_rules, M_TEMP);
3983 
3984 #ifdef KLD_MODULE
3985 		atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
3986 #endif
3987 	}
3988 
3989 	DPRINTF("++ installed rule %d, static count now %d\n",
3990 		rule->rulenum, static_count);
3991 }
3992 
3993 /*
3994  * Free storage associated with a static rule (including derived
3995  * states/tracks).
3996  * The caller is in charge of clearing rule pointers to avoid
3997  * dangling pointers.
3998  * @return a pointer to the next entry.
3999  * Arguments are not checked, so they better be correct.
4000  */
4001 static struct ip_fw *
4002 ipfw_delete_rule(struct ipfw_context *ctx,
4003 		 struct ip_fw *prev, struct ip_fw *rule)
4004 {
4005 	struct ip_fw *n;
4006 
4007 	n = rule->next;
4008 	if (prev == NULL)
4009 		ctx->ipfw_layer3_chain = n;
4010 	else
4011 		prev->next = n;
4012 
4013 	/* Mark the rule as invalid */
4014 	rule->rule_flags |= IPFW_RULE_F_INVALID;
4015 	rule->next_rule = NULL;
4016 	rule->sibling = NULL;
4017 #ifdef foo
4018 	/* Don't reset cpuid here; keep various assertion working */
4019 	rule->cpuid = -1;
4020 #endif
4021 
4022 	/* Statistics only need to be updated once */
4023 	if (mycpuid == 0)
4024 		ipfw_dec_static_count(rule);
4025 
4026 	if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4027 		/* Try to free this rule */
4028 		ipfw_free_rule(rule);
4029 	} else {
4030 		/* TODO: check staging area. */
4031 		if (mycpuid == 0) {
4032 			rule->next = ipfw_gd.ipfw_crossref_free;
4033 			ipfw_gd.ipfw_crossref_free = rule;
4034 		}
4035 	}
4036 
4037 	/* Return the next rule */
4038 	return n;
4039 }
4040 
4041 static void
4042 ipfw_flush_dispatch(netmsg_t nmsg)
4043 {
4044 	int kill_default = nmsg->lmsg.u.ms_result;
4045 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4046 	struct ip_fw *rule;
4047 
4048 	ASSERT_NETISR_NCPUS(mycpuid);
4049 
4050 	/*
4051 	 * Flush states.
4052 	 */
4053 	ipfw_state_flush(ctx, NULL);
4054 	KASSERT(ctx->ipfw_state_cnt == 0,
4055 	    ("%d pcpu states remain", ctx->ipfw_state_cnt));
4056 	ctx->ipfw_state_loosecnt = 0;
4057 	ctx->ipfw_state_lastexp = 0;
4058 
4059 	/*
4060 	 * Flush tracks.
4061 	 */
4062 	ipfw_track_flush(ctx, NULL);
4063 	ctx->ipfw_track_lastexp = 0;
4064 	if (ctx->ipfw_trkcnt_spare != NULL) {
4065 		kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4066 		ctx->ipfw_trkcnt_spare = NULL;
4067 	}
4068 
4069 	ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4070 
4071 	while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4072 	       (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4073 		ipfw_delete_rule(ctx, NULL, rule);
4074 
4075 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4076 }
4077 
4078 /*
4079  * Deletes all rules from a chain (including the default rule
4080  * if the second argument is set).
4081  */
4082 static void
4083 ipfw_flush(int kill_default)
4084 {
4085 	struct netmsg_base nmsg;
4086 #ifdef INVARIANTS
4087 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4088 	int state_cnt;
4089 #endif
4090 
4091 	ASSERT_NETISR0;
4092 
4093 	/*
4094 	 * If 'kill_default' then caller has done the necessary
4095 	 * msgport syncing; unnecessary to do it again.
4096 	 */
4097 	if (!kill_default) {
4098 		/*
4099 		 * Let ipfw_chk() know the rules are going to
4100 		 * be flushed, so it could jump directly to
4101 		 * the default rule.
4102 		 */
4103 		ipfw_flushing = 1;
4104 		/* XXX use priority sync */
4105 		netmsg_service_sync();
4106 	}
4107 
4108 	/*
4109 	 * Press the 'flush' button
4110 	 */
4111 	bzero(&nmsg, sizeof(nmsg));
4112 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4113 	    ipfw_flush_dispatch);
4114 	nmsg.lmsg.u.ms_result = kill_default;
4115 	netisr_domsg_global(&nmsg);
4116 	ipfw_gd.ipfw_state_loosecnt = 0;
4117 	ipfw_gd.ipfw_state_globexp = 0;
4118 	ipfw_gd.ipfw_track_globexp = 0;
4119 
4120 #ifdef INVARIANTS
4121 	state_cnt = ipfw_state_cntcoll();
4122 	KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4123 
4124 	KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4125 	    ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4126 
4127 	if (kill_default) {
4128 		KASSERT(static_count == 0,
4129 			("%u static rules remain", static_count));
4130 		KASSERT(static_ioc_len == 0,
4131 			("%u bytes of static rules remain", static_ioc_len));
4132 	} else {
4133 		KASSERT(static_count == 1,
4134 			("%u static rules remain", static_count));
4135 		KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4136 			("%u bytes of static rules remain, should be %lu",
4137 			 static_ioc_len,
4138 			 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4139 	}
4140 #endif
4141 
4142 	/* Flush is done */
4143 	ipfw_flushing = 0;
4144 }
4145 
4146 static void
4147 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4148 {
4149 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4150 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4151 	struct ip_fw *rule, *prev;
4152 
4153 	ASSERT_NETISR_NCPUS(mycpuid);
4154 
4155 	rule = dmsg->start_rule;
4156 	KKASSERT(rule->cpuid == mycpuid);
4157 	dmsg->start_rule = rule->sibling;
4158 
4159 	prev = dmsg->prev_rule;
4160 	if (prev != NULL) {
4161 		KKASSERT(prev->cpuid == mycpuid);
4162 
4163 		/*
4164 		 * Move to the position on the next CPU
4165 		 * before the msg is forwarded.
4166 		 */
4167 		dmsg->prev_rule = prev->sibling;
4168 	}
4169 
4170 	/*
4171 	 * flush pointers outside the loop, then delete all matching
4172 	 * rules.  'prev' remains the same throughout the cycle.
4173 	 */
4174 	ipfw_flush_rule_ptrs(ctx);
4175 	while (rule && rule->rulenum == dmsg->rulenum) {
4176 		if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4177 			/* Flush states generated by this rule. */
4178 			ipfw_state_flush(ctx, rule);
4179 		}
4180 		if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4181 			/* Flush tracks generated by this rule. */
4182 			ipfw_track_flush(ctx, rule);
4183 		}
4184 		rule = ipfw_delete_rule(ctx, prev, rule);
4185 	}
4186 
4187 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4188 }
4189 
4190 static int
4191 ipfw_alt_delete_rule(uint16_t rulenum)
4192 {
4193 	struct ip_fw *prev, *rule;
4194 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4195 	struct netmsg_del dmsg;
4196 
4197 	ASSERT_NETISR0;
4198 
4199 	/*
4200 	 * Locate first rule to delete
4201 	 */
4202 	for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4203 	     rule && rule->rulenum < rulenum;
4204 	     prev = rule, rule = rule->next)
4205 		; /* EMPTY */
4206 	if (rule->rulenum != rulenum)
4207 		return EINVAL;
4208 
4209 	/*
4210 	 * Get rid of the rule duplications on all CPUs
4211 	 */
4212 	bzero(&dmsg, sizeof(dmsg));
4213 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4214 	    ipfw_alt_delete_rule_dispatch);
4215 	dmsg.prev_rule = prev;
4216 	dmsg.start_rule = rule;
4217 	dmsg.rulenum = rulenum;
4218 
4219 	netisr_domsg_global(&dmsg.base);
4220 	KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4221 	return 0;
4222 }
4223 
4224 static void
4225 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4226 {
4227 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4228 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4229 	struct ip_fw *prev, *rule;
4230 #ifdef INVARIANTS
4231 	int del = 0;
4232 #endif
4233 
4234 	ASSERT_NETISR_NCPUS(mycpuid);
4235 
4236 	ipfw_flush_rule_ptrs(ctx);
4237 
4238 	prev = NULL;
4239 	rule = ctx->ipfw_layer3_chain;
4240 	while (rule != NULL) {
4241 		if (rule->set == dmsg->from_set) {
4242 			if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4243 				/* Flush states generated by this rule. */
4244 				ipfw_state_flush(ctx, rule);
4245 			}
4246 			if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4247 				/* Flush tracks generated by this rule. */
4248 				ipfw_track_flush(ctx, rule);
4249 			}
4250 			rule = ipfw_delete_rule(ctx, prev, rule);
4251 #ifdef INVARIANTS
4252 			del = 1;
4253 #endif
4254 		} else {
4255 			prev = rule;
4256 			rule = rule->next;
4257 		}
4258 	}
4259 	KASSERT(del, ("no match set?!"));
4260 
4261 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4262 }
4263 
4264 static int
4265 ipfw_alt_delete_ruleset(uint8_t set)
4266 {
4267 	struct netmsg_del dmsg;
4268 	int del;
4269 	struct ip_fw *rule;
4270 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4271 
4272 	ASSERT_NETISR0;
4273 
4274 	/*
4275 	 * Check whether the 'set' exists.  If it exists,
4276 	 * then check whether any rules within the set will
4277 	 * try to create states.
4278 	 */
4279 	del = 0;
4280 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4281 		if (rule->set == set)
4282 			del = 1;
4283 	}
4284 	if (!del)
4285 		return 0; /* XXX EINVAL? */
4286 
4287 	/*
4288 	 * Delete this set
4289 	 */
4290 	bzero(&dmsg, sizeof(dmsg));
4291 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4292 	    ipfw_alt_delete_ruleset_dispatch);
4293 	dmsg.from_set = set;
4294 	netisr_domsg_global(&dmsg.base);
4295 
4296 	return 0;
4297 }
4298 
4299 static void
4300 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
4301 {
4302 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4303 	struct ip_fw *rule;
4304 
4305 	ASSERT_NETISR_NCPUS(mycpuid);
4306 
4307 	rule = dmsg->start_rule;
4308 	KKASSERT(rule->cpuid == mycpuid);
4309 
4310 	/*
4311 	 * Move to the position on the next CPU
4312 	 * before the msg is forwarded.
4313 	 */
4314 	dmsg->start_rule = rule->sibling;
4315 
4316 	while (rule && rule->rulenum <= dmsg->rulenum) {
4317 		if (rule->rulenum == dmsg->rulenum)
4318 			rule->set = dmsg->to_set;
4319 		rule = rule->next;
4320 	}
4321 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4322 }
4323 
4324 static int
4325 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
4326 {
4327 	struct netmsg_del dmsg;
4328 	struct netmsg_base *nmsg;
4329 	struct ip_fw *rule;
4330 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4331 
4332 	ASSERT_NETISR0;
4333 
4334 	/*
4335 	 * Locate first rule to move
4336 	 */
4337 	for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
4338 	     rule = rule->next) {
4339 		if (rule->rulenum == rulenum && rule->set != set)
4340 			break;
4341 	}
4342 	if (rule == NULL || rule->rulenum > rulenum)
4343 		return 0; /* XXX error? */
4344 
4345 	bzero(&dmsg, sizeof(dmsg));
4346 	nmsg = &dmsg.base;
4347 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4348 	    ipfw_alt_move_rule_dispatch);
4349 	dmsg.start_rule = rule;
4350 	dmsg.rulenum = rulenum;
4351 	dmsg.to_set = set;
4352 
4353 	netisr_domsg_global(nmsg);
4354 	KKASSERT(dmsg.start_rule == NULL);
4355 	return 0;
4356 }
4357 
4358 static void
4359 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
4360 {
4361 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4362 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4363 	struct ip_fw *rule;
4364 
4365 	ASSERT_NETISR_NCPUS(mycpuid);
4366 
4367 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4368 		if (rule->set == dmsg->from_set)
4369 			rule->set = dmsg->to_set;
4370 	}
4371 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4372 }
4373 
4374 static int
4375 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
4376 {
4377 	struct netmsg_del dmsg;
4378 	struct netmsg_base *nmsg;
4379 
4380 	ASSERT_NETISR0;
4381 
4382 	bzero(&dmsg, sizeof(dmsg));
4383 	nmsg = &dmsg.base;
4384 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4385 	    ipfw_alt_move_ruleset_dispatch);
4386 	dmsg.from_set = from_set;
4387 	dmsg.to_set = to_set;
4388 
4389 	netisr_domsg_global(nmsg);
4390 	return 0;
4391 }
4392 
4393 static void
4394 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
4395 {
4396 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4397 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4398 	struct ip_fw *rule;
4399 
4400 	ASSERT_NETISR_NCPUS(mycpuid);
4401 
4402 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4403 		if (rule->set == dmsg->from_set)
4404 			rule->set = dmsg->to_set;
4405 		else if (rule->set == dmsg->to_set)
4406 			rule->set = dmsg->from_set;
4407 	}
4408 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4409 }
4410 
4411 static int
4412 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
4413 {
4414 	struct netmsg_del dmsg;
4415 	struct netmsg_base *nmsg;
4416 
4417 	ASSERT_NETISR0;
4418 
4419 	bzero(&dmsg, sizeof(dmsg));
4420 	nmsg = &dmsg.base;
4421 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4422 	    ipfw_alt_swap_ruleset_dispatch);
4423 	dmsg.from_set = set1;
4424 	dmsg.to_set = set2;
4425 
4426 	netisr_domsg_global(nmsg);
4427 	return 0;
4428 }
4429 
4430 /*
4431  * Remove all rules with given number, and also do set manipulation.
4432  *
4433  * The argument is an uint32_t. The low 16 bit are the rule or set number,
4434  * the next 8 bits are the new set, the top 8 bits are the command:
4435  *
4436  *	0	delete rules with given number
4437  *	1	delete rules with given set number
4438  *	2	move rules with given number to new set
4439  *	3	move rules with given set number to new set
4440  *	4	swap sets with given numbers
4441  */
4442 static int
4443 ipfw_ctl_alter(uint32_t arg)
4444 {
4445 	uint16_t rulenum;
4446 	uint8_t cmd, new_set;
4447 	int error = 0;
4448 
4449 	ASSERT_NETISR0;
4450 
4451 	rulenum = arg & 0xffff;
4452 	cmd = (arg >> 24) & 0xff;
4453 	new_set = (arg >> 16) & 0xff;
4454 
4455 	if (cmd > 4)
4456 		return EINVAL;
4457 	if (new_set >= IPFW_DEFAULT_SET)
4458 		return EINVAL;
4459 	if (cmd == 0 || cmd == 2) {
4460 		if (rulenum == IPFW_DEFAULT_RULE)
4461 			return EINVAL;
4462 	} else {
4463 		if (rulenum >= IPFW_DEFAULT_SET)
4464 			return EINVAL;
4465 	}
4466 
4467 	switch (cmd) {
4468 	case 0:	/* delete rules with given number */
4469 		error = ipfw_alt_delete_rule(rulenum);
4470 		break;
4471 
4472 	case 1:	/* delete all rules with given set number */
4473 		error = ipfw_alt_delete_ruleset(rulenum);
4474 		break;
4475 
4476 	case 2:	/* move rules with given number to new set */
4477 		error = ipfw_alt_move_rule(rulenum, new_set);
4478 		break;
4479 
4480 	case 3: /* move rules with given set number to new set */
4481 		error = ipfw_alt_move_ruleset(rulenum, new_set);
4482 		break;
4483 
4484 	case 4: /* swap two sets */
4485 		error = ipfw_alt_swap_ruleset(rulenum, new_set);
4486 		break;
4487 	}
4488 	return error;
4489 }
4490 
4491 /*
4492  * Clear counters for a specific rule.
4493  */
4494 static void
4495 clear_counters(struct ip_fw *rule, int log_only)
4496 {
4497 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
4498 
4499 	if (log_only == 0) {
4500 		rule->bcnt = rule->pcnt = 0;
4501 		rule->timestamp = 0;
4502 	}
4503 	if (l->o.opcode == O_LOG)
4504 		l->log_left = l->max_log;
4505 }
4506 
4507 static void
4508 ipfw_zero_entry_dispatch(netmsg_t nmsg)
4509 {
4510 	struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
4511 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4512 	struct ip_fw *rule;
4513 
4514 	ASSERT_NETISR_NCPUS(mycpuid);
4515 
4516 	if (zmsg->rulenum == 0) {
4517 		KKASSERT(zmsg->start_rule == NULL);
4518 
4519 		ctx->ipfw_norule_counter = 0;
4520 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4521 			clear_counters(rule, zmsg->log_only);
4522 	} else {
4523 		struct ip_fw *start = zmsg->start_rule;
4524 
4525 		KKASSERT(start->cpuid == mycpuid);
4526 		KKASSERT(start->rulenum == zmsg->rulenum);
4527 
4528 		/*
4529 		 * We can have multiple rules with the same number, so we
4530 		 * need to clear them all.
4531 		 */
4532 		for (rule = start; rule && rule->rulenum == zmsg->rulenum;
4533 		     rule = rule->next)
4534 			clear_counters(rule, zmsg->log_only);
4535 
4536 		/*
4537 		 * Move to the position on the next CPU
4538 		 * before the msg is forwarded.
4539 		 */
4540 		zmsg->start_rule = start->sibling;
4541 	}
4542 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4543 }
4544 
4545 /*
4546  * Reset some or all counters on firewall rules.
4547  * @arg frwl is null to clear all entries, or contains a specific
4548  * rule number.
4549  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
4550  */
4551 static int
4552 ipfw_ctl_zero_entry(int rulenum, int log_only)
4553 {
4554 	struct netmsg_zent zmsg;
4555 	struct netmsg_base *nmsg;
4556 	const char *msg;
4557 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4558 
4559 	ASSERT_NETISR0;
4560 
4561 	bzero(&zmsg, sizeof(zmsg));
4562 	nmsg = &zmsg.base;
4563 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4564 	    ipfw_zero_entry_dispatch);
4565 	zmsg.log_only = log_only;
4566 
4567 	if (rulenum == 0) {
4568 		msg = log_only ? "ipfw: All logging counts reset.\n"
4569 			       : "ipfw: Accounting cleared.\n";
4570 	} else {
4571 		struct ip_fw *rule;
4572 
4573 		/*
4574 		 * Locate the first rule with 'rulenum'
4575 		 */
4576 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4577 			if (rule->rulenum == rulenum)
4578 				break;
4579 		}
4580 		if (rule == NULL) /* we did not find any matching rules */
4581 			return (EINVAL);
4582 		zmsg.start_rule = rule;
4583 		zmsg.rulenum = rulenum;
4584 
4585 		msg = log_only ? "ipfw: Entry %d logging count reset.\n"
4586 			       : "ipfw: Entry %d cleared.\n";
4587 	}
4588 	netisr_domsg_global(nmsg);
4589 	KKASSERT(zmsg.start_rule == NULL);
4590 
4591 	if (fw_verbose)
4592 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
4593 	return (0);
4594 }
4595 
4596 /*
4597  * Check validity of the structure before insert.
4598  * Fortunately rules are simple, so this mostly need to check rule sizes.
4599  */
4600 static int
4601 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
4602 {
4603 	int l, cmdlen = 0;
4604 	int have_action = 0;
4605 	ipfw_insn *cmd;
4606 
4607 	*rule_flags = 0;
4608 
4609 	/* Check for valid size */
4610 	if (size < sizeof(*rule)) {
4611 		kprintf("ipfw: rule too short\n");
4612 		return EINVAL;
4613 	}
4614 	l = IOC_RULESIZE(rule);
4615 	if (l != size) {
4616 		kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
4617 		return EINVAL;
4618 	}
4619 
4620 	/* Check rule number */
4621 	if (rule->rulenum == IPFW_DEFAULT_RULE) {
4622 		kprintf("ipfw: invalid rule number\n");
4623 		return EINVAL;
4624 	}
4625 
4626 	/*
4627 	 * Now go for the individual checks. Very simple ones, basically only
4628 	 * instruction sizes.
4629 	 */
4630 	for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
4631 	     l -= cmdlen, cmd += cmdlen) {
4632 		cmdlen = F_LEN(cmd);
4633 		if (cmdlen > l) {
4634 			kprintf("ipfw: opcode %d size truncated\n",
4635 				cmd->opcode);
4636 			return EINVAL;
4637 		}
4638 
4639 		DPRINTF("ipfw: opcode %d\n", cmd->opcode);
4640 
4641 		if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT) {
4642 			/* This rule will generate states. */
4643 			*rule_flags |= IPFW_RULE_F_GENSTATE;
4644 			if (cmd->opcode == O_LIMIT)
4645 				*rule_flags |= IPFW_RULE_F_GENTRACK;
4646 		}
4647 		if (cmd->opcode == O_DEFRAG)
4648 			*rule_flags |= IPFW_RULE_F_CROSSREF;
4649 		if (cmd->opcode == O_IP_SRC_IFIP ||
4650 		    cmd->opcode == O_IP_DST_IFIP) {
4651 			*rule_flags |= IPFW_RULE_F_DYNIFADDR;
4652 			cmd->arg1 &= IPFW_IFIP_SETTINGS;
4653 		}
4654 
4655 		switch (cmd->opcode) {
4656 		case O_NOP:
4657 		case O_PROBE_STATE:
4658 		case O_KEEP_STATE:
4659 		case O_PROTO:
4660 		case O_IP_SRC_ME:
4661 		case O_IP_DST_ME:
4662 		case O_LAYER2:
4663 		case O_IN:
4664 		case O_FRAG:
4665 		case O_IPFRAG:
4666 		case O_IPOPT:
4667 		case O_IPLEN:
4668 		case O_IPID:
4669 		case O_IPTOS:
4670 		case O_IPPRECEDENCE:
4671 		case O_IPTTL:
4672 		case O_IPVER:
4673 		case O_TCPWIN:
4674 		case O_TCPFLAGS:
4675 		case O_TCPOPTS:
4676 		case O_ESTAB:
4677 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
4678 				goto bad_size;
4679 			break;
4680 
4681 		case O_IP_SRC_TABLE:
4682 		case O_IP_DST_TABLE:
4683 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
4684 				goto bad_size;
4685 			if (cmd->arg1 >= ipfw_table_max) {
4686 				kprintf("ipfw: invalid table id %u, max %d\n",
4687 				    cmd->arg1, ipfw_table_max);
4688 				return EINVAL;
4689 			}
4690 			break;
4691 
4692 		case O_IP_SRC_IFIP:
4693 		case O_IP_DST_IFIP:
4694 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
4695 				goto bad_size;
4696 			break;
4697 
4698 		case O_UID:
4699 		case O_GID:
4700 		case O_IP_SRC:
4701 		case O_IP_DST:
4702 		case O_TCPSEQ:
4703 		case O_TCPACK:
4704 		case O_PROB:
4705 		case O_ICMPTYPE:
4706 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
4707 				goto bad_size;
4708 			break;
4709 
4710 		case O_LIMIT:
4711 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
4712 				goto bad_size;
4713 			break;
4714 
4715 		case O_LOG:
4716 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
4717 				goto bad_size;
4718 
4719 			((ipfw_insn_log *)cmd)->log_left =
4720 			    ((ipfw_insn_log *)cmd)->max_log;
4721 
4722 			break;
4723 
4724 		case O_IP_SRC_MASK:
4725 		case O_IP_DST_MASK:
4726 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
4727 				goto bad_size;
4728 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
4729 				kprintf("ipfw: opcode %d, useless rule\n",
4730 					cmd->opcode);
4731 				return EINVAL;
4732 			}
4733 			break;
4734 
4735 		case O_IP_SRC_SET:
4736 		case O_IP_DST_SET:
4737 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
4738 				kprintf("ipfw: invalid set size %d\n",
4739 					cmd->arg1);
4740 				return EINVAL;
4741 			}
4742 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
4743 			    (cmd->arg1+31)/32 )
4744 				goto bad_size;
4745 			break;
4746 
4747 		case O_MACADDR2:
4748 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
4749 				goto bad_size;
4750 			break;
4751 
4752 		case O_MAC_TYPE:
4753 		case O_IP_SRCPORT:
4754 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
4755 			if (cmdlen < 2 || cmdlen > 31)
4756 				goto bad_size;
4757 			break;
4758 
4759 		case O_RECV:
4760 		case O_XMIT:
4761 		case O_VIA:
4762 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
4763 				goto bad_size;
4764 			break;
4765 
4766 		case O_PIPE:
4767 		case O_QUEUE:
4768 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
4769 				goto bad_size;
4770 			goto check_action;
4771 
4772 		case O_FORWARD_IP:
4773 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
4774 				goto bad_size;
4775 			} else {
4776 				in_addr_t fwd_addr;
4777 
4778 				fwd_addr = ((ipfw_insn_sa *)cmd)->
4779 					   sa.sin_addr.s_addr;
4780 				if (IN_MULTICAST(ntohl(fwd_addr))) {
4781 					kprintf("ipfw: try forwarding to "
4782 						"multicast address\n");
4783 					return EINVAL;
4784 				}
4785 			}
4786 			goto check_action;
4787 
4788 		case O_FORWARD_MAC: /* XXX not implemented yet */
4789 		case O_CHECK_STATE:
4790 		case O_COUNT:
4791 		case O_ACCEPT:
4792 		case O_DENY:
4793 		case O_REJECT:
4794 		case O_SKIPTO:
4795 		case O_DIVERT:
4796 		case O_TEE:
4797 		case O_DEFRAG:
4798 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
4799 				goto bad_size;
4800 check_action:
4801 			if (have_action) {
4802 				kprintf("ipfw: opcode %d, multiple actions"
4803 					" not allowed\n",
4804 					cmd->opcode);
4805 				return EINVAL;
4806 			}
4807 			have_action = 1;
4808 			if (l != cmdlen) {
4809 				kprintf("ipfw: opcode %d, action must be"
4810 					" last opcode\n",
4811 					cmd->opcode);
4812 				return EINVAL;
4813 			}
4814 			break;
4815 		default:
4816 			kprintf("ipfw: opcode %d, unknown opcode\n",
4817 				cmd->opcode);
4818 			return EINVAL;
4819 		}
4820 	}
4821 	if (have_action == 0) {
4822 		kprintf("ipfw: missing action\n");
4823 		return EINVAL;
4824 	}
4825 	return 0;
4826 
4827 bad_size:
4828 	kprintf("ipfw: opcode %d size %d wrong\n",
4829 		cmd->opcode, cmdlen);
4830 	return EINVAL;
4831 }
4832 
4833 static int
4834 ipfw_ctl_add_rule(struct sockopt *sopt)
4835 {
4836 	struct ipfw_ioc_rule *ioc_rule;
4837 	size_t size;
4838 	uint32_t rule_flags;
4839 	int error;
4840 
4841 	ASSERT_NETISR0;
4842 
4843 	size = sopt->sopt_valsize;
4844 	if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
4845 	    size < sizeof(*ioc_rule)) {
4846 		return EINVAL;
4847 	}
4848 	if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
4849 		sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
4850 					  IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
4851 	}
4852 	ioc_rule = sopt->sopt_val;
4853 
4854 	error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
4855 	if (error)
4856 		return error;
4857 
4858 	ipfw_add_rule(ioc_rule, rule_flags);
4859 
4860 	if (sopt->sopt_dir == SOPT_GET)
4861 		sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
4862 	return 0;
4863 }
4864 
4865 static void *
4866 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
4867     struct ipfw_ioc_rule *ioc_rule)
4868 {
4869 	const struct ip_fw *sibling;
4870 #ifdef INVARIANTS
4871 	int i;
4872 #endif
4873 
4874 	ASSERT_NETISR0;
4875 	KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
4876 
4877 	ioc_rule->act_ofs = rule->act_ofs;
4878 	ioc_rule->cmd_len = rule->cmd_len;
4879 	ioc_rule->rulenum = rule->rulenum;
4880 	ioc_rule->set = rule->set;
4881 	ioc_rule->usr_flags = rule->usr_flags;
4882 
4883 	ioc_rule->set_disable = ctx->ipfw_set_disable;
4884 	ioc_rule->static_count = static_count;
4885 	ioc_rule->static_len = static_ioc_len;
4886 
4887 	/*
4888 	 * Visit (read-only) all of the rule's duplications to get
4889 	 * the necessary statistics
4890 	 */
4891 #ifdef INVARIANTS
4892 	i = 0;
4893 #endif
4894 	ioc_rule->pcnt = 0;
4895 	ioc_rule->bcnt = 0;
4896 	ioc_rule->timestamp = 0;
4897 	for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
4898 		ioc_rule->pcnt += sibling->pcnt;
4899 		ioc_rule->bcnt += sibling->bcnt;
4900 		if (sibling->timestamp > ioc_rule->timestamp)
4901 			ioc_rule->timestamp = sibling->timestamp;
4902 #ifdef INVARIANTS
4903 		++i;
4904 #endif
4905 	}
4906 	KASSERT(i == netisr_ncpus,
4907 	    ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
4908 
4909 	bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
4910 
4911 	return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
4912 }
4913 
4914 static boolean_t
4915 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
4916 {
4917 	struct ipfw_ioc_flowid *ioc_id;
4918 
4919 	if (trk->tc_expire == 0) {
4920 		/* Not a scanned one. */
4921 		return (FALSE);
4922 	}
4923 
4924 	ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
4925 	    0 : trk->tc_expire - time_uptime;
4926 	ioc_state->pcnt = 0;
4927 	ioc_state->bcnt = 0;
4928 
4929 	ioc_state->dyn_type = O_LIMIT_PARENT;
4930 	ioc_state->count = trk->tc_count;
4931 
4932 	ioc_state->rulenum = trk->tc_rulenum;
4933 
4934 	ioc_id = &ioc_state->id;
4935 	ioc_id->type = ETHERTYPE_IP;
4936 	ioc_id->u.ip.proto = trk->tc_proto;
4937 	ioc_id->u.ip.src_ip = trk->tc_saddr;
4938 	ioc_id->u.ip.dst_ip = trk->tc_daddr;
4939 	ioc_id->u.ip.src_port = trk->tc_sport;
4940 	ioc_id->u.ip.dst_port = trk->tc_dport;
4941 
4942 	return (TRUE);
4943 }
4944 
4945 static boolean_t
4946 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
4947 {
4948 	struct ipfw_ioc_flowid *ioc_id;
4949 
4950 	if (s->st_type == O_ANCHOR)
4951 		return (FALSE);
4952 
4953 	ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
4954 	    0 : s->st_expire - time_uptime;
4955 	ioc_state->pcnt = s->st_pcnt;
4956 	ioc_state->bcnt = s->st_bcnt;
4957 
4958 	ioc_state->dyn_type = s->st_type;
4959 	ioc_state->count = 0;
4960 
4961 	ioc_state->rulenum = s->st_rule->rulenum;
4962 
4963 	ioc_id = &ioc_state->id;
4964 	ioc_id->type = ETHERTYPE_IP;
4965 	ioc_id->u.ip.proto = s->st_proto;
4966 	ipfw_key_4tuple(&s->st_key,
4967 	    &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
4968 	    &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
4969 
4970 	return (TRUE);
4971 }
4972 
4973 static void
4974 ipfw_state_copy_dispatch(netmsg_t nmsg)
4975 {
4976 	struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
4977 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4978 	const struct ipfw_state *s;
4979 	const struct ipfw_track *t;
4980 
4981 	ASSERT_NETISR_NCPUS(mycpuid);
4982 	KASSERT(nm->state_cnt < nm->state_cntmax,
4983 	    ("invalid state count %d, max %d",
4984 	     nm->state_cnt, nm->state_cntmax));
4985 
4986 	TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
4987 		if (ipfw_state_copy(s, nm->ioc_state)) {
4988 			nm->ioc_state++;
4989 			nm->state_cnt++;
4990 			if (nm->state_cnt == nm->state_cntmax)
4991 				goto done;
4992 		}
4993 	}
4994 
4995 	/*
4996 	 * Prepare tracks in the global track tree for userland.
4997 	 */
4998 	TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
4999 		struct ipfw_trkcnt *trk;
5000 
5001 		if (t->t_count == NULL) /* anchor */
5002 			continue;
5003 		trk = t->t_trkcnt;
5004 
5005 		/*
5006 		 * Only one netisr can run this function at
5007 		 * any time, and only this function accesses
5008 		 * trkcnt's tc_expire, so this is safe w/o
5009 		 * ipfw_gd.ipfw_trkcnt_token.
5010 		 */
5011 		if (trk->tc_expire > t->t_expire)
5012 			continue;
5013 		trk->tc_expire = t->t_expire;
5014 	}
5015 
5016 	/*
5017 	 * Copy tracks in the global track tree to userland in
5018 	 * the last netisr.
5019 	 */
5020 	if (mycpuid == netisr_ncpus - 1) {
5021 		struct ipfw_trkcnt *trk;
5022 
5023 		KASSERT(nm->state_cnt < nm->state_cntmax,
5024 		    ("invalid state count %d, max %d",
5025 		     nm->state_cnt, nm->state_cntmax));
5026 
5027 		IPFW_TRKCNT_TOKGET;
5028 		RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5029 			if (ipfw_track_copy(trk, nm->ioc_state)) {
5030 				nm->ioc_state++;
5031 				nm->state_cnt++;
5032 				if (nm->state_cnt == nm->state_cntmax) {
5033 					IPFW_TRKCNT_TOKREL;
5034 					goto done;
5035 				}
5036 			}
5037 		}
5038 		IPFW_TRKCNT_TOKREL;
5039 	}
5040 done:
5041 	if (nm->state_cnt == nm->state_cntmax) {
5042 		/* No more space; done. */
5043 		netisr_replymsg(&nm->base, 0);
5044 	} else {
5045 		netisr_forwardmsg(&nm->base, mycpuid + 1);
5046 	}
5047 }
5048 
5049 static int
5050 ipfw_ctl_get_rules(struct sockopt *sopt)
5051 {
5052 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5053 	struct ip_fw *rule;
5054 	void *bp;
5055 	size_t size;
5056 	int state_cnt;
5057 
5058 	ASSERT_NETISR0;
5059 
5060 	/*
5061 	 * pass up a copy of the current rules. Static rules
5062 	 * come first (the last of which has number IPFW_DEFAULT_RULE),
5063 	 * followed by a possibly empty list of states.
5064 	 */
5065 
5066 	size = static_ioc_len;	/* size of static rules */
5067 
5068 	/*
5069 	 * Size of the states.
5070 	 * XXX take tracks as state for userland compat.
5071 	 */
5072 	state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5073 	state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5074 	size += state_cnt * sizeof(struct ipfw_ioc_state);
5075 
5076 	if (sopt->sopt_valsize < size) {
5077 		/* short length, no need to return incomplete rules */
5078 		/* XXX: if superuser, no need to zero buffer */
5079 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5080 		return 0;
5081 	}
5082 	bp = sopt->sopt_val;
5083 
5084 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5085 		bp = ipfw_copy_rule(ctx, rule, bp);
5086 
5087 	if (state_cnt) {
5088 		struct netmsg_cpstate nm;
5089 #ifdef INVARIANTS
5090 		size_t old_size = size;
5091 #endif
5092 
5093 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5094 		    MSGF_PRIORITY, ipfw_state_copy_dispatch);
5095 		nm.ioc_state = bp;
5096 		nm.state_cntmax = state_cnt;
5097 		nm.state_cnt = 0;
5098 		netisr_domsg_global(&nm.base);
5099 
5100 		/*
5101 		 * The # of states may be shrinked after the snapshot
5102 		 * of the state count was taken.  To give user a correct
5103 		 * state count, nm->state_cnt is used to recalculate
5104 		 * the actual size.
5105 		 */
5106 		size = static_ioc_len +
5107 		    (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5108 		KKASSERT(size <= old_size);
5109 	}
5110 
5111 	sopt->sopt_valsize = size;
5112 	return 0;
5113 }
5114 
5115 static void
5116 ipfw_set_disable_dispatch(netmsg_t nmsg)
5117 {
5118 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5119 
5120 	ASSERT_NETISR_NCPUS(mycpuid);
5121 
5122 	ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5123 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5124 }
5125 
5126 static void
5127 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5128 {
5129 	struct netmsg_base nmsg;
5130 	uint32_t set_disable;
5131 
5132 	ASSERT_NETISR0;
5133 
5134 	/* IPFW_DEFAULT_SET is always enabled */
5135 	enable |= (1 << IPFW_DEFAULT_SET);
5136 	set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5137 
5138 	bzero(&nmsg, sizeof(nmsg));
5139 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5140 	    ipfw_set_disable_dispatch);
5141 	nmsg.lmsg.u.ms_result32 = set_disable;
5142 
5143 	netisr_domsg_global(&nmsg);
5144 }
5145 
5146 static void
5147 ipfw_table_create_dispatch(netmsg_t nm)
5148 {
5149 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5150 	int tblid = nm->lmsg.u.ms_result;
5151 
5152 	ASSERT_NETISR_NCPUS(mycpuid);
5153 
5154 	if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5155 	    rn_cpumaskhead(mycpuid), 32))
5156 		panic("ipfw: create table%d failed", tblid);
5157 
5158 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5159 }
5160 
5161 static int
5162 ipfw_table_create(struct sockopt *sopt)
5163 {
5164 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5165 	struct ipfw_ioc_table *tbl;
5166 	struct netmsg_base nm;
5167 
5168 	ASSERT_NETISR0;
5169 
5170 	if (sopt->sopt_valsize != sizeof(*tbl))
5171 		return (EINVAL);
5172 
5173 	tbl = sopt->sopt_val;
5174 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5175 		return (EINVAL);
5176 
5177 	if (ctx->ipfw_tables[tbl->tableid] != NULL)
5178 		return (EEXIST);
5179 
5180 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5181 	    ipfw_table_create_dispatch);
5182 	nm.lmsg.u.ms_result = tbl->tableid;
5183 	netisr_domsg_global(&nm);
5184 
5185 	return (0);
5186 }
5187 
5188 static void
5189 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5190 {
5191 	struct radix_node *ret;
5192 
5193 	ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5194 	if (ret != rn)
5195 		panic("deleted other table entry");
5196 	kfree(ret, M_IPFW);
5197 }
5198 
5199 static int
5200 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5201 {
5202 
5203 	ipfw_table_killrn(xrnh, rn);
5204 	return (0);
5205 }
5206 
5207 static void
5208 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5209     int destroy)
5210 {
5211 	struct radix_node_head *rnh;
5212 
5213 	ASSERT_NETISR_NCPUS(mycpuid);
5214 
5215 	rnh = ctx->ipfw_tables[tableid];
5216 	rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5217 	if (destroy) {
5218 		Free(rnh);
5219 		ctx->ipfw_tables[tableid] = NULL;
5220 	}
5221 }
5222 
5223 static void
5224 ipfw_table_flush_dispatch(netmsg_t nmsg)
5225 {
5226 	struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5227 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5228 
5229 	ASSERT_NETISR_NCPUS(mycpuid);
5230 
5231 	ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5232 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5233 }
5234 
5235 static void
5236 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5237 {
5238 	int i;
5239 
5240 	ASSERT_NETISR_NCPUS(mycpuid);
5241 
5242 	for (i = 0; i < ipfw_table_max; ++i) {
5243 		if (ctx->ipfw_tables[i] != NULL)
5244 			ipfw_table_flush_oncpu(ctx, i, destroy);
5245 	}
5246 }
5247 
5248 static void
5249 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5250 {
5251 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5252 
5253 	ASSERT_NETISR_NCPUS(mycpuid);
5254 
5255 	ipfw_table_flushall_oncpu(ctx, 0);
5256 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5257 }
5258 
5259 static int
5260 ipfw_table_flush(struct sockopt *sopt)
5261 {
5262 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5263 	struct ipfw_ioc_table *tbl;
5264 	struct netmsg_tblflush nm;
5265 
5266 	ASSERT_NETISR0;
5267 
5268 	if (sopt->sopt_valsize != sizeof(*tbl))
5269 		return (EINVAL);
5270 
5271 	tbl = sopt->sopt_val;
5272 	if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
5273 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5274 		    MSGF_PRIORITY, ipfw_table_flushall_dispatch);
5275 		netisr_domsg_global(&nm.base);
5276 		return (0);
5277 	}
5278 
5279 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5280 		return (EINVAL);
5281 
5282 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
5283 		return (ENOENT);
5284 
5285 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5286 	    ipfw_table_flush_dispatch);
5287 	nm.tableid = tbl->tableid;
5288 	nm.destroy = 0;
5289 	if (sopt->sopt_name == IP_FW_TBL_DESTROY)
5290 		nm.destroy = 1;
5291 	netisr_domsg_global(&nm.base);
5292 
5293 	return (0);
5294 }
5295 
5296 static int
5297 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
5298 {
5299 	int *cnt = xcnt;
5300 
5301 	(*cnt)++;
5302 	return (0);
5303 }
5304 
5305 static int
5306 ipfw_table_cpent(struct radix_node *rn, void *xcp)
5307 {
5308 	struct ipfw_table_cp *cp = xcp;
5309 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5310 	struct ipfw_ioc_tblent *ioc_te;
5311 #ifdef INVARIANTS
5312 	int cnt;
5313 #endif
5314 
5315 	KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
5316 	    cp->te_idx, cp->te_cnt));
5317 	ioc_te = &cp->te[cp->te_idx];
5318 
5319 	if (te->te_nodes->rn_mask != NULL) {
5320 		memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
5321 		    *te->te_nodes->rn_mask);
5322 	} else {
5323 		ioc_te->netmask.sin_len = 0;
5324 	}
5325 	memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
5326 
5327 	ioc_te->use = te->te_use;
5328 	ioc_te->last_used = te->te_lastuse;
5329 #ifdef INVARIANTS
5330 	cnt = 1;
5331 #endif
5332 
5333 	while ((te = te->te_sibling) != NULL) {
5334 #ifdef INVARIANTS
5335 		++cnt;
5336 #endif
5337 		ioc_te->use += te->te_use;
5338 		if (te->te_lastuse > ioc_te->last_used)
5339 			ioc_te->last_used = te->te_lastuse;
5340 	}
5341 	KASSERT(cnt == netisr_ncpus,
5342 	    ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
5343 
5344 	cp->te_idx++;
5345 
5346 	return (0);
5347 }
5348 
5349 static int
5350 ipfw_table_get(struct sockopt *sopt)
5351 {
5352 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5353 	struct radix_node_head *rnh;
5354 	struct ipfw_ioc_table *tbl;
5355 	struct ipfw_ioc_tblcont *cont;
5356 	struct ipfw_table_cp cp;
5357 	int cnt = 0, sz;
5358 
5359 	ASSERT_NETISR0;
5360 
5361 	if (sopt->sopt_valsize < sizeof(*tbl))
5362 		return (EINVAL);
5363 
5364 	tbl = sopt->sopt_val;
5365 	if (tbl->tableid < 0) {
5366 		struct ipfw_ioc_tbllist *list;
5367 		int i;
5368 
5369 		/*
5370 		 * List available table ids.
5371 		 */
5372 		for (i = 0; i < ipfw_table_max; ++i) {
5373 			if (ctx->ipfw_tables[i] != NULL)
5374 				++cnt;
5375 		}
5376 
5377 		sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
5378 		if (sopt->sopt_valsize < sz) {
5379 			bzero(sopt->sopt_val, sopt->sopt_valsize);
5380 			return (E2BIG);
5381 		}
5382 		list = sopt->sopt_val;
5383 		list->tablecnt = cnt;
5384 
5385 		cnt = 0;
5386 		for (i = 0; i < ipfw_table_max; ++i) {
5387 			if (ctx->ipfw_tables[i] != NULL) {
5388 				KASSERT(cnt < list->tablecnt,
5389 				    ("invalid idx %d, cnt %d",
5390 				     cnt, list->tablecnt));
5391 				list->tables[cnt++] = i;
5392 			}
5393 		}
5394 		sopt->sopt_valsize = sz;
5395 		return (0);
5396 	} else if (tbl->tableid >= ipfw_table_max) {
5397 		return (EINVAL);
5398 	}
5399 
5400 	rnh = ctx->ipfw_tables[tbl->tableid];
5401 	if (rnh == NULL)
5402 		return (ENOENT);
5403 	rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
5404 
5405 	sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
5406 	if (sopt->sopt_valsize < sz) {
5407 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5408 		return (E2BIG);
5409 	}
5410 	cont = sopt->sopt_val;
5411 	cont->entcnt = cnt;
5412 
5413 	cp.te = cont->ent;
5414 	cp.te_idx = 0;
5415 	cp.te_cnt = cnt;
5416 	rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
5417 
5418 	sopt->sopt_valsize = sz;
5419 	return (0);
5420 }
5421 
5422 static void
5423 ipfw_table_add_dispatch(netmsg_t nmsg)
5424 {
5425 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5426 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5427 	struct radix_node_head *rnh;
5428 	struct ipfw_tblent *te;
5429 
5430 	ASSERT_NETISR_NCPUS(mycpuid);
5431 
5432 	rnh = ctx->ipfw_tables[nm->tableid];
5433 
5434 	te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
5435 	te->te_nodes->rn_key = (char *)&te->te_key;
5436 	memcpy(&te->te_key, nm->key, sizeof(te->te_key));
5437 
5438 	if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
5439 	    te->te_nodes) == NULL) {
5440 		if (mycpuid == 0) {
5441 			kfree(te, M_IPFW);
5442 			netisr_replymsg(&nm->base, EEXIST);
5443 			return;
5444 		}
5445 		panic("rnh_addaddr failed");
5446 	}
5447 
5448 	/* Link siblings. */
5449 	if (nm->sibling != NULL)
5450 		nm->sibling->te_sibling = te;
5451 	nm->sibling = te;
5452 
5453 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5454 }
5455 
5456 static void
5457 ipfw_table_del_dispatch(netmsg_t nmsg)
5458 {
5459 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
5460 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5461 	struct radix_node_head *rnh;
5462 	struct radix_node *rn;
5463 
5464 	ASSERT_NETISR_NCPUS(mycpuid);
5465 
5466 	rnh = ctx->ipfw_tables[nm->tableid];
5467 	rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
5468 	if (rn == NULL) {
5469 		if (mycpuid == 0) {
5470 			netisr_replymsg(&nm->base, ESRCH);
5471 			return;
5472 		}
5473 		panic("rnh_deladdr failed");
5474 	}
5475 	kfree(rn, M_IPFW);
5476 
5477 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5478 }
5479 
5480 static int
5481 ipfw_table_alt(struct sockopt *sopt)
5482 {
5483 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5484 	struct ipfw_ioc_tblcont *tbl;
5485 	struct ipfw_ioc_tblent *te;
5486 	struct sockaddr_in key0;
5487 	struct sockaddr *netmask = NULL, *key;
5488 	struct netmsg_tblent nm;
5489 
5490 	ASSERT_NETISR0;
5491 
5492 	if (sopt->sopt_valsize != sizeof(*tbl))
5493 		return (EINVAL);
5494 	tbl = sopt->sopt_val;
5495 
5496 	if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
5497 		return (EINVAL);
5498 	if (tbl->entcnt != 1)
5499 		return (EINVAL);
5500 
5501 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
5502 		return (ENOENT);
5503 	te = &tbl->ent[0];
5504 
5505 	if (te->key.sin_family != AF_INET ||
5506 	    te->key.sin_port != 0 ||
5507 	    te->key.sin_len != sizeof(struct sockaddr_in))
5508 		return (EINVAL);
5509 	key = (struct sockaddr *)&te->key;
5510 
5511 	if (te->netmask.sin_len != 0) {
5512 		if (te->netmask.sin_port != 0 ||
5513 		    te->netmask.sin_len > sizeof(struct sockaddr_in))
5514 			return (EINVAL);
5515 		netmask = (struct sockaddr *)&te->netmask;
5516 		sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
5517 		key = (struct sockaddr *)&key0;
5518 	}
5519 
5520 	if (sopt->sopt_name == IP_FW_TBL_ADD) {
5521 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5522 		    MSGF_PRIORITY, ipfw_table_add_dispatch);
5523 	} else {
5524 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5525 		    MSGF_PRIORITY, ipfw_table_del_dispatch);
5526 	}
5527 	nm.key = key;
5528 	nm.netmask = netmask;
5529 	nm.tableid = tbl->tableid;
5530 	nm.sibling = NULL;
5531 	return (netisr_domsg_global(&nm.base));
5532 }
5533 
5534 static int
5535 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
5536 {
5537 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5538 
5539 	te->te_use = 0;
5540 	te->te_lastuse = 0;
5541 	return (0);
5542 }
5543 
5544 static void
5545 ipfw_table_zero_dispatch(netmsg_t nmsg)
5546 {
5547 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5548 	struct radix_node_head *rnh;
5549 
5550 	ASSERT_NETISR_NCPUS(mycpuid);
5551 
5552 	rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
5553 	rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5554 
5555 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5556 }
5557 
5558 static void
5559 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
5560 {
5561 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5562 	int i;
5563 
5564 	ASSERT_NETISR_NCPUS(mycpuid);
5565 
5566 	for (i = 0; i < ipfw_table_max; ++i) {
5567 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
5568 
5569 		if (rnh != NULL)
5570 			rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
5571 	}
5572 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5573 }
5574 
5575 static int
5576 ipfw_table_zero(struct sockopt *sopt)
5577 {
5578 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5579 	struct netmsg_base nm;
5580 	struct ipfw_ioc_table *tbl;
5581 
5582 	ASSERT_NETISR0;
5583 
5584 	if (sopt->sopt_valsize != sizeof(*tbl))
5585 		return (EINVAL);
5586 	tbl = sopt->sopt_val;
5587 
5588 	if (tbl->tableid < 0) {
5589 		netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5590 		    ipfw_table_zeroall_dispatch);
5591 		netisr_domsg_global(&nm);
5592 		return (0);
5593 	} else if (tbl->tableid >= ipfw_table_max) {
5594 		return (EINVAL);
5595 	} else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
5596 		return (ENOENT);
5597 	}
5598 
5599 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5600 	    ipfw_table_zero_dispatch);
5601 	nm.lmsg.u.ms_result = tbl->tableid;
5602 	netisr_domsg_global(&nm);
5603 
5604 	return (0);
5605 }
5606 
5607 static int
5608 ipfw_table_killexp(struct radix_node *rn, void *xnm)
5609 {
5610 	struct netmsg_tblexp *nm = xnm;
5611 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
5612 
5613 	if (te->te_expired) {
5614 		ipfw_table_killrn(nm->rnh, rn);
5615 		nm->expcnt++;
5616 	}
5617 	return (0);
5618 }
5619 
5620 static void
5621 ipfw_table_expire_dispatch(netmsg_t nmsg)
5622 {
5623 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5624 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5625 	struct radix_node_head *rnh;
5626 
5627 	ASSERT_NETISR_NCPUS(mycpuid);
5628 
5629 	rnh = ctx->ipfw_tables[nm->tableid];
5630 	nm->rnh = rnh;
5631 	rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5632 
5633 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5634 	    ("not all expired addresses (%d) were deleted (%d)",
5635 	     nm->cnt * (mycpuid + 1), nm->expcnt));
5636 
5637 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5638 }
5639 
5640 static void
5641 ipfw_table_expireall_dispatch(netmsg_t nmsg)
5642 {
5643 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
5644 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5645 	int i;
5646 
5647 	ASSERT_NETISR_NCPUS(mycpuid);
5648 
5649 	for (i = 0; i < ipfw_table_max; ++i) {
5650 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
5651 
5652 		if (rnh == NULL)
5653 			continue;
5654 		nm->rnh = rnh;
5655 		rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
5656 	}
5657 
5658 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
5659 	    ("not all expired addresses (%d) were deleted (%d)",
5660 	     nm->cnt * (mycpuid + 1), nm->expcnt));
5661 
5662 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5663 }
5664 
5665 static int
5666 ipfw_table_markexp(struct radix_node *rn, void *xnm)
5667 {
5668 	struct netmsg_tblexp *nm = xnm;
5669 	struct ipfw_tblent *te;
5670 	time_t lastuse;
5671 
5672 	te = (struct ipfw_tblent *)rn;
5673 	lastuse = te->te_lastuse;
5674 
5675 	while ((te = te->te_sibling) != NULL) {
5676 		if (te->te_lastuse > lastuse)
5677 			lastuse = te->te_lastuse;
5678 	}
5679 	if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
5680 		/* Not expired */
5681 		return (0);
5682 	}
5683 
5684 	te = (struct ipfw_tblent *)rn;
5685 	te->te_expired = 1;
5686 	while ((te = te->te_sibling) != NULL)
5687 		te->te_expired = 1;
5688 	nm->cnt++;
5689 
5690 	return (0);
5691 }
5692 
5693 static int
5694 ipfw_table_expire(struct sockopt *sopt)
5695 {
5696 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5697 	struct netmsg_tblexp nm;
5698 	struct ipfw_ioc_tblexp *tbl;
5699 	struct radix_node_head *rnh;
5700 
5701 	ASSERT_NETISR0;
5702 
5703 	if (sopt->sopt_valsize != sizeof(*tbl))
5704 		return (EINVAL);
5705 	tbl = sopt->sopt_val;
5706 	tbl->expcnt = 0;
5707 
5708 	nm.expcnt = 0;
5709 	nm.cnt = 0;
5710 	nm.expire = tbl->expire;
5711 
5712 	if (tbl->tableid < 0) {
5713 		int i;
5714 
5715 		for (i = 0; i < ipfw_table_max; ++i) {
5716 			rnh = ctx->ipfw_tables[i];
5717 			if (rnh == NULL)
5718 				continue;
5719 			rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5720 		}
5721 		if (nm.cnt == 0) {
5722 			/* No addresses can be expired. */
5723 			return (0);
5724 		}
5725 		tbl->expcnt = nm.cnt;
5726 
5727 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5728 		    MSGF_PRIORITY, ipfw_table_expireall_dispatch);
5729 		nm.tableid = -1;
5730 		netisr_domsg_global(&nm.base);
5731 		KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5732 		    ("not all expired addresses (%d) were deleted (%d)",
5733 		     nm.cnt * netisr_ncpus, nm.expcnt));
5734 
5735 		return (0);
5736 	} else if (tbl->tableid >= ipfw_table_max) {
5737 		return (EINVAL);
5738 	}
5739 
5740 	rnh = ctx->ipfw_tables[tbl->tableid];
5741 	if (rnh == NULL)
5742 		return (ENOENT);
5743 	rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
5744 	if (nm.cnt == 0) {
5745 		/* No addresses can be expired. */
5746 		return (0);
5747 	}
5748 	tbl->expcnt = nm.cnt;
5749 
5750 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5751 	    ipfw_table_expire_dispatch);
5752 	nm.tableid = tbl->tableid;
5753 	netisr_domsg_global(&nm.base);
5754 	KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
5755 	    ("not all expired addresses (%d) were deleted (%d)",
5756 	     nm.cnt * netisr_ncpus, nm.expcnt));
5757 	return (0);
5758 }
5759 
5760 static void
5761 ipfw_crossref_free_dispatch(netmsg_t nmsg)
5762 {
5763 	struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
5764 
5765 	KKASSERT((rule->rule_flags &
5766 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5767 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5768 	ipfw_free_rule(rule);
5769 
5770 	netisr_replymsg(&nmsg->base, 0);
5771 }
5772 
5773 static void
5774 ipfw_crossref_reap(void)
5775 {
5776 	struct ip_fw *rule, *prev = NULL;
5777 
5778 	ASSERT_NETISR0;
5779 
5780 	rule = ipfw_gd.ipfw_crossref_free;
5781 	while (rule != NULL) {
5782 		uint64_t inflight = 0;
5783 		int i;
5784 
5785 		for (i = 0; i < netisr_ncpus; ++i)
5786 			inflight += rule->cross_rules[i]->cross_refs;
5787 		if (inflight == 0) {
5788 			struct ip_fw *f = rule;
5789 
5790 			/*
5791 			 * Unlink.
5792 			 */
5793 			rule = rule->next;
5794 			if (prev != NULL)
5795 				prev->next = rule;
5796 			else
5797 				ipfw_gd.ipfw_crossref_free = rule;
5798 
5799 			/*
5800 			 * Free.
5801 			 */
5802 			for (i = 1; i < netisr_ncpus; ++i) {
5803 				struct netmsg_base nm;
5804 
5805 				netmsg_init(&nm, NULL, &curthread->td_msgport,
5806 				    MSGF_PRIORITY, ipfw_crossref_free_dispatch);
5807 				nm.lmsg.u.ms_resultp = f->cross_rules[i];
5808 				netisr_domsg(&nm, i);
5809 			}
5810 			KKASSERT((f->rule_flags &
5811 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
5812 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
5813 			ipfw_unref_rule(f);
5814 		} else {
5815 			prev = rule;
5816 			rule = rule->next;
5817 		}
5818 	}
5819 
5820 	if (ipfw_gd.ipfw_crossref_free != NULL) {
5821 		callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
5822 		    ipfw_crossref_timeo, NULL);
5823 	}
5824 }
5825 
5826 /*
5827  * {set|get}sockopt parser.
5828  */
5829 static int
5830 ipfw_ctl(struct sockopt *sopt)
5831 {
5832 	int error, rulenum;
5833 	uint32_t *masks;
5834 	size_t size;
5835 
5836 	ASSERT_NETISR0;
5837 
5838 	error = 0;
5839 
5840 	switch (sopt->sopt_name) {
5841 	case IP_FW_GET:
5842 		error = ipfw_ctl_get_rules(sopt);
5843 		break;
5844 
5845 	case IP_FW_FLUSH:
5846 		ipfw_flush(0 /* keep default rule */);
5847 		break;
5848 
5849 	case IP_FW_ADD:
5850 		error = ipfw_ctl_add_rule(sopt);
5851 		break;
5852 
5853 	case IP_FW_DEL:
5854 		/*
5855 		 * IP_FW_DEL is used for deleting single rules or sets,
5856 		 * and (ab)used to atomically manipulate sets.
5857 		 * Argument size is used to distinguish between the two:
5858 		 *    sizeof(uint32_t)
5859 		 *	delete single rule or set of rules,
5860 		 *	or reassign rules (or sets) to a different set.
5861 		 *    2 * sizeof(uint32_t)
5862 		 *	atomic disable/enable sets.
5863 		 *	first uint32_t contains sets to be disabled,
5864 		 *	second uint32_t contains sets to be enabled.
5865 		 */
5866 		masks = sopt->sopt_val;
5867 		size = sopt->sopt_valsize;
5868 		if (size == sizeof(*masks)) {
5869 			/*
5870 			 * Delete or reassign static rule
5871 			 */
5872 			error = ipfw_ctl_alter(masks[0]);
5873 		} else if (size == (2 * sizeof(*masks))) {
5874 			/*
5875 			 * Set enable/disable
5876 			 */
5877 			ipfw_ctl_set_disable(masks[0], masks[1]);
5878 		} else {
5879 			error = EINVAL;
5880 		}
5881 		break;
5882 
5883 	case IP_FW_ZERO:
5884 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
5885 		rulenum = 0;
5886 
5887 		if (sopt->sopt_val != 0) {
5888 		    error = soopt_to_kbuf(sopt, &rulenum,
5889 			    sizeof(int), sizeof(int));
5890 		    if (error)
5891 			break;
5892 		}
5893 		error = ipfw_ctl_zero_entry(rulenum,
5894 			sopt->sopt_name == IP_FW_RESETLOG);
5895 		break;
5896 
5897 	case IP_FW_TBL_CREATE:
5898 		error = ipfw_table_create(sopt);
5899 		break;
5900 
5901 	case IP_FW_TBL_ADD:
5902 	case IP_FW_TBL_DEL:
5903 		error = ipfw_table_alt(sopt);
5904 		break;
5905 
5906 	case IP_FW_TBL_FLUSH:
5907 	case IP_FW_TBL_DESTROY:
5908 		error = ipfw_table_flush(sopt);
5909 		break;
5910 
5911 	case IP_FW_TBL_GET:
5912 		error = ipfw_table_get(sopt);
5913 		break;
5914 
5915 	case IP_FW_TBL_ZERO:
5916 		error = ipfw_table_zero(sopt);
5917 		break;
5918 
5919 	case IP_FW_TBL_EXPIRE:
5920 		error = ipfw_table_expire(sopt);
5921 		break;
5922 
5923 	default:
5924 		kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
5925 		error = EINVAL;
5926 	}
5927 
5928 	ipfw_crossref_reap();
5929 	return error;
5930 }
5931 
5932 static void
5933 ipfw_keepalive_done(struct ipfw_context *ctx)
5934 {
5935 
5936 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5937 	    ("keepalive is not in progress"));
5938 	ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
5939 	callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
5940 	    ipfw_keepalive, NULL);
5941 }
5942 
5943 static void
5944 ipfw_keepalive_more(struct ipfw_context *ctx)
5945 {
5946 	struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
5947 
5948 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5949 	    ("keepalive is not in progress"));
5950 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
5951 	    ("keepalive more did not finish"));
5952 	netisr_sendmsg_oncpu(nm);
5953 }
5954 
5955 static void
5956 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
5957 {
5958 	struct ipfw_state *s;
5959 	int scanned = 0, expired = 0, kept = 0;
5960 
5961 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
5962 	    ("keepalive is not in progress"));
5963 
5964 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
5965 		uint32_t ack_rev, ack_fwd;
5966 		struct ipfw_flow_id id;
5967 
5968 		if (scanned++ >= ipfw_state_scan_max) {
5969 			ipfw_keepalive_more(ctx);
5970 			return;
5971 		}
5972 
5973 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
5974 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
5975 
5976 		if (s->st_type == O_ANCHOR)
5977 			continue;
5978 
5979 		if (TIME_LEQ(s->st_expire, time_uptime)) {
5980 			/* State expired. */
5981 			ipfw_state_del(ctx, s);
5982 			if (++expired >= ipfw_state_expire_max) {
5983 				ipfw_keepalive_more(ctx);
5984 				return;
5985 			}
5986 			continue;
5987 		}
5988 
5989 		/*
5990 		 * Keep alive processing
5991 		 */
5992 
5993 		if (s->st_proto != IPPROTO_TCP)
5994 			continue;
5995 		if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
5996 			continue;
5997 		if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
5998 		    s->st_expire))
5999 			continue;	/* too early */
6000 
6001 		ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6002 		    &id.dst_ip, &id.dst_port);
6003 		ack_rev = s->st_ack_rev;
6004 		ack_fwd = s->st_ack_fwd;
6005 
6006 		send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6007 		send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6008 
6009 		if (++kept >= ipfw_keepalive_max) {
6010 			ipfw_keepalive_more(ctx);
6011 			return;
6012 		}
6013 	}
6014 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6015 	ipfw_keepalive_done(ctx);
6016 }
6017 
6018 static void
6019 ipfw_keepalive_more_dispatch(netmsg_t nm)
6020 {
6021 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6022 	struct ipfw_state *anchor;
6023 
6024 	ASSERT_NETISR_NCPUS(mycpuid);
6025 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6026 	    ("keepalive is not in progress"));
6027 
6028 	/* Reply ASAP */
6029 	netisr_replymsg(&nm->base, 0);
6030 
6031 	anchor = &ctx->ipfw_keepalive_anch;
6032 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6033 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6034 		ipfw_keepalive_done(ctx);
6035 		return;
6036 	}
6037 	ipfw_keepalive_loop(ctx, anchor);
6038 }
6039 
6040 /*
6041  * This procedure is only used to handle keepalives. It is invoked
6042  * every dyn_keepalive_period
6043  */
6044 static void
6045 ipfw_keepalive_dispatch(netmsg_t nm)
6046 {
6047 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6048 	struct ipfw_state *anchor;
6049 
6050 	ASSERT_NETISR_NCPUS(mycpuid);
6051 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6052 	    ("keepalive is in progress"));
6053 	ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6054 
6055 	/* Reply ASAP */
6056 	crit_enter();
6057 	netisr_replymsg(&nm->base, 0);
6058 	crit_exit();
6059 
6060 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6061 		ipfw_keepalive_done(ctx);
6062 		return;
6063 	}
6064 
6065 	anchor = &ctx->ipfw_keepalive_anch;
6066 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6067 	ipfw_keepalive_loop(ctx, anchor);
6068 }
6069 
6070 /*
6071  * This procedure is only used to handle keepalives. It is invoked
6072  * every dyn_keepalive_period
6073  */
6074 static void
6075 ipfw_keepalive(void *dummy __unused)
6076 {
6077 	struct netmsg_base *msg;
6078 
6079 	KKASSERT(mycpuid < netisr_ncpus);
6080 	msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6081 
6082 	crit_enter();
6083 	if (msg->lmsg.ms_flags & MSGF_DONE)
6084 		netisr_sendmsg_oncpu(msg);
6085 	crit_exit();
6086 }
6087 
6088 static void
6089 ipfw_ip_input_dispatch(netmsg_t nmsg)
6090 {
6091 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6092 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6093 	struct mbuf *m = nm->m;
6094 	struct ip_fw *rule = nm->arg1;
6095 
6096 	ASSERT_NETISR_NCPUS(mycpuid);
6097 	KASSERT(rule->cpuid == mycpuid,
6098 	    ("rule does not belong to cpu%d", mycpuid));
6099 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6100 	    ("mbuf does not have ipfw continue rule"));
6101 
6102 	KASSERT(ctx->ipfw_cont_rule == NULL,
6103 	    ("pending ipfw continue rule"));
6104 	ctx->ipfw_cont_rule = rule;
6105 	ip_input(m);
6106 
6107 	/*
6108 	 * This rule is no longer used; decrement its cross_refs,
6109 	 * so this rule can be deleted.
6110 	 */
6111 	rule->cross_refs--;
6112 
6113 	/* May not be cleared, if ipfw was unload/disabled. */
6114 	ctx->ipfw_cont_rule = NULL;
6115 }
6116 
6117 static int
6118 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6119 {
6120 	struct ip_fw_args args;
6121 	struct mbuf *m = *m0;
6122 	struct m_tag *mtag;
6123 	int tee = 0, error = 0, ret, cpuid;
6124 	struct netmsg_genpkt *nm;
6125 
6126 	args.cont = 0;
6127 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6128 		/* Extract info from dummynet tag */
6129 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6130 		KKASSERT(mtag != NULL);
6131 		args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6132 		KKASSERT(args.rule != NULL);
6133 
6134 		m_tag_delete(m, mtag);
6135 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6136 	} else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6137 		struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6138 
6139 		KKASSERT(ctx->ipfw_cont_rule != NULL);
6140 		args.rule = ctx->ipfw_cont_rule;
6141 		ctx->ipfw_cont_rule = NULL;
6142 
6143 		args.cont = 1;
6144 		m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6145 	} else {
6146 		args.rule = NULL;
6147 	}
6148 
6149 	args.eh = NULL;
6150 	args.oif = NULL;
6151 	args.m = m;
6152 	ret = ipfw_chk(&args);
6153 	m = args.m;
6154 
6155 	if (m == NULL) {
6156 		error = EACCES;
6157 		goto back;
6158 	}
6159 
6160 	switch (ret) {
6161 	case IP_FW_PASS:
6162 		break;
6163 
6164 	case IP_FW_DENY:
6165 		m_freem(m);
6166 		m = NULL;
6167 		error = EACCES;
6168 		break;
6169 
6170 	case IP_FW_DUMMYNET:
6171 		/* Send packet to the appropriate pipe */
6172 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6173 		break;
6174 
6175 	case IP_FW_TEE:
6176 		tee = 1;
6177 		/* FALL THROUGH */
6178 
6179 	case IP_FW_DIVERT:
6180 		/*
6181 		 * Must clear bridge tag when changing
6182 		 */
6183 		m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6184 		if (ip_divert_p != NULL) {
6185 			m = ip_divert_p(m, tee, 1);
6186 		} else {
6187 			m_freem(m);
6188 			m = NULL;
6189 			/* not sure this is the right error msg */
6190 			error = EACCES;
6191 		}
6192 		break;
6193 
6194 	case IP_FW_CONTINUE:
6195 		KASSERT(m->m_flags & M_HASH, ("no hash"));
6196 		cpuid = netisr_hashcpu(m->m_pkthdr.hash);
6197 		KASSERT(cpuid != mycpuid,
6198 		    ("continue on the same cpu%d", cpuid));
6199 
6200 		/*
6201 		 * NOTE:
6202 		 * Bump cross_refs to prevent this rule and its siblings
6203 		 * from being deleted, while this mbuf is inflight.  The
6204 		 * cross_refs of the sibling rule on the target cpu will
6205 		 * be decremented, once this mbuf is going to be filtered
6206 		 * on the target cpu.
6207 		 */
6208 		args.rule->cross_refs++;
6209 		m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6210 
6211 		nm = &m->m_hdr.mh_genmsg;
6212 		netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6213 		    ipfw_ip_input_dispatch);
6214 		nm->m = m;
6215 		nm->arg1 = args.rule->cross_rules[cpuid];
6216 		netisr_sendmsg(&nm->base, cpuid);
6217 
6218 		/* This mbuf is dispatched; no longer valid. */
6219 		m = NULL;
6220 		break;
6221 
6222 	default:
6223 		panic("unknown ipfw return value: %d", ret);
6224 	}
6225 back:
6226 	*m0 = m;
6227 	return error;
6228 }
6229 
6230 static int
6231 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6232 {
6233 	struct ip_fw_args args;
6234 	struct mbuf *m = *m0;
6235 	struct m_tag *mtag;
6236 	int tee = 0, error = 0, ret;
6237 
6238 	args.cont = 0;
6239 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6240 		/* Extract info from dummynet tag */
6241 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6242 		KKASSERT(mtag != NULL);
6243 		args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6244 		KKASSERT(args.rule != NULL);
6245 
6246 		m_tag_delete(m, mtag);
6247 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6248 	} else {
6249 		args.rule = NULL;
6250 	}
6251 
6252 	args.eh = NULL;
6253 	args.m = m;
6254 	args.oif = ifp;
6255 	ret = ipfw_chk(&args);
6256 	m = args.m;
6257 
6258 	if (m == NULL) {
6259 		error = EACCES;
6260 		goto back;
6261 	}
6262 
6263 	switch (ret) {
6264 	case IP_FW_PASS:
6265 		break;
6266 
6267 	case IP_FW_DENY:
6268 		m_freem(m);
6269 		m = NULL;
6270 		error = EACCES;
6271 		break;
6272 
6273 	case IP_FW_DUMMYNET:
6274 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
6275 		break;
6276 
6277 	case IP_FW_TEE:
6278 		tee = 1;
6279 		/* FALL THROUGH */
6280 
6281 	case IP_FW_DIVERT:
6282 		if (ip_divert_p != NULL) {
6283 			m = ip_divert_p(m, tee, 0);
6284 		} else {
6285 			m_freem(m);
6286 			m = NULL;
6287 			/* not sure this is the right error msg */
6288 			error = EACCES;
6289 		}
6290 		break;
6291 
6292 	default:
6293 		panic("unknown ipfw return value: %d", ret);
6294 	}
6295 back:
6296 	*m0 = m;
6297 	return error;
6298 }
6299 
6300 static void
6301 ipfw_hook(void)
6302 {
6303 	struct pfil_head *pfh;
6304 
6305 	ASSERT_NETISR0;
6306 
6307 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6308 	if (pfh == NULL)
6309 		return;
6310 
6311 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6312 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6313 }
6314 
6315 static void
6316 ipfw_dehook(void)
6317 {
6318 	struct pfil_head *pfh;
6319 
6320 	ASSERT_NETISR0;
6321 
6322 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
6323 	if (pfh == NULL)
6324 		return;
6325 
6326 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
6327 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
6328 }
6329 
6330 static int
6331 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
6332 {
6333 	int dyn_cnt;
6334 
6335 	dyn_cnt = ipfw_state_cntcoll();
6336 	dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
6337 
6338 	return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
6339 }
6340 
6341 static int
6342 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
6343 {
6344 	int state_cnt;
6345 
6346 	state_cnt = ipfw_state_cntcoll();
6347 	return (sysctl_handle_int(oidp, &state_cnt, 0, req));
6348 }
6349 
6350 static int
6351 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
6352 {
6353 	int state_max, error;
6354 
6355 	state_max = ipfw_state_max;
6356 	error = sysctl_handle_int(oidp, &state_max, 0, req);
6357 	if (error || req->newptr == NULL)
6358 		return (error);
6359 
6360 	if (state_max < 1)
6361 		return (EINVAL);
6362 
6363 	ipfw_state_max_set(state_max);
6364 	return (0);
6365 }
6366 
6367 static int
6368 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
6369 {
6370 	int dyn_max, error;
6371 
6372 	dyn_max = ipfw_state_max + ipfw_track_max;
6373 
6374 	error = sysctl_handle_int(oidp, &dyn_max, 0, req);
6375 	if (error || req->newptr == NULL)
6376 		return (error);
6377 
6378 	if (dyn_max < 2)
6379 		return (EINVAL);
6380 
6381 	ipfw_state_max_set(dyn_max / 2);
6382 	ipfw_track_max = dyn_max / 2;
6383 	return (0);
6384 }
6385 
6386 static void
6387 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
6388 {
6389 	int enable = nmsg->lmsg.u.ms_result;
6390 
6391 	ASSERT_NETISR0;
6392 
6393 	if (fw_enable == enable)
6394 		goto reply;
6395 
6396 	fw_enable = enable;
6397 	if (fw_enable)
6398 		ipfw_hook();
6399 	else
6400 		ipfw_dehook();
6401 reply:
6402 	netisr_replymsg(&nmsg->base, 0);
6403 }
6404 
6405 static int
6406 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
6407 {
6408 	struct netmsg_base nmsg;
6409 	int enable, error;
6410 
6411 	enable = fw_enable;
6412 	error = sysctl_handle_int(oidp, &enable, 0, req);
6413 	if (error || req->newptr == NULL)
6414 		return error;
6415 
6416 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6417 	    ipfw_sysctl_enable_dispatch);
6418 	nmsg.lmsg.u.ms_result = enable;
6419 
6420 	return netisr_domsg(&nmsg, 0);
6421 }
6422 
6423 static int
6424 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
6425 {
6426 	return sysctl_int_range(oidp, arg1, arg2, req,
6427 	       IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
6428 }
6429 
6430 static int
6431 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
6432 {
6433 
6434 	return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
6435 }
6436 
6437 static int
6438 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
6439 {
6440 	u_long stat = 0;
6441 	int cpu, error;
6442 
6443 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6444 		stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
6445 
6446 	error = sysctl_handle_long(oidp, &stat, 0, req);
6447 	if (error || req->newptr == NULL)
6448 		return (error);
6449 
6450 	/* Zero out this stat. */
6451 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6452 		*((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
6453 	return (0);
6454 }
6455 
6456 static void
6457 ipfw_ctx_init_dispatch(netmsg_t nmsg)
6458 {
6459 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
6460 	struct ipfw_context *ctx;
6461 	struct ip_fw *def_rule;
6462 
6463 	ASSERT_NETISR_NCPUS(mycpuid);
6464 
6465 	ctx = kmalloc(__offsetof(struct ipfw_context,
6466 	    ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
6467 
6468 	RB_INIT(&ctx->ipfw_state_tree);
6469 	TAILQ_INIT(&ctx->ipfw_state_list);
6470 
6471 	RB_INIT(&ctx->ipfw_track_tree);
6472 	TAILQ_INIT(&ctx->ipfw_track_list);
6473 
6474 	callout_init_mp(&ctx->ipfw_stateto_ch);
6475 	netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
6476 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
6477 	ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
6478 	netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
6479 	    MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
6480 
6481 	callout_init_mp(&ctx->ipfw_trackto_ch);
6482 	netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
6483 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
6484 	netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
6485 	    MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
6486 
6487 	callout_init_mp(&ctx->ipfw_keepalive_ch);
6488 	netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
6489 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
6490 	ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
6491 	netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
6492 	    MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
6493 
6494 	ipfw_ctx[mycpuid] = ctx;
6495 
6496 	def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
6497 
6498 	def_rule->act_ofs = 0;
6499 	def_rule->rulenum = IPFW_DEFAULT_RULE;
6500 	def_rule->cmd_len = 1;
6501 	def_rule->set = IPFW_DEFAULT_SET;
6502 
6503 	def_rule->cmd[0].len = 1;
6504 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
6505 	def_rule->cmd[0].opcode = O_ACCEPT;
6506 #else
6507 	if (filters_default_to_accept)
6508 		def_rule->cmd[0].opcode = O_ACCEPT;
6509 	else
6510 		def_rule->cmd[0].opcode = O_DENY;
6511 #endif
6512 
6513 	def_rule->refcnt = 1;
6514 	def_rule->cpuid = mycpuid;
6515 
6516 	/* Install the default rule */
6517 	ctx->ipfw_default_rule = def_rule;
6518 	ctx->ipfw_layer3_chain = def_rule;
6519 
6520 	/* Link rule CPU sibling */
6521 	ipfw_link_sibling(fwmsg, def_rule);
6522 
6523 	/* Statistics only need to be updated once */
6524 	if (mycpuid == 0)
6525 		ipfw_inc_static_count(def_rule);
6526 
6527 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6528 }
6529 
6530 static void
6531 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
6532 {
6533 
6534 	crit_enter();
6535 	/* Reply ASAP */
6536 	netisr_replymsg(&nmsg->base, 0);
6537 	crit_exit();
6538 	ipfw_crossref_reap();
6539 }
6540 
6541 static void
6542 ipfw_crossref_timeo(void *dummy __unused)
6543 {
6544 	struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
6545 
6546 	KKASSERT(mycpuid == 0);
6547 
6548 	crit_enter();
6549 	if (msg->lmsg.ms_flags & MSGF_DONE)
6550 		netisr_sendmsg_oncpu(msg);
6551 	crit_exit();
6552 }
6553 
6554 static void
6555 ipfw_ifaddr_dispatch(netmsg_t nmsg)
6556 {
6557 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6558 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
6559 	struct ip_fw *f;
6560 
6561 	ASSERT_NETISR_NCPUS(mycpuid);
6562 
6563 	for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
6564 		int l, cmdlen;
6565 		ipfw_insn *cmd;
6566 
6567 		if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
6568 			continue;
6569 
6570 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
6571 		     l -= cmdlen, cmd += cmdlen) {
6572 			cmdlen = F_LEN(cmd);
6573 			if (cmd->opcode == O_IP_SRC_IFIP ||
6574 			    cmd->opcode == O_IP_DST_IFIP) {
6575 				if (strncmp(ifp->if_xname,
6576 				    ((ipfw_insn_ifip *)cmd)->ifname,
6577 				    IFNAMSIZ) == 0)
6578 					cmd->arg1 &= ~IPFW_IFIP_VALID;
6579 			}
6580 		}
6581 	}
6582 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6583 }
6584 
6585 static void
6586 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
6587     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
6588 {
6589 	struct netmsg_base nm;
6590 
6591 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6592 	    ipfw_ifaddr_dispatch);
6593 	nm.lmsg.u.ms_resultp = ifp;
6594 	netisr_domsg_global(&nm);
6595 }
6596 
6597 static void
6598 ipfw_init_dispatch(netmsg_t nmsg)
6599 {
6600 	struct netmsg_ipfw fwmsg;
6601 	int error = 0, cpu;
6602 
6603 	ASSERT_NETISR0;
6604 
6605 	if (IPFW_LOADED) {
6606 		kprintf("IP firewall already loaded\n");
6607 		error = EEXIST;
6608 		goto reply;
6609 	}
6610 
6611 	if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
6612 		ipfw_table_max = UINT16_MAX;
6613 
6614 	/* Initialize global track tree. */
6615 	RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
6616 	IPFW_TRKCNT_TOKINIT;
6617 
6618 	/* GC for freed crossref rules. */
6619 	callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
6620 	netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
6621 	    MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
6622 
6623 	ipfw_state_max_set(ipfw_state_max);
6624 	ipfw_state_headroom = 8 * netisr_ncpus;
6625 
6626 	bzero(&fwmsg, sizeof(fwmsg));
6627 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6628 	    ipfw_ctx_init_dispatch);
6629 	netisr_domsg_global(&fwmsg.base);
6630 
6631 	ip_fw_chk_ptr = ipfw_chk;
6632 	ip_fw_ctl_ptr = ipfw_ctl;
6633 	ip_fw_dn_io_ptr = ipfw_dummynet_io;
6634 
6635 	kprintf("ipfw2 initialized, default to %s, logging ",
6636 		ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
6637 		O_ACCEPT ? "accept" : "deny");
6638 
6639 #ifdef IPFIREWALL_VERBOSE
6640 	fw_verbose = 1;
6641 #endif
6642 #ifdef IPFIREWALL_VERBOSE_LIMIT
6643 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
6644 #endif
6645 	if (fw_verbose == 0) {
6646 		kprintf("disabled\n");
6647 	} else if (verbose_limit == 0) {
6648 		kprintf("unlimited\n");
6649 	} else {
6650 		kprintf("limited to %d packets/entry by default\n",
6651 			verbose_limit);
6652 	}
6653 
6654 	ip_fw_loaded = 1;
6655 	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
6656 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
6657 		    ipfw_state_expire_ipifunc, NULL, cpu);
6658 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
6659 		    ipfw_track_expire_ipifunc, NULL, cpu);
6660 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
6661 		    ipfw_keepalive, NULL, cpu);
6662 	}
6663 
6664 	if (fw_enable)
6665 		ipfw_hook();
6666 
6667 	ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
6668 	    NULL, EVENTHANDLER_PRI_ANY);
6669 	if (ipfw_ifaddr_event == NULL)
6670 		kprintf("ipfw: ifaddr_event register failed\n");
6671 
6672 reply:
6673 	netisr_replymsg(&nmsg->base, error);
6674 }
6675 
6676 static int
6677 ipfw_init(void)
6678 {
6679 	struct netmsg_base smsg;
6680 
6681 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6682 	    ipfw_init_dispatch);
6683 	return netisr_domsg(&smsg, 0);
6684 }
6685 
6686 #ifdef KLD_MODULE
6687 
6688 static void
6689 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
6690 {
6691 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6692 
6693 	ASSERT_NETISR_NCPUS(mycpuid);
6694 
6695 	callout_stop_sync(&ctx->ipfw_stateto_ch);
6696 	callout_stop_sync(&ctx->ipfw_trackto_ch);
6697 	callout_stop_sync(&ctx->ipfw_keepalive_ch);
6698 
6699 	crit_enter();
6700 	netisr_dropmsg(&ctx->ipfw_stateexp_more);
6701 	netisr_dropmsg(&ctx->ipfw_stateexp_nm);
6702 	netisr_dropmsg(&ctx->ipfw_trackexp_more);
6703 	netisr_dropmsg(&ctx->ipfw_trackexp_nm);
6704 	netisr_dropmsg(&ctx->ipfw_keepalive_more);
6705 	netisr_dropmsg(&ctx->ipfw_keepalive_nm);
6706 	crit_exit();
6707 
6708 	ipfw_table_flushall_oncpu(ctx, 1);
6709 
6710 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6711 }
6712 
6713 static void
6714 ipfw_fini_dispatch(netmsg_t nmsg)
6715 {
6716 	struct netmsg_base nm;
6717 	int error = 0, cpu;
6718 
6719 	ASSERT_NETISR0;
6720 
6721 	ipfw_crossref_reap();
6722 
6723 	if (ipfw_gd.ipfw_refcnt != 0) {
6724 		error = EBUSY;
6725 		goto reply;
6726 	}
6727 
6728 	ip_fw_loaded = 0;
6729 	ipfw_dehook();
6730 
6731 	/* Synchronize any inflight state/track expire IPIs. */
6732 	lwkt_synchronize_ipiqs("ipfwfini");
6733 
6734 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6735 	    ipfw_ctx_fini_dispatch);
6736 	netisr_domsg_global(&nm);
6737 
6738 	callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
6739 	crit_enter();
6740 	netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
6741 	crit_exit();
6742 
6743 	if (ipfw_ifaddr_event != NULL)
6744 		EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
6745 
6746 	ip_fw_chk_ptr = NULL;
6747 	ip_fw_ctl_ptr = NULL;
6748 	ip_fw_dn_io_ptr = NULL;
6749 	ipfw_flush(1 /* kill default rule */);
6750 
6751 	/* Free pre-cpu context */
6752 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
6753 		kfree(ipfw_ctx[cpu], M_IPFW);
6754 
6755 	kprintf("IP firewall unloaded\n");
6756 reply:
6757 	netisr_replymsg(&nmsg->base, error);
6758 }
6759 
6760 static int
6761 ipfw_fini(void)
6762 {
6763 	struct netmsg_base smsg;
6764 
6765 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6766 	    ipfw_fini_dispatch);
6767 	return netisr_domsg(&smsg, 0);
6768 }
6769 
6770 #endif	/* KLD_MODULE */
6771 
6772 static int
6773 ipfw_modevent(module_t mod, int type, void *unused)
6774 {
6775 	int err = 0;
6776 
6777 	switch (type) {
6778 	case MOD_LOAD:
6779 		err = ipfw_init();
6780 		break;
6781 
6782 	case MOD_UNLOAD:
6783 #ifndef KLD_MODULE
6784 		kprintf("ipfw statically compiled, cannot unload\n");
6785 		err = EBUSY;
6786 #else
6787 		err = ipfw_fini();
6788 #endif
6789 		break;
6790 	default:
6791 		break;
6792 	}
6793 	return err;
6794 }
6795 
6796 static moduledata_t ipfwmod = {
6797 	"ipfw",
6798 	ipfw_modevent,
6799 	0
6800 };
6801 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
6802 MODULE_VERSION(ipfw, 1);
6803