xref: /dragonfly/sys/net/ipfw/ip_fw2.c (revision f0e61bb7)
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27 
28 /*
29  * Implement IP packet firewall (new version)
30  */
31 
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58 
59 #include <sys/thread2.h>
60 #include <net/netmsg2.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in_var.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #include <netinet/udp.h>
75 #include <netinet/udp_var.h>
76 #include <netinet/ip_divert.h>
77 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
78 
79 #include <net/ipfw/ip_fw2.h>
80 
81 #ifdef IPFIREWALL_DEBUG
82 #define DPRINTF(fmt, ...) \
83 do { \
84 	if (fw_debug > 0) \
85 		kprintf(fmt, __VA_ARGS__); \
86 } while (0)
87 #else
88 #define DPRINTF(fmt, ...)	((void)0)
89 #endif
90 
91 /*
92  * Description about per-CPU rule duplication:
93  *
94  * Module loading/unloading and all ioctl operations are serialized
95  * by netisr0, so we don't have any ordering or locking problems.
96  *
97  * Following graph shows how operation on per-CPU rule list is
98  * performed [2 CPU case]:
99  *
100  *   CPU0                 CPU1
101  *
102  * netisr0 <------------------------------------+
103  *  domsg                                       |
104  *    :                                         |
105  *    :(delete/add...)                          |
106  *    :                                         |
107  *    :         netmsg                          | netmsg
108  *  forwardmsg---------->netisr1                |
109  *                          :                   |
110  *                          :(delete/add...)    |
111  *                          :                   |
112  *                          :                   |
113  *                        replymsg--------------+
114  *
115  *
116  *
117  * Rule structure [2 CPU case]
118  *
119  *    CPU0               CPU1
120  *
121  * layer3_chain       layer3_chain
122  *     |                  |
123  *     V                  V
124  * +-------+ sibling  +-------+ sibling
125  * | rule1 |--------->| rule1 |--------->NULL
126  * +-------+          +-------+
127  *     |                  |
128  *     |next              |next
129  *     V                  V
130  * +-------+ sibling  +-------+ sibling
131  * | rule2 |--------->| rule2 |--------->NULL
132  * +-------+          +-------+
133  *
134  * ip_fw.sibling:
135  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
136  *    iterate layer3_chain in netisr0; the current rule's duplication
137  *    to the other CPUs could safely be read-only accessed through
138  *    ip_fw.sibling.
139  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
140  *    a) In netisr0 rule3 is determined to be inserted between rule1
141  *       and rule2.  To make this decision we need to iterate the
142  *       layer3_chain in netisr0.  The netmsg, which is used to insert
143  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
144  *       in netisr0 as next_rule.
145  *    b) After the insertion in netisr0 is done, we will move on to
146  *       netisr1.  But instead of relocating the rule3's position in
147  *       netisr1 by iterating the layer3_chain in netisr1, we set the
148  *       netmsg's prev_rule to rule1->sibling and next_rule to
149  *       rule2->sibling before the netmsg is forwarded to netisr1 from
150  *       netisr0.
151  */
152 
153 /*
154  * Description of states and tracks.
155  *
156  * Both states and tracks are stored in per-cpu RB trees instead of
157  * per-cpu hash tables to avoid the worst case hash degeneration.
158  *
159  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
160  * measured in seconds and depending on the flags.
161  *
162  * When a packet is received, its address fields are first masked with
163  * the mask defined for the rule, then matched against the entries in
164  * the per-cpu state RB tree.  States are generated by 'keep-state'
165  * and 'limit' options.
166  *
167  * The max number of states is ipfw_state_max.  When we reach the
168  * maximum number of states we do not create anymore.  This is done to
169  * avoid consuming too much memory, but also too much time when
170  * searching on each packet.
171  *
172  * Each state holds a pointer to the parent ipfw rule of the current
173  * CPU so we know what action to perform.  States are removed when the
174  * parent rule is deleted.  XXX we should make them survive.
175  *
176  * There are some limitations with states -- we do not obey the
177  * 'randomized match', and we do not do multiple passes through the
178  * firewall.  XXX check the latter!!!
179  *
180  * States grow independently on each CPU, e.g. 2 CPU case:
181  *
182  *        CPU0                     CPU1
183  * ...................      ...................
184  * :  state RB tree  :      :  state RB tree  :
185  * :                 :      :                 :
186  * : state1   state2 :      :      state3     :
187  * :     |    |      :      :        |        :
188  * :.....|....|......:      :........|........:
189  *       |    |                      |
190  *       |    |                      |st_rule
191  *       |    |                      |
192  *       V    V                      V
193  *     +-------+                 +-------+
194  *     | rule1 |                 | rule1 |
195  *     +-------+                 +-------+
196  *
197  * Tracks are used to enforce limits on the number of sessions.  Tracks
198  * are generated by 'limit' option.
199  *
200  * The max number of tracks is ipfw_track_max.  When we reach the
201  * maximum number of tracks we do not create anymore.  This is done to
202  * avoid consuming too much memory.
203  *
204  * Tracks are organized into two layers, track counter RB tree is
205  * shared between CPUs, track RB tree is per-cpu.  States generated by
206  * 'limit' option are linked to the track in addition to the per-cpu
207  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
208  *
209  *             ..............................
210  *             :    track counter RB tree   :
211  *             :                            :
212  *             :        +-----------+       :
213  *             :        |  trkcnt1  |       :
214  *             :        |           |       :
215  *             :      +--->counter<----+    :
216  *             :      | |           |  |    :
217  *             :      | +-----------+  |    :
218  *             :......|................|....:
219  *                    |                |
220  *        CPU0        |                |         CPU1
221  * .................  |t_count         |  .................
222  * : track RB tree :  |                |  : track RB tree :
223  * :               :  |                |  :               :
224  * : +-->track1-------+                +--------track2    :
225  * : |     A       :                      :               :
226  * : |     |       :                      :               :
227  * :.|.....|.......:                      :...............:
228  *   |     +----------------+
229  *   | .................... |
230  *   | :   state RB tree  : |st_track
231  *   | :                  : |
232  *   +---state1    state2---+
233  *     :     |       |    :
234  *     :.....|.......|....:
235  *           |       |
236  *           |       |st_rule
237  *           V       V
238  *         +----------+
239  *         |   rule1  |
240  *         +----------+
241  */
242 
243 #define IPFW_AUTOINC_STEP_MIN	1
244 #define IPFW_AUTOINC_STEP_MAX	1000
245 #define IPFW_AUTOINC_STEP_DEF	100
246 
247 #define IPFW_TABLE_MAX_DEF	64
248 
249 #define	IPFW_DEFAULT_RULE	65535	/* rulenum for the default rule */
250 #define IPFW_DEFAULT_SET	31	/* set number for the default rule */
251 
252 #define MATCH_REVERSE		0
253 #define MATCH_FORWARD		1
254 #define MATCH_NONE		2
255 #define MATCH_UNKNOWN		3
256 
257 #define TIME_LEQ(a, b)		((a) - (b) <= 0)
258 
259 #define IPFW_STATE_TCPFLAGS	(TH_SYN | TH_FIN | TH_RST)
260 #define IPFW_STATE_TCPSTATES	(IPFW_STATE_TCPFLAGS |	\
261 				 (IPFW_STATE_TCPFLAGS << 8))
262 
263 #define BOTH_SYN		(TH_SYN | (TH_SYN << 8))
264 #define BOTH_FIN		(TH_FIN | (TH_FIN << 8))
265 #define BOTH_RST		(TH_RST | (TH_RST << 8))
266 /* TH_ACK here means FIN was ACKed. */
267 #define BOTH_FINACK		(TH_ACK | (TH_ACK << 8))
268 
269 #define IPFW_STATE_TCPCLOSED(s)	((s)->st_proto == IPPROTO_TCP &&	\
270 				 (((s)->st_state & BOTH_RST) ||		\
271 				  ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
272 
273 #define O_ANCHOR		O_NOP
274 
275 #define IPFW_ISXLAT(type)	((type) == O_REDIRECT)
276 #define IPFW_XLAT_INVALID(s)	(IPFW_ISXLAT((s)->st_type) &&	\
277 				 ((struct ipfw_xlat *)(s))->xlat_invalid)
278 
279 #define IPFW_MBUF_XLATINS	FW_MBUF_PRIVATE1
280 #define IPFW_MBUF_XLATFWD	FW_MBUF_PRIVATE2
281 
282 #define IPFW_XLATE_INSERT	0x0001
283 #define IPFW_XLATE_FORWARD	0x0002
284 #define IPFW_XLATE_OUTPUT	0x0004
285 
286 struct netmsg_ipfw {
287 	struct netmsg_base	base;
288 	const struct ipfw_ioc_rule *ioc_rule;
289 	struct ip_fw		*next_rule;
290 	struct ip_fw		*prev_rule;
291 	struct ip_fw		*sibling;
292 	uint32_t		rule_flags;
293 	struct ip_fw		**cross_rules;
294 };
295 
296 struct netmsg_del {
297 	struct netmsg_base	base;
298 	struct ip_fw		*start_rule;
299 	struct ip_fw		*prev_rule;
300 	uint16_t		rulenum;
301 	uint8_t			from_set;
302 	uint8_t			to_set;
303 };
304 
305 struct netmsg_zent {
306 	struct netmsg_base	base;
307 	struct ip_fw		*start_rule;
308 	uint16_t		rulenum;
309 	uint16_t		log_only;
310 };
311 
312 struct netmsg_cpstate {
313 	struct netmsg_base	base;
314 	struct ipfw_ioc_state	*ioc_state;
315 	int			state_cntmax;
316 	int			state_cnt;
317 };
318 
319 struct netmsg_tblent {
320 	struct netmsg_base	base;
321 	struct sockaddr		*key;
322 	struct sockaddr		*netmask;
323 	struct ipfw_tblent	*sibling;
324 	int			tableid;
325 };
326 
327 struct netmsg_tblflush {
328 	struct netmsg_base	base;
329 	int			tableid;
330 	int			destroy;
331 };
332 
333 struct netmsg_tblexp {
334 	struct netmsg_base	base;
335 	time_t			expire;
336 	int			tableid;
337 	int			cnt;
338 	int			expcnt;
339 	struct radix_node_head	*rnh;
340 };
341 
342 struct ipfw_table_cp {
343 	struct ipfw_ioc_tblent	*te;
344 	int			te_idx;
345 	int			te_cnt;
346 };
347 
348 struct ip_fw_local {
349 	/*
350 	 * offset	The offset of a fragment. offset != 0 means that
351 	 *	we have a fragment at this offset of an IPv4 packet.
352 	 *	offset == 0 means that (if this is an IPv4 packet)
353 	 *	this is the first or only fragment.
354 	 */
355 	u_short			offset;
356 
357 	/*
358 	 * Local copies of addresses. They are only valid if we have
359 	 * an IP packet.
360 	 *
361 	 * proto	The protocol. Set to 0 for non-ip packets,
362 	 *	or to the protocol read from the packet otherwise.
363 	 *	proto != 0 means that we have an IPv4 packet.
364 	 *
365 	 * src_port, dst_port	port numbers, in HOST format. Only
366 	 *	valid for TCP and UDP packets.
367 	 *
368 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
369 	 *	Only valid for IPv4 packets.
370 	 */
371 	uint8_t			proto;
372 	uint16_t		src_port;	/* NOTE: host format	*/
373 	uint16_t		dst_port;	/* NOTE: host format	*/
374 	struct in_addr		src_ip;		/* NOTE: network format	*/
375 	struct in_addr		dst_ip;		/* NOTE: network format	*/
376 	uint16_t		ip_len;		/* NOTE: host format	*/
377 	struct tcphdr		*tcp;
378 };
379 
380 struct ipfw_addrs {
381 	uint32_t		addr1;	/* host byte order */
382 	uint32_t		addr2;	/* host byte order */
383 };
384 
385 struct ipfw_ports {
386 	uint16_t		port1;	/* host byte order */
387 	uint16_t		port2;	/* host byte order */
388 };
389 
390 struct ipfw_key {
391 	union {
392 		struct ipfw_addrs addrs;
393 		uint64_t	value;
394 	} addr_u;
395 	union {
396 		struct ipfw_ports ports;
397 		uint32_t	value;
398 	} port_u;
399 	uint8_t			proto;
400 	uint8_t			swap;	/* IPFW_KEY_SWAP_ */
401 	uint16_t		rsvd2;
402 };
403 
404 #define IPFW_KEY_SWAP_ADDRS	0x1
405 #define IPFW_KEY_SWAP_PORTS	0x2
406 #define IPFW_KEY_SWAP_ALL	(IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
407 
408 struct ipfw_trkcnt {
409 	RB_ENTRY(ipfw_trkcnt)	tc_rblink;
410 	struct ipfw_key		tc_key;
411 	uintptr_t		tc_ruleid;
412 	int			tc_refs;
413 	int			tc_count;
414 	time_t			tc_expire;	/* userland get-only */
415 	uint16_t		tc_rulenum;	/* userland get-only */
416 } __cachealign;
417 
418 #define tc_addrs		tc_key.addr_u.value
419 #define tc_ports		tc_key.port_u.value
420 #define tc_proto		tc_key.proto
421 #define tc_saddr		tc_key.addr_u.addrs.addr1
422 #define tc_daddr		tc_key.addr_u.addrs.addr2
423 #define tc_sport		tc_key.port_u.ports.port1
424 #define tc_dport		tc_key.port_u.ports.port2
425 
426 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
427 
428 struct ipfw_state;
429 
430 struct ipfw_track {
431 	RB_ENTRY(ipfw_track)	t_rblink;
432 	struct ipfw_key		t_key;
433 	struct ip_fw		*t_rule;
434 	time_t			t_lastexp;
435 	LIST_HEAD(, ipfw_state)	t_state_list;
436 	time_t			t_expire;
437 	volatile int		*t_count;
438 	struct ipfw_trkcnt	*t_trkcnt;
439 	TAILQ_ENTRY(ipfw_track)	t_link;
440 };
441 
442 #define t_addrs			t_key.addr_u.value
443 #define t_ports			t_key.port_u.value
444 #define t_proto			t_key.proto
445 #define t_saddr			t_key.addr_u.addrs.addr1
446 #define t_daddr			t_key.addr_u.addrs.addr2
447 #define t_sport			t_key.port_u.ports.port1
448 #define t_dport			t_key.port_u.ports.port2
449 
450 RB_HEAD(ipfw_track_tree, ipfw_track);
451 TAILQ_HEAD(ipfw_track_list, ipfw_track);
452 
453 struct ipfw_state {
454 	RB_ENTRY(ipfw_state)	st_rblink;
455 	struct ipfw_key		st_key;
456 
457 	time_t			st_expire;	/* expire time */
458 	struct ip_fw		*st_rule;
459 
460 	uint64_t		st_pcnt;	/* packets */
461 	uint64_t		st_bcnt;	/* bytes */
462 
463 	/*
464 	 * st_state:
465 	 * State of this rule, typically a combination of TCP flags.
466 	 *
467 	 * st_ack_fwd/st_ack_rev:
468 	 * Most recent ACKs in forward and reverse direction.  They
469 	 * are used to generate keepalives.
470 	 */
471 	uint32_t		st_state;
472 	uint32_t		st_ack_fwd;	/* host byte order */
473 	uint32_t		st_seq_fwd;	/* host byte order */
474 	uint32_t		st_ack_rev;	/* host byte order */
475 	uint32_t		st_seq_rev;	/* host byte order */
476 
477 	uint16_t		st_flags;	/* IPFW_STATE_F_ */
478 	uint16_t		st_type;	/* KEEP_STATE/LIMIT/RDR */
479 	struct ipfw_track	*st_track;
480 
481 	LIST_ENTRY(ipfw_state)	st_trklink;
482 	TAILQ_ENTRY(ipfw_state)	st_link;
483 };
484 
485 #define st_addrs		st_key.addr_u.value
486 #define st_ports		st_key.port_u.value
487 #define st_proto		st_key.proto
488 #define st_swap			st_key.swap
489 
490 #define IPFW_STATE_F_ACKFWD	0x0001
491 #define IPFW_STATE_F_SEQFWD	0x0002
492 #define IPFW_STATE_F_ACKREV	0x0004
493 #define IPFW_STATE_F_SEQREV	0x0008
494 #define IPFW_STATE_F_XLATSRC	0x0010
495 #define IPFW_STATE_F_XLATSLAVE	0x0020
496 #define IPFW_STATE_F_LINKED	0x0040
497 
498 #define IPFW_STATE_SCANSKIP(s)	((s)->st_type == O_ANCHOR ||	\
499 				 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
500 
501 /* Expired or being deleted. */
502 #define IPFW_STATE_ISDEAD(s)	(TIME_LEQ((s)->st_expire, time_uptime) || \
503 				 IPFW_XLAT_INVALID((s)))
504 
505 TAILQ_HEAD(ipfw_state_list, ipfw_state);
506 RB_HEAD(ipfw_state_tree, ipfw_state);
507 
508 struct ipfw_xlat {
509 	struct ipfw_state	xlat_st;	/* MUST be the first field */
510 	uint32_t		xlat_addr;	/* network byte order */
511 	uint16_t		xlat_port;	/* network byte order */
512 	uint16_t		xlat_dir;	/* MATCH_ */
513 	struct ifnet		*xlat_ifp;	/* matching ifnet */
514 	struct ipfw_xlat	*xlat_pair;	/* paired state */
515 	int			xlat_pcpu;	/* paired cpu */
516 	volatile int		xlat_invalid;	/* invalid, but not dtor yet */
517 	volatile uint64_t	xlat_crefs;	/* cross references */
518 	struct netmsg_base	xlat_freenm;	/* for remote free */
519 };
520 
521 #define xlat_type		xlat_st.st_type
522 #define xlat_flags		xlat_st.st_flags
523 #define xlat_rule		xlat_st.st_rule
524 #define xlat_bcnt		xlat_st.st_bcnt
525 #define xlat_pcnt		xlat_st.st_pcnt
526 
527 struct ipfw_tblent {
528 	struct radix_node	te_nodes[2];
529 	struct sockaddr_in	te_key;
530 	u_long			te_use;
531 	time_t			te_lastuse;
532 	struct ipfw_tblent	*te_sibling;
533 	volatile int		te_expired;
534 };
535 
536 struct ipfw_context {
537 	struct ip_fw		*ipfw_layer3_chain;	/* rules for layer3 */
538 	struct ip_fw		*ipfw_default_rule;	/* default rule */
539 	uint64_t		ipfw_norule_counter;	/* ipfw_log(NULL) stat*/
540 
541 	/*
542 	 * ipfw_set_disable contains one bit per set value (0..31).
543 	 * If the bit is set, all rules with the corresponding set
544 	 * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
545 	 * default rule and CANNOT be disabled.
546 	 */
547 	uint32_t		ipfw_set_disable;
548 
549 	uint8_t			ipfw_flags;	/* IPFW_FLAG_ */
550 
551 	struct ip_fw		*ipfw_cont_rule;
552 	struct ipfw_xlat	*ipfw_cont_xlat;
553 
554 	struct ipfw_state_tree	ipfw_state_tree;
555 	struct ipfw_state_list	ipfw_state_list;
556 	int			ipfw_state_loosecnt;
557 	int			ipfw_state_cnt;
558 
559 	union {
560 		struct ipfw_state state;
561 		struct ipfw_track track;
562 		struct ipfw_trkcnt trkcnt;
563 	} ipfw_tmpkey;
564 
565 	struct ipfw_track_tree	ipfw_track_tree;
566 	struct ipfw_track_list	ipfw_track_list;
567 	struct ipfw_trkcnt	*ipfw_trkcnt_spare;
568 
569 	struct callout		ipfw_stateto_ch;
570 	time_t			ipfw_state_lastexp;
571 	struct netmsg_base	ipfw_stateexp_nm;
572 	struct netmsg_base	ipfw_stateexp_more;
573 	struct ipfw_state	ipfw_stateexp_anch;
574 
575 	struct callout		ipfw_trackto_ch;
576 	time_t			ipfw_track_lastexp;
577 	struct netmsg_base	ipfw_trackexp_nm;
578 	struct netmsg_base	ipfw_trackexp_more;
579 	struct ipfw_track	ipfw_trackexp_anch;
580 
581 	struct callout		ipfw_keepalive_ch;
582 	struct netmsg_base	ipfw_keepalive_nm;
583 	struct netmsg_base	ipfw_keepalive_more;
584 	struct ipfw_state	ipfw_keepalive_anch;
585 
586 	struct callout		ipfw_xlatreap_ch;
587 	struct netmsg_base	ipfw_xlatreap_nm;
588 	struct ipfw_state_list	ipfw_xlatreap;
589 
590 	/*
591 	 * Statistics
592 	 */
593 	u_long			ipfw_sts_reap;
594 	u_long			ipfw_sts_reapfailed;
595 	u_long			ipfw_sts_overflow;
596 	u_long			ipfw_sts_nomem;
597 	u_long			ipfw_sts_tcprecycled;
598 
599 	u_long			ipfw_tks_nomem;
600 	u_long			ipfw_tks_reap;
601 	u_long			ipfw_tks_reapfailed;
602 	u_long			ipfw_tks_overflow;
603 	u_long			ipfw_tks_cntnomem;
604 
605 	u_long			ipfw_frags;
606 	u_long			ipfw_defraged;
607 	u_long			ipfw_defrag_remote;
608 
609 	u_long			ipfw_xlated;
610 	u_long			ipfw_xlate_split;
611 	u_long			ipfw_xlate_conflicts;
612 	u_long			ipfw_xlate_cresolved;
613 
614 	/* Last field */
615 	struct radix_node_head	*ipfw_tables[];
616 };
617 
618 #define IPFW_FLAG_KEEPALIVE	0x01
619 #define IPFW_FLAG_STATEEXP	0x02
620 #define IPFW_FLAG_TRACKEXP	0x04
621 #define IPFW_FLAG_STATEREAP	0x08
622 #define IPFW_FLAG_TRACKREAP	0x10
623 
624 #define ipfw_state_tmpkey	ipfw_tmpkey.state
625 #define ipfw_track_tmpkey	ipfw_tmpkey.track
626 #define ipfw_trkcnt_tmpkey	ipfw_tmpkey.trkcnt
627 
628 struct ipfw_global {
629 	int			ipfw_state_loosecnt;	/* cache aligned */
630 	time_t			ipfw_state_globexp __cachealign;
631 
632 	struct lwkt_token	ipfw_trkcnt_token __cachealign;
633 	struct ipfw_trkcnt_tree	ipfw_trkcnt_tree;
634 	int			ipfw_trkcnt_cnt;
635 	time_t			ipfw_track_globexp;
636 
637 	/* Accessed in netisr0. */
638 	struct ip_fw		*ipfw_crossref_free __cachealign;
639 	struct callout		ipfw_crossref_ch;
640 	struct netmsg_base	ipfw_crossref_nm;
641 
642 #ifdef KLD_MODULE
643 	/*
644 	 * Module can not be unloaded, if there are references to
645 	 * certains rules of ipfw(4), e.g. dummynet(4)
646 	 */
647 	int			ipfw_refcnt __cachealign;
648 #endif
649 } __cachealign;
650 
651 static struct ipfw_context	*ipfw_ctx[MAXCPU];
652 
653 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
654 
655 /*
656  * Following two global variables are accessed and updated only
657  * in netisr0.
658  */
659 static uint32_t static_count;	/* # of static rules */
660 static uint32_t static_ioc_len;	/* bytes of static rules */
661 
662 /*
663  * If 1, then ipfw static rules are being flushed,
664  * ipfw_chk() will skip to the default rule.
665  */
666 static int ipfw_flushing;
667 
668 static int fw_verbose;
669 static int verbose_limit;
670 
671 static int fw_debug;
672 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
673 
674 static int	ipfw_table_max = IPFW_TABLE_MAX_DEF;
675 
676 static int	ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
677 static int	ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
678 
679 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
680 
681 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
682 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
683     "Firewall statistics");
684 
685 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
686     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
687 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
688     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
689     "Rule number autincrement step");
690 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
691     &fw_one_pass, 0,
692     "Only do a single pass through ipfw when using dummynet(4)");
693 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
694     &fw_debug, 0, "Enable printing of debug ip_fw statements");
695 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
696     &fw_verbose, 0, "Log matches to ipfw rules");
697 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
698     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
699 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
700     &ipfw_table_max, 0, "Max # of tables");
701 
702 static int	ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
703 static int	ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
704 static int	ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
705 static int	ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
706 static int	ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
707 static int	ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
708 
709 /*
710  * Timeouts for various events in handing states.
711  *
712  * NOTE:
713  * 1 == 0~1 second.
714  * 2 == 1~2 second(s).
715  *
716  * We use 2 seconds for FIN lifetime, so that the states will not be
717  * ripped prematurely.
718  */
719 static uint32_t dyn_ack_lifetime = 300;
720 static uint32_t dyn_syn_lifetime = 20;
721 static uint32_t dyn_finwait_lifetime = 20;
722 static uint32_t dyn_fin_lifetime = 2;
723 static uint32_t dyn_rst_lifetime = 2;
724 static uint32_t dyn_udp_lifetime = 10;
725 static uint32_t dyn_short_lifetime = 5;	/* used by tracks too */
726 
727 /*
728  * Keepalives are sent if dyn_keepalive is set. They are sent every
729  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
730  * seconds of lifetime of a rule.
731  */
732 static uint32_t dyn_keepalive_interval = 20;
733 static uint32_t dyn_keepalive_period = 5;
734 static uint32_t dyn_keepalive = 1;	/* do send keepalives */
735 
736 static struct ipfw_global	ipfw_gd;
737 static int	ipfw_state_loosecnt_updthr;
738 static int	ipfw_state_max = 4096;	/* max # of states */
739 static int	ipfw_track_max = 4096;	/* max # of tracks */
740 
741 static int	ipfw_state_headroom;	/* setup at module load time */
742 static int	ipfw_state_reap_min = 8;
743 static int	ipfw_state_expire_max = 32;
744 static int	ipfw_state_scan_max = 256;
745 static int	ipfw_keepalive_max = 8;
746 static int	ipfw_track_reap_max = 4;
747 static int	ipfw_track_expire_max = 16;
748 static int	ipfw_track_scan_max = 128;
749 
750 static eventhandler_tag ipfw_ifaddr_event;
751 
752 /* Compat */
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
754     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
755     "Number of states and tracks");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
757     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
758     "Max number of states and tracks");
759 
760 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
761     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
762     "Number of states");
763 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
764     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
765     "Max number of states");
766 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
767     &ipfw_state_headroom, 0, "headroom for state reap");
768 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
769     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
770 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
771     &ipfw_track_max, 0, "Max number of tracks");
772 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
773     &static_count, 0, "Number of static rules");
774 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
775     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
776 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
777     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
778 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
779     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
780 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
781     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
782 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
783     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
784 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
785     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
786 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
787     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
788 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
789     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
790 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
791     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
792     "I", "# of states to scan for each expire iteration");
793 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
794     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
795     "I", "# of states to expire for each expire iteration");
796 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
797     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
798     "I", "# of states to expire for each expire iteration");
799 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
800     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
801     "I", "# of states to reap for state shortage");
802 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
803     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
804     "I", "# of tracks to scan for each expire iteration");
805 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
806     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
807     "I", "# of tracks to expire for each expire iteration");
808 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
809     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
810     "I", "# of tracks to reap for track shortage");
811 
812 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
813     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
814     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
815     "LU", "# of state reaps due to states shortage");
816 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
817     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
818     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
819     "LU", "# of state reap failure");
820 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
821     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
822     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
823     "LU", "# of state overflow");
824 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
825     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
826     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
827     "LU", "# of state allocation failure");
828 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
829     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
830     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
831     "LU", "# of state deleted due to fast TCP port recycling");
832 
833 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
834     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
835     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
836     "LU", "# of track allocation failure");
837 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
838     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
839     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
840     "LU", "# of track reap due to tracks shortage");
841 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
842     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
843     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
844     "LU", "# of track reap failure");
845 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
846     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
847     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
848     "LU", "# of track overflow");
849 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
850     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
851     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
852     "LU", "# of track counter allocation failure");
853 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
854     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
855     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
856     "LU", "# of IP fragements defraged");
857 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
858     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
859     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
860     "LU", "# of IP packets after defrag");
861 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
862     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
863     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
864     "LU", "# of IP packets after defrag dispatched to remote cpus");
865 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
866     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
867     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
868     "LU", "# address/port translations");
869 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
870     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
871     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
872     "LU", "# address/port translations split between different cpus");
873 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
874     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
875     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
876     "LU", "# address/port translations conflicts on remote cpu");
877 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
878     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
879     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
880     "LU", "# address/port translations conflicts resolved on remote cpu");
881 
882 static int		ipfw_state_cmp(struct ipfw_state *,
883 			    struct ipfw_state *);
884 static int		ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
885 			    struct ipfw_trkcnt *);
886 static int		ipfw_track_cmp(struct ipfw_track *,
887 			    struct ipfw_track *);
888 
889 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
890 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 
892 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
893 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 
895 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
896 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 
898 static int		ipfw_chk(struct ip_fw_args *);
899 static void		ipfw_track_expire_ipifunc(void *);
900 static void		ipfw_state_expire_ipifunc(void *);
901 static void		ipfw_keepalive(void *);
902 static int		ipfw_state_expire_start(struct ipfw_context *,
903 			    int, int);
904 static void		ipfw_crossref_timeo(void *);
905 static void		ipfw_state_remove(struct ipfw_context *,
906 			    struct ipfw_state *);
907 static void		ipfw_xlat_reap_timeo(void *);
908 static void		ipfw_defrag_redispatch(struct mbuf *, int,
909 			    struct ip_fw *);
910 
911 #define IPFW_TRKCNT_TOKGET	lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
912 #define IPFW_TRKCNT_TOKREL	lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKINIT	\
914 	lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
915 
916 static void
917 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
918     const struct sockaddr *netmask)
919 {
920 	const u_char *cp1 = (const u_char *)src;
921 	u_char *cp2 = (u_char *)dst;
922 	const u_char *cp3 = (const u_char *)netmask;
923 	u_char *cplim = cp2 + *cp3;
924 	u_char *cplim2 = cp2 + *cp1;
925 
926 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
927 	cp3 += 2;
928 	if (cplim > cplim2)
929 		cplim = cplim2;
930 	while (cp2 < cplim)
931 		*cp2++ = *cp1++ & *cp3++;
932 	if (cp2 < cplim2)
933 		bzero(cp2, cplim2 - cp2);
934 }
935 
936 static __inline uint16_t
937 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
938 {
939 	uint32_t l;
940 
941 	if (udp && !cksum)
942 		return (0x0000);
943 	l = cksum + old - new;
944 	l = (l >> 16) + (l & 65535);
945 	l = l & 65535;
946 	if (udp && !l)
947 		return (0xFFFF);
948 	return (l);
949 }
950 
951 static __inline void
952 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
953     in_addr_t daddr, uint16_t dport, uint8_t proto)
954 {
955 
956 	key->proto = proto;
957 	key->swap = 0;
958 
959 	if (saddr < daddr) {
960 		key->addr_u.addrs.addr1 = daddr;
961 		key->addr_u.addrs.addr2 = saddr;
962 		key->swap |= IPFW_KEY_SWAP_ADDRS;
963 	} else {
964 		key->addr_u.addrs.addr1 = saddr;
965 		key->addr_u.addrs.addr2 = daddr;
966 	}
967 
968 	if (sport < dport) {
969 		key->port_u.ports.port1 = dport;
970 		key->port_u.ports.port2 = sport;
971 		key->swap |= IPFW_KEY_SWAP_PORTS;
972 	} else {
973 		key->port_u.ports.port1 = sport;
974 		key->port_u.ports.port2 = dport;
975 	}
976 
977 	if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
978 		key->swap |= IPFW_KEY_SWAP_PORTS;
979 	if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
980 		key->swap |= IPFW_KEY_SWAP_ADDRS;
981 }
982 
983 static __inline void
984 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
985     in_addr_t *daddr, uint16_t *dport)
986 {
987 
988 	if (key->swap & IPFW_KEY_SWAP_ADDRS) {
989 		*saddr = key->addr_u.addrs.addr2;
990 		*daddr = key->addr_u.addrs.addr1;
991 	} else {
992 		*saddr = key->addr_u.addrs.addr1;
993 		*daddr = key->addr_u.addrs.addr2;
994 	}
995 
996 	if (key->swap & IPFW_KEY_SWAP_PORTS) {
997 		*sport = key->port_u.ports.port2;
998 		*dport = key->port_u.ports.port1;
999 	} else {
1000 		*sport = key->port_u.ports.port1;
1001 		*dport = key->port_u.ports.port2;
1002 	}
1003 }
1004 
1005 static int
1006 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1007 {
1008 
1009 	if (s1->st_proto > s2->st_proto)
1010 		return (1);
1011 	if (s1->st_proto < s2->st_proto)
1012 		return (-1);
1013 
1014 	if (s1->st_addrs > s2->st_addrs)
1015 		return (1);
1016 	if (s1->st_addrs < s2->st_addrs)
1017 		return (-1);
1018 
1019 	if (s1->st_ports > s2->st_ports)
1020 		return (1);
1021 	if (s1->st_ports < s2->st_ports)
1022 		return (-1);
1023 
1024 	if (s1->st_swap == s2->st_swap ||
1025 	    (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1026 		return (0);
1027 
1028 	if (s1->st_swap > s2->st_swap)
1029 		return (1);
1030 	else
1031 		return (-1);
1032 }
1033 
1034 static int
1035 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1036 {
1037 
1038 	if (t1->tc_proto > t2->tc_proto)
1039 		return (1);
1040 	if (t1->tc_proto < t2->tc_proto)
1041 		return (-1);
1042 
1043 	if (t1->tc_addrs > t2->tc_addrs)
1044 		return (1);
1045 	if (t1->tc_addrs < t2->tc_addrs)
1046 		return (-1);
1047 
1048 	if (t1->tc_ports > t2->tc_ports)
1049 		return (1);
1050 	if (t1->tc_ports < t2->tc_ports)
1051 		return (-1);
1052 
1053 	if (t1->tc_ruleid > t2->tc_ruleid)
1054 		return (1);
1055 	if (t1->tc_ruleid < t2->tc_ruleid)
1056 		return (-1);
1057 
1058 	return (0);
1059 }
1060 
1061 static int
1062 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1063 {
1064 
1065 	if (t1->t_proto > t2->t_proto)
1066 		return (1);
1067 	if (t1->t_proto < t2->t_proto)
1068 		return (-1);
1069 
1070 	if (t1->t_addrs > t2->t_addrs)
1071 		return (1);
1072 	if (t1->t_addrs < t2->t_addrs)
1073 		return (-1);
1074 
1075 	if (t1->t_ports > t2->t_ports)
1076 		return (1);
1077 	if (t1->t_ports < t2->t_ports)
1078 		return (-1);
1079 
1080 	if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1081 		return (1);
1082 	if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1083 		return (-1);
1084 
1085 	return (0);
1086 }
1087 
1088 static __inline struct ipfw_state *
1089 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1090 {
1091 	struct ipfw_state *dup;
1092 
1093 	KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1094 	    ("state %p was linked", s));
1095 	dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1096 	if (dup == NULL) {
1097 		TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1098 		s->st_flags |= IPFW_STATE_F_LINKED;
1099 	}
1100 	return (dup);
1101 }
1102 
1103 static __inline void
1104 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1105 {
1106 
1107 	KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1108 	    ("state %p was not linked", s));
1109 	RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1110 	TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1111 	s->st_flags &= ~IPFW_STATE_F_LINKED;
1112 }
1113 
1114 static void
1115 ipfw_state_max_set(int state_max)
1116 {
1117 
1118 	ipfw_state_max = state_max;
1119 	/* Allow 5% states over-allocation. */
1120 	ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1121 }
1122 
1123 static __inline int
1124 ipfw_state_cntcoll(void)
1125 {
1126 	int cpu, state_cnt = 0;
1127 
1128 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1129 		state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1130 	return (state_cnt);
1131 }
1132 
1133 static __inline int
1134 ipfw_state_cntsync(void)
1135 {
1136 	int state_cnt;
1137 
1138 	state_cnt = ipfw_state_cntcoll();
1139 	ipfw_gd.ipfw_state_loosecnt = state_cnt;
1140 	return (state_cnt);
1141 }
1142 
1143 static __inline int
1144 ipfw_free_rule(struct ip_fw *rule)
1145 {
1146 	KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1147 	KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1148 	rule->refcnt--;
1149 	if (rule->refcnt == 0) {
1150 		if (rule->cross_rules != NULL)
1151 			kfree(rule->cross_rules, M_IPFW);
1152 		kfree(rule, M_IPFW);
1153 		return 1;
1154 	}
1155 	return 0;
1156 }
1157 
1158 static void
1159 ipfw_unref_rule(void *priv)
1160 {
1161 	ipfw_free_rule(priv);
1162 #ifdef KLD_MODULE
1163 	KASSERT(ipfw_gd.ipfw_refcnt > 0,
1164 	    ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1165 	atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1166 #endif
1167 }
1168 
1169 static __inline void
1170 ipfw_ref_rule(struct ip_fw *rule)
1171 {
1172 	KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1173 #ifdef KLD_MODULE
1174 	atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1175 #endif
1176 	rule->refcnt++;
1177 }
1178 
1179 /*
1180  * This macro maps an ip pointer into a layer3 header pointer of type T
1181  */
1182 #define	L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1183 
1184 static __inline int
1185 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1186 {
1187 	int type = L3HDR(struct icmp,ip)->icmp_type;
1188 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1189 	int idx = type / 32;
1190 
1191 	if (idx >= idx_max)
1192 		return (0);
1193 	return (cmd->d[idx] & (1 << (type % 32)));
1194 }
1195 
1196 static __inline int
1197 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1198 {
1199 	int code = L3HDR(struct icmp,ip)->icmp_code;
1200 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1201 	int idx = code / 32;
1202 
1203 	if (idx >= idx_max)
1204 		return (0);
1205 	return (cmd->d[idx] & (1 << (code % 32)));
1206 }
1207 
1208 #define TT	((1 << ICMP_ECHO) | \
1209 		 (1 << ICMP_ROUTERSOLICIT) | \
1210 		 (1 << ICMP_TSTAMP) | \
1211 		 (1 << ICMP_IREQ) | \
1212 		 (1 << ICMP_MASKREQ))
1213 
1214 static int
1215 is_icmp_query(struct ip *ip)
1216 {
1217 	int type = L3HDR(struct icmp, ip)->icmp_type;
1218 
1219 	return (type < 32 && (TT & (1 << type)));
1220 }
1221 
1222 #undef TT
1223 
1224 /*
1225  * The following checks use two arrays of 8 or 16 bits to store the
1226  * bits that we want set or clear, respectively. They are in the
1227  * low and high half of cmd->arg1 or cmd->d[0].
1228  *
1229  * We scan options and store the bits we find set. We succeed if
1230  *
1231  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1232  *
1233  * The code is sometimes optimized not to store additional variables.
1234  */
1235 static int
1236 flags_match(ipfw_insn *cmd, uint8_t bits)
1237 {
1238 	u_char want_clear;
1239 	bits = ~bits;
1240 
1241 	if (((cmd->arg1 & 0xff) & bits) != 0)
1242 		return 0; /* some bits we want set were clear */
1243 
1244 	want_clear = (cmd->arg1 >> 8) & 0xff;
1245 	if ((want_clear & bits) != want_clear)
1246 		return 0; /* some bits we want clear were set */
1247 	return 1;
1248 }
1249 
1250 static int
1251 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1252 {
1253 	int optlen, bits = 0;
1254 	u_char *cp = (u_char *)(ip + 1);
1255 	int x = (ip->ip_hl << 2) - sizeof(struct ip);
1256 
1257 	for (; x > 0; x -= optlen, cp += optlen) {
1258 		int opt = cp[IPOPT_OPTVAL];
1259 
1260 		if (opt == IPOPT_EOL)
1261 			break;
1262 
1263 		if (opt == IPOPT_NOP) {
1264 			optlen = 1;
1265 		} else {
1266 			optlen = cp[IPOPT_OLEN];
1267 			if (optlen <= 0 || optlen > x)
1268 				return 0; /* invalid or truncated */
1269 		}
1270 
1271 		switch (opt) {
1272 		case IPOPT_LSRR:
1273 			bits |= IP_FW_IPOPT_LSRR;
1274 			break;
1275 
1276 		case IPOPT_SSRR:
1277 			bits |= IP_FW_IPOPT_SSRR;
1278 			break;
1279 
1280 		case IPOPT_RR:
1281 			bits |= IP_FW_IPOPT_RR;
1282 			break;
1283 
1284 		case IPOPT_TS:
1285 			bits |= IP_FW_IPOPT_TS;
1286 			break;
1287 
1288 		default:
1289 			break;
1290 		}
1291 	}
1292 	return (flags_match(cmd, bits));
1293 }
1294 
1295 static int
1296 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1297 {
1298 	int optlen, bits = 0;
1299 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1300 	u_char *cp = (u_char *)(tcp + 1);
1301 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1302 
1303 	for (; x > 0; x -= optlen, cp += optlen) {
1304 		int opt = cp[0];
1305 
1306 		if (opt == TCPOPT_EOL)
1307 			break;
1308 
1309 		if (opt == TCPOPT_NOP) {
1310 			optlen = 1;
1311 		} else {
1312 			optlen = cp[1];
1313 			if (optlen <= 0)
1314 				break;
1315 		}
1316 
1317 		switch (opt) {
1318 		case TCPOPT_MAXSEG:
1319 			bits |= IP_FW_TCPOPT_MSS;
1320 			break;
1321 
1322 		case TCPOPT_WINDOW:
1323 			bits |= IP_FW_TCPOPT_WINDOW;
1324 			break;
1325 
1326 		case TCPOPT_SACK_PERMITTED:
1327 		case TCPOPT_SACK:
1328 			bits |= IP_FW_TCPOPT_SACK;
1329 			break;
1330 
1331 		case TCPOPT_TIMESTAMP:
1332 			bits |= IP_FW_TCPOPT_TS;
1333 			break;
1334 
1335 		case TCPOPT_CC:
1336 		case TCPOPT_CCNEW:
1337 		case TCPOPT_CCECHO:
1338 			bits |= IP_FW_TCPOPT_CC;
1339 			break;
1340 
1341 		default:
1342 			break;
1343 		}
1344 	}
1345 	return (flags_match(cmd, bits));
1346 }
1347 
1348 static int
1349 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1350 {
1351 	if (ifp == NULL)	/* no iface with this packet, match fails */
1352 		return 0;
1353 
1354 	/* Check by name or by IP address */
1355 	if (cmd->name[0] != '\0') { /* match by name */
1356 		/* Check name */
1357 		if (cmd->p.glob) {
1358 			if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1359 				return(1);
1360 		} else {
1361 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1362 				return(1);
1363 		}
1364 	} else {
1365 		struct ifaddr_container *ifac;
1366 
1367 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1368 			struct ifaddr *ia = ifac->ifa;
1369 
1370 			if (ia->ifa_addr == NULL)
1371 				continue;
1372 			if (ia->ifa_addr->sa_family != AF_INET)
1373 				continue;
1374 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1375 			    (ia->ifa_addr))->sin_addr.s_addr)
1376 				return(1);	/* match */
1377 		}
1378 	}
1379 	return(0);	/* no match, fail ... */
1380 }
1381 
1382 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1383 
1384 /*
1385  * We enter here when we have a rule with O_LOG.
1386  * XXX this function alone takes about 2Kbytes of code!
1387  */
1388 static void
1389 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1390     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1391 {
1392 	char *action;
1393 	int limit_reached = 0;
1394 	char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1395 
1396 	fragment[0] = '\0';
1397 	proto[0] = '\0';
1398 
1399 	if (f == NULL) {	/* bogus pkt */
1400 		if (verbose_limit != 0 &&
1401 		    ctx->ipfw_norule_counter >= verbose_limit)
1402 			return;
1403 		ctx->ipfw_norule_counter++;
1404 		if (ctx->ipfw_norule_counter == verbose_limit)
1405 			limit_reached = verbose_limit;
1406 		action = "Refuse";
1407 	} else {	/* O_LOG is the first action, find the real one */
1408 		ipfw_insn *cmd = ACTION_PTR(f);
1409 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1410 
1411 		if (l->max_log != 0 && l->log_left == 0)
1412 			return;
1413 		l->log_left--;
1414 		if (l->log_left == 0)
1415 			limit_reached = l->max_log;
1416 		cmd += F_LEN(cmd);	/* point to first action */
1417 		if (cmd->opcode == O_PROB)
1418 			cmd += F_LEN(cmd);
1419 
1420 		action = action2;
1421 		switch (cmd->opcode) {
1422 		case O_DENY:
1423 			action = "Deny";
1424 			break;
1425 
1426 		case O_REJECT:
1427 			if (cmd->arg1==ICMP_REJECT_RST) {
1428 				action = "Reset";
1429 			} else if (cmd->arg1==ICMP_UNREACH_HOST) {
1430 				action = "Reject";
1431 			} else {
1432 				ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1433 					  cmd->arg1);
1434 			}
1435 			break;
1436 
1437 		case O_ACCEPT:
1438 			action = "Accept";
1439 			break;
1440 
1441 		case O_COUNT:
1442 			action = "Count";
1443 			break;
1444 
1445 		case O_DIVERT:
1446 			ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1447 			break;
1448 
1449 		case O_TEE:
1450 			ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1451 			break;
1452 
1453 		case O_SKIPTO:
1454 			ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1455 			break;
1456 
1457 		case O_PIPE:
1458 			ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1459 			break;
1460 
1461 		case O_QUEUE:
1462 			ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1463 			break;
1464 
1465 		case O_FORWARD_IP:
1466 			{
1467 				ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1468 				int len;
1469 
1470 				len = ksnprintf(SNPARGS(action2, 0),
1471 				    "Forward to %s",
1472 				    kinet_ntoa(sa->sa.sin_addr, abuf));
1473 				if (sa->sa.sin_port) {
1474 					ksnprintf(SNPARGS(action2, len), ":%d",
1475 						  sa->sa.sin_port);
1476 				}
1477 			}
1478 			break;
1479 
1480 		default:
1481 			action = "UNKNOWN";
1482 			break;
1483 		}
1484 	}
1485 
1486 	if (hlen == 0) {	/* non-ip */
1487 		ksnprintf(SNPARGS(proto, 0), "MAC");
1488 	} else {
1489 		struct ip *ip = mtod(m, struct ip *);
1490 		/* these three are all aliases to the same thing */
1491 		struct icmp *const icmp = L3HDR(struct icmp, ip);
1492 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
1493 		struct udphdr *const udp = (struct udphdr *)icmp;
1494 
1495 		int ip_off, offset, ip_len;
1496 		int len;
1497 
1498 		ip_off = ntohs(ip->ip_off);
1499 		ip_len = ntohs(ip->ip_len);
1500 		offset = ip_off & IP_OFFMASK;
1501 
1502 		switch (ip->ip_p) {
1503 		case IPPROTO_TCP:
1504 			len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1505 					kinet_ntoa(ip->ip_src, abuf));
1506 			if (offset == 0) {
1507 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1508 					  ntohs(tcp->th_sport),
1509 					  kinet_ntoa(ip->ip_dst, abuf),
1510 					  ntohs(tcp->th_dport));
1511 			} else {
1512 				ksnprintf(SNPARGS(proto, len), " %s",
1513 					  kinet_ntoa(ip->ip_dst, abuf));
1514 			}
1515 			break;
1516 
1517 		case IPPROTO_UDP:
1518 			len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1519 					kinet_ntoa(ip->ip_src, abuf));
1520 			if (offset == 0) {
1521 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1522 					  ntohs(udp->uh_sport),
1523 					  kinet_ntoa(ip->ip_dst, abuf),
1524 					  ntohs(udp->uh_dport));
1525 			} else {
1526 				ksnprintf(SNPARGS(proto, len), " %s",
1527 					  kinet_ntoa(ip->ip_dst, abuf));
1528 			}
1529 			break;
1530 
1531 		case IPPROTO_ICMP:
1532 			if (offset == 0) {
1533 				len = ksnprintf(SNPARGS(proto, 0),
1534 						"ICMP:%u.%u ",
1535 						icmp->icmp_type,
1536 						icmp->icmp_code);
1537 			} else {
1538 				len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1539 			}
1540 			len += ksnprintf(SNPARGS(proto, len), "%s",
1541 					 kinet_ntoa(ip->ip_src, abuf));
1542 			ksnprintf(SNPARGS(proto, len), " %s",
1543 				  kinet_ntoa(ip->ip_dst, abuf));
1544 			break;
1545 
1546 		default:
1547 			len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1548 					kinet_ntoa(ip->ip_src, abuf));
1549 			ksnprintf(SNPARGS(proto, len), " %s",
1550 				  kinet_ntoa(ip->ip_dst, abuf));
1551 			break;
1552 		}
1553 
1554 		if (ip_off & (IP_MF | IP_OFFMASK)) {
1555 			ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1556 				  ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1557 				  offset << 3, (ip_off & IP_MF) ? "+" : "");
1558 		}
1559 	}
1560 
1561 	if (oif || m->m_pkthdr.rcvif) {
1562 		log(LOG_SECURITY | LOG_INFO,
1563 		    "ipfw: %d %s %s %s via %s%s\n",
1564 		    f ? f->rulenum : -1,
1565 		    action, proto, oif ? "out" : "in",
1566 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1567 		    fragment);
1568 	} else {
1569 		log(LOG_SECURITY | LOG_INFO,
1570 		    "ipfw: %d %s %s [no if info]%s\n",
1571 		    f ? f->rulenum : -1,
1572 		    action, proto, fragment);
1573 	}
1574 
1575 	if (limit_reached) {
1576 		log(LOG_SECURITY | LOG_NOTICE,
1577 		    "ipfw: limit %d reached on entry %d\n",
1578 		    limit_reached, f ? f->rulenum : -1);
1579 	}
1580 }
1581 
1582 #undef SNPARGS
1583 
1584 static void
1585 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1586 {
1587 	struct ip_fw *rule = slave_x->xlat_rule;
1588 
1589 	KKASSERT(rule->cpuid == mycpuid);
1590 
1591 	/* No more cross references; free this pair now. */
1592 	kfree(x, M_IPFW);
1593 	kfree(slave_x, M_IPFW);
1594 
1595 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1596 	rule->cross_refs--;
1597 }
1598 
1599 static void
1600 ipfw_xlat_reap_dispatch(netmsg_t nm)
1601 {
1602 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1603 	struct ipfw_state *s, *ns;
1604 
1605 	ASSERT_NETISR_NCPUS(mycpuid);
1606 
1607 	crit_enter();
1608 	/* Reply ASAP. */
1609 	netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1610 	crit_exit();
1611 
1612 	/* TODO: limit scanning depth */
1613 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1614 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1615 		struct ipfw_xlat *slave_x = x->xlat_pair;
1616 		uint64_t crefs;
1617 
1618 		crefs = slave_x->xlat_crefs + x->xlat_crefs;
1619 		if (crefs == 0) {
1620 			TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1621 			ipfw_xlat_reap(x, slave_x);
1622 		}
1623 	}
1624 	if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1625 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1626 		    &ctx->ipfw_xlatreap_nm);
1627 	}
1628 }
1629 
1630 static void
1631 ipfw_xlat_reap_timeo(void *xnm)
1632 {
1633 	struct netmsg_base *nm = xnm;
1634 
1635 	KKASSERT(mycpuid < netisr_ncpus);
1636 
1637 	crit_enter();
1638 	if (nm->lmsg.ms_flags & MSGF_DONE)
1639 		netisr_sendmsg_oncpu(nm);
1640 	crit_exit();
1641 }
1642 
1643 static void
1644 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1645 {
1646 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1647 	struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1648 	struct ipfw_xlat *slave_x = x->xlat_pair;
1649 	uint64_t crefs;
1650 
1651 	ASSERT_NETISR_NCPUS(mycpuid);
1652 
1653 	KKASSERT(slave_x != NULL);
1654 	KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1655 
1656 	KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1657 	    ("master xlat is still linked"));
1658 	if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1659 		ipfw_state_unlink(ctx, &slave_x->xlat_st);
1660 
1661 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1662 	slave_x->xlat_crefs--;
1663 
1664 	crefs = slave_x->xlat_crefs + x->xlat_crefs;
1665 	if (crefs == 0) {
1666 		ipfw_xlat_reap(x, slave_x);
1667 		return;
1668 	}
1669 
1670 	if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1671 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1672 		    &ctx->ipfw_xlatreap_nm);
1673 	}
1674 
1675 	/*
1676 	 * This pair is still referenced; defer its destruction.
1677 	 * YYY reuse st_link.
1678 	 */
1679 	TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1680 }
1681 
1682 static __inline void
1683 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1684 {
1685 
1686 	x->xlat_invalid = 1;
1687 	x->xlat_pair->xlat_invalid = 1;
1688 }
1689 
1690 static void
1691 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1692 {
1693 	struct ipfw_xlat *x, *slave_x;
1694 	struct netmsg_base *nm;
1695 
1696 	KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1697 	    IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1698 	KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1699 	    ("delete slave xlat"));
1700 
1701 	KASSERT(ctx->ipfw_state_cnt > 0,
1702 	    ("invalid state count %d", ctx->ipfw_state_cnt));
1703 	ctx->ipfw_state_cnt--;
1704 	if (ctx->ipfw_state_loosecnt > 0)
1705 		ctx->ipfw_state_loosecnt--;
1706 
1707 	/*
1708 	 * Unhook this state.
1709 	 */
1710 	if (s->st_track != NULL) {
1711 		struct ipfw_track *t = s->st_track;
1712 
1713 		KASSERT(!LIST_EMPTY(&t->t_state_list),
1714 		    ("track state list is empty"));
1715 		LIST_REMOVE(s, st_trklink);
1716 
1717 		KASSERT(*t->t_count > 0,
1718 		    ("invalid track count %d", *t->t_count));
1719 		atomic_subtract_int(t->t_count, 1);
1720 	}
1721 	ipfw_state_unlink(ctx, s);
1722 
1723 	/*
1724 	 * Free this state.  Xlat requires special processing,
1725 	 * since xlat are paired state and they could be on
1726 	 * different cpus.
1727 	 */
1728 
1729 	if (!IPFW_ISXLAT(s->st_type)) {
1730 		/* Not xlat; free now. */
1731 		kfree(s, M_IPFW);
1732 		/* Done! */
1733 		return;
1734 	}
1735 	x = (struct ipfw_xlat *)s;
1736 
1737 	if (x->xlat_pair == NULL) {
1738 		/* Not setup yet; free now. */
1739 		kfree(x, M_IPFW);
1740 		/* Done! */
1741 		return;
1742 	}
1743 	slave_x = x->xlat_pair;
1744 	KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1745 
1746 	if (x->xlat_pcpu == mycpuid) {
1747 		/*
1748 		 * Paired states are on the same cpu; delete this
1749 		 * pair now.
1750 		 */
1751 		KKASSERT(x->xlat_crefs == 0);
1752 		KKASSERT(slave_x->xlat_crefs == 0);
1753 		if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1754 			ipfw_state_unlink(ctx, &slave_x->xlat_st);
1755 		kfree(x, M_IPFW);
1756 		kfree(slave_x, M_IPFW);
1757 		return;
1758 	}
1759 
1760 	/*
1761 	 * Free the paired states on the cpu owning the slave xlat.
1762 	 */
1763 
1764 	/*
1765 	 * Mark the state pair invalid; completely deleting them
1766 	 * may take some time.
1767 	 */
1768 	ipfw_xlat_invalidate(x);
1769 
1770 	nm = &x->xlat_freenm;
1771 	netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1772 	    ipfw_xlat_free_dispatch);
1773 	nm->lmsg.u.ms_resultp = x;
1774 
1775 	/* See the comment in ipfw_xlate_redispatch(). */
1776 	x->xlat_rule->cross_refs++;
1777 	x->xlat_crefs++;
1778 
1779 	netisr_sendmsg(nm, x->xlat_pcpu);
1780 }
1781 
1782 static void
1783 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1784 {
1785 
1786 	if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1787 		KKASSERT(IPFW_ISXLAT(s->st_type));
1788 		ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1789 		ipfw_state_unlink(ctx, s);
1790 		return;
1791 	}
1792 	ipfw_state_del(ctx, s);
1793 }
1794 
1795 static int
1796 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1797 {
1798 	struct ipfw_state *s, *anchor;
1799 	int expired;
1800 
1801 	if (reap_max < ipfw_state_reap_min)
1802 		reap_max = ipfw_state_reap_min;
1803 
1804 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1805 		/*
1806 		 * Kick start state expiring.  Ignore scan limit,
1807 		 * we are short of states.
1808 		 */
1809 		ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1810 		expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1811 		ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1812 		return (expired);
1813 	}
1814 
1815 	/*
1816 	 * States are being expired.
1817 	 */
1818 
1819 	if (ctx->ipfw_state_cnt == 0)
1820 		return (0);
1821 
1822 	expired = 0;
1823 	anchor = &ctx->ipfw_stateexp_anch;
1824 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1825 		/*
1826 		 * Ignore scan limit; we are short of states.
1827 		 */
1828 
1829 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1830 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1831 
1832 		if (IPFW_STATE_SCANSKIP(s))
1833 			continue;
1834 
1835 		if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1836 			ipfw_state_del(ctx, s);
1837 			if (++expired >= reap_max)
1838 				break;
1839 			if ((expired & 0xff) == 0 &&
1840 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1841 			    ipfw_state_max)
1842 				break;
1843 		}
1844 	}
1845 	/*
1846 	 * NOTE:
1847 	 * Leave the anchor on the list, even if the end of the list has
1848 	 * been reached.  ipfw_state_expire_more_dispatch() will handle
1849 	 * the removal.
1850 	 */
1851 	return (expired);
1852 }
1853 
1854 static void
1855 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1856 {
1857 	struct ipfw_state *s, *sn;
1858 
1859 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1860 		if (IPFW_STATE_SCANSKIP(s))
1861 			continue;
1862 		if (rule != NULL && s->st_rule != rule)
1863 			continue;
1864 		ipfw_state_del(ctx, s);
1865 	}
1866 }
1867 
1868 static void
1869 ipfw_state_expire_done(struct ipfw_context *ctx)
1870 {
1871 
1872 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1873 	    ("stateexp is not in progress"));
1874 	ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1875 	callout_reset(&ctx->ipfw_stateto_ch, hz,
1876 	    ipfw_state_expire_ipifunc, NULL);
1877 }
1878 
1879 static void
1880 ipfw_state_expire_more(struct ipfw_context *ctx)
1881 {
1882 	struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1883 
1884 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1885 	    ("stateexp is not in progress"));
1886 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1887 	    ("stateexp more did not finish"));
1888 	netisr_sendmsg_oncpu(nm);
1889 }
1890 
1891 static int
1892 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1893     int scan_max, int expire_max)
1894 {
1895 	struct ipfw_state *s;
1896 	int scanned = 0, expired = 0;
1897 
1898 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1899 	    ("stateexp is not in progress"));
1900 
1901 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1902 		if (scanned++ >= scan_max) {
1903 			ipfw_state_expire_more(ctx);
1904 			return (expired);
1905 		}
1906 
1907 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1908 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1909 
1910 		if (IPFW_STATE_SCANSKIP(s))
1911 			continue;
1912 
1913 		if (IPFW_STATE_ISDEAD(s) ||
1914 		    ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1915 		     IPFW_STATE_TCPCLOSED(s))) {
1916 			ipfw_state_del(ctx, s);
1917 			if (++expired >= expire_max) {
1918 				ipfw_state_expire_more(ctx);
1919 				return (expired);
1920 			}
1921 			if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1922 			    (expired & 0xff) == 0 &&
1923 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1924 			    ipfw_state_max) {
1925 				ipfw_state_expire_more(ctx);
1926 				return (expired);
1927 			}
1928 		}
1929 	}
1930 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1931 	ipfw_state_expire_done(ctx);
1932 	return (expired);
1933 }
1934 
1935 static void
1936 ipfw_state_expire_more_dispatch(netmsg_t nm)
1937 {
1938 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1939 	struct ipfw_state *anchor;
1940 
1941 	ASSERT_NETISR_NCPUS(mycpuid);
1942 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1943 	    ("statexp is not in progress"));
1944 
1945 	/* Reply ASAP */
1946 	netisr_replymsg(&nm->base, 0);
1947 
1948 	anchor = &ctx->ipfw_stateexp_anch;
1949 	if (ctx->ipfw_state_cnt == 0) {
1950 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1951 		ipfw_state_expire_done(ctx);
1952 		return;
1953 	}
1954 	ipfw_state_expire_loop(ctx, anchor,
1955 	    ipfw_state_scan_max, ipfw_state_expire_max);
1956 }
1957 
1958 static int
1959 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1960 {
1961 	struct ipfw_state *anchor;
1962 
1963 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1964 	    ("stateexp is in progress"));
1965 	ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1966 
1967 	if (ctx->ipfw_state_cnt == 0) {
1968 		ipfw_state_expire_done(ctx);
1969 		return (0);
1970 	}
1971 
1972 	/*
1973 	 * Do not expire more than once per second, it is useless.
1974 	 */
1975 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1976 	    ctx->ipfw_state_lastexp == time_uptime) {
1977 		ipfw_state_expire_done(ctx);
1978 		return (0);
1979 	}
1980 	ctx->ipfw_state_lastexp = time_uptime;
1981 
1982 	anchor = &ctx->ipfw_stateexp_anch;
1983 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1984 	return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1985 }
1986 
1987 static void
1988 ipfw_state_expire_dispatch(netmsg_t nm)
1989 {
1990 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1991 
1992 	ASSERT_NETISR_NCPUS(mycpuid);
1993 
1994 	/* Reply ASAP */
1995 	crit_enter();
1996 	netisr_replymsg(&nm->base, 0);
1997 	crit_exit();
1998 
1999 	if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2000 		/* Running; done. */
2001 		return;
2002 	}
2003 	ipfw_state_expire_start(ctx,
2004 	    ipfw_state_scan_max, ipfw_state_expire_max);
2005 }
2006 
2007 static void
2008 ipfw_state_expire_ipifunc(void *dummy __unused)
2009 {
2010 	struct netmsg_base *msg;
2011 
2012 	KKASSERT(mycpuid < netisr_ncpus);
2013 	msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2014 
2015 	crit_enter();
2016 	if (msg->lmsg.ms_flags & MSGF_DONE)
2017 		netisr_sendmsg_oncpu(msg);
2018 	crit_exit();
2019 }
2020 
2021 static boolean_t
2022 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2023 {
2024 	uint32_t seq = ntohl(tcp->th_seq);
2025 	uint32_t ack = ntohl(tcp->th_ack);
2026 
2027 	if (tcp->th_flags & TH_RST)
2028 		return (TRUE);
2029 
2030 	if (dir == MATCH_FORWARD) {
2031 		if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2032 			s->st_flags |= IPFW_STATE_F_SEQFWD;
2033 			s->st_seq_fwd = seq;
2034 		} else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2035 			s->st_seq_fwd = seq;
2036 		} else {
2037 			/* Out-of-sequence; done. */
2038 			return (FALSE);
2039 		}
2040 		if (tcp->th_flags & TH_ACK) {
2041 			if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2042 				s->st_flags |= IPFW_STATE_F_ACKFWD;
2043 				s->st_ack_fwd = ack;
2044 			} else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2045 				s->st_ack_fwd = ack;
2046 			} else {
2047 				/* Out-of-sequence; done. */
2048 				return (FALSE);
2049 			}
2050 
2051 			if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2052 			    (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2053 				s->st_state |= (TH_ACK << 8);
2054 		}
2055 	} else {
2056 		if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2057 			s->st_flags |= IPFW_STATE_F_SEQREV;
2058 			s->st_seq_rev = seq;
2059 		} else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2060 			s->st_seq_rev = seq;
2061 		} else {
2062 			/* Out-of-sequence; done. */
2063 			return (FALSE);
2064 		}
2065 		if (tcp->th_flags & TH_ACK) {
2066 			if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2067 				s->st_flags |= IPFW_STATE_F_ACKREV;
2068 				s->st_ack_rev= ack;
2069 			} else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2070 				s->st_ack_rev = ack;
2071 			} else {
2072 				/* Out-of-sequence; done. */
2073 				return (FALSE);
2074 			}
2075 
2076 			if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2077 			    s->st_ack_rev == s->st_seq_fwd + 1)
2078 				s->st_state |= TH_ACK;
2079 		}
2080 	}
2081 	return (TRUE);
2082 }
2083 
2084 static void
2085 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2086     const struct tcphdr *tcp, struct ipfw_state *s)
2087 {
2088 
2089 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2090 		u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2091 
2092 		if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2093 			return;
2094 
2095 		s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2096 		switch (s->st_state & IPFW_STATE_TCPSTATES) {
2097 		case TH_SYN:				/* opening */
2098 			s->st_expire = time_uptime + dyn_syn_lifetime;
2099 			break;
2100 
2101 		case BOTH_SYN:			/* move to established */
2102 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
2103 		case BOTH_SYN | (TH_FIN << 8):
2104 			s->st_expire = time_uptime + dyn_ack_lifetime;
2105 			break;
2106 
2107 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
2108 			if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2109 				/* And both FINs were ACKed. */
2110 				s->st_expire = time_uptime + dyn_fin_lifetime;
2111 			} else {
2112 				s->st_expire = time_uptime +
2113 				    dyn_finwait_lifetime;
2114 			}
2115 			break;
2116 
2117 		default:
2118 #if 0
2119 			/*
2120 			 * reset or some invalid combination, but can also
2121 			 * occur if we use keep-state the wrong way.
2122 			 */
2123 			if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2124 				kprintf("invalid state: 0x%x\n", s->st_state);
2125 #endif
2126 			s->st_expire = time_uptime + dyn_rst_lifetime;
2127 			break;
2128 		}
2129 	} else if (pkt->proto == IPPROTO_UDP) {
2130 		s->st_expire = time_uptime + dyn_udp_lifetime;
2131 	} else {
2132 		/* other protocols */
2133 		s->st_expire = time_uptime + dyn_short_lifetime;
2134 	}
2135 }
2136 
2137 /*
2138  * Lookup a state.
2139  */
2140 static struct ipfw_state *
2141 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2142     int *match_direction, const struct tcphdr *tcp)
2143 {
2144 	struct ipfw_state *key, *s;
2145 	int dir = MATCH_NONE;
2146 
2147 	key = &ctx->ipfw_state_tmpkey;
2148 	ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2149 	    pkt->dst_ip, pkt->dst_port, pkt->proto);
2150 	s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2151 	if (s == NULL)
2152 		goto done; /* not found. */
2153 	if (IPFW_STATE_ISDEAD(s)) {
2154 		ipfw_state_remove(ctx, s);
2155 		s = NULL;
2156 		goto done;
2157 	}
2158 	if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2159 		/* TCP ports recycling is too fast. */
2160 		ctx->ipfw_sts_tcprecycled++;
2161 		ipfw_state_remove(ctx, s);
2162 		s = NULL;
2163 		goto done;
2164 	}
2165 
2166 	if (s->st_swap == key->st_swap) {
2167 		dir = MATCH_FORWARD;
2168 	} else {
2169 		KASSERT((s->st_swap & key->st_swap) == 0,
2170 		    ("found mismatch state"));
2171 		dir = MATCH_REVERSE;
2172 	}
2173 
2174 	/* Update this state. */
2175 	ipfw_state_update(pkt, dir, tcp, s);
2176 
2177 	if (s->st_track != NULL) {
2178 		/* This track has been used. */
2179 		s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2180 	}
2181 done:
2182 	if (match_direction)
2183 		*match_direction = dir;
2184 	return (s);
2185 }
2186 
2187 static struct ipfw_state *
2188 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2189     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2190 {
2191 	struct ipfw_state *s;
2192 	size_t sz;
2193 
2194 	KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2195 	    ("invalid state type %u", type));
2196 
2197 	sz = sizeof(struct ipfw_state);
2198 	if (IPFW_ISXLAT(type))
2199 		sz = sizeof(struct ipfw_xlat);
2200 
2201 	s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2202 	if (s == NULL) {
2203 		ctx->ipfw_sts_nomem++;
2204 		return (NULL);
2205 	}
2206 
2207 	ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2208 	    id->dst_ip, id->dst_port, id->proto);
2209 
2210 	s->st_rule = rule;
2211 	s->st_type = type;
2212 	if (IPFW_ISXLAT(type)) {
2213 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2214 
2215 		x->xlat_dir = MATCH_NONE;
2216 		x->xlat_pcpu = -1;
2217 	}
2218 
2219 	/*
2220 	 * Update this state:
2221 	 * Set st_expire and st_state.
2222 	 */
2223 	ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2224 
2225 	return (s);
2226 }
2227 
2228 static struct ipfw_state *
2229 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2230     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2231     const struct tcphdr *tcp)
2232 {
2233 	struct ipfw_state *s, *dup;
2234 
2235 	s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2236 	if (s == NULL)
2237 		return (NULL);
2238 
2239 	ctx->ipfw_state_cnt++;
2240 	ctx->ipfw_state_loosecnt++;
2241 	if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2242 		ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2243 		ctx->ipfw_state_loosecnt = 0;
2244 	}
2245 
2246 	dup = ipfw_state_link(ctx, s);
2247 	if (dup != NULL)
2248 		panic("ipfw: %u state exists %p", type, dup);
2249 
2250 	if (t != NULL) {
2251 		/* Keep the track referenced. */
2252 		LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2253 		s->st_track = t;
2254 	}
2255 	return (s);
2256 }
2257 
2258 static boolean_t
2259 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2260 {
2261 	struct ipfw_trkcnt *trk;
2262 	boolean_t trk_freed = FALSE;
2263 
2264 	KASSERT(t->t_count != NULL, ("track anchor"));
2265 	KASSERT(LIST_EMPTY(&t->t_state_list),
2266 	    ("invalid track is still referenced"));
2267 
2268 	trk = t->t_trkcnt;
2269 	KASSERT(trk != NULL, ("track has no trkcnt"));
2270 
2271 	RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2272 	TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2273 	kfree(t, M_IPFW);
2274 
2275 	/*
2276 	 * fdrop() style reference counting.
2277 	 * See kern/kern_descrip.c fdrop().
2278 	 */
2279 	for (;;) {
2280 		int refs = trk->tc_refs;
2281 
2282 		cpu_ccfence();
2283 		KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2284 		if (refs == 1) {
2285 			IPFW_TRKCNT_TOKGET;
2286 			if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2287 				KASSERT(trk->tc_count == 0,
2288 				    ("%d states reference this trkcnt",
2289 				     trk->tc_count));
2290 				RB_REMOVE(ipfw_trkcnt_tree,
2291 				    &ipfw_gd.ipfw_trkcnt_tree, trk);
2292 
2293 				KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2294 				    ("invalid trkcnt cnt %d",
2295 				     ipfw_gd.ipfw_trkcnt_cnt));
2296 				ipfw_gd.ipfw_trkcnt_cnt--;
2297 				IPFW_TRKCNT_TOKREL;
2298 
2299 				if (ctx->ipfw_trkcnt_spare == NULL)
2300 					ctx->ipfw_trkcnt_spare = trk;
2301 				else
2302 					kfree(trk, M_IPFW);
2303 				trk_freed = TRUE;
2304 				break; /* done! */
2305 			}
2306 			IPFW_TRKCNT_TOKREL;
2307 			/* retry */
2308 		} else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2309 			break; /* done! */
2310 		}
2311 		/* retry */
2312 	}
2313 	return (trk_freed);
2314 }
2315 
2316 static void
2317 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2318 {
2319 	struct ipfw_track *t, *tn;
2320 
2321 	TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2322 		if (t->t_count == NULL) /* anchor */
2323 			continue;
2324 		if (rule != NULL && t->t_rule != rule)
2325 			continue;
2326 		ipfw_track_free(ctx, t);
2327 	}
2328 }
2329 
2330 static boolean_t
2331 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2332     boolean_t reap)
2333 {
2334 	struct ipfw_state *s, *sn;
2335 	boolean_t ret = FALSE;
2336 
2337 	KASSERT(t->t_count != NULL, ("track anchor"));
2338 
2339 	if (LIST_EMPTY(&t->t_state_list))
2340 		return (FALSE);
2341 
2342 	/*
2343 	 * Do not expire more than once per second, it is useless.
2344 	 */
2345 	if (t->t_lastexp == time_uptime)
2346 		return (FALSE);
2347 	t->t_lastexp = time_uptime;
2348 
2349 	LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2350 		if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2351 			KASSERT(s->st_track == t,
2352 			    ("state track %p does not match %p",
2353 			     s->st_track, t));
2354 			ipfw_state_del(ctx, s);
2355 			ret = TRUE;
2356 		}
2357 	}
2358 	return (ret);
2359 }
2360 
2361 static __inline struct ipfw_trkcnt *
2362 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2363 {
2364 	struct ipfw_trkcnt *trk;
2365 
2366 	if (ctx->ipfw_trkcnt_spare != NULL) {
2367 		trk = ctx->ipfw_trkcnt_spare;
2368 		ctx->ipfw_trkcnt_spare = NULL;
2369 	} else {
2370 		trk = kmalloc(sizeof(*trk), M_IPFW,
2371 			      M_INTWAIT | M_NULLOK | M_CACHEALIGN);
2372 	}
2373 	return (trk);
2374 }
2375 
2376 static void
2377 ipfw_track_expire_done(struct ipfw_context *ctx)
2378 {
2379 
2380 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2381 	    ("trackexp is not in progress"));
2382 	ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2383 	callout_reset(&ctx->ipfw_trackto_ch, hz,
2384 	    ipfw_track_expire_ipifunc, NULL);
2385 }
2386 
2387 static void
2388 ipfw_track_expire_more(struct ipfw_context *ctx)
2389 {
2390 	struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2391 
2392 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2393 	    ("trackexp is not in progress"));
2394 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2395 	    ("trackexp more did not finish"));
2396 	netisr_sendmsg_oncpu(nm);
2397 }
2398 
2399 static int
2400 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2401     int scan_max, int expire_max)
2402 {
2403 	struct ipfw_track *t;
2404 	int scanned = 0, expired = 0;
2405 	boolean_t reap = FALSE;
2406 
2407 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2408 	    ("trackexp is not in progress"));
2409 
2410 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2411 		reap = TRUE;
2412 
2413 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2414 		if (scanned++ >= scan_max) {
2415 			ipfw_track_expire_more(ctx);
2416 			return (expired);
2417 		}
2418 
2419 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2420 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2421 
2422 		if (t->t_count == NULL) /* anchor */
2423 			continue;
2424 
2425 		ipfw_track_state_expire(ctx, t, reap);
2426 		if (!LIST_EMPTY(&t->t_state_list)) {
2427 			/* There are states referencing this track. */
2428 			continue;
2429 		}
2430 
2431 		if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2432 			/* Expired. */
2433 			if (ipfw_track_free(ctx, t)) {
2434 				if (++expired >= expire_max) {
2435 					ipfw_track_expire_more(ctx);
2436 					return (expired);
2437 				}
2438 			}
2439 		}
2440 	}
2441 	TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2442 	ipfw_track_expire_done(ctx);
2443 	return (expired);
2444 }
2445 
2446 static int
2447 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2448 {
2449 	struct ipfw_track *anchor;
2450 
2451 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2452 	    ("trackexp is in progress"));
2453 	ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2454 
2455 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2456 		ipfw_track_expire_done(ctx);
2457 		return (0);
2458 	}
2459 
2460 	/*
2461 	 * Do not expire more than once per second, it is useless.
2462 	 */
2463 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2464 	    ctx->ipfw_track_lastexp == time_uptime) {
2465 		ipfw_track_expire_done(ctx);
2466 		return (0);
2467 	}
2468 	ctx->ipfw_track_lastexp = time_uptime;
2469 
2470 	anchor = &ctx->ipfw_trackexp_anch;
2471 	TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2472 	return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2473 }
2474 
2475 static void
2476 ipfw_track_expire_more_dispatch(netmsg_t nm)
2477 {
2478 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2479 	struct ipfw_track *anchor;
2480 
2481 	ASSERT_NETISR_NCPUS(mycpuid);
2482 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2483 	    ("trackexp is not in progress"));
2484 
2485 	/* Reply ASAP */
2486 	netisr_replymsg(&nm->base, 0);
2487 
2488 	anchor = &ctx->ipfw_trackexp_anch;
2489 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2490 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2491 		ipfw_track_expire_done(ctx);
2492 		return;
2493 	}
2494 	ipfw_track_expire_loop(ctx, anchor,
2495 	    ipfw_track_scan_max, ipfw_track_expire_max);
2496 }
2497 
2498 static void
2499 ipfw_track_expire_dispatch(netmsg_t nm)
2500 {
2501 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2502 
2503 	ASSERT_NETISR_NCPUS(mycpuid);
2504 
2505 	/* Reply ASAP */
2506 	crit_enter();
2507 	netisr_replymsg(&nm->base, 0);
2508 	crit_exit();
2509 
2510 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2511 		/* Running; done. */
2512 		return;
2513 	}
2514 	ipfw_track_expire_start(ctx,
2515 	    ipfw_track_scan_max, ipfw_track_expire_max);
2516 }
2517 
2518 static void
2519 ipfw_track_expire_ipifunc(void *dummy __unused)
2520 {
2521 	struct netmsg_base *msg;
2522 
2523 	KKASSERT(mycpuid < netisr_ncpus);
2524 	msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2525 
2526 	crit_enter();
2527 	if (msg->lmsg.ms_flags & MSGF_DONE)
2528 		netisr_sendmsg_oncpu(msg);
2529 	crit_exit();
2530 }
2531 
2532 static int
2533 ipfw_track_reap(struct ipfw_context *ctx)
2534 {
2535 	struct ipfw_track *t, *anchor;
2536 	int expired;
2537 
2538 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2539 		/*
2540 		 * Kick start track expiring.  Ignore scan limit,
2541 		 * we are short of tracks.
2542 		 */
2543 		ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2544 		expired = ipfw_track_expire_start(ctx, INT_MAX,
2545 		    ipfw_track_reap_max);
2546 		ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2547 		return (expired);
2548 	}
2549 
2550 	/*
2551 	 * Tracks are being expired.
2552 	 */
2553 
2554 	if (RB_EMPTY(&ctx->ipfw_track_tree))
2555 		return (0);
2556 
2557 	expired = 0;
2558 	anchor = &ctx->ipfw_trackexp_anch;
2559 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2560 		/*
2561 		 * Ignore scan limit; we are short of tracks.
2562 		 */
2563 
2564 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2565 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2566 
2567 		if (t->t_count == NULL) /* anchor */
2568 			continue;
2569 
2570 		ipfw_track_state_expire(ctx, t, TRUE);
2571 		if (!LIST_EMPTY(&t->t_state_list)) {
2572 			/* There are states referencing this track. */
2573 			continue;
2574 		}
2575 
2576 		if (ipfw_track_free(ctx, t)) {
2577 			if (++expired >= ipfw_track_reap_max) {
2578 				ipfw_track_expire_more(ctx);
2579 				break;
2580 			}
2581 		}
2582 	}
2583 	/*
2584 	 * NOTE:
2585 	 * Leave the anchor on the list, even if the end of the list has
2586 	 * been reached.  ipfw_track_expire_more_dispatch() will handle
2587 	 * the removal.
2588 	 */
2589 	return (expired);
2590 }
2591 
2592 static struct ipfw_track *
2593 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2594     uint16_t limit_mask, struct ip_fw *rule)
2595 {
2596 	struct ipfw_track *key, *t, *dup;
2597 	struct ipfw_trkcnt *trk, *ret;
2598 	boolean_t do_expire = FALSE;
2599 
2600 	KASSERT(rule->track_ruleid != 0,
2601 	    ("rule %u has no track ruleid", rule->rulenum));
2602 
2603 	key = &ctx->ipfw_track_tmpkey;
2604 	key->t_proto = id->proto;
2605 	key->t_addrs = 0;
2606 	key->t_ports = 0;
2607 	key->t_rule = rule;
2608 	if (limit_mask & DYN_SRC_ADDR)
2609 		key->t_saddr = id->src_ip;
2610 	if (limit_mask & DYN_DST_ADDR)
2611 		key->t_daddr = id->dst_ip;
2612 	if (limit_mask & DYN_SRC_PORT)
2613 		key->t_sport = id->src_port;
2614 	if (limit_mask & DYN_DST_PORT)
2615 		key->t_dport = id->dst_port;
2616 
2617 	t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2618 	if (t != NULL)
2619 		goto done;
2620 
2621 	t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2622 	if (t == NULL) {
2623 		ctx->ipfw_tks_nomem++;
2624 		return (NULL);
2625 	}
2626 
2627 	t->t_key = key->t_key;
2628 	t->t_rule = rule;
2629 	t->t_lastexp = 0;
2630 	LIST_INIT(&t->t_state_list);
2631 
2632 	if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2633 		time_t globexp, uptime;
2634 
2635 		trk = NULL;
2636 		do_expire = TRUE;
2637 
2638 		/*
2639 		 * Do not expire globally more than once per second,
2640 		 * it is useless.
2641 		 */
2642 		uptime = time_uptime;
2643 		globexp = ipfw_gd.ipfw_track_globexp;
2644 		if (globexp != uptime &&
2645 		    atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2646 		    globexp, uptime)) {
2647 			int cpu;
2648 
2649 			/* Expire tracks on other CPUs. */
2650 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2651 				if (cpu == mycpuid)
2652 					continue;
2653 				lwkt_send_ipiq(globaldata_find(cpu),
2654 				    ipfw_track_expire_ipifunc, NULL);
2655 			}
2656 		}
2657 	} else {
2658 		trk = ipfw_trkcnt_alloc(ctx);
2659 	}
2660 	if (trk == NULL) {
2661 		struct ipfw_trkcnt *tkey;
2662 
2663 		tkey = &ctx->ipfw_trkcnt_tmpkey;
2664 		key = NULL; /* tkey overlaps key */
2665 
2666 		tkey->tc_key = t->t_key;
2667 		tkey->tc_ruleid = rule->track_ruleid;
2668 
2669 		IPFW_TRKCNT_TOKGET;
2670 		trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2671 		    tkey);
2672 		if (trk == NULL) {
2673 			IPFW_TRKCNT_TOKREL;
2674 			if (do_expire) {
2675 				ctx->ipfw_tks_reap++;
2676 				if (ipfw_track_reap(ctx) > 0) {
2677 					if (ipfw_gd.ipfw_trkcnt_cnt <
2678 					    ipfw_track_max) {
2679 						trk = ipfw_trkcnt_alloc(ctx);
2680 						if (trk != NULL)
2681 							goto install;
2682 						ctx->ipfw_tks_cntnomem++;
2683 					} else {
2684 						ctx->ipfw_tks_overflow++;
2685 					}
2686 				} else {
2687 					ctx->ipfw_tks_reapfailed++;
2688 					ctx->ipfw_tks_overflow++;
2689 				}
2690 			} else {
2691 				ctx->ipfw_tks_cntnomem++;
2692 			}
2693 			kfree(t, M_IPFW);
2694 			return (NULL);
2695 		}
2696 		KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2697 		    ("invalid trkcnt refs %d", trk->tc_refs));
2698 		atomic_add_int(&trk->tc_refs, 1);
2699 		IPFW_TRKCNT_TOKREL;
2700 	} else {
2701 install:
2702 		trk->tc_key = t->t_key;
2703 		trk->tc_ruleid = rule->track_ruleid;
2704 		trk->tc_refs = 0;
2705 		trk->tc_count = 0;
2706 		trk->tc_expire = 0;
2707 		trk->tc_rulenum = rule->rulenum;
2708 
2709 		IPFW_TRKCNT_TOKGET;
2710 		ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2711 		    trk);
2712 		if (ret != NULL) {
2713 			KASSERT(ret->tc_refs > 0 &&
2714 			    ret->tc_refs < netisr_ncpus,
2715 			    ("invalid trkcnt refs %d", ret->tc_refs));
2716 			KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2717 			    ("trkcnt spare was installed"));
2718 			ctx->ipfw_trkcnt_spare = trk;
2719 			trk = ret;
2720 		} else {
2721 			ipfw_gd.ipfw_trkcnt_cnt++;
2722 		}
2723 		atomic_add_int(&trk->tc_refs, 1);
2724 		IPFW_TRKCNT_TOKREL;
2725 	}
2726 	t->t_count = &trk->tc_count;
2727 	t->t_trkcnt = trk;
2728 
2729 	dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2730 	if (dup != NULL)
2731 		panic("ipfw: track exists");
2732 	TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2733 done:
2734 	t->t_expire = time_uptime + dyn_short_lifetime;
2735 	return (t);
2736 }
2737 
2738 /*
2739  * Install state for rule type cmd->o.opcode
2740  *
2741  * Returns NULL if state is not installed because of errors or because
2742  * states limitations are enforced.
2743  */
2744 static struct ipfw_state *
2745 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2746     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2747 {
2748 	struct ipfw_state *s;
2749 	struct ipfw_track *t;
2750 	int count, diff;
2751 
2752 	if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2753 	    (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2754 		boolean_t overflow = TRUE;
2755 
2756 		ctx->ipfw_sts_reap++;
2757 		if (ipfw_state_reap(ctx, diff) == 0)
2758 			ctx->ipfw_sts_reapfailed++;
2759 		if (ipfw_state_cntsync() < ipfw_state_max)
2760 			overflow = FALSE;
2761 
2762 		if (overflow) {
2763 			time_t globexp, uptime;
2764 			int cpu;
2765 
2766 			/*
2767 			 * Do not expire globally more than once per second,
2768 			 * it is useless.
2769 			 */
2770 			uptime = time_uptime;
2771 			globexp = ipfw_gd.ipfw_state_globexp;
2772 			if (globexp == uptime ||
2773 			    !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2774 			    globexp, uptime)) {
2775 				ctx->ipfw_sts_overflow++;
2776 				return (NULL);
2777 			}
2778 
2779 			/* Expire states on other CPUs. */
2780 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2781 				if (cpu == mycpuid)
2782 					continue;
2783 				lwkt_send_ipiq(globaldata_find(cpu),
2784 				    ipfw_state_expire_ipifunc, NULL);
2785 			}
2786 			ctx->ipfw_sts_overflow++;
2787 			return (NULL);
2788 		}
2789 	}
2790 
2791 	switch (cmd->o.opcode) {
2792 	case O_KEEP_STATE: /* bidir rule */
2793 	case O_REDIRECT:
2794 		s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2795 		    tcp);
2796 		if (s == NULL)
2797 			return (NULL);
2798 		break;
2799 
2800 	case O_LIMIT: /* limit number of sessions */
2801 		t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2802 		if (t == NULL)
2803 			return (NULL);
2804 
2805 		if (*t->t_count >= cmd->conn_limit) {
2806 			if (!ipfw_track_state_expire(ctx, t, TRUE))
2807 				return (NULL);
2808 		}
2809 		for (;;) {
2810 			count = *t->t_count;
2811 			if (count >= cmd->conn_limit)
2812 				return (NULL);
2813 			if (atomic_cmpset_int(t->t_count, count, count + 1))
2814 				break;
2815 		}
2816 
2817 		s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2818 		if (s == NULL) {
2819 			/* Undo damage. */
2820 			atomic_subtract_int(t->t_count, 1);
2821 			return (NULL);
2822 		}
2823 		break;
2824 
2825 	default:
2826 		panic("unknown state type %u\n", cmd->o.opcode);
2827 	}
2828 
2829 	if (s->st_type == O_REDIRECT) {
2830 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2831 		ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2832 
2833 		x->xlat_addr = r->addr.s_addr;
2834 		x->xlat_port = r->port;
2835 		x->xlat_ifp = args->m->m_pkthdr.rcvif;
2836 		x->xlat_dir = MATCH_FORWARD;
2837 		KKASSERT(x->xlat_ifp != NULL);
2838 	}
2839 	return (s);
2840 }
2841 
2842 static int
2843 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2844     const struct in_addr *in)
2845 {
2846 	struct radix_node_head *rnh;
2847 	struct sockaddr_in sin;
2848 	struct ipfw_tblent *te;
2849 
2850 	KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2851 	rnh = ctx->ipfw_tables[tableid];
2852 	if (rnh == NULL)
2853 		return (0); /* no match */
2854 
2855 	memset(&sin, 0, sizeof(sin));
2856 	sin.sin_family = AF_INET;
2857 	sin.sin_len = sizeof(sin);
2858 	sin.sin_addr = *in;
2859 
2860 	te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2861 	if (te == NULL)
2862 		return (0); /* no match */
2863 
2864 	te->te_use++;
2865 	te->te_lastuse = time_second;
2866 	return (1); /* match */
2867 }
2868 
2869 /*
2870  * Transmit a TCP packet, containing either a RST or a keepalive.
2871  * When flags & TH_RST, we are sending a RST packet, because of a
2872  * "reset" action matched the packet.
2873  * Otherwise we are sending a keepalive, and flags & TH_
2874  *
2875  * Only {src,dst}_{ip,port} of "id" are used.
2876  */
2877 static void
2878 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2879 {
2880 	struct mbuf *m;
2881 	struct ip *ip;
2882 	struct tcphdr *tcp;
2883 	struct route sro;	/* fake route */
2884 
2885 	MGETHDR(m, M_NOWAIT, MT_HEADER);
2886 	if (m == NULL)
2887 		return;
2888 	m->m_pkthdr.rcvif = NULL;
2889 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2890 	m->m_data += max_linkhdr;
2891 
2892 	ip = mtod(m, struct ip *);
2893 	bzero(ip, m->m_len);
2894 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2895 	ip->ip_p = IPPROTO_TCP;
2896 	tcp->th_off = 5;
2897 
2898 	/*
2899 	 * Assume we are sending a RST (or a keepalive in the reverse
2900 	 * direction), swap src and destination addresses and ports.
2901 	 */
2902 	ip->ip_src.s_addr = htonl(id->dst_ip);
2903 	ip->ip_dst.s_addr = htonl(id->src_ip);
2904 	tcp->th_sport = htons(id->dst_port);
2905 	tcp->th_dport = htons(id->src_port);
2906 	if (flags & TH_RST) {	/* we are sending a RST */
2907 		if (flags & TH_ACK) {
2908 			tcp->th_seq = htonl(ack);
2909 			tcp->th_ack = htonl(0);
2910 			tcp->th_flags = TH_RST;
2911 		} else {
2912 			if (flags & TH_SYN)
2913 				seq++;
2914 			tcp->th_seq = htonl(0);
2915 			tcp->th_ack = htonl(seq);
2916 			tcp->th_flags = TH_RST | TH_ACK;
2917 		}
2918 	} else {
2919 		/*
2920 		 * We are sending a keepalive. flags & TH_SYN determines
2921 		 * the direction, forward if set, reverse if clear.
2922 		 * NOTE: seq and ack are always assumed to be correct
2923 		 * as set by the caller. This may be confusing...
2924 		 */
2925 		if (flags & TH_SYN) {
2926 			/*
2927 			 * we have to rewrite the correct addresses!
2928 			 */
2929 			ip->ip_dst.s_addr = htonl(id->dst_ip);
2930 			ip->ip_src.s_addr = htonl(id->src_ip);
2931 			tcp->th_dport = htons(id->dst_port);
2932 			tcp->th_sport = htons(id->src_port);
2933 		}
2934 		tcp->th_seq = htonl(seq);
2935 		tcp->th_ack = htonl(ack);
2936 		tcp->th_flags = TH_ACK;
2937 	}
2938 
2939 	/*
2940 	 * set ip_len to the payload size so we can compute
2941 	 * the tcp checksum on the pseudoheader
2942 	 * XXX check this, could save a couple of words ?
2943 	 */
2944 	ip->ip_len = htons(sizeof(struct tcphdr));
2945 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2946 
2947 	/*
2948 	 * now fill fields left out earlier
2949 	 */
2950 	ip->ip_ttl = ip_defttl;
2951 	ip->ip_len = htons(m->m_pkthdr.len);
2952 
2953 	bzero(&sro, sizeof(sro));
2954 	ip_rtaddr(ip->ip_dst, &sro);
2955 
2956 	m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2957 	ip_output(m, NULL, &sro, 0, NULL, NULL);
2958 	if (sro.ro_rt)
2959 		RTFREE(sro.ro_rt);
2960 }
2961 
2962 /*
2963  * Send a reject message, consuming the mbuf passed as an argument.
2964  */
2965 static void
2966 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2967 {
2968 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2969 		/* IP header is always left in network order */
2970 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2971 	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2972 		struct tcphdr *const tcp =
2973 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2974 
2975 		if ((tcp->th_flags & TH_RST) == 0) {
2976 			send_pkt(&args->f_id, ntohl(tcp->th_seq),
2977 				 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2978 		}
2979 		m_freem(args->m);
2980 	} else {
2981 		m_freem(args->m);
2982 	}
2983 	args->m = NULL;
2984 }
2985 
2986 /*
2987  * Given an ip_fw *, lookup_next_rule will return a pointer
2988  * to the next rule, which can be either the jump
2989  * target (for skipto instructions) or the next one in the list (in
2990  * all other cases including a missing jump target).
2991  * The result is also written in the "next_rule" field of the rule.
2992  * Backward jumps are not allowed, so start looking from the next
2993  * rule...
2994  *
2995  * This never returns NULL -- in case we do not have an exact match,
2996  * the next rule is returned. When the ruleset is changed,
2997  * pointers are flushed so we are always correct.
2998  */
2999 static struct ip_fw *
3000 lookup_next_rule(struct ip_fw *me)
3001 {
3002 	struct ip_fw *rule = NULL;
3003 	ipfw_insn *cmd;
3004 
3005 	/* look for action, in case it is a skipto */
3006 	cmd = ACTION_PTR(me);
3007 	if (cmd->opcode == O_LOG)
3008 		cmd += F_LEN(cmd);
3009 	if (cmd->opcode == O_SKIPTO) {
3010 		for (rule = me->next; rule; rule = rule->next) {
3011 			if (rule->rulenum >= cmd->arg1)
3012 				break;
3013 		}
3014 	}
3015 	if (rule == NULL)			/* failure or not a skipto */
3016 		rule = me->next;
3017 	me->next_rule = rule;
3018 	return rule;
3019 }
3020 
3021 static int
3022 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3023 		enum ipfw_opcodes opcode, uid_t uid)
3024 {
3025 	struct in_addr src_ip, dst_ip;
3026 	struct inpcbinfo *pi;
3027 	boolean_t wildcard;
3028 	struct inpcb *pcb;
3029 
3030 	if (fid->proto == IPPROTO_TCP) {
3031 		wildcard = FALSE;
3032 		pi = &tcbinfo[mycpuid];
3033 	} else if (fid->proto == IPPROTO_UDP) {
3034 		wildcard = TRUE;
3035 		pi = &udbinfo[mycpuid];
3036 	} else {
3037 		return 0;
3038 	}
3039 
3040 	/*
3041 	 * Values in 'fid' are in host byte order
3042 	 */
3043 	dst_ip.s_addr = htonl(fid->dst_ip);
3044 	src_ip.s_addr = htonl(fid->src_ip);
3045 	if (oif) {
3046 		pcb = in_pcblookup_hash(pi,
3047 			dst_ip, htons(fid->dst_port),
3048 			src_ip, htons(fid->src_port),
3049 			wildcard, oif);
3050 	} else {
3051 		pcb = in_pcblookup_hash(pi,
3052 			src_ip, htons(fid->src_port),
3053 			dst_ip, htons(fid->dst_port),
3054 			wildcard, NULL);
3055 	}
3056 	if (pcb == NULL || pcb->inp_socket == NULL)
3057 		return 0;
3058 
3059 	if (opcode == O_UID) {
3060 #define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
3061 		return !socheckuid(pcb->inp_socket, uid);
3062 #undef socheckuid
3063 	} else  {
3064 		return groupmember(uid, pcb->inp_socket->so_cred);
3065 	}
3066 }
3067 
3068 static int
3069 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3070 {
3071 
3072 	if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3073 		struct ifaddr_container *ifac;
3074 		struct ifnet *ifp;
3075 
3076 		ifp = ifunit_netisr(cmd->ifname);
3077 		if (ifp == NULL)
3078 			return (0);
3079 
3080 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3081 			struct ifaddr *ia = ifac->ifa;
3082 
3083 			if (ia->ifa_addr == NULL)
3084 				continue;
3085 			if (ia->ifa_addr->sa_family != AF_INET)
3086 				continue;
3087 
3088 			cmd->mask.s_addr = INADDR_ANY;
3089 			if (cmd->o.arg1 & IPFW_IFIP_NET) {
3090 				cmd->mask = ((struct sockaddr_in *)
3091 				    ia->ifa_netmask)->sin_addr;
3092 			}
3093 			if (cmd->mask.s_addr == INADDR_ANY)
3094 				cmd->mask.s_addr = INADDR_BROADCAST;
3095 
3096 			cmd->addr =
3097 			    ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3098 			cmd->addr.s_addr &= cmd->mask.s_addr;
3099 
3100 			cmd->o.arg1 |= IPFW_IFIP_VALID;
3101 			break;
3102 		}
3103 		if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3104 			return (0);
3105 	}
3106 	return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3107 }
3108 
3109 static void
3110 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3111     struct in_addr *old_addr, uint16_t *old_port)
3112 {
3113 	struct ip *ip = mtod(m, struct ip *);
3114 	struct in_addr *addr;
3115 	uint16_t *port, *csum, dlen = 0;
3116 	uint8_t udp = 0;
3117 	boolean_t pseudo = FALSE;
3118 
3119 	if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3120 		addr = &ip->ip_src;
3121 		switch (ip->ip_p) {
3122 		case IPPROTO_TCP:
3123 			port = &L3HDR(struct tcphdr, ip)->th_sport;
3124 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3125 			break;
3126 		case IPPROTO_UDP:
3127 			port = &L3HDR(struct udphdr, ip)->uh_sport;
3128 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3129 			udp = 1;
3130 			break;
3131 		default:
3132 			panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3133 		}
3134 	} else {
3135 		addr = &ip->ip_dst;
3136 		switch (ip->ip_p) {
3137 		case IPPROTO_TCP:
3138 			port = &L3HDR(struct tcphdr, ip)->th_dport;
3139 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3140 			break;
3141 		case IPPROTO_UDP:
3142 			port = &L3HDR(struct udphdr, ip)->uh_dport;
3143 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3144 			udp = 1;
3145 			break;
3146 		default:
3147 			panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3148 		}
3149 	}
3150 	if (old_addr != NULL)
3151 		*old_addr = *addr;
3152 	if (old_port != NULL) {
3153 		if (x->xlat_port != 0)
3154 			*old_port = *port;
3155 		else
3156 			*old_port = 0;
3157 	}
3158 
3159 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3160 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3161 			dlen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
3162 		pseudo = TRUE;
3163 	}
3164 
3165 	if (!pseudo) {
3166 		const uint16_t *oaddr, *naddr;
3167 
3168 		oaddr = (const uint16_t *)&addr->s_addr;
3169 		naddr = (const uint16_t *)&x->xlat_addr;
3170 
3171 		ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3172 		    oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3173 		*csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3174 		    oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3175 	}
3176 	addr->s_addr = x->xlat_addr;
3177 
3178 	if (x->xlat_port != 0) {
3179 		if (!pseudo) {
3180 			*csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3181 			    udp);
3182 		}
3183 		*port = x->xlat_port;
3184 	}
3185 
3186 	if (pseudo) {
3187 		*csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3188 		    htons(dlen + ip->ip_p));
3189 	}
3190 }
3191 
3192 static void
3193 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3194 {
3195 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3196 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3197 	struct mbuf *m = nm->m;
3198 	struct ipfw_xlat *x = nm->arg1;
3199 	struct ip_fw *rule = x->xlat_rule;
3200 
3201 	ASSERT_NETISR_NCPUS(mycpuid);
3202 	KASSERT(rule->cpuid == mycpuid,
3203 	    ("rule does not belong to cpu%d", mycpuid));
3204 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3205 	    ("mbuf does not have ipfw continue rule"));
3206 
3207 	KASSERT(ctx->ipfw_cont_rule == NULL,
3208 	    ("pending ipfw continue rule"));
3209 	KASSERT(ctx->ipfw_cont_xlat == NULL,
3210 	    ("pending ipfw continue xlat"));
3211 	ctx->ipfw_cont_rule = rule;
3212 	ctx->ipfw_cont_xlat = x;
3213 
3214 	if (nm->arg2 == 0)
3215 		ip_input(m);
3216 	else
3217 		ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3218 
3219 	/* May not be cleared, if ipfw was unload/disabled. */
3220 	ctx->ipfw_cont_rule = NULL;
3221 	ctx->ipfw_cont_xlat = NULL;
3222 
3223 	/*
3224 	 * This state is no longer used; decrement its xlat_crefs,
3225 	 * so this state can be deleted.
3226 	 */
3227 	x->xlat_crefs--;
3228 	/*
3229 	 * This rule is no longer used; decrement its cross_refs,
3230 	 * so this rule can be deleted.
3231 	 *
3232 	 * NOTE:
3233 	 * Decrement cross_refs in the last step of this function,
3234 	 * so that the module could be unloaded safely.
3235 	 */
3236 	rule->cross_refs--;
3237 }
3238 
3239 static void
3240 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3241     uint32_t flags)
3242 {
3243 	struct netmsg_genpkt *nm;
3244 
3245 	KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3246 	    x->xlat_pcpu, cpuid));
3247 
3248 	/*
3249 	 * Bump cross_refs to prevent this rule and its siblings
3250 	 * from being deleted, while this mbuf is inflight.  The
3251 	 * cross_refs of the sibling rule on the target cpu will
3252 	 * be decremented, once this mbuf is going to be filtered
3253 	 * on the target cpu.
3254 	 */
3255 	x->xlat_rule->cross_refs++;
3256 	/*
3257 	 * Bump xlat_crefs to prevent this state and its paired
3258 	 * state from being deleted, while this mbuf is inflight.
3259 	 * The xlat_crefs of the paired state on the target cpu
3260 	 * will be decremented, once this mbuf is going to be
3261 	 * filtered on the target cpu.
3262 	 */
3263 	x->xlat_crefs++;
3264 
3265 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3266 	if (flags & IPFW_XLATE_INSERT)
3267 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3268 	if (flags & IPFW_XLATE_FORWARD)
3269 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3270 
3271 	/*
3272 	 * NOTE: We always leave ip_len and ip_off in network
3273 	 *	 order across all network layers.
3274 	 */
3275 	nm = &m->m_hdr.mh_genmsg;
3276 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3277 	    ipfw_ip_xlate_dispatch);
3278 	nm->m = m;
3279 	nm->arg1 = x->xlat_pair;
3280 	nm->arg2 = 0;
3281 	if (flags & IPFW_XLATE_OUTPUT)
3282 		nm->arg2 = 1;
3283 	netisr_sendmsg(&nm->base, cpuid);
3284 }
3285 
3286 static struct mbuf *
3287 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3288     struct ip_fw_local *local, struct ip **ip0)
3289 {
3290 	struct ip *ip = mtod(m, struct ip *);
3291 	struct tcphdr *tcp;
3292 	struct udphdr *udp;
3293 
3294 	/*
3295 	 * Collect parameters into local variables for faster matching.
3296 	 */
3297 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
3298 		local->proto = args->f_id.proto = 0;	/* mark f_id invalid */
3299 		goto done;
3300 	}
3301 
3302 	local->proto = args->f_id.proto = ip->ip_p;
3303 	local->src_ip = ip->ip_src;
3304 	local->dst_ip = ip->ip_dst;
3305 	local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3306 	local->ip_len = ntohs(ip->ip_len);
3307 
3308 #define PULLUP_TO(len)					\
3309 do {							\
3310 	if (m->m_len < (len)) {				\
3311 		args->m = m = m_pullup(m, (len));	\
3312 		if (m == NULL) {			\
3313 			ip = NULL;			\
3314 			goto done;			\
3315 		}					\
3316 		ip = mtod(m, struct ip *);		\
3317 	}						\
3318 } while (0)
3319 
3320 	if (local->offset == 0) {
3321 		switch (local->proto) {
3322 		case IPPROTO_TCP:
3323 			PULLUP_TO(hlen + sizeof(struct tcphdr));
3324 			local->tcp = tcp = L3HDR(struct tcphdr, ip);
3325 			local->dst_port = tcp->th_dport;
3326 			local->src_port = tcp->th_sport;
3327 			args->f_id.flags = tcp->th_flags;
3328 			break;
3329 
3330 		case IPPROTO_UDP:
3331 			PULLUP_TO(hlen + sizeof(struct udphdr));
3332 			udp = L3HDR(struct udphdr, ip);
3333 			local->dst_port = udp->uh_dport;
3334 			local->src_port = udp->uh_sport;
3335 			break;
3336 
3337 		case IPPROTO_ICMP:
3338 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
3339 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3340 			break;
3341 
3342 		default:
3343 			break;
3344 		}
3345 	}
3346 
3347 #undef PULLUP_TO
3348 
3349 	args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3350 	args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3351 	args->f_id.src_port = local->src_port = ntohs(local->src_port);
3352 	args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3353 done:
3354 	*ip0 = ip;
3355 	return (m);
3356 }
3357 
3358 static struct mbuf *
3359 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3360     struct ip_fw_local *local, struct ip **ip0)
3361 {
3362 	m->m_flags &= ~M_HASH;
3363 	ip_hashfn(&m, 0);
3364 	args->m = m;
3365 	if (m == NULL) {
3366 		*ip0 = NULL;
3367 		return (NULL);
3368 	}
3369 	KASSERT(m->m_flags & M_HASH, ("no hash"));
3370 
3371 	/* 'm' might be changed by ip_hashfn(). */
3372 	return (ipfw_setup_local(m, hlen, args, local, ip0));
3373 }
3374 
3375 /*
3376  * The main check routine for the firewall.
3377  *
3378  * All arguments are in args so we can modify them and return them
3379  * back to the caller.
3380  *
3381  * Parameters:
3382  *
3383  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
3384  *		Starts with the IP header.
3385  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
3386  *	args->oif	Outgoing interface, or NULL if packet is incoming.
3387  *		The incoming interface is in the mbuf. (in)
3388  *
3389  *	args->rule	Pointer to the last matching rule (in/out)
3390  *	args->f_id	Addresses grabbed from the packet (out)
3391  *
3392  * Return value:
3393  *
3394  *	If the packet was denied/rejected and has been dropped, *m is equal
3395  *	to NULL upon return.
3396  *
3397  *	IP_FW_DENY	the packet must be dropped.
3398  *	IP_FW_PASS	The packet is to be accepted and routed normally.
3399  *	IP_FW_DIVERT	Divert the packet to port (args->cookie)
3400  *	IP_FW_TEE	Tee the packet to port (args->cookie)
3401  *	IP_FW_DUMMYNET	Send the packet to pipe/queue (args->cookie)
3402  *	IP_FW_CONTINUE	Continue processing on another cpu.
3403  */
3404 static int
3405 ipfw_chk(struct ip_fw_args *args)
3406 {
3407 	/*
3408 	 * Local variables hold state during the processing of a packet.
3409 	 *
3410 	 * IMPORTANT NOTE: to speed up the processing of rules, there
3411 	 * are some assumption on the values of the variables, which
3412 	 * are documented here. Should you change them, please check
3413 	 * the implementation of the various instructions to make sure
3414 	 * that they still work.
3415 	 *
3416 	 * args->eh	The MAC header. It is non-null for a layer2
3417 	 *	packet, it is NULL for a layer-3 packet.
3418 	 *
3419 	 * m | args->m	Pointer to the mbuf, as received from the caller.
3420 	 *	It may change if ipfw_chk() does an m_pullup, or if it
3421 	 *	consumes the packet because it calls send_reject().
3422 	 *	XXX This has to change, so that ipfw_chk() never modifies
3423 	 *	or consumes the buffer.
3424 	 * ip	is simply an alias of the value of m, and it is kept
3425 	 *	in sync with it (the packet is	supposed to start with
3426 	 *	the ip header).
3427 	 */
3428 	struct mbuf *m = args->m;
3429 	struct ip *ip = mtod(m, struct ip *);
3430 
3431 	/*
3432 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
3433 	 *	inbound path (ether_input, ip_input).
3434 	 *	If non-NULL, ipfw_chk has been called on the outbound path
3435 	 *	(ether_output, ip_output).
3436 	 */
3437 	struct ifnet *oif = args->oif;
3438 
3439 	struct ip_fw *f = NULL;		/* matching rule */
3440 	int retval = IP_FW_PASS;
3441 	struct m_tag *mtag;
3442 	struct divert_info *divinfo;
3443 	struct ipfw_state *s;
3444 
3445 	/*
3446 	 * hlen	The length of the IPv4 header.
3447 	 *	hlen >0 means we have an IPv4 packet.
3448 	 */
3449 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
3450 
3451 	struct ip_fw_local lc;
3452 
3453 	/*
3454 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3455 	 * 	MATCH_NONE when checked and not matched (dyn_f = NULL),
3456 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3457 	 */
3458 	int dyn_dir = MATCH_UNKNOWN;
3459 	struct ip_fw *dyn_f = NULL;
3460 	int cpuid = mycpuid;
3461 	struct ipfw_context *ctx;
3462 
3463 	ASSERT_NETISR_NCPUS(cpuid);
3464 	ctx = ipfw_ctx[cpuid];
3465 
3466 	if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3467 		return IP_FW_PASS;	/* accept */
3468 
3469 	if (args->eh == NULL ||		/* layer 3 packet */
3470 	    (m->m_pkthdr.len >= sizeof(struct ip) &&
3471 	     ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3472 		hlen = ip->ip_hl << 2;
3473 
3474 	memset(&lc, 0, sizeof(lc));
3475 
3476 	m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3477 	if (m == NULL)
3478 		goto pullup_failed;
3479 
3480 	if (args->rule) {
3481 		/*
3482 		 * Packet has already been tagged. Look for the next rule
3483 		 * to restart processing.
3484 		 *
3485 		 * If fw_one_pass != 0 then just accept it.
3486 		 * XXX should not happen here, but optimized out in
3487 		 * the caller.
3488 		 */
3489 		if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3490 			return IP_FW_PASS;
3491 		args->flags &= ~IP_FWARG_F_CONT;
3492 
3493 		/* This rule is being/has been flushed */
3494 		if (ipfw_flushing)
3495 			return IP_FW_DENY;
3496 
3497 		KASSERT(args->rule->cpuid == cpuid,
3498 			("rule used on cpu%d", cpuid));
3499 
3500 		/* This rule was deleted */
3501 		if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3502 			return IP_FW_DENY;
3503 
3504 		if (args->xlat != NULL) {
3505 			struct ipfw_xlat *x = args->xlat;
3506 
3507 			/* This xlat is being deleted. */
3508 			if (x->xlat_invalid)
3509 				return IP_FW_DENY;
3510 
3511 			f = args->rule;
3512 
3513 			dyn_f = f;
3514 			dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3515 			    MATCH_FORWARD : MATCH_REVERSE;
3516 
3517 			if (args->flags & IP_FWARG_F_XLATINS) {
3518 				KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3519 				    ("not slave %u state", x->xlat_type));
3520 				s = ipfw_state_link(ctx, &x->xlat_st);
3521 				if (s != NULL) {
3522 					ctx->ipfw_xlate_conflicts++;
3523 					if (IPFW_STATE_ISDEAD(s)) {
3524 						ipfw_state_remove(ctx, s);
3525 						s = ipfw_state_link(ctx,
3526 						    &x->xlat_st);
3527 					}
3528 					if (s != NULL) {
3529 						if (bootverbose) {
3530 							kprintf("ipfw: "
3531 							"slave %u state "
3532 							"conflicts %u state\n",
3533 							x->xlat_type,
3534 							s->st_type);
3535 						}
3536 						ipfw_xlat_invalidate(x);
3537 						return IP_FW_DENY;
3538 					}
3539 					ctx->ipfw_xlate_cresolved++;
3540 				}
3541 			} else {
3542 				ipfw_state_update(&args->f_id, dyn_dir,
3543 				    lc.tcp, &x->xlat_st);
3544 			}
3545 		} else {
3546 			/* TODO: setup dyn_f, dyn_dir */
3547 
3548 			f = args->rule->next_rule;
3549 			if (f == NULL)
3550 				f = lookup_next_rule(args->rule);
3551 		}
3552 	} else {
3553 		/*
3554 		 * Find the starting rule. It can be either the first
3555 		 * one, or the one after divert_rule if asked so.
3556 		 */
3557 		int skipto;
3558 
3559 		KKASSERT((args->flags &
3560 		    (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3561 		KKASSERT(args->xlat == NULL);
3562 
3563 		mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3564 		if (mtag != NULL) {
3565 			divinfo = m_tag_data(mtag);
3566 			skipto = divinfo->skipto;
3567 		} else {
3568 			skipto = 0;
3569 		}
3570 
3571 		f = ctx->ipfw_layer3_chain;
3572 		if (args->eh == NULL && skipto != 0) {
3573 			/* No skipto during rule flushing */
3574 			if (ipfw_flushing)
3575 				return IP_FW_DENY;
3576 
3577 			if (skipto >= IPFW_DEFAULT_RULE)
3578 				return IP_FW_DENY; /* invalid */
3579 
3580 			while (f && f->rulenum <= skipto)
3581 				f = f->next;
3582 			if (f == NULL)	/* drop packet */
3583 				return IP_FW_DENY;
3584 		} else if (ipfw_flushing) {
3585 			/* Rules are being flushed; skip to default rule */
3586 			f = ctx->ipfw_default_rule;
3587 		}
3588 	}
3589 	if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3590 		m_tag_delete(m, mtag);
3591 
3592 	/*
3593 	 * Now scan the rules, and parse microinstructions for each rule.
3594 	 */
3595 	for (; f; f = f->next) {
3596 		int l, cmdlen;
3597 		ipfw_insn *cmd;
3598 		int skip_or; /* skip rest of OR block */
3599 
3600 again:
3601 		if (ctx->ipfw_set_disable & (1 << f->set)) {
3602 			args->xlat = NULL;
3603 			continue;
3604 		}
3605 
3606 		if (args->xlat != NULL) {
3607 			args->xlat = NULL;
3608 			l = f->cmd_len - f->act_ofs;
3609 			cmd = ACTION_PTR(f);
3610 		} else {
3611 			l = f->cmd_len;
3612 			cmd = f->cmd;
3613 		}
3614 
3615 		skip_or = 0;
3616 		for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3617 			int match;
3618 
3619 			/*
3620 			 * check_body is a jump target used when we find a
3621 			 * CHECK_STATE, and need to jump to the body of
3622 			 * the target rule.
3623 			 */
3624 check_body:
3625 			cmdlen = F_LEN(cmd);
3626 			/*
3627 			 * An OR block (insn_1 || .. || insn_n) has the
3628 			 * F_OR bit set in all but the last instruction.
3629 			 * The first match will set "skip_or", and cause
3630 			 * the following instructions to be skipped until
3631 			 * past the one with the F_OR bit clear.
3632 			 */
3633 			if (skip_or) {		/* skip this instruction */
3634 				if ((cmd->len & F_OR) == 0)
3635 					skip_or = 0;	/* next one is good */
3636 				continue;
3637 			}
3638 			match = 0; /* set to 1 if we succeed */
3639 
3640 			switch (cmd->opcode) {
3641 			/*
3642 			 * The first set of opcodes compares the packet's
3643 			 * fields with some pattern, setting 'match' if a
3644 			 * match is found. At the end of the loop there is
3645 			 * logic to deal with F_NOT and F_OR flags associated
3646 			 * with the opcode.
3647 			 */
3648 			case O_NOP:
3649 				match = 1;
3650 				break;
3651 
3652 			case O_FORWARD_MAC:
3653 				kprintf("ipfw: opcode %d unimplemented\n",
3654 					cmd->opcode);
3655 				break;
3656 
3657 			case O_GID:
3658 			case O_UID:
3659 				/*
3660 				 * We only check offset == 0 && proto != 0,
3661 				 * as this ensures that we have an IPv4
3662 				 * packet with the ports info.
3663 				 */
3664 				if (lc.offset!=0)
3665 					break;
3666 
3667 				match = ipfw_match_uid(&args->f_id, oif,
3668 					cmd->opcode,
3669 					(uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3670 				break;
3671 
3672 			case O_RECV:
3673 				match = iface_match(m->m_pkthdr.rcvif,
3674 				    (ipfw_insn_if *)cmd);
3675 				break;
3676 
3677 			case O_XMIT:
3678 				match = iface_match(oif, (ipfw_insn_if *)cmd);
3679 				break;
3680 
3681 			case O_VIA:
3682 				match = iface_match(oif ? oif :
3683 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3684 				break;
3685 
3686 			case O_MACADDR2:
3687 				if (args->eh != NULL) {	/* have MAC header */
3688 					uint32_t *want = (uint32_t *)
3689 						((ipfw_insn_mac *)cmd)->addr;
3690 					uint32_t *mask = (uint32_t *)
3691 						((ipfw_insn_mac *)cmd)->mask;
3692 					uint32_t *hdr = (uint32_t *)args->eh;
3693 
3694 					match =
3695 					(want[0] == (hdr[0] & mask[0]) &&
3696 					 want[1] == (hdr[1] & mask[1]) &&
3697 					 want[2] == (hdr[2] & mask[2]));
3698 				}
3699 				break;
3700 
3701 			case O_MAC_TYPE:
3702 				if (args->eh != NULL) {
3703 					uint16_t t =
3704 					    ntohs(args->eh->ether_type);
3705 					uint16_t *p =
3706 					    ((ipfw_insn_u16 *)cmd)->ports;
3707 					int i;
3708 
3709 					/* Special vlan handling */
3710 					if (m->m_flags & M_VLANTAG)
3711 						t = ETHERTYPE_VLAN;
3712 
3713 					for (i = cmdlen - 1; !match && i > 0;
3714 					     i--, p += 2) {
3715 						match =
3716 						(t >= p[0] && t <= p[1]);
3717 					}
3718 				}
3719 				break;
3720 
3721 			case O_FRAG:
3722 				match = (hlen > 0 && lc.offset != 0);
3723 				break;
3724 
3725 			case O_IPFRAG:
3726 				if (hlen > 0) {
3727 					uint16_t off;
3728 
3729 					off = ntohs(ip->ip_off);
3730 					if (off & (IP_MF | IP_OFFMASK))
3731 						match = 1;
3732 				}
3733 				break;
3734 
3735 			case O_IN:	/* "out" is "not in" */
3736 				match = (oif == NULL);
3737 				break;
3738 
3739 			case O_LAYER2:
3740 				match = (args->eh != NULL);
3741 				break;
3742 
3743 			case O_PROTO:
3744 				/*
3745 				 * We do not allow an arg of 0 so the
3746 				 * check of "proto" only suffices.
3747 				 */
3748 				match = (lc.proto == cmd->arg1);
3749 				break;
3750 
3751 			case O_IP_SRC:
3752 				match = (hlen > 0 &&
3753 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3754 				    lc.src_ip.s_addr);
3755 				break;
3756 
3757 			case O_IP_SRC_MASK:
3758 				match = (hlen > 0 &&
3759 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3760 				     (lc.src_ip.s_addr &
3761 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3762 				break;
3763 
3764 			case O_IP_SRC_ME:
3765 				if (hlen > 0) {
3766 					struct ifnet *tif;
3767 
3768 					tif = INADDR_TO_IFP(&lc.src_ip);
3769 					match = (tif != NULL);
3770 				}
3771 				break;
3772 
3773 			case O_IP_SRC_TABLE:
3774 				match = ipfw_table_lookup(ctx, cmd->arg1,
3775 				    &lc.src_ip);
3776 				break;
3777 
3778 			case O_IP_SRC_IFIP:
3779 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3780 				    &lc.src_ip);
3781 				break;
3782 
3783 			case O_IP_DST_SET:
3784 			case O_IP_SRC_SET:
3785 				if (hlen > 0) {
3786 					uint32_t *d = (uint32_t *)(cmd + 1);
3787 					uint32_t addr =
3788 					    cmd->opcode == O_IP_DST_SET ?
3789 						args->f_id.dst_ip :
3790 						args->f_id.src_ip;
3791 
3792 					if (addr < d[0])
3793 						break;
3794 					addr -= d[0]; /* subtract base */
3795 					match =
3796 					(addr < cmd->arg1) &&
3797 					 (d[1 + (addr >> 5)] &
3798 					  (1 << (addr & 0x1f)));
3799 				}
3800 				break;
3801 
3802 			case O_IP_DST:
3803 				match = (hlen > 0 &&
3804 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3805 				    lc.dst_ip.s_addr);
3806 				break;
3807 
3808 			case O_IP_DST_MASK:
3809 				match = (hlen > 0) &&
3810 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3811 				     (lc.dst_ip.s_addr &
3812 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3813 				break;
3814 
3815 			case O_IP_DST_ME:
3816 				if (hlen > 0) {
3817 					struct ifnet *tif;
3818 
3819 					tif = INADDR_TO_IFP(&lc.dst_ip);
3820 					match = (tif != NULL);
3821 				}
3822 				break;
3823 
3824 			case O_IP_DST_TABLE:
3825 				match = ipfw_table_lookup(ctx, cmd->arg1,
3826 				    &lc.dst_ip);
3827 				break;
3828 
3829 			case O_IP_DST_IFIP:
3830 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3831 				    &lc.dst_ip);
3832 				break;
3833 
3834 			case O_IP_SRCPORT:
3835 			case O_IP_DSTPORT:
3836 				/*
3837 				 * offset == 0 && proto != 0 is enough
3838 				 * to guarantee that we have an IPv4
3839 				 * packet with port info.
3840 				 */
3841 				if ((lc.proto==IPPROTO_UDP ||
3842 				     lc.proto==IPPROTO_TCP)
3843 				    && lc.offset == 0) {
3844 					uint16_t x =
3845 					    (cmd->opcode == O_IP_SRCPORT) ?
3846 						lc.src_port : lc.dst_port;
3847 					uint16_t *p =
3848 					    ((ipfw_insn_u16 *)cmd)->ports;
3849 					int i;
3850 
3851 					for (i = cmdlen - 1; !match && i > 0;
3852 					     i--, p += 2) {
3853 						match =
3854 						(x >= p[0] && x <= p[1]);
3855 					}
3856 				}
3857 				break;
3858 
3859 			case O_ICMPCODE:
3860 				match = (lc.offset == 0 &&
3861 				    lc.proto==IPPROTO_ICMP &&
3862 				    icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3863 				break;
3864 
3865 			case O_ICMPTYPE:
3866 				match = (lc.offset == 0 &&
3867 				    lc.proto==IPPROTO_ICMP &&
3868 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3869 				break;
3870 
3871 			case O_IPOPT:
3872 				match = (hlen > 0 && ipopts_match(ip, cmd));
3873 				break;
3874 
3875 			case O_IPVER:
3876 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3877 				break;
3878 
3879 			case O_IPTTL:
3880 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3881 				break;
3882 
3883 			case O_IPID:
3884 				match = (hlen > 0 &&
3885 				    cmd->arg1 == ntohs(ip->ip_id));
3886 				break;
3887 
3888 			case O_IPLEN:
3889 				match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3890 				break;
3891 
3892 			case O_IPPRECEDENCE:
3893 				match = (hlen > 0 &&
3894 				    (cmd->arg1 == (ip->ip_tos & 0xe0)));
3895 				break;
3896 
3897 			case O_IPTOS:
3898 				match = (hlen > 0 &&
3899 				    flags_match(cmd, ip->ip_tos));
3900 				break;
3901 
3902 			case O_TCPFLAGS:
3903 				match = (lc.proto == IPPROTO_TCP &&
3904 				    lc.offset == 0 &&
3905 				    flags_match(cmd,
3906 					L3HDR(struct tcphdr,ip)->th_flags));
3907 				break;
3908 
3909 			case O_TCPOPTS:
3910 				match = (lc.proto == IPPROTO_TCP &&
3911 				    lc.offset == 0 && tcpopts_match(ip, cmd));
3912 				break;
3913 
3914 			case O_TCPSEQ:
3915 				match = (lc.proto == IPPROTO_TCP &&
3916 				    lc.offset == 0 &&
3917 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3918 					L3HDR(struct tcphdr,ip)->th_seq);
3919 				break;
3920 
3921 			case O_TCPACK:
3922 				match = (lc.proto == IPPROTO_TCP &&
3923 				    lc.offset == 0 &&
3924 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3925 					L3HDR(struct tcphdr,ip)->th_ack);
3926 				break;
3927 
3928 			case O_TCPWIN:
3929 				match = (lc.proto == IPPROTO_TCP &&
3930 				    lc.offset == 0 &&
3931 				    cmd->arg1 ==
3932 					L3HDR(struct tcphdr,ip)->th_win);
3933 				break;
3934 
3935 			case O_ESTAB:
3936 				/* reject packets which have SYN only */
3937 				/* XXX should i also check for TH_ACK ? */
3938 				match = (lc.proto == IPPROTO_TCP &&
3939 				    lc.offset == 0 &&
3940 				    (L3HDR(struct tcphdr,ip)->th_flags &
3941 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3942 				break;
3943 
3944 			case O_LOG:
3945 				if (fw_verbose) {
3946 					ipfw_log(ctx, f, hlen, args->eh, m,
3947 					    oif);
3948 				}
3949 				match = 1;
3950 				break;
3951 
3952 			case O_PROB:
3953 				match = (krandom() <
3954 					((ipfw_insn_u32 *)cmd)->d[0]);
3955 				break;
3956 
3957 			/*
3958 			 * The second set of opcodes represents 'actions',
3959 			 * i.e. the terminal part of a rule once the packet
3960 			 * matches all previous patterns.
3961 			 * Typically there is only one action for each rule,
3962 			 * and the opcode is stored at the end of the rule
3963 			 * (but there are exceptions -- see below).
3964 			 *
3965 			 * In general, here we set retval and terminate the
3966 			 * outer loop (would be a 'break 3' in some language,
3967 			 * but we need to do a 'goto done').
3968 			 *
3969 			 * Exceptions:
3970 			 * O_COUNT and O_SKIPTO actions:
3971 			 *   instead of terminating, we jump to the next rule
3972 			 *   ('goto next_rule', equivalent to a 'break 2'),
3973 			 *   or to the SKIPTO target ('goto again' after
3974 			 *   having set f, cmd and l), respectively.
3975 			 *
3976 			 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
3977 			 *   are not real 'actions', and are stored right
3978 			 *   before the 'action' part of the rule.
3979 			 *   These opcodes try to install an entry in the
3980 			 *   state tables; if successful, we continue with
3981 			 *   the next opcode (match=1; break;), otherwise
3982 			 *   the packet must be dropped ('goto done' after
3983 			 *   setting retval).  If static rules are changed
3984 			 *   during the state installation, the packet will
3985 			 *   be dropped and rule's stats will not beupdated
3986 			 *   ('return IP_FW_DENY').
3987 			 *
3988 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
3989 			 *   cause a lookup of the state table, and a jump
3990 			 *   to the 'action' part of the parent rule
3991 			 *   ('goto check_body') if an entry is found, or
3992 			 *   (CHECK_STATE only) a jump to the next rule if
3993 			 *   the entry is not found ('goto next_rule').
3994 			 *   The result of the lookup is cached to make
3995 			 *   further instances of these opcodes are
3996 			 *   effectively NOPs.  If static rules are changed
3997 			 *   during the state looking up, the packet will
3998 			 *   be dropped and rule's stats will not be updated
3999 			 *   ('return IP_FW_DENY').
4000 			 */
4001 			case O_REDIRECT:
4002 				if (f->cross_rules == NULL) {
4003 					/*
4004 					 * This rule was not completely setup;
4005 					 * move on to the next rule.
4006 					 */
4007 					goto next_rule;
4008 				}
4009 				/*
4010 				 * Apply redirect only on input path and
4011 				 * only to non-fragment TCP segments or
4012 				 * UDP datagrams.
4013 				 *
4014 				 * Does _not_ work with layer2 filtering.
4015 				 */
4016 				if (oif != NULL || args->eh != NULL ||
4017 				    (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4018 				    (lc.proto != IPPROTO_TCP &&
4019 				     lc.proto != IPPROTO_UDP))
4020 					break;
4021 				/* FALL THROUGH */
4022 			case O_LIMIT:
4023 			case O_KEEP_STATE:
4024 				if (hlen == 0)
4025 					break;
4026 				s = ipfw_state_install(ctx, f,
4027 				    (ipfw_insn_limit *)cmd, args, lc.tcp);
4028 				if (s == NULL) {
4029 					retval = IP_FW_DENY;
4030 					goto done; /* error/limit violation */
4031 				}
4032 				s->st_pcnt++;
4033 				s->st_bcnt += lc.ip_len;
4034 
4035 				if (s->st_type == O_REDIRECT) {
4036 					struct in_addr oaddr;
4037 					uint16_t oport;
4038 					struct ipfw_xlat *slave_x, *x;
4039 					struct ipfw_state *dup;
4040 
4041 					x = (struct ipfw_xlat *)s;
4042 					ipfw_xlate(x, m, &oaddr, &oport);
4043 					m = ipfw_rehashm(m, hlen, args, &lc,
4044 					    &ip);
4045 					if (m == NULL) {
4046 						ipfw_state_del(ctx, s);
4047 						goto pullup_failed;
4048 					}
4049 
4050 					cpuid = netisr_hashcpu(
4051 					    m->m_pkthdr.hash);
4052 
4053 					slave_x = (struct ipfw_xlat *)
4054 					    ipfw_state_alloc(ctx, &args->f_id,
4055 					    O_REDIRECT, f->cross_rules[cpuid],
4056 					    lc.tcp);
4057 					if (slave_x == NULL) {
4058 						ipfw_state_del(ctx, s);
4059 						retval = IP_FW_DENY;
4060 						goto done;
4061 					}
4062 					slave_x->xlat_addr = oaddr.s_addr;
4063 					slave_x->xlat_port = oport;
4064 					slave_x->xlat_dir = MATCH_REVERSE;
4065 					slave_x->xlat_flags |=
4066 					    IPFW_STATE_F_XLATSRC |
4067 					    IPFW_STATE_F_XLATSLAVE;
4068 
4069 					slave_x->xlat_pair = x;
4070 					slave_x->xlat_pcpu = mycpuid;
4071 					x->xlat_pair = slave_x;
4072 					x->xlat_pcpu = cpuid;
4073 
4074 					ctx->ipfw_xlated++;
4075 					if (cpuid != mycpuid) {
4076 						ctx->ipfw_xlate_split++;
4077 						ipfw_xlate_redispatch(
4078 						    m, cpuid, x,
4079 						    IPFW_XLATE_INSERT |
4080 						    IPFW_XLATE_FORWARD);
4081 						args->m = NULL;
4082 						return (IP_FW_REDISPATCH);
4083 					}
4084 
4085 					dup = ipfw_state_link(ctx,
4086 					    &slave_x->xlat_st);
4087 					if (dup != NULL) {
4088 						ctx->ipfw_xlate_conflicts++;
4089 						if (IPFW_STATE_ISDEAD(dup)) {
4090 							ipfw_state_remove(ctx,
4091 							    dup);
4092 							dup = ipfw_state_link(
4093 							ctx, &slave_x->xlat_st);
4094 						}
4095 						if (dup != NULL) {
4096 							if (bootverbose) {
4097 							    kprintf("ipfw: "
4098 							    "slave %u state "
4099 							    "conflicts "
4100 							    "%u state\n",
4101 							    x->xlat_type,
4102 							    s->st_type);
4103 							}
4104 							ipfw_state_del(ctx, s);
4105 							return (IP_FW_DENY);
4106 						}
4107 						ctx->ipfw_xlate_cresolved++;
4108 					}
4109 				}
4110 				match = 1;
4111 				break;
4112 
4113 			case O_PROBE_STATE:
4114 			case O_CHECK_STATE:
4115 				/*
4116 				 * States are checked at the first keep-state
4117 				 * check-state occurrence, with the result
4118 				 * being stored in dyn_dir.  The compiler
4119 				 * introduces a PROBE_STATE instruction for
4120 				 * us when we have a KEEP_STATE/LIMIT/RDR
4121 				 * (because PROBE_STATE needs to be run first).
4122 				 */
4123 				s = NULL;
4124 				if (dyn_dir == MATCH_UNKNOWN) {
4125 					s = ipfw_state_lookup(ctx,
4126 					    &args->f_id, &dyn_dir, lc.tcp);
4127 				}
4128 				if (s == NULL ||
4129 				    (s->st_type == O_REDIRECT &&
4130 				     (args->eh != NULL ||
4131 				      (ip->ip_off & htons(IP_MF | IP_OFFMASK)) ||
4132 				      (lc.proto != IPPROTO_TCP &&
4133 				       lc.proto != IPPROTO_UDP)))) {
4134 					/*
4135 					 * State not found. If CHECK_STATE,
4136 					 * skip to next rule, if PROBE_STATE
4137 					 * just ignore and continue with next
4138 					 * opcode.
4139 					 */
4140 					if (cmd->opcode == O_CHECK_STATE)
4141 						goto next_rule;
4142 					match = 1;
4143 					break;
4144 				}
4145 
4146 				s->st_pcnt++;
4147 				s->st_bcnt += lc.ip_len;
4148 
4149 				if (s->st_type == O_REDIRECT) {
4150 					struct ipfw_xlat *x =
4151 					    (struct ipfw_xlat *)s;
4152 
4153 					if (oif != NULL &&
4154 					    x->xlat_ifp == NULL) {
4155 						KASSERT(x->xlat_flags &
4156 						    IPFW_STATE_F_XLATSLAVE,
4157 						    ("master rdr state "
4158 						     "missing ifp"));
4159 						x->xlat_ifp = oif;
4160 					} else if (
4161 					    (oif != NULL && x->xlat_ifp!=oif) ||
4162 					    (oif == NULL &&
4163 					     x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4164 						retval = IP_FW_DENY;
4165 						goto done;
4166 					}
4167 					if (x->xlat_dir != dyn_dir)
4168 						goto skip_xlate;
4169 
4170 					ipfw_xlate(x, m, NULL, NULL);
4171 					m = ipfw_rehashm(m, hlen, args, &lc,
4172 					    &ip);
4173 					if (m == NULL)
4174 						goto pullup_failed;
4175 
4176 					cpuid = netisr_hashcpu(
4177 					    m->m_pkthdr.hash);
4178 					if (cpuid != mycpuid) {
4179 						uint32_t xlate = 0;
4180 
4181 						if (oif != NULL) {
4182 							xlate |=
4183 							    IPFW_XLATE_OUTPUT;
4184 						}
4185 						if (dyn_dir == MATCH_FORWARD) {
4186 							xlate |=
4187 							    IPFW_XLATE_FORWARD;
4188 						}
4189 						ipfw_xlate_redispatch(m, cpuid,
4190 						    x, xlate);
4191 						args->m = NULL;
4192 						return (IP_FW_REDISPATCH);
4193 					}
4194 
4195 					KKASSERT(x->xlat_pcpu == mycpuid);
4196 					ipfw_state_update(&args->f_id, dyn_dir,
4197 					    lc.tcp, &x->xlat_pair->xlat_st);
4198 				}
4199 skip_xlate:
4200 				/*
4201 				 * Found a rule from a state; jump to the
4202 				 * 'action' part of the rule.
4203 				 */
4204 				f = s->st_rule;
4205 				KKASSERT(f->cpuid == mycpuid);
4206 
4207 				cmd = ACTION_PTR(f);
4208 				l = f->cmd_len - f->act_ofs;
4209 				dyn_f = f;
4210 				goto check_body;
4211 
4212 			case O_ACCEPT:
4213 				retval = IP_FW_PASS;	/* accept */
4214 				goto done;
4215 
4216 			case O_DEFRAG:
4217 				if (f->cross_rules == NULL) {
4218 					/*
4219 					 * This rule was not completely setup;
4220 					 * move on to the next rule.
4221 					 */
4222 					goto next_rule;
4223 				}
4224 
4225 				/*
4226 				 * Don't defrag for l2 packets, output packets
4227 				 * or non-fragments.
4228 				 */
4229 				if (oif != NULL || args->eh != NULL ||
4230 				    (ip->ip_off & htons(IP_MF | IP_OFFMASK)) == 0)
4231 					goto next_rule;
4232 
4233 				ctx->ipfw_frags++;
4234 				m = ip_reass(m);
4235 				args->m = m;
4236 				if (m == NULL) {
4237 					retval = IP_FW_PASS;
4238 					goto done;
4239 				}
4240 				ctx->ipfw_defraged++;
4241 				KASSERT((m->m_flags & M_HASH) == 0,
4242 				    ("hash not cleared"));
4243 
4244 				/* Update statistics */
4245 				f->pcnt++;
4246 				f->bcnt += lc.ip_len;
4247 				f->timestamp = time_second;
4248 
4249 				ip = mtod(m, struct ip *);
4250 				hlen = ip->ip_hl << 2;
4251 				ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
4252 
4253 				ip_hashfn(&m, 0);
4254 				args->m = m;
4255 				if (m == NULL)
4256 					goto pullup_failed;
4257 
4258 				KASSERT(m->m_flags & M_HASH, ("no hash"));
4259 				cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4260 				if (cpuid != mycpuid) {
4261 					ctx->ipfw_defrag_remote++;
4262 					ipfw_defrag_redispatch(m, cpuid, f);
4263 					args->m = NULL;
4264 					return (IP_FW_REDISPATCH);
4265 				}
4266 
4267 				/* 'm' might be changed by ip_hashfn(). */
4268 				ip = mtod(m, struct ip *);
4269 
4270 				m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4271 				if (m == NULL)
4272 					goto pullup_failed;
4273 
4274 				/* Move on. */
4275 				goto next_rule;
4276 
4277 			case O_PIPE:
4278 			case O_QUEUE:
4279 				args->rule = f; /* report matching rule */
4280 				args->cookie = cmd->arg1;
4281 				retval = IP_FW_DUMMYNET;
4282 				goto done;
4283 
4284 			case O_DIVERT:
4285 			case O_TEE:
4286 				if (args->eh) /* not on layer 2 */
4287 					break;
4288 
4289 				mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4290 				    sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4291 				if (mtag == NULL) {
4292 					retval = IP_FW_DENY;
4293 					goto done;
4294 				}
4295 				divinfo = m_tag_data(mtag);
4296 
4297 				divinfo->skipto = f->rulenum;
4298 				divinfo->port = cmd->arg1;
4299 				divinfo->tee = (cmd->opcode == O_TEE);
4300 				m_tag_prepend(m, mtag);
4301 
4302 				args->cookie = cmd->arg1;
4303 				retval = (cmd->opcode == O_DIVERT) ?
4304 					 IP_FW_DIVERT : IP_FW_TEE;
4305 				goto done;
4306 
4307 			case O_COUNT:
4308 			case O_SKIPTO:
4309 				f->pcnt++;	/* update stats */
4310 				f->bcnt += lc.ip_len;
4311 				f->timestamp = time_second;
4312 				if (cmd->opcode == O_COUNT)
4313 					goto next_rule;
4314 				/* handle skipto */
4315 				if (f->next_rule == NULL)
4316 					lookup_next_rule(f);
4317 				f = f->next_rule;
4318 				goto again;
4319 
4320 			case O_REJECT:
4321 				/*
4322 				 * Drop the packet and send a reject notice
4323 				 * if the packet is not ICMP (or is an ICMP
4324 				 * query), and it is not multicast/broadcast.
4325 				 */
4326 				if (hlen > 0 &&
4327 				    (lc.proto != IPPROTO_ICMP ||
4328 				     is_icmp_query(ip)) &&
4329 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
4330 				    !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4331 					send_reject(args, cmd->arg1,
4332 					    lc.offset, lc.ip_len);
4333 					retval = IP_FW_DENY;
4334 					goto done;
4335 				}
4336 				/* FALLTHROUGH */
4337 			case O_DENY:
4338 				retval = IP_FW_DENY;
4339 				goto done;
4340 
4341 			case O_FORWARD_IP:
4342 				if (args->eh)	/* not valid on layer2 pkts */
4343 					break;
4344 				if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4345 					struct sockaddr_in *sin;
4346 
4347 					mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4348 					    sizeof(*sin), M_INTWAIT | M_NULLOK);
4349 					if (mtag == NULL) {
4350 						retval = IP_FW_DENY;
4351 						goto done;
4352 					}
4353 					sin = m_tag_data(mtag);
4354 
4355 					/* Structure copy */
4356 					*sin = ((ipfw_insn_sa *)cmd)->sa;
4357 
4358 					m_tag_prepend(m, mtag);
4359 					m->m_pkthdr.fw_flags |=
4360 						IPFORWARD_MBUF_TAGGED;
4361 					m->m_pkthdr.fw_flags &=
4362 						~BRIDGE_MBUF_TAGGED;
4363 				}
4364 				retval = IP_FW_PASS;
4365 				goto done;
4366 
4367 			default:
4368 				panic("-- unknown opcode %d", cmd->opcode);
4369 			} /* end of switch() on opcodes */
4370 
4371 			if (cmd->len & F_NOT)
4372 				match = !match;
4373 
4374 			if (match) {
4375 				if (cmd->len & F_OR)
4376 					skip_or = 1;
4377 			} else {
4378 				if (!(cmd->len & F_OR)) /* not an OR block, */
4379 					break;		/* try next rule    */
4380 			}
4381 
4382 		}	/* end of inner for, scan opcodes */
4383 
4384 next_rule:;		/* try next rule		*/
4385 
4386 	}		/* end of outer for, scan rules */
4387 	kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4388 	return IP_FW_DENY;
4389 
4390 done:
4391 	/* Update statistics */
4392 	f->pcnt++;
4393 	f->bcnt += lc.ip_len;
4394 	f->timestamp = time_second;
4395 	return retval;
4396 
4397 pullup_failed:
4398 	if (fw_verbose)
4399 		kprintf("pullup failed\n");
4400 	return IP_FW_DENY;
4401 }
4402 
4403 static struct mbuf *
4404 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4405 {
4406 	struct m_tag *mtag;
4407 	struct dn_pkt *pkt;
4408 	ipfw_insn *cmd;
4409 	const struct ipfw_flow_id *id;
4410 	struct dn_flow_id *fid;
4411 
4412 	M_ASSERTPKTHDR(m);
4413 
4414 	mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4415 	    M_INTWAIT | M_NULLOK);
4416 	if (mtag == NULL) {
4417 		m_freem(m);
4418 		return (NULL);
4419 	}
4420 	m_tag_prepend(m, mtag);
4421 
4422 	pkt = m_tag_data(mtag);
4423 	bzero(pkt, sizeof(*pkt));
4424 
4425 	cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4426 	if (cmd->opcode == O_LOG)
4427 		cmd += F_LEN(cmd);
4428 	KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4429 		("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4430 
4431 	pkt->dn_m = m;
4432 	pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4433 	pkt->ifp = fwa->oif;
4434 	pkt->pipe_nr = pipe_nr;
4435 
4436 	pkt->cpuid = mycpuid;
4437 	pkt->msgport = netisr_curport();
4438 
4439 	id = &fwa->f_id;
4440 	fid = &pkt->id;
4441 	fid->fid_dst_ip = id->dst_ip;
4442 	fid->fid_src_ip = id->src_ip;
4443 	fid->fid_dst_port = id->dst_port;
4444 	fid->fid_src_port = id->src_port;
4445 	fid->fid_proto = id->proto;
4446 	fid->fid_flags = id->flags;
4447 
4448 	ipfw_ref_rule(fwa->rule);
4449 	pkt->dn_priv = fwa->rule;
4450 	pkt->dn_unref_priv = ipfw_unref_rule;
4451 
4452 	if (cmd->opcode == O_PIPE)
4453 		pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4454 
4455 	m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4456 	return (m);
4457 }
4458 
4459 /*
4460  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4461  * These will be reconstructed on the fly as packets are matched.
4462  */
4463 static void
4464 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4465 {
4466 	struct ip_fw *rule;
4467 
4468 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4469 		rule->next_rule = NULL;
4470 }
4471 
4472 static void
4473 ipfw_inc_static_count(struct ip_fw *rule)
4474 {
4475 	/* Static rule's counts are updated only on CPU0 */
4476 	KKASSERT(mycpuid == 0);
4477 
4478 	static_count++;
4479 	static_ioc_len += IOC_RULESIZE(rule);
4480 }
4481 
4482 static void
4483 ipfw_dec_static_count(struct ip_fw *rule)
4484 {
4485 	int l = IOC_RULESIZE(rule);
4486 
4487 	/* Static rule's counts are updated only on CPU0 */
4488 	KKASSERT(mycpuid == 0);
4489 
4490 	KASSERT(static_count > 0, ("invalid static count %u", static_count));
4491 	static_count--;
4492 
4493 	KASSERT(static_ioc_len >= l,
4494 		("invalid static len %u", static_ioc_len));
4495 	static_ioc_len -= l;
4496 }
4497 
4498 static void
4499 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4500 {
4501 	if (fwmsg->sibling != NULL) {
4502 		KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4503 		fwmsg->sibling->sibling = rule;
4504 	}
4505 	fwmsg->sibling = rule;
4506 }
4507 
4508 static struct ip_fw *
4509 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4510 {
4511 	struct ip_fw *rule;
4512 
4513 	rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4514 
4515 	rule->act_ofs = ioc_rule->act_ofs;
4516 	rule->cmd_len = ioc_rule->cmd_len;
4517 	rule->rulenum = ioc_rule->rulenum;
4518 	rule->set = ioc_rule->set;
4519 	rule->usr_flags = ioc_rule->usr_flags;
4520 
4521 	bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4522 
4523 	rule->refcnt = 1;
4524 	rule->cpuid = mycpuid;
4525 	rule->rule_flags = rule_flags;
4526 
4527 	return rule;
4528 }
4529 
4530 static void
4531 ipfw_add_rule_dispatch(netmsg_t nmsg)
4532 {
4533 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4534 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4535 	struct ip_fw *rule;
4536 
4537 	ASSERT_NETISR_NCPUS(mycpuid);
4538 
4539 	rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4540 
4541 	/*
4542 	 * Insert rule into the pre-determined position
4543 	 */
4544 	if (fwmsg->prev_rule != NULL) {
4545 		struct ip_fw *prev, *next;
4546 
4547 		prev = fwmsg->prev_rule;
4548 		KKASSERT(prev->cpuid == mycpuid);
4549 
4550 		next = fwmsg->next_rule;
4551 		KKASSERT(next->cpuid == mycpuid);
4552 
4553 		rule->next = next;
4554 		prev->next = rule;
4555 
4556 		/*
4557 		 * Move to the position on the next CPU
4558 		 * before the msg is forwarded.
4559 		 */
4560 		fwmsg->prev_rule = prev->sibling;
4561 		fwmsg->next_rule = next->sibling;
4562 	} else {
4563 		KKASSERT(fwmsg->next_rule == NULL);
4564 		rule->next = ctx->ipfw_layer3_chain;
4565 		ctx->ipfw_layer3_chain = rule;
4566 	}
4567 
4568 	/* Link rule CPU sibling */
4569 	ipfw_link_sibling(fwmsg, rule);
4570 
4571 	ipfw_flush_rule_ptrs(ctx);
4572 
4573 	if (mycpuid == 0) {
4574 		/* Statistics only need to be updated once */
4575 		ipfw_inc_static_count(rule);
4576 
4577 		/* Return the rule on CPU0 */
4578 		nmsg->lmsg.u.ms_resultp = rule;
4579 	}
4580 
4581 	if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4582 		rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4583 
4584 	if (fwmsg->cross_rules != NULL) {
4585 		/* Save rules for later use. */
4586 		fwmsg->cross_rules[mycpuid] = rule;
4587 	}
4588 
4589 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4590 }
4591 
4592 static void
4593 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4594 {
4595 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4596 	struct ip_fw *rule = fwmsg->sibling;
4597 	int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4598 
4599 	ASSERT_NETISR_NCPUS(mycpuid);
4600 	KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4601 	    ("not crossref rule"));
4602 
4603 	rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4604 	memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4605 
4606 	fwmsg->sibling = rule->sibling;
4607 	netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4608 }
4609 
4610 /*
4611  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4612  * then possibly create a rule number and add the rule to the list.
4613  * Update the rule_number in the input struct so the caller knows
4614  * it as well.
4615  */
4616 static void
4617 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4618 {
4619 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4620 	struct netmsg_ipfw fwmsg;
4621 	struct ip_fw *f, *prev, *rule;
4622 
4623 	ASSERT_NETISR0;
4624 
4625 	/*
4626 	 * If rulenum is 0, find highest numbered rule before the
4627 	 * default rule, and add rule number incremental step.
4628 	 */
4629 	if (ioc_rule->rulenum == 0) {
4630 		int step = autoinc_step;
4631 
4632 		KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4633 			 step <= IPFW_AUTOINC_STEP_MAX);
4634 
4635 		/*
4636 		 * Locate the highest numbered rule before default
4637 		 */
4638 		for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4639 			if (f->rulenum == IPFW_DEFAULT_RULE)
4640 				break;
4641 			ioc_rule->rulenum = f->rulenum;
4642 		}
4643 		if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4644 			ioc_rule->rulenum += step;
4645 	}
4646 	KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4647 		ioc_rule->rulenum != 0,
4648 		("invalid rule num %d", ioc_rule->rulenum));
4649 
4650 	/*
4651 	 * Now find the right place for the new rule in the sorted list.
4652 	 */
4653 	for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4654 	     prev = f, f = f->next) {
4655 		if (f->rulenum > ioc_rule->rulenum) {
4656 			/* Found the location */
4657 			break;
4658 		}
4659 	}
4660 	KASSERT(f != NULL, ("no default rule?!"));
4661 
4662 	/*
4663 	 * Duplicate the rule onto each CPU.
4664 	 * The rule duplicated on CPU0 will be returned.
4665 	 */
4666 	bzero(&fwmsg, sizeof(fwmsg));
4667 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4668 	    ipfw_add_rule_dispatch);
4669 	fwmsg.ioc_rule = ioc_rule;
4670 	fwmsg.prev_rule = prev;
4671 	fwmsg.next_rule = prev == NULL ? NULL : f;
4672 	fwmsg.rule_flags = rule_flags;
4673 	if (rule_flags & IPFW_RULE_F_CROSSREF) {
4674 		fwmsg.cross_rules = kmalloc(
4675 		    sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4676 		    M_WAITOK | M_ZERO);
4677 	}
4678 
4679 	netisr_domsg_global(&fwmsg.base);
4680 	KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4681 
4682 	rule = fwmsg.base.lmsg.u.ms_resultp;
4683 	KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4684 
4685 	if (fwmsg.cross_rules != NULL) {
4686 		netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4687 		    MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4688 		fwmsg.sibling = rule;
4689 		netisr_domsg_global(&fwmsg.base);
4690 		KKASSERT(fwmsg.sibling == NULL);
4691 
4692 		kfree(fwmsg.cross_rules, M_TEMP);
4693 
4694 #ifdef KLD_MODULE
4695 		atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4696 #endif
4697 	}
4698 
4699 	DPRINTF("++ installed rule %d, static count now %d\n",
4700 		rule->rulenum, static_count);
4701 }
4702 
4703 /*
4704  * Free storage associated with a static rule (including derived
4705  * states/tracks).
4706  * The caller is in charge of clearing rule pointers to avoid
4707  * dangling pointers.
4708  * @return a pointer to the next entry.
4709  * Arguments are not checked, so they better be correct.
4710  */
4711 static struct ip_fw *
4712 ipfw_delete_rule(struct ipfw_context *ctx,
4713 		 struct ip_fw *prev, struct ip_fw *rule)
4714 {
4715 	struct ip_fw *n;
4716 
4717 	n = rule->next;
4718 	if (prev == NULL)
4719 		ctx->ipfw_layer3_chain = n;
4720 	else
4721 		prev->next = n;
4722 
4723 	/* Mark the rule as invalid */
4724 	rule->rule_flags |= IPFW_RULE_F_INVALID;
4725 	rule->next_rule = NULL;
4726 	rule->sibling = NULL;
4727 #ifdef foo
4728 	/* Don't reset cpuid here; keep various assertion working */
4729 	rule->cpuid = -1;
4730 #endif
4731 
4732 	/* Statistics only need to be updated once */
4733 	if (mycpuid == 0)
4734 		ipfw_dec_static_count(rule);
4735 
4736 	if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4737 		/* Try to free this rule */
4738 		ipfw_free_rule(rule);
4739 	} else {
4740 		/* TODO: check staging area. */
4741 		if (mycpuid == 0) {
4742 			rule->next = ipfw_gd.ipfw_crossref_free;
4743 			ipfw_gd.ipfw_crossref_free = rule;
4744 		}
4745 	}
4746 
4747 	/* Return the next rule */
4748 	return n;
4749 }
4750 
4751 static void
4752 ipfw_flush_dispatch(netmsg_t nmsg)
4753 {
4754 	int kill_default = nmsg->lmsg.u.ms_result;
4755 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4756 	struct ip_fw *rule;
4757 
4758 	ASSERT_NETISR_NCPUS(mycpuid);
4759 
4760 	/*
4761 	 * Flush states.
4762 	 */
4763 	ipfw_state_flush(ctx, NULL);
4764 	KASSERT(ctx->ipfw_state_cnt == 0,
4765 	    ("%d pcpu states remain", ctx->ipfw_state_cnt));
4766 	ctx->ipfw_state_loosecnt = 0;
4767 	ctx->ipfw_state_lastexp = 0;
4768 
4769 	/*
4770 	 * Flush tracks.
4771 	 */
4772 	ipfw_track_flush(ctx, NULL);
4773 	ctx->ipfw_track_lastexp = 0;
4774 	if (ctx->ipfw_trkcnt_spare != NULL) {
4775 		kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4776 		ctx->ipfw_trkcnt_spare = NULL;
4777 	}
4778 
4779 	ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4780 
4781 	while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4782 	       (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4783 		ipfw_delete_rule(ctx, NULL, rule);
4784 
4785 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4786 }
4787 
4788 /*
4789  * Deletes all rules from a chain (including the default rule
4790  * if the second argument is set).
4791  */
4792 static void
4793 ipfw_flush(int kill_default)
4794 {
4795 	struct netmsg_base nmsg;
4796 #ifdef INVARIANTS
4797 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4798 	int state_cnt;
4799 #endif
4800 
4801 	ASSERT_NETISR0;
4802 
4803 	/*
4804 	 * If 'kill_default' then caller has done the necessary
4805 	 * msgport syncing; unnecessary to do it again.
4806 	 */
4807 	if (!kill_default) {
4808 		/*
4809 		 * Let ipfw_chk() know the rules are going to
4810 		 * be flushed, so it could jump directly to
4811 		 * the default rule.
4812 		 */
4813 		ipfw_flushing = 1;
4814 		/* XXX use priority sync */
4815 		netmsg_service_sync();
4816 	}
4817 
4818 	/*
4819 	 * Press the 'flush' button
4820 	 */
4821 	bzero(&nmsg, sizeof(nmsg));
4822 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4823 	    ipfw_flush_dispatch);
4824 	nmsg.lmsg.u.ms_result = kill_default;
4825 	netisr_domsg_global(&nmsg);
4826 	ipfw_gd.ipfw_state_loosecnt = 0;
4827 	ipfw_gd.ipfw_state_globexp = 0;
4828 	ipfw_gd.ipfw_track_globexp = 0;
4829 
4830 #ifdef INVARIANTS
4831 	state_cnt = ipfw_state_cntcoll();
4832 	KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4833 
4834 	KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4835 	    ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4836 
4837 	if (kill_default) {
4838 		KASSERT(static_count == 0,
4839 			("%u static rules remain", static_count));
4840 		KASSERT(static_ioc_len == 0,
4841 			("%u bytes of static rules remain", static_ioc_len));
4842 	} else {
4843 		KASSERT(static_count == 1,
4844 			("%u static rules remain", static_count));
4845 		KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4846 			("%u bytes of static rules remain, should be %lu",
4847 			 static_ioc_len,
4848 			 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4849 	}
4850 #endif
4851 
4852 	/* Flush is done */
4853 	ipfw_flushing = 0;
4854 }
4855 
4856 static void
4857 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4858 {
4859 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4860 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4861 	struct ip_fw *rule, *prev;
4862 
4863 	ASSERT_NETISR_NCPUS(mycpuid);
4864 
4865 	rule = dmsg->start_rule;
4866 	KKASSERT(rule->cpuid == mycpuid);
4867 	dmsg->start_rule = rule->sibling;
4868 
4869 	prev = dmsg->prev_rule;
4870 	if (prev != NULL) {
4871 		KKASSERT(prev->cpuid == mycpuid);
4872 
4873 		/*
4874 		 * Move to the position on the next CPU
4875 		 * before the msg is forwarded.
4876 		 */
4877 		dmsg->prev_rule = prev->sibling;
4878 	}
4879 
4880 	/*
4881 	 * flush pointers outside the loop, then delete all matching
4882 	 * rules.  'prev' remains the same throughout the cycle.
4883 	 */
4884 	ipfw_flush_rule_ptrs(ctx);
4885 	while (rule && rule->rulenum == dmsg->rulenum) {
4886 		if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4887 			/* Flush states generated by this rule. */
4888 			ipfw_state_flush(ctx, rule);
4889 		}
4890 		if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4891 			/* Flush tracks generated by this rule. */
4892 			ipfw_track_flush(ctx, rule);
4893 		}
4894 		rule = ipfw_delete_rule(ctx, prev, rule);
4895 	}
4896 
4897 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4898 }
4899 
4900 static int
4901 ipfw_alt_delete_rule(uint16_t rulenum)
4902 {
4903 	struct ip_fw *prev, *rule;
4904 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4905 	struct netmsg_del dmsg;
4906 
4907 	ASSERT_NETISR0;
4908 
4909 	/*
4910 	 * Locate first rule to delete
4911 	 */
4912 	for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4913 	     rule && rule->rulenum < rulenum;
4914 	     prev = rule, rule = rule->next)
4915 		; /* EMPTY */
4916 	if (rule->rulenum != rulenum)
4917 		return EINVAL;
4918 
4919 	/*
4920 	 * Get rid of the rule duplications on all CPUs
4921 	 */
4922 	bzero(&dmsg, sizeof(dmsg));
4923 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4924 	    ipfw_alt_delete_rule_dispatch);
4925 	dmsg.prev_rule = prev;
4926 	dmsg.start_rule = rule;
4927 	dmsg.rulenum = rulenum;
4928 
4929 	netisr_domsg_global(&dmsg.base);
4930 	KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4931 	return 0;
4932 }
4933 
4934 static void
4935 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4936 {
4937 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4938 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4939 	struct ip_fw *prev, *rule;
4940 #ifdef INVARIANTS
4941 	int del = 0;
4942 #endif
4943 
4944 	ASSERT_NETISR_NCPUS(mycpuid);
4945 
4946 	ipfw_flush_rule_ptrs(ctx);
4947 
4948 	prev = NULL;
4949 	rule = ctx->ipfw_layer3_chain;
4950 	while (rule != NULL) {
4951 		if (rule->set == dmsg->from_set) {
4952 			if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4953 				/* Flush states generated by this rule. */
4954 				ipfw_state_flush(ctx, rule);
4955 			}
4956 			if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4957 				/* Flush tracks generated by this rule. */
4958 				ipfw_track_flush(ctx, rule);
4959 			}
4960 			rule = ipfw_delete_rule(ctx, prev, rule);
4961 #ifdef INVARIANTS
4962 			del = 1;
4963 #endif
4964 		} else {
4965 			prev = rule;
4966 			rule = rule->next;
4967 		}
4968 	}
4969 	KASSERT(del, ("no match set?!"));
4970 
4971 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4972 }
4973 
4974 static int
4975 ipfw_alt_delete_ruleset(uint8_t set)
4976 {
4977 	struct netmsg_del dmsg;
4978 	int del;
4979 	struct ip_fw *rule;
4980 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4981 
4982 	ASSERT_NETISR0;
4983 
4984 	/*
4985 	 * Check whether the 'set' exists.  If it exists,
4986 	 * then check whether any rules within the set will
4987 	 * try to create states.
4988 	 */
4989 	del = 0;
4990 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
4991 		if (rule->set == set)
4992 			del = 1;
4993 	}
4994 	if (!del)
4995 		return 0; /* XXX EINVAL? */
4996 
4997 	/*
4998 	 * Delete this set
4999 	 */
5000 	bzero(&dmsg, sizeof(dmsg));
5001 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5002 	    ipfw_alt_delete_ruleset_dispatch);
5003 	dmsg.from_set = set;
5004 	netisr_domsg_global(&dmsg.base);
5005 
5006 	return 0;
5007 }
5008 
5009 static void
5010 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5011 {
5012 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5013 	struct ip_fw *rule;
5014 
5015 	ASSERT_NETISR_NCPUS(mycpuid);
5016 
5017 	rule = dmsg->start_rule;
5018 	KKASSERT(rule->cpuid == mycpuid);
5019 
5020 	/*
5021 	 * Move to the position on the next CPU
5022 	 * before the msg is forwarded.
5023 	 */
5024 	dmsg->start_rule = rule->sibling;
5025 
5026 	while (rule && rule->rulenum <= dmsg->rulenum) {
5027 		if (rule->rulenum == dmsg->rulenum)
5028 			rule->set = dmsg->to_set;
5029 		rule = rule->next;
5030 	}
5031 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5032 }
5033 
5034 static int
5035 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5036 {
5037 	struct netmsg_del dmsg;
5038 	struct netmsg_base *nmsg;
5039 	struct ip_fw *rule;
5040 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5041 
5042 	ASSERT_NETISR0;
5043 
5044 	/*
5045 	 * Locate first rule to move
5046 	 */
5047 	for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5048 	     rule = rule->next) {
5049 		if (rule->rulenum == rulenum && rule->set != set)
5050 			break;
5051 	}
5052 	if (rule == NULL || rule->rulenum > rulenum)
5053 		return 0; /* XXX error? */
5054 
5055 	bzero(&dmsg, sizeof(dmsg));
5056 	nmsg = &dmsg.base;
5057 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5058 	    ipfw_alt_move_rule_dispatch);
5059 	dmsg.start_rule = rule;
5060 	dmsg.rulenum = rulenum;
5061 	dmsg.to_set = set;
5062 
5063 	netisr_domsg_global(nmsg);
5064 	KKASSERT(dmsg.start_rule == NULL);
5065 	return 0;
5066 }
5067 
5068 static void
5069 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5070 {
5071 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5072 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5073 	struct ip_fw *rule;
5074 
5075 	ASSERT_NETISR_NCPUS(mycpuid);
5076 
5077 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5078 		if (rule->set == dmsg->from_set)
5079 			rule->set = dmsg->to_set;
5080 	}
5081 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5082 }
5083 
5084 static int
5085 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5086 {
5087 	struct netmsg_del dmsg;
5088 	struct netmsg_base *nmsg;
5089 
5090 	ASSERT_NETISR0;
5091 
5092 	bzero(&dmsg, sizeof(dmsg));
5093 	nmsg = &dmsg.base;
5094 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5095 	    ipfw_alt_move_ruleset_dispatch);
5096 	dmsg.from_set = from_set;
5097 	dmsg.to_set = to_set;
5098 
5099 	netisr_domsg_global(nmsg);
5100 	return 0;
5101 }
5102 
5103 static void
5104 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5105 {
5106 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5107 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5108 	struct ip_fw *rule;
5109 
5110 	ASSERT_NETISR_NCPUS(mycpuid);
5111 
5112 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5113 		if (rule->set == dmsg->from_set)
5114 			rule->set = dmsg->to_set;
5115 		else if (rule->set == dmsg->to_set)
5116 			rule->set = dmsg->from_set;
5117 	}
5118 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5119 }
5120 
5121 static int
5122 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5123 {
5124 	struct netmsg_del dmsg;
5125 	struct netmsg_base *nmsg;
5126 
5127 	ASSERT_NETISR0;
5128 
5129 	bzero(&dmsg, sizeof(dmsg));
5130 	nmsg = &dmsg.base;
5131 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5132 	    ipfw_alt_swap_ruleset_dispatch);
5133 	dmsg.from_set = set1;
5134 	dmsg.to_set = set2;
5135 
5136 	netisr_domsg_global(nmsg);
5137 	return 0;
5138 }
5139 
5140 /*
5141  * Remove all rules with given number, and also do set manipulation.
5142  *
5143  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5144  * the next 8 bits are the new set, the top 8 bits are the command:
5145  *
5146  *	0	delete rules with given number
5147  *	1	delete rules with given set number
5148  *	2	move rules with given number to new set
5149  *	3	move rules with given set number to new set
5150  *	4	swap sets with given numbers
5151  */
5152 static int
5153 ipfw_ctl_alter(uint32_t arg)
5154 {
5155 	uint16_t rulenum;
5156 	uint8_t cmd, new_set;
5157 	int error = 0;
5158 
5159 	ASSERT_NETISR0;
5160 
5161 	rulenum = arg & 0xffff;
5162 	cmd = (arg >> 24) & 0xff;
5163 	new_set = (arg >> 16) & 0xff;
5164 
5165 	if (cmd > 4)
5166 		return EINVAL;
5167 	if (new_set >= IPFW_DEFAULT_SET)
5168 		return EINVAL;
5169 	if (cmd == 0 || cmd == 2) {
5170 		if (rulenum == IPFW_DEFAULT_RULE)
5171 			return EINVAL;
5172 	} else {
5173 		if (rulenum >= IPFW_DEFAULT_SET)
5174 			return EINVAL;
5175 	}
5176 
5177 	switch (cmd) {
5178 	case 0:	/* delete rules with given number */
5179 		error = ipfw_alt_delete_rule(rulenum);
5180 		break;
5181 
5182 	case 1:	/* delete all rules with given set number */
5183 		error = ipfw_alt_delete_ruleset(rulenum);
5184 		break;
5185 
5186 	case 2:	/* move rules with given number to new set */
5187 		error = ipfw_alt_move_rule(rulenum, new_set);
5188 		break;
5189 
5190 	case 3: /* move rules with given set number to new set */
5191 		error = ipfw_alt_move_ruleset(rulenum, new_set);
5192 		break;
5193 
5194 	case 4: /* swap two sets */
5195 		error = ipfw_alt_swap_ruleset(rulenum, new_set);
5196 		break;
5197 	}
5198 	return error;
5199 }
5200 
5201 /*
5202  * Clear counters for a specific rule.
5203  */
5204 static void
5205 clear_counters(struct ip_fw *rule, int log_only)
5206 {
5207 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5208 
5209 	if (log_only == 0) {
5210 		rule->bcnt = rule->pcnt = 0;
5211 		rule->timestamp = 0;
5212 	}
5213 	if (l->o.opcode == O_LOG)
5214 		l->log_left = l->max_log;
5215 }
5216 
5217 static void
5218 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5219 {
5220 	struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5221 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5222 	struct ip_fw *rule;
5223 
5224 	ASSERT_NETISR_NCPUS(mycpuid);
5225 
5226 	if (zmsg->rulenum == 0) {
5227 		KKASSERT(zmsg->start_rule == NULL);
5228 
5229 		ctx->ipfw_norule_counter = 0;
5230 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5231 			clear_counters(rule, zmsg->log_only);
5232 	} else {
5233 		struct ip_fw *start = zmsg->start_rule;
5234 
5235 		KKASSERT(start->cpuid == mycpuid);
5236 		KKASSERT(start->rulenum == zmsg->rulenum);
5237 
5238 		/*
5239 		 * We can have multiple rules with the same number, so we
5240 		 * need to clear them all.
5241 		 */
5242 		for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5243 		     rule = rule->next)
5244 			clear_counters(rule, zmsg->log_only);
5245 
5246 		/*
5247 		 * Move to the position on the next CPU
5248 		 * before the msg is forwarded.
5249 		 */
5250 		zmsg->start_rule = start->sibling;
5251 	}
5252 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5253 }
5254 
5255 /*
5256  * Reset some or all counters on firewall rules.
5257  * @arg frwl is null to clear all entries, or contains a specific
5258  * rule number.
5259  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5260  */
5261 static int
5262 ipfw_ctl_zero_entry(int rulenum, int log_only)
5263 {
5264 	struct netmsg_zent zmsg;
5265 	struct netmsg_base *nmsg;
5266 	const char *msg;
5267 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5268 
5269 	ASSERT_NETISR0;
5270 
5271 	bzero(&zmsg, sizeof(zmsg));
5272 	nmsg = &zmsg.base;
5273 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5274 	    ipfw_zero_entry_dispatch);
5275 	zmsg.log_only = log_only;
5276 
5277 	if (rulenum == 0) {
5278 		msg = log_only ? "ipfw: All logging counts reset.\n"
5279 			       : "ipfw: Accounting cleared.\n";
5280 	} else {
5281 		struct ip_fw *rule;
5282 
5283 		/*
5284 		 * Locate the first rule with 'rulenum'
5285 		 */
5286 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5287 			if (rule->rulenum == rulenum)
5288 				break;
5289 		}
5290 		if (rule == NULL) /* we did not find any matching rules */
5291 			return (EINVAL);
5292 		zmsg.start_rule = rule;
5293 		zmsg.rulenum = rulenum;
5294 
5295 		msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5296 			       : "ipfw: Entry %d cleared.\n";
5297 	}
5298 	netisr_domsg_global(nmsg);
5299 	KKASSERT(zmsg.start_rule == NULL);
5300 
5301 	if (fw_verbose)
5302 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5303 	return (0);
5304 }
5305 
5306 /*
5307  * Check validity of the structure before insert.
5308  * Fortunately rules are simple, so this mostly need to check rule sizes.
5309  */
5310 static int
5311 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5312 {
5313 	int l, cmdlen = 0;
5314 	int have_action = 0;
5315 	ipfw_insn *cmd;
5316 
5317 	*rule_flags = 0;
5318 
5319 	/* Check for valid size */
5320 	if (size < sizeof(*rule)) {
5321 		kprintf("ipfw: rule too short\n");
5322 		return EINVAL;
5323 	}
5324 	l = IOC_RULESIZE(rule);
5325 	if (l != size) {
5326 		kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5327 		return EINVAL;
5328 	}
5329 
5330 	/* Check rule number */
5331 	if (rule->rulenum == IPFW_DEFAULT_RULE) {
5332 		kprintf("ipfw: invalid rule number\n");
5333 		return EINVAL;
5334 	}
5335 
5336 	/*
5337 	 * Now go for the individual checks. Very simple ones, basically only
5338 	 * instruction sizes.
5339 	 */
5340 	for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5341 	     l -= cmdlen, cmd += cmdlen) {
5342 		cmdlen = F_LEN(cmd);
5343 		if (cmdlen > l) {
5344 			kprintf("ipfw: opcode %d size truncated\n",
5345 				cmd->opcode);
5346 			return EINVAL;
5347 		}
5348 
5349 		DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5350 
5351 		if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5352 		    IPFW_ISXLAT(cmd->opcode)) {
5353 			/* This rule will generate states. */
5354 			*rule_flags |= IPFW_RULE_F_GENSTATE;
5355 			if (cmd->opcode == O_LIMIT)
5356 				*rule_flags |= IPFW_RULE_F_GENTRACK;
5357 		}
5358 		if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5359 			*rule_flags |= IPFW_RULE_F_CROSSREF;
5360 		if (cmd->opcode == O_IP_SRC_IFIP ||
5361 		    cmd->opcode == O_IP_DST_IFIP) {
5362 			*rule_flags |= IPFW_RULE_F_DYNIFADDR;
5363 			cmd->arg1 &= IPFW_IFIP_SETTINGS;
5364 		}
5365 
5366 		switch (cmd->opcode) {
5367 		case O_NOP:
5368 		case O_PROBE_STATE:
5369 		case O_KEEP_STATE:
5370 		case O_PROTO:
5371 		case O_IP_SRC_ME:
5372 		case O_IP_DST_ME:
5373 		case O_LAYER2:
5374 		case O_IN:
5375 		case O_FRAG:
5376 		case O_IPFRAG:
5377 		case O_IPOPT:
5378 		case O_IPLEN:
5379 		case O_IPID:
5380 		case O_IPTOS:
5381 		case O_IPPRECEDENCE:
5382 		case O_IPTTL:
5383 		case O_IPVER:
5384 		case O_TCPWIN:
5385 		case O_TCPFLAGS:
5386 		case O_TCPOPTS:
5387 		case O_ESTAB:
5388 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5389 				goto bad_size;
5390 			break;
5391 
5392 		case O_IP_SRC_TABLE:
5393 		case O_IP_DST_TABLE:
5394 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5395 				goto bad_size;
5396 			if (cmd->arg1 >= ipfw_table_max) {
5397 				kprintf("ipfw: invalid table id %u, max %d\n",
5398 				    cmd->arg1, ipfw_table_max);
5399 				return EINVAL;
5400 			}
5401 			break;
5402 
5403 		case O_IP_SRC_IFIP:
5404 		case O_IP_DST_IFIP:
5405 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5406 				goto bad_size;
5407 			break;
5408 
5409 		case O_ICMPCODE:
5410 		case O_ICMPTYPE:
5411 			if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5412 				goto bad_size;
5413 			break;
5414 
5415 		case O_UID:
5416 		case O_GID:
5417 		case O_IP_SRC:
5418 		case O_IP_DST:
5419 		case O_TCPSEQ:
5420 		case O_TCPACK:
5421 		case O_PROB:
5422 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5423 				goto bad_size;
5424 			break;
5425 
5426 		case O_LIMIT:
5427 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5428 				goto bad_size;
5429 			break;
5430 		case O_REDIRECT:
5431 			if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5432 				goto bad_size;
5433 			break;
5434 
5435 		case O_LOG:
5436 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5437 				goto bad_size;
5438 
5439 			((ipfw_insn_log *)cmd)->log_left =
5440 			    ((ipfw_insn_log *)cmd)->max_log;
5441 
5442 			break;
5443 
5444 		case O_IP_SRC_MASK:
5445 		case O_IP_DST_MASK:
5446 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5447 				goto bad_size;
5448 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5449 				kprintf("ipfw: opcode %d, useless rule\n",
5450 					cmd->opcode);
5451 				return EINVAL;
5452 			}
5453 			break;
5454 
5455 		case O_IP_SRC_SET:
5456 		case O_IP_DST_SET:
5457 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5458 				kprintf("ipfw: invalid set size %d\n",
5459 					cmd->arg1);
5460 				return EINVAL;
5461 			}
5462 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5463 			    (cmd->arg1+31)/32 )
5464 				goto bad_size;
5465 			break;
5466 
5467 		case O_MACADDR2:
5468 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5469 				goto bad_size;
5470 			break;
5471 
5472 		case O_MAC_TYPE:
5473 		case O_IP_SRCPORT:
5474 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5475 			if (cmdlen < 2 || cmdlen > 31)
5476 				goto bad_size;
5477 			break;
5478 
5479 		case O_RECV:
5480 		case O_XMIT:
5481 		case O_VIA:
5482 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5483 				goto bad_size;
5484 			break;
5485 
5486 		case O_PIPE:
5487 		case O_QUEUE:
5488 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5489 				goto bad_size;
5490 			goto check_action;
5491 
5492 		case O_FORWARD_IP:
5493 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5494 				goto bad_size;
5495 			} else {
5496 				in_addr_t fwd_addr;
5497 
5498 				fwd_addr = ((ipfw_insn_sa *)cmd)->
5499 					   sa.sin_addr.s_addr;
5500 				if (IN_MULTICAST(ntohl(fwd_addr))) {
5501 					kprintf("ipfw: try forwarding to "
5502 						"multicast address\n");
5503 					return EINVAL;
5504 				}
5505 			}
5506 			goto check_action;
5507 
5508 		case O_FORWARD_MAC: /* XXX not implemented yet */
5509 		case O_CHECK_STATE:
5510 		case O_COUNT:
5511 		case O_ACCEPT:
5512 		case O_DENY:
5513 		case O_REJECT:
5514 		case O_SKIPTO:
5515 		case O_DIVERT:
5516 		case O_TEE:
5517 		case O_DEFRAG:
5518 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5519 				goto bad_size;
5520 check_action:
5521 			if (have_action) {
5522 				kprintf("ipfw: opcode %d, multiple actions"
5523 					" not allowed\n",
5524 					cmd->opcode);
5525 				return EINVAL;
5526 			}
5527 			have_action = 1;
5528 			if (l != cmdlen) {
5529 				kprintf("ipfw: opcode %d, action must be"
5530 					" last opcode\n",
5531 					cmd->opcode);
5532 				return EINVAL;
5533 			}
5534 			break;
5535 		default:
5536 			kprintf("ipfw: opcode %d, unknown opcode\n",
5537 				cmd->opcode);
5538 			return EINVAL;
5539 		}
5540 	}
5541 	if (have_action == 0) {
5542 		kprintf("ipfw: missing action\n");
5543 		return EINVAL;
5544 	}
5545 	return 0;
5546 
5547 bad_size:
5548 	kprintf("ipfw: opcode %d size %d wrong\n",
5549 		cmd->opcode, cmdlen);
5550 	return EINVAL;
5551 }
5552 
5553 static int
5554 ipfw_ctl_add_rule(struct sockopt *sopt)
5555 {
5556 	struct ipfw_ioc_rule *ioc_rule;
5557 	size_t size;
5558 	uint32_t rule_flags;
5559 	int error;
5560 
5561 	ASSERT_NETISR0;
5562 
5563 	size = sopt->sopt_valsize;
5564 	if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5565 	    size < sizeof(*ioc_rule)) {
5566 		return EINVAL;
5567 	}
5568 	if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5569 		sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5570 					  IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5571 	}
5572 	ioc_rule = sopt->sopt_val;
5573 
5574 	error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5575 	if (error)
5576 		return error;
5577 
5578 	ipfw_add_rule(ioc_rule, rule_flags);
5579 
5580 	if (sopt->sopt_dir == SOPT_GET)
5581 		sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5582 	return 0;
5583 }
5584 
5585 static void *
5586 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5587     struct ipfw_ioc_rule *ioc_rule)
5588 {
5589 	const struct ip_fw *sibling;
5590 #ifdef INVARIANTS
5591 	int i;
5592 #endif
5593 
5594 	ASSERT_NETISR0;
5595 	KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5596 
5597 	ioc_rule->act_ofs = rule->act_ofs;
5598 	ioc_rule->cmd_len = rule->cmd_len;
5599 	ioc_rule->rulenum = rule->rulenum;
5600 	ioc_rule->set = rule->set;
5601 	ioc_rule->usr_flags = rule->usr_flags;
5602 
5603 	ioc_rule->set_disable = ctx->ipfw_set_disable;
5604 	ioc_rule->static_count = static_count;
5605 	ioc_rule->static_len = static_ioc_len;
5606 
5607 	/*
5608 	 * Visit (read-only) all of the rule's duplications to get
5609 	 * the necessary statistics
5610 	 */
5611 #ifdef INVARIANTS
5612 	i = 0;
5613 #endif
5614 	ioc_rule->pcnt = 0;
5615 	ioc_rule->bcnt = 0;
5616 	ioc_rule->timestamp = 0;
5617 	for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5618 		ioc_rule->pcnt += sibling->pcnt;
5619 		ioc_rule->bcnt += sibling->bcnt;
5620 		if (sibling->timestamp > ioc_rule->timestamp)
5621 			ioc_rule->timestamp = sibling->timestamp;
5622 #ifdef INVARIANTS
5623 		++i;
5624 #endif
5625 	}
5626 	KASSERT(i == netisr_ncpus,
5627 	    ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5628 
5629 	bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5630 
5631 	return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5632 }
5633 
5634 static boolean_t
5635 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5636 {
5637 	struct ipfw_ioc_flowid *ioc_id;
5638 
5639 	if (trk->tc_expire == 0) {
5640 		/* Not a scanned one. */
5641 		return (FALSE);
5642 	}
5643 
5644 	ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5645 	    0 : trk->tc_expire - time_uptime;
5646 	ioc_state->pcnt = 0;
5647 	ioc_state->bcnt = 0;
5648 
5649 	ioc_state->dyn_type = O_LIMIT_PARENT;
5650 	ioc_state->count = trk->tc_count;
5651 
5652 	ioc_state->rulenum = trk->tc_rulenum;
5653 
5654 	ioc_id = &ioc_state->id;
5655 	ioc_id->type = ETHERTYPE_IP;
5656 	ioc_id->u.ip.proto = trk->tc_proto;
5657 	ioc_id->u.ip.src_ip = trk->tc_saddr;
5658 	ioc_id->u.ip.dst_ip = trk->tc_daddr;
5659 	ioc_id->u.ip.src_port = trk->tc_sport;
5660 	ioc_id->u.ip.dst_port = trk->tc_dport;
5661 
5662 	return (TRUE);
5663 }
5664 
5665 static boolean_t
5666 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5667 {
5668 	struct ipfw_ioc_flowid *ioc_id;
5669 
5670 	if (IPFW_STATE_SCANSKIP(s))
5671 		return (FALSE);
5672 
5673 	ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5674 	    0 : s->st_expire - time_uptime;
5675 	ioc_state->pcnt = s->st_pcnt;
5676 	ioc_state->bcnt = s->st_bcnt;
5677 
5678 	ioc_state->dyn_type = s->st_type;
5679 	ioc_state->count = 0;
5680 
5681 	ioc_state->rulenum = s->st_rule->rulenum;
5682 
5683 	ioc_id = &ioc_state->id;
5684 	ioc_id->type = ETHERTYPE_IP;
5685 	ioc_id->u.ip.proto = s->st_proto;
5686 	ipfw_key_4tuple(&s->st_key,
5687 	    &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5688 	    &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5689 
5690 	if (IPFW_ISXLAT(s->st_type)) {
5691 		const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5692 
5693 		if (x->xlat_port == 0)
5694 			ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5695 		else
5696 			ioc_state->xlat_port = ntohs(x->xlat_port);
5697 		ioc_state->xlat_addr = ntohl(x->xlat_addr);
5698 
5699 		ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5700 		ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5701 	}
5702 
5703 	return (TRUE);
5704 }
5705 
5706 static void
5707 ipfw_state_copy_dispatch(netmsg_t nmsg)
5708 {
5709 	struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5710 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5711 	const struct ipfw_state *s;
5712 	const struct ipfw_track *t;
5713 
5714 	ASSERT_NETISR_NCPUS(mycpuid);
5715 	KASSERT(nm->state_cnt < nm->state_cntmax,
5716 	    ("invalid state count %d, max %d",
5717 	     nm->state_cnt, nm->state_cntmax));
5718 
5719 	TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5720 		if (ipfw_state_copy(s, nm->ioc_state)) {
5721 			nm->ioc_state++;
5722 			nm->state_cnt++;
5723 			if (nm->state_cnt == nm->state_cntmax)
5724 				goto done;
5725 		}
5726 	}
5727 
5728 	/*
5729 	 * Prepare tracks in the global track tree for userland.
5730 	 */
5731 	TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5732 		struct ipfw_trkcnt *trk;
5733 
5734 		if (t->t_count == NULL) /* anchor */
5735 			continue;
5736 		trk = t->t_trkcnt;
5737 
5738 		/*
5739 		 * Only one netisr can run this function at
5740 		 * any time, and only this function accesses
5741 		 * trkcnt's tc_expire, so this is safe w/o
5742 		 * ipfw_gd.ipfw_trkcnt_token.
5743 		 */
5744 		if (trk->tc_expire > t->t_expire)
5745 			continue;
5746 		trk->tc_expire = t->t_expire;
5747 	}
5748 
5749 	/*
5750 	 * Copy tracks in the global track tree to userland in
5751 	 * the last netisr.
5752 	 */
5753 	if (mycpuid == netisr_ncpus - 1) {
5754 		struct ipfw_trkcnt *trk;
5755 
5756 		KASSERT(nm->state_cnt < nm->state_cntmax,
5757 		    ("invalid state count %d, max %d",
5758 		     nm->state_cnt, nm->state_cntmax));
5759 
5760 		IPFW_TRKCNT_TOKGET;
5761 		RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5762 			if (ipfw_track_copy(trk, nm->ioc_state)) {
5763 				nm->ioc_state++;
5764 				nm->state_cnt++;
5765 				if (nm->state_cnt == nm->state_cntmax) {
5766 					IPFW_TRKCNT_TOKREL;
5767 					goto done;
5768 				}
5769 			}
5770 		}
5771 		IPFW_TRKCNT_TOKREL;
5772 	}
5773 done:
5774 	if (nm->state_cnt == nm->state_cntmax) {
5775 		/* No more space; done. */
5776 		netisr_replymsg(&nm->base, 0);
5777 	} else {
5778 		netisr_forwardmsg(&nm->base, mycpuid + 1);
5779 	}
5780 }
5781 
5782 static int
5783 ipfw_ctl_get_rules(struct sockopt *sopt)
5784 {
5785 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5786 	struct ip_fw *rule;
5787 	void *bp;
5788 	size_t size;
5789 	int state_cnt;
5790 
5791 	ASSERT_NETISR0;
5792 
5793 	/*
5794 	 * pass up a copy of the current rules. Static rules
5795 	 * come first (the last of which has number IPFW_DEFAULT_RULE),
5796 	 * followed by a possibly empty list of states.
5797 	 */
5798 
5799 	size = static_ioc_len;	/* size of static rules */
5800 
5801 	/*
5802 	 * Size of the states.
5803 	 * XXX take tracks as state for userland compat.
5804 	 */
5805 	state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5806 	state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5807 	size += state_cnt * sizeof(struct ipfw_ioc_state);
5808 
5809 	if (sopt->sopt_valsize < size) {
5810 		/* short length, no need to return incomplete rules */
5811 		/* XXX: if superuser, no need to zero buffer */
5812 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5813 		return 0;
5814 	}
5815 	bp = sopt->sopt_val;
5816 
5817 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5818 		bp = ipfw_copy_rule(ctx, rule, bp);
5819 
5820 	if (state_cnt) {
5821 		struct netmsg_cpstate nm;
5822 #ifdef INVARIANTS
5823 		size_t old_size = size;
5824 #endif
5825 
5826 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5827 		    MSGF_PRIORITY, ipfw_state_copy_dispatch);
5828 		nm.ioc_state = bp;
5829 		nm.state_cntmax = state_cnt;
5830 		nm.state_cnt = 0;
5831 		netisr_domsg_global(&nm.base);
5832 
5833 		/*
5834 		 * The # of states may be shrinked after the snapshot
5835 		 * of the state count was taken.  To give user a correct
5836 		 * state count, nm->state_cnt is used to recalculate
5837 		 * the actual size.
5838 		 */
5839 		size = static_ioc_len +
5840 		    (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5841 		KKASSERT(size <= old_size);
5842 	}
5843 
5844 	sopt->sopt_valsize = size;
5845 	return 0;
5846 }
5847 
5848 static void
5849 ipfw_set_disable_dispatch(netmsg_t nmsg)
5850 {
5851 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5852 
5853 	ASSERT_NETISR_NCPUS(mycpuid);
5854 
5855 	ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5856 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5857 }
5858 
5859 static void
5860 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5861 {
5862 	struct netmsg_base nmsg;
5863 	uint32_t set_disable;
5864 
5865 	ASSERT_NETISR0;
5866 
5867 	/* IPFW_DEFAULT_SET is always enabled */
5868 	enable |= (1 << IPFW_DEFAULT_SET);
5869 	set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5870 
5871 	bzero(&nmsg, sizeof(nmsg));
5872 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5873 	    ipfw_set_disable_dispatch);
5874 	nmsg.lmsg.u.ms_result32 = set_disable;
5875 
5876 	netisr_domsg_global(&nmsg);
5877 }
5878 
5879 static void
5880 ipfw_table_create_dispatch(netmsg_t nm)
5881 {
5882 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5883 	int tblid = nm->lmsg.u.ms_result;
5884 
5885 	ASSERT_NETISR_NCPUS(mycpuid);
5886 
5887 	if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5888 	    rn_cpumaskhead(mycpuid), 32))
5889 		panic("ipfw: create table%d failed", tblid);
5890 
5891 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5892 }
5893 
5894 static int
5895 ipfw_table_create(struct sockopt *sopt)
5896 {
5897 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5898 	struct ipfw_ioc_table *tbl;
5899 	struct netmsg_base nm;
5900 
5901 	ASSERT_NETISR0;
5902 
5903 	if (sopt->sopt_valsize != sizeof(*tbl))
5904 		return (EINVAL);
5905 
5906 	tbl = sopt->sopt_val;
5907 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5908 		return (EINVAL);
5909 
5910 	if (ctx->ipfw_tables[tbl->tableid] != NULL)
5911 		return (EEXIST);
5912 
5913 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5914 	    ipfw_table_create_dispatch);
5915 	nm.lmsg.u.ms_result = tbl->tableid;
5916 	netisr_domsg_global(&nm);
5917 
5918 	return (0);
5919 }
5920 
5921 static void
5922 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5923 {
5924 	struct radix_node *ret;
5925 
5926 	ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5927 	if (ret != rn)
5928 		panic("deleted other table entry");
5929 	kfree(ret, M_IPFW);
5930 }
5931 
5932 static int
5933 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5934 {
5935 
5936 	ipfw_table_killrn(xrnh, rn);
5937 	return (0);
5938 }
5939 
5940 static void
5941 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5942     int destroy)
5943 {
5944 	struct radix_node_head *rnh;
5945 
5946 	ASSERT_NETISR_NCPUS(mycpuid);
5947 
5948 	rnh = ctx->ipfw_tables[tableid];
5949 	rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5950 	if (destroy) {
5951 		Free(rnh);
5952 		ctx->ipfw_tables[tableid] = NULL;
5953 	}
5954 }
5955 
5956 static void
5957 ipfw_table_flush_dispatch(netmsg_t nmsg)
5958 {
5959 	struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5960 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5961 
5962 	ASSERT_NETISR_NCPUS(mycpuid);
5963 
5964 	ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5965 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5966 }
5967 
5968 static void
5969 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5970 {
5971 	int i;
5972 
5973 	ASSERT_NETISR_NCPUS(mycpuid);
5974 
5975 	for (i = 0; i < ipfw_table_max; ++i) {
5976 		if (ctx->ipfw_tables[i] != NULL)
5977 			ipfw_table_flush_oncpu(ctx, i, destroy);
5978 	}
5979 }
5980 
5981 static void
5982 ipfw_table_flushall_dispatch(netmsg_t nmsg)
5983 {
5984 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5985 
5986 	ASSERT_NETISR_NCPUS(mycpuid);
5987 
5988 	ipfw_table_flushall_oncpu(ctx, 0);
5989 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5990 }
5991 
5992 static int
5993 ipfw_table_flush(struct sockopt *sopt)
5994 {
5995 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5996 	struct ipfw_ioc_table *tbl;
5997 	struct netmsg_tblflush nm;
5998 
5999 	ASSERT_NETISR0;
6000 
6001 	if (sopt->sopt_valsize != sizeof(*tbl))
6002 		return (EINVAL);
6003 
6004 	tbl = sopt->sopt_val;
6005 	if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6006 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6007 		    MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6008 		netisr_domsg_global(&nm.base);
6009 		return (0);
6010 	}
6011 
6012 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6013 		return (EINVAL);
6014 
6015 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6016 		return (ENOENT);
6017 
6018 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6019 	    ipfw_table_flush_dispatch);
6020 	nm.tableid = tbl->tableid;
6021 	nm.destroy = 0;
6022 	if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6023 		nm.destroy = 1;
6024 	netisr_domsg_global(&nm.base);
6025 
6026 	return (0);
6027 }
6028 
6029 static int
6030 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6031 {
6032 	int *cnt = xcnt;
6033 
6034 	(*cnt)++;
6035 	return (0);
6036 }
6037 
6038 static int
6039 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6040 {
6041 	struct ipfw_table_cp *cp = xcp;
6042 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6043 	struct ipfw_ioc_tblent *ioc_te;
6044 #ifdef INVARIANTS
6045 	int cnt;
6046 #endif
6047 
6048 	KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6049 	    cp->te_idx, cp->te_cnt));
6050 	ioc_te = &cp->te[cp->te_idx];
6051 
6052 	if (te->te_nodes->rn_mask != NULL) {
6053 		memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6054 		    *te->te_nodes->rn_mask);
6055 	} else {
6056 		ioc_te->netmask.sin_len = 0;
6057 	}
6058 	memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6059 
6060 	ioc_te->use = te->te_use;
6061 	ioc_te->last_used = te->te_lastuse;
6062 #ifdef INVARIANTS
6063 	cnt = 1;
6064 #endif
6065 
6066 	while ((te = te->te_sibling) != NULL) {
6067 #ifdef INVARIANTS
6068 		++cnt;
6069 #endif
6070 		ioc_te->use += te->te_use;
6071 		if (te->te_lastuse > ioc_te->last_used)
6072 			ioc_te->last_used = te->te_lastuse;
6073 	}
6074 	KASSERT(cnt == netisr_ncpus,
6075 	    ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6076 
6077 	cp->te_idx++;
6078 
6079 	return (0);
6080 }
6081 
6082 static int
6083 ipfw_table_get(struct sockopt *sopt)
6084 {
6085 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6086 	struct radix_node_head *rnh;
6087 	struct ipfw_ioc_table *tbl;
6088 	struct ipfw_ioc_tblcont *cont;
6089 	struct ipfw_table_cp cp;
6090 	int cnt = 0, sz;
6091 
6092 	ASSERT_NETISR0;
6093 
6094 	if (sopt->sopt_valsize < sizeof(*tbl))
6095 		return (EINVAL);
6096 
6097 	tbl = sopt->sopt_val;
6098 	if (tbl->tableid < 0) {
6099 		struct ipfw_ioc_tbllist *list;
6100 		int i;
6101 
6102 		/*
6103 		 * List available table ids.
6104 		 */
6105 		for (i = 0; i < ipfw_table_max; ++i) {
6106 			if (ctx->ipfw_tables[i] != NULL)
6107 				++cnt;
6108 		}
6109 
6110 		sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6111 		if (sopt->sopt_valsize < sz) {
6112 			bzero(sopt->sopt_val, sopt->sopt_valsize);
6113 			return (E2BIG);
6114 		}
6115 		list = sopt->sopt_val;
6116 		list->tablecnt = cnt;
6117 
6118 		cnt = 0;
6119 		for (i = 0; i < ipfw_table_max; ++i) {
6120 			if (ctx->ipfw_tables[i] != NULL) {
6121 				KASSERT(cnt < list->tablecnt,
6122 				    ("invalid idx %d, cnt %d",
6123 				     cnt, list->tablecnt));
6124 				list->tables[cnt++] = i;
6125 			}
6126 		}
6127 		sopt->sopt_valsize = sz;
6128 		return (0);
6129 	} else if (tbl->tableid >= ipfw_table_max) {
6130 		return (EINVAL);
6131 	}
6132 
6133 	rnh = ctx->ipfw_tables[tbl->tableid];
6134 	if (rnh == NULL)
6135 		return (ENOENT);
6136 	rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6137 
6138 	sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6139 	if (sopt->sopt_valsize < sz) {
6140 		bzero(sopt->sopt_val, sopt->sopt_valsize);
6141 		return (E2BIG);
6142 	}
6143 	cont = sopt->sopt_val;
6144 	cont->entcnt = cnt;
6145 
6146 	cp.te = cont->ent;
6147 	cp.te_idx = 0;
6148 	cp.te_cnt = cnt;
6149 	rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6150 
6151 	sopt->sopt_valsize = sz;
6152 	return (0);
6153 }
6154 
6155 static void
6156 ipfw_table_add_dispatch(netmsg_t nmsg)
6157 {
6158 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6159 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6160 	struct radix_node_head *rnh;
6161 	struct ipfw_tblent *te;
6162 
6163 	ASSERT_NETISR_NCPUS(mycpuid);
6164 
6165 	rnh = ctx->ipfw_tables[nm->tableid];
6166 
6167 	te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6168 	te->te_nodes->rn_key = (char *)&te->te_key;
6169 	memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6170 
6171 	if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6172 	    te->te_nodes) == NULL) {
6173 		if (mycpuid == 0) {
6174 			kfree(te, M_IPFW);
6175 			netisr_replymsg(&nm->base, EEXIST);
6176 			return;
6177 		}
6178 		panic("rnh_addaddr failed");
6179 	}
6180 
6181 	/* Link siblings. */
6182 	if (nm->sibling != NULL)
6183 		nm->sibling->te_sibling = te;
6184 	nm->sibling = te;
6185 
6186 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6187 }
6188 
6189 static void
6190 ipfw_table_del_dispatch(netmsg_t nmsg)
6191 {
6192 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6193 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6194 	struct radix_node_head *rnh;
6195 	struct radix_node *rn;
6196 
6197 	ASSERT_NETISR_NCPUS(mycpuid);
6198 
6199 	rnh = ctx->ipfw_tables[nm->tableid];
6200 	rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6201 	if (rn == NULL) {
6202 		if (mycpuid == 0) {
6203 			netisr_replymsg(&nm->base, ESRCH);
6204 			return;
6205 		}
6206 		panic("rnh_deladdr failed");
6207 	}
6208 	kfree(rn, M_IPFW);
6209 
6210 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6211 }
6212 
6213 static int
6214 ipfw_table_alt(struct sockopt *sopt)
6215 {
6216 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6217 	struct ipfw_ioc_tblcont *tbl;
6218 	struct ipfw_ioc_tblent *te;
6219 	struct sockaddr_in key0;
6220 	struct sockaddr *netmask = NULL, *key;
6221 	struct netmsg_tblent nm;
6222 
6223 	ASSERT_NETISR0;
6224 
6225 	if (sopt->sopt_valsize != sizeof(*tbl))
6226 		return (EINVAL);
6227 	tbl = sopt->sopt_val;
6228 
6229 	if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6230 		return (EINVAL);
6231 	if (tbl->entcnt != 1)
6232 		return (EINVAL);
6233 
6234 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6235 		return (ENOENT);
6236 	te = &tbl->ent[0];
6237 
6238 	if (te->key.sin_family != AF_INET ||
6239 	    te->key.sin_port != 0 ||
6240 	    te->key.sin_len != sizeof(struct sockaddr_in))
6241 		return (EINVAL);
6242 	key = (struct sockaddr *)&te->key;
6243 
6244 	if (te->netmask.sin_len != 0) {
6245 		if (te->netmask.sin_port != 0 ||
6246 		    te->netmask.sin_len > sizeof(struct sockaddr_in))
6247 			return (EINVAL);
6248 		netmask = (struct sockaddr *)&te->netmask;
6249 		sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6250 		key = (struct sockaddr *)&key0;
6251 	}
6252 
6253 	if (sopt->sopt_name == IP_FW_TBL_ADD) {
6254 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6255 		    MSGF_PRIORITY, ipfw_table_add_dispatch);
6256 	} else {
6257 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6258 		    MSGF_PRIORITY, ipfw_table_del_dispatch);
6259 	}
6260 	nm.key = key;
6261 	nm.netmask = netmask;
6262 	nm.tableid = tbl->tableid;
6263 	nm.sibling = NULL;
6264 	return (netisr_domsg_global(&nm.base));
6265 }
6266 
6267 static int
6268 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6269 {
6270 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6271 
6272 	te->te_use = 0;
6273 	te->te_lastuse = 0;
6274 	return (0);
6275 }
6276 
6277 static void
6278 ipfw_table_zero_dispatch(netmsg_t nmsg)
6279 {
6280 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6281 	struct radix_node_head *rnh;
6282 
6283 	ASSERT_NETISR_NCPUS(mycpuid);
6284 
6285 	rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6286 	rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6287 
6288 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6289 }
6290 
6291 static void
6292 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6293 {
6294 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6295 	int i;
6296 
6297 	ASSERT_NETISR_NCPUS(mycpuid);
6298 
6299 	for (i = 0; i < ipfw_table_max; ++i) {
6300 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6301 
6302 		if (rnh != NULL)
6303 			rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6304 	}
6305 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6306 }
6307 
6308 static int
6309 ipfw_table_zero(struct sockopt *sopt)
6310 {
6311 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6312 	struct netmsg_base nm;
6313 	struct ipfw_ioc_table *tbl;
6314 
6315 	ASSERT_NETISR0;
6316 
6317 	if (sopt->sopt_valsize != sizeof(*tbl))
6318 		return (EINVAL);
6319 	tbl = sopt->sopt_val;
6320 
6321 	if (tbl->tableid < 0) {
6322 		netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6323 		    ipfw_table_zeroall_dispatch);
6324 		netisr_domsg_global(&nm);
6325 		return (0);
6326 	} else if (tbl->tableid >= ipfw_table_max) {
6327 		return (EINVAL);
6328 	} else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6329 		return (ENOENT);
6330 	}
6331 
6332 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6333 	    ipfw_table_zero_dispatch);
6334 	nm.lmsg.u.ms_result = tbl->tableid;
6335 	netisr_domsg_global(&nm);
6336 
6337 	return (0);
6338 }
6339 
6340 static int
6341 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6342 {
6343 	struct netmsg_tblexp *nm = xnm;
6344 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6345 
6346 	if (te->te_expired) {
6347 		ipfw_table_killrn(nm->rnh, rn);
6348 		nm->expcnt++;
6349 	}
6350 	return (0);
6351 }
6352 
6353 static void
6354 ipfw_table_expire_dispatch(netmsg_t nmsg)
6355 {
6356 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6357 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6358 	struct radix_node_head *rnh;
6359 
6360 	ASSERT_NETISR_NCPUS(mycpuid);
6361 
6362 	rnh = ctx->ipfw_tables[nm->tableid];
6363 	nm->rnh = rnh;
6364 	rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6365 
6366 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6367 	    ("not all expired addresses (%d) were deleted (%d)",
6368 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6369 
6370 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6371 }
6372 
6373 static void
6374 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6375 {
6376 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6377 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6378 	int i;
6379 
6380 	ASSERT_NETISR_NCPUS(mycpuid);
6381 
6382 	for (i = 0; i < ipfw_table_max; ++i) {
6383 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6384 
6385 		if (rnh == NULL)
6386 			continue;
6387 		nm->rnh = rnh;
6388 		rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6389 	}
6390 
6391 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6392 	    ("not all expired addresses (%d) were deleted (%d)",
6393 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6394 
6395 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6396 }
6397 
6398 static int
6399 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6400 {
6401 	struct netmsg_tblexp *nm = xnm;
6402 	struct ipfw_tblent *te;
6403 	time_t lastuse;
6404 
6405 	te = (struct ipfw_tblent *)rn;
6406 	lastuse = te->te_lastuse;
6407 
6408 	while ((te = te->te_sibling) != NULL) {
6409 		if (te->te_lastuse > lastuse)
6410 			lastuse = te->te_lastuse;
6411 	}
6412 	if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6413 		/* Not expired */
6414 		return (0);
6415 	}
6416 
6417 	te = (struct ipfw_tblent *)rn;
6418 	te->te_expired = 1;
6419 	while ((te = te->te_sibling) != NULL)
6420 		te->te_expired = 1;
6421 	nm->cnt++;
6422 
6423 	return (0);
6424 }
6425 
6426 static int
6427 ipfw_table_expire(struct sockopt *sopt)
6428 {
6429 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6430 	struct netmsg_tblexp nm;
6431 	struct ipfw_ioc_tblexp *tbl;
6432 	struct radix_node_head *rnh;
6433 
6434 	ASSERT_NETISR0;
6435 
6436 	if (sopt->sopt_valsize != sizeof(*tbl))
6437 		return (EINVAL);
6438 	tbl = sopt->sopt_val;
6439 	tbl->expcnt = 0;
6440 
6441 	nm.expcnt = 0;
6442 	nm.cnt = 0;
6443 	nm.expire = tbl->expire;
6444 
6445 	if (tbl->tableid < 0) {
6446 		int i;
6447 
6448 		for (i = 0; i < ipfw_table_max; ++i) {
6449 			rnh = ctx->ipfw_tables[i];
6450 			if (rnh == NULL)
6451 				continue;
6452 			rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6453 		}
6454 		if (nm.cnt == 0) {
6455 			/* No addresses can be expired. */
6456 			return (0);
6457 		}
6458 		tbl->expcnt = nm.cnt;
6459 
6460 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6461 		    MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6462 		nm.tableid = -1;
6463 		netisr_domsg_global(&nm.base);
6464 		KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6465 		    ("not all expired addresses (%d) were deleted (%d)",
6466 		     nm.cnt * netisr_ncpus, nm.expcnt));
6467 
6468 		return (0);
6469 	} else if (tbl->tableid >= ipfw_table_max) {
6470 		return (EINVAL);
6471 	}
6472 
6473 	rnh = ctx->ipfw_tables[tbl->tableid];
6474 	if (rnh == NULL)
6475 		return (ENOENT);
6476 	rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6477 	if (nm.cnt == 0) {
6478 		/* No addresses can be expired. */
6479 		return (0);
6480 	}
6481 	tbl->expcnt = nm.cnt;
6482 
6483 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6484 	    ipfw_table_expire_dispatch);
6485 	nm.tableid = tbl->tableid;
6486 	netisr_domsg_global(&nm.base);
6487 	KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6488 	    ("not all expired addresses (%d) were deleted (%d)",
6489 	     nm.cnt * netisr_ncpus, nm.expcnt));
6490 	return (0);
6491 }
6492 
6493 static void
6494 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6495 {
6496 	struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6497 
6498 	KKASSERT((rule->rule_flags &
6499 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6500 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6501 	ipfw_free_rule(rule);
6502 
6503 	netisr_replymsg(&nmsg->base, 0);
6504 }
6505 
6506 static void
6507 ipfw_crossref_reap(void)
6508 {
6509 	struct ip_fw *rule, *prev = NULL;
6510 
6511 	ASSERT_NETISR0;
6512 
6513 	rule = ipfw_gd.ipfw_crossref_free;
6514 	while (rule != NULL) {
6515 		uint64_t inflight = 0;
6516 		int i;
6517 
6518 		for (i = 0; i < netisr_ncpus; ++i)
6519 			inflight += rule->cross_rules[i]->cross_refs;
6520 		if (inflight == 0) {
6521 			struct ip_fw *f = rule;
6522 
6523 			/*
6524 			 * Unlink.
6525 			 */
6526 			rule = rule->next;
6527 			if (prev != NULL)
6528 				prev->next = rule;
6529 			else
6530 				ipfw_gd.ipfw_crossref_free = rule;
6531 
6532 			/*
6533 			 * Free.
6534 			 */
6535 			for (i = 1; i < netisr_ncpus; ++i) {
6536 				struct netmsg_base nm;
6537 
6538 				netmsg_init(&nm, NULL, &curthread->td_msgport,
6539 				    MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6540 				nm.lmsg.u.ms_resultp = f->cross_rules[i];
6541 				netisr_domsg(&nm, i);
6542 			}
6543 			KKASSERT((f->rule_flags &
6544 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6545 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6546 			ipfw_unref_rule(f);
6547 		} else {
6548 			prev = rule;
6549 			rule = rule->next;
6550 		}
6551 	}
6552 
6553 	if (ipfw_gd.ipfw_crossref_free != NULL) {
6554 		callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6555 		    ipfw_crossref_timeo, NULL);
6556 	}
6557 }
6558 
6559 /*
6560  * {set|get}sockopt parser.
6561  */
6562 static int
6563 ipfw_ctl(struct sockopt *sopt)
6564 {
6565 	int error, rulenum;
6566 	uint32_t *masks;
6567 	size_t size;
6568 
6569 	ASSERT_NETISR0;
6570 
6571 	error = 0;
6572 
6573 	switch (sopt->sopt_name) {
6574 	case IP_FW_GET:
6575 		error = ipfw_ctl_get_rules(sopt);
6576 		break;
6577 
6578 	case IP_FW_FLUSH:
6579 		ipfw_flush(0 /* keep default rule */);
6580 		break;
6581 
6582 	case IP_FW_ADD:
6583 		error = ipfw_ctl_add_rule(sopt);
6584 		break;
6585 
6586 	case IP_FW_DEL:
6587 		/*
6588 		 * IP_FW_DEL is used for deleting single rules or sets,
6589 		 * and (ab)used to atomically manipulate sets.
6590 		 * Argument size is used to distinguish between the two:
6591 		 *    sizeof(uint32_t)
6592 		 *	delete single rule or set of rules,
6593 		 *	or reassign rules (or sets) to a different set.
6594 		 *    2 * sizeof(uint32_t)
6595 		 *	atomic disable/enable sets.
6596 		 *	first uint32_t contains sets to be disabled,
6597 		 *	second uint32_t contains sets to be enabled.
6598 		 */
6599 		masks = sopt->sopt_val;
6600 		size = sopt->sopt_valsize;
6601 		if (size == sizeof(*masks)) {
6602 			/*
6603 			 * Delete or reassign static rule
6604 			 */
6605 			error = ipfw_ctl_alter(masks[0]);
6606 		} else if (size == (2 * sizeof(*masks))) {
6607 			/*
6608 			 * Set enable/disable
6609 			 */
6610 			ipfw_ctl_set_disable(masks[0], masks[1]);
6611 		} else {
6612 			error = EINVAL;
6613 		}
6614 		break;
6615 
6616 	case IP_FW_ZERO:
6617 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
6618 		rulenum = 0;
6619 
6620 		if (sopt->sopt_val != 0) {
6621 		    error = soopt_to_kbuf(sopt, &rulenum,
6622 			    sizeof(int), sizeof(int));
6623 		    if (error)
6624 			break;
6625 		}
6626 		error = ipfw_ctl_zero_entry(rulenum,
6627 			sopt->sopt_name == IP_FW_RESETLOG);
6628 		break;
6629 
6630 	case IP_FW_TBL_CREATE:
6631 		error = ipfw_table_create(sopt);
6632 		break;
6633 
6634 	case IP_FW_TBL_ADD:
6635 	case IP_FW_TBL_DEL:
6636 		error = ipfw_table_alt(sopt);
6637 		break;
6638 
6639 	case IP_FW_TBL_FLUSH:
6640 	case IP_FW_TBL_DESTROY:
6641 		error = ipfw_table_flush(sopt);
6642 		break;
6643 
6644 	case IP_FW_TBL_GET:
6645 		error = ipfw_table_get(sopt);
6646 		break;
6647 
6648 	case IP_FW_TBL_ZERO:
6649 		error = ipfw_table_zero(sopt);
6650 		break;
6651 
6652 	case IP_FW_TBL_EXPIRE:
6653 		error = ipfw_table_expire(sopt);
6654 		break;
6655 
6656 	default:
6657 		kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6658 		error = EINVAL;
6659 	}
6660 
6661 	ipfw_crossref_reap();
6662 	return error;
6663 }
6664 
6665 static void
6666 ipfw_keepalive_done(struct ipfw_context *ctx)
6667 {
6668 
6669 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6670 	    ("keepalive is not in progress"));
6671 	ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6672 	callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6673 	    ipfw_keepalive, NULL);
6674 }
6675 
6676 static void
6677 ipfw_keepalive_more(struct ipfw_context *ctx)
6678 {
6679 	struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6680 
6681 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6682 	    ("keepalive is not in progress"));
6683 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6684 	    ("keepalive more did not finish"));
6685 	netisr_sendmsg_oncpu(nm);
6686 }
6687 
6688 static void
6689 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6690 {
6691 	struct ipfw_state *s;
6692 	int scanned = 0, expired = 0, kept = 0;
6693 
6694 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6695 	    ("keepalive is not in progress"));
6696 
6697 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6698 		uint32_t ack_rev, ack_fwd;
6699 		struct ipfw_flow_id id;
6700 		uint8_t send_dir;
6701 
6702 		if (scanned++ >= ipfw_state_scan_max) {
6703 			ipfw_keepalive_more(ctx);
6704 			return;
6705 		}
6706 
6707 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6708 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6709 
6710 		/*
6711 		 * NOTE:
6712 		 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6713 		 * on slave xlat.
6714 		 */
6715 		if (s->st_type == O_ANCHOR)
6716 			continue;
6717 
6718 		if (IPFW_STATE_ISDEAD(s)) {
6719 			ipfw_state_remove(ctx, s);
6720 			if (++expired >= ipfw_state_expire_max) {
6721 				ipfw_keepalive_more(ctx);
6722 				return;
6723 			}
6724 			continue;
6725 		}
6726 
6727 		/*
6728 		 * Keep alive processing
6729 		 */
6730 
6731 		if (s->st_proto != IPPROTO_TCP)
6732 			continue;
6733 		if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6734 			continue;
6735 		if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6736 		    s->st_expire))
6737 			continue;	/* too early */
6738 
6739 		ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6740 		    &id.dst_ip, &id.dst_port);
6741 		ack_rev = s->st_ack_rev;
6742 		ack_fwd = s->st_ack_fwd;
6743 
6744 #define SEND_FWD	0x1
6745 #define SEND_REV	0x2
6746 
6747 		if (IPFW_ISXLAT(s->st_type)) {
6748 			const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6749 
6750 			if (x->xlat_dir == MATCH_FORWARD)
6751 				send_dir = SEND_FWD;
6752 			else
6753 				send_dir = SEND_REV;
6754 		} else {
6755 			send_dir = SEND_FWD | SEND_REV;
6756 		}
6757 
6758 		if (send_dir & SEND_REV)
6759 			send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6760 		if (send_dir & SEND_FWD)
6761 			send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6762 
6763 #undef SEND_FWD
6764 #undef SEND_REV
6765 
6766 		if (++kept >= ipfw_keepalive_max) {
6767 			ipfw_keepalive_more(ctx);
6768 			return;
6769 		}
6770 	}
6771 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6772 	ipfw_keepalive_done(ctx);
6773 }
6774 
6775 static void
6776 ipfw_keepalive_more_dispatch(netmsg_t nm)
6777 {
6778 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6779 	struct ipfw_state *anchor;
6780 
6781 	ASSERT_NETISR_NCPUS(mycpuid);
6782 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6783 	    ("keepalive is not in progress"));
6784 
6785 	/* Reply ASAP */
6786 	netisr_replymsg(&nm->base, 0);
6787 
6788 	anchor = &ctx->ipfw_keepalive_anch;
6789 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6790 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6791 		ipfw_keepalive_done(ctx);
6792 		return;
6793 	}
6794 	ipfw_keepalive_loop(ctx, anchor);
6795 }
6796 
6797 /*
6798  * This procedure is only used to handle keepalives. It is invoked
6799  * every dyn_keepalive_period
6800  */
6801 static void
6802 ipfw_keepalive_dispatch(netmsg_t nm)
6803 {
6804 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6805 	struct ipfw_state *anchor;
6806 
6807 	ASSERT_NETISR_NCPUS(mycpuid);
6808 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6809 	    ("keepalive is in progress"));
6810 	ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6811 
6812 	/* Reply ASAP */
6813 	crit_enter();
6814 	netisr_replymsg(&nm->base, 0);
6815 	crit_exit();
6816 
6817 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6818 		ipfw_keepalive_done(ctx);
6819 		return;
6820 	}
6821 
6822 	anchor = &ctx->ipfw_keepalive_anch;
6823 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6824 	ipfw_keepalive_loop(ctx, anchor);
6825 }
6826 
6827 /*
6828  * This procedure is only used to handle keepalives. It is invoked
6829  * every dyn_keepalive_period
6830  */
6831 static void
6832 ipfw_keepalive(void *dummy __unused)
6833 {
6834 	struct netmsg_base *msg;
6835 
6836 	KKASSERT(mycpuid < netisr_ncpus);
6837 	msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6838 
6839 	crit_enter();
6840 	if (msg->lmsg.ms_flags & MSGF_DONE)
6841 		netisr_sendmsg_oncpu(msg);
6842 	crit_exit();
6843 }
6844 
6845 static void
6846 ipfw_ip_input_dispatch(netmsg_t nmsg)
6847 {
6848 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6849 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6850 	struct mbuf *m = nm->m;
6851 	struct ip_fw *rule = nm->arg1;
6852 
6853 	ASSERT_NETISR_NCPUS(mycpuid);
6854 	KASSERT(rule->cpuid == mycpuid,
6855 	    ("rule does not belong to cpu%d", mycpuid));
6856 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6857 	    ("mbuf does not have ipfw continue rule"));
6858 
6859 	KASSERT(ctx->ipfw_cont_rule == NULL,
6860 	    ("pending ipfw continue rule"));
6861 	ctx->ipfw_cont_rule = rule;
6862 	ip_input(m);
6863 
6864 	/* May not be cleared, if ipfw was unload/disabled. */
6865 	ctx->ipfw_cont_rule = NULL;
6866 
6867 	/*
6868 	 * This rule is no longer used; decrement its cross_refs,
6869 	 * so this rule can be deleted.
6870 	 */
6871 	rule->cross_refs--;
6872 }
6873 
6874 static void
6875 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6876 {
6877 	struct netmsg_genpkt *nm;
6878 
6879 	KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6880 
6881 	/*
6882 	 * NOTE:
6883 	 * Bump cross_refs to prevent this rule and its siblings
6884 	 * from being deleted, while this mbuf is inflight.  The
6885 	 * cross_refs of the sibling rule on the target cpu will
6886 	 * be decremented, once this mbuf is going to be filtered
6887 	 * on the target cpu.
6888 	 */
6889 	rule->cross_refs++;
6890 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6891 
6892 	nm = &m->m_hdr.mh_genmsg;
6893 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6894 	    ipfw_ip_input_dispatch);
6895 	nm->m = m;
6896 	nm->arg1 = rule->cross_rules[cpuid];
6897 	netisr_sendmsg(&nm->base, cpuid);
6898 }
6899 
6900 static void
6901 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6902 {
6903 
6904 	args->flags = 0;
6905 	args->rule = NULL;
6906 	args->xlat = NULL;
6907 
6908 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6909 		struct m_tag *mtag;
6910 
6911 		/* Extract info from dummynet tag */
6912 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6913 		KKASSERT(mtag != NULL);
6914 		args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6915 		KKASSERT(args->rule != NULL);
6916 
6917 		m_tag_delete(m, mtag);
6918 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6919 	} else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6920 		struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6921 
6922 		KKASSERT(ctx->ipfw_cont_rule != NULL);
6923 		args->rule = ctx->ipfw_cont_rule;
6924 		ctx->ipfw_cont_rule = NULL;
6925 
6926 		if (ctx->ipfw_cont_xlat != NULL) {
6927 			args->xlat = ctx->ipfw_cont_xlat;
6928 			ctx->ipfw_cont_xlat = NULL;
6929 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6930 				args->flags |= IP_FWARG_F_XLATINS;
6931 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6932 			}
6933 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6934 				args->flags |= IP_FWARG_F_XLATFWD;
6935 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6936 			}
6937 		}
6938 		KKASSERT((m->m_pkthdr.fw_flags &
6939 		    (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6940 
6941 		args->flags |= IP_FWARG_F_CONT;
6942 		m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6943 	}
6944 
6945 	args->eh = NULL;
6946 	args->oif = oif;
6947 	args->m = m;
6948 }
6949 
6950 static int
6951 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6952 {
6953 	struct ip_fw_args args;
6954 	struct mbuf *m = *m0;
6955 	int tee = 0, error = 0, ret;
6956 
6957 	ipfw_init_args(&args, m, NULL);
6958 
6959 	ret = ipfw_chk(&args);
6960 	m = args.m;
6961 	if (m == NULL) {
6962 		if (ret != IP_FW_REDISPATCH)
6963 			error = EACCES;
6964 		goto back;
6965 	}
6966 
6967 	switch (ret) {
6968 	case IP_FW_PASS:
6969 		break;
6970 
6971 	case IP_FW_DENY:
6972 		m_freem(m);
6973 		m = NULL;
6974 		error = EACCES;
6975 		break;
6976 
6977 	case IP_FW_DUMMYNET:
6978 		/* Send packet to the appropriate pipe */
6979 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6980 		break;
6981 
6982 	case IP_FW_TEE:
6983 		tee = 1;
6984 		/* FALL THROUGH */
6985 
6986 	case IP_FW_DIVERT:
6987 		/*
6988 		 * Must clear bridge tag when changing
6989 		 */
6990 		m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
6991 		if (ip_divert_p != NULL) {
6992 			m = ip_divert_p(m, tee, 1);
6993 		} else {
6994 			m_freem(m);
6995 			m = NULL;
6996 			/* not sure this is the right error msg */
6997 			error = EACCES;
6998 		}
6999 		break;
7000 
7001 	default:
7002 		panic("unknown ipfw return value: %d", ret);
7003 	}
7004 back:
7005 	*m0 = m;
7006 	return error;
7007 }
7008 
7009 static int
7010 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7011 {
7012 	struct ip_fw_args args;
7013 	struct mbuf *m = *m0;
7014 	int tee = 0, error = 0, ret;
7015 
7016 	ipfw_init_args(&args, m, ifp);
7017 
7018 	ret = ipfw_chk(&args);
7019 	m = args.m;
7020 	if (m == NULL) {
7021 		if (ret != IP_FW_REDISPATCH)
7022 			error = EACCES;
7023 		goto back;
7024 	}
7025 
7026 	switch (ret) {
7027 	case IP_FW_PASS:
7028 		break;
7029 
7030 	case IP_FW_DENY:
7031 		m_freem(m);
7032 		m = NULL;
7033 		error = EACCES;
7034 		break;
7035 
7036 	case IP_FW_DUMMYNET:
7037 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7038 		break;
7039 
7040 	case IP_FW_TEE:
7041 		tee = 1;
7042 		/* FALL THROUGH */
7043 
7044 	case IP_FW_DIVERT:
7045 		if (ip_divert_p != NULL) {
7046 			m = ip_divert_p(m, tee, 0);
7047 		} else {
7048 			m_freem(m);
7049 			m = NULL;
7050 			/* not sure this is the right error msg */
7051 			error = EACCES;
7052 		}
7053 		break;
7054 
7055 	default:
7056 		panic("unknown ipfw return value: %d", ret);
7057 	}
7058 back:
7059 	*m0 = m;
7060 	return error;
7061 }
7062 
7063 static void
7064 ipfw_hook(void)
7065 {
7066 	struct pfil_head *pfh;
7067 
7068 	ASSERT_NETISR0;
7069 
7070 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7071 	if (pfh == NULL)
7072 		return;
7073 
7074 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7075 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7076 }
7077 
7078 static void
7079 ipfw_dehook(void)
7080 {
7081 	struct pfil_head *pfh;
7082 
7083 	ASSERT_NETISR0;
7084 
7085 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7086 	if (pfh == NULL)
7087 		return;
7088 
7089 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7090 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7091 }
7092 
7093 static int
7094 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7095 {
7096 	int dyn_cnt;
7097 
7098 	dyn_cnt = ipfw_state_cntcoll();
7099 	dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7100 
7101 	return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7102 }
7103 
7104 static int
7105 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7106 {
7107 	int state_cnt;
7108 
7109 	state_cnt = ipfw_state_cntcoll();
7110 	return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7111 }
7112 
7113 static int
7114 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7115 {
7116 	int state_max, error;
7117 
7118 	state_max = ipfw_state_max;
7119 	error = sysctl_handle_int(oidp, &state_max, 0, req);
7120 	if (error || req->newptr == NULL)
7121 		return (error);
7122 
7123 	if (state_max < 1)
7124 		return (EINVAL);
7125 
7126 	ipfw_state_max_set(state_max);
7127 	return (0);
7128 }
7129 
7130 static int
7131 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7132 {
7133 	int dyn_max, error;
7134 
7135 	dyn_max = ipfw_state_max + ipfw_track_max;
7136 
7137 	error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7138 	if (error || req->newptr == NULL)
7139 		return (error);
7140 
7141 	if (dyn_max < 2)
7142 		return (EINVAL);
7143 
7144 	ipfw_state_max_set(dyn_max / 2);
7145 	ipfw_track_max = dyn_max / 2;
7146 	return (0);
7147 }
7148 
7149 static void
7150 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7151 {
7152 	int enable = nmsg->lmsg.u.ms_result;
7153 
7154 	ASSERT_NETISR0;
7155 
7156 	if (fw_enable == enable)
7157 		goto reply;
7158 
7159 	fw_enable = enable;
7160 	if (fw_enable)
7161 		ipfw_hook();
7162 	else
7163 		ipfw_dehook();
7164 reply:
7165 	netisr_replymsg(&nmsg->base, 0);
7166 }
7167 
7168 static int
7169 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7170 {
7171 	struct netmsg_base nmsg;
7172 	int enable, error;
7173 
7174 	enable = fw_enable;
7175 	error = sysctl_handle_int(oidp, &enable, 0, req);
7176 	if (error || req->newptr == NULL)
7177 		return error;
7178 
7179 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7180 	    ipfw_sysctl_enable_dispatch);
7181 	nmsg.lmsg.u.ms_result = enable;
7182 
7183 	return netisr_domsg(&nmsg, 0);
7184 }
7185 
7186 static int
7187 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7188 {
7189 	return sysctl_int_range(oidp, arg1, arg2, req,
7190 	       IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7191 }
7192 
7193 static int
7194 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7195 {
7196 
7197 	return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7198 }
7199 
7200 static int
7201 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7202 {
7203 	u_long stat = 0;
7204 	int cpu, error;
7205 
7206 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7207 		stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7208 
7209 	error = sysctl_handle_long(oidp, &stat, 0, req);
7210 	if (error || req->newptr == NULL)
7211 		return (error);
7212 
7213 	/* Zero out this stat. */
7214 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7215 		*((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7216 	return (0);
7217 }
7218 
7219 static void
7220 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7221 {
7222 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7223 	struct ipfw_context *ctx;
7224 	struct ip_fw *def_rule;
7225 
7226 	ASSERT_NETISR_NCPUS(mycpuid);
7227 
7228 	ctx = kmalloc(__offsetof(struct ipfw_context,
7229 	    ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7230 
7231 	RB_INIT(&ctx->ipfw_state_tree);
7232 	TAILQ_INIT(&ctx->ipfw_state_list);
7233 
7234 	RB_INIT(&ctx->ipfw_track_tree);
7235 	TAILQ_INIT(&ctx->ipfw_track_list);
7236 
7237 	callout_init_mp(&ctx->ipfw_stateto_ch);
7238 	netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7239 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7240 	ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7241 	netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7242 	    MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7243 
7244 	callout_init_mp(&ctx->ipfw_trackto_ch);
7245 	netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7246 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7247 	netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7248 	    MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7249 
7250 	callout_init_mp(&ctx->ipfw_keepalive_ch);
7251 	netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7252 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7253 	ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7254 	netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7255 	    MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7256 
7257 	callout_init_mp(&ctx->ipfw_xlatreap_ch);
7258 	netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7259 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7260 	TAILQ_INIT(&ctx->ipfw_xlatreap);
7261 
7262 	ipfw_ctx[mycpuid] = ctx;
7263 
7264 	def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7265 
7266 	def_rule->act_ofs = 0;
7267 	def_rule->rulenum = IPFW_DEFAULT_RULE;
7268 	def_rule->cmd_len = 1;
7269 	def_rule->set = IPFW_DEFAULT_SET;
7270 
7271 	def_rule->cmd[0].len = 1;
7272 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7273 	def_rule->cmd[0].opcode = O_ACCEPT;
7274 #else
7275 	if (filters_default_to_accept)
7276 		def_rule->cmd[0].opcode = O_ACCEPT;
7277 	else
7278 		def_rule->cmd[0].opcode = O_DENY;
7279 #endif
7280 
7281 	def_rule->refcnt = 1;
7282 	def_rule->cpuid = mycpuid;
7283 
7284 	/* Install the default rule */
7285 	ctx->ipfw_default_rule = def_rule;
7286 	ctx->ipfw_layer3_chain = def_rule;
7287 
7288 	/* Link rule CPU sibling */
7289 	ipfw_link_sibling(fwmsg, def_rule);
7290 
7291 	/* Statistics only need to be updated once */
7292 	if (mycpuid == 0)
7293 		ipfw_inc_static_count(def_rule);
7294 
7295 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7296 }
7297 
7298 static void
7299 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7300 {
7301 
7302 	crit_enter();
7303 	/* Reply ASAP */
7304 	netisr_replymsg(&nmsg->base, 0);
7305 	crit_exit();
7306 	ipfw_crossref_reap();
7307 }
7308 
7309 static void
7310 ipfw_crossref_timeo(void *dummy __unused)
7311 {
7312 	struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7313 
7314 	KKASSERT(mycpuid == 0);
7315 
7316 	crit_enter();
7317 	if (msg->lmsg.ms_flags & MSGF_DONE)
7318 		netisr_sendmsg_oncpu(msg);
7319 	crit_exit();
7320 }
7321 
7322 static void
7323 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7324 {
7325 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7326 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7327 	struct ip_fw *f;
7328 
7329 	ASSERT_NETISR_NCPUS(mycpuid);
7330 
7331 	for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7332 		int l, cmdlen;
7333 		ipfw_insn *cmd;
7334 
7335 		if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7336 			continue;
7337 
7338 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
7339 		     l -= cmdlen, cmd += cmdlen) {
7340 			cmdlen = F_LEN(cmd);
7341 			if (cmd->opcode == O_IP_SRC_IFIP ||
7342 			    cmd->opcode == O_IP_DST_IFIP) {
7343 				if (strncmp(ifp->if_xname,
7344 				    ((ipfw_insn_ifip *)cmd)->ifname,
7345 				    IFNAMSIZ) == 0)
7346 					cmd->arg1 &= ~IPFW_IFIP_VALID;
7347 			}
7348 		}
7349 	}
7350 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7351 }
7352 
7353 static void
7354 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7355     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7356 {
7357 	struct netmsg_base nm;
7358 
7359 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7360 	    ipfw_ifaddr_dispatch);
7361 	nm.lmsg.u.ms_resultp = ifp;
7362 	netisr_domsg_global(&nm);
7363 }
7364 
7365 static void
7366 ipfw_init_dispatch(netmsg_t nmsg)
7367 {
7368 	struct netmsg_ipfw fwmsg;
7369 	int error = 0, cpu;
7370 
7371 	ASSERT_NETISR0;
7372 
7373 	if (IPFW_LOADED) {
7374 		kprintf("IP firewall already loaded\n");
7375 		error = EEXIST;
7376 		goto reply;
7377 	}
7378 
7379 	if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7380 		ipfw_table_max = UINT16_MAX;
7381 
7382 	/* Initialize global track tree. */
7383 	RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7384 	IPFW_TRKCNT_TOKINIT;
7385 
7386 	/* GC for freed crossref rules. */
7387 	callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7388 	netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7389 	    MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7390 
7391 	ipfw_state_max_set(ipfw_state_max);
7392 	ipfw_state_headroom = 8 * netisr_ncpus;
7393 
7394 	bzero(&fwmsg, sizeof(fwmsg));
7395 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7396 	    ipfw_ctx_init_dispatch);
7397 	netisr_domsg_global(&fwmsg.base);
7398 
7399 	ip_fw_chk_ptr = ipfw_chk;
7400 	ip_fw_ctl_ptr = ipfw_ctl;
7401 	ip_fw_dn_io_ptr = ipfw_dummynet_io;
7402 
7403 	kprintf("ipfw2 initialized, default to %s, logging ",
7404 		ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7405 		O_ACCEPT ? "accept" : "deny");
7406 
7407 #ifdef IPFIREWALL_VERBOSE
7408 	fw_verbose = 1;
7409 #endif
7410 #ifdef IPFIREWALL_VERBOSE_LIMIT
7411 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7412 #endif
7413 	if (fw_verbose == 0) {
7414 		kprintf("disabled\n");
7415 	} else if (verbose_limit == 0) {
7416 		kprintf("unlimited\n");
7417 	} else {
7418 		kprintf("limited to %d packets/entry by default\n",
7419 			verbose_limit);
7420 	}
7421 
7422 	ip_fw_loaded = 1;
7423 	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7424 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7425 		    ipfw_state_expire_ipifunc, NULL, cpu);
7426 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7427 		    ipfw_track_expire_ipifunc, NULL, cpu);
7428 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7429 		    ipfw_keepalive, NULL, cpu);
7430 	}
7431 
7432 	if (fw_enable)
7433 		ipfw_hook();
7434 
7435 	ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7436 	    NULL, EVENTHANDLER_PRI_ANY);
7437 	if (ipfw_ifaddr_event == NULL)
7438 		kprintf("ipfw: ifaddr_event register failed\n");
7439 
7440 reply:
7441 	netisr_replymsg(&nmsg->base, error);
7442 }
7443 
7444 static int
7445 ipfw_init(void)
7446 {
7447 	struct netmsg_base smsg;
7448 
7449 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7450 	    ipfw_init_dispatch);
7451 	return netisr_domsg(&smsg, 0);
7452 }
7453 
7454 #ifdef KLD_MODULE
7455 
7456 static void
7457 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7458 {
7459 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7460 
7461 	ASSERT_NETISR_NCPUS(mycpuid);
7462 
7463 	callout_cancel(&ctx->ipfw_stateto_ch);
7464 	callout_cancel(&ctx->ipfw_trackto_ch);
7465 	callout_cancel(&ctx->ipfw_keepalive_ch);
7466 	callout_cancel(&ctx->ipfw_xlatreap_ch);
7467 
7468 	crit_enter();
7469 	netisr_dropmsg(&ctx->ipfw_stateexp_more);
7470 	netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7471 	netisr_dropmsg(&ctx->ipfw_trackexp_more);
7472 	netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7473 	netisr_dropmsg(&ctx->ipfw_keepalive_more);
7474 	netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7475 	netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7476 	crit_exit();
7477 
7478 	ipfw_table_flushall_oncpu(ctx, 1);
7479 
7480 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7481 }
7482 
7483 static void
7484 ipfw_fini_dispatch(netmsg_t nmsg)
7485 {
7486 	struct netmsg_base nm;
7487 	int error = 0, cpu;
7488 
7489 	ASSERT_NETISR0;
7490 
7491 	ipfw_crossref_reap();
7492 
7493 	if (ipfw_gd.ipfw_refcnt != 0) {
7494 		error = EBUSY;
7495 		goto reply;
7496 	}
7497 
7498 	ip_fw_loaded = 0;
7499 	ipfw_dehook();
7500 
7501 	/* Synchronize any inflight state/track expire IPIs. */
7502 	lwkt_synchronize_ipiqs("ipfwfini");
7503 
7504 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7505 	    ipfw_ctx_fini_dispatch);
7506 	netisr_domsg_global(&nm);
7507 
7508 	callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7509 	crit_enter();
7510 	netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7511 	crit_exit();
7512 
7513 	if (ipfw_ifaddr_event != NULL)
7514 		EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7515 
7516 	ip_fw_chk_ptr = NULL;
7517 	ip_fw_ctl_ptr = NULL;
7518 	ip_fw_dn_io_ptr = NULL;
7519 	ipfw_flush(1 /* kill default rule */);
7520 
7521 	/* Free pre-cpu context */
7522 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7523 		kfree(ipfw_ctx[cpu], M_IPFW);
7524 
7525 	kprintf("IP firewall unloaded\n");
7526 reply:
7527 	netisr_replymsg(&nmsg->base, error);
7528 }
7529 
7530 static void
7531 ipfw_fflush_dispatch(netmsg_t nmsg)
7532 {
7533 
7534 	ipfw_flush(0 /* keep default rule */);
7535 	ipfw_crossref_reap();
7536 	netisr_replymsg(&nmsg->base, 0);
7537 }
7538 
7539 static int
7540 ipfw_fini(void)
7541 {
7542 	struct netmsg_base smsg;
7543 	int i = 0;
7544 
7545 	for (;;) {
7546 		netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7547 		    ipfw_fflush_dispatch);
7548 		netisr_domsg(&smsg, 0);
7549 
7550 		if (ipfw_gd.ipfw_refcnt == 0)
7551 			break;
7552 		kprintf("ipfw: flush pending %d\n", ++i);
7553 		tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7554 	}
7555 
7556 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7557 	    ipfw_fini_dispatch);
7558 	return netisr_domsg(&smsg, 0);
7559 }
7560 
7561 #endif	/* KLD_MODULE */
7562 
7563 static int
7564 ipfw_modevent(module_t mod, int type, void *unused)
7565 {
7566 	int err = 0;
7567 
7568 	switch (type) {
7569 	case MOD_LOAD:
7570 		err = ipfw_init();
7571 		break;
7572 
7573 	case MOD_UNLOAD:
7574 #ifndef KLD_MODULE
7575 		kprintf("ipfw statically compiled, cannot unload\n");
7576 		err = EBUSY;
7577 #else
7578 		err = ipfw_fini();
7579 #endif
7580 		break;
7581 	default:
7582 		break;
7583 	}
7584 	return err;
7585 }
7586 
7587 static moduledata_t ipfwmod = {
7588 	"ipfw",
7589 	ipfw_modevent,
7590 	0
7591 };
7592 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7593 MODULE_VERSION(ipfw, 1);
7594