xref: /dragonfly/sys/net/ipfw/ip_fw2.c (revision e6e77800)
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27 
28 /*
29  * Implement IP packet firewall (new version)
30  */
31 
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58 
59 #include <sys/thread2.h>
60 #include <sys/mplock2.h>
61 #include <net/netmsg2.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/in_systm.h>
65 #include <netinet/in_var.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/ip.h>
68 #include <netinet/ip_var.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcpip.h>
75 #include <netinet/udp.h>
76 #include <netinet/udp_var.h>
77 #include <netinet/ip_divert.h>
78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
79 
80 #include <net/ipfw/ip_fw2.h>
81 
82 #ifdef IPFIREWALL_DEBUG
83 #define DPRINTF(fmt, ...) \
84 do { \
85 	if (fw_debug > 0) \
86 		kprintf(fmt, __VA_ARGS__); \
87 } while (0)
88 #else
89 #define DPRINTF(fmt, ...)	((void)0)
90 #endif
91 
92 /*
93  * Description about per-CPU rule duplication:
94  *
95  * Module loading/unloading and all ioctl operations are serialized
96  * by netisr0, so we don't have any ordering or locking problems.
97  *
98  * Following graph shows how operation on per-CPU rule list is
99  * performed [2 CPU case]:
100  *
101  *   CPU0                 CPU1
102  *
103  * netisr0 <------------------------------------+
104  *  domsg                                       |
105  *    :                                         |
106  *    :(delete/add...)                          |
107  *    :                                         |
108  *    :         netmsg                          | netmsg
109  *  forwardmsg---------->netisr1                |
110  *                          :                   |
111  *                          :(delete/add...)    |
112  *                          :                   |
113  *                          :                   |
114  *                        replymsg--------------+
115  *
116  *
117  *
118  * Rule structure [2 CPU case]
119  *
120  *    CPU0               CPU1
121  *
122  * layer3_chain       layer3_chain
123  *     |                  |
124  *     V                  V
125  * +-------+ sibling  +-------+ sibling
126  * | rule1 |--------->| rule1 |--------->NULL
127  * +-------+          +-------+
128  *     |                  |
129  *     |next              |next
130  *     V                  V
131  * +-------+ sibling  +-------+ sibling
132  * | rule2 |--------->| rule2 |--------->NULL
133  * +-------+          +-------+
134  *
135  * ip_fw.sibling:
136  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
137  *    iterate layer3_chain in netisr0; the current rule's duplication
138  *    to the other CPUs could safely be read-only accessed through
139  *    ip_fw.sibling.
140  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
141  *    a) In netisr0 rule3 is determined to be inserted between rule1
142  *       and rule2.  To make this decision we need to iterate the
143  *       layer3_chain in netisr0.  The netmsg, which is used to insert
144  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
145  *       in netisr0 as next_rule.
146  *    b) After the insertion in netisr0 is done, we will move on to
147  *       netisr1.  But instead of relocating the rule3's position in
148  *       netisr1 by iterating the layer3_chain in netisr1, we set the
149  *       netmsg's prev_rule to rule1->sibling and next_rule to
150  *       rule2->sibling before the netmsg is forwarded to netisr1 from
151  *       netisr0.
152  */
153 
154 /*
155  * Description of states and tracks.
156  *
157  * Both states and tracks are stored in per-cpu RB trees instead of
158  * per-cpu hash tables to avoid the worst case hash degeneration.
159  *
160  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
161  * measured in seconds and depending on the flags.
162  *
163  * When a packet is received, its address fields are first masked with
164  * the mask defined for the rule, then matched against the entries in
165  * the per-cpu state RB tree.  States are generated by 'keep-state'
166  * and 'limit' options.
167  *
168  * The max number of states is ipfw_state_max.  When we reach the
169  * maximum number of states we do not create anymore.  This is done to
170  * avoid consuming too much memory, but also too much time when
171  * searching on each packet.
172  *
173  * Each state holds a pointer to the parent ipfw rule of the current
174  * CPU so we know what action to perform.  States are removed when the
175  * parent rule is deleted.  XXX we should make them survive.
176  *
177  * There are some limitations with states -- we do not obey the
178  * 'randomized match', and we do not do multiple passes through the
179  * firewall.  XXX check the latter!!!
180  *
181  * States grow independently on each CPU, e.g. 2 CPU case:
182  *
183  *        CPU0                     CPU1
184  * ...................      ...................
185  * :  state RB tree  :      :  state RB tree  :
186  * :                 :      :                 :
187  * : state1   state2 :      :      state3     :
188  * :     |    |      :      :        |        :
189  * :.....|....|......:      :........|........:
190  *       |    |                      |
191  *       |    |                      |st_rule
192  *       |    |                      |
193  *       V    V                      V
194  *     +-------+                 +-------+
195  *     | rule1 |                 | rule1 |
196  *     +-------+                 +-------+
197  *
198  * Tracks are used to enforce limits on the number of sessions.  Tracks
199  * are generated by 'limit' option.
200  *
201  * The max number of tracks is ipfw_track_max.  When we reach the
202  * maximum number of tracks we do not create anymore.  This is done to
203  * avoid consuming too much memory.
204  *
205  * Tracks are organized into two layers, track counter RB tree is
206  * shared between CPUs, track RB tree is per-cpu.  States generated by
207  * 'limit' option are linked to the track in addition to the per-cpu
208  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
209  *
210  *             ..............................
211  *             :    track counter RB tree   :
212  *             :                            :
213  *             :        +-----------+       :
214  *             :        |  trkcnt1  |       :
215  *             :        |           |       :
216  *             :      +--->counter<----+    :
217  *             :      | |           |  |    :
218  *             :      | +-----------+  |    :
219  *             :......|................|....:
220  *                    |                |
221  *        CPU0        |                |         CPU1
222  * .................  |t_count         |  .................
223  * : track RB tree :  |                |  : track RB tree :
224  * :               :  |                |  :               :
225  * : +-->track1-------+                +--------track2    :
226  * : |     A       :                      :               :
227  * : |     |       :                      :               :
228  * :.|.....|.......:                      :...............:
229  *   |     +----------------+
230  *   | .................... |
231  *   | :   state RB tree  : |st_track
232  *   | :                  : |
233  *   +---state1    state2---+
234  *     :     |       |    :
235  *     :.....|.......|....:
236  *           |       |
237  *           |       |st_rule
238  *           V       V
239  *         +----------+
240  *         |   rule1  |
241  *         +----------+
242  */
243 
244 #define IPFW_AUTOINC_STEP_MIN	1
245 #define IPFW_AUTOINC_STEP_MAX	1000
246 #define IPFW_AUTOINC_STEP_DEF	100
247 
248 #define IPFW_TABLE_MAX_DEF	64
249 
250 #define	IPFW_DEFAULT_RULE	65535	/* rulenum for the default rule */
251 #define IPFW_DEFAULT_SET	31	/* set number for the default rule */
252 
253 #define MATCH_REVERSE		0
254 #define MATCH_FORWARD		1
255 #define MATCH_NONE		2
256 #define MATCH_UNKNOWN		3
257 
258 #define TIME_LEQ(a, b)		((a) - (b) <= 0)
259 
260 #define IPFW_STATE_TCPFLAGS	(TH_SYN | TH_FIN | TH_RST)
261 #define IPFW_STATE_TCPSTATES	(IPFW_STATE_TCPFLAGS |	\
262 				 (IPFW_STATE_TCPFLAGS << 8))
263 
264 #define BOTH_SYN		(TH_SYN | (TH_SYN << 8))
265 #define BOTH_FIN		(TH_FIN | (TH_FIN << 8))
266 #define BOTH_RST		(TH_RST | (TH_RST << 8))
267 /* TH_ACK here means FIN was ACKed. */
268 #define BOTH_FINACK		(TH_ACK | (TH_ACK << 8))
269 
270 #define IPFW_STATE_TCPCLOSED(s)	((s)->st_proto == IPPROTO_TCP &&	\
271 				 (((s)->st_state & BOTH_RST) ||		\
272 				  ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
273 
274 #define O_ANCHOR		O_NOP
275 
276 #define IPFW_ISXLAT(type)	((type) == O_REDIRECT)
277 #define IPFW_XLAT_INVALID(s)	(IPFW_ISXLAT((s)->st_type) &&	\
278 				 ((struct ipfw_xlat *)(s))->xlat_invalid)
279 
280 #define IPFW_MBUF_XLATINS	FW_MBUF_PRIVATE1
281 #define IPFW_MBUF_XLATFWD	FW_MBUF_PRIVATE2
282 
283 #define IPFW_XLATE_INSERT	0x0001
284 #define IPFW_XLATE_FORWARD	0x0002
285 #define IPFW_XLATE_OUTPUT	0x0004
286 
287 struct netmsg_ipfw {
288 	struct netmsg_base	base;
289 	const struct ipfw_ioc_rule *ioc_rule;
290 	struct ip_fw		*next_rule;
291 	struct ip_fw		*prev_rule;
292 	struct ip_fw		*sibling;
293 	uint32_t		rule_flags;
294 	struct ip_fw		**cross_rules;
295 };
296 
297 struct netmsg_del {
298 	struct netmsg_base	base;
299 	struct ip_fw		*start_rule;
300 	struct ip_fw		*prev_rule;
301 	uint16_t		rulenum;
302 	uint8_t			from_set;
303 	uint8_t			to_set;
304 };
305 
306 struct netmsg_zent {
307 	struct netmsg_base	base;
308 	struct ip_fw		*start_rule;
309 	uint16_t		rulenum;
310 	uint16_t		log_only;
311 };
312 
313 struct netmsg_cpstate {
314 	struct netmsg_base	base;
315 	struct ipfw_ioc_state	*ioc_state;
316 	int			state_cntmax;
317 	int			state_cnt;
318 };
319 
320 struct netmsg_tblent {
321 	struct netmsg_base	base;
322 	struct sockaddr		*key;
323 	struct sockaddr		*netmask;
324 	struct ipfw_tblent	*sibling;
325 	int			tableid;
326 };
327 
328 struct netmsg_tblflush {
329 	struct netmsg_base	base;
330 	int			tableid;
331 	int			destroy;
332 };
333 
334 struct netmsg_tblexp {
335 	struct netmsg_base	base;
336 	time_t			expire;
337 	int			tableid;
338 	int			cnt;
339 	int			expcnt;
340 	struct radix_node_head	*rnh;
341 };
342 
343 struct ipfw_table_cp {
344 	struct ipfw_ioc_tblent	*te;
345 	int			te_idx;
346 	int			te_cnt;
347 };
348 
349 struct ip_fw_local {
350 	/*
351 	 * offset	The offset of a fragment. offset != 0 means that
352 	 *	we have a fragment at this offset of an IPv4 packet.
353 	 *	offset == 0 means that (if this is an IPv4 packet)
354 	 *	this is the first or only fragment.
355 	 */
356 	u_short			offset;
357 
358 	/*
359 	 * Local copies of addresses. They are only valid if we have
360 	 * an IP packet.
361 	 *
362 	 * proto	The protocol. Set to 0 for non-ip packets,
363 	 *	or to the protocol read from the packet otherwise.
364 	 *	proto != 0 means that we have an IPv4 packet.
365 	 *
366 	 * src_port, dst_port	port numbers, in HOST format. Only
367 	 *	valid for TCP and UDP packets.
368 	 *
369 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
370 	 *	Only valid for IPv4 packets.
371 	 */
372 	uint8_t			proto;
373 	uint16_t		src_port;	/* NOTE: host format	*/
374 	uint16_t		dst_port;	/* NOTE: host format	*/
375 	struct in_addr		src_ip;		/* NOTE: network format	*/
376 	struct in_addr		dst_ip;		/* NOTE: network format	*/
377 	uint16_t		ip_len;
378 	struct tcphdr		*tcp;
379 };
380 
381 struct ipfw_addrs {
382 	uint32_t		addr1;	/* host byte order */
383 	uint32_t		addr2;	/* host byte order */
384 };
385 
386 struct ipfw_ports {
387 	uint16_t		port1;	/* host byte order */
388 	uint16_t		port2;	/* host byte order */
389 };
390 
391 struct ipfw_key {
392 	union {
393 		struct ipfw_addrs addrs;
394 		uint64_t	value;
395 	} addr_u;
396 	union {
397 		struct ipfw_ports ports;
398 		uint32_t	value;
399 	} port_u;
400 	uint8_t			proto;
401 	uint8_t			swap;	/* IPFW_KEY_SWAP_ */
402 	uint16_t		rsvd2;
403 };
404 
405 #define IPFW_KEY_SWAP_ADDRS	0x1
406 #define IPFW_KEY_SWAP_PORTS	0x2
407 #define IPFW_KEY_SWAP_ALL	(IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
408 
409 struct ipfw_trkcnt {
410 	RB_ENTRY(ipfw_trkcnt)	tc_rblink;
411 	struct ipfw_key		tc_key;
412 	uintptr_t		tc_ruleid;
413 	int			tc_refs;
414 	int			tc_count;
415 	time_t			tc_expire;	/* userland get-only */
416 	uint16_t		tc_rulenum;	/* userland get-only */
417 } __cachealign;
418 
419 #define tc_addrs		tc_key.addr_u.value
420 #define tc_ports		tc_key.port_u.value
421 #define tc_proto		tc_key.proto
422 #define tc_saddr		tc_key.addr_u.addrs.addr1
423 #define tc_daddr		tc_key.addr_u.addrs.addr2
424 #define tc_sport		tc_key.port_u.ports.port1
425 #define tc_dport		tc_key.port_u.ports.port2
426 
427 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
428 
429 struct ipfw_state;
430 
431 struct ipfw_track {
432 	RB_ENTRY(ipfw_track)	t_rblink;
433 	struct ipfw_key		t_key;
434 	struct ip_fw		*t_rule;
435 	time_t			t_lastexp;
436 	LIST_HEAD(, ipfw_state)	t_state_list;
437 	time_t			t_expire;
438 	volatile int		*t_count;
439 	struct ipfw_trkcnt	*t_trkcnt;
440 	TAILQ_ENTRY(ipfw_track)	t_link;
441 };
442 
443 #define t_addrs			t_key.addr_u.value
444 #define t_ports			t_key.port_u.value
445 #define t_proto			t_key.proto
446 #define t_saddr			t_key.addr_u.addrs.addr1
447 #define t_daddr			t_key.addr_u.addrs.addr2
448 #define t_sport			t_key.port_u.ports.port1
449 #define t_dport			t_key.port_u.ports.port2
450 
451 RB_HEAD(ipfw_track_tree, ipfw_track);
452 TAILQ_HEAD(ipfw_track_list, ipfw_track);
453 
454 struct ipfw_state {
455 	RB_ENTRY(ipfw_state)	st_rblink;
456 	struct ipfw_key		st_key;
457 
458 	time_t			st_expire;	/* expire time */
459 	struct ip_fw		*st_rule;
460 
461 	uint64_t		st_pcnt;	/* packets */
462 	uint64_t		st_bcnt;	/* bytes */
463 
464 	/*
465 	 * st_state:
466 	 * State of this rule, typically a combination of TCP flags.
467 	 *
468 	 * st_ack_fwd/st_ack_rev:
469 	 * Most recent ACKs in forward and reverse direction.  They
470 	 * are used to generate keepalives.
471 	 */
472 	uint32_t		st_state;
473 	uint32_t		st_ack_fwd;	/* host byte order */
474 	uint32_t		st_seq_fwd;	/* host byte order */
475 	uint32_t		st_ack_rev;	/* host byte order */
476 	uint32_t		st_seq_rev;	/* host byte order */
477 
478 	uint16_t		st_flags;	/* IPFW_STATE_F_ */
479 	uint16_t		st_type;	/* KEEP_STATE/LIMIT/RDR */
480 	struct ipfw_track	*st_track;
481 
482 	LIST_ENTRY(ipfw_state)	st_trklink;
483 	TAILQ_ENTRY(ipfw_state)	st_link;
484 };
485 
486 #define st_addrs		st_key.addr_u.value
487 #define st_ports		st_key.port_u.value
488 #define st_proto		st_key.proto
489 #define st_swap			st_key.swap
490 
491 #define IPFW_STATE_F_ACKFWD	0x0001
492 #define IPFW_STATE_F_SEQFWD	0x0002
493 #define IPFW_STATE_F_ACKREV	0x0004
494 #define IPFW_STATE_F_SEQREV	0x0008
495 #define IPFW_STATE_F_XLATSRC	0x0010
496 #define IPFW_STATE_F_XLATSLAVE	0x0020
497 #define IPFW_STATE_F_LINKED	0x0040
498 
499 #define IPFW_STATE_SCANSKIP(s)	((s)->st_type == O_ANCHOR ||	\
500 				 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
501 
502 /* Expired or being deleted. */
503 #define IPFW_STATE_ISDEAD(s)	(TIME_LEQ((s)->st_expire, time_uptime) || \
504 				 IPFW_XLAT_INVALID((s)))
505 
506 TAILQ_HEAD(ipfw_state_list, ipfw_state);
507 RB_HEAD(ipfw_state_tree, ipfw_state);
508 
509 struct ipfw_xlat {
510 	struct ipfw_state	xlat_st;	/* MUST be the first field */
511 	uint32_t		xlat_addr;	/* network byte order */
512 	uint16_t		xlat_port;	/* network byte order */
513 	uint16_t		xlat_dir;	/* MATCH_ */
514 	struct ifnet		*xlat_ifp;	/* matching ifnet */
515 	struct ipfw_xlat	*xlat_pair;	/* paired state */
516 	int			xlat_pcpu;	/* paired cpu */
517 	volatile int		xlat_invalid;	/* invalid, but not dtor yet */
518 	volatile uint64_t	xlat_crefs;	/* cross references */
519 	struct netmsg_base	xlat_freenm;	/* for remote free */
520 };
521 
522 #define xlat_type		xlat_st.st_type
523 #define xlat_flags		xlat_st.st_flags
524 #define xlat_rule		xlat_st.st_rule
525 #define xlat_bcnt		xlat_st.st_bcnt
526 #define xlat_pcnt		xlat_st.st_pcnt
527 
528 struct ipfw_tblent {
529 	struct radix_node	te_nodes[2];
530 	struct sockaddr_in	te_key;
531 	u_long			te_use;
532 	time_t			te_lastuse;
533 	struct ipfw_tblent	*te_sibling;
534 	volatile int		te_expired;
535 };
536 
537 struct ipfw_context {
538 	struct ip_fw		*ipfw_layer3_chain;	/* rules for layer3 */
539 	struct ip_fw		*ipfw_default_rule;	/* default rule */
540 	uint64_t		ipfw_norule_counter;	/* ipfw_log(NULL) stat*/
541 
542 	/*
543 	 * ipfw_set_disable contains one bit per set value (0..31).
544 	 * If the bit is set, all rules with the corresponding set
545 	 * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
546 	 * default rule and CANNOT be disabled.
547 	 */
548 	uint32_t		ipfw_set_disable;
549 
550 	uint8_t			ipfw_flags;	/* IPFW_FLAG_ */
551 
552 	struct ip_fw		*ipfw_cont_rule;
553 	struct ipfw_xlat	*ipfw_cont_xlat;
554 
555 	struct ipfw_state_tree	ipfw_state_tree;
556 	struct ipfw_state_list	ipfw_state_list;
557 	int			ipfw_state_loosecnt;
558 	int			ipfw_state_cnt;
559 
560 	union {
561 		struct ipfw_state state;
562 		struct ipfw_track track;
563 		struct ipfw_trkcnt trkcnt;
564 	} ipfw_tmpkey;
565 
566 	struct ipfw_track_tree	ipfw_track_tree;
567 	struct ipfw_track_list	ipfw_track_list;
568 	struct ipfw_trkcnt	*ipfw_trkcnt_spare;
569 
570 	struct callout		ipfw_stateto_ch;
571 	time_t			ipfw_state_lastexp;
572 	struct netmsg_base	ipfw_stateexp_nm;
573 	struct netmsg_base	ipfw_stateexp_more;
574 	struct ipfw_state	ipfw_stateexp_anch;
575 
576 	struct callout		ipfw_trackto_ch;
577 	time_t			ipfw_track_lastexp;
578 	struct netmsg_base	ipfw_trackexp_nm;
579 	struct netmsg_base	ipfw_trackexp_more;
580 	struct ipfw_track	ipfw_trackexp_anch;
581 
582 	struct callout		ipfw_keepalive_ch;
583 	struct netmsg_base	ipfw_keepalive_nm;
584 	struct netmsg_base	ipfw_keepalive_more;
585 	struct ipfw_state	ipfw_keepalive_anch;
586 
587 	struct callout		ipfw_xlatreap_ch;
588 	struct netmsg_base	ipfw_xlatreap_nm;
589 	struct ipfw_state_list	ipfw_xlatreap;
590 
591 	/*
592 	 * Statistics
593 	 */
594 	u_long			ipfw_sts_reap;
595 	u_long			ipfw_sts_reapfailed;
596 	u_long			ipfw_sts_overflow;
597 	u_long			ipfw_sts_nomem;
598 	u_long			ipfw_sts_tcprecycled;
599 
600 	u_long			ipfw_tks_nomem;
601 	u_long			ipfw_tks_reap;
602 	u_long			ipfw_tks_reapfailed;
603 	u_long			ipfw_tks_overflow;
604 	u_long			ipfw_tks_cntnomem;
605 
606 	u_long			ipfw_frags;
607 	u_long			ipfw_defraged;
608 	u_long			ipfw_defrag_remote;
609 
610 	u_long			ipfw_xlated;
611 	u_long			ipfw_xlate_split;
612 	u_long			ipfw_xlate_conflicts;
613 	u_long			ipfw_xlate_cresolved;
614 
615 	/* Last field */
616 	struct radix_node_head	*ipfw_tables[];
617 };
618 
619 #define IPFW_FLAG_KEEPALIVE	0x01
620 #define IPFW_FLAG_STATEEXP	0x02
621 #define IPFW_FLAG_TRACKEXP	0x04
622 #define IPFW_FLAG_STATEREAP	0x08
623 #define IPFW_FLAG_TRACKREAP	0x10
624 
625 #define ipfw_state_tmpkey	ipfw_tmpkey.state
626 #define ipfw_track_tmpkey	ipfw_tmpkey.track
627 #define ipfw_trkcnt_tmpkey	ipfw_tmpkey.trkcnt
628 
629 struct ipfw_global {
630 	int			ipfw_state_loosecnt;	/* cache aligned */
631 	time_t			ipfw_state_globexp __cachealign;
632 
633 	struct lwkt_token	ipfw_trkcnt_token __cachealign;
634 	struct ipfw_trkcnt_tree	ipfw_trkcnt_tree;
635 	int			ipfw_trkcnt_cnt;
636 	time_t			ipfw_track_globexp;
637 
638 	/* Accessed in netisr0. */
639 	struct ip_fw		*ipfw_crossref_free __cachealign;
640 	struct callout		ipfw_crossref_ch;
641 	struct netmsg_base	ipfw_crossref_nm;
642 
643 #ifdef KLD_MODULE
644 	/*
645 	 * Module can not be unloaded, if there are references to
646 	 * certains rules of ipfw(4), e.g. dummynet(4)
647 	 */
648 	int			ipfw_refcnt __cachealign;
649 #endif
650 } __cachealign;
651 
652 static struct ipfw_context	*ipfw_ctx[MAXCPU];
653 
654 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
655 
656 /*
657  * Following two global variables are accessed and updated only
658  * in netisr0.
659  */
660 static uint32_t static_count;	/* # of static rules */
661 static uint32_t static_ioc_len;	/* bytes of static rules */
662 
663 /*
664  * If 1, then ipfw static rules are being flushed,
665  * ipfw_chk() will skip to the default rule.
666  */
667 static int ipfw_flushing;
668 
669 static int fw_verbose;
670 static int verbose_limit;
671 
672 static int fw_debug;
673 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
674 
675 static int	ipfw_table_max = IPFW_TABLE_MAX_DEF;
676 
677 static int	ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
678 static int	ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
679 
680 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
681 
682 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
683 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
684     "Firewall statistics");
685 
686 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
687     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
688 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
689     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
690     "Rule number autincrement step");
691 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
692     &fw_one_pass, 0,
693     "Only do a single pass through ipfw when using dummynet(4)");
694 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
695     &fw_debug, 0, "Enable printing of debug ip_fw statements");
696 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
697     &fw_verbose, 0, "Log matches to ipfw rules");
698 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
699     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
700 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
701     &ipfw_table_max, 0, "Max # of tables");
702 
703 static int	ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
704 static int	ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
705 static int	ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
706 static int	ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
707 static int	ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
708 static int	ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
709 
710 /*
711  * Timeouts for various events in handing states.
712  *
713  * NOTE:
714  * 1 == 0~1 second.
715  * 2 == 1~2 second(s).
716  *
717  * We use 2 seconds for FIN lifetime, so that the states will not be
718  * ripped prematurely.
719  */
720 static uint32_t dyn_ack_lifetime = 300;
721 static uint32_t dyn_syn_lifetime = 20;
722 static uint32_t dyn_finwait_lifetime = 20;
723 static uint32_t dyn_fin_lifetime = 2;
724 static uint32_t dyn_rst_lifetime = 2;
725 static uint32_t dyn_udp_lifetime = 10;
726 static uint32_t dyn_short_lifetime = 5;	/* used by tracks too */
727 
728 /*
729  * Keepalives are sent if dyn_keepalive is set. They are sent every
730  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
731  * seconds of lifetime of a rule.
732  */
733 static uint32_t dyn_keepalive_interval = 20;
734 static uint32_t dyn_keepalive_period = 5;
735 static uint32_t dyn_keepalive = 1;	/* do send keepalives */
736 
737 static struct ipfw_global	ipfw_gd;
738 static int	ipfw_state_loosecnt_updthr;
739 static int	ipfw_state_max = 4096;	/* max # of states */
740 static int	ipfw_track_max = 4096;	/* max # of tracks */
741 
742 static int	ipfw_state_headroom;	/* setup at module load time */
743 static int	ipfw_state_reap_min = 8;
744 static int	ipfw_state_expire_max = 32;
745 static int	ipfw_state_scan_max = 256;
746 static int	ipfw_keepalive_max = 8;
747 static int	ipfw_track_reap_max = 4;
748 static int	ipfw_track_expire_max = 16;
749 static int	ipfw_track_scan_max = 128;
750 
751 static eventhandler_tag ipfw_ifaddr_event;
752 
753 /* Compat */
754 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
755     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
756     "Number of states and tracks");
757 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
758     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
759     "Max number of states and tracks");
760 
761 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
762     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
763     "Number of states");
764 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
765     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
766     "Max number of states");
767 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
768     &ipfw_state_headroom, 0, "headroom for state reap");
769 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
770     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
771 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
772     &ipfw_track_max, 0, "Max number of tracks");
773 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
774     &static_count, 0, "Number of static rules");
775 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
776     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
777 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
778     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
779 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
780     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
781 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
782     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
783 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
784     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
785 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
786     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
787 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
788     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
789 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
790     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
791 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
792     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
793     "I", "# of states to scan for each expire iteration");
794 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
795     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
796     "I", "# of states to expire for each expire iteration");
797 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
798     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
799     "I", "# of states to expire for each expire iteration");
800 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
801     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
802     "I", "# of states to reap for state shortage");
803 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
804     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
805     "I", "# of tracks to scan for each expire iteration");
806 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
807     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
808     "I", "# of tracks to expire for each expire iteration");
809 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
810     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
811     "I", "# of tracks to reap for track shortage");
812 
813 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
814     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
815     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
816     "LU", "# of state reaps due to states shortage");
817 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
818     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
819     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
820     "LU", "# of state reap failure");
821 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
822     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
823     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
824     "LU", "# of state overflow");
825 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
826     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
827     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
828     "LU", "# of state allocation failure");
829 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
830     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
831     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
832     "LU", "# of state deleted due to fast TCP port recycling");
833 
834 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
835     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
836     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
837     "LU", "# of track allocation failure");
838 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
839     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
840     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
841     "LU", "# of track reap due to tracks shortage");
842 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
843     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
844     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
845     "LU", "# of track reap failure");
846 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
847     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
848     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
849     "LU", "# of track overflow");
850 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
851     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
852     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
853     "LU", "# of track counter allocation failure");
854 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
855     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
856     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
857     "LU", "# of IP fragements defraged");
858 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
859     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
860     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
861     "LU", "# of IP packets after defrag");
862 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
863     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
864     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
865     "LU", "# of IP packets after defrag dispatched to remote cpus");
866 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
867     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
868     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
869     "LU", "# address/port translations");
870 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
871     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
872     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
873     "LU", "# address/port translations split between different cpus");
874 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
875     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
876     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
877     "LU", "# address/port translations conflicts on remote cpu");
878 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
879     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
880     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
881     "LU", "# address/port translations conflicts resolved on remote cpu");
882 
883 static int		ipfw_state_cmp(struct ipfw_state *,
884 			    struct ipfw_state *);
885 static int		ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
886 			    struct ipfw_trkcnt *);
887 static int		ipfw_track_cmp(struct ipfw_track *,
888 			    struct ipfw_track *);
889 
890 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
892 
893 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
895 
896 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
898 
899 static int		ipfw_chk(struct ip_fw_args *);
900 static void		ipfw_track_expire_ipifunc(void *);
901 static void		ipfw_state_expire_ipifunc(void *);
902 static void		ipfw_keepalive(void *);
903 static int		ipfw_state_expire_start(struct ipfw_context *,
904 			    int, int);
905 static void		ipfw_crossref_timeo(void *);
906 static void		ipfw_state_remove(struct ipfw_context *,
907 			    struct ipfw_state *);
908 static void		ipfw_xlat_reap_timeo(void *);
909 static void		ipfw_defrag_redispatch(struct mbuf *, int,
910 			    struct ip_fw *);
911 
912 #define IPFW_TRKCNT_TOKGET	lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKREL	lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
914 #define IPFW_TRKCNT_TOKINIT	\
915 	lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
916 
917 static void
918 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
919     const struct sockaddr *netmask)
920 {
921 	const u_char *cp1 = (const u_char *)src;
922 	u_char *cp2 = (u_char *)dst;
923 	const u_char *cp3 = (const u_char *)netmask;
924 	u_char *cplim = cp2 + *cp3;
925 	u_char *cplim2 = cp2 + *cp1;
926 
927 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
928 	cp3 += 2;
929 	if (cplim > cplim2)
930 		cplim = cplim2;
931 	while (cp2 < cplim)
932 		*cp2++ = *cp1++ & *cp3++;
933 	if (cp2 < cplim2)
934 		bzero(cp2, cplim2 - cp2);
935 }
936 
937 static __inline uint16_t
938 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
939 {
940 	uint32_t l;
941 
942 	if (udp && !cksum)
943 		return (0x0000);
944 	l = cksum + old - new;
945 	l = (l >> 16) + (l & 65535);
946 	l = l & 65535;
947 	if (udp && !l)
948 		return (0xFFFF);
949 	return (l);
950 }
951 
952 static __inline void
953 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
954     in_addr_t daddr, uint16_t dport, uint8_t proto)
955 {
956 
957 	key->proto = proto;
958 	key->swap = 0;
959 
960 	if (saddr < daddr) {
961 		key->addr_u.addrs.addr1 = daddr;
962 		key->addr_u.addrs.addr2 = saddr;
963 		key->swap |= IPFW_KEY_SWAP_ADDRS;
964 	} else {
965 		key->addr_u.addrs.addr1 = saddr;
966 		key->addr_u.addrs.addr2 = daddr;
967 	}
968 
969 	if (sport < dport) {
970 		key->port_u.ports.port1 = dport;
971 		key->port_u.ports.port2 = sport;
972 		key->swap |= IPFW_KEY_SWAP_PORTS;
973 	} else {
974 		key->port_u.ports.port1 = sport;
975 		key->port_u.ports.port2 = dport;
976 	}
977 
978 	if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
979 		key->swap |= IPFW_KEY_SWAP_PORTS;
980 	if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
981 		key->swap |= IPFW_KEY_SWAP_ADDRS;
982 }
983 
984 static __inline void
985 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
986     in_addr_t *daddr, uint16_t *dport)
987 {
988 
989 	if (key->swap & IPFW_KEY_SWAP_ADDRS) {
990 		*saddr = key->addr_u.addrs.addr2;
991 		*daddr = key->addr_u.addrs.addr1;
992 	} else {
993 		*saddr = key->addr_u.addrs.addr1;
994 		*daddr = key->addr_u.addrs.addr2;
995 	}
996 
997 	if (key->swap & IPFW_KEY_SWAP_PORTS) {
998 		*sport = key->port_u.ports.port2;
999 		*dport = key->port_u.ports.port1;
1000 	} else {
1001 		*sport = key->port_u.ports.port1;
1002 		*dport = key->port_u.ports.port2;
1003 	}
1004 }
1005 
1006 static int
1007 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1008 {
1009 
1010 	if (s1->st_proto > s2->st_proto)
1011 		return (1);
1012 	if (s1->st_proto < s2->st_proto)
1013 		return (-1);
1014 
1015 	if (s1->st_addrs > s2->st_addrs)
1016 		return (1);
1017 	if (s1->st_addrs < s2->st_addrs)
1018 		return (-1);
1019 
1020 	if (s1->st_ports > s2->st_ports)
1021 		return (1);
1022 	if (s1->st_ports < s2->st_ports)
1023 		return (-1);
1024 
1025 	if (s1->st_swap == s2->st_swap ||
1026 	    (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1027 		return (0);
1028 
1029 	if (s1->st_swap > s2->st_swap)
1030 		return (1);
1031 	else
1032 		return (-1);
1033 }
1034 
1035 static int
1036 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1037 {
1038 
1039 	if (t1->tc_proto > t2->tc_proto)
1040 		return (1);
1041 	if (t1->tc_proto < t2->tc_proto)
1042 		return (-1);
1043 
1044 	if (t1->tc_addrs > t2->tc_addrs)
1045 		return (1);
1046 	if (t1->tc_addrs < t2->tc_addrs)
1047 		return (-1);
1048 
1049 	if (t1->tc_ports > t2->tc_ports)
1050 		return (1);
1051 	if (t1->tc_ports < t2->tc_ports)
1052 		return (-1);
1053 
1054 	if (t1->tc_ruleid > t2->tc_ruleid)
1055 		return (1);
1056 	if (t1->tc_ruleid < t2->tc_ruleid)
1057 		return (-1);
1058 
1059 	return (0);
1060 }
1061 
1062 static int
1063 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1064 {
1065 
1066 	if (t1->t_proto > t2->t_proto)
1067 		return (1);
1068 	if (t1->t_proto < t2->t_proto)
1069 		return (-1);
1070 
1071 	if (t1->t_addrs > t2->t_addrs)
1072 		return (1);
1073 	if (t1->t_addrs < t2->t_addrs)
1074 		return (-1);
1075 
1076 	if (t1->t_ports > t2->t_ports)
1077 		return (1);
1078 	if (t1->t_ports < t2->t_ports)
1079 		return (-1);
1080 
1081 	if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1082 		return (1);
1083 	if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1084 		return (-1);
1085 
1086 	return (0);
1087 }
1088 
1089 static __inline struct ipfw_state *
1090 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1091 {
1092 	struct ipfw_state *dup;
1093 
1094 	KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1095 	    ("state %p was linked", s));
1096 	dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1097 	if (dup == NULL) {
1098 		TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1099 		s->st_flags |= IPFW_STATE_F_LINKED;
1100 	}
1101 	return (dup);
1102 }
1103 
1104 static __inline void
1105 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1106 {
1107 
1108 	KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1109 	    ("state %p was not linked", s));
1110 	RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1111 	TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1112 	s->st_flags &= ~IPFW_STATE_F_LINKED;
1113 }
1114 
1115 static void
1116 ipfw_state_max_set(int state_max)
1117 {
1118 
1119 	ipfw_state_max = state_max;
1120 	/* Allow 5% states over-allocation. */
1121 	ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1122 }
1123 
1124 static __inline int
1125 ipfw_state_cntcoll(void)
1126 {
1127 	int cpu, state_cnt = 0;
1128 
1129 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1130 		state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1131 	return (state_cnt);
1132 }
1133 
1134 static __inline int
1135 ipfw_state_cntsync(void)
1136 {
1137 	int state_cnt;
1138 
1139 	state_cnt = ipfw_state_cntcoll();
1140 	ipfw_gd.ipfw_state_loosecnt = state_cnt;
1141 	return (state_cnt);
1142 }
1143 
1144 static __inline int
1145 ipfw_free_rule(struct ip_fw *rule)
1146 {
1147 	KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1148 	KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1149 	rule->refcnt--;
1150 	if (rule->refcnt == 0) {
1151 		if (rule->cross_rules != NULL)
1152 			kfree(rule->cross_rules, M_IPFW);
1153 		kfree(rule, M_IPFW);
1154 		return 1;
1155 	}
1156 	return 0;
1157 }
1158 
1159 static void
1160 ipfw_unref_rule(void *priv)
1161 {
1162 	ipfw_free_rule(priv);
1163 #ifdef KLD_MODULE
1164 	KASSERT(ipfw_gd.ipfw_refcnt > 0,
1165 	    ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1166 	atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1167 #endif
1168 }
1169 
1170 static __inline void
1171 ipfw_ref_rule(struct ip_fw *rule)
1172 {
1173 	KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1174 #ifdef KLD_MODULE
1175 	atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1176 #endif
1177 	rule->refcnt++;
1178 }
1179 
1180 /*
1181  * This macro maps an ip pointer into a layer3 header pointer of type T
1182  */
1183 #define	L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1184 
1185 static __inline int
1186 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1187 {
1188 	int type = L3HDR(struct icmp,ip)->icmp_type;
1189 
1190 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type)));
1191 }
1192 
1193 #define TT	((1 << ICMP_ECHO) | \
1194 		 (1 << ICMP_ROUTERSOLICIT) | \
1195 		 (1 << ICMP_TSTAMP) | \
1196 		 (1 << ICMP_IREQ) | \
1197 		 (1 << ICMP_MASKREQ))
1198 
1199 static int
1200 is_icmp_query(struct ip *ip)
1201 {
1202 	int type = L3HDR(struct icmp, ip)->icmp_type;
1203 
1204 	return (type <= ICMP_MAXTYPE && (TT & (1 << type)));
1205 }
1206 
1207 #undef TT
1208 
1209 /*
1210  * The following checks use two arrays of 8 or 16 bits to store the
1211  * bits that we want set or clear, respectively. They are in the
1212  * low and high half of cmd->arg1 or cmd->d[0].
1213  *
1214  * We scan options and store the bits we find set. We succeed if
1215  *
1216  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1217  *
1218  * The code is sometimes optimized not to store additional variables.
1219  */
1220 static int
1221 flags_match(ipfw_insn *cmd, uint8_t bits)
1222 {
1223 	u_char want_clear;
1224 	bits = ~bits;
1225 
1226 	if (((cmd->arg1 & 0xff) & bits) != 0)
1227 		return 0; /* some bits we want set were clear */
1228 
1229 	want_clear = (cmd->arg1 >> 8) & 0xff;
1230 	if ((want_clear & bits) != want_clear)
1231 		return 0; /* some bits we want clear were set */
1232 	return 1;
1233 }
1234 
1235 static int
1236 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1237 {
1238 	int optlen, bits = 0;
1239 	u_char *cp = (u_char *)(ip + 1);
1240 	int x = (ip->ip_hl << 2) - sizeof(struct ip);
1241 
1242 	for (; x > 0; x -= optlen, cp += optlen) {
1243 		int opt = cp[IPOPT_OPTVAL];
1244 
1245 		if (opt == IPOPT_EOL)
1246 			break;
1247 
1248 		if (opt == IPOPT_NOP) {
1249 			optlen = 1;
1250 		} else {
1251 			optlen = cp[IPOPT_OLEN];
1252 			if (optlen <= 0 || optlen > x)
1253 				return 0; /* invalid or truncated */
1254 		}
1255 
1256 		switch (opt) {
1257 		case IPOPT_LSRR:
1258 			bits |= IP_FW_IPOPT_LSRR;
1259 			break;
1260 
1261 		case IPOPT_SSRR:
1262 			bits |= IP_FW_IPOPT_SSRR;
1263 			break;
1264 
1265 		case IPOPT_RR:
1266 			bits |= IP_FW_IPOPT_RR;
1267 			break;
1268 
1269 		case IPOPT_TS:
1270 			bits |= IP_FW_IPOPT_TS;
1271 			break;
1272 
1273 		default:
1274 			break;
1275 		}
1276 	}
1277 	return (flags_match(cmd, bits));
1278 }
1279 
1280 static int
1281 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1282 {
1283 	int optlen, bits = 0;
1284 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1285 	u_char *cp = (u_char *)(tcp + 1);
1286 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1287 
1288 	for (; x > 0; x -= optlen, cp += optlen) {
1289 		int opt = cp[0];
1290 
1291 		if (opt == TCPOPT_EOL)
1292 			break;
1293 
1294 		if (opt == TCPOPT_NOP) {
1295 			optlen = 1;
1296 		} else {
1297 			optlen = cp[1];
1298 			if (optlen <= 0)
1299 				break;
1300 		}
1301 
1302 		switch (opt) {
1303 		case TCPOPT_MAXSEG:
1304 			bits |= IP_FW_TCPOPT_MSS;
1305 			break;
1306 
1307 		case TCPOPT_WINDOW:
1308 			bits |= IP_FW_TCPOPT_WINDOW;
1309 			break;
1310 
1311 		case TCPOPT_SACK_PERMITTED:
1312 		case TCPOPT_SACK:
1313 			bits |= IP_FW_TCPOPT_SACK;
1314 			break;
1315 
1316 		case TCPOPT_TIMESTAMP:
1317 			bits |= IP_FW_TCPOPT_TS;
1318 			break;
1319 
1320 		case TCPOPT_CC:
1321 		case TCPOPT_CCNEW:
1322 		case TCPOPT_CCECHO:
1323 			bits |= IP_FW_TCPOPT_CC;
1324 			break;
1325 
1326 		default:
1327 			break;
1328 		}
1329 	}
1330 	return (flags_match(cmd, bits));
1331 }
1332 
1333 static int
1334 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1335 {
1336 	if (ifp == NULL)	/* no iface with this packet, match fails */
1337 		return 0;
1338 
1339 	/* Check by name or by IP address */
1340 	if (cmd->name[0] != '\0') { /* match by name */
1341 		/* Check name */
1342 		if (cmd->p.glob) {
1343 			if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1344 				return(1);
1345 		} else {
1346 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1347 				return(1);
1348 		}
1349 	} else {
1350 		struct ifaddr_container *ifac;
1351 
1352 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1353 			struct ifaddr *ia = ifac->ifa;
1354 
1355 			if (ia->ifa_addr == NULL)
1356 				continue;
1357 			if (ia->ifa_addr->sa_family != AF_INET)
1358 				continue;
1359 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1360 			    (ia->ifa_addr))->sin_addr.s_addr)
1361 				return(1);	/* match */
1362 		}
1363 	}
1364 	return(0);	/* no match, fail ... */
1365 }
1366 
1367 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1368 
1369 /*
1370  * We enter here when we have a rule with O_LOG.
1371  * XXX this function alone takes about 2Kbytes of code!
1372  */
1373 static void
1374 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1375     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1376 {
1377 	char *action;
1378 	int limit_reached = 0;
1379 	char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1380 
1381 	fragment[0] = '\0';
1382 	proto[0] = '\0';
1383 
1384 	if (f == NULL) {	/* bogus pkt */
1385 		if (verbose_limit != 0 &&
1386 		    ctx->ipfw_norule_counter >= verbose_limit)
1387 			return;
1388 		ctx->ipfw_norule_counter++;
1389 		if (ctx->ipfw_norule_counter == verbose_limit)
1390 			limit_reached = verbose_limit;
1391 		action = "Refuse";
1392 	} else {	/* O_LOG is the first action, find the real one */
1393 		ipfw_insn *cmd = ACTION_PTR(f);
1394 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1395 
1396 		if (l->max_log != 0 && l->log_left == 0)
1397 			return;
1398 		l->log_left--;
1399 		if (l->log_left == 0)
1400 			limit_reached = l->max_log;
1401 		cmd += F_LEN(cmd);	/* point to first action */
1402 		if (cmd->opcode == O_PROB)
1403 			cmd += F_LEN(cmd);
1404 
1405 		action = action2;
1406 		switch (cmd->opcode) {
1407 		case O_DENY:
1408 			action = "Deny";
1409 			break;
1410 
1411 		case O_REJECT:
1412 			if (cmd->arg1==ICMP_REJECT_RST) {
1413 				action = "Reset";
1414 			} else if (cmd->arg1==ICMP_UNREACH_HOST) {
1415 				action = "Reject";
1416 			} else {
1417 				ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1418 					  cmd->arg1);
1419 			}
1420 			break;
1421 
1422 		case O_ACCEPT:
1423 			action = "Accept";
1424 			break;
1425 
1426 		case O_COUNT:
1427 			action = "Count";
1428 			break;
1429 
1430 		case O_DIVERT:
1431 			ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1432 			break;
1433 
1434 		case O_TEE:
1435 			ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1436 			break;
1437 
1438 		case O_SKIPTO:
1439 			ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1440 			break;
1441 
1442 		case O_PIPE:
1443 			ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1444 			break;
1445 
1446 		case O_QUEUE:
1447 			ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1448 			break;
1449 
1450 		case O_FORWARD_IP:
1451 			{
1452 				ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1453 				int len;
1454 
1455 				len = ksnprintf(SNPARGS(action2, 0),
1456 				    "Forward to %s",
1457 				    kinet_ntoa(sa->sa.sin_addr, abuf));
1458 				if (sa->sa.sin_port) {
1459 					ksnprintf(SNPARGS(action2, len), ":%d",
1460 						  sa->sa.sin_port);
1461 				}
1462 			}
1463 			break;
1464 
1465 		default:
1466 			action = "UNKNOWN";
1467 			break;
1468 		}
1469 	}
1470 
1471 	if (hlen == 0) {	/* non-ip */
1472 		ksnprintf(SNPARGS(proto, 0), "MAC");
1473 	} else {
1474 		struct ip *ip = mtod(m, struct ip *);
1475 		/* these three are all aliases to the same thing */
1476 		struct icmp *const icmp = L3HDR(struct icmp, ip);
1477 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
1478 		struct udphdr *const udp = (struct udphdr *)icmp;
1479 
1480 		int ip_off, offset, ip_len;
1481 		int len;
1482 
1483 		if (eh != NULL) { /* layer 2 packets are as on the wire */
1484 			ip_off = ntohs(ip->ip_off);
1485 			ip_len = ntohs(ip->ip_len);
1486 		} else {
1487 			ip_off = ip->ip_off;
1488 			ip_len = ip->ip_len;
1489 		}
1490 		offset = ip_off & IP_OFFMASK;
1491 		switch (ip->ip_p) {
1492 		case IPPROTO_TCP:
1493 			len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1494 					kinet_ntoa(ip->ip_src, abuf));
1495 			if (offset == 0) {
1496 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1497 					  ntohs(tcp->th_sport),
1498 					  kinet_ntoa(ip->ip_dst, abuf),
1499 					  ntohs(tcp->th_dport));
1500 			} else {
1501 				ksnprintf(SNPARGS(proto, len), " %s",
1502 					  kinet_ntoa(ip->ip_dst, abuf));
1503 			}
1504 			break;
1505 
1506 		case IPPROTO_UDP:
1507 			len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1508 					kinet_ntoa(ip->ip_src, abuf));
1509 			if (offset == 0) {
1510 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1511 					  ntohs(udp->uh_sport),
1512 					  kinet_ntoa(ip->ip_dst, abuf),
1513 					  ntohs(udp->uh_dport));
1514 			} else {
1515 				ksnprintf(SNPARGS(proto, len), " %s",
1516 					  kinet_ntoa(ip->ip_dst, abuf));
1517 			}
1518 			break;
1519 
1520 		case IPPROTO_ICMP:
1521 			if (offset == 0) {
1522 				len = ksnprintf(SNPARGS(proto, 0),
1523 						"ICMP:%u.%u ",
1524 						icmp->icmp_type,
1525 						icmp->icmp_code);
1526 			} else {
1527 				len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1528 			}
1529 			len += ksnprintf(SNPARGS(proto, len), "%s",
1530 					 kinet_ntoa(ip->ip_src, abuf));
1531 			ksnprintf(SNPARGS(proto, len), " %s",
1532 				  kinet_ntoa(ip->ip_dst, abuf));
1533 			break;
1534 
1535 		default:
1536 			len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1537 					kinet_ntoa(ip->ip_src, abuf));
1538 			ksnprintf(SNPARGS(proto, len), " %s",
1539 				  kinet_ntoa(ip->ip_dst, abuf));
1540 			break;
1541 		}
1542 
1543 		if (ip_off & (IP_MF | IP_OFFMASK)) {
1544 			ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1545 				  ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1546 				  offset << 3, (ip_off & IP_MF) ? "+" : "");
1547 		}
1548 	}
1549 
1550 	if (oif || m->m_pkthdr.rcvif) {
1551 		log(LOG_SECURITY | LOG_INFO,
1552 		    "ipfw: %d %s %s %s via %s%s\n",
1553 		    f ? f->rulenum : -1,
1554 		    action, proto, oif ? "out" : "in",
1555 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1556 		    fragment);
1557 	} else {
1558 		log(LOG_SECURITY | LOG_INFO,
1559 		    "ipfw: %d %s %s [no if info]%s\n",
1560 		    f ? f->rulenum : -1,
1561 		    action, proto, fragment);
1562 	}
1563 
1564 	if (limit_reached) {
1565 		log(LOG_SECURITY | LOG_NOTICE,
1566 		    "ipfw: limit %d reached on entry %d\n",
1567 		    limit_reached, f ? f->rulenum : -1);
1568 	}
1569 }
1570 
1571 #undef SNPARGS
1572 
1573 static void
1574 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1575 {
1576 	struct ip_fw *rule = slave_x->xlat_rule;
1577 
1578 	KKASSERT(rule->cpuid == mycpuid);
1579 
1580 	/* No more cross references; free this pair now. */
1581 	kfree(x, M_IPFW);
1582 	kfree(slave_x, M_IPFW);
1583 
1584 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1585 	rule->cross_refs--;
1586 }
1587 
1588 static void
1589 ipfw_xlat_reap_dispatch(netmsg_t nm)
1590 {
1591 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1592 	struct ipfw_state *s, *ns;
1593 
1594 	ASSERT_NETISR_NCPUS(mycpuid);
1595 
1596 	crit_enter();
1597 	/* Reply ASAP. */
1598 	netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1599 	crit_exit();
1600 
1601 	/* TODO: limit scanning depth */
1602 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1603 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1604 		struct ipfw_xlat *slave_x = x->xlat_pair;
1605 		uint64_t crefs;
1606 
1607 		crefs = slave_x->xlat_crefs + x->xlat_crefs;
1608 		if (crefs == 0) {
1609 			TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1610 			ipfw_xlat_reap(x, slave_x);
1611 		}
1612 	}
1613 	if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1614 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1615 		    &ctx->ipfw_xlatreap_nm);
1616 	}
1617 }
1618 
1619 static void
1620 ipfw_xlat_reap_timeo(void *xnm)
1621 {
1622 	struct netmsg_base *nm = xnm;
1623 
1624 	KKASSERT(mycpuid < netisr_ncpus);
1625 
1626 	crit_enter();
1627 	if (nm->lmsg.ms_flags & MSGF_DONE)
1628 		netisr_sendmsg_oncpu(nm);
1629 	crit_exit();
1630 }
1631 
1632 static void
1633 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1634 {
1635 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1636 	struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1637 	struct ipfw_xlat *slave_x = x->xlat_pair;
1638 	uint64_t crefs;
1639 
1640 	ASSERT_NETISR_NCPUS(mycpuid);
1641 
1642 	KKASSERT(slave_x != NULL);
1643 	KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1644 
1645 	KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1646 	    ("master xlat is still linked"));
1647 	if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1648 		ipfw_state_unlink(ctx, &slave_x->xlat_st);
1649 
1650 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1651 	slave_x->xlat_crefs--;
1652 
1653 	crefs = slave_x->xlat_crefs + x->xlat_crefs;
1654 	if (crefs == 0) {
1655 		ipfw_xlat_reap(x, slave_x);
1656 		return;
1657 	}
1658 
1659 	if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1660 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1661 		    &ctx->ipfw_xlatreap_nm);
1662 	}
1663 
1664 	/*
1665 	 * This pair is still referenced; defer its destruction.
1666 	 * YYY reuse st_link.
1667 	 */
1668 	TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1669 }
1670 
1671 static __inline void
1672 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1673 {
1674 
1675 	x->xlat_invalid = 1;
1676 	x->xlat_pair->xlat_invalid = 1;
1677 }
1678 
1679 static void
1680 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1681 {
1682 	struct ipfw_xlat *x, *slave_x;
1683 	struct netmsg_base *nm;
1684 
1685 	KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1686 	    IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1687 	KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1688 	    ("delete slave xlat"));
1689 
1690 	KASSERT(ctx->ipfw_state_cnt > 0,
1691 	    ("invalid state count %d", ctx->ipfw_state_cnt));
1692 	ctx->ipfw_state_cnt--;
1693 	if (ctx->ipfw_state_loosecnt > 0)
1694 		ctx->ipfw_state_loosecnt--;
1695 
1696 	/*
1697 	 * Unhook this state.
1698 	 */
1699 	if (s->st_track != NULL) {
1700 		struct ipfw_track *t = s->st_track;
1701 
1702 		KASSERT(!LIST_EMPTY(&t->t_state_list),
1703 		    ("track state list is empty"));
1704 		LIST_REMOVE(s, st_trklink);
1705 
1706 		KASSERT(*t->t_count > 0,
1707 		    ("invalid track count %d", *t->t_count));
1708 		atomic_subtract_int(t->t_count, 1);
1709 	}
1710 	ipfw_state_unlink(ctx, s);
1711 
1712 	/*
1713 	 * Free this state.  Xlat requires special processing,
1714 	 * since xlat are paired state and they could be on
1715 	 * different cpus.
1716 	 */
1717 
1718 	if (!IPFW_ISXLAT(s->st_type)) {
1719 		/* Not xlat; free now. */
1720 		kfree(s, M_IPFW);
1721 		/* Done! */
1722 		return;
1723 	}
1724 	x = (struct ipfw_xlat *)s;
1725 
1726 	if (x->xlat_pair == NULL) {
1727 		/* Not setup yet; free now. */
1728 		kfree(x, M_IPFW);
1729 		/* Done! */
1730 		return;
1731 	}
1732 	slave_x = x->xlat_pair;
1733 	KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1734 
1735 	if (x->xlat_pcpu == mycpuid) {
1736 		/*
1737 		 * Paired states are on the same cpu; delete this
1738 		 * pair now.
1739 		 */
1740 		KKASSERT(x->xlat_crefs == 0);
1741 		KKASSERT(slave_x->xlat_crefs == 0);
1742 		if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1743 			ipfw_state_unlink(ctx, &slave_x->xlat_st);
1744 		kfree(x, M_IPFW);
1745 		kfree(slave_x, M_IPFW);
1746 		return;
1747 	}
1748 
1749 	/*
1750 	 * Free the paired states on the cpu owning the slave xlat.
1751 	 */
1752 
1753 	/*
1754 	 * Mark the state pair invalid; completely deleting them
1755 	 * may take some time.
1756 	 */
1757 	ipfw_xlat_invalidate(x);
1758 
1759 	nm = &x->xlat_freenm;
1760 	netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1761 	    ipfw_xlat_free_dispatch);
1762 	nm->lmsg.u.ms_resultp = x;
1763 
1764 	/* See the comment in ipfw_xlate_redispatch(). */
1765 	x->xlat_rule->cross_refs++;
1766 	x->xlat_crefs++;
1767 
1768 	netisr_sendmsg(nm, x->xlat_pcpu);
1769 }
1770 
1771 static void
1772 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1773 {
1774 
1775 	if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1776 		KKASSERT(IPFW_ISXLAT(s->st_type));
1777 		ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1778 		ipfw_state_unlink(ctx, s);
1779 		return;
1780 	}
1781 	ipfw_state_del(ctx, s);
1782 }
1783 
1784 static int
1785 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1786 {
1787 	struct ipfw_state *s, *anchor;
1788 	int expired;
1789 
1790 	if (reap_max < ipfw_state_reap_min)
1791 		reap_max = ipfw_state_reap_min;
1792 
1793 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1794 		/*
1795 		 * Kick start state expiring.  Ignore scan limit,
1796 		 * we are short of states.
1797 		 */
1798 		ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1799 		expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1800 		ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1801 		return (expired);
1802 	}
1803 
1804 	/*
1805 	 * States are being expired.
1806 	 */
1807 
1808 	if (ctx->ipfw_state_cnt == 0)
1809 		return (0);
1810 
1811 	expired = 0;
1812 	anchor = &ctx->ipfw_stateexp_anch;
1813 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1814 		/*
1815 		 * Ignore scan limit; we are short of states.
1816 		 */
1817 
1818 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1819 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1820 
1821 		if (IPFW_STATE_SCANSKIP(s))
1822 			continue;
1823 
1824 		if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1825 			ipfw_state_del(ctx, s);
1826 			if (++expired >= reap_max)
1827 				break;
1828 			if ((expired & 0xff) == 0 &&
1829 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1830 			    ipfw_state_max)
1831 				break;
1832 		}
1833 	}
1834 	/*
1835 	 * NOTE:
1836 	 * Leave the anchor on the list, even if the end of the list has
1837 	 * been reached.  ipfw_state_expire_more_dispatch() will handle
1838 	 * the removal.
1839 	 */
1840 	return (expired);
1841 }
1842 
1843 static void
1844 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1845 {
1846 	struct ipfw_state *s, *sn;
1847 
1848 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1849 		if (IPFW_STATE_SCANSKIP(s))
1850 			continue;
1851 		if (rule != NULL && s->st_rule != rule)
1852 			continue;
1853 		ipfw_state_del(ctx, s);
1854 	}
1855 }
1856 
1857 static void
1858 ipfw_state_expire_done(struct ipfw_context *ctx)
1859 {
1860 
1861 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1862 	    ("stateexp is not in progress"));
1863 	ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1864 	callout_reset(&ctx->ipfw_stateto_ch, hz,
1865 	    ipfw_state_expire_ipifunc, NULL);
1866 }
1867 
1868 static void
1869 ipfw_state_expire_more(struct ipfw_context *ctx)
1870 {
1871 	struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1872 
1873 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1874 	    ("stateexp is not in progress"));
1875 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1876 	    ("stateexp more did not finish"));
1877 	netisr_sendmsg_oncpu(nm);
1878 }
1879 
1880 static int
1881 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1882     int scan_max, int expire_max)
1883 {
1884 	struct ipfw_state *s;
1885 	int scanned = 0, expired = 0;
1886 
1887 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1888 	    ("stateexp is not in progress"));
1889 
1890 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1891 		if (scanned++ >= scan_max) {
1892 			ipfw_state_expire_more(ctx);
1893 			return (expired);
1894 		}
1895 
1896 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1897 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1898 
1899 		if (IPFW_STATE_SCANSKIP(s))
1900 			continue;
1901 
1902 		if (IPFW_STATE_ISDEAD(s) ||
1903 		    ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1904 		     IPFW_STATE_TCPCLOSED(s))) {
1905 			ipfw_state_del(ctx, s);
1906 			if (++expired >= expire_max) {
1907 				ipfw_state_expire_more(ctx);
1908 				return (expired);
1909 			}
1910 			if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1911 			    (expired & 0xff) == 0 &&
1912 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1913 			    ipfw_state_max) {
1914 				ipfw_state_expire_more(ctx);
1915 				return (expired);
1916 			}
1917 		}
1918 	}
1919 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1920 	ipfw_state_expire_done(ctx);
1921 	return (expired);
1922 }
1923 
1924 static void
1925 ipfw_state_expire_more_dispatch(netmsg_t nm)
1926 {
1927 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1928 	struct ipfw_state *anchor;
1929 
1930 	ASSERT_NETISR_NCPUS(mycpuid);
1931 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1932 	    ("statexp is not in progress"));
1933 
1934 	/* Reply ASAP */
1935 	netisr_replymsg(&nm->base, 0);
1936 
1937 	anchor = &ctx->ipfw_stateexp_anch;
1938 	if (ctx->ipfw_state_cnt == 0) {
1939 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1940 		ipfw_state_expire_done(ctx);
1941 		return;
1942 	}
1943 	ipfw_state_expire_loop(ctx, anchor,
1944 	    ipfw_state_scan_max, ipfw_state_expire_max);
1945 }
1946 
1947 static int
1948 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1949 {
1950 	struct ipfw_state *anchor;
1951 
1952 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1953 	    ("stateexp is in progress"));
1954 	ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1955 
1956 	if (ctx->ipfw_state_cnt == 0) {
1957 		ipfw_state_expire_done(ctx);
1958 		return (0);
1959 	}
1960 
1961 	/*
1962 	 * Do not expire more than once per second, it is useless.
1963 	 */
1964 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1965 	    ctx->ipfw_state_lastexp == time_uptime) {
1966 		ipfw_state_expire_done(ctx);
1967 		return (0);
1968 	}
1969 	ctx->ipfw_state_lastexp = time_uptime;
1970 
1971 	anchor = &ctx->ipfw_stateexp_anch;
1972 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1973 	return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1974 }
1975 
1976 static void
1977 ipfw_state_expire_dispatch(netmsg_t nm)
1978 {
1979 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1980 
1981 	ASSERT_NETISR_NCPUS(mycpuid);
1982 
1983 	/* Reply ASAP */
1984 	crit_enter();
1985 	netisr_replymsg(&nm->base, 0);
1986 	crit_exit();
1987 
1988 	if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
1989 		/* Running; done. */
1990 		return;
1991 	}
1992 	ipfw_state_expire_start(ctx,
1993 	    ipfw_state_scan_max, ipfw_state_expire_max);
1994 }
1995 
1996 static void
1997 ipfw_state_expire_ipifunc(void *dummy __unused)
1998 {
1999 	struct netmsg_base *msg;
2000 
2001 	KKASSERT(mycpuid < netisr_ncpus);
2002 	msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2003 
2004 	crit_enter();
2005 	if (msg->lmsg.ms_flags & MSGF_DONE)
2006 		netisr_sendmsg_oncpu(msg);
2007 	crit_exit();
2008 }
2009 
2010 static boolean_t
2011 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2012 {
2013 	uint32_t seq = ntohl(tcp->th_seq);
2014 	uint32_t ack = ntohl(tcp->th_ack);
2015 
2016 	if (tcp->th_flags & TH_RST)
2017 		return (TRUE);
2018 
2019 	if (dir == MATCH_FORWARD) {
2020 		if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2021 			s->st_flags |= IPFW_STATE_F_SEQFWD;
2022 			s->st_seq_fwd = seq;
2023 		} else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2024 			s->st_seq_fwd = seq;
2025 		} else {
2026 			/* Out-of-sequence; done. */
2027 			return (FALSE);
2028 		}
2029 		if (tcp->th_flags & TH_ACK) {
2030 			if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2031 				s->st_flags |= IPFW_STATE_F_ACKFWD;
2032 				s->st_ack_fwd = ack;
2033 			} else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2034 				s->st_ack_fwd = ack;
2035 			} else {
2036 				/* Out-of-sequence; done. */
2037 				return (FALSE);
2038 			}
2039 
2040 			if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2041 			    (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2042 				s->st_state |= (TH_ACK << 8);
2043 		}
2044 	} else {
2045 		if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2046 			s->st_flags |= IPFW_STATE_F_SEQREV;
2047 			s->st_seq_rev = seq;
2048 		} else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2049 			s->st_seq_rev = seq;
2050 		} else {
2051 			/* Out-of-sequence; done. */
2052 			return (FALSE);
2053 		}
2054 		if (tcp->th_flags & TH_ACK) {
2055 			if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2056 				s->st_flags |= IPFW_STATE_F_ACKREV;
2057 				s->st_ack_rev= ack;
2058 			} else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2059 				s->st_ack_rev = ack;
2060 			} else {
2061 				/* Out-of-sequence; done. */
2062 				return (FALSE);
2063 			}
2064 
2065 			if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2066 			    s->st_ack_rev == s->st_seq_fwd + 1)
2067 				s->st_state |= TH_ACK;
2068 		}
2069 	}
2070 	return (TRUE);
2071 }
2072 
2073 static void
2074 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2075     const struct tcphdr *tcp, struct ipfw_state *s)
2076 {
2077 
2078 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2079 		u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2080 
2081 		if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2082 			return;
2083 
2084 		s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2085 		switch (s->st_state & IPFW_STATE_TCPSTATES) {
2086 		case TH_SYN:				/* opening */
2087 			s->st_expire = time_uptime + dyn_syn_lifetime;
2088 			break;
2089 
2090 		case BOTH_SYN:			/* move to established */
2091 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
2092 		case BOTH_SYN | (TH_FIN << 8):
2093 			s->st_expire = time_uptime + dyn_ack_lifetime;
2094 			break;
2095 
2096 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
2097 			if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2098 				/* And both FINs were ACKed. */
2099 				s->st_expire = time_uptime + dyn_fin_lifetime;
2100 			} else {
2101 				s->st_expire = time_uptime +
2102 				    dyn_finwait_lifetime;
2103 			}
2104 			break;
2105 
2106 		default:
2107 #if 0
2108 			/*
2109 			 * reset or some invalid combination, but can also
2110 			 * occur if we use keep-state the wrong way.
2111 			 */
2112 			if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2113 				kprintf("invalid state: 0x%x\n", s->st_state);
2114 #endif
2115 			s->st_expire = time_uptime + dyn_rst_lifetime;
2116 			break;
2117 		}
2118 	} else if (pkt->proto == IPPROTO_UDP) {
2119 		s->st_expire = time_uptime + dyn_udp_lifetime;
2120 	} else {
2121 		/* other protocols */
2122 		s->st_expire = time_uptime + dyn_short_lifetime;
2123 	}
2124 }
2125 
2126 /*
2127  * Lookup a state.
2128  */
2129 static struct ipfw_state *
2130 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2131     int *match_direction, const struct tcphdr *tcp)
2132 {
2133 	struct ipfw_state *key, *s;
2134 	int dir = MATCH_NONE;
2135 
2136 	key = &ctx->ipfw_state_tmpkey;
2137 	ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2138 	    pkt->dst_ip, pkt->dst_port, pkt->proto);
2139 	s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2140 	if (s == NULL)
2141 		goto done; /* not found. */
2142 	if (IPFW_STATE_ISDEAD(s)) {
2143 		ipfw_state_remove(ctx, s);
2144 		s = NULL;
2145 		goto done;
2146 	}
2147 	if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2148 		/* TCP ports recycling is too fast. */
2149 		ctx->ipfw_sts_tcprecycled++;
2150 		ipfw_state_remove(ctx, s);
2151 		s = NULL;
2152 		goto done;
2153 	}
2154 
2155 	if (s->st_swap == key->st_swap) {
2156 		dir = MATCH_FORWARD;
2157 	} else {
2158 		KASSERT((s->st_swap & key->st_swap) == 0,
2159 		    ("found mismatch state"));
2160 		dir = MATCH_REVERSE;
2161 	}
2162 
2163 	/* Update this state. */
2164 	ipfw_state_update(pkt, dir, tcp, s);
2165 
2166 	if (s->st_track != NULL) {
2167 		/* This track has been used. */
2168 		s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2169 	}
2170 done:
2171 	if (match_direction)
2172 		*match_direction = dir;
2173 	return (s);
2174 }
2175 
2176 static struct ipfw_state *
2177 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2178     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2179 {
2180 	struct ipfw_state *s;
2181 	size_t sz;
2182 
2183 	KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2184 	    ("invalid state type %u", type));
2185 
2186 	sz = sizeof(struct ipfw_state);
2187 	if (IPFW_ISXLAT(type))
2188 		sz = sizeof(struct ipfw_xlat);
2189 
2190 	s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2191 	if (s == NULL) {
2192 		ctx->ipfw_sts_nomem++;
2193 		return (NULL);
2194 	}
2195 
2196 	ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2197 	    id->dst_ip, id->dst_port, id->proto);
2198 
2199 	s->st_rule = rule;
2200 	s->st_type = type;
2201 	if (IPFW_ISXLAT(type)) {
2202 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2203 
2204 		x->xlat_dir = MATCH_NONE;
2205 		x->xlat_pcpu = -1;
2206 	}
2207 
2208 	/*
2209 	 * Update this state:
2210 	 * Set st_expire and st_state.
2211 	 */
2212 	ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2213 
2214 	return (s);
2215 }
2216 
2217 static struct ipfw_state *
2218 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2219     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2220     const struct tcphdr *tcp)
2221 {
2222 	struct ipfw_state *s, *dup;
2223 
2224 	s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2225 	if (s == NULL)
2226 		return (NULL);
2227 
2228 	ctx->ipfw_state_cnt++;
2229 	ctx->ipfw_state_loosecnt++;
2230 	if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2231 		ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2232 		ctx->ipfw_state_loosecnt = 0;
2233 	}
2234 
2235 	dup = ipfw_state_link(ctx, s);
2236 	if (dup != NULL)
2237 		panic("ipfw: %u state exists %p", type, dup);
2238 
2239 	if (t != NULL) {
2240 		/* Keep the track referenced. */
2241 		LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2242 		s->st_track = t;
2243 	}
2244 	return (s);
2245 }
2246 
2247 static boolean_t
2248 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2249 {
2250 	struct ipfw_trkcnt *trk;
2251 	boolean_t trk_freed = FALSE;
2252 
2253 	KASSERT(t->t_count != NULL, ("track anchor"));
2254 	KASSERT(LIST_EMPTY(&t->t_state_list),
2255 	    ("invalid track is still referenced"));
2256 
2257 	trk = t->t_trkcnt;
2258 	KASSERT(trk != NULL, ("track has no trkcnt"));
2259 
2260 	RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2261 	TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2262 	kfree(t, M_IPFW);
2263 
2264 	/*
2265 	 * fdrop() style reference counting.
2266 	 * See kern/kern_descrip.c fdrop().
2267 	 */
2268 	for (;;) {
2269 		int refs = trk->tc_refs;
2270 
2271 		cpu_ccfence();
2272 		KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2273 		if (refs == 1) {
2274 			IPFW_TRKCNT_TOKGET;
2275 			if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2276 				KASSERT(trk->tc_count == 0,
2277 				    ("%d states reference this trkcnt",
2278 				     trk->tc_count));
2279 				RB_REMOVE(ipfw_trkcnt_tree,
2280 				    &ipfw_gd.ipfw_trkcnt_tree, trk);
2281 
2282 				KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2283 				    ("invalid trkcnt cnt %d",
2284 				     ipfw_gd.ipfw_trkcnt_cnt));
2285 				ipfw_gd.ipfw_trkcnt_cnt--;
2286 				IPFW_TRKCNT_TOKREL;
2287 
2288 				if (ctx->ipfw_trkcnt_spare == NULL)
2289 					ctx->ipfw_trkcnt_spare = trk;
2290 				else
2291 					kfree(trk, M_IPFW);
2292 				trk_freed = TRUE;
2293 				break; /* done! */
2294 			}
2295 			IPFW_TRKCNT_TOKREL;
2296 			/* retry */
2297 		} else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2298 			break; /* done! */
2299 		}
2300 		/* retry */
2301 	}
2302 	return (trk_freed);
2303 }
2304 
2305 static void
2306 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2307 {
2308 	struct ipfw_track *t, *tn;
2309 
2310 	TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2311 		if (t->t_count == NULL) /* anchor */
2312 			continue;
2313 		if (rule != NULL && t->t_rule != rule)
2314 			continue;
2315 		ipfw_track_free(ctx, t);
2316 	}
2317 }
2318 
2319 static boolean_t
2320 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2321     boolean_t reap)
2322 {
2323 	struct ipfw_state *s, *sn;
2324 	boolean_t ret = FALSE;
2325 
2326 	KASSERT(t->t_count != NULL, ("track anchor"));
2327 
2328 	if (LIST_EMPTY(&t->t_state_list))
2329 		return (FALSE);
2330 
2331 	/*
2332 	 * Do not expire more than once per second, it is useless.
2333 	 */
2334 	if (t->t_lastexp == time_uptime)
2335 		return (FALSE);
2336 	t->t_lastexp = time_uptime;
2337 
2338 	LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2339 		if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2340 			KASSERT(s->st_track == t,
2341 			    ("state track %p does not match %p",
2342 			     s->st_track, t));
2343 			ipfw_state_del(ctx, s);
2344 			ret = TRUE;
2345 		}
2346 	}
2347 	return (ret);
2348 }
2349 
2350 static __inline struct ipfw_trkcnt *
2351 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2352 {
2353 	struct ipfw_trkcnt *trk;
2354 
2355 	if (ctx->ipfw_trkcnt_spare != NULL) {
2356 		trk = ctx->ipfw_trkcnt_spare;
2357 		ctx->ipfw_trkcnt_spare = NULL;
2358 	} else {
2359 		trk = kmalloc_cachealign(sizeof(*trk), M_IPFW,
2360 		    M_INTWAIT | M_NULLOK);
2361 	}
2362 	return (trk);
2363 }
2364 
2365 static void
2366 ipfw_track_expire_done(struct ipfw_context *ctx)
2367 {
2368 
2369 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2370 	    ("trackexp is not in progress"));
2371 	ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2372 	callout_reset(&ctx->ipfw_trackto_ch, hz,
2373 	    ipfw_track_expire_ipifunc, NULL);
2374 }
2375 
2376 static void
2377 ipfw_track_expire_more(struct ipfw_context *ctx)
2378 {
2379 	struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2380 
2381 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2382 	    ("trackexp is not in progress"));
2383 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2384 	    ("trackexp more did not finish"));
2385 	netisr_sendmsg_oncpu(nm);
2386 }
2387 
2388 static int
2389 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2390     int scan_max, int expire_max)
2391 {
2392 	struct ipfw_track *t;
2393 	int scanned = 0, expired = 0;
2394 	boolean_t reap = FALSE;
2395 
2396 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2397 	    ("trackexp is not in progress"));
2398 
2399 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2400 		reap = TRUE;
2401 
2402 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2403 		if (scanned++ >= scan_max) {
2404 			ipfw_track_expire_more(ctx);
2405 			return (expired);
2406 		}
2407 
2408 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2409 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2410 
2411 		if (t->t_count == NULL) /* anchor */
2412 			continue;
2413 
2414 		ipfw_track_state_expire(ctx, t, reap);
2415 		if (!LIST_EMPTY(&t->t_state_list)) {
2416 			/* There are states referencing this track. */
2417 			continue;
2418 		}
2419 
2420 		if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2421 			/* Expired. */
2422 			if (ipfw_track_free(ctx, t)) {
2423 				if (++expired >= expire_max) {
2424 					ipfw_track_expire_more(ctx);
2425 					return (expired);
2426 				}
2427 			}
2428 		}
2429 	}
2430 	TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2431 	ipfw_track_expire_done(ctx);
2432 	return (expired);
2433 }
2434 
2435 static int
2436 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2437 {
2438 	struct ipfw_track *anchor;
2439 
2440 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2441 	    ("trackexp is in progress"));
2442 	ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2443 
2444 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2445 		ipfw_track_expire_done(ctx);
2446 		return (0);
2447 	}
2448 
2449 	/*
2450 	 * Do not expire more than once per second, it is useless.
2451 	 */
2452 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2453 	    ctx->ipfw_track_lastexp == time_uptime) {
2454 		ipfw_track_expire_done(ctx);
2455 		return (0);
2456 	}
2457 	ctx->ipfw_track_lastexp = time_uptime;
2458 
2459 	anchor = &ctx->ipfw_trackexp_anch;
2460 	TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2461 	return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2462 }
2463 
2464 static void
2465 ipfw_track_expire_more_dispatch(netmsg_t nm)
2466 {
2467 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2468 	struct ipfw_track *anchor;
2469 
2470 	ASSERT_NETISR_NCPUS(mycpuid);
2471 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2472 	    ("trackexp is not in progress"));
2473 
2474 	/* Reply ASAP */
2475 	netisr_replymsg(&nm->base, 0);
2476 
2477 	anchor = &ctx->ipfw_trackexp_anch;
2478 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2479 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2480 		ipfw_track_expire_done(ctx);
2481 		return;
2482 	}
2483 	ipfw_track_expire_loop(ctx, anchor,
2484 	    ipfw_track_scan_max, ipfw_track_expire_max);
2485 }
2486 
2487 static void
2488 ipfw_track_expire_dispatch(netmsg_t nm)
2489 {
2490 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2491 
2492 	ASSERT_NETISR_NCPUS(mycpuid);
2493 
2494 	/* Reply ASAP */
2495 	crit_enter();
2496 	netisr_replymsg(&nm->base, 0);
2497 	crit_exit();
2498 
2499 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2500 		/* Running; done. */
2501 		return;
2502 	}
2503 	ipfw_track_expire_start(ctx,
2504 	    ipfw_track_scan_max, ipfw_track_expire_max);
2505 }
2506 
2507 static void
2508 ipfw_track_expire_ipifunc(void *dummy __unused)
2509 {
2510 	struct netmsg_base *msg;
2511 
2512 	KKASSERT(mycpuid < netisr_ncpus);
2513 	msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2514 
2515 	crit_enter();
2516 	if (msg->lmsg.ms_flags & MSGF_DONE)
2517 		netisr_sendmsg_oncpu(msg);
2518 	crit_exit();
2519 }
2520 
2521 static int
2522 ipfw_track_reap(struct ipfw_context *ctx)
2523 {
2524 	struct ipfw_track *t, *anchor;
2525 	int expired;
2526 
2527 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2528 		/*
2529 		 * Kick start track expiring.  Ignore scan limit,
2530 		 * we are short of tracks.
2531 		 */
2532 		ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2533 		expired = ipfw_track_expire_start(ctx, INT_MAX,
2534 		    ipfw_track_reap_max);
2535 		ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2536 		return (expired);
2537 	}
2538 
2539 	/*
2540 	 * Tracks are being expired.
2541 	 */
2542 
2543 	if (RB_EMPTY(&ctx->ipfw_track_tree))
2544 		return (0);
2545 
2546 	expired = 0;
2547 	anchor = &ctx->ipfw_trackexp_anch;
2548 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2549 		/*
2550 		 * Ignore scan limit; we are short of tracks.
2551 		 */
2552 
2553 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2554 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2555 
2556 		if (t->t_count == NULL) /* anchor */
2557 			continue;
2558 
2559 		ipfw_track_state_expire(ctx, t, TRUE);
2560 		if (!LIST_EMPTY(&t->t_state_list)) {
2561 			/* There are states referencing this track. */
2562 			continue;
2563 		}
2564 
2565 		if (ipfw_track_free(ctx, t)) {
2566 			if (++expired >= ipfw_track_reap_max) {
2567 				ipfw_track_expire_more(ctx);
2568 				break;
2569 			}
2570 		}
2571 	}
2572 	/*
2573 	 * NOTE:
2574 	 * Leave the anchor on the list, even if the end of the list has
2575 	 * been reached.  ipfw_track_expire_more_dispatch() will handle
2576 	 * the removal.
2577 	 */
2578 	return (expired);
2579 }
2580 
2581 static struct ipfw_track *
2582 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2583     uint16_t limit_mask, struct ip_fw *rule)
2584 {
2585 	struct ipfw_track *key, *t, *dup;
2586 	struct ipfw_trkcnt *trk, *ret;
2587 	boolean_t do_expire = FALSE;
2588 
2589 	KASSERT(rule->track_ruleid != 0,
2590 	    ("rule %u has no track ruleid", rule->rulenum));
2591 
2592 	key = &ctx->ipfw_track_tmpkey;
2593 	key->t_proto = id->proto;
2594 	key->t_addrs = 0;
2595 	key->t_ports = 0;
2596 	key->t_rule = rule;
2597 	if (limit_mask & DYN_SRC_ADDR)
2598 		key->t_saddr = id->src_ip;
2599 	if (limit_mask & DYN_DST_ADDR)
2600 		key->t_daddr = id->dst_ip;
2601 	if (limit_mask & DYN_SRC_PORT)
2602 		key->t_sport = id->src_port;
2603 	if (limit_mask & DYN_DST_PORT)
2604 		key->t_dport = id->dst_port;
2605 
2606 	t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2607 	if (t != NULL)
2608 		goto done;
2609 
2610 	t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2611 	if (t == NULL) {
2612 		ctx->ipfw_tks_nomem++;
2613 		return (NULL);
2614 	}
2615 
2616 	t->t_key = key->t_key;
2617 	t->t_rule = rule;
2618 	t->t_lastexp = 0;
2619 	LIST_INIT(&t->t_state_list);
2620 
2621 	if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2622 		time_t globexp, uptime;
2623 
2624 		trk = NULL;
2625 		do_expire = TRUE;
2626 
2627 		/*
2628 		 * Do not expire globally more than once per second,
2629 		 * it is useless.
2630 		 */
2631 		uptime = time_uptime;
2632 		globexp = ipfw_gd.ipfw_track_globexp;
2633 		if (globexp != uptime &&
2634 		    atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2635 		    globexp, uptime)) {
2636 			int cpu;
2637 
2638 			/* Expire tracks on other CPUs. */
2639 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2640 				if (cpu == mycpuid)
2641 					continue;
2642 				lwkt_send_ipiq(globaldata_find(cpu),
2643 				    ipfw_track_expire_ipifunc, NULL);
2644 			}
2645 		}
2646 	} else {
2647 		trk = ipfw_trkcnt_alloc(ctx);
2648 	}
2649 	if (trk == NULL) {
2650 		struct ipfw_trkcnt *tkey;
2651 
2652 		tkey = &ctx->ipfw_trkcnt_tmpkey;
2653 		key = NULL; /* tkey overlaps key */
2654 
2655 		tkey->tc_key = t->t_key;
2656 		tkey->tc_ruleid = rule->track_ruleid;
2657 
2658 		IPFW_TRKCNT_TOKGET;
2659 		trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2660 		    tkey);
2661 		if (trk == NULL) {
2662 			IPFW_TRKCNT_TOKREL;
2663 			if (do_expire) {
2664 				ctx->ipfw_tks_reap++;
2665 				if (ipfw_track_reap(ctx) > 0) {
2666 					if (ipfw_gd.ipfw_trkcnt_cnt <
2667 					    ipfw_track_max) {
2668 						trk = ipfw_trkcnt_alloc(ctx);
2669 						if (trk != NULL)
2670 							goto install;
2671 						ctx->ipfw_tks_cntnomem++;
2672 					} else {
2673 						ctx->ipfw_tks_overflow++;
2674 					}
2675 				} else {
2676 					ctx->ipfw_tks_reapfailed++;
2677 					ctx->ipfw_tks_overflow++;
2678 				}
2679 			} else {
2680 				ctx->ipfw_tks_cntnomem++;
2681 			}
2682 			kfree(t, M_IPFW);
2683 			return (NULL);
2684 		}
2685 		KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2686 		    ("invalid trkcnt refs %d", trk->tc_refs));
2687 		atomic_add_int(&trk->tc_refs, 1);
2688 		IPFW_TRKCNT_TOKREL;
2689 	} else {
2690 install:
2691 		trk->tc_key = t->t_key;
2692 		trk->tc_ruleid = rule->track_ruleid;
2693 		trk->tc_refs = 0;
2694 		trk->tc_count = 0;
2695 		trk->tc_expire = 0;
2696 		trk->tc_rulenum = rule->rulenum;
2697 
2698 		IPFW_TRKCNT_TOKGET;
2699 		ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2700 		    trk);
2701 		if (ret != NULL) {
2702 			KASSERT(ret->tc_refs > 0 &&
2703 			    ret->tc_refs < netisr_ncpus,
2704 			    ("invalid trkcnt refs %d", ret->tc_refs));
2705 			KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2706 			    ("trkcnt spare was installed"));
2707 			ctx->ipfw_trkcnt_spare = trk;
2708 			trk = ret;
2709 		} else {
2710 			ipfw_gd.ipfw_trkcnt_cnt++;
2711 		}
2712 		atomic_add_int(&trk->tc_refs, 1);
2713 		IPFW_TRKCNT_TOKREL;
2714 	}
2715 	t->t_count = &trk->tc_count;
2716 	t->t_trkcnt = trk;
2717 
2718 	dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2719 	if (dup != NULL)
2720 		panic("ipfw: track exists");
2721 	TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2722 done:
2723 	t->t_expire = time_uptime + dyn_short_lifetime;
2724 	return (t);
2725 }
2726 
2727 /*
2728  * Install state for rule type cmd->o.opcode
2729  *
2730  * Returns NULL if state is not installed because of errors or because
2731  * states limitations are enforced.
2732  */
2733 static struct ipfw_state *
2734 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2735     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2736 {
2737 	struct ipfw_state *s;
2738 	struct ipfw_track *t;
2739 	int count, diff;
2740 
2741 	if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2742 	    (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2743 		boolean_t overflow = TRUE;
2744 
2745 		ctx->ipfw_sts_reap++;
2746 		if (ipfw_state_reap(ctx, diff) == 0)
2747 			ctx->ipfw_sts_reapfailed++;
2748 		if (ipfw_state_cntsync() < ipfw_state_max)
2749 			overflow = FALSE;
2750 
2751 		if (overflow) {
2752 			time_t globexp, uptime;
2753 			int cpu;
2754 
2755 			/*
2756 			 * Do not expire globally more than once per second,
2757 			 * it is useless.
2758 			 */
2759 			uptime = time_uptime;
2760 			globexp = ipfw_gd.ipfw_state_globexp;
2761 			if (globexp == uptime ||
2762 			    !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2763 			    globexp, uptime)) {
2764 				ctx->ipfw_sts_overflow++;
2765 				return (NULL);
2766 			}
2767 
2768 			/* Expire states on other CPUs. */
2769 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2770 				if (cpu == mycpuid)
2771 					continue;
2772 				lwkt_send_ipiq(globaldata_find(cpu),
2773 				    ipfw_state_expire_ipifunc, NULL);
2774 			}
2775 			ctx->ipfw_sts_overflow++;
2776 			return (NULL);
2777 		}
2778 	}
2779 
2780 	switch (cmd->o.opcode) {
2781 	case O_KEEP_STATE: /* bidir rule */
2782 	case O_REDIRECT:
2783 		s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2784 		    tcp);
2785 		if (s == NULL)
2786 			return (NULL);
2787 		break;
2788 
2789 	case O_LIMIT: /* limit number of sessions */
2790 		t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2791 		if (t == NULL)
2792 			return (NULL);
2793 
2794 		if (*t->t_count >= cmd->conn_limit) {
2795 			if (!ipfw_track_state_expire(ctx, t, TRUE))
2796 				return (NULL);
2797 		}
2798 		for (;;) {
2799 			count = *t->t_count;
2800 			if (count >= cmd->conn_limit)
2801 				return (NULL);
2802 			if (atomic_cmpset_int(t->t_count, count, count + 1))
2803 				break;
2804 		}
2805 
2806 		s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2807 		if (s == NULL) {
2808 			/* Undo damage. */
2809 			atomic_subtract_int(t->t_count, 1);
2810 			return (NULL);
2811 		}
2812 		break;
2813 
2814 	default:
2815 		panic("unknown state type %u\n", cmd->o.opcode);
2816 	}
2817 
2818 	if (s->st_type == O_REDIRECT) {
2819 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2820 		ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2821 
2822 		x->xlat_addr = r->addr.s_addr;
2823 		x->xlat_port = r->port;
2824 		x->xlat_ifp = args->m->m_pkthdr.rcvif;
2825 		x->xlat_dir = MATCH_FORWARD;
2826 		KKASSERT(x->xlat_ifp != NULL);
2827 	}
2828 	return (s);
2829 }
2830 
2831 static int
2832 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2833     const struct in_addr *in)
2834 {
2835 	struct radix_node_head *rnh;
2836 	struct sockaddr_in sin;
2837 	struct ipfw_tblent *te;
2838 
2839 	KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2840 	rnh = ctx->ipfw_tables[tableid];
2841 	if (rnh == NULL)
2842 		return (0); /* no match */
2843 
2844 	memset(&sin, 0, sizeof(sin));
2845 	sin.sin_family = AF_INET;
2846 	sin.sin_len = sizeof(sin);
2847 	sin.sin_addr = *in;
2848 
2849 	te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2850 	if (te == NULL)
2851 		return (0); /* no match */
2852 
2853 	te->te_use++;
2854 	te->te_lastuse = time_second;
2855 	return (1); /* match */
2856 }
2857 
2858 /*
2859  * Transmit a TCP packet, containing either a RST or a keepalive.
2860  * When flags & TH_RST, we are sending a RST packet, because of a
2861  * "reset" action matched the packet.
2862  * Otherwise we are sending a keepalive, and flags & TH_
2863  *
2864  * Only {src,dst}_{ip,port} of "id" are used.
2865  */
2866 static void
2867 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2868 {
2869 	struct mbuf *m;
2870 	struct ip *ip;
2871 	struct tcphdr *tcp;
2872 	struct route sro;	/* fake route */
2873 
2874 	MGETHDR(m, M_NOWAIT, MT_HEADER);
2875 	if (m == NULL)
2876 		return;
2877 	m->m_pkthdr.rcvif = NULL;
2878 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2879 	m->m_data += max_linkhdr;
2880 
2881 	ip = mtod(m, struct ip *);
2882 	bzero(ip, m->m_len);
2883 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2884 	ip->ip_p = IPPROTO_TCP;
2885 	tcp->th_off = 5;
2886 
2887 	/*
2888 	 * Assume we are sending a RST (or a keepalive in the reverse
2889 	 * direction), swap src and destination addresses and ports.
2890 	 */
2891 	ip->ip_src.s_addr = htonl(id->dst_ip);
2892 	ip->ip_dst.s_addr = htonl(id->src_ip);
2893 	tcp->th_sport = htons(id->dst_port);
2894 	tcp->th_dport = htons(id->src_port);
2895 	if (flags & TH_RST) {	/* we are sending a RST */
2896 		if (flags & TH_ACK) {
2897 			tcp->th_seq = htonl(ack);
2898 			tcp->th_ack = htonl(0);
2899 			tcp->th_flags = TH_RST;
2900 		} else {
2901 			if (flags & TH_SYN)
2902 				seq++;
2903 			tcp->th_seq = htonl(0);
2904 			tcp->th_ack = htonl(seq);
2905 			tcp->th_flags = TH_RST | TH_ACK;
2906 		}
2907 	} else {
2908 		/*
2909 		 * We are sending a keepalive. flags & TH_SYN determines
2910 		 * the direction, forward if set, reverse if clear.
2911 		 * NOTE: seq and ack are always assumed to be correct
2912 		 * as set by the caller. This may be confusing...
2913 		 */
2914 		if (flags & TH_SYN) {
2915 			/*
2916 			 * we have to rewrite the correct addresses!
2917 			 */
2918 			ip->ip_dst.s_addr = htonl(id->dst_ip);
2919 			ip->ip_src.s_addr = htonl(id->src_ip);
2920 			tcp->th_dport = htons(id->dst_port);
2921 			tcp->th_sport = htons(id->src_port);
2922 		}
2923 		tcp->th_seq = htonl(seq);
2924 		tcp->th_ack = htonl(ack);
2925 		tcp->th_flags = TH_ACK;
2926 	}
2927 
2928 	/*
2929 	 * set ip_len to the payload size so we can compute
2930 	 * the tcp checksum on the pseudoheader
2931 	 * XXX check this, could save a couple of words ?
2932 	 */
2933 	ip->ip_len = htons(sizeof(struct tcphdr));
2934 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2935 
2936 	/*
2937 	 * now fill fields left out earlier
2938 	 */
2939 	ip->ip_ttl = ip_defttl;
2940 	ip->ip_len = m->m_pkthdr.len;
2941 
2942 	bzero(&sro, sizeof(sro));
2943 	ip_rtaddr(ip->ip_dst, &sro);
2944 
2945 	m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2946 	ip_output(m, NULL, &sro, 0, NULL, NULL);
2947 	if (sro.ro_rt)
2948 		RTFREE(sro.ro_rt);
2949 }
2950 
2951 /*
2952  * Send a reject message, consuming the mbuf passed as an argument.
2953  */
2954 static void
2955 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2956 {
2957 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2958 		/* We need the IP header in host order for icmp_error(). */
2959 		if (args->eh != NULL) {
2960 			struct ip *ip = mtod(args->m, struct ip *);
2961 
2962 			ip->ip_len = ntohs(ip->ip_len);
2963 			ip->ip_off = ntohs(ip->ip_off);
2964 		}
2965 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2966 	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2967 		struct tcphdr *const tcp =
2968 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2969 
2970 		if ((tcp->th_flags & TH_RST) == 0) {
2971 			send_pkt(&args->f_id, ntohl(tcp->th_seq),
2972 				 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2973 		}
2974 		m_freem(args->m);
2975 	} else {
2976 		m_freem(args->m);
2977 	}
2978 	args->m = NULL;
2979 }
2980 
2981 /*
2982  * Given an ip_fw *, lookup_next_rule will return a pointer
2983  * to the next rule, which can be either the jump
2984  * target (for skipto instructions) or the next one in the list (in
2985  * all other cases including a missing jump target).
2986  * The result is also written in the "next_rule" field of the rule.
2987  * Backward jumps are not allowed, so start looking from the next
2988  * rule...
2989  *
2990  * This never returns NULL -- in case we do not have an exact match,
2991  * the next rule is returned. When the ruleset is changed,
2992  * pointers are flushed so we are always correct.
2993  */
2994 static struct ip_fw *
2995 lookup_next_rule(struct ip_fw *me)
2996 {
2997 	struct ip_fw *rule = NULL;
2998 	ipfw_insn *cmd;
2999 
3000 	/* look for action, in case it is a skipto */
3001 	cmd = ACTION_PTR(me);
3002 	if (cmd->opcode == O_LOG)
3003 		cmd += F_LEN(cmd);
3004 	if (cmd->opcode == O_SKIPTO) {
3005 		for (rule = me->next; rule; rule = rule->next) {
3006 			if (rule->rulenum >= cmd->arg1)
3007 				break;
3008 		}
3009 	}
3010 	if (rule == NULL)			/* failure or not a skipto */
3011 		rule = me->next;
3012 	me->next_rule = rule;
3013 	return rule;
3014 }
3015 
3016 static int
3017 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3018 		enum ipfw_opcodes opcode, uid_t uid)
3019 {
3020 	struct in_addr src_ip, dst_ip;
3021 	struct inpcbinfo *pi;
3022 	boolean_t wildcard;
3023 	struct inpcb *pcb;
3024 
3025 	if (fid->proto == IPPROTO_TCP) {
3026 		wildcard = FALSE;
3027 		pi = &tcbinfo[mycpuid];
3028 	} else if (fid->proto == IPPROTO_UDP) {
3029 		wildcard = TRUE;
3030 		pi = &udbinfo[mycpuid];
3031 	} else {
3032 		return 0;
3033 	}
3034 
3035 	/*
3036 	 * Values in 'fid' are in host byte order
3037 	 */
3038 	dst_ip.s_addr = htonl(fid->dst_ip);
3039 	src_ip.s_addr = htonl(fid->src_ip);
3040 	if (oif) {
3041 		pcb = in_pcblookup_hash(pi,
3042 			dst_ip, htons(fid->dst_port),
3043 			src_ip, htons(fid->src_port),
3044 			wildcard, oif);
3045 	} else {
3046 		pcb = in_pcblookup_hash(pi,
3047 			src_ip, htons(fid->src_port),
3048 			dst_ip, htons(fid->dst_port),
3049 			wildcard, NULL);
3050 	}
3051 	if (pcb == NULL || pcb->inp_socket == NULL)
3052 		return 0;
3053 
3054 	if (opcode == O_UID) {
3055 #define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
3056 		return !socheckuid(pcb->inp_socket, uid);
3057 #undef socheckuid
3058 	} else  {
3059 		return groupmember(uid, pcb->inp_socket->so_cred);
3060 	}
3061 }
3062 
3063 static int
3064 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3065 {
3066 
3067 	if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3068 		struct ifaddr_container *ifac;
3069 		struct ifnet *ifp;
3070 
3071 		ifp = ifunit_netisr(cmd->ifname);
3072 		if (ifp == NULL)
3073 			return (0);
3074 
3075 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3076 			struct ifaddr *ia = ifac->ifa;
3077 
3078 			if (ia->ifa_addr == NULL)
3079 				continue;
3080 			if (ia->ifa_addr->sa_family != AF_INET)
3081 				continue;
3082 
3083 			cmd->mask.s_addr = INADDR_ANY;
3084 			if (cmd->o.arg1 & IPFW_IFIP_NET) {
3085 				cmd->mask = ((struct sockaddr_in *)
3086 				    ia->ifa_netmask)->sin_addr;
3087 			}
3088 			if (cmd->mask.s_addr == INADDR_ANY)
3089 				cmd->mask.s_addr = INADDR_BROADCAST;
3090 
3091 			cmd->addr =
3092 			    ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3093 			cmd->addr.s_addr &= cmd->mask.s_addr;
3094 
3095 			cmd->o.arg1 |= IPFW_IFIP_VALID;
3096 			break;
3097 		}
3098 		if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3099 			return (0);
3100 	}
3101 	return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3102 }
3103 
3104 static void
3105 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3106     struct in_addr *old_addr, uint16_t *old_port)
3107 {
3108 	struct ip *ip = mtod(m, struct ip *);
3109 	struct in_addr *addr;
3110 	uint16_t *port, *csum, dlen = 0;
3111 	uint8_t udp = 0;
3112 	boolean_t pseudo = FALSE;
3113 
3114 	if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3115 		addr = &ip->ip_src;
3116 		switch (ip->ip_p) {
3117 		case IPPROTO_TCP:
3118 			port = &L3HDR(struct tcphdr, ip)->th_sport;
3119 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3120 			break;
3121 		case IPPROTO_UDP:
3122 			port = &L3HDR(struct udphdr, ip)->uh_sport;
3123 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3124 			udp = 1;
3125 			break;
3126 		default:
3127 			panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3128 		}
3129 	} else {
3130 		addr = &ip->ip_dst;
3131 		switch (ip->ip_p) {
3132 		case IPPROTO_TCP:
3133 			port = &L3HDR(struct tcphdr, ip)->th_dport;
3134 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3135 			break;
3136 		case IPPROTO_UDP:
3137 			port = &L3HDR(struct udphdr, ip)->uh_dport;
3138 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3139 			udp = 1;
3140 			break;
3141 		default:
3142 			panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3143 		}
3144 	}
3145 	if (old_addr != NULL)
3146 		*old_addr = *addr;
3147 	if (old_port != NULL) {
3148 		if (x->xlat_port != 0)
3149 			*old_port = *port;
3150 		else
3151 			*old_port = 0;
3152 	}
3153 
3154 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3155 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3156 			dlen = ip->ip_len - (ip->ip_hl << 2);
3157 		pseudo = TRUE;
3158 	}
3159 
3160 	if (!pseudo) {
3161 		const uint16_t *oaddr, *naddr;
3162 
3163 		oaddr = (const uint16_t *)&addr->s_addr;
3164 		naddr = (const uint16_t *)&x->xlat_addr;
3165 
3166 		ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3167 		    oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3168 		*csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3169 		    oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3170 	}
3171 	addr->s_addr = x->xlat_addr;
3172 
3173 	if (x->xlat_port != 0) {
3174 		if (!pseudo) {
3175 			*csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3176 			    udp);
3177 		}
3178 		*port = x->xlat_port;
3179 	}
3180 
3181 	if (pseudo) {
3182 		*csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3183 		    htons(dlen + ip->ip_p));
3184 	}
3185 }
3186 
3187 static void
3188 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3189 {
3190 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3191 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3192 	struct mbuf *m = nm->m;
3193 	struct ipfw_xlat *x = nm->arg1;
3194 	struct ip_fw *rule = x->xlat_rule;
3195 
3196 	ASSERT_NETISR_NCPUS(mycpuid);
3197 	KASSERT(rule->cpuid == mycpuid,
3198 	    ("rule does not belong to cpu%d", mycpuid));
3199 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3200 	    ("mbuf does not have ipfw continue rule"));
3201 
3202 	KASSERT(ctx->ipfw_cont_rule == NULL,
3203 	    ("pending ipfw continue rule"));
3204 	KASSERT(ctx->ipfw_cont_xlat == NULL,
3205 	    ("pending ipfw continue xlat"));
3206 	ctx->ipfw_cont_rule = rule;
3207 	ctx->ipfw_cont_xlat = x;
3208 
3209 	if (nm->arg2 == 0)
3210 		ip_input(m);
3211 	else
3212 		ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3213 
3214 	/* May not be cleared, if ipfw was unload/disabled. */
3215 	ctx->ipfw_cont_rule = NULL;
3216 	ctx->ipfw_cont_xlat = NULL;
3217 
3218 	/*
3219 	 * This state is no longer used; decrement its xlat_crefs,
3220 	 * so this state can be deleted.
3221 	 */
3222 	x->xlat_crefs--;
3223 	/*
3224 	 * This rule is no longer used; decrement its cross_refs,
3225 	 * so this rule can be deleted.
3226 	 *
3227 	 * NOTE:
3228 	 * Decrement cross_refs in the last step of this function,
3229 	 * so that the module could be unloaded safely.
3230 	 */
3231 	rule->cross_refs--;
3232 }
3233 
3234 static void
3235 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3236     uint32_t flags)
3237 {
3238 	struct netmsg_genpkt *nm;
3239 
3240 	KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3241 	    x->xlat_pcpu, cpuid));
3242 
3243 	/*
3244 	 * Bump cross_refs to prevent this rule and its siblings
3245 	 * from being deleted, while this mbuf is inflight.  The
3246 	 * cross_refs of the sibling rule on the target cpu will
3247 	 * be decremented, once this mbuf is going to be filtered
3248 	 * on the target cpu.
3249 	 */
3250 	x->xlat_rule->cross_refs++;
3251 	/*
3252 	 * Bump xlat_crefs to prevent this state and its paired
3253 	 * state from being deleted, while this mbuf is inflight.
3254 	 * The xlat_crefs of the paired state on the target cpu
3255 	 * will be decremented, once this mbuf is going to be
3256 	 * filtered on the target cpu.
3257 	 */
3258 	x->xlat_crefs++;
3259 
3260 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3261 	if (flags & IPFW_XLATE_INSERT)
3262 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3263 	if (flags & IPFW_XLATE_FORWARD)
3264 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3265 
3266 	if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3267 		struct ip *ip = mtod(m, struct ip *);
3268 
3269 		/*
3270 		 * NOTE:
3271 		 * ip_input() expects ip_len/ip_off are in network
3272 		 * byte order.
3273 		 */
3274 		ip->ip_len = htons(ip->ip_len);
3275 		ip->ip_off = htons(ip->ip_off);
3276 	}
3277 
3278 	nm = &m->m_hdr.mh_genmsg;
3279 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3280 	    ipfw_ip_xlate_dispatch);
3281 	nm->m = m;
3282 	nm->arg1 = x->xlat_pair;
3283 	nm->arg2 = 0;
3284 	if (flags & IPFW_XLATE_OUTPUT)
3285 		nm->arg2 = 1;
3286 	netisr_sendmsg(&nm->base, cpuid);
3287 }
3288 
3289 static struct mbuf *
3290 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3291     struct ip_fw_local *local, struct ip **ip0)
3292 {
3293 	struct ip *ip = mtod(m, struct ip *);
3294 	struct tcphdr *tcp;
3295 	struct udphdr *udp;
3296 
3297 	/*
3298 	 * Collect parameters into local variables for faster matching.
3299 	 */
3300 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
3301 		local->proto = args->f_id.proto = 0;	/* mark f_id invalid */
3302 		goto done;
3303 	}
3304 
3305 	local->proto = args->f_id.proto = ip->ip_p;
3306 	local->src_ip = ip->ip_src;
3307 	local->dst_ip = ip->ip_dst;
3308 	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3309 		local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3310 		local->ip_len = ntohs(ip->ip_len);
3311 	} else {
3312 		local->offset = ip->ip_off & IP_OFFMASK;
3313 		local->ip_len = ip->ip_len;
3314 	}
3315 
3316 #define PULLUP_TO(len)					\
3317 do {							\
3318 	if (m->m_len < (len)) {				\
3319 		args->m = m = m_pullup(m, (len));	\
3320 		if (m == NULL) {			\
3321 			ip = NULL;			\
3322 			goto done;			\
3323 		}					\
3324 		ip = mtod(m, struct ip *);		\
3325 	}						\
3326 } while (0)
3327 
3328 	if (local->offset == 0) {
3329 		switch (local->proto) {
3330 		case IPPROTO_TCP:
3331 			PULLUP_TO(hlen + sizeof(struct tcphdr));
3332 			local->tcp = tcp = L3HDR(struct tcphdr, ip);
3333 			local->dst_port = tcp->th_dport;
3334 			local->src_port = tcp->th_sport;
3335 			args->f_id.flags = tcp->th_flags;
3336 			break;
3337 
3338 		case IPPROTO_UDP:
3339 			PULLUP_TO(hlen + sizeof(struct udphdr));
3340 			udp = L3HDR(struct udphdr, ip);
3341 			local->dst_port = udp->uh_dport;
3342 			local->src_port = udp->uh_sport;
3343 			break;
3344 
3345 		case IPPROTO_ICMP:
3346 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
3347 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3348 			break;
3349 
3350 		default:
3351 			break;
3352 		}
3353 	}
3354 
3355 #undef PULLUP_TO
3356 
3357 	args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3358 	args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3359 	args->f_id.src_port = local->src_port = ntohs(local->src_port);
3360 	args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3361 done:
3362 	*ip0 = ip;
3363 	return (m);
3364 }
3365 
3366 static struct mbuf *
3367 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3368     struct ip_fw_local *local, struct ip **ip0)
3369 {
3370 	struct ip *ip = mtod(m, struct ip *);
3371 
3372 	ip->ip_len = htons(ip->ip_len);
3373 	ip->ip_off = htons(ip->ip_off);
3374 
3375 	m->m_flags &= ~M_HASH;
3376 	ip_hashfn(&m, 0);
3377 	args->m = m;
3378 	if (m == NULL) {
3379 		*ip0 = NULL;
3380 		return (NULL);
3381 	}
3382 	KASSERT(m->m_flags & M_HASH, ("no hash"));
3383 
3384 	/* 'm' might be changed by ip_hashfn(). */
3385 	ip = mtod(m, struct ip *);
3386 	ip->ip_len = ntohs(ip->ip_len);
3387 	ip->ip_off = ntohs(ip->ip_off);
3388 
3389 	return (ipfw_setup_local(m, hlen, args, local, ip0));
3390 }
3391 
3392 /*
3393  * The main check routine for the firewall.
3394  *
3395  * All arguments are in args so we can modify them and return them
3396  * back to the caller.
3397  *
3398  * Parameters:
3399  *
3400  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
3401  *		Starts with the IP header.
3402  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
3403  *	args->oif	Outgoing interface, or NULL if packet is incoming.
3404  *		The incoming interface is in the mbuf. (in)
3405  *
3406  *	args->rule	Pointer to the last matching rule (in/out)
3407  *	args->f_id	Addresses grabbed from the packet (out)
3408  *
3409  * Return value:
3410  *
3411  *	If the packet was denied/rejected and has been dropped, *m is equal
3412  *	to NULL upon return.
3413  *
3414  *	IP_FW_DENY	the packet must be dropped.
3415  *	IP_FW_PASS	The packet is to be accepted and routed normally.
3416  *	IP_FW_DIVERT	Divert the packet to port (args->cookie)
3417  *	IP_FW_TEE	Tee the packet to port (args->cookie)
3418  *	IP_FW_DUMMYNET	Send the packet to pipe/queue (args->cookie)
3419  *	IP_FW_CONTINUE	Continue processing on another cpu.
3420  */
3421 static int
3422 ipfw_chk(struct ip_fw_args *args)
3423 {
3424 	/*
3425 	 * Local variables hold state during the processing of a packet.
3426 	 *
3427 	 * IMPORTANT NOTE: to speed up the processing of rules, there
3428 	 * are some assumption on the values of the variables, which
3429 	 * are documented here. Should you change them, please check
3430 	 * the implementation of the various instructions to make sure
3431 	 * that they still work.
3432 	 *
3433 	 * args->eh	The MAC header. It is non-null for a layer2
3434 	 *	packet, it is NULL for a layer-3 packet.
3435 	 *
3436 	 * m | args->m	Pointer to the mbuf, as received from the caller.
3437 	 *	It may change if ipfw_chk() does an m_pullup, or if it
3438 	 *	consumes the packet because it calls send_reject().
3439 	 *	XXX This has to change, so that ipfw_chk() never modifies
3440 	 *	or consumes the buffer.
3441 	 * ip	is simply an alias of the value of m, and it is kept
3442 	 *	in sync with it (the packet is	supposed to start with
3443 	 *	the ip header).
3444 	 */
3445 	struct mbuf *m = args->m;
3446 	struct ip *ip = mtod(m, struct ip *);
3447 
3448 	/*
3449 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
3450 	 *	inbound path (ether_input, ip_input).
3451 	 *	If non-NULL, ipfw_chk has been called on the outbound path
3452 	 *	(ether_output, ip_output).
3453 	 */
3454 	struct ifnet *oif = args->oif;
3455 
3456 	struct ip_fw *f = NULL;		/* matching rule */
3457 	int retval = IP_FW_PASS;
3458 	struct m_tag *mtag;
3459 	struct divert_info *divinfo;
3460 	struct ipfw_state *s;
3461 
3462 	/*
3463 	 * hlen	The length of the IPv4 header.
3464 	 *	hlen >0 means we have an IPv4 packet.
3465 	 */
3466 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
3467 
3468 	struct ip_fw_local lc;
3469 
3470 	/*
3471 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3472 	 * 	MATCH_NONE when checked and not matched (dyn_f = NULL),
3473 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3474 	 */
3475 	int dyn_dir = MATCH_UNKNOWN;
3476 	struct ip_fw *dyn_f = NULL;
3477 	int cpuid = mycpuid;
3478 	struct ipfw_context *ctx;
3479 
3480 	ASSERT_NETISR_NCPUS(cpuid);
3481 	ctx = ipfw_ctx[cpuid];
3482 
3483 	if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3484 		return IP_FW_PASS;	/* accept */
3485 
3486 	if (args->eh == NULL ||		/* layer 3 packet */
3487 	    (m->m_pkthdr.len >= sizeof(struct ip) &&
3488 	     ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3489 		hlen = ip->ip_hl << 2;
3490 
3491 	memset(&lc, 0, sizeof(lc));
3492 
3493 	m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3494 	if (m == NULL)
3495 		goto pullup_failed;
3496 
3497 	if (args->rule) {
3498 		/*
3499 		 * Packet has already been tagged. Look for the next rule
3500 		 * to restart processing.
3501 		 *
3502 		 * If fw_one_pass != 0 then just accept it.
3503 		 * XXX should not happen here, but optimized out in
3504 		 * the caller.
3505 		 */
3506 		if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3507 			return IP_FW_PASS;
3508 		args->flags &= ~IP_FWARG_F_CONT;
3509 
3510 		/* This rule is being/has been flushed */
3511 		if (ipfw_flushing)
3512 			return IP_FW_DENY;
3513 
3514 		KASSERT(args->rule->cpuid == cpuid,
3515 			("rule used on cpu%d", cpuid));
3516 
3517 		/* This rule was deleted */
3518 		if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3519 			return IP_FW_DENY;
3520 
3521 		if (args->xlat != NULL) {
3522 			struct ipfw_xlat *x = args->xlat;
3523 
3524 			/* This xlat is being deleted. */
3525 			if (x->xlat_invalid)
3526 				return IP_FW_DENY;
3527 
3528 			f = args->rule;
3529 
3530 			dyn_f = f;
3531 			dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3532 			    MATCH_FORWARD : MATCH_REVERSE;
3533 
3534 			if (args->flags & IP_FWARG_F_XLATINS) {
3535 				KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3536 				    ("not slave %u state", x->xlat_type));
3537 				s = ipfw_state_link(ctx, &x->xlat_st);
3538 				if (s != NULL) {
3539 					ctx->ipfw_xlate_conflicts++;
3540 					if (IPFW_STATE_ISDEAD(s)) {
3541 						ipfw_state_remove(ctx, s);
3542 						s = ipfw_state_link(ctx,
3543 						    &x->xlat_st);
3544 					}
3545 					if (s != NULL) {
3546 						if (bootverbose) {
3547 							kprintf("ipfw: "
3548 							"slave %u state "
3549 							"conflicts %u state\n",
3550 							x->xlat_type,
3551 							s->st_type);
3552 						}
3553 						ipfw_xlat_invalidate(x);
3554 						return IP_FW_DENY;
3555 					}
3556 					ctx->ipfw_xlate_cresolved++;
3557 				}
3558 			} else {
3559 				ipfw_state_update(&args->f_id, dyn_dir,
3560 				    lc.tcp, &x->xlat_st);
3561 			}
3562 		} else {
3563 			/* TODO: setup dyn_f, dyn_dir */
3564 
3565 			f = args->rule->next_rule;
3566 			if (f == NULL)
3567 				f = lookup_next_rule(args->rule);
3568 		}
3569 	} else {
3570 		/*
3571 		 * Find the starting rule. It can be either the first
3572 		 * one, or the one after divert_rule if asked so.
3573 		 */
3574 		int skipto;
3575 
3576 		KKASSERT((args->flags &
3577 		    (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3578 		KKASSERT(args->xlat == NULL);
3579 
3580 		mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3581 		if (mtag != NULL) {
3582 			divinfo = m_tag_data(mtag);
3583 			skipto = divinfo->skipto;
3584 		} else {
3585 			skipto = 0;
3586 		}
3587 
3588 		f = ctx->ipfw_layer3_chain;
3589 		if (args->eh == NULL && skipto != 0) {
3590 			/* No skipto during rule flushing */
3591 			if (ipfw_flushing)
3592 				return IP_FW_DENY;
3593 
3594 			if (skipto >= IPFW_DEFAULT_RULE)
3595 				return IP_FW_DENY; /* invalid */
3596 
3597 			while (f && f->rulenum <= skipto)
3598 				f = f->next;
3599 			if (f == NULL)	/* drop packet */
3600 				return IP_FW_DENY;
3601 		} else if (ipfw_flushing) {
3602 			/* Rules are being flushed; skip to default rule */
3603 			f = ctx->ipfw_default_rule;
3604 		}
3605 	}
3606 	if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3607 		m_tag_delete(m, mtag);
3608 
3609 	/*
3610 	 * Now scan the rules, and parse microinstructions for each rule.
3611 	 */
3612 	for (; f; f = f->next) {
3613 		int l, cmdlen;
3614 		ipfw_insn *cmd;
3615 		int skip_or; /* skip rest of OR block */
3616 
3617 again:
3618 		if (ctx->ipfw_set_disable & (1 << f->set)) {
3619 			args->xlat = NULL;
3620 			continue;
3621 		}
3622 
3623 		if (args->xlat != NULL) {
3624 			args->xlat = NULL;
3625 			l = f->cmd_len - f->act_ofs;
3626 			cmd = ACTION_PTR(f);
3627 		} else {
3628 			l = f->cmd_len;
3629 			cmd = f->cmd;
3630 		}
3631 
3632 		skip_or = 0;
3633 		for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3634 			int match;
3635 
3636 			/*
3637 			 * check_body is a jump target used when we find a
3638 			 * CHECK_STATE, and need to jump to the body of
3639 			 * the target rule.
3640 			 */
3641 check_body:
3642 			cmdlen = F_LEN(cmd);
3643 			/*
3644 			 * An OR block (insn_1 || .. || insn_n) has the
3645 			 * F_OR bit set in all but the last instruction.
3646 			 * The first match will set "skip_or", and cause
3647 			 * the following instructions to be skipped until
3648 			 * past the one with the F_OR bit clear.
3649 			 */
3650 			if (skip_or) {		/* skip this instruction */
3651 				if ((cmd->len & F_OR) == 0)
3652 					skip_or = 0;	/* next one is good */
3653 				continue;
3654 			}
3655 			match = 0; /* set to 1 if we succeed */
3656 
3657 			switch (cmd->opcode) {
3658 			/*
3659 			 * The first set of opcodes compares the packet's
3660 			 * fields with some pattern, setting 'match' if a
3661 			 * match is found. At the end of the loop there is
3662 			 * logic to deal with F_NOT and F_OR flags associated
3663 			 * with the opcode.
3664 			 */
3665 			case O_NOP:
3666 				match = 1;
3667 				break;
3668 
3669 			case O_FORWARD_MAC:
3670 				kprintf("ipfw: opcode %d unimplemented\n",
3671 					cmd->opcode);
3672 				break;
3673 
3674 			case O_GID:
3675 			case O_UID:
3676 				/*
3677 				 * We only check offset == 0 && proto != 0,
3678 				 * as this ensures that we have an IPv4
3679 				 * packet with the ports info.
3680 				 */
3681 				if (lc.offset!=0)
3682 					break;
3683 
3684 				match = ipfw_match_uid(&args->f_id, oif,
3685 					cmd->opcode,
3686 					(uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3687 				break;
3688 
3689 			case O_RECV:
3690 				match = iface_match(m->m_pkthdr.rcvif,
3691 				    (ipfw_insn_if *)cmd);
3692 				break;
3693 
3694 			case O_XMIT:
3695 				match = iface_match(oif, (ipfw_insn_if *)cmd);
3696 				break;
3697 
3698 			case O_VIA:
3699 				match = iface_match(oif ? oif :
3700 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3701 				break;
3702 
3703 			case O_MACADDR2:
3704 				if (args->eh != NULL) {	/* have MAC header */
3705 					uint32_t *want = (uint32_t *)
3706 						((ipfw_insn_mac *)cmd)->addr;
3707 					uint32_t *mask = (uint32_t *)
3708 						((ipfw_insn_mac *)cmd)->mask;
3709 					uint32_t *hdr = (uint32_t *)args->eh;
3710 
3711 					match =
3712 					(want[0] == (hdr[0] & mask[0]) &&
3713 					 want[1] == (hdr[1] & mask[1]) &&
3714 					 want[2] == (hdr[2] & mask[2]));
3715 				}
3716 				break;
3717 
3718 			case O_MAC_TYPE:
3719 				if (args->eh != NULL) {
3720 					uint16_t t =
3721 					    ntohs(args->eh->ether_type);
3722 					uint16_t *p =
3723 					    ((ipfw_insn_u16 *)cmd)->ports;
3724 					int i;
3725 
3726 					/* Special vlan handling */
3727 					if (m->m_flags & M_VLANTAG)
3728 						t = ETHERTYPE_VLAN;
3729 
3730 					for (i = cmdlen - 1; !match && i > 0;
3731 					     i--, p += 2) {
3732 						match =
3733 						(t >= p[0] && t <= p[1]);
3734 					}
3735 				}
3736 				break;
3737 
3738 			case O_FRAG:
3739 				match = (hlen > 0 && lc.offset != 0);
3740 				break;
3741 
3742 			case O_IPFRAG:
3743 				if (hlen > 0) {
3744 					uint16_t off;
3745 
3746 					if (args->eh != NULL)
3747 						off = ntohs(ip->ip_off);
3748 					else
3749 						off = ip->ip_off;
3750 					if (off & (IP_MF | IP_OFFMASK))
3751 						match = 1;
3752 				}
3753 				break;
3754 
3755 			case O_IN:	/* "out" is "not in" */
3756 				match = (oif == NULL);
3757 				break;
3758 
3759 			case O_LAYER2:
3760 				match = (args->eh != NULL);
3761 				break;
3762 
3763 			case O_PROTO:
3764 				/*
3765 				 * We do not allow an arg of 0 so the
3766 				 * check of "proto" only suffices.
3767 				 */
3768 				match = (lc.proto == cmd->arg1);
3769 				break;
3770 
3771 			case O_IP_SRC:
3772 				match = (hlen > 0 &&
3773 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3774 				    lc.src_ip.s_addr);
3775 				break;
3776 
3777 			case O_IP_SRC_MASK:
3778 				match = (hlen > 0 &&
3779 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3780 				     (lc.src_ip.s_addr &
3781 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3782 				break;
3783 
3784 			case O_IP_SRC_ME:
3785 				if (hlen > 0) {
3786 					struct ifnet *tif;
3787 
3788 					tif = INADDR_TO_IFP(&lc.src_ip);
3789 					match = (tif != NULL);
3790 				}
3791 				break;
3792 
3793 			case O_IP_SRC_TABLE:
3794 				match = ipfw_table_lookup(ctx, cmd->arg1,
3795 				    &lc.src_ip);
3796 				break;
3797 
3798 			case O_IP_SRC_IFIP:
3799 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3800 				    &lc.src_ip);
3801 				break;
3802 
3803 			case O_IP_DST_SET:
3804 			case O_IP_SRC_SET:
3805 				if (hlen > 0) {
3806 					uint32_t *d = (uint32_t *)(cmd + 1);
3807 					uint32_t addr =
3808 					    cmd->opcode == O_IP_DST_SET ?
3809 						args->f_id.dst_ip :
3810 						args->f_id.src_ip;
3811 
3812 					if (addr < d[0])
3813 						break;
3814 					addr -= d[0]; /* subtract base */
3815 					match =
3816 					(addr < cmd->arg1) &&
3817 					 (d[1 + (addr >> 5)] &
3818 					  (1 << (addr & 0x1f)));
3819 				}
3820 				break;
3821 
3822 			case O_IP_DST:
3823 				match = (hlen > 0 &&
3824 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3825 				    lc.dst_ip.s_addr);
3826 				break;
3827 
3828 			case O_IP_DST_MASK:
3829 				match = (hlen > 0) &&
3830 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3831 				     (lc.dst_ip.s_addr &
3832 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3833 				break;
3834 
3835 			case O_IP_DST_ME:
3836 				if (hlen > 0) {
3837 					struct ifnet *tif;
3838 
3839 					tif = INADDR_TO_IFP(&lc.dst_ip);
3840 					match = (tif != NULL);
3841 				}
3842 				break;
3843 
3844 			case O_IP_DST_TABLE:
3845 				match = ipfw_table_lookup(ctx, cmd->arg1,
3846 				    &lc.dst_ip);
3847 				break;
3848 
3849 			case O_IP_DST_IFIP:
3850 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3851 				    &lc.dst_ip);
3852 				break;
3853 
3854 			case O_IP_SRCPORT:
3855 			case O_IP_DSTPORT:
3856 				/*
3857 				 * offset == 0 && proto != 0 is enough
3858 				 * to guarantee that we have an IPv4
3859 				 * packet with port info.
3860 				 */
3861 				if ((lc.proto==IPPROTO_UDP ||
3862 				     lc.proto==IPPROTO_TCP)
3863 				    && lc.offset == 0) {
3864 					uint16_t x =
3865 					    (cmd->opcode == O_IP_SRCPORT) ?
3866 						lc.src_port : lc.dst_port;
3867 					uint16_t *p =
3868 					    ((ipfw_insn_u16 *)cmd)->ports;
3869 					int i;
3870 
3871 					for (i = cmdlen - 1; !match && i > 0;
3872 					     i--, p += 2) {
3873 						match =
3874 						(x >= p[0] && x <= p[1]);
3875 					}
3876 				}
3877 				break;
3878 
3879 			case O_ICMPTYPE:
3880 				match = (lc.offset == 0 &&
3881 				    lc.proto==IPPROTO_ICMP &&
3882 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3883 				break;
3884 
3885 			case O_IPOPT:
3886 				match = (hlen > 0 && ipopts_match(ip, cmd));
3887 				break;
3888 
3889 			case O_IPVER:
3890 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3891 				break;
3892 
3893 			case O_IPTTL:
3894 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3895 				break;
3896 
3897 			case O_IPID:
3898 				match = (hlen > 0 &&
3899 				    cmd->arg1 == ntohs(ip->ip_id));
3900 				break;
3901 
3902 			case O_IPLEN:
3903 				match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3904 				break;
3905 
3906 			case O_IPPRECEDENCE:
3907 				match = (hlen > 0 &&
3908 				    (cmd->arg1 == (ip->ip_tos & 0xe0)));
3909 				break;
3910 
3911 			case O_IPTOS:
3912 				match = (hlen > 0 &&
3913 				    flags_match(cmd, ip->ip_tos));
3914 				break;
3915 
3916 			case O_TCPFLAGS:
3917 				match = (lc.proto == IPPROTO_TCP &&
3918 				    lc.offset == 0 &&
3919 				    flags_match(cmd,
3920 					L3HDR(struct tcphdr,ip)->th_flags));
3921 				break;
3922 
3923 			case O_TCPOPTS:
3924 				match = (lc.proto == IPPROTO_TCP &&
3925 				    lc.offset == 0 && tcpopts_match(ip, cmd));
3926 				break;
3927 
3928 			case O_TCPSEQ:
3929 				match = (lc.proto == IPPROTO_TCP &&
3930 				    lc.offset == 0 &&
3931 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3932 					L3HDR(struct tcphdr,ip)->th_seq);
3933 				break;
3934 
3935 			case O_TCPACK:
3936 				match = (lc.proto == IPPROTO_TCP &&
3937 				    lc.offset == 0 &&
3938 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3939 					L3HDR(struct tcphdr,ip)->th_ack);
3940 				break;
3941 
3942 			case O_TCPWIN:
3943 				match = (lc.proto == IPPROTO_TCP &&
3944 				    lc.offset == 0 &&
3945 				    cmd->arg1 ==
3946 					L3HDR(struct tcphdr,ip)->th_win);
3947 				break;
3948 
3949 			case O_ESTAB:
3950 				/* reject packets which have SYN only */
3951 				/* XXX should i also check for TH_ACK ? */
3952 				match = (lc.proto == IPPROTO_TCP &&
3953 				    lc.offset == 0 &&
3954 				    (L3HDR(struct tcphdr,ip)->th_flags &
3955 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3956 				break;
3957 
3958 			case O_LOG:
3959 				if (fw_verbose) {
3960 					ipfw_log(ctx, f, hlen, args->eh, m,
3961 					    oif);
3962 				}
3963 				match = 1;
3964 				break;
3965 
3966 			case O_PROB:
3967 				match = (krandom() <
3968 					((ipfw_insn_u32 *)cmd)->d[0]);
3969 				break;
3970 
3971 			/*
3972 			 * The second set of opcodes represents 'actions',
3973 			 * i.e. the terminal part of a rule once the packet
3974 			 * matches all previous patterns.
3975 			 * Typically there is only one action for each rule,
3976 			 * and the opcode is stored at the end of the rule
3977 			 * (but there are exceptions -- see below).
3978 			 *
3979 			 * In general, here we set retval and terminate the
3980 			 * outer loop (would be a 'break 3' in some language,
3981 			 * but we need to do a 'goto done').
3982 			 *
3983 			 * Exceptions:
3984 			 * O_COUNT and O_SKIPTO actions:
3985 			 *   instead of terminating, we jump to the next rule
3986 			 *   ('goto next_rule', equivalent to a 'break 2'),
3987 			 *   or to the SKIPTO target ('goto again' after
3988 			 *   having set f, cmd and l), respectively.
3989 			 *
3990 			 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
3991 			 *   are not real 'actions', and are stored right
3992 			 *   before the 'action' part of the rule.
3993 			 *   These opcodes try to install an entry in the
3994 			 *   state tables; if successful, we continue with
3995 			 *   the next opcode (match=1; break;), otherwise
3996 			 *   the packet must be dropped ('goto done' after
3997 			 *   setting retval).  If static rules are changed
3998 			 *   during the state installation, the packet will
3999 			 *   be dropped and rule's stats will not beupdated
4000 			 *   ('return IP_FW_DENY').
4001 			 *
4002 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4003 			 *   cause a lookup of the state table, and a jump
4004 			 *   to the 'action' part of the parent rule
4005 			 *   ('goto check_body') if an entry is found, or
4006 			 *   (CHECK_STATE only) a jump to the next rule if
4007 			 *   the entry is not found ('goto next_rule').
4008 			 *   The result of the lookup is cached to make
4009 			 *   further instances of these opcodes are
4010 			 *   effectively NOPs.  If static rules are changed
4011 			 *   during the state looking up, the packet will
4012 			 *   be dropped and rule's stats will not be updated
4013 			 *   ('return IP_FW_DENY').
4014 			 */
4015 			case O_REDIRECT:
4016 				if (f->cross_rules == NULL) {
4017 					/*
4018 					 * This rule was not completely setup;
4019 					 * move on to the next rule.
4020 					 */
4021 					goto next_rule;
4022 				}
4023 				/*
4024 				 * Apply redirect only on input path and
4025 				 * only to non-fragment TCP segments or
4026 				 * UDP datagrams.
4027 				 *
4028 				 * Does _not_ work with layer2 filtering.
4029 				 */
4030 				if (oif != NULL || args->eh != NULL ||
4031 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4032 				    (lc.proto != IPPROTO_TCP &&
4033 				     lc.proto != IPPROTO_UDP))
4034 					break;
4035 				/* FALL THROUGH */
4036 			case O_LIMIT:
4037 			case O_KEEP_STATE:
4038 				if (hlen == 0)
4039 					break;
4040 				s = ipfw_state_install(ctx, f,
4041 				    (ipfw_insn_limit *)cmd, args, lc.tcp);
4042 				if (s == NULL) {
4043 					retval = IP_FW_DENY;
4044 					goto done; /* error/limit violation */
4045 				}
4046 				s->st_pcnt++;
4047 				s->st_bcnt += lc.ip_len;
4048 
4049 				if (s->st_type == O_REDIRECT) {
4050 					struct in_addr oaddr;
4051 					uint16_t oport;
4052 					struct ipfw_xlat *slave_x, *x;
4053 					struct ipfw_state *dup;
4054 
4055 					x = (struct ipfw_xlat *)s;
4056 					ipfw_xlate(x, m, &oaddr, &oport);
4057 					m = ipfw_rehashm(m, hlen, args, &lc,
4058 					    &ip);
4059 					if (m == NULL) {
4060 						ipfw_state_del(ctx, s);
4061 						goto pullup_failed;
4062 					}
4063 
4064 					cpuid = netisr_hashcpu(
4065 					    m->m_pkthdr.hash);
4066 
4067 					slave_x = (struct ipfw_xlat *)
4068 					    ipfw_state_alloc(ctx, &args->f_id,
4069 					    O_REDIRECT, f->cross_rules[cpuid],
4070 					    lc.tcp);
4071 					if (slave_x == NULL) {
4072 						ipfw_state_del(ctx, s);
4073 						retval = IP_FW_DENY;
4074 						goto done;
4075 					}
4076 					slave_x->xlat_addr = oaddr.s_addr;
4077 					slave_x->xlat_port = oport;
4078 					slave_x->xlat_dir = MATCH_REVERSE;
4079 					slave_x->xlat_flags |=
4080 					    IPFW_STATE_F_XLATSRC |
4081 					    IPFW_STATE_F_XLATSLAVE;
4082 
4083 					slave_x->xlat_pair = x;
4084 					slave_x->xlat_pcpu = mycpuid;
4085 					x->xlat_pair = slave_x;
4086 					x->xlat_pcpu = cpuid;
4087 
4088 					ctx->ipfw_xlated++;
4089 					if (cpuid != mycpuid) {
4090 						ctx->ipfw_xlate_split++;
4091 						ipfw_xlate_redispatch(
4092 						    m, cpuid, x,
4093 						    IPFW_XLATE_INSERT |
4094 						    IPFW_XLATE_FORWARD);
4095 						args->m = NULL;
4096 						return (IP_FW_REDISPATCH);
4097 					}
4098 
4099 					dup = ipfw_state_link(ctx,
4100 					    &slave_x->xlat_st);
4101 					if (dup != NULL) {
4102 						ctx->ipfw_xlate_conflicts++;
4103 						if (IPFW_STATE_ISDEAD(dup)) {
4104 							ipfw_state_remove(ctx,
4105 							    dup);
4106 							dup = ipfw_state_link(
4107 							ctx, &slave_x->xlat_st);
4108 						}
4109 						if (dup != NULL) {
4110 							if (bootverbose) {
4111 							    kprintf("ipfw: "
4112 							    "slave %u state "
4113 							    "conflicts "
4114 							    "%u state\n",
4115 							    x->xlat_type,
4116 							    s->st_type);
4117 							}
4118 							ipfw_state_del(ctx, s);
4119 							return (IP_FW_DENY);
4120 						}
4121 						ctx->ipfw_xlate_cresolved++;
4122 					}
4123 				}
4124 				match = 1;
4125 				break;
4126 
4127 			case O_PROBE_STATE:
4128 			case O_CHECK_STATE:
4129 				/*
4130 				 * States are checked at the first keep-state
4131 				 * check-state occurrence, with the result
4132 				 * being stored in dyn_dir.  The compiler
4133 				 * introduces a PROBE_STATE instruction for
4134 				 * us when we have a KEEP_STATE/LIMIT/RDR
4135 				 * (because PROBE_STATE needs to be run first).
4136 				 */
4137 				s = NULL;
4138 				if (dyn_dir == MATCH_UNKNOWN) {
4139 					s = ipfw_state_lookup(ctx,
4140 					    &args->f_id, &dyn_dir, lc.tcp);
4141 				}
4142 				if (s == NULL ||
4143 				    (s->st_type == O_REDIRECT &&
4144 				     (args->eh != NULL ||
4145 				      (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4146 				      (lc.proto != IPPROTO_TCP &&
4147 				       lc.proto != IPPROTO_UDP)))) {
4148 					/*
4149 					 * State not found. If CHECK_STATE,
4150 					 * skip to next rule, if PROBE_STATE
4151 					 * just ignore and continue with next
4152 					 * opcode.
4153 					 */
4154 					if (cmd->opcode == O_CHECK_STATE)
4155 						goto next_rule;
4156 					match = 1;
4157 					break;
4158 				}
4159 
4160 				s->st_pcnt++;
4161 				s->st_bcnt += lc.ip_len;
4162 
4163 				if (s->st_type == O_REDIRECT) {
4164 					struct ipfw_xlat *x =
4165 					    (struct ipfw_xlat *)s;
4166 
4167 					if (oif != NULL &&
4168 					    x->xlat_ifp == NULL) {
4169 						KASSERT(x->xlat_flags &
4170 						    IPFW_STATE_F_XLATSLAVE,
4171 						    ("master rdr state "
4172 						     "missing ifp"));
4173 						x->xlat_ifp = oif;
4174 					} else if (
4175 					    (oif != NULL && x->xlat_ifp!=oif) ||
4176 					    (oif == NULL &&
4177 					     x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4178 						retval = IP_FW_DENY;
4179 						goto done;
4180 					}
4181 					if (x->xlat_dir != dyn_dir)
4182 						goto skip_xlate;
4183 
4184 					ipfw_xlate(x, m, NULL, NULL);
4185 					m = ipfw_rehashm(m, hlen, args, &lc,
4186 					    &ip);
4187 					if (m == NULL)
4188 						goto pullup_failed;
4189 
4190 					cpuid = netisr_hashcpu(
4191 					    m->m_pkthdr.hash);
4192 					if (cpuid != mycpuid) {
4193 						uint32_t xlate = 0;
4194 
4195 						if (oif != NULL) {
4196 							xlate |=
4197 							    IPFW_XLATE_OUTPUT;
4198 						}
4199 						if (dyn_dir == MATCH_FORWARD) {
4200 							xlate |=
4201 							    IPFW_XLATE_FORWARD;
4202 						}
4203 						ipfw_xlate_redispatch(m, cpuid,
4204 						    x, xlate);
4205 						args->m = NULL;
4206 						return (IP_FW_REDISPATCH);
4207 					}
4208 
4209 					KKASSERT(x->xlat_pcpu == mycpuid);
4210 					ipfw_state_update(&args->f_id, dyn_dir,
4211 					    lc.tcp, &x->xlat_pair->xlat_st);
4212 				}
4213 skip_xlate:
4214 				/*
4215 				 * Found a rule from a state; jump to the
4216 				 * 'action' part of the rule.
4217 				 */
4218 				f = s->st_rule;
4219 				KKASSERT(f->cpuid == mycpuid);
4220 
4221 				cmd = ACTION_PTR(f);
4222 				l = f->cmd_len - f->act_ofs;
4223 				dyn_f = f;
4224 				goto check_body;
4225 
4226 			case O_ACCEPT:
4227 				retval = IP_FW_PASS;	/* accept */
4228 				goto done;
4229 
4230 			case O_DEFRAG:
4231 				if (f->cross_rules == NULL) {
4232 					/*
4233 					 * This rule was not completely setup;
4234 					 * move on to the next rule.
4235 					 */
4236 					goto next_rule;
4237 				}
4238 
4239 				/*
4240 				 * Don't defrag for l2 packets, output packets
4241 				 * or non-fragments.
4242 				 */
4243 				if (oif != NULL || args->eh != NULL ||
4244 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4245 					goto next_rule;
4246 
4247 				ctx->ipfw_frags++;
4248 				m = ip_reass(m);
4249 				args->m = m;
4250 				if (m == NULL) {
4251 					retval = IP_FW_PASS;
4252 					goto done;
4253 				}
4254 				ctx->ipfw_defraged++;
4255 				KASSERT((m->m_flags & M_HASH) == 0,
4256 				    ("hash not cleared"));
4257 
4258 				/* Update statistics */
4259 				f->pcnt++;
4260 				f->bcnt += lc.ip_len;
4261 				f->timestamp = time_second;
4262 
4263 				ip = mtod(m, struct ip *);
4264 				hlen = ip->ip_hl << 2;
4265 				ip->ip_len += hlen;
4266 
4267 				ip->ip_len = htons(ip->ip_len);
4268 				ip->ip_off = htons(ip->ip_off);
4269 
4270 				ip_hashfn(&m, 0);
4271 				args->m = m;
4272 				if (m == NULL)
4273 					goto pullup_failed;
4274 
4275 				KASSERT(m->m_flags & M_HASH, ("no hash"));
4276 				cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4277 				if (cpuid != mycpuid) {
4278 					/*
4279 					 * NOTE:
4280 					 * ip_len/ip_off are in network byte
4281 					 * order.
4282 					 */
4283 					ctx->ipfw_defrag_remote++;
4284 					ipfw_defrag_redispatch(m, cpuid, f);
4285 					args->m = NULL;
4286 					return (IP_FW_REDISPATCH);
4287 				}
4288 
4289 				/* 'm' might be changed by ip_hashfn(). */
4290 				ip = mtod(m, struct ip *);
4291 				ip->ip_len = ntohs(ip->ip_len);
4292 				ip->ip_off = ntohs(ip->ip_off);
4293 
4294 				m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4295 				if (m == NULL)
4296 					goto pullup_failed;
4297 
4298 				/* Move on. */
4299 				goto next_rule;
4300 
4301 			case O_PIPE:
4302 			case O_QUEUE:
4303 				args->rule = f; /* report matching rule */
4304 				args->cookie = cmd->arg1;
4305 				retval = IP_FW_DUMMYNET;
4306 				goto done;
4307 
4308 			case O_DIVERT:
4309 			case O_TEE:
4310 				if (args->eh) /* not on layer 2 */
4311 					break;
4312 
4313 				mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4314 				    sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4315 				if (mtag == NULL) {
4316 					retval = IP_FW_DENY;
4317 					goto done;
4318 				}
4319 				divinfo = m_tag_data(mtag);
4320 
4321 				divinfo->skipto = f->rulenum;
4322 				divinfo->port = cmd->arg1;
4323 				divinfo->tee = (cmd->opcode == O_TEE);
4324 				m_tag_prepend(m, mtag);
4325 
4326 				args->cookie = cmd->arg1;
4327 				retval = (cmd->opcode == O_DIVERT) ?
4328 					 IP_FW_DIVERT : IP_FW_TEE;
4329 				goto done;
4330 
4331 			case O_COUNT:
4332 			case O_SKIPTO:
4333 				f->pcnt++;	/* update stats */
4334 				f->bcnt += lc.ip_len;
4335 				f->timestamp = time_second;
4336 				if (cmd->opcode == O_COUNT)
4337 					goto next_rule;
4338 				/* handle skipto */
4339 				if (f->next_rule == NULL)
4340 					lookup_next_rule(f);
4341 				f = f->next_rule;
4342 				goto again;
4343 
4344 			case O_REJECT:
4345 				/*
4346 				 * Drop the packet and send a reject notice
4347 				 * if the packet is not ICMP (or is an ICMP
4348 				 * query), and it is not multicast/broadcast.
4349 				 */
4350 				if (hlen > 0 &&
4351 				    (lc.proto != IPPROTO_ICMP ||
4352 				     is_icmp_query(ip)) &&
4353 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
4354 				    !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4355 					send_reject(args, cmd->arg1,
4356 					    lc.offset, lc.ip_len);
4357 					retval = IP_FW_DENY;
4358 					goto done;
4359 				}
4360 				/* FALLTHROUGH */
4361 			case O_DENY:
4362 				retval = IP_FW_DENY;
4363 				goto done;
4364 
4365 			case O_FORWARD_IP:
4366 				if (args->eh)	/* not valid on layer2 pkts */
4367 					break;
4368 				if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4369 					struct sockaddr_in *sin;
4370 
4371 					mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4372 					    sizeof(*sin), M_INTWAIT | M_NULLOK);
4373 					if (mtag == NULL) {
4374 						retval = IP_FW_DENY;
4375 						goto done;
4376 					}
4377 					sin = m_tag_data(mtag);
4378 
4379 					/* Structure copy */
4380 					*sin = ((ipfw_insn_sa *)cmd)->sa;
4381 
4382 					m_tag_prepend(m, mtag);
4383 					m->m_pkthdr.fw_flags |=
4384 						IPFORWARD_MBUF_TAGGED;
4385 					m->m_pkthdr.fw_flags &=
4386 						~BRIDGE_MBUF_TAGGED;
4387 				}
4388 				retval = IP_FW_PASS;
4389 				goto done;
4390 
4391 			default:
4392 				panic("-- unknown opcode %d", cmd->opcode);
4393 			} /* end of switch() on opcodes */
4394 
4395 			if (cmd->len & F_NOT)
4396 				match = !match;
4397 
4398 			if (match) {
4399 				if (cmd->len & F_OR)
4400 					skip_or = 1;
4401 			} else {
4402 				if (!(cmd->len & F_OR)) /* not an OR block, */
4403 					break;		/* try next rule    */
4404 			}
4405 
4406 		}	/* end of inner for, scan opcodes */
4407 
4408 next_rule:;		/* try next rule		*/
4409 
4410 	}		/* end of outer for, scan rules */
4411 	kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4412 	return IP_FW_DENY;
4413 
4414 done:
4415 	/* Update statistics */
4416 	f->pcnt++;
4417 	f->bcnt += lc.ip_len;
4418 	f->timestamp = time_second;
4419 	return retval;
4420 
4421 pullup_failed:
4422 	if (fw_verbose)
4423 		kprintf("pullup failed\n");
4424 	return IP_FW_DENY;
4425 }
4426 
4427 static struct mbuf *
4428 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4429 {
4430 	struct m_tag *mtag;
4431 	struct dn_pkt *pkt;
4432 	ipfw_insn *cmd;
4433 	const struct ipfw_flow_id *id;
4434 	struct dn_flow_id *fid;
4435 
4436 	M_ASSERTPKTHDR(m);
4437 
4438 	mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4439 	    M_INTWAIT | M_NULLOK);
4440 	if (mtag == NULL) {
4441 		m_freem(m);
4442 		return (NULL);
4443 	}
4444 	m_tag_prepend(m, mtag);
4445 
4446 	pkt = m_tag_data(mtag);
4447 	bzero(pkt, sizeof(*pkt));
4448 
4449 	cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4450 	if (cmd->opcode == O_LOG)
4451 		cmd += F_LEN(cmd);
4452 	KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4453 		("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4454 
4455 	pkt->dn_m = m;
4456 	pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4457 	pkt->ifp = fwa->oif;
4458 	pkt->pipe_nr = pipe_nr;
4459 
4460 	pkt->cpuid = mycpuid;
4461 	pkt->msgport = netisr_curport();
4462 
4463 	id = &fwa->f_id;
4464 	fid = &pkt->id;
4465 	fid->fid_dst_ip = id->dst_ip;
4466 	fid->fid_src_ip = id->src_ip;
4467 	fid->fid_dst_port = id->dst_port;
4468 	fid->fid_src_port = id->src_port;
4469 	fid->fid_proto = id->proto;
4470 	fid->fid_flags = id->flags;
4471 
4472 	ipfw_ref_rule(fwa->rule);
4473 	pkt->dn_priv = fwa->rule;
4474 	pkt->dn_unref_priv = ipfw_unref_rule;
4475 
4476 	if (cmd->opcode == O_PIPE)
4477 		pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4478 
4479 	m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4480 	return (m);
4481 }
4482 
4483 /*
4484  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4485  * These will be reconstructed on the fly as packets are matched.
4486  */
4487 static void
4488 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4489 {
4490 	struct ip_fw *rule;
4491 
4492 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4493 		rule->next_rule = NULL;
4494 }
4495 
4496 static void
4497 ipfw_inc_static_count(struct ip_fw *rule)
4498 {
4499 	/* Static rule's counts are updated only on CPU0 */
4500 	KKASSERT(mycpuid == 0);
4501 
4502 	static_count++;
4503 	static_ioc_len += IOC_RULESIZE(rule);
4504 }
4505 
4506 static void
4507 ipfw_dec_static_count(struct ip_fw *rule)
4508 {
4509 	int l = IOC_RULESIZE(rule);
4510 
4511 	/* Static rule's counts are updated only on CPU0 */
4512 	KKASSERT(mycpuid == 0);
4513 
4514 	KASSERT(static_count > 0, ("invalid static count %u", static_count));
4515 	static_count--;
4516 
4517 	KASSERT(static_ioc_len >= l,
4518 		("invalid static len %u", static_ioc_len));
4519 	static_ioc_len -= l;
4520 }
4521 
4522 static void
4523 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4524 {
4525 	if (fwmsg->sibling != NULL) {
4526 		KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4527 		fwmsg->sibling->sibling = rule;
4528 	}
4529 	fwmsg->sibling = rule;
4530 }
4531 
4532 static struct ip_fw *
4533 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4534 {
4535 	struct ip_fw *rule;
4536 
4537 	rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4538 
4539 	rule->act_ofs = ioc_rule->act_ofs;
4540 	rule->cmd_len = ioc_rule->cmd_len;
4541 	rule->rulenum = ioc_rule->rulenum;
4542 	rule->set = ioc_rule->set;
4543 	rule->usr_flags = ioc_rule->usr_flags;
4544 
4545 	bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4546 
4547 	rule->refcnt = 1;
4548 	rule->cpuid = mycpuid;
4549 	rule->rule_flags = rule_flags;
4550 
4551 	return rule;
4552 }
4553 
4554 static void
4555 ipfw_add_rule_dispatch(netmsg_t nmsg)
4556 {
4557 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4558 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4559 	struct ip_fw *rule;
4560 
4561 	ASSERT_NETISR_NCPUS(mycpuid);
4562 
4563 	rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4564 
4565 	/*
4566 	 * Insert rule into the pre-determined position
4567 	 */
4568 	if (fwmsg->prev_rule != NULL) {
4569 		struct ip_fw *prev, *next;
4570 
4571 		prev = fwmsg->prev_rule;
4572 		KKASSERT(prev->cpuid == mycpuid);
4573 
4574 		next = fwmsg->next_rule;
4575 		KKASSERT(next->cpuid == mycpuid);
4576 
4577 		rule->next = next;
4578 		prev->next = rule;
4579 
4580 		/*
4581 		 * Move to the position on the next CPU
4582 		 * before the msg is forwarded.
4583 		 */
4584 		fwmsg->prev_rule = prev->sibling;
4585 		fwmsg->next_rule = next->sibling;
4586 	} else {
4587 		KKASSERT(fwmsg->next_rule == NULL);
4588 		rule->next = ctx->ipfw_layer3_chain;
4589 		ctx->ipfw_layer3_chain = rule;
4590 	}
4591 
4592 	/* Link rule CPU sibling */
4593 	ipfw_link_sibling(fwmsg, rule);
4594 
4595 	ipfw_flush_rule_ptrs(ctx);
4596 
4597 	if (mycpuid == 0) {
4598 		/* Statistics only need to be updated once */
4599 		ipfw_inc_static_count(rule);
4600 
4601 		/* Return the rule on CPU0 */
4602 		nmsg->lmsg.u.ms_resultp = rule;
4603 	}
4604 
4605 	if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4606 		rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4607 
4608 	if (fwmsg->cross_rules != NULL) {
4609 		/* Save rules for later use. */
4610 		fwmsg->cross_rules[mycpuid] = rule;
4611 	}
4612 
4613 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4614 }
4615 
4616 static void
4617 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4618 {
4619 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4620 	struct ip_fw *rule = fwmsg->sibling;
4621 	int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4622 
4623 	ASSERT_NETISR_NCPUS(mycpuid);
4624 	KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4625 	    ("not crossref rule"));
4626 
4627 	rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4628 	memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4629 
4630 	fwmsg->sibling = rule->sibling;
4631 	netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4632 }
4633 
4634 /*
4635  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4636  * then possibly create a rule number and add the rule to the list.
4637  * Update the rule_number in the input struct so the caller knows
4638  * it as well.
4639  */
4640 static void
4641 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4642 {
4643 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4644 	struct netmsg_ipfw fwmsg;
4645 	struct ip_fw *f, *prev, *rule;
4646 
4647 	ASSERT_NETISR0;
4648 
4649 	/*
4650 	 * If rulenum is 0, find highest numbered rule before the
4651 	 * default rule, and add rule number incremental step.
4652 	 */
4653 	if (ioc_rule->rulenum == 0) {
4654 		int step = autoinc_step;
4655 
4656 		KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4657 			 step <= IPFW_AUTOINC_STEP_MAX);
4658 
4659 		/*
4660 		 * Locate the highest numbered rule before default
4661 		 */
4662 		for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4663 			if (f->rulenum == IPFW_DEFAULT_RULE)
4664 				break;
4665 			ioc_rule->rulenum = f->rulenum;
4666 		}
4667 		if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4668 			ioc_rule->rulenum += step;
4669 	}
4670 	KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4671 		ioc_rule->rulenum != 0,
4672 		("invalid rule num %d", ioc_rule->rulenum));
4673 
4674 	/*
4675 	 * Now find the right place for the new rule in the sorted list.
4676 	 */
4677 	for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4678 	     prev = f, f = f->next) {
4679 		if (f->rulenum > ioc_rule->rulenum) {
4680 			/* Found the location */
4681 			break;
4682 		}
4683 	}
4684 	KASSERT(f != NULL, ("no default rule?!"));
4685 
4686 	/*
4687 	 * Duplicate the rule onto each CPU.
4688 	 * The rule duplicated on CPU0 will be returned.
4689 	 */
4690 	bzero(&fwmsg, sizeof(fwmsg));
4691 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4692 	    ipfw_add_rule_dispatch);
4693 	fwmsg.ioc_rule = ioc_rule;
4694 	fwmsg.prev_rule = prev;
4695 	fwmsg.next_rule = prev == NULL ? NULL : f;
4696 	fwmsg.rule_flags = rule_flags;
4697 	if (rule_flags & IPFW_RULE_F_CROSSREF) {
4698 		fwmsg.cross_rules = kmalloc(
4699 		    sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4700 		    M_WAITOK | M_ZERO);
4701 	}
4702 
4703 	netisr_domsg_global(&fwmsg.base);
4704 	KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4705 
4706 	rule = fwmsg.base.lmsg.u.ms_resultp;
4707 	KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4708 
4709 	if (fwmsg.cross_rules != NULL) {
4710 		netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4711 		    MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4712 		fwmsg.sibling = rule;
4713 		netisr_domsg_global(&fwmsg.base);
4714 		KKASSERT(fwmsg.sibling == NULL);
4715 
4716 		kfree(fwmsg.cross_rules, M_TEMP);
4717 
4718 #ifdef KLD_MODULE
4719 		atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4720 #endif
4721 	}
4722 
4723 	DPRINTF("++ installed rule %d, static count now %d\n",
4724 		rule->rulenum, static_count);
4725 }
4726 
4727 /*
4728  * Free storage associated with a static rule (including derived
4729  * states/tracks).
4730  * The caller is in charge of clearing rule pointers to avoid
4731  * dangling pointers.
4732  * @return a pointer to the next entry.
4733  * Arguments are not checked, so they better be correct.
4734  */
4735 static struct ip_fw *
4736 ipfw_delete_rule(struct ipfw_context *ctx,
4737 		 struct ip_fw *prev, struct ip_fw *rule)
4738 {
4739 	struct ip_fw *n;
4740 
4741 	n = rule->next;
4742 	if (prev == NULL)
4743 		ctx->ipfw_layer3_chain = n;
4744 	else
4745 		prev->next = n;
4746 
4747 	/* Mark the rule as invalid */
4748 	rule->rule_flags |= IPFW_RULE_F_INVALID;
4749 	rule->next_rule = NULL;
4750 	rule->sibling = NULL;
4751 #ifdef foo
4752 	/* Don't reset cpuid here; keep various assertion working */
4753 	rule->cpuid = -1;
4754 #endif
4755 
4756 	/* Statistics only need to be updated once */
4757 	if (mycpuid == 0)
4758 		ipfw_dec_static_count(rule);
4759 
4760 	if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4761 		/* Try to free this rule */
4762 		ipfw_free_rule(rule);
4763 	} else {
4764 		/* TODO: check staging area. */
4765 		if (mycpuid == 0) {
4766 			rule->next = ipfw_gd.ipfw_crossref_free;
4767 			ipfw_gd.ipfw_crossref_free = rule;
4768 		}
4769 	}
4770 
4771 	/* Return the next rule */
4772 	return n;
4773 }
4774 
4775 static void
4776 ipfw_flush_dispatch(netmsg_t nmsg)
4777 {
4778 	int kill_default = nmsg->lmsg.u.ms_result;
4779 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4780 	struct ip_fw *rule;
4781 
4782 	ASSERT_NETISR_NCPUS(mycpuid);
4783 
4784 	/*
4785 	 * Flush states.
4786 	 */
4787 	ipfw_state_flush(ctx, NULL);
4788 	KASSERT(ctx->ipfw_state_cnt == 0,
4789 	    ("%d pcpu states remain", ctx->ipfw_state_cnt));
4790 	ctx->ipfw_state_loosecnt = 0;
4791 	ctx->ipfw_state_lastexp = 0;
4792 
4793 	/*
4794 	 * Flush tracks.
4795 	 */
4796 	ipfw_track_flush(ctx, NULL);
4797 	ctx->ipfw_track_lastexp = 0;
4798 	if (ctx->ipfw_trkcnt_spare != NULL) {
4799 		kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4800 		ctx->ipfw_trkcnt_spare = NULL;
4801 	}
4802 
4803 	ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4804 
4805 	while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4806 	       (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4807 		ipfw_delete_rule(ctx, NULL, rule);
4808 
4809 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4810 }
4811 
4812 /*
4813  * Deletes all rules from a chain (including the default rule
4814  * if the second argument is set).
4815  */
4816 static void
4817 ipfw_flush(int kill_default)
4818 {
4819 	struct netmsg_base nmsg;
4820 #ifdef INVARIANTS
4821 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4822 	int state_cnt;
4823 #endif
4824 
4825 	ASSERT_NETISR0;
4826 
4827 	/*
4828 	 * If 'kill_default' then caller has done the necessary
4829 	 * msgport syncing; unnecessary to do it again.
4830 	 */
4831 	if (!kill_default) {
4832 		/*
4833 		 * Let ipfw_chk() know the rules are going to
4834 		 * be flushed, so it could jump directly to
4835 		 * the default rule.
4836 		 */
4837 		ipfw_flushing = 1;
4838 		/* XXX use priority sync */
4839 		netmsg_service_sync();
4840 	}
4841 
4842 	/*
4843 	 * Press the 'flush' button
4844 	 */
4845 	bzero(&nmsg, sizeof(nmsg));
4846 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4847 	    ipfw_flush_dispatch);
4848 	nmsg.lmsg.u.ms_result = kill_default;
4849 	netisr_domsg_global(&nmsg);
4850 	ipfw_gd.ipfw_state_loosecnt = 0;
4851 	ipfw_gd.ipfw_state_globexp = 0;
4852 	ipfw_gd.ipfw_track_globexp = 0;
4853 
4854 #ifdef INVARIANTS
4855 	state_cnt = ipfw_state_cntcoll();
4856 	KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4857 
4858 	KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4859 	    ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4860 
4861 	if (kill_default) {
4862 		KASSERT(static_count == 0,
4863 			("%u static rules remain", static_count));
4864 		KASSERT(static_ioc_len == 0,
4865 			("%u bytes of static rules remain", static_ioc_len));
4866 	} else {
4867 		KASSERT(static_count == 1,
4868 			("%u static rules remain", static_count));
4869 		KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4870 			("%u bytes of static rules remain, should be %lu",
4871 			 static_ioc_len,
4872 			 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4873 	}
4874 #endif
4875 
4876 	/* Flush is done */
4877 	ipfw_flushing = 0;
4878 }
4879 
4880 static void
4881 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4882 {
4883 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4884 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4885 	struct ip_fw *rule, *prev;
4886 
4887 	ASSERT_NETISR_NCPUS(mycpuid);
4888 
4889 	rule = dmsg->start_rule;
4890 	KKASSERT(rule->cpuid == mycpuid);
4891 	dmsg->start_rule = rule->sibling;
4892 
4893 	prev = dmsg->prev_rule;
4894 	if (prev != NULL) {
4895 		KKASSERT(prev->cpuid == mycpuid);
4896 
4897 		/*
4898 		 * Move to the position on the next CPU
4899 		 * before the msg is forwarded.
4900 		 */
4901 		dmsg->prev_rule = prev->sibling;
4902 	}
4903 
4904 	/*
4905 	 * flush pointers outside the loop, then delete all matching
4906 	 * rules.  'prev' remains the same throughout the cycle.
4907 	 */
4908 	ipfw_flush_rule_ptrs(ctx);
4909 	while (rule && rule->rulenum == dmsg->rulenum) {
4910 		if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4911 			/* Flush states generated by this rule. */
4912 			ipfw_state_flush(ctx, rule);
4913 		}
4914 		if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4915 			/* Flush tracks generated by this rule. */
4916 			ipfw_track_flush(ctx, rule);
4917 		}
4918 		rule = ipfw_delete_rule(ctx, prev, rule);
4919 	}
4920 
4921 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4922 }
4923 
4924 static int
4925 ipfw_alt_delete_rule(uint16_t rulenum)
4926 {
4927 	struct ip_fw *prev, *rule;
4928 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4929 	struct netmsg_del dmsg;
4930 
4931 	ASSERT_NETISR0;
4932 
4933 	/*
4934 	 * Locate first rule to delete
4935 	 */
4936 	for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4937 	     rule && rule->rulenum < rulenum;
4938 	     prev = rule, rule = rule->next)
4939 		; /* EMPTY */
4940 	if (rule->rulenum != rulenum)
4941 		return EINVAL;
4942 
4943 	/*
4944 	 * Get rid of the rule duplications on all CPUs
4945 	 */
4946 	bzero(&dmsg, sizeof(dmsg));
4947 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4948 	    ipfw_alt_delete_rule_dispatch);
4949 	dmsg.prev_rule = prev;
4950 	dmsg.start_rule = rule;
4951 	dmsg.rulenum = rulenum;
4952 
4953 	netisr_domsg_global(&dmsg.base);
4954 	KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4955 	return 0;
4956 }
4957 
4958 static void
4959 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4960 {
4961 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4962 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4963 	struct ip_fw *prev, *rule;
4964 #ifdef INVARIANTS
4965 	int del = 0;
4966 #endif
4967 
4968 	ASSERT_NETISR_NCPUS(mycpuid);
4969 
4970 	ipfw_flush_rule_ptrs(ctx);
4971 
4972 	prev = NULL;
4973 	rule = ctx->ipfw_layer3_chain;
4974 	while (rule != NULL) {
4975 		if (rule->set == dmsg->from_set) {
4976 			if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4977 				/* Flush states generated by this rule. */
4978 				ipfw_state_flush(ctx, rule);
4979 			}
4980 			if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4981 				/* Flush tracks generated by this rule. */
4982 				ipfw_track_flush(ctx, rule);
4983 			}
4984 			rule = ipfw_delete_rule(ctx, prev, rule);
4985 #ifdef INVARIANTS
4986 			del = 1;
4987 #endif
4988 		} else {
4989 			prev = rule;
4990 			rule = rule->next;
4991 		}
4992 	}
4993 	KASSERT(del, ("no match set?!"));
4994 
4995 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4996 }
4997 
4998 static int
4999 ipfw_alt_delete_ruleset(uint8_t set)
5000 {
5001 	struct netmsg_del dmsg;
5002 	int del;
5003 	struct ip_fw *rule;
5004 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5005 
5006 	ASSERT_NETISR0;
5007 
5008 	/*
5009 	 * Check whether the 'set' exists.  If it exists,
5010 	 * then check whether any rules within the set will
5011 	 * try to create states.
5012 	 */
5013 	del = 0;
5014 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5015 		if (rule->set == set)
5016 			del = 1;
5017 	}
5018 	if (!del)
5019 		return 0; /* XXX EINVAL? */
5020 
5021 	/*
5022 	 * Delete this set
5023 	 */
5024 	bzero(&dmsg, sizeof(dmsg));
5025 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5026 	    ipfw_alt_delete_ruleset_dispatch);
5027 	dmsg.from_set = set;
5028 	netisr_domsg_global(&dmsg.base);
5029 
5030 	return 0;
5031 }
5032 
5033 static void
5034 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5035 {
5036 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5037 	struct ip_fw *rule;
5038 
5039 	ASSERT_NETISR_NCPUS(mycpuid);
5040 
5041 	rule = dmsg->start_rule;
5042 	KKASSERT(rule->cpuid == mycpuid);
5043 
5044 	/*
5045 	 * Move to the position on the next CPU
5046 	 * before the msg is forwarded.
5047 	 */
5048 	dmsg->start_rule = rule->sibling;
5049 
5050 	while (rule && rule->rulenum <= dmsg->rulenum) {
5051 		if (rule->rulenum == dmsg->rulenum)
5052 			rule->set = dmsg->to_set;
5053 		rule = rule->next;
5054 	}
5055 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5056 }
5057 
5058 static int
5059 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5060 {
5061 	struct netmsg_del dmsg;
5062 	struct netmsg_base *nmsg;
5063 	struct ip_fw *rule;
5064 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5065 
5066 	ASSERT_NETISR0;
5067 
5068 	/*
5069 	 * Locate first rule to move
5070 	 */
5071 	for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5072 	     rule = rule->next) {
5073 		if (rule->rulenum == rulenum && rule->set != set)
5074 			break;
5075 	}
5076 	if (rule == NULL || rule->rulenum > rulenum)
5077 		return 0; /* XXX error? */
5078 
5079 	bzero(&dmsg, sizeof(dmsg));
5080 	nmsg = &dmsg.base;
5081 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5082 	    ipfw_alt_move_rule_dispatch);
5083 	dmsg.start_rule = rule;
5084 	dmsg.rulenum = rulenum;
5085 	dmsg.to_set = set;
5086 
5087 	netisr_domsg_global(nmsg);
5088 	KKASSERT(dmsg.start_rule == NULL);
5089 	return 0;
5090 }
5091 
5092 static void
5093 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5094 {
5095 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5096 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5097 	struct ip_fw *rule;
5098 
5099 	ASSERT_NETISR_NCPUS(mycpuid);
5100 
5101 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5102 		if (rule->set == dmsg->from_set)
5103 			rule->set = dmsg->to_set;
5104 	}
5105 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5106 }
5107 
5108 static int
5109 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5110 {
5111 	struct netmsg_del dmsg;
5112 	struct netmsg_base *nmsg;
5113 
5114 	ASSERT_NETISR0;
5115 
5116 	bzero(&dmsg, sizeof(dmsg));
5117 	nmsg = &dmsg.base;
5118 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5119 	    ipfw_alt_move_ruleset_dispatch);
5120 	dmsg.from_set = from_set;
5121 	dmsg.to_set = to_set;
5122 
5123 	netisr_domsg_global(nmsg);
5124 	return 0;
5125 }
5126 
5127 static void
5128 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5129 {
5130 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5131 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5132 	struct ip_fw *rule;
5133 
5134 	ASSERT_NETISR_NCPUS(mycpuid);
5135 
5136 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5137 		if (rule->set == dmsg->from_set)
5138 			rule->set = dmsg->to_set;
5139 		else if (rule->set == dmsg->to_set)
5140 			rule->set = dmsg->from_set;
5141 	}
5142 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5143 }
5144 
5145 static int
5146 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5147 {
5148 	struct netmsg_del dmsg;
5149 	struct netmsg_base *nmsg;
5150 
5151 	ASSERT_NETISR0;
5152 
5153 	bzero(&dmsg, sizeof(dmsg));
5154 	nmsg = &dmsg.base;
5155 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5156 	    ipfw_alt_swap_ruleset_dispatch);
5157 	dmsg.from_set = set1;
5158 	dmsg.to_set = set2;
5159 
5160 	netisr_domsg_global(nmsg);
5161 	return 0;
5162 }
5163 
5164 /*
5165  * Remove all rules with given number, and also do set manipulation.
5166  *
5167  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5168  * the next 8 bits are the new set, the top 8 bits are the command:
5169  *
5170  *	0	delete rules with given number
5171  *	1	delete rules with given set number
5172  *	2	move rules with given number to new set
5173  *	3	move rules with given set number to new set
5174  *	4	swap sets with given numbers
5175  */
5176 static int
5177 ipfw_ctl_alter(uint32_t arg)
5178 {
5179 	uint16_t rulenum;
5180 	uint8_t cmd, new_set;
5181 	int error = 0;
5182 
5183 	ASSERT_NETISR0;
5184 
5185 	rulenum = arg & 0xffff;
5186 	cmd = (arg >> 24) & 0xff;
5187 	new_set = (arg >> 16) & 0xff;
5188 
5189 	if (cmd > 4)
5190 		return EINVAL;
5191 	if (new_set >= IPFW_DEFAULT_SET)
5192 		return EINVAL;
5193 	if (cmd == 0 || cmd == 2) {
5194 		if (rulenum == IPFW_DEFAULT_RULE)
5195 			return EINVAL;
5196 	} else {
5197 		if (rulenum >= IPFW_DEFAULT_SET)
5198 			return EINVAL;
5199 	}
5200 
5201 	switch (cmd) {
5202 	case 0:	/* delete rules with given number */
5203 		error = ipfw_alt_delete_rule(rulenum);
5204 		break;
5205 
5206 	case 1:	/* delete all rules with given set number */
5207 		error = ipfw_alt_delete_ruleset(rulenum);
5208 		break;
5209 
5210 	case 2:	/* move rules with given number to new set */
5211 		error = ipfw_alt_move_rule(rulenum, new_set);
5212 		break;
5213 
5214 	case 3: /* move rules with given set number to new set */
5215 		error = ipfw_alt_move_ruleset(rulenum, new_set);
5216 		break;
5217 
5218 	case 4: /* swap two sets */
5219 		error = ipfw_alt_swap_ruleset(rulenum, new_set);
5220 		break;
5221 	}
5222 	return error;
5223 }
5224 
5225 /*
5226  * Clear counters for a specific rule.
5227  */
5228 static void
5229 clear_counters(struct ip_fw *rule, int log_only)
5230 {
5231 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5232 
5233 	if (log_only == 0) {
5234 		rule->bcnt = rule->pcnt = 0;
5235 		rule->timestamp = 0;
5236 	}
5237 	if (l->o.opcode == O_LOG)
5238 		l->log_left = l->max_log;
5239 }
5240 
5241 static void
5242 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5243 {
5244 	struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5245 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5246 	struct ip_fw *rule;
5247 
5248 	ASSERT_NETISR_NCPUS(mycpuid);
5249 
5250 	if (zmsg->rulenum == 0) {
5251 		KKASSERT(zmsg->start_rule == NULL);
5252 
5253 		ctx->ipfw_norule_counter = 0;
5254 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5255 			clear_counters(rule, zmsg->log_only);
5256 	} else {
5257 		struct ip_fw *start = zmsg->start_rule;
5258 
5259 		KKASSERT(start->cpuid == mycpuid);
5260 		KKASSERT(start->rulenum == zmsg->rulenum);
5261 
5262 		/*
5263 		 * We can have multiple rules with the same number, so we
5264 		 * need to clear them all.
5265 		 */
5266 		for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5267 		     rule = rule->next)
5268 			clear_counters(rule, zmsg->log_only);
5269 
5270 		/*
5271 		 * Move to the position on the next CPU
5272 		 * before the msg is forwarded.
5273 		 */
5274 		zmsg->start_rule = start->sibling;
5275 	}
5276 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5277 }
5278 
5279 /*
5280  * Reset some or all counters on firewall rules.
5281  * @arg frwl is null to clear all entries, or contains a specific
5282  * rule number.
5283  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5284  */
5285 static int
5286 ipfw_ctl_zero_entry(int rulenum, int log_only)
5287 {
5288 	struct netmsg_zent zmsg;
5289 	struct netmsg_base *nmsg;
5290 	const char *msg;
5291 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5292 
5293 	ASSERT_NETISR0;
5294 
5295 	bzero(&zmsg, sizeof(zmsg));
5296 	nmsg = &zmsg.base;
5297 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5298 	    ipfw_zero_entry_dispatch);
5299 	zmsg.log_only = log_only;
5300 
5301 	if (rulenum == 0) {
5302 		msg = log_only ? "ipfw: All logging counts reset.\n"
5303 			       : "ipfw: Accounting cleared.\n";
5304 	} else {
5305 		struct ip_fw *rule;
5306 
5307 		/*
5308 		 * Locate the first rule with 'rulenum'
5309 		 */
5310 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5311 			if (rule->rulenum == rulenum)
5312 				break;
5313 		}
5314 		if (rule == NULL) /* we did not find any matching rules */
5315 			return (EINVAL);
5316 		zmsg.start_rule = rule;
5317 		zmsg.rulenum = rulenum;
5318 
5319 		msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5320 			       : "ipfw: Entry %d cleared.\n";
5321 	}
5322 	netisr_domsg_global(nmsg);
5323 	KKASSERT(zmsg.start_rule == NULL);
5324 
5325 	if (fw_verbose)
5326 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5327 	return (0);
5328 }
5329 
5330 /*
5331  * Check validity of the structure before insert.
5332  * Fortunately rules are simple, so this mostly need to check rule sizes.
5333  */
5334 static int
5335 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5336 {
5337 	int l, cmdlen = 0;
5338 	int have_action = 0;
5339 	ipfw_insn *cmd;
5340 
5341 	*rule_flags = 0;
5342 
5343 	/* Check for valid size */
5344 	if (size < sizeof(*rule)) {
5345 		kprintf("ipfw: rule too short\n");
5346 		return EINVAL;
5347 	}
5348 	l = IOC_RULESIZE(rule);
5349 	if (l != size) {
5350 		kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5351 		return EINVAL;
5352 	}
5353 
5354 	/* Check rule number */
5355 	if (rule->rulenum == IPFW_DEFAULT_RULE) {
5356 		kprintf("ipfw: invalid rule number\n");
5357 		return EINVAL;
5358 	}
5359 
5360 	/*
5361 	 * Now go for the individual checks. Very simple ones, basically only
5362 	 * instruction sizes.
5363 	 */
5364 	for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5365 	     l -= cmdlen, cmd += cmdlen) {
5366 		cmdlen = F_LEN(cmd);
5367 		if (cmdlen > l) {
5368 			kprintf("ipfw: opcode %d size truncated\n",
5369 				cmd->opcode);
5370 			return EINVAL;
5371 		}
5372 
5373 		DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5374 
5375 		if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5376 		    IPFW_ISXLAT(cmd->opcode)) {
5377 			/* This rule will generate states. */
5378 			*rule_flags |= IPFW_RULE_F_GENSTATE;
5379 			if (cmd->opcode == O_LIMIT)
5380 				*rule_flags |= IPFW_RULE_F_GENTRACK;
5381 		}
5382 		if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5383 			*rule_flags |= IPFW_RULE_F_CROSSREF;
5384 		if (cmd->opcode == O_IP_SRC_IFIP ||
5385 		    cmd->opcode == O_IP_DST_IFIP) {
5386 			*rule_flags |= IPFW_RULE_F_DYNIFADDR;
5387 			cmd->arg1 &= IPFW_IFIP_SETTINGS;
5388 		}
5389 
5390 		switch (cmd->opcode) {
5391 		case O_NOP:
5392 		case O_PROBE_STATE:
5393 		case O_KEEP_STATE:
5394 		case O_PROTO:
5395 		case O_IP_SRC_ME:
5396 		case O_IP_DST_ME:
5397 		case O_LAYER2:
5398 		case O_IN:
5399 		case O_FRAG:
5400 		case O_IPFRAG:
5401 		case O_IPOPT:
5402 		case O_IPLEN:
5403 		case O_IPID:
5404 		case O_IPTOS:
5405 		case O_IPPRECEDENCE:
5406 		case O_IPTTL:
5407 		case O_IPVER:
5408 		case O_TCPWIN:
5409 		case O_TCPFLAGS:
5410 		case O_TCPOPTS:
5411 		case O_ESTAB:
5412 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5413 				goto bad_size;
5414 			break;
5415 
5416 		case O_IP_SRC_TABLE:
5417 		case O_IP_DST_TABLE:
5418 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5419 				goto bad_size;
5420 			if (cmd->arg1 >= ipfw_table_max) {
5421 				kprintf("ipfw: invalid table id %u, max %d\n",
5422 				    cmd->arg1, ipfw_table_max);
5423 				return EINVAL;
5424 			}
5425 			break;
5426 
5427 		case O_IP_SRC_IFIP:
5428 		case O_IP_DST_IFIP:
5429 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5430 				goto bad_size;
5431 			break;
5432 
5433 		case O_UID:
5434 		case O_GID:
5435 		case O_IP_SRC:
5436 		case O_IP_DST:
5437 		case O_TCPSEQ:
5438 		case O_TCPACK:
5439 		case O_PROB:
5440 		case O_ICMPTYPE:
5441 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5442 				goto bad_size;
5443 			break;
5444 
5445 		case O_LIMIT:
5446 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5447 				goto bad_size;
5448 			break;
5449 		case O_REDIRECT:
5450 			if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5451 				goto bad_size;
5452 			break;
5453 
5454 		case O_LOG:
5455 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5456 				goto bad_size;
5457 
5458 			((ipfw_insn_log *)cmd)->log_left =
5459 			    ((ipfw_insn_log *)cmd)->max_log;
5460 
5461 			break;
5462 
5463 		case O_IP_SRC_MASK:
5464 		case O_IP_DST_MASK:
5465 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5466 				goto bad_size;
5467 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5468 				kprintf("ipfw: opcode %d, useless rule\n",
5469 					cmd->opcode);
5470 				return EINVAL;
5471 			}
5472 			break;
5473 
5474 		case O_IP_SRC_SET:
5475 		case O_IP_DST_SET:
5476 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5477 				kprintf("ipfw: invalid set size %d\n",
5478 					cmd->arg1);
5479 				return EINVAL;
5480 			}
5481 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5482 			    (cmd->arg1+31)/32 )
5483 				goto bad_size;
5484 			break;
5485 
5486 		case O_MACADDR2:
5487 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5488 				goto bad_size;
5489 			break;
5490 
5491 		case O_MAC_TYPE:
5492 		case O_IP_SRCPORT:
5493 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5494 			if (cmdlen < 2 || cmdlen > 31)
5495 				goto bad_size;
5496 			break;
5497 
5498 		case O_RECV:
5499 		case O_XMIT:
5500 		case O_VIA:
5501 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5502 				goto bad_size;
5503 			break;
5504 
5505 		case O_PIPE:
5506 		case O_QUEUE:
5507 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5508 				goto bad_size;
5509 			goto check_action;
5510 
5511 		case O_FORWARD_IP:
5512 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5513 				goto bad_size;
5514 			} else {
5515 				in_addr_t fwd_addr;
5516 
5517 				fwd_addr = ((ipfw_insn_sa *)cmd)->
5518 					   sa.sin_addr.s_addr;
5519 				if (IN_MULTICAST(ntohl(fwd_addr))) {
5520 					kprintf("ipfw: try forwarding to "
5521 						"multicast address\n");
5522 					return EINVAL;
5523 				}
5524 			}
5525 			goto check_action;
5526 
5527 		case O_FORWARD_MAC: /* XXX not implemented yet */
5528 		case O_CHECK_STATE:
5529 		case O_COUNT:
5530 		case O_ACCEPT:
5531 		case O_DENY:
5532 		case O_REJECT:
5533 		case O_SKIPTO:
5534 		case O_DIVERT:
5535 		case O_TEE:
5536 		case O_DEFRAG:
5537 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5538 				goto bad_size;
5539 check_action:
5540 			if (have_action) {
5541 				kprintf("ipfw: opcode %d, multiple actions"
5542 					" not allowed\n",
5543 					cmd->opcode);
5544 				return EINVAL;
5545 			}
5546 			have_action = 1;
5547 			if (l != cmdlen) {
5548 				kprintf("ipfw: opcode %d, action must be"
5549 					" last opcode\n",
5550 					cmd->opcode);
5551 				return EINVAL;
5552 			}
5553 			break;
5554 		default:
5555 			kprintf("ipfw: opcode %d, unknown opcode\n",
5556 				cmd->opcode);
5557 			return EINVAL;
5558 		}
5559 	}
5560 	if (have_action == 0) {
5561 		kprintf("ipfw: missing action\n");
5562 		return EINVAL;
5563 	}
5564 	return 0;
5565 
5566 bad_size:
5567 	kprintf("ipfw: opcode %d size %d wrong\n",
5568 		cmd->opcode, cmdlen);
5569 	return EINVAL;
5570 }
5571 
5572 static int
5573 ipfw_ctl_add_rule(struct sockopt *sopt)
5574 {
5575 	struct ipfw_ioc_rule *ioc_rule;
5576 	size_t size;
5577 	uint32_t rule_flags;
5578 	int error;
5579 
5580 	ASSERT_NETISR0;
5581 
5582 	size = sopt->sopt_valsize;
5583 	if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5584 	    size < sizeof(*ioc_rule)) {
5585 		return EINVAL;
5586 	}
5587 	if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5588 		sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5589 					  IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5590 	}
5591 	ioc_rule = sopt->sopt_val;
5592 
5593 	error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5594 	if (error)
5595 		return error;
5596 
5597 	ipfw_add_rule(ioc_rule, rule_flags);
5598 
5599 	if (sopt->sopt_dir == SOPT_GET)
5600 		sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5601 	return 0;
5602 }
5603 
5604 static void *
5605 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5606     struct ipfw_ioc_rule *ioc_rule)
5607 {
5608 	const struct ip_fw *sibling;
5609 #ifdef INVARIANTS
5610 	int i;
5611 #endif
5612 
5613 	ASSERT_NETISR0;
5614 	KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5615 
5616 	ioc_rule->act_ofs = rule->act_ofs;
5617 	ioc_rule->cmd_len = rule->cmd_len;
5618 	ioc_rule->rulenum = rule->rulenum;
5619 	ioc_rule->set = rule->set;
5620 	ioc_rule->usr_flags = rule->usr_flags;
5621 
5622 	ioc_rule->set_disable = ctx->ipfw_set_disable;
5623 	ioc_rule->static_count = static_count;
5624 	ioc_rule->static_len = static_ioc_len;
5625 
5626 	/*
5627 	 * Visit (read-only) all of the rule's duplications to get
5628 	 * the necessary statistics
5629 	 */
5630 #ifdef INVARIANTS
5631 	i = 0;
5632 #endif
5633 	ioc_rule->pcnt = 0;
5634 	ioc_rule->bcnt = 0;
5635 	ioc_rule->timestamp = 0;
5636 	for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5637 		ioc_rule->pcnt += sibling->pcnt;
5638 		ioc_rule->bcnt += sibling->bcnt;
5639 		if (sibling->timestamp > ioc_rule->timestamp)
5640 			ioc_rule->timestamp = sibling->timestamp;
5641 #ifdef INVARIANTS
5642 		++i;
5643 #endif
5644 	}
5645 	KASSERT(i == netisr_ncpus,
5646 	    ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5647 
5648 	bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5649 
5650 	return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5651 }
5652 
5653 static boolean_t
5654 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5655 {
5656 	struct ipfw_ioc_flowid *ioc_id;
5657 
5658 	if (trk->tc_expire == 0) {
5659 		/* Not a scanned one. */
5660 		return (FALSE);
5661 	}
5662 
5663 	ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5664 	    0 : trk->tc_expire - time_uptime;
5665 	ioc_state->pcnt = 0;
5666 	ioc_state->bcnt = 0;
5667 
5668 	ioc_state->dyn_type = O_LIMIT_PARENT;
5669 	ioc_state->count = trk->tc_count;
5670 
5671 	ioc_state->rulenum = trk->tc_rulenum;
5672 
5673 	ioc_id = &ioc_state->id;
5674 	ioc_id->type = ETHERTYPE_IP;
5675 	ioc_id->u.ip.proto = trk->tc_proto;
5676 	ioc_id->u.ip.src_ip = trk->tc_saddr;
5677 	ioc_id->u.ip.dst_ip = trk->tc_daddr;
5678 	ioc_id->u.ip.src_port = trk->tc_sport;
5679 	ioc_id->u.ip.dst_port = trk->tc_dport;
5680 
5681 	return (TRUE);
5682 }
5683 
5684 static boolean_t
5685 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5686 {
5687 	struct ipfw_ioc_flowid *ioc_id;
5688 
5689 	if (IPFW_STATE_SCANSKIP(s))
5690 		return (FALSE);
5691 
5692 	ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5693 	    0 : s->st_expire - time_uptime;
5694 	ioc_state->pcnt = s->st_pcnt;
5695 	ioc_state->bcnt = s->st_bcnt;
5696 
5697 	ioc_state->dyn_type = s->st_type;
5698 	ioc_state->count = 0;
5699 
5700 	ioc_state->rulenum = s->st_rule->rulenum;
5701 
5702 	ioc_id = &ioc_state->id;
5703 	ioc_id->type = ETHERTYPE_IP;
5704 	ioc_id->u.ip.proto = s->st_proto;
5705 	ipfw_key_4tuple(&s->st_key,
5706 	    &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5707 	    &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5708 
5709 	if (IPFW_ISXLAT(s->st_type)) {
5710 		const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5711 
5712 		if (x->xlat_port == 0)
5713 			ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5714 		else
5715 			ioc_state->xlat_port = ntohs(x->xlat_port);
5716 		ioc_state->xlat_addr = ntohl(x->xlat_addr);
5717 
5718 		ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5719 		ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5720 	}
5721 
5722 	return (TRUE);
5723 }
5724 
5725 static void
5726 ipfw_state_copy_dispatch(netmsg_t nmsg)
5727 {
5728 	struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5729 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5730 	const struct ipfw_state *s;
5731 	const struct ipfw_track *t;
5732 
5733 	ASSERT_NETISR_NCPUS(mycpuid);
5734 	KASSERT(nm->state_cnt < nm->state_cntmax,
5735 	    ("invalid state count %d, max %d",
5736 	     nm->state_cnt, nm->state_cntmax));
5737 
5738 	TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5739 		if (ipfw_state_copy(s, nm->ioc_state)) {
5740 			nm->ioc_state++;
5741 			nm->state_cnt++;
5742 			if (nm->state_cnt == nm->state_cntmax)
5743 				goto done;
5744 		}
5745 	}
5746 
5747 	/*
5748 	 * Prepare tracks in the global track tree for userland.
5749 	 */
5750 	TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5751 		struct ipfw_trkcnt *trk;
5752 
5753 		if (t->t_count == NULL) /* anchor */
5754 			continue;
5755 		trk = t->t_trkcnt;
5756 
5757 		/*
5758 		 * Only one netisr can run this function at
5759 		 * any time, and only this function accesses
5760 		 * trkcnt's tc_expire, so this is safe w/o
5761 		 * ipfw_gd.ipfw_trkcnt_token.
5762 		 */
5763 		if (trk->tc_expire > t->t_expire)
5764 			continue;
5765 		trk->tc_expire = t->t_expire;
5766 	}
5767 
5768 	/*
5769 	 * Copy tracks in the global track tree to userland in
5770 	 * the last netisr.
5771 	 */
5772 	if (mycpuid == netisr_ncpus - 1) {
5773 		struct ipfw_trkcnt *trk;
5774 
5775 		KASSERT(nm->state_cnt < nm->state_cntmax,
5776 		    ("invalid state count %d, max %d",
5777 		     nm->state_cnt, nm->state_cntmax));
5778 
5779 		IPFW_TRKCNT_TOKGET;
5780 		RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5781 			if (ipfw_track_copy(trk, nm->ioc_state)) {
5782 				nm->ioc_state++;
5783 				nm->state_cnt++;
5784 				if (nm->state_cnt == nm->state_cntmax) {
5785 					IPFW_TRKCNT_TOKREL;
5786 					goto done;
5787 				}
5788 			}
5789 		}
5790 		IPFW_TRKCNT_TOKREL;
5791 	}
5792 done:
5793 	if (nm->state_cnt == nm->state_cntmax) {
5794 		/* No more space; done. */
5795 		netisr_replymsg(&nm->base, 0);
5796 	} else {
5797 		netisr_forwardmsg(&nm->base, mycpuid + 1);
5798 	}
5799 }
5800 
5801 static int
5802 ipfw_ctl_get_rules(struct sockopt *sopt)
5803 {
5804 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5805 	struct ip_fw *rule;
5806 	void *bp;
5807 	size_t size;
5808 	int state_cnt;
5809 
5810 	ASSERT_NETISR0;
5811 
5812 	/*
5813 	 * pass up a copy of the current rules. Static rules
5814 	 * come first (the last of which has number IPFW_DEFAULT_RULE),
5815 	 * followed by a possibly empty list of states.
5816 	 */
5817 
5818 	size = static_ioc_len;	/* size of static rules */
5819 
5820 	/*
5821 	 * Size of the states.
5822 	 * XXX take tracks as state for userland compat.
5823 	 */
5824 	state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5825 	state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5826 	size += state_cnt * sizeof(struct ipfw_ioc_state);
5827 
5828 	if (sopt->sopt_valsize < size) {
5829 		/* short length, no need to return incomplete rules */
5830 		/* XXX: if superuser, no need to zero buffer */
5831 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5832 		return 0;
5833 	}
5834 	bp = sopt->sopt_val;
5835 
5836 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5837 		bp = ipfw_copy_rule(ctx, rule, bp);
5838 
5839 	if (state_cnt) {
5840 		struct netmsg_cpstate nm;
5841 #ifdef INVARIANTS
5842 		size_t old_size = size;
5843 #endif
5844 
5845 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5846 		    MSGF_PRIORITY, ipfw_state_copy_dispatch);
5847 		nm.ioc_state = bp;
5848 		nm.state_cntmax = state_cnt;
5849 		nm.state_cnt = 0;
5850 		netisr_domsg_global(&nm.base);
5851 
5852 		/*
5853 		 * The # of states may be shrinked after the snapshot
5854 		 * of the state count was taken.  To give user a correct
5855 		 * state count, nm->state_cnt is used to recalculate
5856 		 * the actual size.
5857 		 */
5858 		size = static_ioc_len +
5859 		    (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5860 		KKASSERT(size <= old_size);
5861 	}
5862 
5863 	sopt->sopt_valsize = size;
5864 	return 0;
5865 }
5866 
5867 static void
5868 ipfw_set_disable_dispatch(netmsg_t nmsg)
5869 {
5870 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5871 
5872 	ASSERT_NETISR_NCPUS(mycpuid);
5873 
5874 	ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5875 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5876 }
5877 
5878 static void
5879 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5880 {
5881 	struct netmsg_base nmsg;
5882 	uint32_t set_disable;
5883 
5884 	ASSERT_NETISR0;
5885 
5886 	/* IPFW_DEFAULT_SET is always enabled */
5887 	enable |= (1 << IPFW_DEFAULT_SET);
5888 	set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5889 
5890 	bzero(&nmsg, sizeof(nmsg));
5891 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5892 	    ipfw_set_disable_dispatch);
5893 	nmsg.lmsg.u.ms_result32 = set_disable;
5894 
5895 	netisr_domsg_global(&nmsg);
5896 }
5897 
5898 static void
5899 ipfw_table_create_dispatch(netmsg_t nm)
5900 {
5901 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5902 	int tblid = nm->lmsg.u.ms_result;
5903 
5904 	ASSERT_NETISR_NCPUS(mycpuid);
5905 
5906 	if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5907 	    rn_cpumaskhead(mycpuid), 32))
5908 		panic("ipfw: create table%d failed", tblid);
5909 
5910 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5911 }
5912 
5913 static int
5914 ipfw_table_create(struct sockopt *sopt)
5915 {
5916 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5917 	struct ipfw_ioc_table *tbl;
5918 	struct netmsg_base nm;
5919 
5920 	ASSERT_NETISR0;
5921 
5922 	if (sopt->sopt_valsize != sizeof(*tbl))
5923 		return (EINVAL);
5924 
5925 	tbl = sopt->sopt_val;
5926 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5927 		return (EINVAL);
5928 
5929 	if (ctx->ipfw_tables[tbl->tableid] != NULL)
5930 		return (EEXIST);
5931 
5932 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5933 	    ipfw_table_create_dispatch);
5934 	nm.lmsg.u.ms_result = tbl->tableid;
5935 	netisr_domsg_global(&nm);
5936 
5937 	return (0);
5938 }
5939 
5940 static void
5941 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5942 {
5943 	struct radix_node *ret;
5944 
5945 	ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5946 	if (ret != rn)
5947 		panic("deleted other table entry");
5948 	kfree(ret, M_IPFW);
5949 }
5950 
5951 static int
5952 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5953 {
5954 
5955 	ipfw_table_killrn(xrnh, rn);
5956 	return (0);
5957 }
5958 
5959 static void
5960 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5961     int destroy)
5962 {
5963 	struct radix_node_head *rnh;
5964 
5965 	ASSERT_NETISR_NCPUS(mycpuid);
5966 
5967 	rnh = ctx->ipfw_tables[tableid];
5968 	rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5969 	if (destroy) {
5970 		Free(rnh);
5971 		ctx->ipfw_tables[tableid] = NULL;
5972 	}
5973 }
5974 
5975 static void
5976 ipfw_table_flush_dispatch(netmsg_t nmsg)
5977 {
5978 	struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
5979 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5980 
5981 	ASSERT_NETISR_NCPUS(mycpuid);
5982 
5983 	ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
5984 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5985 }
5986 
5987 static void
5988 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
5989 {
5990 	int i;
5991 
5992 	ASSERT_NETISR_NCPUS(mycpuid);
5993 
5994 	for (i = 0; i < ipfw_table_max; ++i) {
5995 		if (ctx->ipfw_tables[i] != NULL)
5996 			ipfw_table_flush_oncpu(ctx, i, destroy);
5997 	}
5998 }
5999 
6000 static void
6001 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6002 {
6003 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6004 
6005 	ASSERT_NETISR_NCPUS(mycpuid);
6006 
6007 	ipfw_table_flushall_oncpu(ctx, 0);
6008 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6009 }
6010 
6011 static int
6012 ipfw_table_flush(struct sockopt *sopt)
6013 {
6014 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6015 	struct ipfw_ioc_table *tbl;
6016 	struct netmsg_tblflush nm;
6017 
6018 	ASSERT_NETISR0;
6019 
6020 	if (sopt->sopt_valsize != sizeof(*tbl))
6021 		return (EINVAL);
6022 
6023 	tbl = sopt->sopt_val;
6024 	if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6025 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6026 		    MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6027 		netisr_domsg_global(&nm.base);
6028 		return (0);
6029 	}
6030 
6031 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6032 		return (EINVAL);
6033 
6034 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6035 		return (ENOENT);
6036 
6037 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6038 	    ipfw_table_flush_dispatch);
6039 	nm.tableid = tbl->tableid;
6040 	nm.destroy = 0;
6041 	if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6042 		nm.destroy = 1;
6043 	netisr_domsg_global(&nm.base);
6044 
6045 	return (0);
6046 }
6047 
6048 static int
6049 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6050 {
6051 	int *cnt = xcnt;
6052 
6053 	(*cnt)++;
6054 	return (0);
6055 }
6056 
6057 static int
6058 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6059 {
6060 	struct ipfw_table_cp *cp = xcp;
6061 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6062 	struct ipfw_ioc_tblent *ioc_te;
6063 #ifdef INVARIANTS
6064 	int cnt;
6065 #endif
6066 
6067 	KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6068 	    cp->te_idx, cp->te_cnt));
6069 	ioc_te = &cp->te[cp->te_idx];
6070 
6071 	if (te->te_nodes->rn_mask != NULL) {
6072 		memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6073 		    *te->te_nodes->rn_mask);
6074 	} else {
6075 		ioc_te->netmask.sin_len = 0;
6076 	}
6077 	memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6078 
6079 	ioc_te->use = te->te_use;
6080 	ioc_te->last_used = te->te_lastuse;
6081 #ifdef INVARIANTS
6082 	cnt = 1;
6083 #endif
6084 
6085 	while ((te = te->te_sibling) != NULL) {
6086 #ifdef INVARIANTS
6087 		++cnt;
6088 #endif
6089 		ioc_te->use += te->te_use;
6090 		if (te->te_lastuse > ioc_te->last_used)
6091 			ioc_te->last_used = te->te_lastuse;
6092 	}
6093 	KASSERT(cnt == netisr_ncpus,
6094 	    ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6095 
6096 	cp->te_idx++;
6097 
6098 	return (0);
6099 }
6100 
6101 static int
6102 ipfw_table_get(struct sockopt *sopt)
6103 {
6104 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6105 	struct radix_node_head *rnh;
6106 	struct ipfw_ioc_table *tbl;
6107 	struct ipfw_ioc_tblcont *cont;
6108 	struct ipfw_table_cp cp;
6109 	int cnt = 0, sz;
6110 
6111 	ASSERT_NETISR0;
6112 
6113 	if (sopt->sopt_valsize < sizeof(*tbl))
6114 		return (EINVAL);
6115 
6116 	tbl = sopt->sopt_val;
6117 	if (tbl->tableid < 0) {
6118 		struct ipfw_ioc_tbllist *list;
6119 		int i;
6120 
6121 		/*
6122 		 * List available table ids.
6123 		 */
6124 		for (i = 0; i < ipfw_table_max; ++i) {
6125 			if (ctx->ipfw_tables[i] != NULL)
6126 				++cnt;
6127 		}
6128 
6129 		sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6130 		if (sopt->sopt_valsize < sz) {
6131 			bzero(sopt->sopt_val, sopt->sopt_valsize);
6132 			return (E2BIG);
6133 		}
6134 		list = sopt->sopt_val;
6135 		list->tablecnt = cnt;
6136 
6137 		cnt = 0;
6138 		for (i = 0; i < ipfw_table_max; ++i) {
6139 			if (ctx->ipfw_tables[i] != NULL) {
6140 				KASSERT(cnt < list->tablecnt,
6141 				    ("invalid idx %d, cnt %d",
6142 				     cnt, list->tablecnt));
6143 				list->tables[cnt++] = i;
6144 			}
6145 		}
6146 		sopt->sopt_valsize = sz;
6147 		return (0);
6148 	} else if (tbl->tableid >= ipfw_table_max) {
6149 		return (EINVAL);
6150 	}
6151 
6152 	rnh = ctx->ipfw_tables[tbl->tableid];
6153 	if (rnh == NULL)
6154 		return (ENOENT);
6155 	rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6156 
6157 	sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6158 	if (sopt->sopt_valsize < sz) {
6159 		bzero(sopt->sopt_val, sopt->sopt_valsize);
6160 		return (E2BIG);
6161 	}
6162 	cont = sopt->sopt_val;
6163 	cont->entcnt = cnt;
6164 
6165 	cp.te = cont->ent;
6166 	cp.te_idx = 0;
6167 	cp.te_cnt = cnt;
6168 	rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6169 
6170 	sopt->sopt_valsize = sz;
6171 	return (0);
6172 }
6173 
6174 static void
6175 ipfw_table_add_dispatch(netmsg_t nmsg)
6176 {
6177 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6178 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6179 	struct radix_node_head *rnh;
6180 	struct ipfw_tblent *te;
6181 
6182 	ASSERT_NETISR_NCPUS(mycpuid);
6183 
6184 	rnh = ctx->ipfw_tables[nm->tableid];
6185 
6186 	te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6187 	te->te_nodes->rn_key = (char *)&te->te_key;
6188 	memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6189 
6190 	if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6191 	    te->te_nodes) == NULL) {
6192 		if (mycpuid == 0) {
6193 			kfree(te, M_IPFW);
6194 			netisr_replymsg(&nm->base, EEXIST);
6195 			return;
6196 		}
6197 		panic("rnh_addaddr failed");
6198 	}
6199 
6200 	/* Link siblings. */
6201 	if (nm->sibling != NULL)
6202 		nm->sibling->te_sibling = te;
6203 	nm->sibling = te;
6204 
6205 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6206 }
6207 
6208 static void
6209 ipfw_table_del_dispatch(netmsg_t nmsg)
6210 {
6211 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6212 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6213 	struct radix_node_head *rnh;
6214 	struct radix_node *rn;
6215 
6216 	ASSERT_NETISR_NCPUS(mycpuid);
6217 
6218 	rnh = ctx->ipfw_tables[nm->tableid];
6219 	rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6220 	if (rn == NULL) {
6221 		if (mycpuid == 0) {
6222 			netisr_replymsg(&nm->base, ESRCH);
6223 			return;
6224 		}
6225 		panic("rnh_deladdr failed");
6226 	}
6227 	kfree(rn, M_IPFW);
6228 
6229 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6230 }
6231 
6232 static int
6233 ipfw_table_alt(struct sockopt *sopt)
6234 {
6235 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6236 	struct ipfw_ioc_tblcont *tbl;
6237 	struct ipfw_ioc_tblent *te;
6238 	struct sockaddr_in key0;
6239 	struct sockaddr *netmask = NULL, *key;
6240 	struct netmsg_tblent nm;
6241 
6242 	ASSERT_NETISR0;
6243 
6244 	if (sopt->sopt_valsize != sizeof(*tbl))
6245 		return (EINVAL);
6246 	tbl = sopt->sopt_val;
6247 
6248 	if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6249 		return (EINVAL);
6250 	if (tbl->entcnt != 1)
6251 		return (EINVAL);
6252 
6253 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6254 		return (ENOENT);
6255 	te = &tbl->ent[0];
6256 
6257 	if (te->key.sin_family != AF_INET ||
6258 	    te->key.sin_port != 0 ||
6259 	    te->key.sin_len != sizeof(struct sockaddr_in))
6260 		return (EINVAL);
6261 	key = (struct sockaddr *)&te->key;
6262 
6263 	if (te->netmask.sin_len != 0) {
6264 		if (te->netmask.sin_port != 0 ||
6265 		    te->netmask.sin_len > sizeof(struct sockaddr_in))
6266 			return (EINVAL);
6267 		netmask = (struct sockaddr *)&te->netmask;
6268 		sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6269 		key = (struct sockaddr *)&key0;
6270 	}
6271 
6272 	if (sopt->sopt_name == IP_FW_TBL_ADD) {
6273 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6274 		    MSGF_PRIORITY, ipfw_table_add_dispatch);
6275 	} else {
6276 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6277 		    MSGF_PRIORITY, ipfw_table_del_dispatch);
6278 	}
6279 	nm.key = key;
6280 	nm.netmask = netmask;
6281 	nm.tableid = tbl->tableid;
6282 	nm.sibling = NULL;
6283 	return (netisr_domsg_global(&nm.base));
6284 }
6285 
6286 static int
6287 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6288 {
6289 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6290 
6291 	te->te_use = 0;
6292 	te->te_lastuse = 0;
6293 	return (0);
6294 }
6295 
6296 static void
6297 ipfw_table_zero_dispatch(netmsg_t nmsg)
6298 {
6299 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6300 	struct radix_node_head *rnh;
6301 
6302 	ASSERT_NETISR_NCPUS(mycpuid);
6303 
6304 	rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6305 	rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6306 
6307 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6308 }
6309 
6310 static void
6311 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6312 {
6313 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6314 	int i;
6315 
6316 	ASSERT_NETISR_NCPUS(mycpuid);
6317 
6318 	for (i = 0; i < ipfw_table_max; ++i) {
6319 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6320 
6321 		if (rnh != NULL)
6322 			rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6323 	}
6324 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6325 }
6326 
6327 static int
6328 ipfw_table_zero(struct sockopt *sopt)
6329 {
6330 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6331 	struct netmsg_base nm;
6332 	struct ipfw_ioc_table *tbl;
6333 
6334 	ASSERT_NETISR0;
6335 
6336 	if (sopt->sopt_valsize != sizeof(*tbl))
6337 		return (EINVAL);
6338 	tbl = sopt->sopt_val;
6339 
6340 	if (tbl->tableid < 0) {
6341 		netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6342 		    ipfw_table_zeroall_dispatch);
6343 		netisr_domsg_global(&nm);
6344 		return (0);
6345 	} else if (tbl->tableid >= ipfw_table_max) {
6346 		return (EINVAL);
6347 	} else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6348 		return (ENOENT);
6349 	}
6350 
6351 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6352 	    ipfw_table_zero_dispatch);
6353 	nm.lmsg.u.ms_result = tbl->tableid;
6354 	netisr_domsg_global(&nm);
6355 
6356 	return (0);
6357 }
6358 
6359 static int
6360 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6361 {
6362 	struct netmsg_tblexp *nm = xnm;
6363 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6364 
6365 	if (te->te_expired) {
6366 		ipfw_table_killrn(nm->rnh, rn);
6367 		nm->expcnt++;
6368 	}
6369 	return (0);
6370 }
6371 
6372 static void
6373 ipfw_table_expire_dispatch(netmsg_t nmsg)
6374 {
6375 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6376 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6377 	struct radix_node_head *rnh;
6378 
6379 	ASSERT_NETISR_NCPUS(mycpuid);
6380 
6381 	rnh = ctx->ipfw_tables[nm->tableid];
6382 	nm->rnh = rnh;
6383 	rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6384 
6385 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6386 	    ("not all expired addresses (%d) were deleted (%d)",
6387 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6388 
6389 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6390 }
6391 
6392 static void
6393 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6394 {
6395 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6396 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6397 	int i;
6398 
6399 	ASSERT_NETISR_NCPUS(mycpuid);
6400 
6401 	for (i = 0; i < ipfw_table_max; ++i) {
6402 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6403 
6404 		if (rnh == NULL)
6405 			continue;
6406 		nm->rnh = rnh;
6407 		rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6408 	}
6409 
6410 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6411 	    ("not all expired addresses (%d) were deleted (%d)",
6412 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6413 
6414 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6415 }
6416 
6417 static int
6418 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6419 {
6420 	struct netmsg_tblexp *nm = xnm;
6421 	struct ipfw_tblent *te;
6422 	time_t lastuse;
6423 
6424 	te = (struct ipfw_tblent *)rn;
6425 	lastuse = te->te_lastuse;
6426 
6427 	while ((te = te->te_sibling) != NULL) {
6428 		if (te->te_lastuse > lastuse)
6429 			lastuse = te->te_lastuse;
6430 	}
6431 	if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6432 		/* Not expired */
6433 		return (0);
6434 	}
6435 
6436 	te = (struct ipfw_tblent *)rn;
6437 	te->te_expired = 1;
6438 	while ((te = te->te_sibling) != NULL)
6439 		te->te_expired = 1;
6440 	nm->cnt++;
6441 
6442 	return (0);
6443 }
6444 
6445 static int
6446 ipfw_table_expire(struct sockopt *sopt)
6447 {
6448 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6449 	struct netmsg_tblexp nm;
6450 	struct ipfw_ioc_tblexp *tbl;
6451 	struct radix_node_head *rnh;
6452 
6453 	ASSERT_NETISR0;
6454 
6455 	if (sopt->sopt_valsize != sizeof(*tbl))
6456 		return (EINVAL);
6457 	tbl = sopt->sopt_val;
6458 	tbl->expcnt = 0;
6459 
6460 	nm.expcnt = 0;
6461 	nm.cnt = 0;
6462 	nm.expire = tbl->expire;
6463 
6464 	if (tbl->tableid < 0) {
6465 		int i;
6466 
6467 		for (i = 0; i < ipfw_table_max; ++i) {
6468 			rnh = ctx->ipfw_tables[i];
6469 			if (rnh == NULL)
6470 				continue;
6471 			rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6472 		}
6473 		if (nm.cnt == 0) {
6474 			/* No addresses can be expired. */
6475 			return (0);
6476 		}
6477 		tbl->expcnt = nm.cnt;
6478 
6479 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6480 		    MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6481 		nm.tableid = -1;
6482 		netisr_domsg_global(&nm.base);
6483 		KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6484 		    ("not all expired addresses (%d) were deleted (%d)",
6485 		     nm.cnt * netisr_ncpus, nm.expcnt));
6486 
6487 		return (0);
6488 	} else if (tbl->tableid >= ipfw_table_max) {
6489 		return (EINVAL);
6490 	}
6491 
6492 	rnh = ctx->ipfw_tables[tbl->tableid];
6493 	if (rnh == NULL)
6494 		return (ENOENT);
6495 	rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6496 	if (nm.cnt == 0) {
6497 		/* No addresses can be expired. */
6498 		return (0);
6499 	}
6500 	tbl->expcnt = nm.cnt;
6501 
6502 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6503 	    ipfw_table_expire_dispatch);
6504 	nm.tableid = tbl->tableid;
6505 	netisr_domsg_global(&nm.base);
6506 	KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6507 	    ("not all expired addresses (%d) were deleted (%d)",
6508 	     nm.cnt * netisr_ncpus, nm.expcnt));
6509 	return (0);
6510 }
6511 
6512 static void
6513 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6514 {
6515 	struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6516 
6517 	KKASSERT((rule->rule_flags &
6518 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6519 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6520 	ipfw_free_rule(rule);
6521 
6522 	netisr_replymsg(&nmsg->base, 0);
6523 }
6524 
6525 static void
6526 ipfw_crossref_reap(void)
6527 {
6528 	struct ip_fw *rule, *prev = NULL;
6529 
6530 	ASSERT_NETISR0;
6531 
6532 	rule = ipfw_gd.ipfw_crossref_free;
6533 	while (rule != NULL) {
6534 		uint64_t inflight = 0;
6535 		int i;
6536 
6537 		for (i = 0; i < netisr_ncpus; ++i)
6538 			inflight += rule->cross_rules[i]->cross_refs;
6539 		if (inflight == 0) {
6540 			struct ip_fw *f = rule;
6541 
6542 			/*
6543 			 * Unlink.
6544 			 */
6545 			rule = rule->next;
6546 			if (prev != NULL)
6547 				prev->next = rule;
6548 			else
6549 				ipfw_gd.ipfw_crossref_free = rule;
6550 
6551 			/*
6552 			 * Free.
6553 			 */
6554 			for (i = 1; i < netisr_ncpus; ++i) {
6555 				struct netmsg_base nm;
6556 
6557 				netmsg_init(&nm, NULL, &curthread->td_msgport,
6558 				    MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6559 				nm.lmsg.u.ms_resultp = f->cross_rules[i];
6560 				netisr_domsg(&nm, i);
6561 			}
6562 			KKASSERT((f->rule_flags &
6563 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6564 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6565 			ipfw_unref_rule(f);
6566 		} else {
6567 			prev = rule;
6568 			rule = rule->next;
6569 		}
6570 	}
6571 
6572 	if (ipfw_gd.ipfw_crossref_free != NULL) {
6573 		callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6574 		    ipfw_crossref_timeo, NULL);
6575 	}
6576 }
6577 
6578 /*
6579  * {set|get}sockopt parser.
6580  */
6581 static int
6582 ipfw_ctl(struct sockopt *sopt)
6583 {
6584 	int error, rulenum;
6585 	uint32_t *masks;
6586 	size_t size;
6587 
6588 	ASSERT_NETISR0;
6589 
6590 	error = 0;
6591 
6592 	switch (sopt->sopt_name) {
6593 	case IP_FW_GET:
6594 		error = ipfw_ctl_get_rules(sopt);
6595 		break;
6596 
6597 	case IP_FW_FLUSH:
6598 		ipfw_flush(0 /* keep default rule */);
6599 		break;
6600 
6601 	case IP_FW_ADD:
6602 		error = ipfw_ctl_add_rule(sopt);
6603 		break;
6604 
6605 	case IP_FW_DEL:
6606 		/*
6607 		 * IP_FW_DEL is used for deleting single rules or sets,
6608 		 * and (ab)used to atomically manipulate sets.
6609 		 * Argument size is used to distinguish between the two:
6610 		 *    sizeof(uint32_t)
6611 		 *	delete single rule or set of rules,
6612 		 *	or reassign rules (or sets) to a different set.
6613 		 *    2 * sizeof(uint32_t)
6614 		 *	atomic disable/enable sets.
6615 		 *	first uint32_t contains sets to be disabled,
6616 		 *	second uint32_t contains sets to be enabled.
6617 		 */
6618 		masks = sopt->sopt_val;
6619 		size = sopt->sopt_valsize;
6620 		if (size == sizeof(*masks)) {
6621 			/*
6622 			 * Delete or reassign static rule
6623 			 */
6624 			error = ipfw_ctl_alter(masks[0]);
6625 		} else if (size == (2 * sizeof(*masks))) {
6626 			/*
6627 			 * Set enable/disable
6628 			 */
6629 			ipfw_ctl_set_disable(masks[0], masks[1]);
6630 		} else {
6631 			error = EINVAL;
6632 		}
6633 		break;
6634 
6635 	case IP_FW_ZERO:
6636 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
6637 		rulenum = 0;
6638 
6639 		if (sopt->sopt_val != 0) {
6640 		    error = soopt_to_kbuf(sopt, &rulenum,
6641 			    sizeof(int), sizeof(int));
6642 		    if (error)
6643 			break;
6644 		}
6645 		error = ipfw_ctl_zero_entry(rulenum,
6646 			sopt->sopt_name == IP_FW_RESETLOG);
6647 		break;
6648 
6649 	case IP_FW_TBL_CREATE:
6650 		error = ipfw_table_create(sopt);
6651 		break;
6652 
6653 	case IP_FW_TBL_ADD:
6654 	case IP_FW_TBL_DEL:
6655 		error = ipfw_table_alt(sopt);
6656 		break;
6657 
6658 	case IP_FW_TBL_FLUSH:
6659 	case IP_FW_TBL_DESTROY:
6660 		error = ipfw_table_flush(sopt);
6661 		break;
6662 
6663 	case IP_FW_TBL_GET:
6664 		error = ipfw_table_get(sopt);
6665 		break;
6666 
6667 	case IP_FW_TBL_ZERO:
6668 		error = ipfw_table_zero(sopt);
6669 		break;
6670 
6671 	case IP_FW_TBL_EXPIRE:
6672 		error = ipfw_table_expire(sopt);
6673 		break;
6674 
6675 	default:
6676 		kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6677 		error = EINVAL;
6678 	}
6679 
6680 	ipfw_crossref_reap();
6681 	return error;
6682 }
6683 
6684 static void
6685 ipfw_keepalive_done(struct ipfw_context *ctx)
6686 {
6687 
6688 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6689 	    ("keepalive is not in progress"));
6690 	ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6691 	callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6692 	    ipfw_keepalive, NULL);
6693 }
6694 
6695 static void
6696 ipfw_keepalive_more(struct ipfw_context *ctx)
6697 {
6698 	struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6699 
6700 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6701 	    ("keepalive is not in progress"));
6702 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6703 	    ("keepalive more did not finish"));
6704 	netisr_sendmsg_oncpu(nm);
6705 }
6706 
6707 static void
6708 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6709 {
6710 	struct ipfw_state *s;
6711 	int scanned = 0, expired = 0, kept = 0;
6712 
6713 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6714 	    ("keepalive is not in progress"));
6715 
6716 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6717 		uint32_t ack_rev, ack_fwd;
6718 		struct ipfw_flow_id id;
6719 		uint8_t send_dir;
6720 
6721 		if (scanned++ >= ipfw_state_scan_max) {
6722 			ipfw_keepalive_more(ctx);
6723 			return;
6724 		}
6725 
6726 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6727 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6728 
6729 		/*
6730 		 * NOTE:
6731 		 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6732 		 * on slave xlat.
6733 		 */
6734 		if (s->st_type == O_ANCHOR)
6735 			continue;
6736 
6737 		if (IPFW_STATE_ISDEAD(s)) {
6738 			ipfw_state_remove(ctx, s);
6739 			if (++expired >= ipfw_state_expire_max) {
6740 				ipfw_keepalive_more(ctx);
6741 				return;
6742 			}
6743 			continue;
6744 		}
6745 
6746 		/*
6747 		 * Keep alive processing
6748 		 */
6749 
6750 		if (s->st_proto != IPPROTO_TCP)
6751 			continue;
6752 		if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6753 			continue;
6754 		if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6755 		    s->st_expire))
6756 			continue;	/* too early */
6757 
6758 		ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6759 		    &id.dst_ip, &id.dst_port);
6760 		ack_rev = s->st_ack_rev;
6761 		ack_fwd = s->st_ack_fwd;
6762 
6763 #define SEND_FWD	0x1
6764 #define SEND_REV	0x2
6765 
6766 		if (IPFW_ISXLAT(s->st_type)) {
6767 			const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6768 
6769 			if (x->xlat_dir == MATCH_FORWARD)
6770 				send_dir = SEND_FWD;
6771 			else
6772 				send_dir = SEND_REV;
6773 		} else {
6774 			send_dir = SEND_FWD | SEND_REV;
6775 		}
6776 
6777 		if (send_dir & SEND_REV)
6778 			send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6779 		if (send_dir & SEND_FWD)
6780 			send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6781 
6782 #undef SEND_FWD
6783 #undef SEND_REV
6784 
6785 		if (++kept >= ipfw_keepalive_max) {
6786 			ipfw_keepalive_more(ctx);
6787 			return;
6788 		}
6789 	}
6790 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6791 	ipfw_keepalive_done(ctx);
6792 }
6793 
6794 static void
6795 ipfw_keepalive_more_dispatch(netmsg_t nm)
6796 {
6797 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6798 	struct ipfw_state *anchor;
6799 
6800 	ASSERT_NETISR_NCPUS(mycpuid);
6801 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6802 	    ("keepalive is not in progress"));
6803 
6804 	/* Reply ASAP */
6805 	netisr_replymsg(&nm->base, 0);
6806 
6807 	anchor = &ctx->ipfw_keepalive_anch;
6808 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6809 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6810 		ipfw_keepalive_done(ctx);
6811 		return;
6812 	}
6813 	ipfw_keepalive_loop(ctx, anchor);
6814 }
6815 
6816 /*
6817  * This procedure is only used to handle keepalives. It is invoked
6818  * every dyn_keepalive_period
6819  */
6820 static void
6821 ipfw_keepalive_dispatch(netmsg_t nm)
6822 {
6823 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6824 	struct ipfw_state *anchor;
6825 
6826 	ASSERT_NETISR_NCPUS(mycpuid);
6827 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6828 	    ("keepalive is in progress"));
6829 	ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6830 
6831 	/* Reply ASAP */
6832 	crit_enter();
6833 	netisr_replymsg(&nm->base, 0);
6834 	crit_exit();
6835 
6836 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6837 		ipfw_keepalive_done(ctx);
6838 		return;
6839 	}
6840 
6841 	anchor = &ctx->ipfw_keepalive_anch;
6842 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6843 	ipfw_keepalive_loop(ctx, anchor);
6844 }
6845 
6846 /*
6847  * This procedure is only used to handle keepalives. It is invoked
6848  * every dyn_keepalive_period
6849  */
6850 static void
6851 ipfw_keepalive(void *dummy __unused)
6852 {
6853 	struct netmsg_base *msg;
6854 
6855 	KKASSERT(mycpuid < netisr_ncpus);
6856 	msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6857 
6858 	crit_enter();
6859 	if (msg->lmsg.ms_flags & MSGF_DONE)
6860 		netisr_sendmsg_oncpu(msg);
6861 	crit_exit();
6862 }
6863 
6864 static void
6865 ipfw_ip_input_dispatch(netmsg_t nmsg)
6866 {
6867 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6868 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6869 	struct mbuf *m = nm->m;
6870 	struct ip_fw *rule = nm->arg1;
6871 
6872 	ASSERT_NETISR_NCPUS(mycpuid);
6873 	KASSERT(rule->cpuid == mycpuid,
6874 	    ("rule does not belong to cpu%d", mycpuid));
6875 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6876 	    ("mbuf does not have ipfw continue rule"));
6877 
6878 	KASSERT(ctx->ipfw_cont_rule == NULL,
6879 	    ("pending ipfw continue rule"));
6880 	ctx->ipfw_cont_rule = rule;
6881 	ip_input(m);
6882 
6883 	/* May not be cleared, if ipfw was unload/disabled. */
6884 	ctx->ipfw_cont_rule = NULL;
6885 
6886 	/*
6887 	 * This rule is no longer used; decrement its cross_refs,
6888 	 * so this rule can be deleted.
6889 	 */
6890 	rule->cross_refs--;
6891 }
6892 
6893 static void
6894 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6895 {
6896 	struct netmsg_genpkt *nm;
6897 
6898 	KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6899 
6900 	/*
6901 	 * NOTE:
6902 	 * Bump cross_refs to prevent this rule and its siblings
6903 	 * from being deleted, while this mbuf is inflight.  The
6904 	 * cross_refs of the sibling rule on the target cpu will
6905 	 * be decremented, once this mbuf is going to be filtered
6906 	 * on the target cpu.
6907 	 */
6908 	rule->cross_refs++;
6909 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6910 
6911 	nm = &m->m_hdr.mh_genmsg;
6912 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6913 	    ipfw_ip_input_dispatch);
6914 	nm->m = m;
6915 	nm->arg1 = rule->cross_rules[cpuid];
6916 	netisr_sendmsg(&nm->base, cpuid);
6917 }
6918 
6919 static void
6920 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6921 {
6922 
6923 	args->flags = 0;
6924 	args->rule = NULL;
6925 	args->xlat = NULL;
6926 
6927 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6928 		struct m_tag *mtag;
6929 
6930 		/* Extract info from dummynet tag */
6931 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6932 		KKASSERT(mtag != NULL);
6933 		args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6934 		KKASSERT(args->rule != NULL);
6935 
6936 		m_tag_delete(m, mtag);
6937 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6938 	} else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6939 		struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6940 
6941 		KKASSERT(ctx->ipfw_cont_rule != NULL);
6942 		args->rule = ctx->ipfw_cont_rule;
6943 		ctx->ipfw_cont_rule = NULL;
6944 
6945 		if (ctx->ipfw_cont_xlat != NULL) {
6946 			args->xlat = ctx->ipfw_cont_xlat;
6947 			ctx->ipfw_cont_xlat = NULL;
6948 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6949 				args->flags |= IP_FWARG_F_XLATINS;
6950 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6951 			}
6952 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6953 				args->flags |= IP_FWARG_F_XLATFWD;
6954 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6955 			}
6956 		}
6957 		KKASSERT((m->m_pkthdr.fw_flags &
6958 		    (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6959 
6960 		args->flags |= IP_FWARG_F_CONT;
6961 		m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6962 	}
6963 
6964 	args->eh = NULL;
6965 	args->oif = oif;
6966 	args->m = m;
6967 }
6968 
6969 static int
6970 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6971 {
6972 	struct ip_fw_args args;
6973 	struct mbuf *m = *m0;
6974 	int tee = 0, error = 0, ret;
6975 
6976 	ipfw_init_args(&args, m, NULL);
6977 
6978 	ret = ipfw_chk(&args);
6979 	m = args.m;
6980 	if (m == NULL) {
6981 		if (ret != IP_FW_REDISPATCH)
6982 			error = EACCES;
6983 		goto back;
6984 	}
6985 
6986 	switch (ret) {
6987 	case IP_FW_PASS:
6988 		break;
6989 
6990 	case IP_FW_DENY:
6991 		m_freem(m);
6992 		m = NULL;
6993 		error = EACCES;
6994 		break;
6995 
6996 	case IP_FW_DUMMYNET:
6997 		/* Send packet to the appropriate pipe */
6998 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
6999 		break;
7000 
7001 	case IP_FW_TEE:
7002 		tee = 1;
7003 		/* FALL THROUGH */
7004 
7005 	case IP_FW_DIVERT:
7006 		/*
7007 		 * Must clear bridge tag when changing
7008 		 */
7009 		m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7010 		if (ip_divert_p != NULL) {
7011 			m = ip_divert_p(m, tee, 1);
7012 		} else {
7013 			m_freem(m);
7014 			m = NULL;
7015 			/* not sure this is the right error msg */
7016 			error = EACCES;
7017 		}
7018 		break;
7019 
7020 	default:
7021 		panic("unknown ipfw return value: %d", ret);
7022 	}
7023 back:
7024 	*m0 = m;
7025 	return error;
7026 }
7027 
7028 static int
7029 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7030 {
7031 	struct ip_fw_args args;
7032 	struct mbuf *m = *m0;
7033 	int tee = 0, error = 0, ret;
7034 
7035 	ipfw_init_args(&args, m, ifp);
7036 
7037 	ret = ipfw_chk(&args);
7038 	m = args.m;
7039 	if (m == NULL) {
7040 		if (ret != IP_FW_REDISPATCH)
7041 			error = EACCES;
7042 		goto back;
7043 	}
7044 
7045 	switch (ret) {
7046 	case IP_FW_PASS:
7047 		break;
7048 
7049 	case IP_FW_DENY:
7050 		m_freem(m);
7051 		m = NULL;
7052 		error = EACCES;
7053 		break;
7054 
7055 	case IP_FW_DUMMYNET:
7056 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7057 		break;
7058 
7059 	case IP_FW_TEE:
7060 		tee = 1;
7061 		/* FALL THROUGH */
7062 
7063 	case IP_FW_DIVERT:
7064 		if (ip_divert_p != NULL) {
7065 			m = ip_divert_p(m, tee, 0);
7066 		} else {
7067 			m_freem(m);
7068 			m = NULL;
7069 			/* not sure this is the right error msg */
7070 			error = EACCES;
7071 		}
7072 		break;
7073 
7074 	default:
7075 		panic("unknown ipfw return value: %d", ret);
7076 	}
7077 back:
7078 	*m0 = m;
7079 	return error;
7080 }
7081 
7082 static void
7083 ipfw_hook(void)
7084 {
7085 	struct pfil_head *pfh;
7086 
7087 	ASSERT_NETISR0;
7088 
7089 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7090 	if (pfh == NULL)
7091 		return;
7092 
7093 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7094 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7095 }
7096 
7097 static void
7098 ipfw_dehook(void)
7099 {
7100 	struct pfil_head *pfh;
7101 
7102 	ASSERT_NETISR0;
7103 
7104 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7105 	if (pfh == NULL)
7106 		return;
7107 
7108 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7109 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7110 }
7111 
7112 static int
7113 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7114 {
7115 	int dyn_cnt;
7116 
7117 	dyn_cnt = ipfw_state_cntcoll();
7118 	dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7119 
7120 	return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7121 }
7122 
7123 static int
7124 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7125 {
7126 	int state_cnt;
7127 
7128 	state_cnt = ipfw_state_cntcoll();
7129 	return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7130 }
7131 
7132 static int
7133 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7134 {
7135 	int state_max, error;
7136 
7137 	state_max = ipfw_state_max;
7138 	error = sysctl_handle_int(oidp, &state_max, 0, req);
7139 	if (error || req->newptr == NULL)
7140 		return (error);
7141 
7142 	if (state_max < 1)
7143 		return (EINVAL);
7144 
7145 	ipfw_state_max_set(state_max);
7146 	return (0);
7147 }
7148 
7149 static int
7150 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7151 {
7152 	int dyn_max, error;
7153 
7154 	dyn_max = ipfw_state_max + ipfw_track_max;
7155 
7156 	error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7157 	if (error || req->newptr == NULL)
7158 		return (error);
7159 
7160 	if (dyn_max < 2)
7161 		return (EINVAL);
7162 
7163 	ipfw_state_max_set(dyn_max / 2);
7164 	ipfw_track_max = dyn_max / 2;
7165 	return (0);
7166 }
7167 
7168 static void
7169 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7170 {
7171 	int enable = nmsg->lmsg.u.ms_result;
7172 
7173 	ASSERT_NETISR0;
7174 
7175 	if (fw_enable == enable)
7176 		goto reply;
7177 
7178 	fw_enable = enable;
7179 	if (fw_enable)
7180 		ipfw_hook();
7181 	else
7182 		ipfw_dehook();
7183 reply:
7184 	netisr_replymsg(&nmsg->base, 0);
7185 }
7186 
7187 static int
7188 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7189 {
7190 	struct netmsg_base nmsg;
7191 	int enable, error;
7192 
7193 	enable = fw_enable;
7194 	error = sysctl_handle_int(oidp, &enable, 0, req);
7195 	if (error || req->newptr == NULL)
7196 		return error;
7197 
7198 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7199 	    ipfw_sysctl_enable_dispatch);
7200 	nmsg.lmsg.u.ms_result = enable;
7201 
7202 	return netisr_domsg(&nmsg, 0);
7203 }
7204 
7205 static int
7206 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7207 {
7208 	return sysctl_int_range(oidp, arg1, arg2, req,
7209 	       IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7210 }
7211 
7212 static int
7213 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7214 {
7215 
7216 	return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7217 }
7218 
7219 static int
7220 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7221 {
7222 	u_long stat = 0;
7223 	int cpu, error;
7224 
7225 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7226 		stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7227 
7228 	error = sysctl_handle_long(oidp, &stat, 0, req);
7229 	if (error || req->newptr == NULL)
7230 		return (error);
7231 
7232 	/* Zero out this stat. */
7233 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7234 		*((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7235 	return (0);
7236 }
7237 
7238 static void
7239 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7240 {
7241 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7242 	struct ipfw_context *ctx;
7243 	struct ip_fw *def_rule;
7244 
7245 	ASSERT_NETISR_NCPUS(mycpuid);
7246 
7247 	ctx = kmalloc(__offsetof(struct ipfw_context,
7248 	    ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7249 
7250 	RB_INIT(&ctx->ipfw_state_tree);
7251 	TAILQ_INIT(&ctx->ipfw_state_list);
7252 
7253 	RB_INIT(&ctx->ipfw_track_tree);
7254 	TAILQ_INIT(&ctx->ipfw_track_list);
7255 
7256 	callout_init_mp(&ctx->ipfw_stateto_ch);
7257 	netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7258 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7259 	ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7260 	netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7261 	    MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7262 
7263 	callout_init_mp(&ctx->ipfw_trackto_ch);
7264 	netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7265 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7266 	netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7267 	    MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7268 
7269 	callout_init_mp(&ctx->ipfw_keepalive_ch);
7270 	netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7271 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7272 	ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7273 	netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7274 	    MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7275 
7276 	callout_init_mp(&ctx->ipfw_xlatreap_ch);
7277 	netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7278 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7279 	TAILQ_INIT(&ctx->ipfw_xlatreap);
7280 
7281 	ipfw_ctx[mycpuid] = ctx;
7282 
7283 	def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7284 
7285 	def_rule->act_ofs = 0;
7286 	def_rule->rulenum = IPFW_DEFAULT_RULE;
7287 	def_rule->cmd_len = 1;
7288 	def_rule->set = IPFW_DEFAULT_SET;
7289 
7290 	def_rule->cmd[0].len = 1;
7291 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7292 	def_rule->cmd[0].opcode = O_ACCEPT;
7293 #else
7294 	if (filters_default_to_accept)
7295 		def_rule->cmd[0].opcode = O_ACCEPT;
7296 	else
7297 		def_rule->cmd[0].opcode = O_DENY;
7298 #endif
7299 
7300 	def_rule->refcnt = 1;
7301 	def_rule->cpuid = mycpuid;
7302 
7303 	/* Install the default rule */
7304 	ctx->ipfw_default_rule = def_rule;
7305 	ctx->ipfw_layer3_chain = def_rule;
7306 
7307 	/* Link rule CPU sibling */
7308 	ipfw_link_sibling(fwmsg, def_rule);
7309 
7310 	/* Statistics only need to be updated once */
7311 	if (mycpuid == 0)
7312 		ipfw_inc_static_count(def_rule);
7313 
7314 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7315 }
7316 
7317 static void
7318 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7319 {
7320 
7321 	crit_enter();
7322 	/* Reply ASAP */
7323 	netisr_replymsg(&nmsg->base, 0);
7324 	crit_exit();
7325 	ipfw_crossref_reap();
7326 }
7327 
7328 static void
7329 ipfw_crossref_timeo(void *dummy __unused)
7330 {
7331 	struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7332 
7333 	KKASSERT(mycpuid == 0);
7334 
7335 	crit_enter();
7336 	if (msg->lmsg.ms_flags & MSGF_DONE)
7337 		netisr_sendmsg_oncpu(msg);
7338 	crit_exit();
7339 }
7340 
7341 static void
7342 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7343 {
7344 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7345 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7346 	struct ip_fw *f;
7347 
7348 	ASSERT_NETISR_NCPUS(mycpuid);
7349 
7350 	for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7351 		int l, cmdlen;
7352 		ipfw_insn *cmd;
7353 
7354 		if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7355 			continue;
7356 
7357 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
7358 		     l -= cmdlen, cmd += cmdlen) {
7359 			cmdlen = F_LEN(cmd);
7360 			if (cmd->opcode == O_IP_SRC_IFIP ||
7361 			    cmd->opcode == O_IP_DST_IFIP) {
7362 				if (strncmp(ifp->if_xname,
7363 				    ((ipfw_insn_ifip *)cmd)->ifname,
7364 				    IFNAMSIZ) == 0)
7365 					cmd->arg1 &= ~IPFW_IFIP_VALID;
7366 			}
7367 		}
7368 	}
7369 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7370 }
7371 
7372 static void
7373 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7374     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7375 {
7376 	struct netmsg_base nm;
7377 
7378 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7379 	    ipfw_ifaddr_dispatch);
7380 	nm.lmsg.u.ms_resultp = ifp;
7381 	netisr_domsg_global(&nm);
7382 }
7383 
7384 static void
7385 ipfw_init_dispatch(netmsg_t nmsg)
7386 {
7387 	struct netmsg_ipfw fwmsg;
7388 	int error = 0, cpu;
7389 
7390 	ASSERT_NETISR0;
7391 
7392 	if (IPFW_LOADED) {
7393 		kprintf("IP firewall already loaded\n");
7394 		error = EEXIST;
7395 		goto reply;
7396 	}
7397 
7398 	if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7399 		ipfw_table_max = UINT16_MAX;
7400 
7401 	/* Initialize global track tree. */
7402 	RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7403 	IPFW_TRKCNT_TOKINIT;
7404 
7405 	/* GC for freed crossref rules. */
7406 	callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7407 	netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7408 	    MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7409 
7410 	ipfw_state_max_set(ipfw_state_max);
7411 	ipfw_state_headroom = 8 * netisr_ncpus;
7412 
7413 	bzero(&fwmsg, sizeof(fwmsg));
7414 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7415 	    ipfw_ctx_init_dispatch);
7416 	netisr_domsg_global(&fwmsg.base);
7417 
7418 	ip_fw_chk_ptr = ipfw_chk;
7419 	ip_fw_ctl_ptr = ipfw_ctl;
7420 	ip_fw_dn_io_ptr = ipfw_dummynet_io;
7421 
7422 	kprintf("ipfw2 initialized, default to %s, logging ",
7423 		ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7424 		O_ACCEPT ? "accept" : "deny");
7425 
7426 #ifdef IPFIREWALL_VERBOSE
7427 	fw_verbose = 1;
7428 #endif
7429 #ifdef IPFIREWALL_VERBOSE_LIMIT
7430 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7431 #endif
7432 	if (fw_verbose == 0) {
7433 		kprintf("disabled\n");
7434 	} else if (verbose_limit == 0) {
7435 		kprintf("unlimited\n");
7436 	} else {
7437 		kprintf("limited to %d packets/entry by default\n",
7438 			verbose_limit);
7439 	}
7440 
7441 	ip_fw_loaded = 1;
7442 	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7443 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7444 		    ipfw_state_expire_ipifunc, NULL, cpu);
7445 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7446 		    ipfw_track_expire_ipifunc, NULL, cpu);
7447 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7448 		    ipfw_keepalive, NULL, cpu);
7449 	}
7450 
7451 	if (fw_enable)
7452 		ipfw_hook();
7453 
7454 	ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7455 	    NULL, EVENTHANDLER_PRI_ANY);
7456 	if (ipfw_ifaddr_event == NULL)
7457 		kprintf("ipfw: ifaddr_event register failed\n");
7458 
7459 reply:
7460 	netisr_replymsg(&nmsg->base, error);
7461 }
7462 
7463 static int
7464 ipfw_init(void)
7465 {
7466 	struct netmsg_base smsg;
7467 
7468 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7469 	    ipfw_init_dispatch);
7470 	return netisr_domsg(&smsg, 0);
7471 }
7472 
7473 #ifdef KLD_MODULE
7474 
7475 static void
7476 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7477 {
7478 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7479 
7480 	ASSERT_NETISR_NCPUS(mycpuid);
7481 
7482 	callout_stop_sync(&ctx->ipfw_stateto_ch);
7483 	callout_stop_sync(&ctx->ipfw_trackto_ch);
7484 	callout_stop_sync(&ctx->ipfw_keepalive_ch);
7485 	callout_stop_sync(&ctx->ipfw_xlatreap_ch);
7486 
7487 	crit_enter();
7488 	netisr_dropmsg(&ctx->ipfw_stateexp_more);
7489 	netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7490 	netisr_dropmsg(&ctx->ipfw_trackexp_more);
7491 	netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7492 	netisr_dropmsg(&ctx->ipfw_keepalive_more);
7493 	netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7494 	netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7495 	crit_exit();
7496 
7497 	ipfw_table_flushall_oncpu(ctx, 1);
7498 
7499 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7500 }
7501 
7502 static void
7503 ipfw_fini_dispatch(netmsg_t nmsg)
7504 {
7505 	struct netmsg_base nm;
7506 	int error = 0, cpu;
7507 
7508 	ASSERT_NETISR0;
7509 
7510 	ipfw_crossref_reap();
7511 
7512 	if (ipfw_gd.ipfw_refcnt != 0) {
7513 		error = EBUSY;
7514 		goto reply;
7515 	}
7516 
7517 	ip_fw_loaded = 0;
7518 	ipfw_dehook();
7519 
7520 	/* Synchronize any inflight state/track expire IPIs. */
7521 	lwkt_synchronize_ipiqs("ipfwfini");
7522 
7523 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7524 	    ipfw_ctx_fini_dispatch);
7525 	netisr_domsg_global(&nm);
7526 
7527 	callout_stop_sync(&ipfw_gd.ipfw_crossref_ch);
7528 	crit_enter();
7529 	netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7530 	crit_exit();
7531 
7532 	if (ipfw_ifaddr_event != NULL)
7533 		EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7534 
7535 	ip_fw_chk_ptr = NULL;
7536 	ip_fw_ctl_ptr = NULL;
7537 	ip_fw_dn_io_ptr = NULL;
7538 	ipfw_flush(1 /* kill default rule */);
7539 
7540 	/* Free pre-cpu context */
7541 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7542 		kfree(ipfw_ctx[cpu], M_IPFW);
7543 
7544 	kprintf("IP firewall unloaded\n");
7545 reply:
7546 	netisr_replymsg(&nmsg->base, error);
7547 }
7548 
7549 static void
7550 ipfw_fflush_dispatch(netmsg_t nmsg)
7551 {
7552 
7553 	ipfw_flush(0 /* keep default rule */);
7554 	ipfw_crossref_reap();
7555 	netisr_replymsg(&nmsg->base, 0);
7556 }
7557 
7558 static int
7559 ipfw_fini(void)
7560 {
7561 	struct netmsg_base smsg;
7562 	int i = 0;
7563 
7564 	for (;;) {
7565 		netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7566 		    ipfw_fflush_dispatch);
7567 		netisr_domsg(&smsg, 0);
7568 
7569 		if (ipfw_gd.ipfw_refcnt == 0)
7570 			break;
7571 		kprintf("ipfw: flush pending %d\n", ++i);
7572 		tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7573 	}
7574 
7575 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7576 	    ipfw_fini_dispatch);
7577 	return netisr_domsg(&smsg, 0);
7578 }
7579 
7580 #endif	/* KLD_MODULE */
7581 
7582 static int
7583 ipfw_modevent(module_t mod, int type, void *unused)
7584 {
7585 	int err = 0;
7586 
7587 	switch (type) {
7588 	case MOD_LOAD:
7589 		err = ipfw_init();
7590 		break;
7591 
7592 	case MOD_UNLOAD:
7593 #ifndef KLD_MODULE
7594 		kprintf("ipfw statically compiled, cannot unload\n");
7595 		err = EBUSY;
7596 #else
7597 		err = ipfw_fini();
7598 #endif
7599 		break;
7600 	default:
7601 		break;
7602 	}
7603 	return err;
7604 }
7605 
7606 static moduledata_t ipfwmod = {
7607 	"ipfw",
7608 	ipfw_modevent,
7609 	0
7610 };
7611 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7612 MODULE_VERSION(ipfw, 1);
7613