xref: /dragonfly/sys/net/ipfw/ip_fw2.c (revision e6d22e9b)
1 /*
2  * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $
26  */
27 
28 /*
29  * Implement IP packet firewall (new version)
30  */
31 
32 #include "opt_ipfw.h"
33 #include "opt_inet.h"
34 #ifndef INET
35 #error IPFIREWALL requires INET.
36 #endif /* INET */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/ucred.h>
49 #include <sys/in_cksum.h>
50 #include <sys/limits.h>
51 #include <sys/lock.h>
52 #include <sys/tree.h>
53 
54 #include <net/if.h>
55 #include <net/route.h>
56 #include <net/pfil.h>
57 #include <net/dummynet/ip_dummynet.h>
58 
59 #include <sys/thread2.h>
60 #include <net/netmsg2.h>
61 
62 #include <netinet/in.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in_var.h>
65 #include <netinet/in_pcb.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73 #include <netinet/tcpip.h>
74 #include <netinet/udp.h>
75 #include <netinet/udp_var.h>
76 #include <netinet/ip_divert.h>
77 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */
78 
79 #include <net/ipfw/ip_fw2.h>
80 
81 #ifdef IPFIREWALL_DEBUG
82 #define DPRINTF(fmt, ...) \
83 do { \
84 	if (fw_debug > 0) \
85 		kprintf(fmt, __VA_ARGS__); \
86 } while (0)
87 #else
88 #define DPRINTF(fmt, ...)	((void)0)
89 #endif
90 
91 /*
92  * Description about per-CPU rule duplication:
93  *
94  * Module loading/unloading and all ioctl operations are serialized
95  * by netisr0, so we don't have any ordering or locking problems.
96  *
97  * Following graph shows how operation on per-CPU rule list is
98  * performed [2 CPU case]:
99  *
100  *   CPU0                 CPU1
101  *
102  * netisr0 <------------------------------------+
103  *  domsg                                       |
104  *    :                                         |
105  *    :(delete/add...)                          |
106  *    :                                         |
107  *    :         netmsg                          | netmsg
108  *  forwardmsg---------->netisr1                |
109  *                          :                   |
110  *                          :(delete/add...)    |
111  *                          :                   |
112  *                          :                   |
113  *                        replymsg--------------+
114  *
115  *
116  *
117  * Rule structure [2 CPU case]
118  *
119  *    CPU0               CPU1
120  *
121  * layer3_chain       layer3_chain
122  *     |                  |
123  *     V                  V
124  * +-------+ sibling  +-------+ sibling
125  * | rule1 |--------->| rule1 |--------->NULL
126  * +-------+          +-------+
127  *     |                  |
128  *     |next              |next
129  *     V                  V
130  * +-------+ sibling  +-------+ sibling
131  * | rule2 |--------->| rule2 |--------->NULL
132  * +-------+          +-------+
133  *
134  * ip_fw.sibling:
135  * 1) Ease statistics calculation during IP_FW_GET.  We only need to
136  *    iterate layer3_chain in netisr0; the current rule's duplication
137  *    to the other CPUs could safely be read-only accessed through
138  *    ip_fw.sibling.
139  * 2) Accelerate rule insertion and deletion, e.g. rule insertion:
140  *    a) In netisr0 rule3 is determined to be inserted between rule1
141  *       and rule2.  To make this decision we need to iterate the
142  *       layer3_chain in netisr0.  The netmsg, which is used to insert
143  *       the rule, will contain rule1 in netisr0 as prev_rule and rule2
144  *       in netisr0 as next_rule.
145  *    b) After the insertion in netisr0 is done, we will move on to
146  *       netisr1.  But instead of relocating the rule3's position in
147  *       netisr1 by iterating the layer3_chain in netisr1, we set the
148  *       netmsg's prev_rule to rule1->sibling and next_rule to
149  *       rule2->sibling before the netmsg is forwarded to netisr1 from
150  *       netisr0.
151  */
152 
153 /*
154  * Description of states and tracks.
155  *
156  * Both states and tracks are stored in per-cpu RB trees instead of
157  * per-cpu hash tables to avoid the worst case hash degeneration.
158  *
159  * The lifetimes of states and tracks are regulated by dyn_*_lifetime,
160  * measured in seconds and depending on the flags.
161  *
162  * When a packet is received, its address fields are first masked with
163  * the mask defined for the rule, then matched against the entries in
164  * the per-cpu state RB tree.  States are generated by 'keep-state'
165  * and 'limit' options.
166  *
167  * The max number of states is ipfw_state_max.  When we reach the
168  * maximum number of states we do not create anymore.  This is done to
169  * avoid consuming too much memory, but also too much time when
170  * searching on each packet.
171  *
172  * Each state holds a pointer to the parent ipfw rule of the current
173  * CPU so we know what action to perform.  States are removed when the
174  * parent rule is deleted.  XXX we should make them survive.
175  *
176  * There are some limitations with states -- we do not obey the
177  * 'randomized match', and we do not do multiple passes through the
178  * firewall.  XXX check the latter!!!
179  *
180  * States grow independently on each CPU, e.g. 2 CPU case:
181  *
182  *        CPU0                     CPU1
183  * ...................      ...................
184  * :  state RB tree  :      :  state RB tree  :
185  * :                 :      :                 :
186  * : state1   state2 :      :      state3     :
187  * :     |    |      :      :        |        :
188  * :.....|....|......:      :........|........:
189  *       |    |                      |
190  *       |    |                      |st_rule
191  *       |    |                      |
192  *       V    V                      V
193  *     +-------+                 +-------+
194  *     | rule1 |                 | rule1 |
195  *     +-------+                 +-------+
196  *
197  * Tracks are used to enforce limits on the number of sessions.  Tracks
198  * are generated by 'limit' option.
199  *
200  * The max number of tracks is ipfw_track_max.  When we reach the
201  * maximum number of tracks we do not create anymore.  This is done to
202  * avoid consuming too much memory.
203  *
204  * Tracks are organized into two layers, track counter RB tree is
205  * shared between CPUs, track RB tree is per-cpu.  States generated by
206  * 'limit' option are linked to the track in addition to the per-cpu
207  * state RB tree; mainly to ease expiration.  e.g. 2 CPU case:
208  *
209  *             ..............................
210  *             :    track counter RB tree   :
211  *             :                            :
212  *             :        +-----------+       :
213  *             :        |  trkcnt1  |       :
214  *             :        |           |       :
215  *             :      +--->counter<----+    :
216  *             :      | |           |  |    :
217  *             :      | +-----------+  |    :
218  *             :......|................|....:
219  *                    |                |
220  *        CPU0        |                |         CPU1
221  * .................  |t_count         |  .................
222  * : track RB tree :  |                |  : track RB tree :
223  * :               :  |                |  :               :
224  * : +-->track1-------+                +--------track2    :
225  * : |     A       :                      :               :
226  * : |     |       :                      :               :
227  * :.|.....|.......:                      :...............:
228  *   |     +----------------+
229  *   | .................... |
230  *   | :   state RB tree  : |st_track
231  *   | :                  : |
232  *   +---state1    state2---+
233  *     :     |       |    :
234  *     :.....|.......|....:
235  *           |       |
236  *           |       |st_rule
237  *           V       V
238  *         +----------+
239  *         |   rule1  |
240  *         +----------+
241  */
242 
243 #define IPFW_AUTOINC_STEP_MIN	1
244 #define IPFW_AUTOINC_STEP_MAX	1000
245 #define IPFW_AUTOINC_STEP_DEF	100
246 
247 #define IPFW_TABLE_MAX_DEF	64
248 
249 #define	IPFW_DEFAULT_RULE	65535	/* rulenum for the default rule */
250 #define IPFW_DEFAULT_SET	31	/* set number for the default rule */
251 
252 #define MATCH_REVERSE		0
253 #define MATCH_FORWARD		1
254 #define MATCH_NONE		2
255 #define MATCH_UNKNOWN		3
256 
257 #define TIME_LEQ(a, b)		((a) - (b) <= 0)
258 
259 #define IPFW_STATE_TCPFLAGS	(TH_SYN | TH_FIN | TH_RST)
260 #define IPFW_STATE_TCPSTATES	(IPFW_STATE_TCPFLAGS |	\
261 				 (IPFW_STATE_TCPFLAGS << 8))
262 
263 #define BOTH_SYN		(TH_SYN | (TH_SYN << 8))
264 #define BOTH_FIN		(TH_FIN | (TH_FIN << 8))
265 #define BOTH_RST		(TH_RST | (TH_RST << 8))
266 /* TH_ACK here means FIN was ACKed. */
267 #define BOTH_FINACK		(TH_ACK | (TH_ACK << 8))
268 
269 #define IPFW_STATE_TCPCLOSED(s)	((s)->st_proto == IPPROTO_TCP &&	\
270 				 (((s)->st_state & BOTH_RST) ||		\
271 				  ((s)->st_state & BOTH_FINACK) == BOTH_FINACK))
272 
273 #define O_ANCHOR		O_NOP
274 
275 #define IPFW_ISXLAT(type)	((type) == O_REDIRECT)
276 #define IPFW_XLAT_INVALID(s)	(IPFW_ISXLAT((s)->st_type) &&	\
277 				 ((struct ipfw_xlat *)(s))->xlat_invalid)
278 
279 #define IPFW_MBUF_XLATINS	FW_MBUF_PRIVATE1
280 #define IPFW_MBUF_XLATFWD	FW_MBUF_PRIVATE2
281 
282 #define IPFW_XLATE_INSERT	0x0001
283 #define IPFW_XLATE_FORWARD	0x0002
284 #define IPFW_XLATE_OUTPUT	0x0004
285 
286 struct netmsg_ipfw {
287 	struct netmsg_base	base;
288 	const struct ipfw_ioc_rule *ioc_rule;
289 	struct ip_fw		*next_rule;
290 	struct ip_fw		*prev_rule;
291 	struct ip_fw		*sibling;
292 	uint32_t		rule_flags;
293 	struct ip_fw		**cross_rules;
294 };
295 
296 struct netmsg_del {
297 	struct netmsg_base	base;
298 	struct ip_fw		*start_rule;
299 	struct ip_fw		*prev_rule;
300 	uint16_t		rulenum;
301 	uint8_t			from_set;
302 	uint8_t			to_set;
303 };
304 
305 struct netmsg_zent {
306 	struct netmsg_base	base;
307 	struct ip_fw		*start_rule;
308 	uint16_t		rulenum;
309 	uint16_t		log_only;
310 };
311 
312 struct netmsg_cpstate {
313 	struct netmsg_base	base;
314 	struct ipfw_ioc_state	*ioc_state;
315 	int			state_cntmax;
316 	int			state_cnt;
317 };
318 
319 struct netmsg_tblent {
320 	struct netmsg_base	base;
321 	struct sockaddr		*key;
322 	struct sockaddr		*netmask;
323 	struct ipfw_tblent	*sibling;
324 	int			tableid;
325 };
326 
327 struct netmsg_tblflush {
328 	struct netmsg_base	base;
329 	int			tableid;
330 	int			destroy;
331 };
332 
333 struct netmsg_tblexp {
334 	struct netmsg_base	base;
335 	time_t			expire;
336 	int			tableid;
337 	int			cnt;
338 	int			expcnt;
339 	struct radix_node_head	*rnh;
340 };
341 
342 struct ipfw_table_cp {
343 	struct ipfw_ioc_tblent	*te;
344 	int			te_idx;
345 	int			te_cnt;
346 };
347 
348 struct ip_fw_local {
349 	/*
350 	 * offset	The offset of a fragment. offset != 0 means that
351 	 *	we have a fragment at this offset of an IPv4 packet.
352 	 *	offset == 0 means that (if this is an IPv4 packet)
353 	 *	this is the first or only fragment.
354 	 */
355 	u_short			offset;
356 
357 	/*
358 	 * Local copies of addresses. They are only valid if we have
359 	 * an IP packet.
360 	 *
361 	 * proto	The protocol. Set to 0 for non-ip packets,
362 	 *	or to the protocol read from the packet otherwise.
363 	 *	proto != 0 means that we have an IPv4 packet.
364 	 *
365 	 * src_port, dst_port	port numbers, in HOST format. Only
366 	 *	valid for TCP and UDP packets.
367 	 *
368 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
369 	 *	Only valid for IPv4 packets.
370 	 */
371 	uint8_t			proto;
372 	uint16_t		src_port;	/* NOTE: host format	*/
373 	uint16_t		dst_port;	/* NOTE: host format	*/
374 	struct in_addr		src_ip;		/* NOTE: network format	*/
375 	struct in_addr		dst_ip;		/* NOTE: network format	*/
376 	uint16_t		ip_len;
377 	struct tcphdr		*tcp;
378 };
379 
380 struct ipfw_addrs {
381 	uint32_t		addr1;	/* host byte order */
382 	uint32_t		addr2;	/* host byte order */
383 };
384 
385 struct ipfw_ports {
386 	uint16_t		port1;	/* host byte order */
387 	uint16_t		port2;	/* host byte order */
388 };
389 
390 struct ipfw_key {
391 	union {
392 		struct ipfw_addrs addrs;
393 		uint64_t	value;
394 	} addr_u;
395 	union {
396 		struct ipfw_ports ports;
397 		uint32_t	value;
398 	} port_u;
399 	uint8_t			proto;
400 	uint8_t			swap;	/* IPFW_KEY_SWAP_ */
401 	uint16_t		rsvd2;
402 };
403 
404 #define IPFW_KEY_SWAP_ADDRS	0x1
405 #define IPFW_KEY_SWAP_PORTS	0x2
406 #define IPFW_KEY_SWAP_ALL	(IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS)
407 
408 struct ipfw_trkcnt {
409 	RB_ENTRY(ipfw_trkcnt)	tc_rblink;
410 	struct ipfw_key		tc_key;
411 	uintptr_t		tc_ruleid;
412 	int			tc_refs;
413 	int			tc_count;
414 	time_t			tc_expire;	/* userland get-only */
415 	uint16_t		tc_rulenum;	/* userland get-only */
416 } __cachealign;
417 
418 #define tc_addrs		tc_key.addr_u.value
419 #define tc_ports		tc_key.port_u.value
420 #define tc_proto		tc_key.proto
421 #define tc_saddr		tc_key.addr_u.addrs.addr1
422 #define tc_daddr		tc_key.addr_u.addrs.addr2
423 #define tc_sport		tc_key.port_u.ports.port1
424 #define tc_dport		tc_key.port_u.ports.port2
425 
426 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt);
427 
428 struct ipfw_state;
429 
430 struct ipfw_track {
431 	RB_ENTRY(ipfw_track)	t_rblink;
432 	struct ipfw_key		t_key;
433 	struct ip_fw		*t_rule;
434 	time_t			t_lastexp;
435 	LIST_HEAD(, ipfw_state)	t_state_list;
436 	time_t			t_expire;
437 	volatile int		*t_count;
438 	struct ipfw_trkcnt	*t_trkcnt;
439 	TAILQ_ENTRY(ipfw_track)	t_link;
440 };
441 
442 #define t_addrs			t_key.addr_u.value
443 #define t_ports			t_key.port_u.value
444 #define t_proto			t_key.proto
445 #define t_saddr			t_key.addr_u.addrs.addr1
446 #define t_daddr			t_key.addr_u.addrs.addr2
447 #define t_sport			t_key.port_u.ports.port1
448 #define t_dport			t_key.port_u.ports.port2
449 
450 RB_HEAD(ipfw_track_tree, ipfw_track);
451 TAILQ_HEAD(ipfw_track_list, ipfw_track);
452 
453 struct ipfw_state {
454 	RB_ENTRY(ipfw_state)	st_rblink;
455 	struct ipfw_key		st_key;
456 
457 	time_t			st_expire;	/* expire time */
458 	struct ip_fw		*st_rule;
459 
460 	uint64_t		st_pcnt;	/* packets */
461 	uint64_t		st_bcnt;	/* bytes */
462 
463 	/*
464 	 * st_state:
465 	 * State of this rule, typically a combination of TCP flags.
466 	 *
467 	 * st_ack_fwd/st_ack_rev:
468 	 * Most recent ACKs in forward and reverse direction.  They
469 	 * are used to generate keepalives.
470 	 */
471 	uint32_t		st_state;
472 	uint32_t		st_ack_fwd;	/* host byte order */
473 	uint32_t		st_seq_fwd;	/* host byte order */
474 	uint32_t		st_ack_rev;	/* host byte order */
475 	uint32_t		st_seq_rev;	/* host byte order */
476 
477 	uint16_t		st_flags;	/* IPFW_STATE_F_ */
478 	uint16_t		st_type;	/* KEEP_STATE/LIMIT/RDR */
479 	struct ipfw_track	*st_track;
480 
481 	LIST_ENTRY(ipfw_state)	st_trklink;
482 	TAILQ_ENTRY(ipfw_state)	st_link;
483 };
484 
485 #define st_addrs		st_key.addr_u.value
486 #define st_ports		st_key.port_u.value
487 #define st_proto		st_key.proto
488 #define st_swap			st_key.swap
489 
490 #define IPFW_STATE_F_ACKFWD	0x0001
491 #define IPFW_STATE_F_SEQFWD	0x0002
492 #define IPFW_STATE_F_ACKREV	0x0004
493 #define IPFW_STATE_F_SEQREV	0x0008
494 #define IPFW_STATE_F_XLATSRC	0x0010
495 #define IPFW_STATE_F_XLATSLAVE	0x0020
496 #define IPFW_STATE_F_LINKED	0x0040
497 
498 #define IPFW_STATE_SCANSKIP(s)	((s)->st_type == O_ANCHOR ||	\
499 				 ((s)->st_flags & IPFW_STATE_F_XLATSLAVE))
500 
501 /* Expired or being deleted. */
502 #define IPFW_STATE_ISDEAD(s)	(TIME_LEQ((s)->st_expire, time_uptime) || \
503 				 IPFW_XLAT_INVALID((s)))
504 
505 TAILQ_HEAD(ipfw_state_list, ipfw_state);
506 RB_HEAD(ipfw_state_tree, ipfw_state);
507 
508 struct ipfw_xlat {
509 	struct ipfw_state	xlat_st;	/* MUST be the first field */
510 	uint32_t		xlat_addr;	/* network byte order */
511 	uint16_t		xlat_port;	/* network byte order */
512 	uint16_t		xlat_dir;	/* MATCH_ */
513 	struct ifnet		*xlat_ifp;	/* matching ifnet */
514 	struct ipfw_xlat	*xlat_pair;	/* paired state */
515 	int			xlat_pcpu;	/* paired cpu */
516 	volatile int		xlat_invalid;	/* invalid, but not dtor yet */
517 	volatile uint64_t	xlat_crefs;	/* cross references */
518 	struct netmsg_base	xlat_freenm;	/* for remote free */
519 };
520 
521 #define xlat_type		xlat_st.st_type
522 #define xlat_flags		xlat_st.st_flags
523 #define xlat_rule		xlat_st.st_rule
524 #define xlat_bcnt		xlat_st.st_bcnt
525 #define xlat_pcnt		xlat_st.st_pcnt
526 
527 struct ipfw_tblent {
528 	struct radix_node	te_nodes[2];
529 	struct sockaddr_in	te_key;
530 	u_long			te_use;
531 	time_t			te_lastuse;
532 	struct ipfw_tblent	*te_sibling;
533 	volatile int		te_expired;
534 };
535 
536 struct ipfw_context {
537 	struct ip_fw		*ipfw_layer3_chain;	/* rules for layer3 */
538 	struct ip_fw		*ipfw_default_rule;	/* default rule */
539 	uint64_t		ipfw_norule_counter;	/* ipfw_log(NULL) stat*/
540 
541 	/*
542 	 * ipfw_set_disable contains one bit per set value (0..31).
543 	 * If the bit is set, all rules with the corresponding set
544 	 * are disabled.  Set IPDW_DEFAULT_SET is reserved for the
545 	 * default rule and CANNOT be disabled.
546 	 */
547 	uint32_t		ipfw_set_disable;
548 
549 	uint8_t			ipfw_flags;	/* IPFW_FLAG_ */
550 
551 	struct ip_fw		*ipfw_cont_rule;
552 	struct ipfw_xlat	*ipfw_cont_xlat;
553 
554 	struct ipfw_state_tree	ipfw_state_tree;
555 	struct ipfw_state_list	ipfw_state_list;
556 	int			ipfw_state_loosecnt;
557 	int			ipfw_state_cnt;
558 
559 	union {
560 		struct ipfw_state state;
561 		struct ipfw_track track;
562 		struct ipfw_trkcnt trkcnt;
563 	} ipfw_tmpkey;
564 
565 	struct ipfw_track_tree	ipfw_track_tree;
566 	struct ipfw_track_list	ipfw_track_list;
567 	struct ipfw_trkcnt	*ipfw_trkcnt_spare;
568 
569 	struct callout		ipfw_stateto_ch;
570 	time_t			ipfw_state_lastexp;
571 	struct netmsg_base	ipfw_stateexp_nm;
572 	struct netmsg_base	ipfw_stateexp_more;
573 	struct ipfw_state	ipfw_stateexp_anch;
574 
575 	struct callout		ipfw_trackto_ch;
576 	time_t			ipfw_track_lastexp;
577 	struct netmsg_base	ipfw_trackexp_nm;
578 	struct netmsg_base	ipfw_trackexp_more;
579 	struct ipfw_track	ipfw_trackexp_anch;
580 
581 	struct callout		ipfw_keepalive_ch;
582 	struct netmsg_base	ipfw_keepalive_nm;
583 	struct netmsg_base	ipfw_keepalive_more;
584 	struct ipfw_state	ipfw_keepalive_anch;
585 
586 	struct callout		ipfw_xlatreap_ch;
587 	struct netmsg_base	ipfw_xlatreap_nm;
588 	struct ipfw_state_list	ipfw_xlatreap;
589 
590 	/*
591 	 * Statistics
592 	 */
593 	u_long			ipfw_sts_reap;
594 	u_long			ipfw_sts_reapfailed;
595 	u_long			ipfw_sts_overflow;
596 	u_long			ipfw_sts_nomem;
597 	u_long			ipfw_sts_tcprecycled;
598 
599 	u_long			ipfw_tks_nomem;
600 	u_long			ipfw_tks_reap;
601 	u_long			ipfw_tks_reapfailed;
602 	u_long			ipfw_tks_overflow;
603 	u_long			ipfw_tks_cntnomem;
604 
605 	u_long			ipfw_frags;
606 	u_long			ipfw_defraged;
607 	u_long			ipfw_defrag_remote;
608 
609 	u_long			ipfw_xlated;
610 	u_long			ipfw_xlate_split;
611 	u_long			ipfw_xlate_conflicts;
612 	u_long			ipfw_xlate_cresolved;
613 
614 	/* Last field */
615 	struct radix_node_head	*ipfw_tables[];
616 };
617 
618 #define IPFW_FLAG_KEEPALIVE	0x01
619 #define IPFW_FLAG_STATEEXP	0x02
620 #define IPFW_FLAG_TRACKEXP	0x04
621 #define IPFW_FLAG_STATEREAP	0x08
622 #define IPFW_FLAG_TRACKREAP	0x10
623 
624 #define ipfw_state_tmpkey	ipfw_tmpkey.state
625 #define ipfw_track_tmpkey	ipfw_tmpkey.track
626 #define ipfw_trkcnt_tmpkey	ipfw_tmpkey.trkcnt
627 
628 struct ipfw_global {
629 	int			ipfw_state_loosecnt;	/* cache aligned */
630 	time_t			ipfw_state_globexp __cachealign;
631 
632 	struct lwkt_token	ipfw_trkcnt_token __cachealign;
633 	struct ipfw_trkcnt_tree	ipfw_trkcnt_tree;
634 	int			ipfw_trkcnt_cnt;
635 	time_t			ipfw_track_globexp;
636 
637 	/* Accessed in netisr0. */
638 	struct ip_fw		*ipfw_crossref_free __cachealign;
639 	struct callout		ipfw_crossref_ch;
640 	struct netmsg_base	ipfw_crossref_nm;
641 
642 #ifdef KLD_MODULE
643 	/*
644 	 * Module can not be unloaded, if there are references to
645 	 * certains rules of ipfw(4), e.g. dummynet(4)
646 	 */
647 	int			ipfw_refcnt __cachealign;
648 #endif
649 } __cachealign;
650 
651 static struct ipfw_context	*ipfw_ctx[MAXCPU];
652 
653 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
654 
655 /*
656  * Following two global variables are accessed and updated only
657  * in netisr0.
658  */
659 static uint32_t static_count;	/* # of static rules */
660 static uint32_t static_ioc_len;	/* bytes of static rules */
661 
662 /*
663  * If 1, then ipfw static rules are being flushed,
664  * ipfw_chk() will skip to the default rule.
665  */
666 static int ipfw_flushing;
667 
668 static int fw_verbose;
669 static int verbose_limit;
670 
671 static int fw_debug;
672 static int autoinc_step = IPFW_AUTOINC_STEP_DEF;
673 
674 static int	ipfw_table_max = IPFW_TABLE_MAX_DEF;
675 
676 static int	ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS);
677 static int	ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS);
678 
679 TUNABLE_INT("net.inet.ip.fw.table_max", &ipfw_table_max);
680 
681 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
682 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0,
683     "Firewall statistics");
684 
685 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
686     &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw");
687 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW,
688     &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I",
689     "Rule number autincrement step");
690 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW,
691     &fw_one_pass, 0,
692     "Only do a single pass through ipfw when using dummynet(4)");
693 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
694     &fw_debug, 0, "Enable printing of debug ip_fw statements");
695 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW,
696     &fw_verbose, 0, "Log matches to ipfw rules");
697 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
698     &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
699 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, table_max, CTLFLAG_RD,
700     &ipfw_table_max, 0, "Max # of tables");
701 
702 static int	ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS);
703 static int	ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS);
704 static int	ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS);
705 static int	ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS);
706 static int	ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS);
707 static int	ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS);
708 
709 /*
710  * Timeouts for various events in handing states.
711  *
712  * NOTE:
713  * 1 == 0~1 second.
714  * 2 == 1~2 second(s).
715  *
716  * We use 2 seconds for FIN lifetime, so that the states will not be
717  * ripped prematurely.
718  */
719 static uint32_t dyn_ack_lifetime = 300;
720 static uint32_t dyn_syn_lifetime = 20;
721 static uint32_t dyn_finwait_lifetime = 20;
722 static uint32_t dyn_fin_lifetime = 2;
723 static uint32_t dyn_rst_lifetime = 2;
724 static uint32_t dyn_udp_lifetime = 10;
725 static uint32_t dyn_short_lifetime = 5;	/* used by tracks too */
726 
727 /*
728  * Keepalives are sent if dyn_keepalive is set. They are sent every
729  * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
730  * seconds of lifetime of a rule.
731  */
732 static uint32_t dyn_keepalive_interval = 20;
733 static uint32_t dyn_keepalive_period = 5;
734 static uint32_t dyn_keepalive = 1;	/* do send keepalives */
735 
736 static struct ipfw_global	ipfw_gd;
737 static int	ipfw_state_loosecnt_updthr;
738 static int	ipfw_state_max = 4096;	/* max # of states */
739 static int	ipfw_track_max = 4096;	/* max # of tracks */
740 
741 static int	ipfw_state_headroom;	/* setup at module load time */
742 static int	ipfw_state_reap_min = 8;
743 static int	ipfw_state_expire_max = 32;
744 static int	ipfw_state_scan_max = 256;
745 static int	ipfw_keepalive_max = 8;
746 static int	ipfw_track_reap_max = 4;
747 static int	ipfw_track_expire_max = 16;
748 static int	ipfw_track_scan_max = 128;
749 
750 static eventhandler_tag ipfw_ifaddr_event;
751 
752 /* Compat */
753 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
754     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I",
755     "Number of states and tracks");
756 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
757     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I",
758     "Max number of states and tracks");
759 
760 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt,
761     CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I",
762     "Number of states");
763 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max,
764     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I",
765     "Max number of states");
766 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW,
767     &ipfw_state_headroom, 0, "headroom for state reap");
768 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD,
769     &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks");
770 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW,
771     &ipfw_track_max, 0, "Max number of tracks");
772 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
773     &static_count, 0, "Number of static rules");
774 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
775     &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
776 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
777     &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
778 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
779     &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
780 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW,
781     &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait");
782 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
783     &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
784 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
785     &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
786 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
787     &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
788 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
789     &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
790 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max,
791     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt,
792     "I", "# of states to scan for each expire iteration");
793 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max,
794     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt,
795     "I", "# of states to expire for each expire iteration");
796 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max,
797     CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt,
798     "I", "# of states to expire for each expire iteration");
799 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min,
800     CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt,
801     "I", "# of states to reap for state shortage");
802 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max,
803     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt,
804     "I", "# of tracks to scan for each expire iteration");
805 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max,
806     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt,
807     "I", "# of tracks to expire for each expire iteration");
808 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max,
809     CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt,
810     "I", "# of tracks to reap for track shortage");
811 
812 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap,
813     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
814     __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat,
815     "LU", "# of state reaps due to states shortage");
816 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed,
817     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
818     __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat,
819     "LU", "# of state reap failure");
820 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow,
821     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
822     __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat,
823     "LU", "# of state overflow");
824 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem,
825     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
826     __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat,
827     "LU", "# of state allocation failure");
828 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled,
829     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
830     __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat,
831     "LU", "# of state deleted due to fast TCP port recycling");
832 
833 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem,
834     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
835     __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat,
836     "LU", "# of track allocation failure");
837 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap,
838     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
839     __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat,
840     "LU", "# of track reap due to tracks shortage");
841 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed,
842     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
843     __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat,
844     "LU", "# of track reap failure");
845 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow,
846     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
847     __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat,
848     "LU", "# of track overflow");
849 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem,
850     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
851     __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat,
852     "LU", "# of track counter allocation failure");
853 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, frags,
854     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
855     __offsetof(struct ipfw_context, ipfw_frags), ipfw_sysctl_stat,
856     "LU", "# of IP fragements defraged");
857 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defraged,
858     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
859     __offsetof(struct ipfw_context, ipfw_defraged), ipfw_sysctl_stat,
860     "LU", "# of IP packets after defrag");
861 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, defrag_remote,
862     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
863     __offsetof(struct ipfw_context, ipfw_defrag_remote), ipfw_sysctl_stat,
864     "LU", "# of IP packets after defrag dispatched to remote cpus");
865 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlated,
866     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
867     __offsetof(struct ipfw_context, ipfw_xlated), ipfw_sysctl_stat,
868     "LU", "# address/port translations");
869 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_split,
870     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
871     __offsetof(struct ipfw_context, ipfw_xlate_split), ipfw_sysctl_stat,
872     "LU", "# address/port translations split between different cpus");
873 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_conflicts,
874     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
875     __offsetof(struct ipfw_context, ipfw_xlate_conflicts), ipfw_sysctl_stat,
876     "LU", "# address/port translations conflicts on remote cpu");
877 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, xlate_cresolved,
878     CTLTYPE_ULONG | CTLFLAG_RW, NULL,
879     __offsetof(struct ipfw_context, ipfw_xlate_cresolved), ipfw_sysctl_stat,
880     "LU", "# address/port translations conflicts resolved on remote cpu");
881 
882 static int		ipfw_state_cmp(struct ipfw_state *,
883 			    struct ipfw_state *);
884 static int		ipfw_trkcnt_cmp(struct ipfw_trkcnt *,
885 			    struct ipfw_trkcnt *);
886 static int		ipfw_track_cmp(struct ipfw_track *,
887 			    struct ipfw_track *);
888 
889 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
890 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp);
891 
892 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
893 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp);
894 
895 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
896 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp);
897 
898 static int		ipfw_chk(struct ip_fw_args *);
899 static void		ipfw_track_expire_ipifunc(void *);
900 static void		ipfw_state_expire_ipifunc(void *);
901 static void		ipfw_keepalive(void *);
902 static int		ipfw_state_expire_start(struct ipfw_context *,
903 			    int, int);
904 static void		ipfw_crossref_timeo(void *);
905 static void		ipfw_state_remove(struct ipfw_context *,
906 			    struct ipfw_state *);
907 static void		ipfw_xlat_reap_timeo(void *);
908 static void		ipfw_defrag_redispatch(struct mbuf *, int,
909 			    struct ip_fw *);
910 
911 #define IPFW_TRKCNT_TOKGET	lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token)
912 #define IPFW_TRKCNT_TOKREL	lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token)
913 #define IPFW_TRKCNT_TOKINIT	\
914 	lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt");
915 
916 static void
917 sa_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
918     const struct sockaddr *netmask)
919 {
920 	const u_char *cp1 = (const u_char *)src;
921 	u_char *cp2 = (u_char *)dst;
922 	const u_char *cp3 = (const u_char *)netmask;
923 	u_char *cplim = cp2 + *cp3;
924 	u_char *cplim2 = cp2 + *cp1;
925 
926 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
927 	cp3 += 2;
928 	if (cplim > cplim2)
929 		cplim = cplim2;
930 	while (cp2 < cplim)
931 		*cp2++ = *cp1++ & *cp3++;
932 	if (cp2 < cplim2)
933 		bzero(cp2, cplim2 - cp2);
934 }
935 
936 static __inline uint16_t
937 pfil_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
938 {
939 	uint32_t l;
940 
941 	if (udp && !cksum)
942 		return (0x0000);
943 	l = cksum + old - new;
944 	l = (l >> 16) + (l & 65535);
945 	l = l & 65535;
946 	if (udp && !l)
947 		return (0xFFFF);
948 	return (l);
949 }
950 
951 static __inline void
952 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport,
953     in_addr_t daddr, uint16_t dport, uint8_t proto)
954 {
955 
956 	key->proto = proto;
957 	key->swap = 0;
958 
959 	if (saddr < daddr) {
960 		key->addr_u.addrs.addr1 = daddr;
961 		key->addr_u.addrs.addr2 = saddr;
962 		key->swap |= IPFW_KEY_SWAP_ADDRS;
963 	} else {
964 		key->addr_u.addrs.addr1 = saddr;
965 		key->addr_u.addrs.addr2 = daddr;
966 	}
967 
968 	if (sport < dport) {
969 		key->port_u.ports.port1 = dport;
970 		key->port_u.ports.port2 = sport;
971 		key->swap |= IPFW_KEY_SWAP_PORTS;
972 	} else {
973 		key->port_u.ports.port1 = sport;
974 		key->port_u.ports.port2 = dport;
975 	}
976 
977 	if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS))
978 		key->swap |= IPFW_KEY_SWAP_PORTS;
979 	if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS))
980 		key->swap |= IPFW_KEY_SWAP_ADDRS;
981 }
982 
983 static __inline void
984 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport,
985     in_addr_t *daddr, uint16_t *dport)
986 {
987 
988 	if (key->swap & IPFW_KEY_SWAP_ADDRS) {
989 		*saddr = key->addr_u.addrs.addr2;
990 		*daddr = key->addr_u.addrs.addr1;
991 	} else {
992 		*saddr = key->addr_u.addrs.addr1;
993 		*daddr = key->addr_u.addrs.addr2;
994 	}
995 
996 	if (key->swap & IPFW_KEY_SWAP_PORTS) {
997 		*sport = key->port_u.ports.port2;
998 		*dport = key->port_u.ports.port1;
999 	} else {
1000 		*sport = key->port_u.ports.port1;
1001 		*dport = key->port_u.ports.port2;
1002 	}
1003 }
1004 
1005 static int
1006 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2)
1007 {
1008 
1009 	if (s1->st_proto > s2->st_proto)
1010 		return (1);
1011 	if (s1->st_proto < s2->st_proto)
1012 		return (-1);
1013 
1014 	if (s1->st_addrs > s2->st_addrs)
1015 		return (1);
1016 	if (s1->st_addrs < s2->st_addrs)
1017 		return (-1);
1018 
1019 	if (s1->st_ports > s2->st_ports)
1020 		return (1);
1021 	if (s1->st_ports < s2->st_ports)
1022 		return (-1);
1023 
1024 	if (s1->st_swap == s2->st_swap ||
1025 	    (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL)
1026 		return (0);
1027 
1028 	if (s1->st_swap > s2->st_swap)
1029 		return (1);
1030 	else
1031 		return (-1);
1032 }
1033 
1034 static int
1035 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2)
1036 {
1037 
1038 	if (t1->tc_proto > t2->tc_proto)
1039 		return (1);
1040 	if (t1->tc_proto < t2->tc_proto)
1041 		return (-1);
1042 
1043 	if (t1->tc_addrs > t2->tc_addrs)
1044 		return (1);
1045 	if (t1->tc_addrs < t2->tc_addrs)
1046 		return (-1);
1047 
1048 	if (t1->tc_ports > t2->tc_ports)
1049 		return (1);
1050 	if (t1->tc_ports < t2->tc_ports)
1051 		return (-1);
1052 
1053 	if (t1->tc_ruleid > t2->tc_ruleid)
1054 		return (1);
1055 	if (t1->tc_ruleid < t2->tc_ruleid)
1056 		return (-1);
1057 
1058 	return (0);
1059 }
1060 
1061 static int
1062 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2)
1063 {
1064 
1065 	if (t1->t_proto > t2->t_proto)
1066 		return (1);
1067 	if (t1->t_proto < t2->t_proto)
1068 		return (-1);
1069 
1070 	if (t1->t_addrs > t2->t_addrs)
1071 		return (1);
1072 	if (t1->t_addrs < t2->t_addrs)
1073 		return (-1);
1074 
1075 	if (t1->t_ports > t2->t_ports)
1076 		return (1);
1077 	if (t1->t_ports < t2->t_ports)
1078 		return (-1);
1079 
1080 	if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule)
1081 		return (1);
1082 	if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule)
1083 		return (-1);
1084 
1085 	return (0);
1086 }
1087 
1088 static __inline struct ipfw_state *
1089 ipfw_state_link(struct ipfw_context *ctx, struct ipfw_state *s)
1090 {
1091 	struct ipfw_state *dup;
1092 
1093 	KASSERT((s->st_flags & IPFW_STATE_F_LINKED) == 0,
1094 	    ("state %p was linked", s));
1095 	dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1096 	if (dup == NULL) {
1097 		TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link);
1098 		s->st_flags |= IPFW_STATE_F_LINKED;
1099 	}
1100 	return (dup);
1101 }
1102 
1103 static __inline void
1104 ipfw_state_unlink(struct ipfw_context *ctx, struct ipfw_state *s)
1105 {
1106 
1107 	KASSERT(s->st_flags & IPFW_STATE_F_LINKED,
1108 	    ("state %p was not linked", s));
1109 	RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s);
1110 	TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link);
1111 	s->st_flags &= ~IPFW_STATE_F_LINKED;
1112 }
1113 
1114 static void
1115 ipfw_state_max_set(int state_max)
1116 {
1117 
1118 	ipfw_state_max = state_max;
1119 	/* Allow 5% states over-allocation. */
1120 	ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus;
1121 }
1122 
1123 static __inline int
1124 ipfw_state_cntcoll(void)
1125 {
1126 	int cpu, state_cnt = 0;
1127 
1128 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
1129 		state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt;
1130 	return (state_cnt);
1131 }
1132 
1133 static __inline int
1134 ipfw_state_cntsync(void)
1135 {
1136 	int state_cnt;
1137 
1138 	state_cnt = ipfw_state_cntcoll();
1139 	ipfw_gd.ipfw_state_loosecnt = state_cnt;
1140 	return (state_cnt);
1141 }
1142 
1143 static __inline int
1144 ipfw_free_rule(struct ip_fw *rule)
1145 {
1146 	KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid));
1147 	KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt));
1148 	rule->refcnt--;
1149 	if (rule->refcnt == 0) {
1150 		if (rule->cross_rules != NULL)
1151 			kfree(rule->cross_rules, M_IPFW);
1152 		kfree(rule, M_IPFW);
1153 		return 1;
1154 	}
1155 	return 0;
1156 }
1157 
1158 static void
1159 ipfw_unref_rule(void *priv)
1160 {
1161 	ipfw_free_rule(priv);
1162 #ifdef KLD_MODULE
1163 	KASSERT(ipfw_gd.ipfw_refcnt > 0,
1164 	    ("invalid ipfw_refcnt %d", ipfw_gd.ipfw_refcnt));
1165 	atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1);
1166 #endif
1167 }
1168 
1169 static __inline void
1170 ipfw_ref_rule(struct ip_fw *rule)
1171 {
1172 	KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid));
1173 #ifdef KLD_MODULE
1174 	atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
1175 #endif
1176 	rule->refcnt++;
1177 }
1178 
1179 /*
1180  * This macro maps an ip pointer into a layer3 header pointer of type T
1181  */
1182 #define	L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl))
1183 
1184 static __inline int
1185 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd)
1186 {
1187 	int type = L3HDR(struct icmp,ip)->icmp_type;
1188 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1189 	int idx = type / 32;
1190 
1191 	if (idx >= idx_max)
1192 		return (0);
1193 	return (cmd->d[idx] & (1 << (type % 32)));
1194 }
1195 
1196 static __inline int
1197 icmpcode_match(struct ip *ip, ipfw_insn_u32 *cmd)
1198 {
1199 	int code = L3HDR(struct icmp,ip)->icmp_code;
1200 	int idx_max = F_LEN(&cmd->o) - F_INSN_SIZE(ipfw_insn);
1201 	int idx = code / 32;
1202 
1203 	if (idx >= idx_max)
1204 		return (0);
1205 	return (cmd->d[idx] & (1 << (code % 32)));
1206 }
1207 
1208 #define TT	((1 << ICMP_ECHO) | \
1209 		 (1 << ICMP_ROUTERSOLICIT) | \
1210 		 (1 << ICMP_TSTAMP) | \
1211 		 (1 << ICMP_IREQ) | \
1212 		 (1 << ICMP_MASKREQ))
1213 
1214 static int
1215 is_icmp_query(struct ip *ip)
1216 {
1217 	int type = L3HDR(struct icmp, ip)->icmp_type;
1218 
1219 	return (type < 32 && (TT & (1 << type)));
1220 }
1221 
1222 #undef TT
1223 
1224 /*
1225  * The following checks use two arrays of 8 or 16 bits to store the
1226  * bits that we want set or clear, respectively. They are in the
1227  * low and high half of cmd->arg1 or cmd->d[0].
1228  *
1229  * We scan options and store the bits we find set. We succeed if
1230  *
1231  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
1232  *
1233  * The code is sometimes optimized not to store additional variables.
1234  */
1235 static int
1236 flags_match(ipfw_insn *cmd, uint8_t bits)
1237 {
1238 	u_char want_clear;
1239 	bits = ~bits;
1240 
1241 	if (((cmd->arg1 & 0xff) & bits) != 0)
1242 		return 0; /* some bits we want set were clear */
1243 
1244 	want_clear = (cmd->arg1 >> 8) & 0xff;
1245 	if ((want_clear & bits) != want_clear)
1246 		return 0; /* some bits we want clear were set */
1247 	return 1;
1248 }
1249 
1250 static int
1251 ipopts_match(struct ip *ip, ipfw_insn *cmd)
1252 {
1253 	int optlen, bits = 0;
1254 	u_char *cp = (u_char *)(ip + 1);
1255 	int x = (ip->ip_hl << 2) - sizeof(struct ip);
1256 
1257 	for (; x > 0; x -= optlen, cp += optlen) {
1258 		int opt = cp[IPOPT_OPTVAL];
1259 
1260 		if (opt == IPOPT_EOL)
1261 			break;
1262 
1263 		if (opt == IPOPT_NOP) {
1264 			optlen = 1;
1265 		} else {
1266 			optlen = cp[IPOPT_OLEN];
1267 			if (optlen <= 0 || optlen > x)
1268 				return 0; /* invalid or truncated */
1269 		}
1270 
1271 		switch (opt) {
1272 		case IPOPT_LSRR:
1273 			bits |= IP_FW_IPOPT_LSRR;
1274 			break;
1275 
1276 		case IPOPT_SSRR:
1277 			bits |= IP_FW_IPOPT_SSRR;
1278 			break;
1279 
1280 		case IPOPT_RR:
1281 			bits |= IP_FW_IPOPT_RR;
1282 			break;
1283 
1284 		case IPOPT_TS:
1285 			bits |= IP_FW_IPOPT_TS;
1286 			break;
1287 
1288 		default:
1289 			break;
1290 		}
1291 	}
1292 	return (flags_match(cmd, bits));
1293 }
1294 
1295 static int
1296 tcpopts_match(struct ip *ip, ipfw_insn *cmd)
1297 {
1298 	int optlen, bits = 0;
1299 	struct tcphdr *tcp = L3HDR(struct tcphdr,ip);
1300 	u_char *cp = (u_char *)(tcp + 1);
1301 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
1302 
1303 	for (; x > 0; x -= optlen, cp += optlen) {
1304 		int opt = cp[0];
1305 
1306 		if (opt == TCPOPT_EOL)
1307 			break;
1308 
1309 		if (opt == TCPOPT_NOP) {
1310 			optlen = 1;
1311 		} else {
1312 			optlen = cp[1];
1313 			if (optlen <= 0)
1314 				break;
1315 		}
1316 
1317 		switch (opt) {
1318 		case TCPOPT_MAXSEG:
1319 			bits |= IP_FW_TCPOPT_MSS;
1320 			break;
1321 
1322 		case TCPOPT_WINDOW:
1323 			bits |= IP_FW_TCPOPT_WINDOW;
1324 			break;
1325 
1326 		case TCPOPT_SACK_PERMITTED:
1327 		case TCPOPT_SACK:
1328 			bits |= IP_FW_TCPOPT_SACK;
1329 			break;
1330 
1331 		case TCPOPT_TIMESTAMP:
1332 			bits |= IP_FW_TCPOPT_TS;
1333 			break;
1334 
1335 		case TCPOPT_CC:
1336 		case TCPOPT_CCNEW:
1337 		case TCPOPT_CCECHO:
1338 			bits |= IP_FW_TCPOPT_CC;
1339 			break;
1340 
1341 		default:
1342 			break;
1343 		}
1344 	}
1345 	return (flags_match(cmd, bits));
1346 }
1347 
1348 static int
1349 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
1350 {
1351 	if (ifp == NULL)	/* no iface with this packet, match fails */
1352 		return 0;
1353 
1354 	/* Check by name or by IP address */
1355 	if (cmd->name[0] != '\0') { /* match by name */
1356 		/* Check name */
1357 		if (cmd->p.glob) {
1358 			if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0)
1359 				return(1);
1360 		} else {
1361 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
1362 				return(1);
1363 		}
1364 	} else {
1365 		struct ifaddr_container *ifac;
1366 
1367 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1368 			struct ifaddr *ia = ifac->ifa;
1369 
1370 			if (ia->ifa_addr == NULL)
1371 				continue;
1372 			if (ia->ifa_addr->sa_family != AF_INET)
1373 				continue;
1374 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
1375 			    (ia->ifa_addr))->sin_addr.s_addr)
1376 				return(1);	/* match */
1377 		}
1378 	}
1379 	return(0);	/* no match, fail ... */
1380 }
1381 
1382 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
1383 
1384 /*
1385  * We enter here when we have a rule with O_LOG.
1386  * XXX this function alone takes about 2Kbytes of code!
1387  */
1388 static void
1389 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen,
1390     struct ether_header *eh, struct mbuf *m, struct ifnet *oif)
1391 {
1392 	char *action;
1393 	int limit_reached = 0;
1394 	char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN];
1395 
1396 	fragment[0] = '\0';
1397 	proto[0] = '\0';
1398 
1399 	if (f == NULL) {	/* bogus pkt */
1400 		if (verbose_limit != 0 &&
1401 		    ctx->ipfw_norule_counter >= verbose_limit)
1402 			return;
1403 		ctx->ipfw_norule_counter++;
1404 		if (ctx->ipfw_norule_counter == verbose_limit)
1405 			limit_reached = verbose_limit;
1406 		action = "Refuse";
1407 	} else {	/* O_LOG is the first action, find the real one */
1408 		ipfw_insn *cmd = ACTION_PTR(f);
1409 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
1410 
1411 		if (l->max_log != 0 && l->log_left == 0)
1412 			return;
1413 		l->log_left--;
1414 		if (l->log_left == 0)
1415 			limit_reached = l->max_log;
1416 		cmd += F_LEN(cmd);	/* point to first action */
1417 		if (cmd->opcode == O_PROB)
1418 			cmd += F_LEN(cmd);
1419 
1420 		action = action2;
1421 		switch (cmd->opcode) {
1422 		case O_DENY:
1423 			action = "Deny";
1424 			break;
1425 
1426 		case O_REJECT:
1427 			if (cmd->arg1==ICMP_REJECT_RST) {
1428 				action = "Reset";
1429 			} else if (cmd->arg1==ICMP_UNREACH_HOST) {
1430 				action = "Reject";
1431 			} else {
1432 				ksnprintf(SNPARGS(action2, 0), "Unreach %d",
1433 					  cmd->arg1);
1434 			}
1435 			break;
1436 
1437 		case O_ACCEPT:
1438 			action = "Accept";
1439 			break;
1440 
1441 		case O_COUNT:
1442 			action = "Count";
1443 			break;
1444 
1445 		case O_DIVERT:
1446 			ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1);
1447 			break;
1448 
1449 		case O_TEE:
1450 			ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1);
1451 			break;
1452 
1453 		case O_SKIPTO:
1454 			ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1);
1455 			break;
1456 
1457 		case O_PIPE:
1458 			ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1);
1459 			break;
1460 
1461 		case O_QUEUE:
1462 			ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1);
1463 			break;
1464 
1465 		case O_FORWARD_IP:
1466 			{
1467 				ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
1468 				int len;
1469 
1470 				len = ksnprintf(SNPARGS(action2, 0),
1471 				    "Forward to %s",
1472 				    kinet_ntoa(sa->sa.sin_addr, abuf));
1473 				if (sa->sa.sin_port) {
1474 					ksnprintf(SNPARGS(action2, len), ":%d",
1475 						  sa->sa.sin_port);
1476 				}
1477 			}
1478 			break;
1479 
1480 		default:
1481 			action = "UNKNOWN";
1482 			break;
1483 		}
1484 	}
1485 
1486 	if (hlen == 0) {	/* non-ip */
1487 		ksnprintf(SNPARGS(proto, 0), "MAC");
1488 	} else {
1489 		struct ip *ip = mtod(m, struct ip *);
1490 		/* these three are all aliases to the same thing */
1491 		struct icmp *const icmp = L3HDR(struct icmp, ip);
1492 		struct tcphdr *const tcp = (struct tcphdr *)icmp;
1493 		struct udphdr *const udp = (struct udphdr *)icmp;
1494 
1495 		int ip_off, offset, ip_len;
1496 		int len;
1497 
1498 		if (eh != NULL) { /* layer 2 packets are as on the wire */
1499 			ip_off = ntohs(ip->ip_off);
1500 			ip_len = ntohs(ip->ip_len);
1501 		} else {
1502 			ip_off = ip->ip_off;
1503 			ip_len = ip->ip_len;
1504 		}
1505 		offset = ip_off & IP_OFFMASK;
1506 		switch (ip->ip_p) {
1507 		case IPPROTO_TCP:
1508 			len = ksnprintf(SNPARGS(proto, 0), "TCP %s",
1509 					kinet_ntoa(ip->ip_src, abuf));
1510 			if (offset == 0) {
1511 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1512 					  ntohs(tcp->th_sport),
1513 					  kinet_ntoa(ip->ip_dst, abuf),
1514 					  ntohs(tcp->th_dport));
1515 			} else {
1516 				ksnprintf(SNPARGS(proto, len), " %s",
1517 					  kinet_ntoa(ip->ip_dst, abuf));
1518 			}
1519 			break;
1520 
1521 		case IPPROTO_UDP:
1522 			len = ksnprintf(SNPARGS(proto, 0), "UDP %s",
1523 					kinet_ntoa(ip->ip_src, abuf));
1524 			if (offset == 0) {
1525 				ksnprintf(SNPARGS(proto, len), ":%d %s:%d",
1526 					  ntohs(udp->uh_sport),
1527 					  kinet_ntoa(ip->ip_dst, abuf),
1528 					  ntohs(udp->uh_dport));
1529 			} else {
1530 				ksnprintf(SNPARGS(proto, len), " %s",
1531 					  kinet_ntoa(ip->ip_dst, abuf));
1532 			}
1533 			break;
1534 
1535 		case IPPROTO_ICMP:
1536 			if (offset == 0) {
1537 				len = ksnprintf(SNPARGS(proto, 0),
1538 						"ICMP:%u.%u ",
1539 						icmp->icmp_type,
1540 						icmp->icmp_code);
1541 			} else {
1542 				len = ksnprintf(SNPARGS(proto, 0), "ICMP ");
1543 			}
1544 			len += ksnprintf(SNPARGS(proto, len), "%s",
1545 					 kinet_ntoa(ip->ip_src, abuf));
1546 			ksnprintf(SNPARGS(proto, len), " %s",
1547 				  kinet_ntoa(ip->ip_dst, abuf));
1548 			break;
1549 
1550 		default:
1551 			len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p,
1552 					kinet_ntoa(ip->ip_src, abuf));
1553 			ksnprintf(SNPARGS(proto, len), " %s",
1554 				  kinet_ntoa(ip->ip_dst, abuf));
1555 			break;
1556 		}
1557 
1558 		if (ip_off & (IP_MF | IP_OFFMASK)) {
1559 			ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)",
1560 				  ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
1561 				  offset << 3, (ip_off & IP_MF) ? "+" : "");
1562 		}
1563 	}
1564 
1565 	if (oif || m->m_pkthdr.rcvif) {
1566 		log(LOG_SECURITY | LOG_INFO,
1567 		    "ipfw: %d %s %s %s via %s%s\n",
1568 		    f ? f->rulenum : -1,
1569 		    action, proto, oif ? "out" : "in",
1570 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
1571 		    fragment);
1572 	} else {
1573 		log(LOG_SECURITY | LOG_INFO,
1574 		    "ipfw: %d %s %s [no if info]%s\n",
1575 		    f ? f->rulenum : -1,
1576 		    action, proto, fragment);
1577 	}
1578 
1579 	if (limit_reached) {
1580 		log(LOG_SECURITY | LOG_NOTICE,
1581 		    "ipfw: limit %d reached on entry %d\n",
1582 		    limit_reached, f ? f->rulenum : -1);
1583 	}
1584 }
1585 
1586 #undef SNPARGS
1587 
1588 static void
1589 ipfw_xlat_reap(struct ipfw_xlat *x, struct ipfw_xlat *slave_x)
1590 {
1591 	struct ip_fw *rule = slave_x->xlat_rule;
1592 
1593 	KKASSERT(rule->cpuid == mycpuid);
1594 
1595 	/* No more cross references; free this pair now. */
1596 	kfree(x, M_IPFW);
1597 	kfree(slave_x, M_IPFW);
1598 
1599 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1600 	rule->cross_refs--;
1601 }
1602 
1603 static void
1604 ipfw_xlat_reap_dispatch(netmsg_t nm)
1605 {
1606 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1607 	struct ipfw_state *s, *ns;
1608 
1609 	ASSERT_NETISR_NCPUS(mycpuid);
1610 
1611 	crit_enter();
1612 	/* Reply ASAP. */
1613 	netisr_replymsg(&ctx->ipfw_xlatreap_nm, 0);
1614 	crit_exit();
1615 
1616 	/* TODO: limit scanning depth */
1617 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_xlatreap, st_link, ns) {
1618 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
1619 		struct ipfw_xlat *slave_x = x->xlat_pair;
1620 		uint64_t crefs;
1621 
1622 		crefs = slave_x->xlat_crefs + x->xlat_crefs;
1623 		if (crefs == 0) {
1624 			TAILQ_REMOVE(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1625 			ipfw_xlat_reap(x, slave_x);
1626 		}
1627 	}
1628 	if (!TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1629 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1630 		    &ctx->ipfw_xlatreap_nm);
1631 	}
1632 }
1633 
1634 static void
1635 ipfw_xlat_reap_timeo(void *xnm)
1636 {
1637 	struct netmsg_base *nm = xnm;
1638 
1639 	KKASSERT(mycpuid < netisr_ncpus);
1640 
1641 	crit_enter();
1642 	if (nm->lmsg.ms_flags & MSGF_DONE)
1643 		netisr_sendmsg_oncpu(nm);
1644 	crit_exit();
1645 }
1646 
1647 static void
1648 ipfw_xlat_free_dispatch(netmsg_t nmsg)
1649 {
1650 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1651 	struct ipfw_xlat *x = nmsg->lmsg.u.ms_resultp;
1652 	struct ipfw_xlat *slave_x = x->xlat_pair;
1653 	uint64_t crefs;
1654 
1655 	ASSERT_NETISR_NCPUS(mycpuid);
1656 
1657 	KKASSERT(slave_x != NULL);
1658 	KKASSERT(slave_x->xlat_invalid && x->xlat_invalid);
1659 
1660 	KASSERT((x->xlat_flags & IPFW_STATE_F_LINKED) == 0,
1661 	    ("master xlat is still linked"));
1662 	if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1663 		ipfw_state_unlink(ctx, &slave_x->xlat_st);
1664 
1665 	/* See the comment in ipfw_ip_xlate_dispatch(). */
1666 	slave_x->xlat_crefs--;
1667 
1668 	crefs = slave_x->xlat_crefs + x->xlat_crefs;
1669 	if (crefs == 0) {
1670 		ipfw_xlat_reap(x, slave_x);
1671 		return;
1672 	}
1673 
1674 	if (TAILQ_EMPTY(&ctx->ipfw_xlatreap)) {
1675 		callout_reset(&ctx->ipfw_xlatreap_ch, 2, ipfw_xlat_reap_timeo,
1676 		    &ctx->ipfw_xlatreap_nm);
1677 	}
1678 
1679 	/*
1680 	 * This pair is still referenced; defer its destruction.
1681 	 * YYY reuse st_link.
1682 	 */
1683 	TAILQ_INSERT_TAIL(&ctx->ipfw_xlatreap, &x->xlat_st, st_link);
1684 }
1685 
1686 static __inline void
1687 ipfw_xlat_invalidate(struct ipfw_xlat *x)
1688 {
1689 
1690 	x->xlat_invalid = 1;
1691 	x->xlat_pair->xlat_invalid = 1;
1692 }
1693 
1694 static void
1695 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s)
1696 {
1697 	struct ipfw_xlat *x, *slave_x;
1698 	struct netmsg_base *nm;
1699 
1700 	KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT ||
1701 	    IPFW_ISXLAT(s->st_type), ("invalid state type %u", s->st_type));
1702 	KASSERT((s->st_flags & IPFW_STATE_F_XLATSLAVE) == 0,
1703 	    ("delete slave xlat"));
1704 
1705 	KASSERT(ctx->ipfw_state_cnt > 0,
1706 	    ("invalid state count %d", ctx->ipfw_state_cnt));
1707 	ctx->ipfw_state_cnt--;
1708 	if (ctx->ipfw_state_loosecnt > 0)
1709 		ctx->ipfw_state_loosecnt--;
1710 
1711 	/*
1712 	 * Unhook this state.
1713 	 */
1714 	if (s->st_track != NULL) {
1715 		struct ipfw_track *t = s->st_track;
1716 
1717 		KASSERT(!LIST_EMPTY(&t->t_state_list),
1718 		    ("track state list is empty"));
1719 		LIST_REMOVE(s, st_trklink);
1720 
1721 		KASSERT(*t->t_count > 0,
1722 		    ("invalid track count %d", *t->t_count));
1723 		atomic_subtract_int(t->t_count, 1);
1724 	}
1725 	ipfw_state_unlink(ctx, s);
1726 
1727 	/*
1728 	 * Free this state.  Xlat requires special processing,
1729 	 * since xlat are paired state and they could be on
1730 	 * different cpus.
1731 	 */
1732 
1733 	if (!IPFW_ISXLAT(s->st_type)) {
1734 		/* Not xlat; free now. */
1735 		kfree(s, M_IPFW);
1736 		/* Done! */
1737 		return;
1738 	}
1739 	x = (struct ipfw_xlat *)s;
1740 
1741 	if (x->xlat_pair == NULL) {
1742 		/* Not setup yet; free now. */
1743 		kfree(x, M_IPFW);
1744 		/* Done! */
1745 		return;
1746 	}
1747 	slave_x = x->xlat_pair;
1748 	KKASSERT(slave_x->xlat_flags & IPFW_STATE_F_XLATSLAVE);
1749 
1750 	if (x->xlat_pcpu == mycpuid) {
1751 		/*
1752 		 * Paired states are on the same cpu; delete this
1753 		 * pair now.
1754 		 */
1755 		KKASSERT(x->xlat_crefs == 0);
1756 		KKASSERT(slave_x->xlat_crefs == 0);
1757 		if (slave_x->xlat_flags & IPFW_STATE_F_LINKED)
1758 			ipfw_state_unlink(ctx, &slave_x->xlat_st);
1759 		kfree(x, M_IPFW);
1760 		kfree(slave_x, M_IPFW);
1761 		return;
1762 	}
1763 
1764 	/*
1765 	 * Free the paired states on the cpu owning the slave xlat.
1766 	 */
1767 
1768 	/*
1769 	 * Mark the state pair invalid; completely deleting them
1770 	 * may take some time.
1771 	 */
1772 	ipfw_xlat_invalidate(x);
1773 
1774 	nm = &x->xlat_freenm;
1775 	netmsg_init(nm, NULL, &netisr_apanic_rport, MSGF_PRIORITY,
1776 	    ipfw_xlat_free_dispatch);
1777 	nm->lmsg.u.ms_resultp = x;
1778 
1779 	/* See the comment in ipfw_xlate_redispatch(). */
1780 	x->xlat_rule->cross_refs++;
1781 	x->xlat_crefs++;
1782 
1783 	netisr_sendmsg(nm, x->xlat_pcpu);
1784 }
1785 
1786 static void
1787 ipfw_state_remove(struct ipfw_context *ctx, struct ipfw_state *s)
1788 {
1789 
1790 	if (s->st_flags & IPFW_STATE_F_XLATSLAVE) {
1791 		KKASSERT(IPFW_ISXLAT(s->st_type));
1792 		ipfw_xlat_invalidate((struct ipfw_xlat *)s);
1793 		ipfw_state_unlink(ctx, s);
1794 		return;
1795 	}
1796 	ipfw_state_del(ctx, s);
1797 }
1798 
1799 static int
1800 ipfw_state_reap(struct ipfw_context *ctx, int reap_max)
1801 {
1802 	struct ipfw_state *s, *anchor;
1803 	int expired;
1804 
1805 	if (reap_max < ipfw_state_reap_min)
1806 		reap_max = ipfw_state_reap_min;
1807 
1808 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) {
1809 		/*
1810 		 * Kick start state expiring.  Ignore scan limit,
1811 		 * we are short of states.
1812 		 */
1813 		ctx->ipfw_flags |= IPFW_FLAG_STATEREAP;
1814 		expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max);
1815 		ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP;
1816 		return (expired);
1817 	}
1818 
1819 	/*
1820 	 * States are being expired.
1821 	 */
1822 
1823 	if (ctx->ipfw_state_cnt == 0)
1824 		return (0);
1825 
1826 	expired = 0;
1827 	anchor = &ctx->ipfw_stateexp_anch;
1828 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1829 		/*
1830 		 * Ignore scan limit; we are short of states.
1831 		 */
1832 
1833 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1834 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1835 
1836 		if (IPFW_STATE_SCANSKIP(s))
1837 			continue;
1838 
1839 		if (IPFW_STATE_ISDEAD(s) || IPFW_STATE_TCPCLOSED(s)) {
1840 			ipfw_state_del(ctx, s);
1841 			if (++expired >= reap_max)
1842 				break;
1843 			if ((expired & 0xff) == 0 &&
1844 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1845 			    ipfw_state_max)
1846 				break;
1847 		}
1848 	}
1849 	/*
1850 	 * NOTE:
1851 	 * Leave the anchor on the list, even if the end of the list has
1852 	 * been reached.  ipfw_state_expire_more_dispatch() will handle
1853 	 * the removal.
1854 	 */
1855 	return (expired);
1856 }
1857 
1858 static void
1859 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule)
1860 {
1861 	struct ipfw_state *s, *sn;
1862 
1863 	TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) {
1864 		if (IPFW_STATE_SCANSKIP(s))
1865 			continue;
1866 		if (rule != NULL && s->st_rule != rule)
1867 			continue;
1868 		ipfw_state_del(ctx, s);
1869 	}
1870 }
1871 
1872 static void
1873 ipfw_state_expire_done(struct ipfw_context *ctx)
1874 {
1875 
1876 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1877 	    ("stateexp is not in progress"));
1878 	ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP;
1879 	callout_reset(&ctx->ipfw_stateto_ch, hz,
1880 	    ipfw_state_expire_ipifunc, NULL);
1881 }
1882 
1883 static void
1884 ipfw_state_expire_more(struct ipfw_context *ctx)
1885 {
1886 	struct netmsg_base *nm = &ctx->ipfw_stateexp_more;
1887 
1888 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1889 	    ("stateexp is not in progress"));
1890 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
1891 	    ("stateexp more did not finish"));
1892 	netisr_sendmsg_oncpu(nm);
1893 }
1894 
1895 static int
1896 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor,
1897     int scan_max, int expire_max)
1898 {
1899 	struct ipfw_state *s;
1900 	int scanned = 0, expired = 0;
1901 
1902 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1903 	    ("stateexp is not in progress"));
1904 
1905 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
1906 		if (scanned++ >= scan_max) {
1907 			ipfw_state_expire_more(ctx);
1908 			return (expired);
1909 		}
1910 
1911 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1912 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
1913 
1914 		if (IPFW_STATE_SCANSKIP(s))
1915 			continue;
1916 
1917 		if (IPFW_STATE_ISDEAD(s) ||
1918 		    ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1919 		     IPFW_STATE_TCPCLOSED(s))) {
1920 			ipfw_state_del(ctx, s);
1921 			if (++expired >= expire_max) {
1922 				ipfw_state_expire_more(ctx);
1923 				return (expired);
1924 			}
1925 			if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) &&
1926 			    (expired & 0xff) == 0 &&
1927 			    ipfw_state_cntcoll() + ipfw_state_headroom <=
1928 			    ipfw_state_max) {
1929 				ipfw_state_expire_more(ctx);
1930 				return (expired);
1931 			}
1932 		}
1933 	}
1934 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1935 	ipfw_state_expire_done(ctx);
1936 	return (expired);
1937 }
1938 
1939 static void
1940 ipfw_state_expire_more_dispatch(netmsg_t nm)
1941 {
1942 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1943 	struct ipfw_state *anchor;
1944 
1945 	ASSERT_NETISR_NCPUS(mycpuid);
1946 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP,
1947 	    ("statexp is not in progress"));
1948 
1949 	/* Reply ASAP */
1950 	netisr_replymsg(&nm->base, 0);
1951 
1952 	anchor = &ctx->ipfw_stateexp_anch;
1953 	if (ctx->ipfw_state_cnt == 0) {
1954 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
1955 		ipfw_state_expire_done(ctx);
1956 		return;
1957 	}
1958 	ipfw_state_expire_loop(ctx, anchor,
1959 	    ipfw_state_scan_max, ipfw_state_expire_max);
1960 }
1961 
1962 static int
1963 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
1964 {
1965 	struct ipfw_state *anchor;
1966 
1967 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0,
1968 	    ("stateexp is in progress"));
1969 	ctx->ipfw_flags |= IPFW_FLAG_STATEEXP;
1970 
1971 	if (ctx->ipfw_state_cnt == 0) {
1972 		ipfw_state_expire_done(ctx);
1973 		return (0);
1974 	}
1975 
1976 	/*
1977 	 * Do not expire more than once per second, it is useless.
1978 	 */
1979 	if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 &&
1980 	    ctx->ipfw_state_lastexp == time_uptime) {
1981 		ipfw_state_expire_done(ctx);
1982 		return (0);
1983 	}
1984 	ctx->ipfw_state_lastexp = time_uptime;
1985 
1986 	anchor = &ctx->ipfw_stateexp_anch;
1987 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
1988 	return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max));
1989 }
1990 
1991 static void
1992 ipfw_state_expire_dispatch(netmsg_t nm)
1993 {
1994 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
1995 
1996 	ASSERT_NETISR_NCPUS(mycpuid);
1997 
1998 	/* Reply ASAP */
1999 	crit_enter();
2000 	netisr_replymsg(&nm->base, 0);
2001 	crit_exit();
2002 
2003 	if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) {
2004 		/* Running; done. */
2005 		return;
2006 	}
2007 	ipfw_state_expire_start(ctx,
2008 	    ipfw_state_scan_max, ipfw_state_expire_max);
2009 }
2010 
2011 static void
2012 ipfw_state_expire_ipifunc(void *dummy __unused)
2013 {
2014 	struct netmsg_base *msg;
2015 
2016 	KKASSERT(mycpuid < netisr_ncpus);
2017 	msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm;
2018 
2019 	crit_enter();
2020 	if (msg->lmsg.ms_flags & MSGF_DONE)
2021 		netisr_sendmsg_oncpu(msg);
2022 	crit_exit();
2023 }
2024 
2025 static boolean_t
2026 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp)
2027 {
2028 	uint32_t seq = ntohl(tcp->th_seq);
2029 	uint32_t ack = ntohl(tcp->th_ack);
2030 
2031 	if (tcp->th_flags & TH_RST)
2032 		return (TRUE);
2033 
2034 	if (dir == MATCH_FORWARD) {
2035 		if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) {
2036 			s->st_flags |= IPFW_STATE_F_SEQFWD;
2037 			s->st_seq_fwd = seq;
2038 		} else if (SEQ_GEQ(seq, s->st_seq_fwd)) {
2039 			s->st_seq_fwd = seq;
2040 		} else {
2041 			/* Out-of-sequence; done. */
2042 			return (FALSE);
2043 		}
2044 		if (tcp->th_flags & TH_ACK) {
2045 			if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) {
2046 				s->st_flags |= IPFW_STATE_F_ACKFWD;
2047 				s->st_ack_fwd = ack;
2048 			} else if (SEQ_GEQ(ack, s->st_ack_fwd)) {
2049 				s->st_ack_fwd = ack;
2050 			} else {
2051 				/* Out-of-sequence; done. */
2052 				return (FALSE);
2053 			}
2054 
2055 			if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) ==
2056 			    (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1)
2057 				s->st_state |= (TH_ACK << 8);
2058 		}
2059 	} else {
2060 		if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) {
2061 			s->st_flags |= IPFW_STATE_F_SEQREV;
2062 			s->st_seq_rev = seq;
2063 		} else if (SEQ_GEQ(seq, s->st_seq_rev)) {
2064 			s->st_seq_rev = seq;
2065 		} else {
2066 			/* Out-of-sequence; done. */
2067 			return (FALSE);
2068 		}
2069 		if (tcp->th_flags & TH_ACK) {
2070 			if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) {
2071 				s->st_flags |= IPFW_STATE_F_ACKREV;
2072 				s->st_ack_rev= ack;
2073 			} else if (SEQ_GEQ(ack, s->st_ack_rev)) {
2074 				s->st_ack_rev = ack;
2075 			} else {
2076 				/* Out-of-sequence; done. */
2077 				return (FALSE);
2078 			}
2079 
2080 			if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN &&
2081 			    s->st_ack_rev == s->st_seq_fwd + 1)
2082 				s->st_state |= TH_ACK;
2083 		}
2084 	}
2085 	return (TRUE);
2086 }
2087 
2088 static void
2089 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir,
2090     const struct tcphdr *tcp, struct ipfw_state *s)
2091 {
2092 
2093 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
2094 		u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS;
2095 
2096 		if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp))
2097 			return;
2098 
2099 		s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
2100 		switch (s->st_state & IPFW_STATE_TCPSTATES) {
2101 		case TH_SYN:				/* opening */
2102 			s->st_expire = time_uptime + dyn_syn_lifetime;
2103 			break;
2104 
2105 		case BOTH_SYN:			/* move to established */
2106 		case BOTH_SYN | TH_FIN:		/* one side tries to close */
2107 		case BOTH_SYN | (TH_FIN << 8):
2108 			s->st_expire = time_uptime + dyn_ack_lifetime;
2109 			break;
2110 
2111 		case BOTH_SYN | BOTH_FIN:	/* both sides closed */
2112 			if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) {
2113 				/* And both FINs were ACKed. */
2114 				s->st_expire = time_uptime + dyn_fin_lifetime;
2115 			} else {
2116 				s->st_expire = time_uptime +
2117 				    dyn_finwait_lifetime;
2118 			}
2119 			break;
2120 
2121 		default:
2122 #if 0
2123 			/*
2124 			 * reset or some invalid combination, but can also
2125 			 * occur if we use keep-state the wrong way.
2126 			 */
2127 			if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0)
2128 				kprintf("invalid state: 0x%x\n", s->st_state);
2129 #endif
2130 			s->st_expire = time_uptime + dyn_rst_lifetime;
2131 			break;
2132 		}
2133 	} else if (pkt->proto == IPPROTO_UDP) {
2134 		s->st_expire = time_uptime + dyn_udp_lifetime;
2135 	} else {
2136 		/* other protocols */
2137 		s->st_expire = time_uptime + dyn_short_lifetime;
2138 	}
2139 }
2140 
2141 /*
2142  * Lookup a state.
2143  */
2144 static struct ipfw_state *
2145 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt,
2146     int *match_direction, const struct tcphdr *tcp)
2147 {
2148 	struct ipfw_state *key, *s;
2149 	int dir = MATCH_NONE;
2150 
2151 	key = &ctx->ipfw_state_tmpkey;
2152 	ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port,
2153 	    pkt->dst_ip, pkt->dst_port, pkt->proto);
2154 	s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key);
2155 	if (s == NULL)
2156 		goto done; /* not found. */
2157 	if (IPFW_STATE_ISDEAD(s)) {
2158 		ipfw_state_remove(ctx, s);
2159 		s = NULL;
2160 		goto done;
2161 	}
2162 	if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) {
2163 		/* TCP ports recycling is too fast. */
2164 		ctx->ipfw_sts_tcprecycled++;
2165 		ipfw_state_remove(ctx, s);
2166 		s = NULL;
2167 		goto done;
2168 	}
2169 
2170 	if (s->st_swap == key->st_swap) {
2171 		dir = MATCH_FORWARD;
2172 	} else {
2173 		KASSERT((s->st_swap & key->st_swap) == 0,
2174 		    ("found mismatch state"));
2175 		dir = MATCH_REVERSE;
2176 	}
2177 
2178 	/* Update this state. */
2179 	ipfw_state_update(pkt, dir, tcp, s);
2180 
2181 	if (s->st_track != NULL) {
2182 		/* This track has been used. */
2183 		s->st_track->t_expire = time_uptime + dyn_short_lifetime;
2184 	}
2185 done:
2186 	if (match_direction)
2187 		*match_direction = dir;
2188 	return (s);
2189 }
2190 
2191 static struct ipfw_state *
2192 ipfw_state_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2193     uint16_t type, struct ip_fw *rule, const struct tcphdr *tcp)
2194 {
2195 	struct ipfw_state *s;
2196 	size_t sz;
2197 
2198 	KASSERT(type == O_KEEP_STATE || type == O_LIMIT || IPFW_ISXLAT(type),
2199 	    ("invalid state type %u", type));
2200 
2201 	sz = sizeof(struct ipfw_state);
2202 	if (IPFW_ISXLAT(type))
2203 		sz = sizeof(struct ipfw_xlat);
2204 
2205 	s = kmalloc(sz, M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO);
2206 	if (s == NULL) {
2207 		ctx->ipfw_sts_nomem++;
2208 		return (NULL);
2209 	}
2210 
2211 	ipfw_key_build(&s->st_key, id->src_ip, id->src_port,
2212 	    id->dst_ip, id->dst_port, id->proto);
2213 
2214 	s->st_rule = rule;
2215 	s->st_type = type;
2216 	if (IPFW_ISXLAT(type)) {
2217 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2218 
2219 		x->xlat_dir = MATCH_NONE;
2220 		x->xlat_pcpu = -1;
2221 	}
2222 
2223 	/*
2224 	 * Update this state:
2225 	 * Set st_expire and st_state.
2226 	 */
2227 	ipfw_state_update(id, MATCH_FORWARD, tcp, s);
2228 
2229 	return (s);
2230 }
2231 
2232 static struct ipfw_state *
2233 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2234     uint16_t type, struct ip_fw *rule, struct ipfw_track *t,
2235     const struct tcphdr *tcp)
2236 {
2237 	struct ipfw_state *s, *dup;
2238 
2239 	s = ipfw_state_alloc(ctx, id, type, rule, tcp);
2240 	if (s == NULL)
2241 		return (NULL);
2242 
2243 	ctx->ipfw_state_cnt++;
2244 	ctx->ipfw_state_loosecnt++;
2245 	if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) {
2246 		ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt;
2247 		ctx->ipfw_state_loosecnt = 0;
2248 	}
2249 
2250 	dup = ipfw_state_link(ctx, s);
2251 	if (dup != NULL)
2252 		panic("ipfw: %u state exists %p", type, dup);
2253 
2254 	if (t != NULL) {
2255 		/* Keep the track referenced. */
2256 		LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink);
2257 		s->st_track = t;
2258 	}
2259 	return (s);
2260 }
2261 
2262 static boolean_t
2263 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t)
2264 {
2265 	struct ipfw_trkcnt *trk;
2266 	boolean_t trk_freed = FALSE;
2267 
2268 	KASSERT(t->t_count != NULL, ("track anchor"));
2269 	KASSERT(LIST_EMPTY(&t->t_state_list),
2270 	    ("invalid track is still referenced"));
2271 
2272 	trk = t->t_trkcnt;
2273 	KASSERT(trk != NULL, ("track has no trkcnt"));
2274 
2275 	RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2276 	TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link);
2277 	kfree(t, M_IPFW);
2278 
2279 	/*
2280 	 * fdrop() style reference counting.
2281 	 * See kern/kern_descrip.c fdrop().
2282 	 */
2283 	for (;;) {
2284 		int refs = trk->tc_refs;
2285 
2286 		cpu_ccfence();
2287 		KASSERT(refs > 0, ("invalid trkcnt refs %d", refs));
2288 		if (refs == 1) {
2289 			IPFW_TRKCNT_TOKGET;
2290 			if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) {
2291 				KASSERT(trk->tc_count == 0,
2292 				    ("%d states reference this trkcnt",
2293 				     trk->tc_count));
2294 				RB_REMOVE(ipfw_trkcnt_tree,
2295 				    &ipfw_gd.ipfw_trkcnt_tree, trk);
2296 
2297 				KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0,
2298 				    ("invalid trkcnt cnt %d",
2299 				     ipfw_gd.ipfw_trkcnt_cnt));
2300 				ipfw_gd.ipfw_trkcnt_cnt--;
2301 				IPFW_TRKCNT_TOKREL;
2302 
2303 				if (ctx->ipfw_trkcnt_spare == NULL)
2304 					ctx->ipfw_trkcnt_spare = trk;
2305 				else
2306 					kfree(trk, M_IPFW);
2307 				trk_freed = TRUE;
2308 				break; /* done! */
2309 			}
2310 			IPFW_TRKCNT_TOKREL;
2311 			/* retry */
2312 		} else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) {
2313 			break; /* done! */
2314 		}
2315 		/* retry */
2316 	}
2317 	return (trk_freed);
2318 }
2319 
2320 static void
2321 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule)
2322 {
2323 	struct ipfw_track *t, *tn;
2324 
2325 	TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) {
2326 		if (t->t_count == NULL) /* anchor */
2327 			continue;
2328 		if (rule != NULL && t->t_rule != rule)
2329 			continue;
2330 		ipfw_track_free(ctx, t);
2331 	}
2332 }
2333 
2334 static boolean_t
2335 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t,
2336     boolean_t reap)
2337 {
2338 	struct ipfw_state *s, *sn;
2339 	boolean_t ret = FALSE;
2340 
2341 	KASSERT(t->t_count != NULL, ("track anchor"));
2342 
2343 	if (LIST_EMPTY(&t->t_state_list))
2344 		return (FALSE);
2345 
2346 	/*
2347 	 * Do not expire more than once per second, it is useless.
2348 	 */
2349 	if (t->t_lastexp == time_uptime)
2350 		return (FALSE);
2351 	t->t_lastexp = time_uptime;
2352 
2353 	LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) {
2354 		if (IPFW_STATE_ISDEAD(s) || (reap && IPFW_STATE_TCPCLOSED(s))) {
2355 			KASSERT(s->st_track == t,
2356 			    ("state track %p does not match %p",
2357 			     s->st_track, t));
2358 			ipfw_state_del(ctx, s);
2359 			ret = TRUE;
2360 		}
2361 	}
2362 	return (ret);
2363 }
2364 
2365 static __inline struct ipfw_trkcnt *
2366 ipfw_trkcnt_alloc(struct ipfw_context *ctx)
2367 {
2368 	struct ipfw_trkcnt *trk;
2369 
2370 	if (ctx->ipfw_trkcnt_spare != NULL) {
2371 		trk = ctx->ipfw_trkcnt_spare;
2372 		ctx->ipfw_trkcnt_spare = NULL;
2373 	} else {
2374 		trk = kmalloc(sizeof(*trk), M_IPFW,
2375 			      M_INTWAIT | M_NULLOK | M_CACHEALIGN);
2376 	}
2377 	return (trk);
2378 }
2379 
2380 static void
2381 ipfw_track_expire_done(struct ipfw_context *ctx)
2382 {
2383 
2384 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2385 	    ("trackexp is not in progress"));
2386 	ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP;
2387 	callout_reset(&ctx->ipfw_trackto_ch, hz,
2388 	    ipfw_track_expire_ipifunc, NULL);
2389 }
2390 
2391 static void
2392 ipfw_track_expire_more(struct ipfw_context *ctx)
2393 {
2394 	struct netmsg_base *nm = &ctx->ipfw_trackexp_more;
2395 
2396 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2397 	    ("trackexp is not in progress"));
2398 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
2399 	    ("trackexp more did not finish"));
2400 	netisr_sendmsg_oncpu(nm);
2401 }
2402 
2403 static int
2404 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor,
2405     int scan_max, int expire_max)
2406 {
2407 	struct ipfw_track *t;
2408 	int scanned = 0, expired = 0;
2409 	boolean_t reap = FALSE;
2410 
2411 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2412 	    ("trackexp is not in progress"));
2413 
2414 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP)
2415 		reap = TRUE;
2416 
2417 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2418 		if (scanned++ >= scan_max) {
2419 			ipfw_track_expire_more(ctx);
2420 			return (expired);
2421 		}
2422 
2423 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2424 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2425 
2426 		if (t->t_count == NULL) /* anchor */
2427 			continue;
2428 
2429 		ipfw_track_state_expire(ctx, t, reap);
2430 		if (!LIST_EMPTY(&t->t_state_list)) {
2431 			/* There are states referencing this track. */
2432 			continue;
2433 		}
2434 
2435 		if (TIME_LEQ(t->t_expire, time_uptime) || reap) {
2436 			/* Expired. */
2437 			if (ipfw_track_free(ctx, t)) {
2438 				if (++expired >= expire_max) {
2439 					ipfw_track_expire_more(ctx);
2440 					return (expired);
2441 				}
2442 			}
2443 		}
2444 	}
2445 	TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2446 	ipfw_track_expire_done(ctx);
2447 	return (expired);
2448 }
2449 
2450 static int
2451 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max)
2452 {
2453 	struct ipfw_track *anchor;
2454 
2455 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0,
2456 	    ("trackexp is in progress"));
2457 	ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP;
2458 
2459 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2460 		ipfw_track_expire_done(ctx);
2461 		return (0);
2462 	}
2463 
2464 	/*
2465 	 * Do not expire more than once per second, it is useless.
2466 	 */
2467 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 &&
2468 	    ctx->ipfw_track_lastexp == time_uptime) {
2469 		ipfw_track_expire_done(ctx);
2470 		return (0);
2471 	}
2472 	ctx->ipfw_track_lastexp = time_uptime;
2473 
2474 	anchor = &ctx->ipfw_trackexp_anch;
2475 	TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link);
2476 	return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max));
2477 }
2478 
2479 static void
2480 ipfw_track_expire_more_dispatch(netmsg_t nm)
2481 {
2482 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2483 	struct ipfw_track *anchor;
2484 
2485 	ASSERT_NETISR_NCPUS(mycpuid);
2486 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP,
2487 	    ("trackexp is not in progress"));
2488 
2489 	/* Reply ASAP */
2490 	netisr_replymsg(&nm->base, 0);
2491 
2492 	anchor = &ctx->ipfw_trackexp_anch;
2493 	if (RB_EMPTY(&ctx->ipfw_track_tree)) {
2494 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2495 		ipfw_track_expire_done(ctx);
2496 		return;
2497 	}
2498 	ipfw_track_expire_loop(ctx, anchor,
2499 	    ipfw_track_scan_max, ipfw_track_expire_max);
2500 }
2501 
2502 static void
2503 ipfw_track_expire_dispatch(netmsg_t nm)
2504 {
2505 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
2506 
2507 	ASSERT_NETISR_NCPUS(mycpuid);
2508 
2509 	/* Reply ASAP */
2510 	crit_enter();
2511 	netisr_replymsg(&nm->base, 0);
2512 	crit_exit();
2513 
2514 	if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) {
2515 		/* Running; done. */
2516 		return;
2517 	}
2518 	ipfw_track_expire_start(ctx,
2519 	    ipfw_track_scan_max, ipfw_track_expire_max);
2520 }
2521 
2522 static void
2523 ipfw_track_expire_ipifunc(void *dummy __unused)
2524 {
2525 	struct netmsg_base *msg;
2526 
2527 	KKASSERT(mycpuid < netisr_ncpus);
2528 	msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm;
2529 
2530 	crit_enter();
2531 	if (msg->lmsg.ms_flags & MSGF_DONE)
2532 		netisr_sendmsg_oncpu(msg);
2533 	crit_exit();
2534 }
2535 
2536 static int
2537 ipfw_track_reap(struct ipfw_context *ctx)
2538 {
2539 	struct ipfw_track *t, *anchor;
2540 	int expired;
2541 
2542 	if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) {
2543 		/*
2544 		 * Kick start track expiring.  Ignore scan limit,
2545 		 * we are short of tracks.
2546 		 */
2547 		ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP;
2548 		expired = ipfw_track_expire_start(ctx, INT_MAX,
2549 		    ipfw_track_reap_max);
2550 		ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP;
2551 		return (expired);
2552 	}
2553 
2554 	/*
2555 	 * Tracks are being expired.
2556 	 */
2557 
2558 	if (RB_EMPTY(&ctx->ipfw_track_tree))
2559 		return (0);
2560 
2561 	expired = 0;
2562 	anchor = &ctx->ipfw_trackexp_anch;
2563 	while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) {
2564 		/*
2565 		 * Ignore scan limit; we are short of tracks.
2566 		 */
2567 
2568 		TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link);
2569 		TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link);
2570 
2571 		if (t->t_count == NULL) /* anchor */
2572 			continue;
2573 
2574 		ipfw_track_state_expire(ctx, t, TRUE);
2575 		if (!LIST_EMPTY(&t->t_state_list)) {
2576 			/* There are states referencing this track. */
2577 			continue;
2578 		}
2579 
2580 		if (ipfw_track_free(ctx, t)) {
2581 			if (++expired >= ipfw_track_reap_max) {
2582 				ipfw_track_expire_more(ctx);
2583 				break;
2584 			}
2585 		}
2586 	}
2587 	/*
2588 	 * NOTE:
2589 	 * Leave the anchor on the list, even if the end of the list has
2590 	 * been reached.  ipfw_track_expire_more_dispatch() will handle
2591 	 * the removal.
2592 	 */
2593 	return (expired);
2594 }
2595 
2596 static struct ipfw_track *
2597 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id,
2598     uint16_t limit_mask, struct ip_fw *rule)
2599 {
2600 	struct ipfw_track *key, *t, *dup;
2601 	struct ipfw_trkcnt *trk, *ret;
2602 	boolean_t do_expire = FALSE;
2603 
2604 	KASSERT(rule->track_ruleid != 0,
2605 	    ("rule %u has no track ruleid", rule->rulenum));
2606 
2607 	key = &ctx->ipfw_track_tmpkey;
2608 	key->t_proto = id->proto;
2609 	key->t_addrs = 0;
2610 	key->t_ports = 0;
2611 	key->t_rule = rule;
2612 	if (limit_mask & DYN_SRC_ADDR)
2613 		key->t_saddr = id->src_ip;
2614 	if (limit_mask & DYN_DST_ADDR)
2615 		key->t_daddr = id->dst_ip;
2616 	if (limit_mask & DYN_SRC_PORT)
2617 		key->t_sport = id->src_port;
2618 	if (limit_mask & DYN_DST_PORT)
2619 		key->t_dport = id->dst_port;
2620 
2621 	t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key);
2622 	if (t != NULL)
2623 		goto done;
2624 
2625 	t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK);
2626 	if (t == NULL) {
2627 		ctx->ipfw_tks_nomem++;
2628 		return (NULL);
2629 	}
2630 
2631 	t->t_key = key->t_key;
2632 	t->t_rule = rule;
2633 	t->t_lastexp = 0;
2634 	LIST_INIT(&t->t_state_list);
2635 
2636 	if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) {
2637 		time_t globexp, uptime;
2638 
2639 		trk = NULL;
2640 		do_expire = TRUE;
2641 
2642 		/*
2643 		 * Do not expire globally more than once per second,
2644 		 * it is useless.
2645 		 */
2646 		uptime = time_uptime;
2647 		globexp = ipfw_gd.ipfw_track_globexp;
2648 		if (globexp != uptime &&
2649 		    atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp,
2650 		    globexp, uptime)) {
2651 			int cpu;
2652 
2653 			/* Expire tracks on other CPUs. */
2654 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2655 				if (cpu == mycpuid)
2656 					continue;
2657 				lwkt_send_ipiq(globaldata_find(cpu),
2658 				    ipfw_track_expire_ipifunc, NULL);
2659 			}
2660 		}
2661 	} else {
2662 		trk = ipfw_trkcnt_alloc(ctx);
2663 	}
2664 	if (trk == NULL) {
2665 		struct ipfw_trkcnt *tkey;
2666 
2667 		tkey = &ctx->ipfw_trkcnt_tmpkey;
2668 		key = NULL; /* tkey overlaps key */
2669 
2670 		tkey->tc_key = t->t_key;
2671 		tkey->tc_ruleid = rule->track_ruleid;
2672 
2673 		IPFW_TRKCNT_TOKGET;
2674 		trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2675 		    tkey);
2676 		if (trk == NULL) {
2677 			IPFW_TRKCNT_TOKREL;
2678 			if (do_expire) {
2679 				ctx->ipfw_tks_reap++;
2680 				if (ipfw_track_reap(ctx) > 0) {
2681 					if (ipfw_gd.ipfw_trkcnt_cnt <
2682 					    ipfw_track_max) {
2683 						trk = ipfw_trkcnt_alloc(ctx);
2684 						if (trk != NULL)
2685 							goto install;
2686 						ctx->ipfw_tks_cntnomem++;
2687 					} else {
2688 						ctx->ipfw_tks_overflow++;
2689 					}
2690 				} else {
2691 					ctx->ipfw_tks_reapfailed++;
2692 					ctx->ipfw_tks_overflow++;
2693 				}
2694 			} else {
2695 				ctx->ipfw_tks_cntnomem++;
2696 			}
2697 			kfree(t, M_IPFW);
2698 			return (NULL);
2699 		}
2700 		KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus,
2701 		    ("invalid trkcnt refs %d", trk->tc_refs));
2702 		atomic_add_int(&trk->tc_refs, 1);
2703 		IPFW_TRKCNT_TOKREL;
2704 	} else {
2705 install:
2706 		trk->tc_key = t->t_key;
2707 		trk->tc_ruleid = rule->track_ruleid;
2708 		trk->tc_refs = 0;
2709 		trk->tc_count = 0;
2710 		trk->tc_expire = 0;
2711 		trk->tc_rulenum = rule->rulenum;
2712 
2713 		IPFW_TRKCNT_TOKGET;
2714 		ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree,
2715 		    trk);
2716 		if (ret != NULL) {
2717 			KASSERT(ret->tc_refs > 0 &&
2718 			    ret->tc_refs < netisr_ncpus,
2719 			    ("invalid trkcnt refs %d", ret->tc_refs));
2720 			KASSERT(ctx->ipfw_trkcnt_spare == NULL,
2721 			    ("trkcnt spare was installed"));
2722 			ctx->ipfw_trkcnt_spare = trk;
2723 			trk = ret;
2724 		} else {
2725 			ipfw_gd.ipfw_trkcnt_cnt++;
2726 		}
2727 		atomic_add_int(&trk->tc_refs, 1);
2728 		IPFW_TRKCNT_TOKREL;
2729 	}
2730 	t->t_count = &trk->tc_count;
2731 	t->t_trkcnt = trk;
2732 
2733 	dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t);
2734 	if (dup != NULL)
2735 		panic("ipfw: track exists");
2736 	TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link);
2737 done:
2738 	t->t_expire = time_uptime + dyn_short_lifetime;
2739 	return (t);
2740 }
2741 
2742 /*
2743  * Install state for rule type cmd->o.opcode
2744  *
2745  * Returns NULL if state is not installed because of errors or because
2746  * states limitations are enforced.
2747  */
2748 static struct ipfw_state *
2749 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule,
2750     ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp)
2751 {
2752 	struct ipfw_state *s;
2753 	struct ipfw_track *t;
2754 	int count, diff;
2755 
2756 	if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max &&
2757 	    (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) {
2758 		boolean_t overflow = TRUE;
2759 
2760 		ctx->ipfw_sts_reap++;
2761 		if (ipfw_state_reap(ctx, diff) == 0)
2762 			ctx->ipfw_sts_reapfailed++;
2763 		if (ipfw_state_cntsync() < ipfw_state_max)
2764 			overflow = FALSE;
2765 
2766 		if (overflow) {
2767 			time_t globexp, uptime;
2768 			int cpu;
2769 
2770 			/*
2771 			 * Do not expire globally more than once per second,
2772 			 * it is useless.
2773 			 */
2774 			uptime = time_uptime;
2775 			globexp = ipfw_gd.ipfw_state_globexp;
2776 			if (globexp == uptime ||
2777 			    !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp,
2778 			    globexp, uptime)) {
2779 				ctx->ipfw_sts_overflow++;
2780 				return (NULL);
2781 			}
2782 
2783 			/* Expire states on other CPUs. */
2784 			for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
2785 				if (cpu == mycpuid)
2786 					continue;
2787 				lwkt_send_ipiq(globaldata_find(cpu),
2788 				    ipfw_state_expire_ipifunc, NULL);
2789 			}
2790 			ctx->ipfw_sts_overflow++;
2791 			return (NULL);
2792 		}
2793 	}
2794 
2795 	switch (cmd->o.opcode) {
2796 	case O_KEEP_STATE: /* bidir rule */
2797 	case O_REDIRECT:
2798 		s = ipfw_state_add(ctx, &args->f_id, cmd->o.opcode, rule, NULL,
2799 		    tcp);
2800 		if (s == NULL)
2801 			return (NULL);
2802 		break;
2803 
2804 	case O_LIMIT: /* limit number of sessions */
2805 		t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule);
2806 		if (t == NULL)
2807 			return (NULL);
2808 
2809 		if (*t->t_count >= cmd->conn_limit) {
2810 			if (!ipfw_track_state_expire(ctx, t, TRUE))
2811 				return (NULL);
2812 		}
2813 		for (;;) {
2814 			count = *t->t_count;
2815 			if (count >= cmd->conn_limit)
2816 				return (NULL);
2817 			if (atomic_cmpset_int(t->t_count, count, count + 1))
2818 				break;
2819 		}
2820 
2821 		s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp);
2822 		if (s == NULL) {
2823 			/* Undo damage. */
2824 			atomic_subtract_int(t->t_count, 1);
2825 			return (NULL);
2826 		}
2827 		break;
2828 
2829 	default:
2830 		panic("unknown state type %u\n", cmd->o.opcode);
2831 	}
2832 
2833 	if (s->st_type == O_REDIRECT) {
2834 		struct ipfw_xlat *x = (struct ipfw_xlat *)s;
2835 		ipfw_insn_rdr *r = (ipfw_insn_rdr *)cmd;
2836 
2837 		x->xlat_addr = r->addr.s_addr;
2838 		x->xlat_port = r->port;
2839 		x->xlat_ifp = args->m->m_pkthdr.rcvif;
2840 		x->xlat_dir = MATCH_FORWARD;
2841 		KKASSERT(x->xlat_ifp != NULL);
2842 	}
2843 	return (s);
2844 }
2845 
2846 static int
2847 ipfw_table_lookup(struct ipfw_context *ctx, uint16_t tableid,
2848     const struct in_addr *in)
2849 {
2850 	struct radix_node_head *rnh;
2851 	struct sockaddr_in sin;
2852 	struct ipfw_tblent *te;
2853 
2854 	KASSERT(tableid < ipfw_table_max, ("invalid tableid %u", tableid));
2855 	rnh = ctx->ipfw_tables[tableid];
2856 	if (rnh == NULL)
2857 		return (0); /* no match */
2858 
2859 	memset(&sin, 0, sizeof(sin));
2860 	sin.sin_family = AF_INET;
2861 	sin.sin_len = sizeof(sin);
2862 	sin.sin_addr = *in;
2863 
2864 	te = (struct ipfw_tblent *)rnh->rnh_matchaddr((char *)&sin, rnh);
2865 	if (te == NULL)
2866 		return (0); /* no match */
2867 
2868 	te->te_use++;
2869 	te->te_lastuse = time_second;
2870 	return (1); /* match */
2871 }
2872 
2873 /*
2874  * Transmit a TCP packet, containing either a RST or a keepalive.
2875  * When flags & TH_RST, we are sending a RST packet, because of a
2876  * "reset" action matched the packet.
2877  * Otherwise we are sending a keepalive, and flags & TH_
2878  *
2879  * Only {src,dst}_{ip,port} of "id" are used.
2880  */
2881 static void
2882 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags)
2883 {
2884 	struct mbuf *m;
2885 	struct ip *ip;
2886 	struct tcphdr *tcp;
2887 	struct route sro;	/* fake route */
2888 
2889 	MGETHDR(m, M_NOWAIT, MT_HEADER);
2890 	if (m == NULL)
2891 		return;
2892 	m->m_pkthdr.rcvif = NULL;
2893 	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
2894 	m->m_data += max_linkhdr;
2895 
2896 	ip = mtod(m, struct ip *);
2897 	bzero(ip, m->m_len);
2898 	tcp = (struct tcphdr *)(ip + 1); /* no IP options */
2899 	ip->ip_p = IPPROTO_TCP;
2900 	tcp->th_off = 5;
2901 
2902 	/*
2903 	 * Assume we are sending a RST (or a keepalive in the reverse
2904 	 * direction), swap src and destination addresses and ports.
2905 	 */
2906 	ip->ip_src.s_addr = htonl(id->dst_ip);
2907 	ip->ip_dst.s_addr = htonl(id->src_ip);
2908 	tcp->th_sport = htons(id->dst_port);
2909 	tcp->th_dport = htons(id->src_port);
2910 	if (flags & TH_RST) {	/* we are sending a RST */
2911 		if (flags & TH_ACK) {
2912 			tcp->th_seq = htonl(ack);
2913 			tcp->th_ack = htonl(0);
2914 			tcp->th_flags = TH_RST;
2915 		} else {
2916 			if (flags & TH_SYN)
2917 				seq++;
2918 			tcp->th_seq = htonl(0);
2919 			tcp->th_ack = htonl(seq);
2920 			tcp->th_flags = TH_RST | TH_ACK;
2921 		}
2922 	} else {
2923 		/*
2924 		 * We are sending a keepalive. flags & TH_SYN determines
2925 		 * the direction, forward if set, reverse if clear.
2926 		 * NOTE: seq and ack are always assumed to be correct
2927 		 * as set by the caller. This may be confusing...
2928 		 */
2929 		if (flags & TH_SYN) {
2930 			/*
2931 			 * we have to rewrite the correct addresses!
2932 			 */
2933 			ip->ip_dst.s_addr = htonl(id->dst_ip);
2934 			ip->ip_src.s_addr = htonl(id->src_ip);
2935 			tcp->th_dport = htons(id->dst_port);
2936 			tcp->th_sport = htons(id->src_port);
2937 		}
2938 		tcp->th_seq = htonl(seq);
2939 		tcp->th_ack = htonl(ack);
2940 		tcp->th_flags = TH_ACK;
2941 	}
2942 
2943 	/*
2944 	 * set ip_len to the payload size so we can compute
2945 	 * the tcp checksum on the pseudoheader
2946 	 * XXX check this, could save a couple of words ?
2947 	 */
2948 	ip->ip_len = htons(sizeof(struct tcphdr));
2949 	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
2950 
2951 	/*
2952 	 * now fill fields left out earlier
2953 	 */
2954 	ip->ip_ttl = ip_defttl;
2955 	ip->ip_len = m->m_pkthdr.len;
2956 
2957 	bzero(&sro, sizeof(sro));
2958 	ip_rtaddr(ip->ip_dst, &sro);
2959 
2960 	m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED;
2961 	ip_output(m, NULL, &sro, 0, NULL, NULL);
2962 	if (sro.ro_rt)
2963 		RTFREE(sro.ro_rt);
2964 }
2965 
2966 /*
2967  * Send a reject message, consuming the mbuf passed as an argument.
2968  */
2969 static void
2970 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len)
2971 {
2972 	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
2973 		/* We need the IP header in host order for icmp_error(). */
2974 		if (args->eh != NULL) {
2975 			struct ip *ip = mtod(args->m, struct ip *);
2976 
2977 			ip->ip_len = ntohs(ip->ip_len);
2978 			ip->ip_off = ntohs(ip->ip_off);
2979 		}
2980 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
2981 	} else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) {
2982 		struct tcphdr *const tcp =
2983 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
2984 
2985 		if ((tcp->th_flags & TH_RST) == 0) {
2986 			send_pkt(&args->f_id, ntohl(tcp->th_seq),
2987 				 ntohl(tcp->th_ack), tcp->th_flags | TH_RST);
2988 		}
2989 		m_freem(args->m);
2990 	} else {
2991 		m_freem(args->m);
2992 	}
2993 	args->m = NULL;
2994 }
2995 
2996 /*
2997  * Given an ip_fw *, lookup_next_rule will return a pointer
2998  * to the next rule, which can be either the jump
2999  * target (for skipto instructions) or the next one in the list (in
3000  * all other cases including a missing jump target).
3001  * The result is also written in the "next_rule" field of the rule.
3002  * Backward jumps are not allowed, so start looking from the next
3003  * rule...
3004  *
3005  * This never returns NULL -- in case we do not have an exact match,
3006  * the next rule is returned. When the ruleset is changed,
3007  * pointers are flushed so we are always correct.
3008  */
3009 static struct ip_fw *
3010 lookup_next_rule(struct ip_fw *me)
3011 {
3012 	struct ip_fw *rule = NULL;
3013 	ipfw_insn *cmd;
3014 
3015 	/* look for action, in case it is a skipto */
3016 	cmd = ACTION_PTR(me);
3017 	if (cmd->opcode == O_LOG)
3018 		cmd += F_LEN(cmd);
3019 	if (cmd->opcode == O_SKIPTO) {
3020 		for (rule = me->next; rule; rule = rule->next) {
3021 			if (rule->rulenum >= cmd->arg1)
3022 				break;
3023 		}
3024 	}
3025 	if (rule == NULL)			/* failure or not a skipto */
3026 		rule = me->next;
3027 	me->next_rule = rule;
3028 	return rule;
3029 }
3030 
3031 static int
3032 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif,
3033 		enum ipfw_opcodes opcode, uid_t uid)
3034 {
3035 	struct in_addr src_ip, dst_ip;
3036 	struct inpcbinfo *pi;
3037 	boolean_t wildcard;
3038 	struct inpcb *pcb;
3039 
3040 	if (fid->proto == IPPROTO_TCP) {
3041 		wildcard = FALSE;
3042 		pi = &tcbinfo[mycpuid];
3043 	} else if (fid->proto == IPPROTO_UDP) {
3044 		wildcard = TRUE;
3045 		pi = &udbinfo[mycpuid];
3046 	} else {
3047 		return 0;
3048 	}
3049 
3050 	/*
3051 	 * Values in 'fid' are in host byte order
3052 	 */
3053 	dst_ip.s_addr = htonl(fid->dst_ip);
3054 	src_ip.s_addr = htonl(fid->src_ip);
3055 	if (oif) {
3056 		pcb = in_pcblookup_hash(pi,
3057 			dst_ip, htons(fid->dst_port),
3058 			src_ip, htons(fid->src_port),
3059 			wildcard, oif);
3060 	} else {
3061 		pcb = in_pcblookup_hash(pi,
3062 			src_ip, htons(fid->src_port),
3063 			dst_ip, htons(fid->dst_port),
3064 			wildcard, NULL);
3065 	}
3066 	if (pcb == NULL || pcb->inp_socket == NULL)
3067 		return 0;
3068 
3069 	if (opcode == O_UID) {
3070 #define socheckuid(a,b)	((a)->so_cred->cr_uid != (b))
3071 		return !socheckuid(pcb->inp_socket, uid);
3072 #undef socheckuid
3073 	} else  {
3074 		return groupmember(uid, pcb->inp_socket->so_cred);
3075 	}
3076 }
3077 
3078 static int
3079 ipfw_match_ifip(ipfw_insn_ifip *cmd, const struct in_addr *ip)
3080 {
3081 
3082 	if (__predict_false((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)) {
3083 		struct ifaddr_container *ifac;
3084 		struct ifnet *ifp;
3085 
3086 		ifp = ifunit_netisr(cmd->ifname);
3087 		if (ifp == NULL)
3088 			return (0);
3089 
3090 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
3091 			struct ifaddr *ia = ifac->ifa;
3092 
3093 			if (ia->ifa_addr == NULL)
3094 				continue;
3095 			if (ia->ifa_addr->sa_family != AF_INET)
3096 				continue;
3097 
3098 			cmd->mask.s_addr = INADDR_ANY;
3099 			if (cmd->o.arg1 & IPFW_IFIP_NET) {
3100 				cmd->mask = ((struct sockaddr_in *)
3101 				    ia->ifa_netmask)->sin_addr;
3102 			}
3103 			if (cmd->mask.s_addr == INADDR_ANY)
3104 				cmd->mask.s_addr = INADDR_BROADCAST;
3105 
3106 			cmd->addr =
3107 			    ((struct sockaddr_in *)ia->ifa_addr)->sin_addr;
3108 			cmd->addr.s_addr &= cmd->mask.s_addr;
3109 
3110 			cmd->o.arg1 |= IPFW_IFIP_VALID;
3111 			break;
3112 		}
3113 		if ((cmd->o.arg1 & IPFW_IFIP_VALID) == 0)
3114 			return (0);
3115 	}
3116 	return ((ip->s_addr & cmd->mask.s_addr) == cmd->addr.s_addr);
3117 }
3118 
3119 static void
3120 ipfw_xlate(const struct ipfw_xlat *x, struct mbuf *m,
3121     struct in_addr *old_addr, uint16_t *old_port)
3122 {
3123 	struct ip *ip = mtod(m, struct ip *);
3124 	struct in_addr *addr;
3125 	uint16_t *port, *csum, dlen = 0;
3126 	uint8_t udp = 0;
3127 	boolean_t pseudo = FALSE;
3128 
3129 	if (x->xlat_flags & IPFW_STATE_F_XLATSRC) {
3130 		addr = &ip->ip_src;
3131 		switch (ip->ip_p) {
3132 		case IPPROTO_TCP:
3133 			port = &L3HDR(struct tcphdr, ip)->th_sport;
3134 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3135 			break;
3136 		case IPPROTO_UDP:
3137 			port = &L3HDR(struct udphdr, ip)->uh_sport;
3138 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3139 			udp = 1;
3140 			break;
3141 		default:
3142 			panic("ipfw: unsupported src xlate proto %u", ip->ip_p);
3143 		}
3144 	} else {
3145 		addr = &ip->ip_dst;
3146 		switch (ip->ip_p) {
3147 		case IPPROTO_TCP:
3148 			port = &L3HDR(struct tcphdr, ip)->th_dport;
3149 			csum = &L3HDR(struct tcphdr, ip)->th_sum;
3150 			break;
3151 		case IPPROTO_UDP:
3152 			port = &L3HDR(struct udphdr, ip)->uh_dport;
3153 			csum = &L3HDR(struct udphdr, ip)->uh_sum;
3154 			udp = 1;
3155 			break;
3156 		default:
3157 			panic("ipfw: unsupported dst xlate proto %u", ip->ip_p);
3158 		}
3159 	}
3160 	if (old_addr != NULL)
3161 		*old_addr = *addr;
3162 	if (old_port != NULL) {
3163 		if (x->xlat_port != 0)
3164 			*old_port = *port;
3165 		else
3166 			*old_port = 0;
3167 	}
3168 
3169 	if (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TSO)) {
3170 		if ((m->m_pkthdr.csum_flags & CSUM_TSO) == 0)
3171 			dlen = ip->ip_len - (ip->ip_hl << 2);
3172 		pseudo = TRUE;
3173 	}
3174 
3175 	if (!pseudo) {
3176 		const uint16_t *oaddr, *naddr;
3177 
3178 		oaddr = (const uint16_t *)&addr->s_addr;
3179 		naddr = (const uint16_t *)&x->xlat_addr;
3180 
3181 		ip->ip_sum = pfil_cksum_fixup(pfil_cksum_fixup(ip->ip_sum,
3182 		    oaddr[0], naddr[0], 0), oaddr[1], naddr[1], 0);
3183 		*csum = pfil_cksum_fixup(pfil_cksum_fixup(*csum,
3184 		    oaddr[0], naddr[0], udp), oaddr[1], naddr[1], udp);
3185 	}
3186 	addr->s_addr = x->xlat_addr;
3187 
3188 	if (x->xlat_port != 0) {
3189 		if (!pseudo) {
3190 			*csum = pfil_cksum_fixup(*csum, *port, x->xlat_port,
3191 			    udp);
3192 		}
3193 		*port = x->xlat_port;
3194 	}
3195 
3196 	if (pseudo) {
3197 		*csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
3198 		    htons(dlen + ip->ip_p));
3199 	}
3200 }
3201 
3202 static void
3203 ipfw_ip_xlate_dispatch(netmsg_t nmsg)
3204 {
3205 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
3206 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
3207 	struct mbuf *m = nm->m;
3208 	struct ipfw_xlat *x = nm->arg1;
3209 	struct ip_fw *rule = x->xlat_rule;
3210 
3211 	ASSERT_NETISR_NCPUS(mycpuid);
3212 	KASSERT(rule->cpuid == mycpuid,
3213 	    ("rule does not belong to cpu%d", mycpuid));
3214 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
3215 	    ("mbuf does not have ipfw continue rule"));
3216 
3217 	KASSERT(ctx->ipfw_cont_rule == NULL,
3218 	    ("pending ipfw continue rule"));
3219 	KASSERT(ctx->ipfw_cont_xlat == NULL,
3220 	    ("pending ipfw continue xlat"));
3221 	ctx->ipfw_cont_rule = rule;
3222 	ctx->ipfw_cont_xlat = x;
3223 
3224 	if (nm->arg2 == 0)
3225 		ip_input(m);
3226 	else
3227 		ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
3228 
3229 	/* May not be cleared, if ipfw was unload/disabled. */
3230 	ctx->ipfw_cont_rule = NULL;
3231 	ctx->ipfw_cont_xlat = NULL;
3232 
3233 	/*
3234 	 * This state is no longer used; decrement its xlat_crefs,
3235 	 * so this state can be deleted.
3236 	 */
3237 	x->xlat_crefs--;
3238 	/*
3239 	 * This rule is no longer used; decrement its cross_refs,
3240 	 * so this rule can be deleted.
3241 	 *
3242 	 * NOTE:
3243 	 * Decrement cross_refs in the last step of this function,
3244 	 * so that the module could be unloaded safely.
3245 	 */
3246 	rule->cross_refs--;
3247 }
3248 
3249 static void
3250 ipfw_xlate_redispatch(struct mbuf *m, int cpuid, struct ipfw_xlat *x,
3251     uint32_t flags)
3252 {
3253 	struct netmsg_genpkt *nm;
3254 
3255 	KASSERT(x->xlat_pcpu == cpuid, ("xlat paired cpu%d, target cpu%d",
3256 	    x->xlat_pcpu, cpuid));
3257 
3258 	/*
3259 	 * Bump cross_refs to prevent this rule and its siblings
3260 	 * from being deleted, while this mbuf is inflight.  The
3261 	 * cross_refs of the sibling rule on the target cpu will
3262 	 * be decremented, once this mbuf is going to be filtered
3263 	 * on the target cpu.
3264 	 */
3265 	x->xlat_rule->cross_refs++;
3266 	/*
3267 	 * Bump xlat_crefs to prevent this state and its paired
3268 	 * state from being deleted, while this mbuf is inflight.
3269 	 * The xlat_crefs of the paired state on the target cpu
3270 	 * will be decremented, once this mbuf is going to be
3271 	 * filtered on the target cpu.
3272 	 */
3273 	x->xlat_crefs++;
3274 
3275 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
3276 	if (flags & IPFW_XLATE_INSERT)
3277 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATINS;
3278 	if (flags & IPFW_XLATE_FORWARD)
3279 		m->m_pkthdr.fw_flags |= IPFW_MBUF_XLATFWD;
3280 
3281 	if ((flags & IPFW_XLATE_OUTPUT) == 0) {
3282 		struct ip *ip = mtod(m, struct ip *);
3283 
3284 		/*
3285 		 * NOTE:
3286 		 * ip_input() expects ip_len/ip_off are in network
3287 		 * byte order.
3288 		 */
3289 		ip->ip_len = htons(ip->ip_len);
3290 		ip->ip_off = htons(ip->ip_off);
3291 	}
3292 
3293 	nm = &m->m_hdr.mh_genmsg;
3294 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
3295 	    ipfw_ip_xlate_dispatch);
3296 	nm->m = m;
3297 	nm->arg1 = x->xlat_pair;
3298 	nm->arg2 = 0;
3299 	if (flags & IPFW_XLATE_OUTPUT)
3300 		nm->arg2 = 1;
3301 	netisr_sendmsg(&nm->base, cpuid);
3302 }
3303 
3304 static struct mbuf *
3305 ipfw_setup_local(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3306     struct ip_fw_local *local, struct ip **ip0)
3307 {
3308 	struct ip *ip = mtod(m, struct ip *);
3309 	struct tcphdr *tcp;
3310 	struct udphdr *udp;
3311 
3312 	/*
3313 	 * Collect parameters into local variables for faster matching.
3314 	 */
3315 	if (hlen == 0) {	/* do not grab addresses for non-ip pkts */
3316 		local->proto = args->f_id.proto = 0;	/* mark f_id invalid */
3317 		goto done;
3318 	}
3319 
3320 	local->proto = args->f_id.proto = ip->ip_p;
3321 	local->src_ip = ip->ip_src;
3322 	local->dst_ip = ip->ip_dst;
3323 	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
3324 		local->offset = ntohs(ip->ip_off) & IP_OFFMASK;
3325 		local->ip_len = ntohs(ip->ip_len);
3326 	} else {
3327 		local->offset = ip->ip_off & IP_OFFMASK;
3328 		local->ip_len = ip->ip_len;
3329 	}
3330 
3331 #define PULLUP_TO(len)					\
3332 do {							\
3333 	if (m->m_len < (len)) {				\
3334 		args->m = m = m_pullup(m, (len));	\
3335 		if (m == NULL) {			\
3336 			ip = NULL;			\
3337 			goto done;			\
3338 		}					\
3339 		ip = mtod(m, struct ip *);		\
3340 	}						\
3341 } while (0)
3342 
3343 	if (local->offset == 0) {
3344 		switch (local->proto) {
3345 		case IPPROTO_TCP:
3346 			PULLUP_TO(hlen + sizeof(struct tcphdr));
3347 			local->tcp = tcp = L3HDR(struct tcphdr, ip);
3348 			local->dst_port = tcp->th_dport;
3349 			local->src_port = tcp->th_sport;
3350 			args->f_id.flags = tcp->th_flags;
3351 			break;
3352 
3353 		case IPPROTO_UDP:
3354 			PULLUP_TO(hlen + sizeof(struct udphdr));
3355 			udp = L3HDR(struct udphdr, ip);
3356 			local->dst_port = udp->uh_dport;
3357 			local->src_port = udp->uh_sport;
3358 			break;
3359 
3360 		case IPPROTO_ICMP:
3361 			PULLUP_TO(hlen + 4);	/* type, code and checksum. */
3362 			args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type;
3363 			break;
3364 
3365 		default:
3366 			break;
3367 		}
3368 	}
3369 
3370 #undef PULLUP_TO
3371 
3372 	args->f_id.src_ip = ntohl(local->src_ip.s_addr);
3373 	args->f_id.dst_ip = ntohl(local->dst_ip.s_addr);
3374 	args->f_id.src_port = local->src_port = ntohs(local->src_port);
3375 	args->f_id.dst_port = local->dst_port = ntohs(local->dst_port);
3376 done:
3377 	*ip0 = ip;
3378 	return (m);
3379 }
3380 
3381 static struct mbuf *
3382 ipfw_rehashm(struct mbuf *m, const int hlen, struct ip_fw_args *args,
3383     struct ip_fw_local *local, struct ip **ip0)
3384 {
3385 	struct ip *ip = mtod(m, struct ip *);
3386 
3387 	ip->ip_len = htons(ip->ip_len);
3388 	ip->ip_off = htons(ip->ip_off);
3389 
3390 	m->m_flags &= ~M_HASH;
3391 	ip_hashfn(&m, 0);
3392 	args->m = m;
3393 	if (m == NULL) {
3394 		*ip0 = NULL;
3395 		return (NULL);
3396 	}
3397 	KASSERT(m->m_flags & M_HASH, ("no hash"));
3398 
3399 	/* 'm' might be changed by ip_hashfn(). */
3400 	ip = mtod(m, struct ip *);
3401 	ip->ip_len = ntohs(ip->ip_len);
3402 	ip->ip_off = ntohs(ip->ip_off);
3403 
3404 	return (ipfw_setup_local(m, hlen, args, local, ip0));
3405 }
3406 
3407 /*
3408  * The main check routine for the firewall.
3409  *
3410  * All arguments are in args so we can modify them and return them
3411  * back to the caller.
3412  *
3413  * Parameters:
3414  *
3415  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
3416  *		Starts with the IP header.
3417  *	args->eh (in)	Mac header if present, or NULL for layer3 packet.
3418  *	args->oif	Outgoing interface, or NULL if packet is incoming.
3419  *		The incoming interface is in the mbuf. (in)
3420  *
3421  *	args->rule	Pointer to the last matching rule (in/out)
3422  *	args->f_id	Addresses grabbed from the packet (out)
3423  *
3424  * Return value:
3425  *
3426  *	If the packet was denied/rejected and has been dropped, *m is equal
3427  *	to NULL upon return.
3428  *
3429  *	IP_FW_DENY	the packet must be dropped.
3430  *	IP_FW_PASS	The packet is to be accepted and routed normally.
3431  *	IP_FW_DIVERT	Divert the packet to port (args->cookie)
3432  *	IP_FW_TEE	Tee the packet to port (args->cookie)
3433  *	IP_FW_DUMMYNET	Send the packet to pipe/queue (args->cookie)
3434  *	IP_FW_CONTINUE	Continue processing on another cpu.
3435  */
3436 static int
3437 ipfw_chk(struct ip_fw_args *args)
3438 {
3439 	/*
3440 	 * Local variables hold state during the processing of a packet.
3441 	 *
3442 	 * IMPORTANT NOTE: to speed up the processing of rules, there
3443 	 * are some assumption on the values of the variables, which
3444 	 * are documented here. Should you change them, please check
3445 	 * the implementation of the various instructions to make sure
3446 	 * that they still work.
3447 	 *
3448 	 * args->eh	The MAC header. It is non-null for a layer2
3449 	 *	packet, it is NULL for a layer-3 packet.
3450 	 *
3451 	 * m | args->m	Pointer to the mbuf, as received from the caller.
3452 	 *	It may change if ipfw_chk() does an m_pullup, or if it
3453 	 *	consumes the packet because it calls send_reject().
3454 	 *	XXX This has to change, so that ipfw_chk() never modifies
3455 	 *	or consumes the buffer.
3456 	 * ip	is simply an alias of the value of m, and it is kept
3457 	 *	in sync with it (the packet is	supposed to start with
3458 	 *	the ip header).
3459 	 */
3460 	struct mbuf *m = args->m;
3461 	struct ip *ip = mtod(m, struct ip *);
3462 
3463 	/*
3464 	 * oif | args->oif	If NULL, ipfw_chk has been called on the
3465 	 *	inbound path (ether_input, ip_input).
3466 	 *	If non-NULL, ipfw_chk has been called on the outbound path
3467 	 *	(ether_output, ip_output).
3468 	 */
3469 	struct ifnet *oif = args->oif;
3470 
3471 	struct ip_fw *f = NULL;		/* matching rule */
3472 	int retval = IP_FW_PASS;
3473 	struct m_tag *mtag;
3474 	struct divert_info *divinfo;
3475 	struct ipfw_state *s;
3476 
3477 	/*
3478 	 * hlen	The length of the IPv4 header.
3479 	 *	hlen >0 means we have an IPv4 packet.
3480 	 */
3481 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
3482 
3483 	struct ip_fw_local lc;
3484 
3485 	/*
3486 	 * dyn_dir = MATCH_UNKNOWN when rules unchecked,
3487 	 * 	MATCH_NONE when checked and not matched (dyn_f = NULL),
3488 	 *	MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL)
3489 	 */
3490 	int dyn_dir = MATCH_UNKNOWN;
3491 	struct ip_fw *dyn_f = NULL;
3492 	int cpuid = mycpuid;
3493 	struct ipfw_context *ctx;
3494 
3495 	ASSERT_NETISR_NCPUS(cpuid);
3496 	ctx = ipfw_ctx[cpuid];
3497 
3498 	if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED)
3499 		return IP_FW_PASS;	/* accept */
3500 
3501 	if (args->eh == NULL ||		/* layer 3 packet */
3502 	    (m->m_pkthdr.len >= sizeof(struct ip) &&
3503 	     ntohs(args->eh->ether_type) == ETHERTYPE_IP))
3504 		hlen = ip->ip_hl << 2;
3505 
3506 	memset(&lc, 0, sizeof(lc));
3507 
3508 	m = ipfw_setup_local(m, hlen, args, &lc, &ip);
3509 	if (m == NULL)
3510 		goto pullup_failed;
3511 
3512 	if (args->rule) {
3513 		/*
3514 		 * Packet has already been tagged. Look for the next rule
3515 		 * to restart processing.
3516 		 *
3517 		 * If fw_one_pass != 0 then just accept it.
3518 		 * XXX should not happen here, but optimized out in
3519 		 * the caller.
3520 		 */
3521 		if (fw_one_pass && (args->flags & IP_FWARG_F_CONT) == 0)
3522 			return IP_FW_PASS;
3523 		args->flags &= ~IP_FWARG_F_CONT;
3524 
3525 		/* This rule is being/has been flushed */
3526 		if (ipfw_flushing)
3527 			return IP_FW_DENY;
3528 
3529 		KASSERT(args->rule->cpuid == cpuid,
3530 			("rule used on cpu%d", cpuid));
3531 
3532 		/* This rule was deleted */
3533 		if (args->rule->rule_flags & IPFW_RULE_F_INVALID)
3534 			return IP_FW_DENY;
3535 
3536 		if (args->xlat != NULL) {
3537 			struct ipfw_xlat *x = args->xlat;
3538 
3539 			/* This xlat is being deleted. */
3540 			if (x->xlat_invalid)
3541 				return IP_FW_DENY;
3542 
3543 			f = args->rule;
3544 
3545 			dyn_f = f;
3546 			dyn_dir = (args->flags & IP_FWARG_F_XLATFWD) ?
3547 			    MATCH_FORWARD : MATCH_REVERSE;
3548 
3549 			if (args->flags & IP_FWARG_F_XLATINS) {
3550 				KASSERT(x->xlat_flags & IPFW_STATE_F_XLATSLAVE,
3551 				    ("not slave %u state", x->xlat_type));
3552 				s = ipfw_state_link(ctx, &x->xlat_st);
3553 				if (s != NULL) {
3554 					ctx->ipfw_xlate_conflicts++;
3555 					if (IPFW_STATE_ISDEAD(s)) {
3556 						ipfw_state_remove(ctx, s);
3557 						s = ipfw_state_link(ctx,
3558 						    &x->xlat_st);
3559 					}
3560 					if (s != NULL) {
3561 						if (bootverbose) {
3562 							kprintf("ipfw: "
3563 							"slave %u state "
3564 							"conflicts %u state\n",
3565 							x->xlat_type,
3566 							s->st_type);
3567 						}
3568 						ipfw_xlat_invalidate(x);
3569 						return IP_FW_DENY;
3570 					}
3571 					ctx->ipfw_xlate_cresolved++;
3572 				}
3573 			} else {
3574 				ipfw_state_update(&args->f_id, dyn_dir,
3575 				    lc.tcp, &x->xlat_st);
3576 			}
3577 		} else {
3578 			/* TODO: setup dyn_f, dyn_dir */
3579 
3580 			f = args->rule->next_rule;
3581 			if (f == NULL)
3582 				f = lookup_next_rule(args->rule);
3583 		}
3584 	} else {
3585 		/*
3586 		 * Find the starting rule. It can be either the first
3587 		 * one, or the one after divert_rule if asked so.
3588 		 */
3589 		int skipto;
3590 
3591 		KKASSERT((args->flags &
3592 		    (IP_FWARG_F_XLATINS | IP_FWARG_F_CONT)) == 0);
3593 		KKASSERT(args->xlat == NULL);
3594 
3595 		mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
3596 		if (mtag != NULL) {
3597 			divinfo = m_tag_data(mtag);
3598 			skipto = divinfo->skipto;
3599 		} else {
3600 			skipto = 0;
3601 		}
3602 
3603 		f = ctx->ipfw_layer3_chain;
3604 		if (args->eh == NULL && skipto != 0) {
3605 			/* No skipto during rule flushing */
3606 			if (ipfw_flushing)
3607 				return IP_FW_DENY;
3608 
3609 			if (skipto >= IPFW_DEFAULT_RULE)
3610 				return IP_FW_DENY; /* invalid */
3611 
3612 			while (f && f->rulenum <= skipto)
3613 				f = f->next;
3614 			if (f == NULL)	/* drop packet */
3615 				return IP_FW_DENY;
3616 		} else if (ipfw_flushing) {
3617 			/* Rules are being flushed; skip to default rule */
3618 			f = ctx->ipfw_default_rule;
3619 		}
3620 	}
3621 	if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL)
3622 		m_tag_delete(m, mtag);
3623 
3624 	/*
3625 	 * Now scan the rules, and parse microinstructions for each rule.
3626 	 */
3627 	for (; f; f = f->next) {
3628 		int l, cmdlen;
3629 		ipfw_insn *cmd;
3630 		int skip_or; /* skip rest of OR block */
3631 
3632 again:
3633 		if (ctx->ipfw_set_disable & (1 << f->set)) {
3634 			args->xlat = NULL;
3635 			continue;
3636 		}
3637 
3638 		if (args->xlat != NULL) {
3639 			args->xlat = NULL;
3640 			l = f->cmd_len - f->act_ofs;
3641 			cmd = ACTION_PTR(f);
3642 		} else {
3643 			l = f->cmd_len;
3644 			cmd = f->cmd;
3645 		}
3646 
3647 		skip_or = 0;
3648 		for (; l > 0; l -= cmdlen, cmd += cmdlen) {
3649 			int match;
3650 
3651 			/*
3652 			 * check_body is a jump target used when we find a
3653 			 * CHECK_STATE, and need to jump to the body of
3654 			 * the target rule.
3655 			 */
3656 check_body:
3657 			cmdlen = F_LEN(cmd);
3658 			/*
3659 			 * An OR block (insn_1 || .. || insn_n) has the
3660 			 * F_OR bit set in all but the last instruction.
3661 			 * The first match will set "skip_or", and cause
3662 			 * the following instructions to be skipped until
3663 			 * past the one with the F_OR bit clear.
3664 			 */
3665 			if (skip_or) {		/* skip this instruction */
3666 				if ((cmd->len & F_OR) == 0)
3667 					skip_or = 0;	/* next one is good */
3668 				continue;
3669 			}
3670 			match = 0; /* set to 1 if we succeed */
3671 
3672 			switch (cmd->opcode) {
3673 			/*
3674 			 * The first set of opcodes compares the packet's
3675 			 * fields with some pattern, setting 'match' if a
3676 			 * match is found. At the end of the loop there is
3677 			 * logic to deal with F_NOT and F_OR flags associated
3678 			 * with the opcode.
3679 			 */
3680 			case O_NOP:
3681 				match = 1;
3682 				break;
3683 
3684 			case O_FORWARD_MAC:
3685 				kprintf("ipfw: opcode %d unimplemented\n",
3686 					cmd->opcode);
3687 				break;
3688 
3689 			case O_GID:
3690 			case O_UID:
3691 				/*
3692 				 * We only check offset == 0 && proto != 0,
3693 				 * as this ensures that we have an IPv4
3694 				 * packet with the ports info.
3695 				 */
3696 				if (lc.offset!=0)
3697 					break;
3698 
3699 				match = ipfw_match_uid(&args->f_id, oif,
3700 					cmd->opcode,
3701 					(uid_t)((ipfw_insn_u32 *)cmd)->d[0]);
3702 				break;
3703 
3704 			case O_RECV:
3705 				match = iface_match(m->m_pkthdr.rcvif,
3706 				    (ipfw_insn_if *)cmd);
3707 				break;
3708 
3709 			case O_XMIT:
3710 				match = iface_match(oif, (ipfw_insn_if *)cmd);
3711 				break;
3712 
3713 			case O_VIA:
3714 				match = iface_match(oif ? oif :
3715 				    m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
3716 				break;
3717 
3718 			case O_MACADDR2:
3719 				if (args->eh != NULL) {	/* have MAC header */
3720 					uint32_t *want = (uint32_t *)
3721 						((ipfw_insn_mac *)cmd)->addr;
3722 					uint32_t *mask = (uint32_t *)
3723 						((ipfw_insn_mac *)cmd)->mask;
3724 					uint32_t *hdr = (uint32_t *)args->eh;
3725 
3726 					match =
3727 					(want[0] == (hdr[0] & mask[0]) &&
3728 					 want[1] == (hdr[1] & mask[1]) &&
3729 					 want[2] == (hdr[2] & mask[2]));
3730 				}
3731 				break;
3732 
3733 			case O_MAC_TYPE:
3734 				if (args->eh != NULL) {
3735 					uint16_t t =
3736 					    ntohs(args->eh->ether_type);
3737 					uint16_t *p =
3738 					    ((ipfw_insn_u16 *)cmd)->ports;
3739 					int i;
3740 
3741 					/* Special vlan handling */
3742 					if (m->m_flags & M_VLANTAG)
3743 						t = ETHERTYPE_VLAN;
3744 
3745 					for (i = cmdlen - 1; !match && i > 0;
3746 					     i--, p += 2) {
3747 						match =
3748 						(t >= p[0] && t <= p[1]);
3749 					}
3750 				}
3751 				break;
3752 
3753 			case O_FRAG:
3754 				match = (hlen > 0 && lc.offset != 0);
3755 				break;
3756 
3757 			case O_IPFRAG:
3758 				if (hlen > 0) {
3759 					uint16_t off;
3760 
3761 					if (args->eh != NULL)
3762 						off = ntohs(ip->ip_off);
3763 					else
3764 						off = ip->ip_off;
3765 					if (off & (IP_MF | IP_OFFMASK))
3766 						match = 1;
3767 				}
3768 				break;
3769 
3770 			case O_IN:	/* "out" is "not in" */
3771 				match = (oif == NULL);
3772 				break;
3773 
3774 			case O_LAYER2:
3775 				match = (args->eh != NULL);
3776 				break;
3777 
3778 			case O_PROTO:
3779 				/*
3780 				 * We do not allow an arg of 0 so the
3781 				 * check of "proto" only suffices.
3782 				 */
3783 				match = (lc.proto == cmd->arg1);
3784 				break;
3785 
3786 			case O_IP_SRC:
3787 				match = (hlen > 0 &&
3788 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3789 				    lc.src_ip.s_addr);
3790 				break;
3791 
3792 			case O_IP_SRC_MASK:
3793 				match = (hlen > 0 &&
3794 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3795 				     (lc.src_ip.s_addr &
3796 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3797 				break;
3798 
3799 			case O_IP_SRC_ME:
3800 				if (hlen > 0) {
3801 					struct ifnet *tif;
3802 
3803 					tif = INADDR_TO_IFP(&lc.src_ip);
3804 					match = (tif != NULL);
3805 				}
3806 				break;
3807 
3808 			case O_IP_SRC_TABLE:
3809 				match = ipfw_table_lookup(ctx, cmd->arg1,
3810 				    &lc.src_ip);
3811 				break;
3812 
3813 			case O_IP_SRC_IFIP:
3814 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3815 				    &lc.src_ip);
3816 				break;
3817 
3818 			case O_IP_DST_SET:
3819 			case O_IP_SRC_SET:
3820 				if (hlen > 0) {
3821 					uint32_t *d = (uint32_t *)(cmd + 1);
3822 					uint32_t addr =
3823 					    cmd->opcode == O_IP_DST_SET ?
3824 						args->f_id.dst_ip :
3825 						args->f_id.src_ip;
3826 
3827 					if (addr < d[0])
3828 						break;
3829 					addr -= d[0]; /* subtract base */
3830 					match =
3831 					(addr < cmd->arg1) &&
3832 					 (d[1 + (addr >> 5)] &
3833 					  (1 << (addr & 0x1f)));
3834 				}
3835 				break;
3836 
3837 			case O_IP_DST:
3838 				match = (hlen > 0 &&
3839 				    ((ipfw_insn_ip *)cmd)->addr.s_addr ==
3840 				    lc.dst_ip.s_addr);
3841 				break;
3842 
3843 			case O_IP_DST_MASK:
3844 				match = (hlen > 0) &&
3845 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
3846 				     (lc.dst_ip.s_addr &
3847 				     ((ipfw_insn_ip *)cmd)->mask.s_addr));
3848 				break;
3849 
3850 			case O_IP_DST_ME:
3851 				if (hlen > 0) {
3852 					struct ifnet *tif;
3853 
3854 					tif = INADDR_TO_IFP(&lc.dst_ip);
3855 					match = (tif != NULL);
3856 				}
3857 				break;
3858 
3859 			case O_IP_DST_TABLE:
3860 				match = ipfw_table_lookup(ctx, cmd->arg1,
3861 				    &lc.dst_ip);
3862 				break;
3863 
3864 			case O_IP_DST_IFIP:
3865 				match = ipfw_match_ifip((ipfw_insn_ifip *)cmd,
3866 				    &lc.dst_ip);
3867 				break;
3868 
3869 			case O_IP_SRCPORT:
3870 			case O_IP_DSTPORT:
3871 				/*
3872 				 * offset == 0 && proto != 0 is enough
3873 				 * to guarantee that we have an IPv4
3874 				 * packet with port info.
3875 				 */
3876 				if ((lc.proto==IPPROTO_UDP ||
3877 				     lc.proto==IPPROTO_TCP)
3878 				    && lc.offset == 0) {
3879 					uint16_t x =
3880 					    (cmd->opcode == O_IP_SRCPORT) ?
3881 						lc.src_port : lc.dst_port;
3882 					uint16_t *p =
3883 					    ((ipfw_insn_u16 *)cmd)->ports;
3884 					int i;
3885 
3886 					for (i = cmdlen - 1; !match && i > 0;
3887 					     i--, p += 2) {
3888 						match =
3889 						(x >= p[0] && x <= p[1]);
3890 					}
3891 				}
3892 				break;
3893 
3894 			case O_ICMPCODE:
3895 				match = (lc.offset == 0 &&
3896 				    lc.proto==IPPROTO_ICMP &&
3897 				    icmpcode_match(ip, (ipfw_insn_u32 *)cmd));
3898 				break;
3899 
3900 			case O_ICMPTYPE:
3901 				match = (lc.offset == 0 &&
3902 				    lc.proto==IPPROTO_ICMP &&
3903 				    icmptype_match(ip, (ipfw_insn_u32 *)cmd));
3904 				break;
3905 
3906 			case O_IPOPT:
3907 				match = (hlen > 0 && ipopts_match(ip, cmd));
3908 				break;
3909 
3910 			case O_IPVER:
3911 				match = (hlen > 0 && cmd->arg1 == ip->ip_v);
3912 				break;
3913 
3914 			case O_IPTTL:
3915 				match = (hlen > 0 && cmd->arg1 == ip->ip_ttl);
3916 				break;
3917 
3918 			case O_IPID:
3919 				match = (hlen > 0 &&
3920 				    cmd->arg1 == ntohs(ip->ip_id));
3921 				break;
3922 
3923 			case O_IPLEN:
3924 				match = (hlen > 0 && cmd->arg1 == lc.ip_len);
3925 				break;
3926 
3927 			case O_IPPRECEDENCE:
3928 				match = (hlen > 0 &&
3929 				    (cmd->arg1 == (ip->ip_tos & 0xe0)));
3930 				break;
3931 
3932 			case O_IPTOS:
3933 				match = (hlen > 0 &&
3934 				    flags_match(cmd, ip->ip_tos));
3935 				break;
3936 
3937 			case O_TCPFLAGS:
3938 				match = (lc.proto == IPPROTO_TCP &&
3939 				    lc.offset == 0 &&
3940 				    flags_match(cmd,
3941 					L3HDR(struct tcphdr,ip)->th_flags));
3942 				break;
3943 
3944 			case O_TCPOPTS:
3945 				match = (lc.proto == IPPROTO_TCP &&
3946 				    lc.offset == 0 && tcpopts_match(ip, cmd));
3947 				break;
3948 
3949 			case O_TCPSEQ:
3950 				match = (lc.proto == IPPROTO_TCP &&
3951 				    lc.offset == 0 &&
3952 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3953 					L3HDR(struct tcphdr,ip)->th_seq);
3954 				break;
3955 
3956 			case O_TCPACK:
3957 				match = (lc.proto == IPPROTO_TCP &&
3958 				    lc.offset == 0 &&
3959 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
3960 					L3HDR(struct tcphdr,ip)->th_ack);
3961 				break;
3962 
3963 			case O_TCPWIN:
3964 				match = (lc.proto == IPPROTO_TCP &&
3965 				    lc.offset == 0 &&
3966 				    cmd->arg1 ==
3967 					L3HDR(struct tcphdr,ip)->th_win);
3968 				break;
3969 
3970 			case O_ESTAB:
3971 				/* reject packets which have SYN only */
3972 				/* XXX should i also check for TH_ACK ? */
3973 				match = (lc.proto == IPPROTO_TCP &&
3974 				    lc.offset == 0 &&
3975 				    (L3HDR(struct tcphdr,ip)->th_flags &
3976 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
3977 				break;
3978 
3979 			case O_LOG:
3980 				if (fw_verbose) {
3981 					ipfw_log(ctx, f, hlen, args->eh, m,
3982 					    oif);
3983 				}
3984 				match = 1;
3985 				break;
3986 
3987 			case O_PROB:
3988 				match = (krandom() <
3989 					((ipfw_insn_u32 *)cmd)->d[0]);
3990 				break;
3991 
3992 			/*
3993 			 * The second set of opcodes represents 'actions',
3994 			 * i.e. the terminal part of a rule once the packet
3995 			 * matches all previous patterns.
3996 			 * Typically there is only one action for each rule,
3997 			 * and the opcode is stored at the end of the rule
3998 			 * (but there are exceptions -- see below).
3999 			 *
4000 			 * In general, here we set retval and terminate the
4001 			 * outer loop (would be a 'break 3' in some language,
4002 			 * but we need to do a 'goto done').
4003 			 *
4004 			 * Exceptions:
4005 			 * O_COUNT and O_SKIPTO actions:
4006 			 *   instead of terminating, we jump to the next rule
4007 			 *   ('goto next_rule', equivalent to a 'break 2'),
4008 			 *   or to the SKIPTO target ('goto again' after
4009 			 *   having set f, cmd and l), respectively.
4010 			 *
4011 			 * O_LIMIT and O_KEEP_STATE, O_REDIRECT: these opcodes
4012 			 *   are not real 'actions', and are stored right
4013 			 *   before the 'action' part of the rule.
4014 			 *   These opcodes try to install an entry in the
4015 			 *   state tables; if successful, we continue with
4016 			 *   the next opcode (match=1; break;), otherwise
4017 			 *   the packet must be dropped ('goto done' after
4018 			 *   setting retval).  If static rules are changed
4019 			 *   during the state installation, the packet will
4020 			 *   be dropped and rule's stats will not beupdated
4021 			 *   ('return IP_FW_DENY').
4022 			 *
4023 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
4024 			 *   cause a lookup of the state table, and a jump
4025 			 *   to the 'action' part of the parent rule
4026 			 *   ('goto check_body') if an entry is found, or
4027 			 *   (CHECK_STATE only) a jump to the next rule if
4028 			 *   the entry is not found ('goto next_rule').
4029 			 *   The result of the lookup is cached to make
4030 			 *   further instances of these opcodes are
4031 			 *   effectively NOPs.  If static rules are changed
4032 			 *   during the state looking up, the packet will
4033 			 *   be dropped and rule's stats will not be updated
4034 			 *   ('return IP_FW_DENY').
4035 			 */
4036 			case O_REDIRECT:
4037 				if (f->cross_rules == NULL) {
4038 					/*
4039 					 * This rule was not completely setup;
4040 					 * move on to the next rule.
4041 					 */
4042 					goto next_rule;
4043 				}
4044 				/*
4045 				 * Apply redirect only on input path and
4046 				 * only to non-fragment TCP segments or
4047 				 * UDP datagrams.
4048 				 *
4049 				 * Does _not_ work with layer2 filtering.
4050 				 */
4051 				if (oif != NULL || args->eh != NULL ||
4052 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4053 				    (lc.proto != IPPROTO_TCP &&
4054 				     lc.proto != IPPROTO_UDP))
4055 					break;
4056 				/* FALL THROUGH */
4057 			case O_LIMIT:
4058 			case O_KEEP_STATE:
4059 				if (hlen == 0)
4060 					break;
4061 				s = ipfw_state_install(ctx, f,
4062 				    (ipfw_insn_limit *)cmd, args, lc.tcp);
4063 				if (s == NULL) {
4064 					retval = IP_FW_DENY;
4065 					goto done; /* error/limit violation */
4066 				}
4067 				s->st_pcnt++;
4068 				s->st_bcnt += lc.ip_len;
4069 
4070 				if (s->st_type == O_REDIRECT) {
4071 					struct in_addr oaddr;
4072 					uint16_t oport;
4073 					struct ipfw_xlat *slave_x, *x;
4074 					struct ipfw_state *dup;
4075 
4076 					x = (struct ipfw_xlat *)s;
4077 					ipfw_xlate(x, m, &oaddr, &oport);
4078 					m = ipfw_rehashm(m, hlen, args, &lc,
4079 					    &ip);
4080 					if (m == NULL) {
4081 						ipfw_state_del(ctx, s);
4082 						goto pullup_failed;
4083 					}
4084 
4085 					cpuid = netisr_hashcpu(
4086 					    m->m_pkthdr.hash);
4087 
4088 					slave_x = (struct ipfw_xlat *)
4089 					    ipfw_state_alloc(ctx, &args->f_id,
4090 					    O_REDIRECT, f->cross_rules[cpuid],
4091 					    lc.tcp);
4092 					if (slave_x == NULL) {
4093 						ipfw_state_del(ctx, s);
4094 						retval = IP_FW_DENY;
4095 						goto done;
4096 					}
4097 					slave_x->xlat_addr = oaddr.s_addr;
4098 					slave_x->xlat_port = oport;
4099 					slave_x->xlat_dir = MATCH_REVERSE;
4100 					slave_x->xlat_flags |=
4101 					    IPFW_STATE_F_XLATSRC |
4102 					    IPFW_STATE_F_XLATSLAVE;
4103 
4104 					slave_x->xlat_pair = x;
4105 					slave_x->xlat_pcpu = mycpuid;
4106 					x->xlat_pair = slave_x;
4107 					x->xlat_pcpu = cpuid;
4108 
4109 					ctx->ipfw_xlated++;
4110 					if (cpuid != mycpuid) {
4111 						ctx->ipfw_xlate_split++;
4112 						ipfw_xlate_redispatch(
4113 						    m, cpuid, x,
4114 						    IPFW_XLATE_INSERT |
4115 						    IPFW_XLATE_FORWARD);
4116 						args->m = NULL;
4117 						return (IP_FW_REDISPATCH);
4118 					}
4119 
4120 					dup = ipfw_state_link(ctx,
4121 					    &slave_x->xlat_st);
4122 					if (dup != NULL) {
4123 						ctx->ipfw_xlate_conflicts++;
4124 						if (IPFW_STATE_ISDEAD(dup)) {
4125 							ipfw_state_remove(ctx,
4126 							    dup);
4127 							dup = ipfw_state_link(
4128 							ctx, &slave_x->xlat_st);
4129 						}
4130 						if (dup != NULL) {
4131 							if (bootverbose) {
4132 							    kprintf("ipfw: "
4133 							    "slave %u state "
4134 							    "conflicts "
4135 							    "%u state\n",
4136 							    x->xlat_type,
4137 							    s->st_type);
4138 							}
4139 							ipfw_state_del(ctx, s);
4140 							return (IP_FW_DENY);
4141 						}
4142 						ctx->ipfw_xlate_cresolved++;
4143 					}
4144 				}
4145 				match = 1;
4146 				break;
4147 
4148 			case O_PROBE_STATE:
4149 			case O_CHECK_STATE:
4150 				/*
4151 				 * States are checked at the first keep-state
4152 				 * check-state occurrence, with the result
4153 				 * being stored in dyn_dir.  The compiler
4154 				 * introduces a PROBE_STATE instruction for
4155 				 * us when we have a KEEP_STATE/LIMIT/RDR
4156 				 * (because PROBE_STATE needs to be run first).
4157 				 */
4158 				s = NULL;
4159 				if (dyn_dir == MATCH_UNKNOWN) {
4160 					s = ipfw_state_lookup(ctx,
4161 					    &args->f_id, &dyn_dir, lc.tcp);
4162 				}
4163 				if (s == NULL ||
4164 				    (s->st_type == O_REDIRECT &&
4165 				     (args->eh != NULL ||
4166 				      (ip->ip_off & (IP_MF | IP_OFFMASK)) ||
4167 				      (lc.proto != IPPROTO_TCP &&
4168 				       lc.proto != IPPROTO_UDP)))) {
4169 					/*
4170 					 * State not found. If CHECK_STATE,
4171 					 * skip to next rule, if PROBE_STATE
4172 					 * just ignore and continue with next
4173 					 * opcode.
4174 					 */
4175 					if (cmd->opcode == O_CHECK_STATE)
4176 						goto next_rule;
4177 					match = 1;
4178 					break;
4179 				}
4180 
4181 				s->st_pcnt++;
4182 				s->st_bcnt += lc.ip_len;
4183 
4184 				if (s->st_type == O_REDIRECT) {
4185 					struct ipfw_xlat *x =
4186 					    (struct ipfw_xlat *)s;
4187 
4188 					if (oif != NULL &&
4189 					    x->xlat_ifp == NULL) {
4190 						KASSERT(x->xlat_flags &
4191 						    IPFW_STATE_F_XLATSLAVE,
4192 						    ("master rdr state "
4193 						     "missing ifp"));
4194 						x->xlat_ifp = oif;
4195 					} else if (
4196 					    (oif != NULL && x->xlat_ifp!=oif) ||
4197 					    (oif == NULL &&
4198 					     x->xlat_ifp!=m->m_pkthdr.rcvif)) {
4199 						retval = IP_FW_DENY;
4200 						goto done;
4201 					}
4202 					if (x->xlat_dir != dyn_dir)
4203 						goto skip_xlate;
4204 
4205 					ipfw_xlate(x, m, NULL, NULL);
4206 					m = ipfw_rehashm(m, hlen, args, &lc,
4207 					    &ip);
4208 					if (m == NULL)
4209 						goto pullup_failed;
4210 
4211 					cpuid = netisr_hashcpu(
4212 					    m->m_pkthdr.hash);
4213 					if (cpuid != mycpuid) {
4214 						uint32_t xlate = 0;
4215 
4216 						if (oif != NULL) {
4217 							xlate |=
4218 							    IPFW_XLATE_OUTPUT;
4219 						}
4220 						if (dyn_dir == MATCH_FORWARD) {
4221 							xlate |=
4222 							    IPFW_XLATE_FORWARD;
4223 						}
4224 						ipfw_xlate_redispatch(m, cpuid,
4225 						    x, xlate);
4226 						args->m = NULL;
4227 						return (IP_FW_REDISPATCH);
4228 					}
4229 
4230 					KKASSERT(x->xlat_pcpu == mycpuid);
4231 					ipfw_state_update(&args->f_id, dyn_dir,
4232 					    lc.tcp, &x->xlat_pair->xlat_st);
4233 				}
4234 skip_xlate:
4235 				/*
4236 				 * Found a rule from a state; jump to the
4237 				 * 'action' part of the rule.
4238 				 */
4239 				f = s->st_rule;
4240 				KKASSERT(f->cpuid == mycpuid);
4241 
4242 				cmd = ACTION_PTR(f);
4243 				l = f->cmd_len - f->act_ofs;
4244 				dyn_f = f;
4245 				goto check_body;
4246 
4247 			case O_ACCEPT:
4248 				retval = IP_FW_PASS;	/* accept */
4249 				goto done;
4250 
4251 			case O_DEFRAG:
4252 				if (f->cross_rules == NULL) {
4253 					/*
4254 					 * This rule was not completely setup;
4255 					 * move on to the next rule.
4256 					 */
4257 					goto next_rule;
4258 				}
4259 
4260 				/*
4261 				 * Don't defrag for l2 packets, output packets
4262 				 * or non-fragments.
4263 				 */
4264 				if (oif != NULL || args->eh != NULL ||
4265 				    (ip->ip_off & (IP_MF | IP_OFFMASK)) == 0)
4266 					goto next_rule;
4267 
4268 				ctx->ipfw_frags++;
4269 				m = ip_reass(m);
4270 				args->m = m;
4271 				if (m == NULL) {
4272 					retval = IP_FW_PASS;
4273 					goto done;
4274 				}
4275 				ctx->ipfw_defraged++;
4276 				KASSERT((m->m_flags & M_HASH) == 0,
4277 				    ("hash not cleared"));
4278 
4279 				/* Update statistics */
4280 				f->pcnt++;
4281 				f->bcnt += lc.ip_len;
4282 				f->timestamp = time_second;
4283 
4284 				ip = mtod(m, struct ip *);
4285 				hlen = ip->ip_hl << 2;
4286 				ip->ip_len += hlen;
4287 
4288 				ip->ip_len = htons(ip->ip_len);
4289 				ip->ip_off = htons(ip->ip_off);
4290 
4291 				ip_hashfn(&m, 0);
4292 				args->m = m;
4293 				if (m == NULL)
4294 					goto pullup_failed;
4295 
4296 				KASSERT(m->m_flags & M_HASH, ("no hash"));
4297 				cpuid = netisr_hashcpu(m->m_pkthdr.hash);
4298 				if (cpuid != mycpuid) {
4299 					/*
4300 					 * NOTE:
4301 					 * ip_len/ip_off are in network byte
4302 					 * order.
4303 					 */
4304 					ctx->ipfw_defrag_remote++;
4305 					ipfw_defrag_redispatch(m, cpuid, f);
4306 					args->m = NULL;
4307 					return (IP_FW_REDISPATCH);
4308 				}
4309 
4310 				/* 'm' might be changed by ip_hashfn(). */
4311 				ip = mtod(m, struct ip *);
4312 				ip->ip_len = ntohs(ip->ip_len);
4313 				ip->ip_off = ntohs(ip->ip_off);
4314 
4315 				m = ipfw_setup_local(m, hlen, args, &lc, &ip);
4316 				if (m == NULL)
4317 					goto pullup_failed;
4318 
4319 				/* Move on. */
4320 				goto next_rule;
4321 
4322 			case O_PIPE:
4323 			case O_QUEUE:
4324 				args->rule = f; /* report matching rule */
4325 				args->cookie = cmd->arg1;
4326 				retval = IP_FW_DUMMYNET;
4327 				goto done;
4328 
4329 			case O_DIVERT:
4330 			case O_TEE:
4331 				if (args->eh) /* not on layer 2 */
4332 					break;
4333 
4334 				mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT,
4335 				    sizeof(*divinfo), M_INTWAIT | M_NULLOK);
4336 				if (mtag == NULL) {
4337 					retval = IP_FW_DENY;
4338 					goto done;
4339 				}
4340 				divinfo = m_tag_data(mtag);
4341 
4342 				divinfo->skipto = f->rulenum;
4343 				divinfo->port = cmd->arg1;
4344 				divinfo->tee = (cmd->opcode == O_TEE);
4345 				m_tag_prepend(m, mtag);
4346 
4347 				args->cookie = cmd->arg1;
4348 				retval = (cmd->opcode == O_DIVERT) ?
4349 					 IP_FW_DIVERT : IP_FW_TEE;
4350 				goto done;
4351 
4352 			case O_COUNT:
4353 			case O_SKIPTO:
4354 				f->pcnt++;	/* update stats */
4355 				f->bcnt += lc.ip_len;
4356 				f->timestamp = time_second;
4357 				if (cmd->opcode == O_COUNT)
4358 					goto next_rule;
4359 				/* handle skipto */
4360 				if (f->next_rule == NULL)
4361 					lookup_next_rule(f);
4362 				f = f->next_rule;
4363 				goto again;
4364 
4365 			case O_REJECT:
4366 				/*
4367 				 * Drop the packet and send a reject notice
4368 				 * if the packet is not ICMP (or is an ICMP
4369 				 * query), and it is not multicast/broadcast.
4370 				 */
4371 				if (hlen > 0 &&
4372 				    (lc.proto != IPPROTO_ICMP ||
4373 				     is_icmp_query(ip)) &&
4374 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
4375 				    !IN_MULTICAST(ntohl(lc.dst_ip.s_addr))) {
4376 					send_reject(args, cmd->arg1,
4377 					    lc.offset, lc.ip_len);
4378 					retval = IP_FW_DENY;
4379 					goto done;
4380 				}
4381 				/* FALLTHROUGH */
4382 			case O_DENY:
4383 				retval = IP_FW_DENY;
4384 				goto done;
4385 
4386 			case O_FORWARD_IP:
4387 				if (args->eh)	/* not valid on layer2 pkts */
4388 					break;
4389 				if (!dyn_f || dyn_dir == MATCH_FORWARD) {
4390 					struct sockaddr_in *sin;
4391 
4392 					mtag = m_tag_get(PACKET_TAG_IPFORWARD,
4393 					    sizeof(*sin), M_INTWAIT | M_NULLOK);
4394 					if (mtag == NULL) {
4395 						retval = IP_FW_DENY;
4396 						goto done;
4397 					}
4398 					sin = m_tag_data(mtag);
4399 
4400 					/* Structure copy */
4401 					*sin = ((ipfw_insn_sa *)cmd)->sa;
4402 
4403 					m_tag_prepend(m, mtag);
4404 					m->m_pkthdr.fw_flags |=
4405 						IPFORWARD_MBUF_TAGGED;
4406 					m->m_pkthdr.fw_flags &=
4407 						~BRIDGE_MBUF_TAGGED;
4408 				}
4409 				retval = IP_FW_PASS;
4410 				goto done;
4411 
4412 			default:
4413 				panic("-- unknown opcode %d", cmd->opcode);
4414 			} /* end of switch() on opcodes */
4415 
4416 			if (cmd->len & F_NOT)
4417 				match = !match;
4418 
4419 			if (match) {
4420 				if (cmd->len & F_OR)
4421 					skip_or = 1;
4422 			} else {
4423 				if (!(cmd->len & F_OR)) /* not an OR block, */
4424 					break;		/* try next rule    */
4425 			}
4426 
4427 		}	/* end of inner for, scan opcodes */
4428 
4429 next_rule:;		/* try next rule		*/
4430 
4431 	}		/* end of outer for, scan rules */
4432 	kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n");
4433 	return IP_FW_DENY;
4434 
4435 done:
4436 	/* Update statistics */
4437 	f->pcnt++;
4438 	f->bcnt += lc.ip_len;
4439 	f->timestamp = time_second;
4440 	return retval;
4441 
4442 pullup_failed:
4443 	if (fw_verbose)
4444 		kprintf("pullup failed\n");
4445 	return IP_FW_DENY;
4446 }
4447 
4448 static struct mbuf *
4449 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
4450 {
4451 	struct m_tag *mtag;
4452 	struct dn_pkt *pkt;
4453 	ipfw_insn *cmd;
4454 	const struct ipfw_flow_id *id;
4455 	struct dn_flow_id *fid;
4456 
4457 	M_ASSERTPKTHDR(m);
4458 
4459 	mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt),
4460 	    M_INTWAIT | M_NULLOK);
4461 	if (mtag == NULL) {
4462 		m_freem(m);
4463 		return (NULL);
4464 	}
4465 	m_tag_prepend(m, mtag);
4466 
4467 	pkt = m_tag_data(mtag);
4468 	bzero(pkt, sizeof(*pkt));
4469 
4470 	cmd = fwa->rule->cmd + fwa->rule->act_ofs;
4471 	if (cmd->opcode == O_LOG)
4472 		cmd += F_LEN(cmd);
4473 	KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE,
4474 		("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode));
4475 
4476 	pkt->dn_m = m;
4477 	pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK);
4478 	pkt->ifp = fwa->oif;
4479 	pkt->pipe_nr = pipe_nr;
4480 
4481 	pkt->cpuid = mycpuid;
4482 	pkt->msgport = netisr_curport();
4483 
4484 	id = &fwa->f_id;
4485 	fid = &pkt->id;
4486 	fid->fid_dst_ip = id->dst_ip;
4487 	fid->fid_src_ip = id->src_ip;
4488 	fid->fid_dst_port = id->dst_port;
4489 	fid->fid_src_port = id->src_port;
4490 	fid->fid_proto = id->proto;
4491 	fid->fid_flags = id->flags;
4492 
4493 	ipfw_ref_rule(fwa->rule);
4494 	pkt->dn_priv = fwa->rule;
4495 	pkt->dn_unref_priv = ipfw_unref_rule;
4496 
4497 	if (cmd->opcode == O_PIPE)
4498 		pkt->dn_flags |= DN_FLAGS_IS_PIPE;
4499 
4500 	m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED;
4501 	return (m);
4502 }
4503 
4504 /*
4505  * When a rule is added/deleted, clear the next_rule pointers in all rules.
4506  * These will be reconstructed on the fly as packets are matched.
4507  */
4508 static void
4509 ipfw_flush_rule_ptrs(struct ipfw_context *ctx)
4510 {
4511 	struct ip_fw *rule;
4512 
4513 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
4514 		rule->next_rule = NULL;
4515 }
4516 
4517 static void
4518 ipfw_inc_static_count(struct ip_fw *rule)
4519 {
4520 	/* Static rule's counts are updated only on CPU0 */
4521 	KKASSERT(mycpuid == 0);
4522 
4523 	static_count++;
4524 	static_ioc_len += IOC_RULESIZE(rule);
4525 }
4526 
4527 static void
4528 ipfw_dec_static_count(struct ip_fw *rule)
4529 {
4530 	int l = IOC_RULESIZE(rule);
4531 
4532 	/* Static rule's counts are updated only on CPU0 */
4533 	KKASSERT(mycpuid == 0);
4534 
4535 	KASSERT(static_count > 0, ("invalid static count %u", static_count));
4536 	static_count--;
4537 
4538 	KASSERT(static_ioc_len >= l,
4539 		("invalid static len %u", static_ioc_len));
4540 	static_ioc_len -= l;
4541 }
4542 
4543 static void
4544 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule)
4545 {
4546 	if (fwmsg->sibling != NULL) {
4547 		KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1);
4548 		fwmsg->sibling->sibling = rule;
4549 	}
4550 	fwmsg->sibling = rule;
4551 }
4552 
4553 static struct ip_fw *
4554 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4555 {
4556 	struct ip_fw *rule;
4557 
4558 	rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO);
4559 
4560 	rule->act_ofs = ioc_rule->act_ofs;
4561 	rule->cmd_len = ioc_rule->cmd_len;
4562 	rule->rulenum = ioc_rule->rulenum;
4563 	rule->set = ioc_rule->set;
4564 	rule->usr_flags = ioc_rule->usr_flags;
4565 
4566 	bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */);
4567 
4568 	rule->refcnt = 1;
4569 	rule->cpuid = mycpuid;
4570 	rule->rule_flags = rule_flags;
4571 
4572 	return rule;
4573 }
4574 
4575 static void
4576 ipfw_add_rule_dispatch(netmsg_t nmsg)
4577 {
4578 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4579 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4580 	struct ip_fw *rule;
4581 
4582 	ASSERT_NETISR_NCPUS(mycpuid);
4583 
4584 	rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags);
4585 
4586 	/*
4587 	 * Insert rule into the pre-determined position
4588 	 */
4589 	if (fwmsg->prev_rule != NULL) {
4590 		struct ip_fw *prev, *next;
4591 
4592 		prev = fwmsg->prev_rule;
4593 		KKASSERT(prev->cpuid == mycpuid);
4594 
4595 		next = fwmsg->next_rule;
4596 		KKASSERT(next->cpuid == mycpuid);
4597 
4598 		rule->next = next;
4599 		prev->next = rule;
4600 
4601 		/*
4602 		 * Move to the position on the next CPU
4603 		 * before the msg is forwarded.
4604 		 */
4605 		fwmsg->prev_rule = prev->sibling;
4606 		fwmsg->next_rule = next->sibling;
4607 	} else {
4608 		KKASSERT(fwmsg->next_rule == NULL);
4609 		rule->next = ctx->ipfw_layer3_chain;
4610 		ctx->ipfw_layer3_chain = rule;
4611 	}
4612 
4613 	/* Link rule CPU sibling */
4614 	ipfw_link_sibling(fwmsg, rule);
4615 
4616 	ipfw_flush_rule_ptrs(ctx);
4617 
4618 	if (mycpuid == 0) {
4619 		/* Statistics only need to be updated once */
4620 		ipfw_inc_static_count(rule);
4621 
4622 		/* Return the rule on CPU0 */
4623 		nmsg->lmsg.u.ms_resultp = rule;
4624 	}
4625 
4626 	if (rule->rule_flags & IPFW_RULE_F_GENTRACK)
4627 		rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp;
4628 
4629 	if (fwmsg->cross_rules != NULL) {
4630 		/* Save rules for later use. */
4631 		fwmsg->cross_rules[mycpuid] = rule;
4632 	}
4633 
4634 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4635 }
4636 
4637 static void
4638 ipfw_crossref_rule_dispatch(netmsg_t nmsg)
4639 {
4640 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
4641 	struct ip_fw *rule = fwmsg->sibling;
4642 	int sz = sizeof(struct ip_fw *) * netisr_ncpus;
4643 
4644 	ASSERT_NETISR_NCPUS(mycpuid);
4645 	KASSERT(rule->rule_flags & IPFW_RULE_F_CROSSREF,
4646 	    ("not crossref rule"));
4647 
4648 	rule->cross_rules = kmalloc(sz, M_IPFW, M_WAITOK);
4649 	memcpy(rule->cross_rules, fwmsg->cross_rules, sz);
4650 
4651 	fwmsg->sibling = rule->sibling;
4652 	netisr_forwardmsg(&fwmsg->base, mycpuid + 1);
4653 }
4654 
4655 /*
4656  * Add a new rule to the list.  Copy the rule into a malloc'ed area,
4657  * then possibly create a rule number and add the rule to the list.
4658  * Update the rule_number in the input struct so the caller knows
4659  * it as well.
4660  */
4661 static void
4662 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags)
4663 {
4664 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4665 	struct netmsg_ipfw fwmsg;
4666 	struct ip_fw *f, *prev, *rule;
4667 
4668 	ASSERT_NETISR0;
4669 
4670 	/*
4671 	 * If rulenum is 0, find highest numbered rule before the
4672 	 * default rule, and add rule number incremental step.
4673 	 */
4674 	if (ioc_rule->rulenum == 0) {
4675 		int step = autoinc_step;
4676 
4677 		KKASSERT(step >= IPFW_AUTOINC_STEP_MIN &&
4678 			 step <= IPFW_AUTOINC_STEP_MAX);
4679 
4680 		/*
4681 		 * Locate the highest numbered rule before default
4682 		 */
4683 		for (f = ctx->ipfw_layer3_chain; f; f = f->next) {
4684 			if (f->rulenum == IPFW_DEFAULT_RULE)
4685 				break;
4686 			ioc_rule->rulenum = f->rulenum;
4687 		}
4688 		if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step)
4689 			ioc_rule->rulenum += step;
4690 	}
4691 	KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE &&
4692 		ioc_rule->rulenum != 0,
4693 		("invalid rule num %d", ioc_rule->rulenum));
4694 
4695 	/*
4696 	 * Now find the right place for the new rule in the sorted list.
4697 	 */
4698 	for (prev = NULL, f = ctx->ipfw_layer3_chain; f;
4699 	     prev = f, f = f->next) {
4700 		if (f->rulenum > ioc_rule->rulenum) {
4701 			/* Found the location */
4702 			break;
4703 		}
4704 	}
4705 	KASSERT(f != NULL, ("no default rule?!"));
4706 
4707 	/*
4708 	 * Duplicate the rule onto each CPU.
4709 	 * The rule duplicated on CPU0 will be returned.
4710 	 */
4711 	bzero(&fwmsg, sizeof(fwmsg));
4712 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4713 	    ipfw_add_rule_dispatch);
4714 	fwmsg.ioc_rule = ioc_rule;
4715 	fwmsg.prev_rule = prev;
4716 	fwmsg.next_rule = prev == NULL ? NULL : f;
4717 	fwmsg.rule_flags = rule_flags;
4718 	if (rule_flags & IPFW_RULE_F_CROSSREF) {
4719 		fwmsg.cross_rules = kmalloc(
4720 		    sizeof(struct ip_fw *) * netisr_ncpus, M_TEMP,
4721 		    M_WAITOK | M_ZERO);
4722 	}
4723 
4724 	netisr_domsg_global(&fwmsg.base);
4725 	KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL);
4726 
4727 	rule = fwmsg.base.lmsg.u.ms_resultp;
4728 	KKASSERT(rule != NULL && rule->cpuid == mycpuid);
4729 
4730 	if (fwmsg.cross_rules != NULL) {
4731 		netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport,
4732 		    MSGF_PRIORITY, ipfw_crossref_rule_dispatch);
4733 		fwmsg.sibling = rule;
4734 		netisr_domsg_global(&fwmsg.base);
4735 		KKASSERT(fwmsg.sibling == NULL);
4736 
4737 		kfree(fwmsg.cross_rules, M_TEMP);
4738 
4739 #ifdef KLD_MODULE
4740 		atomic_add_int(&ipfw_gd.ipfw_refcnt, 1);
4741 #endif
4742 	}
4743 
4744 	DPRINTF("++ installed rule %d, static count now %d\n",
4745 		rule->rulenum, static_count);
4746 }
4747 
4748 /*
4749  * Free storage associated with a static rule (including derived
4750  * states/tracks).
4751  * The caller is in charge of clearing rule pointers to avoid
4752  * dangling pointers.
4753  * @return a pointer to the next entry.
4754  * Arguments are not checked, so they better be correct.
4755  */
4756 static struct ip_fw *
4757 ipfw_delete_rule(struct ipfw_context *ctx,
4758 		 struct ip_fw *prev, struct ip_fw *rule)
4759 {
4760 	struct ip_fw *n;
4761 
4762 	n = rule->next;
4763 	if (prev == NULL)
4764 		ctx->ipfw_layer3_chain = n;
4765 	else
4766 		prev->next = n;
4767 
4768 	/* Mark the rule as invalid */
4769 	rule->rule_flags |= IPFW_RULE_F_INVALID;
4770 	rule->next_rule = NULL;
4771 	rule->sibling = NULL;
4772 #ifdef foo
4773 	/* Don't reset cpuid here; keep various assertion working */
4774 	rule->cpuid = -1;
4775 #endif
4776 
4777 	/* Statistics only need to be updated once */
4778 	if (mycpuid == 0)
4779 		ipfw_dec_static_count(rule);
4780 
4781 	if ((rule->rule_flags & IPFW_RULE_F_CROSSREF) == 0) {
4782 		/* Try to free this rule */
4783 		ipfw_free_rule(rule);
4784 	} else {
4785 		/* TODO: check staging area. */
4786 		if (mycpuid == 0) {
4787 			rule->next = ipfw_gd.ipfw_crossref_free;
4788 			ipfw_gd.ipfw_crossref_free = rule;
4789 		}
4790 	}
4791 
4792 	/* Return the next rule */
4793 	return n;
4794 }
4795 
4796 static void
4797 ipfw_flush_dispatch(netmsg_t nmsg)
4798 {
4799 	int kill_default = nmsg->lmsg.u.ms_result;
4800 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4801 	struct ip_fw *rule;
4802 
4803 	ASSERT_NETISR_NCPUS(mycpuid);
4804 
4805 	/*
4806 	 * Flush states.
4807 	 */
4808 	ipfw_state_flush(ctx, NULL);
4809 	KASSERT(ctx->ipfw_state_cnt == 0,
4810 	    ("%d pcpu states remain", ctx->ipfw_state_cnt));
4811 	ctx->ipfw_state_loosecnt = 0;
4812 	ctx->ipfw_state_lastexp = 0;
4813 
4814 	/*
4815 	 * Flush tracks.
4816 	 */
4817 	ipfw_track_flush(ctx, NULL);
4818 	ctx->ipfw_track_lastexp = 0;
4819 	if (ctx->ipfw_trkcnt_spare != NULL) {
4820 		kfree(ctx->ipfw_trkcnt_spare, M_IPFW);
4821 		ctx->ipfw_trkcnt_spare = NULL;
4822 	}
4823 
4824 	ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */
4825 
4826 	while ((rule = ctx->ipfw_layer3_chain) != NULL &&
4827 	       (kill_default || rule->rulenum != IPFW_DEFAULT_RULE))
4828 		ipfw_delete_rule(ctx, NULL, rule);
4829 
4830 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4831 }
4832 
4833 /*
4834  * Deletes all rules from a chain (including the default rule
4835  * if the second argument is set).
4836  */
4837 static void
4838 ipfw_flush(int kill_default)
4839 {
4840 	struct netmsg_base nmsg;
4841 #ifdef INVARIANTS
4842 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4843 	int state_cnt;
4844 #endif
4845 
4846 	ASSERT_NETISR0;
4847 
4848 	/*
4849 	 * If 'kill_default' then caller has done the necessary
4850 	 * msgport syncing; unnecessary to do it again.
4851 	 */
4852 	if (!kill_default) {
4853 		/*
4854 		 * Let ipfw_chk() know the rules are going to
4855 		 * be flushed, so it could jump directly to
4856 		 * the default rule.
4857 		 */
4858 		ipfw_flushing = 1;
4859 		/* XXX use priority sync */
4860 		netmsg_service_sync();
4861 	}
4862 
4863 	/*
4864 	 * Press the 'flush' button
4865 	 */
4866 	bzero(&nmsg, sizeof(nmsg));
4867 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4868 	    ipfw_flush_dispatch);
4869 	nmsg.lmsg.u.ms_result = kill_default;
4870 	netisr_domsg_global(&nmsg);
4871 	ipfw_gd.ipfw_state_loosecnt = 0;
4872 	ipfw_gd.ipfw_state_globexp = 0;
4873 	ipfw_gd.ipfw_track_globexp = 0;
4874 
4875 #ifdef INVARIANTS
4876 	state_cnt = ipfw_state_cntcoll();
4877 	KASSERT(state_cnt == 0, ("%d states remain", state_cnt));
4878 
4879 	KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0,
4880 	    ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt));
4881 
4882 	if (kill_default) {
4883 		KASSERT(static_count == 0,
4884 			("%u static rules remain", static_count));
4885 		KASSERT(static_ioc_len == 0,
4886 			("%u bytes of static rules remain", static_ioc_len));
4887 	} else {
4888 		KASSERT(static_count == 1,
4889 			("%u static rules remain", static_count));
4890 		KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule),
4891 			("%u bytes of static rules remain, should be %lu",
4892 			 static_ioc_len,
4893 			 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule)));
4894 	}
4895 #endif
4896 
4897 	/* Flush is done */
4898 	ipfw_flushing = 0;
4899 }
4900 
4901 static void
4902 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg)
4903 {
4904 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4905 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4906 	struct ip_fw *rule, *prev;
4907 
4908 	ASSERT_NETISR_NCPUS(mycpuid);
4909 
4910 	rule = dmsg->start_rule;
4911 	KKASSERT(rule->cpuid == mycpuid);
4912 	dmsg->start_rule = rule->sibling;
4913 
4914 	prev = dmsg->prev_rule;
4915 	if (prev != NULL) {
4916 		KKASSERT(prev->cpuid == mycpuid);
4917 
4918 		/*
4919 		 * Move to the position on the next CPU
4920 		 * before the msg is forwarded.
4921 		 */
4922 		dmsg->prev_rule = prev->sibling;
4923 	}
4924 
4925 	/*
4926 	 * flush pointers outside the loop, then delete all matching
4927 	 * rules.  'prev' remains the same throughout the cycle.
4928 	 */
4929 	ipfw_flush_rule_ptrs(ctx);
4930 	while (rule && rule->rulenum == dmsg->rulenum) {
4931 		if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4932 			/* Flush states generated by this rule. */
4933 			ipfw_state_flush(ctx, rule);
4934 		}
4935 		if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
4936 			/* Flush tracks generated by this rule. */
4937 			ipfw_track_flush(ctx, rule);
4938 		}
4939 		rule = ipfw_delete_rule(ctx, prev, rule);
4940 	}
4941 
4942 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
4943 }
4944 
4945 static int
4946 ipfw_alt_delete_rule(uint16_t rulenum)
4947 {
4948 	struct ip_fw *prev, *rule;
4949 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4950 	struct netmsg_del dmsg;
4951 
4952 	ASSERT_NETISR0;
4953 
4954 	/*
4955 	 * Locate first rule to delete
4956 	 */
4957 	for (prev = NULL, rule = ctx->ipfw_layer3_chain;
4958 	     rule && rule->rulenum < rulenum;
4959 	     prev = rule, rule = rule->next)
4960 		; /* EMPTY */
4961 	if (rule->rulenum != rulenum)
4962 		return EINVAL;
4963 
4964 	/*
4965 	 * Get rid of the rule duplications on all CPUs
4966 	 */
4967 	bzero(&dmsg, sizeof(dmsg));
4968 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
4969 	    ipfw_alt_delete_rule_dispatch);
4970 	dmsg.prev_rule = prev;
4971 	dmsg.start_rule = rule;
4972 	dmsg.rulenum = rulenum;
4973 
4974 	netisr_domsg_global(&dmsg.base);
4975 	KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL);
4976 	return 0;
4977 }
4978 
4979 static void
4980 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg)
4981 {
4982 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
4983 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
4984 	struct ip_fw *prev, *rule;
4985 #ifdef INVARIANTS
4986 	int del = 0;
4987 #endif
4988 
4989 	ASSERT_NETISR_NCPUS(mycpuid);
4990 
4991 	ipfw_flush_rule_ptrs(ctx);
4992 
4993 	prev = NULL;
4994 	rule = ctx->ipfw_layer3_chain;
4995 	while (rule != NULL) {
4996 		if (rule->set == dmsg->from_set) {
4997 			if (rule->rule_flags & IPFW_RULE_F_GENSTATE) {
4998 				/* Flush states generated by this rule. */
4999 				ipfw_state_flush(ctx, rule);
5000 			}
5001 			if (rule->rule_flags & IPFW_RULE_F_GENTRACK) {
5002 				/* Flush tracks generated by this rule. */
5003 				ipfw_track_flush(ctx, rule);
5004 			}
5005 			rule = ipfw_delete_rule(ctx, prev, rule);
5006 #ifdef INVARIANTS
5007 			del = 1;
5008 #endif
5009 		} else {
5010 			prev = rule;
5011 			rule = rule->next;
5012 		}
5013 	}
5014 	KASSERT(del, ("no match set?!"));
5015 
5016 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5017 }
5018 
5019 static int
5020 ipfw_alt_delete_ruleset(uint8_t set)
5021 {
5022 	struct netmsg_del dmsg;
5023 	int del;
5024 	struct ip_fw *rule;
5025 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5026 
5027 	ASSERT_NETISR0;
5028 
5029 	/*
5030 	 * Check whether the 'set' exists.  If it exists,
5031 	 * then check whether any rules within the set will
5032 	 * try to create states.
5033 	 */
5034 	del = 0;
5035 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5036 		if (rule->set == set)
5037 			del = 1;
5038 	}
5039 	if (!del)
5040 		return 0; /* XXX EINVAL? */
5041 
5042 	/*
5043 	 * Delete this set
5044 	 */
5045 	bzero(&dmsg, sizeof(dmsg));
5046 	netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5047 	    ipfw_alt_delete_ruleset_dispatch);
5048 	dmsg.from_set = set;
5049 	netisr_domsg_global(&dmsg.base);
5050 
5051 	return 0;
5052 }
5053 
5054 static void
5055 ipfw_alt_move_rule_dispatch(netmsg_t nmsg)
5056 {
5057 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5058 	struct ip_fw *rule;
5059 
5060 	ASSERT_NETISR_NCPUS(mycpuid);
5061 
5062 	rule = dmsg->start_rule;
5063 	KKASSERT(rule->cpuid == mycpuid);
5064 
5065 	/*
5066 	 * Move to the position on the next CPU
5067 	 * before the msg is forwarded.
5068 	 */
5069 	dmsg->start_rule = rule->sibling;
5070 
5071 	while (rule && rule->rulenum <= dmsg->rulenum) {
5072 		if (rule->rulenum == dmsg->rulenum)
5073 			rule->set = dmsg->to_set;
5074 		rule = rule->next;
5075 	}
5076 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5077 }
5078 
5079 static int
5080 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set)
5081 {
5082 	struct netmsg_del dmsg;
5083 	struct netmsg_base *nmsg;
5084 	struct ip_fw *rule;
5085 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5086 
5087 	ASSERT_NETISR0;
5088 
5089 	/*
5090 	 * Locate first rule to move
5091 	 */
5092 	for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum;
5093 	     rule = rule->next) {
5094 		if (rule->rulenum == rulenum && rule->set != set)
5095 			break;
5096 	}
5097 	if (rule == NULL || rule->rulenum > rulenum)
5098 		return 0; /* XXX error? */
5099 
5100 	bzero(&dmsg, sizeof(dmsg));
5101 	nmsg = &dmsg.base;
5102 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5103 	    ipfw_alt_move_rule_dispatch);
5104 	dmsg.start_rule = rule;
5105 	dmsg.rulenum = rulenum;
5106 	dmsg.to_set = set;
5107 
5108 	netisr_domsg_global(nmsg);
5109 	KKASSERT(dmsg.start_rule == NULL);
5110 	return 0;
5111 }
5112 
5113 static void
5114 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg)
5115 {
5116 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5117 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5118 	struct ip_fw *rule;
5119 
5120 	ASSERT_NETISR_NCPUS(mycpuid);
5121 
5122 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5123 		if (rule->set == dmsg->from_set)
5124 			rule->set = dmsg->to_set;
5125 	}
5126 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5127 }
5128 
5129 static int
5130 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set)
5131 {
5132 	struct netmsg_del dmsg;
5133 	struct netmsg_base *nmsg;
5134 
5135 	ASSERT_NETISR0;
5136 
5137 	bzero(&dmsg, sizeof(dmsg));
5138 	nmsg = &dmsg.base;
5139 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5140 	    ipfw_alt_move_ruleset_dispatch);
5141 	dmsg.from_set = from_set;
5142 	dmsg.to_set = to_set;
5143 
5144 	netisr_domsg_global(nmsg);
5145 	return 0;
5146 }
5147 
5148 static void
5149 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg)
5150 {
5151 	struct netmsg_del *dmsg = (struct netmsg_del *)nmsg;
5152 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5153 	struct ip_fw *rule;
5154 
5155 	ASSERT_NETISR_NCPUS(mycpuid);
5156 
5157 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5158 		if (rule->set == dmsg->from_set)
5159 			rule->set = dmsg->to_set;
5160 		else if (rule->set == dmsg->to_set)
5161 			rule->set = dmsg->from_set;
5162 	}
5163 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5164 }
5165 
5166 static int
5167 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2)
5168 {
5169 	struct netmsg_del dmsg;
5170 	struct netmsg_base *nmsg;
5171 
5172 	ASSERT_NETISR0;
5173 
5174 	bzero(&dmsg, sizeof(dmsg));
5175 	nmsg = &dmsg.base;
5176 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5177 	    ipfw_alt_swap_ruleset_dispatch);
5178 	dmsg.from_set = set1;
5179 	dmsg.to_set = set2;
5180 
5181 	netisr_domsg_global(nmsg);
5182 	return 0;
5183 }
5184 
5185 /*
5186  * Remove all rules with given number, and also do set manipulation.
5187  *
5188  * The argument is an uint32_t. The low 16 bit are the rule or set number,
5189  * the next 8 bits are the new set, the top 8 bits are the command:
5190  *
5191  *	0	delete rules with given number
5192  *	1	delete rules with given set number
5193  *	2	move rules with given number to new set
5194  *	3	move rules with given set number to new set
5195  *	4	swap sets with given numbers
5196  */
5197 static int
5198 ipfw_ctl_alter(uint32_t arg)
5199 {
5200 	uint16_t rulenum;
5201 	uint8_t cmd, new_set;
5202 	int error = 0;
5203 
5204 	ASSERT_NETISR0;
5205 
5206 	rulenum = arg & 0xffff;
5207 	cmd = (arg >> 24) & 0xff;
5208 	new_set = (arg >> 16) & 0xff;
5209 
5210 	if (cmd > 4)
5211 		return EINVAL;
5212 	if (new_set >= IPFW_DEFAULT_SET)
5213 		return EINVAL;
5214 	if (cmd == 0 || cmd == 2) {
5215 		if (rulenum == IPFW_DEFAULT_RULE)
5216 			return EINVAL;
5217 	} else {
5218 		if (rulenum >= IPFW_DEFAULT_SET)
5219 			return EINVAL;
5220 	}
5221 
5222 	switch (cmd) {
5223 	case 0:	/* delete rules with given number */
5224 		error = ipfw_alt_delete_rule(rulenum);
5225 		break;
5226 
5227 	case 1:	/* delete all rules with given set number */
5228 		error = ipfw_alt_delete_ruleset(rulenum);
5229 		break;
5230 
5231 	case 2:	/* move rules with given number to new set */
5232 		error = ipfw_alt_move_rule(rulenum, new_set);
5233 		break;
5234 
5235 	case 3: /* move rules with given set number to new set */
5236 		error = ipfw_alt_move_ruleset(rulenum, new_set);
5237 		break;
5238 
5239 	case 4: /* swap two sets */
5240 		error = ipfw_alt_swap_ruleset(rulenum, new_set);
5241 		break;
5242 	}
5243 	return error;
5244 }
5245 
5246 /*
5247  * Clear counters for a specific rule.
5248  */
5249 static void
5250 clear_counters(struct ip_fw *rule, int log_only)
5251 {
5252 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
5253 
5254 	if (log_only == 0) {
5255 		rule->bcnt = rule->pcnt = 0;
5256 		rule->timestamp = 0;
5257 	}
5258 	if (l->o.opcode == O_LOG)
5259 		l->log_left = l->max_log;
5260 }
5261 
5262 static void
5263 ipfw_zero_entry_dispatch(netmsg_t nmsg)
5264 {
5265 	struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg;
5266 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5267 	struct ip_fw *rule;
5268 
5269 	ASSERT_NETISR_NCPUS(mycpuid);
5270 
5271 	if (zmsg->rulenum == 0) {
5272 		KKASSERT(zmsg->start_rule == NULL);
5273 
5274 		ctx->ipfw_norule_counter = 0;
5275 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5276 			clear_counters(rule, zmsg->log_only);
5277 	} else {
5278 		struct ip_fw *start = zmsg->start_rule;
5279 
5280 		KKASSERT(start->cpuid == mycpuid);
5281 		KKASSERT(start->rulenum == zmsg->rulenum);
5282 
5283 		/*
5284 		 * We can have multiple rules with the same number, so we
5285 		 * need to clear them all.
5286 		 */
5287 		for (rule = start; rule && rule->rulenum == zmsg->rulenum;
5288 		     rule = rule->next)
5289 			clear_counters(rule, zmsg->log_only);
5290 
5291 		/*
5292 		 * Move to the position on the next CPU
5293 		 * before the msg is forwarded.
5294 		 */
5295 		zmsg->start_rule = start->sibling;
5296 	}
5297 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5298 }
5299 
5300 /*
5301  * Reset some or all counters on firewall rules.
5302  * @arg frwl is null to clear all entries, or contains a specific
5303  * rule number.
5304  * @arg log_only is 1 if we only want to reset logs, zero otherwise.
5305  */
5306 static int
5307 ipfw_ctl_zero_entry(int rulenum, int log_only)
5308 {
5309 	struct netmsg_zent zmsg;
5310 	struct netmsg_base *nmsg;
5311 	const char *msg;
5312 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5313 
5314 	ASSERT_NETISR0;
5315 
5316 	bzero(&zmsg, sizeof(zmsg));
5317 	nmsg = &zmsg.base;
5318 	netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5319 	    ipfw_zero_entry_dispatch);
5320 	zmsg.log_only = log_only;
5321 
5322 	if (rulenum == 0) {
5323 		msg = log_only ? "ipfw: All logging counts reset.\n"
5324 			       : "ipfw: Accounting cleared.\n";
5325 	} else {
5326 		struct ip_fw *rule;
5327 
5328 		/*
5329 		 * Locate the first rule with 'rulenum'
5330 		 */
5331 		for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) {
5332 			if (rule->rulenum == rulenum)
5333 				break;
5334 		}
5335 		if (rule == NULL) /* we did not find any matching rules */
5336 			return (EINVAL);
5337 		zmsg.start_rule = rule;
5338 		zmsg.rulenum = rulenum;
5339 
5340 		msg = log_only ? "ipfw: Entry %d logging count reset.\n"
5341 			       : "ipfw: Entry %d cleared.\n";
5342 	}
5343 	netisr_domsg_global(nmsg);
5344 	KKASSERT(zmsg.start_rule == NULL);
5345 
5346 	if (fw_verbose)
5347 		log(LOG_SECURITY | LOG_NOTICE, msg, rulenum);
5348 	return (0);
5349 }
5350 
5351 /*
5352  * Check validity of the structure before insert.
5353  * Fortunately rules are simple, so this mostly need to check rule sizes.
5354  */
5355 static int
5356 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags)
5357 {
5358 	int l, cmdlen = 0;
5359 	int have_action = 0;
5360 	ipfw_insn *cmd;
5361 
5362 	*rule_flags = 0;
5363 
5364 	/* Check for valid size */
5365 	if (size < sizeof(*rule)) {
5366 		kprintf("ipfw: rule too short\n");
5367 		return EINVAL;
5368 	}
5369 	l = IOC_RULESIZE(rule);
5370 	if (l != size) {
5371 		kprintf("ipfw: size mismatch (have %d want %d)\n", size, l);
5372 		return EINVAL;
5373 	}
5374 
5375 	/* Check rule number */
5376 	if (rule->rulenum == IPFW_DEFAULT_RULE) {
5377 		kprintf("ipfw: invalid rule number\n");
5378 		return EINVAL;
5379 	}
5380 
5381 	/*
5382 	 * Now go for the individual checks. Very simple ones, basically only
5383 	 * instruction sizes.
5384 	 */
5385 	for (l = rule->cmd_len, cmd = rule->cmd; l > 0;
5386 	     l -= cmdlen, cmd += cmdlen) {
5387 		cmdlen = F_LEN(cmd);
5388 		if (cmdlen > l) {
5389 			kprintf("ipfw: opcode %d size truncated\n",
5390 				cmd->opcode);
5391 			return EINVAL;
5392 		}
5393 
5394 		DPRINTF("ipfw: opcode %d\n", cmd->opcode);
5395 
5396 		if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT ||
5397 		    IPFW_ISXLAT(cmd->opcode)) {
5398 			/* This rule will generate states. */
5399 			*rule_flags |= IPFW_RULE_F_GENSTATE;
5400 			if (cmd->opcode == O_LIMIT)
5401 				*rule_flags |= IPFW_RULE_F_GENTRACK;
5402 		}
5403 		if (cmd->opcode == O_DEFRAG || IPFW_ISXLAT(cmd->opcode))
5404 			*rule_flags |= IPFW_RULE_F_CROSSREF;
5405 		if (cmd->opcode == O_IP_SRC_IFIP ||
5406 		    cmd->opcode == O_IP_DST_IFIP) {
5407 			*rule_flags |= IPFW_RULE_F_DYNIFADDR;
5408 			cmd->arg1 &= IPFW_IFIP_SETTINGS;
5409 		}
5410 
5411 		switch (cmd->opcode) {
5412 		case O_NOP:
5413 		case O_PROBE_STATE:
5414 		case O_KEEP_STATE:
5415 		case O_PROTO:
5416 		case O_IP_SRC_ME:
5417 		case O_IP_DST_ME:
5418 		case O_LAYER2:
5419 		case O_IN:
5420 		case O_FRAG:
5421 		case O_IPFRAG:
5422 		case O_IPOPT:
5423 		case O_IPLEN:
5424 		case O_IPID:
5425 		case O_IPTOS:
5426 		case O_IPPRECEDENCE:
5427 		case O_IPTTL:
5428 		case O_IPVER:
5429 		case O_TCPWIN:
5430 		case O_TCPFLAGS:
5431 		case O_TCPOPTS:
5432 		case O_ESTAB:
5433 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5434 				goto bad_size;
5435 			break;
5436 
5437 		case O_IP_SRC_TABLE:
5438 		case O_IP_DST_TABLE:
5439 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5440 				goto bad_size;
5441 			if (cmd->arg1 >= ipfw_table_max) {
5442 				kprintf("ipfw: invalid table id %u, max %d\n",
5443 				    cmd->arg1, ipfw_table_max);
5444 				return EINVAL;
5445 			}
5446 			break;
5447 
5448 		case O_IP_SRC_IFIP:
5449 		case O_IP_DST_IFIP:
5450 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ifip))
5451 				goto bad_size;
5452 			break;
5453 
5454 		case O_ICMPCODE:
5455 		case O_ICMPTYPE:
5456 			if (cmdlen < F_INSN_SIZE(ipfw_insn_u32))
5457 				goto bad_size;
5458 			break;
5459 
5460 		case O_UID:
5461 		case O_GID:
5462 		case O_IP_SRC:
5463 		case O_IP_DST:
5464 		case O_TCPSEQ:
5465 		case O_TCPACK:
5466 		case O_PROB:
5467 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
5468 				goto bad_size;
5469 			break;
5470 
5471 		case O_LIMIT:
5472 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
5473 				goto bad_size;
5474 			break;
5475 		case O_REDIRECT:
5476 			if (cmdlen != F_INSN_SIZE(ipfw_insn_rdr))
5477 				goto bad_size;
5478 			break;
5479 
5480 		case O_LOG:
5481 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
5482 				goto bad_size;
5483 
5484 			((ipfw_insn_log *)cmd)->log_left =
5485 			    ((ipfw_insn_log *)cmd)->max_log;
5486 
5487 			break;
5488 
5489 		case O_IP_SRC_MASK:
5490 		case O_IP_DST_MASK:
5491 			if (cmdlen != F_INSN_SIZE(ipfw_insn_ip))
5492 				goto bad_size;
5493 			if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) {
5494 				kprintf("ipfw: opcode %d, useless rule\n",
5495 					cmd->opcode);
5496 				return EINVAL;
5497 			}
5498 			break;
5499 
5500 		case O_IP_SRC_SET:
5501 		case O_IP_DST_SET:
5502 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
5503 				kprintf("ipfw: invalid set size %d\n",
5504 					cmd->arg1);
5505 				return EINVAL;
5506 			}
5507 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
5508 			    (cmd->arg1+31)/32 )
5509 				goto bad_size;
5510 			break;
5511 
5512 		case O_MACADDR2:
5513 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
5514 				goto bad_size;
5515 			break;
5516 
5517 		case O_MAC_TYPE:
5518 		case O_IP_SRCPORT:
5519 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
5520 			if (cmdlen < 2 || cmdlen > 31)
5521 				goto bad_size;
5522 			break;
5523 
5524 		case O_RECV:
5525 		case O_XMIT:
5526 		case O_VIA:
5527 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
5528 				goto bad_size;
5529 			break;
5530 
5531 		case O_PIPE:
5532 		case O_QUEUE:
5533 			if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
5534 				goto bad_size;
5535 			goto check_action;
5536 
5537 		case O_FORWARD_IP:
5538 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) {
5539 				goto bad_size;
5540 			} else {
5541 				in_addr_t fwd_addr;
5542 
5543 				fwd_addr = ((ipfw_insn_sa *)cmd)->
5544 					   sa.sin_addr.s_addr;
5545 				if (IN_MULTICAST(ntohl(fwd_addr))) {
5546 					kprintf("ipfw: try forwarding to "
5547 						"multicast address\n");
5548 					return EINVAL;
5549 				}
5550 			}
5551 			goto check_action;
5552 
5553 		case O_FORWARD_MAC: /* XXX not implemented yet */
5554 		case O_CHECK_STATE:
5555 		case O_COUNT:
5556 		case O_ACCEPT:
5557 		case O_DENY:
5558 		case O_REJECT:
5559 		case O_SKIPTO:
5560 		case O_DIVERT:
5561 		case O_TEE:
5562 		case O_DEFRAG:
5563 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
5564 				goto bad_size;
5565 check_action:
5566 			if (have_action) {
5567 				kprintf("ipfw: opcode %d, multiple actions"
5568 					" not allowed\n",
5569 					cmd->opcode);
5570 				return EINVAL;
5571 			}
5572 			have_action = 1;
5573 			if (l != cmdlen) {
5574 				kprintf("ipfw: opcode %d, action must be"
5575 					" last opcode\n",
5576 					cmd->opcode);
5577 				return EINVAL;
5578 			}
5579 			break;
5580 		default:
5581 			kprintf("ipfw: opcode %d, unknown opcode\n",
5582 				cmd->opcode);
5583 			return EINVAL;
5584 		}
5585 	}
5586 	if (have_action == 0) {
5587 		kprintf("ipfw: missing action\n");
5588 		return EINVAL;
5589 	}
5590 	return 0;
5591 
5592 bad_size:
5593 	kprintf("ipfw: opcode %d size %d wrong\n",
5594 		cmd->opcode, cmdlen);
5595 	return EINVAL;
5596 }
5597 
5598 static int
5599 ipfw_ctl_add_rule(struct sockopt *sopt)
5600 {
5601 	struct ipfw_ioc_rule *ioc_rule;
5602 	size_t size;
5603 	uint32_t rule_flags;
5604 	int error;
5605 
5606 	ASSERT_NETISR0;
5607 
5608 	size = sopt->sopt_valsize;
5609 	if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) ||
5610 	    size < sizeof(*ioc_rule)) {
5611 		return EINVAL;
5612 	}
5613 	if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) {
5614 		sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) *
5615 					  IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK);
5616 	}
5617 	ioc_rule = sopt->sopt_val;
5618 
5619 	error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags);
5620 	if (error)
5621 		return error;
5622 
5623 	ipfw_add_rule(ioc_rule, rule_flags);
5624 
5625 	if (sopt->sopt_dir == SOPT_GET)
5626 		sopt->sopt_valsize = IOC_RULESIZE(ioc_rule);
5627 	return 0;
5628 }
5629 
5630 static void *
5631 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule,
5632     struct ipfw_ioc_rule *ioc_rule)
5633 {
5634 	const struct ip_fw *sibling;
5635 #ifdef INVARIANTS
5636 	int i;
5637 #endif
5638 
5639 	ASSERT_NETISR0;
5640 	KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0"));
5641 
5642 	ioc_rule->act_ofs = rule->act_ofs;
5643 	ioc_rule->cmd_len = rule->cmd_len;
5644 	ioc_rule->rulenum = rule->rulenum;
5645 	ioc_rule->set = rule->set;
5646 	ioc_rule->usr_flags = rule->usr_flags;
5647 
5648 	ioc_rule->set_disable = ctx->ipfw_set_disable;
5649 	ioc_rule->static_count = static_count;
5650 	ioc_rule->static_len = static_ioc_len;
5651 
5652 	/*
5653 	 * Visit (read-only) all of the rule's duplications to get
5654 	 * the necessary statistics
5655 	 */
5656 #ifdef INVARIANTS
5657 	i = 0;
5658 #endif
5659 	ioc_rule->pcnt = 0;
5660 	ioc_rule->bcnt = 0;
5661 	ioc_rule->timestamp = 0;
5662 	for (sibling = rule; sibling != NULL; sibling = sibling->sibling) {
5663 		ioc_rule->pcnt += sibling->pcnt;
5664 		ioc_rule->bcnt += sibling->bcnt;
5665 		if (sibling->timestamp > ioc_rule->timestamp)
5666 			ioc_rule->timestamp = sibling->timestamp;
5667 #ifdef INVARIANTS
5668 		++i;
5669 #endif
5670 	}
5671 	KASSERT(i == netisr_ncpus,
5672 	    ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus));
5673 
5674 	bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */);
5675 
5676 	return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule));
5677 }
5678 
5679 static boolean_t
5680 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state)
5681 {
5682 	struct ipfw_ioc_flowid *ioc_id;
5683 
5684 	if (trk->tc_expire == 0) {
5685 		/* Not a scanned one. */
5686 		return (FALSE);
5687 	}
5688 
5689 	ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ?
5690 	    0 : trk->tc_expire - time_uptime;
5691 	ioc_state->pcnt = 0;
5692 	ioc_state->bcnt = 0;
5693 
5694 	ioc_state->dyn_type = O_LIMIT_PARENT;
5695 	ioc_state->count = trk->tc_count;
5696 
5697 	ioc_state->rulenum = trk->tc_rulenum;
5698 
5699 	ioc_id = &ioc_state->id;
5700 	ioc_id->type = ETHERTYPE_IP;
5701 	ioc_id->u.ip.proto = trk->tc_proto;
5702 	ioc_id->u.ip.src_ip = trk->tc_saddr;
5703 	ioc_id->u.ip.dst_ip = trk->tc_daddr;
5704 	ioc_id->u.ip.src_port = trk->tc_sport;
5705 	ioc_id->u.ip.dst_port = trk->tc_dport;
5706 
5707 	return (TRUE);
5708 }
5709 
5710 static boolean_t
5711 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state)
5712 {
5713 	struct ipfw_ioc_flowid *ioc_id;
5714 
5715 	if (IPFW_STATE_SCANSKIP(s))
5716 		return (FALSE);
5717 
5718 	ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ?
5719 	    0 : s->st_expire - time_uptime;
5720 	ioc_state->pcnt = s->st_pcnt;
5721 	ioc_state->bcnt = s->st_bcnt;
5722 
5723 	ioc_state->dyn_type = s->st_type;
5724 	ioc_state->count = 0;
5725 
5726 	ioc_state->rulenum = s->st_rule->rulenum;
5727 
5728 	ioc_id = &ioc_state->id;
5729 	ioc_id->type = ETHERTYPE_IP;
5730 	ioc_id->u.ip.proto = s->st_proto;
5731 	ipfw_key_4tuple(&s->st_key,
5732 	    &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port,
5733 	    &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port);
5734 
5735 	if (IPFW_ISXLAT(s->st_type)) {
5736 		const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
5737 
5738 		if (x->xlat_port == 0)
5739 			ioc_state->xlat_port = ioc_id->u.ip.dst_port;
5740 		else
5741 			ioc_state->xlat_port = ntohs(x->xlat_port);
5742 		ioc_state->xlat_addr = ntohl(x->xlat_addr);
5743 
5744 		ioc_state->pcnt += x->xlat_pair->xlat_pcnt;
5745 		ioc_state->bcnt += x->xlat_pair->xlat_bcnt;
5746 	}
5747 
5748 	return (TRUE);
5749 }
5750 
5751 static void
5752 ipfw_state_copy_dispatch(netmsg_t nmsg)
5753 {
5754 	struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg;
5755 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5756 	const struct ipfw_state *s;
5757 	const struct ipfw_track *t;
5758 
5759 	ASSERT_NETISR_NCPUS(mycpuid);
5760 	KASSERT(nm->state_cnt < nm->state_cntmax,
5761 	    ("invalid state count %d, max %d",
5762 	     nm->state_cnt, nm->state_cntmax));
5763 
5764 	TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) {
5765 		if (ipfw_state_copy(s, nm->ioc_state)) {
5766 			nm->ioc_state++;
5767 			nm->state_cnt++;
5768 			if (nm->state_cnt == nm->state_cntmax)
5769 				goto done;
5770 		}
5771 	}
5772 
5773 	/*
5774 	 * Prepare tracks in the global track tree for userland.
5775 	 */
5776 	TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) {
5777 		struct ipfw_trkcnt *trk;
5778 
5779 		if (t->t_count == NULL) /* anchor */
5780 			continue;
5781 		trk = t->t_trkcnt;
5782 
5783 		/*
5784 		 * Only one netisr can run this function at
5785 		 * any time, and only this function accesses
5786 		 * trkcnt's tc_expire, so this is safe w/o
5787 		 * ipfw_gd.ipfw_trkcnt_token.
5788 		 */
5789 		if (trk->tc_expire > t->t_expire)
5790 			continue;
5791 		trk->tc_expire = t->t_expire;
5792 	}
5793 
5794 	/*
5795 	 * Copy tracks in the global track tree to userland in
5796 	 * the last netisr.
5797 	 */
5798 	if (mycpuid == netisr_ncpus - 1) {
5799 		struct ipfw_trkcnt *trk;
5800 
5801 		KASSERT(nm->state_cnt < nm->state_cntmax,
5802 		    ("invalid state count %d, max %d",
5803 		     nm->state_cnt, nm->state_cntmax));
5804 
5805 		IPFW_TRKCNT_TOKGET;
5806 		RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) {
5807 			if (ipfw_track_copy(trk, nm->ioc_state)) {
5808 				nm->ioc_state++;
5809 				nm->state_cnt++;
5810 				if (nm->state_cnt == nm->state_cntmax) {
5811 					IPFW_TRKCNT_TOKREL;
5812 					goto done;
5813 				}
5814 			}
5815 		}
5816 		IPFW_TRKCNT_TOKREL;
5817 	}
5818 done:
5819 	if (nm->state_cnt == nm->state_cntmax) {
5820 		/* No more space; done. */
5821 		netisr_replymsg(&nm->base, 0);
5822 	} else {
5823 		netisr_forwardmsg(&nm->base, mycpuid + 1);
5824 	}
5825 }
5826 
5827 static int
5828 ipfw_ctl_get_rules(struct sockopt *sopt)
5829 {
5830 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5831 	struct ip_fw *rule;
5832 	void *bp;
5833 	size_t size;
5834 	int state_cnt;
5835 
5836 	ASSERT_NETISR0;
5837 
5838 	/*
5839 	 * pass up a copy of the current rules. Static rules
5840 	 * come first (the last of which has number IPFW_DEFAULT_RULE),
5841 	 * followed by a possibly empty list of states.
5842 	 */
5843 
5844 	size = static_ioc_len;	/* size of static rules */
5845 
5846 	/*
5847 	 * Size of the states.
5848 	 * XXX take tracks as state for userland compat.
5849 	 */
5850 	state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt;
5851 	state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */
5852 	size += state_cnt * sizeof(struct ipfw_ioc_state);
5853 
5854 	if (sopt->sopt_valsize < size) {
5855 		/* short length, no need to return incomplete rules */
5856 		/* XXX: if superuser, no need to zero buffer */
5857 		bzero(sopt->sopt_val, sopt->sopt_valsize);
5858 		return 0;
5859 	}
5860 	bp = sopt->sopt_val;
5861 
5862 	for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next)
5863 		bp = ipfw_copy_rule(ctx, rule, bp);
5864 
5865 	if (state_cnt) {
5866 		struct netmsg_cpstate nm;
5867 #ifdef INVARIANTS
5868 		size_t old_size = size;
5869 #endif
5870 
5871 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
5872 		    MSGF_PRIORITY, ipfw_state_copy_dispatch);
5873 		nm.ioc_state = bp;
5874 		nm.state_cntmax = state_cnt;
5875 		nm.state_cnt = 0;
5876 		netisr_domsg_global(&nm.base);
5877 
5878 		/*
5879 		 * The # of states may be shrinked after the snapshot
5880 		 * of the state count was taken.  To give user a correct
5881 		 * state count, nm->state_cnt is used to recalculate
5882 		 * the actual size.
5883 		 */
5884 		size = static_ioc_len +
5885 		    (nm.state_cnt * sizeof(struct ipfw_ioc_state));
5886 		KKASSERT(size <= old_size);
5887 	}
5888 
5889 	sopt->sopt_valsize = size;
5890 	return 0;
5891 }
5892 
5893 static void
5894 ipfw_set_disable_dispatch(netmsg_t nmsg)
5895 {
5896 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5897 
5898 	ASSERT_NETISR_NCPUS(mycpuid);
5899 
5900 	ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32;
5901 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
5902 }
5903 
5904 static void
5905 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable)
5906 {
5907 	struct netmsg_base nmsg;
5908 	uint32_t set_disable;
5909 
5910 	ASSERT_NETISR0;
5911 
5912 	/* IPFW_DEFAULT_SET is always enabled */
5913 	enable |= (1 << IPFW_DEFAULT_SET);
5914 	set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable;
5915 
5916 	bzero(&nmsg, sizeof(nmsg));
5917 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5918 	    ipfw_set_disable_dispatch);
5919 	nmsg.lmsg.u.ms_result32 = set_disable;
5920 
5921 	netisr_domsg_global(&nmsg);
5922 }
5923 
5924 static void
5925 ipfw_table_create_dispatch(netmsg_t nm)
5926 {
5927 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5928 	int tblid = nm->lmsg.u.ms_result;
5929 
5930 	ASSERT_NETISR_NCPUS(mycpuid);
5931 
5932 	if (!rn_inithead((void **)&ctx->ipfw_tables[tblid],
5933 	    rn_cpumaskhead(mycpuid), 32))
5934 		panic("ipfw: create table%d failed", tblid);
5935 
5936 	netisr_forwardmsg(&nm->base, mycpuid + 1);
5937 }
5938 
5939 static int
5940 ipfw_table_create(struct sockopt *sopt)
5941 {
5942 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
5943 	struct ipfw_ioc_table *tbl;
5944 	struct netmsg_base nm;
5945 
5946 	ASSERT_NETISR0;
5947 
5948 	if (sopt->sopt_valsize != sizeof(*tbl))
5949 		return (EINVAL);
5950 
5951 	tbl = sopt->sopt_val;
5952 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
5953 		return (EINVAL);
5954 
5955 	if (ctx->ipfw_tables[tbl->tableid] != NULL)
5956 		return (EEXIST);
5957 
5958 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
5959 	    ipfw_table_create_dispatch);
5960 	nm.lmsg.u.ms_result = tbl->tableid;
5961 	netisr_domsg_global(&nm);
5962 
5963 	return (0);
5964 }
5965 
5966 static void
5967 ipfw_table_killrn(struct radix_node_head *rnh, struct radix_node *rn)
5968 {
5969 	struct radix_node *ret;
5970 
5971 	ret = rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
5972 	if (ret != rn)
5973 		panic("deleted other table entry");
5974 	kfree(ret, M_IPFW);
5975 }
5976 
5977 static int
5978 ipfw_table_killent(struct radix_node *rn, void *xrnh)
5979 {
5980 
5981 	ipfw_table_killrn(xrnh, rn);
5982 	return (0);
5983 }
5984 
5985 static void
5986 ipfw_table_flush_oncpu(struct ipfw_context *ctx, int tableid,
5987     int destroy)
5988 {
5989 	struct radix_node_head *rnh;
5990 
5991 	ASSERT_NETISR_NCPUS(mycpuid);
5992 
5993 	rnh = ctx->ipfw_tables[tableid];
5994 	rnh->rnh_walktree(rnh, ipfw_table_killent, rnh);
5995 	if (destroy) {
5996 		Free(rnh);
5997 		ctx->ipfw_tables[tableid] = NULL;
5998 	}
5999 }
6000 
6001 static void
6002 ipfw_table_flush_dispatch(netmsg_t nmsg)
6003 {
6004 	struct netmsg_tblflush *nm = (struct netmsg_tblflush *)nmsg;
6005 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6006 
6007 	ASSERT_NETISR_NCPUS(mycpuid);
6008 
6009 	ipfw_table_flush_oncpu(ctx, nm->tableid, nm->destroy);
6010 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6011 }
6012 
6013 static void
6014 ipfw_table_flushall_oncpu(struct ipfw_context *ctx, int destroy)
6015 {
6016 	int i;
6017 
6018 	ASSERT_NETISR_NCPUS(mycpuid);
6019 
6020 	for (i = 0; i < ipfw_table_max; ++i) {
6021 		if (ctx->ipfw_tables[i] != NULL)
6022 			ipfw_table_flush_oncpu(ctx, i, destroy);
6023 	}
6024 }
6025 
6026 static void
6027 ipfw_table_flushall_dispatch(netmsg_t nmsg)
6028 {
6029 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6030 
6031 	ASSERT_NETISR_NCPUS(mycpuid);
6032 
6033 	ipfw_table_flushall_oncpu(ctx, 0);
6034 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6035 }
6036 
6037 static int
6038 ipfw_table_flush(struct sockopt *sopt)
6039 {
6040 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6041 	struct ipfw_ioc_table *tbl;
6042 	struct netmsg_tblflush nm;
6043 
6044 	ASSERT_NETISR0;
6045 
6046 	if (sopt->sopt_valsize != sizeof(*tbl))
6047 		return (EINVAL);
6048 
6049 	tbl = sopt->sopt_val;
6050 	if (sopt->sopt_name == IP_FW_TBL_FLUSH && tbl->tableid < 0) {
6051 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6052 		    MSGF_PRIORITY, ipfw_table_flushall_dispatch);
6053 		netisr_domsg_global(&nm.base);
6054 		return (0);
6055 	}
6056 
6057 	if (tbl->tableid < 0 || tbl->tableid >= ipfw_table_max)
6058 		return (EINVAL);
6059 
6060 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6061 		return (ENOENT);
6062 
6063 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6064 	    ipfw_table_flush_dispatch);
6065 	nm.tableid = tbl->tableid;
6066 	nm.destroy = 0;
6067 	if (sopt->sopt_name == IP_FW_TBL_DESTROY)
6068 		nm.destroy = 1;
6069 	netisr_domsg_global(&nm.base);
6070 
6071 	return (0);
6072 }
6073 
6074 static int
6075 ipfw_table_cntent(struct radix_node *rn __unused, void *xcnt)
6076 {
6077 	int *cnt = xcnt;
6078 
6079 	(*cnt)++;
6080 	return (0);
6081 }
6082 
6083 static int
6084 ipfw_table_cpent(struct radix_node *rn, void *xcp)
6085 {
6086 	struct ipfw_table_cp *cp = xcp;
6087 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6088 	struct ipfw_ioc_tblent *ioc_te;
6089 #ifdef INVARIANTS
6090 	int cnt;
6091 #endif
6092 
6093 	KASSERT(cp->te_idx < cp->te_cnt, ("invalid table cp idx %d, cnt %d",
6094 	    cp->te_idx, cp->te_cnt));
6095 	ioc_te = &cp->te[cp->te_idx];
6096 
6097 	if (te->te_nodes->rn_mask != NULL) {
6098 		memcpy(&ioc_te->netmask, te->te_nodes->rn_mask,
6099 		    *te->te_nodes->rn_mask);
6100 	} else {
6101 		ioc_te->netmask.sin_len = 0;
6102 	}
6103 	memcpy(&ioc_te->key, &te->te_key, sizeof(ioc_te->key));
6104 
6105 	ioc_te->use = te->te_use;
6106 	ioc_te->last_used = te->te_lastuse;
6107 #ifdef INVARIANTS
6108 	cnt = 1;
6109 #endif
6110 
6111 	while ((te = te->te_sibling) != NULL) {
6112 #ifdef INVARIANTS
6113 		++cnt;
6114 #endif
6115 		ioc_te->use += te->te_use;
6116 		if (te->te_lastuse > ioc_te->last_used)
6117 			ioc_te->last_used = te->te_lastuse;
6118 	}
6119 	KASSERT(cnt == netisr_ncpus,
6120 	    ("invalid # of tblent %d, should be %d", cnt, netisr_ncpus));
6121 
6122 	cp->te_idx++;
6123 
6124 	return (0);
6125 }
6126 
6127 static int
6128 ipfw_table_get(struct sockopt *sopt)
6129 {
6130 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6131 	struct radix_node_head *rnh;
6132 	struct ipfw_ioc_table *tbl;
6133 	struct ipfw_ioc_tblcont *cont;
6134 	struct ipfw_table_cp cp;
6135 	int cnt = 0, sz;
6136 
6137 	ASSERT_NETISR0;
6138 
6139 	if (sopt->sopt_valsize < sizeof(*tbl))
6140 		return (EINVAL);
6141 
6142 	tbl = sopt->sopt_val;
6143 	if (tbl->tableid < 0) {
6144 		struct ipfw_ioc_tbllist *list;
6145 		int i;
6146 
6147 		/*
6148 		 * List available table ids.
6149 		 */
6150 		for (i = 0; i < ipfw_table_max; ++i) {
6151 			if (ctx->ipfw_tables[i] != NULL)
6152 				++cnt;
6153 		}
6154 
6155 		sz = __offsetof(struct ipfw_ioc_tbllist, tables[cnt]);
6156 		if (sopt->sopt_valsize < sz) {
6157 			bzero(sopt->sopt_val, sopt->sopt_valsize);
6158 			return (E2BIG);
6159 		}
6160 		list = sopt->sopt_val;
6161 		list->tablecnt = cnt;
6162 
6163 		cnt = 0;
6164 		for (i = 0; i < ipfw_table_max; ++i) {
6165 			if (ctx->ipfw_tables[i] != NULL) {
6166 				KASSERT(cnt < list->tablecnt,
6167 				    ("invalid idx %d, cnt %d",
6168 				     cnt, list->tablecnt));
6169 				list->tables[cnt++] = i;
6170 			}
6171 		}
6172 		sopt->sopt_valsize = sz;
6173 		return (0);
6174 	} else if (tbl->tableid >= ipfw_table_max) {
6175 		return (EINVAL);
6176 	}
6177 
6178 	rnh = ctx->ipfw_tables[tbl->tableid];
6179 	if (rnh == NULL)
6180 		return (ENOENT);
6181 	rnh->rnh_walktree(rnh, ipfw_table_cntent, &cnt);
6182 
6183 	sz = __offsetof(struct ipfw_ioc_tblcont, ent[cnt]);
6184 	if (sopt->sopt_valsize < sz) {
6185 		bzero(sopt->sopt_val, sopt->sopt_valsize);
6186 		return (E2BIG);
6187 	}
6188 	cont = sopt->sopt_val;
6189 	cont->entcnt = cnt;
6190 
6191 	cp.te = cont->ent;
6192 	cp.te_idx = 0;
6193 	cp.te_cnt = cnt;
6194 	rnh->rnh_walktree(rnh, ipfw_table_cpent, &cp);
6195 
6196 	sopt->sopt_valsize = sz;
6197 	return (0);
6198 }
6199 
6200 static void
6201 ipfw_table_add_dispatch(netmsg_t nmsg)
6202 {
6203 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6204 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6205 	struct radix_node_head *rnh;
6206 	struct ipfw_tblent *te;
6207 
6208 	ASSERT_NETISR_NCPUS(mycpuid);
6209 
6210 	rnh = ctx->ipfw_tables[nm->tableid];
6211 
6212 	te = kmalloc(sizeof(*te), M_IPFW, M_WAITOK | M_ZERO);
6213 	te->te_nodes->rn_key = (char *)&te->te_key;
6214 	memcpy(&te->te_key, nm->key, sizeof(te->te_key));
6215 
6216 	if (rnh->rnh_addaddr((char *)&te->te_key, (char *)nm->netmask, rnh,
6217 	    te->te_nodes) == NULL) {
6218 		if (mycpuid == 0) {
6219 			kfree(te, M_IPFW);
6220 			netisr_replymsg(&nm->base, EEXIST);
6221 			return;
6222 		}
6223 		panic("rnh_addaddr failed");
6224 	}
6225 
6226 	/* Link siblings. */
6227 	if (nm->sibling != NULL)
6228 		nm->sibling->te_sibling = te;
6229 	nm->sibling = te;
6230 
6231 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6232 }
6233 
6234 static void
6235 ipfw_table_del_dispatch(netmsg_t nmsg)
6236 {
6237 	struct netmsg_tblent *nm = (struct netmsg_tblent *)nmsg;
6238 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6239 	struct radix_node_head *rnh;
6240 	struct radix_node *rn;
6241 
6242 	ASSERT_NETISR_NCPUS(mycpuid);
6243 
6244 	rnh = ctx->ipfw_tables[nm->tableid];
6245 	rn = rnh->rnh_deladdr((char *)nm->key, (char *)nm->netmask, rnh);
6246 	if (rn == NULL) {
6247 		if (mycpuid == 0) {
6248 			netisr_replymsg(&nm->base, ESRCH);
6249 			return;
6250 		}
6251 		panic("rnh_deladdr failed");
6252 	}
6253 	kfree(rn, M_IPFW);
6254 
6255 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6256 }
6257 
6258 static int
6259 ipfw_table_alt(struct sockopt *sopt)
6260 {
6261 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6262 	struct ipfw_ioc_tblcont *tbl;
6263 	struct ipfw_ioc_tblent *te;
6264 	struct sockaddr_in key0;
6265 	struct sockaddr *netmask = NULL, *key;
6266 	struct netmsg_tblent nm;
6267 
6268 	ASSERT_NETISR0;
6269 
6270 	if (sopt->sopt_valsize != sizeof(*tbl))
6271 		return (EINVAL);
6272 	tbl = sopt->sopt_val;
6273 
6274 	if (tbl->tableid < 0  || tbl->tableid >= ipfw_table_max)
6275 		return (EINVAL);
6276 	if (tbl->entcnt != 1)
6277 		return (EINVAL);
6278 
6279 	if (ctx->ipfw_tables[tbl->tableid] == NULL)
6280 		return (ENOENT);
6281 	te = &tbl->ent[0];
6282 
6283 	if (te->key.sin_family != AF_INET ||
6284 	    te->key.sin_port != 0 ||
6285 	    te->key.sin_len != sizeof(struct sockaddr_in))
6286 		return (EINVAL);
6287 	key = (struct sockaddr *)&te->key;
6288 
6289 	if (te->netmask.sin_len != 0) {
6290 		if (te->netmask.sin_port != 0 ||
6291 		    te->netmask.sin_len > sizeof(struct sockaddr_in))
6292 			return (EINVAL);
6293 		netmask = (struct sockaddr *)&te->netmask;
6294 		sa_maskedcopy(key, (struct sockaddr *)&key0, netmask);
6295 		key = (struct sockaddr *)&key0;
6296 	}
6297 
6298 	if (sopt->sopt_name == IP_FW_TBL_ADD) {
6299 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6300 		    MSGF_PRIORITY, ipfw_table_add_dispatch);
6301 	} else {
6302 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6303 		    MSGF_PRIORITY, ipfw_table_del_dispatch);
6304 	}
6305 	nm.key = key;
6306 	nm.netmask = netmask;
6307 	nm.tableid = tbl->tableid;
6308 	nm.sibling = NULL;
6309 	return (netisr_domsg_global(&nm.base));
6310 }
6311 
6312 static int
6313 ipfw_table_zeroent(struct radix_node *rn, void *arg __unused)
6314 {
6315 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6316 
6317 	te->te_use = 0;
6318 	te->te_lastuse = 0;
6319 	return (0);
6320 }
6321 
6322 static void
6323 ipfw_table_zero_dispatch(netmsg_t nmsg)
6324 {
6325 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6326 	struct radix_node_head *rnh;
6327 
6328 	ASSERT_NETISR_NCPUS(mycpuid);
6329 
6330 	rnh = ctx->ipfw_tables[nmsg->lmsg.u.ms_result];
6331 	rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6332 
6333 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6334 }
6335 
6336 static void
6337 ipfw_table_zeroall_dispatch(netmsg_t nmsg)
6338 {
6339 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6340 	int i;
6341 
6342 	ASSERT_NETISR_NCPUS(mycpuid);
6343 
6344 	for (i = 0; i < ipfw_table_max; ++i) {
6345 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6346 
6347 		if (rnh != NULL)
6348 			rnh->rnh_walktree(rnh, ipfw_table_zeroent, NULL);
6349 	}
6350 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
6351 }
6352 
6353 static int
6354 ipfw_table_zero(struct sockopt *sopt)
6355 {
6356 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6357 	struct netmsg_base nm;
6358 	struct ipfw_ioc_table *tbl;
6359 
6360 	ASSERT_NETISR0;
6361 
6362 	if (sopt->sopt_valsize != sizeof(*tbl))
6363 		return (EINVAL);
6364 	tbl = sopt->sopt_val;
6365 
6366 	if (tbl->tableid < 0) {
6367 		netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6368 		    ipfw_table_zeroall_dispatch);
6369 		netisr_domsg_global(&nm);
6370 		return (0);
6371 	} else if (tbl->tableid >= ipfw_table_max) {
6372 		return (EINVAL);
6373 	} else if (ctx->ipfw_tables[tbl->tableid] == NULL) {
6374 		return (ENOENT);
6375 	}
6376 
6377 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6378 	    ipfw_table_zero_dispatch);
6379 	nm.lmsg.u.ms_result = tbl->tableid;
6380 	netisr_domsg_global(&nm);
6381 
6382 	return (0);
6383 }
6384 
6385 static int
6386 ipfw_table_killexp(struct radix_node *rn, void *xnm)
6387 {
6388 	struct netmsg_tblexp *nm = xnm;
6389 	struct ipfw_tblent *te = (struct ipfw_tblent *)rn;
6390 
6391 	if (te->te_expired) {
6392 		ipfw_table_killrn(nm->rnh, rn);
6393 		nm->expcnt++;
6394 	}
6395 	return (0);
6396 }
6397 
6398 static void
6399 ipfw_table_expire_dispatch(netmsg_t nmsg)
6400 {
6401 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6402 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6403 	struct radix_node_head *rnh;
6404 
6405 	ASSERT_NETISR_NCPUS(mycpuid);
6406 
6407 	rnh = ctx->ipfw_tables[nm->tableid];
6408 	nm->rnh = rnh;
6409 	rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6410 
6411 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6412 	    ("not all expired addresses (%d) were deleted (%d)",
6413 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6414 
6415 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6416 }
6417 
6418 static void
6419 ipfw_table_expireall_dispatch(netmsg_t nmsg)
6420 {
6421 	struct netmsg_tblexp *nm = (struct netmsg_tblexp *)nmsg;
6422 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6423 	int i;
6424 
6425 	ASSERT_NETISR_NCPUS(mycpuid);
6426 
6427 	for (i = 0; i < ipfw_table_max; ++i) {
6428 		struct radix_node_head *rnh = ctx->ipfw_tables[i];
6429 
6430 		if (rnh == NULL)
6431 			continue;
6432 		nm->rnh = rnh;
6433 		rnh->rnh_walktree(rnh, ipfw_table_killexp, nm);
6434 	}
6435 
6436 	KASSERT(nm->expcnt == nm->cnt * (mycpuid + 1),
6437 	    ("not all expired addresses (%d) were deleted (%d)",
6438 	     nm->cnt * (mycpuid + 1), nm->expcnt));
6439 
6440 	netisr_forwardmsg(&nm->base, mycpuid + 1);
6441 }
6442 
6443 static int
6444 ipfw_table_markexp(struct radix_node *rn, void *xnm)
6445 {
6446 	struct netmsg_tblexp *nm = xnm;
6447 	struct ipfw_tblent *te;
6448 	time_t lastuse;
6449 
6450 	te = (struct ipfw_tblent *)rn;
6451 	lastuse = te->te_lastuse;
6452 
6453 	while ((te = te->te_sibling) != NULL) {
6454 		if (te->te_lastuse > lastuse)
6455 			lastuse = te->te_lastuse;
6456 	}
6457 	if (!TIME_LEQ(lastuse + nm->expire, time_second)) {
6458 		/* Not expired */
6459 		return (0);
6460 	}
6461 
6462 	te = (struct ipfw_tblent *)rn;
6463 	te->te_expired = 1;
6464 	while ((te = te->te_sibling) != NULL)
6465 		te->te_expired = 1;
6466 	nm->cnt++;
6467 
6468 	return (0);
6469 }
6470 
6471 static int
6472 ipfw_table_expire(struct sockopt *sopt)
6473 {
6474 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6475 	struct netmsg_tblexp nm;
6476 	struct ipfw_ioc_tblexp *tbl;
6477 	struct radix_node_head *rnh;
6478 
6479 	ASSERT_NETISR0;
6480 
6481 	if (sopt->sopt_valsize != sizeof(*tbl))
6482 		return (EINVAL);
6483 	tbl = sopt->sopt_val;
6484 	tbl->expcnt = 0;
6485 
6486 	nm.expcnt = 0;
6487 	nm.cnt = 0;
6488 	nm.expire = tbl->expire;
6489 
6490 	if (tbl->tableid < 0) {
6491 		int i;
6492 
6493 		for (i = 0; i < ipfw_table_max; ++i) {
6494 			rnh = ctx->ipfw_tables[i];
6495 			if (rnh == NULL)
6496 				continue;
6497 			rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6498 		}
6499 		if (nm.cnt == 0) {
6500 			/* No addresses can be expired. */
6501 			return (0);
6502 		}
6503 		tbl->expcnt = nm.cnt;
6504 
6505 		netmsg_init(&nm.base, NULL, &curthread->td_msgport,
6506 		    MSGF_PRIORITY, ipfw_table_expireall_dispatch);
6507 		nm.tableid = -1;
6508 		netisr_domsg_global(&nm.base);
6509 		KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6510 		    ("not all expired addresses (%d) were deleted (%d)",
6511 		     nm.cnt * netisr_ncpus, nm.expcnt));
6512 
6513 		return (0);
6514 	} else if (tbl->tableid >= ipfw_table_max) {
6515 		return (EINVAL);
6516 	}
6517 
6518 	rnh = ctx->ipfw_tables[tbl->tableid];
6519 	if (rnh == NULL)
6520 		return (ENOENT);
6521 	rnh->rnh_walktree(rnh, ipfw_table_markexp, &nm);
6522 	if (nm.cnt == 0) {
6523 		/* No addresses can be expired. */
6524 		return (0);
6525 	}
6526 	tbl->expcnt = nm.cnt;
6527 
6528 	netmsg_init(&nm.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
6529 	    ipfw_table_expire_dispatch);
6530 	nm.tableid = tbl->tableid;
6531 	netisr_domsg_global(&nm.base);
6532 	KASSERT(nm.expcnt == nm.cnt * netisr_ncpus,
6533 	    ("not all expired addresses (%d) were deleted (%d)",
6534 	     nm.cnt * netisr_ncpus, nm.expcnt));
6535 	return (0);
6536 }
6537 
6538 static void
6539 ipfw_crossref_free_dispatch(netmsg_t nmsg)
6540 {
6541 	struct ip_fw *rule = nmsg->lmsg.u.ms_resultp;
6542 
6543 	KKASSERT((rule->rule_flags &
6544 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6545 	    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6546 	ipfw_free_rule(rule);
6547 
6548 	netisr_replymsg(&nmsg->base, 0);
6549 }
6550 
6551 static void
6552 ipfw_crossref_reap(void)
6553 {
6554 	struct ip_fw *rule, *prev = NULL;
6555 
6556 	ASSERT_NETISR0;
6557 
6558 	rule = ipfw_gd.ipfw_crossref_free;
6559 	while (rule != NULL) {
6560 		uint64_t inflight = 0;
6561 		int i;
6562 
6563 		for (i = 0; i < netisr_ncpus; ++i)
6564 			inflight += rule->cross_rules[i]->cross_refs;
6565 		if (inflight == 0) {
6566 			struct ip_fw *f = rule;
6567 
6568 			/*
6569 			 * Unlink.
6570 			 */
6571 			rule = rule->next;
6572 			if (prev != NULL)
6573 				prev->next = rule;
6574 			else
6575 				ipfw_gd.ipfw_crossref_free = rule;
6576 
6577 			/*
6578 			 * Free.
6579 			 */
6580 			for (i = 1; i < netisr_ncpus; ++i) {
6581 				struct netmsg_base nm;
6582 
6583 				netmsg_init(&nm, NULL, &curthread->td_msgport,
6584 				    MSGF_PRIORITY, ipfw_crossref_free_dispatch);
6585 				nm.lmsg.u.ms_resultp = f->cross_rules[i];
6586 				netisr_domsg(&nm, i);
6587 			}
6588 			KKASSERT((f->rule_flags &
6589 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID)) ==
6590 			    (IPFW_RULE_F_CROSSREF | IPFW_RULE_F_INVALID));
6591 			ipfw_unref_rule(f);
6592 		} else {
6593 			prev = rule;
6594 			rule = rule->next;
6595 		}
6596 	}
6597 
6598 	if (ipfw_gd.ipfw_crossref_free != NULL) {
6599 		callout_reset(&ipfw_gd.ipfw_crossref_ch, hz,
6600 		    ipfw_crossref_timeo, NULL);
6601 	}
6602 }
6603 
6604 /*
6605  * {set|get}sockopt parser.
6606  */
6607 static int
6608 ipfw_ctl(struct sockopt *sopt)
6609 {
6610 	int error, rulenum;
6611 	uint32_t *masks;
6612 	size_t size;
6613 
6614 	ASSERT_NETISR0;
6615 
6616 	error = 0;
6617 
6618 	switch (sopt->sopt_name) {
6619 	case IP_FW_GET:
6620 		error = ipfw_ctl_get_rules(sopt);
6621 		break;
6622 
6623 	case IP_FW_FLUSH:
6624 		ipfw_flush(0 /* keep default rule */);
6625 		break;
6626 
6627 	case IP_FW_ADD:
6628 		error = ipfw_ctl_add_rule(sopt);
6629 		break;
6630 
6631 	case IP_FW_DEL:
6632 		/*
6633 		 * IP_FW_DEL is used for deleting single rules or sets,
6634 		 * and (ab)used to atomically manipulate sets.
6635 		 * Argument size is used to distinguish between the two:
6636 		 *    sizeof(uint32_t)
6637 		 *	delete single rule or set of rules,
6638 		 *	or reassign rules (or sets) to a different set.
6639 		 *    2 * sizeof(uint32_t)
6640 		 *	atomic disable/enable sets.
6641 		 *	first uint32_t contains sets to be disabled,
6642 		 *	second uint32_t contains sets to be enabled.
6643 		 */
6644 		masks = sopt->sopt_val;
6645 		size = sopt->sopt_valsize;
6646 		if (size == sizeof(*masks)) {
6647 			/*
6648 			 * Delete or reassign static rule
6649 			 */
6650 			error = ipfw_ctl_alter(masks[0]);
6651 		} else if (size == (2 * sizeof(*masks))) {
6652 			/*
6653 			 * Set enable/disable
6654 			 */
6655 			ipfw_ctl_set_disable(masks[0], masks[1]);
6656 		} else {
6657 			error = EINVAL;
6658 		}
6659 		break;
6660 
6661 	case IP_FW_ZERO:
6662 	case IP_FW_RESETLOG: /* argument is an int, the rule number */
6663 		rulenum = 0;
6664 
6665 		if (sopt->sopt_val != 0) {
6666 		    error = soopt_to_kbuf(sopt, &rulenum,
6667 			    sizeof(int), sizeof(int));
6668 		    if (error)
6669 			break;
6670 		}
6671 		error = ipfw_ctl_zero_entry(rulenum,
6672 			sopt->sopt_name == IP_FW_RESETLOG);
6673 		break;
6674 
6675 	case IP_FW_TBL_CREATE:
6676 		error = ipfw_table_create(sopt);
6677 		break;
6678 
6679 	case IP_FW_TBL_ADD:
6680 	case IP_FW_TBL_DEL:
6681 		error = ipfw_table_alt(sopt);
6682 		break;
6683 
6684 	case IP_FW_TBL_FLUSH:
6685 	case IP_FW_TBL_DESTROY:
6686 		error = ipfw_table_flush(sopt);
6687 		break;
6688 
6689 	case IP_FW_TBL_GET:
6690 		error = ipfw_table_get(sopt);
6691 		break;
6692 
6693 	case IP_FW_TBL_ZERO:
6694 		error = ipfw_table_zero(sopt);
6695 		break;
6696 
6697 	case IP_FW_TBL_EXPIRE:
6698 		error = ipfw_table_expire(sopt);
6699 		break;
6700 
6701 	default:
6702 		kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name);
6703 		error = EINVAL;
6704 	}
6705 
6706 	ipfw_crossref_reap();
6707 	return error;
6708 }
6709 
6710 static void
6711 ipfw_keepalive_done(struct ipfw_context *ctx)
6712 {
6713 
6714 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6715 	    ("keepalive is not in progress"));
6716 	ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE;
6717 	callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz,
6718 	    ipfw_keepalive, NULL);
6719 }
6720 
6721 static void
6722 ipfw_keepalive_more(struct ipfw_context *ctx)
6723 {
6724 	struct netmsg_base *nm = &ctx->ipfw_keepalive_more;
6725 
6726 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6727 	    ("keepalive is not in progress"));
6728 	KASSERT(nm->lmsg.ms_flags & MSGF_DONE,
6729 	    ("keepalive more did not finish"));
6730 	netisr_sendmsg_oncpu(nm);
6731 }
6732 
6733 static void
6734 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor)
6735 {
6736 	struct ipfw_state *s;
6737 	int scanned = 0, expired = 0, kept = 0;
6738 
6739 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6740 	    ("keepalive is not in progress"));
6741 
6742 	while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) {
6743 		uint32_t ack_rev, ack_fwd;
6744 		struct ipfw_flow_id id;
6745 		uint8_t send_dir;
6746 
6747 		if (scanned++ >= ipfw_state_scan_max) {
6748 			ipfw_keepalive_more(ctx);
6749 			return;
6750 		}
6751 
6752 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6753 		TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link);
6754 
6755 		/*
6756 		 * NOTE:
6757 		 * Don't use IPFW_STATE_SCANSKIP; need to perform keepalive
6758 		 * on slave xlat.
6759 		 */
6760 		if (s->st_type == O_ANCHOR)
6761 			continue;
6762 
6763 		if (IPFW_STATE_ISDEAD(s)) {
6764 			ipfw_state_remove(ctx, s);
6765 			if (++expired >= ipfw_state_expire_max) {
6766 				ipfw_keepalive_more(ctx);
6767 				return;
6768 			}
6769 			continue;
6770 		}
6771 
6772 		/*
6773 		 * Keep alive processing
6774 		 */
6775 
6776 		if (s->st_proto != IPPROTO_TCP)
6777 			continue;
6778 		if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN)
6779 			continue;
6780 		if (TIME_LEQ(time_uptime + dyn_keepalive_interval,
6781 		    s->st_expire))
6782 			continue;	/* too early */
6783 
6784 		ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port,
6785 		    &id.dst_ip, &id.dst_port);
6786 		ack_rev = s->st_ack_rev;
6787 		ack_fwd = s->st_ack_fwd;
6788 
6789 #define SEND_FWD	0x1
6790 #define SEND_REV	0x2
6791 
6792 		if (IPFW_ISXLAT(s->st_type)) {
6793 			const struct ipfw_xlat *x = (const struct ipfw_xlat *)s;
6794 
6795 			if (x->xlat_dir == MATCH_FORWARD)
6796 				send_dir = SEND_FWD;
6797 			else
6798 				send_dir = SEND_REV;
6799 		} else {
6800 			send_dir = SEND_FWD | SEND_REV;
6801 		}
6802 
6803 		if (send_dir & SEND_REV)
6804 			send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN);
6805 		if (send_dir & SEND_FWD)
6806 			send_pkt(&id, ack_fwd - 1, ack_rev, 0);
6807 
6808 #undef SEND_FWD
6809 #undef SEND_REV
6810 
6811 		if (++kept >= ipfw_keepalive_max) {
6812 			ipfw_keepalive_more(ctx);
6813 			return;
6814 		}
6815 	}
6816 	TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6817 	ipfw_keepalive_done(ctx);
6818 }
6819 
6820 static void
6821 ipfw_keepalive_more_dispatch(netmsg_t nm)
6822 {
6823 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6824 	struct ipfw_state *anchor;
6825 
6826 	ASSERT_NETISR_NCPUS(mycpuid);
6827 	KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE,
6828 	    ("keepalive is not in progress"));
6829 
6830 	/* Reply ASAP */
6831 	netisr_replymsg(&nm->base, 0);
6832 
6833 	anchor = &ctx->ipfw_keepalive_anch;
6834 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6835 		TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link);
6836 		ipfw_keepalive_done(ctx);
6837 		return;
6838 	}
6839 	ipfw_keepalive_loop(ctx, anchor);
6840 }
6841 
6842 /*
6843  * This procedure is only used to handle keepalives. It is invoked
6844  * every dyn_keepalive_period
6845  */
6846 static void
6847 ipfw_keepalive_dispatch(netmsg_t nm)
6848 {
6849 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6850 	struct ipfw_state *anchor;
6851 
6852 	ASSERT_NETISR_NCPUS(mycpuid);
6853 	KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0,
6854 	    ("keepalive is in progress"));
6855 	ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE;
6856 
6857 	/* Reply ASAP */
6858 	crit_enter();
6859 	netisr_replymsg(&nm->base, 0);
6860 	crit_exit();
6861 
6862 	if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) {
6863 		ipfw_keepalive_done(ctx);
6864 		return;
6865 	}
6866 
6867 	anchor = &ctx->ipfw_keepalive_anch;
6868 	TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link);
6869 	ipfw_keepalive_loop(ctx, anchor);
6870 }
6871 
6872 /*
6873  * This procedure is only used to handle keepalives. It is invoked
6874  * every dyn_keepalive_period
6875  */
6876 static void
6877 ipfw_keepalive(void *dummy __unused)
6878 {
6879 	struct netmsg_base *msg;
6880 
6881 	KKASSERT(mycpuid < netisr_ncpus);
6882 	msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm;
6883 
6884 	crit_enter();
6885 	if (msg->lmsg.ms_flags & MSGF_DONE)
6886 		netisr_sendmsg_oncpu(msg);
6887 	crit_exit();
6888 }
6889 
6890 static void
6891 ipfw_ip_input_dispatch(netmsg_t nmsg)
6892 {
6893 	struct netmsg_genpkt *nm = (struct netmsg_genpkt *)nmsg;
6894 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6895 	struct mbuf *m = nm->m;
6896 	struct ip_fw *rule = nm->arg1;
6897 
6898 	ASSERT_NETISR_NCPUS(mycpuid);
6899 	KASSERT(rule->cpuid == mycpuid,
6900 	    ("rule does not belong to cpu%d", mycpuid));
6901 	KASSERT(m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE,
6902 	    ("mbuf does not have ipfw continue rule"));
6903 
6904 	KASSERT(ctx->ipfw_cont_rule == NULL,
6905 	    ("pending ipfw continue rule"));
6906 	ctx->ipfw_cont_rule = rule;
6907 	ip_input(m);
6908 
6909 	/* May not be cleared, if ipfw was unload/disabled. */
6910 	ctx->ipfw_cont_rule = NULL;
6911 
6912 	/*
6913 	 * This rule is no longer used; decrement its cross_refs,
6914 	 * so this rule can be deleted.
6915 	 */
6916 	rule->cross_refs--;
6917 }
6918 
6919 static void
6920 ipfw_defrag_redispatch(struct mbuf *m, int cpuid, struct ip_fw *rule)
6921 {
6922 	struct netmsg_genpkt *nm;
6923 
6924 	KASSERT(cpuid != mycpuid, ("continue on the same cpu%d", cpuid));
6925 
6926 	/*
6927 	 * NOTE:
6928 	 * Bump cross_refs to prevent this rule and its siblings
6929 	 * from being deleted, while this mbuf is inflight.  The
6930 	 * cross_refs of the sibling rule on the target cpu will
6931 	 * be decremented, once this mbuf is going to be filtered
6932 	 * on the target cpu.
6933 	 */
6934 	rule->cross_refs++;
6935 	m->m_pkthdr.fw_flags |= IPFW_MBUF_CONTINUE;
6936 
6937 	nm = &m->m_hdr.mh_genmsg;
6938 	netmsg_init(&nm->base, NULL, &netisr_apanic_rport, 0,
6939 	    ipfw_ip_input_dispatch);
6940 	nm->m = m;
6941 	nm->arg1 = rule->cross_rules[cpuid];
6942 	netisr_sendmsg(&nm->base, cpuid);
6943 }
6944 
6945 static void
6946 ipfw_init_args(struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif)
6947 {
6948 
6949 	args->flags = 0;
6950 	args->rule = NULL;
6951 	args->xlat = NULL;
6952 
6953 	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
6954 		struct m_tag *mtag;
6955 
6956 		/* Extract info from dummynet tag */
6957 		mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
6958 		KKASSERT(mtag != NULL);
6959 		args->rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv;
6960 		KKASSERT(args->rule != NULL);
6961 
6962 		m_tag_delete(m, mtag);
6963 		m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
6964 	} else if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
6965 		struct ipfw_context *ctx = ipfw_ctx[mycpuid];
6966 
6967 		KKASSERT(ctx->ipfw_cont_rule != NULL);
6968 		args->rule = ctx->ipfw_cont_rule;
6969 		ctx->ipfw_cont_rule = NULL;
6970 
6971 		if (ctx->ipfw_cont_xlat != NULL) {
6972 			args->xlat = ctx->ipfw_cont_xlat;
6973 			ctx->ipfw_cont_xlat = NULL;
6974 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATINS) {
6975 				args->flags |= IP_FWARG_F_XLATINS;
6976 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATINS;
6977 			}
6978 			if (m->m_pkthdr.fw_flags & IPFW_MBUF_XLATFWD) {
6979 				args->flags |= IP_FWARG_F_XLATFWD;
6980 				m->m_pkthdr.fw_flags &= ~IPFW_MBUF_XLATFWD;
6981 			}
6982 		}
6983 		KKASSERT((m->m_pkthdr.fw_flags &
6984 		    (IPFW_MBUF_XLATINS | IPFW_MBUF_XLATFWD)) == 0);
6985 
6986 		args->flags |= IP_FWARG_F_CONT;
6987 		m->m_pkthdr.fw_flags &= ~IPFW_MBUF_CONTINUE;
6988 	}
6989 
6990 	args->eh = NULL;
6991 	args->oif = oif;
6992 	args->m = m;
6993 }
6994 
6995 static int
6996 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
6997 {
6998 	struct ip_fw_args args;
6999 	struct mbuf *m = *m0;
7000 	int tee = 0, error = 0, ret;
7001 
7002 	ipfw_init_args(&args, m, NULL);
7003 
7004 	ret = ipfw_chk(&args);
7005 	m = args.m;
7006 	if (m == NULL) {
7007 		if (ret != IP_FW_REDISPATCH)
7008 			error = EACCES;
7009 		goto back;
7010 	}
7011 
7012 	switch (ret) {
7013 	case IP_FW_PASS:
7014 		break;
7015 
7016 	case IP_FW_DENY:
7017 		m_freem(m);
7018 		m = NULL;
7019 		error = EACCES;
7020 		break;
7021 
7022 	case IP_FW_DUMMYNET:
7023 		/* Send packet to the appropriate pipe */
7024 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args);
7025 		break;
7026 
7027 	case IP_FW_TEE:
7028 		tee = 1;
7029 		/* FALL THROUGH */
7030 
7031 	case IP_FW_DIVERT:
7032 		/*
7033 		 * Must clear bridge tag when changing
7034 		 */
7035 		m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED;
7036 		if (ip_divert_p != NULL) {
7037 			m = ip_divert_p(m, tee, 1);
7038 		} else {
7039 			m_freem(m);
7040 			m = NULL;
7041 			/* not sure this is the right error msg */
7042 			error = EACCES;
7043 		}
7044 		break;
7045 
7046 	default:
7047 		panic("unknown ipfw return value: %d", ret);
7048 	}
7049 back:
7050 	*m0 = m;
7051 	return error;
7052 }
7053 
7054 static int
7055 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir)
7056 {
7057 	struct ip_fw_args args;
7058 	struct mbuf *m = *m0;
7059 	int tee = 0, error = 0, ret;
7060 
7061 	ipfw_init_args(&args, m, ifp);
7062 
7063 	ret = ipfw_chk(&args);
7064 	m = args.m;
7065 	if (m == NULL) {
7066 		if (ret != IP_FW_REDISPATCH)
7067 			error = EACCES;
7068 		goto back;
7069 	}
7070 
7071 	switch (ret) {
7072 	case IP_FW_PASS:
7073 		break;
7074 
7075 	case IP_FW_DENY:
7076 		m_freem(m);
7077 		m = NULL;
7078 		error = EACCES;
7079 		break;
7080 
7081 	case IP_FW_DUMMYNET:
7082 		m = ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args);
7083 		break;
7084 
7085 	case IP_FW_TEE:
7086 		tee = 1;
7087 		/* FALL THROUGH */
7088 
7089 	case IP_FW_DIVERT:
7090 		if (ip_divert_p != NULL) {
7091 			m = ip_divert_p(m, tee, 0);
7092 		} else {
7093 			m_freem(m);
7094 			m = NULL;
7095 			/* not sure this is the right error msg */
7096 			error = EACCES;
7097 		}
7098 		break;
7099 
7100 	default:
7101 		panic("unknown ipfw return value: %d", ret);
7102 	}
7103 back:
7104 	*m0 = m;
7105 	return error;
7106 }
7107 
7108 static void
7109 ipfw_hook(void)
7110 {
7111 	struct pfil_head *pfh;
7112 
7113 	ASSERT_NETISR0;
7114 
7115 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7116 	if (pfh == NULL)
7117 		return;
7118 
7119 	pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7120 	pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7121 }
7122 
7123 static void
7124 ipfw_dehook(void)
7125 {
7126 	struct pfil_head *pfh;
7127 
7128 	ASSERT_NETISR0;
7129 
7130 	pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET);
7131 	if (pfh == NULL)
7132 		return;
7133 
7134 	pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh);
7135 	pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh);
7136 }
7137 
7138 static int
7139 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS)
7140 {
7141 	int dyn_cnt;
7142 
7143 	dyn_cnt = ipfw_state_cntcoll();
7144 	dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt;
7145 
7146 	return (sysctl_handle_int(oidp, &dyn_cnt, 0, req));
7147 }
7148 
7149 static int
7150 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS)
7151 {
7152 	int state_cnt;
7153 
7154 	state_cnt = ipfw_state_cntcoll();
7155 	return (sysctl_handle_int(oidp, &state_cnt, 0, req));
7156 }
7157 
7158 static int
7159 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS)
7160 {
7161 	int state_max, error;
7162 
7163 	state_max = ipfw_state_max;
7164 	error = sysctl_handle_int(oidp, &state_max, 0, req);
7165 	if (error || req->newptr == NULL)
7166 		return (error);
7167 
7168 	if (state_max < 1)
7169 		return (EINVAL);
7170 
7171 	ipfw_state_max_set(state_max);
7172 	return (0);
7173 }
7174 
7175 static int
7176 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS)
7177 {
7178 	int dyn_max, error;
7179 
7180 	dyn_max = ipfw_state_max + ipfw_track_max;
7181 
7182 	error = sysctl_handle_int(oidp, &dyn_max, 0, req);
7183 	if (error || req->newptr == NULL)
7184 		return (error);
7185 
7186 	if (dyn_max < 2)
7187 		return (EINVAL);
7188 
7189 	ipfw_state_max_set(dyn_max / 2);
7190 	ipfw_track_max = dyn_max / 2;
7191 	return (0);
7192 }
7193 
7194 static void
7195 ipfw_sysctl_enable_dispatch(netmsg_t nmsg)
7196 {
7197 	int enable = nmsg->lmsg.u.ms_result;
7198 
7199 	ASSERT_NETISR0;
7200 
7201 	if (fw_enable == enable)
7202 		goto reply;
7203 
7204 	fw_enable = enable;
7205 	if (fw_enable)
7206 		ipfw_hook();
7207 	else
7208 		ipfw_dehook();
7209 reply:
7210 	netisr_replymsg(&nmsg->base, 0);
7211 }
7212 
7213 static int
7214 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS)
7215 {
7216 	struct netmsg_base nmsg;
7217 	int enable, error;
7218 
7219 	enable = fw_enable;
7220 	error = sysctl_handle_int(oidp, &enable, 0, req);
7221 	if (error || req->newptr == NULL)
7222 		return error;
7223 
7224 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7225 	    ipfw_sysctl_enable_dispatch);
7226 	nmsg.lmsg.u.ms_result = enable;
7227 
7228 	return netisr_domsg(&nmsg, 0);
7229 }
7230 
7231 static int
7232 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS)
7233 {
7234 	return sysctl_int_range(oidp, arg1, arg2, req,
7235 	       IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX);
7236 }
7237 
7238 static int
7239 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS)
7240 {
7241 
7242 	return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX);
7243 }
7244 
7245 static int
7246 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS)
7247 {
7248 	u_long stat = 0;
7249 	int cpu, error;
7250 
7251 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7252 		stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2));
7253 
7254 	error = sysctl_handle_long(oidp, &stat, 0, req);
7255 	if (error || req->newptr == NULL)
7256 		return (error);
7257 
7258 	/* Zero out this stat. */
7259 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7260 		*((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0;
7261 	return (0);
7262 }
7263 
7264 static void
7265 ipfw_ctx_init_dispatch(netmsg_t nmsg)
7266 {
7267 	struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg;
7268 	struct ipfw_context *ctx;
7269 	struct ip_fw *def_rule;
7270 
7271 	ASSERT_NETISR_NCPUS(mycpuid);
7272 
7273 	ctx = kmalloc(__offsetof(struct ipfw_context,
7274 	    ipfw_tables[ipfw_table_max]), M_IPFW, M_WAITOK | M_ZERO);
7275 
7276 	RB_INIT(&ctx->ipfw_state_tree);
7277 	TAILQ_INIT(&ctx->ipfw_state_list);
7278 
7279 	RB_INIT(&ctx->ipfw_track_tree);
7280 	TAILQ_INIT(&ctx->ipfw_track_list);
7281 
7282 	callout_init_mp(&ctx->ipfw_stateto_ch);
7283 	netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport,
7284 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch);
7285 	ctx->ipfw_stateexp_anch.st_type = O_ANCHOR;
7286 	netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport,
7287 	    MSGF_DROPABLE, ipfw_state_expire_more_dispatch);
7288 
7289 	callout_init_mp(&ctx->ipfw_trackto_ch);
7290 	netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport,
7291 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch);
7292 	netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport,
7293 	    MSGF_DROPABLE, ipfw_track_expire_more_dispatch);
7294 
7295 	callout_init_mp(&ctx->ipfw_keepalive_ch);
7296 	netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport,
7297 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch);
7298 	ctx->ipfw_keepalive_anch.st_type = O_ANCHOR;
7299 	netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport,
7300 	    MSGF_DROPABLE, ipfw_keepalive_more_dispatch);
7301 
7302 	callout_init_mp(&ctx->ipfw_xlatreap_ch);
7303 	netmsg_init(&ctx->ipfw_xlatreap_nm, NULL, &netisr_adone_rport,
7304 	    MSGF_DROPABLE | MSGF_PRIORITY, ipfw_xlat_reap_dispatch);
7305 	TAILQ_INIT(&ctx->ipfw_xlatreap);
7306 
7307 	ipfw_ctx[mycpuid] = ctx;
7308 
7309 	def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO);
7310 
7311 	def_rule->act_ofs = 0;
7312 	def_rule->rulenum = IPFW_DEFAULT_RULE;
7313 	def_rule->cmd_len = 1;
7314 	def_rule->set = IPFW_DEFAULT_SET;
7315 
7316 	def_rule->cmd[0].len = 1;
7317 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
7318 	def_rule->cmd[0].opcode = O_ACCEPT;
7319 #else
7320 	if (filters_default_to_accept)
7321 		def_rule->cmd[0].opcode = O_ACCEPT;
7322 	else
7323 		def_rule->cmd[0].opcode = O_DENY;
7324 #endif
7325 
7326 	def_rule->refcnt = 1;
7327 	def_rule->cpuid = mycpuid;
7328 
7329 	/* Install the default rule */
7330 	ctx->ipfw_default_rule = def_rule;
7331 	ctx->ipfw_layer3_chain = def_rule;
7332 
7333 	/* Link rule CPU sibling */
7334 	ipfw_link_sibling(fwmsg, def_rule);
7335 
7336 	/* Statistics only need to be updated once */
7337 	if (mycpuid == 0)
7338 		ipfw_inc_static_count(def_rule);
7339 
7340 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7341 }
7342 
7343 static void
7344 ipfw_crossref_reap_dispatch(netmsg_t nmsg)
7345 {
7346 
7347 	crit_enter();
7348 	/* Reply ASAP */
7349 	netisr_replymsg(&nmsg->base, 0);
7350 	crit_exit();
7351 	ipfw_crossref_reap();
7352 }
7353 
7354 static void
7355 ipfw_crossref_timeo(void *dummy __unused)
7356 {
7357 	struct netmsg_base *msg = &ipfw_gd.ipfw_crossref_nm;
7358 
7359 	KKASSERT(mycpuid == 0);
7360 
7361 	crit_enter();
7362 	if (msg->lmsg.ms_flags & MSGF_DONE)
7363 		netisr_sendmsg_oncpu(msg);
7364 	crit_exit();
7365 }
7366 
7367 static void
7368 ipfw_ifaddr_dispatch(netmsg_t nmsg)
7369 {
7370 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7371 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
7372 	struct ip_fw *f;
7373 
7374 	ASSERT_NETISR_NCPUS(mycpuid);
7375 
7376 	for (f = ctx->ipfw_layer3_chain; f != NULL; f = f->next) {
7377 		int l, cmdlen;
7378 		ipfw_insn *cmd;
7379 
7380 		if ((f->rule_flags & IPFW_RULE_F_DYNIFADDR) == 0)
7381 			continue;
7382 
7383 		for (l = f->cmd_len, cmd = f->cmd; l > 0;
7384 		     l -= cmdlen, cmd += cmdlen) {
7385 			cmdlen = F_LEN(cmd);
7386 			if (cmd->opcode == O_IP_SRC_IFIP ||
7387 			    cmd->opcode == O_IP_DST_IFIP) {
7388 				if (strncmp(ifp->if_xname,
7389 				    ((ipfw_insn_ifip *)cmd)->ifname,
7390 				    IFNAMSIZ) == 0)
7391 					cmd->arg1 &= ~IPFW_IFIP_VALID;
7392 			}
7393 		}
7394 	}
7395 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7396 }
7397 
7398 static void
7399 ipfw_ifaddr(void *arg __unused, struct ifnet *ifp,
7400     enum ifaddr_event event __unused, struct ifaddr *ifa __unused)
7401 {
7402 	struct netmsg_base nm;
7403 
7404 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7405 	    ipfw_ifaddr_dispatch);
7406 	nm.lmsg.u.ms_resultp = ifp;
7407 	netisr_domsg_global(&nm);
7408 }
7409 
7410 static void
7411 ipfw_init_dispatch(netmsg_t nmsg)
7412 {
7413 	struct netmsg_ipfw fwmsg;
7414 	int error = 0, cpu;
7415 
7416 	ASSERT_NETISR0;
7417 
7418 	if (IPFW_LOADED) {
7419 		kprintf("IP firewall already loaded\n");
7420 		error = EEXIST;
7421 		goto reply;
7422 	}
7423 
7424 	if (ipfw_table_max > UINT16_MAX || ipfw_table_max <= 0)
7425 		ipfw_table_max = UINT16_MAX;
7426 
7427 	/* Initialize global track tree. */
7428 	RB_INIT(&ipfw_gd.ipfw_trkcnt_tree);
7429 	IPFW_TRKCNT_TOKINIT;
7430 
7431 	/* GC for freed crossref rules. */
7432 	callout_init_mp(&ipfw_gd.ipfw_crossref_ch);
7433 	netmsg_init(&ipfw_gd.ipfw_crossref_nm, NULL, &netisr_adone_rport,
7434 	    MSGF_PRIORITY | MSGF_DROPABLE, ipfw_crossref_reap_dispatch);
7435 
7436 	ipfw_state_max_set(ipfw_state_max);
7437 	ipfw_state_headroom = 8 * netisr_ncpus;
7438 
7439 	bzero(&fwmsg, sizeof(fwmsg));
7440 	netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7441 	    ipfw_ctx_init_dispatch);
7442 	netisr_domsg_global(&fwmsg.base);
7443 
7444 	ip_fw_chk_ptr = ipfw_chk;
7445 	ip_fw_ctl_ptr = ipfw_ctl;
7446 	ip_fw_dn_io_ptr = ipfw_dummynet_io;
7447 
7448 	kprintf("ipfw2 initialized, default to %s, logging ",
7449 		ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode ==
7450 		O_ACCEPT ? "accept" : "deny");
7451 
7452 #ifdef IPFIREWALL_VERBOSE
7453 	fw_verbose = 1;
7454 #endif
7455 #ifdef IPFIREWALL_VERBOSE_LIMIT
7456 	verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
7457 #endif
7458 	if (fw_verbose == 0) {
7459 		kprintf("disabled\n");
7460 	} else if (verbose_limit == 0) {
7461 		kprintf("unlimited\n");
7462 	} else {
7463 		kprintf("limited to %d packets/entry by default\n",
7464 			verbose_limit);
7465 	}
7466 
7467 	ip_fw_loaded = 1;
7468 	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
7469 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz,
7470 		    ipfw_state_expire_ipifunc, NULL, cpu);
7471 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz,
7472 		    ipfw_track_expire_ipifunc, NULL, cpu);
7473 		callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz,
7474 		    ipfw_keepalive, NULL, cpu);
7475 	}
7476 
7477 	if (fw_enable)
7478 		ipfw_hook();
7479 
7480 	ipfw_ifaddr_event = EVENTHANDLER_REGISTER(ifaddr_event, ipfw_ifaddr,
7481 	    NULL, EVENTHANDLER_PRI_ANY);
7482 	if (ipfw_ifaddr_event == NULL)
7483 		kprintf("ipfw: ifaddr_event register failed\n");
7484 
7485 reply:
7486 	netisr_replymsg(&nmsg->base, error);
7487 }
7488 
7489 static int
7490 ipfw_init(void)
7491 {
7492 	struct netmsg_base smsg;
7493 
7494 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7495 	    ipfw_init_dispatch);
7496 	return netisr_domsg(&smsg, 0);
7497 }
7498 
7499 #ifdef KLD_MODULE
7500 
7501 static void
7502 ipfw_ctx_fini_dispatch(netmsg_t nmsg)
7503 {
7504 	struct ipfw_context *ctx = ipfw_ctx[mycpuid];
7505 
7506 	ASSERT_NETISR_NCPUS(mycpuid);
7507 
7508 	callout_cancel(&ctx->ipfw_stateto_ch);
7509 	callout_cancel(&ctx->ipfw_trackto_ch);
7510 	callout_cancel(&ctx->ipfw_keepalive_ch);
7511 	callout_cancel(&ctx->ipfw_xlatreap_ch);
7512 
7513 	crit_enter();
7514 	netisr_dropmsg(&ctx->ipfw_stateexp_more);
7515 	netisr_dropmsg(&ctx->ipfw_stateexp_nm);
7516 	netisr_dropmsg(&ctx->ipfw_trackexp_more);
7517 	netisr_dropmsg(&ctx->ipfw_trackexp_nm);
7518 	netisr_dropmsg(&ctx->ipfw_keepalive_more);
7519 	netisr_dropmsg(&ctx->ipfw_keepalive_nm);
7520 	netisr_dropmsg(&ctx->ipfw_xlatreap_nm);
7521 	crit_exit();
7522 
7523 	ipfw_table_flushall_oncpu(ctx, 1);
7524 
7525 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
7526 }
7527 
7528 static void
7529 ipfw_fini_dispatch(netmsg_t nmsg)
7530 {
7531 	struct netmsg_base nm;
7532 	int error = 0, cpu;
7533 
7534 	ASSERT_NETISR0;
7535 
7536 	ipfw_crossref_reap();
7537 
7538 	if (ipfw_gd.ipfw_refcnt != 0) {
7539 		error = EBUSY;
7540 		goto reply;
7541 	}
7542 
7543 	ip_fw_loaded = 0;
7544 	ipfw_dehook();
7545 
7546 	/* Synchronize any inflight state/track expire IPIs. */
7547 	lwkt_synchronize_ipiqs("ipfwfini");
7548 
7549 	netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7550 	    ipfw_ctx_fini_dispatch);
7551 	netisr_domsg_global(&nm);
7552 
7553 	callout_cancel(&ipfw_gd.ipfw_crossref_ch);
7554 	crit_enter();
7555 	netisr_dropmsg(&ipfw_gd.ipfw_crossref_nm);
7556 	crit_exit();
7557 
7558 	if (ipfw_ifaddr_event != NULL)
7559 		EVENTHANDLER_DEREGISTER(ifaddr_event, ipfw_ifaddr_event);
7560 
7561 	ip_fw_chk_ptr = NULL;
7562 	ip_fw_ctl_ptr = NULL;
7563 	ip_fw_dn_io_ptr = NULL;
7564 	ipfw_flush(1 /* kill default rule */);
7565 
7566 	/* Free pre-cpu context */
7567 	for (cpu = 0; cpu < netisr_ncpus; ++cpu)
7568 		kfree(ipfw_ctx[cpu], M_IPFW);
7569 
7570 	kprintf("IP firewall unloaded\n");
7571 reply:
7572 	netisr_replymsg(&nmsg->base, error);
7573 }
7574 
7575 static void
7576 ipfw_fflush_dispatch(netmsg_t nmsg)
7577 {
7578 
7579 	ipfw_flush(0 /* keep default rule */);
7580 	ipfw_crossref_reap();
7581 	netisr_replymsg(&nmsg->base, 0);
7582 }
7583 
7584 static int
7585 ipfw_fini(void)
7586 {
7587 	struct netmsg_base smsg;
7588 	int i = 0;
7589 
7590 	for (;;) {
7591 		netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7592 		    ipfw_fflush_dispatch);
7593 		netisr_domsg(&smsg, 0);
7594 
7595 		if (ipfw_gd.ipfw_refcnt == 0)
7596 			break;
7597 		kprintf("ipfw: flush pending %d\n", ++i);
7598 		tsleep(&smsg, 0, "ipfwff", (3 * hz) / 2);
7599 	}
7600 
7601 	netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY,
7602 	    ipfw_fini_dispatch);
7603 	return netisr_domsg(&smsg, 0);
7604 }
7605 
7606 #endif	/* KLD_MODULE */
7607 
7608 static int
7609 ipfw_modevent(module_t mod, int type, void *unused)
7610 {
7611 	int err = 0;
7612 
7613 	switch (type) {
7614 	case MOD_LOAD:
7615 		err = ipfw_init();
7616 		break;
7617 
7618 	case MOD_UNLOAD:
7619 #ifndef KLD_MODULE
7620 		kprintf("ipfw statically compiled, cannot unload\n");
7621 		err = EBUSY;
7622 #else
7623 		err = ipfw_fini();
7624 #endif
7625 		break;
7626 	default:
7627 		break;
7628 	}
7629 	return err;
7630 }
7631 
7632 static moduledata_t ipfwmod = {
7633 	"ipfw",
7634 	ipfw_modevent,
7635 	0
7636 };
7637 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY);
7638 MODULE_VERSION(ipfw, 1);
7639