1 /* 2 * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD: src/sys/netinet/ip_fw2.c,v 1.6.2.12 2003/04/08 10:42:32 maxim Exp $ 26 */ 27 28 /* 29 * Implement IP packet firewall (new version) 30 */ 31 32 #include "opt_ipfw.h" 33 #include "opt_inet.h" 34 #ifndef INET 35 #error IPFIREWALL requires INET. 36 #endif /* INET */ 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/kernel.h> 43 #include <sys/proc.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/syslog.h> 48 #include <sys/ucred.h> 49 #include <sys/in_cksum.h> 50 #include <sys/limits.h> 51 #include <sys/lock.h> 52 #include <sys/tree.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/pfil.h> 57 #include <net/dummynet/ip_dummynet.h> 58 59 #include <sys/thread2.h> 60 #include <sys/mplock2.h> 61 #include <net/netmsg2.h> 62 63 #include <netinet/in.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/in_var.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/ip.h> 68 #include <netinet/ip_var.h> 69 #include <netinet/ip_icmp.h> 70 #include <netinet/tcp.h> 71 #include <netinet/tcp_seq.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcpip.h> 75 #include <netinet/udp.h> 76 #include <netinet/udp_var.h> 77 #include <netinet/ip_divert.h> 78 #include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */ 79 80 #include <net/ipfw/ip_fw2.h> 81 82 #ifdef IPFIREWALL_DEBUG 83 #define DPRINTF(fmt, ...) \ 84 do { \ 85 if (fw_debug > 0) \ 86 kprintf(fmt, __VA_ARGS__); \ 87 } while (0) 88 #else 89 #define DPRINTF(fmt, ...) ((void)0) 90 #endif 91 92 /* 93 * Description about per-CPU rule duplication: 94 * 95 * Module loading/unloading and all ioctl operations are serialized 96 * by netisr0, so we don't have any ordering or locking problems. 97 * 98 * Following graph shows how operation on per-CPU rule list is 99 * performed [2 CPU case]: 100 * 101 * CPU0 CPU1 102 * 103 * netisr0 <------------------------------------+ 104 * domsg | 105 * : | 106 * :(delete/add...) | 107 * : | 108 * : netmsg | netmsg 109 * forwardmsg---------->netisr1 | 110 * : | 111 * :(delete/add...) | 112 * : | 113 * : | 114 * replymsg--------------+ 115 * 116 * 117 * 118 * Rule structure [2 CPU case] 119 * 120 * CPU0 CPU1 121 * 122 * layer3_chain layer3_chain 123 * | | 124 * V V 125 * +-------+ sibling +-------+ sibling 126 * | rule1 |--------->| rule1 |--------->NULL 127 * +-------+ +-------+ 128 * | | 129 * |next |next 130 * V V 131 * +-------+ sibling +-------+ sibling 132 * | rule2 |--------->| rule2 |--------->NULL 133 * +-------+ +-------+ 134 * 135 * ip_fw.sibling: 136 * 1) Ease statistics calculation during IP_FW_GET. We only need to 137 * iterate layer3_chain in netisr0; the current rule's duplication 138 * to the other CPUs could safely be read-only accessed through 139 * ip_fw.sibling. 140 * 2) Accelerate rule insertion and deletion, e.g. rule insertion: 141 * a) In netisr0 rule3 is determined to be inserted between rule1 142 * and rule2. To make this decision we need to iterate the 143 * layer3_chain in netisr0. The netmsg, which is used to insert 144 * the rule, will contain rule1 in netisr0 as prev_rule and rule2 145 * in netisr0 as next_rule. 146 * b) After the insertion in netisr0 is done, we will move on to 147 * netisr1. But instead of relocating the rule3's position in 148 * netisr1 by iterating the layer3_chain in netisr1, we set the 149 * netmsg's prev_rule to rule1->sibling and next_rule to 150 * rule2->sibling before the netmsg is forwarded to netisr1 from 151 * netisr0. 152 */ 153 154 /* 155 * Description of states and tracks. 156 * 157 * Both states and tracks are stored in per-cpu RB trees instead of 158 * per-cpu hash tables to avoid the worst case hash degeneration. 159 * 160 * The lifetimes of states and tracks are regulated by dyn_*_lifetime, 161 * measured in seconds and depending on the flags. 162 * 163 * When a packet is received, its address fields are first masked with 164 * the mask defined for the rule, then matched against the entries in 165 * the per-cpu state RB tree. States are generated by 'keep-state' 166 * and 'limit' options. 167 * 168 * The max number of states is ipfw_state_max. When we reach the 169 * maximum number of states we do not create anymore. This is done to 170 * avoid consuming too much memory, but also too much time when 171 * searching on each packet. 172 * 173 * Each state holds a pointer to the parent ipfw rule of the current 174 * CPU so we know what action to perform. States are removed when the 175 * parent rule is deleted. XXX we should make them survive. 176 * 177 * There are some limitations with states -- we do not obey the 178 * 'randomized match', and we do not do multiple passes through the 179 * firewall. XXX check the latter!!! 180 * 181 * States grow independently on each CPU, e.g. 2 CPU case: 182 * 183 * CPU0 CPU1 184 * ................... ................... 185 * : state RB tree : : state RB tree : 186 * : : : : 187 * : state1 state2 : : state3 : 188 * : | | : : | : 189 * :.....|....|......: :........|........: 190 * | | | 191 * | | |st_rule 192 * | | | 193 * V V V 194 * +-------+ +-------+ 195 * | rule1 | | rule1 | 196 * +-------+ +-------+ 197 * 198 * Tracks are used to enforce limits on the number of sessions. Tracks 199 * are generated by 'limit' option. 200 * 201 * The max number of tracks is ipfw_track_max. When we reach the 202 * maximum number of tracks we do not create anymore. This is done to 203 * avoid consuming too much memory. 204 * 205 * Tracks are organized into two layers, track counter RB tree is 206 * shared between CPUs, track RB tree is per-cpu. States generated by 207 * 'limit' option are linked to the track in addition to the per-cpu 208 * state RB tree; mainly to ease expiration. e.g. 2 CPU case: 209 * 210 * .............................. 211 * : track counter RB tree : 212 * : : 213 * : +-----------+ : 214 * : | trkcnt1 | : 215 * : | | : 216 * : +--->counter<----+ : 217 * : | | | | : 218 * : | +-----------+ | : 219 * :......|................|....: 220 * | | 221 * CPU0 | | CPU1 222 * ................. |t_count | ................. 223 * : track RB tree : | | : track RB tree : 224 * : : | | : : 225 * : +-->track1-------+ +--------track2 : 226 * : | A : : : 227 * : | | : : : 228 * :.|.....|.......: :...............: 229 * | +----------------+ 230 * | .................... | 231 * | : state RB tree : |st_track 232 * | : : | 233 * +---state1 state2---+ 234 * : | | : 235 * :.....|.......|....: 236 * | | 237 * | |st_rule 238 * V V 239 * +----------+ 240 * | rule1 | 241 * +----------+ 242 */ 243 244 #define IPFW_AUTOINC_STEP_MIN 1 245 #define IPFW_AUTOINC_STEP_MAX 1000 246 #define IPFW_AUTOINC_STEP_DEF 100 247 248 #define IPFW_DEFAULT_RULE 65535 /* rulenum for the default rule */ 249 #define IPFW_DEFAULT_SET 31 /* set number for the default rule */ 250 251 #define MATCH_REVERSE 0 252 #define MATCH_FORWARD 1 253 #define MATCH_NONE 2 254 #define MATCH_UNKNOWN 3 255 256 #define IPFW_STATE_TCPFLAGS (TH_SYN | TH_FIN | TH_RST) 257 #define IPFW_STATE_TCPSTATES (IPFW_STATE_TCPFLAGS | \ 258 (IPFW_STATE_TCPFLAGS << 8)) 259 260 #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) 261 #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) 262 #define BOTH_RST (TH_RST | (TH_RST << 8)) 263 /* TH_ACK here means FIN was ACKed. */ 264 #define BOTH_FINACK (TH_ACK | (TH_ACK << 8)) 265 266 #define IPFW_STATE_TCPCLOSED(s) ((s)->st_proto == IPPROTO_TCP && \ 267 (((s)->st_state & BOTH_RST) || \ 268 ((s)->st_state & BOTH_FINACK) == BOTH_FINACK)) 269 270 #define O_ANCHOR O_NOP 271 272 struct netmsg_ipfw { 273 struct netmsg_base base; 274 const struct ipfw_ioc_rule *ioc_rule; 275 struct ip_fw *next_rule; 276 struct ip_fw *prev_rule; 277 struct ip_fw *sibling; 278 uint32_t rule_flags; 279 }; 280 281 struct netmsg_del { 282 struct netmsg_base base; 283 struct ip_fw *start_rule; 284 struct ip_fw *prev_rule; 285 uint16_t rulenum; 286 uint8_t from_set; 287 uint8_t to_set; 288 }; 289 290 struct netmsg_zent { 291 struct netmsg_base base; 292 struct ip_fw *start_rule; 293 uint16_t rulenum; 294 uint16_t log_only; 295 }; 296 297 struct netmsg_cpstate { 298 struct netmsg_base base; 299 struct ipfw_ioc_state *ioc_state; 300 int state_cntmax; 301 int state_cnt; 302 }; 303 304 struct ipfw_addrs { 305 uint32_t addr1; 306 uint32_t addr2; 307 }; 308 309 struct ipfw_ports { 310 uint16_t port1; 311 uint16_t port2; 312 }; 313 314 struct ipfw_key { 315 union { 316 struct ipfw_addrs addrs; 317 uint64_t value; 318 } addr_u; 319 union { 320 struct ipfw_ports ports; 321 uint32_t value; 322 } port_u; 323 uint8_t proto; 324 uint8_t swap; /* IPFW_KEY_SWAP_ */ 325 uint16_t rsvd2; 326 }; 327 328 #define IPFW_KEY_SWAP_ADDRS 0x1 329 #define IPFW_KEY_SWAP_PORTS 0x2 330 #define IPFW_KEY_SWAP_ALL (IPFW_KEY_SWAP_ADDRS | IPFW_KEY_SWAP_PORTS) 331 332 struct ipfw_trkcnt { 333 RB_ENTRY(ipfw_trkcnt) tc_rblink; 334 struct ipfw_key tc_key; 335 uintptr_t tc_ruleid; 336 int tc_refs; 337 int tc_count; 338 time_t tc_expire; /* userland get-only */ 339 uint16_t tc_rulenum; /* userland get-only */ 340 } __cachealign; 341 342 #define tc_addrs tc_key.addr_u.value 343 #define tc_ports tc_key.port_u.value 344 #define tc_proto tc_key.proto 345 #define tc_saddr tc_key.addr_u.addrs.addr1 346 #define tc_daddr tc_key.addr_u.addrs.addr2 347 #define tc_sport tc_key.port_u.ports.port1 348 #define tc_dport tc_key.port_u.ports.port2 349 350 RB_HEAD(ipfw_trkcnt_tree, ipfw_trkcnt); 351 352 struct ipfw_state; 353 354 struct ipfw_track { 355 RB_ENTRY(ipfw_track) t_rblink; 356 struct ipfw_key t_key; 357 struct ip_fw *t_rule; 358 time_t t_lastexp; 359 LIST_HEAD(, ipfw_state) t_state_list; 360 time_t t_expire; 361 volatile int *t_count; 362 struct ipfw_trkcnt *t_trkcnt; 363 TAILQ_ENTRY(ipfw_track) t_link; 364 }; 365 366 #define t_addrs t_key.addr_u.value 367 #define t_ports t_key.port_u.value 368 #define t_proto t_key.proto 369 #define t_saddr t_key.addr_u.addrs.addr1 370 #define t_daddr t_key.addr_u.addrs.addr2 371 #define t_sport t_key.port_u.ports.port1 372 #define t_dport t_key.port_u.ports.port2 373 374 RB_HEAD(ipfw_track_tree, ipfw_track); 375 TAILQ_HEAD(ipfw_track_list, ipfw_track); 376 377 struct ipfw_state { 378 RB_ENTRY(ipfw_state) st_rblink; 379 struct ipfw_key st_key; 380 381 time_t st_expire; /* expire time */ 382 struct ip_fw *st_rule; 383 384 uint64_t st_pcnt; /* packets */ 385 uint64_t st_bcnt; /* bytes */ 386 387 /* 388 * st_state: 389 * State of this rule, typically a combination of TCP flags. 390 * 391 * st_ack_fwd/st_ack_rev: 392 * Most recent ACKs in forward and reverse direction. They 393 * are used to generate keepalives. 394 */ 395 uint32_t st_state; 396 uint32_t st_ack_fwd; 397 uint32_t st_seq_fwd; 398 uint32_t st_ack_rev; 399 uint32_t st_seq_rev; 400 401 uint16_t st_flags; /* IPFW_STATE_F_ */ 402 uint16_t st_type; /* O_KEEP_STATE/O_LIMIT */ 403 struct ipfw_track *st_track; 404 405 LIST_ENTRY(ipfw_state) st_trklink; 406 TAILQ_ENTRY(ipfw_state) st_link; 407 }; 408 409 #define st_addrs st_key.addr_u.value 410 #define st_ports st_key.port_u.value 411 #define st_proto st_key.proto 412 #define st_swap st_key.swap 413 414 #define IPFW_STATE_F_ACKFWD 0x0001 415 #define IPFW_STATE_F_SEQFWD 0x0002 416 #define IPFW_STATE_F_ACKREV 0x0004 417 #define IPFW_STATE_F_SEQREV 0x0008 418 419 TAILQ_HEAD(ipfw_state_list, ipfw_state); 420 RB_HEAD(ipfw_state_tree, ipfw_state); 421 422 struct ipfw_context { 423 struct ip_fw *ipfw_layer3_chain; /* rules for layer3 */ 424 struct ip_fw *ipfw_default_rule; /* default rule */ 425 uint64_t ipfw_norule_counter; /* ipfw_log(NULL) stat*/ 426 427 /* 428 * ipfw_set_disable contains one bit per set value (0..31). 429 * If the bit is set, all rules with the corresponding set 430 * are disabled. Set IPDW_DEFAULT_SET is reserved for the 431 * default rule and CANNOT be disabled. 432 */ 433 uint32_t ipfw_set_disable; 434 435 uint8_t ipfw_flags; /* IPFW_FLAG_ */ 436 437 struct ipfw_state_tree ipfw_state_tree; 438 struct ipfw_state_list ipfw_state_list; 439 int ipfw_state_loosecnt; 440 int ipfw_state_cnt; 441 442 union { 443 struct ipfw_state state; 444 struct ipfw_track track; 445 struct ipfw_trkcnt trkcnt; 446 } ipfw_tmpkey; 447 448 struct ipfw_track_tree ipfw_track_tree; 449 struct ipfw_track_list ipfw_track_list; 450 struct ipfw_trkcnt *ipfw_trkcnt_spare; 451 452 struct callout ipfw_stateto_ch; 453 time_t ipfw_state_lastexp; 454 struct netmsg_base ipfw_stateexp_nm; 455 struct netmsg_base ipfw_stateexp_more; 456 struct ipfw_state ipfw_stateexp_anch; 457 458 struct callout ipfw_trackto_ch; 459 time_t ipfw_track_lastexp; 460 struct netmsg_base ipfw_trackexp_nm; 461 struct netmsg_base ipfw_trackexp_more; 462 struct ipfw_track ipfw_trackexp_anch; 463 464 struct callout ipfw_keepalive_ch; 465 struct netmsg_base ipfw_keepalive_nm; 466 struct netmsg_base ipfw_keepalive_more; 467 struct ipfw_state ipfw_keepalive_anch; 468 469 /* 470 * Statistics 471 */ 472 u_long ipfw_sts_reap; 473 u_long ipfw_sts_reapfailed; 474 u_long ipfw_sts_overflow; 475 u_long ipfw_sts_nomem; 476 u_long ipfw_sts_tcprecycled; 477 478 u_long ipfw_tks_nomem; 479 u_long ipfw_tks_reap; 480 u_long ipfw_tks_reapfailed; 481 u_long ipfw_tks_overflow; 482 u_long ipfw_tks_cntnomem; 483 }; 484 485 #define IPFW_FLAG_KEEPALIVE 0x01 486 #define IPFW_FLAG_STATEEXP 0x02 487 #define IPFW_FLAG_TRACKEXP 0x04 488 #define IPFW_FLAG_STATEREAP 0x08 489 #define IPFW_FLAG_TRACKREAP 0x10 490 491 #define ipfw_state_tmpkey ipfw_tmpkey.state 492 #define ipfw_track_tmpkey ipfw_tmpkey.track 493 #define ipfw_trkcnt_tmpkey ipfw_tmpkey.trkcnt 494 495 struct ipfw_global { 496 int ipfw_state_loosecnt; /* cache aligned */ 497 time_t ipfw_state_globexp __cachealign; 498 499 struct lwkt_token ipfw_trkcnt_token __cachealign; 500 struct ipfw_trkcnt_tree ipfw_trkcnt_tree; 501 int ipfw_trkcnt_cnt; 502 time_t ipfw_track_globexp; 503 504 #ifdef KLD_MODULE 505 /* 506 * Module can not be unloaded, if there are references to 507 * certains rules of ipfw(4), e.g. dummynet(4) 508 */ 509 int ipfw_refcnt __cachealign; 510 #endif 511 } __cachealign; 512 513 static struct ipfw_context *ipfw_ctx[MAXCPU]; 514 515 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); 516 517 /* 518 * Following two global variables are accessed and updated only 519 * in netisr0. 520 */ 521 static uint32_t static_count; /* # of static rules */ 522 static uint32_t static_ioc_len; /* bytes of static rules */ 523 524 /* 525 * If 1, then ipfw static rules are being flushed, 526 * ipfw_chk() will skip to the default rule. 527 */ 528 static int ipfw_flushing; 529 530 static int fw_verbose; 531 static int verbose_limit; 532 533 static int fw_debug; 534 static int autoinc_step = IPFW_AUTOINC_STEP_DEF; 535 536 static int ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS); 537 static int ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS); 538 539 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); 540 SYSCTL_NODE(_net_inet_ip_fw, OID_AUTO, stats, CTLFLAG_RW, 0, 541 "Firewall statistics"); 542 543 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, 544 &fw_enable, 0, ipfw_sysctl_enable, "I", "Enable ipfw"); 545 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLTYPE_INT | CTLFLAG_RW, 546 &autoinc_step, 0, ipfw_sysctl_autoinc_step, "I", 547 "Rule number autincrement step"); 548 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW, 549 &fw_one_pass, 0, 550 "Only do a single pass through ipfw when using dummynet(4)"); 551 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, 552 &fw_debug, 0, "Enable printing of debug ip_fw statements"); 553 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW, 554 &fw_verbose, 0, "Log matches to ipfw rules"); 555 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, 556 &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); 557 558 static int ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS); 559 static int ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS); 560 static int ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS); 561 static int ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS); 562 static int ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS); 563 static int ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS); 564 565 /* 566 * Timeouts for various events in handing states. 567 * 568 * NOTE: 569 * 1 == 0~1 second. 570 * 2 == 1~2 second(s). 571 * 572 * We use 2 seconds for FIN lifetime, so that the states will not be 573 * ripped prematurely. 574 */ 575 static uint32_t dyn_ack_lifetime = 300; 576 static uint32_t dyn_syn_lifetime = 20; 577 static uint32_t dyn_finwait_lifetime = 20; 578 static uint32_t dyn_fin_lifetime = 2; 579 static uint32_t dyn_rst_lifetime = 2; 580 static uint32_t dyn_udp_lifetime = 10; 581 static uint32_t dyn_short_lifetime = 5; /* used by tracks too */ 582 583 /* 584 * Keepalives are sent if dyn_keepalive is set. They are sent every 585 * dyn_keepalive_period seconds, in the last dyn_keepalive_interval 586 * seconds of lifetime of a rule. 587 */ 588 static uint32_t dyn_keepalive_interval = 20; 589 static uint32_t dyn_keepalive_period = 5; 590 static uint32_t dyn_keepalive = 1; /* do send keepalives */ 591 592 static struct ipfw_global ipfw_gd; 593 static int ipfw_state_loosecnt_updthr; 594 static int ipfw_state_max = 4096; /* max # of states */ 595 static int ipfw_track_max = 4096; /* max # of tracks */ 596 597 static int ipfw_state_headroom; /* setup at module load time */ 598 static int ipfw_state_reap_min = 8; 599 static int ipfw_state_expire_max = 32; 600 static int ipfw_state_scan_max = 256; 601 static int ipfw_keepalive_max = 8; 602 static int ipfw_track_reap_max = 4; 603 static int ipfw_track_expire_max = 16; 604 static int ipfw_track_scan_max = 128; 605 606 /* Compat */ 607 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, 608 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_dyncnt, "I", 609 "Number of states and tracks"); 610 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, 611 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_dynmax, "I", 612 "Max number of states and tracks"); 613 614 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_cnt, 615 CTLTYPE_INT | CTLFLAG_RD, NULL, 0, ipfw_sysctl_statecnt, "I", 616 "Number of states"); 617 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_max, 618 CTLTYPE_INT | CTLFLAG_RW, NULL, 0, ipfw_sysctl_statemax, "I", 619 "Max number of states"); 620 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, state_headroom, CTLFLAG_RW, 621 &ipfw_state_headroom, 0, "headroom for state reap"); 622 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_cnt, CTLFLAG_RD, 623 &ipfw_gd.ipfw_trkcnt_cnt, 0, "Number of tracks"); 624 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, track_max, CTLFLAG_RW, 625 &ipfw_track_max, 0, "Max number of tracks"); 626 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, 627 &static_count, 0, "Number of static rules"); 628 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, 629 &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); 630 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, 631 &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); 632 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, 633 &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); 634 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_finwait_lifetime, CTLFLAG_RW, 635 &dyn_finwait_lifetime, 0, "Lifetime of dyn. rules for fin wait"); 636 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, 637 &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); 638 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, 639 &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); 640 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, 641 &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); 642 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, 643 &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); 644 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_scan_max, 645 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_scan_max, 0, ipfw_sysctl_scancnt, 646 "I", "# of states to scan for each expire iteration"); 647 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_expire_max, 648 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_expire_max, 0, ipfw_sysctl_scancnt, 649 "I", "# of states to expire for each expire iteration"); 650 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, keepalive_max, 651 CTLTYPE_INT | CTLFLAG_RW, &ipfw_keepalive_max, 0, ipfw_sysctl_scancnt, 652 "I", "# of states to expire for each expire iteration"); 653 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, state_reap_min, 654 CTLTYPE_INT | CTLFLAG_RW, &ipfw_state_reap_min, 0, ipfw_sysctl_scancnt, 655 "I", "# of states to reap for state shortage"); 656 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_scan_max, 657 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_scan_max, 0, ipfw_sysctl_scancnt, 658 "I", "# of tracks to scan for each expire iteration"); 659 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_expire_max, 660 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_expire_max, 0, ipfw_sysctl_scancnt, 661 "I", "# of tracks to expire for each expire iteration"); 662 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, track_reap_max, 663 CTLTYPE_INT | CTLFLAG_RW, &ipfw_track_reap_max, 0, ipfw_sysctl_scancnt, 664 "I", "# of tracks to reap for track shortage"); 665 666 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reap, 667 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 668 __offsetof(struct ipfw_context, ipfw_sts_reap), ipfw_sysctl_stat, 669 "LU", "# of state reaps due to states shortage"); 670 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_reapfailed, 671 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 672 __offsetof(struct ipfw_context, ipfw_sts_reapfailed), ipfw_sysctl_stat, 673 "LU", "# of state reap failure"); 674 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_overflow, 675 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 676 __offsetof(struct ipfw_context, ipfw_sts_overflow), ipfw_sysctl_stat, 677 "LU", "# of state overflow"); 678 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_nomem, 679 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 680 __offsetof(struct ipfw_context, ipfw_sts_nomem), ipfw_sysctl_stat, 681 "LU", "# of state allocation failure"); 682 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, state_tcprecycled, 683 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 684 __offsetof(struct ipfw_context, ipfw_sts_tcprecycled), ipfw_sysctl_stat, 685 "LU", "# of state deleted due to fast TCP port recycling"); 686 687 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_nomem, 688 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 689 __offsetof(struct ipfw_context, ipfw_tks_nomem), ipfw_sysctl_stat, 690 "LU", "# of track allocation failure"); 691 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reap, 692 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 693 __offsetof(struct ipfw_context, ipfw_tks_reap), ipfw_sysctl_stat, 694 "LU", "# of track reap due to tracks shortage"); 695 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_reapfailed, 696 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 697 __offsetof(struct ipfw_context, ipfw_tks_reapfailed), ipfw_sysctl_stat, 698 "LU", "# of track reap failure"); 699 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_overflow, 700 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 701 __offsetof(struct ipfw_context, ipfw_tks_overflow), ipfw_sysctl_stat, 702 "LU", "# of track overflow"); 703 SYSCTL_PROC(_net_inet_ip_fw_stats, OID_AUTO, track_cntnomem, 704 CTLTYPE_ULONG | CTLFLAG_RW, NULL, 705 __offsetof(struct ipfw_context, ipfw_tks_cntnomem), ipfw_sysctl_stat, 706 "LU", "# of track counter allocation failure"); 707 708 static int ipfw_state_cmp(struct ipfw_state *, 709 struct ipfw_state *); 710 static int ipfw_trkcnt_cmp(struct ipfw_trkcnt *, 711 struct ipfw_trkcnt *); 712 static int ipfw_track_cmp(struct ipfw_track *, 713 struct ipfw_track *); 714 715 RB_PROTOTYPE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp); 716 RB_GENERATE(ipfw_state_tree, ipfw_state, st_rblink, ipfw_state_cmp); 717 718 RB_PROTOTYPE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp); 719 RB_GENERATE(ipfw_trkcnt_tree, ipfw_trkcnt, tc_rblink, ipfw_trkcnt_cmp); 720 721 RB_PROTOTYPE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp); 722 RB_GENERATE(ipfw_track_tree, ipfw_track, t_rblink, ipfw_track_cmp); 723 724 static ip_fw_chk_t ipfw_chk; 725 static void ipfw_track_expire_ipifunc(void *); 726 static void ipfw_state_expire_ipifunc(void *); 727 static void ipfw_keepalive(void *); 728 static int ipfw_state_expire_start(struct ipfw_context *, 729 int, int); 730 731 #define IPFW_TRKCNT_TOKGET lwkt_gettoken(&ipfw_gd.ipfw_trkcnt_token) 732 #define IPFW_TRKCNT_TOKREL lwkt_reltoken(&ipfw_gd.ipfw_trkcnt_token) 733 #define IPFW_TRKCNT_TOKINIT \ 734 lwkt_token_init(&ipfw_gd.ipfw_trkcnt_token, "ipfw_trkcnt"); 735 736 static __inline void 737 ipfw_key_build(struct ipfw_key *key, in_addr_t saddr, uint16_t sport, 738 in_addr_t daddr, uint16_t dport, uint8_t proto) 739 { 740 741 key->proto = proto; 742 key->swap = 0; 743 744 if (saddr < daddr) { 745 key->addr_u.addrs.addr1 = daddr; 746 key->addr_u.addrs.addr2 = saddr; 747 key->swap |= IPFW_KEY_SWAP_ADDRS; 748 } else { 749 key->addr_u.addrs.addr1 = saddr; 750 key->addr_u.addrs.addr2 = daddr; 751 } 752 753 if (sport < dport) { 754 key->port_u.ports.port1 = dport; 755 key->port_u.ports.port2 = sport; 756 key->swap |= IPFW_KEY_SWAP_PORTS; 757 } else { 758 key->port_u.ports.port1 = sport; 759 key->port_u.ports.port2 = dport; 760 } 761 762 if (sport == dport && (key->swap & IPFW_KEY_SWAP_ADDRS)) 763 key->swap |= IPFW_KEY_SWAP_PORTS; 764 if (saddr == daddr && (key->swap & IPFW_KEY_SWAP_PORTS)) 765 key->swap |= IPFW_KEY_SWAP_ADDRS; 766 } 767 768 static __inline void 769 ipfw_key_4tuple(const struct ipfw_key *key, in_addr_t *saddr, uint16_t *sport, 770 in_addr_t *daddr, uint16_t *dport) 771 { 772 773 if (key->swap & IPFW_KEY_SWAP_ADDRS) { 774 *saddr = key->addr_u.addrs.addr2; 775 *daddr = key->addr_u.addrs.addr1; 776 } else { 777 *saddr = key->addr_u.addrs.addr1; 778 *daddr = key->addr_u.addrs.addr2; 779 } 780 781 if (key->swap & IPFW_KEY_SWAP_PORTS) { 782 *sport = key->port_u.ports.port2; 783 *dport = key->port_u.ports.port1; 784 } else { 785 *sport = key->port_u.ports.port1; 786 *dport = key->port_u.ports.port2; 787 } 788 } 789 790 static int 791 ipfw_state_cmp(struct ipfw_state *s1, struct ipfw_state *s2) 792 { 793 794 if (s1->st_proto > s2->st_proto) 795 return (1); 796 if (s1->st_proto < s2->st_proto) 797 return (-1); 798 799 if (s1->st_addrs > s2->st_addrs) 800 return (1); 801 if (s1->st_addrs < s2->st_addrs) 802 return (-1); 803 804 if (s1->st_ports > s2->st_ports) 805 return (1); 806 if (s1->st_ports < s2->st_ports) 807 return (-1); 808 809 if (s1->st_swap == s2->st_swap || 810 (s1->st_swap ^ s2->st_swap) == IPFW_KEY_SWAP_ALL) 811 return (0); 812 813 if (s1->st_swap > s2->st_swap) 814 return (1); 815 else 816 return (-1); 817 } 818 819 static int 820 ipfw_trkcnt_cmp(struct ipfw_trkcnt *t1, struct ipfw_trkcnt *t2) 821 { 822 823 if (t1->tc_proto > t2->tc_proto) 824 return (1); 825 if (t1->tc_proto < t2->tc_proto) 826 return (-1); 827 828 if (t1->tc_addrs > t2->tc_addrs) 829 return (1); 830 if (t1->tc_addrs < t2->tc_addrs) 831 return (-1); 832 833 if (t1->tc_ports > t2->tc_ports) 834 return (1); 835 if (t1->tc_ports < t2->tc_ports) 836 return (-1); 837 838 if (t1->tc_ruleid > t2->tc_ruleid) 839 return (1); 840 if (t1->tc_ruleid < t2->tc_ruleid) 841 return (-1); 842 843 return (0); 844 } 845 846 static int 847 ipfw_track_cmp(struct ipfw_track *t1, struct ipfw_track *t2) 848 { 849 850 if (t1->t_proto > t2->t_proto) 851 return (1); 852 if (t1->t_proto < t2->t_proto) 853 return (-1); 854 855 if (t1->t_addrs > t2->t_addrs) 856 return (1); 857 if (t1->t_addrs < t2->t_addrs) 858 return (-1); 859 860 if (t1->t_ports > t2->t_ports) 861 return (1); 862 if (t1->t_ports < t2->t_ports) 863 return (-1); 864 865 if ((uintptr_t)t1->t_rule > (uintptr_t)t2->t_rule) 866 return (1); 867 if ((uintptr_t)t1->t_rule < (uintptr_t)t2->t_rule) 868 return (-1); 869 870 return (0); 871 } 872 873 static void 874 ipfw_state_max_set(int state_max) 875 { 876 877 ipfw_state_max = state_max; 878 /* Allow 5% states over-allocation. */ 879 ipfw_state_loosecnt_updthr = (state_max / 20) / netisr_ncpus; 880 } 881 882 static __inline int 883 ipfw_state_cntcoll(void) 884 { 885 int cpu, state_cnt = 0; 886 887 for (cpu = 0; cpu < netisr_ncpus; ++cpu) 888 state_cnt += ipfw_ctx[cpu]->ipfw_state_cnt; 889 return (state_cnt); 890 } 891 892 static __inline int 893 ipfw_state_cntsync(void) 894 { 895 int state_cnt; 896 897 state_cnt = ipfw_state_cntcoll(); 898 ipfw_gd.ipfw_state_loosecnt = state_cnt; 899 return (state_cnt); 900 } 901 902 static __inline int 903 ipfw_free_rule(struct ip_fw *rule) 904 { 905 KASSERT(rule->cpuid == mycpuid, ("rule freed on cpu%d", mycpuid)); 906 KASSERT(rule->refcnt > 0, ("invalid refcnt %u", rule->refcnt)); 907 rule->refcnt--; 908 if (rule->refcnt == 0) { 909 kfree(rule, M_IPFW); 910 return 1; 911 } 912 return 0; 913 } 914 915 static void 916 ipfw_unref_rule(void *priv) 917 { 918 ipfw_free_rule(priv); 919 #ifdef KLD_MODULE 920 atomic_subtract_int(&ipfw_gd.ipfw_refcnt, 1); 921 #endif 922 } 923 924 static __inline void 925 ipfw_ref_rule(struct ip_fw *rule) 926 { 927 KASSERT(rule->cpuid == mycpuid, ("rule used on cpu%d", mycpuid)); 928 #ifdef KLD_MODULE 929 atomic_add_int(&ipfw_gd.ipfw_refcnt, 1); 930 #endif 931 rule->refcnt++; 932 } 933 934 /* 935 * This macro maps an ip pointer into a layer3 header pointer of type T 936 */ 937 #define L3HDR(T, ip) ((T *)((uint32_t *)(ip) + (ip)->ip_hl)) 938 939 static __inline int 940 icmptype_match(struct ip *ip, ipfw_insn_u32 *cmd) 941 { 942 int type = L3HDR(struct icmp,ip)->icmp_type; 943 944 return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1 << type))); 945 } 946 947 #define TT ((1 << ICMP_ECHO) | \ 948 (1 << ICMP_ROUTERSOLICIT) | \ 949 (1 << ICMP_TSTAMP) | \ 950 (1 << ICMP_IREQ) | \ 951 (1 << ICMP_MASKREQ)) 952 953 static int 954 is_icmp_query(struct ip *ip) 955 { 956 int type = L3HDR(struct icmp, ip)->icmp_type; 957 958 return (type <= ICMP_MAXTYPE && (TT & (1 << type))); 959 } 960 961 #undef TT 962 963 /* 964 * The following checks use two arrays of 8 or 16 bits to store the 965 * bits that we want set or clear, respectively. They are in the 966 * low and high half of cmd->arg1 or cmd->d[0]. 967 * 968 * We scan options and store the bits we find set. We succeed if 969 * 970 * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear 971 * 972 * The code is sometimes optimized not to store additional variables. 973 */ 974 static int 975 flags_match(ipfw_insn *cmd, uint8_t bits) 976 { 977 u_char want_clear; 978 bits = ~bits; 979 980 if (((cmd->arg1 & 0xff) & bits) != 0) 981 return 0; /* some bits we want set were clear */ 982 983 want_clear = (cmd->arg1 >> 8) & 0xff; 984 if ((want_clear & bits) != want_clear) 985 return 0; /* some bits we want clear were set */ 986 return 1; 987 } 988 989 static int 990 ipopts_match(struct ip *ip, ipfw_insn *cmd) 991 { 992 int optlen, bits = 0; 993 u_char *cp = (u_char *)(ip + 1); 994 int x = (ip->ip_hl << 2) - sizeof(struct ip); 995 996 for (; x > 0; x -= optlen, cp += optlen) { 997 int opt = cp[IPOPT_OPTVAL]; 998 999 if (opt == IPOPT_EOL) 1000 break; 1001 1002 if (opt == IPOPT_NOP) { 1003 optlen = 1; 1004 } else { 1005 optlen = cp[IPOPT_OLEN]; 1006 if (optlen <= 0 || optlen > x) 1007 return 0; /* invalid or truncated */ 1008 } 1009 1010 switch (opt) { 1011 case IPOPT_LSRR: 1012 bits |= IP_FW_IPOPT_LSRR; 1013 break; 1014 1015 case IPOPT_SSRR: 1016 bits |= IP_FW_IPOPT_SSRR; 1017 break; 1018 1019 case IPOPT_RR: 1020 bits |= IP_FW_IPOPT_RR; 1021 break; 1022 1023 case IPOPT_TS: 1024 bits |= IP_FW_IPOPT_TS; 1025 break; 1026 1027 default: 1028 break; 1029 } 1030 } 1031 return (flags_match(cmd, bits)); 1032 } 1033 1034 static int 1035 tcpopts_match(struct ip *ip, ipfw_insn *cmd) 1036 { 1037 int optlen, bits = 0; 1038 struct tcphdr *tcp = L3HDR(struct tcphdr,ip); 1039 u_char *cp = (u_char *)(tcp + 1); 1040 int x = (tcp->th_off << 2) - sizeof(struct tcphdr); 1041 1042 for (; x > 0; x -= optlen, cp += optlen) { 1043 int opt = cp[0]; 1044 1045 if (opt == TCPOPT_EOL) 1046 break; 1047 1048 if (opt == TCPOPT_NOP) { 1049 optlen = 1; 1050 } else { 1051 optlen = cp[1]; 1052 if (optlen <= 0) 1053 break; 1054 } 1055 1056 switch (opt) { 1057 case TCPOPT_MAXSEG: 1058 bits |= IP_FW_TCPOPT_MSS; 1059 break; 1060 1061 case TCPOPT_WINDOW: 1062 bits |= IP_FW_TCPOPT_WINDOW; 1063 break; 1064 1065 case TCPOPT_SACK_PERMITTED: 1066 case TCPOPT_SACK: 1067 bits |= IP_FW_TCPOPT_SACK; 1068 break; 1069 1070 case TCPOPT_TIMESTAMP: 1071 bits |= IP_FW_TCPOPT_TS; 1072 break; 1073 1074 case TCPOPT_CC: 1075 case TCPOPT_CCNEW: 1076 case TCPOPT_CCECHO: 1077 bits |= IP_FW_TCPOPT_CC; 1078 break; 1079 1080 default: 1081 break; 1082 } 1083 } 1084 return (flags_match(cmd, bits)); 1085 } 1086 1087 static int 1088 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) 1089 { 1090 if (ifp == NULL) /* no iface with this packet, match fails */ 1091 return 0; 1092 1093 /* Check by name or by IP address */ 1094 if (cmd->name[0] != '\0') { /* match by name */ 1095 /* Check name */ 1096 if (cmd->p.glob) { 1097 if (kfnmatch(cmd->name, ifp->if_xname, 0) == 0) 1098 return(1); 1099 } else { 1100 if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) 1101 return(1); 1102 } 1103 } else { 1104 struct ifaddr_container *ifac; 1105 1106 TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) { 1107 struct ifaddr *ia = ifac->ifa; 1108 1109 if (ia->ifa_addr == NULL) 1110 continue; 1111 if (ia->ifa_addr->sa_family != AF_INET) 1112 continue; 1113 if (cmd->p.ip.s_addr == ((struct sockaddr_in *) 1114 (ia->ifa_addr))->sin_addr.s_addr) 1115 return(1); /* match */ 1116 } 1117 } 1118 return(0); /* no match, fail ... */ 1119 } 1120 1121 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 1122 1123 /* 1124 * We enter here when we have a rule with O_LOG. 1125 * XXX this function alone takes about 2Kbytes of code! 1126 */ 1127 static void 1128 ipfw_log(struct ipfw_context *ctx, struct ip_fw *f, u_int hlen, 1129 struct ether_header *eh, struct mbuf *m, struct ifnet *oif) 1130 { 1131 char *action; 1132 int limit_reached = 0; 1133 char action2[40], proto[48], fragment[28], abuf[INET_ADDRSTRLEN]; 1134 1135 fragment[0] = '\0'; 1136 proto[0] = '\0'; 1137 1138 if (f == NULL) { /* bogus pkt */ 1139 if (verbose_limit != 0 && 1140 ctx->ipfw_norule_counter >= verbose_limit) 1141 return; 1142 ctx->ipfw_norule_counter++; 1143 if (ctx->ipfw_norule_counter == verbose_limit) 1144 limit_reached = verbose_limit; 1145 action = "Refuse"; 1146 } else { /* O_LOG is the first action, find the real one */ 1147 ipfw_insn *cmd = ACTION_PTR(f); 1148 ipfw_insn_log *l = (ipfw_insn_log *)cmd; 1149 1150 if (l->max_log != 0 && l->log_left == 0) 1151 return; 1152 l->log_left--; 1153 if (l->log_left == 0) 1154 limit_reached = l->max_log; 1155 cmd += F_LEN(cmd); /* point to first action */ 1156 if (cmd->opcode == O_PROB) 1157 cmd += F_LEN(cmd); 1158 1159 action = action2; 1160 switch (cmd->opcode) { 1161 case O_DENY: 1162 action = "Deny"; 1163 break; 1164 1165 case O_REJECT: 1166 if (cmd->arg1==ICMP_REJECT_RST) { 1167 action = "Reset"; 1168 } else if (cmd->arg1==ICMP_UNREACH_HOST) { 1169 action = "Reject"; 1170 } else { 1171 ksnprintf(SNPARGS(action2, 0), "Unreach %d", 1172 cmd->arg1); 1173 } 1174 break; 1175 1176 case O_ACCEPT: 1177 action = "Accept"; 1178 break; 1179 1180 case O_COUNT: 1181 action = "Count"; 1182 break; 1183 1184 case O_DIVERT: 1185 ksnprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); 1186 break; 1187 1188 case O_TEE: 1189 ksnprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); 1190 break; 1191 1192 case O_SKIPTO: 1193 ksnprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); 1194 break; 1195 1196 case O_PIPE: 1197 ksnprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); 1198 break; 1199 1200 case O_QUEUE: 1201 ksnprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); 1202 break; 1203 1204 case O_FORWARD_IP: 1205 { 1206 ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; 1207 int len; 1208 1209 len = ksnprintf(SNPARGS(action2, 0), 1210 "Forward to %s", 1211 kinet_ntoa(sa->sa.sin_addr, abuf)); 1212 if (sa->sa.sin_port) { 1213 ksnprintf(SNPARGS(action2, len), ":%d", 1214 sa->sa.sin_port); 1215 } 1216 } 1217 break; 1218 1219 default: 1220 action = "UNKNOWN"; 1221 break; 1222 } 1223 } 1224 1225 if (hlen == 0) { /* non-ip */ 1226 ksnprintf(SNPARGS(proto, 0), "MAC"); 1227 } else { 1228 struct ip *ip = mtod(m, struct ip *); 1229 /* these three are all aliases to the same thing */ 1230 struct icmp *const icmp = L3HDR(struct icmp, ip); 1231 struct tcphdr *const tcp = (struct tcphdr *)icmp; 1232 struct udphdr *const udp = (struct udphdr *)icmp; 1233 1234 int ip_off, offset, ip_len; 1235 int len; 1236 1237 if (eh != NULL) { /* layer 2 packets are as on the wire */ 1238 ip_off = ntohs(ip->ip_off); 1239 ip_len = ntohs(ip->ip_len); 1240 } else { 1241 ip_off = ip->ip_off; 1242 ip_len = ip->ip_len; 1243 } 1244 offset = ip_off & IP_OFFMASK; 1245 switch (ip->ip_p) { 1246 case IPPROTO_TCP: 1247 len = ksnprintf(SNPARGS(proto, 0), "TCP %s", 1248 kinet_ntoa(ip->ip_src, abuf)); 1249 if (offset == 0) { 1250 ksnprintf(SNPARGS(proto, len), ":%d %s:%d", 1251 ntohs(tcp->th_sport), 1252 kinet_ntoa(ip->ip_dst, abuf), 1253 ntohs(tcp->th_dport)); 1254 } else { 1255 ksnprintf(SNPARGS(proto, len), " %s", 1256 kinet_ntoa(ip->ip_dst, abuf)); 1257 } 1258 break; 1259 1260 case IPPROTO_UDP: 1261 len = ksnprintf(SNPARGS(proto, 0), "UDP %s", 1262 kinet_ntoa(ip->ip_src, abuf)); 1263 if (offset == 0) { 1264 ksnprintf(SNPARGS(proto, len), ":%d %s:%d", 1265 ntohs(udp->uh_sport), 1266 kinet_ntoa(ip->ip_dst, abuf), 1267 ntohs(udp->uh_dport)); 1268 } else { 1269 ksnprintf(SNPARGS(proto, len), " %s", 1270 kinet_ntoa(ip->ip_dst, abuf)); 1271 } 1272 break; 1273 1274 case IPPROTO_ICMP: 1275 if (offset == 0) { 1276 len = ksnprintf(SNPARGS(proto, 0), 1277 "ICMP:%u.%u ", 1278 icmp->icmp_type, 1279 icmp->icmp_code); 1280 } else { 1281 len = ksnprintf(SNPARGS(proto, 0), "ICMP "); 1282 } 1283 len += ksnprintf(SNPARGS(proto, len), "%s", 1284 kinet_ntoa(ip->ip_src, abuf)); 1285 ksnprintf(SNPARGS(proto, len), " %s", 1286 kinet_ntoa(ip->ip_dst, abuf)); 1287 break; 1288 1289 default: 1290 len = ksnprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p, 1291 kinet_ntoa(ip->ip_src, abuf)); 1292 ksnprintf(SNPARGS(proto, len), " %s", 1293 kinet_ntoa(ip->ip_dst, abuf)); 1294 break; 1295 } 1296 1297 if (ip_off & (IP_MF | IP_OFFMASK)) { 1298 ksnprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", 1299 ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), 1300 offset << 3, (ip_off & IP_MF) ? "+" : ""); 1301 } 1302 } 1303 1304 if (oif || m->m_pkthdr.rcvif) { 1305 log(LOG_SECURITY | LOG_INFO, 1306 "ipfw: %d %s %s %s via %s%s\n", 1307 f ? f->rulenum : -1, 1308 action, proto, oif ? "out" : "in", 1309 oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, 1310 fragment); 1311 } else { 1312 log(LOG_SECURITY | LOG_INFO, 1313 "ipfw: %d %s %s [no if info]%s\n", 1314 f ? f->rulenum : -1, 1315 action, proto, fragment); 1316 } 1317 1318 if (limit_reached) { 1319 log(LOG_SECURITY | LOG_NOTICE, 1320 "ipfw: limit %d reached on entry %d\n", 1321 limit_reached, f ? f->rulenum : -1); 1322 } 1323 } 1324 1325 #undef SNPARGS 1326 1327 #define TIME_LEQ(a, b) ((a) - (b) <= 0) 1328 1329 static void 1330 ipfw_state_del(struct ipfw_context *ctx, struct ipfw_state *s) 1331 { 1332 1333 KASSERT(s->st_type == O_KEEP_STATE || s->st_type == O_LIMIT, 1334 ("invalid state type %u", s->st_type)); 1335 KASSERT(ctx->ipfw_state_cnt > 0, 1336 ("invalid state count %d", ctx->ipfw_state_cnt)); 1337 1338 if (s->st_track != NULL) { 1339 struct ipfw_track *t = s->st_track; 1340 1341 KASSERT(!LIST_EMPTY(&t->t_state_list), 1342 ("track state list is empty")); 1343 LIST_REMOVE(s, st_trklink); 1344 1345 KASSERT(*t->t_count > 0, 1346 ("invalid track count %d", *t->t_count)); 1347 atomic_subtract_int(t->t_count, 1); 1348 } 1349 1350 TAILQ_REMOVE(&ctx->ipfw_state_list, s, st_link); 1351 RB_REMOVE(ipfw_state_tree, &ctx->ipfw_state_tree, s); 1352 kfree(s, M_IPFW); 1353 1354 ctx->ipfw_state_cnt--; 1355 if (ctx->ipfw_state_loosecnt > 0) 1356 ctx->ipfw_state_loosecnt--; 1357 } 1358 1359 static int 1360 ipfw_state_reap(struct ipfw_context *ctx, int reap_max) 1361 { 1362 struct ipfw_state *s, *anchor; 1363 int expired; 1364 1365 if (reap_max < ipfw_state_reap_min) 1366 reap_max = ipfw_state_reap_min; 1367 1368 if ((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0) { 1369 /* 1370 * Kick start state expiring. Ignore scan limit, 1371 * we are short of states. 1372 */ 1373 ctx->ipfw_flags |= IPFW_FLAG_STATEREAP; 1374 expired = ipfw_state_expire_start(ctx, INT_MAX, reap_max); 1375 ctx->ipfw_flags &= ~IPFW_FLAG_STATEREAP; 1376 return (expired); 1377 } 1378 1379 /* 1380 * States are being expired. 1381 */ 1382 1383 if (ctx->ipfw_state_cnt == 0) 1384 return (0); 1385 1386 expired = 0; 1387 anchor = &ctx->ipfw_stateexp_anch; 1388 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) { 1389 /* 1390 * Ignore scan limit; we are short of states. 1391 */ 1392 1393 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 1394 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link); 1395 1396 if (s->st_type == O_ANCHOR) 1397 continue; 1398 1399 if (IPFW_STATE_TCPCLOSED(s) || 1400 TIME_LEQ(s->st_expire, time_uptime)) { 1401 ipfw_state_del(ctx, s); 1402 if (++expired >= reap_max) 1403 break; 1404 if ((expired & 0xff) == 0 && 1405 ipfw_state_cntcoll() + ipfw_state_headroom <= 1406 ipfw_state_max) 1407 break; 1408 } 1409 } 1410 /* 1411 * NOTE: 1412 * Leave the anchor on the list, even if the end of the list has 1413 * been reached. ipfw_state_expire_more_dispatch() will handle 1414 * the removal. 1415 */ 1416 return (expired); 1417 } 1418 1419 static void 1420 ipfw_state_flush(struct ipfw_context *ctx, const struct ip_fw *rule) 1421 { 1422 struct ipfw_state *s, *sn; 1423 1424 TAILQ_FOREACH_MUTABLE(s, &ctx->ipfw_state_list, st_link, sn) { 1425 if (s->st_type == O_ANCHOR) 1426 continue; 1427 if (rule != NULL && s->st_rule != rule) 1428 continue; 1429 ipfw_state_del(ctx, s); 1430 } 1431 } 1432 1433 static void 1434 ipfw_state_expire_done(struct ipfw_context *ctx) 1435 { 1436 1437 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP, 1438 ("stateexp is not in progress")); 1439 ctx->ipfw_flags &= ~IPFW_FLAG_STATEEXP; 1440 callout_reset(&ctx->ipfw_stateto_ch, hz, 1441 ipfw_state_expire_ipifunc, NULL); 1442 } 1443 1444 static void 1445 ipfw_state_expire_more(struct ipfw_context *ctx) 1446 { 1447 struct netmsg_base *nm = &ctx->ipfw_stateexp_more; 1448 1449 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP, 1450 ("stateexp is not in progress")); 1451 KASSERT(nm->lmsg.ms_flags & MSGF_DONE, 1452 ("stateexp more did not finish")); 1453 netisr_sendmsg_oncpu(nm); 1454 } 1455 1456 static int 1457 ipfw_state_expire_loop(struct ipfw_context *ctx, struct ipfw_state *anchor, 1458 int scan_max, int expire_max) 1459 { 1460 struct ipfw_state *s; 1461 int scanned = 0, expired = 0; 1462 1463 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP, 1464 ("stateexp is not in progress")); 1465 1466 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) { 1467 if (scanned++ >= scan_max) { 1468 ipfw_state_expire_more(ctx); 1469 return (expired); 1470 } 1471 1472 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 1473 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link); 1474 1475 if (s->st_type == O_ANCHOR) 1476 continue; 1477 1478 if (TIME_LEQ(s->st_expire, time_uptime) || 1479 ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) && 1480 IPFW_STATE_TCPCLOSED(s))) { 1481 ipfw_state_del(ctx, s); 1482 if (++expired >= expire_max) { 1483 ipfw_state_expire_more(ctx); 1484 return (expired); 1485 } 1486 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) && 1487 (expired & 0xff) == 0 && 1488 ipfw_state_cntcoll() + ipfw_state_headroom <= 1489 ipfw_state_max) { 1490 ipfw_state_expire_more(ctx); 1491 return (expired); 1492 } 1493 } 1494 } 1495 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 1496 ipfw_state_expire_done(ctx); 1497 return (expired); 1498 } 1499 1500 static void 1501 ipfw_state_expire_more_dispatch(netmsg_t nm) 1502 { 1503 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 1504 struct ipfw_state *anchor; 1505 1506 ASSERT_NETISR_NCPUS(mycpuid); 1507 KASSERT(ctx->ipfw_flags & IPFW_FLAG_STATEEXP, 1508 ("statexp is not in progress")); 1509 1510 /* Reply ASAP */ 1511 netisr_replymsg(&nm->base, 0); 1512 1513 anchor = &ctx->ipfw_stateexp_anch; 1514 if (ctx->ipfw_state_cnt == 0) { 1515 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 1516 ipfw_state_expire_done(ctx); 1517 return; 1518 } 1519 ipfw_state_expire_loop(ctx, anchor, 1520 ipfw_state_scan_max, ipfw_state_expire_max); 1521 } 1522 1523 static int 1524 ipfw_state_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max) 1525 { 1526 struct ipfw_state *anchor; 1527 1528 KASSERT((ctx->ipfw_flags & IPFW_FLAG_STATEEXP) == 0, 1529 ("stateexp is in progress")); 1530 ctx->ipfw_flags |= IPFW_FLAG_STATEEXP; 1531 1532 if (ctx->ipfw_state_cnt == 0) { 1533 ipfw_state_expire_done(ctx); 1534 return (0); 1535 } 1536 1537 /* 1538 * Do not expire more than once per second, it is useless. 1539 */ 1540 if ((ctx->ipfw_flags & IPFW_FLAG_STATEREAP) == 0 && 1541 ctx->ipfw_state_lastexp == time_uptime) { 1542 ipfw_state_expire_done(ctx); 1543 return (0); 1544 } 1545 ctx->ipfw_state_lastexp = time_uptime; 1546 1547 anchor = &ctx->ipfw_stateexp_anch; 1548 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link); 1549 return (ipfw_state_expire_loop(ctx, anchor, scan_max, expire_max)); 1550 } 1551 1552 static void 1553 ipfw_state_expire_dispatch(netmsg_t nm) 1554 { 1555 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 1556 1557 ASSERT_NETISR_NCPUS(mycpuid); 1558 1559 /* Reply ASAP */ 1560 crit_enter(); 1561 netisr_replymsg(&nm->base, 0); 1562 crit_exit(); 1563 1564 if (ctx->ipfw_flags & IPFW_FLAG_STATEEXP) { 1565 /* Running; done. */ 1566 return; 1567 } 1568 ipfw_state_expire_start(ctx, 1569 ipfw_state_scan_max, ipfw_state_expire_max); 1570 } 1571 1572 static void 1573 ipfw_state_expire_ipifunc(void *dummy __unused) 1574 { 1575 struct netmsg_base *msg; 1576 1577 KKASSERT(mycpuid < netisr_ncpus); 1578 msg = &ipfw_ctx[mycpuid]->ipfw_stateexp_nm; 1579 1580 crit_enter(); 1581 if (msg->lmsg.ms_flags & MSGF_DONE) 1582 netisr_sendmsg_oncpu(msg); 1583 crit_exit(); 1584 } 1585 1586 static boolean_t 1587 ipfw_state_update_tcp(struct ipfw_state *s, int dir, const struct tcphdr *tcp) 1588 { 1589 uint32_t seq = ntohl(tcp->th_seq); 1590 uint32_t ack = ntohl(tcp->th_ack); 1591 1592 if (tcp->th_flags & TH_RST) 1593 return (TRUE); 1594 1595 if (dir == MATCH_FORWARD) { 1596 if ((s->st_flags & IPFW_STATE_F_SEQFWD) == 0) { 1597 s->st_flags |= IPFW_STATE_F_SEQFWD; 1598 s->st_seq_fwd = seq; 1599 } else if (SEQ_GEQ(seq, s->st_seq_fwd)) { 1600 s->st_seq_fwd = seq; 1601 } else { 1602 /* Out-of-sequence; done. */ 1603 return (FALSE); 1604 } 1605 if (tcp->th_flags & TH_ACK) { 1606 if ((s->st_flags & IPFW_STATE_F_ACKFWD) == 0) { 1607 s->st_flags |= IPFW_STATE_F_ACKFWD; 1608 s->st_ack_fwd = ack; 1609 } else if (SEQ_GEQ(ack, s->st_ack_fwd)) { 1610 s->st_ack_fwd = ack; 1611 } else { 1612 /* Out-of-sequence; done. */ 1613 return (FALSE); 1614 } 1615 1616 if ((s->st_state & ((TH_FIN | TH_ACK) << 8)) == 1617 (TH_FIN << 8) && s->st_ack_fwd == s->st_seq_rev + 1) 1618 s->st_state |= (TH_ACK << 8); 1619 } 1620 } else { 1621 if ((s->st_flags & IPFW_STATE_F_SEQREV) == 0) { 1622 s->st_flags |= IPFW_STATE_F_SEQREV; 1623 s->st_seq_rev = seq; 1624 } else if (SEQ_GEQ(seq, s->st_seq_rev)) { 1625 s->st_seq_rev = seq; 1626 } else { 1627 /* Out-of-sequence; done. */ 1628 return (FALSE); 1629 } 1630 if (tcp->th_flags & TH_ACK) { 1631 if ((s->st_flags & IPFW_STATE_F_ACKREV) == 0) { 1632 s->st_flags |= IPFW_STATE_F_ACKREV; 1633 s->st_ack_rev= ack; 1634 } else if (SEQ_GEQ(ack, s->st_ack_rev)) { 1635 s->st_ack_rev = ack; 1636 } else { 1637 /* Out-of-sequence; done. */ 1638 return (FALSE); 1639 } 1640 1641 if ((s->st_state & (TH_FIN | TH_ACK)) == TH_FIN && 1642 s->st_ack_rev == s->st_seq_fwd + 1) 1643 s->st_state |= TH_ACK; 1644 } 1645 } 1646 return (TRUE); 1647 } 1648 1649 static void 1650 ipfw_state_update(const struct ipfw_flow_id *pkt, int dir, 1651 const struct tcphdr *tcp, struct ipfw_state *s) 1652 { 1653 1654 if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ 1655 u_char flags = pkt->flags & IPFW_STATE_TCPFLAGS; 1656 1657 if (tcp != NULL && !ipfw_state_update_tcp(s, dir, tcp)) 1658 return; 1659 1660 s->st_state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); 1661 switch (s->st_state & IPFW_STATE_TCPSTATES) { 1662 case TH_SYN: /* opening */ 1663 s->st_expire = time_uptime + dyn_syn_lifetime; 1664 break; 1665 1666 case BOTH_SYN: /* move to established */ 1667 case BOTH_SYN | TH_FIN: /* one side tries to close */ 1668 case BOTH_SYN | (TH_FIN << 8): 1669 s->st_expire = time_uptime + dyn_ack_lifetime; 1670 break; 1671 1672 case BOTH_SYN | BOTH_FIN: /* both sides closed */ 1673 if ((s->st_state & BOTH_FINACK) == BOTH_FINACK) { 1674 /* And both FINs were ACKed. */ 1675 s->st_expire = time_uptime + dyn_fin_lifetime; 1676 } else { 1677 s->st_expire = time_uptime + 1678 dyn_finwait_lifetime; 1679 } 1680 break; 1681 1682 default: 1683 #if 0 1684 /* 1685 * reset or some invalid combination, but can also 1686 * occur if we use keep-state the wrong way. 1687 */ 1688 if ((s->st_state & ((TH_RST << 8) | TH_RST)) == 0) 1689 kprintf("invalid state: 0x%x\n", s->st_state); 1690 #endif 1691 s->st_expire = time_uptime + dyn_rst_lifetime; 1692 break; 1693 } 1694 } else if (pkt->proto == IPPROTO_UDP) { 1695 s->st_expire = time_uptime + dyn_udp_lifetime; 1696 } else { 1697 /* other protocols */ 1698 s->st_expire = time_uptime + dyn_short_lifetime; 1699 } 1700 } 1701 1702 /* 1703 * Lookup a state. 1704 */ 1705 static struct ipfw_state * 1706 ipfw_state_lookup(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt, 1707 int *match_direction, const struct tcphdr *tcp) 1708 { 1709 struct ipfw_state *key, *s; 1710 int dir = MATCH_NONE; 1711 1712 key = &ctx->ipfw_state_tmpkey; 1713 ipfw_key_build(&key->st_key, pkt->src_ip, pkt->src_port, 1714 pkt->dst_ip, pkt->dst_port, pkt->proto); 1715 s = RB_FIND(ipfw_state_tree, &ctx->ipfw_state_tree, key); 1716 if (s == NULL) 1717 goto done; /* not found. */ 1718 if (TIME_LEQ(s->st_expire, time_uptime)) { 1719 /* Expired. */ 1720 ipfw_state_del(ctx, s); 1721 s = NULL; 1722 goto done; 1723 } 1724 if ((pkt->flags & TH_SYN) && IPFW_STATE_TCPCLOSED(s)) { 1725 /* TCP ports recycling is too fast. */ 1726 ctx->ipfw_sts_tcprecycled++; 1727 ipfw_state_del(ctx, s); 1728 s = NULL; 1729 goto done; 1730 } 1731 1732 if (s->st_swap == key->st_swap) { 1733 dir = MATCH_FORWARD; 1734 } else { 1735 KASSERT((s->st_swap & key->st_swap) == 0, 1736 ("found mismatch state")); 1737 dir = MATCH_REVERSE; 1738 } 1739 1740 /* Update this state. */ 1741 ipfw_state_update(pkt, dir, tcp, s); 1742 1743 if (s->st_track != NULL) { 1744 /* This track has been used. */ 1745 s->st_track->t_expire = time_uptime + dyn_short_lifetime; 1746 } 1747 done: 1748 if (match_direction) 1749 *match_direction = dir; 1750 return (s); 1751 } 1752 1753 static __inline struct ip_fw * 1754 ipfw_state_lookup_rule(struct ipfw_context *ctx, const struct ipfw_flow_id *pkt, 1755 int *match_direction, const struct tcphdr *tcp, uint16_t len) 1756 { 1757 struct ipfw_state *s; 1758 1759 s = ipfw_state_lookup(ctx, pkt, match_direction, tcp); 1760 if (s == NULL) 1761 return (NULL); 1762 1763 KASSERT(s->st_rule->cpuid == mycpuid, 1764 ("rule %p (cpu%d) does not belong to the current cpu%d", 1765 s->st_rule, s->st_rule->cpuid, mycpuid)); 1766 1767 s->st_pcnt++; 1768 s->st_bcnt += len; 1769 1770 return (s->st_rule); 1771 } 1772 1773 static struct ipfw_state * 1774 ipfw_state_add(struct ipfw_context *ctx, const struct ipfw_flow_id *id, 1775 uint16_t type, struct ip_fw *rule, struct ipfw_track *t, 1776 const struct tcphdr *tcp) 1777 { 1778 struct ipfw_state *s, *dup; 1779 1780 KASSERT(type == O_KEEP_STATE || type == O_LIMIT, 1781 ("invalid state type %u", type)); 1782 1783 s = kmalloc(sizeof(*s), M_IPFW, M_INTWAIT | M_NULLOK | M_ZERO); 1784 if (s == NULL) { 1785 ctx->ipfw_sts_nomem++; 1786 return (NULL); 1787 } 1788 1789 ipfw_key_build(&s->st_key, id->src_ip, id->src_port, 1790 id->dst_ip, id->dst_port, id->proto); 1791 1792 s->st_rule = rule; 1793 s->st_type = type; 1794 1795 ctx->ipfw_state_cnt++; 1796 ctx->ipfw_state_loosecnt++; 1797 if (ctx->ipfw_state_loosecnt >= ipfw_state_loosecnt_updthr) { 1798 ipfw_gd.ipfw_state_loosecnt += ctx->ipfw_state_loosecnt; 1799 ctx->ipfw_state_loosecnt = 0; 1800 } 1801 1802 dup = RB_INSERT(ipfw_state_tree, &ctx->ipfw_state_tree, s); 1803 if (dup != NULL) 1804 panic("ipfw: state exists"); 1805 TAILQ_INSERT_TAIL(&ctx->ipfw_state_list, s, st_link); 1806 1807 /* 1808 * Update this state: 1809 * Set st_expire and st_state. 1810 */ 1811 ipfw_state_update(id, MATCH_FORWARD, tcp, s); 1812 1813 if (t != NULL) { 1814 /* Keep the track referenced. */ 1815 LIST_INSERT_HEAD(&t->t_state_list, s, st_trklink); 1816 s->st_track = t; 1817 } 1818 return (s); 1819 } 1820 1821 static boolean_t 1822 ipfw_track_free(struct ipfw_context *ctx, struct ipfw_track *t) 1823 { 1824 struct ipfw_trkcnt *trk; 1825 boolean_t trk_freed = FALSE; 1826 1827 KASSERT(t->t_count != NULL, ("track anchor")); 1828 KASSERT(LIST_EMPTY(&t->t_state_list), 1829 ("invalid track is still referenced")); 1830 1831 trk = t->t_trkcnt; 1832 KASSERT(trk != NULL, ("track has no trkcnt")); 1833 1834 RB_REMOVE(ipfw_track_tree, &ctx->ipfw_track_tree, t); 1835 TAILQ_REMOVE(&ctx->ipfw_track_list, t, t_link); 1836 kfree(t, M_IPFW); 1837 1838 /* 1839 * fdrop() style reference counting. 1840 * See kern/kern_descrip.c fdrop(). 1841 */ 1842 for (;;) { 1843 int refs = trk->tc_refs; 1844 1845 cpu_ccfence(); 1846 KASSERT(refs > 0, ("invalid trkcnt refs %d", refs)); 1847 if (refs == 1) { 1848 IPFW_TRKCNT_TOKGET; 1849 if (atomic_cmpset_int(&trk->tc_refs, refs, 0)) { 1850 KASSERT(trk->tc_count == 0, 1851 ("%d states reference this trkcnt", 1852 trk->tc_count)); 1853 RB_REMOVE(ipfw_trkcnt_tree, 1854 &ipfw_gd.ipfw_trkcnt_tree, trk); 1855 1856 KASSERT(ipfw_gd.ipfw_trkcnt_cnt > 0, 1857 ("invalid trkcnt cnt %d", 1858 ipfw_gd.ipfw_trkcnt_cnt)); 1859 ipfw_gd.ipfw_trkcnt_cnt--; 1860 IPFW_TRKCNT_TOKREL; 1861 1862 if (ctx->ipfw_trkcnt_spare == NULL) 1863 ctx->ipfw_trkcnt_spare = trk; 1864 else 1865 kfree(trk, M_IPFW); 1866 trk_freed = TRUE; 1867 break; /* done! */ 1868 } 1869 IPFW_TRKCNT_TOKREL; 1870 /* retry */ 1871 } else if (atomic_cmpset_int(&trk->tc_refs, refs, refs - 1)) { 1872 break; /* done! */ 1873 } 1874 /* retry */ 1875 } 1876 return (trk_freed); 1877 } 1878 1879 static void 1880 ipfw_track_flush(struct ipfw_context *ctx, struct ip_fw *rule) 1881 { 1882 struct ipfw_track *t, *tn; 1883 1884 TAILQ_FOREACH_MUTABLE(t, &ctx->ipfw_track_list, t_link, tn) { 1885 if (t->t_count == NULL) /* anchor */ 1886 continue; 1887 if (rule != NULL && t->t_rule != rule) 1888 continue; 1889 ipfw_track_free(ctx, t); 1890 } 1891 } 1892 1893 static boolean_t 1894 ipfw_track_state_expire(struct ipfw_context *ctx, struct ipfw_track *t, 1895 boolean_t reap) 1896 { 1897 struct ipfw_state *s, *sn; 1898 boolean_t ret = FALSE; 1899 1900 KASSERT(t->t_count != NULL, ("track anchor")); 1901 1902 if (LIST_EMPTY(&t->t_state_list)) 1903 return (FALSE); 1904 1905 /* 1906 * Do not expire more than once per second, it is useless. 1907 */ 1908 if (t->t_lastexp == time_uptime) 1909 return (FALSE); 1910 t->t_lastexp = time_uptime; 1911 1912 LIST_FOREACH_MUTABLE(s, &t->t_state_list, st_trklink, sn) { 1913 if (TIME_LEQ(s->st_expire, time_uptime) || 1914 (reap && IPFW_STATE_TCPCLOSED(s))) { 1915 KASSERT(s->st_track == t, 1916 ("state track %p does not match %p", 1917 s->st_track, t)); 1918 ipfw_state_del(ctx, s); 1919 ret = TRUE; 1920 } 1921 } 1922 return (ret); 1923 } 1924 1925 static __inline struct ipfw_trkcnt * 1926 ipfw_trkcnt_alloc(struct ipfw_context *ctx) 1927 { 1928 struct ipfw_trkcnt *trk; 1929 1930 if (ctx->ipfw_trkcnt_spare != NULL) { 1931 trk = ctx->ipfw_trkcnt_spare; 1932 ctx->ipfw_trkcnt_spare = NULL; 1933 } else { 1934 trk = kmalloc_cachealign(sizeof(*trk), M_IPFW, 1935 M_INTWAIT | M_NULLOK); 1936 } 1937 return (trk); 1938 } 1939 1940 static void 1941 ipfw_track_expire_done(struct ipfw_context *ctx) 1942 { 1943 1944 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP, 1945 ("trackexp is not in progress")); 1946 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKEXP; 1947 callout_reset(&ctx->ipfw_trackto_ch, hz, 1948 ipfw_track_expire_ipifunc, NULL); 1949 } 1950 1951 static void 1952 ipfw_track_expire_more(struct ipfw_context *ctx) 1953 { 1954 struct netmsg_base *nm = &ctx->ipfw_trackexp_more; 1955 1956 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP, 1957 ("trackexp is not in progress")); 1958 KASSERT(nm->lmsg.ms_flags & MSGF_DONE, 1959 ("trackexp more did not finish")); 1960 netisr_sendmsg_oncpu(nm); 1961 } 1962 1963 static int 1964 ipfw_track_expire_loop(struct ipfw_context *ctx, struct ipfw_track *anchor, 1965 int scan_max, int expire_max) 1966 { 1967 struct ipfw_track *t; 1968 int scanned = 0, expired = 0; 1969 boolean_t reap = FALSE; 1970 1971 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP, 1972 ("trackexp is not in progress")); 1973 1974 if (ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) 1975 reap = TRUE; 1976 1977 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) { 1978 if (scanned++ >= scan_max) { 1979 ipfw_track_expire_more(ctx); 1980 return (expired); 1981 } 1982 1983 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link); 1984 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link); 1985 1986 if (t->t_count == NULL) /* anchor */ 1987 continue; 1988 1989 ipfw_track_state_expire(ctx, t, reap); 1990 if (!LIST_EMPTY(&t->t_state_list)) { 1991 /* There are states referencing this track. */ 1992 continue; 1993 } 1994 1995 if (TIME_LEQ(t->t_expire, time_uptime) || reap) { 1996 /* Expired. */ 1997 if (ipfw_track_free(ctx, t)) { 1998 if (++expired >= expire_max) { 1999 ipfw_track_expire_more(ctx); 2000 return (expired); 2001 } 2002 } 2003 } 2004 } 2005 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link); 2006 ipfw_track_expire_done(ctx); 2007 return (expired); 2008 } 2009 2010 static int 2011 ipfw_track_expire_start(struct ipfw_context *ctx, int scan_max, int expire_max) 2012 { 2013 struct ipfw_track *anchor; 2014 2015 KASSERT((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0, 2016 ("trackexp is in progress")); 2017 ctx->ipfw_flags |= IPFW_FLAG_TRACKEXP; 2018 2019 if (RB_EMPTY(&ctx->ipfw_track_tree)) { 2020 ipfw_track_expire_done(ctx); 2021 return (0); 2022 } 2023 2024 /* 2025 * Do not expire more than once per second, it is useless. 2026 */ 2027 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKREAP) == 0 && 2028 ctx->ipfw_track_lastexp == time_uptime) { 2029 ipfw_track_expire_done(ctx); 2030 return (0); 2031 } 2032 ctx->ipfw_track_lastexp = time_uptime; 2033 2034 anchor = &ctx->ipfw_trackexp_anch; 2035 TAILQ_INSERT_HEAD(&ctx->ipfw_track_list, anchor, t_link); 2036 return (ipfw_track_expire_loop(ctx, anchor, scan_max, expire_max)); 2037 } 2038 2039 static void 2040 ipfw_track_expire_more_dispatch(netmsg_t nm) 2041 { 2042 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 2043 struct ipfw_track *anchor; 2044 2045 ASSERT_NETISR_NCPUS(mycpuid); 2046 KASSERT(ctx->ipfw_flags & IPFW_FLAG_TRACKEXP, 2047 ("trackexp is not in progress")); 2048 2049 /* Reply ASAP */ 2050 netisr_replymsg(&nm->base, 0); 2051 2052 anchor = &ctx->ipfw_trackexp_anch; 2053 if (RB_EMPTY(&ctx->ipfw_track_tree)) { 2054 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link); 2055 ipfw_track_expire_done(ctx); 2056 return; 2057 } 2058 ipfw_track_expire_loop(ctx, anchor, 2059 ipfw_track_scan_max, ipfw_track_expire_max); 2060 } 2061 2062 static void 2063 ipfw_track_expire_dispatch(netmsg_t nm) 2064 { 2065 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 2066 2067 ASSERT_NETISR_NCPUS(mycpuid); 2068 2069 /* Reply ASAP */ 2070 crit_enter(); 2071 netisr_replymsg(&nm->base, 0); 2072 crit_exit(); 2073 2074 if (ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) { 2075 /* Running; done. */ 2076 return; 2077 } 2078 ipfw_track_expire_start(ctx, 2079 ipfw_track_scan_max, ipfw_track_expire_max); 2080 } 2081 2082 static void 2083 ipfw_track_expire_ipifunc(void *dummy __unused) 2084 { 2085 struct netmsg_base *msg; 2086 2087 KKASSERT(mycpuid < netisr_ncpus); 2088 msg = &ipfw_ctx[mycpuid]->ipfw_trackexp_nm; 2089 2090 crit_enter(); 2091 if (msg->lmsg.ms_flags & MSGF_DONE) 2092 netisr_sendmsg_oncpu(msg); 2093 crit_exit(); 2094 } 2095 2096 static int 2097 ipfw_track_reap(struct ipfw_context *ctx) 2098 { 2099 struct ipfw_track *t, *anchor; 2100 int expired; 2101 2102 if ((ctx->ipfw_flags & IPFW_FLAG_TRACKEXP) == 0) { 2103 /* 2104 * Kick start track expiring. Ignore scan limit, 2105 * we are short of tracks. 2106 */ 2107 ctx->ipfw_flags |= IPFW_FLAG_TRACKREAP; 2108 expired = ipfw_track_expire_start(ctx, INT_MAX, 2109 ipfw_track_reap_max); 2110 ctx->ipfw_flags &= ~IPFW_FLAG_TRACKREAP; 2111 return (expired); 2112 } 2113 2114 /* 2115 * Tracks are being expired. 2116 */ 2117 2118 if (RB_EMPTY(&ctx->ipfw_track_tree)) 2119 return (0); 2120 2121 expired = 0; 2122 anchor = &ctx->ipfw_trackexp_anch; 2123 while ((t = TAILQ_NEXT(anchor, t_link)) != NULL) { 2124 /* 2125 * Ignore scan limit; we are short of tracks. 2126 */ 2127 2128 TAILQ_REMOVE(&ctx->ipfw_track_list, anchor, t_link); 2129 TAILQ_INSERT_AFTER(&ctx->ipfw_track_list, t, anchor, t_link); 2130 2131 if (t->t_count == NULL) /* anchor */ 2132 continue; 2133 2134 ipfw_track_state_expire(ctx, t, TRUE); 2135 if (!LIST_EMPTY(&t->t_state_list)) { 2136 /* There are states referencing this track. */ 2137 continue; 2138 } 2139 2140 if (ipfw_track_free(ctx, t)) { 2141 if (++expired >= ipfw_track_reap_max) { 2142 ipfw_track_expire_more(ctx); 2143 break; 2144 } 2145 } 2146 } 2147 /* 2148 * NOTE: 2149 * Leave the anchor on the list, even if the end of the list has 2150 * been reached. ipfw_track_expire_more_dispatch() will handle 2151 * the removal. 2152 */ 2153 return (expired); 2154 } 2155 2156 static struct ipfw_track * 2157 ipfw_track_alloc(struct ipfw_context *ctx, const struct ipfw_flow_id *id, 2158 uint16_t limit_mask, struct ip_fw *rule) 2159 { 2160 struct ipfw_track *key, *t, *dup; 2161 struct ipfw_trkcnt *trk, *ret; 2162 boolean_t do_expire = FALSE; 2163 2164 KASSERT(rule->track_ruleid != 0, 2165 ("rule %u has no track ruleid", rule->rulenum)); 2166 2167 key = &ctx->ipfw_track_tmpkey; 2168 key->t_proto = id->proto; 2169 key->t_addrs = 0; 2170 key->t_ports = 0; 2171 key->t_rule = rule; 2172 if (limit_mask & DYN_SRC_ADDR) 2173 key->t_saddr = id->src_ip; 2174 if (limit_mask & DYN_DST_ADDR) 2175 key->t_daddr = id->dst_ip; 2176 if (limit_mask & DYN_SRC_PORT) 2177 key->t_sport = id->src_port; 2178 if (limit_mask & DYN_DST_PORT) 2179 key->t_dport = id->dst_port; 2180 2181 t = RB_FIND(ipfw_track_tree, &ctx->ipfw_track_tree, key); 2182 if (t != NULL) 2183 goto done; 2184 2185 t = kmalloc(sizeof(*t), M_IPFW, M_INTWAIT | M_NULLOK); 2186 if (t == NULL) { 2187 ctx->ipfw_tks_nomem++; 2188 return (NULL); 2189 } 2190 2191 t->t_key = key->t_key; 2192 t->t_rule = rule; 2193 t->t_lastexp = 0; 2194 LIST_INIT(&t->t_state_list); 2195 2196 if (ipfw_gd.ipfw_trkcnt_cnt >= ipfw_track_max) { 2197 time_t globexp, uptime; 2198 2199 trk = NULL; 2200 do_expire = TRUE; 2201 2202 /* 2203 * Do not expire globally more than once per second, 2204 * it is useless. 2205 */ 2206 uptime = time_uptime; 2207 globexp = ipfw_gd.ipfw_track_globexp; 2208 if (globexp != uptime && 2209 atomic_cmpset_long(&ipfw_gd.ipfw_track_globexp, 2210 globexp, uptime)) { 2211 int cpu; 2212 2213 /* Expire tracks on other CPUs. */ 2214 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 2215 if (cpu == mycpuid) 2216 continue; 2217 lwkt_send_ipiq(globaldata_find(cpu), 2218 ipfw_track_expire_ipifunc, NULL); 2219 } 2220 } 2221 } else { 2222 trk = ipfw_trkcnt_alloc(ctx); 2223 } 2224 if (trk == NULL) { 2225 struct ipfw_trkcnt *tkey; 2226 2227 tkey = &ctx->ipfw_trkcnt_tmpkey; 2228 key = NULL; /* tkey overlaps key */ 2229 2230 tkey->tc_key = t->t_key; 2231 tkey->tc_ruleid = rule->track_ruleid; 2232 2233 IPFW_TRKCNT_TOKGET; 2234 trk = RB_FIND(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree, 2235 tkey); 2236 if (trk == NULL) { 2237 IPFW_TRKCNT_TOKREL; 2238 if (do_expire) { 2239 ctx->ipfw_tks_reap++; 2240 if (ipfw_track_reap(ctx) > 0) { 2241 if (ipfw_gd.ipfw_trkcnt_cnt < 2242 ipfw_track_max) { 2243 trk = ipfw_trkcnt_alloc(ctx); 2244 if (trk != NULL) 2245 goto install; 2246 ctx->ipfw_tks_cntnomem++; 2247 } else { 2248 ctx->ipfw_tks_overflow++; 2249 } 2250 } else { 2251 ctx->ipfw_tks_reapfailed++; 2252 ctx->ipfw_tks_overflow++; 2253 } 2254 } else { 2255 ctx->ipfw_tks_cntnomem++; 2256 } 2257 kfree(t, M_IPFW); 2258 return (NULL); 2259 } 2260 KASSERT(trk->tc_refs > 0 && trk->tc_refs < netisr_ncpus, 2261 ("invalid trkcnt refs %d", trk->tc_refs)); 2262 atomic_add_int(&trk->tc_refs, 1); 2263 IPFW_TRKCNT_TOKREL; 2264 } else { 2265 install: 2266 trk->tc_key = t->t_key; 2267 trk->tc_ruleid = rule->track_ruleid; 2268 trk->tc_refs = 0; 2269 trk->tc_count = 0; 2270 trk->tc_expire = 0; 2271 trk->tc_rulenum = rule->rulenum; 2272 2273 IPFW_TRKCNT_TOKGET; 2274 ret = RB_INSERT(ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree, 2275 trk); 2276 if (ret != NULL) { 2277 KASSERT(ret->tc_refs > 0 && 2278 ret->tc_refs < netisr_ncpus, 2279 ("invalid trkcnt refs %d", ret->tc_refs)); 2280 KASSERT(ctx->ipfw_trkcnt_spare == NULL, 2281 ("trkcnt spare was installed")); 2282 ctx->ipfw_trkcnt_spare = trk; 2283 trk = ret; 2284 } else { 2285 ipfw_gd.ipfw_trkcnt_cnt++; 2286 } 2287 atomic_add_int(&trk->tc_refs, 1); 2288 IPFW_TRKCNT_TOKREL; 2289 } 2290 t->t_count = &trk->tc_count; 2291 t->t_trkcnt = trk; 2292 2293 dup = RB_INSERT(ipfw_track_tree, &ctx->ipfw_track_tree, t); 2294 if (dup != NULL) 2295 panic("ipfw: track exists"); 2296 TAILQ_INSERT_TAIL(&ctx->ipfw_track_list, t, t_link); 2297 done: 2298 t->t_expire = time_uptime + dyn_short_lifetime; 2299 return (t); 2300 } 2301 2302 /* 2303 * Install state for rule type cmd->o.opcode 2304 * 2305 * Returns 1 (failure) if state is not installed because of errors or because 2306 * states limitations are enforced. 2307 */ 2308 static int 2309 ipfw_state_install(struct ipfw_context *ctx, struct ip_fw *rule, 2310 ipfw_insn_limit *cmd, struct ip_fw_args *args, const struct tcphdr *tcp) 2311 { 2312 struct ipfw_state *s; 2313 struct ipfw_track *t; 2314 int count, diff; 2315 2316 if (ipfw_gd.ipfw_state_loosecnt >= ipfw_state_max && 2317 (diff = (ipfw_state_cntsync() - ipfw_state_max)) >= 0) { 2318 boolean_t overflow = TRUE; 2319 2320 ctx->ipfw_sts_reap++; 2321 if (ipfw_state_reap(ctx, diff) == 0) 2322 ctx->ipfw_sts_reapfailed++; 2323 if (ipfw_state_cntsync() < ipfw_state_max) 2324 overflow = FALSE; 2325 2326 if (overflow) { 2327 time_t globexp, uptime; 2328 int cpu; 2329 2330 /* 2331 * Do not expire globally more than once per second, 2332 * it is useless. 2333 */ 2334 uptime = time_uptime; 2335 globexp = ipfw_gd.ipfw_state_globexp; 2336 if (globexp == uptime || 2337 !atomic_cmpset_long(&ipfw_gd.ipfw_state_globexp, 2338 globexp, uptime)) { 2339 ctx->ipfw_sts_overflow++; 2340 return (1); 2341 } 2342 2343 /* Expire states on other CPUs. */ 2344 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 2345 if (cpu == mycpuid) 2346 continue; 2347 lwkt_send_ipiq(globaldata_find(cpu), 2348 ipfw_state_expire_ipifunc, NULL); 2349 } 2350 ctx->ipfw_sts_overflow++; 2351 return (1); 2352 } 2353 } 2354 2355 switch (cmd->o.opcode) { 2356 case O_KEEP_STATE: /* bidir rule */ 2357 s = ipfw_state_add(ctx, &args->f_id, O_KEEP_STATE, rule, NULL, 2358 tcp); 2359 if (s == NULL) 2360 return (1); 2361 break; 2362 2363 case O_LIMIT: /* limit number of sessions */ 2364 t = ipfw_track_alloc(ctx, &args->f_id, cmd->limit_mask, rule); 2365 if (t == NULL) 2366 return (1); 2367 2368 if (*t->t_count >= cmd->conn_limit) { 2369 if (!ipfw_track_state_expire(ctx, t, TRUE)) 2370 return (1); 2371 } 2372 for (;;) { 2373 count = *t->t_count; 2374 if (count >= cmd->conn_limit) 2375 return (1); 2376 if (atomic_cmpset_int(t->t_count, count, count + 1)) 2377 break; 2378 } 2379 2380 s = ipfw_state_add(ctx, &args->f_id, O_LIMIT, rule, t, tcp); 2381 if (s == NULL) { 2382 /* Undo damage. */ 2383 atomic_subtract_int(t->t_count, 1); 2384 return (1); 2385 } 2386 break; 2387 2388 default: 2389 panic("unknown state type %u\n", cmd->o.opcode); 2390 } 2391 return (0); 2392 } 2393 2394 /* 2395 * Transmit a TCP packet, containing either a RST or a keepalive. 2396 * When flags & TH_RST, we are sending a RST packet, because of a 2397 * "reset" action matched the packet. 2398 * Otherwise we are sending a keepalive, and flags & TH_ 2399 * 2400 * Only {src,dst}_{ip,port} of "id" are used. 2401 */ 2402 static void 2403 send_pkt(const struct ipfw_flow_id *id, uint32_t seq, uint32_t ack, int flags) 2404 { 2405 struct mbuf *m; 2406 struct ip *ip; 2407 struct tcphdr *tcp; 2408 struct route sro; /* fake route */ 2409 2410 MGETHDR(m, M_NOWAIT, MT_HEADER); 2411 if (m == NULL) 2412 return; 2413 m->m_pkthdr.rcvif = NULL; 2414 m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr); 2415 m->m_data += max_linkhdr; 2416 2417 ip = mtod(m, struct ip *); 2418 bzero(ip, m->m_len); 2419 tcp = (struct tcphdr *)(ip + 1); /* no IP options */ 2420 ip->ip_p = IPPROTO_TCP; 2421 tcp->th_off = 5; 2422 2423 /* 2424 * Assume we are sending a RST (or a keepalive in the reverse 2425 * direction), swap src and destination addresses and ports. 2426 */ 2427 ip->ip_src.s_addr = htonl(id->dst_ip); 2428 ip->ip_dst.s_addr = htonl(id->src_ip); 2429 tcp->th_sport = htons(id->dst_port); 2430 tcp->th_dport = htons(id->src_port); 2431 if (flags & TH_RST) { /* we are sending a RST */ 2432 if (flags & TH_ACK) { 2433 tcp->th_seq = htonl(ack); 2434 tcp->th_ack = htonl(0); 2435 tcp->th_flags = TH_RST; 2436 } else { 2437 if (flags & TH_SYN) 2438 seq++; 2439 tcp->th_seq = htonl(0); 2440 tcp->th_ack = htonl(seq); 2441 tcp->th_flags = TH_RST | TH_ACK; 2442 } 2443 } else { 2444 /* 2445 * We are sending a keepalive. flags & TH_SYN determines 2446 * the direction, forward if set, reverse if clear. 2447 * NOTE: seq and ack are always assumed to be correct 2448 * as set by the caller. This may be confusing... 2449 */ 2450 if (flags & TH_SYN) { 2451 /* 2452 * we have to rewrite the correct addresses! 2453 */ 2454 ip->ip_dst.s_addr = htonl(id->dst_ip); 2455 ip->ip_src.s_addr = htonl(id->src_ip); 2456 tcp->th_dport = htons(id->dst_port); 2457 tcp->th_sport = htons(id->src_port); 2458 } 2459 tcp->th_seq = htonl(seq); 2460 tcp->th_ack = htonl(ack); 2461 tcp->th_flags = TH_ACK; 2462 } 2463 2464 /* 2465 * set ip_len to the payload size so we can compute 2466 * the tcp checksum on the pseudoheader 2467 * XXX check this, could save a couple of words ? 2468 */ 2469 ip->ip_len = htons(sizeof(struct tcphdr)); 2470 tcp->th_sum = in_cksum(m, m->m_pkthdr.len); 2471 2472 /* 2473 * now fill fields left out earlier 2474 */ 2475 ip->ip_ttl = ip_defttl; 2476 ip->ip_len = m->m_pkthdr.len; 2477 2478 bzero(&sro, sizeof(sro)); 2479 ip_rtaddr(ip->ip_dst, &sro); 2480 2481 m->m_pkthdr.fw_flags |= IPFW_MBUF_GENERATED; 2482 ip_output(m, NULL, &sro, 0, NULL, NULL); 2483 if (sro.ro_rt) 2484 RTFREE(sro.ro_rt); 2485 } 2486 2487 /* 2488 * Send a reject message, consuming the mbuf passed as an argument. 2489 */ 2490 static void 2491 send_reject(struct ip_fw_args *args, int code, int offset, int ip_len) 2492 { 2493 if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ 2494 /* We need the IP header in host order for icmp_error(). */ 2495 if (args->eh != NULL) { 2496 struct ip *ip = mtod(args->m, struct ip *); 2497 2498 ip->ip_len = ntohs(ip->ip_len); 2499 ip->ip_off = ntohs(ip->ip_off); 2500 } 2501 icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); 2502 } else if (offset == 0 && args->f_id.proto == IPPROTO_TCP) { 2503 struct tcphdr *const tcp = 2504 L3HDR(struct tcphdr, mtod(args->m, struct ip *)); 2505 2506 if ((tcp->th_flags & TH_RST) == 0) { 2507 send_pkt(&args->f_id, ntohl(tcp->th_seq), 2508 ntohl(tcp->th_ack), tcp->th_flags | TH_RST); 2509 } 2510 m_freem(args->m); 2511 } else { 2512 m_freem(args->m); 2513 } 2514 args->m = NULL; 2515 } 2516 2517 /* 2518 * Given an ip_fw *, lookup_next_rule will return a pointer 2519 * to the next rule, which can be either the jump 2520 * target (for skipto instructions) or the next one in the list (in 2521 * all other cases including a missing jump target). 2522 * The result is also written in the "next_rule" field of the rule. 2523 * Backward jumps are not allowed, so start looking from the next 2524 * rule... 2525 * 2526 * This never returns NULL -- in case we do not have an exact match, 2527 * the next rule is returned. When the ruleset is changed, 2528 * pointers are flushed so we are always correct. 2529 */ 2530 static struct ip_fw * 2531 lookup_next_rule(struct ip_fw *me) 2532 { 2533 struct ip_fw *rule = NULL; 2534 ipfw_insn *cmd; 2535 2536 /* look for action, in case it is a skipto */ 2537 cmd = ACTION_PTR(me); 2538 if (cmd->opcode == O_LOG) 2539 cmd += F_LEN(cmd); 2540 if (cmd->opcode == O_SKIPTO) { 2541 for (rule = me->next; rule; rule = rule->next) { 2542 if (rule->rulenum >= cmd->arg1) 2543 break; 2544 } 2545 } 2546 if (rule == NULL) /* failure or not a skipto */ 2547 rule = me->next; 2548 me->next_rule = rule; 2549 return rule; 2550 } 2551 2552 static int 2553 ipfw_match_uid(const struct ipfw_flow_id *fid, struct ifnet *oif, 2554 enum ipfw_opcodes opcode, uid_t uid) 2555 { 2556 struct in_addr src_ip, dst_ip; 2557 struct inpcbinfo *pi; 2558 boolean_t wildcard; 2559 struct inpcb *pcb; 2560 2561 if (fid->proto == IPPROTO_TCP) { 2562 wildcard = FALSE; 2563 pi = &tcbinfo[mycpuid]; 2564 } else if (fid->proto == IPPROTO_UDP) { 2565 wildcard = TRUE; 2566 pi = &udbinfo[mycpuid]; 2567 } else { 2568 return 0; 2569 } 2570 2571 /* 2572 * Values in 'fid' are in host byte order 2573 */ 2574 dst_ip.s_addr = htonl(fid->dst_ip); 2575 src_ip.s_addr = htonl(fid->src_ip); 2576 if (oif) { 2577 pcb = in_pcblookup_hash(pi, 2578 dst_ip, htons(fid->dst_port), 2579 src_ip, htons(fid->src_port), 2580 wildcard, oif); 2581 } else { 2582 pcb = in_pcblookup_hash(pi, 2583 src_ip, htons(fid->src_port), 2584 dst_ip, htons(fid->dst_port), 2585 wildcard, NULL); 2586 } 2587 if (pcb == NULL || pcb->inp_socket == NULL) 2588 return 0; 2589 2590 if (opcode == O_UID) { 2591 #define socheckuid(a,b) ((a)->so_cred->cr_uid != (b)) 2592 return !socheckuid(pcb->inp_socket, uid); 2593 #undef socheckuid 2594 } else { 2595 return groupmember(uid, pcb->inp_socket->so_cred); 2596 } 2597 } 2598 2599 /* 2600 * The main check routine for the firewall. 2601 * 2602 * All arguments are in args so we can modify them and return them 2603 * back to the caller. 2604 * 2605 * Parameters: 2606 * 2607 * args->m (in/out) The packet; we set to NULL when/if we nuke it. 2608 * Starts with the IP header. 2609 * args->eh (in) Mac header if present, or NULL for layer3 packet. 2610 * args->oif Outgoing interface, or NULL if packet is incoming. 2611 * The incoming interface is in the mbuf. (in) 2612 * 2613 * args->rule Pointer to the last matching rule (in/out) 2614 * args->f_id Addresses grabbed from the packet (out) 2615 * 2616 * Return value: 2617 * 2618 * If the packet was denied/rejected and has been dropped, *m is equal 2619 * to NULL upon return. 2620 * 2621 * IP_FW_DENY the packet must be dropped. 2622 * IP_FW_PASS The packet is to be accepted and routed normally. 2623 * IP_FW_DIVERT Divert the packet to port (args->cookie) 2624 * IP_FW_TEE Tee the packet to port (args->cookie) 2625 * IP_FW_DUMMYNET Send the packet to pipe/queue (args->cookie) 2626 */ 2627 static int 2628 ipfw_chk(struct ip_fw_args *args) 2629 { 2630 /* 2631 * Local variables hold state during the processing of a packet. 2632 * 2633 * IMPORTANT NOTE: to speed up the processing of rules, there 2634 * are some assumption on the values of the variables, which 2635 * are documented here. Should you change them, please check 2636 * the implementation of the various instructions to make sure 2637 * that they still work. 2638 * 2639 * args->eh The MAC header. It is non-null for a layer2 2640 * packet, it is NULL for a layer-3 packet. 2641 * 2642 * m | args->m Pointer to the mbuf, as received from the caller. 2643 * It may change if ipfw_chk() does an m_pullup, or if it 2644 * consumes the packet because it calls send_reject(). 2645 * XXX This has to change, so that ipfw_chk() never modifies 2646 * or consumes the buffer. 2647 * ip is simply an alias of the value of m, and it is kept 2648 * in sync with it (the packet is supposed to start with 2649 * the ip header). 2650 */ 2651 struct mbuf *m = args->m; 2652 struct ip *ip = mtod(m, struct ip *); 2653 2654 /* 2655 * oif | args->oif If NULL, ipfw_chk has been called on the 2656 * inbound path (ether_input, ip_input). 2657 * If non-NULL, ipfw_chk has been called on the outbound path 2658 * (ether_output, ip_output). 2659 */ 2660 struct ifnet *oif = args->oif; 2661 2662 struct ip_fw *f = NULL; /* matching rule */ 2663 int retval = IP_FW_PASS; 2664 struct m_tag *mtag; 2665 struct divert_info *divinfo; 2666 2667 /* 2668 * hlen The length of the IPv4 header. 2669 * hlen >0 means we have an IPv4 packet. 2670 */ 2671 u_int hlen = 0; /* hlen >0 means we have an IP pkt */ 2672 2673 /* 2674 * offset The offset of a fragment. offset != 0 means that 2675 * we have a fragment at this offset of an IPv4 packet. 2676 * offset == 0 means that (if this is an IPv4 packet) 2677 * this is the first or only fragment. 2678 */ 2679 u_short offset = 0; 2680 2681 /* 2682 * Local copies of addresses. They are only valid if we have 2683 * an IP packet. 2684 * 2685 * proto The protocol. Set to 0 for non-ip packets, 2686 * or to the protocol read from the packet otherwise. 2687 * proto != 0 means that we have an IPv4 packet. 2688 * 2689 * src_port, dst_port port numbers, in HOST format. Only 2690 * valid for TCP and UDP packets. 2691 * 2692 * src_ip, dst_ip ip addresses, in NETWORK format. 2693 * Only valid for IPv4 packets. 2694 */ 2695 uint8_t proto; 2696 uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ 2697 struct in_addr src_ip, dst_ip; /* NOTE: network format */ 2698 uint16_t ip_len = 0; 2699 2700 /* 2701 * dyn_dir = MATCH_UNKNOWN when rules unchecked, 2702 * MATCH_NONE when checked and not matched (dyn_f = NULL), 2703 * MATCH_FORWARD or MATCH_REVERSE otherwise (dyn_f != NULL) 2704 */ 2705 int dyn_dir = MATCH_UNKNOWN; 2706 struct ip_fw *dyn_f = NULL; 2707 int cpuid = mycpuid; 2708 struct ipfw_context *ctx; 2709 2710 ASSERT_NETISR_NCPUS(cpuid); 2711 ctx = ipfw_ctx[cpuid]; 2712 2713 if (m->m_pkthdr.fw_flags & IPFW_MBUF_GENERATED) 2714 return IP_FW_PASS; /* accept */ 2715 2716 if (args->eh == NULL || /* layer 3 packet */ 2717 (m->m_pkthdr.len >= sizeof(struct ip) && 2718 ntohs(args->eh->ether_type) == ETHERTYPE_IP)) 2719 hlen = ip->ip_hl << 2; 2720 2721 /* 2722 * Collect parameters into local variables for faster matching. 2723 */ 2724 if (hlen == 0) { /* do not grab addresses for non-ip pkts */ 2725 proto = args->f_id.proto = 0; /* mark f_id invalid */ 2726 goto after_ip_checks; 2727 } 2728 2729 proto = args->f_id.proto = ip->ip_p; 2730 src_ip = ip->ip_src; 2731 dst_ip = ip->ip_dst; 2732 if (args->eh != NULL) { /* layer 2 packets are as on the wire */ 2733 offset = ntohs(ip->ip_off) & IP_OFFMASK; 2734 ip_len = ntohs(ip->ip_len); 2735 } else { 2736 offset = ip->ip_off & IP_OFFMASK; 2737 ip_len = ip->ip_len; 2738 } 2739 2740 #define PULLUP_TO(len) \ 2741 do { \ 2742 if (m->m_len < (len)) { \ 2743 args->m = m = m_pullup(m, (len));\ 2744 if (m == NULL) \ 2745 goto pullup_failed; \ 2746 ip = mtod(m, struct ip *); \ 2747 } \ 2748 } while (0) 2749 2750 if (offset == 0) { 2751 switch (proto) { 2752 case IPPROTO_TCP: 2753 { 2754 struct tcphdr *tcp; 2755 2756 PULLUP_TO(hlen + sizeof(struct tcphdr)); 2757 tcp = L3HDR(struct tcphdr, ip); 2758 dst_port = tcp->th_dport; 2759 src_port = tcp->th_sport; 2760 args->f_id.flags = tcp->th_flags; 2761 } 2762 break; 2763 2764 case IPPROTO_UDP: 2765 { 2766 struct udphdr *udp; 2767 2768 PULLUP_TO(hlen + sizeof(struct udphdr)); 2769 udp = L3HDR(struct udphdr, ip); 2770 dst_port = udp->uh_dport; 2771 src_port = udp->uh_sport; 2772 } 2773 break; 2774 2775 case IPPROTO_ICMP: 2776 PULLUP_TO(hlen + 4); /* type, code and checksum. */ 2777 args->f_id.flags = L3HDR(struct icmp, ip)->icmp_type; 2778 break; 2779 2780 default: 2781 break; 2782 } 2783 } 2784 2785 #undef PULLUP_TO 2786 2787 args->f_id.src_ip = ntohl(src_ip.s_addr); 2788 args->f_id.dst_ip = ntohl(dst_ip.s_addr); 2789 args->f_id.src_port = src_port = ntohs(src_port); 2790 args->f_id.dst_port = dst_port = ntohs(dst_port); 2791 2792 after_ip_checks: 2793 if (args->rule) { 2794 /* 2795 * Packet has already been tagged. Look for the next rule 2796 * to restart processing. 2797 * 2798 * If fw_one_pass != 0 then just accept it. 2799 * XXX should not happen here, but optimized out in 2800 * the caller. 2801 */ 2802 if (fw_one_pass) 2803 return IP_FW_PASS; 2804 2805 /* This rule is being/has been flushed */ 2806 if (ipfw_flushing) 2807 return IP_FW_DENY; 2808 2809 KASSERT(args->rule->cpuid == cpuid, 2810 ("rule used on cpu%d", cpuid)); 2811 2812 /* This rule was deleted */ 2813 if (args->rule->rule_flags & IPFW_RULE_F_INVALID) 2814 return IP_FW_DENY; 2815 2816 f = args->rule->next_rule; 2817 if (f == NULL) 2818 f = lookup_next_rule(args->rule); 2819 } else { 2820 /* 2821 * Find the starting rule. It can be either the first 2822 * one, or the one after divert_rule if asked so. 2823 */ 2824 int skipto; 2825 2826 mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL); 2827 if (mtag != NULL) { 2828 divinfo = m_tag_data(mtag); 2829 skipto = divinfo->skipto; 2830 } else { 2831 skipto = 0; 2832 } 2833 2834 f = ctx->ipfw_layer3_chain; 2835 if (args->eh == NULL && skipto != 0) { 2836 /* No skipto during rule flushing */ 2837 if (ipfw_flushing) 2838 return IP_FW_DENY; 2839 2840 if (skipto >= IPFW_DEFAULT_RULE) 2841 return IP_FW_DENY; /* invalid */ 2842 2843 while (f && f->rulenum <= skipto) 2844 f = f->next; 2845 if (f == NULL) /* drop packet */ 2846 return IP_FW_DENY; 2847 } else if (ipfw_flushing) { 2848 /* Rules are being flushed; skip to default rule */ 2849 f = ctx->ipfw_default_rule; 2850 } 2851 } 2852 if ((mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL)) != NULL) 2853 m_tag_delete(m, mtag); 2854 2855 /* 2856 * Now scan the rules, and parse microinstructions for each rule. 2857 */ 2858 for (; f; f = f->next) { 2859 int l, cmdlen; 2860 ipfw_insn *cmd; 2861 int skip_or; /* skip rest of OR block */ 2862 2863 again: 2864 if (ctx->ipfw_set_disable & (1 << f->set)) 2865 continue; 2866 2867 skip_or = 0; 2868 for (l = f->cmd_len, cmd = f->cmd; l > 0; 2869 l -= cmdlen, cmd += cmdlen) { 2870 int match; 2871 2872 /* 2873 * check_body is a jump target used when we find a 2874 * CHECK_STATE, and need to jump to the body of 2875 * the target rule. 2876 */ 2877 2878 check_body: 2879 cmdlen = F_LEN(cmd); 2880 /* 2881 * An OR block (insn_1 || .. || insn_n) has the 2882 * F_OR bit set in all but the last instruction. 2883 * The first match will set "skip_or", and cause 2884 * the following instructions to be skipped until 2885 * past the one with the F_OR bit clear. 2886 */ 2887 if (skip_or) { /* skip this instruction */ 2888 if ((cmd->len & F_OR) == 0) 2889 skip_or = 0; /* next one is good */ 2890 continue; 2891 } 2892 match = 0; /* set to 1 if we succeed */ 2893 2894 switch (cmd->opcode) { 2895 /* 2896 * The first set of opcodes compares the packet's 2897 * fields with some pattern, setting 'match' if a 2898 * match is found. At the end of the loop there is 2899 * logic to deal with F_NOT and F_OR flags associated 2900 * with the opcode. 2901 */ 2902 case O_NOP: 2903 match = 1; 2904 break; 2905 2906 case O_FORWARD_MAC: 2907 kprintf("ipfw: opcode %d unimplemented\n", 2908 cmd->opcode); 2909 break; 2910 2911 case O_GID: 2912 case O_UID: 2913 /* 2914 * We only check offset == 0 && proto != 0, 2915 * as this ensures that we have an IPv4 2916 * packet with the ports info. 2917 */ 2918 if (offset!=0) 2919 break; 2920 2921 match = ipfw_match_uid(&args->f_id, oif, 2922 cmd->opcode, 2923 (uid_t)((ipfw_insn_u32 *)cmd)->d[0]); 2924 break; 2925 2926 case O_RECV: 2927 match = iface_match(m->m_pkthdr.rcvif, 2928 (ipfw_insn_if *)cmd); 2929 break; 2930 2931 case O_XMIT: 2932 match = iface_match(oif, (ipfw_insn_if *)cmd); 2933 break; 2934 2935 case O_VIA: 2936 match = iface_match(oif ? oif : 2937 m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); 2938 break; 2939 2940 case O_MACADDR2: 2941 if (args->eh != NULL) { /* have MAC header */ 2942 uint32_t *want = (uint32_t *) 2943 ((ipfw_insn_mac *)cmd)->addr; 2944 uint32_t *mask = (uint32_t *) 2945 ((ipfw_insn_mac *)cmd)->mask; 2946 uint32_t *hdr = (uint32_t *)args->eh; 2947 2948 match = 2949 (want[0] == (hdr[0] & mask[0]) && 2950 want[1] == (hdr[1] & mask[1]) && 2951 want[2] == (hdr[2] & mask[2])); 2952 } 2953 break; 2954 2955 case O_MAC_TYPE: 2956 if (args->eh != NULL) { 2957 uint16_t t = 2958 ntohs(args->eh->ether_type); 2959 uint16_t *p = 2960 ((ipfw_insn_u16 *)cmd)->ports; 2961 int i; 2962 2963 /* Special vlan handling */ 2964 if (m->m_flags & M_VLANTAG) 2965 t = ETHERTYPE_VLAN; 2966 2967 for (i = cmdlen - 1; !match && i > 0; 2968 i--, p += 2) { 2969 match = 2970 (t >= p[0] && t <= p[1]); 2971 } 2972 } 2973 break; 2974 2975 case O_FRAG: 2976 match = (hlen > 0 && offset != 0); 2977 break; 2978 2979 case O_IN: /* "out" is "not in" */ 2980 match = (oif == NULL); 2981 break; 2982 2983 case O_LAYER2: 2984 match = (args->eh != NULL); 2985 break; 2986 2987 case O_PROTO: 2988 /* 2989 * We do not allow an arg of 0 so the 2990 * check of "proto" only suffices. 2991 */ 2992 match = (proto == cmd->arg1); 2993 break; 2994 2995 case O_IP_SRC: 2996 match = (hlen > 0 && 2997 ((ipfw_insn_ip *)cmd)->addr.s_addr == 2998 src_ip.s_addr); 2999 break; 3000 3001 case O_IP_SRC_MASK: 3002 match = (hlen > 0 && 3003 ((ipfw_insn_ip *)cmd)->addr.s_addr == 3004 (src_ip.s_addr & 3005 ((ipfw_insn_ip *)cmd)->mask.s_addr)); 3006 break; 3007 3008 case O_IP_SRC_ME: 3009 if (hlen > 0) { 3010 struct ifnet *tif; 3011 3012 tif = INADDR_TO_IFP(&src_ip); 3013 match = (tif != NULL); 3014 } 3015 break; 3016 3017 case O_IP_DST_SET: 3018 case O_IP_SRC_SET: 3019 if (hlen > 0) { 3020 uint32_t *d = (uint32_t *)(cmd + 1); 3021 uint32_t addr = 3022 cmd->opcode == O_IP_DST_SET ? 3023 args->f_id.dst_ip : 3024 args->f_id.src_ip; 3025 3026 if (addr < d[0]) 3027 break; 3028 addr -= d[0]; /* subtract base */ 3029 match = 3030 (addr < cmd->arg1) && 3031 (d[1 + (addr >> 5)] & 3032 (1 << (addr & 0x1f))); 3033 } 3034 break; 3035 3036 case O_IP_DST: 3037 match = (hlen > 0 && 3038 ((ipfw_insn_ip *)cmd)->addr.s_addr == 3039 dst_ip.s_addr); 3040 break; 3041 3042 case O_IP_DST_MASK: 3043 match = (hlen > 0) && 3044 (((ipfw_insn_ip *)cmd)->addr.s_addr == 3045 (dst_ip.s_addr & 3046 ((ipfw_insn_ip *)cmd)->mask.s_addr)); 3047 break; 3048 3049 case O_IP_DST_ME: 3050 if (hlen > 0) { 3051 struct ifnet *tif; 3052 3053 tif = INADDR_TO_IFP(&dst_ip); 3054 match = (tif != NULL); 3055 } 3056 break; 3057 3058 case O_IP_SRCPORT: 3059 case O_IP_DSTPORT: 3060 /* 3061 * offset == 0 && proto != 0 is enough 3062 * to guarantee that we have an IPv4 3063 * packet with port info. 3064 */ 3065 if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) 3066 && offset == 0) { 3067 uint16_t x = 3068 (cmd->opcode == O_IP_SRCPORT) ? 3069 src_port : dst_port ; 3070 uint16_t *p = 3071 ((ipfw_insn_u16 *)cmd)->ports; 3072 int i; 3073 3074 for (i = cmdlen - 1; !match && i > 0; 3075 i--, p += 2) { 3076 match = 3077 (x >= p[0] && x <= p[1]); 3078 } 3079 } 3080 break; 3081 3082 case O_ICMPTYPE: 3083 match = (offset == 0 && proto==IPPROTO_ICMP && 3084 icmptype_match(ip, (ipfw_insn_u32 *)cmd)); 3085 break; 3086 3087 case O_IPOPT: 3088 match = (hlen > 0 && ipopts_match(ip, cmd)); 3089 break; 3090 3091 case O_IPVER: 3092 match = (hlen > 0 && cmd->arg1 == ip->ip_v); 3093 break; 3094 3095 case O_IPTTL: 3096 match = (hlen > 0 && cmd->arg1 == ip->ip_ttl); 3097 break; 3098 3099 case O_IPID: 3100 match = (hlen > 0 && 3101 cmd->arg1 == ntohs(ip->ip_id)); 3102 break; 3103 3104 case O_IPLEN: 3105 match = (hlen > 0 && cmd->arg1 == ip_len); 3106 break; 3107 3108 case O_IPPRECEDENCE: 3109 match = (hlen > 0 && 3110 (cmd->arg1 == (ip->ip_tos & 0xe0))); 3111 break; 3112 3113 case O_IPTOS: 3114 match = (hlen > 0 && 3115 flags_match(cmd, ip->ip_tos)); 3116 break; 3117 3118 case O_TCPFLAGS: 3119 match = (proto == IPPROTO_TCP && offset == 0 && 3120 flags_match(cmd, 3121 L3HDR(struct tcphdr,ip)->th_flags)); 3122 break; 3123 3124 case O_TCPOPTS: 3125 match = (proto == IPPROTO_TCP && offset == 0 && 3126 tcpopts_match(ip, cmd)); 3127 break; 3128 3129 case O_TCPSEQ: 3130 match = (proto == IPPROTO_TCP && offset == 0 && 3131 ((ipfw_insn_u32 *)cmd)->d[0] == 3132 L3HDR(struct tcphdr,ip)->th_seq); 3133 break; 3134 3135 case O_TCPACK: 3136 match = (proto == IPPROTO_TCP && offset == 0 && 3137 ((ipfw_insn_u32 *)cmd)->d[0] == 3138 L3HDR(struct tcphdr,ip)->th_ack); 3139 break; 3140 3141 case O_TCPWIN: 3142 match = (proto == IPPROTO_TCP && offset == 0 && 3143 cmd->arg1 == 3144 L3HDR(struct tcphdr,ip)->th_win); 3145 break; 3146 3147 case O_ESTAB: 3148 /* reject packets which have SYN only */ 3149 /* XXX should i also check for TH_ACK ? */ 3150 match = (proto == IPPROTO_TCP && offset == 0 && 3151 (L3HDR(struct tcphdr,ip)->th_flags & 3152 (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); 3153 break; 3154 3155 case O_LOG: 3156 if (fw_verbose) { 3157 ipfw_log(ctx, f, hlen, args->eh, m, 3158 oif); 3159 } 3160 match = 1; 3161 break; 3162 3163 case O_PROB: 3164 match = (krandom() < 3165 ((ipfw_insn_u32 *)cmd)->d[0]); 3166 break; 3167 3168 /* 3169 * The second set of opcodes represents 'actions', 3170 * i.e. the terminal part of a rule once the packet 3171 * matches all previous patterns. 3172 * Typically there is only one action for each rule, 3173 * and the opcode is stored at the end of the rule 3174 * (but there are exceptions -- see below). 3175 * 3176 * In general, here we set retval and terminate the 3177 * outer loop (would be a 'break 3' in some language, 3178 * but we need to do a 'goto done'). 3179 * 3180 * Exceptions: 3181 * O_COUNT and O_SKIPTO actions: 3182 * instead of terminating, we jump to the next rule 3183 * ('goto next_rule', equivalent to a 'break 2'), 3184 * or to the SKIPTO target ('goto again' after 3185 * having set f, cmd and l), respectively. 3186 * 3187 * O_LIMIT and O_KEEP_STATE: these opcodes are 3188 * not real 'actions', and are stored right 3189 * before the 'action' part of the rule. 3190 * These opcodes try to install an entry in the 3191 * state tables; if successful, we continue with 3192 * the next opcode (match=1; break;), otherwise 3193 * the packet must be dropped ('goto done' after 3194 * setting retval). If static rules are changed 3195 * during the state installation, the packet will 3196 * be dropped and rule's stats will not beupdated 3197 * ('return IP_FW_DENY'). 3198 * 3199 * O_PROBE_STATE and O_CHECK_STATE: these opcodes 3200 * cause a lookup of the state table, and a jump 3201 * to the 'action' part of the parent rule 3202 * ('goto check_body') if an entry is found, or 3203 * (CHECK_STATE only) a jump to the next rule if 3204 * the entry is not found ('goto next_rule'). 3205 * The result of the lookup is cached to make 3206 * further instances of these opcodes are 3207 * effectively NOPs. If static rules are changed 3208 * during the state looking up, the packet will 3209 * be dropped and rule's stats will not be updated 3210 * ('return IP_FW_DENY'). 3211 */ 3212 case O_LIMIT: 3213 case O_KEEP_STATE: 3214 if (ipfw_state_install(ctx, f, 3215 (ipfw_insn_limit *)cmd, args, 3216 (offset == 0 && proto == IPPROTO_TCP) ? 3217 L3HDR(struct tcphdr, ip) : NULL)) { 3218 retval = IP_FW_DENY; 3219 goto done; /* error/limit violation */ 3220 } 3221 match = 1; 3222 break; 3223 3224 case O_PROBE_STATE: 3225 case O_CHECK_STATE: 3226 /* 3227 * States are checked at the first keep-state 3228 * check-state occurrence, with the result 3229 * being stored in dyn_dir. The compiler 3230 * introduces a PROBE_STATE instruction for 3231 * us when we have a KEEP_STATE/LIMIT (because 3232 * PROBE_STATE needs to be run first). 3233 */ 3234 if (dyn_dir == MATCH_UNKNOWN) { 3235 dyn_f = ipfw_state_lookup_rule(ctx, 3236 &args->f_id, &dyn_dir, 3237 (offset == 0 && 3238 proto == IPPROTO_TCP) ? 3239 L3HDR(struct tcphdr, ip) : NULL, 3240 ip_len); 3241 if (dyn_f != NULL) { 3242 /* 3243 * Found a rule from a state; 3244 * jump to the 'action' part 3245 * of the rule. 3246 */ 3247 f = dyn_f; 3248 cmd = ACTION_PTR(f); 3249 l = f->cmd_len - f->act_ofs; 3250 goto check_body; 3251 } 3252 } 3253 /* 3254 * State not found. If CHECK_STATE, skip to 3255 * next rule, if PROBE_STATE just ignore and 3256 * continue with next opcode. 3257 */ 3258 if (cmd->opcode == O_CHECK_STATE) 3259 goto next_rule; 3260 match = 1; 3261 break; 3262 3263 case O_ACCEPT: 3264 retval = IP_FW_PASS; /* accept */ 3265 goto done; 3266 3267 case O_PIPE: 3268 case O_QUEUE: 3269 args->rule = f; /* report matching rule */ 3270 args->cookie = cmd->arg1; 3271 retval = IP_FW_DUMMYNET; 3272 goto done; 3273 3274 case O_DIVERT: 3275 case O_TEE: 3276 if (args->eh) /* not on layer 2 */ 3277 break; 3278 3279 mtag = m_tag_get(PACKET_TAG_IPFW_DIVERT, 3280 sizeof(*divinfo), M_NOWAIT); 3281 if (mtag == NULL) { 3282 retval = IP_FW_DENY; 3283 goto done; 3284 } 3285 divinfo = m_tag_data(mtag); 3286 3287 divinfo->skipto = f->rulenum; 3288 divinfo->port = cmd->arg1; 3289 divinfo->tee = (cmd->opcode == O_TEE); 3290 m_tag_prepend(m, mtag); 3291 3292 args->cookie = cmd->arg1; 3293 retval = (cmd->opcode == O_DIVERT) ? 3294 IP_FW_DIVERT : IP_FW_TEE; 3295 goto done; 3296 3297 case O_COUNT: 3298 case O_SKIPTO: 3299 f->pcnt++; /* update stats */ 3300 f->bcnt += ip_len; 3301 f->timestamp = time_second; 3302 if (cmd->opcode == O_COUNT) 3303 goto next_rule; 3304 /* handle skipto */ 3305 if (f->next_rule == NULL) 3306 lookup_next_rule(f); 3307 f = f->next_rule; 3308 goto again; 3309 3310 case O_REJECT: 3311 /* 3312 * Drop the packet and send a reject notice 3313 * if the packet is not ICMP (or is an ICMP 3314 * query), and it is not multicast/broadcast. 3315 */ 3316 if (hlen > 0 && 3317 (proto != IPPROTO_ICMP || 3318 is_icmp_query(ip)) && 3319 !(m->m_flags & (M_BCAST|M_MCAST)) && 3320 !IN_MULTICAST(ntohl(dst_ip.s_addr))) { 3321 /* 3322 * Update statistics before the possible 3323 * blocking 'send_reject' 3324 */ 3325 f->pcnt++; 3326 f->bcnt += ip_len; 3327 f->timestamp = time_second; 3328 3329 send_reject(args, cmd->arg1, 3330 offset,ip_len); 3331 m = args->m; 3332 3333 /* 3334 * Return directly here, rule stats 3335 * have been updated above. 3336 */ 3337 return IP_FW_DENY; 3338 } 3339 /* FALLTHROUGH */ 3340 case O_DENY: 3341 retval = IP_FW_DENY; 3342 goto done; 3343 3344 case O_FORWARD_IP: 3345 if (args->eh) /* not valid on layer2 pkts */ 3346 break; 3347 if (!dyn_f || dyn_dir == MATCH_FORWARD) { 3348 struct sockaddr_in *sin; 3349 3350 mtag = m_tag_get(PACKET_TAG_IPFORWARD, 3351 sizeof(*sin), M_NOWAIT); 3352 if (mtag == NULL) { 3353 retval = IP_FW_DENY; 3354 goto done; 3355 } 3356 sin = m_tag_data(mtag); 3357 3358 /* Structure copy */ 3359 *sin = ((ipfw_insn_sa *)cmd)->sa; 3360 3361 m_tag_prepend(m, mtag); 3362 m->m_pkthdr.fw_flags |= 3363 IPFORWARD_MBUF_TAGGED; 3364 m->m_pkthdr.fw_flags &= 3365 ~BRIDGE_MBUF_TAGGED; 3366 } 3367 retval = IP_FW_PASS; 3368 goto done; 3369 3370 default: 3371 panic("-- unknown opcode %d", cmd->opcode); 3372 } /* end of switch() on opcodes */ 3373 3374 if (cmd->len & F_NOT) 3375 match = !match; 3376 3377 if (match) { 3378 if (cmd->len & F_OR) 3379 skip_or = 1; 3380 } else { 3381 if (!(cmd->len & F_OR)) /* not an OR block, */ 3382 break; /* try next rule */ 3383 } 3384 3385 } /* end of inner for, scan opcodes */ 3386 3387 next_rule:; /* try next rule */ 3388 3389 } /* end of outer for, scan rules */ 3390 kprintf("+++ ipfw: ouch!, skip past end of rules, denying packet\n"); 3391 return IP_FW_DENY; 3392 3393 done: 3394 /* Update statistics */ 3395 f->pcnt++; 3396 f->bcnt += ip_len; 3397 f->timestamp = time_second; 3398 return retval; 3399 3400 pullup_failed: 3401 if (fw_verbose) 3402 kprintf("pullup failed\n"); 3403 return IP_FW_DENY; 3404 } 3405 3406 static void 3407 ipfw_dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) 3408 { 3409 struct m_tag *mtag; 3410 struct dn_pkt *pkt; 3411 ipfw_insn *cmd; 3412 const struct ipfw_flow_id *id; 3413 struct dn_flow_id *fid; 3414 3415 M_ASSERTPKTHDR(m); 3416 3417 mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*pkt), M_NOWAIT); 3418 if (mtag == NULL) { 3419 m_freem(m); 3420 return; 3421 } 3422 m_tag_prepend(m, mtag); 3423 3424 pkt = m_tag_data(mtag); 3425 bzero(pkt, sizeof(*pkt)); 3426 3427 cmd = fwa->rule->cmd + fwa->rule->act_ofs; 3428 if (cmd->opcode == O_LOG) 3429 cmd += F_LEN(cmd); 3430 KASSERT(cmd->opcode == O_PIPE || cmd->opcode == O_QUEUE, 3431 ("Rule is not PIPE or QUEUE, opcode %d", cmd->opcode)); 3432 3433 pkt->dn_m = m; 3434 pkt->dn_flags = (dir & DN_FLAGS_DIR_MASK); 3435 pkt->ifp = fwa->oif; 3436 pkt->pipe_nr = pipe_nr; 3437 3438 pkt->cpuid = mycpuid; 3439 pkt->msgport = netisr_curport(); 3440 3441 id = &fwa->f_id; 3442 fid = &pkt->id; 3443 fid->fid_dst_ip = id->dst_ip; 3444 fid->fid_src_ip = id->src_ip; 3445 fid->fid_dst_port = id->dst_port; 3446 fid->fid_src_port = id->src_port; 3447 fid->fid_proto = id->proto; 3448 fid->fid_flags = id->flags; 3449 3450 ipfw_ref_rule(fwa->rule); 3451 pkt->dn_priv = fwa->rule; 3452 pkt->dn_unref_priv = ipfw_unref_rule; 3453 3454 if (cmd->opcode == O_PIPE) 3455 pkt->dn_flags |= DN_FLAGS_IS_PIPE; 3456 3457 m->m_pkthdr.fw_flags |= DUMMYNET_MBUF_TAGGED; 3458 } 3459 3460 /* 3461 * When a rule is added/deleted, clear the next_rule pointers in all rules. 3462 * These will be reconstructed on the fly as packets are matched. 3463 */ 3464 static void 3465 ipfw_flush_rule_ptrs(struct ipfw_context *ctx) 3466 { 3467 struct ip_fw *rule; 3468 3469 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) 3470 rule->next_rule = NULL; 3471 } 3472 3473 static __inline void 3474 ipfw_inc_static_count(struct ip_fw *rule) 3475 { 3476 /* Static rule's counts are updated only on CPU0 */ 3477 KKASSERT(mycpuid == 0); 3478 3479 static_count++; 3480 static_ioc_len += IOC_RULESIZE(rule); 3481 } 3482 3483 static __inline void 3484 ipfw_dec_static_count(struct ip_fw *rule) 3485 { 3486 int l = IOC_RULESIZE(rule); 3487 3488 /* Static rule's counts are updated only on CPU0 */ 3489 KKASSERT(mycpuid == 0); 3490 3491 KASSERT(static_count > 0, ("invalid static count %u", static_count)); 3492 static_count--; 3493 3494 KASSERT(static_ioc_len >= l, 3495 ("invalid static len %u", static_ioc_len)); 3496 static_ioc_len -= l; 3497 } 3498 3499 static void 3500 ipfw_link_sibling(struct netmsg_ipfw *fwmsg, struct ip_fw *rule) 3501 { 3502 if (fwmsg->sibling != NULL) { 3503 KKASSERT(mycpuid > 0 && fwmsg->sibling->cpuid == mycpuid - 1); 3504 fwmsg->sibling->sibling = rule; 3505 } 3506 fwmsg->sibling = rule; 3507 } 3508 3509 static struct ip_fw * 3510 ipfw_create_rule(const struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags) 3511 { 3512 struct ip_fw *rule; 3513 3514 rule = kmalloc(RULESIZE(ioc_rule), M_IPFW, M_WAITOK | M_ZERO); 3515 3516 rule->act_ofs = ioc_rule->act_ofs; 3517 rule->cmd_len = ioc_rule->cmd_len; 3518 rule->rulenum = ioc_rule->rulenum; 3519 rule->set = ioc_rule->set; 3520 rule->usr_flags = ioc_rule->usr_flags; 3521 3522 bcopy(ioc_rule->cmd, rule->cmd, rule->cmd_len * 4 /* XXX */); 3523 3524 rule->refcnt = 1; 3525 rule->cpuid = mycpuid; 3526 rule->rule_flags = rule_flags; 3527 3528 return rule; 3529 } 3530 3531 static void 3532 ipfw_add_rule_dispatch(netmsg_t nmsg) 3533 { 3534 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg; 3535 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3536 struct ip_fw *rule; 3537 3538 ASSERT_NETISR_NCPUS(mycpuid); 3539 3540 rule = ipfw_create_rule(fwmsg->ioc_rule, fwmsg->rule_flags); 3541 3542 /* 3543 * Insert rule into the pre-determined position 3544 */ 3545 if (fwmsg->prev_rule != NULL) { 3546 struct ip_fw *prev, *next; 3547 3548 prev = fwmsg->prev_rule; 3549 KKASSERT(prev->cpuid == mycpuid); 3550 3551 next = fwmsg->next_rule; 3552 KKASSERT(next->cpuid == mycpuid); 3553 3554 rule->next = next; 3555 prev->next = rule; 3556 3557 /* 3558 * Move to the position on the next CPU 3559 * before the msg is forwarded. 3560 */ 3561 fwmsg->prev_rule = prev->sibling; 3562 fwmsg->next_rule = next->sibling; 3563 } else { 3564 KKASSERT(fwmsg->next_rule == NULL); 3565 rule->next = ctx->ipfw_layer3_chain; 3566 ctx->ipfw_layer3_chain = rule; 3567 } 3568 3569 /* Link rule CPU sibling */ 3570 ipfw_link_sibling(fwmsg, rule); 3571 3572 ipfw_flush_rule_ptrs(ctx); 3573 3574 if (mycpuid == 0) { 3575 /* Statistics only need to be updated once */ 3576 ipfw_inc_static_count(rule); 3577 3578 /* Return the rule on CPU0 */ 3579 nmsg->lmsg.u.ms_resultp = rule; 3580 } 3581 3582 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) 3583 rule->track_ruleid = (uintptr_t)nmsg->lmsg.u.ms_resultp; 3584 3585 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 3586 } 3587 3588 /* 3589 * Add a new rule to the list. Copy the rule into a malloc'ed area, 3590 * then possibly create a rule number and add the rule to the list. 3591 * Update the rule_number in the input struct so the caller knows 3592 * it as well. 3593 */ 3594 static void 3595 ipfw_add_rule(struct ipfw_ioc_rule *ioc_rule, uint32_t rule_flags) 3596 { 3597 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3598 struct netmsg_ipfw fwmsg; 3599 struct netmsg_base *nmsg; 3600 struct ip_fw *f, *prev, *rule; 3601 3602 ASSERT_NETISR0; 3603 3604 /* 3605 * If rulenum is 0, find highest numbered rule before the 3606 * default rule, and add rule number incremental step. 3607 */ 3608 if (ioc_rule->rulenum == 0) { 3609 int step = autoinc_step; 3610 3611 KKASSERT(step >= IPFW_AUTOINC_STEP_MIN && 3612 step <= IPFW_AUTOINC_STEP_MAX); 3613 3614 /* 3615 * Locate the highest numbered rule before default 3616 */ 3617 for (f = ctx->ipfw_layer3_chain; f; f = f->next) { 3618 if (f->rulenum == IPFW_DEFAULT_RULE) 3619 break; 3620 ioc_rule->rulenum = f->rulenum; 3621 } 3622 if (ioc_rule->rulenum < IPFW_DEFAULT_RULE - step) 3623 ioc_rule->rulenum += step; 3624 } 3625 KASSERT(ioc_rule->rulenum != IPFW_DEFAULT_RULE && 3626 ioc_rule->rulenum != 0, 3627 ("invalid rule num %d", ioc_rule->rulenum)); 3628 3629 /* 3630 * Now find the right place for the new rule in the sorted list. 3631 */ 3632 for (prev = NULL, f = ctx->ipfw_layer3_chain; f; 3633 prev = f, f = f->next) { 3634 if (f->rulenum > ioc_rule->rulenum) { 3635 /* Found the location */ 3636 break; 3637 } 3638 } 3639 KASSERT(f != NULL, ("no default rule?!")); 3640 3641 /* 3642 * Duplicate the rule onto each CPU. 3643 * The rule duplicated on CPU0 will be returned. 3644 */ 3645 bzero(&fwmsg, sizeof(fwmsg)); 3646 nmsg = &fwmsg.base; 3647 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 3648 ipfw_add_rule_dispatch); 3649 fwmsg.ioc_rule = ioc_rule; 3650 fwmsg.prev_rule = prev; 3651 fwmsg.next_rule = prev == NULL ? NULL : f; 3652 fwmsg.rule_flags = rule_flags; 3653 3654 netisr_domsg_global(nmsg); 3655 KKASSERT(fwmsg.prev_rule == NULL && fwmsg.next_rule == NULL); 3656 3657 rule = nmsg->lmsg.u.ms_resultp; 3658 KKASSERT(rule != NULL && rule->cpuid == mycpuid); 3659 3660 DPRINTF("++ installed rule %d, static count now %d\n", 3661 rule->rulenum, static_count); 3662 } 3663 3664 /* 3665 * Free storage associated with a static rule (including derived 3666 * states/tracks). 3667 * The caller is in charge of clearing rule pointers to avoid 3668 * dangling pointers. 3669 * @return a pointer to the next entry. 3670 * Arguments are not checked, so they better be correct. 3671 */ 3672 static struct ip_fw * 3673 ipfw_delete_rule(struct ipfw_context *ctx, 3674 struct ip_fw *prev, struct ip_fw *rule) 3675 { 3676 struct ip_fw *n; 3677 3678 n = rule->next; 3679 if (prev == NULL) 3680 ctx->ipfw_layer3_chain = n; 3681 else 3682 prev->next = n; 3683 3684 /* Mark the rule as invalid */ 3685 rule->rule_flags |= IPFW_RULE_F_INVALID; 3686 rule->next_rule = NULL; 3687 rule->sibling = NULL; 3688 #ifdef foo 3689 /* Don't reset cpuid here; keep various assertion working */ 3690 rule->cpuid = -1; 3691 #endif 3692 3693 /* Statistics only need to be updated once */ 3694 if (mycpuid == 0) 3695 ipfw_dec_static_count(rule); 3696 3697 /* Try to free this rule */ 3698 ipfw_free_rule(rule); 3699 3700 /* Return the next rule */ 3701 return n; 3702 } 3703 3704 static void 3705 ipfw_flush_dispatch(netmsg_t nmsg) 3706 { 3707 int kill_default = nmsg->lmsg.u.ms_result; 3708 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3709 struct ip_fw *rule; 3710 3711 ASSERT_NETISR_NCPUS(mycpuid); 3712 3713 /* 3714 * Flush states. 3715 */ 3716 ipfw_state_flush(ctx, NULL); 3717 KASSERT(ctx->ipfw_state_cnt == 0, 3718 ("%d pcpu states remain", ctx->ipfw_state_cnt)); 3719 ctx->ipfw_state_loosecnt = 0; 3720 ctx->ipfw_state_lastexp = 0; 3721 3722 /* 3723 * Flush tracks. 3724 */ 3725 ipfw_track_flush(ctx, NULL); 3726 ctx->ipfw_track_lastexp = 0; 3727 if (ctx->ipfw_trkcnt_spare != NULL) { 3728 kfree(ctx->ipfw_trkcnt_spare, M_IPFW); 3729 ctx->ipfw_trkcnt_spare = NULL; 3730 } 3731 3732 ipfw_flush_rule_ptrs(ctx); /* more efficient to do outside the loop */ 3733 3734 while ((rule = ctx->ipfw_layer3_chain) != NULL && 3735 (kill_default || rule->rulenum != IPFW_DEFAULT_RULE)) 3736 ipfw_delete_rule(ctx, NULL, rule); 3737 3738 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 3739 } 3740 3741 /* 3742 * Deletes all rules from a chain (including the default rule 3743 * if the second argument is set). 3744 */ 3745 static void 3746 ipfw_flush(int kill_default) 3747 { 3748 struct netmsg_base nmsg; 3749 #ifdef INVARIANTS 3750 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3751 int state_cnt; 3752 #endif 3753 3754 ASSERT_NETISR0; 3755 3756 /* 3757 * If 'kill_default' then caller has done the necessary 3758 * msgport syncing; unnecessary to do it again. 3759 */ 3760 if (!kill_default) { 3761 /* 3762 * Let ipfw_chk() know the rules are going to 3763 * be flushed, so it could jump directly to 3764 * the default rule. 3765 */ 3766 ipfw_flushing = 1; 3767 /* XXX use priority sync */ 3768 netmsg_service_sync(); 3769 } 3770 3771 /* 3772 * Press the 'flush' button 3773 */ 3774 bzero(&nmsg, sizeof(nmsg)); 3775 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 3776 ipfw_flush_dispatch); 3777 nmsg.lmsg.u.ms_result = kill_default; 3778 netisr_domsg_global(&nmsg); 3779 ipfw_gd.ipfw_state_loosecnt = 0; 3780 ipfw_gd.ipfw_state_globexp = 0; 3781 ipfw_gd.ipfw_track_globexp = 0; 3782 3783 #ifdef INVARIANTS 3784 state_cnt = ipfw_state_cntcoll(); 3785 KASSERT(state_cnt == 0, ("%d states remain", state_cnt)); 3786 3787 KASSERT(ipfw_gd.ipfw_trkcnt_cnt == 0, 3788 ("%d trkcnts remain", ipfw_gd.ipfw_trkcnt_cnt)); 3789 3790 if (kill_default) { 3791 KASSERT(static_count == 0, 3792 ("%u static rules remain", static_count)); 3793 KASSERT(static_ioc_len == 0, 3794 ("%u bytes of static rules remain", static_ioc_len)); 3795 } else { 3796 KASSERT(static_count == 1, 3797 ("%u static rules remain", static_count)); 3798 KASSERT(static_ioc_len == IOC_RULESIZE(ctx->ipfw_default_rule), 3799 ("%u bytes of static rules remain, should be %lu", 3800 static_ioc_len, 3801 (u_long)IOC_RULESIZE(ctx->ipfw_default_rule))); 3802 } 3803 #endif 3804 3805 /* Flush is done */ 3806 ipfw_flushing = 0; 3807 } 3808 3809 static void 3810 ipfw_alt_delete_rule_dispatch(netmsg_t nmsg) 3811 { 3812 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg; 3813 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3814 struct ip_fw *rule, *prev; 3815 3816 ASSERT_NETISR_NCPUS(mycpuid); 3817 3818 rule = dmsg->start_rule; 3819 KKASSERT(rule->cpuid == mycpuid); 3820 dmsg->start_rule = rule->sibling; 3821 3822 prev = dmsg->prev_rule; 3823 if (prev != NULL) { 3824 KKASSERT(prev->cpuid == mycpuid); 3825 3826 /* 3827 * Move to the position on the next CPU 3828 * before the msg is forwarded. 3829 */ 3830 dmsg->prev_rule = prev->sibling; 3831 } 3832 3833 /* 3834 * flush pointers outside the loop, then delete all matching 3835 * rules. 'prev' remains the same throughout the cycle. 3836 */ 3837 ipfw_flush_rule_ptrs(ctx); 3838 while (rule && rule->rulenum == dmsg->rulenum) { 3839 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) { 3840 /* Flush states generated by this rule. */ 3841 ipfw_state_flush(ctx, rule); 3842 } 3843 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) { 3844 /* Flush tracks generated by this rule. */ 3845 ipfw_track_flush(ctx, rule); 3846 } 3847 rule = ipfw_delete_rule(ctx, prev, rule); 3848 } 3849 3850 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 3851 } 3852 3853 static int 3854 ipfw_alt_delete_rule(uint16_t rulenum) 3855 { 3856 struct ip_fw *prev, *rule; 3857 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3858 struct netmsg_del dmsg; 3859 3860 ASSERT_NETISR0; 3861 3862 /* 3863 * Locate first rule to delete 3864 */ 3865 for (prev = NULL, rule = ctx->ipfw_layer3_chain; 3866 rule && rule->rulenum < rulenum; 3867 prev = rule, rule = rule->next) 3868 ; /* EMPTY */ 3869 if (rule->rulenum != rulenum) 3870 return EINVAL; 3871 3872 /* 3873 * Get rid of the rule duplications on all CPUs 3874 */ 3875 bzero(&dmsg, sizeof(dmsg)); 3876 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY, 3877 ipfw_alt_delete_rule_dispatch); 3878 dmsg.prev_rule = prev; 3879 dmsg.start_rule = rule; 3880 dmsg.rulenum = rulenum; 3881 3882 netisr_domsg_global(&dmsg.base); 3883 KKASSERT(dmsg.prev_rule == NULL && dmsg.start_rule == NULL); 3884 return 0; 3885 } 3886 3887 static void 3888 ipfw_alt_delete_ruleset_dispatch(netmsg_t nmsg) 3889 { 3890 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg; 3891 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3892 struct ip_fw *prev, *rule; 3893 #ifdef INVARIANTS 3894 int del = 0; 3895 #endif 3896 3897 ASSERT_NETISR_NCPUS(mycpuid); 3898 3899 ipfw_flush_rule_ptrs(ctx); 3900 3901 prev = NULL; 3902 rule = ctx->ipfw_layer3_chain; 3903 while (rule != NULL) { 3904 if (rule->set == dmsg->from_set) { 3905 if (rule->rule_flags & IPFW_RULE_F_GENSTATE) { 3906 /* Flush states generated by this rule. */ 3907 ipfw_state_flush(ctx, rule); 3908 } 3909 if (rule->rule_flags & IPFW_RULE_F_GENTRACK) { 3910 /* Flush tracks generated by this rule. */ 3911 ipfw_track_flush(ctx, rule); 3912 } 3913 rule = ipfw_delete_rule(ctx, prev, rule); 3914 #ifdef INVARIANTS 3915 del = 1; 3916 #endif 3917 } else { 3918 prev = rule; 3919 rule = rule->next; 3920 } 3921 } 3922 KASSERT(del, ("no match set?!")); 3923 3924 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 3925 } 3926 3927 static int 3928 ipfw_alt_delete_ruleset(uint8_t set) 3929 { 3930 struct netmsg_del dmsg; 3931 int del; 3932 struct ip_fw *rule; 3933 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3934 3935 ASSERT_NETISR0; 3936 3937 /* 3938 * Check whether the 'set' exists. If it exists, 3939 * then check whether any rules within the set will 3940 * try to create states. 3941 */ 3942 del = 0; 3943 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) { 3944 if (rule->set == set) 3945 del = 1; 3946 } 3947 if (!del) 3948 return 0; /* XXX EINVAL? */ 3949 3950 /* 3951 * Delete this set 3952 */ 3953 bzero(&dmsg, sizeof(dmsg)); 3954 netmsg_init(&dmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY, 3955 ipfw_alt_delete_ruleset_dispatch); 3956 dmsg.from_set = set; 3957 netisr_domsg_global(&dmsg.base); 3958 3959 return 0; 3960 } 3961 3962 static void 3963 ipfw_alt_move_rule_dispatch(netmsg_t nmsg) 3964 { 3965 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg; 3966 struct ip_fw *rule; 3967 3968 ASSERT_NETISR_NCPUS(mycpuid); 3969 3970 rule = dmsg->start_rule; 3971 KKASSERT(rule->cpuid == mycpuid); 3972 3973 /* 3974 * Move to the position on the next CPU 3975 * before the msg is forwarded. 3976 */ 3977 dmsg->start_rule = rule->sibling; 3978 3979 while (rule && rule->rulenum <= dmsg->rulenum) { 3980 if (rule->rulenum == dmsg->rulenum) 3981 rule->set = dmsg->to_set; 3982 rule = rule->next; 3983 } 3984 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 3985 } 3986 3987 static int 3988 ipfw_alt_move_rule(uint16_t rulenum, uint8_t set) 3989 { 3990 struct netmsg_del dmsg; 3991 struct netmsg_base *nmsg; 3992 struct ip_fw *rule; 3993 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 3994 3995 ASSERT_NETISR0; 3996 3997 /* 3998 * Locate first rule to move 3999 */ 4000 for (rule = ctx->ipfw_layer3_chain; rule && rule->rulenum <= rulenum; 4001 rule = rule->next) { 4002 if (rule->rulenum == rulenum && rule->set != set) 4003 break; 4004 } 4005 if (rule == NULL || rule->rulenum > rulenum) 4006 return 0; /* XXX error? */ 4007 4008 bzero(&dmsg, sizeof(dmsg)); 4009 nmsg = &dmsg.base; 4010 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 4011 ipfw_alt_move_rule_dispatch); 4012 dmsg.start_rule = rule; 4013 dmsg.rulenum = rulenum; 4014 dmsg.to_set = set; 4015 4016 netisr_domsg_global(nmsg); 4017 KKASSERT(dmsg.start_rule == NULL); 4018 return 0; 4019 } 4020 4021 static void 4022 ipfw_alt_move_ruleset_dispatch(netmsg_t nmsg) 4023 { 4024 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg; 4025 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4026 struct ip_fw *rule; 4027 4028 ASSERT_NETISR_NCPUS(mycpuid); 4029 4030 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) { 4031 if (rule->set == dmsg->from_set) 4032 rule->set = dmsg->to_set; 4033 } 4034 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 4035 } 4036 4037 static int 4038 ipfw_alt_move_ruleset(uint8_t from_set, uint8_t to_set) 4039 { 4040 struct netmsg_del dmsg; 4041 struct netmsg_base *nmsg; 4042 4043 ASSERT_NETISR0; 4044 4045 bzero(&dmsg, sizeof(dmsg)); 4046 nmsg = &dmsg.base; 4047 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 4048 ipfw_alt_move_ruleset_dispatch); 4049 dmsg.from_set = from_set; 4050 dmsg.to_set = to_set; 4051 4052 netisr_domsg_global(nmsg); 4053 return 0; 4054 } 4055 4056 static void 4057 ipfw_alt_swap_ruleset_dispatch(netmsg_t nmsg) 4058 { 4059 struct netmsg_del *dmsg = (struct netmsg_del *)nmsg; 4060 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4061 struct ip_fw *rule; 4062 4063 ASSERT_NETISR_NCPUS(mycpuid); 4064 4065 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) { 4066 if (rule->set == dmsg->from_set) 4067 rule->set = dmsg->to_set; 4068 else if (rule->set == dmsg->to_set) 4069 rule->set = dmsg->from_set; 4070 } 4071 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 4072 } 4073 4074 static int 4075 ipfw_alt_swap_ruleset(uint8_t set1, uint8_t set2) 4076 { 4077 struct netmsg_del dmsg; 4078 struct netmsg_base *nmsg; 4079 4080 ASSERT_NETISR0; 4081 4082 bzero(&dmsg, sizeof(dmsg)); 4083 nmsg = &dmsg.base; 4084 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 4085 ipfw_alt_swap_ruleset_dispatch); 4086 dmsg.from_set = set1; 4087 dmsg.to_set = set2; 4088 4089 netisr_domsg_global(nmsg); 4090 return 0; 4091 } 4092 4093 /* 4094 * Remove all rules with given number, and also do set manipulation. 4095 * 4096 * The argument is an uint32_t. The low 16 bit are the rule or set number, 4097 * the next 8 bits are the new set, the top 8 bits are the command: 4098 * 4099 * 0 delete rules with given number 4100 * 1 delete rules with given set number 4101 * 2 move rules with given number to new set 4102 * 3 move rules with given set number to new set 4103 * 4 swap sets with given numbers 4104 */ 4105 static int 4106 ipfw_ctl_alter(uint32_t arg) 4107 { 4108 uint16_t rulenum; 4109 uint8_t cmd, new_set; 4110 int error = 0; 4111 4112 ASSERT_NETISR0; 4113 4114 rulenum = arg & 0xffff; 4115 cmd = (arg >> 24) & 0xff; 4116 new_set = (arg >> 16) & 0xff; 4117 4118 if (cmd > 4) 4119 return EINVAL; 4120 if (new_set >= IPFW_DEFAULT_SET) 4121 return EINVAL; 4122 if (cmd == 0 || cmd == 2) { 4123 if (rulenum == IPFW_DEFAULT_RULE) 4124 return EINVAL; 4125 } else { 4126 if (rulenum >= IPFW_DEFAULT_SET) 4127 return EINVAL; 4128 } 4129 4130 switch (cmd) { 4131 case 0: /* delete rules with given number */ 4132 error = ipfw_alt_delete_rule(rulenum); 4133 break; 4134 4135 case 1: /* delete all rules with given set number */ 4136 error = ipfw_alt_delete_ruleset(rulenum); 4137 break; 4138 4139 case 2: /* move rules with given number to new set */ 4140 error = ipfw_alt_move_rule(rulenum, new_set); 4141 break; 4142 4143 case 3: /* move rules with given set number to new set */ 4144 error = ipfw_alt_move_ruleset(rulenum, new_set); 4145 break; 4146 4147 case 4: /* swap two sets */ 4148 error = ipfw_alt_swap_ruleset(rulenum, new_set); 4149 break; 4150 } 4151 return error; 4152 } 4153 4154 /* 4155 * Clear counters for a specific rule. 4156 */ 4157 static void 4158 clear_counters(struct ip_fw *rule, int log_only) 4159 { 4160 ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); 4161 4162 if (log_only == 0) { 4163 rule->bcnt = rule->pcnt = 0; 4164 rule->timestamp = 0; 4165 } 4166 if (l->o.opcode == O_LOG) 4167 l->log_left = l->max_log; 4168 } 4169 4170 static void 4171 ipfw_zero_entry_dispatch(netmsg_t nmsg) 4172 { 4173 struct netmsg_zent *zmsg = (struct netmsg_zent *)nmsg; 4174 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4175 struct ip_fw *rule; 4176 4177 ASSERT_NETISR_NCPUS(mycpuid); 4178 4179 if (zmsg->rulenum == 0) { 4180 KKASSERT(zmsg->start_rule == NULL); 4181 4182 ctx->ipfw_norule_counter = 0; 4183 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) 4184 clear_counters(rule, zmsg->log_only); 4185 } else { 4186 struct ip_fw *start = zmsg->start_rule; 4187 4188 KKASSERT(start->cpuid == mycpuid); 4189 KKASSERT(start->rulenum == zmsg->rulenum); 4190 4191 /* 4192 * We can have multiple rules with the same number, so we 4193 * need to clear them all. 4194 */ 4195 for (rule = start; rule && rule->rulenum == zmsg->rulenum; 4196 rule = rule->next) 4197 clear_counters(rule, zmsg->log_only); 4198 4199 /* 4200 * Move to the position on the next CPU 4201 * before the msg is forwarded. 4202 */ 4203 zmsg->start_rule = start->sibling; 4204 } 4205 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 4206 } 4207 4208 /* 4209 * Reset some or all counters on firewall rules. 4210 * @arg frwl is null to clear all entries, or contains a specific 4211 * rule number. 4212 * @arg log_only is 1 if we only want to reset logs, zero otherwise. 4213 */ 4214 static int 4215 ipfw_ctl_zero_entry(int rulenum, int log_only) 4216 { 4217 struct netmsg_zent zmsg; 4218 struct netmsg_base *nmsg; 4219 const char *msg; 4220 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4221 4222 ASSERT_NETISR0; 4223 4224 bzero(&zmsg, sizeof(zmsg)); 4225 nmsg = &zmsg.base; 4226 netmsg_init(nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 4227 ipfw_zero_entry_dispatch); 4228 zmsg.log_only = log_only; 4229 4230 if (rulenum == 0) { 4231 msg = log_only ? "ipfw: All logging counts reset.\n" 4232 : "ipfw: Accounting cleared.\n"; 4233 } else { 4234 struct ip_fw *rule; 4235 4236 /* 4237 * Locate the first rule with 'rulenum' 4238 */ 4239 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) { 4240 if (rule->rulenum == rulenum) 4241 break; 4242 } 4243 if (rule == NULL) /* we did not find any matching rules */ 4244 return (EINVAL); 4245 zmsg.start_rule = rule; 4246 zmsg.rulenum = rulenum; 4247 4248 msg = log_only ? "ipfw: Entry %d logging count reset.\n" 4249 : "ipfw: Entry %d cleared.\n"; 4250 } 4251 netisr_domsg_global(nmsg); 4252 KKASSERT(zmsg.start_rule == NULL); 4253 4254 if (fw_verbose) 4255 log(LOG_SECURITY | LOG_NOTICE, msg, rulenum); 4256 return (0); 4257 } 4258 4259 /* 4260 * Check validity of the structure before insert. 4261 * Fortunately rules are simple, so this mostly need to check rule sizes. 4262 */ 4263 static int 4264 ipfw_check_ioc_rule(struct ipfw_ioc_rule *rule, int size, uint32_t *rule_flags) 4265 { 4266 int l, cmdlen = 0; 4267 int have_action = 0; 4268 ipfw_insn *cmd; 4269 4270 *rule_flags = 0; 4271 4272 /* Check for valid size */ 4273 if (size < sizeof(*rule)) { 4274 kprintf("ipfw: rule too short\n"); 4275 return EINVAL; 4276 } 4277 l = IOC_RULESIZE(rule); 4278 if (l != size) { 4279 kprintf("ipfw: size mismatch (have %d want %d)\n", size, l); 4280 return EINVAL; 4281 } 4282 4283 /* Check rule number */ 4284 if (rule->rulenum == IPFW_DEFAULT_RULE) { 4285 kprintf("ipfw: invalid rule number\n"); 4286 return EINVAL; 4287 } 4288 4289 /* 4290 * Now go for the individual checks. Very simple ones, basically only 4291 * instruction sizes. 4292 */ 4293 for (l = rule->cmd_len, cmd = rule->cmd; l > 0; 4294 l -= cmdlen, cmd += cmdlen) { 4295 cmdlen = F_LEN(cmd); 4296 if (cmdlen > l) { 4297 kprintf("ipfw: opcode %d size truncated\n", 4298 cmd->opcode); 4299 return EINVAL; 4300 } 4301 4302 DPRINTF("ipfw: opcode %d\n", cmd->opcode); 4303 4304 if (cmd->opcode == O_KEEP_STATE || cmd->opcode == O_LIMIT) { 4305 /* This rule will generate states. */ 4306 *rule_flags |= IPFW_RULE_F_GENSTATE; 4307 if (cmd->opcode == O_LIMIT) 4308 *rule_flags |= IPFW_RULE_F_GENTRACK; 4309 } 4310 4311 switch (cmd->opcode) { 4312 case O_NOP: 4313 case O_PROBE_STATE: 4314 case O_KEEP_STATE: 4315 case O_PROTO: 4316 case O_IP_SRC_ME: 4317 case O_IP_DST_ME: 4318 case O_LAYER2: 4319 case O_IN: 4320 case O_FRAG: 4321 case O_IPOPT: 4322 case O_IPLEN: 4323 case O_IPID: 4324 case O_IPTOS: 4325 case O_IPPRECEDENCE: 4326 case O_IPTTL: 4327 case O_IPVER: 4328 case O_TCPWIN: 4329 case O_TCPFLAGS: 4330 case O_TCPOPTS: 4331 case O_ESTAB: 4332 if (cmdlen != F_INSN_SIZE(ipfw_insn)) 4333 goto bad_size; 4334 break; 4335 4336 case O_UID: 4337 case O_GID: 4338 case O_IP_SRC: 4339 case O_IP_DST: 4340 case O_TCPSEQ: 4341 case O_TCPACK: 4342 case O_PROB: 4343 case O_ICMPTYPE: 4344 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) 4345 goto bad_size; 4346 break; 4347 4348 case O_LIMIT: 4349 if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) 4350 goto bad_size; 4351 break; 4352 4353 case O_LOG: 4354 if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) 4355 goto bad_size; 4356 4357 ((ipfw_insn_log *)cmd)->log_left = 4358 ((ipfw_insn_log *)cmd)->max_log; 4359 4360 break; 4361 4362 case O_IP_SRC_MASK: 4363 case O_IP_DST_MASK: 4364 if (cmdlen != F_INSN_SIZE(ipfw_insn_ip)) 4365 goto bad_size; 4366 if (((ipfw_insn_ip *)cmd)->mask.s_addr == 0) { 4367 kprintf("ipfw: opcode %d, useless rule\n", 4368 cmd->opcode); 4369 return EINVAL; 4370 } 4371 break; 4372 4373 case O_IP_SRC_SET: 4374 case O_IP_DST_SET: 4375 if (cmd->arg1 == 0 || cmd->arg1 > 256) { 4376 kprintf("ipfw: invalid set size %d\n", 4377 cmd->arg1); 4378 return EINVAL; 4379 } 4380 if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 4381 (cmd->arg1+31)/32 ) 4382 goto bad_size; 4383 break; 4384 4385 case O_MACADDR2: 4386 if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) 4387 goto bad_size; 4388 break; 4389 4390 case O_MAC_TYPE: 4391 case O_IP_SRCPORT: 4392 case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ 4393 if (cmdlen < 2 || cmdlen > 31) 4394 goto bad_size; 4395 break; 4396 4397 case O_RECV: 4398 case O_XMIT: 4399 case O_VIA: 4400 if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) 4401 goto bad_size; 4402 break; 4403 4404 case O_PIPE: 4405 case O_QUEUE: 4406 if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe)) 4407 goto bad_size; 4408 goto check_action; 4409 4410 case O_FORWARD_IP: 4411 if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) { 4412 goto bad_size; 4413 } else { 4414 in_addr_t fwd_addr; 4415 4416 fwd_addr = ((ipfw_insn_sa *)cmd)-> 4417 sa.sin_addr.s_addr; 4418 if (IN_MULTICAST(ntohl(fwd_addr))) { 4419 kprintf("ipfw: try forwarding to " 4420 "multicast address\n"); 4421 return EINVAL; 4422 } 4423 } 4424 goto check_action; 4425 4426 case O_FORWARD_MAC: /* XXX not implemented yet */ 4427 case O_CHECK_STATE: 4428 case O_COUNT: 4429 case O_ACCEPT: 4430 case O_DENY: 4431 case O_REJECT: 4432 case O_SKIPTO: 4433 case O_DIVERT: 4434 case O_TEE: 4435 if (cmdlen != F_INSN_SIZE(ipfw_insn)) 4436 goto bad_size; 4437 check_action: 4438 if (have_action) { 4439 kprintf("ipfw: opcode %d, multiple actions" 4440 " not allowed\n", 4441 cmd->opcode); 4442 return EINVAL; 4443 } 4444 have_action = 1; 4445 if (l != cmdlen) { 4446 kprintf("ipfw: opcode %d, action must be" 4447 " last opcode\n", 4448 cmd->opcode); 4449 return EINVAL; 4450 } 4451 break; 4452 default: 4453 kprintf("ipfw: opcode %d, unknown opcode\n", 4454 cmd->opcode); 4455 return EINVAL; 4456 } 4457 } 4458 if (have_action == 0) { 4459 kprintf("ipfw: missing action\n"); 4460 return EINVAL; 4461 } 4462 return 0; 4463 4464 bad_size: 4465 kprintf("ipfw: opcode %d size %d wrong\n", 4466 cmd->opcode, cmdlen); 4467 return EINVAL; 4468 } 4469 4470 static int 4471 ipfw_ctl_add_rule(struct sockopt *sopt) 4472 { 4473 struct ipfw_ioc_rule *ioc_rule; 4474 size_t size; 4475 uint32_t rule_flags; 4476 int error; 4477 4478 ASSERT_NETISR0; 4479 4480 size = sopt->sopt_valsize; 4481 if (size > (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX) || 4482 size < sizeof(*ioc_rule)) { 4483 return EINVAL; 4484 } 4485 if (size != (sizeof(uint32_t) * IPFW_RULE_SIZE_MAX)) { 4486 sopt->sopt_val = krealloc(sopt->sopt_val, sizeof(uint32_t) * 4487 IPFW_RULE_SIZE_MAX, M_TEMP, M_WAITOK); 4488 } 4489 ioc_rule = sopt->sopt_val; 4490 4491 error = ipfw_check_ioc_rule(ioc_rule, size, &rule_flags); 4492 if (error) 4493 return error; 4494 4495 ipfw_add_rule(ioc_rule, rule_flags); 4496 4497 if (sopt->sopt_dir == SOPT_GET) 4498 sopt->sopt_valsize = IOC_RULESIZE(ioc_rule); 4499 return 0; 4500 } 4501 4502 static void * 4503 ipfw_copy_rule(const struct ipfw_context *ctx, const struct ip_fw *rule, 4504 struct ipfw_ioc_rule *ioc_rule) 4505 { 4506 const struct ip_fw *sibling; 4507 #ifdef INVARIANTS 4508 int i; 4509 #endif 4510 4511 ASSERT_NETISR0; 4512 KASSERT(rule->cpuid == 0, ("rule does not belong to cpu0")); 4513 4514 ioc_rule->act_ofs = rule->act_ofs; 4515 ioc_rule->cmd_len = rule->cmd_len; 4516 ioc_rule->rulenum = rule->rulenum; 4517 ioc_rule->set = rule->set; 4518 ioc_rule->usr_flags = rule->usr_flags; 4519 4520 ioc_rule->set_disable = ctx->ipfw_set_disable; 4521 ioc_rule->static_count = static_count; 4522 ioc_rule->static_len = static_ioc_len; 4523 4524 /* 4525 * Visit (read-only) all of the rule's duplications to get 4526 * the necessary statistics 4527 */ 4528 #ifdef INVARIANTS 4529 i = 0; 4530 #endif 4531 ioc_rule->pcnt = 0; 4532 ioc_rule->bcnt = 0; 4533 ioc_rule->timestamp = 0; 4534 for (sibling = rule; sibling != NULL; sibling = sibling->sibling) { 4535 ioc_rule->pcnt += sibling->pcnt; 4536 ioc_rule->bcnt += sibling->bcnt; 4537 if (sibling->timestamp > ioc_rule->timestamp) 4538 ioc_rule->timestamp = sibling->timestamp; 4539 #ifdef INVARIANTS 4540 ++i; 4541 #endif 4542 } 4543 KASSERT(i == netisr_ncpus, 4544 ("static rule is not duplicated on netisr_ncpus %d", netisr_ncpus)); 4545 4546 bcopy(rule->cmd, ioc_rule->cmd, ioc_rule->cmd_len * 4 /* XXX */); 4547 4548 return ((uint8_t *)ioc_rule + IOC_RULESIZE(ioc_rule)); 4549 } 4550 4551 static boolean_t 4552 ipfw_track_copy(const struct ipfw_trkcnt *trk, struct ipfw_ioc_state *ioc_state) 4553 { 4554 struct ipfw_ioc_flowid *ioc_id; 4555 4556 if (trk->tc_expire == 0) { 4557 /* Not a scanned one. */ 4558 return (FALSE); 4559 } 4560 4561 ioc_state->expire = TIME_LEQ(trk->tc_expire, time_uptime) ? 4562 0 : trk->tc_expire - time_uptime; 4563 ioc_state->pcnt = 0; 4564 ioc_state->bcnt = 0; 4565 4566 ioc_state->dyn_type = O_LIMIT_PARENT; 4567 ioc_state->count = trk->tc_count; 4568 4569 ioc_state->rulenum = trk->tc_rulenum; 4570 4571 ioc_id = &ioc_state->id; 4572 ioc_id->type = ETHERTYPE_IP; 4573 ioc_id->u.ip.proto = trk->tc_proto; 4574 ioc_id->u.ip.src_ip = trk->tc_saddr; 4575 ioc_id->u.ip.dst_ip = trk->tc_daddr; 4576 ioc_id->u.ip.src_port = trk->tc_sport; 4577 ioc_id->u.ip.dst_port = trk->tc_dport; 4578 4579 return (TRUE); 4580 } 4581 4582 static boolean_t 4583 ipfw_state_copy(const struct ipfw_state *s, struct ipfw_ioc_state *ioc_state) 4584 { 4585 struct ipfw_ioc_flowid *ioc_id; 4586 4587 if (s->st_type == O_ANCHOR) 4588 return (FALSE); 4589 4590 ioc_state->expire = TIME_LEQ(s->st_expire, time_uptime) ? 4591 0 : s->st_expire - time_uptime; 4592 ioc_state->pcnt = s->st_pcnt; 4593 ioc_state->bcnt = s->st_bcnt; 4594 4595 ioc_state->dyn_type = s->st_type; 4596 ioc_state->count = 0; 4597 4598 ioc_state->rulenum = s->st_rule->rulenum; 4599 4600 ioc_id = &ioc_state->id; 4601 ioc_id->type = ETHERTYPE_IP; 4602 ioc_id->u.ip.proto = s->st_proto; 4603 ipfw_key_4tuple(&s->st_key, 4604 &ioc_id->u.ip.src_ip, &ioc_id->u.ip.src_port, 4605 &ioc_id->u.ip.dst_ip, &ioc_id->u.ip.dst_port); 4606 4607 return (TRUE); 4608 } 4609 4610 static void 4611 ipfw_state_copy_dispatch(netmsg_t nmsg) 4612 { 4613 struct netmsg_cpstate *nm = (struct netmsg_cpstate *)nmsg; 4614 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4615 const struct ipfw_state *s; 4616 const struct ipfw_track *t; 4617 4618 ASSERT_NETISR_NCPUS(mycpuid); 4619 KASSERT(nm->state_cnt < nm->state_cntmax, 4620 ("invalid state count %d, max %d", 4621 nm->state_cnt, nm->state_cntmax)); 4622 4623 TAILQ_FOREACH(s, &ctx->ipfw_state_list, st_link) { 4624 if (ipfw_state_copy(s, nm->ioc_state)) { 4625 nm->ioc_state++; 4626 nm->state_cnt++; 4627 if (nm->state_cnt == nm->state_cntmax) 4628 goto done; 4629 } 4630 } 4631 4632 /* 4633 * Prepare tracks in the global track tree for userland. 4634 */ 4635 TAILQ_FOREACH(t, &ctx->ipfw_track_list, t_link) { 4636 struct ipfw_trkcnt *trk; 4637 4638 if (t->t_count == NULL) /* anchor */ 4639 continue; 4640 trk = t->t_trkcnt; 4641 4642 /* 4643 * Only one netisr can run this function at 4644 * any time, and only this function accesses 4645 * trkcnt's tc_expire, so this is safe w/o 4646 * ipfw_gd.ipfw_trkcnt_token. 4647 */ 4648 if (trk->tc_expire > t->t_expire) 4649 continue; 4650 trk->tc_expire = t->t_expire; 4651 } 4652 4653 /* 4654 * Copy tracks in the global track tree to userland in 4655 * the last netisr. 4656 */ 4657 if (mycpuid == netisr_ncpus - 1) { 4658 struct ipfw_trkcnt *trk; 4659 4660 KASSERT(nm->state_cnt < nm->state_cntmax, 4661 ("invalid state count %d, max %d", 4662 nm->state_cnt, nm->state_cntmax)); 4663 4664 IPFW_TRKCNT_TOKGET; 4665 RB_FOREACH(trk, ipfw_trkcnt_tree, &ipfw_gd.ipfw_trkcnt_tree) { 4666 if (ipfw_track_copy(trk, nm->ioc_state)) { 4667 nm->ioc_state++; 4668 nm->state_cnt++; 4669 if (nm->state_cnt == nm->state_cntmax) { 4670 IPFW_TRKCNT_TOKREL; 4671 goto done; 4672 } 4673 } 4674 } 4675 IPFW_TRKCNT_TOKREL; 4676 } 4677 done: 4678 if (nm->state_cnt == nm->state_cntmax) { 4679 /* No more space; done. */ 4680 netisr_replymsg(&nm->base, 0); 4681 } else { 4682 netisr_forwardmsg(&nm->base, mycpuid + 1); 4683 } 4684 } 4685 4686 static int 4687 ipfw_ctl_get_rules(struct sockopt *sopt) 4688 { 4689 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4690 struct ip_fw *rule; 4691 void *bp; 4692 size_t size; 4693 int state_cnt; 4694 4695 ASSERT_NETISR0; 4696 4697 /* 4698 * pass up a copy of the current rules. Static rules 4699 * come first (the last of which has number IPFW_DEFAULT_RULE), 4700 * followed by a possibly empty list of states. 4701 */ 4702 4703 size = static_ioc_len; /* size of static rules */ 4704 4705 /* 4706 * Size of the states. 4707 * XXX take tracks as state for userland compat. 4708 */ 4709 state_cnt = ipfw_state_cntcoll() + ipfw_gd.ipfw_trkcnt_cnt; 4710 state_cnt = (state_cnt * 5) / 4; /* leave 25% headroom */ 4711 size += state_cnt * sizeof(struct ipfw_ioc_state); 4712 4713 if (sopt->sopt_valsize < size) { 4714 /* short length, no need to return incomplete rules */ 4715 /* XXX: if superuser, no need to zero buffer */ 4716 bzero(sopt->sopt_val, sopt->sopt_valsize); 4717 return 0; 4718 } 4719 bp = sopt->sopt_val; 4720 4721 for (rule = ctx->ipfw_layer3_chain; rule; rule = rule->next) 4722 bp = ipfw_copy_rule(ctx, rule, bp); 4723 4724 if (state_cnt) { 4725 struct netmsg_cpstate nm; 4726 #ifdef INVARIANTS 4727 size_t old_size = size; 4728 #endif 4729 4730 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 4731 MSGF_PRIORITY, ipfw_state_copy_dispatch); 4732 nm.ioc_state = bp; 4733 nm.state_cntmax = state_cnt; 4734 nm.state_cnt = 0; 4735 netisr_domsg_global(&nm.base); 4736 4737 /* 4738 * The # of states may be shrinked after the snapshot 4739 * of the state count was taken. To give user a correct 4740 * state count, nm->state_cnt is used to recalculate 4741 * the actual size. 4742 */ 4743 size = static_ioc_len + 4744 (nm.state_cnt * sizeof(struct ipfw_ioc_state)); 4745 KKASSERT(size <= old_size); 4746 } 4747 4748 sopt->sopt_valsize = size; 4749 return 0; 4750 } 4751 4752 static void 4753 ipfw_set_disable_dispatch(netmsg_t nmsg) 4754 { 4755 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4756 4757 ASSERT_NETISR_NCPUS(mycpuid); 4758 4759 ctx->ipfw_set_disable = nmsg->lmsg.u.ms_result32; 4760 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 4761 } 4762 4763 static void 4764 ipfw_ctl_set_disable(uint32_t disable, uint32_t enable) 4765 { 4766 struct netmsg_base nmsg; 4767 uint32_t set_disable; 4768 4769 ASSERT_NETISR0; 4770 4771 /* IPFW_DEFAULT_SET is always enabled */ 4772 enable |= (1 << IPFW_DEFAULT_SET); 4773 set_disable = (ipfw_ctx[mycpuid]->ipfw_set_disable | disable) & ~enable; 4774 4775 bzero(&nmsg, sizeof(nmsg)); 4776 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 4777 ipfw_set_disable_dispatch); 4778 nmsg.lmsg.u.ms_result32 = set_disable; 4779 4780 netisr_domsg_global(&nmsg); 4781 } 4782 4783 /* 4784 * {set|get}sockopt parser. 4785 */ 4786 static int 4787 ipfw_ctl(struct sockopt *sopt) 4788 { 4789 int error, rulenum; 4790 uint32_t *masks; 4791 size_t size; 4792 4793 ASSERT_NETISR0; 4794 4795 error = 0; 4796 4797 switch (sopt->sopt_name) { 4798 case IP_FW_GET: 4799 error = ipfw_ctl_get_rules(sopt); 4800 break; 4801 4802 case IP_FW_FLUSH: 4803 ipfw_flush(0 /* keep default rule */); 4804 break; 4805 4806 case IP_FW_ADD: 4807 error = ipfw_ctl_add_rule(sopt); 4808 break; 4809 4810 case IP_FW_DEL: 4811 /* 4812 * IP_FW_DEL is used for deleting single rules or sets, 4813 * and (ab)used to atomically manipulate sets. 4814 * Argument size is used to distinguish between the two: 4815 * sizeof(uint32_t) 4816 * delete single rule or set of rules, 4817 * or reassign rules (or sets) to a different set. 4818 * 2 * sizeof(uint32_t) 4819 * atomic disable/enable sets. 4820 * first uint32_t contains sets to be disabled, 4821 * second uint32_t contains sets to be enabled. 4822 */ 4823 masks = sopt->sopt_val; 4824 size = sopt->sopt_valsize; 4825 if (size == sizeof(*masks)) { 4826 /* 4827 * Delete or reassign static rule 4828 */ 4829 error = ipfw_ctl_alter(masks[0]); 4830 } else if (size == (2 * sizeof(*masks))) { 4831 /* 4832 * Set enable/disable 4833 */ 4834 ipfw_ctl_set_disable(masks[0], masks[1]); 4835 } else { 4836 error = EINVAL; 4837 } 4838 break; 4839 4840 case IP_FW_ZERO: 4841 case IP_FW_RESETLOG: /* argument is an int, the rule number */ 4842 rulenum = 0; 4843 4844 if (sopt->sopt_val != 0) { 4845 error = soopt_to_kbuf(sopt, &rulenum, 4846 sizeof(int), sizeof(int)); 4847 if (error) 4848 break; 4849 } 4850 error = ipfw_ctl_zero_entry(rulenum, 4851 sopt->sopt_name == IP_FW_RESETLOG); 4852 break; 4853 4854 default: 4855 kprintf("ipfw_ctl invalid option %d\n", sopt->sopt_name); 4856 error = EINVAL; 4857 } 4858 return error; 4859 } 4860 4861 static void 4862 ipfw_keepalive_done(struct ipfw_context *ctx) 4863 { 4864 4865 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE, 4866 ("keepalive is not in progress")); 4867 ctx->ipfw_flags &= ~IPFW_FLAG_KEEPALIVE; 4868 callout_reset(&ctx->ipfw_keepalive_ch, dyn_keepalive_period * hz, 4869 ipfw_keepalive, NULL); 4870 } 4871 4872 static void 4873 ipfw_keepalive_more(struct ipfw_context *ctx) 4874 { 4875 struct netmsg_base *nm = &ctx->ipfw_keepalive_more; 4876 4877 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE, 4878 ("keepalive is not in progress")); 4879 KASSERT(nm->lmsg.ms_flags & MSGF_DONE, 4880 ("keepalive more did not finish")); 4881 netisr_sendmsg_oncpu(nm); 4882 } 4883 4884 static void 4885 ipfw_keepalive_loop(struct ipfw_context *ctx, struct ipfw_state *anchor) 4886 { 4887 struct ipfw_state *s; 4888 int scanned = 0, expired = 0, kept = 0; 4889 4890 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE, 4891 ("keepalive is not in progress")); 4892 4893 while ((s = TAILQ_NEXT(anchor, st_link)) != NULL) { 4894 uint32_t ack_rev, ack_fwd; 4895 struct ipfw_flow_id id; 4896 4897 if (scanned++ >= ipfw_state_scan_max) { 4898 ipfw_keepalive_more(ctx); 4899 return; 4900 } 4901 4902 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 4903 TAILQ_INSERT_AFTER(&ctx->ipfw_state_list, s, anchor, st_link); 4904 4905 if (s->st_type == O_ANCHOR) 4906 continue; 4907 4908 if (TIME_LEQ(s->st_expire, time_uptime)) { 4909 /* State expired. */ 4910 ipfw_state_del(ctx, s); 4911 if (++expired >= ipfw_state_expire_max) { 4912 ipfw_keepalive_more(ctx); 4913 return; 4914 } 4915 continue; 4916 } 4917 4918 /* 4919 * Keep alive processing 4920 */ 4921 4922 if (s->st_proto != IPPROTO_TCP) 4923 continue; 4924 if ((s->st_state & IPFW_STATE_TCPSTATES) != BOTH_SYN) 4925 continue; 4926 if (TIME_LEQ(time_uptime + dyn_keepalive_interval, 4927 s->st_expire)) 4928 continue; /* too early */ 4929 4930 ipfw_key_4tuple(&s->st_key, &id.src_ip, &id.src_port, 4931 &id.dst_ip, &id.dst_port); 4932 ack_rev = s->st_ack_rev; 4933 ack_fwd = s->st_ack_fwd; 4934 4935 send_pkt(&id, ack_rev - 1, ack_fwd, TH_SYN); 4936 send_pkt(&id, ack_fwd - 1, ack_rev, 0); 4937 4938 if (++kept >= ipfw_keepalive_max) { 4939 ipfw_keepalive_more(ctx); 4940 return; 4941 } 4942 } 4943 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 4944 ipfw_keepalive_done(ctx); 4945 } 4946 4947 static void 4948 ipfw_keepalive_more_dispatch(netmsg_t nm) 4949 { 4950 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4951 struct ipfw_state *anchor; 4952 4953 ASSERT_NETISR_NCPUS(mycpuid); 4954 KASSERT(ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE, 4955 ("keepalive is not in progress")); 4956 4957 /* Reply ASAP */ 4958 netisr_replymsg(&nm->base, 0); 4959 4960 anchor = &ctx->ipfw_keepalive_anch; 4961 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) { 4962 TAILQ_REMOVE(&ctx->ipfw_state_list, anchor, st_link); 4963 ipfw_keepalive_done(ctx); 4964 return; 4965 } 4966 ipfw_keepalive_loop(ctx, anchor); 4967 } 4968 4969 /* 4970 * This procedure is only used to handle keepalives. It is invoked 4971 * every dyn_keepalive_period 4972 */ 4973 static void 4974 ipfw_keepalive_dispatch(netmsg_t nm) 4975 { 4976 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 4977 struct ipfw_state *anchor; 4978 4979 ASSERT_NETISR_NCPUS(mycpuid); 4980 KASSERT((ctx->ipfw_flags & IPFW_FLAG_KEEPALIVE) == 0, 4981 ("keepalive is in progress")); 4982 ctx->ipfw_flags |= IPFW_FLAG_KEEPALIVE; 4983 4984 /* Reply ASAP */ 4985 crit_enter(); 4986 netisr_replymsg(&nm->base, 0); 4987 crit_exit(); 4988 4989 if (!dyn_keepalive || ctx->ipfw_state_cnt == 0) { 4990 ipfw_keepalive_done(ctx); 4991 return; 4992 } 4993 4994 anchor = &ctx->ipfw_keepalive_anch; 4995 TAILQ_INSERT_HEAD(&ctx->ipfw_state_list, anchor, st_link); 4996 ipfw_keepalive_loop(ctx, anchor); 4997 } 4998 4999 /* 5000 * This procedure is only used to handle keepalives. It is invoked 5001 * every dyn_keepalive_period 5002 */ 5003 static void 5004 ipfw_keepalive(void *dummy __unused) 5005 { 5006 struct netmsg_base *msg; 5007 5008 KKASSERT(mycpuid < netisr_ncpus); 5009 msg = &ipfw_ctx[mycpuid]->ipfw_keepalive_nm; 5010 5011 crit_enter(); 5012 if (msg->lmsg.ms_flags & MSGF_DONE) 5013 netisr_sendmsg_oncpu(msg); 5014 crit_exit(); 5015 } 5016 5017 static int 5018 ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir) 5019 { 5020 struct ip_fw_args args; 5021 struct mbuf *m = *m0; 5022 struct m_tag *mtag; 5023 int tee = 0, error = 0, ret; 5024 5025 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) { 5026 /* Extract info from dummynet tag */ 5027 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); 5028 KKASSERT(mtag != NULL); 5029 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv; 5030 KKASSERT(args.rule != NULL); 5031 5032 m_tag_delete(m, mtag); 5033 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED; 5034 } else { 5035 args.rule = NULL; 5036 } 5037 5038 args.eh = NULL; 5039 args.oif = NULL; 5040 args.m = m; 5041 ret = ipfw_chk(&args); 5042 m = args.m; 5043 5044 if (m == NULL) { 5045 error = EACCES; 5046 goto back; 5047 } 5048 5049 switch (ret) { 5050 case IP_FW_PASS: 5051 break; 5052 5053 case IP_FW_DENY: 5054 m_freem(m); 5055 m = NULL; 5056 error = EACCES; 5057 break; 5058 5059 case IP_FW_DUMMYNET: 5060 /* Send packet to the appropriate pipe */ 5061 ipfw_dummynet_io(m, args.cookie, DN_TO_IP_IN, &args); 5062 break; 5063 5064 case IP_FW_TEE: 5065 tee = 1; 5066 /* FALL THROUGH */ 5067 5068 case IP_FW_DIVERT: 5069 /* 5070 * Must clear bridge tag when changing 5071 */ 5072 m->m_pkthdr.fw_flags &= ~BRIDGE_MBUF_TAGGED; 5073 if (ip_divert_p != NULL) { 5074 m = ip_divert_p(m, tee, 1); 5075 } else { 5076 m_freem(m); 5077 m = NULL; 5078 /* not sure this is the right error msg */ 5079 error = EACCES; 5080 } 5081 break; 5082 5083 default: 5084 panic("unknown ipfw return value: %d", ret); 5085 } 5086 back: 5087 *m0 = m; 5088 return error; 5089 } 5090 5091 static int 5092 ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir) 5093 { 5094 struct ip_fw_args args; 5095 struct mbuf *m = *m0; 5096 struct m_tag *mtag; 5097 int tee = 0, error = 0, ret; 5098 5099 if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) { 5100 /* Extract info from dummynet tag */ 5101 mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); 5102 KKASSERT(mtag != NULL); 5103 args.rule = ((struct dn_pkt *)m_tag_data(mtag))->dn_priv; 5104 KKASSERT(args.rule != NULL); 5105 5106 m_tag_delete(m, mtag); 5107 m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED; 5108 } else { 5109 args.rule = NULL; 5110 } 5111 5112 args.eh = NULL; 5113 args.m = m; 5114 args.oif = ifp; 5115 ret = ipfw_chk(&args); 5116 m = args.m; 5117 5118 if (m == NULL) { 5119 error = EACCES; 5120 goto back; 5121 } 5122 5123 switch (ret) { 5124 case IP_FW_PASS: 5125 break; 5126 5127 case IP_FW_DENY: 5128 m_freem(m); 5129 m = NULL; 5130 error = EACCES; 5131 break; 5132 5133 case IP_FW_DUMMYNET: 5134 ipfw_dummynet_io(m, args.cookie, DN_TO_IP_OUT, &args); 5135 break; 5136 5137 case IP_FW_TEE: 5138 tee = 1; 5139 /* FALL THROUGH */ 5140 5141 case IP_FW_DIVERT: 5142 if (ip_divert_p != NULL) { 5143 m = ip_divert_p(m, tee, 0); 5144 } else { 5145 m_freem(m); 5146 m = NULL; 5147 /* not sure this is the right error msg */ 5148 error = EACCES; 5149 } 5150 break; 5151 5152 default: 5153 panic("unknown ipfw return value: %d", ret); 5154 } 5155 back: 5156 *m0 = m; 5157 return error; 5158 } 5159 5160 static void 5161 ipfw_hook(void) 5162 { 5163 struct pfil_head *pfh; 5164 5165 ASSERT_NETISR0; 5166 5167 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET); 5168 if (pfh == NULL) 5169 return; 5170 5171 pfil_add_hook(ipfw_check_in, NULL, PFIL_IN, pfh); 5172 pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT, pfh); 5173 } 5174 5175 static void 5176 ipfw_dehook(void) 5177 { 5178 struct pfil_head *pfh; 5179 5180 ASSERT_NETISR0; 5181 5182 pfh = pfil_head_get(PFIL_TYPE_AF, AF_INET); 5183 if (pfh == NULL) 5184 return; 5185 5186 pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN, pfh); 5187 pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT, pfh); 5188 } 5189 5190 static int 5191 ipfw_sysctl_dyncnt(SYSCTL_HANDLER_ARGS) 5192 { 5193 int dyn_cnt; 5194 5195 dyn_cnt = ipfw_state_cntcoll(); 5196 dyn_cnt += ipfw_gd.ipfw_trkcnt_cnt; 5197 5198 return (sysctl_handle_int(oidp, &dyn_cnt, 0, req)); 5199 } 5200 5201 static int 5202 ipfw_sysctl_statecnt(SYSCTL_HANDLER_ARGS) 5203 { 5204 int state_cnt; 5205 5206 state_cnt = ipfw_state_cntcoll(); 5207 return (sysctl_handle_int(oidp, &state_cnt, 0, req)); 5208 } 5209 5210 static int 5211 ipfw_sysctl_statemax(SYSCTL_HANDLER_ARGS) 5212 { 5213 int state_max, error; 5214 5215 state_max = ipfw_state_max; 5216 error = sysctl_handle_int(oidp, &state_max, 0, req); 5217 if (error || req->newptr == NULL) 5218 return (error); 5219 5220 if (state_max < 1) 5221 return (EINVAL); 5222 5223 ipfw_state_max_set(state_max); 5224 return (0); 5225 } 5226 5227 static int 5228 ipfw_sysctl_dynmax(SYSCTL_HANDLER_ARGS) 5229 { 5230 int dyn_max, error; 5231 5232 dyn_max = ipfw_state_max + ipfw_track_max; 5233 5234 error = sysctl_handle_int(oidp, &dyn_max, 0, req); 5235 if (error || req->newptr == NULL) 5236 return (error); 5237 5238 if (dyn_max < 2) 5239 return (EINVAL); 5240 5241 ipfw_state_max_set(dyn_max / 2); 5242 ipfw_track_max = dyn_max / 2; 5243 return (0); 5244 } 5245 5246 static void 5247 ipfw_sysctl_enable_dispatch(netmsg_t nmsg) 5248 { 5249 int enable = nmsg->lmsg.u.ms_result; 5250 5251 ASSERT_NETISR0; 5252 5253 if (fw_enable == enable) 5254 goto reply; 5255 5256 fw_enable = enable; 5257 if (fw_enable) 5258 ipfw_hook(); 5259 else 5260 ipfw_dehook(); 5261 reply: 5262 netisr_replymsg(&nmsg->base, 0); 5263 } 5264 5265 static int 5266 ipfw_sysctl_enable(SYSCTL_HANDLER_ARGS) 5267 { 5268 struct netmsg_base nmsg; 5269 int enable, error; 5270 5271 enable = fw_enable; 5272 error = sysctl_handle_int(oidp, &enable, 0, req); 5273 if (error || req->newptr == NULL) 5274 return error; 5275 5276 netmsg_init(&nmsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 5277 ipfw_sysctl_enable_dispatch); 5278 nmsg.lmsg.u.ms_result = enable; 5279 5280 return netisr_domsg(&nmsg, 0); 5281 } 5282 5283 static int 5284 ipfw_sysctl_autoinc_step(SYSCTL_HANDLER_ARGS) 5285 { 5286 return sysctl_int_range(oidp, arg1, arg2, req, 5287 IPFW_AUTOINC_STEP_MIN, IPFW_AUTOINC_STEP_MAX); 5288 } 5289 5290 static int 5291 ipfw_sysctl_scancnt(SYSCTL_HANDLER_ARGS) 5292 { 5293 5294 return sysctl_int_range(oidp, arg1, arg2, req, 1, INT_MAX); 5295 } 5296 5297 static int 5298 ipfw_sysctl_stat(SYSCTL_HANDLER_ARGS) 5299 { 5300 u_long stat = 0; 5301 int cpu, error; 5302 5303 for (cpu = 0; cpu < netisr_ncpus; ++cpu) 5304 stat += *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)); 5305 5306 error = sysctl_handle_long(oidp, &stat, 0, req); 5307 if (error || req->newptr == NULL) 5308 return (error); 5309 5310 /* Zero out this stat. */ 5311 for (cpu = 0; cpu < netisr_ncpus; ++cpu) 5312 *((u_long *)((uint8_t *)ipfw_ctx[cpu] + arg2)) = 0; 5313 return (0); 5314 } 5315 5316 static void 5317 ipfw_ctx_init_dispatch(netmsg_t nmsg) 5318 { 5319 struct netmsg_ipfw *fwmsg = (struct netmsg_ipfw *)nmsg; 5320 struct ipfw_context *ctx; 5321 struct ip_fw *def_rule; 5322 5323 ASSERT_NETISR_NCPUS(mycpuid); 5324 5325 ctx = kmalloc(sizeof(*ctx), M_IPFW, M_WAITOK | M_ZERO); 5326 5327 RB_INIT(&ctx->ipfw_state_tree); 5328 TAILQ_INIT(&ctx->ipfw_state_list); 5329 5330 RB_INIT(&ctx->ipfw_track_tree); 5331 TAILQ_INIT(&ctx->ipfw_track_list); 5332 5333 callout_init_mp(&ctx->ipfw_stateto_ch); 5334 netmsg_init(&ctx->ipfw_stateexp_nm, NULL, &netisr_adone_rport, 5335 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_state_expire_dispatch); 5336 ctx->ipfw_stateexp_anch.st_type = O_ANCHOR; 5337 netmsg_init(&ctx->ipfw_stateexp_more, NULL, &netisr_adone_rport, 5338 MSGF_DROPABLE, ipfw_state_expire_more_dispatch); 5339 5340 callout_init_mp(&ctx->ipfw_trackto_ch); 5341 netmsg_init(&ctx->ipfw_trackexp_nm, NULL, &netisr_adone_rport, 5342 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_track_expire_dispatch); 5343 netmsg_init(&ctx->ipfw_trackexp_more, NULL, &netisr_adone_rport, 5344 MSGF_DROPABLE, ipfw_track_expire_more_dispatch); 5345 5346 callout_init_mp(&ctx->ipfw_keepalive_ch); 5347 netmsg_init(&ctx->ipfw_keepalive_nm, NULL, &netisr_adone_rport, 5348 MSGF_DROPABLE | MSGF_PRIORITY, ipfw_keepalive_dispatch); 5349 ctx->ipfw_keepalive_anch.st_type = O_ANCHOR; 5350 netmsg_init(&ctx->ipfw_keepalive_more, NULL, &netisr_adone_rport, 5351 MSGF_DROPABLE, ipfw_keepalive_more_dispatch); 5352 5353 ipfw_ctx[mycpuid] = ctx; 5354 5355 def_rule = kmalloc(sizeof(*def_rule), M_IPFW, M_WAITOK | M_ZERO); 5356 5357 def_rule->act_ofs = 0; 5358 def_rule->rulenum = IPFW_DEFAULT_RULE; 5359 def_rule->cmd_len = 1; 5360 def_rule->set = IPFW_DEFAULT_SET; 5361 5362 def_rule->cmd[0].len = 1; 5363 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 5364 def_rule->cmd[0].opcode = O_ACCEPT; 5365 #else 5366 if (filters_default_to_accept) 5367 def_rule->cmd[0].opcode = O_ACCEPT; 5368 else 5369 def_rule->cmd[0].opcode = O_DENY; 5370 #endif 5371 5372 def_rule->refcnt = 1; 5373 def_rule->cpuid = mycpuid; 5374 5375 /* Install the default rule */ 5376 ctx->ipfw_default_rule = def_rule; 5377 ctx->ipfw_layer3_chain = def_rule; 5378 5379 /* Link rule CPU sibling */ 5380 ipfw_link_sibling(fwmsg, def_rule); 5381 5382 /* Statistics only need to be updated once */ 5383 if (mycpuid == 0) 5384 ipfw_inc_static_count(def_rule); 5385 5386 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 5387 } 5388 5389 static void 5390 ipfw_init_dispatch(netmsg_t nmsg) 5391 { 5392 struct netmsg_ipfw fwmsg; 5393 int error = 0, cpu; 5394 5395 ASSERT_NETISR0; 5396 5397 if (IPFW_LOADED) { 5398 kprintf("IP firewall already loaded\n"); 5399 error = EEXIST; 5400 goto reply; 5401 } 5402 5403 /* Initialize global track tree. */ 5404 RB_INIT(&ipfw_gd.ipfw_trkcnt_tree); 5405 IPFW_TRKCNT_TOKINIT; 5406 5407 ipfw_state_max_set(ipfw_state_max); 5408 ipfw_state_headroom = 8 * netisr_ncpus; 5409 5410 bzero(&fwmsg, sizeof(fwmsg)); 5411 netmsg_init(&fwmsg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY, 5412 ipfw_ctx_init_dispatch); 5413 netisr_domsg_global(&fwmsg.base); 5414 5415 ip_fw_chk_ptr = ipfw_chk; 5416 ip_fw_ctl_ptr = ipfw_ctl; 5417 ip_fw_dn_io_ptr = ipfw_dummynet_io; 5418 5419 kprintf("ipfw2 initialized, default to %s, logging ", 5420 ipfw_ctx[mycpuid]->ipfw_default_rule->cmd[0].opcode == 5421 O_ACCEPT ? "accept" : "deny"); 5422 5423 #ifdef IPFIREWALL_VERBOSE 5424 fw_verbose = 1; 5425 #endif 5426 #ifdef IPFIREWALL_VERBOSE_LIMIT 5427 verbose_limit = IPFIREWALL_VERBOSE_LIMIT; 5428 #endif 5429 if (fw_verbose == 0) { 5430 kprintf("disabled\n"); 5431 } else if (verbose_limit == 0) { 5432 kprintf("unlimited\n"); 5433 } else { 5434 kprintf("limited to %d packets/entry by default\n", 5435 verbose_limit); 5436 } 5437 5438 ip_fw_loaded = 1; 5439 for (cpu = 0; cpu < netisr_ncpus; ++cpu) { 5440 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_stateto_ch, hz, 5441 ipfw_state_expire_ipifunc, NULL, cpu); 5442 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_trackto_ch, hz, 5443 ipfw_track_expire_ipifunc, NULL, cpu); 5444 callout_reset_bycpu(&ipfw_ctx[cpu]->ipfw_keepalive_ch, hz, 5445 ipfw_keepalive, NULL, cpu); 5446 } 5447 5448 if (fw_enable) 5449 ipfw_hook(); 5450 reply: 5451 netisr_replymsg(&nmsg->base, error); 5452 } 5453 5454 static int 5455 ipfw_init(void) 5456 { 5457 struct netmsg_base smsg; 5458 5459 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 5460 ipfw_init_dispatch); 5461 return netisr_domsg(&smsg, 0); 5462 } 5463 5464 #ifdef KLD_MODULE 5465 5466 static void 5467 ipfw_ctx_fini_dispatch(netmsg_t nmsg) 5468 { 5469 struct ipfw_context *ctx = ipfw_ctx[mycpuid]; 5470 5471 ASSERT_NETISR_NCPUS(mycpuid); 5472 5473 callout_stop_sync(&ctx->ipfw_stateto_ch); 5474 callout_stop_sync(&ctx->ipfw_trackto_ch); 5475 callout_stop_sync(&ctx->ipfw_keepalive_ch); 5476 5477 crit_enter(); 5478 netisr_dropmsg(&ctx->ipfw_stateexp_more); 5479 netisr_dropmsg(&ctx->ipfw_stateexp_nm); 5480 netisr_dropmsg(&ctx->ipfw_trackexp_more); 5481 netisr_dropmsg(&ctx->ipfw_trackexp_nm); 5482 netisr_dropmsg(&ctx->ipfw_keepalive_more); 5483 netisr_dropmsg(&ctx->ipfw_keepalive_nm); 5484 crit_exit(); 5485 5486 netisr_forwardmsg(&nmsg->base, mycpuid + 1); 5487 } 5488 5489 static void 5490 ipfw_fini_dispatch(netmsg_t nmsg) 5491 { 5492 struct netmsg_base nm; 5493 int error = 0, cpu; 5494 5495 ASSERT_NETISR0; 5496 5497 if (ipfw_gd.ipfw_refcnt != 0) { 5498 error = EBUSY; 5499 goto reply; 5500 } 5501 5502 ip_fw_loaded = 0; 5503 ipfw_dehook(); 5504 5505 /* Synchronize any inflight state/track expire IPIs. */ 5506 lwkt_synchronize_ipiqs("ipfwfini"); 5507 5508 netmsg_init(&nm, NULL, &curthread->td_msgport, MSGF_PRIORITY, 5509 ipfw_ctx_fini_dispatch); 5510 netisr_domsg_global(&nm); 5511 5512 ip_fw_chk_ptr = NULL; 5513 ip_fw_ctl_ptr = NULL; 5514 ip_fw_dn_io_ptr = NULL; 5515 ipfw_flush(1 /* kill default rule */); 5516 5517 /* Free pre-cpu context */ 5518 for (cpu = 0; cpu < netisr_ncpus; ++cpu) 5519 kfree(ipfw_ctx[cpu], M_IPFW); 5520 5521 kprintf("IP firewall unloaded\n"); 5522 reply: 5523 netisr_replymsg(&nmsg->base, error); 5524 } 5525 5526 static int 5527 ipfw_fini(void) 5528 { 5529 struct netmsg_base smsg; 5530 5531 netmsg_init(&smsg, NULL, &curthread->td_msgport, MSGF_PRIORITY, 5532 ipfw_fini_dispatch); 5533 return netisr_domsg(&smsg, 0); 5534 } 5535 5536 #endif /* KLD_MODULE */ 5537 5538 static int 5539 ipfw_modevent(module_t mod, int type, void *unused) 5540 { 5541 int err = 0; 5542 5543 switch (type) { 5544 case MOD_LOAD: 5545 err = ipfw_init(); 5546 break; 5547 5548 case MOD_UNLOAD: 5549 #ifndef KLD_MODULE 5550 kprintf("ipfw statically compiled, cannot unload\n"); 5551 err = EBUSY; 5552 #else 5553 err = ipfw_fini(); 5554 #endif 5555 break; 5556 default: 5557 break; 5558 } 5559 return err; 5560 } 5561 5562 static moduledata_t ipfwmod = { 5563 "ipfw", 5564 ipfw_modevent, 5565 0 5566 }; 5567 DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PROTO_END, SI_ORDER_ANY); 5568 MODULE_VERSION(ipfw, 1); 5569