1 /* $NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * NPF state engine to track connections. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 42 #include <sys/mutex.h> 43 #include <netinet/in.h> 44 #include <netinet/tcp.h> 45 #include <netinet/tcp_seq.h> 46 #include <netinet/tcp_fsm.h> 47 48 #include "npf_impl.h" 49 50 /* TCP session expiration table. */ 51 static const u_int tcp_expire_table[ ] __read_mostly = { 52 /* Initial synchronisation. Timeout: 30 sec and 1 minute. */ 53 [TCPS_SYN_SENT] = 30, 54 [TCPS_SYN_RECEIVED] = 60, 55 /* Established (synchronised). Timeout: 24 hours. */ 56 [TCPS_ESTABLISHED] = 60 * 60 * 24, 57 [TCPS_FIN_WAIT_1] = 60 * 60 * 24, 58 [TCPS_FIN_WAIT_2] = 60 * 60 * 24, 59 /* UNUSED [TCPS_CLOSE_WAIT] = 60 * 60 * 24, */ 60 /* Closure. Timeout: 4 minutes (2 * MSL). */ 61 [TCPS_CLOSING] = 60 * 4, 62 [TCPS_LAST_ACK] = 60 * 4, 63 [TCPS_TIME_WAIT] = 60 * 4, 64 /* Fully closed. Timeout immediately. */ 65 [TCPS_CLOSED] = 0 66 }; 67 68 /* Session expiration table. */ 69 static const u_int expire_table[ ] __read_mostly = { 70 [IPPROTO_UDP] = 60, /* 1 min */ 71 [IPPROTO_ICMP] = 30 /* 30 sec */ 72 }; 73 74 #define MAXACKWINDOW 66000 75 76 static bool 77 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 78 const bool forw) 79 { 80 const struct tcphdr * const th = &npc->npc_l4.tcp; 81 const int tcpfl = th->th_flags; 82 npf_tcpstate_t *fstate, *tstate; 83 int tcpdlen, wscale, ackskew; 84 tcp_seq seq, ack, end; 85 uint32_t win; 86 87 KASSERT(npf_iscached(npc, NPC_TCP)); 88 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 89 end = seq + tcpdlen; 90 if (tcpfl & TH_SYN) { 91 end++; 92 } 93 if (tcpfl & TH_FIN) { 94 end++; 95 } 96 97 /* 98 * Perform SEQ/ACK numbers check against boundaries. Reference: 99 * 100 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 101 * 10th USENIX Security Symposium invited talk, Aug. 2001. 102 */ 103 104 fstate = &nst->nst_tcpst[forw ? 0 : 1]; 105 tstate = &nst->nst_tcpst[forw ? 1 : 0]; 106 win = win ? (win << fstate->nst_wscale) : 1; 107 108 if (tcpfl == TH_SYN) { 109 /* 110 * First SYN or re-transmission of SYN. Initialize all 111 * values. State of other side will get set with a SYN-ACK 112 * reply (see below). 113 */ 114 fstate->nst_seqend = end; 115 fstate->nst_ackend = end; 116 fstate->nst_maxwin = win; 117 tstate->nst_ackend = 0; 118 tstate->nst_ackend = 0; 119 tstate->nst_maxwin = 0; 120 /* 121 * Handle TCP Window Scaling (RFC 1323). Both sides may 122 * send this option in their SYN packets. 123 */ 124 if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) { 125 fstate->nst_wscale = wscale; 126 } else { 127 fstate->nst_wscale = 0; 128 } 129 tstate->nst_wscale = 0; 130 /* Done. */ 131 return true; 132 } 133 if (fstate->nst_seqend == 0) { 134 /* 135 * Should be a SYN-ACK reply to SYN. If SYN is not set, 136 * then we are in the middle connection and lost tracking. 137 */ 138 fstate->nst_seqend = end; 139 fstate->nst_ackend = end + 1; 140 fstate->nst_maxwin = 1; 141 142 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 143 if (tcpfl & TH_SYN) { 144 fstate->nst_wscale = 145 npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ? 146 wscale : 0; 147 } 148 } 149 if ((tcpfl & TH_ACK) == 0) { 150 /* Pretend that an ACK was sent. */ 151 ack = tstate->nst_seqend; 152 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 153 /* Workaround for some TCP stacks. */ 154 ack = tstate->nst_seqend; 155 } 156 if (seq == end) { 157 /* If packet contains no data - assume it is valid. */ 158 end = fstate->nst_seqend; 159 seq = end; 160 } 161 162 /* 163 * Determine whether the data is within previously noted window, 164 * that is, upper boundary for valid data (I). 165 */ 166 if (!SEQ_GEQ(fstate->nst_ackend, end)) { 167 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1); 168 return false; 169 } 170 /* Lower boundary (II), which is no more than one window back. */ 171 if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) { 172 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2); 173 return false; 174 } 175 /* 176 * Boundaries for valid acknowledgments (III, IV) - on predicted 177 * window up or down, since packets may be fragmented. 178 */ 179 ackskew = tstate->nst_seqend - ack; 180 if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) { 181 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3); 182 return false; 183 } 184 185 /* 186 * Packet is passed now. 187 * 188 * Negative ackskew might be due to fragmented packets. Since the 189 * total length of the packet is unknown - bump the boundary. 190 */ 191 if (ackskew < 0) { 192 tstate->nst_seqend = end; 193 } 194 /* Keep track of the maximum window seen. */ 195 if (fstate->nst_maxwin < win) { 196 fstate->nst_maxwin = win; 197 } 198 if (SEQ_GT(end, fstate->nst_seqend)) { 199 fstate->nst_seqend = end; 200 } 201 /* Note the window for upper boundary. */ 202 if (SEQ_GEQ(ack + win, tstate->nst_ackend)) { 203 tstate->nst_ackend = ack + win; 204 } 205 return true; 206 } 207 208 static inline bool 209 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 210 const bool forw) 211 { 212 const struct tcphdr * const th = &npc->npc_l4.tcp; 213 const int tcpfl = th->th_flags, state = nst->nst_state; 214 #if 0 215 /* Determine whether TCP packet really belongs to this connection. */ 216 if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) { 217 return false; 218 } 219 #endif 220 /* 221 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK), connection 222 * reset (RST), half-open connections, connection closure, etc. 223 */ 224 if (__predict_false(tcpfl & TH_RST)) { 225 nst->nst_state = TCPS_CLOSED; 226 return true; 227 } 228 switch (state) { 229 case TCPS_ESTABLISHED: 230 case TCPS_FIN_WAIT_2: 231 /* Common case - connection is established. */ 232 if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) { 233 return true; 234 } 235 /* Otherwise, can only be a FIN. */ 236 if ((tcpfl & TH_FIN) == 0) { 237 break; 238 } 239 /* XXX see below TCPS_CLOSE_WAIT */ 240 if (state != TCPS_FIN_WAIT_2) { 241 /* First FIN: closure of one end. */ 242 nst->nst_state = TCPS_FIN_WAIT_1; 243 } else { 244 /* Second FIN: connection closure, wait for ACK. */ 245 nst->nst_state = TCPS_LAST_ACK; 246 } 247 return true; 248 case TCPS_SYN_SENT: 249 /* After SYN expecting SYN-ACK. */ 250 if (tcpfl == (TH_SYN | TH_ACK) && !forw) { 251 /* Received backwards SYN-ACK. */ 252 nst->nst_state = TCPS_SYN_RECEIVED; 253 return true; 254 } 255 if (tcpfl == TH_SYN && forw) { 256 /* Re-transmission of SYN. */ 257 return true; 258 } 259 break; 260 case TCPS_SYN_RECEIVED: 261 /* SYN-ACK was seen, expecting ACK. */ 262 if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) { 263 /* ACK - establish connection. */ 264 nst->nst_state = TCPS_ESTABLISHED; 265 return true; 266 } 267 if (tcpfl == (TH_SYN | TH_ACK)) { 268 /* Re-transmission of SYN-ACK. */ 269 return true; 270 } 271 break; 272 case TCPS_CLOSE_WAIT: 273 /* UNUSED */ 274 case TCPS_FIN_WAIT_1: 275 /* 276 * XXX: FIN re-transmission is not handled, use TCPS_CLOSE_WAIT. 277 */ 278 /* 279 * First FIN was seen, expecting ACK. However, we may receive 280 * a simultaneous FIN or exchange of FINs with FIN-ACK. 281 */ 282 if ((tcpfl & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) { 283 /* Exchange of FINs with ACK. Wait for last ACK. */ 284 nst->nst_state = TCPS_LAST_ACK; 285 return true; 286 } else if (tcpfl & TH_ACK) { 287 /* ACK of first FIN. */ 288 nst->nst_state = TCPS_FIN_WAIT_2; 289 return true; 290 } else if (tcpfl & TH_FIN) { 291 /* Simultaneous FIN. Need to wait for ACKs. */ 292 nst->nst_state = TCPS_CLOSING; 293 return true; 294 } 295 break; 296 case TCPS_CLOSING: 297 case TCPS_LAST_ACK: 298 case TCPS_TIME_WAIT: 299 /* Expecting only ACK. */ 300 if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) != TH_ACK) { 301 return false; 302 } 303 switch (state) { 304 case TCPS_CLOSING: 305 /* One ACK noted, wait for last one. */ 306 nst->nst_state = TCPS_LAST_ACK; 307 break; 308 case TCPS_LAST_ACK: 309 /* Last ACK received, quiet wait now. */ 310 nst->nst_state = TCPS_TIME_WAIT; 311 break; 312 } 313 return true; 314 case TCPS_CLOSED: 315 /* XXX: Drop or pass? */ 316 break; 317 default: 318 npf_state_dump(nst); 319 KASSERT(false); 320 } 321 return false; 322 } 323 324 bool 325 npf_state_init(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst) 326 { 327 const int proto = npf_cache_ipproto(npc); 328 329 KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4)); 330 331 mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET); 332 333 if (proto == IPPROTO_TCP) { 334 const struct tcphdr *th = &npc->npc_l4.tcp; 335 336 /* TCP case: must be SYN. */ 337 KASSERT(npf_iscached(npc, NPC_TCP)); 338 if (th->th_flags != TH_SYN) { 339 npf_stats_inc(NPF_STAT_INVALID_STATE); 340 return false; 341 } 342 /* Initial values for TCP window and sequence tracking. */ 343 if (!npf_tcp_inwindow(npc, nbuf, nst, true)) { 344 npf_stats_inc(NPF_STAT_INVALID_STATE); 345 return false; 346 } 347 } 348 349 /* 350 * Initial state: SYN sent, waiting for response from the other side. 351 * Note: for UDP or ICMP, reuse SYN-sent flag to note response. 352 */ 353 nst->nst_state = TCPS_SYN_SENT; 354 return true; 355 } 356 357 void 358 npf_state_destroy(npf_state_t *nst) 359 { 360 361 mutex_destroy(&nst->nst_lock); 362 } 363 364 bool 365 npf_state_inspect(const npf_cache_t *npc, nbuf_t *nbuf, 366 npf_state_t *nst, const bool forw) 367 { 368 const int proto = npf_cache_ipproto(npc); 369 bool ret; 370 371 mutex_enter(&nst->nst_lock); 372 switch (proto) { 373 case IPPROTO_TCP: 374 /* Handle TCP. */ 375 ret = npf_state_tcp(npc, nbuf, nst, forw); 376 break; 377 default: 378 /* 379 * Handle UDP or ICMP response for opening session. 380 */ 381 if (nst->nst_state == TCPS_SYN_SENT && !forw) { 382 nst->nst_state= TCPS_ESTABLISHED; 383 } 384 ret = true; 385 } 386 mutex_exit(&nst->nst_lock); 387 if (__predict_false(!ret)) { 388 npf_stats_inc(NPF_STAT_INVALID_STATE); 389 } 390 return ret; 391 } 392 393 /* 394 * npf_state_etime: return session expiration time according to the state. 395 */ 396 int 397 npf_state_etime(const npf_state_t *nst, const int proto) 398 { 399 const int state = nst->nst_state; 400 401 if (__predict_true(proto == IPPROTO_TCP)) { 402 return tcp_expire_table[state]; 403 } 404 return expire_table[proto]; 405 } 406 407 #if defined(DDB) || defined(_NPF_TESTING) 408 409 void 410 npf_state_dump(npf_state_t *nst) 411 { 412 npf_tcpstate_t *fst = &nst->nst_tcpst[0], *tst = &nst->nst_tcpst[1]; 413 414 printf("\tstate (%p) %d:\n\t\t" 415 "F { seqend %u ackend %u mwin %u wscale %u }\n\t\t" 416 "T { seqend %u ackend %u mwin %u wscale %u }\n", 417 nst, nst->nst_state, 418 fst->nst_seqend, fst->nst_ackend, fst->nst_maxwin, fst->nst_wscale, 419 tst->nst_seqend, tst->nst_ackend, tst->nst_maxwin, tst->nst_wscale 420 ); 421 } 422 423 #endif 424