1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 36 * 37 * License terms: all terms for the DragonFly license above plus the following: 38 * 39 * 4. All advertising materials mentioning features or use of this software 40 * must display the following acknowledgement: 41 * 42 * This product includes software developed by Jeffrey M. Hsu 43 * for the DragonFly Project. 44 * 45 * This requirement may be waived with permission from Jeffrey Hsu. 46 * This requirement will sunset and may be removed on July 8 2005, 47 * after which the standard DragonFly license (as shown above) will 48 * apply. 49 */ 50 51 /* 52 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 53 * The Regents of the University of California. All rights reserved. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 3. All advertising materials mentioning features or use of this software 64 * must display the following acknowledgement: 65 * This product includes software developed by the University of 66 * California, Berkeley and its contributors. 67 * 4. Neither the name of the University nor the names of its contributors 68 * may be used to endorse or promote products derived from this software 69 * without specific prior written permission. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 * 83 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 84 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.14 2003/02/03 02:33:41 hsu Exp $ 85 * $DragonFly: src/sys/netinet/tcp_timer.c,v 1.13 2004/12/21 02:54:15 hsu Exp $ 86 */ 87 88 #include "opt_compat.h" 89 #include "opt_inet6.h" 90 #include "opt_tcpdebug.h" 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/kernel.h> 95 #include <sys/mbuf.h> 96 #include <sys/sysctl.h> 97 #include <sys/socket.h> 98 #include <sys/socketvar.h> 99 #include <sys/protosw.h> 100 #include <sys/thread.h> 101 #include <sys/globaldata.h> 102 103 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 104 105 #include <net/route.h> 106 107 #include <netinet/in.h> 108 #include <netinet/in_systm.h> 109 #include <netinet/in_pcb.h> 110 #ifdef INET6 111 #include <netinet6/in6_pcb.h> 112 #endif 113 #include <netinet/ip_var.h> 114 #include <netinet/tcp.h> 115 #include <netinet/tcp_fsm.h> 116 #include <netinet/tcp_seq.h> 117 #include <netinet/tcp_timer.h> 118 #include <netinet/tcp_var.h> 119 #include <netinet/tcpip.h> 120 #ifdef TCPDEBUG 121 #include <netinet/tcp_debug.h> 122 #endif 123 124 static int 125 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) 126 { 127 int error, s, tt; 128 129 tt = *(int *)oidp->oid_arg1; 130 s = (int)((int64_t)tt * 1000 / hz); 131 132 error = sysctl_handle_int(oidp, &s, 0, req); 133 if (error || !req->newptr) 134 return (error); 135 136 tt = (int)((int64_t)s * hz / 1000); 137 if (tt < 1) 138 return (EINVAL); 139 140 *(int *)oidp->oid_arg1 = tt; 141 return (0); 142 } 143 144 int tcp_keepinit; 145 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 146 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 147 148 int tcp_keepidle; 149 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 150 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 151 152 int tcp_keepintvl; 153 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 154 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 155 156 int tcp_delacktime; 157 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 158 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 159 "Time before a delayed ACK is sent"); 160 161 int tcp_msl; 162 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 163 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 164 165 int tcp_rexmit_min; 166 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 167 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); 168 169 int tcp_rexmit_slop; 170 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 171 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); 172 173 static int always_keepalive = 0; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 175 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 176 177 static int tcp_keepcnt = TCPTV_KEEPCNT; 178 /* max idle probes */ 179 int tcp_maxpersistidle; 180 /* max idle time in persist */ 181 int tcp_maxidle; 182 183 /* 184 * Tcp protocol timeout routine called every 500 ms. 185 * Updates timestamps used for TCP 186 * causes finite state machine actions if timers expire. 187 */ 188 void 189 tcp_slowtimo(void) 190 { 191 int s; 192 193 s = splnet(); 194 195 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 196 197 splx(s); 198 } 199 200 /* 201 * Cancel all timers for TCP tp. 202 */ 203 void 204 tcp_canceltimers(struct tcpcb *tp) 205 { 206 callout_stop(tp->tt_2msl); 207 callout_stop(tp->tt_persist); 208 callout_stop(tp->tt_keep); 209 callout_stop(tp->tt_rexmt); 210 } 211 212 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 213 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 214 215 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 216 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 217 218 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 219 220 /* 221 * TCP timer processing. 222 */ 223 void 224 tcp_timer_delack(void *xtp) 225 { 226 struct tcpcb *tp = xtp; 227 int s; 228 229 s = splnet(); 230 if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { 231 splx(s); 232 return; 233 } 234 callout_deactivate(tp->tt_delack); 235 236 tp->t_flags |= TF_ACKNOW; 237 tcpstat.tcps_delack++; 238 tcp_output(tp); 239 splx(s); 240 } 241 242 void 243 tcp_timer_2msl(void *xtp) 244 { 245 struct tcpcb *tp = xtp; 246 int s; 247 #ifdef TCPDEBUG 248 int ostate; 249 250 ostate = tp->t_state; 251 #endif 252 s = splnet(); 253 if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { 254 splx(s); 255 return; 256 } 257 callout_deactivate(tp->tt_2msl); 258 /* 259 * 2 MSL timeout in shutdown went off. If we're closed but 260 * still waiting for peer to close and connection has been idle 261 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 262 * control block. Otherwise, check again in a bit. 263 */ 264 if (tp->t_state != TCPS_TIME_WAIT && 265 (ticks - tp->t_rcvtime) <= tcp_maxidle) 266 callout_reset(tp->tt_2msl, tcp_keepintvl, 267 tcp_timer_2msl, tp); 268 else 269 tp = tcp_close(tp); 270 271 #ifdef TCPDEBUG 272 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 273 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 274 #endif 275 splx(s); 276 } 277 278 void 279 tcp_timer_keep(void *xtp) 280 { 281 struct tcpcb *tp = xtp; 282 struct tcptemp *t_template; 283 int s; 284 #ifdef TCPDEBUG 285 int ostate; 286 287 ostate = tp->t_state; 288 #endif 289 s = splnet(); 290 if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { 291 splx(s); 292 return; 293 } 294 callout_deactivate(tp->tt_keep); 295 /* 296 * Keep-alive timer went off; send something 297 * or drop connection if idle for too long. 298 */ 299 tcpstat.tcps_keeptimeo++; 300 if (tp->t_state < TCPS_ESTABLISHED) 301 goto dropit; 302 if ((always_keepalive || 303 tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && 304 tp->t_state <= TCPS_CLOSING) { 305 if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) 306 goto dropit; 307 /* 308 * Send a packet designed to force a response 309 * if the peer is up and reachable: 310 * either an ACK if the connection is still alive, 311 * or an RST if the peer has closed the connection 312 * due to timeout or reboot. 313 * Using sequence number tp->snd_una-1 314 * causes the transmitted zero-length segment 315 * to lie outside the receive window; 316 * by the protocol spec, this requires the 317 * correspondent TCP to respond. 318 */ 319 tcpstat.tcps_keepprobe++; 320 t_template = tcp_maketemplate(tp); 321 if (t_template) { 322 tcp_respond(tp, t_template->tt_ipgen, 323 &t_template->tt_t, (struct mbuf *)NULL, 324 tp->rcv_nxt, tp->snd_una - 1, 0); 325 tcp_freetemplate(t_template); 326 } 327 callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); 328 } else 329 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 330 331 #ifdef TCPDEBUG 332 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 333 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 334 #endif 335 splx(s); 336 return; 337 338 dropit: 339 tcpstat.tcps_keepdrops++; 340 tp = tcp_drop(tp, ETIMEDOUT); 341 342 #ifdef TCPDEBUG 343 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 344 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 345 #endif 346 splx(s); 347 } 348 349 void 350 tcp_timer_persist(void *xtp) 351 { 352 struct tcpcb *tp = xtp; 353 int s; 354 #ifdef TCPDEBUG 355 int ostate; 356 357 ostate = tp->t_state; 358 #endif 359 s = splnet(); 360 if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ 361 splx(s); 362 return; 363 } 364 callout_deactivate(tp->tt_persist); 365 /* 366 * Persistance timer into zero window. 367 * Force a byte to be output, if possible. 368 */ 369 tcpstat.tcps_persisttimeo++; 370 /* 371 * Hack: if the peer is dead/unreachable, we do not 372 * time out if the window is closed. After a full 373 * backoff, drop the connection if the idle time 374 * (no responses to probes) reaches the maximum 375 * backoff that we would use if retransmitting. 376 */ 377 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 378 ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || 379 (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 380 tcpstat.tcps_persistdrop++; 381 tp = tcp_drop(tp, ETIMEDOUT); 382 goto out; 383 } 384 tcp_setpersist(tp); 385 tp->t_flags |= TF_FORCE; 386 tcp_output(tp); 387 tp->t_flags &= ~TF_FORCE; 388 389 out: 390 #ifdef TCPDEBUG 391 if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 392 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 393 #endif 394 splx(s); 395 } 396 397 void 398 tcp_save_congestion_state(struct tcpcb *tp) 399 { 400 tp->snd_cwnd_prev = tp->snd_cwnd; 401 tp->snd_ssthresh_prev = tp->snd_ssthresh; 402 tp->snd_recover_prev = tp->snd_recover; 403 if (IN_FASTRECOVERY(tp)) 404 tp->t_flags |= TF_WASFRECOVERY; 405 else 406 tp->t_flags &= ~TF_WASFRECOVERY; 407 if (tp->t_flags & TF_RCVD_TSTMP) { 408 tp->t_rexmtTS = ticks; 409 tp->t_flags |= TF_FIRSTACCACK; 410 } 411 #ifdef later 412 tcp_sack_save_scoreboard(&tp->scb); 413 #endif 414 } 415 416 void 417 tcp_revert_congestion_state(struct tcpcb *tp) 418 { 419 tp->snd_cwnd = tp->snd_cwnd_prev; 420 tp->snd_ssthresh = tp->snd_ssthresh_prev; 421 tp->snd_recover = tp->snd_recover_prev; 422 if (tp->t_flags & TF_WASFRECOVERY) 423 ENTER_FASTRECOVERY(tp); 424 if (tp->t_flags & TF_FASTREXMT) { 425 ++tcpstat.tcps_sndfastrexmitbad; 426 if (tp->t_flags & TF_EARLYREXMT) 427 ++tcpstat.tcps_sndearlyrexmitbad; 428 } else 429 ++tcpstat.tcps_sndrtobad; 430 tp->t_badrxtwin = 0; 431 tp->t_rxtshift = 0; 432 tp->snd_nxt = tp->snd_max; 433 #ifdef later 434 tcp_sack_revert_scoreboard(&tp->scb, tp->snd_una); 435 #endif 436 } 437 438 void 439 tcp_timer_rexmt(void *xtp) 440 { 441 struct tcpcb *tp = xtp; 442 int s; 443 int rexmt; 444 #ifdef TCPDEBUG 445 int ostate; 446 447 ostate = tp->t_state; 448 #endif 449 s = splnet(); 450 if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { 451 splx(s); 452 return; 453 } 454 callout_deactivate(tp->tt_rexmt); 455 /* 456 * Retransmission timer went off. Message has not 457 * been acked within retransmit interval. Back off 458 * to a longer retransmit interval and retransmit one segment. 459 */ 460 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 461 tp->t_rxtshift = TCP_MAXRXTSHIFT; 462 tcpstat.tcps_timeoutdrop++; 463 tp = tcp_drop(tp, tp->t_softerror ? 464 tp->t_softerror : ETIMEDOUT); 465 goto out; 466 } 467 if (tp->t_rxtshift == 1) { 468 /* 469 * first retransmit; record ssthresh and cwnd so they can 470 * be recovered if this turns out to be a "bad" retransmit. 471 * A retransmit is considered "bad" if an ACK for this 472 * segment is received within RTT/2 interval; the assumption 473 * here is that the ACK was already in flight. See 474 * "On Estimating End-to-End Network Path Properties" by 475 * Allman and Paxson for more details. 476 */ 477 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 478 tcp_save_congestion_state(tp); 479 tp->t_flags &= ~(TF_FASTREXMT | TF_EARLYREXMT); 480 } 481 /* Throw away SACK blocks on a RTO, as specified by RFC2018. */ 482 tcp_sack_cleanup(&tp->scb); 483 tcpstat.tcps_rexmttimeo++; 484 if (tp->t_state == TCPS_SYN_SENT) 485 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 486 else 487 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 488 TCPT_RANGESET(tp->t_rxtcur, rexmt, 489 tp->t_rttmin, TCPTV_REXMTMAX); 490 /* 491 * Disable rfc1323 and rfc1644 if we havn't got any response to 492 * our third SYN to work-around some broken terminal servers 493 * (most of which have hopefully been retired) that have bad VJ 494 * header compression code which trashes TCP segments containing 495 * unknown-to-them TCP options. 496 */ 497 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) 498 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 499 /* 500 * If losing, let the lower level know and try for 501 * a better route. Also, if we backed off this far, 502 * our srtt estimate is probably bogus. Clobber it 503 * so we'll take the next rtt measurement as our srtt; 504 * move the current srtt into rttvar to keep the current 505 * retransmit times until then. 506 */ 507 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 508 #ifdef INET6 509 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 510 in6_losing(tp->t_inpcb); 511 else 512 #endif 513 in_losing(tp->t_inpcb); 514 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 515 tp->t_srtt = 0; 516 } 517 tp->snd_nxt = tp->snd_una; 518 tp->rexmt_high = tp->snd_una; 519 tp->snd_recover = tp->snd_max; 520 /* 521 * Force a segment to be sent. 522 */ 523 tp->t_flags |= TF_ACKNOW; 524 /* 525 * If timing a segment in this window, stop the timer. 526 */ 527 tp->t_rtttime = 0; 528 /* 529 * Close the congestion window down to one segment 530 * (we'll open it by one segment for each ack we get). 531 * Since we probably have a window's worth of unacked 532 * data accumulated, this "slow start" keeps us from 533 * dumping all that data as back-to-back packets (which 534 * might overwhelm an intermediate gateway). 535 * 536 * There are two phases to the opening: Initially we 537 * open by one mss on each ack. This makes the window 538 * size increase exponentially with time. If the 539 * window is larger than the path can handle, this 540 * exponential growth results in dropped packet(s) 541 * almost immediately. To get more time between 542 * drops but still "push" the network to take advantage 543 * of improving conditions, we switch from exponential 544 * to linear window opening at some threshhold size. 545 * For a threshhold, we use half the current window 546 * size, truncated to a multiple of the mss. 547 * 548 * (the minimum cwnd that will give us exponential 549 * growth is 2 mss. We don't allow the threshhold 550 * to go below this.) 551 */ 552 { 553 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 554 if (win < 2) 555 win = 2; 556 tp->snd_cwnd = tp->t_maxseg; 557 tp->snd_ssthresh = win * tp->t_maxseg; 558 tp->t_dupacks = 0; 559 } 560 EXIT_FASTRECOVERY(tp); 561 tcp_output(tp); 562 563 out: 564 #ifdef TCPDEBUG 565 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 566 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 567 #endif 568 splx(s); 569 } 570