1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 36 * 37 * License terms: all terms for the DragonFly license above plus the following: 38 * 39 * 4. All advertising materials mentioning features or use of this software 40 * must display the following acknowledgement: 41 * 42 * This product includes software developed by Jeffrey M. Hsu 43 * for the DragonFly Project. 44 * 45 * This requirement may be waived with permission from Jeffrey Hsu. 46 * This requirement will sunset and may be removed on July 8 2005, 47 * after which the standard DragonFly license (as shown above) will 48 * apply. 49 */ 50 51 /* 52 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 53 * The Regents of the University of California. All rights reserved. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 3. All advertising materials mentioning features or use of this software 64 * must display the following acknowledgement: 65 * This product includes software developed by the University of 66 * California, Berkeley and its contributors. 67 * 4. Neither the name of the University nor the names of its contributors 68 * may be used to endorse or promote products derived from this software 69 * without specific prior written permission. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 * 83 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 84 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.14 2003/02/03 02:33:41 hsu Exp $ 85 * $DragonFly: src/sys/netinet/tcp_timer.c,v 1.11 2004/07/08 22:07:35 hsu Exp $ 86 */ 87 88 #include "opt_compat.h" 89 #include "opt_inet6.h" 90 #include "opt_tcpdebug.h" 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/kernel.h> 95 #include <sys/mbuf.h> 96 #include <sys/sysctl.h> 97 #include <sys/socket.h> 98 #include <sys/socketvar.h> 99 #include <sys/protosw.h> 100 #include <sys/thread.h> 101 #include <sys/globaldata.h> 102 103 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 104 105 #include <net/route.h> 106 107 #include <netinet/in.h> 108 #include <netinet/in_systm.h> 109 #include <netinet/in_pcb.h> 110 #ifdef INET6 111 #include <netinet6/in6_pcb.h> 112 #endif 113 #include <netinet/ip_var.h> 114 #include <netinet/tcp.h> 115 #include <netinet/tcp_fsm.h> 116 #include <netinet/tcp_seq.h> 117 #include <netinet/tcp_timer.h> 118 #include <netinet/tcp_var.h> 119 #include <netinet/tcpip.h> 120 #ifdef TCPDEBUG 121 #include <netinet/tcp_debug.h> 122 #endif 123 124 static int 125 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) 126 { 127 int error, s, tt; 128 129 tt = *(int *)oidp->oid_arg1; 130 s = (int)((int64_t)tt * 1000 / hz); 131 132 error = sysctl_handle_int(oidp, &s, 0, req); 133 if (error || !req->newptr) 134 return (error); 135 136 tt = (int)((int64_t)s * hz / 1000); 137 if (tt < 1) 138 return (EINVAL); 139 140 *(int *)oidp->oid_arg1 = tt; 141 return (0); 142 } 143 144 int tcp_keepinit; 145 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 146 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); 147 148 int tcp_keepidle; 149 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 150 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); 151 152 int tcp_keepintvl; 153 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 154 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); 155 156 int tcp_delacktime; 157 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 158 CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 159 "Time before a delayed ACK is sent"); 160 161 int tcp_msl; 162 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 163 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 164 165 int tcp_rexmit_min; 166 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 167 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); 168 169 int tcp_rexmit_slop; 170 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 171 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); 172 173 static int always_keepalive = 0; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 175 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 176 177 static int tcp_keepcnt = TCPTV_KEEPCNT; 178 /* max idle probes */ 179 int tcp_maxpersistidle; 180 /* max idle time in persist */ 181 int tcp_maxidle; 182 183 /* 184 * Tcp protocol timeout routine called every 500 ms. 185 * Updates timestamps used for TCP 186 * causes finite state machine actions if timers expire. 187 */ 188 void 189 tcp_slowtimo(void) 190 { 191 int s; 192 193 s = splnet(); 194 195 tcp_maxidle = tcp_keepcnt * tcp_keepintvl; 196 197 splx(s); 198 } 199 200 /* 201 * Cancel all timers for TCP tp. 202 */ 203 void 204 tcp_canceltimers(struct tcpcb *tp) 205 { 206 callout_stop(tp->tt_2msl); 207 callout_stop(tp->tt_persist); 208 callout_stop(tp->tt_keep); 209 callout_stop(tp->tt_rexmt); 210 } 211 212 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 213 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 214 215 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 216 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 217 218 static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 219 220 /* 221 * TCP timer processing. 222 */ 223 void 224 tcp_timer_delack(void *xtp) 225 { 226 struct tcpcb *tp = xtp; 227 int s; 228 229 s = splnet(); 230 if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { 231 splx(s); 232 return; 233 } 234 callout_deactivate(tp->tt_delack); 235 236 tp->t_flags |= TF_ACKNOW; 237 tcpstat.tcps_delack++; 238 (void) tcp_output(tp); 239 splx(s); 240 } 241 242 void 243 tcp_timer_2msl(void *xtp) 244 { 245 struct tcpcb *tp = xtp; 246 int s; 247 #ifdef TCPDEBUG 248 int ostate; 249 250 ostate = tp->t_state; 251 #endif 252 s = splnet(); 253 if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { 254 splx(s); 255 return; 256 } 257 callout_deactivate(tp->tt_2msl); 258 /* 259 * 2 MSL timeout in shutdown went off. If we're closed but 260 * still waiting for peer to close and connection has been idle 261 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 262 * control block. Otherwise, check again in a bit. 263 */ 264 if (tp->t_state != TCPS_TIME_WAIT && 265 (ticks - tp->t_rcvtime) <= tcp_maxidle) 266 callout_reset(tp->tt_2msl, tcp_keepintvl, 267 tcp_timer_2msl, tp); 268 else 269 tp = tcp_close(tp); 270 271 #ifdef TCPDEBUG 272 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 273 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 274 PRU_SLOWTIMO); 275 #endif 276 splx(s); 277 } 278 279 void 280 tcp_timer_keep(void *xtp) 281 { 282 struct tcpcb *tp = xtp; 283 struct tcptemp *t_template; 284 int s; 285 #ifdef TCPDEBUG 286 int ostate; 287 288 ostate = tp->t_state; 289 #endif 290 s = splnet(); 291 if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { 292 splx(s); 293 return; 294 } 295 callout_deactivate(tp->tt_keep); 296 /* 297 * Keep-alive timer went off; send something 298 * or drop connection if idle for too long. 299 */ 300 tcpstat.tcps_keeptimeo++; 301 if (tp->t_state < TCPS_ESTABLISHED) 302 goto dropit; 303 if ((always_keepalive || 304 tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && 305 tp->t_state <= TCPS_CLOSING) { 306 if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) 307 goto dropit; 308 /* 309 * Send a packet designed to force a response 310 * if the peer is up and reachable: 311 * either an ACK if the connection is still alive, 312 * or an RST if the peer has closed the connection 313 * due to timeout or reboot. 314 * Using sequence number tp->snd_una-1 315 * causes the transmitted zero-length segment 316 * to lie outside the receive window; 317 * by the protocol spec, this requires the 318 * correspondent TCP to respond. 319 */ 320 tcpstat.tcps_keepprobe++; 321 t_template = tcp_maketemplate(tp); 322 if (t_template) { 323 tcp_respond(tp, t_template->tt_ipgen, 324 &t_template->tt_t, (struct mbuf *)NULL, 325 tp->rcv_nxt, tp->snd_una - 1, 0); 326 tcp_freetemplate(t_template); 327 } 328 callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); 329 } else 330 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); 331 332 #ifdef TCPDEBUG 333 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 334 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 335 PRU_SLOWTIMO); 336 #endif 337 splx(s); 338 return; 339 340 dropit: 341 tcpstat.tcps_keepdrops++; 342 tp = tcp_drop(tp, ETIMEDOUT); 343 344 #ifdef TCPDEBUG 345 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 346 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 347 PRU_SLOWTIMO); 348 #endif 349 splx(s); 350 } 351 352 void 353 tcp_timer_persist(void *xtp) 354 { 355 struct tcpcb *tp = xtp; 356 int s; 357 #ifdef TCPDEBUG 358 int ostate; 359 360 ostate = tp->t_state; 361 #endif 362 s = splnet(); 363 if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ 364 splx(s); 365 return; 366 } 367 callout_deactivate(tp->tt_persist); 368 /* 369 * Persistance timer into zero window. 370 * Force a byte to be output, if possible. 371 */ 372 tcpstat.tcps_persisttimeo++; 373 /* 374 * Hack: if the peer is dead/unreachable, we do not 375 * time out if the window is closed. After a full 376 * backoff, drop the connection if the idle time 377 * (no responses to probes) reaches the maximum 378 * backoff that we would use if retransmitting. 379 */ 380 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 381 ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || 382 (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 383 tcpstat.tcps_persistdrop++; 384 tp = tcp_drop(tp, ETIMEDOUT); 385 goto out; 386 } 387 tcp_setpersist(tp); 388 tp->t_flags |= TF_FORCE; 389 (void) tcp_output(tp); 390 tp->t_flags &= ~TF_FORCE; 391 392 out: 393 #ifdef TCPDEBUG 394 if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 395 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 396 PRU_SLOWTIMO); 397 #endif 398 splx(s); 399 } 400 401 void 402 tcp_save_congestion_state(struct tcpcb *tp) 403 { 404 tp->snd_cwnd_prev = tp->snd_cwnd; 405 tp->snd_ssthresh_prev = tp->snd_ssthresh; 406 tp->snd_recover_prev = tp->snd_recover; 407 if (IN_FASTRECOVERY(tp)) 408 tp->t_flags |= TF_WASFRECOVERY; 409 else 410 tp->t_flags &= ~TF_WASFRECOVERY; 411 if (tp->t_flags & TF_RCVD_TSTMP) { 412 tp->t_rexmtTS = ticks; 413 tp->t_flags |= TF_FIRSTACCACK; 414 } 415 } 416 417 void 418 tcp_revert_congestion_state(struct tcpcb *tp) 419 { 420 tp->snd_cwnd = tp->snd_cwnd_prev; 421 tp->snd_ssthresh = tp->snd_ssthresh_prev; 422 tp->snd_recover = tp->snd_recover_prev; 423 if (tp->t_flags & TF_WASFRECOVERY) 424 ENTER_FASTRECOVERY(tp); 425 if (tp->t_flags & TF_FASTREXMT) { 426 ++tcpstat.tcps_sndfastrexmitbad; 427 if (tp->t_flags & TF_EARLYREXMT) 428 ++tcpstat.tcps_sndearlyrexmitbad; 429 } else 430 ++tcpstat.tcps_sndrtobad; 431 tp->t_badrxtwin = 0; 432 tp->t_rxtshift = 0; 433 tp->snd_nxt = tp->snd_max; 434 } 435 436 void 437 tcp_timer_rexmt(void *xtp) 438 { 439 struct tcpcb *tp = xtp; 440 int s; 441 int rexmt; 442 #ifdef TCPDEBUG 443 int ostate; 444 445 ostate = tp->t_state; 446 #endif 447 s = splnet(); 448 if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { 449 splx(s); 450 return; 451 } 452 callout_deactivate(tp->tt_rexmt); 453 /* 454 * Retransmission timer went off. Message has not 455 * been acked within retransmit interval. Back off 456 * to a longer retransmit interval and retransmit one segment. 457 */ 458 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 459 tp->t_rxtshift = TCP_MAXRXTSHIFT; 460 tcpstat.tcps_timeoutdrop++; 461 tp = tcp_drop(tp, tp->t_softerror ? 462 tp->t_softerror : ETIMEDOUT); 463 goto out; 464 } 465 if (tp->t_rxtshift == 1) { 466 /* 467 * first retransmit; record ssthresh and cwnd so they can 468 * be recovered if this turns out to be a "bad" retransmit. 469 * A retransmit is considered "bad" if an ACK for this 470 * segment is received within RTT/2 interval; the assumption 471 * here is that the ACK was already in flight. See 472 * "On Estimating End-to-End Network Path Properties" by 473 * Allman and Paxson for more details. 474 */ 475 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 476 tcp_save_congestion_state(tp); 477 tp->t_flags &= ~(TF_FASTREXMT | TF_EARLYREXMT); 478 } 479 tcpstat.tcps_rexmttimeo++; 480 if (tp->t_state == TCPS_SYN_SENT) 481 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; 482 else 483 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 484 TCPT_RANGESET(tp->t_rxtcur, rexmt, 485 tp->t_rttmin, TCPTV_REXMTMAX); 486 /* 487 * Disable rfc1323 and rfc1644 if we havn't got any response to 488 * our third SYN to work-around some broken terminal servers 489 * (most of which have hopefully been retired) that have bad VJ 490 * header compression code which trashes TCP segments containing 491 * unknown-to-them TCP options. 492 */ 493 if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) 494 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); 495 /* 496 * If losing, let the lower level know and try for 497 * a better route. Also, if we backed off this far, 498 * our srtt estimate is probably bogus. Clobber it 499 * so we'll take the next rtt measurement as our srtt; 500 * move the current srtt into rttvar to keep the current 501 * retransmit times until then. 502 */ 503 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 504 #ifdef INET6 505 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 506 in6_losing(tp->t_inpcb); 507 else 508 #endif 509 in_losing(tp->t_inpcb); 510 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 511 tp->t_srtt = 0; 512 } 513 tp->snd_nxt = tp->snd_una; 514 tp->snd_recover = tp->snd_max; 515 /* 516 * Force a segment to be sent. 517 */ 518 tp->t_flags |= TF_ACKNOW; 519 /* 520 * If timing a segment in this window, stop the timer. 521 */ 522 tp->t_rtttime = 0; 523 /* 524 * Close the congestion window down to one segment 525 * (we'll open it by one segment for each ack we get). 526 * Since we probably have a window's worth of unacked 527 * data accumulated, this "slow start" keeps us from 528 * dumping all that data as back-to-back packets (which 529 * might overwhelm an intermediate gateway). 530 * 531 * There are two phases to the opening: Initially we 532 * open by one mss on each ack. This makes the window 533 * size increase exponentially with time. If the 534 * window is larger than the path can handle, this 535 * exponential growth results in dropped packet(s) 536 * almost immediately. To get more time between 537 * drops but still "push" the network to take advantage 538 * of improving conditions, we switch from exponential 539 * to linear window opening at some threshhold size. 540 * For a threshhold, we use half the current window 541 * size, truncated to a multiple of the mss. 542 * 543 * (the minimum cwnd that will give us exponential 544 * growth is 2 mss. We don't allow the threshhold 545 * to go below this.) 546 */ 547 { 548 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 549 if (win < 2) 550 win = 2; 551 tp->snd_cwnd = tp->t_maxseg; 552 tp->snd_ssthresh = win * tp->t_maxseg; 553 tp->t_dupacks = 0; 554 } 555 EXIT_FASTRECOVERY(tp); 556 (void) tcp_output(tp); 557 558 out: 559 #ifdef TCPDEBUG 560 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 561 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 562 PRU_SLOWTIMO); 563 #endif 564 splx(s); 565 } 566