1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/cc/cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_persmin; 81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84 int tcp_persmax; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88 int tcp_keepinit; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92 int tcp_keepidle; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100 int tcp_delacktime; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105 int tcp_msl; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109 int tcp_rexmit_min; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114 int tcp_rexmit_slop; 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119 static int always_keepalive = 1; 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122 123 int tcp_fast_finwait2_recycle = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 125 &tcp_fast_finwait2_recycle, 0, 126 "Recycle closed FIN_WAIT_2 connections faster"); 127 128 int tcp_finwait2_timeout; 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 131 132 int tcp_keepcnt = TCPTV_KEEPCNT; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 134 "Number of keepalive probes to send"); 135 136 /* max idle probes */ 137 int tcp_maxpersistidle; 138 139 static int tcp_rexmit_drop_options = 0; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 141 &tcp_rexmit_drop_options, 0, 142 "Drop TCP options from 3rd and later retransmitted SYN"); 143 144 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 146 CTLFLAG_RW|CTLFLAG_VNET, 147 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 148 "Path MTU Discovery Black Hole Detection Enabled"); 149 150 #ifdef INET 151 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 153 CTLFLAG_RW|CTLFLAG_VNET, 154 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 155 "Path MTU Discovery Black Hole Detection lowered MSS"); 156 #endif 157 158 #ifdef INET6 159 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 161 CTLFLAG_RW|CTLFLAG_VNET, 162 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 163 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 164 #endif 165 166 #ifdef RSS 167 static int per_cpu_timers = 1; 168 #else 169 static int per_cpu_timers = 0; 170 #endif 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 172 &per_cpu_timers , 0, "run tcp timers on all cpus"); 173 174 #if 0 175 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 176 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 177 #endif 178 179 /* 180 * Map the given inp to a CPU id. 181 * 182 * This queries RSS if it's compiled in, else it defaults to the current 183 * CPU ID. 184 */ 185 static inline int 186 inp_to_cpuid(struct inpcb *inp) 187 { 188 u_int cpuid; 189 190 #ifdef RSS 191 if (per_cpu_timers) { 192 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 193 if (cpuid == NETISR_CPUID_NONE) 194 return (curcpu); /* XXX */ 195 else 196 return (cpuid); 197 } 198 #else 199 /* Legacy, pre-RSS behaviour */ 200 if (per_cpu_timers) { 201 /* 202 * We don't have a flowid -> cpuid mapping, so cheat and 203 * just map unknown cpuids to curcpu. Not the best, but 204 * apparently better than defaulting to swi 0. 205 */ 206 cpuid = inp->inp_flowid % (mp_maxid + 1); 207 if (! CPU_ABSENT(cpuid)) 208 return (cpuid); 209 return (curcpu); 210 } 211 #endif 212 /* Default for RSS and non-RSS - cpuid 0 */ 213 else { 214 return (0); 215 } 216 } 217 218 /* 219 * Tcp protocol timeout routine called every 500 ms. 220 * Updates timestamps used for TCP 221 * causes finite state machine actions if timers expire. 222 */ 223 void 224 tcp_slowtimo(void) 225 { 226 VNET_ITERATOR_DECL(vnet_iter); 227 228 VNET_LIST_RLOCK_NOSLEEP(); 229 VNET_FOREACH(vnet_iter) { 230 CURVNET_SET(vnet_iter); 231 (void) tcp_tw_2msl_scan(0); 232 CURVNET_RESTORE(); 233 } 234 VNET_LIST_RUNLOCK_NOSLEEP(); 235 } 236 237 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 238 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 239 240 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 241 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 242 243 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 244 245 /* 246 * TCP timer processing. 247 */ 248 249 void 250 tcp_timer_delack(void *xtp) 251 { 252 struct tcpcb *tp = xtp; 253 struct inpcb *inp; 254 CURVNET_SET(tp->t_vnet); 255 256 inp = tp->t_inpcb; 257 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 258 INP_WLOCK(inp); 259 if (callout_pending(&tp->t_timers->tt_delack) || 260 !callout_active(&tp->t_timers->tt_delack)) { 261 INP_WUNLOCK(inp); 262 CURVNET_RESTORE(); 263 return; 264 } 265 callout_deactivate(&tp->t_timers->tt_delack); 266 if ((inp->inp_flags & INP_DROPPED) != 0) { 267 INP_WUNLOCK(inp); 268 CURVNET_RESTORE(); 269 return; 270 } 271 tp->t_flags |= TF_ACKNOW; 272 TCPSTAT_INC(tcps_delack); 273 (void) tp->t_fb->tfb_tcp_output(tp); 274 INP_WUNLOCK(inp); 275 CURVNET_RESTORE(); 276 } 277 278 /* 279 * When a timer wants to remove a TCB it must 280 * hold the INP_INFO_RLOCK(). The timer function 281 * should only have grabbed the INP_WLOCK() when 282 * it entered. To safely switch to holding both the 283 * INP_INFO_RLOCK() and the INP_WLOCK() we must first 284 * grab a reference on the inp, which will hold the inp 285 * so that it can't be removed. We then unlock the INP_WLOCK(), 286 * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() 287 * we proceed again to get the INP_WLOCK() (this preserves proper 288 * lock order). After acquiring the INP_WLOCK we must check if someone 289 * else deleted the pcb i.e. the inp_flags check. 290 * If so we return 1 otherwise we return 0. 291 * 292 * No matter what the tcp_inpinfo_lock_add() function 293 * returns the caller must afterwards call tcp_inpinfo_lock_del() 294 * to drop the locks and reference properly. 295 */ 296 297 int 298 tcp_inpinfo_lock_add(struct inpcb *inp) 299 { 300 in_pcbref(inp); 301 INP_WUNLOCK(inp); 302 INP_INFO_RLOCK(&V_tcbinfo); 303 INP_WLOCK(inp); 304 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 305 return(1); 306 } 307 return(0); 308 309 } 310 311 void 312 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 313 { 314 INP_INFO_RUNLOCK(&V_tcbinfo); 315 if (inp && (tp == NULL)) { 316 /* 317 * If tcp_close/drop() gets called and tp 318 * returns NULL, then the function dropped 319 * the inp lock, we hold a reference keeping 320 * this around, so we must re-aquire the 321 * INP_WLOCK() in order to proceed with 322 * our dropping the inp reference. 323 */ 324 INP_WLOCK(inp); 325 } 326 if (inp && in_pcbrele_wlocked(inp) == 0) 327 INP_WUNLOCK(inp); 328 } 329 330 void 331 tcp_timer_2msl(void *xtp) 332 { 333 struct tcpcb *tp = xtp; 334 struct inpcb *inp; 335 CURVNET_SET(tp->t_vnet); 336 #ifdef TCPDEBUG 337 int ostate; 338 339 ostate = tp->t_state; 340 #endif 341 inp = tp->t_inpcb; 342 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 343 INP_WLOCK(inp); 344 tcp_free_sackholes(tp); 345 if (callout_pending(&tp->t_timers->tt_2msl) || 346 !callout_active(&tp->t_timers->tt_2msl)) { 347 INP_WUNLOCK(tp->t_inpcb); 348 CURVNET_RESTORE(); 349 return; 350 } 351 callout_deactivate(&tp->t_timers->tt_2msl); 352 if ((inp->inp_flags & INP_DROPPED) != 0) { 353 INP_WUNLOCK(inp); 354 CURVNET_RESTORE(); 355 return; 356 } 357 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 358 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 359 /* 360 * 2 MSL timeout in shutdown went off. If we're closed but 361 * still waiting for peer to close and connection has been idle 362 * too long delete connection control block. Otherwise, check 363 * again in a bit. 364 * 365 * If in TIME_WAIT state just ignore as this timeout is handled in 366 * tcp_tw_2msl_scan(). 367 * 368 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 369 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 370 * Ignore fact that there were recent incoming segments. 371 */ 372 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 373 INP_WUNLOCK(inp); 374 CURVNET_RESTORE(); 375 return; 376 } 377 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 378 tp->t_inpcb && tp->t_inpcb->inp_socket && 379 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 380 TCPSTAT_INC(tcps_finwait2_drops); 381 if (tcp_inpinfo_lock_add(inp)) { 382 tcp_inpinfo_lock_del(inp, tp); 383 goto out; 384 } 385 tp = tcp_close(tp); 386 tcp_inpinfo_lock_del(inp, tp); 387 goto out; 388 } else { 389 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 390 callout_reset(&tp->t_timers->tt_2msl, 391 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 392 } else { 393 if (tcp_inpinfo_lock_add(inp)) { 394 tcp_inpinfo_lock_del(inp, tp); 395 goto out; 396 } 397 tp = tcp_close(tp); 398 tcp_inpinfo_lock_del(inp, tp); 399 goto out; 400 } 401 } 402 403 #ifdef TCPDEBUG 404 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 405 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 406 PRU_SLOWTIMO); 407 #endif 408 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 409 410 if (tp != NULL) 411 INP_WUNLOCK(inp); 412 out: 413 CURVNET_RESTORE(); 414 } 415 416 void 417 tcp_timer_keep(void *xtp) 418 { 419 struct tcpcb *tp = xtp; 420 struct tcptemp *t_template; 421 struct inpcb *inp; 422 CURVNET_SET(tp->t_vnet); 423 #ifdef TCPDEBUG 424 int ostate; 425 426 ostate = tp->t_state; 427 #endif 428 inp = tp->t_inpcb; 429 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 430 INP_WLOCK(inp); 431 if (callout_pending(&tp->t_timers->tt_keep) || 432 !callout_active(&tp->t_timers->tt_keep)) { 433 INP_WUNLOCK(inp); 434 CURVNET_RESTORE(); 435 return; 436 } 437 callout_deactivate(&tp->t_timers->tt_keep); 438 if ((inp->inp_flags & INP_DROPPED) != 0) { 439 INP_WUNLOCK(inp); 440 CURVNET_RESTORE(); 441 return; 442 } 443 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 444 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 445 446 /* 447 * Because we don't regularly reset the keepalive callout in 448 * the ESTABLISHED state, it may be that we don't actually need 449 * to send a keepalive yet. If that occurs, schedule another 450 * call for the next time the keepalive timer might expire. 451 */ 452 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 453 u_int idletime; 454 455 idletime = ticks - tp->t_rcvtime; 456 if (idletime < TP_KEEPIDLE(tp)) { 457 callout_reset(&tp->t_timers->tt_keep, 458 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 459 INP_WUNLOCK(inp); 460 CURVNET_RESTORE(); 461 return; 462 } 463 } 464 465 /* 466 * Keep-alive timer went off; send something 467 * or drop connection if idle for too long. 468 */ 469 TCPSTAT_INC(tcps_keeptimeo); 470 if (tp->t_state < TCPS_ESTABLISHED) 471 goto dropit; 472 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 473 tp->t_state <= TCPS_CLOSING) { 474 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 475 goto dropit; 476 /* 477 * Send a packet designed to force a response 478 * if the peer is up and reachable: 479 * either an ACK if the connection is still alive, 480 * or an RST if the peer has closed the connection 481 * due to timeout or reboot. 482 * Using sequence number tp->snd_una-1 483 * causes the transmitted zero-length segment 484 * to lie outside the receive window; 485 * by the protocol spec, this requires the 486 * correspondent TCP to respond. 487 */ 488 TCPSTAT_INC(tcps_keepprobe); 489 t_template = tcpip_maketemplate(inp); 490 if (t_template) { 491 tcp_respond(tp, t_template->tt_ipgen, 492 &t_template->tt_t, (struct mbuf *)NULL, 493 tp->rcv_nxt, tp->snd_una - 1, 0); 494 free(t_template, M_TEMP); 495 } 496 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 497 tcp_timer_keep, tp); 498 } else 499 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 500 tcp_timer_keep, tp); 501 502 #ifdef TCPDEBUG 503 if (inp->inp_socket->so_options & SO_DEBUG) 504 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 505 PRU_SLOWTIMO); 506 #endif 507 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 508 INP_WUNLOCK(inp); 509 CURVNET_RESTORE(); 510 return; 511 512 dropit: 513 TCPSTAT_INC(tcps_keepdrops); 514 515 if (tcp_inpinfo_lock_add(inp)) { 516 tcp_inpinfo_lock_del(inp, tp); 517 goto out; 518 } 519 tp = tcp_drop(tp, ETIMEDOUT); 520 521 #ifdef TCPDEBUG 522 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 523 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 524 PRU_SLOWTIMO); 525 #endif 526 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 527 tcp_inpinfo_lock_del(inp, tp); 528 out: 529 CURVNET_RESTORE(); 530 } 531 532 void 533 tcp_timer_persist(void *xtp) 534 { 535 struct tcpcb *tp = xtp; 536 struct inpcb *inp; 537 CURVNET_SET(tp->t_vnet); 538 #ifdef TCPDEBUG 539 int ostate; 540 541 ostate = tp->t_state; 542 #endif 543 inp = tp->t_inpcb; 544 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 545 INP_WLOCK(inp); 546 if (callout_pending(&tp->t_timers->tt_persist) || 547 !callout_active(&tp->t_timers->tt_persist)) { 548 INP_WUNLOCK(inp); 549 CURVNET_RESTORE(); 550 return; 551 } 552 callout_deactivate(&tp->t_timers->tt_persist); 553 if ((inp->inp_flags & INP_DROPPED) != 0) { 554 INP_WUNLOCK(inp); 555 CURVNET_RESTORE(); 556 return; 557 } 558 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 559 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 560 /* 561 * Persistence timer into zero window. 562 * Force a byte to be output, if possible. 563 */ 564 TCPSTAT_INC(tcps_persisttimeo); 565 /* 566 * Hack: if the peer is dead/unreachable, we do not 567 * time out if the window is closed. After a full 568 * backoff, drop the connection if the idle time 569 * (no responses to probes) reaches the maximum 570 * backoff that we would use if retransmitting. 571 */ 572 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 573 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 574 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 575 TCPSTAT_INC(tcps_persistdrop); 576 if (tcp_inpinfo_lock_add(inp)) { 577 tcp_inpinfo_lock_del(inp, tp); 578 goto out; 579 } 580 tp = tcp_drop(tp, ETIMEDOUT); 581 tcp_inpinfo_lock_del(inp, tp); 582 goto out; 583 } 584 /* 585 * If the user has closed the socket then drop a persisting 586 * connection after a much reduced timeout. 587 */ 588 if (tp->t_state > TCPS_CLOSE_WAIT && 589 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 590 TCPSTAT_INC(tcps_persistdrop); 591 if (tcp_inpinfo_lock_add(inp)) { 592 tcp_inpinfo_lock_del(inp, tp); 593 goto out; 594 } 595 tp = tcp_drop(tp, ETIMEDOUT); 596 tcp_inpinfo_lock_del(inp, tp); 597 goto out; 598 } 599 tcp_setpersist(tp); 600 tp->t_flags |= TF_FORCEDATA; 601 (void) tp->t_fb->tfb_tcp_output(tp); 602 tp->t_flags &= ~TF_FORCEDATA; 603 604 #ifdef TCPDEBUG 605 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 606 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 607 #endif 608 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 609 INP_WUNLOCK(inp); 610 out: 611 CURVNET_RESTORE(); 612 } 613 614 void 615 tcp_timer_rexmt(void * xtp) 616 { 617 struct tcpcb *tp = xtp; 618 CURVNET_SET(tp->t_vnet); 619 int rexmt; 620 struct inpcb *inp; 621 #ifdef TCPDEBUG 622 int ostate; 623 624 ostate = tp->t_state; 625 #endif 626 inp = tp->t_inpcb; 627 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 628 INP_WLOCK(inp); 629 if (callout_pending(&tp->t_timers->tt_rexmt) || 630 !callout_active(&tp->t_timers->tt_rexmt)) { 631 INP_WUNLOCK(inp); 632 CURVNET_RESTORE(); 633 return; 634 } 635 callout_deactivate(&tp->t_timers->tt_rexmt); 636 if ((inp->inp_flags & INP_DROPPED) != 0) { 637 INP_WUNLOCK(inp); 638 CURVNET_RESTORE(); 639 return; 640 } 641 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 642 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 643 tcp_free_sackholes(tp); 644 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 645 /* The stack has a timer action too. */ 646 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 647 } 648 /* 649 * Retransmission timer went off. Message has not 650 * been acked within retransmit interval. Back off 651 * to a longer retransmit interval and retransmit one segment. 652 */ 653 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 654 tp->t_rxtshift = TCP_MAXRXTSHIFT; 655 TCPSTAT_INC(tcps_timeoutdrop); 656 if (tcp_inpinfo_lock_add(inp)) { 657 tcp_inpinfo_lock_del(inp, tp); 658 goto out; 659 } 660 tp = tcp_drop(tp, tp->t_softerror ? 661 tp->t_softerror : ETIMEDOUT); 662 tcp_inpinfo_lock_del(inp, tp); 663 goto out; 664 } 665 if (tp->t_state == TCPS_SYN_SENT) { 666 /* 667 * If the SYN was retransmitted, indicate CWND to be 668 * limited to 1 segment in cc_conn_init(). 669 */ 670 tp->snd_cwnd = 1; 671 } else if (tp->t_rxtshift == 1) { 672 /* 673 * first retransmit; record ssthresh and cwnd so they can 674 * be recovered if this turns out to be a "bad" retransmit. 675 * A retransmit is considered "bad" if an ACK for this 676 * segment is received within RTT/2 interval; the assumption 677 * here is that the ACK was already in flight. See 678 * "On Estimating End-to-End Network Path Properties" by 679 * Allman and Paxson for more details. 680 */ 681 tp->snd_cwnd_prev = tp->snd_cwnd; 682 tp->snd_ssthresh_prev = tp->snd_ssthresh; 683 tp->snd_recover_prev = tp->snd_recover; 684 if (IN_FASTRECOVERY(tp->t_flags)) 685 tp->t_flags |= TF_WASFRECOVERY; 686 else 687 tp->t_flags &= ~TF_WASFRECOVERY; 688 if (IN_CONGRECOVERY(tp->t_flags)) 689 tp->t_flags |= TF_WASCRECOVERY; 690 else 691 tp->t_flags &= ~TF_WASCRECOVERY; 692 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 693 tp->t_flags |= TF_PREVVALID; 694 } else 695 tp->t_flags &= ~TF_PREVVALID; 696 TCPSTAT_INC(tcps_rexmttimeo); 697 if ((tp->t_state == TCPS_SYN_SENT) || 698 (tp->t_state == TCPS_SYN_RECEIVED)) 699 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 700 else 701 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 702 TCPT_RANGESET(tp->t_rxtcur, rexmt, 703 tp->t_rttmin, TCPTV_REXMTMAX); 704 705 /* 706 * We enter the path for PLMTUD if connection is established or, if 707 * connection is FIN_WAIT_1 status, reason for the last is that if 708 * amount of data we send is very small, we could send it in couple of 709 * packets and process straight to FIN. In that case we won't catch 710 * ESTABLISHED state. 711 */ 712 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 713 || (tp->t_state == TCPS_FIN_WAIT_1))) { 714 #ifdef INET6 715 int isipv6; 716 #endif 717 718 /* 719 * Idea here is that at each stage of mtu probe (usually, 1448 720 * -> 1188 -> 524) should be given 2 chances to recover before 721 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 722 * take care of that. 723 */ 724 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 725 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 726 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 727 tp->t_rxtshift % 2 == 0)) { 728 /* 729 * Enter Path MTU Black-hole Detection mechanism: 730 * - Disable Path MTU Discovery (IP "DF" bit). 731 * - Reduce MTU to lower value than what we 732 * negotiated with peer. 733 */ 734 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 735 /* Record that we may have found a black hole. */ 736 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 737 /* Keep track of previous MSS. */ 738 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 739 } 740 741 /* 742 * Reduce the MSS to blackhole value or to the default 743 * in an attempt to retransmit. 744 */ 745 #ifdef INET6 746 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 747 if (isipv6 && 748 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 749 /* Use the sysctl tuneable blackhole MSS. */ 750 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 751 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 752 } else if (isipv6) { 753 /* Use the default MSS. */ 754 tp->t_maxseg = V_tcp_v6mssdflt; 755 /* 756 * Disable Path MTU Discovery when we switch to 757 * minmss. 758 */ 759 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 760 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 761 } 762 #endif 763 #if defined(INET6) && defined(INET) 764 else 765 #endif 766 #ifdef INET 767 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 768 /* Use the sysctl tuneable blackhole MSS. */ 769 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 770 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 771 } else { 772 /* Use the default MSS. */ 773 tp->t_maxseg = V_tcp_mssdflt; 774 /* 775 * Disable Path MTU Discovery when we switch to 776 * minmss. 777 */ 778 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 779 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 780 } 781 #endif 782 /* 783 * Reset the slow-start flight size 784 * as it may depend on the new MSS. 785 */ 786 if (CC_ALGO(tp)->conn_init != NULL) 787 CC_ALGO(tp)->conn_init(tp->ccv); 788 } else { 789 /* 790 * If further retransmissions are still unsuccessful 791 * with a lowered MTU, maybe this isn't a blackhole and 792 * we restore the previous MSS and blackhole detection 793 * flags. 794 * The limit '6' is determined by giving each probe 795 * stage (1448, 1188, 524) 2 chances to recover. 796 */ 797 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 798 (tp->t_rxtshift >= 6)) { 799 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 800 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 801 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 802 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 803 /* 804 * Reset the slow-start flight size as it 805 * may depend on the new MSS. 806 */ 807 if (CC_ALGO(tp)->conn_init != NULL) 808 CC_ALGO(tp)->conn_init(tp->ccv); 809 } 810 } 811 } 812 813 /* 814 * Disable RFC1323 and SACK if we haven't got any response to 815 * our third SYN to work-around some broken terminal servers 816 * (most of which have hopefully been retired) that have bad VJ 817 * header compression code which trashes TCP segments containing 818 * unknown-to-them TCP options. 819 */ 820 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 821 (tp->t_rxtshift == 3)) 822 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 823 /* 824 * If we backed off this far, notify the L3 protocol that we're having 825 * connection problems. 826 */ 827 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 828 #ifdef INET6 829 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 830 in6_losing(tp->t_inpcb); 831 else 832 #endif 833 in_losing(tp->t_inpcb); 834 } 835 tp->snd_nxt = tp->snd_una; 836 tp->snd_recover = tp->snd_max; 837 /* 838 * Force a segment to be sent. 839 */ 840 tp->t_flags |= TF_ACKNOW; 841 /* 842 * If timing a segment in this window, stop the timer. 843 */ 844 tp->t_rtttime = 0; 845 846 cc_cong_signal(tp, NULL, CC_RTO); 847 848 (void) tp->t_fb->tfb_tcp_output(tp); 849 850 #ifdef TCPDEBUG 851 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 852 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 853 PRU_SLOWTIMO); 854 #endif 855 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 856 INP_WUNLOCK(inp); 857 out: 858 CURVNET_RESTORE(); 859 } 860 861 void 862 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 863 { 864 struct callout *t_callout; 865 timeout_t *f_callout; 866 struct inpcb *inp = tp->t_inpcb; 867 int cpu = inp_to_cpuid(inp); 868 869 #ifdef TCP_OFFLOAD 870 if (tp->t_flags & TF_TOE) 871 return; 872 #endif 873 874 if (tp->t_timers->tt_flags & TT_STOPPED) 875 return; 876 877 switch (timer_type) { 878 case TT_DELACK: 879 t_callout = &tp->t_timers->tt_delack; 880 f_callout = tcp_timer_delack; 881 break; 882 case TT_REXMT: 883 t_callout = &tp->t_timers->tt_rexmt; 884 f_callout = tcp_timer_rexmt; 885 break; 886 case TT_PERSIST: 887 t_callout = &tp->t_timers->tt_persist; 888 f_callout = tcp_timer_persist; 889 break; 890 case TT_KEEP: 891 t_callout = &tp->t_timers->tt_keep; 892 f_callout = tcp_timer_keep; 893 break; 894 case TT_2MSL: 895 t_callout = &tp->t_timers->tt_2msl; 896 f_callout = tcp_timer_2msl; 897 break; 898 default: 899 if (tp->t_fb->tfb_tcp_timer_activate) { 900 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 901 return; 902 } 903 panic("tp %p bad timer_type %#x", tp, timer_type); 904 } 905 if (delta == 0) { 906 callout_stop(t_callout); 907 } else { 908 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 909 } 910 } 911 912 int 913 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 914 { 915 struct callout *t_callout; 916 917 switch (timer_type) { 918 case TT_DELACK: 919 t_callout = &tp->t_timers->tt_delack; 920 break; 921 case TT_REXMT: 922 t_callout = &tp->t_timers->tt_rexmt; 923 break; 924 case TT_PERSIST: 925 t_callout = &tp->t_timers->tt_persist; 926 break; 927 case TT_KEEP: 928 t_callout = &tp->t_timers->tt_keep; 929 break; 930 case TT_2MSL: 931 t_callout = &tp->t_timers->tt_2msl; 932 break; 933 default: 934 if (tp->t_fb->tfb_tcp_timer_active) { 935 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 936 } 937 panic("tp %p bad timer_type %#x", tp, timer_type); 938 } 939 return callout_active(t_callout); 940 } 941 942 void 943 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 944 { 945 struct callout *t_callout; 946 947 tp->t_timers->tt_flags |= TT_STOPPED; 948 switch (timer_type) { 949 case TT_DELACK: 950 t_callout = &tp->t_timers->tt_delack; 951 break; 952 case TT_REXMT: 953 t_callout = &tp->t_timers->tt_rexmt; 954 break; 955 case TT_PERSIST: 956 t_callout = &tp->t_timers->tt_persist; 957 break; 958 case TT_KEEP: 959 t_callout = &tp->t_timers->tt_keep; 960 break; 961 case TT_2MSL: 962 t_callout = &tp->t_timers->tt_2msl; 963 break; 964 default: 965 if (tp->t_fb->tfb_tcp_timer_stop) { 966 /* 967 * XXXrrs we need to look at this with the 968 * stop case below (flags). 969 */ 970 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 971 return; 972 } 973 panic("tp %p bad timer_type %#x", tp, timer_type); 974 } 975 976 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 977 /* 978 * Can't stop the callout, defer tcpcb actual deletion 979 * to the last one. We do this using the async drain 980 * function and incrementing the count in 981 */ 982 tp->t_timers->tt_draincnt++; 983 } 984 } 985