1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 67 * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.20 2003/01/29 22:45:36 hsu Exp $ 68 */ 69 70 #include "opt_inet.h" 71 #include "opt_inet6.h" 72 #include "opt_ipsec.h" 73 #include "opt_tcpdebug.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/kernel.h> 78 #include <sys/sysctl.h> 79 #include <sys/mbuf.h> 80 #include <sys/domain.h> 81 #include <sys/protosw.h> 82 #include <sys/socket.h> 83 #include <sys/socketvar.h> 84 #include <sys/in_cksum.h> 85 #include <sys/thread.h> 86 #include <sys/globaldata.h> 87 88 #include <net/route.h> 89 90 #include <netinet/in.h> 91 #include <netinet/in_systm.h> 92 #include <netinet/ip.h> 93 #include <netinet/in_pcb.h> 94 #include <netinet/ip_var.h> 95 #include <netinet6/in6_pcb.h> 96 #include <netinet/ip6.h> 97 #include <netinet6/ip6_var.h> 98 #include <netinet/tcp.h> 99 #define TCPOUTFLAGS 100 #include <netinet/tcp_fsm.h> 101 #include <netinet/tcp_seq.h> 102 #include <netinet/tcp_timer.h> 103 #include <netinet/tcp_timer2.h> 104 #include <netinet/tcp_var.h> 105 #include <netinet/tcpip.h> 106 #ifdef TCPDEBUG 107 #include <netinet/tcp_debug.h> 108 #endif 109 110 #ifdef IPSEC 111 #include <netinet6/ipsec.h> 112 #endif /*IPSEC*/ 113 114 #ifdef FAST_IPSEC 115 #include <netproto/ipsec/ipsec.h> 116 #define IPSEC 117 #endif /*FAST_IPSEC*/ 118 119 #ifdef notyet 120 extern struct mbuf *m_copypack(); 121 #endif 122 123 int path_mtu_discovery = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, 125 &path_mtu_discovery, 1, "Enable Path MTU Discovery"); 126 127 static int avoid_pure_win_update = 1; 128 SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW, 129 &avoid_pure_win_update, 1, "Avoid pure window updates when possible"); 130 131 int tcp_do_autosndbuf = 1; 132 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, 133 &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); 134 135 int tcp_autosndbuf_inc = 8*1024; 136 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, 137 &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); 138 139 int tcp_autosndbuf_max = 2*1024*1024; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, 141 &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); 142 143 static int tcp_idle_cwv = 1; 144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, idle_cwv, CTLFLAG_RW, 145 &tcp_idle_cwv, 0, 146 "Congestion window validation after idle period (part of RFC2861)"); 147 148 static int tcp_idle_restart = 1; 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, idle_restart, CTLFLAG_RW, 150 &tcp_idle_restart, 0, "Reset congestion window after idle period"); 151 152 static void tcp_idle_cwnd_validate(struct tcpcb *); 153 154 /* 155 * Tcp output routine: figure out what should be sent and send it. 156 */ 157 int 158 tcp_output(struct tcpcb *tp) 159 { 160 struct inpcb * const inp = tp->t_inpcb; 161 struct socket *so = inp->inp_socket; 162 long len, recvwin, sendwin; 163 int nsacked = 0; 164 int off, flags, error = 0; 165 #ifdef TCP_SIGNATURE 166 int sigoff = 0; 167 #endif 168 struct mbuf *m; 169 struct ip *ip; 170 struct ipovly *ipov; 171 struct tcphdr *th; 172 u_char opt[TCP_MAXOLEN]; 173 unsigned int ipoptlen, optlen, hdrlen; 174 int idle, idle_cwv = 0; 175 boolean_t sendalot; 176 struct ip6_hdr *ip6; 177 #ifdef INET6 178 const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 179 #else 180 const boolean_t isipv6 = FALSE; 181 #endif 182 183 KKASSERT(so->so_port == &curthread->td_msgport); 184 185 /* 186 * Determine length of data that should be transmitted, 187 * and flags that will be used. 188 * If there is some data or critical controls (SYN, RST) 189 * to send, then transmit; otherwise, investigate further. 190 */ 191 192 /* 193 * If we have been idle for a while, the send congestion window 194 * could be no longer representative of the current state of the 195 * link; need to validate congestion window. However, we should 196 * not perform congestion window validation here, since we could 197 * be asked to send pure ACK. 198 */ 199 if (tp->snd_max == tp->snd_una && 200 (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart) 201 idle_cwv = 1; 202 203 /* 204 * Calculate whether the transmit stream was previously idle 205 * and adjust TF_LASTIDLE for the next time. 206 */ 207 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 208 if (idle && (tp->t_flags & TF_MORETOCOME)) 209 tp->t_flags |= TF_LASTIDLE; 210 else 211 tp->t_flags &= ~TF_LASTIDLE; 212 213 if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && 214 !IN_FASTRECOVERY(tp)) 215 nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); 216 217 again: 218 m = NULL; 219 ip = NULL; 220 ipov = NULL; 221 th = NULL; 222 ip6 = NULL; 223 224 /* Make use of SACK information when slow-starting after a RTO. */ 225 if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && 226 !IN_FASTRECOVERY(tp)) { 227 tcp_seq old_snd_nxt = tp->snd_nxt; 228 229 tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); 230 nsacked += tp->snd_nxt - old_snd_nxt; 231 } 232 233 sendalot = FALSE; 234 off = tp->snd_nxt - tp->snd_una; 235 sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); 236 sendwin = min(sendwin, tp->snd_bwnd); 237 238 flags = tcp_outflags[tp->t_state]; 239 /* 240 * Get standard flags, and add SYN or FIN if requested by 'hidden' 241 * state flags. 242 */ 243 if (tp->t_flags & TF_NEEDFIN) 244 flags |= TH_FIN; 245 if (tp->t_flags & TF_NEEDSYN) 246 flags |= TH_SYN; 247 248 /* 249 * If in persist timeout with window of 0, send 1 byte. 250 * Otherwise, if window is small but nonzero 251 * and timer expired, we will send what we can 252 * and go to transmit state. 253 */ 254 if (tp->t_flags & TF_FORCE) { 255 if (sendwin == 0) { 256 /* 257 * If we still have some data to send, then 258 * clear the FIN bit. Usually this would 259 * happen below when it realizes that we 260 * aren't sending all the data. However, 261 * if we have exactly 1 byte of unsent data, 262 * then it won't clear the FIN bit below, 263 * and if we are in persist state, we wind 264 * up sending the packet without recording 265 * that we sent the FIN bit. 266 * 267 * We can't just blindly clear the FIN bit, 268 * because if we don't have any more data 269 * to send then the probe will be the FIN 270 * itself. 271 */ 272 if (off < so->so_snd.ssb_cc) 273 flags &= ~TH_FIN; 274 sendwin = 1; 275 } else { 276 tcp_callout_stop(tp, tp->tt_persist); 277 tp->t_rxtshift = 0; 278 } 279 } 280 281 /* 282 * If snd_nxt == snd_max and we have transmitted a FIN, the 283 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in 284 * a negative length. This can also occur when TCP opens up 285 * its congestion window while receiving additional duplicate 286 * acks after fast-retransmit because TCP will reset snd_nxt 287 * to snd_max after the fast-retransmit. 288 * 289 * A negative length can also occur when we are in the 290 * TCPS_SYN_RECEIVED state due to a simultanious connect where 291 * our SYN has not been acked yet. 292 * 293 * In the normal retransmit-FIN-only case, however, snd_nxt will 294 * be set to snd_una, the offset will be 0, and the length may 295 * wind up 0. 296 */ 297 len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; 298 299 /* 300 * Lop off SYN bit if it has already been sent. However, if this 301 * is SYN-SENT state and if segment contains data, suppress sending 302 * segment (sending the segment would be an option if we still 303 * did TAO and the remote host supported it). 304 */ 305 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 306 flags &= ~TH_SYN; 307 off--, len++; 308 if (len > 0 && tp->t_state == TCPS_SYN_SENT) 309 return 0; 310 } 311 312 /* 313 * Be careful not to send data and/or FIN on SYN segments. 314 * This measure is needed to prevent interoperability problems 315 * with not fully conformant TCP implementations. 316 */ 317 if (flags & TH_SYN) { 318 len = 0; 319 flags &= ~TH_FIN; 320 } 321 322 if (len < 0) { 323 /* 324 * A negative len can occur if our FIN has been sent but not 325 * acked, or if we are in a simultanious connect in the 326 * TCPS_SYN_RECEIVED state with our SYN sent but not yet 327 * acked. 328 * 329 * If our window has contracted to 0 in the FIN case 330 * (which can only occur if we have NOT been called to 331 * retransmit as per code a few paragraphs up) then we 332 * want to shift the retransmit timer over to the 333 * persist timer. 334 * 335 * However, if we are in the TCPS_SYN_RECEIVED state 336 * (the SYN case) we will be in a simultanious connect and 337 * the window may be zero degeneratively. In this case we 338 * do not want to shift to the persist timer after the SYN 339 * or the SYN+ACK transmission. 340 */ 341 len = 0; 342 if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) { 343 tcp_callout_stop(tp, tp->tt_rexmt); 344 tp->t_rxtshift = 0; 345 tp->snd_nxt = tp->snd_una; 346 if (!tcp_callout_active(tp, tp->tt_persist)) 347 tcp_setpersist(tp); 348 } 349 } 350 351 KASSERT(len >= 0, ("%s: len < 0", __func__)); 352 /* 353 * Automatic sizing of send socket buffer. Often the send buffer 354 * size is not optimally adjusted to the actual network conditions 355 * at hand (delay bandwidth product). Setting the buffer size too 356 * small limits throughput on links with high bandwidth and high 357 * delay (eg. trans-continental/oceanic links). Setting the 358 * buffer size too big consumes too much real kernel memory, 359 * especially with many connections on busy servers. 360 * 361 * The criteria to step up the send buffer one notch are: 362 * 1. receive window of remote host is larger than send buffer 363 * (with a fudge factor of 5/4th); 364 * 2. send buffer is filled to 7/8th with data (so we actually 365 * have data to make use of it); 366 * 3. send buffer fill has not hit maximal automatic size; 367 * 4. our send window (slow start and cogestion controlled) is 368 * larger than sent but unacknowledged data in send buffer. 369 * 370 * The remote host receive window scaling factor may limit the 371 * growing of the send buffer before it reaches its allowed 372 * maximum. 373 * 374 * It scales directly with slow start or congestion window 375 * and does at most one step per received ACK. This fast 376 * scaling has the drawback of growing the send buffer beyond 377 * what is strictly necessary to make full use of a given 378 * delay*bandwith product. However testing has shown this not 379 * to be much of an problem. At worst we are trading wasting 380 * of available bandwith (the non-use of it) for wasting some 381 * socket buffer memory. 382 * 383 * TODO: Shrink send buffer during idle periods together 384 * with congestion window. Requires another timer. Has to 385 * wait for upcoming tcp timer rewrite. 386 */ 387 if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) { 388 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat && 389 so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) && 390 so->so_snd.ssb_cc < tcp_autosndbuf_max && 391 sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { 392 u_long newsize; 393 394 newsize = ulmin(so->so_snd.ssb_hiwat + 395 tcp_autosndbuf_inc, 396 tcp_autosndbuf_max); 397 if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) 398 atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); 399 if (newsize >= (TCP_MAXWIN << tp->snd_scale)) 400 atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); 401 } 402 } 403 404 /* 405 * Truncate to the maximum segment length and ensure that FIN is 406 * removed if the length no longer contains the last data byte. 407 */ 408 if (len > tp->t_maxseg) { 409 len = tp->t_maxseg; 410 sendalot = TRUE; 411 } 412 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) 413 flags &= ~TH_FIN; 414 415 recvwin = ssb_space(&so->so_rcv); 416 417 /* 418 * Sender silly window avoidance. We transmit under the following 419 * conditions when len is non-zero: 420 * 421 * - We have a full segment 422 * - This is the last buffer in a write()/send() and we are 423 * either idle or running NODELAY 424 * - we've timed out (e.g. persist timer) 425 * - we have more then 1/2 the maximum send window's worth of 426 * data (receiver may be limiting the window size) 427 * - we need to retransmit 428 */ 429 if (len) { 430 if (len == tp->t_maxseg) 431 goto send; 432 /* 433 * NOTE! on localhost connections an 'ack' from the remote 434 * end may occur synchronously with the output and cause 435 * us to flush a buffer queued with moretocome. XXX 436 * 437 * note: the len + off check is almost certainly unnecessary. 438 */ 439 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 440 (idle || (tp->t_flags & TF_NODELAY)) && 441 len + off >= so->so_snd.ssb_cc && 442 !(tp->t_flags & TF_NOPUSH)) { 443 goto send; 444 } 445 if (tp->t_flags & TF_FORCE) /* typ. timeout case */ 446 goto send; 447 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 448 goto send; 449 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ 450 goto send; 451 } 452 453 /* 454 * Compare available window to amount of window 455 * known to peer (as advertised window less 456 * next expected input). If the difference is at least two 457 * max size segments, or at least 50% of the maximum possible 458 * window, then want to send a window update to peer. 459 */ 460 if (recvwin > 0) { 461 /* 462 * "adv" is the amount we can increase the window, 463 * taking into account that we are limited by 464 * TCP_MAXWIN << tp->rcv_scale. 465 */ 466 long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - 467 (tp->rcv_adv - tp->rcv_nxt); 468 long hiwat; 469 470 /* 471 * This ack case typically occurs when the user has drained 472 * the TCP socket buffer sufficiently to warrent an ack 473 * containing a 'pure window update'... that is, an ack that 474 * ONLY updates the tcp window. 475 * 476 * It is unclear why we would need to do a pure window update 477 * past 2 segments if we are going to do one at 1/2 the high 478 * water mark anyway, especially since under normal conditions 479 * the user program will drain the socket buffer quickly. 480 * The 2-segment pure window update will often add a large 481 * number of extra, unnecessary acks to the stream. 482 * 483 * avoid_pure_win_update now defaults to 1. 484 */ 485 if (avoid_pure_win_update == 0 || 486 (tp->t_flags & TF_RXRESIZED)) { 487 if (adv >= (long) (2 * tp->t_maxseg)) { 488 goto send; 489 } 490 } 491 hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); 492 if (hiwat > (long)so->so_rcv.ssb_hiwat) 493 hiwat = (long)so->so_rcv.ssb_hiwat; 494 if (adv >= hiwat / 2) 495 goto send; 496 } 497 498 /* 499 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 500 * is also a catch-all for the retransmit timer timeout case. 501 */ 502 if (tp->t_flags & TF_ACKNOW) 503 goto send; 504 if ((flags & TH_RST) || 505 ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) 506 goto send; 507 if (SEQ_GT(tp->snd_up, tp->snd_una)) 508 goto send; 509 /* 510 * If our state indicates that FIN should be sent 511 * and we have not yet done so, then we need to send. 512 */ 513 if ((flags & TH_FIN) && 514 (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) 515 goto send; 516 517 /* 518 * TCP window updates are not reliable, rather a polling protocol 519 * using ``persist'' packets is used to insure receipt of window 520 * updates. The three ``states'' for the output side are: 521 * idle not doing retransmits or persists 522 * persisting to move a small or zero window 523 * (re)transmitting and thereby not persisting 524 * 525 * tcp_callout_active(tp, tp->tt_persist) 526 * is true when we are in persist state. 527 * The TF_FORCE flag in tp->t_flags 528 * is set when we are called to send a persist packet. 529 * tcp_callout_active(tp, tp->tt_rexmt) 530 * is set when we are retransmitting 531 * The output side is idle when both timers are zero. 532 * 533 * If send window is too small, there is data to transmit, and no 534 * retransmit or persist is pending, then go to persist state. 535 * 536 * If nothing happens soon, send when timer expires: 537 * if window is nonzero, transmit what we can, otherwise force out 538 * a byte. 539 * 540 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED 541 * with data pending. This situation can occur during a 542 * simultanious connect. 543 */ 544 if (so->so_snd.ssb_cc > 0 && 545 tp->t_state != TCPS_SYN_RECEIVED && 546 !tcp_callout_active(tp, tp->tt_rexmt) && 547 !tcp_callout_active(tp, tp->tt_persist)) { 548 tp->t_rxtshift = 0; 549 tcp_setpersist(tp); 550 } 551 552 /* 553 * No reason to send a segment, just return. 554 */ 555 return (0); 556 557 send: 558 /* 559 * Before ESTABLISHED, force sending of initial options 560 * unless TCP set not to do any options. 561 * NOTE: we assume that the IP/TCP header plus TCP options 562 * always fit in a single mbuf, leaving room for a maximum 563 * link header, i.e. 564 * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES 565 */ 566 optlen = 0; 567 if (isipv6) 568 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 569 else 570 hdrlen = sizeof(struct tcpiphdr); 571 if (flags & TH_SYN) { 572 tp->snd_nxt = tp->iss; 573 if (!(tp->t_flags & TF_NOOPT)) { 574 u_short mss; 575 576 opt[0] = TCPOPT_MAXSEG; 577 opt[1] = TCPOLEN_MAXSEG; 578 mss = htons((u_short) tcp_mssopt(tp)); 579 memcpy(opt + 2, &mss, sizeof mss); 580 optlen = TCPOLEN_MAXSEG; 581 582 if ((tp->t_flags & TF_REQ_SCALE) && 583 (!(flags & TH_ACK) || 584 (tp->t_flags & TF_RCVD_SCALE))) { 585 *((u_int32_t *)(opt + optlen)) = htonl( 586 TCPOPT_NOP << 24 | 587 TCPOPT_WINDOW << 16 | 588 TCPOLEN_WINDOW << 8 | 589 tp->request_r_scale); 590 optlen += 4; 591 } 592 593 if ((tcp_do_sack && !(flags & TH_ACK)) || 594 tp->t_flags & TF_SACK_PERMITTED) { 595 uint32_t *lp = (uint32_t *)(opt + optlen); 596 597 *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); 598 optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; 599 } 600 } 601 } 602 603 /* 604 * Send a timestamp and echo-reply if this is a SYN and our side 605 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 606 * and our peer have sent timestamps in our SYN's. 607 */ 608 if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && 609 !(flags & TH_RST) && 610 (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { 611 u_int32_t *lp = (u_int32_t *)(opt + optlen); 612 613 /* Form timestamp option as shown in appendix A of RFC 1323. */ 614 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 615 *lp++ = htonl(ticks); 616 *lp = htonl(tp->ts_recent); 617 optlen += TCPOLEN_TSTAMP_APPA; 618 } 619 620 /* Set receive buffer autosizing timestamp. */ 621 if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) 622 tp->rfbuf_ts = ticks; 623 624 /* 625 * If this is a SACK connection and we have a block to report, 626 * fill in the SACK blocks in the TCP options. 627 */ 628 if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == 629 TF_SACK_PERMITTED && 630 (!TAILQ_EMPTY(&tp->t_segq) || 631 tp->reportblk.rblk_start != tp->reportblk.rblk_end)) 632 tcp_sack_fill_report(tp, opt, &optlen); 633 634 #ifdef TCP_SIGNATURE 635 if (tp->t_flags & TF_SIGNATURE) { 636 int i; 637 u_char *bp; 638 /* 639 * Initialize TCP-MD5 option (RFC2385) 640 */ 641 bp = (u_char *)opt + optlen; 642 *bp++ = TCPOPT_SIGNATURE; 643 *bp++ = TCPOLEN_SIGNATURE; 644 sigoff = optlen + 2; 645 for (i = 0; i < TCP_SIGLEN; i++) 646 *bp++ = 0; 647 optlen += TCPOLEN_SIGNATURE; 648 /* 649 * Terminate options list and maintain 32-bit alignment. 650 */ 651 *bp++ = TCPOPT_NOP; 652 *bp++ = TCPOPT_EOL; 653 optlen += 2; 654 } 655 #endif /* TCP_SIGNATURE */ 656 KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); 657 hdrlen += optlen; 658 659 if (isipv6) { 660 ipoptlen = ip6_optlen(inp); 661 } else { 662 if (inp->inp_options) { 663 ipoptlen = inp->inp_options->m_len - 664 offsetof(struct ipoption, ipopt_list); 665 } else { 666 ipoptlen = 0; 667 } 668 } 669 #ifdef IPSEC 670 ipoptlen += ipsec_hdrsiz_tcp(tp); 671 #endif 672 673 /* 674 * Adjust data length if insertion of options will bump the packet 675 * length beyond the t_maxopd length. Clear FIN to prevent premature 676 * closure since there is still more data to send after this (now 677 * truncated) packet. 678 * 679 * If just the options do not fit we are in a no-win situation and 680 * we treat it as an unreachable host. 681 */ 682 if (len + optlen + ipoptlen > tp->t_maxopd) { 683 if (tp->t_maxopd <= optlen + ipoptlen) { 684 static time_t last_optlen_report; 685 686 if (last_optlen_report != time_second) { 687 last_optlen_report = time_second; 688 kprintf("tcpcb %p: MSS (%d) too small to hold options!\n", tp, tp->t_maxopd); 689 } 690 error = EHOSTUNREACH; 691 goto out; 692 } else { 693 flags &= ~TH_FIN; 694 len = tp->t_maxopd - optlen - ipoptlen; 695 sendalot = TRUE; 696 } 697 } 698 699 #ifdef INET6 700 KASSERT(max_linkhdr + hdrlen <= MCLBYTES, ("tcphdr too big")); 701 #else 702 KASSERT(max_linkhdr + hdrlen <= MHLEN, ("tcphdr too big")); 703 #endif 704 705 /* 706 * Grab a header mbuf, attaching a copy of data to 707 * be transmitted, and initialize the header from 708 * the template for sends on this connection. 709 */ 710 if (len) { 711 if ((tp->t_flags & TF_FORCE) && len == 1) 712 tcpstat.tcps_sndprobe++; 713 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 714 if (tp->snd_nxt == tp->snd_una) 715 tp->snd_max_rexmt = tp->snd_max; 716 if (nsacked) { 717 tcpstat.tcps_sndsackrtopack++; 718 tcpstat.tcps_sndsackrtobyte += len; 719 } 720 tcpstat.tcps_sndrexmitpack++; 721 tcpstat.tcps_sndrexmitbyte += len; 722 } else { 723 tcpstat.tcps_sndpack++; 724 tcpstat.tcps_sndbyte += len; 725 } 726 if (idle_cwv) { 727 idle_cwv = 0; 728 tcp_idle_cwnd_validate(tp); 729 } 730 /* Update last send time after CWV */ 731 tp->snd_last = ticks; 732 #ifdef notyet 733 if ((m = m_copypack(so->so_snd.ssb_mb, off, (int)len, 734 max_linkhdr + hdrlen)) == NULL) { 735 error = ENOBUFS; 736 goto after_th; 737 } 738 /* 739 * m_copypack left space for our hdr; use it. 740 */ 741 m->m_len += hdrlen; 742 m->m_data -= hdrlen; 743 #else 744 #ifndef INET6 745 m = m_gethdr(MB_DONTWAIT, MT_HEADER); 746 #else 747 m = m_getl(hdrlen + max_linkhdr, MB_DONTWAIT, MT_HEADER, 748 M_PKTHDR, NULL); 749 #endif 750 if (m == NULL) { 751 error = ENOBUFS; 752 goto after_th; 753 } 754 m->m_data += max_linkhdr; 755 m->m_len = hdrlen; 756 if (len <= MHLEN - hdrlen - max_linkhdr) { 757 m_copydata(so->so_snd.ssb_mb, off, (int) len, 758 mtod(m, caddr_t) + hdrlen); 759 m->m_len += len; 760 } else { 761 m->m_next = m_copy(so->so_snd.ssb_mb, off, (int) len); 762 if (m->m_next == NULL) { 763 m_free(m); 764 m = NULL; 765 error = ENOBUFS; 766 goto after_th; 767 } 768 } 769 #endif 770 /* 771 * If we're sending everything we've got, set PUSH. 772 * (This will keep happy those implementations which only 773 * give data to the user when a buffer fills or 774 * a PUSH comes in.) 775 */ 776 if (off + len == so->so_snd.ssb_cc) 777 flags |= TH_PUSH; 778 } else { 779 if (tp->t_flags & TF_ACKNOW) 780 tcpstat.tcps_sndacks++; 781 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 782 tcpstat.tcps_sndctrl++; 783 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 784 tcpstat.tcps_sndurg++; 785 else 786 tcpstat.tcps_sndwinup++; 787 788 MGETHDR(m, MB_DONTWAIT, MT_HEADER); 789 if (m == NULL) { 790 error = ENOBUFS; 791 goto after_th; 792 } 793 if (isipv6 && 794 (hdrlen + max_linkhdr > MHLEN) && hdrlen <= MHLEN) 795 MH_ALIGN(m, hdrlen); 796 else 797 m->m_data += max_linkhdr; 798 m->m_len = hdrlen; 799 } 800 m->m_pkthdr.rcvif = NULL; 801 if (isipv6) { 802 ip6 = mtod(m, struct ip6_hdr *); 803 th = (struct tcphdr *)(ip6 + 1); 804 tcp_fillheaders(tp, ip6, th); 805 } else { 806 ip = mtod(m, struct ip *); 807 ipov = (struct ipovly *)ip; 808 th = (struct tcphdr *)(ip + 1); 809 /* this picks up the pseudo header (w/o the length) */ 810 tcp_fillheaders(tp, ip, th); 811 } 812 after_th: 813 /* 814 * Fill in fields, remembering maximum advertised 815 * window for use in delaying messages about window sizes. 816 * If resending a FIN, be sure not to use a new sequence number. 817 */ 818 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 819 tp->snd_nxt == tp->snd_max) 820 tp->snd_nxt--; 821 822 if (th != NULL) { 823 /* 824 * If we are doing retransmissions, then snd_nxt will 825 * not reflect the first unsent octet. For ACK only 826 * packets, we do not want the sequence number of the 827 * retransmitted packet, we want the sequence number 828 * of the next unsent octet. So, if there is no data 829 * (and no SYN or FIN), use snd_max instead of snd_nxt 830 * when filling in ti_seq. But if we are in persist 831 * state, snd_max might reflect one byte beyond the 832 * right edge of the window, so use snd_nxt in that 833 * case, since we know we aren't doing a retransmission. 834 * (retransmit and persist are mutually exclusive...) 835 */ 836 if (len || (flags & (TH_SYN|TH_FIN)) || 837 tcp_callout_active(tp, tp->tt_persist)) 838 th->th_seq = htonl(tp->snd_nxt); 839 else 840 th->th_seq = htonl(tp->snd_max); 841 th->th_ack = htonl(tp->rcv_nxt); 842 if (optlen) { 843 bcopy(opt, th + 1, optlen); 844 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 845 } 846 th->th_flags = flags; 847 } 848 849 /* 850 * Calculate receive window. Don't shrink window, but avoid 851 * silly window syndrome by sending a 0 window if the actual 852 * window is less then one segment. 853 */ 854 if (recvwin < (long)(so->so_rcv.ssb_hiwat / 4) && 855 recvwin < (long)tp->t_maxseg) 856 recvwin = 0; 857 if (recvwin < (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt)) 858 recvwin = (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt); 859 if (recvwin > (long)TCP_MAXWIN << tp->rcv_scale) 860 recvwin = (long)TCP_MAXWIN << tp->rcv_scale; 861 862 /* 863 * Adjust the RXWIN0SENT flag - indicate that we have advertised 864 * a 0 window. This may cause the remote transmitter to stall. This 865 * flag tells soreceive() to disable delayed acknowledgements when 866 * draining the buffer. This can occur if the receiver is attempting 867 * to read more data then can be buffered prior to transmitting on 868 * the connection. 869 */ 870 if (recvwin == 0) 871 tp->t_flags |= TF_RXWIN0SENT; 872 else 873 tp->t_flags &= ~TF_RXWIN0SENT; 874 875 if (th != NULL) 876 th->th_win = htons((u_short) (recvwin>>tp->rcv_scale)); 877 878 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 879 if (th != NULL) { 880 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 881 th->th_flags |= TH_URG; 882 } 883 } else { 884 /* 885 * If no urgent pointer to send, then we pull 886 * the urgent pointer to the left edge of the send window 887 * so that it doesn't drift into the send window on sequence 888 * number wraparound. 889 */ 890 tp->snd_up = tp->snd_una; /* drag it along */ 891 } 892 893 if (th != NULL) { 894 #ifdef TCP_SIGNATURE 895 if (tp->t_flags & TF_SIGNATURE) { 896 tcpsignature_compute(m, len, optlen, 897 (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); 898 } 899 #endif /* TCP_SIGNATURE */ 900 901 /* 902 * Put TCP length in extended header, and then 903 * checksum extended header and data. 904 */ 905 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 906 if (isipv6) { 907 /* 908 * ip6_plen is not need to be filled now, and will be 909 * filled in ip6_output(). 910 */ 911 th->th_sum = in6_cksum(m, IPPROTO_TCP, 912 sizeof(struct ip6_hdr), 913 sizeof(struct tcphdr) + optlen + len); 914 } else { 915 m->m_pkthdr.csum_flags = CSUM_TCP; 916 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 917 if (len + optlen) { 918 th->th_sum = in_addword(th->th_sum, 919 htons((u_short)(optlen + len))); 920 } 921 922 /* 923 * IP version must be set here for ipv4/ipv6 checking 924 * later 925 */ 926 KASSERT(ip->ip_v == IPVERSION, 927 ("%s: IP version incorrect: %d", 928 __func__, ip->ip_v)); 929 } 930 } 931 932 /* 933 * In transmit state, time the transmission and arrange for 934 * the retransmit. In persist state, just set snd_max. 935 */ 936 if (!(tp->t_flags & TF_FORCE) || 937 !tcp_callout_active(tp, tp->tt_persist)) { 938 tcp_seq startseq = tp->snd_nxt; 939 940 /* 941 * Advance snd_nxt over sequence space of this segment. 942 */ 943 if (flags & (TH_SYN | TH_FIN)) { 944 if (flags & TH_SYN) 945 tp->snd_nxt++; 946 if (flags & TH_FIN) { 947 tp->snd_nxt++; 948 tp->t_flags |= TF_SENTFIN; 949 } 950 } 951 tp->snd_nxt += len; 952 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 953 tp->snd_max = tp->snd_nxt; 954 /* 955 * Time this transmission if not a retransmission and 956 * not currently timing anything. 957 */ 958 if (tp->t_rtttime == 0) { 959 tp->t_rtttime = ticks; 960 tp->t_rtseq = startseq; 961 tcpstat.tcps_segstimed++; 962 } 963 } 964 965 /* 966 * Set retransmit timer if not currently set, 967 * and not doing a pure ack or a keep-alive probe. 968 * Initial value for retransmit timer is smoothed 969 * round-trip time + 2 * round-trip time variance. 970 * Initialize shift counter which is used for backoff 971 * of retransmit time. 972 */ 973 if (!tcp_callout_active(tp, tp->tt_rexmt) && 974 tp->snd_nxt != tp->snd_una) { 975 if (tcp_callout_active(tp, tp->tt_persist)) { 976 tcp_callout_stop(tp, tp->tt_persist); 977 tp->t_rxtshift = 0; 978 } 979 tcp_callout_reset(tp, tp->tt_rexmt, tp->t_rxtcur, 980 tcp_timer_rexmt); 981 } 982 } else { 983 /* 984 * Persist case, update snd_max but since we are in 985 * persist mode (no window) we do not update snd_nxt. 986 */ 987 int xlen = len; 988 if (flags & TH_SYN) 989 panic("tcp_output: persist timer to send SYN"); 990 if (flags & TH_FIN) { 991 ++xlen; 992 tp->t_flags |= TF_SENTFIN; 993 } 994 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) 995 tp->snd_max = tp->snd_nxt + xlen; 996 } 997 998 if (th != NULL) { 999 #ifdef TCPDEBUG 1000 /* Trace. */ 1001 if (so->so_options & SO_DEBUG) { 1002 tcp_trace(TA_OUTPUT, tp->t_state, tp, 1003 mtod(m, void *), th, 0); 1004 } 1005 #endif 1006 1007 /* 1008 * Fill in IP length and desired time to live and 1009 * send to IP level. There should be a better way 1010 * to handle ttl and tos; we could keep them in 1011 * the template, but need a way to checksum without them. 1012 */ 1013 /* 1014 * m->m_pkthdr.len should have been set before cksum 1015 * calcuration, because in6_cksum() need it. 1016 */ 1017 if (isipv6) { 1018 /* 1019 * we separately set hoplimit for every segment, 1020 * since the user might want to change the value 1021 * via setsockopt. Also, desired default hop 1022 * limit might be changed via Neighbor Discovery. 1023 */ 1024 ip6->ip6_hlim = in6_selecthlim(inp, 1025 (inp->in6p_route.ro_rt ? 1026 inp->in6p_route.ro_rt->rt_ifp : NULL)); 1027 1028 /* TODO: IPv6 IP6TOS_ECT bit on */ 1029 error = ip6_output(m, inp->in6p_outputopts, 1030 &inp->in6p_route, (so->so_options & SO_DONTROUTE), 1031 NULL, NULL, inp); 1032 } else { 1033 struct rtentry *rt; 1034 ip->ip_len = m->m_pkthdr.len; 1035 #ifdef INET6 1036 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1037 ip->ip_ttl = in6_selecthlim(inp, 1038 (inp->in6p_route.ro_rt ? 1039 inp->in6p_route.ro_rt->rt_ifp : NULL)); 1040 else 1041 #endif 1042 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */ 1043 1044 ip->ip_tos = inp->inp_ip_tos; /* XXX */ 1045 /* 1046 * See if we should do MTU discovery. 1047 * We do it only if the following are true: 1048 * 1) we have a valid route to the destination 1049 * 2) the MTU is not locked (if it is, 1050 * then discovery has been disabled) 1051 */ 1052 if (path_mtu_discovery && 1053 (rt = inp->inp_route.ro_rt) && 1054 (rt->rt_flags & RTF_UP) && 1055 !(rt->rt_rmx.rmx_locks & RTV_MTU)) 1056 ip->ip_off |= IP_DF; 1057 1058 error = ip_output(m, inp->inp_options, &inp->inp_route, 1059 (so->so_options & SO_DONTROUTE) | 1060 IP_DEBUGROUTE, NULL, inp); 1061 } 1062 } else { 1063 KASSERT(error != 0, ("no error, but th not set")); 1064 } 1065 if (error) { 1066 1067 /* 1068 * We know that the packet was lost, so back out the 1069 * sequence number advance, if any. 1070 */ 1071 if (!(tp->t_flags & TF_FORCE) || 1072 !tcp_callout_active(tp, tp->tt_persist)) { 1073 /* 1074 * No need to check for TH_FIN here because 1075 * the TF_SENTFIN flag handles that case. 1076 */ 1077 if (!(flags & TH_SYN)) 1078 tp->snd_nxt -= len; 1079 } 1080 1081 out: 1082 if (error == ENOBUFS) { 1083 /* 1084 * If we can't send, make sure there is something 1085 * to get us going again later. 1086 * 1087 * The persist timer isn't necessarily allowed in all 1088 * states, use the rexmt timer. 1089 */ 1090 if (!tcp_callout_active(tp, tp->tt_rexmt) && 1091 !tcp_callout_active(tp, tp->tt_persist)) { 1092 tcp_callout_reset(tp, tp->tt_rexmt, 1093 tp->t_rxtcur, 1094 tcp_timer_rexmt); 1095 #if 0 1096 tp->t_rxtshift = 0; 1097 tcp_setpersist(tp); 1098 #endif 1099 } 1100 tcp_quench(inp, 0); 1101 return (0); 1102 } 1103 if (error == EMSGSIZE) { 1104 /* 1105 * ip_output() will have already fixed the route 1106 * for us. tcp_mtudisc() will, as its last action, 1107 * initiate retransmission, so it is important to 1108 * not do so here. 1109 */ 1110 tcp_mtudisc(inp, 0); 1111 return 0; 1112 } 1113 if ((error == EHOSTUNREACH || error == ENETDOWN) && 1114 TCPS_HAVERCVDSYN(tp->t_state)) { 1115 tp->t_softerror = error; 1116 return (0); 1117 } 1118 return (error); 1119 } 1120 tcpstat.tcps_sndtotal++; 1121 1122 /* 1123 * Data sent (as far as we can tell). 1124 * 1125 * If this advertises a larger window than any other segment, 1126 * then remember the size of the advertised window. 1127 * 1128 * Any pending ACK has now been sent. 1129 */ 1130 if (recvwin > 0 && SEQ_GT(tp->rcv_nxt + recvwin, tp->rcv_adv)) { 1131 tp->rcv_adv = tp->rcv_nxt + recvwin; 1132 tp->t_flags &= ~TF_RXRESIZED; 1133 } 1134 tp->last_ack_sent = tp->rcv_nxt; 1135 tp->t_flags &= ~TF_ACKNOW; 1136 if (tcp_delack_enabled) 1137 tcp_callout_stop(tp, tp->tt_delack); 1138 if (sendalot) 1139 goto again; 1140 return (0); 1141 } 1142 1143 void 1144 tcp_setpersist(struct tcpcb *tp) 1145 { 1146 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 1147 int tt; 1148 1149 if (tp->t_state == TCPS_SYN_SENT || 1150 tp->t_state == TCPS_SYN_RECEIVED) { 1151 panic("tcp_setpersist: not established yet, current %s", 1152 tp->t_state == TCPS_SYN_SENT ? 1153 "SYN_SENT" : "SYN_RECEIVED"); 1154 } 1155 1156 if (tcp_callout_active(tp, tp->tt_rexmt)) 1157 panic("tcp_setpersist: retransmit pending"); 1158 /* 1159 * Start/restart persistance timer. 1160 */ 1161 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, 1162 TCPTV_PERSMAX); 1163 tcp_callout_reset(tp, tp->tt_persist, tt, tcp_timer_persist); 1164 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1165 tp->t_rxtshift++; 1166 } 1167 1168 static void 1169 tcp_idle_cwnd_validate(struct tcpcb *tp) 1170 { 1171 u_long initial_cwnd = tcp_initial_window(tp); 1172 u_long min_cwnd; 1173 1174 tcpstat.tcps_sndidle++; 1175 1176 /* According to RFC5681: RW=min(IW,cwnd) */ 1177 min_cwnd = min(tp->snd_cwnd, initial_cwnd); 1178 1179 if (tcp_idle_cwv) { 1180 u_long idle_time, decay_cwnd; 1181 1182 /* 1183 * RFC2861, but only after idle period. 1184 */ 1185 1186 /* 1187 * Before the congestion window is reduced, ssthresh 1188 * is set to the maximum of its current value and 3/4 1189 * cwnd. If the sender then has more data to send 1190 * than the decayed cwnd allows, the TCP will slow- 1191 * start (perform exponential increase) at least 1192 * half-way back up to the old value of cwnd. 1193 */ 1194 tp->snd_ssthresh = max(tp->snd_ssthresh, 1195 (3 * tp->snd_cwnd) / 4); 1196 1197 /* 1198 * Decay the congestion window by half for every RTT 1199 * that the flow remains inactive. 1200 * 1201 * The difference between our implementation and 1202 * RFC2861 is that we don't allow cwnd to go below 1203 * the value allowed by RFC5681 (min_cwnd). 1204 */ 1205 idle_time = ticks - tp->snd_last; 1206 decay_cwnd = tp->snd_cwnd; 1207 while (idle_time >= tp->t_rxtcur && 1208 decay_cwnd > min_cwnd) { 1209 decay_cwnd >>= 1; 1210 idle_time -= tp->t_rxtcur; 1211 } 1212 tp->snd_cwnd = max(decay_cwnd, min_cwnd); 1213 } else { 1214 /* 1215 * Slow-start from scratch to re-determine the send 1216 * congestion window. 1217 */ 1218 tp->snd_cwnd = min_cwnd; 1219 } 1220 1221 /* Restart ABC counting during congestion avoidance */ 1222 tp->snd_wacked = 0; 1223 } 1224