xref: /dragonfly/sys/netinet/tcp_output.c (revision 0600465e)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
63  * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.20 2003/01/29 22:45:36 hsu Exp $
64  */
65 
66 #include "opt_inet.h"
67 #include "opt_inet6.h"
68 #include "opt_tcpdebug.h"
69 
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/sysctl.h>
74 #include <sys/mbuf.h>
75 #include <sys/domain.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <sys/in_cksum.h>
80 #include <sys/thread.h>
81 #include <sys/globaldata.h>
82 
83 #include <net/if.h>
84 #include <net/if_var.h>
85 #include <net/route.h>
86 #include <net/netmsg2.h>
87 #include <net/netisr2.h>
88 
89 #include <netinet/in.h>
90 #include <netinet/in_systm.h>
91 #include <netinet/ip.h>
92 #include <netinet/in_pcb.h>
93 #include <netinet/ip_var.h>
94 #include <netinet6/in6_pcb.h>
95 #include <netinet/ip6.h>
96 #include <netinet6/ip6_var.h>
97 #include <netinet/tcp.h>
98 #define	TCPOUTFLAGS
99 #include <netinet/tcp_fsm.h>
100 #include <netinet/tcp_seq.h>
101 #include <netinet/tcp_timer.h>
102 #include <netinet/tcp_timer2.h>
103 #include <netinet/tcp_var.h>
104 #include <netinet/tcpip.h>
105 #ifdef TCPDEBUG
106 #include <netinet/tcp_debug.h>
107 #endif
108 
109 #ifdef notyet
110 extern struct mbuf *m_copypack();
111 #endif
112 
113 int path_mtu_discovery = 1;
114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
115 	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
116 
117 static int avoid_pure_win_update = 1;
118 SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
119 	&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
120 
121 /*
122  * 1 - enabled for increasing and decreasing the buffer size
123  * 2 - enabled only for increasing the buffer size
124  */
125 int tcp_do_autosndbuf = 1;
126 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
127     &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
128 
129 int tcp_autosndbuf_inc = 8*1024;
130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
131     &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
132 
133 int tcp_autosndbuf_min = 32768;
134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_min, CTLFLAG_RW,
135     &tcp_autosndbuf_min, 0, "Min size of automatic send buffer");
136 
137 int tcp_autosndbuf_max = 2*1024*1024;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
139     &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
140 
141 int tcp_prio_synack = 1;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, prio_synack, CTLFLAG_RW,
143     &tcp_prio_synack, 0, "Prioritize SYN, SYN|ACK and pure ACK");
144 
145 static int tcp_idle_cwv = 1;
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, idle_cwv, CTLFLAG_RW,
147     &tcp_idle_cwv, 0,
148     "Congestion window validation after idle period (part of RFC2861)");
149 
150 static int tcp_idle_restart = 1;
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, idle_restart, CTLFLAG_RW,
152     &tcp_idle_restart, 0, "Reset congestion window after idle period");
153 
154 static int tcp_do_tso = 1;
155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
156     &tcp_do_tso, 0, "Enable TCP Segmentation Offload (TSO)");
157 
158 static int tcp_fairsend = 4;
159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fairsend, CTLFLAG_RW,
160     &tcp_fairsend, 0,
161     "Amount of segments sent before yield to other senders or receivers");
162 
163 static void	tcp_idle_cwnd_validate(struct tcpcb *);
164 
165 static int	tcp_tso_getsize(struct tcpcb *tp, u_int *segsz, u_int *hlen);
166 static void	tcp_output_sched(struct tcpcb *tp);
167 
168 /*
169  * Tcp output routine: figure out what should be sent and send it.
170  */
171 int
172 tcp_output(struct tcpcb *tp)
173 {
174 	struct inpcb * const inp = tp->t_inpcb;
175 	struct socket *so = inp->inp_socket;
176 	long len, recvwin, sendwin;
177 	int nsacked = 0;
178 	int off, flags, error = 0;
179 #ifdef TCP_SIGNATURE
180 	int sigoff = 0;
181 #endif
182 	struct mbuf *m;
183 	struct ip *ip;
184 	struct tcphdr *th;
185 	u_char opt[TCP_MAXOLEN];
186 	unsigned int ipoptlen, optlen, hdrlen;
187 	int idle;
188 	boolean_t sendalot;
189 	struct ip6_hdr *ip6;
190 #ifdef INET6
191 	const boolean_t isipv6 = INP_ISIPV6(inp);
192 #else
193 	const boolean_t isipv6 = FALSE;
194 #endif
195 	boolean_t can_tso = FALSE, use_tso;
196 	boolean_t report_sack, idle_cwv = FALSE;
197 	u_int segsz, tso_hlen, tso_lenmax = 0;
198 	int segcnt = 0;
199 	boolean_t need_sched = FALSE;
200 
201 	KKASSERT(so->so_port == &curthread->td_msgport);
202 
203 	/*
204 	 * Determine length of data that should be transmitted,
205 	 * and flags that will be used.
206 	 * If there is some data or critical controls (SYN, RST)
207 	 * to send, then transmit; otherwise, investigate further.
208 	 */
209 
210 	/*
211 	 * If we have been idle for a while, the send congestion window
212 	 * could be no longer representative of the current state of the
213 	 * link; need to validate congestion window.  However, we should
214 	 * not perform congestion window validation here, since we could
215 	 * be asked to send pure ACK.
216 	 */
217 	if (tp->snd_max == tp->snd_una &&
218 	    (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart)
219 		idle_cwv = TRUE;
220 
221 	/*
222 	 * Calculate whether the transmit stream was previously idle
223 	 * and adjust TF_LASTIDLE for the next time.
224 	 */
225 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
226 	if (idle && (tp->t_flags & TF_MORETOCOME))
227 		tp->t_flags |= TF_LASTIDLE;
228 	else
229 		tp->t_flags &= ~TF_LASTIDLE;
230 
231 	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
232 	    !IN_FASTRECOVERY(tp))
233 		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);
234 
235 	/*
236 	 * Find out whether TSO could be used or not
237 	 *
238 	 * For TSO capable devices, the following assumptions apply to
239 	 * the processing of TCP flags:
240 	 * - If FIN is set on the large TCP segment, the device must set
241 	 *   FIN on the last segment that it creates from the large TCP
242 	 *   segment.
243 	 * - If PUSH is set on the large TCP segment, the device must set
244 	 *   PUSH on the last segment that it creates from the large TCP
245 	 *   segment.
246 	 */
247 	if (tcp_do_tso
248 #ifdef TCP_SIGNATURE
249 	    && (tp->t_flags & TF_SIGNATURE) == 0
250 #endif
251 	) {
252 		if (!isipv6) {
253 			struct rtentry *rt = inp->inp_route.ro_rt;
254 
255 			if (rt != NULL && (rt->rt_flags & RTF_UP) &&
256 			    (rt->rt_ifp->if_hwassist & CSUM_TSO)) {
257 				can_tso = TRUE;
258 				tso_lenmax = rt->rt_ifp->if_tsolen;
259 			}
260 		}
261 	}
262 
263 again:
264 	m = NULL;
265 	ip = NULL;
266 	th = NULL;
267 	ip6 = NULL;
268 
269 	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
270 		TF_SACK_PERMITTED &&
271 	    (!TAILQ_EMPTY(&tp->t_segq) ||
272 	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
273 		report_sack = TRUE;
274 	else
275 		report_sack = FALSE;
276 
277 	/* Make use of SACK information when slow-starting after a RTO. */
278 	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
279 	    !IN_FASTRECOVERY(tp)) {
280 		tcp_seq old_snd_nxt = tp->snd_nxt;
281 
282 		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
283 		nsacked += tp->snd_nxt - old_snd_nxt;
284 	}
285 
286 	sendalot = FALSE;
287 	off = tp->snd_nxt - tp->snd_una;
288 	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
289 	sendwin = min(sendwin, tp->snd_bwnd);
290 
291 	flags = tcp_outflags[tp->t_state];
292 	/*
293 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
294 	 * state flags.
295 	 */
296 	if (tp->t_flags & TF_NEEDFIN)
297 		flags |= TH_FIN;
298 	if (tp->t_flags & TF_NEEDSYN)
299 		flags |= TH_SYN;
300 
301 	/*
302 	 * If in persist timeout with window of 0, send 1 byte.
303 	 * Otherwise, if window is small but nonzero
304 	 * and timer expired, we will send what we can
305 	 * and go to transmit state.
306 	 */
307 	if (tp->t_flags & TF_FORCE) {
308 		if (sendwin == 0) {
309 			/*
310 			 * If we still have some data to send, then
311 			 * clear the FIN bit.  Usually this would
312 			 * happen below when it realizes that we
313 			 * aren't sending all the data.  However,
314 			 * if we have exactly 1 byte of unsent data,
315 			 * then it won't clear the FIN bit below,
316 			 * and if we are in persist state, we wind
317 			 * up sending the packet without recording
318 			 * that we sent the FIN bit.
319 			 *
320 			 * We can't just blindly clear the FIN bit,
321 			 * because if we don't have any more data
322 			 * to send then the probe will be the FIN
323 			 * itself.
324 			 */
325 			if (off < so->so_snd.ssb_cc)
326 				flags &= ~TH_FIN;
327 			sendwin = 1;
328 		} else {
329 			tcp_callout_stop(tp, tp->tt_persist);
330 			tp->t_rxtshift = 0;
331 		}
332 	}
333 
334 	/*
335 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
336 	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
337 	 * a negative length.  This can also occur when TCP opens up
338 	 * its congestion window while receiving additional duplicate
339 	 * acks after fast-retransmit because TCP will reset snd_nxt
340 	 * to snd_max after the fast-retransmit.
341 	 *
342 	 * A negative length can also occur when we are in the
343 	 * TCPS_SYN_RECEIVED state due to a simultanious connect where
344 	 * our SYN has not been acked yet.
345 	 *
346 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
347 	 * be set to snd_una, the offset will be 0, and the length may
348 	 * wind up 0.
349 	 */
350 	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;
351 
352 	/*
353 	 * Lop off SYN bit if it has already been sent.  However, if this
354 	 * is SYN-SENT state and if segment contains data, suppress sending
355 	 * segment (sending the segment would be an option if we still
356 	 * did TAO and the remote host supported it).
357 	 */
358 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
359 		flags &= ~TH_SYN;
360 		off--, len++;
361 		if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
362 			tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
363 			return 0;
364 		}
365 	}
366 
367 	/*
368 	 * Be careful not to send data and/or FIN on SYN segments.
369 	 * This measure is needed to prevent interoperability problems
370 	 * with not fully conformant TCP implementations.
371 	 */
372 	if (flags & TH_SYN) {
373 		len = 0;
374 		flags &= ~TH_FIN;
375 	}
376 
377 	if (len < 0) {
378 		/*
379 		 * A negative len can occur if our FIN has been sent but not
380 		 * acked, or if we are in a simultanious connect in the
381 		 * TCPS_SYN_RECEIVED state with our SYN sent but not yet
382 		 * acked.
383 		 *
384 		 * If our window has contracted to 0 in the FIN case
385 		 * (which can only occur if we have NOT been called to
386 		 * retransmit as per code a few paragraphs up) then we
387 		 * want to shift the retransmit timer over to the
388 		 * persist timer.
389 		 *
390 		 * However, if we are in the TCPS_SYN_RECEIVED state
391 		 * (the SYN case) we will be in a simultanious connect and
392 		 * the window may be zero degeneratively.  In this case we
393 		 * do not want to shift to the persist timer after the SYN
394 		 * or the SYN+ACK transmission.
395 		 */
396 		len = 0;
397 		if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
398 			tcp_callout_stop(tp, tp->tt_rexmt);
399 			tp->t_rxtshift = 0;
400 			tp->snd_nxt = tp->snd_una;
401 			if (!tcp_callout_active(tp, tp->tt_persist))
402 				tcp_setpersist(tp);
403 		}
404 	}
405 
406 	KASSERT(len >= 0, ("%s: len < 0", __func__));
407 	/*
408 	 * Automatic sizing of send socket buffer.  Often the send buffer
409 	 * size is not optimally adjusted to the actual network conditions
410 	 * at hand (delay bandwidth product).  Setting the buffer size too
411 	 * small limits throughput on links with high bandwidth and high
412 	 * delay (eg. trans-continental/oceanic links).  Setting the
413 	 * buffer size too big consumes too much real kernel memory,
414 	 * especially with many connections on busy servers.
415 	 *
416 	 * The criteria to step up the send buffer one notch are:
417 	 *  1. receive window of remote host is larger than send buffer
418 	 *     (with a fudge factor of 5/4th);
419 	 *  2. hiwat has not significantly exceeded bwnd (inflight)
420 	 *     (bwnd is a maximal value if inflight is disabled).
421 	 *  3. send buffer is filled to 7/8th with data (so we actually
422 	 *     have data to make use of it);
423 	 *  4. hiwat has not hit maximal automatic size;
424 	 *  5. our send window (slow start and cogestion controlled) is
425 	 *     larger than sent but unacknowledged data in send buffer.
426 	 *
427 	 * The remote host receive window scaling factor may limit the
428 	 * growing of the send buffer before it reaches its allowed
429 	 * maximum.
430 	 *
431 	 * It scales directly with slow start or congestion window
432 	 * and does at most one step per received ACK.  This fast
433 	 * scaling has the drawback of growing the send buffer beyond
434 	 * what is strictly necessary to make full use of a given
435 	 * delay*bandwith product.  However testing has shown this not
436 	 * to be much of an problem.  At worst we are trading wasting
437 	 * of available bandwith (the non-use of it) for wasting some
438 	 * socket buffer memory.
439 	 *
440 	 * The criteria for shrinking the buffer is based solely on
441 	 * the inflight code (snd_bwnd).  If inflight is disabled,
442 	 * the buffer will not be shrinked.  Note that snd_bwnd already
443 	 * has a fudge factor.  Our test adds a little hysteresis.
444 	 */
445 	if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) {
446 		const int asbinc = tcp_autosndbuf_inc;
447 		const int hiwat = so->so_snd.ssb_hiwat;
448 		const int lowat = so->so_snd.ssb_lowat;
449 		u_long newsize;
450 
451 		if ((tp->snd_wnd / 4 * 5) >= hiwat &&
452 		    so->so_snd.ssb_cc >= (hiwat / 8 * 7) &&
453 		    hiwat < tp->snd_bwnd + hiwat / 10 &&
454 		    hiwat + asbinc < tcp_autosndbuf_max &&
455 		    hiwat < (TCP_MAXWIN << tp->snd_scale) &&
456 		    sendwin >= (so->so_snd.ssb_cc -
457 				(tp->snd_nxt - tp->snd_una))) {
458 			newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max);
459 			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
460 				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
461 #if 0
462 			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
463 				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
464 #endif
465 		} else if ((long)tp->snd_bwnd <
466 			   (long)(hiwat * 3 / 4 - lowat - asbinc) &&
467 			   hiwat > tp->t_maxseg * 2 + asbinc &&
468 			   hiwat + asbinc >= tcp_autosndbuf_min &&
469 			   tcp_do_autosndbuf == 1) {
470 			newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2);
471 			ssb_reserve(&so->so_snd, newsize, so, NULL);
472 		}
473 	}
474 
475 	/*
476 	 * Don't use TSO, if:
477 	 * - Congestion window needs validation
478 	 * - There are SACK blocks to report
479 	 * - RST or SYN flags is set
480 	 * - URG will be set
481 	 *
482 	 * XXX
483 	 * Checking for SYN|RST looks overkill, just to be safe than sorry
484 	 */
485 	use_tso = can_tso;
486 	if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN)))
487 		use_tso = FALSE;
488 	if (use_tso) {
489 		tcp_seq ugr_nxt = tp->snd_nxt;
490 
491 		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
492 		    tp->snd_nxt == tp->snd_max)
493 			--ugr_nxt;
494 
495 		if (SEQ_GT(tp->snd_up, ugr_nxt))
496 			use_tso = FALSE;
497 	}
498 
499 	if (use_tso) {
500 		/*
501 		 * Find out segment size and header length for TSO
502 		 */
503 		error = tcp_tso_getsize(tp, &segsz, &tso_hlen);
504 		if (error)
505 			use_tso = FALSE;
506 	}
507 	if (!use_tso) {
508 		segsz = tp->t_maxseg;
509 		tso_hlen = 0; /* not used */
510 	}
511 
512 	/*
513 	 * Truncate to the maximum segment length if not TSO, and ensure that
514 	 * FIN is removed if the length no longer contains the last data byte.
515 	 */
516 	if (len > segsz) {
517 		if (!use_tso) {
518 			len = segsz;
519 			++segcnt;
520 		} else {
521 			int nsegs;
522 
523 			if (__predict_false(tso_lenmax < segsz))
524 				tso_lenmax = segsz << 1;
525 
526 			/*
527 			 * Truncate TSO transfers to (IP_MAXPACKET - iphlen -
528 			 * thoff), and make sure that we send equal size
529 			 * transfers down the stack (rather than big-small-
530 			 * big-small-...).
531 			 */
532 			len = min(len, tso_lenmax);
533 			nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz;
534 			KKASSERT(nsegs > 0);
535 
536 			len = nsegs * segsz;
537 
538 			if (len <= segsz) {
539 				use_tso = FALSE;
540 				++segcnt;
541 			} else {
542 				segcnt += nsegs;
543 			}
544 		}
545 		sendalot = TRUE;
546 	} else {
547 		use_tso = FALSE;
548 		if (len > 0)
549 			++segcnt;
550 	}
551 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
552 		flags &= ~TH_FIN;
553 
554 	recvwin = ssb_space(&so->so_rcv);
555 
556 	/*
557 	 * Sender silly window avoidance.   We transmit under the following
558 	 * conditions when len is non-zero:
559 	 *
560 	 *	- We have a full segment
561 	 *	- This is the last buffer in a write()/send() and we are
562 	 *	  either idle or running NODELAY
563 	 *	- we've timed out (e.g. persist timer)
564 	 *	- we have more then 1/2 the maximum send window's worth of
565 	 *	  data (receiver may be limiting the window size)
566 	 *	- we need to retransmit
567 	 */
568 	if (len) {
569 		if (len >= segsz)
570 			goto send;
571 		/*
572 		 * NOTE! on localhost connections an 'ack' from the remote
573 		 * end may occur synchronously with the output and cause
574 		 * us to flush a buffer queued with moretocome.  XXX
575 		 *
576 		 * note: the len + off check is almost certainly unnecessary.
577 		 */
578 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
579 		    (idle || (tp->t_flags & TF_NODELAY)) &&
580 		    len + off >= so->so_snd.ssb_cc &&
581 		    !(tp->t_flags & TF_NOPUSH)) {
582 			goto send;
583 		}
584 		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
585 			goto send;
586 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
587 			goto send;
588 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
589 			goto send;
590 		if (tp->t_flags & TF_XMITNOW)
591 			goto send;
592 	}
593 
594 	/*
595 	 * Compare available window to amount of window
596 	 * known to peer (as advertised window less
597 	 * next expected input).  If the difference is at least two
598 	 * max size segments, or at least 50% of the maximum possible
599 	 * window, then want to send a window update to peer.
600 	 */
601 	if (recvwin > 0) {
602 		/*
603 		 * "adv" is the amount we can increase the window,
604 		 * taking into account that we are limited by
605 		 * TCP_MAXWIN << tp->rcv_scale.
606 		 */
607 		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
608 			(tp->rcv_adv - tp->rcv_nxt);
609 		long hiwat;
610 
611 		/*
612 		 * This ack case typically occurs when the user has drained
613 		 * the TCP socket buffer sufficiently to warrent an ack
614 		 * containing a 'pure window update'... that is, an ack that
615 		 * ONLY updates the tcp window.
616 		 *
617 		 * It is unclear why we would need to do a pure window update
618 		 * past 2 segments if we are going to do one at 1/2 the high
619 		 * water mark anyway, especially since under normal conditions
620 		 * the user program will drain the socket buffer quickly.
621 		 * The 2-segment pure window update will often add a large
622 		 * number of extra, unnecessary acks to the stream.
623 		 *
624 		 * avoid_pure_win_update now defaults to 1.
625 		 */
626 		if (avoid_pure_win_update == 0 ||
627 		    (tp->t_flags & TF_RXRESIZED)) {
628 			if (adv >= (long) (2 * segsz)) {
629 				goto send;
630 			}
631 		}
632 		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
633 		if (hiwat > (long)so->so_rcv.ssb_hiwat)
634 			hiwat = (long)so->so_rcv.ssb_hiwat;
635 		if (adv >= hiwat / 2)
636 			goto send;
637 	}
638 
639 	/*
640 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
641 	 * is also a catch-all for the retransmit timer timeout case.
642 	 */
643 	if (tp->t_flags & TF_ACKNOW)
644 		goto send;
645 	if ((flags & TH_RST) ||
646 	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
647 		goto send;
648 	if (SEQ_GT(tp->snd_up, tp->snd_una))
649 		goto send;
650 	/*
651 	 * If our state indicates that FIN should be sent
652 	 * and we have not yet done so, then we need to send.
653 	 */
654 	if ((flags & TH_FIN) &&
655 	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
656 		goto send;
657 
658 	/*
659 	 * TCP window updates are not reliable, rather a polling protocol
660 	 * using ``persist'' packets is used to insure receipt of window
661 	 * updates.  The three ``states'' for the output side are:
662 	 *	idle			not doing retransmits or persists
663 	 *	persisting		to move a small or zero window
664 	 *	(re)transmitting	and thereby not persisting
665 	 *
666 	 * tcp_callout_active(tp, tp->tt_persist)
667 	 *	is true when we are in persist state.
668 	 * The TF_FORCE flag in tp->t_flags
669 	 *	is set when we are called to send a persist packet.
670 	 * tcp_callout_active(tp, tp->tt_rexmt)
671 	 *	is set when we are retransmitting
672 	 * The output side is idle when both timers are zero.
673 	 *
674 	 * If send window is too small, there is data to transmit, and no
675 	 * retransmit or persist is pending, then go to persist state.
676 	 *
677 	 * If nothing happens soon, send when timer expires:
678 	 * if window is nonzero, transmit what we can, otherwise force out
679 	 * a byte.
680 	 *
681 	 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED
682 	 * with data pending.  This situation can occur during a
683 	 * simultanious connect.
684 	 */
685 	if (so->so_snd.ssb_cc > 0 &&
686 	    tp->t_state != TCPS_SYN_RECEIVED &&
687 	    !tcp_callout_active(tp, tp->tt_rexmt) &&
688 	    !tcp_callout_active(tp, tp->tt_persist)) {
689 		tp->t_rxtshift = 0;
690 		tcp_setpersist(tp);
691 	}
692 
693 	/*
694 	 * No reason to send a segment, just return.
695 	 */
696 	tp->t_flags &= ~TF_XMITNOW;
697 	return (0);
698 
699 send:
700 	if (need_sched && len > 0) {
701 		tcp_output_sched(tp);
702 		return 0;
703 	}
704 
705 	/*
706 	 * Before ESTABLISHED, force sending of initial options
707 	 * unless TCP set not to do any options.
708 	 * NOTE: we assume that the IP/TCP header plus TCP options
709 	 * always fit in a single mbuf, leaving room for a maximum
710 	 * link header, i.e.
711 	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
712 	 */
713 	optlen = 0;
714 	if (isipv6)
715 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
716 	else
717 		hdrlen = sizeof(struct tcpiphdr);
718 	if (flags & TH_SYN) {
719 		tp->snd_nxt = tp->iss;
720 		if (!(tp->t_flags & TF_NOOPT)) {
721 			u_short mss;
722 
723 			opt[0] = TCPOPT_MAXSEG;
724 			opt[1] = TCPOLEN_MAXSEG;
725 			mss = htons((u_short) tcp_mssopt(tp));
726 			memcpy(opt + 2, &mss, sizeof mss);
727 			optlen = TCPOLEN_MAXSEG;
728 
729 			if ((tp->t_flags & TF_REQ_SCALE) &&
730 			    (!(flags & TH_ACK) ||
731 			     (tp->t_flags & TF_RCVD_SCALE))) {
732 				*((u_int32_t *)(opt + optlen)) = htonl(
733 					TCPOPT_NOP << 24 |
734 					TCPOPT_WINDOW << 16 |
735 					TCPOLEN_WINDOW << 8 |
736 					tp->request_r_scale);
737 				optlen += 4;
738 			}
739 
740 			if ((tcp_do_sack && !(flags & TH_ACK)) ||
741 			    tp->t_flags & TF_SACK_PERMITTED) {
742 				uint32_t *lp = (uint32_t *)(opt + optlen);
743 
744 				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
745 				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
746 			}
747 		}
748 	}
749 
750 	/*
751 	 * Send a timestamp and echo-reply if this is a SYN and our side
752 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
753 	 * and our peer have sent timestamps in our SYN's.
754 	 */
755 	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
756 	    !(flags & TH_RST) &&
757 	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
758 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
759 
760 		/* Form timestamp option as shown in appendix A of RFC 1323. */
761 		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
762 		*lp++ = htonl(ticks);
763 		*lp   = htonl(tp->ts_recent);
764 		optlen += TCPOLEN_TSTAMP_APPA;
765 	}
766 
767 	/* Set receive buffer autosizing timestamp. */
768 	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
769 		tp->rfbuf_ts = ticks;
770 
771 	/*
772 	 * If this is a SACK connection and we have a block to report,
773 	 * fill in the SACK blocks in the TCP options.
774 	 */
775 	if (report_sack)
776 		tcp_sack_fill_report(tp, opt, &optlen);
777 
778 #ifdef TCP_SIGNATURE
779 	if (tp->t_flags & TF_SIGNATURE) {
780 		int i;
781 		u_char *bp;
782 		/*
783 		 * Initialize TCP-MD5 option (RFC2385)
784 		 */
785 		bp = (u_char *)opt + optlen;
786 		*bp++ = TCPOPT_SIGNATURE;
787 		*bp++ = TCPOLEN_SIGNATURE;
788 		sigoff = optlen + 2;
789 		for (i = 0; i < TCP_SIGLEN; i++)
790 			*bp++ = 0;
791 		optlen += TCPOLEN_SIGNATURE;
792 		/*
793 		 * Terminate options list and maintain 32-bit alignment.
794 		 */
795 		*bp++ = TCPOPT_NOP;
796 		*bp++ = TCPOPT_EOL;
797 		optlen += 2;
798 	}
799 #endif /* TCP_SIGNATURE */
800 	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
801 	hdrlen += optlen;
802 
803 	if (isipv6) {
804 		ipoptlen = ip6_optlen(inp);
805 	} else {
806 		if (inp->inp_options) {
807 			ipoptlen = inp->inp_options->m_len -
808 			    offsetof(struct ipoption, ipopt_list);
809 		} else {
810 			ipoptlen = 0;
811 		}
812 	}
813 
814 	if (use_tso) {
815 		/* TSO segment length must be multiple of segment size */
816 		KASSERT(len >= (2 * segsz) && (len % segsz == 0),
817 		    ("invalid TSO len %ld, segsz %u", len, segsz));
818 	} else {
819 		KASSERT(len <= segsz,
820 		    ("invalid len %ld, segsz %u", len, segsz));
821 
822 		/*
823 		 * Adjust data length if insertion of options will bump
824 		 * the packet length beyond the t_maxopd length.  Clear
825 		 * FIN to prevent premature closure since there is still
826 		 * more data to send after this (now truncated) packet.
827 		 *
828 		 * If just the options do not fit we are in a no-win
829 		 * situation and we treat it as an unreachable host.
830 		 */
831 		if (len + optlen + ipoptlen > tp->t_maxopd) {
832 			if (tp->t_maxopd <= optlen + ipoptlen) {
833 				static time_t last_optlen_report;
834 
835 				if (last_optlen_report != time_uptime) {
836 					last_optlen_report = time_uptime;
837 					kprintf("tcpcb %p: MSS (%d) too "
838 					    "small to hold options!\n",
839 					    tp, tp->t_maxopd);
840 				}
841 				error = EHOSTUNREACH;
842 				goto out;
843 			} else {
844 				flags &= ~TH_FIN;
845 				len = tp->t_maxopd - optlen - ipoptlen;
846 				sendalot = TRUE;
847 			}
848 		}
849 	}
850 
851 #ifdef INET6
852 	KASSERT(max_linkhdr + hdrlen <= MCLBYTES, ("tcphdr too big"));
853 #else
854 	KASSERT(max_linkhdr + hdrlen <= MHLEN, ("tcphdr too big"));
855 #endif
856 
857 	/*
858 	 * Grab a header mbuf, attaching a copy of data to
859 	 * be transmitted, and initialize the header from
860 	 * the template for sends on this connection.
861 	 */
862 	if (len) {
863 		if ((tp->t_flags & TF_FORCE) && len == 1)
864 			tcpstat.tcps_sndprobe++;
865 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
866 			if (tp->snd_nxt == tp->snd_una)
867 				tp->snd_max_rexmt = tp->snd_max;
868 			if (nsacked) {
869 				tcpstat.tcps_sndsackrtopack++;
870 				tcpstat.tcps_sndsackrtobyte += len;
871 			}
872 			tcpstat.tcps_sndrexmitpack++;
873 			tcpstat.tcps_sndrexmitbyte += len;
874 		} else {
875 			tcpstat.tcps_sndpack++;
876 			tcpstat.tcps_sndbyte += len;
877 		}
878 		if (idle_cwv) {
879 			idle_cwv = FALSE;
880 			tcp_idle_cwnd_validate(tp);
881 		}
882 		/* Update last send time after CWV */
883 		tp->snd_last = ticks;
884 #ifdef notyet
885 		if ((m = m_copypack(so->so_snd.ssb_mb, off, (int)len,
886 		    max_linkhdr + hdrlen)) == NULL) {
887 			error = ENOBUFS;
888 			goto after_th;
889 		}
890 		/*
891 		 * m_copypack left space for our hdr; use it.
892 		 */
893 		m->m_len += hdrlen;
894 		m->m_data -= hdrlen;
895 #else
896 #ifndef INET6
897 		m = m_gethdr(M_NOWAIT, MT_HEADER);
898 #else
899 		m = m_getl(hdrlen + max_linkhdr, M_NOWAIT, MT_HEADER,
900 			   M_PKTHDR, NULL);
901 #endif
902 		if (m == NULL) {
903 			error = ENOBUFS;
904 			goto after_th;
905 		}
906 		m->m_data += max_linkhdr;
907 		m->m_len = hdrlen;
908 		if (len <= MHLEN - hdrlen - max_linkhdr) {
909 			m_copydata(so->so_snd.ssb_mb, off, (int) len,
910 			    mtod(m, caddr_t) + hdrlen);
911 			m->m_len += len;
912 		} else {
913 			m->m_next = m_copy(so->so_snd.ssb_mb, off, (int) len);
914 			if (m->m_next == NULL) {
915 				m_free(m);
916 				m = NULL;
917 				error = ENOBUFS;
918 				goto after_th;
919 			}
920 		}
921 #endif
922 		/*
923 		 * If we're sending everything we've got, set PUSH.
924 		 * (This will keep happy those implementations which only
925 		 * give data to the user when a buffer fills or
926 		 * a PUSH comes in.)
927 		 */
928 		if (off + len == so->so_snd.ssb_cc)
929 			flags |= TH_PUSH;
930 	} else {
931 		if (tp->t_flags & TF_ACKNOW)
932 			tcpstat.tcps_sndacks++;
933 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
934 			tcpstat.tcps_sndctrl++;
935 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
936 			tcpstat.tcps_sndurg++;
937 		else
938 			tcpstat.tcps_sndwinup++;
939 
940 		MGETHDR(m, M_NOWAIT, MT_HEADER);
941 		if (m == NULL) {
942 			error = ENOBUFS;
943 			goto after_th;
944 		}
945 		if (isipv6 &&
946 		    (hdrlen + max_linkhdr > MHLEN) && hdrlen <= MHLEN)
947 			MH_ALIGN(m, hdrlen);
948 		else
949 			m->m_data += max_linkhdr;
950 		m->m_len = hdrlen;
951 
952 		/*
953 		 * Prioritize SYN, SYN|ACK and pure ACK.
954 		 * Leave FIN and RST as they are.
955 		 */
956 		if (tcp_prio_synack && (flags & (TH_FIN | TH_RST)) == 0)
957 			m->m_flags |= M_PRIO;
958 	}
959 	m->m_pkthdr.rcvif = NULL;
960 	if (isipv6) {
961 		ip6 = mtod(m, struct ip6_hdr *);
962 		th = (struct tcphdr *)(ip6 + 1);
963 		tcp_fillheaders(tp, ip6, th, use_tso);
964 	} else {
965 		ip = mtod(m, struct ip *);
966 		th = (struct tcphdr *)(ip + 1);
967 		/* this picks up the pseudo header (w/o the length) */
968 		tcp_fillheaders(tp, ip, th, use_tso);
969 	}
970 after_th:
971 	/*
972 	 * Fill in fields, remembering maximum advertised
973 	 * window for use in delaying messages about window sizes.
974 	 * If resending a FIN, be sure not to use a new sequence number.
975 	 */
976 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
977 	    tp->snd_nxt == tp->snd_max)
978 		tp->snd_nxt--;
979 
980 	if (th != NULL) {
981 		/*
982 		 * If we are doing retransmissions, then snd_nxt will
983 		 * not reflect the first unsent octet.  For ACK only
984 		 * packets, we do not want the sequence number of the
985 		 * retransmitted packet, we want the sequence number
986 		 * of the next unsent octet.  So, if there is no data
987 		 * (and no SYN or FIN), use snd_max instead of snd_nxt
988 		 * when filling in ti_seq.  But if we are in persist
989 		 * state, snd_max might reflect one byte beyond the
990 		 * right edge of the window, so use snd_nxt in that
991 		 * case, since we know we aren't doing a retransmission.
992 		 * (retransmit and persist are mutually exclusive...)
993 		 */
994 		if (len || (flags & (TH_SYN|TH_FIN)) ||
995 		    tcp_callout_active(tp, tp->tt_persist))
996 			th->th_seq = htonl(tp->snd_nxt);
997 		else
998 			th->th_seq = htonl(tp->snd_max);
999 		th->th_ack = htonl(tp->rcv_nxt);
1000 		if (optlen) {
1001 			bcopy(opt, th + 1, optlen);
1002 			th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1003 		}
1004 		th->th_flags = flags;
1005 	}
1006 
1007 	/*
1008 	 * Calculate receive window.  Don't shrink window, but avoid
1009 	 * silly window syndrome by sending a 0 window if the actual
1010 	 * window is less then one segment.
1011 	 */
1012 	if (recvwin < (long)(so->so_rcv.ssb_hiwat / 4) &&
1013 	    recvwin < (long)segsz)
1014 		recvwin = 0;
1015 	if (recvwin < (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt))
1016 		recvwin = (tcp_seq_diff_t)(tp->rcv_adv - tp->rcv_nxt);
1017 	if (recvwin > (long)TCP_MAXWIN << tp->rcv_scale)
1018 		recvwin = (long)TCP_MAXWIN << tp->rcv_scale;
1019 
1020 	/*
1021 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
1022 	 * a 0 window.  This may cause the remote transmitter to stall.  This
1023 	 * flag tells soreceive() to disable delayed acknowledgements when
1024 	 * draining the buffer.  This can occur if the receiver is attempting
1025 	 * to read more data then can be buffered prior to transmitting on
1026 	 * the connection.
1027 	 */
1028 	if (recvwin == 0)
1029 		tp->t_flags |= TF_RXWIN0SENT;
1030 	else
1031 		tp->t_flags &= ~TF_RXWIN0SENT;
1032 
1033 	if (th != NULL)
1034 		th->th_win = htons((u_short) (recvwin>>tp->rcv_scale));
1035 
1036 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1037 		KASSERT(!use_tso, ("URG with TSO"));
1038 		if (th != NULL) {
1039 			th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
1040 			th->th_flags |= TH_URG;
1041 		}
1042 	} else {
1043 		/*
1044 		 * If no urgent pointer to send, then we pull
1045 		 * the urgent pointer to the left edge of the send window
1046 		 * so that it doesn't drift into the send window on sequence
1047 		 * number wraparound.
1048 		 */
1049 		tp->snd_up = tp->snd_una;		/* drag it along */
1050 	}
1051 
1052 	if (th != NULL) {
1053 #ifdef TCP_SIGNATURE
1054 		if (tp->t_flags & TF_SIGNATURE) {
1055 			tcpsignature_compute(m, len, optlen,
1056 			    (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
1057 		}
1058 #endif /* TCP_SIGNATURE */
1059 
1060 		/*
1061 		 * Put TCP length in extended header, and then
1062 		 * checksum extended header and data.
1063 		 */
1064 		m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
1065 		if (isipv6) {
1066 			/*
1067 			 * ip6_plen is not need to be filled now, and will be
1068 			 * filled in ip6_output().
1069 			 */
1070 			th->th_sum = in6_cksum(m, IPPROTO_TCP,
1071 			    sizeof(struct ip6_hdr),
1072 			    sizeof(struct tcphdr) + optlen + len);
1073 		} else {
1074 			m->m_pkthdr.csum_thlen = sizeof(struct tcphdr) + optlen;
1075 			if (use_tso) {
1076 				m->m_pkthdr.csum_flags = CSUM_TSO;
1077 				m->m_pkthdr.tso_segsz = segsz;
1078 			} else {
1079 				m->m_pkthdr.csum_flags = CSUM_TCP;
1080 				m->m_pkthdr.csum_data =
1081 				    offsetof(struct tcphdr, th_sum);
1082 				if (len + optlen) {
1083 					th->th_sum = in_addword(th->th_sum,
1084 					    htons((u_short)(optlen + len)));
1085 				}
1086 			}
1087 
1088 			/*
1089 			 * IP version must be set here for ipv4/ipv6 checking
1090 			 * later
1091 			 */
1092 			KASSERT(ip->ip_v == IPVERSION,
1093 			    ("%s: IP version incorrect: %d",
1094 			     __func__, ip->ip_v));
1095 		}
1096 	}
1097 
1098 	/*
1099 	 * In transmit state, time the transmission and arrange for
1100 	 * the retransmit.  In persist state, just set snd_max.
1101 	 */
1102 	if (!(tp->t_flags & TF_FORCE) ||
1103 	    !tcp_callout_active(tp, tp->tt_persist)) {
1104 		tcp_seq startseq = tp->snd_nxt;
1105 
1106 		/*
1107 		 * Advance snd_nxt over sequence space of this segment.
1108 		 */
1109 		if (flags & (TH_SYN | TH_FIN)) {
1110 			if (flags & TH_SYN)
1111 				tp->snd_nxt++;
1112 			if (flags & TH_FIN) {
1113 				tp->snd_nxt++;
1114 				tp->t_flags |= TF_SENTFIN;
1115 			}
1116 		}
1117 		tp->snd_nxt += len;
1118 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1119 			tp->snd_max = tp->snd_nxt;
1120 			/*
1121 			 * Time this transmission if not a retransmission and
1122 			 * not currently timing anything.
1123 			 */
1124 			if (tp->t_rtttime == 0) {
1125 				tp->t_rtttime = ticks;
1126 				tp->t_rtseq = startseq;
1127 				tcpstat.tcps_segstimed++;
1128 			}
1129 		}
1130 
1131 		/*
1132 		 * Set retransmit timer if not currently set,
1133 		 * and not doing a pure ack or a keep-alive probe.
1134 		 * Initial value for retransmit timer is smoothed
1135 		 * round-trip time + 2 * round-trip time variance.
1136 		 * Initialize shift counter which is used for backoff
1137 		 * of retransmit time.
1138 		 */
1139 		if (!tcp_callout_active(tp, tp->tt_rexmt) &&
1140 		    tp->snd_nxt != tp->snd_una) {
1141 			if (tcp_callout_active(tp, tp->tt_persist)) {
1142 				tcp_callout_stop(tp, tp->tt_persist);
1143 				tp->t_rxtshift = 0;
1144 			}
1145 			tcp_callout_reset(tp, tp->tt_rexmt, tp->t_rxtcur,
1146 			    tcp_timer_rexmt);
1147 		} else if (len == 0 && so->so_snd.ssb_cc &&
1148 			   tp->t_state > TCPS_SYN_RECEIVED &&
1149 			   !tcp_callout_active(tp, tp->tt_rexmt) &&
1150 			   !tcp_callout_active(tp, tp->tt_persist)) {
1151 			/*
1152 			 * Avoid a situation where we do not set persist timer
1153 			 * after a zero window condition. For example:
1154 			 * 1) A -> B: packet with enough data to fill the window
1155 			 * 2) B -> A: ACK for #1 + new data (0 window
1156 			 *    advertisement)
1157 			 * 3) A -> B: ACK for #2, 0 len packet
1158 			 *
1159 			 * In this case, A will not activate the persist timer,
1160 			 * because it chose to send a packet. Unless tcp_output
1161 			 * is called for some other reason (delayed ack timer,
1162 			 * another input packet from B, socket syscall), A will
1163 			 * not send zero window probes.
1164 			 *
1165 			 * So, if you send a 0-length packet, but there is data
1166 			 * in the socket buffer, and neither the rexmt or
1167 			 * persist timer is already set, then activate the
1168 			 * persist timer.
1169 			 */
1170 			tp->t_rxtshift = 0;
1171 			tcp_setpersist(tp);
1172 		}
1173 	} else {
1174 		/*
1175 		 * Persist case, update snd_max but since we are in
1176 		 * persist mode (no window) we do not update snd_nxt.
1177 		 */
1178 		int xlen = len;
1179 		if (flags & TH_SYN)
1180 			panic("tcp_output: persist timer to send SYN");
1181 		if (flags & TH_FIN) {
1182 			++xlen;
1183 			tp->t_flags |= TF_SENTFIN;
1184 		}
1185 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
1186 			tp->snd_max = tp->snd_nxt + xlen;
1187 	}
1188 
1189 	if (th != NULL) {
1190 #ifdef TCPDEBUG
1191 		/* Trace. */
1192 		if (so->so_options & SO_DEBUG) {
1193 			tcp_trace(TA_OUTPUT, tp->t_state, tp,
1194 			    mtod(m, void *), th, 0);
1195 		}
1196 #endif
1197 
1198 		/*
1199 		 * Fill in IP length and desired time to live and
1200 		 * send to IP level.  There should be a better way
1201 		 * to handle ttl and tos; we could keep them in
1202 		 * the template, but need a way to checksum without them.
1203 		 */
1204 		/*
1205 		 * m->m_pkthdr.len should have been set before cksum
1206 		 * calcuration, because in6_cksum() need it.
1207 		 */
1208 		if (isipv6) {
1209 			/*
1210 			 * we separately set hoplimit for every segment,
1211 			 * since the user might want to change the value
1212 			 * via setsockopt.  Also, desired default hop
1213 			 * limit might be changed via Neighbor Discovery.
1214 			 */
1215 			ip6->ip6_hlim = in6_selecthlim(inp,
1216 			    (inp->in6p_route.ro_rt ?
1217 			     inp->in6p_route.ro_rt->rt_ifp : NULL));
1218 
1219 			/* TODO: IPv6 IP6TOS_ECT bit on */
1220 			error = ip6_output(m, inp->in6p_outputopts,
1221 			    &inp->in6p_route, (so->so_options & SO_DONTROUTE),
1222 			    NULL, NULL, inp);
1223 		} else {
1224 			struct rtentry *rt;
1225 
1226 			KASSERT(!INP_CHECK_SOCKAF(so, AF_INET6), ("inet6 pcb"));
1227 
1228 			ip->ip_len = m->m_pkthdr.len;
1229 			ip->ip_ttl = inp->inp_ip_ttl;	/* XXX */
1230 			ip->ip_tos = inp->inp_ip_tos;	/* XXX */
1231 			/*
1232 			 * See if we should do MTU discovery.
1233 			 * We do it only if the following are true:
1234 			 *	1) we have a valid route to the destination
1235 			 *	2) the MTU is not locked (if it is,
1236 			 *	   then discovery has been disabled)
1237 			 */
1238 			if (path_mtu_discovery &&
1239 			    (rt = inp->inp_route.ro_rt) &&
1240 			    (rt->rt_flags & RTF_UP) &&
1241 			    !(rt->rt_rmx.rmx_locks & RTV_MTU))
1242 				ip->ip_off |= IP_DF;
1243 
1244 			KASSERT(inp->inp_flags & INP_HASH,
1245 			    ("inpcb has no hash"));
1246 			m_sethash(m, inp->inp_hashval);
1247 			error = ip_output(m, inp->inp_options, &inp->inp_route,
1248 					  (so->so_options & SO_DONTROUTE) |
1249 					  IP_DEBUGROUTE, NULL, inp);
1250 		}
1251 	} else {
1252 		KASSERT(error != 0, ("no error, but th not set"));
1253 	}
1254 	if (error) {
1255 		tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
1256 
1257 		/*
1258 		 * We know that the packet was lost, so back out the
1259 		 * sequence number advance, if any.
1260 		 */
1261 		if (!(tp->t_flags & TF_FORCE) ||
1262 		    !tcp_callout_active(tp, tp->tt_persist)) {
1263 			/*
1264 			 * No need to check for TH_FIN here because
1265 			 * the TF_SENTFIN flag handles that case.
1266 			 */
1267 			if (!(flags & TH_SYN))
1268 				tp->snd_nxt -= len;
1269 		}
1270 
1271 out:
1272 		if (error == ENOBUFS) {
1273 			KASSERT((len == 0 && (flags & (TH_SYN | TH_FIN)) == 0) ||
1274 			    tcp_callout_active(tp, tp->tt_rexmt) ||
1275 			    tcp_callout_active(tp, tp->tt_persist),
1276 			    ("neither rexmt nor persist timer is set"));
1277 			return (0);
1278 		}
1279 		if (error == EMSGSIZE) {
1280 			/*
1281 			 * ip_output() will have already fixed the route
1282 			 * for us.  tcp_mtudisc() will, as its last action,
1283 			 * initiate retransmission, so it is important to
1284 			 * not do so here.
1285 			 */
1286 			tcp_mtudisc(inp, 0);
1287 			return 0;
1288 		}
1289 		if ((error == EHOSTUNREACH || error == ENETDOWN) &&
1290 		    TCPS_HAVERCVDSYN(tp->t_state)) {
1291 			tp->t_softerror = error;
1292 			return (0);
1293 		}
1294 		return (error);
1295 	}
1296 	tcpstat.tcps_sndtotal++;
1297 
1298 	/*
1299 	 * Data sent (as far as we can tell).
1300 	 *
1301 	 * If this advertises a larger window than any other segment,
1302 	 * then remember the size of the advertised window.
1303 	 *
1304 	 * Any pending ACK has now been sent.
1305 	 */
1306 	if (recvwin > 0 && SEQ_GT(tp->rcv_nxt + recvwin, tp->rcv_adv)) {
1307 		tp->rcv_adv = tp->rcv_nxt + recvwin;
1308 		tp->t_flags &= ~TF_RXRESIZED;
1309 	}
1310 	tp->last_ack_sent = tp->rcv_nxt;
1311 	tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
1312 	if (tcp_delack_enabled)
1313 		tcp_callout_stop(tp, tp->tt_delack);
1314 	if (sendalot) {
1315 		if (tcp_fairsend > 0 && (tp->t_flags & TF_FAIRSEND) &&
1316 		    segcnt >= tcp_fairsend)
1317 			need_sched = TRUE;
1318 		goto again;
1319 	}
1320 	return (0);
1321 }
1322 
1323 void
1324 tcp_setpersist(struct tcpcb *tp)
1325 {
1326 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
1327 	int tt;
1328 
1329 	if (tp->t_state == TCPS_SYN_SENT ||
1330 	    tp->t_state == TCPS_SYN_RECEIVED) {
1331 		panic("tcp_setpersist: not established yet, current %s",
1332 		      tp->t_state == TCPS_SYN_SENT ?
1333 		      "SYN_SENT" : "SYN_RECEIVED");
1334 	}
1335 
1336 	if (tcp_callout_active(tp, tp->tt_rexmt))
1337 		panic("tcp_setpersist: retransmit pending");
1338 	/*
1339 	 * Start/restart persistance timer.
1340 	 */
1341 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN,
1342 		      TCPTV_PERSMAX);
1343 	tcp_callout_reset(tp, tp->tt_persist, tt, tcp_timer_persist);
1344 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1345 		tp->t_rxtshift++;
1346 }
1347 
1348 static void
1349 tcp_idle_cwnd_validate(struct tcpcb *tp)
1350 {
1351 	u_long initial_cwnd = tcp_initial_window(tp);
1352 	u_long min_cwnd;
1353 
1354 	tcpstat.tcps_sndidle++;
1355 
1356 	/* According to RFC5681: RW=min(IW,cwnd) */
1357 	min_cwnd = min(tp->snd_cwnd, initial_cwnd);
1358 
1359 	if (tcp_idle_cwv) {
1360 		u_long idle_time, decay_cwnd;
1361 
1362 		/*
1363 		 * RFC2861, but only after idle period.
1364 		 */
1365 
1366 		/*
1367 		 * Before the congestion window is reduced, ssthresh
1368 		 * is set to the maximum of its current value and 3/4
1369 		 * cwnd.  If the sender then has more data to send
1370 		 * than the decayed cwnd allows, the TCP will slow-
1371 		 * start (perform exponential increase) at least
1372 		 * half-way back up to the old value of cwnd.
1373 		 */
1374 		tp->snd_ssthresh = max(tp->snd_ssthresh,
1375 		    (3 * tp->snd_cwnd) / 4);
1376 
1377 		/*
1378 		 * Decay the congestion window by half for every RTT
1379 		 * that the flow remains inactive.
1380 		 *
1381 		 * The difference between our implementation and
1382 		 * RFC2861 is that we don't allow cwnd to go below
1383 		 * the value allowed by RFC5681 (min_cwnd).
1384 		 */
1385 		idle_time = ticks - tp->snd_last;
1386 		decay_cwnd = tp->snd_cwnd;
1387 		while (idle_time >= tp->t_rxtcur &&
1388 		    decay_cwnd > min_cwnd) {
1389 			decay_cwnd >>= 1;
1390 			idle_time -= tp->t_rxtcur;
1391 		}
1392 		tp->snd_cwnd = max(decay_cwnd, min_cwnd);
1393 	} else {
1394 		/*
1395 		 * Slow-start from scratch to re-determine the send
1396 		 * congestion window.
1397 		 */
1398 		tp->snd_cwnd = min_cwnd;
1399 	}
1400 
1401 	/* Restart ABC counting during congestion avoidance */
1402 	tp->snd_wacked = 0;
1403 }
1404 
1405 static int
1406 tcp_tso_getsize(struct tcpcb *tp, u_int *segsz, u_int *hlen0)
1407 {
1408 	struct inpcb * const inp = tp->t_inpcb;
1409 #ifdef INET6
1410 	const boolean_t isipv6 = INP_ISIPV6(inp);
1411 #else
1412 	const boolean_t isipv6 = FALSE;
1413 #endif
1414 	unsigned int ipoptlen, optlen;
1415 	u_int hlen;
1416 
1417 	hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1418 
1419 	if (isipv6) {
1420 		ipoptlen = ip6_optlen(inp);
1421 	} else {
1422 		if (inp->inp_options) {
1423 			ipoptlen = inp->inp_options->m_len -
1424 			    offsetof(struct ipoption, ipopt_list);
1425 		} else {
1426 			ipoptlen = 0;
1427 		}
1428 	}
1429 	hlen += ipoptlen;
1430 
1431 	optlen = 0;
1432 	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
1433 	    (tp->t_flags & TF_RCVD_TSTMP))
1434 		optlen += TCPOLEN_TSTAMP_APPA;
1435 	hlen += optlen;
1436 
1437 	if (tp->t_maxopd <= optlen + ipoptlen)
1438 		return EHOSTUNREACH;
1439 
1440 	*segsz = tp->t_maxopd - optlen - ipoptlen;
1441 	*hlen0 = hlen;
1442 	return 0;
1443 }
1444 
1445 static void
1446 tcp_output_sched_handler(netmsg_t nmsg)
1447 {
1448 	struct tcpcb *tp = nmsg->lmsg.u.ms_resultp;
1449 
1450 	/* Reply ASAP */
1451 	crit_enter();
1452 	lwkt_replymsg(&nmsg->lmsg, 0);
1453 	crit_exit();
1454 
1455 	tcp_output_fair(tp);
1456 }
1457 
1458 void
1459 tcp_output_init(struct tcpcb *tp)
1460 {
1461 	netmsg_init(tp->tt_sndmore, NULL, &netisr_adone_rport, MSGF_DROPABLE,
1462 	    tcp_output_sched_handler);
1463 	tp->tt_sndmore->lmsg.u.ms_resultp = tp;
1464 }
1465 
1466 void
1467 tcp_output_cancel(struct tcpcb *tp)
1468 {
1469 	/*
1470 	 * This message is still pending to be processed;
1471 	 * drop it.  Optimized.
1472 	 */
1473 	crit_enter();
1474 	if ((tp->tt_sndmore->lmsg.ms_flags & MSGF_DONE) == 0) {
1475 		lwkt_dropmsg(&tp->tt_sndmore->lmsg);
1476 	}
1477 	crit_exit();
1478 }
1479 
1480 boolean_t
1481 tcp_output_pending(struct tcpcb *tp)
1482 {
1483 	if ((tp->tt_sndmore->lmsg.ms_flags & MSGF_DONE) == 0)
1484 		return TRUE;
1485 	else
1486 		return FALSE;
1487 }
1488 
1489 static void
1490 tcp_output_sched(struct tcpcb *tp)
1491 {
1492 	crit_enter();
1493 	if (tp->tt_sndmore->lmsg.ms_flags & MSGF_DONE)
1494 		lwkt_sendmsg(netisr_cpuport(mycpuid), &tp->tt_sndmore->lmsg);
1495 	crit_exit();
1496 }
1497 
1498 /*
1499  * Fairsend
1500  *
1501  * Yield to other senders or receivers on the same netisr if the current
1502  * TCP stream has sent tcp_fairsend segments and is going to burst more
1503  * segments.  Bursting large amount of segements in a single TCP stream
1504  * could delay other senders' segments and receivers' ACKs quite a lot,
1505  * if others segments and ACKs are queued on to the same hardware transmit
1506  * queue; thus cause unfairness between senders and suppress receiving
1507  * performance.
1508  *
1509  * Fairsend should be performed at the places that do not affect segment
1510  * sending during congestion control, e.g.
1511  * - User requested output
1512  * - ACK input triggered output
1513  *
1514  * NOTE:
1515  * For devices that are TSO capable, their TSO aggregation size limit could
1516  * affect fairsend.
1517  */
1518 int
1519 tcp_output_fair(struct tcpcb *tp)
1520 {
1521 	int ret;
1522 
1523 	tp->t_flags |= TF_FAIRSEND;
1524 	ret = tcp_output(tp);
1525 	tp->t_flags &= ~TF_FAIRSEND;
1526 
1527 	return ret;
1528 }
1529