1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
25  * Copyright 2019 Joyent, Inc.
26  */
27 
28 /* This file contains all TCP output processing functions. */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/strsun.h>
33 #include <sys/strsubr.h>
34 #include <sys/stropts.h>
35 #include <sys/strlog.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/suntpi.h>
39 #include <sys/xti_inet.h>
40 #include <sys/timod.h>
41 #include <sys/pattr.h>
42 #include <sys/squeue_impl.h>
43 #include <sys/squeue.h>
44 #include <sys/sockio.h>
45 #include <sys/tsol/tnet.h>
46 
47 #include <inet/common.h>
48 #include <inet/ip.h>
49 #include <inet/tcp.h>
50 #include <inet/tcp_impl.h>
51 #include <inet/snmpcom.h>
52 #include <inet/proto_set.h>
53 #include <inet/ipsec_impl.h>
54 #include <inet/ip_ndp.h>
55 
56 static mblk_t	*tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
57 static void	tcp_wput_cmdblk(queue_t *, mblk_t *);
58 static void	tcp_wput_flush(tcp_t *, mblk_t *);
59 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
60 static int	tcp_xmit_end(tcp_t *);
61 static int	tcp_send(tcp_t *, const int, const int, const int,
62 		    const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
63 static void	tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
64 		    int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
65 static boolean_t	tcp_send_rst_chk(tcp_stack_t *);
66 static void	tcp_process_shrunk_swnd(tcp_t *, uint32_t);
67 static void	tcp_fill_header(tcp_t *, uchar_t *, int);
68 
69 /*
70  * Functions called directly via squeue having a prototype of edesc_t.
71  */
72 static void	tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void	tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
74 static void	tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
75 
76 /*
77  * This controls how tiny a write must be before we try to copy it
78  * into the mblk on the tail of the transmit queue.  Not much
79  * speedup is observed for values larger than sixteen.  Zero will
80  * disable the optimisation.
81  */
82 static int tcp_tx_pull_len = 16;
83 
84 int
85 tcp_wput(queue_t *q, mblk_t *mp)
86 {
87 	conn_t	*connp = Q_TO_CONN(q);
88 	tcp_t	*tcp;
89 	void (*output_proc)();
90 	t_scalar_t type;
91 	uchar_t *rptr;
92 	struct iocblk	*iocp;
93 	size_t size;
94 
95 	ASSERT(connp->conn_ref >= 2);
96 
97 	switch (DB_TYPE(mp)) {
98 	case M_DATA:
99 		tcp = connp->conn_tcp;
100 		ASSERT(tcp != NULL);
101 
102 		size = msgdsize(mp);
103 
104 		mutex_enter(&tcp->tcp_non_sq_lock);
105 		tcp->tcp_squeue_bytes += size;
106 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
107 			tcp_setqfull(tcp);
108 		}
109 		mutex_exit(&tcp->tcp_non_sq_lock);
110 
111 		CONN_INC_REF(connp);
112 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
113 		    NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
114 		return (0);
115 
116 	case M_CMD:
117 		tcp_wput_cmdblk(q, mp);
118 		return (0);
119 
120 	case M_PROTO:
121 	case M_PCPROTO:
122 		/*
123 		 * if it is a snmp message, don't get behind the squeue
124 		 */
125 		tcp = connp->conn_tcp;
126 		rptr = mp->b_rptr;
127 		if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
128 			type = ((union T_primitives *)rptr)->type;
129 		} else {
130 			if (connp->conn_debug) {
131 				(void) strlog(TCP_MOD_ID, 0, 1,
132 				    SL_ERROR|SL_TRACE,
133 				    "tcp_wput_proto, dropping one...");
134 			}
135 			freemsg(mp);
136 			return (0);
137 		}
138 		if (type == T_SVR4_OPTMGMT_REQ) {
139 			/*
140 			 * All Solaris components should pass a db_credp
141 			 * for this TPI message, hence we ASSERT.
142 			 * But in case there is some other M_PROTO that looks
143 			 * like a TPI message sent by some other kernel
144 			 * component, we check and return an error.
145 			 */
146 			cred_t	*cr = msg_getcred(mp, NULL);
147 
148 			ASSERT(cr != NULL);
149 			if (cr == NULL) {
150 				tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
151 				return (0);
152 			}
153 			if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get,
154 			    cr)) {
155 				/*
156 				 * This was a SNMP request
157 				 */
158 				return (0);
159 			} else {
160 				output_proc = tcp_wput_proto;
161 			}
162 		} else {
163 			output_proc = tcp_wput_proto;
164 		}
165 		break;
166 	case M_IOCTL:
167 		/*
168 		 * Most ioctls can be processed right away without going via
169 		 * squeues - process them right here. Those that do require
170 		 * squeue (currently _SIOCSOCKFALLBACK)
171 		 * are processed by tcp_wput_ioctl().
172 		 */
173 		iocp = (struct iocblk *)mp->b_rptr;
174 		tcp = connp->conn_tcp;
175 
176 		switch (iocp->ioc_cmd) {
177 		case TCP_IOC_ABORT_CONN:
178 			tcp_ioctl_abort_conn(q, mp);
179 			return (0);
180 		case TI_GETPEERNAME:
181 		case TI_GETMYNAME:
182 			mi_copyin(q, mp, NULL,
183 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
184 			return (0);
185 
186 		default:
187 			output_proc = tcp_wput_ioctl;
188 			break;
189 		}
190 		break;
191 	default:
192 		output_proc = tcp_wput_nondata;
193 		break;
194 	}
195 
196 	CONN_INC_REF(connp);
197 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
198 	    NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
199 	return (0);
200 }
201 
202 /*
203  * The TCP normal data output path.
204  * NOTE: the logic of the fast path is duplicated from this function.
205  */
206 void
207 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
208 {
209 	int		len;
210 	mblk_t		*local_time;
211 	mblk_t		*mp1;
212 	uint32_t	snxt;
213 	int		tail_unsent;
214 	int		tcpstate;
215 	int		usable = 0;
216 	mblk_t		*xmit_tail;
217 	int32_t		mss;
218 	int32_t		num_sack_blk = 0;
219 	int32_t		total_hdr_len;
220 	int32_t		tcp_hdr_len;
221 	int		rc;
222 	tcp_stack_t	*tcps = tcp->tcp_tcps;
223 	conn_t		*connp = tcp->tcp_connp;
224 	clock_t		now = LBOLT_FASTPATH;
225 
226 	tcpstate = tcp->tcp_state;
227 	if (mp == NULL) {
228 		/*
229 		 * tcp_wput_data() with NULL mp should only be called when
230 		 * there is unsent data.
231 		 */
232 		ASSERT(tcp->tcp_unsent > 0);
233 		/* Really tacky... but we need this for detached closes. */
234 		len = tcp->tcp_unsent;
235 		goto data_null;
236 	}
237 
238 	ASSERT(mp->b_datap->db_type == M_DATA);
239 	/*
240 	 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
241 	 * or before a connection attempt has begun.
242 	 */
243 	if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
244 	    (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
245 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
246 #ifdef DEBUG
247 			cmn_err(CE_WARN,
248 			    "tcp_wput_data: data after ordrel, %s",
249 			    tcp_display(tcp, NULL,
250 			    DISP_ADDR_AND_PORT));
251 #else
252 			if (connp->conn_debug) {
253 				(void) strlog(TCP_MOD_ID, 0, 1,
254 				    SL_TRACE|SL_ERROR,
255 				    "tcp_wput_data: data after ordrel, %s\n",
256 				    tcp_display(tcp, NULL,
257 				    DISP_ADDR_AND_PORT));
258 			}
259 #endif /* DEBUG */
260 		}
261 		if (tcp->tcp_snd_zcopy_aware &&
262 		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
263 			tcp_zcopy_notify(tcp);
264 		freemsg(mp);
265 		mutex_enter(&tcp->tcp_non_sq_lock);
266 		if (tcp->tcp_flow_stopped &&
267 		    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
268 			tcp_clrqfull(tcp);
269 		}
270 		mutex_exit(&tcp->tcp_non_sq_lock);
271 		return;
272 	}
273 
274 	/* Strip empties */
275 	for (;;) {
276 		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
277 		    (uintptr_t)INT_MAX);
278 		len = (int)(mp->b_wptr - mp->b_rptr);
279 		if (len > 0)
280 			break;
281 		mp1 = mp;
282 		mp = mp->b_cont;
283 		freeb(mp1);
284 		if (mp == NULL) {
285 			return;
286 		}
287 	}
288 
289 	/* If we are the first on the list ... */
290 	if (tcp->tcp_xmit_head == NULL) {
291 		tcp->tcp_xmit_head = mp;
292 		tcp->tcp_xmit_tail = mp;
293 		tcp->tcp_xmit_tail_unsent = len;
294 	} else {
295 		/* If tiny tx and room in txq tail, pullup to save mblks. */
296 		struct datab *dp;
297 
298 		mp1 = tcp->tcp_xmit_last;
299 		if (len < tcp_tx_pull_len &&
300 		    (dp = mp1->b_datap)->db_ref == 1 &&
301 		    dp->db_lim - mp1->b_wptr >= len) {
302 			ASSERT(len > 0);
303 			ASSERT(!mp1->b_cont);
304 			if (len == 1) {
305 				*mp1->b_wptr++ = *mp->b_rptr;
306 			} else {
307 				bcopy(mp->b_rptr, mp1->b_wptr, len);
308 				mp1->b_wptr += len;
309 			}
310 			if (mp1 == tcp->tcp_xmit_tail)
311 				tcp->tcp_xmit_tail_unsent += len;
312 			mp1->b_cont = mp->b_cont;
313 			if (tcp->tcp_snd_zcopy_aware &&
314 			    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
315 				mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
316 			freeb(mp);
317 			mp = mp1;
318 		} else {
319 			tcp->tcp_xmit_last->b_cont = mp;
320 		}
321 		len += tcp->tcp_unsent;
322 	}
323 
324 	/* Tack on however many more positive length mblks we have */
325 	if ((mp1 = mp->b_cont) != NULL) {
326 		do {
327 			int tlen;
328 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
329 			    (uintptr_t)INT_MAX);
330 			tlen = (int)(mp1->b_wptr - mp1->b_rptr);
331 			if (tlen <= 0) {
332 				mp->b_cont = mp1->b_cont;
333 				freeb(mp1);
334 			} else {
335 				len += tlen;
336 				mp = mp1;
337 			}
338 		} while ((mp1 = mp->b_cont) != NULL);
339 	}
340 	tcp->tcp_xmit_last = mp;
341 	tcp->tcp_unsent = len;
342 
343 	if (urgent)
344 		usable = 1;
345 
346 data_null:
347 	snxt = tcp->tcp_snxt;
348 	xmit_tail = tcp->tcp_xmit_tail;
349 	tail_unsent = tcp->tcp_xmit_tail_unsent;
350 
351 	/*
352 	 * Note that tcp_mss has been adjusted to take into account the
353 	 * timestamp option if applicable.  Because SACK options do not
354 	 * appear in every TCP segments and they are of variable lengths,
355 	 * they cannot be included in tcp_mss.  Thus we need to calculate
356 	 * the actual segment length when we need to send a segment which
357 	 * includes SACK options.
358 	 */
359 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
360 		int32_t	opt_len;
361 
362 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
363 		    tcp->tcp_num_sack_blk);
364 		opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
365 		    2 + TCPOPT_HEADER_LEN;
366 		mss = tcp->tcp_mss - opt_len;
367 		total_hdr_len = connp->conn_ht_iphc_len + opt_len;
368 		tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
369 	} else {
370 		mss = tcp->tcp_mss;
371 		total_hdr_len = connp->conn_ht_iphc_len;
372 		tcp_hdr_len = connp->conn_ht_ulp_len;
373 	}
374 
375 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
376 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
377 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
378 	}
379 	if (tcpstate == TCPS_SYN_RCVD) {
380 		/*
381 		 * The three-way connection establishment handshake is not
382 		 * complete yet. We want to queue the data for transmission
383 		 * after entering ESTABLISHED state (RFC793). A jump to
384 		 * "done" label effectively leaves data on the queue.
385 		 */
386 		goto done;
387 	} else {
388 		int usable_r;
389 
390 		/*
391 		 * In the special case when cwnd is zero, which can only
392 		 * happen if the connection is ECN capable, return now.
393 		 * New segments is sent using tcp_timer().  The timer
394 		 * is set in tcp_input_data().
395 		 */
396 		if (tcp->tcp_cwnd == 0) {
397 			/*
398 			 * Note that tcp_cwnd is 0 before 3-way handshake is
399 			 * finished.
400 			 */
401 			ASSERT(tcp->tcp_ecn_ok ||
402 			    tcp->tcp_state < TCPS_ESTABLISHED);
403 			return;
404 		}
405 
406 		/* NOTE: trouble if xmitting while SYN not acked? */
407 		usable_r = snxt - tcp->tcp_suna;
408 		usable_r = tcp->tcp_swnd - usable_r;
409 
410 		/*
411 		 * Check if the receiver has shrunk the window.  If
412 		 * tcp_wput_data() with NULL mp is called, tcp_fin_sent
413 		 * cannot be set as there is unsent data, so FIN cannot
414 		 * be sent out.  Otherwise, we need to take into account
415 		 * of FIN as it consumes an "invisible" sequence number.
416 		 */
417 		ASSERT(tcp->tcp_fin_sent == 0);
418 		if (usable_r < 0) {
419 			/*
420 			 * The receiver has shrunk the window and we have sent
421 			 * -usable_r date beyond the window, re-adjust.
422 			 *
423 			 * If TCP window scaling is enabled, there can be
424 			 * round down error as the advertised receive window
425 			 * is actually right shifted n bits.  This means that
426 			 * the lower n bits info is wiped out.  It will look
427 			 * like the window is shrunk.  Do a check here to
428 			 * see if the shrunk amount is actually within the
429 			 * error in window calculation.  If it is, just
430 			 * return.  Note that this check is inside the
431 			 * shrunk window check.  This makes sure that even
432 			 * though tcp_process_shrunk_swnd() is not called,
433 			 * we will stop further processing.
434 			 */
435 			if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
436 				tcp_process_shrunk_swnd(tcp, -usable_r);
437 			}
438 			return;
439 		}
440 
441 		/* usable = MIN(swnd, cwnd) - unacked_bytes */
442 		if (tcp->tcp_swnd > tcp->tcp_cwnd)
443 			usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
444 
445 		/* usable = MIN(usable, unsent) */
446 		if (usable_r > len)
447 			usable_r = len;
448 
449 		/* usable = MAX(usable, {1 for urgent, 0 for data}) */
450 		if (usable_r > 0) {
451 			usable = usable_r;
452 		} else {
453 			/* Bypass all other unnecessary processing. */
454 			goto done;
455 		}
456 	}
457 
458 	local_time = (mblk_t *)(intptr_t)gethrtime();
459 
460 	/*
461 	 * "Our" Nagle Algorithm.  This is not the same as in the old
462 	 * BSD.  This is more in line with the true intent of Nagle.
463 	 *
464 	 * The conditions are:
465 	 * 1. The amount of unsent data (or amount of data which can be
466 	 *    sent, whichever is smaller) is less than Nagle limit.
467 	 * 2. The last sent size is also less than Nagle limit.
468 	 * 3. There is unack'ed data.
469 	 * 4. Urgent pointer is not set.  Send urgent data ignoring the
470 	 *    Nagle algorithm.  This reduces the probability that urgent
471 	 *    bytes get "merged" together.
472 	 * 5. The app has not closed the connection.  This eliminates the
473 	 *    wait time of the receiving side waiting for the last piece of
474 	 *    (small) data.
475 	 *
476 	 * If all are satisified, exit without sending anything.  Note
477 	 * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
478 	 * the smaller of 1 MSS and global tcp_naglim_def (default to be
479 	 * 4095).
480 	 */
481 	if (usable < (int)tcp->tcp_naglim &&
482 	    tcp->tcp_naglim > tcp->tcp_last_sent_len &&
483 	    snxt != tcp->tcp_suna &&
484 	    !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
485 	    !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
486 		goto done;
487 	}
488 
489 	/*
490 	 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
491 	 * is set, then we have to force TCP not to send partial segment
492 	 * (smaller than MSS bytes). We are calculating the usable now
493 	 * based on full mss and will save the rest of remaining data for
494 	 * later. When tcp_zero_win_probe is set, TCP needs to send out
495 	 * something to do zero window probe.
496 	 */
497 	if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) {
498 		if (usable < mss)
499 			goto done;
500 		usable = (usable / mss) * mss;
501 	}
502 
503 	/* Update the latest receive window size in TCP header. */
504 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
505 
506 	/* Send the packet. */
507 	rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
508 	    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
509 	    local_time);
510 
511 	/* Pretend that all we were trying to send really got sent */
512 	if (rc < 0 && tail_unsent < 0) {
513 		do {
514 			xmit_tail = xmit_tail->b_cont;
515 			xmit_tail->b_prev = local_time;
516 			ASSERT((uintptr_t)(xmit_tail->b_wptr -
517 			    xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
518 			tail_unsent += (int)(xmit_tail->b_wptr -
519 			    xmit_tail->b_rptr);
520 		} while (tail_unsent < 0);
521 	}
522 done:;
523 	tcp->tcp_xmit_tail = xmit_tail;
524 	tcp->tcp_xmit_tail_unsent = tail_unsent;
525 	len = tcp->tcp_snxt - snxt;
526 	if (len) {
527 		/*
528 		 * If new data was sent, need to update the notsack
529 		 * list, which is, afterall, data blocks that have
530 		 * not been sack'ed by the receiver.  New data is
531 		 * not sack'ed.
532 		 */
533 		if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
534 			/* len is a negative value. */
535 			tcp->tcp_pipe -= len;
536 			tcp_notsack_update(&(tcp->tcp_notsack_list),
537 			    tcp->tcp_snxt, snxt,
538 			    &(tcp->tcp_num_notsack_blk),
539 			    &(tcp->tcp_cnt_notsack_list));
540 		}
541 		tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
542 		tcp->tcp_rack = tcp->tcp_rnxt;
543 		tcp->tcp_rack_cnt = 0;
544 		if ((snxt + len) == tcp->tcp_suna) {
545 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
546 		}
547 	} else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
548 		/*
549 		 * Didn't send anything. Make sure the timer is running
550 		 * so that we will probe a zero window.
551 		 */
552 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
553 	}
554 	/* Note that len is the amount we just sent but with a negative sign */
555 	tcp->tcp_unsent += len;
556 	mutex_enter(&tcp->tcp_non_sq_lock);
557 	if (tcp->tcp_flow_stopped) {
558 		if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
559 			tcp_clrqfull(tcp);
560 		}
561 	} else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
562 		if (!(tcp->tcp_detached))
563 			tcp_setqfull(tcp);
564 	}
565 	mutex_exit(&tcp->tcp_non_sq_lock);
566 }
567 
568 /*
569  * Initial STREAMS write side put() procedure for sockets. It tries to
570  * handle the T_CAPABILITY_REQ which sockfs sends down while setting
571  * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
572  * are handled by tcp_wput() as usual.
573  *
574  * All further messages will also be handled by tcp_wput() because we cannot
575  * be sure that the above short cut is safe later.
576  */
577 int
578 tcp_wput_sock(queue_t *wq, mblk_t *mp)
579 {
580 	conn_t			*connp = Q_TO_CONN(wq);
581 	tcp_t			*tcp = connp->conn_tcp;
582 	struct T_capability_req	*car = (struct T_capability_req *)mp->b_rptr;
583 
584 	ASSERT(wq->q_qinfo == &tcp_sock_winit);
585 	wq->q_qinfo = &tcp_winit;
586 
587 	ASSERT(IPCL_IS_TCP(connp));
588 	ASSERT(TCP_IS_SOCKET(tcp));
589 
590 	if (DB_TYPE(mp) == M_PCPROTO &&
591 	    MBLKL(mp) == sizeof (struct T_capability_req) &&
592 	    car->PRIM_type == T_CAPABILITY_REQ) {
593 		tcp_capability_req(tcp, mp);
594 		return (0);
595 	}
596 
597 	tcp_wput(wq, mp);
598 	return (0);
599 }
600 
601 /* ARGSUSED */
602 int
603 tcp_wput_fallback(queue_t *wq, mblk_t *mp)
604 {
605 #ifdef DEBUG
606 	cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
607 #endif
608 	freemsg(mp);
609 	return (0);
610 }
611 
612 /*
613  * Call by tcp_wput() to handle misc non M_DATA messages.
614  */
615 /* ARGSUSED */
616 static void
617 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
618 {
619 	conn_t	*connp = (conn_t *)arg;
620 	tcp_t	*tcp = connp->conn_tcp;
621 
622 	ASSERT(DB_TYPE(mp) != M_IOCTL);
623 	/*
624 	 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
625 	 * Once the close starts, streamhead and sockfs will not let any data
626 	 * packets come down (close ensures that there are no threads using the
627 	 * queue and no new threads will come down) but since qprocsoff()
628 	 * hasn't happened yet, a M_FLUSH or some non data message might
629 	 * get reflected back (in response to our own FLUSHRW) and get
630 	 * processed after tcp_close() is done. The conn would still be valid
631 	 * because a ref would have added but we need to check the state
632 	 * before actually processing the packet.
633 	 */
634 	if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
635 		freemsg(mp);
636 		return;
637 	}
638 
639 	switch (DB_TYPE(mp)) {
640 	case M_IOCDATA:
641 		tcp_wput_iocdata(tcp, mp);
642 		break;
643 	case M_FLUSH:
644 		tcp_wput_flush(tcp, mp);
645 		break;
646 	default:
647 		ip_wput_nondata(connp->conn_wq, mp);
648 		break;
649 	}
650 }
651 
652 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
653 static void
654 tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
655 {
656 	uchar_t	fval = *mp->b_rptr;
657 	mblk_t	*tail;
658 	conn_t	*connp = tcp->tcp_connp;
659 	queue_t	*q = connp->conn_wq;
660 
661 	/* TODO: How should flush interact with urgent data? */
662 	if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL &&
663 	    !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
664 		/*
665 		 * Flush only data that has not yet been put on the wire.  If
666 		 * we flush data that we have already transmitted, life, as we
667 		 * know it, may come to an end.
668 		 */
669 		tail = tcp->tcp_xmit_tail;
670 		tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
671 		tcp->tcp_xmit_tail_unsent = 0;
672 		tcp->tcp_unsent = 0;
673 		if (tail->b_wptr != tail->b_rptr)
674 			tail = tail->b_cont;
675 		if (tail) {
676 			mblk_t **excess = &tcp->tcp_xmit_head;
677 			for (;;) {
678 				mblk_t *mp1 = *excess;
679 				if (mp1 == tail)
680 					break;
681 				tcp->tcp_xmit_tail = mp1;
682 				tcp->tcp_xmit_last = mp1;
683 				excess = &mp1->b_cont;
684 			}
685 			*excess = NULL;
686 			tcp_close_mpp(&tail);
687 			if (tcp->tcp_snd_zcopy_aware)
688 				tcp_zcopy_notify(tcp);
689 		}
690 		/*
691 		 * We have no unsent data, so unsent must be less than
692 		 * conn_sndlowat, so re-enable flow.
693 		 */
694 		mutex_enter(&tcp->tcp_non_sq_lock);
695 		if (tcp->tcp_flow_stopped) {
696 			tcp_clrqfull(tcp);
697 		}
698 		mutex_exit(&tcp->tcp_non_sq_lock);
699 	}
700 	/*
701 	 * TODO: you can't just flush these, you have to increase rwnd for one
702 	 * thing.  For another, how should urgent data interact?
703 	 */
704 	if (fval & FLUSHR) {
705 		*mp->b_rptr = fval & ~FLUSHW;
706 		/* XXX */
707 		qreply(q, mp);
708 		return;
709 	}
710 	freemsg(mp);
711 }
712 
713 /*
714  * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
715  * messages.
716  */
717 static void
718 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
719 {
720 	mblk_t		*mp1;
721 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
722 	STRUCT_HANDLE(strbuf, sb);
723 	uint_t		addrlen;
724 	conn_t		*connp = tcp->tcp_connp;
725 	queue_t		*q = connp->conn_wq;
726 
727 	/* Make sure it is one of ours. */
728 	switch (iocp->ioc_cmd) {
729 	case TI_GETMYNAME:
730 	case TI_GETPEERNAME:
731 		break;
732 	default:
733 		/*
734 		 * If the conn is closing, then error the ioctl here. Otherwise
735 		 * use the CONN_IOCTLREF_* macros to hold off tcp_close until
736 		 * we're done here.
737 		 */
738 		mutex_enter(&connp->conn_lock);
739 		if (connp->conn_state_flags & CONN_CLOSING) {
740 			mutex_exit(&connp->conn_lock);
741 			iocp->ioc_error = EINVAL;
742 			mp->b_datap->db_type = M_IOCNAK;
743 			iocp->ioc_count = 0;
744 			qreply(q, mp);
745 			return;
746 		}
747 
748 		CONN_INC_IOCTLREF_LOCKED(connp);
749 		ip_wput_nondata(q, mp);
750 		CONN_DEC_IOCTLREF(connp);
751 		return;
752 	}
753 	switch (mi_copy_state(q, mp, &mp1)) {
754 	case -1:
755 		return;
756 	case MI_COPY_CASE(MI_COPY_IN, 1):
757 		break;
758 	case MI_COPY_CASE(MI_COPY_OUT, 1):
759 		/* Copy out the strbuf. */
760 		mi_copyout(q, mp);
761 		return;
762 	case MI_COPY_CASE(MI_COPY_OUT, 2):
763 		/* All done. */
764 		mi_copy_done(q, mp, 0);
765 		return;
766 	default:
767 		mi_copy_done(q, mp, EPROTO);
768 		return;
769 	}
770 	/* Check alignment of the strbuf */
771 	if (!OK_32PTR(mp1->b_rptr)) {
772 		mi_copy_done(q, mp, EINVAL);
773 		return;
774 	}
775 
776 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
777 
778 	if (connp->conn_family == AF_INET)
779 		addrlen = sizeof (sin_t);
780 	else
781 		addrlen = sizeof (sin6_t);
782 
783 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
784 		mi_copy_done(q, mp, EINVAL);
785 		return;
786 	}
787 
788 	switch (iocp->ioc_cmd) {
789 	case TI_GETMYNAME:
790 		break;
791 	case TI_GETPEERNAME:
792 		if (tcp->tcp_state < TCPS_SYN_RCVD) {
793 			mi_copy_done(q, mp, ENOTCONN);
794 			return;
795 		}
796 		break;
797 	}
798 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
799 	if (!mp1)
800 		return;
801 
802 	STRUCT_FSET(sb, len, addrlen);
803 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
804 	case TI_GETMYNAME:
805 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
806 		    &addrlen);
807 		break;
808 	case TI_GETPEERNAME:
809 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
810 		    &addrlen);
811 		break;
812 	}
813 	mp1->b_wptr += addrlen;
814 	/* Copy out the address */
815 	mi_copyout(q, mp);
816 }
817 
818 /*
819  * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
820  * messages.
821  */
822 /* ARGSUSED */
823 static void
824 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
825 {
826 	conn_t		*connp = (conn_t *)arg;
827 	tcp_t		*tcp = connp->conn_tcp;
828 	queue_t		*q = connp->conn_wq;
829 	struct iocblk	*iocp;
830 
831 	ASSERT(DB_TYPE(mp) == M_IOCTL);
832 	/*
833 	 * Try and ASSERT the minimum possible references on the
834 	 * conn early enough. Since we are executing on write side,
835 	 * the connection is obviously not detached and that means
836 	 * there is a ref each for TCP and IP. Since we are behind
837 	 * the squeue, the minimum references needed are 3. If the
838 	 * conn is in classifier hash list, there should be an
839 	 * extra ref for that (we check both the possibilities).
840 	 */
841 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
842 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
843 
844 	iocp = (struct iocblk *)mp->b_rptr;
845 	switch (iocp->ioc_cmd) {
846 	case _SIOCSOCKFALLBACK:
847 		/*
848 		 * Either sockmod is about to be popped and the socket
849 		 * would now be treated as a plain stream, or a module
850 		 * is about to be pushed so we could no longer use read-
851 		 * side synchronous streams for fused loopback tcp.
852 		 * Drain any queued data and disable direct sockfs
853 		 * interface from now on.
854 		 */
855 		if (!tcp->tcp_issocket) {
856 			DB_TYPE(mp) = M_IOCNAK;
857 			iocp->ioc_error = EINVAL;
858 		} else {
859 			tcp_use_pure_tpi(tcp);
860 			DB_TYPE(mp) = M_IOCACK;
861 			iocp->ioc_error = 0;
862 		}
863 		iocp->ioc_count = 0;
864 		iocp->ioc_rval = 0;
865 		qreply(q, mp);
866 		return;
867 	}
868 
869 	/*
870 	 * If the conn is closing, then error the ioctl here. Otherwise bump the
871 	 * conn_ioctlref to hold off tcp_close until we're done here.
872 	 */
873 	mutex_enter(&(connp)->conn_lock);
874 	if ((connp)->conn_state_flags & CONN_CLOSING) {
875 		mutex_exit(&(connp)->conn_lock);
876 		iocp->ioc_error = EINVAL;
877 		mp->b_datap->db_type = M_IOCNAK;
878 		iocp->ioc_count = 0;
879 		qreply(q, mp);
880 		return;
881 	}
882 
883 	CONN_INC_IOCTLREF_LOCKED(connp);
884 	ip_wput_nondata(q, mp);
885 	CONN_DEC_IOCTLREF(connp);
886 }
887 
888 /*
889  * This routine is called by tcp_wput() to handle all TPI requests.
890  */
891 /* ARGSUSED */
892 static void
893 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
894 {
895 	conn_t		*connp = (conn_t *)arg;
896 	tcp_t		*tcp = connp->conn_tcp;
897 	union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
898 	uchar_t		*rptr;
899 	t_scalar_t	type;
900 	cred_t		*cr;
901 
902 	/*
903 	 * Try and ASSERT the minimum possible references on the
904 	 * conn early enough. Since we are executing on write side,
905 	 * the connection is obviously not detached and that means
906 	 * there is a ref each for TCP and IP. Since we are behind
907 	 * the squeue, the minimum references needed are 3. If the
908 	 * conn is in classifier hash list, there should be an
909 	 * extra ref for that (we check both the possibilities).
910 	 */
911 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
912 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
913 
914 	rptr = mp->b_rptr;
915 	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
916 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
917 		type = ((union T_primitives *)rptr)->type;
918 		if (type == T_EXDATA_REQ) {
919 			tcp_output_urgent(connp, mp, arg2, NULL);
920 		} else if (type != T_DATA_REQ) {
921 			goto non_urgent_data;
922 		} else {
923 			/* TODO: options, flags, ... from user */
924 			/* Set length to zero for reclamation below */
925 			tcp_wput_data(tcp, mp->b_cont, B_TRUE);
926 			freeb(mp);
927 		}
928 		return;
929 	} else {
930 		if (connp->conn_debug) {
931 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
932 			    "tcp_wput_proto, dropping one...");
933 		}
934 		freemsg(mp);
935 		return;
936 	}
937 
938 non_urgent_data:
939 
940 	switch ((int)tprim->type) {
941 	case O_T_BIND_REQ:	/* bind request */
942 	case T_BIND_REQ:	/* new semantics bind request */
943 		tcp_tpi_bind(tcp, mp);
944 		break;
945 	case T_UNBIND_REQ:	/* unbind request */
946 		tcp_tpi_unbind(tcp, mp);
947 		break;
948 	case O_T_CONN_RES:	/* old connection response XXX */
949 	case T_CONN_RES:	/* connection response */
950 		tcp_tli_accept(tcp, mp);
951 		break;
952 	case T_CONN_REQ:	/* connection request */
953 		tcp_tpi_connect(tcp, mp);
954 		break;
955 	case T_DISCON_REQ:	/* disconnect request */
956 		tcp_disconnect(tcp, mp);
957 		break;
958 	case T_CAPABILITY_REQ:
959 		tcp_capability_req(tcp, mp);	/* capability request */
960 		break;
961 	case T_INFO_REQ:	/* information request */
962 		tcp_info_req(tcp, mp);
963 		break;
964 	case T_SVR4_OPTMGMT_REQ:	/* manage options req */
965 	case T_OPTMGMT_REQ:
966 		/*
967 		 * Note:  no support for snmpcom_req() through new
968 		 * T_OPTMGMT_REQ. See comments in ip.c
969 		 */
970 
971 		/*
972 		 * All Solaris components should pass a db_credp
973 		 * for this TPI message, hence we ASSERT.
974 		 * But in case there is some other M_PROTO that looks
975 		 * like a TPI message sent by some other kernel
976 		 * component, we check and return an error.
977 		 */
978 		cr = msg_getcred(mp, NULL);
979 		ASSERT(cr != NULL);
980 		if (cr == NULL) {
981 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
982 			return;
983 		}
984 		/*
985 		 * If EINPROGRESS is returned, the request has been queued
986 		 * for subsequent processing by ip_restart_optmgmt(), which
987 		 * will do the CONN_DEC_REF().
988 		 */
989 		if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
990 			svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
991 		} else {
992 			tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
993 		}
994 		break;
995 
996 	case T_UNITDATA_REQ:	/* unitdata request */
997 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
998 		break;
999 	case T_ORDREL_REQ:	/* orderly release req */
1000 		freemsg(mp);
1001 
1002 		if (tcp->tcp_fused)
1003 			tcp_unfuse(tcp);
1004 
1005 		if (tcp_xmit_end(tcp) != 0) {
1006 			/*
1007 			 * We were crossing FINs and got a reset from
1008 			 * the other side. Just ignore it.
1009 			 */
1010 			if (connp->conn_debug) {
1011 				(void) strlog(TCP_MOD_ID, 0, 1,
1012 				    SL_ERROR|SL_TRACE,
1013 				    "tcp_wput_proto, T_ORDREL_REQ out of "
1014 				    "state %s",
1015 				    tcp_display(tcp, NULL,
1016 				    DISP_ADDR_AND_PORT));
1017 			}
1018 		}
1019 		break;
1020 	case T_ADDR_REQ:
1021 		tcp_addr_req(tcp, mp);
1022 		break;
1023 	default:
1024 		if (connp->conn_debug) {
1025 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
1026 			    "tcp_wput_proto, bogus TPI msg, type %d",
1027 			    tprim->type);
1028 		}
1029 		/*
1030 		 * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
1031 		 * to recover.
1032 		 */
1033 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
1034 		break;
1035 	}
1036 }
1037 
1038 /*
1039  * Handle special out-of-band ioctl requests (see PSARC/2008/265).
1040  */
1041 static void
1042 tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
1043 {
1044 	void	*data;
1045 	mblk_t	*datamp = mp->b_cont;
1046 	conn_t	*connp = Q_TO_CONN(q);
1047 	tcp_t	*tcp = connp->conn_tcp;
1048 	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
1049 
1050 	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
1051 		cmdp->cb_error = EPROTO;
1052 		qreply(q, mp);
1053 		return;
1054 	}
1055 
1056 	data = datamp->b_rptr;
1057 
1058 	switch (cmdp->cb_cmd) {
1059 	case TI_GETPEERNAME:
1060 		if (tcp->tcp_state < TCPS_SYN_RCVD)
1061 			cmdp->cb_error = ENOTCONN;
1062 		else
1063 			cmdp->cb_error = conn_getpeername(connp, data,
1064 			    &cmdp->cb_len);
1065 		break;
1066 	case TI_GETMYNAME:
1067 		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
1068 		break;
1069 	default:
1070 		cmdp->cb_error = EINVAL;
1071 		break;
1072 	}
1073 
1074 	qreply(q, mp);
1075 }
1076 
1077 /*
1078  * The TCP fast path write put procedure.
1079  * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
1080  */
1081 /* ARGSUSED */
1082 void
1083 tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1084 {
1085 	int		len;
1086 	int		hdrlen;
1087 	int		plen;
1088 	mblk_t		*mp1;
1089 	uchar_t		*rptr;
1090 	uint32_t	snxt;
1091 	tcpha_t		*tcpha;
1092 	struct datab	*db;
1093 	uint32_t	suna;
1094 	uint32_t	mss;
1095 	ipaddr_t	*dst;
1096 	ipaddr_t	*src;
1097 	uint32_t	sum;
1098 	int		usable;
1099 	conn_t		*connp = (conn_t *)arg;
1100 	tcp_t		*tcp = connp->conn_tcp;
1101 	uint32_t	msize;
1102 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1103 	ip_xmit_attr_t	*ixa;
1104 	clock_t		now;
1105 
1106 	/*
1107 	 * Try and ASSERT the minimum possible references on the
1108 	 * conn early enough. Since we are executing on write side,
1109 	 * the connection is obviously not detached and that means
1110 	 * there is a ref each for TCP and IP. Since we are behind
1111 	 * the squeue, the minimum references needed are 3. If the
1112 	 * conn is in classifier hash list, there should be an
1113 	 * extra ref for that (we check both the possibilities).
1114 	 */
1115 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1116 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1117 
1118 	ASSERT(DB_TYPE(mp) == M_DATA);
1119 	msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1120 
1121 	mutex_enter(&tcp->tcp_non_sq_lock);
1122 	tcp->tcp_squeue_bytes -= msize;
1123 	mutex_exit(&tcp->tcp_non_sq_lock);
1124 
1125 	/* Bypass tcp protocol for fused tcp loopback */
1126 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1127 		return;
1128 
1129 	mss = tcp->tcp_mss;
1130 	/*
1131 	 * If ZEROCOPY has turned off, try not to send any zero-copy message
1132 	 * down. Do backoff, now.
1133 	 */
1134 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
1135 		mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
1136 
1137 
1138 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1139 	len = (int)(mp->b_wptr - mp->b_rptr);
1140 
1141 	/*
1142 	 * Criteria for fast path:
1143 	 *
1144 	 *   1. no unsent data
1145 	 *   2. single mblk in request
1146 	 *   3. connection established
1147 	 *   4. data in mblk
1148 	 *   5. len <= mss
1149 	 *   6. no tcp_valid bits
1150 	 */
1151 	if ((tcp->tcp_unsent != 0) ||
1152 	    (tcp->tcp_cork) ||
1153 	    (mp->b_cont != NULL) ||
1154 	    (tcp->tcp_state != TCPS_ESTABLISHED) ||
1155 	    (len == 0) ||
1156 	    (len > mss) ||
1157 	    (tcp->tcp_valid_bits != 0)) {
1158 		tcp_wput_data(tcp, mp, B_FALSE);
1159 		return;
1160 	}
1161 
1162 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
1163 	ASSERT(tcp->tcp_fin_sent == 0);
1164 
1165 	/* queue new packet onto retransmission queue */
1166 	if (tcp->tcp_xmit_head == NULL) {
1167 		tcp->tcp_xmit_head = mp;
1168 	} else {
1169 		tcp->tcp_xmit_last->b_cont = mp;
1170 	}
1171 	tcp->tcp_xmit_last = mp;
1172 	tcp->tcp_xmit_tail = mp;
1173 
1174 	/* find out how much we can send */
1175 	/* BEGIN CSTYLED */
1176 	/*
1177 	 *    un-acked	   usable
1178 	 *  |--------------|-----------------|
1179 	 *  tcp_suna       tcp_snxt	  tcp_suna+tcp_swnd
1180 	 */
1181 	/* END CSTYLED */
1182 
1183 	/* start sending from tcp_snxt */
1184 	snxt = tcp->tcp_snxt;
1185 
1186 	/*
1187 	 * Check to see if this connection has been idle for some time and no
1188 	 * ACK is expected. If so, then the congestion window size is no longer
1189 	 * meaningfully tied to current network conditions.
1190 	 *
1191 	 * We reinitialize tcp_cwnd, and slow start again to get back the
1192 	 * connection's "self-clock" as described in Van Jacobson's 1988 paper
1193 	 * "Congestion avoidance and control".
1194 	 */
1195 	now = LBOLT_FASTPATH;
1196 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1197 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1198 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1199 	}
1200 
1201 	usable = tcp->tcp_swnd;		/* tcp window size */
1202 	if (usable > tcp->tcp_cwnd)
1203 		usable = tcp->tcp_cwnd;	/* congestion window smaller */
1204 	usable -= snxt;		/* subtract stuff already sent */
1205 	suna = tcp->tcp_suna;
1206 	usable += suna;
1207 	/* usable can be < 0 if the congestion window is smaller */
1208 	if (len > usable) {
1209 		/* Can't send complete M_DATA in one shot */
1210 		goto slow;
1211 	}
1212 
1213 	mutex_enter(&tcp->tcp_non_sq_lock);
1214 	if (tcp->tcp_flow_stopped &&
1215 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1216 		tcp_clrqfull(tcp);
1217 	}
1218 	mutex_exit(&tcp->tcp_non_sq_lock);
1219 
1220 	/*
1221 	 * determine if anything to send (Nagle).
1222 	 *
1223 	 *   1. len < tcp_mss (i.e. small)
1224 	 *   2. unacknowledged data present
1225 	 *   3. len < nagle limit
1226 	 *   4. last packet sent < nagle limit (previous packet sent)
1227 	 */
1228 	if ((len < mss) && (snxt != suna) &&
1229 	    (len < (int)tcp->tcp_naglim) &&
1230 	    (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
1231 		/*
1232 		 * This was the first unsent packet and normally
1233 		 * mss < xmit_hiwater so there is no need to worry
1234 		 * about flow control. The next packet will go
1235 		 * through the flow control check in tcp_wput_data().
1236 		 */
1237 		/* leftover work from above */
1238 		tcp->tcp_unsent = len;
1239 		tcp->tcp_xmit_tail_unsent = len;
1240 
1241 		return;
1242 	}
1243 
1244 	/*
1245 	 * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1246 	 * send now.
1247 	 */
1248 
1249 	if (snxt == suna) {
1250 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1251 	}
1252 
1253 	/* we have always sent something */
1254 	tcp->tcp_rack_cnt = 0;
1255 
1256 	tcp->tcp_snxt = snxt + len;
1257 	tcp->tcp_rack = tcp->tcp_rnxt;
1258 
1259 	if ((mp1 = dupb(mp)) == 0)
1260 		goto no_memory;
1261 	mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
1262 	mp->b_next = (mblk_t *)(uintptr_t)snxt;
1263 
1264 	/* adjust tcp header information */
1265 	tcpha = tcp->tcp_tcpha;
1266 	tcpha->tha_flags = (TH_ACK|TH_PUSH);
1267 
1268 	sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1269 	sum = (sum >> 16) + (sum & 0xFFFF);
1270 	tcpha->tha_sum = htons(sum);
1271 
1272 	tcpha->tha_seq = htonl(snxt);
1273 
1274 	TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1275 	TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1276 	TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1277 	tcp->tcp_cs.tcp_out_data_segs++;
1278 	tcp->tcp_cs.tcp_out_data_bytes += len;
1279 
1280 	/* Update the latest receive window size in TCP header. */
1281 	tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1282 
1283 	tcp->tcp_last_sent_len = (ushort_t)len;
1284 
1285 	plen = len + connp->conn_ht_iphc_len;
1286 
1287 	ixa = connp->conn_ixa;
1288 	ixa->ixa_pktlen = plen;
1289 
1290 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1291 		tcp->tcp_ipha->ipha_length = htons(plen);
1292 	} else {
1293 		tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1294 	}
1295 
1296 	/* see if we need to allocate a mblk for the headers */
1297 	hdrlen = connp->conn_ht_iphc_len;
1298 	rptr = mp1->b_rptr - hdrlen;
1299 	db = mp1->b_datap;
1300 	if ((db->db_ref != 2) || rptr < db->db_base ||
1301 	    (!OK_32PTR(rptr))) {
1302 		/* NOTE: we assume allocb returns an OK_32PTR */
1303 		mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1304 		if (!mp) {
1305 			freemsg(mp1);
1306 			goto no_memory;
1307 		}
1308 		mp->b_cont = mp1;
1309 		mp1 = mp;
1310 		/* Leave room for Link Level header */
1311 		rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1312 		mp1->b_wptr = &rptr[hdrlen];
1313 	}
1314 	mp1->b_rptr = rptr;
1315 
1316 	/* Fill in the timestamp option. */
1317 	if (tcp->tcp_snd_ts_ok) {
1318 		U32_TO_BE32(now,
1319 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
1320 		U32_TO_BE32(tcp->tcp_ts_recent,
1321 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
1322 	} else {
1323 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1324 	}
1325 
1326 	/* copy header into outgoing packet */
1327 	dst = (ipaddr_t *)rptr;
1328 	src = (ipaddr_t *)connp->conn_ht_iphc;
1329 	dst[0] = src[0];
1330 	dst[1] = src[1];
1331 	dst[2] = src[2];
1332 	dst[3] = src[3];
1333 	dst[4] = src[4];
1334 	dst[5] = src[5];
1335 	dst[6] = src[6];
1336 	dst[7] = src[7];
1337 	dst[8] = src[8];
1338 	dst[9] = src[9];
1339 	if (hdrlen -= 40) {
1340 		hdrlen >>= 2;
1341 		dst += 10;
1342 		src += 10;
1343 		do {
1344 			*dst++ = *src++;
1345 		} while (--hdrlen);
1346 	}
1347 
1348 	/*
1349 	 * Set the ECN info in the TCP header.  Note that this
1350 	 * is not the template header.
1351 	 */
1352 	if (tcp->tcp_ecn_ok) {
1353 		TCP_SET_ECT(tcp, rptr);
1354 
1355 		tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
1356 		if (tcp->tcp_ecn_echo_on)
1357 			tcpha->tha_flags |= TH_ECE;
1358 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
1359 			tcpha->tha_flags |= TH_CWR;
1360 			tcp->tcp_ecn_cwr_sent = B_TRUE;
1361 		}
1362 	}
1363 
1364 	if (tcp->tcp_ip_forward_progress) {
1365 		tcp->tcp_ip_forward_progress = B_FALSE;
1366 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
1367 	} else {
1368 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
1369 	}
1370 	tcp_send_data(tcp, mp1);
1371 	return;
1372 
1373 	/*
1374 	 * If we ran out of memory, we pretend to have sent the packet
1375 	 * and that it was lost on the wire.
1376 	 */
1377 no_memory:
1378 	return;
1379 
1380 slow:
1381 	/* leftover work from above */
1382 	tcp->tcp_unsent = len;
1383 	tcp->tcp_xmit_tail_unsent = len;
1384 	tcp_wput_data(tcp, NULL, B_FALSE);
1385 }
1386 
1387 /* ARGSUSED2 */
1388 void
1389 tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1390 {
1391 	int len;
1392 	uint32_t msize;
1393 	conn_t *connp = (conn_t *)arg;
1394 	tcp_t *tcp = connp->conn_tcp;
1395 
1396 	msize = msgdsize(mp);
1397 
1398 	len = msize - 1;
1399 	if (len < 0) {
1400 		freemsg(mp);
1401 		return;
1402 	}
1403 
1404 	/*
1405 	 * Try to force urgent data out on the wire. Even if we have unsent
1406 	 * data this will at least send the urgent flag.
1407 	 * XXX does not handle more flag correctly.
1408 	 */
1409 	len += tcp->tcp_unsent;
1410 	len += tcp->tcp_snxt;
1411 	tcp->tcp_urg = len;
1412 	tcp->tcp_valid_bits |= TCP_URG_VALID;
1413 
1414 	/* Bypass tcp protocol for fused tcp loopback */
1415 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1416 		return;
1417 
1418 	/* Strip off the T_EXDATA_REQ if the data is from TPI */
1419 	if (DB_TYPE(mp) != M_DATA) {
1420 		mblk_t *mp1 = mp;
1421 		ASSERT(!IPCL_IS_NONSTR(connp));
1422 		mp = mp->b_cont;
1423 		freeb(mp1);
1424 	}
1425 	tcp_wput_data(tcp, mp, B_TRUE);
1426 }
1427 
1428 /*
1429  * Called by streams close routine via squeues when our client blows off its
1430  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
1431  * connection politely" When SO_LINGER is set (with a non-zero linger time and
1432  * it is not a nonblocking socket) then this routine sleeps until the FIN is
1433  * acked.
1434  *
1435  * NOTE: tcp_close potentially returns error when lingering.
1436  * However, the stream head currently does not pass these errors
1437  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
1438  * errors to the application (from tsleep()) and not errors
1439  * like ECONNRESET caused by receiving a reset packet.
1440  */
1441 
1442 /* ARGSUSED */
1443 void
1444 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1445 {
1446 	char	*msg;
1447 	conn_t	*connp = (conn_t *)arg;
1448 	tcp_t	*tcp = connp->conn_tcp;
1449 	clock_t	delta = 0;
1450 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1451 
1452 	/*
1453 	 * When a non-STREAMS socket is being closed, it does not always
1454 	 * stick around waiting for tcp_close_output to run and can therefore
1455 	 * have dropped a reference already. So adjust the asserts accordingly.
1456 	 */
1457 	ASSERT((connp->conn_fanout != NULL &&
1458 	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
1459 	    (connp->conn_fanout == NULL &&
1460 	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
1461 
1462 	mutex_enter(&tcp->tcp_eager_lock);
1463 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
1464 		/*
1465 		 * Cleanup for listener. For non-STREAM sockets sockfs will
1466 		 * close all the eagers on 'q', so in that case only deal
1467 		 * with 'q0'.
1468 		 */
1469 		tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
1470 		tcp->tcp_wait_for_eagers = 1;
1471 	}
1472 	mutex_exit(&tcp->tcp_eager_lock);
1473 
1474 	tcp->tcp_lso = B_FALSE;
1475 
1476 	msg = NULL;
1477 	switch (tcp->tcp_state) {
1478 	case TCPS_CLOSED:
1479 	case TCPS_IDLE:
1480 		break;
1481 	case TCPS_BOUND:
1482 		if (tcp->tcp_listener != NULL) {
1483 			ASSERT(IPCL_IS_NONSTR(connp));
1484 			/*
1485 			 * Unlink from the listener and drop the reference
1486 			 * put on it by the eager. tcp_closei_local will not
1487 			 * do it because tcp_tconnind_started is TRUE.
1488 			 */
1489 			mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1490 			tcp_eager_unlink(tcp);
1491 			mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1492 			CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1493 		}
1494 		break;
1495 	case TCPS_LISTEN:
1496 		break;
1497 	case TCPS_SYN_SENT:
1498 		msg = "tcp_close, during connect";
1499 		break;
1500 	case TCPS_SYN_RCVD:
1501 		/*
1502 		 * Close during the connect 3-way handshake
1503 		 * but here there may or may not be pending data
1504 		 * already on queue. Process almost same as in
1505 		 * the ESTABLISHED state.
1506 		 */
1507 		/* FALLTHRU */
1508 	default:
1509 		if (tcp->tcp_fused)
1510 			tcp_unfuse(tcp);
1511 
1512 		/*
1513 		 * If SO_LINGER has set a zero linger time, abort the
1514 		 * connection with a reset.
1515 		 */
1516 		if (connp->conn_linger && connp->conn_lingertime == 0) {
1517 			msg = "tcp_close, zero lingertime";
1518 			break;
1519 		}
1520 
1521 		/*
1522 		 * Abort connection if there is unread data queued.
1523 		 */
1524 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
1525 			msg = "tcp_close, unread data";
1526 			break;
1527 		}
1528 
1529 		/*
1530 		 * Abort connection if it is being closed without first
1531 		 * being accepted. This can happen if a listening non-STREAM
1532 		 * socket wants to get rid of the socket, for example, if the
1533 		 * listener is closing.
1534 		 */
1535 		if (tcp->tcp_listener != NULL) {
1536 			ASSERT(IPCL_IS_NONSTR(connp));
1537 			msg = "tcp_close, close before accept";
1538 
1539 			/*
1540 			 * Unlink from the listener and drop the reference
1541 			 * put on it by the eager. tcp_closei_local will not
1542 			 * do it because tcp_tconnind_started is TRUE.
1543 			 */
1544 			mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
1545 			tcp_eager_unlink(tcp);
1546 			mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
1547 			CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1548 			break;
1549 		}
1550 
1551 		/*
1552 		 * Transmit the FIN before detaching the tcp_t.
1553 		 * After tcp_detach returns this queue/perimeter
1554 		 * no longer owns the tcp_t thus others can modify it.
1555 		 */
1556 		(void) tcp_xmit_end(tcp);
1557 
1558 		/*
1559 		 * If lingering on close then wait until the fin is acked,
1560 		 * the SO_LINGER time passes, or a reset is sent/received.
1561 		 */
1562 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
1563 		    !(tcp->tcp_fin_acked) &&
1564 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
1565 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
1566 				tcp->tcp_client_errno = EWOULDBLOCK;
1567 			} else if (tcp->tcp_client_errno == 0) {
1568 
1569 				ASSERT(tcp->tcp_linger_tid == 0);
1570 
1571 				/* conn_lingertime is in sec. */
1572 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
1573 				    tcp_close_linger_timeout,
1574 				    connp->conn_lingertime * MILLISEC);
1575 
1576 				/* tcp_close_linger_timeout will finish close */
1577 				if (tcp->tcp_linger_tid == 0)
1578 					tcp->tcp_client_errno = ENOSR;
1579 				else
1580 					return;
1581 			}
1582 
1583 			/*
1584 			 * Check if we need to detach or just close
1585 			 * the instance.
1586 			 */
1587 			if (tcp->tcp_state <= TCPS_LISTEN)
1588 				break;
1589 		}
1590 
1591 		/*
1592 		 * Make sure that no other thread will access the conn_rq of
1593 		 * this instance (through lookups etc.) as conn_rq will go
1594 		 * away shortly.
1595 		 */
1596 		tcp_acceptor_hash_remove(tcp);
1597 
1598 		mutex_enter(&tcp->tcp_non_sq_lock);
1599 		if (tcp->tcp_flow_stopped) {
1600 			tcp_clrqfull(tcp);
1601 		}
1602 		mutex_exit(&tcp->tcp_non_sq_lock);
1603 
1604 		if (tcp->tcp_timer_tid != 0) {
1605 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
1606 			tcp->tcp_timer_tid = 0;
1607 		}
1608 		/*
1609 		 * Need to cancel those timers which will not be used when
1610 		 * TCP is detached.  This has to be done before the conn_wq
1611 		 * is set to NULL.
1612 		 */
1613 		tcp_timers_stop(tcp);
1614 
1615 		tcp->tcp_detached = B_TRUE;
1616 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
1617 			tcp_time_wait_append(tcp);
1618 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
1619 			ASSERT(connp->conn_ref >=
1620 			    (IPCL_IS_NONSTR(connp) ? 2 : 3));
1621 			goto finish;
1622 		}
1623 
1624 		/*
1625 		 * If delta is zero the timer event wasn't executed and was
1626 		 * successfully canceled. In this case we need to restart it
1627 		 * with the minimal delta possible.
1628 		 */
1629 		if (delta >= 0)
1630 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
1631 			    delta ? delta : 1);
1632 
1633 		ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
1634 		goto finish;
1635 	}
1636 
1637 	/* Detach did not complete. Still need to remove q from stream. */
1638 	if (msg) {
1639 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
1640 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
1641 			TCPS_BUMP_MIB(tcps, tcpEstabResets);
1642 		if (tcp->tcp_state == TCPS_SYN_SENT ||
1643 		    tcp->tcp_state == TCPS_SYN_RCVD)
1644 			TCPS_BUMP_MIB(tcps, tcpAttemptFails);
1645 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
1646 	}
1647 
1648 	tcp_closei_local(tcp);
1649 	CONN_DEC_REF(connp);
1650 	ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
1651 
1652 finish:
1653 	/*
1654 	 * Don't change the queues in the case of a listener that has
1655 	 * eagers in its q or q0. It could surprise the eagers.
1656 	 * Instead wait for the eagers outside the squeue.
1657 	 *
1658 	 * For non-STREAMS sockets tcp_wait_for_eagers implies that
1659 	 * we should delay the su_closed upcall until all eagers have
1660 	 * dropped their references.
1661 	 */
1662 	if (!tcp->tcp_wait_for_eagers) {
1663 		tcp->tcp_detached = B_TRUE;
1664 		connp->conn_rq = NULL;
1665 		connp->conn_wq = NULL;
1666 
1667 		/* non-STREAM socket, release the upper handle */
1668 		if (IPCL_IS_NONSTR(connp)) {
1669 			ASSERT(connp->conn_upper_handle != NULL);
1670 			(*connp->conn_upcalls->su_closed)
1671 			    (connp->conn_upper_handle);
1672 			connp->conn_upper_handle = NULL;
1673 			connp->conn_upcalls = NULL;
1674 		}
1675 	}
1676 
1677 	/* Signal tcp_close() to finish closing. */
1678 	mutex_enter(&tcp->tcp_closelock);
1679 	tcp->tcp_closed = 1;
1680 	cv_signal(&tcp->tcp_closecv);
1681 	mutex_exit(&tcp->tcp_closelock);
1682 }
1683 
1684 /* ARGSUSED */
1685 void
1686 tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1687 {
1688 	conn_t	*connp = (conn_t *)arg;
1689 	tcp_t	*tcp = connp->conn_tcp;
1690 
1691 	freemsg(mp);
1692 
1693 	if (tcp->tcp_fused)
1694 		tcp_unfuse(tcp);
1695 
1696 	if (tcp_xmit_end(tcp) != 0) {
1697 		/*
1698 		 * We were crossing FINs and got a reset from
1699 		 * the other side. Just ignore it.
1700 		 */
1701 		if (connp->conn_debug) {
1702 			(void) strlog(TCP_MOD_ID, 0, 1,
1703 			    SL_ERROR|SL_TRACE,
1704 			    "tcp_shutdown_output() out of state %s",
1705 			    tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
1706 		}
1707 	}
1708 }
1709 
1710 #pragma inline(tcp_send_data)
1711 
1712 void
1713 tcp_send_data(tcp_t *tcp, mblk_t *mp)
1714 {
1715 	conn_t		*connp = tcp->tcp_connp;
1716 
1717 	/*
1718 	 * Check here to avoid sending zero-copy message down to IP when
1719 	 * ZEROCOPY capability has turned off. We only need to deal with
1720 	 * the race condition between sockfs and the notification here.
1721 	 * Since we have tried to backoff the tcp_xmit_head when turning
1722 	 * zero-copy off and new messages in tcp_output(), we simply drop
1723 	 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
1724 	 * is not true.
1725 	 */
1726 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
1727 	    !tcp->tcp_xmit_zc_clean) {
1728 		ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
1729 		freemsg(mp);
1730 		return;
1731 	}
1732 
1733 	DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
1734 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, tcp,
1735 	    __dtrace_tcp_tcph_t *,
1736 	    &mp->b_rptr[connp->conn_ixa->ixa_ip_hdr_length]);
1737 
1738 	ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
1739 	(void) conn_ip_output(mp, connp->conn_ixa);
1740 }
1741 
1742 /* ARGSUSED2 */
1743 void
1744 tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1745 {
1746 	conn_t	*econnp = (conn_t *)arg;
1747 	tcp_t	*tcp = econnp->conn_tcp;
1748 	ip_xmit_attr_t *ixa = econnp->conn_ixa;
1749 
1750 	/* Guard against a RST having blown it away while on the squeue */
1751 	if (tcp->tcp_state == TCPS_CLOSED) {
1752 		freemsg(mp);
1753 		return;
1754 	}
1755 
1756 	/*
1757 	 * In the off-chance that the eager received and responded to
1758 	 * some other packet while the SYN|ACK was queued, we recalculate
1759 	 * the ixa_pktlen. It would be better to fix the SYN/accept
1760 	 * multithreading scheme to avoid this complexity.
1761 	 */
1762 	ixa->ixa_pktlen = msgdsize(mp);
1763 	(void) conn_ip_output(mp, ixa);
1764 }
1765 
1766 /*
1767  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1768  *
1769  * -1 = failed allocation.
1770  *  0 = We've either successfully sent data, or our usable send window is too
1771  *      small and we'd rather wait until later before sending again.
1772  */
1773 static int
1774 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1775     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1776     uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1777 {
1778 	int		num_lso_seg = 1;
1779 	uint_t		lso_usable;
1780 	boolean_t	do_lso_send = B_FALSE;
1781 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1782 	conn_t		*connp = tcp->tcp_connp;
1783 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
1784 
1785 	/*
1786 	 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1787 	 * the underlying connection is LSO capable. Will check whether having
1788 	 * enough available data to initiate LSO transmission in the for(){}
1789 	 * loops.
1790 	 */
1791 	if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1792 		do_lso_send = B_TRUE;
1793 
1794 	for (;;) {
1795 		struct datab	*db;
1796 		tcpha_t		*tcpha;
1797 		uint32_t	sum;
1798 		mblk_t		*mp, *mp1;
1799 		uchar_t		*rptr;
1800 		int		len;
1801 
1802 		/*
1803 		 * Calculate the maximum payload length we can send at one
1804 		 * time.
1805 		 */
1806 		if (do_lso_send) {
1807 			/*
1808 			 * Determine whether or not it's possible to do LSO,
1809 			 * and if so, how much data we can send.
1810 			 */
1811 			if ((*usable - 1) / mss >= 1) {
1812 				lso_usable = MIN(tcp->tcp_lso_max, *usable);
1813 				num_lso_seg = lso_usable / mss;
1814 				if (lso_usable % mss) {
1815 					num_lso_seg++;
1816 					tcp->tcp_last_sent_len = (ushort_t)
1817 					    (lso_usable % mss);
1818 				} else {
1819 					tcp->tcp_last_sent_len = (ushort_t)mss;
1820 				}
1821 			} else {
1822 				do_lso_send = B_FALSE;
1823 				num_lso_seg = 1;
1824 				lso_usable = mss;
1825 			}
1826 		}
1827 
1828 		ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1829 
1830 		len = mss;
1831 		if (len > *usable) {
1832 			ASSERT(do_lso_send == B_FALSE);
1833 
1834 			len = *usable;
1835 			if (len <= 0) {
1836 				/* Terminate the loop */
1837 				break;	/* success; too small */
1838 			}
1839 			/*
1840 			 * Sender silly-window avoidance.
1841 			 * Ignore this if we are going to send a
1842 			 * zero window probe out.
1843 			 *
1844 			 * TODO: force data into microscopic window?
1845 			 *	==> (!pushed || (unsent > usable))
1846 			 */
1847 			if (len < (tcp->tcp_max_swnd >> 1) &&
1848 			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
1849 			    !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
1850 			    len == 1) && (! tcp->tcp_zero_win_probe)) {
1851 				/*
1852 				 * If the retransmit timer is not running
1853 				 * we start it so that we will retransmit
1854 				 * in the case when the receiver has
1855 				 * decremented the window.
1856 				 */
1857 				if (*snxt == tcp->tcp_snxt &&
1858 				    *snxt == tcp->tcp_suna) {
1859 					/*
1860 					 * We are not supposed to send
1861 					 * anything.  So let's wait a little
1862 					 * bit longer before breaking SWS
1863 					 * avoidance.
1864 					 *
1865 					 * What should the value be?
1866 					 * Suggestion: MAX(init rexmit time,
1867 					 * tcp->tcp_rto)
1868 					 */
1869 					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1870 				}
1871 				break;	/* success; too small */
1872 			}
1873 		}
1874 
1875 		tcpha = tcp->tcp_tcpha;
1876 
1877 		/*
1878 		 * The reason to adjust len here is that we need to set flags
1879 		 * and calculate checksum.
1880 		 */
1881 		if (do_lso_send)
1882 			len = lso_usable;
1883 
1884 		*usable -= len; /* Approximate - can be adjusted later */
1885 		if (*usable > 0)
1886 			tcpha->tha_flags = TH_ACK;
1887 		else
1888 			tcpha->tha_flags = (TH_ACK | TH_PUSH);
1889 
1890 		/*
1891 		 * Prime pump for IP's checksumming on our behalf.
1892 		 * Include the adjustment for a source route if any.
1893 		 * In case of LSO, the partial pseudo-header checksum should
1894 		 * exclusive TCP length, so zero tha_sum before IP calculate
1895 		 * pseudo-header checksum for partial checksum offload.
1896 		 */
1897 		if (do_lso_send) {
1898 			sum = 0;
1899 		} else {
1900 			sum = len + tcp_hdr_len + connp->conn_sum;
1901 			sum = (sum >> 16) + (sum & 0xFFFF);
1902 		}
1903 		tcpha->tha_sum = htons(sum);
1904 		tcpha->tha_seq = htonl(*snxt);
1905 
1906 		/*
1907 		 * Branch off to tcp_xmit_mp() if any of the VALID bits is
1908 		 * set.  For the case when TCP_FSS_VALID is the only valid
1909 		 * bit (normal active close), branch off only when we think
1910 		 * that the FIN flag needs to be set.  Note for this case,
1911 		 * that (snxt + len) may not reflect the actual seg_len,
1912 		 * as len may be further reduced in tcp_xmit_mp().  If len
1913 		 * gets modified, we will end up here again.
1914 		 */
1915 		if (tcp->tcp_valid_bits != 0 &&
1916 		    (tcp->tcp_valid_bits != TCP_FSS_VALID ||
1917 		    ((*snxt + len) == tcp->tcp_fss))) {
1918 			uchar_t		*prev_rptr;
1919 			uint32_t	prev_snxt = tcp->tcp_snxt;
1920 
1921 			if (*tail_unsent == 0) {
1922 				ASSERT((*xmit_tail)->b_cont != NULL);
1923 				*xmit_tail = (*xmit_tail)->b_cont;
1924 				prev_rptr = (*xmit_tail)->b_rptr;
1925 				*tail_unsent = (int)((*xmit_tail)->b_wptr -
1926 				    (*xmit_tail)->b_rptr);
1927 			} else {
1928 				prev_rptr = (*xmit_tail)->b_rptr;
1929 				(*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
1930 				    *tail_unsent;
1931 			}
1932 			mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
1933 			    *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
1934 			/* Restore tcp_snxt so we get amount sent right. */
1935 			tcp->tcp_snxt = prev_snxt;
1936 			if (prev_rptr == (*xmit_tail)->b_rptr) {
1937 				/*
1938 				 * If the previous timestamp is still in use,
1939 				 * don't stomp on it.
1940 				 */
1941 				if ((*xmit_tail)->b_next == NULL) {
1942 					(*xmit_tail)->b_prev = local_time;
1943 					(*xmit_tail)->b_next =
1944 					    (mblk_t *)(uintptr_t)(*snxt);
1945 				}
1946 			} else
1947 				(*xmit_tail)->b_rptr = prev_rptr;
1948 
1949 			if (mp == NULL) {
1950 				return (-1);
1951 			}
1952 			mp1 = mp->b_cont;
1953 
1954 			if (len <= mss) /* LSO is unusable (!do_lso_send) */
1955 				tcp->tcp_last_sent_len = (ushort_t)len;
1956 			while (mp1->b_cont) {
1957 				*xmit_tail = (*xmit_tail)->b_cont;
1958 				(*xmit_tail)->b_prev = local_time;
1959 				(*xmit_tail)->b_next =
1960 				    (mblk_t *)(uintptr_t)(*snxt);
1961 				mp1 = mp1->b_cont;
1962 			}
1963 			*snxt += len;
1964 			*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1965 			TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1966 			TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1967 			TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1968 			tcp->tcp_cs.tcp_out_data_segs++;
1969 			tcp->tcp_cs.tcp_out_data_bytes += len;
1970 			tcp_send_data(tcp, mp);
1971 			continue;
1972 		}
1973 
1974 		*snxt += len;	/* Adjust later if we don't send all of len */
1975 		TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
1976 		TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1977 		TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1978 		tcp->tcp_cs.tcp_out_data_segs++;
1979 		tcp->tcp_cs.tcp_out_data_bytes += len;
1980 
1981 		if (*tail_unsent) {
1982 			/* Are the bytes above us in flight? */
1983 			rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1984 			if (rptr != (*xmit_tail)->b_rptr) {
1985 				*tail_unsent -= len;
1986 				if (len <= mss) /* LSO is unusable */
1987 					tcp->tcp_last_sent_len = (ushort_t)len;
1988 				len += total_hdr_len;
1989 				ixa->ixa_pktlen = len;
1990 
1991 				if (ixa->ixa_flags & IXAF_IS_IPV4) {
1992 					tcp->tcp_ipha->ipha_length = htons(len);
1993 				} else {
1994 					tcp->tcp_ip6h->ip6_plen =
1995 					    htons(len - IPV6_HDR_LEN);
1996 				}
1997 
1998 				mp = dupb(*xmit_tail);
1999 				if (mp == NULL) {
2000 					return (-1);	/* out_of_mem */
2001 				}
2002 				mp->b_rptr = rptr;
2003 				/*
2004 				 * If the old timestamp is no longer in use,
2005 				 * sample a new timestamp now.
2006 				 */
2007 				if ((*xmit_tail)->b_next == NULL) {
2008 					(*xmit_tail)->b_prev = local_time;
2009 					(*xmit_tail)->b_next =
2010 					    (mblk_t *)(uintptr_t)(*snxt-len);
2011 				}
2012 				goto must_alloc;
2013 			}
2014 		} else {
2015 			*xmit_tail = (*xmit_tail)->b_cont;
2016 			ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
2017 			    (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
2018 			*tail_unsent = (int)((*xmit_tail)->b_wptr -
2019 			    (*xmit_tail)->b_rptr);
2020 		}
2021 
2022 		(*xmit_tail)->b_prev = local_time;
2023 		(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
2024 
2025 		*tail_unsent -= len;
2026 		if (len <= mss) /* LSO is unusable (!do_lso_send) */
2027 			tcp->tcp_last_sent_len = (ushort_t)len;
2028 
2029 		len += total_hdr_len;
2030 		ixa->ixa_pktlen = len;
2031 
2032 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
2033 			tcp->tcp_ipha->ipha_length = htons(len);
2034 		} else {
2035 			tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2036 		}
2037 
2038 		mp = dupb(*xmit_tail);
2039 		if (mp == NULL) {
2040 			return (-1);	/* out_of_mem */
2041 		}
2042 
2043 		len = total_hdr_len;
2044 		/*
2045 		 * There are four reasons to allocate a new hdr mblk:
2046 		 *  1) The bytes above us are in use by another packet
2047 		 *  2) We don't have good alignment
2048 		 *  3) The mblk is being shared
2049 		 *  4) We don't have enough room for a header
2050 		 */
2051 		rptr = mp->b_rptr - len;
2052 		if (!OK_32PTR(rptr) ||
2053 		    ((db = mp->b_datap), db->db_ref != 2) ||
2054 		    rptr < db->db_base) {
2055 			/* NOTE: we assume allocb returns an OK_32PTR */
2056 
2057 		must_alloc:;
2058 			mp1 = allocb(connp->conn_ht_iphc_allocated +
2059 			    tcps->tcps_wroff_xtra, BPRI_MED);
2060 			if (mp1 == NULL) {
2061 				freemsg(mp);
2062 				return (-1);	/* out_of_mem */
2063 			}
2064 			mp1->b_cont = mp;
2065 			mp = mp1;
2066 			/* Leave room for Link Level header */
2067 			len = total_hdr_len;
2068 			rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2069 			mp->b_wptr = &rptr[len];
2070 		}
2071 
2072 		/*
2073 		 * Fill in the header using the template header, and add
2074 		 * options such as time-stamp, ECN and/or SACK, as needed.
2075 		 */
2076 		tcp_fill_header(tcp, rptr, num_sack_blk);
2077 
2078 		mp->b_rptr = rptr;
2079 
2080 		if (*tail_unsent) {
2081 			int spill = *tail_unsent;
2082 
2083 			mp1 = mp->b_cont;
2084 			if (mp1 == NULL)
2085 				mp1 = mp;
2086 
2087 			/*
2088 			 * If we're a little short, tack on more mblks until
2089 			 * there is no more spillover.
2090 			 */
2091 			while (spill < 0) {
2092 				mblk_t *nmp;
2093 				int nmpsz;
2094 
2095 				nmp = (*xmit_tail)->b_cont;
2096 				nmpsz = MBLKL(nmp);
2097 
2098 				/*
2099 				 * Excess data in mblk; can we split it?
2100 				 * If LSO is enabled for the connection,
2101 				 * keep on splitting as this is a transient
2102 				 * send path.
2103 				 */
2104 				if (!do_lso_send && (spill + nmpsz > 0)) {
2105 					/*
2106 					 * Don't split if stream head was
2107 					 * told to break up larger writes
2108 					 * into smaller ones.
2109 					 */
2110 					if (tcp->tcp_maxpsz_multiplier > 0)
2111 						break;
2112 
2113 					/*
2114 					 * Next mblk is less than SMSS/2
2115 					 * rounded up to nearest 64-byte;
2116 					 * let it get sent as part of the
2117 					 * next segment.
2118 					 */
2119 					if (tcp->tcp_localnet &&
2120 					    !tcp->tcp_cork &&
2121 					    (nmpsz < roundup((mss >> 1), 64)))
2122 						break;
2123 				}
2124 
2125 				*xmit_tail = nmp;
2126 				ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
2127 				/* Stash for rtt use later */
2128 				(*xmit_tail)->b_prev = local_time;
2129 				(*xmit_tail)->b_next =
2130 				    (mblk_t *)(uintptr_t)(*snxt - len);
2131 				mp1->b_cont = dupb(*xmit_tail);
2132 				mp1 = mp1->b_cont;
2133 
2134 				spill += nmpsz;
2135 				if (mp1 == NULL) {
2136 					*tail_unsent = spill;
2137 					freemsg(mp);
2138 					return (-1);	/* out_of_mem */
2139 				}
2140 			}
2141 
2142 			/* Trim back any surplus on the last mblk */
2143 			if (spill >= 0) {
2144 				mp1->b_wptr -= spill;
2145 				*tail_unsent = spill;
2146 			} else {
2147 				/*
2148 				 * We did not send everything we could in
2149 				 * order to remain within the b_cont limit.
2150 				 */
2151 				*usable -= spill;
2152 				*snxt += spill;
2153 				tcp->tcp_last_sent_len += spill;
2154 				TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2155 				tcp->tcp_cs.tcp_out_data_bytes += spill;
2156 				/*
2157 				 * Adjust the checksum
2158 				 */
2159 				tcpha = (tcpha_t *)(rptr +
2160 				    ixa->ixa_ip_hdr_length);
2161 				sum += spill;
2162 				sum = (sum >> 16) + (sum & 0xFFFF);
2163 				tcpha->tha_sum = htons(sum);
2164 				if (connp->conn_ipversion == IPV4_VERSION) {
2165 					sum = ntohs(
2166 					    ((ipha_t *)rptr)->ipha_length) +
2167 					    spill;
2168 					((ipha_t *)rptr)->ipha_length =
2169 					    htons(sum);
2170 				} else {
2171 					sum = ntohs(
2172 					    ((ip6_t *)rptr)->ip6_plen) +
2173 					    spill;
2174 					((ip6_t *)rptr)->ip6_plen =
2175 					    htons(sum);
2176 				}
2177 				ixa->ixa_pktlen += spill;
2178 				*tail_unsent = 0;
2179 			}
2180 		}
2181 		if (tcp->tcp_ip_forward_progress) {
2182 			tcp->tcp_ip_forward_progress = B_FALSE;
2183 			ixa->ixa_flags |= IXAF_REACH_CONF;
2184 		} else {
2185 			ixa->ixa_flags &= ~IXAF_REACH_CONF;
2186 		}
2187 
2188 		if (do_lso_send) {
2189 			/* Append LSO information to the mp. */
2190 			lso_info_set(mp, mss, HW_LSO);
2191 			ixa->ixa_fragsize = IP_MAXPACKET;
2192 			ixa->ixa_extra_ident = num_lso_seg - 1;
2193 
2194 			DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2195 			    boolean_t, B_TRUE);
2196 
2197 			tcp_send_data(tcp, mp);
2198 
2199 			/*
2200 			 * Restore values of ixa_fragsize and ixa_extra_ident.
2201 			 */
2202 			ixa->ixa_fragsize = ixa->ixa_pmtu;
2203 			ixa->ixa_extra_ident = 0;
2204 			TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2205 			TCP_STAT(tcps, tcp_lso_times);
2206 			TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2207 		} else {
2208 			/*
2209 			 * Make sure to clean up LSO information. Wherever a
2210 			 * new mp uses the prepended header room after dupb(),
2211 			 * lso_info_cleanup() should be called.
2212 			 */
2213 			lso_info_cleanup(mp);
2214 			tcp_send_data(tcp, mp);
2215 			TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2216 		}
2217 	}
2218 
2219 	return (0);
2220 }
2221 
2222 /*
2223  * Initiate closedown sequence on an active connection.  (May be called as
2224  * writer.)  Return value zero for OK return, non-zero for error return.
2225  */
2226 static int
2227 tcp_xmit_end(tcp_t *tcp)
2228 {
2229 	mblk_t		*mp;
2230 	tcp_stack_t	*tcps = tcp->tcp_tcps;
2231 	iulp_t		uinfo;
2232 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
2233 	conn_t		*connp = tcp->tcp_connp;
2234 
2235 	if (tcp->tcp_state < TCPS_SYN_RCVD ||
2236 	    tcp->tcp_state > TCPS_CLOSE_WAIT) {
2237 		/*
2238 		 * Invalid state, only states TCPS_SYN_RCVD,
2239 		 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
2240 		 */
2241 		return (-1);
2242 	}
2243 
2244 	tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
2245 	tcp->tcp_valid_bits |= TCP_FSS_VALID;
2246 	/*
2247 	 * If there is nothing more unsent, send the FIN now.
2248 	 * Otherwise, it will go out with the last segment.
2249 	 */
2250 	if (tcp->tcp_unsent == 0) {
2251 		mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
2252 		    tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
2253 
2254 		if (mp) {
2255 			tcp_send_data(tcp, mp);
2256 		} else {
2257 			/*
2258 			 * Couldn't allocate msg.  Pretend we got it out.
2259 			 * Wait for rexmit timeout.
2260 			 */
2261 			tcp->tcp_snxt = tcp->tcp_fss + 1;
2262 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
2263 		}
2264 
2265 		/*
2266 		 * If needed, update tcp_rexmit_snxt as tcp_snxt is
2267 		 * changed.
2268 		 */
2269 		if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
2270 			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2271 		}
2272 	} else {
2273 		/*
2274 		 * If tcp->tcp_cork is set, then the data will not get sent,
2275 		 * so we have to check that and unset it first.
2276 		 */
2277 		if (tcp->tcp_cork)
2278 			tcp->tcp_cork = B_FALSE;
2279 		tcp_wput_data(tcp, NULL, B_FALSE);
2280 	}
2281 
2282 	/*
2283 	 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2284 	 * is 0, don't update the cache.
2285 	 */
2286 	if (tcps->tcps_rtt_updates == 0 ||
2287 	    tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2288 		return (0);
2289 
2290 	/*
2291 	 * We do not have a good algorithm to update ssthresh at this time.
2292 	 * So don't do any update.
2293 	 */
2294 	bzero(&uinfo, sizeof (uinfo));
2295 	uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
2296 	uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
2297 
2298 	/*
2299 	 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2300 	 * if source routed but we don't.
2301 	 */
2302 	if (connp->conn_ipversion == IPV4_VERSION) {
2303 		if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2304 			return (0);
2305 		}
2306 		(void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2307 	} else {
2308 		uint_t ifindex;
2309 
2310 		if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2311 		    &tcp->tcp_ip6h->ip6_dst))) {
2312 			return (0);
2313 		}
2314 		ifindex = 0;
2315 		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2316 			ip_xmit_attr_t *ixa = connp->conn_ixa;
2317 
2318 			/*
2319 			 * If we are going to create a DCE we'd better have
2320 			 * an ifindex
2321 			 */
2322 			if (ixa->ixa_nce != NULL) {
2323 				ifindex = ixa->ixa_nce->nce_common->ncec_ill->
2324 				    ill_phyint->phyint_ifindex;
2325 			} else {
2326 				return (0);
2327 			}
2328 		}
2329 
2330 		(void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
2331 		    ipst);
2332 	}
2333 	return (0);
2334 }
2335 
2336 /*
2337  * Send out a control packet on the tcp connection specified.  This routine
2338  * is typically called where we need a simple ACK or RST generated.
2339  */
2340 void
2341 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
2342 {
2343 	uchar_t		*rptr;
2344 	tcpha_t		*tcpha;
2345 	ipha_t		*ipha = NULL;
2346 	ip6_t		*ip6h = NULL;
2347 	uint32_t	sum;
2348 	int		total_hdr_len;
2349 	int		ip_hdr_len;
2350 	mblk_t		*mp;
2351 	tcp_stack_t	*tcps = tcp->tcp_tcps;
2352 	conn_t		*connp = tcp->tcp_connp;
2353 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
2354 
2355 	/*
2356 	 * Save sum for use in source route later.
2357 	 */
2358 	sum = connp->conn_ht_ulp_len + connp->conn_sum;
2359 	total_hdr_len = connp->conn_ht_iphc_len;
2360 	ip_hdr_len = ixa->ixa_ip_hdr_length;
2361 
2362 	/* If a text string is passed in with the request, pass it to strlog. */
2363 	if (str != NULL && connp->conn_debug) {
2364 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2365 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
2366 		    str, seq, ack, ctl);
2367 	}
2368 	mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
2369 	    BPRI_MED);
2370 	if (mp == NULL) {
2371 		return;
2372 	}
2373 	rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2374 	mp->b_rptr = rptr;
2375 	mp->b_wptr = &rptr[total_hdr_len];
2376 	bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
2377 
2378 	ixa->ixa_pktlen = total_hdr_len;
2379 
2380 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2381 		ipha = (ipha_t *)rptr;
2382 		ipha->ipha_length = htons(total_hdr_len);
2383 	} else {
2384 		ip6h = (ip6_t *)rptr;
2385 		ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
2386 	}
2387 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2388 	tcpha->tha_flags = (uint8_t)ctl;
2389 	if (ctl & TH_RST) {
2390 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
2391 		TCPS_BUMP_MIB(tcps, tcpOutControl);
2392 		/*
2393 		 * Don't send TSopt w/ TH_RST packets per RFC 1323.
2394 		 */
2395 		if (tcp->tcp_snd_ts_ok &&
2396 		    tcp->tcp_state > TCPS_SYN_SENT) {
2397 			mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
2398 			*(mp->b_wptr) = TCPOPT_EOL;
2399 
2400 			ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
2401 
2402 			if (connp->conn_ipversion == IPV4_VERSION) {
2403 				ipha->ipha_length = htons(total_hdr_len -
2404 				    TCPOPT_REAL_TS_LEN);
2405 			} else {
2406 				ip6h->ip6_plen = htons(total_hdr_len -
2407 				    IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
2408 			}
2409 			tcpha->tha_offset_and_reserved -= (3 << 4);
2410 			sum -= TCPOPT_REAL_TS_LEN;
2411 		}
2412 	}
2413 	if (ctl & TH_ACK) {
2414 		if (tcp->tcp_snd_ts_ok) {
2415 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2416 
2417 			U32_TO_BE32(llbolt,
2418 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2419 			U32_TO_BE32(tcp->tcp_ts_recent,
2420 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2421 		}
2422 
2423 		/* Update the latest receive window size in TCP header. */
2424 		tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2425 		/* Track what we sent to the peer */
2426 		tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2427 		tcp->tcp_rack = ack;
2428 		tcp->tcp_rack_cnt = 0;
2429 		TCPS_BUMP_MIB(tcps, tcpOutAck);
2430 	}
2431 	TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2432 	tcpha->tha_seq = htonl(seq);
2433 	tcpha->tha_ack = htonl(ack);
2434 	/*
2435 	 * Include the adjustment for a source route if any.
2436 	 */
2437 	sum = (sum >> 16) + (sum & 0xFFFF);
2438 	tcpha->tha_sum = htons(sum);
2439 	tcp_send_data(tcp, mp);
2440 }
2441 
2442 /*
2443  * Generate a reset based on an inbound packet, connp is set by caller
2444  * when RST is in response to an unexpected inbound packet for which
2445  * there is active tcp state in the system.
2446  *
2447  * IPSEC NOTE : Try to send the reply with the same protection as it came
2448  * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2449  * That way the packet will go out at the same level of protection as it
2450  * came in with.
2451  */
2452 static void
2453 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
2454     ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
2455 {
2456 	ipha_t		*ipha = NULL;
2457 	ip6_t		*ip6h = NULL;
2458 	ushort_t	len;
2459 	tcpha_t		*tcpha;
2460 	int		i;
2461 	ipaddr_t	v4addr;
2462 	in6_addr_t	v6addr;
2463 	netstack_t	*ns = ipst->ips_netstack;
2464 	tcp_stack_t	*tcps = ns->netstack_tcp;
2465 	ip_xmit_attr_t	ixas, *ixa;
2466 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
2467 	boolean_t	need_refrele = B_FALSE;		/* ixa_refrele(ixa) */
2468 	ushort_t	port;
2469 
2470 	if (!tcp_send_rst_chk(tcps)) {
2471 		TCP_STAT(tcps, tcp_rst_unsent);
2472 		freemsg(mp);
2473 		return;
2474 	}
2475 
2476 	/*
2477 	 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
2478 	 * options from the listener. In that case the caller must ensure that
2479 	 * we are running on the listener = connp squeue.
2480 	 *
2481 	 * We get a safe copy of conn_ixa so we don't need to restore anything
2482 	 * we or ip_output_simple might change in the ixa.
2483 	 */
2484 	if (connp != NULL) {
2485 		ASSERT(connp->conn_on_sqp);
2486 
2487 		ixa = conn_get_ixa_exclusive(connp);
2488 		if (ixa == NULL) {
2489 			TCP_STAT(tcps, tcp_rst_unsent);
2490 			freemsg(mp);
2491 			return;
2492 		}
2493 		need_refrele = B_TRUE;
2494 	} else {
2495 		bzero(&ixas, sizeof (ixas));
2496 		ixa = &ixas;
2497 		/*
2498 		 * IXAF_VERIFY_SOURCE is overkill since we know the
2499 		 * packet was for us.
2500 		 */
2501 		ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
2502 		ixa->ixa_protocol = IPPROTO_TCP;
2503 		ixa->ixa_zoneid = ira->ira_zoneid;
2504 		ixa->ixa_ifindex = 0;
2505 		ixa->ixa_ipst = ipst;
2506 		ixa->ixa_cred = kcred;
2507 		ixa->ixa_cpid = NOPID;
2508 	}
2509 
2510 	if (str && tcps->tcps_dbg) {
2511 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2512 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
2513 		    "flags 0x%x",
2514 		    str, seq, ack, ctl);
2515 	}
2516 	if (mp->b_datap->db_ref != 1) {
2517 		mblk_t *mp1 = copyb(mp);
2518 		freemsg(mp);
2519 		mp = mp1;
2520 		if (mp == NULL)
2521 			goto done;
2522 	} else if (mp->b_cont) {
2523 		freemsg(mp->b_cont);
2524 		mp->b_cont = NULL;
2525 		DB_CKSUMFLAGS(mp) = 0;
2526 	}
2527 	/*
2528 	 * We skip reversing source route here.
2529 	 * (for now we replace all IP options with EOL)
2530 	 */
2531 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2532 		ipha = (ipha_t *)mp->b_rptr;
2533 		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
2534 			mp->b_rptr[i] = IPOPT_EOL;
2535 		/*
2536 		 * Make sure that src address isn't flagrantly invalid.
2537 		 * Not all broadcast address checking for the src address
2538 		 * is possible, since we don't know the netmask of the src
2539 		 * addr.  No check for destination address is done, since
2540 		 * IP will not pass up a packet with a broadcast dest
2541 		 * address to TCP.  Similar checks are done below for IPv6.
2542 		 */
2543 		if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
2544 		    CLASSD(ipha->ipha_src)) {
2545 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2546 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2547 			freemsg(mp);
2548 			goto done;
2549 		}
2550 	} else {
2551 		ip6h = (ip6_t *)mp->b_rptr;
2552 
2553 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
2554 		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
2555 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
2556 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2557 			freemsg(mp);
2558 			goto done;
2559 		}
2560 
2561 		/* Remove any extension headers assuming partial overlay */
2562 		if (ip_hdr_len > IPV6_HDR_LEN) {
2563 			uint8_t *to;
2564 
2565 			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
2566 			ovbcopy(ip6h, to, IPV6_HDR_LEN);
2567 			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
2568 			ip_hdr_len = IPV6_HDR_LEN;
2569 			ip6h = (ip6_t *)mp->b_rptr;
2570 			ip6h->ip6_nxt = IPPROTO_TCP;
2571 		}
2572 	}
2573 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
2574 	if (tcpha->tha_flags & TH_RST) {
2575 		freemsg(mp);
2576 		goto done;
2577 	}
2578 	tcpha->tha_offset_and_reserved = (5 << 4);
2579 	len = ip_hdr_len + sizeof (tcpha_t);
2580 	mp->b_wptr = &mp->b_rptr[len];
2581 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2582 		ipha->ipha_length = htons(len);
2583 		/* Swap addresses */
2584 		v4addr = ipha->ipha_src;
2585 		ipha->ipha_src = ipha->ipha_dst;
2586 		ipha->ipha_dst = v4addr;
2587 		ipha->ipha_ident = 0;
2588 		ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
2589 		ixa->ixa_flags |= IXAF_IS_IPV4;
2590 		ixa->ixa_ip_hdr_length = ip_hdr_len;
2591 	} else {
2592 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2593 		/* Swap addresses */
2594 		v6addr = ip6h->ip6_src;
2595 		ip6h->ip6_src = ip6h->ip6_dst;
2596 		ip6h->ip6_dst = v6addr;
2597 		ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
2598 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
2599 
2600 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
2601 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
2602 			ixa->ixa_scopeid = ira->ira_ruifindex;
2603 		}
2604 		ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
2605 	}
2606 	ixa->ixa_pktlen = len;
2607 
2608 	/* Swap the ports */
2609 	port = tcpha->tha_fport;
2610 	tcpha->tha_fport = tcpha->tha_lport;
2611 	tcpha->tha_lport = port;
2612 
2613 	tcpha->tha_ack = htonl(ack);
2614 	tcpha->tha_seq = htonl(seq);
2615 	tcpha->tha_win = 0;
2616 	tcpha->tha_sum = htons(sizeof (tcpha_t));
2617 	tcpha->tha_flags = (uint8_t)ctl;
2618 	if (ctl & TH_RST) {
2619 		if (ctl & TH_ACK) {
2620 			/*
2621 			 * Probe connection rejection here.
2622 			 * tcp_xmit_listeners_reset() drops non-SYN segments
2623 			 * that do not specify TH_ACK in their flags without
2624 			 * calling this function.  As a consequence, if this
2625 			 * function is called with a TH_RST|TH_ACK ctl argument,
2626 			 * it is being called in response to a SYN segment
2627 			 * and thus the tcp:::accept-refused probe point
2628 			 * is valid here.
2629 			 */
2630 			DTRACE_TCP5(accept__refused, mblk_t *, NULL,
2631 			    void, NULL, void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2632 			    tcph_t *, tcpha);
2633 		}
2634 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
2635 		TCPS_BUMP_MIB(tcps, tcpOutControl);
2636 	}
2637 
2638 	/* Discard any old label */
2639 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
2640 		ASSERT(ixa->ixa_tsl != NULL);
2641 		label_rele(ixa->ixa_tsl);
2642 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
2643 	}
2644 	ixa->ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
2645 
2646 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2647 		/*
2648 		 * Apply IPsec based on how IPsec was applied to
2649 		 * the packet that caused the RST.
2650 		 */
2651 		if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
2652 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2653 			/* Note: mp already consumed and ip_drop_packet done */
2654 			goto done;
2655 		}
2656 	} else {
2657 		/*
2658 		 * This is in clear. The RST message we are building
2659 		 * here should go out in clear, independent of our policy.
2660 		 */
2661 		ixa->ixa_flags |= IXAF_NO_IPSEC;
2662 	}
2663 
2664 	DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, ixa,
2665 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2666 	    __dtrace_tcp_tcph_t *, tcpha);
2667 
2668 	/*
2669 	 * NOTE:  one might consider tracing a TCP packet here, but
2670 	 * this function has no active TCP state and no tcp structure
2671 	 * that has a trace buffer.  If we traced here, we would have
2672 	 * to keep a local trace buffer in tcp_record_trace().
2673 	 */
2674 
2675 	(void) ip_output_simple(mp, ixa);
2676 done:
2677 	ixa_cleanup(ixa);
2678 	if (need_refrele) {
2679 		ASSERT(ixa != &ixas);
2680 		ixa_refrele(ixa);
2681 	}
2682 }
2683 
2684 /*
2685  * Generate a "no listener here" RST in response to an "unknown" segment.
2686  * connp is set by caller when RST is in response to an unexpected
2687  * inbound packet for which there is active tcp state in the system.
2688  * Note that we are reusing the incoming mp to construct the outgoing RST.
2689  */
2690 void
2691 tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
2692     conn_t *connp)
2693 {
2694 	uchar_t		*rptr;
2695 	uint32_t	seg_len;
2696 	tcpha_t		*tcpha;
2697 	uint32_t	seg_seq;
2698 	uint32_t	seg_ack;
2699 	uint_t		flags;
2700 	ipha_t		*ipha;
2701 	ip6_t		*ip6h;
2702 	boolean_t	policy_present;
2703 	netstack_t	*ns = ipst->ips_netstack;
2704 	tcp_stack_t	*tcps = ns->netstack_tcp;
2705 	ipsec_stack_t	*ipss = tcps->tcps_netstack->netstack_ipsec;
2706 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
2707 
2708 	TCP_STAT(tcps, tcp_no_listener);
2709 
2710 	/*
2711 	 * DTrace this "unknown" segment as a tcp:::receive, as we did
2712 	 * just receive something that was TCP.
2713 	 */
2714 	DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, NULL,
2715 	    __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, NULL,
2716 	    __dtrace_tcp_tcph_t *, &mp->b_rptr[ip_hdr_len]);
2717 
2718 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2719 		policy_present = ipss->ipsec_inbound_v4_policy_present;
2720 		ipha = (ipha_t *)mp->b_rptr;
2721 		ip6h = NULL;
2722 	} else {
2723 		policy_present = ipss->ipsec_inbound_v6_policy_present;
2724 		ipha = NULL;
2725 		ip6h = (ip6_t *)mp->b_rptr;
2726 	}
2727 
2728 	if (policy_present) {
2729 		/*
2730 		 * The conn_t parameter is NULL because we already know
2731 		 * nobody's home.
2732 		 */
2733 		mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
2734 		    ira, ns);
2735 		if (mp == NULL)
2736 			return;
2737 	}
2738 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
2739 		DTRACE_PROBE2(
2740 		    tx__ip__log__error__nolistener__tcp,
2741 		    char *, "Could not reply with RST to mp(1)",
2742 		    mblk_t *, mp);
2743 		ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
2744 		freemsg(mp);
2745 		return;
2746 	}
2747 
2748 	rptr = mp->b_rptr;
2749 
2750 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2751 	seg_seq = ntohl(tcpha->tha_seq);
2752 	seg_ack = ntohl(tcpha->tha_ack);
2753 	flags = tcpha->tha_flags;
2754 
2755 	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
2756 	if (flags & TH_RST) {
2757 		freemsg(mp);
2758 	} else if (flags & TH_ACK) {
2759 		tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
2760 		    ira, ipst, connp);
2761 	} else {
2762 		if (flags & TH_SYN) {
2763 			seg_len++;
2764 		} else {
2765 			/*
2766 			 * Here we violate the RFC.  Note that a normal
2767 			 * TCP will never send a segment without the ACK
2768 			 * flag, except for RST or SYN segment.  This
2769 			 * segment is neither.  Just drop it on the
2770 			 * floor.
2771 			 */
2772 			freemsg(mp);
2773 			TCP_STAT(tcps, tcp_rst_unsent);
2774 			return;
2775 		}
2776 
2777 		tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
2778 		    seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
2779 	}
2780 }
2781 
2782 /*
2783  * Helper function for tcp_xmit_mp() in handling connection set up flag
2784  * options setting.
2785  */
2786 static void
2787 tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp,
2788     uint_t *flags)
2789 {
2790 	uint32_t u1;
2791 	uint8_t	*wptr = mp->b_wptr;
2792 	tcp_stack_t *tcps = tcp->tcp_tcps;
2793 	boolean_t add_sack = B_FALSE;
2794 
2795 	/*
2796 	 * If TCP_ISS_VALID and the seq number is tcp_iss,
2797 	 * TCP can only be in SYN-SENT, SYN-RCVD or
2798 	 * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
2799 	 * our SYN is not ack'ed but the app closes this
2800 	 * TCP connection.
2801 	 */
2802 	ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
2803 	    tcp->tcp_state == TCPS_SYN_RCVD ||
2804 	    tcp->tcp_state == TCPS_FIN_WAIT_1);
2805 
2806 	/*
2807 	 * Tack on the MSS option.  It is always needed
2808 	 * for both active and passive open.
2809 	 *
2810 	 * MSS option value should be interface MTU - MIN
2811 	 * TCP/IP header according to RFC 793 as it means
2812 	 * the maximum segment size TCP can receive.  But
2813 	 * to get around some broken middle boxes/end hosts
2814 	 * out there, we allow the option value to be the
2815 	 * same as the MSS option size on the peer side.
2816 	 * In this way, the other side will not send
2817 	 * anything larger than they can receive.
2818 	 *
2819 	 * Note that for SYN_SENT state, the ndd param
2820 	 * tcp_use_smss_as_mss_opt has no effect as we
2821 	 * don't know the peer's MSS option value. So
2822 	 * the only case we need to take care of is in
2823 	 * SYN_RCVD state, which is done later.
2824 	 */
2825 	wptr[0] = TCPOPT_MAXSEG;
2826 	wptr[1] = TCPOPT_MAXSEG_LEN;
2827 	wptr += 2;
2828 	u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ?
2829 	    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH;
2830 	U16_TO_BE16(u1, wptr);
2831 	wptr += 2;
2832 
2833 	/* Update the offset to cover the additional word */
2834 	tcpha->tha_offset_and_reserved += (1 << 4);
2835 
2836 	switch (tcp->tcp_state) {
2837 	case TCPS_SYN_SENT:
2838 		*flags = TH_SYN;
2839 
2840 		if (tcp->tcp_snd_sack_ok)
2841 			add_sack = B_TRUE;
2842 
2843 		if (tcp->tcp_snd_ts_ok) {
2844 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2845 
2846 			if (add_sack) {
2847 				wptr[0] = TCPOPT_SACK_PERMITTED;
2848 				wptr[1] = TCPOPT_SACK_OK_LEN;
2849 				add_sack = B_FALSE;
2850 			} else {
2851 				wptr[0] = TCPOPT_NOP;
2852 				wptr[1] = TCPOPT_NOP;
2853 			}
2854 			wptr[2] = TCPOPT_TSTAMP;
2855 			wptr[3] = TCPOPT_TSTAMP_LEN;
2856 			wptr += 4;
2857 			U32_TO_BE32(llbolt, wptr);
2858 			wptr += 4;
2859 			ASSERT(tcp->tcp_ts_recent == 0);
2860 			U32_TO_BE32(0L, wptr);
2861 			wptr += 4;
2862 			tcpha->tha_offset_and_reserved += (3 << 4);
2863 		}
2864 
2865 		/*
2866 		 * Set up all the bits to tell other side
2867 		 * we are ECN capable.
2868 		 */
2869 		if (tcp->tcp_ecn_ok)
2870 			*flags |= (TH_ECE | TH_CWR);
2871 
2872 		break;
2873 
2874 	case TCPS_SYN_RCVD:
2875 		*flags |= TH_SYN;
2876 
2877 		/*
2878 		 * Reset the MSS option value to be SMSS
2879 		 * We should probably add back the bytes
2880 		 * for timestamp option and IPsec.  We
2881 		 * don't do that as this is a workaround
2882 		 * for broken middle boxes/end hosts, it
2883 		 * is better for us to be more cautious.
2884 		 * They may not take these things into
2885 		 * account in their SMSS calculation.  Thus
2886 		 * the peer's calculated SMSS may be smaller
2887 		 * than what it can be.  This should be OK.
2888 		 */
2889 		if (tcps->tcps_use_smss_as_mss_opt) {
2890 			u1 = tcp->tcp_mss;
2891 			/*
2892 			 * Note that wptr points just past the MSS
2893 			 * option value.
2894 			 */
2895 			U16_TO_BE16(u1, wptr - 2);
2896 		}
2897 
2898 		/*
2899 		 * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
2900 		 * when the peer also uses timestamps option.  And
2901 		 * the TCP header template must have already been
2902 		 * updated to include the timestamps option.
2903 		 */
2904 		if (tcp->tcp_snd_sack_ok) {
2905 			if (tcp->tcp_snd_ts_ok) {
2906 				uint8_t *tmp_wptr;
2907 
2908 				/*
2909 				 * Use the NOP in the header just
2910 				 * before timestamps opton.
2911 				 */
2912 				tmp_wptr = (uint8_t *)tcpha +
2913 				    TCP_MIN_HEADER_LENGTH;
2914 				ASSERT(tmp_wptr[0] == TCPOPT_NOP &&
2915 				    tmp_wptr[1] == TCPOPT_NOP);
2916 				tmp_wptr[0] = TCPOPT_SACK_PERMITTED;
2917 				tmp_wptr[1] = TCPOPT_SACK_OK_LEN;
2918 			} else {
2919 				add_sack = B_TRUE;
2920 			}
2921 		}
2922 
2923 
2924 		/*
2925 		 * If the other side is ECN capable, reply
2926 		 * that we are also ECN capable.
2927 		 */
2928 		if (tcp->tcp_ecn_ok)
2929 			*flags |= TH_ECE;
2930 		break;
2931 
2932 	default:
2933 		/*
2934 		 * The above ASSERT() makes sure that this
2935 		 * must be FIN-WAIT-1 state.  Our SYN has
2936 		 * not been ack'ed so retransmit it.
2937 		 */
2938 		*flags |= TH_SYN;
2939 		break;
2940 	}
2941 
2942 	if (add_sack) {
2943 		wptr[0] = TCPOPT_NOP;
2944 		wptr[1] = TCPOPT_NOP;
2945 		wptr[2] = TCPOPT_SACK_PERMITTED;
2946 		wptr[3] = TCPOPT_SACK_OK_LEN;
2947 		wptr += TCPOPT_REAL_SACK_OK_LEN;
2948 		tcpha->tha_offset_and_reserved += (1 << 4);
2949 	}
2950 
2951 	if (tcp->tcp_snd_ws_ok) {
2952 		wptr[0] =  TCPOPT_NOP;
2953 		wptr[1] =  TCPOPT_WSCALE;
2954 		wptr[2] =  TCPOPT_WS_LEN;
2955 		wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
2956 		wptr += TCPOPT_REAL_WS_LEN;
2957 		tcpha->tha_offset_and_reserved += (1 << 4);
2958 	}
2959 
2960 	mp->b_wptr = wptr;
2961 	u1 = (int)(mp->b_wptr - mp->b_rptr);
2962 	/*
2963 	 * Get IP set to checksum on our behalf
2964 	 * Include the adjustment for a source route if any.
2965 	 */
2966 	u1 += connp->conn_sum;
2967 	u1 = (u1 >> 16) + (u1 & 0xFFFF);
2968 	tcpha->tha_sum = htons(u1);
2969 	TCPS_BUMP_MIB(tcps, tcpOutControl);
2970 }
2971 
2972 /*
2973  * Helper function for tcp_xmit_mp() in handling connection tear down
2974  * flag setting and state changes.
2975  */
2976 static void
2977 tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags)
2978 {
2979 	if (!tcp->tcp_fin_acked) {
2980 		*flags |= TH_FIN;
2981 		TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl);
2982 	}
2983 	if (!tcp->tcp_fin_sent) {
2984 		tcp->tcp_fin_sent = B_TRUE;
2985 		switch (tcp->tcp_state) {
2986 		case TCPS_SYN_RCVD:
2987 			tcp->tcp_state = TCPS_FIN_WAIT_1;
2988 			DTRACE_TCP6(state__change, void, NULL,
2989 			    ip_xmit_attr_t *, ixa, void, NULL,
2990 			    tcp_t *, tcp, void, NULL,
2991 			    int32_t, TCPS_SYN_RCVD);
2992 			break;
2993 		case TCPS_ESTABLISHED:
2994 			tcp->tcp_state = TCPS_FIN_WAIT_1;
2995 			DTRACE_TCP6(state__change, void, NULL,
2996 			    ip_xmit_attr_t *, ixa, void, NULL,
2997 			    tcp_t *, tcp, void, NULL,
2998 			    int32_t, TCPS_ESTABLISHED);
2999 			break;
3000 		case TCPS_CLOSE_WAIT:
3001 			tcp->tcp_state = TCPS_LAST_ACK;
3002 			DTRACE_TCP6(state__change, void, NULL,
3003 			    ip_xmit_attr_t *, ixa, void, NULL,
3004 			    tcp_t *, tcp, void, NULL,
3005 			    int32_t, TCPS_CLOSE_WAIT);
3006 			break;
3007 		}
3008 		if (tcp->tcp_suna == tcp->tcp_snxt)
3009 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3010 		tcp->tcp_snxt = tcp->tcp_fss + 1;
3011 	}
3012 }
3013 
3014 /*
3015  * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
3016  * ip and tcp header ready to pass down to IP.  If the mp passed in is
3017  * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
3018  * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
3019  * otherwise it will dup partial mblks.)
3020  * Otherwise, an appropriate ACK packet will be generated.  This
3021  * routine is not usually called to send new data for the first time.  It
3022  * is mostly called out of the timer for retransmits, and to generate ACKs.
3023  *
3024  * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
3025  * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
3026  * of the original mblk chain will be returned in *offset and *end_mp.
3027  */
3028 mblk_t *
3029 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
3030     mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
3031     boolean_t rexmit)
3032 {
3033 	int	data_length;
3034 	int32_t	off = 0;
3035 	uint_t	flags;
3036 	mblk_t	*mp1;
3037 	mblk_t	*mp2;
3038 	uchar_t	*rptr;
3039 	tcpha_t	*tcpha;
3040 	int32_t	num_sack_blk = 0;
3041 	int32_t	sack_opt_len = 0;
3042 	tcp_stack_t	*tcps = tcp->tcp_tcps;
3043 	conn_t		*connp = tcp->tcp_connp;
3044 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
3045 
3046 	/* Allocate for our maximum TCP header + link-level */
3047 	mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
3048 	    BPRI_MED);
3049 	if (mp1 == NULL)
3050 		return (NULL);
3051 	data_length = 0;
3052 
3053 	/*
3054 	 * Note that tcp_mss has been adjusted to take into account the
3055 	 * timestamp option if applicable.  Because SACK options do not
3056 	 * appear in every TCP segments and they are of variable lengths,
3057 	 * they cannot be included in tcp_mss.  Thus we need to calculate
3058 	 * the actual segment length when we need to send a segment which
3059 	 * includes SACK options.
3060 	 */
3061 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
3062 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
3063 		    tcp->tcp_num_sack_blk);
3064 		sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
3065 		    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
3066 		if (max_to_send + sack_opt_len > tcp->tcp_mss)
3067 			max_to_send -= sack_opt_len;
3068 	}
3069 
3070 	if (offset != NULL) {
3071 		off = *offset;
3072 		/* We use offset as an indicator that end_mp is not NULL. */
3073 		*end_mp = NULL;
3074 	}
3075 	for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
3076 		/* This could be faster with cooperation from downstream */
3077 		if (mp2 != mp1 && !sendall &&
3078 		    data_length + (int)(mp->b_wptr - mp->b_rptr) >
3079 		    max_to_send)
3080 			/*
3081 			 * Don't send the next mblk since the whole mblk
3082 			 * does not fit.
3083 			 */
3084 			break;
3085 		mp2->b_cont = dupb(mp);
3086 		mp2 = mp2->b_cont;
3087 		if (!mp2) {
3088 			freemsg(mp1);
3089 			return (NULL);
3090 		}
3091 		mp2->b_rptr += off;
3092 		ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
3093 		    (uintptr_t)INT_MAX);
3094 
3095 		data_length += (int)(mp2->b_wptr - mp2->b_rptr);
3096 		if (data_length > max_to_send) {
3097 			mp2->b_wptr -= data_length - max_to_send;
3098 			data_length = max_to_send;
3099 			off = mp2->b_wptr - mp->b_rptr;
3100 			break;
3101 		} else {
3102 			off = 0;
3103 		}
3104 	}
3105 	if (offset != NULL) {
3106 		*offset = off;
3107 		*end_mp = mp;
3108 	}
3109 	if (seg_len != NULL) {
3110 		*seg_len = data_length;
3111 	}
3112 
3113 	/* Update the latest receive window size in TCP header. */
3114 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
3115 
3116 	rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
3117 	mp1->b_rptr = rptr;
3118 	mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
3119 	bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
3120 	tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
3121 	tcpha->tha_seq = htonl(seq);
3122 
3123 	/*
3124 	 * Use tcp_unsent to determine if the PUSH bit should be used assumes
3125 	 * that this function was called from tcp_wput_data. Thus, when called
3126 	 * to retransmit data the setting of the PUSH bit may appear some
3127 	 * what random in that it might get set when it should not. This
3128 	 * should not pose any performance issues.
3129 	 */
3130 	if (data_length != 0 && (tcp->tcp_unsent == 0 ||
3131 	    tcp->tcp_unsent == data_length)) {
3132 		flags = TH_ACK | TH_PUSH;
3133 	} else {
3134 		flags = TH_ACK;
3135 	}
3136 
3137 	if (tcp->tcp_ecn_ok) {
3138 		if (tcp->tcp_ecn_echo_on)
3139 			flags |= TH_ECE;
3140 
3141 		/*
3142 		 * Only set ECT bit and ECN_CWR if a segment contains new data.
3143 		 * There is no TCP flow control for non-data segments, and
3144 		 * only data segment is transmitted reliably.
3145 		 */
3146 		if (data_length > 0 && !rexmit) {
3147 			TCP_SET_ECT(tcp, rptr);
3148 			if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3149 				flags |= TH_CWR;
3150 				tcp->tcp_ecn_cwr_sent = B_TRUE;
3151 			}
3152 		}
3153 	}
3154 
3155 	/* Check if there is any special processing needs to be done. */
3156 	if (tcp->tcp_valid_bits) {
3157 		uint32_t u1;
3158 
3159 		/* We don't allow having SYN and FIN in the same segment... */
3160 		if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
3161 		    seq == tcp->tcp_iss) {
3162 			/* Need to do connection set up processing. */
3163 			tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags);
3164 		} else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3165 		    (seq + data_length) == tcp->tcp_fss) {
3166 			/* Need to do connection tear down processing. */
3167 			tcp_xmit_mp_aux_fss(tcp, ixa, &flags);
3168 		}
3169 
3170 		/*
3171 		 * Need to do urgent pointer processing.
3172 		 *
3173 		 * Note the trick here.  u1 is unsigned.  When tcp_urg
3174 		 * is smaller than seq, u1 will become a very huge value.
3175 		 * So the comparison will fail.  Also note that tcp_urp
3176 		 * should be positive, see RFC 793 page 17.
3177 		 */
3178 		u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
3179 		if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
3180 		    u1 < (uint32_t)(64 * 1024)) {
3181 			flags |= TH_URG;
3182 			TCPS_BUMP_MIB(tcps, tcpOutUrg);
3183 			tcpha->tha_urp = htons(u1);
3184 		}
3185 	}
3186 	tcpha->tha_flags = (uchar_t)flags;
3187 	tcp->tcp_rack = tcp->tcp_rnxt;
3188 	tcp->tcp_rack_cnt = 0;
3189 
3190 	/* Fill in the current value of timestamps option. */
3191 	if (tcp->tcp_snd_ts_ok) {
3192 		if (tcp->tcp_state != TCPS_SYN_SENT) {
3193 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
3194 
3195 			U32_TO_BE32(llbolt,
3196 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
3197 			U32_TO_BE32(tcp->tcp_ts_recent,
3198 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
3199 		}
3200 	}
3201 
3202 	/* Fill in the SACK blocks. */
3203 	if (num_sack_blk > 0) {
3204 		uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
3205 		sack_blk_t *tmp;
3206 		int32_t	i;
3207 
3208 		wptr[0] = TCPOPT_NOP;
3209 		wptr[1] = TCPOPT_NOP;
3210 		wptr[2] = TCPOPT_SACK;
3211 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3212 		    sizeof (sack_blk_t);
3213 		wptr += TCPOPT_REAL_SACK_LEN;
3214 
3215 		tmp = tcp->tcp_sack_list;
3216 		for (i = 0; i < num_sack_blk; i++) {
3217 			U32_TO_BE32(tmp[i].begin, wptr);
3218 			wptr += sizeof (tcp_seq);
3219 			U32_TO_BE32(tmp[i].end, wptr);
3220 			wptr += sizeof (tcp_seq);
3221 		}
3222 		tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
3223 	}
3224 	ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
3225 	data_length += (int)(mp1->b_wptr - rptr);
3226 
3227 	ixa->ixa_pktlen = data_length;
3228 
3229 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3230 		((ipha_t *)rptr)->ipha_length = htons(data_length);
3231 	} else {
3232 		ip6_t *ip6 = (ip6_t *)rptr;
3233 
3234 		ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
3235 	}
3236 
3237 	/*
3238 	 * Prime pump for IP
3239 	 * Include the adjustment for a source route if any.
3240 	 */
3241 	data_length -= ixa->ixa_ip_hdr_length;
3242 	data_length += connp->conn_sum;
3243 	data_length = (data_length >> 16) + (data_length & 0xFFFF);
3244 	tcpha->tha_sum = htons(data_length);
3245 	if (tcp->tcp_ip_forward_progress) {
3246 		tcp->tcp_ip_forward_progress = B_FALSE;
3247 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
3248 	} else {
3249 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
3250 	}
3251 	return (mp1);
3252 }
3253 
3254 /*
3255  * If this routine returns B_TRUE, TCP can generate a RST in response
3256  * to a segment.  If it returns B_FALSE, TCP should not respond.
3257  */
3258 static boolean_t
3259 tcp_send_rst_chk(tcp_stack_t *tcps)
3260 {
3261 	int64_t	now;
3262 
3263 	/*
3264 	 * TCP needs to protect itself from generating too many RSTs.
3265 	 * This can be a DoS attack by sending us random segments
3266 	 * soliciting RSTs.
3267 	 *
3268 	 * What we do here is to have a limit of tcp_rst_sent_rate RSTs
3269 	 * in each 1 second interval.  In this way, TCP still generate
3270 	 * RSTs in normal cases but when under attack, the impact is
3271 	 * limited.
3272 	 */
3273 	if (tcps->tcps_rst_sent_rate_enabled != 0) {
3274 		now = ddi_get_lbolt64();
3275 		if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) >
3276 		    1*SECONDS) {
3277 			tcps->tcps_last_rst_intrvl = now;
3278 			tcps->tcps_rst_cnt = 1;
3279 		} else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) {
3280 			return (B_FALSE);
3281 		}
3282 	}
3283 	return (B_TRUE);
3284 }
3285 
3286 /*
3287  * This function handles all retransmissions if SACK is enabled for this
3288  * connection.  First it calculates how many segments can be retransmitted
3289  * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
3290  * segments.  A segment is eligible if sack_cnt for that segment is greater
3291  * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
3292  * all eligible segments, it checks to see if TCP can send some new segments
3293  * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
3294  *
3295  * Parameters:
3296  *	tcp_t *tcp: the tcp structure of the connection.
3297  *	uint_t *flags: in return, appropriate value will be set for
3298  *	tcp_input_data().
3299  */
3300 void
3301 tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
3302 {
3303 	notsack_blk_t	*notsack_blk;
3304 	int32_t		usable_swnd;
3305 	int32_t		mss;
3306 	uint32_t	seg_len;
3307 	mblk_t		*xmit_mp;
3308 	tcp_stack_t	*tcps = tcp->tcp_tcps;
3309 
3310 	ASSERT(tcp->tcp_notsack_list != NULL);
3311 	ASSERT(tcp->tcp_rexmit == B_FALSE);
3312 
3313 	/* Defensive coding in case there is a bug... */
3314 	if (tcp->tcp_notsack_list == NULL) {
3315 		return;
3316 	}
3317 	notsack_blk = tcp->tcp_notsack_list;
3318 	mss = tcp->tcp_mss;
3319 
3320 	/*
3321 	 * Limit the num of outstanding data in the network to be
3322 	 * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
3323 	 */
3324 	usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3325 
3326 	/* At least retransmit 1 MSS of data. */
3327 	if (usable_swnd <= 0) {
3328 		usable_swnd = mss;
3329 	}
3330 
3331 	/* Make sure no new RTT samples will be taken. */
3332 	tcp->tcp_csuna = tcp->tcp_snxt;
3333 
3334 	notsack_blk = tcp->tcp_notsack_list;
3335 	while (usable_swnd > 0) {
3336 		mblk_t		*snxt_mp, *tmp_mp;
3337 		tcp_seq		begin = tcp->tcp_sack_snxt;
3338 		tcp_seq		end;
3339 		int32_t		off;
3340 
3341 		for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
3342 			if (SEQ_GT(notsack_blk->end, begin) &&
3343 			    (notsack_blk->sack_cnt >=
3344 			    tcps->tcps_dupack_fast_retransmit)) {
3345 				end = notsack_blk->end;
3346 				if (SEQ_LT(begin, notsack_blk->begin)) {
3347 					begin = notsack_blk->begin;
3348 				}
3349 				break;
3350 			}
3351 		}
3352 		/*
3353 		 * All holes are filled.  Manipulate tcp_cwnd to send more
3354 		 * if we can.  Note that after the SACK recovery, tcp_cwnd is
3355 		 * set to tcp_cwnd_ssthresh.
3356 		 */
3357 		if (notsack_blk == NULL) {
3358 			usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3359 			if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
3360 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
3361 				ASSERT(tcp->tcp_cwnd > 0);
3362 				return;
3363 			} else {
3364 				usable_swnd = usable_swnd / mss;
3365 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
3366 				    MAX(usable_swnd * mss, mss);
3367 				*flags |= TH_XMIT_NEEDED;
3368 				return;
3369 			}
3370 		}
3371 
3372 		/*
3373 		 * Note that we may send more than usable_swnd allows here
3374 		 * because of round off, but no more than 1 MSS of data.
3375 		 */
3376 		seg_len = end - begin;
3377 		if (seg_len > mss)
3378 			seg_len = mss;
3379 		snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
3380 		ASSERT(snxt_mp != NULL);
3381 		/* This should not happen.  Defensive coding again... */
3382 		if (snxt_mp == NULL) {
3383 			return;
3384 		}
3385 
3386 		xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3387 		    &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3388 		if (xmit_mp == NULL)
3389 			return;
3390 
3391 		usable_swnd -= seg_len;
3392 		tcp->tcp_pipe += seg_len;
3393 		tcp->tcp_sack_snxt = begin + seg_len;
3394 
3395 		tcp_send_data(tcp, xmit_mp);
3396 
3397 		/*
3398 		 * Update the send timestamp to avoid false retransmission.
3399 		 */
3400 		snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3401 
3402 		TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3403 		TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3404 		TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3405 		tcp->tcp_cs.tcp_out_retrans_segs++;
3406 		tcp->tcp_cs.tcp_out_retrans_bytes += seg_len;
3407 		/*
3408 		 * Update tcp_rexmit_max to extend this SACK recovery phase.
3409 		 * This happens when new data sent during fast recovery is
3410 		 * also lost.  If TCP retransmits those new data, it needs
3411 		 * to extend SACK recover phase to avoid starting another
3412 		 * fast retransmit/recovery unnecessarily.
3413 		 */
3414 		if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3415 			tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3416 		}
3417 	}
3418 }
3419 
3420 /*
3421  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3422  * or ICMP errors.
3423  */
3424 void
3425 tcp_ss_rexmit(tcp_t *tcp)
3426 {
3427 	uint32_t	snxt;
3428 	uint32_t	smax;
3429 	int32_t		win;
3430 	int32_t		mss;
3431 	int32_t		off;
3432 	mblk_t		*snxt_mp;
3433 	tcp_stack_t	*tcps = tcp->tcp_tcps;
3434 
3435 	/*
3436 	 * Note that tcp_rexmit can be set even though TCP has retransmitted
3437 	 * all unack'ed segments.
3438 	 */
3439 	if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3440 		smax = tcp->tcp_rexmit_max;
3441 		snxt = tcp->tcp_rexmit_nxt;
3442 		if (SEQ_LT(snxt, tcp->tcp_suna)) {
3443 			snxt = tcp->tcp_suna;
3444 		}
3445 		win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3446 		win -= snxt - tcp->tcp_suna;
3447 		mss = tcp->tcp_mss;
3448 		snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3449 
3450 		while (SEQ_LT(snxt, smax) && (win > 0) && (snxt_mp != NULL)) {
3451 			mblk_t	*xmit_mp;
3452 			mblk_t	*old_snxt_mp = snxt_mp;
3453 			uint32_t cnt = mss;
3454 
3455 			if (win < cnt) {
3456 				cnt = win;
3457 			}
3458 			if (SEQ_GT(snxt + cnt, smax)) {
3459 				cnt = smax - snxt;
3460 			}
3461 			xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3462 			    &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3463 			if (xmit_mp == NULL)
3464 				return;
3465 
3466 			tcp_send_data(tcp, xmit_mp);
3467 
3468 			snxt += cnt;
3469 			win -= cnt;
3470 			/*
3471 			 * Update the send timestamp to avoid false
3472 			 * retransmission.
3473 			 */
3474 			old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
3475 			TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3476 			TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3477 			tcp->tcp_cs.tcp_out_retrans_segs++;
3478 			tcp->tcp_cs.tcp_out_retrans_bytes += cnt;
3479 
3480 			tcp->tcp_rexmit_nxt = snxt;
3481 		}
3482 		/*
3483 		 * If we have transmitted all we have at the time
3484 		 * we started the retranmission, we can leave
3485 		 * the rest of the job to tcp_wput_data().  But we
3486 		 * need to check the send window first.  If the
3487 		 * win is not 0, go on with tcp_wput_data().
3488 		 */
3489 		if (SEQ_LT(snxt, smax) || win == 0) {
3490 			return;
3491 		}
3492 	}
3493 	/* Only call tcp_wput_data() if there is data to be sent. */
3494 	if (tcp->tcp_unsent) {
3495 		tcp_wput_data(tcp, NULL, B_FALSE);
3496 	}
3497 }
3498 
3499 /*
3500  * Do slow start retransmission after ICMP errors of PMTU changes.
3501  */
3502 void
3503 tcp_rexmit_after_error(tcp_t *tcp)
3504 {
3505 	/*
3506 	 * All sent data has been acknowledged or no data left to send, just
3507 	 * to return.
3508 	 */
3509 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3510 	    (tcp->tcp_xmit_head == NULL))
3511 		return;
3512 
3513 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3514 		tcp->tcp_rexmit_max = tcp->tcp_fss;
3515 	else
3516 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
3517 
3518 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3519 	tcp->tcp_rexmit = B_TRUE;
3520 	tcp->tcp_dupack_cnt = 0;
3521 	tcp_ss_rexmit(tcp);
3522 }
3523 
3524 /*
3525  * tcp_get_seg_mp() is called to get the pointer to a segment in the
3526  * send queue which starts at the given sequence number. If the given
3527  * sequence number is equal to last valid sequence number (tcp_snxt), the
3528  * returned mblk is the last valid mblk, and off is set to the length of
3529  * that mblk.
3530  *
3531  * send queue which starts at the given seq. no.
3532  *
3533  * Parameters:
3534  *	tcp_t *tcp: the tcp instance pointer.
3535  *	uint32_t seq: the starting seq. no of the requested segment.
3536  *	int32_t *off: after the execution, *off will be the offset to
3537  *		the returned mblk which points to the requested seq no.
3538  *		It is the caller's responsibility to send in a non-null off.
3539  *
3540  * Return:
3541  *	A mblk_t pointer pointing to the requested segment in send queue.
3542  */
3543 static mblk_t *
3544 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
3545 {
3546 	int32_t	cnt;
3547 	mblk_t	*mp;
3548 
3549 	/* Defensive coding.  Make sure we don't send incorrect data. */
3550 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt))
3551 		return (NULL);
3552 
3553 	cnt = seq - tcp->tcp_suna;
3554 	mp = tcp->tcp_xmit_head;
3555 	while (cnt > 0 && mp != NULL) {
3556 		cnt -= mp->b_wptr - mp->b_rptr;
3557 		if (cnt <= 0) {
3558 			cnt += mp->b_wptr - mp->b_rptr;
3559 			break;
3560 		}
3561 		mp = mp->b_cont;
3562 	}
3563 	ASSERT(mp != NULL);
3564 	*off = cnt;
3565 	return (mp);
3566 }
3567 
3568 /*
3569  * This routine adjusts next-to-send sequence number variables, in the
3570  * case where the reciever has shrunk it's window.
3571  */
3572 void
3573 tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
3574 {
3575 	mblk_t *xmit_tail;
3576 	int32_t offset;
3577 
3578 	tcp->tcp_snxt = snxt;
3579 
3580 	/* Get the mblk, and the offset in it, as per the shrunk window */
3581 	xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
3582 	ASSERT(xmit_tail != NULL);
3583 	tcp->tcp_xmit_tail = xmit_tail;
3584 	tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr -
3585 	    xmit_tail->b_rptr - offset;
3586 }
3587 
3588 /*
3589  * This handles the case when the receiver has shrunk its win. Per RFC 1122
3590  * if the receiver shrinks the window, i.e. moves the right window to the
3591  * left, the we should not send new data, but should retransmit normally the
3592  * old unacked data between suna and suna + swnd. We might has sent data
3593  * that is now outside the new window, pretend that we didn't send  it.
3594  */
3595 static void
3596 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
3597 {
3598 	uint32_t	snxt = tcp->tcp_snxt;
3599 
3600 	ASSERT(shrunk_count > 0);
3601 
3602 	if (!tcp->tcp_is_wnd_shrnk) {
3603 		tcp->tcp_snxt_shrunk = snxt;
3604 		tcp->tcp_is_wnd_shrnk = B_TRUE;
3605 	} else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) {
3606 		tcp->tcp_snxt_shrunk = snxt;
3607 	}
3608 
3609 	/* Pretend we didn't send the data outside the window */
3610 	snxt -= shrunk_count;
3611 
3612 	/* Reset all the values per the now shrunk window */
3613 	tcp_update_xmit_tail(tcp, snxt);
3614 	tcp->tcp_unsent += shrunk_count;
3615 
3616 	/*
3617 	 * If the SACK option is set, delete the entire list of
3618 	 * notsack'ed blocks.
3619 	 */
3620 	TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3621 
3622 	if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3623 		/*
3624 		 * Make sure the timer is running so that we will probe a zero
3625 		 * window.
3626 		 */
3627 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3628 }
3629 
3630 /*
3631  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3632  * with the template header, as well as other options such as time-stamp,
3633  * ECN and/or SACK.
3634  */
3635 static void
3636 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
3637 {
3638 	tcpha_t *tcp_tmpl, *tcpha;
3639 	uint32_t *dst, *src;
3640 	int hdrlen;
3641 	conn_t *connp = tcp->tcp_connp;
3642 
3643 	ASSERT(OK_32PTR(rptr));
3644 
3645 	/* Template header */
3646 	tcp_tmpl = tcp->tcp_tcpha;
3647 
3648 	/* Header of outgoing packet */
3649 	tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3650 
3651 	/* dst and src are opaque 32-bit fields, used for copying */
3652 	dst = (uint32_t *)rptr;
3653 	src = (uint32_t *)connp->conn_ht_iphc;
3654 	hdrlen = connp->conn_ht_iphc_len;
3655 
3656 	/* Fill time-stamp option if needed */
3657 	if (tcp->tcp_snd_ts_ok) {
3658 		U32_TO_BE32(LBOLT_FASTPATH,
3659 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3660 		U32_TO_BE32(tcp->tcp_ts_recent,
3661 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3662 	} else {
3663 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3664 	}
3665 
3666 	/*
3667 	 * Copy the template header; is this really more efficient than
3668 	 * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3669 	 * but perhaps not for other scenarios.
3670 	 */
3671 	dst[0] = src[0];
3672 	dst[1] = src[1];
3673 	dst[2] = src[2];
3674 	dst[3] = src[3];
3675 	dst[4] = src[4];
3676 	dst[5] = src[5];
3677 	dst[6] = src[6];
3678 	dst[7] = src[7];
3679 	dst[8] = src[8];
3680 	dst[9] = src[9];
3681 	if (hdrlen -= 40) {
3682 		hdrlen >>= 2;
3683 		dst += 10;
3684 		src += 10;
3685 		do {
3686 			*dst++ = *src++;
3687 		} while (--hdrlen);
3688 	}
3689 
3690 	/*
3691 	 * Set the ECN info in the TCP header if it is not a zero
3692 	 * window probe.  Zero window probe is only sent in
3693 	 * tcp_wput_data() and tcp_timer().
3694 	 */
3695 	if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
3696 		TCP_SET_ECT(tcp, rptr);
3697 
3698 		if (tcp->tcp_ecn_echo_on)
3699 			tcpha->tha_flags |= TH_ECE;
3700 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3701 			tcpha->tha_flags |= TH_CWR;
3702 			tcp->tcp_ecn_cwr_sent = B_TRUE;
3703 		}
3704 	}
3705 
3706 	/* Fill in SACK options */
3707 	if (num_sack_blk > 0) {
3708 		uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
3709 		sack_blk_t *tmp;
3710 		int32_t	i;
3711 
3712 		wptr[0] = TCPOPT_NOP;
3713 		wptr[1] = TCPOPT_NOP;
3714 		wptr[2] = TCPOPT_SACK;
3715 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3716 		    sizeof (sack_blk_t);
3717 		wptr += TCPOPT_REAL_SACK_LEN;
3718 
3719 		tmp = tcp->tcp_sack_list;
3720 		for (i = 0; i < num_sack_blk; i++) {
3721 			U32_TO_BE32(tmp[i].begin, wptr);
3722 			wptr += sizeof (tcp_seq);
3723 			U32_TO_BE32(tmp[i].end, wptr);
3724 			wptr += sizeof (tcp_seq);
3725 		}
3726 		tcpha->tha_offset_and_reserved +=
3727 		    ((num_sack_blk * 2 + 1) << 4);
3728 	}
3729 }
3730