xref: /openbsd/sys/netinet/tcp_input.c (revision 133306f0)
1 /*	$OpenBSD: tcp_input.c,v 1.81 2000/12/13 09:47:08 provos Exp $	*/
2 /*	$NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)tcp_input.c	8.5 (Berkeley) 4/10/94
37  */
38 
39 /*
40 %%% portions-copyright-nrl-95
41 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
43 Reserved. All rights under this copyright have been assigned to the US
44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
46 software.
47 You should have received a copy of the license with this software. If you
48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>.
49 */
50 
51 #ifndef TUBA_INCLUDE
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/malloc.h>
55 #include <sys/mbuf.h>
56 #include <sys/protosw.h>
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/errno.h>
60 #include <sys/domain.h>
61 
62 #include <net/if.h>
63 #include <net/route.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/ip.h>
68 #include <netinet/in_pcb.h>
69 #include <netinet/ip_var.h>
70 #include <netinet/tcp.h>
71 #include <netinet/tcp_fsm.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_timer.h>
74 #include <netinet/tcp_var.h>
75 #include <netinet/tcpip.h>
76 #include <netinet/tcp_debug.h>
77 #include <dev/rndvar.h>
78 #include <machine/stdarg.h>
79 #include <sys/md5k.h>
80 
81 #ifdef IPSEC
82 #include <netinet/ip_ipsp.h>
83 #endif /* IPSEC */
84 
85 #define PI_MAGIC 0xdeadbeef  /* XXX the horror! */
86 
87 #ifdef INET6
88 #include <netinet6/in6_var.h>
89 #include <netinet/ip6.h>
90 #include <netinet6/ip6_var.h>
91 #include <netinet6/tcpipv6.h>
92 #include <netinet/icmp6.h>
93 #include <netinet6/nd6.h>
94 
95 struct	tcpiphdr tcp_saveti;
96 struct  tcpipv6hdr tcp_saveti6;
97 
98 /* for the packet header length in the mbuf */
99 #define M_PH_LEN(m)      (((struct mbuf *)(m))->m_pkthdr.len)
100 #define M_V6_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip6_hdr))
101 #define M_V4_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip))
102 #endif /* INET6 */
103 
104 int	tcprexmtthresh = 3;
105 struct	tcpiphdr tcp_saveti;
106 int	tcptv_keep_init = TCPTV_KEEP_INIT;
107 
108 extern u_long sb_max;
109 
110 int tcp_rst_ppslim = 100;		/* 100pps */
111 int tcp_rst_ppslim_count = 0;
112 struct timeval tcp_rst_ppslim_last;
113 
114 #endif /* TUBA_INCLUDE */
115 #define TCP_PAWS_IDLE	(24 * 24 * 60 * 60 * PR_SLOWHZ)
116 
117 /* for modulo comparisons of timestamps */
118 #define TSTMP_LT(a,b)	((int)((a)-(b)) < 0)
119 #define TSTMP_GEQ(a,b)	((int)((a)-(b)) >= 0)
120 
121 /*
122  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
123  */
124 #ifdef INET6
125 #define ND6_HINT(tp) \
126 do { \
127 	if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
128 	    tp->t_inpcb->inp_route6.ro_rt) { \
129 		nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \
130 	} \
131 } while (0)
132 #else
133 #define ND6_HINT(tp)
134 #endif
135 
136 /*
137  * Insert segment ti into reassembly queue of tcp with
138  * control block tp.  Return TH_FIN if reassembly now includes
139  * a segment with FIN.  The macro form does the common case inline
140  * (segment is the next to be received on an established connection,
141  * and the queue is empty), avoiding linkage into and removal
142  * from the queue and repetition of various conversions.
143  * Set DELACK for segments received in order, but ack immediately
144  * when segments are out of order (so fast retransmit can work).
145  */
146 
147 #ifndef TUBA_INCLUDE
148 
149 int
150 tcp_reass(tp, th, m, tlen)
151 	register struct tcpcb *tp;
152 	register struct tcphdr *th;
153 	struct mbuf *m;
154 	int *tlen;
155 {
156 	register struct ipqent *p, *q, *nq, *tiqe;
157 	struct socket *so = tp->t_inpcb->inp_socket;
158 	int flags;
159 
160 	/*
161 	 * Call with th==0 after become established to
162 	 * force pre-ESTABLISHED data up to user socket.
163 	 */
164 	if (th == 0)
165 		goto present;
166 
167 	/*
168 	 * Allocate a new queue entry, before we throw away any data.
169 	 * If we can't, just drop the packet.  XXX
170 	 */
171 	MALLOC(tiqe, struct ipqent *, sizeof(struct ipqent), M_IPQ, M_NOWAIT);
172 	if (tiqe == NULL) {
173 		tcpstat.tcps_rcvmemdrop++;
174 		m_freem(m);
175 		return (0);
176 	}
177 
178 	/*
179 	 * Find a segment which begins after this one does.
180 	 */
181 	for (p = NULL, q = tp->segq.lh_first; q != NULL;
182 	    p = q, q = q->ipqe_q.le_next)
183 		if (SEQ_GT(q->ipqe_tcp->th_seq, th->th_seq))
184 			break;
185 
186 	/*
187 	 * If there is a preceding segment, it may provide some of
188 	 * our data already.  If so, drop the data from the incoming
189 	 * segment.  If it provides all of our data, drop us.
190 	 */
191 	if (p != NULL) {
192 		register struct tcphdr *phdr = p->ipqe_tcp;
193 		register int i;
194 
195 		/* conversion to int (in i) handles seq wraparound */
196 		i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
197 		if (i > 0) {
198 		        if (i >= *tlen) {
199 				tcpstat.tcps_rcvduppack++;
200 				tcpstat.tcps_rcvdupbyte += *tlen;
201 				m_freem(m);
202 				FREE(tiqe, M_IPQ);
203 				return (0);
204 			}
205 			m_adj(m, i);
206 			*tlen -= i;
207 			th->th_seq += i;
208 		}
209 	}
210 	tcpstat.tcps_rcvoopack++;
211 	tcpstat.tcps_rcvoobyte += *tlen;
212 
213 	/*
214 	 * While we overlap succeeding segments trim them or,
215 	 * if they are completely covered, dequeue them.
216 	 */
217 	for (; q != NULL; q = nq) {
218 		register struct tcphdr *qhdr = q->ipqe_tcp;
219 		register int i = (th->th_seq + *tlen) - qhdr->th_seq;
220 
221 		if (i <= 0)
222 			break;
223 		if (i < qhdr->th_reseqlen) {
224 			qhdr->th_seq += i;
225 			qhdr->th_reseqlen -= i;
226 			m_adj(q->ipqe_m, i);
227 			break;
228 		}
229 		nq = q->ipqe_q.le_next;
230 		m_freem(q->ipqe_m);
231 		LIST_REMOVE(q, ipqe_q);
232 		FREE(q, M_IPQ);
233 	}
234 
235 	/* Insert the new fragment queue entry into place. */
236 	tiqe->ipqe_m = m;
237 	th->th_reseqlen = *tlen;
238 	tiqe->ipqe_tcp = th;
239 	if (p == NULL) {
240 		LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
241 	} else {
242 		LIST_INSERT_AFTER(p, tiqe, ipqe_q);
243 	}
244 
245 present:
246 	/*
247 	 * Present data to user, advancing rcv_nxt through
248 	 * completed sequence space.
249 	 */
250 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
251 		return (0);
252 	q = tp->segq.lh_first;
253 	if (q == NULL || q->ipqe_tcp->th_seq != tp->rcv_nxt)
254 		return (0);
255 	if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->th_reseqlen)
256 		return (0);
257 	do {
258 		tp->rcv_nxt += q->ipqe_tcp->th_reseqlen;
259 		flags = q->ipqe_tcp->th_flags & TH_FIN;
260 
261 		nq = q->ipqe_q.le_next;
262 		LIST_REMOVE(q, ipqe_q);
263 		ND6_HINT(tp);
264 		if (so->so_state & SS_CANTRCVMORE)
265 			m_freem(q->ipqe_m);
266 		else
267 			sbappend(&so->so_rcv, q->ipqe_m);
268 		FREE(q, M_IPQ);
269 		q = nq;
270 	} while (q != NULL && q->ipqe_tcp->th_seq == tp->rcv_nxt);
271 	sorwakeup(so);
272 	return (flags);
273 }
274 
275 /*
276  * First check for a port-specific bomb. We do not want to drop half-opens
277  * for other ports if this is the only port being bombed.  We only check
278  * the bottom 40 half open connections, to avoid wasting too much time.
279  *
280  * Or, otherwise it is more likely a generic syn bomb, so delete the oldest
281  * half-open connection.
282  */
283 void
284 tcpdropoldhalfopen(avoidtp, port)
285 	struct tcpcb *avoidtp;
286 	u_int16_t port;
287 {
288 	register struct inpcb *inp;
289 	register struct tcpcb *tp;
290 	int ncheck = 40;
291 	int s;
292 
293 	s = splnet();
294 	inp = tcbtable.inpt_queue.cqh_first;
295 	if (inp)						/* XXX */
296 	for (; inp != (struct inpcb *)&tcbtable.inpt_queue && --ncheck;
297 	    inp = inp->inp_queue.cqe_prev) {
298 		if ((tp = (struct tcpcb *)inp->inp_ppcb) &&
299 		    tp != avoidtp &&
300 		    tp->t_state == TCPS_SYN_RECEIVED &&
301 		    port == inp->inp_lport) {
302 			tcp_close(tp);
303 			goto done;
304 		}
305 	}
306 
307 	inp = tcbtable.inpt_queue.cqh_first;
308 	if (inp)						/* XXX */
309 	for (; inp != (struct inpcb *)&tcbtable.inpt_queue;
310 	    inp = inp->inp_queue.cqe_prev) {
311 		if ((tp = (struct tcpcb *)inp->inp_ppcb) &&
312 		    tp != avoidtp &&
313 		    tp->t_state == TCPS_SYN_RECEIVED) {
314 			tcp_close(tp);
315 			goto done;
316 		}
317 	}
318 done:
319 	splx(s);
320 }
321 
322 #ifdef INET6
323 int
324 tcp6_input(mp, offp, proto)
325 	struct mbuf **mp;
326 	int *offp, proto;
327 {
328 	struct mbuf *m = *mp;
329 
330 #if defined(NFAITH) && 0 < NFAITH
331 	if (m->m_pkthdr.rcvif) {
332 		if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
333 			/* XXX send icmp6 host/port unreach? */
334 			m_freem(m);
335 			return IPPROTO_DONE;
336 		}
337 	}
338 #endif
339 
340 	/*
341 	 * draft-itojun-ipv6-tcp-to-anycast
342 	 * better place to put this in?
343 	 */
344 	if (m->m_flags & M_ANYCAST6) {
345 		if (m->m_len >= sizeof(struct ip6_hdr)) {
346 			struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
347 			icmp6_error(m, ICMP6_DST_UNREACH,
348 				ICMP6_DST_UNREACH_ADDR,
349 				(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
350 		} else
351 			m_freem(m);
352 		return IPPROTO_DONE;
353 	}
354 
355 	tcp_input(m, *offp, proto);
356 	return IPPROTO_DONE;
357 }
358 #endif
359 
360 /*
361  * TCP input routine, follows pages 65-76 of the
362  * protocol specification dated September, 1981 very closely.
363  */
364 void
365 #if __STDC__
366 tcp_input(struct mbuf *m, ...)
367 #else
368 tcp_input(m, va_alist)
369 	register struct mbuf *m;
370 #endif
371 {
372 	struct ip *ip;
373 	register struct inpcb *inp;
374 	caddr_t optp = NULL;
375 	int optlen = 0;
376 	int len, tlen, off;
377 	register struct tcpcb *tp = 0;
378 	register int tiflags;
379 	struct socket *so = NULL;
380 	int todrop, acked, ourfinisacked, needoutput = 0;
381 	int hdroptlen = 0;
382 	short ostate = 0;
383 	struct in_addr laddr;
384 	int dropsocket = 0;
385 	int iss = 0;
386 	u_long tiwin;
387 	u_int32_t ts_val, ts_ecr;
388 	int ts_present = 0;
389 	int iphlen;
390 	va_list ap;
391 	register struct tcphdr *th;
392 #ifdef INET6
393 	struct in6_addr laddr6;
394 	struct ip6_hdr *ipv6 = NULL;
395 #endif /* INET6 */
396 #ifdef IPSEC
397 	struct tdb_ident *tdbi;
398 	struct tdb *tdb;
399 	int error, s;
400 #endif /* IPSEC */
401 	int af;
402 
403 #ifdef IPSEC
404 	tdbi = (struct tdb_ident *) m->m_pkthdr.tdbi;
405 	if (tdbi == (void *) PI_MAGIC)
406 	        tdbi = NULL;
407 #endif /* IPSEC */
408 
409 	va_start(ap, m);
410 	iphlen = va_arg(ap, int);
411 	va_end(ap);
412 
413 	tcpstat.tcps_rcvtotal++;
414 
415 	/*
416 	 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or
417 	 * TCP/IPv4.
418  	 */
419 	switch (mtod(m, struct ip *)->ip_v) {
420 #ifdef INET6
421 	case 6:
422 		af = AF_INET6;
423 		break;
424 #endif
425 	case 4:
426 		af = AF_INET;
427 		break;
428 	default:
429 #ifdef IPSEC
430 	        if (tdbi)
431 		        free(tdbi, M_TEMP);
432 #endif /* IPSEC */
433 		m_freem(m);
434 		return;	/*EAFNOSUPPORT*/
435 	}
436 
437 	/*
438 	 * Get IP and TCP header together in first mbuf.
439 	 * Note: IP leaves IP header in first mbuf.
440 	 */
441 	switch (af) {
442 	case AF_INET:
443 #ifdef DIAGNOSTIC
444 		if (iphlen < sizeof(struct ip)) {
445 #ifdef IPSEC
446 		        if (tdbi)
447 			        free(tdbi, M_TEMP);
448 #endif /* IPSEC */
449 			m_freem(m);
450 			return;
451 		}
452 #endif /* DIAGNOSTIC */
453 		if (iphlen > sizeof(struct ip)) {
454 #if 0	/*XXX*/
455 			ip_stripoptions(m, (struct mbuf *)0);
456 			iphlen = sizeof(struct ip);
457 #else
458 			printf("extension headers are not allowed\n");
459 #ifdef IPSEC
460 		        if (tdbi)
461 			        free(tdbi, M_TEMP);
462 #endif /* IPSEC */
463 			m_freem(m);
464 			return;
465 #endif
466 		}
467 		break;
468 #ifdef INET6
469 	case AF_INET6:
470 #ifdef DIAGNOSTIC
471 		if (iphlen < sizeof(struct ip6_hdr)) {
472 			m_freem(m);
473 #ifdef IPSEC
474 			if (tdbi)
475 			        free(tdbi, M_TEMP);
476 #endif /* IPSEC */
477 			return;
478 		}
479 #endif /* DIAGNOSTIC */
480 		if (iphlen > sizeof(struct ip6_hdr)) {
481 #if 0 /*XXX*/
482 			ipv6_stripoptions(m, iphlen);
483 			iphlen = sizeof(struct ip6_hdr);
484 #else
485 			printf("extension headers are not allowed\n");
486 #ifdef IPSEC
487 		        if (tdbi)
488 			        free(tdbi, M_TEMP);
489 #endif /* IPSEC */
490 			m_freem(m);
491 			return;
492 #endif
493 		}
494 		break;
495 #endif
496 	default:
497 #ifdef IPSEC
498 	        if (tdbi)
499 		        free(tdbi, M_TEMP);
500 #endif /* IPSEC */
501 		m_freem(m);
502 		return;
503 	}
504 
505 	if (m->m_len < iphlen + sizeof(struct tcphdr)) {
506 		m = m_pullup2(m, iphlen + sizeof(struct tcphdr));
507 		if (m == 0) {
508 			tcpstat.tcps_rcvshort++;
509 #ifdef IPSEC
510 		        if (tdbi)
511 			        free(tdbi, M_TEMP);
512 #endif /* IPSEC */
513 			return;
514 		}
515 	}
516 
517 	ip = NULL;
518 #ifdef INET6
519 	ipv6 = NULL;
520 #endif
521 	switch (af) {
522 	case AF_INET:
523 	    {
524 		struct tcpiphdr *ti;
525 
526 		ip = mtod(m, struct ip *);
527 #if 1
528 		tlen = m->m_pkthdr.len - iphlen;
529 #else
530 		tlen = ((struct ip *)ti)->ip_len;
531 #endif
532 		ti = mtod(m, struct tcpiphdr *);
533 
534 		/*
535 		 * Checksum extended TCP header and data.
536 		 */
537 		len = sizeof(struct ip) + tlen;
538 		bzero(ti->ti_x1, sizeof ti->ti_x1);
539 		ti->ti_len = (u_int16_t)tlen;
540 		HTONS(ti->ti_len);
541 		if ((ti->ti_sum = in_cksum(m, len)) != 0) {
542 			tcpstat.tcps_rcvbadsum++;
543 			goto drop;
544 		}
545 		break;
546 	    }
547 #ifdef INET6
548 	case AF_INET6:
549 		ipv6 = mtod(m, struct ip6_hdr *);
550 		tlen = m->m_pkthdr.len - iphlen;
551 
552 		/* Be proactive about malicious use of IPv4 mapped address */
553 		if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) ||
554 		    IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_dst)) {
555 			/* XXX stat */
556 			goto drop;
557 		}
558 
559 		/*
560 		 * Be proactive about unspecified IPv6 address in source.
561 		 * As we use all-zero to indicate unbounded/unconnected pcb,
562 		 * unspecified IPv6 address can be used to confuse us.
563 		 *
564 		 * Note that packets with unspecified IPv6 destination is
565 		 * already dropped in ip6_input.
566 		 */
567 		if (IN6_IS_ADDR_UNSPECIFIED(&ipv6->ip6_src)) {
568 			/* XXX stat */
569 			goto drop;
570 		}
571 
572 		/*
573 		 * Checksum extended TCP header and data.
574 		 */
575 		if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) {
576 			tcpstat.tcps_rcvbadsum++;
577 			goto drop;
578 		}
579 		break;
580 #endif
581 	}
582 #endif /* TUBA_INCLUDE */
583 
584 	th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen);
585 
586 	/*
587 	 * Check that TCP offset makes sense,
588 	 * pull out TCP options and adjust length.		XXX
589 	 */
590 	off = th->th_off << 2;
591 	if (off < sizeof(struct tcphdr) || off > tlen) {
592 		tcpstat.tcps_rcvbadoff++;
593 		goto drop;
594 	}
595 	tlen -= off;
596 	if (off > sizeof(struct tcphdr)) {
597 		if (m->m_len < iphlen + off) {
598 			if ((m = m_pullup2(m, iphlen + off)) == 0) {
599 				tcpstat.tcps_rcvshort++;
600 #ifdef IPSEC
601 				if (tdbi)
602 			                free(tdbi, M_TEMP);
603 #endif /* IPSEC */
604 				return;
605 			}
606 			switch (af) {
607 			case AF_INET:
608 				ip = mtod(m, struct ip *);
609 				break;
610 #ifdef INET6
611 			case AF_INET6:
612 				ipv6 = mtod(m, struct ip6_hdr *);
613 				break;
614 #endif
615 			}
616 			th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen);
617 		}
618 		optlen = off - sizeof(struct tcphdr);
619 		optp = mtod(m, caddr_t) + iphlen + sizeof(struct tcphdr);
620 		/*
621 		 * Do quick retrieval of timestamp options ("options
622 		 * prediction?").  If timestamp is the only option and it's
623 		 * formatted as recommended in RFC 1323 appendix A, we
624 		 * quickly get the values now and not bother calling
625 		 * tcp_dooptions(), etc.
626 		 */
627 		if ((optlen == TCPOLEN_TSTAMP_APPA ||
628 		     (optlen > TCPOLEN_TSTAMP_APPA &&
629 			optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
630 		     *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
631 		     (th->th_flags & TH_SYN) == 0) {
632 			ts_present = 1;
633 			ts_val = ntohl(*(u_int32_t *)(optp + 4));
634 			ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
635 			optp = NULL;	/* we've parsed the options */
636 		}
637 	}
638 	tiflags = th->th_flags;
639 
640 	/*
641 	 * Convert TCP protocol specific fields to host format.
642 	 */
643 	NTOHL(th->th_seq);
644 	NTOHL(th->th_ack);
645 	NTOHS(th->th_win);
646 	NTOHS(th->th_urp);
647 
648 	/*
649 	 * Locate pcb for segment.
650 	 */
651 findpcb:
652 	switch (af) {
653 #ifdef INET6
654 	case AF_INET6:
655 		inp = in6_pcbhashlookup(&tcbtable, &ipv6->ip6_src, th->th_sport,
656 		    &ipv6->ip6_dst, th->th_dport);
657 		break;
658 #endif
659 	case AF_INET:
660 		inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport,
661 		    ip->ip_dst, th->th_dport);
662 		break;
663 	}
664 	if (inp == 0) {
665 		++tcpstat.tcps_pcbhashmiss;
666 		switch (af) {
667 #ifdef INET6
668 		case AF_INET6:
669 			inp = in_pcblookup(&tcbtable, &ipv6->ip6_src,
670 			    th->th_sport, &ipv6->ip6_dst, th->th_dport,
671 			    INPLOOKUP_WILDCARD | INPLOOKUP_IPV6);
672 			break;
673 #endif /* INET6 */
674 		case AF_INET:
675 			inp = in_pcblookup(&tcbtable, &ip->ip_src, th->th_sport,
676 			    &ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD);
677 			break;
678 		}
679 		/*
680 		 * If the state is CLOSED (i.e., TCB does not exist) then
681 		 * all data in the incoming segment is discarded.
682 		 * If the TCB exists but is in CLOSED state, it is embryonic,
683 		 * but should either do a listen or a connect soon.
684 		 */
685 		if (inp == 0) {
686 			++tcpstat.tcps_noport;
687 			goto dropwithreset_ratelim;
688 		}
689 	}
690 
691 	tp = intotcpcb(inp);
692 	if (tp == 0)
693 		goto dropwithreset_ratelim;
694 	if (tp->t_state == TCPS_CLOSED)
695 		goto drop;
696 
697 	/* Unscale the window into a 32-bit value. */
698 	if ((tiflags & TH_SYN) == 0)
699 		tiwin = th->th_win << tp->snd_scale;
700 	else
701 		tiwin = th->th_win;
702 
703 	so = inp->inp_socket;
704 	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
705 		if (so->so_options & SO_DEBUG) {
706 			ostate = tp->t_state;
707 			switch (af) {
708 #ifdef INET6
709 			case AF_INET6:
710 				tcp_saveti6 = *(mtod(m, struct tcpipv6hdr *));
711 				break;
712 #endif
713 			case AF_INET:
714 				tcp_saveti = *(mtod(m, struct tcpiphdr *));
715 				break;
716 			}
717 		}
718 		if (so->so_options & SO_ACCEPTCONN) {
719 			struct socket *so1;
720 
721 			so1 = sonewconn(so, 0);
722 			if (so1 == NULL) {
723 				tcpdropoldhalfopen(tp, th->th_dport);
724 				so1 = sonewconn(so, 0);
725 				if (so1 == NULL)
726 					goto drop;
727 			}
728 			so = so1;
729 			/*
730 			 * This is ugly, but ....
731 			 *
732 			 * Mark socket as temporary until we're
733 			 * committed to keeping it.  The code at
734 			 * ``drop'' and ``dropwithreset'' check the
735 			 * flag dropsocket to see if the temporary
736 			 * socket created here should be discarded.
737 			 * We mark the socket as discardable until
738 			 * we're committed to it below in TCPS_LISTEN.
739 			 */
740 			dropsocket++;
741 #ifdef IPSEC
742 			/*
743 			 * We need to copy the required security levels
744 			 * from the old pcb.
745 			 */
746 			{
747 			  struct inpcb *newinp = (struct inpcb *)so->so_pcb;
748 			  bcopy(inp->inp_seclevel, newinp->inp_seclevel,
749 				sizeof(inp->inp_seclevel));
750 			  newinp->inp_secrequire = inp->inp_secrequire;
751 			}
752 #endif /* IPSEC */
753 #ifdef INET6
754 			/*
755 			 * inp still has the OLD in_pcb stuff, set the
756 			 * v6-related flags on the new guy, too.   This is
757 			 * done particularly for the case where an AF_INET6
758 			 * socket is bound only to a port, and a v4 connection
759 			 * comes in on that port.
760 			 * we also copy the flowinfo from the original pcb
761 			 * to the new one.
762 			 */
763 			{
764 			  int flags = inp->inp_flags;
765 			  struct inpcb *oldinpcb = inp;
766 
767 			  inp = (struct inpcb *)so->so_pcb;
768 			  inp->inp_flags |= (flags & INP_IPV6);
769 			  if ((inp->inp_flags & INP_IPV6) != 0) {
770 			    inp->inp_ipv6.ip6_hlim =
771 			      oldinpcb->inp_ipv6.ip6_hlim;
772 			    inp->inp_ipv6.ip6_flow =
773 			      oldinpcb->inp_ipv6.ip6_flow;
774 			  }
775 			}
776 #else /* INET6 */
777 			inp = (struct inpcb *)so->so_pcb;
778 #endif /* INET6 */
779 			inp->inp_lport = th->th_dport;
780 			switch (af) {
781 #ifdef INET6
782 			case AF_INET6:
783 				inp->inp_laddr6 = ipv6->ip6_dst;
784 				inp->inp_fflowinfo =
785 				    htonl(0x0fffffff) & ipv6->ip6_flow;
786 
787 				/*inp->inp_options = ip6_srcroute();*/ /* soon. */
788 				/*
789 				 * still need to tweak outbound options
790 				 * processing to include this mbuf in
791 				 * the right place and put the correct
792 				 * NextHdr values in the right places.
793 				 * XXX  rja
794 				 */
795 				break;
796 #endif /* INET6 */
797 			case AF_INET:
798 				inp->inp_laddr = ip->ip_dst;
799 				inp->inp_options = ip_srcroute();
800 				break;
801 			}
802 			in_pcbrehash(inp);
803 			tp = intotcpcb(inp);
804 			tp->t_state = TCPS_LISTEN;
805 
806 			/* Compute proper scaling value from buffer space
807 			 */
808 			tcp_rscale(tp, so->so_rcv.sb_hiwat);
809 		}
810 	}
811 
812 #ifdef IPSEC
813         s = splnet();
814         if (tdbi == NULL)
815                 tdb = NULL;
816         else
817 	        tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
818 
819 	ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN,
820 			tdb, inp);
821         splx(s);
822 
823 	if (tdbi)
824 	        free(tdbi, M_TEMP);
825 	tdbi = NULL;
826 
827 	/* Error or otherwise drop-packet indication */
828 	if (error)
829 		goto drop;
830 #endif /* IPSEC */
831 
832 	/*
833 	 * Segment received on connection.
834 	 * Reset idle time and keep-alive timer.
835 	 */
836 	tp->t_idle = 0;
837 	if (tp->t_state != TCPS_SYN_RECEIVED)
838 		tp->t_timer[TCPT_KEEP] = tcp_keepidle;
839 
840 #ifdef TCP_SACK
841 	if (!tp->sack_disable)
842 		tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
843 #endif /* TCP_SACK */
844 
845 	/*
846 	 * Process options if not in LISTEN state,
847 	 * else do it below (after getting remote address).
848 	 */
849 	if (optp && tp->t_state != TCPS_LISTEN)
850 		tcp_dooptions(tp, optp, optlen, th,
851 			&ts_present, &ts_val, &ts_ecr);
852 
853 #ifdef TCP_SACK
854 	if (!tp->sack_disable) {
855 		tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
856 		tp->rcv_lastend = th->th_seq + tlen;
857 	}
858 #endif /* TCP_SACK */
859 	/*
860 	 * Header prediction: check for the two common cases
861 	 * of a uni-directional data xfer.  If the packet has
862 	 * no control flags, is in-sequence, the window didn't
863 	 * change and we're not retransmitting, it's a
864 	 * candidate.  If the length is zero and the ack moved
865 	 * forward, we're the sender side of the xfer.  Just
866 	 * free the data acked & wake any higher level process
867 	 * that was blocked waiting for space.  If the length
868 	 * is non-zero and the ack didn't move, we're the
869 	 * receiver side.  If we're getting packets in-order
870 	 * (the reassembly queue is empty), add the data to
871 	 * the socket buffer and note that we need a delayed ack.
872 	 */
873 	if (tp->t_state == TCPS_ESTABLISHED &&
874 	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
875 	    (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) &&
876 	    th->th_seq == tp->rcv_nxt &&
877 	    tiwin && tiwin == tp->snd_wnd &&
878 	    tp->snd_nxt == tp->snd_max) {
879 
880 		/*
881 		 * If last ACK falls within this segment's sequence numbers,
882 		 *  record the timestamp.
883 		 * Fix from Braden, see Stevens p. 870
884 		 */
885 		if (ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
886 			tp->ts_recent_age = tcp_now;
887 			tp->ts_recent = ts_val;
888 		}
889 
890 		if (tlen == 0) {
891 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
892 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
893 			    tp->snd_cwnd >= tp->snd_wnd &&
894 			    tp->t_dupacks == 0) {
895 				/*
896 				 * this is a pure ack for outstanding data.
897 				 */
898 				++tcpstat.tcps_predack;
899 				if (ts_present)
900 					tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
901 				else if (tp->t_rtt &&
902 					    SEQ_GT(th->th_ack, tp->t_rtseq))
903 					tcp_xmit_timer(tp, tp->t_rtt);
904 				acked = th->th_ack - tp->snd_una;
905 				tcpstat.tcps_rcvackpack++;
906 				tcpstat.tcps_rcvackbyte += acked;
907 				ND6_HINT(tp);
908 				sbdrop(&so->so_snd, acked);
909 				tp->snd_una = th->th_ack;
910 #if defined(TCP_SACK)
911 				/*
912 				 * We want snd_last to track snd_una so
913 				 * as to avoid sequence wraparound problems
914 				 * for very large transfers.
915 				 */
916 				tp->snd_last = tp->snd_una;
917 #endif /* TCP_SACK */
918 #if defined(TCP_SACK) && defined(TCP_FACK)
919 				tp->snd_fack = tp->snd_una;
920 				tp->retran_data = 0;
921 #endif /* TCP_FACK */
922 				m_freem(m);
923 
924 				/*
925 				 * If all outstanding data are acked, stop
926 				 * retransmit timer, otherwise restart timer
927 				 * using current (possibly backed-off) value.
928 				 * If process is waiting for space,
929 				 * wakeup/selwakeup/signal.  If data
930 				 * are ready to send, let tcp_output
931 				 * decide between more output or persist.
932 				 */
933 				if (tp->snd_una == tp->snd_max)
934 					tp->t_timer[TCPT_REXMT] = 0;
935 				else if (tp->t_timer[TCPT_PERSIST] == 0)
936 					tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
937 
938 				if (sb_notify(&so->so_snd))
939 					sowwakeup(so);
940 				if (so->so_snd.sb_cc)
941 					(void) tcp_output(tp);
942 				return;
943 			}
944 		} else if (th->th_ack == tp->snd_una &&
945 		    tp->segq.lh_first == NULL &&
946 		    tlen <= sbspace(&so->so_rcv)) {
947 			/*
948 			 * This is a pure, in-sequence data packet
949 			 * with nothing on the reassembly queue and
950 			 * we have enough buffer space to take it.
951 			 */
952 #ifdef TCP_SACK
953 			/* Clean receiver SACK report if present */
954 			if (!tp->sack_disable && tp->rcv_numsacks)
955 				tcp_clean_sackreport(tp);
956 #endif /* TCP_SACK */
957 			++tcpstat.tcps_preddat;
958 			tp->rcv_nxt += tlen;
959 			tcpstat.tcps_rcvpack++;
960 			tcpstat.tcps_rcvbyte += tlen;
961 			ND6_HINT(tp);
962 			/*
963 			 * Drop TCP, IP headers and TCP options then add data
964 			 * to socket buffer.
965 			 */
966 			if (th->th_flags & TH_PUSH)
967 				tp->t_flags |= TF_ACKNOW;
968 			else
969 				tp->t_flags |= TF_DELACK;
970 			m_adj(m, iphlen + off);
971 			sbappend(&so->so_rcv, m);
972 			sorwakeup(so);
973 			return;
974 		}
975 	}
976 
977 	/*
978 	 * Compute mbuf offset to TCP data segment.
979 	 */
980 	hdroptlen = iphlen + off;
981 
982 	/*
983 	 * Calculate amount of space in receive window,
984 	 * and then do TCP input processing.
985 	 * Receive window is amount of space in rcv queue,
986 	 * but not less than advertised window.
987 	 */
988 	{ int win;
989 
990 	win = sbspace(&so->so_rcv);
991 	if (win < 0)
992 		win = 0;
993 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
994 	}
995 
996 	switch (tp->t_state) {
997 
998 	/*
999 	 * If the state is LISTEN then ignore segment if it contains an RST.
1000 	 * If the segment contains an ACK then it is bad and send a RST.
1001 	 * If it does not contain a SYN then it is not interesting; drop it.
1002 	 * If it is from this socket, drop it, it must be forged.
1003 	 * Don't bother responding if the destination was a broadcast.
1004 	 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
1005 	 * tp->iss, and send a segment:
1006 	 *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
1007 	 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
1008 	 * Fill in remote peer address fields if not previously specified.
1009 	 * Enter SYN_RECEIVED state, and process any other fields of this
1010 	 * segment in this state.
1011 	 */
1012 	case TCPS_LISTEN: {
1013 		struct mbuf *am;
1014 		register struct sockaddr_in *sin;
1015 #ifdef INET6
1016 		register struct sockaddr_in6 *sin6;
1017 #endif /* INET6 */
1018 
1019 		if (tiflags & TH_RST)
1020 			goto drop;
1021 		if (tiflags & TH_ACK)
1022 			goto dropwithreset;
1023 		if ((tiflags & TH_SYN) == 0)
1024 			goto drop;
1025 		if (th->th_dport == th->th_sport) {
1026 			switch (af) {
1027 #ifdef INET6
1028 			case AF_INET6:
1029 				if (IN6_ARE_ADDR_EQUAL(&ipv6->ip6_src,
1030 				    &ipv6->ip6_dst))
1031 					goto drop;
1032 				break;
1033 #endif /* INET6 */
1034 			case AF_INET:
1035 				if (ip->ip_dst.s_addr == ip->ip_src.s_addr)
1036 					goto drop;
1037 				break;
1038 			}
1039 		}
1040 
1041 		/*
1042 		 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1043 		 * in_broadcast() should never return true on a received
1044 		 * packet with M_BCAST not set.
1045 		 */
1046 		if (m->m_flags & (M_BCAST|M_MCAST))
1047 			goto drop;
1048 		switch (af) {
1049 #ifdef INET6
1050 		case AF_INET6:
1051 			/* XXX What about IPv6 Anycasting ?? :-(  rja */
1052 			if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst))
1053 				goto drop;
1054 			break;
1055 #endif /* INET6 */
1056 		case AF_INET:
1057 			if (IN_MULTICAST(ip->ip_dst.s_addr))
1058 				goto drop;
1059 			break;
1060 		}
1061 		am = m_get(M_DONTWAIT, MT_SONAME);	/* XXX */
1062 		if (am == NULL)
1063 			goto drop;
1064 		switch (af) {
1065 #ifdef INET6
1066 		case AF_INET6:
1067 			/*
1068 			 * This is probably the place to set the tp->pf value.
1069 			 * (Don't forget to do it in the v4 code as well!)
1070 			 *
1071 			 * Also, remember to blank out things like flowlabel, or
1072 			 * set flowlabel for accepted sockets in v6.
1073 			 *
1074 			 * FURTHERMORE, this is PROBABLY the place where the
1075 			 * whole business of key munging is set up for passive
1076 			 * connections.
1077 			 */
1078 			am->m_len = sizeof(struct sockaddr_in6);
1079 			sin6 = mtod(am, struct sockaddr_in6 *);
1080 			sin6->sin6_family = AF_INET6;
1081 			sin6->sin6_len = sizeof(struct sockaddr_in6);
1082 			sin6->sin6_addr = ipv6->ip6_src;
1083 			sin6->sin6_port = th->th_sport;
1084 			sin6->sin6_flowinfo = htonl(0x0fffffff) &
1085 				inp->inp_ipv6.ip6_flow;
1086 			laddr6 = inp->inp_laddr6;
1087 			if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6))
1088 				inp->inp_laddr6 = ipv6->ip6_dst;
1089 			/* This is a good optimization. */
1090 			if (in6_pcbconnect(inp, am)) {
1091 				inp->inp_laddr6 = laddr6;
1092 				(void) m_free(am);
1093 				goto drop;
1094 			}
1095 			break;
1096 #endif
1097 		case AF_INET:
1098 			/* drop IPv4 packet to AF_INET6 socket */
1099 			if (inp->inp_flags & INP_IPV6) {
1100 				(void) m_free(am);
1101 				goto drop;
1102 			}
1103 			am->m_len = sizeof(struct sockaddr_in);
1104 			sin = mtod(am, struct sockaddr_in *);
1105 			sin->sin_family = AF_INET;
1106 			sin->sin_len = sizeof(*sin);
1107 			sin->sin_addr = ip->ip_src;
1108 			sin->sin_port = th->th_sport;
1109 			bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1110 			laddr = inp->inp_laddr;
1111 			if (inp->inp_laddr.s_addr == INADDR_ANY)
1112 				inp->inp_laddr = ip->ip_dst;
1113 			if (in_pcbconnect(inp, am)) {
1114 				inp->inp_laddr = laddr;
1115 				(void) m_free(am);
1116 				goto drop;
1117 			}
1118 			(void) m_free(am);
1119 			break;
1120 		}
1121 		tp->t_template = tcp_template(tp);
1122 		if (tp->t_template == 0) {
1123 			tp = tcp_drop(tp, ENOBUFS);
1124 			dropsocket = 0;		/* socket is already gone */
1125 			goto drop;
1126 		}
1127 		if (optp)
1128 			tcp_dooptions(tp, optp, optlen, th,
1129 				&ts_present, &ts_val, &ts_ecr);
1130 #ifdef TCP_SACK
1131 		/*
1132 		 * If peer did not send a SACK_PERMITTED option (i.e., if
1133 		 * tcp_dooptions() did not set TF_SACK_PERMIT), set
1134                  * sack_disable to 1 if it is currently 0.
1135                  */
1136                 if (!tp->sack_disable)
1137                         if ((tp->t_flags & TF_SACK_PERMIT) == 0)
1138                                 tp->sack_disable = 1;
1139 #endif
1140 
1141 		if (iss)
1142 			tp->iss = iss;
1143 		else {
1144 #ifdef TCP_COMPAT_42
1145 			tcp_iss += TCP_ISSINCR/2;
1146 			tp->iss = tcp_iss;
1147 #else /* TCP_COMPAT_42 */
1148 			tp->iss = tcp_rndiss_next();
1149 #endif /* !TCP_COMPAT_42 */
1150 		}
1151 		tp->irs = th->th_seq;
1152 		tcp_sendseqinit(tp);
1153 #if defined (TCP_SACK)
1154 		tp->snd_last = tp->snd_una;
1155 #endif /* TCP_SACK */
1156 #if defined(TCP_SACK) && defined(TCP_FACK)
1157 		tp->snd_fack = tp->snd_una;
1158 		tp->retran_data = 0;
1159 		tp->snd_awnd = 0;
1160 #endif /* TCP_FACK */
1161 		tcp_rcvseqinit(tp);
1162 		tp->t_flags |= TF_ACKNOW;
1163 		tp->t_state = TCPS_SYN_RECEIVED;
1164 		tp->t_timer[TCPT_KEEP] = tcptv_keep_init;
1165 		dropsocket = 0;		/* committed to socket */
1166 		tcpstat.tcps_accepts++;
1167 		goto trimthenstep6;
1168 		}
1169 
1170 	/*
1171 	 * If the state is SYN_RECEIVED:
1172 	 * 	if seg contains SYN/ACK, send an RST.
1173 	 *	if seg contains an ACK, but not for our SYN/ACK, send an RST
1174   	 */
1175 
1176 	case TCPS_SYN_RECEIVED:
1177 		if (tiflags & TH_ACK) {
1178 			if (tiflags & TH_SYN) {
1179 				tcpstat.tcps_badsyn++;
1180 				goto dropwithreset;
1181 			}
1182 			if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1183 			    SEQ_GT(th->th_ack, tp->snd_max))
1184 				goto dropwithreset;
1185 		}
1186 		break;
1187 
1188 	/*
1189 	 * If the state is SYN_SENT:
1190 	 *	if seg contains an ACK, but not for our SYN, drop the input.
1191 	 *	if seg contains a RST, then drop the connection.
1192 	 *	if seg does not contain SYN, then drop it.
1193 	 * Otherwise this is an acceptable SYN segment
1194 	 *	initialize tp->rcv_nxt and tp->irs
1195 	 *	if seg contains ack then advance tp->snd_una
1196 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1197 	 *	arrange for segment to be acked (eventually)
1198 	 *	continue processing rest of data/controls, beginning with URG
1199 	 */
1200 	case TCPS_SYN_SENT:
1201 		if ((tiflags & TH_ACK) &&
1202 		    (SEQ_LEQ(th->th_ack, tp->iss) ||
1203 		     SEQ_GT(th->th_ack, tp->snd_max)))
1204 			goto dropwithreset;
1205 		if (tiflags & TH_RST) {
1206 			if (tiflags & TH_ACK)
1207 				tp = tcp_drop(tp, ECONNREFUSED);
1208 			goto drop;
1209 		}
1210 		if ((tiflags & TH_SYN) == 0)
1211 			goto drop;
1212 		if (tiflags & TH_ACK) {
1213 			tp->snd_una = th->th_ack;
1214 			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1215 				tp->snd_nxt = tp->snd_una;
1216 		}
1217 		tp->t_timer[TCPT_REXMT] = 0;
1218 		tp->irs = th->th_seq;
1219 		tcp_rcvseqinit(tp);
1220 		tp->t_flags |= TF_ACKNOW;
1221 #ifdef TCP_SACK
1222                 /*
1223                  * If we've sent a SACK_PERMITTED option, and the peer
1224                  * also replied with one, then TF_SACK_PERMIT should have
1225                  * been set in tcp_dooptions().  If it was not, disable SACKs.
1226                  */
1227                 if (!tp->sack_disable)
1228                         if ((tp->t_flags & TF_SACK_PERMIT) == 0)
1229                                 tp->sack_disable = 1;
1230 #endif
1231 		if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
1232 			tcpstat.tcps_connects++;
1233 			soisconnected(so);
1234 			tp->t_state = TCPS_ESTABLISHED;
1235 			/* Do window scaling on this connection? */
1236 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1237 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1238 				tp->snd_scale = tp->requested_s_scale;
1239 				tp->rcv_scale = tp->request_r_scale;
1240 			}
1241 			(void) tcp_reass(tp, (struct tcphdr *)0,
1242 				(struct mbuf *)0, &tlen);
1243 			/*
1244 			 * if we didn't have to retransmit the SYN,
1245 			 * use its rtt as our initial srtt & rtt var.
1246 			 */
1247 			if (tp->t_rtt)
1248 				tcp_xmit_timer(tp, tp->t_rtt);
1249 			/*
1250 			 * Since new data was acked (the SYN), open the
1251 			 * congestion window by one MSS.  We do this
1252 			 * here, because we won't go through the normal
1253 			 * ACK processing below.  And since this is the
1254 			 * start of the connection, we know we are in
1255 			 * the exponential phase of slow-start.
1256 			 */
1257 			tp->snd_cwnd += tp->t_maxseg;
1258 		} else
1259 			tp->t_state = TCPS_SYN_RECEIVED;
1260 
1261 trimthenstep6:
1262 		/*
1263 		 * Advance th->th_seq to correspond to first data byte.
1264 		 * If data, trim to stay within window,
1265 		 * dropping FIN if necessary.
1266 		 */
1267 		th->th_seq++;
1268 		if (tlen > tp->rcv_wnd) {
1269 			todrop = tlen - tp->rcv_wnd;
1270 			m_adj(m, -todrop);
1271 			tlen = tp->rcv_wnd;
1272 			tiflags &= ~TH_FIN;
1273 			tcpstat.tcps_rcvpackafterwin++;
1274 			tcpstat.tcps_rcvbyteafterwin += todrop;
1275 		}
1276 		tp->snd_wl1 = th->th_seq - 1;
1277 		tp->rcv_up = th->th_seq;
1278 		goto step6;
1279 	}
1280 
1281 	/*
1282 	 * States other than LISTEN or SYN_SENT.
1283 	 * First check timestamp, if present.
1284 	 * Then check that at least some bytes of segment are within
1285 	 * receive window.  If segment begins before rcv_nxt,
1286 	 * drop leading data (and SYN); if nothing left, just ack.
1287 	 *
1288 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1289 	 * and it's less than ts_recent, drop it.
1290 	 */
1291 	if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1292 	    TSTMP_LT(ts_val, tp->ts_recent)) {
1293 
1294 		/* Check to see if ts_recent is over 24 days old.  */
1295 		if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1296 			/*
1297 			 * Invalidate ts_recent.  If this segment updates
1298 			 * ts_recent, the age will be reset later and ts_recent
1299 			 * will get a valid value.  If it does not, setting
1300 			 * ts_recent to zero will at least satisfy the
1301 			 * requirement that zero be placed in the timestamp
1302 			 * echo reply when ts_recent isn't valid.  The
1303 			 * age isn't reset until we get a valid ts_recent
1304 			 * because we don't want out-of-order segments to be
1305 			 * dropped when ts_recent is old.
1306 			 */
1307 			tp->ts_recent = 0;
1308 		} else {
1309 			tcpstat.tcps_rcvduppack++;
1310 			tcpstat.tcps_rcvdupbyte += tlen;
1311 			tcpstat.tcps_pawsdrop++;
1312 			goto dropafterack;
1313 		}
1314 	}
1315 
1316 	todrop = tp->rcv_nxt - th->th_seq;
1317 	if (todrop > 0) {
1318 		if (tiflags & TH_SYN) {
1319 			tiflags &= ~TH_SYN;
1320 			th->th_seq++;
1321 			if (th->th_urp > 1)
1322 				th->th_urp--;
1323 			else
1324 				tiflags &= ~TH_URG;
1325 			todrop--;
1326 		}
1327 		if (todrop >= tlen ||
1328 		    (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1329 			/*
1330 			 * Any valid FIN must be to the left of the
1331 			 * window.  At this point, FIN must be a
1332 			 * duplicate or out-of-sequence, so drop it.
1333 			 */
1334 			tiflags &= ~TH_FIN;
1335 			/*
1336 			 * Send ACK to resynchronize, and drop any data,
1337 			 * but keep on processing for RST or ACK.
1338 			 */
1339 			tp->t_flags |= TF_ACKNOW;
1340 			tcpstat.tcps_rcvdupbyte += todrop = tlen;
1341 			tcpstat.tcps_rcvduppack++;
1342 		} else {
1343 			tcpstat.tcps_rcvpartduppack++;
1344 			tcpstat.tcps_rcvpartdupbyte += todrop;
1345 		}
1346 		hdroptlen += todrop;	/* drop from head afterwards */
1347 		th->th_seq += todrop;
1348 		tlen -= todrop;
1349 		if (th->th_urp > todrop)
1350 			th->th_urp -= todrop;
1351 		else {
1352 			tiflags &= ~TH_URG;
1353 			th->th_urp = 0;
1354 		}
1355 	}
1356 
1357 	/*
1358 	 * If new data are received on a connection after the
1359 	 * user processes are gone, then RST the other end.
1360 	 */
1361 	if ((so->so_state & SS_NOFDREF) &&
1362 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1363 		tp = tcp_close(tp);
1364 		tcpstat.tcps_rcvafterclose++;
1365 		goto dropwithreset;
1366 	}
1367 
1368 	/*
1369 	 * If segment ends after window, drop trailing data
1370 	 * (and PUSH and FIN); if nothing left, just ACK.
1371 	 */
1372 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1373 	if (todrop > 0) {
1374 		tcpstat.tcps_rcvpackafterwin++;
1375 		if (todrop >= tlen) {
1376 			tcpstat.tcps_rcvbyteafterwin += tlen;
1377 			/*
1378 			 * If a new connection request is received
1379 			 * while in TIME_WAIT, drop the old connection
1380 			 * and start over if the sequence numbers
1381 			 * are above the previous ones.
1382 			 */
1383 			if (tiflags & TH_SYN &&
1384 			    tp->t_state == TCPS_TIME_WAIT &&
1385 			    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1386 				iss = tp->snd_nxt + TCP_ISSINCR;
1387 				tp = tcp_close(tp);
1388 				goto findpcb;
1389 			}
1390 			/*
1391 			 * If window is closed can only take segments at
1392 			 * window edge, and have to drop data and PUSH from
1393 			 * incoming segments.  Continue processing, but
1394 			 * remember to ack.  Otherwise, drop segment
1395 			 * and ack.
1396 			 */
1397 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1398 				tp->t_flags |= TF_ACKNOW;
1399 				tcpstat.tcps_rcvwinprobe++;
1400 			} else
1401 				goto dropafterack;
1402 		} else
1403 			tcpstat.tcps_rcvbyteafterwin += todrop;
1404 		m_adj(m, -todrop);
1405 		tlen -= todrop;
1406 		tiflags &= ~(TH_PUSH|TH_FIN);
1407 	}
1408 
1409 	/*
1410 	 * If last ACK falls within this segment's sequence numbers,
1411 	 * record its timestamp.
1412 	 * Fix from Braden, see Stevens p. 870
1413 	 */
1414 	if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) &&
1415 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1416 		tp->ts_recent_age = tcp_now;
1417 		tp->ts_recent = ts_val;
1418 	}
1419 
1420 	/*
1421 	 * If the RST bit is set examine the state:
1422 	 *    SYN_RECEIVED STATE:
1423 	 *	If passive open, return to LISTEN state.
1424 	 *	If active open, inform user that connection was refused.
1425 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1426 	 *	Inform user that connection was reset, and close tcb.
1427 	 *    CLOSING, LAST_ACK, TIME_WAIT STATES
1428 	 *	Close the tcb.
1429 	 */
1430 	if (tiflags & TH_RST) {
1431 		if (th->th_seq != tp->last_ack_sent)
1432 			goto drop;
1433 
1434 		switch (tp->t_state) {
1435 		case TCPS_SYN_RECEIVED:
1436 			so->so_error = ECONNREFUSED;
1437 			goto close;
1438 
1439 		case TCPS_ESTABLISHED:
1440 		case TCPS_FIN_WAIT_1:
1441 		case TCPS_FIN_WAIT_2:
1442 		case TCPS_CLOSE_WAIT:
1443 			so->so_error = ECONNRESET;
1444 		close:
1445 			tp->t_state = TCPS_CLOSED;
1446 			tcpstat.tcps_drops++;
1447 			tp = tcp_close(tp);
1448 			goto drop;
1449 		case TCPS_CLOSING:
1450 		case TCPS_LAST_ACK:
1451 		case TCPS_TIME_WAIT:
1452 			tp = tcp_close(tp);
1453 			goto drop;
1454 		}
1455 	}
1456 
1457 	/*
1458 	 * If a SYN is in the window, then this is an
1459 	 * error and we send an RST and drop the connection.
1460 	 */
1461 	if (tiflags & TH_SYN) {
1462 		tp = tcp_drop(tp, ECONNRESET);
1463 		goto dropwithreset;
1464 	}
1465 
1466 	/*
1467 	 * If the ACK bit is off we drop the segment and return.
1468 	 */
1469 	if ((tiflags & TH_ACK) == 0) {
1470 		if (tp->t_flags & TF_ACKNOW)
1471 			goto dropafterack;
1472 		else
1473 			goto drop;
1474 	}
1475 
1476 	/*
1477 	 * Ack processing.
1478 	 */
1479 	switch (tp->t_state) {
1480 
1481 	/*
1482 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1483 	 * ESTABLISHED state and continue processing.
1484 	 * The ACK was checked above.
1485 	 */
1486 	case TCPS_SYN_RECEIVED:
1487 		tcpstat.tcps_connects++;
1488 		soisconnected(so);
1489 		tp->t_state = TCPS_ESTABLISHED;
1490 		/* Do window scaling? */
1491 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1492 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
1493 			tp->snd_scale = tp->requested_s_scale;
1494 			tp->rcv_scale = tp->request_r_scale;
1495 		}
1496 		(void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0,
1497 				 &tlen);
1498 		tp->snd_wl1 = th->th_seq - 1;
1499 		/* fall into ... */
1500 
1501 	/*
1502 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1503 	 * ACKs.  If the ack is in the range
1504 	 *	tp->snd_una < th->th_ack <= tp->snd_max
1505 	 * then advance tp->snd_una to th->th_ack and drop
1506 	 * data from the retransmission queue.  If this ACK reflects
1507 	 * more up to date window information we update our window information.
1508 	 */
1509 	case TCPS_ESTABLISHED:
1510 	case TCPS_FIN_WAIT_1:
1511 	case TCPS_FIN_WAIT_2:
1512 	case TCPS_CLOSE_WAIT:
1513 	case TCPS_CLOSING:
1514 	case TCPS_LAST_ACK:
1515 	case TCPS_TIME_WAIT:
1516 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1517 			/*
1518 			 * Duplicate/old ACK processing.
1519 			 * Increments t_dupacks:
1520 			 *	Pure duplicate (same seq/ack/window, no data)
1521 			 * Doesn't affect t_dupacks:
1522 			 *	Data packets.
1523 			 *	Normal window updates (window opens)
1524 			 * Resets t_dupacks:
1525 			 *	New data ACKed.
1526 			 *	Window shrinks
1527 			 *	Old ACK
1528 			 */
1529 			if (tlen)
1530 				break;
1531 			/*
1532 			 * If we get an old ACK, there is probably packet
1533 			 * reordering going on.  Be conservative and reset
1534 			 * t_dupacks so that we are less agressive in
1535 			 * doing a fast retransmit.
1536 			 */
1537 			if (th->th_ack != tp->snd_una) {
1538 				tp->t_dupacks = 0;
1539 				break;
1540 			}
1541 			if (tiwin == tp->snd_wnd) {
1542 				tcpstat.tcps_rcvdupack++;
1543 				/*
1544 				 * If we have outstanding data (other than
1545 				 * a window probe), this is a completely
1546 				 * duplicate ack (ie, window info didn't
1547 				 * change), the ack is the biggest we've
1548 				 * seen and we've seen exactly our rexmt
1549 				 * threshhold of them, assume a packet
1550 				 * has been dropped and retransmit it.
1551 				 * Kludge snd_nxt & the congestion
1552 				 * window so we send only this one
1553 				 * packet.
1554 				 *
1555 				 * We know we're losing at the current
1556 				 * window size so do congestion avoidance
1557 				 * (set ssthresh to half the current window
1558 				 * and pull our congestion window back to
1559 				 * the new ssthresh).
1560 				 *
1561 				 * Dup acks mean that packets have left the
1562 				 * network (they're now cached at the receiver)
1563 				 * so bump cwnd by the amount in the receiver
1564 				 * to keep a constant cwnd packets in the
1565 				 * network.
1566 				 */
1567 				if (tp->t_timer[TCPT_REXMT] == 0)
1568 					tp->t_dupacks = 0;
1569 #if defined(TCP_SACK) && defined(TCP_FACK)
1570 				/*
1571 				 * In FACK, can enter fast rec. if the receiver
1572 				 * reports a reass. queue longer than 3 segs.
1573 				 */
1574 				else if (++tp->t_dupacks == tcprexmtthresh ||
1575 				    ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
1576 				    tp->t_maxseg + tp->snd_una)) &&
1577 				    SEQ_GT(tp->snd_una, tp->snd_last))) {
1578 #else
1579 				else if (++tp->t_dupacks == tcprexmtthresh) {
1580 #endif /* TCP_FACK */
1581 					tcp_seq onxt = tp->snd_nxt;
1582 					u_long win =
1583 					    ulmin(tp->snd_wnd, tp->snd_cwnd) /
1584 						2 / tp->t_maxseg;
1585 
1586 #if defined(TCP_SACK)
1587 					if (SEQ_LT(th->th_ack, tp->snd_last)){
1588 					    	/*
1589 						 * False fast retx after
1590 						 * timeout.  Do not cut window.
1591 						 */
1592 						tp->t_dupacks = 0;
1593 						goto drop;
1594 					}
1595 #endif
1596 					if (win < 2)
1597 						win = 2;
1598 					tp->snd_ssthresh = win * tp->t_maxseg;
1599 #if defined(TCP_SACK)
1600 					tp->snd_last = tp->snd_max;
1601 #endif
1602 #ifdef TCP_SACK
1603                     			if (!tp->sack_disable) {
1604 						tp->t_timer[TCPT_REXMT] = 0;
1605 						tp->t_rtt = 0;
1606 						tcpstat.tcps_sndrexmitfast++;
1607 #if defined(TCP_SACK) && defined(TCP_FACK)
1608 						tp->t_dupacks = tcprexmtthresh;
1609 						(void) tcp_output(tp);
1610 						/*
1611 						 * During FR, snd_cwnd is held
1612 						 * constant for FACK.
1613 						 */
1614 						tp->snd_cwnd = tp->snd_ssthresh;
1615 #else
1616 						/*
1617 						 * tcp_output() will send
1618 						 * oldest SACK-eligible rtx.
1619 						 */
1620 						(void) tcp_output(tp);
1621 						tp->snd_cwnd = tp->snd_ssthresh+
1622 					           tp->t_maxseg * tp->t_dupacks;
1623 #endif /* TCP_FACK */
1624 						goto drop;
1625 					}
1626 #endif /* TCP_SACK */
1627 					tp->t_timer[TCPT_REXMT] = 0;
1628 					tp->t_rtt = 0;
1629 					tp->snd_nxt = th->th_ack;
1630 					tp->snd_cwnd = tp->t_maxseg;
1631 					tcpstat.tcps_sndrexmitfast++;
1632 					(void) tcp_output(tp);
1633 
1634 					tp->snd_cwnd = tp->snd_ssthresh +
1635 					    tp->t_maxseg * tp->t_dupacks;
1636 					if (SEQ_GT(onxt, tp->snd_nxt))
1637 						tp->snd_nxt = onxt;
1638 					goto drop;
1639 				} else if (tp->t_dupacks > tcprexmtthresh) {
1640 #if defined(TCP_SACK) && defined(TCP_FACK)
1641 					/*
1642 					 * while (awnd < cwnd)
1643 					 *         sendsomething();
1644 					 */
1645 					if (!tp->sack_disable) {
1646 						if (tp->snd_awnd < tp->snd_cwnd)
1647 							tcp_output(tp);
1648 						goto drop;
1649 					}
1650 #endif /* TCP_FACK */
1651 					tp->snd_cwnd += tp->t_maxseg;
1652 					(void) tcp_output(tp);
1653 					goto drop;
1654 				}
1655 			} else if (tiwin < tp->snd_wnd) {
1656 				/*
1657 				 * The window was retracted!  Previous dup
1658 				 * ACKs may have been due to packets arriving
1659 				 * after the shrunken window, not a missing
1660 				 * packet, so play it safe and reset t_dupacks
1661 				 */
1662 				tp->t_dupacks = 0;
1663 			}
1664 			break;
1665 		}
1666 		/*
1667 		 * If the congestion window was inflated to account
1668 		 * for the other side's cached packets, retract it.
1669 		 */
1670 #if defined(TCP_SACK)
1671 		if (!tp->sack_disable) {
1672 			if (tp->t_dupacks >= tcprexmtthresh) {
1673 				/* Check for a partial ACK */
1674 				if (tcp_sack_partialack(tp, th)) {
1675 #if defined(TCP_SACK) && defined(TCP_FACK)
1676 					/* Force call to tcp_output */
1677 					if (tp->snd_awnd < tp->snd_cwnd)
1678 						needoutput = 1;
1679 #else
1680 					tp->snd_cwnd += tp->t_maxseg;
1681 					needoutput = 1;
1682 #endif /* TCP_FACK */
1683 				} else {
1684 					/* Out of fast recovery */
1685 					tp->snd_cwnd = tp->snd_ssthresh;
1686 					if (tcp_seq_subtract(tp->snd_max,
1687 					    th->th_ack) < tp->snd_ssthresh)
1688 						tp->snd_cwnd =
1689 						   tcp_seq_subtract(tp->snd_max,
1690 					           th->th_ack);
1691 					tp->t_dupacks = 0;
1692 #if defined(TCP_SACK) && defined(TCP_FACK)
1693 					if (SEQ_GT(th->th_ack, tp->snd_fack))
1694 						tp->snd_fack = th->th_ack;
1695 #endif /* TCP_FACK */
1696 				}
1697 			}
1698 		} else {
1699 			if (tp->t_dupacks >= tcprexmtthresh &&
1700 			    !tcp_newreno(tp, th)) {
1701 				/* Out of fast recovery */
1702 				tp->snd_cwnd = tp->snd_ssthresh;
1703 				if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
1704 			  	    tp->snd_ssthresh)
1705 					tp->snd_cwnd =
1706 					    tcp_seq_subtract(tp->snd_max,
1707 					    th->th_ack);
1708 				tp->t_dupacks = 0;
1709 			}
1710 		}
1711 		if (tp->t_dupacks < tcprexmtthresh)
1712 			tp->t_dupacks = 0;
1713 #else /* else no TCP_SACK */
1714 		if (tp->t_dupacks >= tcprexmtthresh &&
1715 		    tp->snd_cwnd > tp->snd_ssthresh)
1716 			tp->snd_cwnd = tp->snd_ssthresh;
1717 		tp->t_dupacks = 0;
1718 #endif
1719 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
1720 			tcpstat.tcps_rcvacktoomuch++;
1721 			goto dropafterack;
1722 		}
1723 		acked = th->th_ack - tp->snd_una;
1724 		tcpstat.tcps_rcvackpack++;
1725 		tcpstat.tcps_rcvackbyte += acked;
1726 
1727 		/*
1728 		 * If we have a timestamp reply, update smoothed
1729 		 * round trip time.  If no timestamp is present but
1730 		 * transmit timer is running and timed sequence
1731 		 * number was acked, update smoothed round trip time.
1732 		 * Since we now have an rtt measurement, cancel the
1733 		 * timer backoff (cf., Phil Karn's retransmit alg.).
1734 		 * Recompute the initial retransmit timer.
1735 		 */
1736 		if (ts_present)
1737 			tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1738 		else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq))
1739 			tcp_xmit_timer(tp,tp->t_rtt);
1740 
1741 		/*
1742 		 * If all outstanding data is acked, stop retransmit
1743 		 * timer and remember to restart (more output or persist).
1744 		 * If there is more data to be acked, restart retransmit
1745 		 * timer, using current (possibly backed-off) value.
1746 		 */
1747 		if (th->th_ack == tp->snd_max) {
1748 			tp->t_timer[TCPT_REXMT] = 0;
1749 			needoutput = 1;
1750 		} else if (tp->t_timer[TCPT_PERSIST] == 0)
1751 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1752 		/*
1753 		 * When new data is acked, open the congestion window.
1754 		 * If the window gives us less than ssthresh packets
1755 		 * in flight, open exponentially (maxseg per packet).
1756 		 * Otherwise open linearly: maxseg per window
1757 		 * (maxseg^2 / cwnd per packet).
1758 		 */
1759 		{
1760 		register u_int cw = tp->snd_cwnd;
1761 		register u_int incr = tp->t_maxseg;
1762 
1763 		if (cw > tp->snd_ssthresh)
1764 			incr = incr * incr / cw;
1765 #if defined (TCP_SACK)
1766 		if (tp->t_dupacks < tcprexmtthresh)
1767 #endif
1768 		tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1769 		}
1770 		ND6_HINT(tp);
1771 		if (acked > so->so_snd.sb_cc) {
1772 			tp->snd_wnd -= so->so_snd.sb_cc;
1773 			sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1774 			ourfinisacked = 1;
1775 		} else {
1776 			sbdrop(&so->so_snd, acked);
1777 			tp->snd_wnd -= acked;
1778 			ourfinisacked = 0;
1779 		}
1780 		if (sb_notify(&so->so_snd))
1781 			sowwakeup(so);
1782 		tp->snd_una = th->th_ack;
1783 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1784 			tp->snd_nxt = tp->snd_una;
1785 #if defined (TCP_SACK) && defined (TCP_FACK)
1786 		if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
1787 			tp->snd_fack = tp->snd_una;
1788 			/* Update snd_awnd for partial ACK
1789 			 * without any SACK blocks.
1790 			 */
1791 			tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
1792 				tp->snd_fack) + tp->retran_data;
1793 		}
1794 #endif
1795 
1796 		switch (tp->t_state) {
1797 
1798 		/*
1799 		 * In FIN_WAIT_1 STATE in addition to the processing
1800 		 * for the ESTABLISHED state if our FIN is now acknowledged
1801 		 * then enter FIN_WAIT_2.
1802 		 */
1803 		case TCPS_FIN_WAIT_1:
1804 			if (ourfinisacked) {
1805 				/*
1806 				 * If we can't receive any more
1807 				 * data, then closing user can proceed.
1808 				 * Starting the timer is contrary to the
1809 				 * specification, but if we don't get a FIN
1810 				 * we'll hang forever.
1811 				 */
1812 				if (so->so_state & SS_CANTRCVMORE) {
1813 					soisdisconnected(so);
1814 					tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1815 				}
1816 				tp->t_state = TCPS_FIN_WAIT_2;
1817 			}
1818 			break;
1819 
1820 		/*
1821 		 * In CLOSING STATE in addition to the processing for
1822 		 * the ESTABLISHED state if the ACK acknowledges our FIN
1823 		 * then enter the TIME-WAIT state, otherwise ignore
1824 		 * the segment.
1825 		 */
1826 		case TCPS_CLOSING:
1827 			if (ourfinisacked) {
1828 				tp->t_state = TCPS_TIME_WAIT;
1829 				tcp_canceltimers(tp);
1830 				tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1831 				soisdisconnected(so);
1832 			}
1833 			break;
1834 
1835 		/*
1836 		 * In LAST_ACK, we may still be waiting for data to drain
1837 		 * and/or to be acked, as well as for the ack of our FIN.
1838 		 * If our FIN is now acknowledged, delete the TCB,
1839 		 * enter the closed state and return.
1840 		 */
1841 		case TCPS_LAST_ACK:
1842 			if (ourfinisacked) {
1843 				tp = tcp_close(tp);
1844 				goto drop;
1845 			}
1846 			break;
1847 
1848 		/*
1849 		 * In TIME_WAIT state the only thing that should arrive
1850 		 * is a retransmission of the remote FIN.  Acknowledge
1851 		 * it and restart the finack timer.
1852 		 */
1853 		case TCPS_TIME_WAIT:
1854 			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1855 			goto dropafterack;
1856 		}
1857 	}
1858 
1859 step6:
1860 	/*
1861 	 * Update window information.
1862 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1863 	 */
1864 	if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1865 	    (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) ||
1866 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) {
1867 		/* keep track of pure window updates */
1868 		if (tlen == 0 &&
1869 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1870 			tcpstat.tcps_rcvwinupd++;
1871 		tp->snd_wnd = tiwin;
1872 		tp->snd_wl1 = th->th_seq;
1873 		tp->snd_wl2 = th->th_ack;
1874 		if (tp->snd_wnd > tp->max_sndwnd)
1875 			tp->max_sndwnd = tp->snd_wnd;
1876 		needoutput = 1;
1877 	}
1878 
1879 	/*
1880 	 * Process segments with URG.
1881 	 */
1882 	if ((tiflags & TH_URG) && th->th_urp &&
1883 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1884 		/*
1885 		 * This is a kludge, but if we receive and accept
1886 		 * random urgent pointers, we'll crash in
1887 		 * soreceive.  It's hard to imagine someone
1888 		 * actually wanting to send this much urgent data.
1889 		 */
1890 		if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
1891 			th->th_urp = 0;			/* XXX */
1892 			tiflags &= ~TH_URG;		/* XXX */
1893 			goto dodata;			/* XXX */
1894 		}
1895 		/*
1896 		 * If this segment advances the known urgent pointer,
1897 		 * then mark the data stream.  This should not happen
1898 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1899 		 * a FIN has been received from the remote side.
1900 		 * In these states we ignore the URG.
1901 		 *
1902 		 * According to RFC961 (Assigned Protocols),
1903 		 * the urgent pointer points to the last octet
1904 		 * of urgent data.  We continue, however,
1905 		 * to consider it to indicate the first octet
1906 		 * of data past the urgent section as the original
1907 		 * spec states (in one of two places).
1908 		 */
1909 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1910 			tp->rcv_up = th->th_seq + th->th_urp;
1911 			so->so_oobmark = so->so_rcv.sb_cc +
1912 			    (tp->rcv_up - tp->rcv_nxt) - 1;
1913 			if (so->so_oobmark == 0)
1914 				so->so_state |= SS_RCVATMARK;
1915 			sohasoutofband(so);
1916 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1917 		}
1918 		/*
1919 		 * Remove out of band data so doesn't get presented to user.
1920 		 * This can happen independent of advancing the URG pointer,
1921 		 * but if two URG's are pending at once, some out-of-band
1922 		 * data may creep in... ick.
1923 		 */
1924 		if (th->th_urp <= (u_int16_t) tlen
1925 #ifdef SO_OOBINLINE
1926 		     && (so->so_options & SO_OOBINLINE) == 0
1927 #endif
1928 		     )
1929 		        tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
1930 	} else
1931 		/*
1932 		 * If no out of band data is expected,
1933 		 * pull receive urgent pointer along
1934 		 * with the receive window.
1935 		 */
1936 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1937 			tp->rcv_up = tp->rcv_nxt;
1938 dodata:							/* XXX */
1939 
1940 	/*
1941 	 * Process the segment text, merging it into the TCP sequencing queue,
1942 	 * and arranging for acknowledgment of receipt if necessary.
1943 	 * This process logically involves adjusting tp->rcv_wnd as data
1944 	 * is presented to the user (this happens in tcp_usrreq.c,
1945 	 * case PRU_RCVD).  If a FIN has already been received on this
1946 	 * connection then we just ignore the text.
1947 	 */
1948 	if ((tlen || (tiflags & TH_FIN)) &&
1949 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1950 		if (th->th_seq == tp->rcv_nxt && tp->segq.lh_first == NULL &&
1951 		    tp->t_state == TCPS_ESTABLISHED) {
1952 			if (th->th_flags & TH_PUSH)
1953 				tp->t_flags |= TF_ACKNOW;
1954 			else
1955 				tp->t_flags |= TF_DELACK;
1956 			tp->rcv_nxt += tlen;
1957 			tiflags = th->th_flags & TH_FIN;
1958 			tcpstat.tcps_rcvpack++;
1959 			tcpstat.tcps_rcvbyte += tlen;
1960 			ND6_HINT(tp);
1961 			m_adj(m, hdroptlen);
1962 			sbappend(&so->so_rcv, m);
1963 			sorwakeup(so);
1964 		} else {
1965 			m_adj(m, hdroptlen);
1966 			tiflags = tcp_reass(tp, th, m, &tlen);
1967 			tp->t_flags |= TF_ACKNOW;
1968 		}
1969 #ifdef TCP_SACK
1970 		if (!tp->sack_disable)
1971 			tcp_update_sack_list(tp);
1972 #endif
1973 
1974 		/*
1975 		 * variable len never referenced again in modern BSD,
1976 		 * so why bother computing it ??
1977 		 */
1978 #if 0
1979 		/*
1980 		 * Note the amount of data that peer has sent into
1981 		 * our window, in order to estimate the sender's
1982 		 * buffer size.
1983 		 */
1984 		len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1985 #endif /* 0 */
1986 	} else {
1987 		m_freem(m);
1988 		tiflags &= ~TH_FIN;
1989 	}
1990 
1991 	/*
1992 	 * If FIN is received ACK the FIN and let the user know
1993 	 * that the connection is closing.  Ignore a FIN received before
1994 	 * the connection is fully established.
1995 	 */
1996 	if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
1997 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1998 			socantrcvmore(so);
1999 			tp->t_flags |= TF_ACKNOW;
2000 			tp->rcv_nxt++;
2001 		}
2002 		switch (tp->t_state) {
2003 
2004 		/*
2005 		 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2006 		 */
2007 		case TCPS_ESTABLISHED:
2008 			tp->t_state = TCPS_CLOSE_WAIT;
2009 			break;
2010 
2011 		/*
2012 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2013 		 * enter the CLOSING state.
2014 		 */
2015 		case TCPS_FIN_WAIT_1:
2016 			tp->t_state = TCPS_CLOSING;
2017 			break;
2018 
2019 		/*
2020 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2021 		 * starting the time-wait timer, turning off the other
2022 		 * standard timers.
2023 		 */
2024 		case TCPS_FIN_WAIT_2:
2025 			tp->t_state = TCPS_TIME_WAIT;
2026 			tcp_canceltimers(tp);
2027 			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
2028 			soisdisconnected(so);
2029 			break;
2030 
2031 		/*
2032 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2033 		 */
2034 		case TCPS_TIME_WAIT:
2035 			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
2036 			break;
2037 		}
2038 	}
2039 	if (so->so_options & SO_DEBUG) {
2040 		switch (tp->pf == PF_INET6) {
2041 #ifdef INET6
2042 		case PF_INET6:
2043 			tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6,
2044 			    0, tlen);
2045 			break;
2046 #endif /* INET6 */
2047 		case PF_INET:
2048 			tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti,
2049 			    0, tlen);
2050 			break;
2051 		}
2052 	}
2053 
2054 	/*
2055 	 * Return any desired output.
2056 	 */
2057 	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2058 		(void) tcp_output(tp);
2059 	}
2060 	return;
2061 
2062 dropafterack:
2063 	/*
2064 	 * Generate an ACK dropping incoming segment if it occupies
2065 	 * sequence space, where the ACK reflects our state.
2066 	 */
2067 	if (tiflags & TH_RST)
2068 		goto drop;
2069 	m_freem(m);
2070 	tp->t_flags |= TF_ACKNOW;
2071 	(void) tcp_output(tp);
2072 	return;
2073 
2074 dropwithreset_ratelim:
2075 	/*
2076 	 * We may want to rate-limit RSTs in certain situations,
2077 	 * particularly if we are sending an RST in response to
2078 	 * an attempt to connect to or otherwise communicate with
2079 	 * a port for which we have no socket.
2080 	 */
2081 	if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2082 	    tcp_rst_ppslim) == 0) {
2083 		/* XXX stat */
2084 		goto drop;
2085 	}
2086 	/* ...fall into dropwithreset... */
2087 
2088 dropwithreset:
2089 	/*
2090 	 * Generate a RST, dropping incoming segment.
2091 	 * Make ACK acceptable to originator of segment.
2092 	 * Don't bother to respond if destination was broadcast/multicast.
2093 	 */
2094 	if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
2095 		goto drop;
2096 	switch (af) {
2097 #ifdef INET6
2098 	case AF_INET6:
2099 		/* For following calls to tcp_respond */
2100 		if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst))
2101 			goto drop;
2102 		break;
2103 #endif /* INET6 */
2104 	case AF_INET:
2105 		if (IN_MULTICAST(ip->ip_dst.s_addr))
2106 			goto drop;
2107 	}
2108 	if (tiflags & TH_ACK) {
2109 		tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack,
2110 		    TH_RST);
2111 	} else {
2112 		if (tiflags & TH_SYN)
2113 			tlen++;
2114 		tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
2115 		    (tcp_seq)0, TH_RST|TH_ACK);
2116 	}
2117 	/* destroy temporarily created socket */
2118 	if (dropsocket)
2119 		(void) soabort(so);
2120 	return;
2121 
2122 drop:
2123 #ifdef IPSEC
2124 	if (tdbi)
2125 	        free(tdbi, M_TEMP);
2126 #endif
2127 
2128 	/*
2129 	 * Drop space held by incoming segment and return.
2130 	 */
2131 	if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
2132 		switch (tp->pf) {
2133 #ifdef INET6
2134 		case PF_INET6:
2135 			tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6,
2136 			    0, tlen);
2137 			break;
2138 #endif /* INET6 */
2139 		case PF_INET:
2140 			tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti,
2141 			    0, tlen);
2142 			break;
2143 		}
2144 	}
2145 
2146 	m_freem(m);
2147 	/* destroy temporarily created socket */
2148 	if (dropsocket)
2149 		(void) soabort(so);
2150 	return;
2151 #ifndef TUBA_INCLUDE
2152 }
2153 
2154 void
2155 tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr)
2156 	struct tcpcb *tp;
2157 	u_char *cp;
2158 	int cnt;
2159 	struct tcphdr *th;
2160 	int *ts_present;
2161 	u_int32_t *ts_val, *ts_ecr;
2162 {
2163 	u_int16_t mss = 0;
2164 	int opt, optlen;
2165 
2166 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
2167 		opt = cp[0];
2168 		if (opt == TCPOPT_EOL)
2169 			break;
2170 		if (opt == TCPOPT_NOP)
2171 			optlen = 1;
2172 		else {
2173 			if (cnt < 2)
2174 				break;
2175 			optlen = cp[1];
2176 			if (optlen < 2 || optlen > cnt)
2177 				break;
2178 		}
2179 		switch (opt) {
2180 
2181 		default:
2182 			continue;
2183 
2184 		case TCPOPT_MAXSEG:
2185 			if (optlen != TCPOLEN_MAXSEG)
2186 				continue;
2187 			if (!(th->th_flags & TH_SYN))
2188 				continue;
2189 			bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2190 			NTOHS(mss);
2191 			break;
2192 
2193 		case TCPOPT_WINDOW:
2194 			if (optlen != TCPOLEN_WINDOW)
2195 				continue;
2196 			if (!(th->th_flags & TH_SYN))
2197 				continue;
2198 			tp->t_flags |= TF_RCVD_SCALE;
2199 			tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2200 			break;
2201 
2202 		case TCPOPT_TIMESTAMP:
2203 			if (optlen != TCPOLEN_TIMESTAMP)
2204 				continue;
2205 			*ts_present = 1;
2206 			bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val));
2207 			NTOHL(*ts_val);
2208 			bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr));
2209 			NTOHL(*ts_ecr);
2210 
2211 			/*
2212 			 * A timestamp received in a SYN makes
2213 			 * it ok to send timestamp requests and replies.
2214 			 */
2215 			if (th->th_flags & TH_SYN) {
2216 				tp->t_flags |= TF_RCVD_TSTMP;
2217 				tp->ts_recent = *ts_val;
2218 				tp->ts_recent_age = tcp_now;
2219 			}
2220 			break;
2221 
2222 #ifdef TCP_SACK
2223 		case TCPOPT_SACK_PERMITTED:
2224 			if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED)
2225 				continue;
2226 			if (th->th_flags & TH_SYN)
2227 				/* MUST only be set on SYN */
2228 				tp->t_flags |= TF_SACK_PERMIT;
2229 			break;
2230 		case TCPOPT_SACK:
2231 			if (tcp_sack_option(tp, th, cp, optlen))
2232 				continue;
2233 			break;
2234 #endif
2235 		}
2236 	}
2237 	/* Update t_maxopd and t_maxseg after all options are processed */
2238 	if (th->th_flags & TH_SYN) {
2239 		(void) tcp_mss(tp, mss);	/* sets t_maxseg */
2240 
2241 		if (mss)
2242 			tcp_mss_update(tp);
2243 	}
2244 }
2245 
2246 #if defined(TCP_SACK)
2247 u_long
2248 tcp_seq_subtract(a, b)
2249 	u_long a, b;
2250 {
2251 	return ((long)(a - b));
2252 }
2253 #endif
2254 
2255 
2256 #ifdef TCP_SACK
2257 /*
2258  * This function is called upon receipt of new valid data (while not in header
2259  * prediction mode), and it updates the ordered list of sacks.
2260  */
2261 void
2262 tcp_update_sack_list(tp)
2263 	struct tcpcb *tp;
2264 {
2265 	/*
2266 	 * First reported block MUST be the most recent one.  Subsequent
2267 	 * blocks SHOULD be in the order in which they arrived at the
2268 	 * receiver.  These two conditions make the implementation fully
2269 	 * compliant with RFC 2018.
2270 	 */
2271 	int i, j = 0, count = 0, lastpos = -1;
2272 	struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
2273 
2274 	/* First clean up current list of sacks */
2275 	for (i = 0; i < tp->rcv_numsacks; i++) {
2276 		sack = tp->sackblks[i];
2277 		if (sack.start == 0 && sack.end == 0) {
2278 			count++; /* count = number of blocks to be discarded */
2279 			continue;
2280 		}
2281 		if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
2282 			tp->sackblks[i].start = tp->sackblks[i].end = 0;
2283 			count++;
2284 		} else {
2285 			temp[j].start = tp->sackblks[i].start;
2286 			temp[j++].end = tp->sackblks[i].end;
2287 		}
2288 	}
2289 	tp->rcv_numsacks -= count;
2290 	if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
2291 		tcp_clean_sackreport(tp);
2292 		if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
2293 			/* ==> need first sack block */
2294 			tp->sackblks[0].start = tp->rcv_laststart;
2295 			tp->sackblks[0].end = tp->rcv_lastend;
2296 			tp->rcv_numsacks = 1;
2297 		}
2298 		return;
2299 	}
2300 	/* Otherwise, sack blocks are already present. */
2301 	for (i = 0; i < tp->rcv_numsacks; i++)
2302 		tp->sackblks[i] = temp[i]; /* first copy back sack list */
2303 	if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
2304 		return;     /* sack list remains unchanged */
2305 	/*
2306 	 * From here, segment just received should be (part of) the 1st sack.
2307 	 * Go through list, possibly coalescing sack block entries.
2308 	 */
2309 	firstsack.start = tp->rcv_laststart;
2310 	firstsack.end = tp->rcv_lastend;
2311 	for (i = 0; i < tp->rcv_numsacks; i++) {
2312 		sack = tp->sackblks[i];
2313 		if (SEQ_LT(sack.end, firstsack.start) ||
2314 		    SEQ_GT(sack.start, firstsack.end))
2315 			continue; /* no overlap */
2316 		if (sack.start == firstsack.start && sack.end == firstsack.end){
2317 			/*
2318 			 * identical block; delete it here since we will
2319 			 * move it to the front of the list.
2320 			 */
2321 			tp->sackblks[i].start = tp->sackblks[i].end = 0;
2322 			lastpos = i;    /* last posn with a zero entry */
2323 			continue;
2324 		}
2325 		if (SEQ_LEQ(sack.start, firstsack.start))
2326 			firstsack.start = sack.start; /* merge blocks */
2327 		if (SEQ_GEQ(sack.end, firstsack.end))
2328 			firstsack.end = sack.end;     /* merge blocks */
2329 		tp->sackblks[i].start = tp->sackblks[i].end = 0;
2330 		lastpos = i;    /* last posn with a zero entry */
2331 	}
2332 	if (lastpos != -1) {    /* at least one merge */
2333 		for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
2334 			sack = tp->sackblks[i];
2335 			if (sack.start == 0 && sack.end == 0)
2336 				continue;
2337 			temp[j++] = sack;
2338 		}
2339 		tp->rcv_numsacks = j; /* including first blk (added later) */
2340 		for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
2341 			tp->sackblks[i] = temp[i];
2342 	} else {        /* no merges -- shift sacks by 1 */
2343 		if (tp->rcv_numsacks < MAX_SACK_BLKS)
2344 			tp->rcv_numsacks++;
2345 		for (i = tp->rcv_numsacks-1; i > 0; i--)
2346 			tp->sackblks[i] = tp->sackblks[i-1];
2347 	}
2348 	tp->sackblks[0] = firstsack;
2349 	return;
2350 }
2351 
2352 /*
2353  * Process the TCP SACK option.  Returns 1 if tcp_dooptions() should continue,
2354  * and 0 otherwise, if the option was fine.  tp->snd_holes is an ordered list
2355  * of holes (oldest to newest, in terms of the sequence space).
2356  */
2357 int
2358 tcp_sack_option(tp, th, cp, optlen)
2359 	struct tcpcb *tp;
2360 	struct tcphdr *th;
2361 	u_char *cp;
2362 	int    optlen;
2363 {
2364 	int tmp_olen;
2365 	u_char *tmp_cp;
2366 	struct sackhole *cur, *p, *temp;
2367 
2368 	if (tp->sack_disable)
2369 		return 1;
2370 
2371 	/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
2372 	if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2373 		return 1;
2374 	tmp_cp = cp + 2;
2375 	tmp_olen = optlen - 2;
2376 	if (tp->snd_numholes < 0)
2377 		tp->snd_numholes = 0;
2378 	if (tp->t_maxseg == 0)
2379 		panic("tcp_sack_option"); /* Should never happen */
2380 	while (tmp_olen > 0) {
2381 		struct sackblk sack;
2382 
2383 		bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
2384 		NTOHL(sack.start);
2385 		bcopy((char *) tmp_cp + sizeof(tcp_seq),
2386 		    (char *) &(sack.end), sizeof(tcp_seq));
2387 		NTOHL(sack.end);
2388 		tmp_olen -= TCPOLEN_SACK;
2389 		tmp_cp += TCPOLEN_SACK;
2390 		if (SEQ_LEQ(sack.end, sack.start))
2391 			continue; /* bad SACK fields */
2392 		if (SEQ_LEQ(sack.end, tp->snd_una))
2393 			continue; /* old block */
2394 #if defined(TCP_SACK) && defined(TCP_FACK)
2395 		/* Updates snd_fack.  */
2396 		if (SEQ_GEQ(sack.end, tp->snd_fack))
2397 			tp->snd_fack = sack.end;
2398 #endif /* TCP_FACK */
2399 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
2400 			if (SEQ_LT(sack.start, th->th_ack))
2401 				continue;
2402 		} else {
2403 			if (SEQ_LT(sack.start, tp->snd_una))
2404 				continue;
2405 		}
2406 		if (SEQ_GT(sack.end, tp->snd_max))
2407 			continue;
2408 		if (tp->snd_holes == 0) { /* first hole */
2409 			tp->snd_holes = (struct sackhole *)
2410 			    malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT);
2411 			if (tp->snd_holes == NULL) {
2412 				/* ENOBUFS, so ignore SACKed block for now*/
2413 				continue;
2414 			}
2415 			cur = tp->snd_holes;
2416 			cur->start = th->th_ack;
2417 			cur->end = sack.start;
2418 			cur->rxmit = cur->start;
2419 			cur->next = 0;
2420 			tp->snd_numholes = 1;
2421 			tp->rcv_lastsack = sack.end;
2422 			/*
2423 			 * dups is at least one.  If more data has been
2424 			 * SACKed, it can be greater than one.
2425 			 */
2426 			cur->dups = min(tcprexmtthresh,
2427 			    ((sack.end - cur->end)/tp->t_maxseg));
2428 			if (cur->dups < 1)
2429 				cur->dups = 1;
2430 			continue; /* with next sack block */
2431 		}
2432 		/* Go thru list of holes:  p = previous,  cur = current */
2433 		p = cur = tp->snd_holes;
2434 		while (cur) {
2435 			if (SEQ_LEQ(sack.end, cur->start))
2436 				/* SACKs data before the current hole */
2437 				break; /* no use going through more holes */
2438 			if (SEQ_GEQ(sack.start, cur->end)) {
2439 				/* SACKs data beyond the current hole */
2440 				cur->dups++;
2441 				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
2442 					tcprexmtthresh)
2443 					cur->dups = tcprexmtthresh;
2444 				p = cur;
2445 				cur = cur->next;
2446 				continue;
2447 			}
2448 			if (SEQ_LEQ(sack.start, cur->start)) {
2449 				/* Data acks at least the beginning of hole */
2450 #if defined(TCP_SACK) && defined(TCP_FACK)
2451 				if (SEQ_GT(sack.end, cur->rxmit))
2452 					tp->retran_data -=
2453 				    	    tcp_seq_subtract(cur->rxmit,
2454 					    cur->start);
2455 				else
2456 					tp->retran_data -=
2457 					    tcp_seq_subtract(sack.end,
2458 					    cur->start);
2459 #endif /* TCP_FACK */
2460 				if (SEQ_GEQ(sack.end,cur->end)){
2461 					/* Acks entire hole, so delete hole */
2462 					if (p != cur) {
2463 						p->next = cur->next;
2464 						free(cur, M_PCB);
2465 						cur = p->next;
2466 					} else {
2467 						cur=cur->next;
2468 						free(p, M_PCB);
2469 						p = cur;
2470 						tp->snd_holes = p;
2471 					}
2472 					tp->snd_numholes--;
2473 					continue;
2474 				}
2475 				/* otherwise, move start of hole forward */
2476 				cur->start = sack.end;
2477 				cur->rxmit = max (cur->rxmit, cur->start);
2478 				p = cur;
2479 				cur = cur->next;
2480 				continue;
2481 			}
2482 			/* move end of hole backward */
2483 			if (SEQ_GEQ(sack.end, cur->end)) {
2484 #if defined(TCP_SACK) && defined(TCP_FACK)
2485 				if (SEQ_GT(cur->rxmit, sack.start))
2486 					tp->retran_data -=
2487 					    tcp_seq_subtract(cur->rxmit,
2488 					    sack.start);
2489 #endif /* TCP_FACK */
2490 				cur->end = sack.start;
2491 				cur->rxmit = min (cur->rxmit, cur->end);
2492 				cur->dups++;
2493 				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
2494 					tcprexmtthresh)
2495 					cur->dups = tcprexmtthresh;
2496 				p = cur;
2497 				cur = cur->next;
2498 				continue;
2499 			}
2500 			if (SEQ_LT(cur->start, sack.start) &&
2501 			    SEQ_GT(cur->end, sack.end)) {
2502 				/*
2503 				 * ACKs some data in middle of a hole; need to
2504 				 * split current hole
2505 				 */
2506 				temp = (struct sackhole *)malloc(sizeof(*temp),
2507 				    M_PCB,M_NOWAIT);
2508 				if (temp == NULL)
2509 					continue; /* ENOBUFS */
2510 #if defined(TCP_SACK) && defined(TCP_FACK)
2511 				if (SEQ_GT(cur->rxmit, sack.end))
2512 					tp->retran_data -=
2513 					    tcp_seq_subtract(sack.end,
2514 					    sack.start);
2515 				else if (SEQ_GT(cur->rxmit, sack.start))
2516 					tp->retran_data -=
2517 					    tcp_seq_subtract(cur->rxmit,
2518 					    sack.start);
2519 #endif /* TCP_FACK */
2520 				temp->next = cur->next;
2521 				temp->start = sack.end;
2522 				temp->end = cur->end;
2523 				temp->dups = cur->dups;
2524 				temp->rxmit = max (cur->rxmit, temp->start);
2525 				cur->end = sack.start;
2526 				cur->rxmit = min (cur->rxmit, cur->end);
2527 				cur->dups++;
2528 				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
2529 					tcprexmtthresh)
2530 					cur->dups = tcprexmtthresh;
2531 				cur->next = temp;
2532 				p = temp;
2533 				cur = p->next;
2534 				tp->snd_numholes++;
2535 			}
2536 		}
2537 		/* At this point, p points to the last hole on the list */
2538 		if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
2539 			/*
2540 			 * Need to append new hole at end.
2541 			 * Last hole is p (and it's not NULL).
2542 			 */
2543 			temp = (struct sackhole *) malloc(sizeof(*temp),
2544 			    M_PCB, M_NOWAIT);
2545 			if (temp == NULL)
2546 				continue; /* ENOBUFS */
2547 			temp->start = tp->rcv_lastsack;
2548 			temp->end = sack.start;
2549 			temp->dups = min(tcprexmtthresh,
2550 			    ((sack.end - sack.start)/tp->t_maxseg));
2551 			if (temp->dups < 1)
2552 				temp->dups = 1;
2553 			temp->rxmit = temp->start;
2554 			temp->next = 0;
2555 			p->next = temp;
2556 			tp->rcv_lastsack = sack.end;
2557 			tp->snd_numholes++;
2558 		}
2559 	}
2560 #if defined(TCP_SACK) && defined(TCP_FACK)
2561 	/*
2562 	 * Update retran_data and snd_awnd.  Go through the list of
2563 	 * holes.   Increment retran_data by (hole->rxmit - hole->start).
2564 	 */
2565 	tp->retran_data = 0;
2566 	cur = tp->snd_holes;
2567 	while (cur) {
2568 		tp->retran_data += cur->rxmit - cur->start;
2569 		cur = cur->next;
2570 	}
2571 	tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
2572 	    tp->retran_data;
2573 #endif /* TCP_FACK */
2574 
2575 	return 0;
2576 }
2577 
2578 /*
2579  * Delete stale (i.e, cumulatively ack'd) holes.  Hole is deleted only if
2580  * it is completely acked; otherwise, tcp_sack_option(), called from
2581  * tcp_dooptions(), will fix up the hole.
2582  */
2583 void
2584 tcp_del_sackholes(tp, th)
2585 	struct tcpcb *tp;
2586 	struct tcphdr *th;
2587 {
2588 	if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) {
2589 		/* max because this could be an older ack just arrived */
2590 		tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
2591 			th->th_ack : tp->snd_una;
2592 		struct sackhole *cur = tp->snd_holes;
2593 		struct sackhole *prev = cur;
2594 		while (cur)
2595 			if (SEQ_LEQ(cur->end, lastack)) {
2596 				cur = cur->next;
2597 				free(prev, M_PCB);
2598 				prev = cur;
2599 				tp->snd_numholes--;
2600 			} else if (SEQ_LT(cur->start, lastack)) {
2601 				cur->start = lastack;
2602 				if (SEQ_LT(cur->rxmit, cur->start))
2603 					cur->rxmit = cur->start;
2604 				break;
2605 			} else
2606 				break;
2607 		tp->snd_holes = cur;
2608 	}
2609 }
2610 
2611 /*
2612  * Delete all receiver-side SACK information.
2613  */
2614 void
2615 tcp_clean_sackreport(tp)
2616 	struct tcpcb *tp;
2617 {
2618 	int i;
2619 
2620 	tp->rcv_numsacks = 0;
2621 	for (i = 0; i < MAX_SACK_BLKS; i++)
2622 		tp->sackblks[i].start = tp->sackblks[i].end=0;
2623 
2624 }
2625 
2626 /*
2627  * Checks for partial ack.  If partial ack arrives, turn off retransmission
2628  * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
2629  * If the ack advances at least to tp->snd_last, return 0.
2630  */
2631 int
2632 tcp_sack_partialack(tp, th)
2633 	struct tcpcb *tp;
2634 	struct tcphdr *th;
2635 {
2636 	if (SEQ_LT(th->th_ack, tp->snd_last)) {
2637 		/* Turn off retx. timer (will start again next segment) */
2638 		tp->t_timer[TCPT_REXMT] = 0;
2639 		tp->t_rtt = 0;
2640 #ifndef TCP_FACK
2641 		/*
2642 		 * Partial window deflation.  This statement relies on the
2643 		 * fact that tp->snd_una has not been updated yet.  In FACK
2644 		 * hold snd_cwnd constant during fast recovery.
2645 		 */
2646 		if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
2647 			tp->snd_cwnd -= th->th_ack - tp->snd_una;
2648 			tp->snd_cwnd += tp->t_maxseg;
2649 		} else
2650 			tp->snd_cwnd = tp->t_maxseg;
2651 #endif
2652 		return 1;
2653 	}
2654 	return 0;
2655 }
2656 #endif TCP_SACK
2657 
2658 /*
2659  * Pull out of band byte out of a segment so
2660  * it doesn't appear in the user's data queue.
2661  * It is still reflected in the segment length for
2662  * sequencing purposes.
2663  */
2664 void
2665 tcp_pulloutofband(so, urgent, m, off)
2666 	struct socket *so;
2667 	u_int urgent;
2668 	register struct mbuf *m;
2669 	int off;
2670 {
2671         int cnt = off + urgent - 1;
2672 
2673 	while (cnt >= 0) {
2674 		if (m->m_len > cnt) {
2675 			char *cp = mtod(m, caddr_t) + cnt;
2676 			struct tcpcb *tp = sototcpcb(so);
2677 
2678 			tp->t_iobc = *cp;
2679 			tp->t_oobflags |= TCPOOB_HAVEDATA;
2680 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2681 			m->m_len--;
2682 			return;
2683 		}
2684 		cnt -= m->m_len;
2685 		m = m->m_next;
2686 		if (m == 0)
2687 			break;
2688 	}
2689 	panic("tcp_pulloutofband");
2690 }
2691 
2692 /*
2693  * Collect new round-trip time estimate
2694  * and update averages and current timeout.
2695  */
2696 void
2697 tcp_xmit_timer(tp, rtt)
2698 	register struct tcpcb *tp;
2699 	short rtt;
2700 {
2701 	register short delta;
2702 	short rttmin;
2703 
2704 	tcpstat.tcps_rttupdated++;
2705 	--rtt;
2706 	if (tp->t_srtt != 0) {
2707 		/*
2708 		 * srtt is stored as fixed point with 3 bits after the
2709 		 * binary point (i.e., scaled by 8).  The following magic
2710 		 * is equivalent to the smoothing algorithm in rfc793 with
2711 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2712 		 * point).  Adjust rtt to origin 0.
2713 		 */
2714 		delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
2715 		if ((tp->t_srtt += delta) <= 0)
2716 			tp->t_srtt = 1;
2717 		/*
2718 		 * We accumulate a smoothed rtt variance (actually, a
2719 		 * smoothed mean difference), then set the retransmit
2720 		 * timer to smoothed rtt + 4 times the smoothed variance.
2721 		 * rttvar is stored as fixed point with 2 bits after the
2722 		 * binary point (scaled by 4).  The following is
2723 		 * equivalent to rfc793 smoothing with an alpha of .75
2724 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
2725 		 * rfc793's wired-in beta.
2726 		 */
2727 		if (delta < 0)
2728 			delta = -delta;
2729 		delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
2730 		if ((tp->t_rttvar += delta) <= 0)
2731 			tp->t_rttvar = 1;
2732 	} else {
2733 		/*
2734 		 * No rtt measurement yet - use the unsmoothed rtt.
2735 		 * Set the variance to half the rtt (so our first
2736 		 * retransmit happens at 3*rtt).
2737 		 */
2738 		tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
2739 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
2740 	}
2741 	tp->t_rtt = 0;
2742 	tp->t_rxtshift = 0;
2743 
2744 	/*
2745 	 * the retransmit should happen at rtt + 4 * rttvar.
2746 	 * Because of the way we do the smoothing, srtt and rttvar
2747 	 * will each average +1/2 tick of bias.  When we compute
2748 	 * the retransmit timer, we want 1/2 tick of rounding and
2749 	 * 1 extra tick because of +-1/2 tick uncertainty in the
2750 	 * firing of the timer.  The bias will give us exactly the
2751 	 * 1.5 tick we need.  But, because the bias is
2752 	 * statistical, we have to test that we don't drop below
2753 	 * the minimum feasible timer (which is 2 ticks).
2754 	 */
2755 	if (tp->t_rttmin > rtt + 2)
2756 		rttmin = tp->t_rttmin;
2757 	else
2758 		rttmin = rtt + 2;
2759 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
2760 
2761 	/*
2762 	 * We received an ack for a packet that wasn't retransmitted;
2763 	 * it is probably safe to discard any error indications we've
2764 	 * received recently.  This isn't quite right, but close enough
2765 	 * for now (a route might have failed after we sent a segment,
2766 	 * and the return path might not be symmetrical).
2767 	 */
2768 	tp->t_softerror = 0;
2769 }
2770 
2771 /*
2772  * Determine a reasonable value for maxseg size.
2773  * If the route is known, check route for mtu.
2774  * If none, use an mss that can be handled on the outgoing
2775  * interface without forcing IP to fragment; if bigger than
2776  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2777  * to utilize large mbufs.  If no route is found, route has no mtu,
2778  * or the destination isn't local, use a default, hopefully conservative
2779  * size (usually 512 or the default IP max size, but no more than the mtu
2780  * of the interface), as we can't discover anything about intervening
2781  * gateways or networks.  We also initialize the congestion/slow start
2782  * window to be a single segment if the destination isn't local.
2783  * While looking at the routing entry, we also initialize other path-dependent
2784  * parameters from pre-set or cached values in the routing entry.
2785  *
2786  * Also take into account the space needed for options that we
2787  * send regularly.  Make maxseg shorter by that amount to assure
2788  * that we can send maxseg amount of data even when the options
2789  * are present.  Store the upper limit of the length of options plus
2790  * data in maxopd.
2791  *
2792  * NOTE: offer == -1 indicates that the maxseg size changed due to
2793  * Path MTU discovery.
2794  */
2795 int
2796 tcp_mss(tp, offer)
2797 	register struct tcpcb *tp;
2798 	int offer;
2799 {
2800 	struct rtentry *rt;
2801 	struct ifnet *ifp;
2802 	int mss, mssopt;
2803 	int iphlen;
2804 #ifdef INET6
2805 	int is_ipv6 = 0;
2806 #endif
2807 	struct inpcb *inp;
2808 
2809 	inp = tp->t_inpcb;
2810 
2811 	mssopt = mss = tcp_mssdflt;
2812 
2813 	rt = in_pcbrtentry(inp);
2814 
2815 	if (rt == NULL)
2816 		goto out;
2817 
2818 	ifp = rt->rt_ifp;
2819 
2820 	switch (tp->pf) {
2821 #ifdef INET6
2822 	case AF_INET6:
2823 		iphlen = sizeof(struct ip6_hdr);
2824 		is_ipv6 = 1;
2825 		break;
2826 #endif
2827 	case AF_INET:
2828 		iphlen = sizeof(struct ip);
2829 		break;
2830 	default:
2831 		/* the family does not support path MTU discovery */
2832 		goto out;
2833 	}
2834 
2835 #ifdef RTV_MTU
2836 	/*
2837 	 * if there's an mtu associated with the route and we support
2838 	 * path MTU discovery for the underlying protocol family, use it.
2839 	 */
2840 	if (rt->rt_rmx.rmx_mtu) {
2841 		/*
2842 		 * One may wish to lower MSS to take into account options,
2843 		 * especially security-related options.
2844 		 */
2845 		mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
2846 	} else
2847 #endif /* RTV_MTU */
2848 	if (!ifp)
2849 		/*
2850 		 * ifp may be null and rmx_mtu may be zero in certain
2851 		 * v6 cases (e.g., if ND wasn't able to resolve the
2852 		 * destination host.
2853 		 */
2854 		goto out;
2855 	else if (ip_mtudisc || ifp->if_flags & IFF_LOOPBACK)
2856 		mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
2857 #ifdef INET6
2858 	else if (is_ipv6) {
2859 		if (IN6_IS_ADDR_V4MAPPED(&inp->inp_faddr6)) {
2860 			/* mapped addr case */
2861 			struct in_addr d;
2862 			bcopy(&inp->inp_faddr6.s6_addr32[3], &d, sizeof(d));
2863 			if (in_localaddr(d))
2864 				mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
2865 		} else {
2866 			if (in6_localaddr(&inp->inp_faddr6))
2867 				mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
2868 		}
2869 	}
2870 #endif /* INET6 */
2871 	else if (inp && in_localaddr(inp->inp_faddr))
2872 		mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
2873 
2874 	/* Calculate the value that we offer in TCPOPT_MAXSEG */
2875 	if (offer != -1) {
2876 		mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
2877 		mssopt = max(tcp_mssdflt, mssopt);
2878 	}
2879 
2880  out:
2881 	/*
2882 	 * The current mss, t_maxseg, is initialized to the default value.
2883 	 * If we compute a smaller value, reduce the current mss.
2884 	 * If we compute a larger value, return it for use in sending
2885 	 * a max seg size option, but don't store it for use
2886 	 * unless we received an offer at least that large from peer.
2887 	 * However, do not accept offers under 32 bytes.
2888 	 */
2889 	if (offer > 0)
2890 		tp->t_peermss = offer;
2891 	if (tp->t_peermss)
2892 		mss = min(mss, tp->t_peermss);
2893 	mss = max(mss, 64);		/* sanity - at least max opt. space */
2894 
2895 	/*
2896 	 * maxopd stores the maximum length of data AND options
2897 	 * in a segment; maxseg is the amount of data in a normal
2898 	 * segment.  We need to store this value (maxopd) apart
2899 	 * from maxseg, because now every segment carries options
2900 	 * and thus we normally have somewhat less data in segments.
2901 	 */
2902 	tp->t_maxopd = mss;
2903 
2904  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2905 	    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
2906 		mss -= TCPOLEN_TSTAMP_APPA;
2907 
2908 	if (offer == -1) {
2909 		/* mss changed due to Path MTU discovery */
2910 		if (mss < tp->t_maxseg) {
2911 			/*
2912 			 * Follow suggestion in RFC 2414 to reduce the
2913 			 * congestion window by the ratio of the old
2914 			 * segment size to the new segment size.
2915 			 */
2916 			tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
2917 					     mss, mss);
2918 		}
2919 	} else
2920 		tp->snd_cwnd = mss;
2921 
2922 	tp->t_maxseg = mss;
2923 
2924 	return (offer != -1 ? mssopt : mss);
2925 }
2926 
2927 /*
2928  * Set connection variables based on the effective MSS.
2929  * We are passed the TCPCB for the actual connection.  If we
2930  * are the server, we are called by the compressed state engine
2931  * when the 3-way handshake is complete.  If we are the client,
2932  * we are called when we recieve the SYN,ACK from the server.
2933  *
2934  * NOTE: The t_maxseg value must be initialized in the TCPCB
2935  * before this routine is called!
2936  */
2937 void
2938 tcp_mss_update(tp)
2939 	struct tcpcb *tp;
2940 {
2941 	int mss, rtt;
2942 	u_long bufsize;
2943 	struct rtentry *rt;
2944 	struct socket *so;
2945 
2946 	so = tp->t_inpcb->inp_socket;
2947 	mss = tp->t_maxseg;
2948 
2949 	rt = in_pcbrtentry(tp->t_inpcb);
2950 
2951 	if (rt == NULL)
2952 		return;
2953 
2954 #ifdef RTV_MTU	/* if route characteristics exist ... */
2955 	/*
2956 	 * While we're here, check if there's an initial rtt
2957 	 * or rttvar.  Convert from the route-table units
2958 	 * to scaled multiples of the slow timeout timer.
2959 	 */
2960 	if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2961 		/*
2962 		 * XXX the lock bit for MTU indicates that the value
2963 		 * is also a minimum value; this is subject to time.
2964 		 */
2965 		if (rt->rt_rmx.rmx_locks & RTV_RTT)
2966 			TCPT_RANGESET(tp->t_rttmin,
2967 			    rtt / (RTM_RTTUNIT / PR_SLOWHZ),
2968 			    TCPTV_MIN, TCPTV_REXMTMAX);
2969 		tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
2970 		if (rt->rt_rmx.rmx_rttvar)
2971 			tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2972 			    (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
2973 		else
2974 			/* default variation is +- 1 rtt */
2975 			tp->t_rttvar =
2976 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2977 		TCPT_RANGESET((long) tp->t_rxtcur,
2978 		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2979 		    tp->t_rttmin, TCPTV_REXMTMAX);
2980 	}
2981 #endif
2982 
2983 	/*
2984 	 * If there's a pipesize, change the socket buffer
2985 	 * to that size.  Make the socket buffers an integral
2986 	 * number of mss units; if the mss is larger than
2987 	 * the socket buffer, decrease the mss.
2988 	 */
2989 #ifdef RTV_SPIPE
2990 	if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2991 #endif
2992 		bufsize = so->so_snd.sb_hiwat;
2993 	if (bufsize < mss) {
2994 		mss = bufsize;
2995 		/* Update t_maxseg and t_maxopd */
2996 		tcp_mss(tp, mss);
2997 	} else {
2998 		bufsize = roundup(bufsize, mss);
2999 		if (bufsize > sb_max)
3000 			bufsize = sb_max;
3001 		(void)sbreserve(&so->so_snd, bufsize);
3002 	}
3003 
3004 #ifdef RTV_RPIPE
3005 	if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
3006 #endif
3007 		bufsize = so->so_rcv.sb_hiwat;
3008 	if (bufsize > mss) {
3009 		bufsize = roundup(bufsize, mss);
3010 		if (bufsize > sb_max)
3011 			bufsize = sb_max;
3012 		(void)sbreserve(&so->so_rcv, bufsize);
3013 #ifdef RTV_RPIPE
3014 		if (rt->rt_rmx.rmx_recvpipe > 0)
3015 			tcp_rscale(tp, so->so_rcv.sb_hiwat);
3016 #endif
3017 	}
3018 
3019 #ifdef RTV_SSTHRESH
3020 	if (rt->rt_rmx.rmx_ssthresh) {
3021 		/*
3022 		 * There's some sort of gateway or interface
3023 		 * buffer limit on the path.  Use this to set
3024 		 * the slow start threshhold, but set the
3025 		 * threshold to no less than 2*mss.
3026 		 */
3027 		tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
3028 	}
3029 #endif /* RTV_MTU */
3030 }
3031 #endif /* TUBA_INCLUDE */
3032 
3033 #if defined (TCP_SACK)
3034 /*
3035  * Checks for partial ack.  If partial ack arrives, force the retransmission
3036  * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3037  * 1.  By setting snd_nxt to ti_ack, this forces retransmission timer to
3038  * be started again.  If the ack advances at least to tp->snd_last, return 0.
3039  */
3040 int
3041 tcp_newreno(tp, th)
3042 	struct tcpcb *tp;
3043 	struct tcphdr *th;
3044 {
3045 	if (SEQ_LT(th->th_ack, tp->snd_last)) {
3046 		/*
3047 		 * snd_una has not been updated and the socket send buffer
3048 		 * not yet drained of the acked data, so we have to leave
3049 		 * snd_una as it was to get the correct data offset in
3050 		 * tcp_output().
3051 		 */
3052 		tcp_seq onxt = tp->snd_nxt;
3053 		u_long  ocwnd = tp->snd_cwnd;
3054 		tp->t_timer[TCPT_REXMT] = 0;
3055 		tp->t_rtt = 0;
3056 		tp->snd_nxt = th->th_ack;
3057 		/*
3058 		 * Set snd_cwnd to one segment beyond acknowledged offset
3059 		 * (tp->snd_una not yet updated when this function is called)
3060 		 */
3061 		tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3062 		(void) tcp_output(tp);
3063 		tp->snd_cwnd = ocwnd;
3064 		if (SEQ_GT(onxt, tp->snd_nxt))
3065 			tp->snd_nxt = onxt;
3066 		/*
3067 		 * Partial window deflation.  Relies on fact that tp->snd_una
3068 		 * not updated yet.
3069 		 */
3070 		tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg);
3071 		return 1;
3072 	}
3073 	return 0;
3074 }
3075 #endif /* TCP_SACK */
3076