1 /* $OpenBSD: tcp_input.c,v 1.81 2000/12/13 09:47:08 provos Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 37 */ 38 39 /* 40 %%% portions-copyright-nrl-95 41 Portions of this software are Copyright 1995-1998 by Randall Atkinson, 42 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights 43 Reserved. All rights under this copyright have been assigned to the US 44 Naval Research Laboratory (NRL). The NRL Copyright Notice and License 45 Agreement Version 1.1 (January 17, 1995) applies to these portions of the 46 software. 47 You should have received a copy of the license with this software. If you 48 didn't get a copy, you may request one from <license@ipv6.nrl.navy.mil>. 49 */ 50 51 #ifndef TUBA_INCLUDE 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/malloc.h> 55 #include <sys/mbuf.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/errno.h> 60 #include <sys/domain.h> 61 62 #include <net/if.h> 63 #include <net/route.h> 64 65 #include <netinet/in.h> 66 #include <netinet/in_systm.h> 67 #include <netinet/ip.h> 68 #include <netinet/in_pcb.h> 69 #include <netinet/ip_var.h> 70 #include <netinet/tcp.h> 71 #include <netinet/tcp_fsm.h> 72 #include <netinet/tcp_seq.h> 73 #include <netinet/tcp_timer.h> 74 #include <netinet/tcp_var.h> 75 #include <netinet/tcpip.h> 76 #include <netinet/tcp_debug.h> 77 #include <dev/rndvar.h> 78 #include <machine/stdarg.h> 79 #include <sys/md5k.h> 80 81 #ifdef IPSEC 82 #include <netinet/ip_ipsp.h> 83 #endif /* IPSEC */ 84 85 #define PI_MAGIC 0xdeadbeef /* XXX the horror! */ 86 87 #ifdef INET6 88 #include <netinet6/in6_var.h> 89 #include <netinet/ip6.h> 90 #include <netinet6/ip6_var.h> 91 #include <netinet6/tcpipv6.h> 92 #include <netinet/icmp6.h> 93 #include <netinet6/nd6.h> 94 95 struct tcpiphdr tcp_saveti; 96 struct tcpipv6hdr tcp_saveti6; 97 98 /* for the packet header length in the mbuf */ 99 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 100 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 101 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 102 #endif /* INET6 */ 103 104 int tcprexmtthresh = 3; 105 struct tcpiphdr tcp_saveti; 106 int tcptv_keep_init = TCPTV_KEEP_INIT; 107 108 extern u_long sb_max; 109 110 int tcp_rst_ppslim = 100; /* 100pps */ 111 int tcp_rst_ppslim_count = 0; 112 struct timeval tcp_rst_ppslim_last; 113 114 #endif /* TUBA_INCLUDE */ 115 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) 116 117 /* for modulo comparisons of timestamps */ 118 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 119 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 120 121 /* 122 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 123 */ 124 #ifdef INET6 125 #define ND6_HINT(tp) \ 126 do { \ 127 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 128 tp->t_inpcb->inp_route6.ro_rt) { \ 129 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \ 130 } \ 131 } while (0) 132 #else 133 #define ND6_HINT(tp) 134 #endif 135 136 /* 137 * Insert segment ti into reassembly queue of tcp with 138 * control block tp. Return TH_FIN if reassembly now includes 139 * a segment with FIN. The macro form does the common case inline 140 * (segment is the next to be received on an established connection, 141 * and the queue is empty), avoiding linkage into and removal 142 * from the queue and repetition of various conversions. 143 * Set DELACK for segments received in order, but ack immediately 144 * when segments are out of order (so fast retransmit can work). 145 */ 146 147 #ifndef TUBA_INCLUDE 148 149 int 150 tcp_reass(tp, th, m, tlen) 151 register struct tcpcb *tp; 152 register struct tcphdr *th; 153 struct mbuf *m; 154 int *tlen; 155 { 156 register struct ipqent *p, *q, *nq, *tiqe; 157 struct socket *so = tp->t_inpcb->inp_socket; 158 int flags; 159 160 /* 161 * Call with th==0 after become established to 162 * force pre-ESTABLISHED data up to user socket. 163 */ 164 if (th == 0) 165 goto present; 166 167 /* 168 * Allocate a new queue entry, before we throw away any data. 169 * If we can't, just drop the packet. XXX 170 */ 171 MALLOC(tiqe, struct ipqent *, sizeof(struct ipqent), M_IPQ, M_NOWAIT); 172 if (tiqe == NULL) { 173 tcpstat.tcps_rcvmemdrop++; 174 m_freem(m); 175 return (0); 176 } 177 178 /* 179 * Find a segment which begins after this one does. 180 */ 181 for (p = NULL, q = tp->segq.lh_first; q != NULL; 182 p = q, q = q->ipqe_q.le_next) 183 if (SEQ_GT(q->ipqe_tcp->th_seq, th->th_seq)) 184 break; 185 186 /* 187 * If there is a preceding segment, it may provide some of 188 * our data already. If so, drop the data from the incoming 189 * segment. If it provides all of our data, drop us. 190 */ 191 if (p != NULL) { 192 register struct tcphdr *phdr = p->ipqe_tcp; 193 register int i; 194 195 /* conversion to int (in i) handles seq wraparound */ 196 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 197 if (i > 0) { 198 if (i >= *tlen) { 199 tcpstat.tcps_rcvduppack++; 200 tcpstat.tcps_rcvdupbyte += *tlen; 201 m_freem(m); 202 FREE(tiqe, M_IPQ); 203 return (0); 204 } 205 m_adj(m, i); 206 *tlen -= i; 207 th->th_seq += i; 208 } 209 } 210 tcpstat.tcps_rcvoopack++; 211 tcpstat.tcps_rcvoobyte += *tlen; 212 213 /* 214 * While we overlap succeeding segments trim them or, 215 * if they are completely covered, dequeue them. 216 */ 217 for (; q != NULL; q = nq) { 218 register struct tcphdr *qhdr = q->ipqe_tcp; 219 register int i = (th->th_seq + *tlen) - qhdr->th_seq; 220 221 if (i <= 0) 222 break; 223 if (i < qhdr->th_reseqlen) { 224 qhdr->th_seq += i; 225 qhdr->th_reseqlen -= i; 226 m_adj(q->ipqe_m, i); 227 break; 228 } 229 nq = q->ipqe_q.le_next; 230 m_freem(q->ipqe_m); 231 LIST_REMOVE(q, ipqe_q); 232 FREE(q, M_IPQ); 233 } 234 235 /* Insert the new fragment queue entry into place. */ 236 tiqe->ipqe_m = m; 237 th->th_reseqlen = *tlen; 238 tiqe->ipqe_tcp = th; 239 if (p == NULL) { 240 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 241 } else { 242 LIST_INSERT_AFTER(p, tiqe, ipqe_q); 243 } 244 245 present: 246 /* 247 * Present data to user, advancing rcv_nxt through 248 * completed sequence space. 249 */ 250 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 251 return (0); 252 q = tp->segq.lh_first; 253 if (q == NULL || q->ipqe_tcp->th_seq != tp->rcv_nxt) 254 return (0); 255 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->th_reseqlen) 256 return (0); 257 do { 258 tp->rcv_nxt += q->ipqe_tcp->th_reseqlen; 259 flags = q->ipqe_tcp->th_flags & TH_FIN; 260 261 nq = q->ipqe_q.le_next; 262 LIST_REMOVE(q, ipqe_q); 263 ND6_HINT(tp); 264 if (so->so_state & SS_CANTRCVMORE) 265 m_freem(q->ipqe_m); 266 else 267 sbappend(&so->so_rcv, q->ipqe_m); 268 FREE(q, M_IPQ); 269 q = nq; 270 } while (q != NULL && q->ipqe_tcp->th_seq == tp->rcv_nxt); 271 sorwakeup(so); 272 return (flags); 273 } 274 275 /* 276 * First check for a port-specific bomb. We do not want to drop half-opens 277 * for other ports if this is the only port being bombed. We only check 278 * the bottom 40 half open connections, to avoid wasting too much time. 279 * 280 * Or, otherwise it is more likely a generic syn bomb, so delete the oldest 281 * half-open connection. 282 */ 283 void 284 tcpdropoldhalfopen(avoidtp, port) 285 struct tcpcb *avoidtp; 286 u_int16_t port; 287 { 288 register struct inpcb *inp; 289 register struct tcpcb *tp; 290 int ncheck = 40; 291 int s; 292 293 s = splnet(); 294 inp = tcbtable.inpt_queue.cqh_first; 295 if (inp) /* XXX */ 296 for (; inp != (struct inpcb *)&tcbtable.inpt_queue && --ncheck; 297 inp = inp->inp_queue.cqe_prev) { 298 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 299 tp != avoidtp && 300 tp->t_state == TCPS_SYN_RECEIVED && 301 port == inp->inp_lport) { 302 tcp_close(tp); 303 goto done; 304 } 305 } 306 307 inp = tcbtable.inpt_queue.cqh_first; 308 if (inp) /* XXX */ 309 for (; inp != (struct inpcb *)&tcbtable.inpt_queue; 310 inp = inp->inp_queue.cqe_prev) { 311 if ((tp = (struct tcpcb *)inp->inp_ppcb) && 312 tp != avoidtp && 313 tp->t_state == TCPS_SYN_RECEIVED) { 314 tcp_close(tp); 315 goto done; 316 } 317 } 318 done: 319 splx(s); 320 } 321 322 #ifdef INET6 323 int 324 tcp6_input(mp, offp, proto) 325 struct mbuf **mp; 326 int *offp, proto; 327 { 328 struct mbuf *m = *mp; 329 330 #if defined(NFAITH) && 0 < NFAITH 331 if (m->m_pkthdr.rcvif) { 332 if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) { 333 /* XXX send icmp6 host/port unreach? */ 334 m_freem(m); 335 return IPPROTO_DONE; 336 } 337 } 338 #endif 339 340 /* 341 * draft-itojun-ipv6-tcp-to-anycast 342 * better place to put this in? 343 */ 344 if (m->m_flags & M_ANYCAST6) { 345 if (m->m_len >= sizeof(struct ip6_hdr)) { 346 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 347 icmp6_error(m, ICMP6_DST_UNREACH, 348 ICMP6_DST_UNREACH_ADDR, 349 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); 350 } else 351 m_freem(m); 352 return IPPROTO_DONE; 353 } 354 355 tcp_input(m, *offp, proto); 356 return IPPROTO_DONE; 357 } 358 #endif 359 360 /* 361 * TCP input routine, follows pages 65-76 of the 362 * protocol specification dated September, 1981 very closely. 363 */ 364 void 365 #if __STDC__ 366 tcp_input(struct mbuf *m, ...) 367 #else 368 tcp_input(m, va_alist) 369 register struct mbuf *m; 370 #endif 371 { 372 struct ip *ip; 373 register struct inpcb *inp; 374 caddr_t optp = NULL; 375 int optlen = 0; 376 int len, tlen, off; 377 register struct tcpcb *tp = 0; 378 register int tiflags; 379 struct socket *so = NULL; 380 int todrop, acked, ourfinisacked, needoutput = 0; 381 int hdroptlen = 0; 382 short ostate = 0; 383 struct in_addr laddr; 384 int dropsocket = 0; 385 int iss = 0; 386 u_long tiwin; 387 u_int32_t ts_val, ts_ecr; 388 int ts_present = 0; 389 int iphlen; 390 va_list ap; 391 register struct tcphdr *th; 392 #ifdef INET6 393 struct in6_addr laddr6; 394 struct ip6_hdr *ipv6 = NULL; 395 #endif /* INET6 */ 396 #ifdef IPSEC 397 struct tdb_ident *tdbi; 398 struct tdb *tdb; 399 int error, s; 400 #endif /* IPSEC */ 401 int af; 402 403 #ifdef IPSEC 404 tdbi = (struct tdb_ident *) m->m_pkthdr.tdbi; 405 if (tdbi == (void *) PI_MAGIC) 406 tdbi = NULL; 407 #endif /* IPSEC */ 408 409 va_start(ap, m); 410 iphlen = va_arg(ap, int); 411 va_end(ap); 412 413 tcpstat.tcps_rcvtotal++; 414 415 /* 416 * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or 417 * TCP/IPv4. 418 */ 419 switch (mtod(m, struct ip *)->ip_v) { 420 #ifdef INET6 421 case 6: 422 af = AF_INET6; 423 break; 424 #endif 425 case 4: 426 af = AF_INET; 427 break; 428 default: 429 #ifdef IPSEC 430 if (tdbi) 431 free(tdbi, M_TEMP); 432 #endif /* IPSEC */ 433 m_freem(m); 434 return; /*EAFNOSUPPORT*/ 435 } 436 437 /* 438 * Get IP and TCP header together in first mbuf. 439 * Note: IP leaves IP header in first mbuf. 440 */ 441 switch (af) { 442 case AF_INET: 443 #ifdef DIAGNOSTIC 444 if (iphlen < sizeof(struct ip)) { 445 #ifdef IPSEC 446 if (tdbi) 447 free(tdbi, M_TEMP); 448 #endif /* IPSEC */ 449 m_freem(m); 450 return; 451 } 452 #endif /* DIAGNOSTIC */ 453 if (iphlen > sizeof(struct ip)) { 454 #if 0 /*XXX*/ 455 ip_stripoptions(m, (struct mbuf *)0); 456 iphlen = sizeof(struct ip); 457 #else 458 printf("extension headers are not allowed\n"); 459 #ifdef IPSEC 460 if (tdbi) 461 free(tdbi, M_TEMP); 462 #endif /* IPSEC */ 463 m_freem(m); 464 return; 465 #endif 466 } 467 break; 468 #ifdef INET6 469 case AF_INET6: 470 #ifdef DIAGNOSTIC 471 if (iphlen < sizeof(struct ip6_hdr)) { 472 m_freem(m); 473 #ifdef IPSEC 474 if (tdbi) 475 free(tdbi, M_TEMP); 476 #endif /* IPSEC */ 477 return; 478 } 479 #endif /* DIAGNOSTIC */ 480 if (iphlen > sizeof(struct ip6_hdr)) { 481 #if 0 /*XXX*/ 482 ipv6_stripoptions(m, iphlen); 483 iphlen = sizeof(struct ip6_hdr); 484 #else 485 printf("extension headers are not allowed\n"); 486 #ifdef IPSEC 487 if (tdbi) 488 free(tdbi, M_TEMP); 489 #endif /* IPSEC */ 490 m_freem(m); 491 return; 492 #endif 493 } 494 break; 495 #endif 496 default: 497 #ifdef IPSEC 498 if (tdbi) 499 free(tdbi, M_TEMP); 500 #endif /* IPSEC */ 501 m_freem(m); 502 return; 503 } 504 505 if (m->m_len < iphlen + sizeof(struct tcphdr)) { 506 m = m_pullup2(m, iphlen + sizeof(struct tcphdr)); 507 if (m == 0) { 508 tcpstat.tcps_rcvshort++; 509 #ifdef IPSEC 510 if (tdbi) 511 free(tdbi, M_TEMP); 512 #endif /* IPSEC */ 513 return; 514 } 515 } 516 517 ip = NULL; 518 #ifdef INET6 519 ipv6 = NULL; 520 #endif 521 switch (af) { 522 case AF_INET: 523 { 524 struct tcpiphdr *ti; 525 526 ip = mtod(m, struct ip *); 527 #if 1 528 tlen = m->m_pkthdr.len - iphlen; 529 #else 530 tlen = ((struct ip *)ti)->ip_len; 531 #endif 532 ti = mtod(m, struct tcpiphdr *); 533 534 /* 535 * Checksum extended TCP header and data. 536 */ 537 len = sizeof(struct ip) + tlen; 538 bzero(ti->ti_x1, sizeof ti->ti_x1); 539 ti->ti_len = (u_int16_t)tlen; 540 HTONS(ti->ti_len); 541 if ((ti->ti_sum = in_cksum(m, len)) != 0) { 542 tcpstat.tcps_rcvbadsum++; 543 goto drop; 544 } 545 break; 546 } 547 #ifdef INET6 548 case AF_INET6: 549 ipv6 = mtod(m, struct ip6_hdr *); 550 tlen = m->m_pkthdr.len - iphlen; 551 552 /* Be proactive about malicious use of IPv4 mapped address */ 553 if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) || 554 IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_dst)) { 555 /* XXX stat */ 556 goto drop; 557 } 558 559 /* 560 * Be proactive about unspecified IPv6 address in source. 561 * As we use all-zero to indicate unbounded/unconnected pcb, 562 * unspecified IPv6 address can be used to confuse us. 563 * 564 * Note that packets with unspecified IPv6 destination is 565 * already dropped in ip6_input. 566 */ 567 if (IN6_IS_ADDR_UNSPECIFIED(&ipv6->ip6_src)) { 568 /* XXX stat */ 569 goto drop; 570 } 571 572 /* 573 * Checksum extended TCP header and data. 574 */ 575 if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) { 576 tcpstat.tcps_rcvbadsum++; 577 goto drop; 578 } 579 break; 580 #endif 581 } 582 #endif /* TUBA_INCLUDE */ 583 584 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 585 586 /* 587 * Check that TCP offset makes sense, 588 * pull out TCP options and adjust length. XXX 589 */ 590 off = th->th_off << 2; 591 if (off < sizeof(struct tcphdr) || off > tlen) { 592 tcpstat.tcps_rcvbadoff++; 593 goto drop; 594 } 595 tlen -= off; 596 if (off > sizeof(struct tcphdr)) { 597 if (m->m_len < iphlen + off) { 598 if ((m = m_pullup2(m, iphlen + off)) == 0) { 599 tcpstat.tcps_rcvshort++; 600 #ifdef IPSEC 601 if (tdbi) 602 free(tdbi, M_TEMP); 603 #endif /* IPSEC */ 604 return; 605 } 606 switch (af) { 607 case AF_INET: 608 ip = mtod(m, struct ip *); 609 break; 610 #ifdef INET6 611 case AF_INET6: 612 ipv6 = mtod(m, struct ip6_hdr *); 613 break; 614 #endif 615 } 616 th = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 617 } 618 optlen = off - sizeof(struct tcphdr); 619 optp = mtod(m, caddr_t) + iphlen + sizeof(struct tcphdr); 620 /* 621 * Do quick retrieval of timestamp options ("options 622 * prediction?"). If timestamp is the only option and it's 623 * formatted as recommended in RFC 1323 appendix A, we 624 * quickly get the values now and not bother calling 625 * tcp_dooptions(), etc. 626 */ 627 if ((optlen == TCPOLEN_TSTAMP_APPA || 628 (optlen > TCPOLEN_TSTAMP_APPA && 629 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 630 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 631 (th->th_flags & TH_SYN) == 0) { 632 ts_present = 1; 633 ts_val = ntohl(*(u_int32_t *)(optp + 4)); 634 ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 635 optp = NULL; /* we've parsed the options */ 636 } 637 } 638 tiflags = th->th_flags; 639 640 /* 641 * Convert TCP protocol specific fields to host format. 642 */ 643 NTOHL(th->th_seq); 644 NTOHL(th->th_ack); 645 NTOHS(th->th_win); 646 NTOHS(th->th_urp); 647 648 /* 649 * Locate pcb for segment. 650 */ 651 findpcb: 652 switch (af) { 653 #ifdef INET6 654 case AF_INET6: 655 inp = in6_pcbhashlookup(&tcbtable, &ipv6->ip6_src, th->th_sport, 656 &ipv6->ip6_dst, th->th_dport); 657 break; 658 #endif 659 case AF_INET: 660 inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport, 661 ip->ip_dst, th->th_dport); 662 break; 663 } 664 if (inp == 0) { 665 ++tcpstat.tcps_pcbhashmiss; 666 switch (af) { 667 #ifdef INET6 668 case AF_INET6: 669 inp = in_pcblookup(&tcbtable, &ipv6->ip6_src, 670 th->th_sport, &ipv6->ip6_dst, th->th_dport, 671 INPLOOKUP_WILDCARD | INPLOOKUP_IPV6); 672 break; 673 #endif /* INET6 */ 674 case AF_INET: 675 inp = in_pcblookup(&tcbtable, &ip->ip_src, th->th_sport, 676 &ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD); 677 break; 678 } 679 /* 680 * If the state is CLOSED (i.e., TCB does not exist) then 681 * all data in the incoming segment is discarded. 682 * If the TCB exists but is in CLOSED state, it is embryonic, 683 * but should either do a listen or a connect soon. 684 */ 685 if (inp == 0) { 686 ++tcpstat.tcps_noport; 687 goto dropwithreset_ratelim; 688 } 689 } 690 691 tp = intotcpcb(inp); 692 if (tp == 0) 693 goto dropwithreset_ratelim; 694 if (tp->t_state == TCPS_CLOSED) 695 goto drop; 696 697 /* Unscale the window into a 32-bit value. */ 698 if ((tiflags & TH_SYN) == 0) 699 tiwin = th->th_win << tp->snd_scale; 700 else 701 tiwin = th->th_win; 702 703 so = inp->inp_socket; 704 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 705 if (so->so_options & SO_DEBUG) { 706 ostate = tp->t_state; 707 switch (af) { 708 #ifdef INET6 709 case AF_INET6: 710 tcp_saveti6 = *(mtod(m, struct tcpipv6hdr *)); 711 break; 712 #endif 713 case AF_INET: 714 tcp_saveti = *(mtod(m, struct tcpiphdr *)); 715 break; 716 } 717 } 718 if (so->so_options & SO_ACCEPTCONN) { 719 struct socket *so1; 720 721 so1 = sonewconn(so, 0); 722 if (so1 == NULL) { 723 tcpdropoldhalfopen(tp, th->th_dport); 724 so1 = sonewconn(so, 0); 725 if (so1 == NULL) 726 goto drop; 727 } 728 so = so1; 729 /* 730 * This is ugly, but .... 731 * 732 * Mark socket as temporary until we're 733 * committed to keeping it. The code at 734 * ``drop'' and ``dropwithreset'' check the 735 * flag dropsocket to see if the temporary 736 * socket created here should be discarded. 737 * We mark the socket as discardable until 738 * we're committed to it below in TCPS_LISTEN. 739 */ 740 dropsocket++; 741 #ifdef IPSEC 742 /* 743 * We need to copy the required security levels 744 * from the old pcb. 745 */ 746 { 747 struct inpcb *newinp = (struct inpcb *)so->so_pcb; 748 bcopy(inp->inp_seclevel, newinp->inp_seclevel, 749 sizeof(inp->inp_seclevel)); 750 newinp->inp_secrequire = inp->inp_secrequire; 751 } 752 #endif /* IPSEC */ 753 #ifdef INET6 754 /* 755 * inp still has the OLD in_pcb stuff, set the 756 * v6-related flags on the new guy, too. This is 757 * done particularly for the case where an AF_INET6 758 * socket is bound only to a port, and a v4 connection 759 * comes in on that port. 760 * we also copy the flowinfo from the original pcb 761 * to the new one. 762 */ 763 { 764 int flags = inp->inp_flags; 765 struct inpcb *oldinpcb = inp; 766 767 inp = (struct inpcb *)so->so_pcb; 768 inp->inp_flags |= (flags & INP_IPV6); 769 if ((inp->inp_flags & INP_IPV6) != 0) { 770 inp->inp_ipv6.ip6_hlim = 771 oldinpcb->inp_ipv6.ip6_hlim; 772 inp->inp_ipv6.ip6_flow = 773 oldinpcb->inp_ipv6.ip6_flow; 774 } 775 } 776 #else /* INET6 */ 777 inp = (struct inpcb *)so->so_pcb; 778 #endif /* INET6 */ 779 inp->inp_lport = th->th_dport; 780 switch (af) { 781 #ifdef INET6 782 case AF_INET6: 783 inp->inp_laddr6 = ipv6->ip6_dst; 784 inp->inp_fflowinfo = 785 htonl(0x0fffffff) & ipv6->ip6_flow; 786 787 /*inp->inp_options = ip6_srcroute();*/ /* soon. */ 788 /* 789 * still need to tweak outbound options 790 * processing to include this mbuf in 791 * the right place and put the correct 792 * NextHdr values in the right places. 793 * XXX rja 794 */ 795 break; 796 #endif /* INET6 */ 797 case AF_INET: 798 inp->inp_laddr = ip->ip_dst; 799 inp->inp_options = ip_srcroute(); 800 break; 801 } 802 in_pcbrehash(inp); 803 tp = intotcpcb(inp); 804 tp->t_state = TCPS_LISTEN; 805 806 /* Compute proper scaling value from buffer space 807 */ 808 tcp_rscale(tp, so->so_rcv.sb_hiwat); 809 } 810 } 811 812 #ifdef IPSEC 813 s = splnet(); 814 if (tdbi == NULL) 815 tdb = NULL; 816 else 817 tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto); 818 819 ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN, 820 tdb, inp); 821 splx(s); 822 823 if (tdbi) 824 free(tdbi, M_TEMP); 825 tdbi = NULL; 826 827 /* Error or otherwise drop-packet indication */ 828 if (error) 829 goto drop; 830 #endif /* IPSEC */ 831 832 /* 833 * Segment received on connection. 834 * Reset idle time and keep-alive timer. 835 */ 836 tp->t_idle = 0; 837 if (tp->t_state != TCPS_SYN_RECEIVED) 838 tp->t_timer[TCPT_KEEP] = tcp_keepidle; 839 840 #ifdef TCP_SACK 841 if (!tp->sack_disable) 842 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 843 #endif /* TCP_SACK */ 844 845 /* 846 * Process options if not in LISTEN state, 847 * else do it below (after getting remote address). 848 */ 849 if (optp && tp->t_state != TCPS_LISTEN) 850 tcp_dooptions(tp, optp, optlen, th, 851 &ts_present, &ts_val, &ts_ecr); 852 853 #ifdef TCP_SACK 854 if (!tp->sack_disable) { 855 tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ 856 tp->rcv_lastend = th->th_seq + tlen; 857 } 858 #endif /* TCP_SACK */ 859 /* 860 * Header prediction: check for the two common cases 861 * of a uni-directional data xfer. If the packet has 862 * no control flags, is in-sequence, the window didn't 863 * change and we're not retransmitting, it's a 864 * candidate. If the length is zero and the ack moved 865 * forward, we're the sender side of the xfer. Just 866 * free the data acked & wake any higher level process 867 * that was blocked waiting for space. If the length 868 * is non-zero and the ack didn't move, we're the 869 * receiver side. If we're getting packets in-order 870 * (the reassembly queue is empty), add the data to 871 * the socket buffer and note that we need a delayed ack. 872 */ 873 if (tp->t_state == TCPS_ESTABLISHED && 874 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 875 (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && 876 th->th_seq == tp->rcv_nxt && 877 tiwin && tiwin == tp->snd_wnd && 878 tp->snd_nxt == tp->snd_max) { 879 880 /* 881 * If last ACK falls within this segment's sequence numbers, 882 * record the timestamp. 883 * Fix from Braden, see Stevens p. 870 884 */ 885 if (ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 886 tp->ts_recent_age = tcp_now; 887 tp->ts_recent = ts_val; 888 } 889 890 if (tlen == 0) { 891 if (SEQ_GT(th->th_ack, tp->snd_una) && 892 SEQ_LEQ(th->th_ack, tp->snd_max) && 893 tp->snd_cwnd >= tp->snd_wnd && 894 tp->t_dupacks == 0) { 895 /* 896 * this is a pure ack for outstanding data. 897 */ 898 ++tcpstat.tcps_predack; 899 if (ts_present) 900 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 901 else if (tp->t_rtt && 902 SEQ_GT(th->th_ack, tp->t_rtseq)) 903 tcp_xmit_timer(tp, tp->t_rtt); 904 acked = th->th_ack - tp->snd_una; 905 tcpstat.tcps_rcvackpack++; 906 tcpstat.tcps_rcvackbyte += acked; 907 ND6_HINT(tp); 908 sbdrop(&so->so_snd, acked); 909 tp->snd_una = th->th_ack; 910 #if defined(TCP_SACK) 911 /* 912 * We want snd_last to track snd_una so 913 * as to avoid sequence wraparound problems 914 * for very large transfers. 915 */ 916 tp->snd_last = tp->snd_una; 917 #endif /* TCP_SACK */ 918 #if defined(TCP_SACK) && defined(TCP_FACK) 919 tp->snd_fack = tp->snd_una; 920 tp->retran_data = 0; 921 #endif /* TCP_FACK */ 922 m_freem(m); 923 924 /* 925 * If all outstanding data are acked, stop 926 * retransmit timer, otherwise restart timer 927 * using current (possibly backed-off) value. 928 * If process is waiting for space, 929 * wakeup/selwakeup/signal. If data 930 * are ready to send, let tcp_output 931 * decide between more output or persist. 932 */ 933 if (tp->snd_una == tp->snd_max) 934 tp->t_timer[TCPT_REXMT] = 0; 935 else if (tp->t_timer[TCPT_PERSIST] == 0) 936 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 937 938 if (sb_notify(&so->so_snd)) 939 sowwakeup(so); 940 if (so->so_snd.sb_cc) 941 (void) tcp_output(tp); 942 return; 943 } 944 } else if (th->th_ack == tp->snd_una && 945 tp->segq.lh_first == NULL && 946 tlen <= sbspace(&so->so_rcv)) { 947 /* 948 * This is a pure, in-sequence data packet 949 * with nothing on the reassembly queue and 950 * we have enough buffer space to take it. 951 */ 952 #ifdef TCP_SACK 953 /* Clean receiver SACK report if present */ 954 if (!tp->sack_disable && tp->rcv_numsacks) 955 tcp_clean_sackreport(tp); 956 #endif /* TCP_SACK */ 957 ++tcpstat.tcps_preddat; 958 tp->rcv_nxt += tlen; 959 tcpstat.tcps_rcvpack++; 960 tcpstat.tcps_rcvbyte += tlen; 961 ND6_HINT(tp); 962 /* 963 * Drop TCP, IP headers and TCP options then add data 964 * to socket buffer. 965 */ 966 if (th->th_flags & TH_PUSH) 967 tp->t_flags |= TF_ACKNOW; 968 else 969 tp->t_flags |= TF_DELACK; 970 m_adj(m, iphlen + off); 971 sbappend(&so->so_rcv, m); 972 sorwakeup(so); 973 return; 974 } 975 } 976 977 /* 978 * Compute mbuf offset to TCP data segment. 979 */ 980 hdroptlen = iphlen + off; 981 982 /* 983 * Calculate amount of space in receive window, 984 * and then do TCP input processing. 985 * Receive window is amount of space in rcv queue, 986 * but not less than advertised window. 987 */ 988 { int win; 989 990 win = sbspace(&so->so_rcv); 991 if (win < 0) 992 win = 0; 993 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 994 } 995 996 switch (tp->t_state) { 997 998 /* 999 * If the state is LISTEN then ignore segment if it contains an RST. 1000 * If the segment contains an ACK then it is bad and send a RST. 1001 * If it does not contain a SYN then it is not interesting; drop it. 1002 * If it is from this socket, drop it, it must be forged. 1003 * Don't bother responding if the destination was a broadcast. 1004 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial 1005 * tp->iss, and send a segment: 1006 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 1007 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. 1008 * Fill in remote peer address fields if not previously specified. 1009 * Enter SYN_RECEIVED state, and process any other fields of this 1010 * segment in this state. 1011 */ 1012 case TCPS_LISTEN: { 1013 struct mbuf *am; 1014 register struct sockaddr_in *sin; 1015 #ifdef INET6 1016 register struct sockaddr_in6 *sin6; 1017 #endif /* INET6 */ 1018 1019 if (tiflags & TH_RST) 1020 goto drop; 1021 if (tiflags & TH_ACK) 1022 goto dropwithreset; 1023 if ((tiflags & TH_SYN) == 0) 1024 goto drop; 1025 if (th->th_dport == th->th_sport) { 1026 switch (af) { 1027 #ifdef INET6 1028 case AF_INET6: 1029 if (IN6_ARE_ADDR_EQUAL(&ipv6->ip6_src, 1030 &ipv6->ip6_dst)) 1031 goto drop; 1032 break; 1033 #endif /* INET6 */ 1034 case AF_INET: 1035 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) 1036 goto drop; 1037 break; 1038 } 1039 } 1040 1041 /* 1042 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 1043 * in_broadcast() should never return true on a received 1044 * packet with M_BCAST not set. 1045 */ 1046 if (m->m_flags & (M_BCAST|M_MCAST)) 1047 goto drop; 1048 switch (af) { 1049 #ifdef INET6 1050 case AF_INET6: 1051 /* XXX What about IPv6 Anycasting ?? :-( rja */ 1052 if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst)) 1053 goto drop; 1054 break; 1055 #endif /* INET6 */ 1056 case AF_INET: 1057 if (IN_MULTICAST(ip->ip_dst.s_addr)) 1058 goto drop; 1059 break; 1060 } 1061 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 1062 if (am == NULL) 1063 goto drop; 1064 switch (af) { 1065 #ifdef INET6 1066 case AF_INET6: 1067 /* 1068 * This is probably the place to set the tp->pf value. 1069 * (Don't forget to do it in the v4 code as well!) 1070 * 1071 * Also, remember to blank out things like flowlabel, or 1072 * set flowlabel for accepted sockets in v6. 1073 * 1074 * FURTHERMORE, this is PROBABLY the place where the 1075 * whole business of key munging is set up for passive 1076 * connections. 1077 */ 1078 am->m_len = sizeof(struct sockaddr_in6); 1079 sin6 = mtod(am, struct sockaddr_in6 *); 1080 sin6->sin6_family = AF_INET6; 1081 sin6->sin6_len = sizeof(struct sockaddr_in6); 1082 sin6->sin6_addr = ipv6->ip6_src; 1083 sin6->sin6_port = th->th_sport; 1084 sin6->sin6_flowinfo = htonl(0x0fffffff) & 1085 inp->inp_ipv6.ip6_flow; 1086 laddr6 = inp->inp_laddr6; 1087 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) 1088 inp->inp_laddr6 = ipv6->ip6_dst; 1089 /* This is a good optimization. */ 1090 if (in6_pcbconnect(inp, am)) { 1091 inp->inp_laddr6 = laddr6; 1092 (void) m_free(am); 1093 goto drop; 1094 } 1095 break; 1096 #endif 1097 case AF_INET: 1098 /* drop IPv4 packet to AF_INET6 socket */ 1099 if (inp->inp_flags & INP_IPV6) { 1100 (void) m_free(am); 1101 goto drop; 1102 } 1103 am->m_len = sizeof(struct sockaddr_in); 1104 sin = mtod(am, struct sockaddr_in *); 1105 sin->sin_family = AF_INET; 1106 sin->sin_len = sizeof(*sin); 1107 sin->sin_addr = ip->ip_src; 1108 sin->sin_port = th->th_sport; 1109 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); 1110 laddr = inp->inp_laddr; 1111 if (inp->inp_laddr.s_addr == INADDR_ANY) 1112 inp->inp_laddr = ip->ip_dst; 1113 if (in_pcbconnect(inp, am)) { 1114 inp->inp_laddr = laddr; 1115 (void) m_free(am); 1116 goto drop; 1117 } 1118 (void) m_free(am); 1119 break; 1120 } 1121 tp->t_template = tcp_template(tp); 1122 if (tp->t_template == 0) { 1123 tp = tcp_drop(tp, ENOBUFS); 1124 dropsocket = 0; /* socket is already gone */ 1125 goto drop; 1126 } 1127 if (optp) 1128 tcp_dooptions(tp, optp, optlen, th, 1129 &ts_present, &ts_val, &ts_ecr); 1130 #ifdef TCP_SACK 1131 /* 1132 * If peer did not send a SACK_PERMITTED option (i.e., if 1133 * tcp_dooptions() did not set TF_SACK_PERMIT), set 1134 * sack_disable to 1 if it is currently 0. 1135 */ 1136 if (!tp->sack_disable) 1137 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1138 tp->sack_disable = 1; 1139 #endif 1140 1141 if (iss) 1142 tp->iss = iss; 1143 else { 1144 #ifdef TCP_COMPAT_42 1145 tcp_iss += TCP_ISSINCR/2; 1146 tp->iss = tcp_iss; 1147 #else /* TCP_COMPAT_42 */ 1148 tp->iss = tcp_rndiss_next(); 1149 #endif /* !TCP_COMPAT_42 */ 1150 } 1151 tp->irs = th->th_seq; 1152 tcp_sendseqinit(tp); 1153 #if defined (TCP_SACK) 1154 tp->snd_last = tp->snd_una; 1155 #endif /* TCP_SACK */ 1156 #if defined(TCP_SACK) && defined(TCP_FACK) 1157 tp->snd_fack = tp->snd_una; 1158 tp->retran_data = 0; 1159 tp->snd_awnd = 0; 1160 #endif /* TCP_FACK */ 1161 tcp_rcvseqinit(tp); 1162 tp->t_flags |= TF_ACKNOW; 1163 tp->t_state = TCPS_SYN_RECEIVED; 1164 tp->t_timer[TCPT_KEEP] = tcptv_keep_init; 1165 dropsocket = 0; /* committed to socket */ 1166 tcpstat.tcps_accepts++; 1167 goto trimthenstep6; 1168 } 1169 1170 /* 1171 * If the state is SYN_RECEIVED: 1172 * if seg contains SYN/ACK, send an RST. 1173 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1174 */ 1175 1176 case TCPS_SYN_RECEIVED: 1177 if (tiflags & TH_ACK) { 1178 if (tiflags & TH_SYN) { 1179 tcpstat.tcps_badsyn++; 1180 goto dropwithreset; 1181 } 1182 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1183 SEQ_GT(th->th_ack, tp->snd_max)) 1184 goto dropwithreset; 1185 } 1186 break; 1187 1188 /* 1189 * If the state is SYN_SENT: 1190 * if seg contains an ACK, but not for our SYN, drop the input. 1191 * if seg contains a RST, then drop the connection. 1192 * if seg does not contain SYN, then drop it. 1193 * Otherwise this is an acceptable SYN segment 1194 * initialize tp->rcv_nxt and tp->irs 1195 * if seg contains ack then advance tp->snd_una 1196 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1197 * arrange for segment to be acked (eventually) 1198 * continue processing rest of data/controls, beginning with URG 1199 */ 1200 case TCPS_SYN_SENT: 1201 if ((tiflags & TH_ACK) && 1202 (SEQ_LEQ(th->th_ack, tp->iss) || 1203 SEQ_GT(th->th_ack, tp->snd_max))) 1204 goto dropwithreset; 1205 if (tiflags & TH_RST) { 1206 if (tiflags & TH_ACK) 1207 tp = tcp_drop(tp, ECONNREFUSED); 1208 goto drop; 1209 } 1210 if ((tiflags & TH_SYN) == 0) 1211 goto drop; 1212 if (tiflags & TH_ACK) { 1213 tp->snd_una = th->th_ack; 1214 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1215 tp->snd_nxt = tp->snd_una; 1216 } 1217 tp->t_timer[TCPT_REXMT] = 0; 1218 tp->irs = th->th_seq; 1219 tcp_rcvseqinit(tp); 1220 tp->t_flags |= TF_ACKNOW; 1221 #ifdef TCP_SACK 1222 /* 1223 * If we've sent a SACK_PERMITTED option, and the peer 1224 * also replied with one, then TF_SACK_PERMIT should have 1225 * been set in tcp_dooptions(). If it was not, disable SACKs. 1226 */ 1227 if (!tp->sack_disable) 1228 if ((tp->t_flags & TF_SACK_PERMIT) == 0) 1229 tp->sack_disable = 1; 1230 #endif 1231 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1232 tcpstat.tcps_connects++; 1233 soisconnected(so); 1234 tp->t_state = TCPS_ESTABLISHED; 1235 /* Do window scaling on this connection? */ 1236 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1237 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1238 tp->snd_scale = tp->requested_s_scale; 1239 tp->rcv_scale = tp->request_r_scale; 1240 } 1241 (void) tcp_reass(tp, (struct tcphdr *)0, 1242 (struct mbuf *)0, &tlen); 1243 /* 1244 * if we didn't have to retransmit the SYN, 1245 * use its rtt as our initial srtt & rtt var. 1246 */ 1247 if (tp->t_rtt) 1248 tcp_xmit_timer(tp, tp->t_rtt); 1249 /* 1250 * Since new data was acked (the SYN), open the 1251 * congestion window by one MSS. We do this 1252 * here, because we won't go through the normal 1253 * ACK processing below. And since this is the 1254 * start of the connection, we know we are in 1255 * the exponential phase of slow-start. 1256 */ 1257 tp->snd_cwnd += tp->t_maxseg; 1258 } else 1259 tp->t_state = TCPS_SYN_RECEIVED; 1260 1261 trimthenstep6: 1262 /* 1263 * Advance th->th_seq to correspond to first data byte. 1264 * If data, trim to stay within window, 1265 * dropping FIN if necessary. 1266 */ 1267 th->th_seq++; 1268 if (tlen > tp->rcv_wnd) { 1269 todrop = tlen - tp->rcv_wnd; 1270 m_adj(m, -todrop); 1271 tlen = tp->rcv_wnd; 1272 tiflags &= ~TH_FIN; 1273 tcpstat.tcps_rcvpackafterwin++; 1274 tcpstat.tcps_rcvbyteafterwin += todrop; 1275 } 1276 tp->snd_wl1 = th->th_seq - 1; 1277 tp->rcv_up = th->th_seq; 1278 goto step6; 1279 } 1280 1281 /* 1282 * States other than LISTEN or SYN_SENT. 1283 * First check timestamp, if present. 1284 * Then check that at least some bytes of segment are within 1285 * receive window. If segment begins before rcv_nxt, 1286 * drop leading data (and SYN); if nothing left, just ack. 1287 * 1288 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1289 * and it's less than ts_recent, drop it. 1290 */ 1291 if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1292 TSTMP_LT(ts_val, tp->ts_recent)) { 1293 1294 /* Check to see if ts_recent is over 24 days old. */ 1295 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { 1296 /* 1297 * Invalidate ts_recent. If this segment updates 1298 * ts_recent, the age will be reset later and ts_recent 1299 * will get a valid value. If it does not, setting 1300 * ts_recent to zero will at least satisfy the 1301 * requirement that zero be placed in the timestamp 1302 * echo reply when ts_recent isn't valid. The 1303 * age isn't reset until we get a valid ts_recent 1304 * because we don't want out-of-order segments to be 1305 * dropped when ts_recent is old. 1306 */ 1307 tp->ts_recent = 0; 1308 } else { 1309 tcpstat.tcps_rcvduppack++; 1310 tcpstat.tcps_rcvdupbyte += tlen; 1311 tcpstat.tcps_pawsdrop++; 1312 goto dropafterack; 1313 } 1314 } 1315 1316 todrop = tp->rcv_nxt - th->th_seq; 1317 if (todrop > 0) { 1318 if (tiflags & TH_SYN) { 1319 tiflags &= ~TH_SYN; 1320 th->th_seq++; 1321 if (th->th_urp > 1) 1322 th->th_urp--; 1323 else 1324 tiflags &= ~TH_URG; 1325 todrop--; 1326 } 1327 if (todrop >= tlen || 1328 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1329 /* 1330 * Any valid FIN must be to the left of the 1331 * window. At this point, FIN must be a 1332 * duplicate or out-of-sequence, so drop it. 1333 */ 1334 tiflags &= ~TH_FIN; 1335 /* 1336 * Send ACK to resynchronize, and drop any data, 1337 * but keep on processing for RST or ACK. 1338 */ 1339 tp->t_flags |= TF_ACKNOW; 1340 tcpstat.tcps_rcvdupbyte += todrop = tlen; 1341 tcpstat.tcps_rcvduppack++; 1342 } else { 1343 tcpstat.tcps_rcvpartduppack++; 1344 tcpstat.tcps_rcvpartdupbyte += todrop; 1345 } 1346 hdroptlen += todrop; /* drop from head afterwards */ 1347 th->th_seq += todrop; 1348 tlen -= todrop; 1349 if (th->th_urp > todrop) 1350 th->th_urp -= todrop; 1351 else { 1352 tiflags &= ~TH_URG; 1353 th->th_urp = 0; 1354 } 1355 } 1356 1357 /* 1358 * If new data are received on a connection after the 1359 * user processes are gone, then RST the other end. 1360 */ 1361 if ((so->so_state & SS_NOFDREF) && 1362 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1363 tp = tcp_close(tp); 1364 tcpstat.tcps_rcvafterclose++; 1365 goto dropwithreset; 1366 } 1367 1368 /* 1369 * If segment ends after window, drop trailing data 1370 * (and PUSH and FIN); if nothing left, just ACK. 1371 */ 1372 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1373 if (todrop > 0) { 1374 tcpstat.tcps_rcvpackafterwin++; 1375 if (todrop >= tlen) { 1376 tcpstat.tcps_rcvbyteafterwin += tlen; 1377 /* 1378 * If a new connection request is received 1379 * while in TIME_WAIT, drop the old connection 1380 * and start over if the sequence numbers 1381 * are above the previous ones. 1382 */ 1383 if (tiflags & TH_SYN && 1384 tp->t_state == TCPS_TIME_WAIT && 1385 SEQ_GT(th->th_seq, tp->rcv_nxt)) { 1386 iss = tp->snd_nxt + TCP_ISSINCR; 1387 tp = tcp_close(tp); 1388 goto findpcb; 1389 } 1390 /* 1391 * If window is closed can only take segments at 1392 * window edge, and have to drop data and PUSH from 1393 * incoming segments. Continue processing, but 1394 * remember to ack. Otherwise, drop segment 1395 * and ack. 1396 */ 1397 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1398 tp->t_flags |= TF_ACKNOW; 1399 tcpstat.tcps_rcvwinprobe++; 1400 } else 1401 goto dropafterack; 1402 } else 1403 tcpstat.tcps_rcvbyteafterwin += todrop; 1404 m_adj(m, -todrop); 1405 tlen -= todrop; 1406 tiflags &= ~(TH_PUSH|TH_FIN); 1407 } 1408 1409 /* 1410 * If last ACK falls within this segment's sequence numbers, 1411 * record its timestamp. 1412 * Fix from Braden, see Stevens p. 870 1413 */ 1414 if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && 1415 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1416 tp->ts_recent_age = tcp_now; 1417 tp->ts_recent = ts_val; 1418 } 1419 1420 /* 1421 * If the RST bit is set examine the state: 1422 * SYN_RECEIVED STATE: 1423 * If passive open, return to LISTEN state. 1424 * If active open, inform user that connection was refused. 1425 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1426 * Inform user that connection was reset, and close tcb. 1427 * CLOSING, LAST_ACK, TIME_WAIT STATES 1428 * Close the tcb. 1429 */ 1430 if (tiflags & TH_RST) { 1431 if (th->th_seq != tp->last_ack_sent) 1432 goto drop; 1433 1434 switch (tp->t_state) { 1435 case TCPS_SYN_RECEIVED: 1436 so->so_error = ECONNREFUSED; 1437 goto close; 1438 1439 case TCPS_ESTABLISHED: 1440 case TCPS_FIN_WAIT_1: 1441 case TCPS_FIN_WAIT_2: 1442 case TCPS_CLOSE_WAIT: 1443 so->so_error = ECONNRESET; 1444 close: 1445 tp->t_state = TCPS_CLOSED; 1446 tcpstat.tcps_drops++; 1447 tp = tcp_close(tp); 1448 goto drop; 1449 case TCPS_CLOSING: 1450 case TCPS_LAST_ACK: 1451 case TCPS_TIME_WAIT: 1452 tp = tcp_close(tp); 1453 goto drop; 1454 } 1455 } 1456 1457 /* 1458 * If a SYN is in the window, then this is an 1459 * error and we send an RST and drop the connection. 1460 */ 1461 if (tiflags & TH_SYN) { 1462 tp = tcp_drop(tp, ECONNRESET); 1463 goto dropwithreset; 1464 } 1465 1466 /* 1467 * If the ACK bit is off we drop the segment and return. 1468 */ 1469 if ((tiflags & TH_ACK) == 0) { 1470 if (tp->t_flags & TF_ACKNOW) 1471 goto dropafterack; 1472 else 1473 goto drop; 1474 } 1475 1476 /* 1477 * Ack processing. 1478 */ 1479 switch (tp->t_state) { 1480 1481 /* 1482 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1483 * ESTABLISHED state and continue processing. 1484 * The ACK was checked above. 1485 */ 1486 case TCPS_SYN_RECEIVED: 1487 tcpstat.tcps_connects++; 1488 soisconnected(so); 1489 tp->t_state = TCPS_ESTABLISHED; 1490 /* Do window scaling? */ 1491 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1492 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1493 tp->snd_scale = tp->requested_s_scale; 1494 tp->rcv_scale = tp->request_r_scale; 1495 } 1496 (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0, 1497 &tlen); 1498 tp->snd_wl1 = th->th_seq - 1; 1499 /* fall into ... */ 1500 1501 /* 1502 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1503 * ACKs. If the ack is in the range 1504 * tp->snd_una < th->th_ack <= tp->snd_max 1505 * then advance tp->snd_una to th->th_ack and drop 1506 * data from the retransmission queue. If this ACK reflects 1507 * more up to date window information we update our window information. 1508 */ 1509 case TCPS_ESTABLISHED: 1510 case TCPS_FIN_WAIT_1: 1511 case TCPS_FIN_WAIT_2: 1512 case TCPS_CLOSE_WAIT: 1513 case TCPS_CLOSING: 1514 case TCPS_LAST_ACK: 1515 case TCPS_TIME_WAIT: 1516 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1517 /* 1518 * Duplicate/old ACK processing. 1519 * Increments t_dupacks: 1520 * Pure duplicate (same seq/ack/window, no data) 1521 * Doesn't affect t_dupacks: 1522 * Data packets. 1523 * Normal window updates (window opens) 1524 * Resets t_dupacks: 1525 * New data ACKed. 1526 * Window shrinks 1527 * Old ACK 1528 */ 1529 if (tlen) 1530 break; 1531 /* 1532 * If we get an old ACK, there is probably packet 1533 * reordering going on. Be conservative and reset 1534 * t_dupacks so that we are less agressive in 1535 * doing a fast retransmit. 1536 */ 1537 if (th->th_ack != tp->snd_una) { 1538 tp->t_dupacks = 0; 1539 break; 1540 } 1541 if (tiwin == tp->snd_wnd) { 1542 tcpstat.tcps_rcvdupack++; 1543 /* 1544 * If we have outstanding data (other than 1545 * a window probe), this is a completely 1546 * duplicate ack (ie, window info didn't 1547 * change), the ack is the biggest we've 1548 * seen and we've seen exactly our rexmt 1549 * threshhold of them, assume a packet 1550 * has been dropped and retransmit it. 1551 * Kludge snd_nxt & the congestion 1552 * window so we send only this one 1553 * packet. 1554 * 1555 * We know we're losing at the current 1556 * window size so do congestion avoidance 1557 * (set ssthresh to half the current window 1558 * and pull our congestion window back to 1559 * the new ssthresh). 1560 * 1561 * Dup acks mean that packets have left the 1562 * network (they're now cached at the receiver) 1563 * so bump cwnd by the amount in the receiver 1564 * to keep a constant cwnd packets in the 1565 * network. 1566 */ 1567 if (tp->t_timer[TCPT_REXMT] == 0) 1568 tp->t_dupacks = 0; 1569 #if defined(TCP_SACK) && defined(TCP_FACK) 1570 /* 1571 * In FACK, can enter fast rec. if the receiver 1572 * reports a reass. queue longer than 3 segs. 1573 */ 1574 else if (++tp->t_dupacks == tcprexmtthresh || 1575 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 1576 tp->t_maxseg + tp->snd_una)) && 1577 SEQ_GT(tp->snd_una, tp->snd_last))) { 1578 #else 1579 else if (++tp->t_dupacks == tcprexmtthresh) { 1580 #endif /* TCP_FACK */ 1581 tcp_seq onxt = tp->snd_nxt; 1582 u_long win = 1583 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1584 2 / tp->t_maxseg; 1585 1586 #if defined(TCP_SACK) 1587 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1588 /* 1589 * False fast retx after 1590 * timeout. Do not cut window. 1591 */ 1592 tp->t_dupacks = 0; 1593 goto drop; 1594 } 1595 #endif 1596 if (win < 2) 1597 win = 2; 1598 tp->snd_ssthresh = win * tp->t_maxseg; 1599 #if defined(TCP_SACK) 1600 tp->snd_last = tp->snd_max; 1601 #endif 1602 #ifdef TCP_SACK 1603 if (!tp->sack_disable) { 1604 tp->t_timer[TCPT_REXMT] = 0; 1605 tp->t_rtt = 0; 1606 tcpstat.tcps_sndrexmitfast++; 1607 #if defined(TCP_SACK) && defined(TCP_FACK) 1608 tp->t_dupacks = tcprexmtthresh; 1609 (void) tcp_output(tp); 1610 /* 1611 * During FR, snd_cwnd is held 1612 * constant for FACK. 1613 */ 1614 tp->snd_cwnd = tp->snd_ssthresh; 1615 #else 1616 /* 1617 * tcp_output() will send 1618 * oldest SACK-eligible rtx. 1619 */ 1620 (void) tcp_output(tp); 1621 tp->snd_cwnd = tp->snd_ssthresh+ 1622 tp->t_maxseg * tp->t_dupacks; 1623 #endif /* TCP_FACK */ 1624 goto drop; 1625 } 1626 #endif /* TCP_SACK */ 1627 tp->t_timer[TCPT_REXMT] = 0; 1628 tp->t_rtt = 0; 1629 tp->snd_nxt = th->th_ack; 1630 tp->snd_cwnd = tp->t_maxseg; 1631 tcpstat.tcps_sndrexmitfast++; 1632 (void) tcp_output(tp); 1633 1634 tp->snd_cwnd = tp->snd_ssthresh + 1635 tp->t_maxseg * tp->t_dupacks; 1636 if (SEQ_GT(onxt, tp->snd_nxt)) 1637 tp->snd_nxt = onxt; 1638 goto drop; 1639 } else if (tp->t_dupacks > tcprexmtthresh) { 1640 #if defined(TCP_SACK) && defined(TCP_FACK) 1641 /* 1642 * while (awnd < cwnd) 1643 * sendsomething(); 1644 */ 1645 if (!tp->sack_disable) { 1646 if (tp->snd_awnd < tp->snd_cwnd) 1647 tcp_output(tp); 1648 goto drop; 1649 } 1650 #endif /* TCP_FACK */ 1651 tp->snd_cwnd += tp->t_maxseg; 1652 (void) tcp_output(tp); 1653 goto drop; 1654 } 1655 } else if (tiwin < tp->snd_wnd) { 1656 /* 1657 * The window was retracted! Previous dup 1658 * ACKs may have been due to packets arriving 1659 * after the shrunken window, not a missing 1660 * packet, so play it safe and reset t_dupacks 1661 */ 1662 tp->t_dupacks = 0; 1663 } 1664 break; 1665 } 1666 /* 1667 * If the congestion window was inflated to account 1668 * for the other side's cached packets, retract it. 1669 */ 1670 #if defined(TCP_SACK) 1671 if (!tp->sack_disable) { 1672 if (tp->t_dupacks >= tcprexmtthresh) { 1673 /* Check for a partial ACK */ 1674 if (tcp_sack_partialack(tp, th)) { 1675 #if defined(TCP_SACK) && defined(TCP_FACK) 1676 /* Force call to tcp_output */ 1677 if (tp->snd_awnd < tp->snd_cwnd) 1678 needoutput = 1; 1679 #else 1680 tp->snd_cwnd += tp->t_maxseg; 1681 needoutput = 1; 1682 #endif /* TCP_FACK */ 1683 } else { 1684 /* Out of fast recovery */ 1685 tp->snd_cwnd = tp->snd_ssthresh; 1686 if (tcp_seq_subtract(tp->snd_max, 1687 th->th_ack) < tp->snd_ssthresh) 1688 tp->snd_cwnd = 1689 tcp_seq_subtract(tp->snd_max, 1690 th->th_ack); 1691 tp->t_dupacks = 0; 1692 #if defined(TCP_SACK) && defined(TCP_FACK) 1693 if (SEQ_GT(th->th_ack, tp->snd_fack)) 1694 tp->snd_fack = th->th_ack; 1695 #endif /* TCP_FACK */ 1696 } 1697 } 1698 } else { 1699 if (tp->t_dupacks >= tcprexmtthresh && 1700 !tcp_newreno(tp, th)) { 1701 /* Out of fast recovery */ 1702 tp->snd_cwnd = tp->snd_ssthresh; 1703 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1704 tp->snd_ssthresh) 1705 tp->snd_cwnd = 1706 tcp_seq_subtract(tp->snd_max, 1707 th->th_ack); 1708 tp->t_dupacks = 0; 1709 } 1710 } 1711 if (tp->t_dupacks < tcprexmtthresh) 1712 tp->t_dupacks = 0; 1713 #else /* else no TCP_SACK */ 1714 if (tp->t_dupacks >= tcprexmtthresh && 1715 tp->snd_cwnd > tp->snd_ssthresh) 1716 tp->snd_cwnd = tp->snd_ssthresh; 1717 tp->t_dupacks = 0; 1718 #endif 1719 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1720 tcpstat.tcps_rcvacktoomuch++; 1721 goto dropafterack; 1722 } 1723 acked = th->th_ack - tp->snd_una; 1724 tcpstat.tcps_rcvackpack++; 1725 tcpstat.tcps_rcvackbyte += acked; 1726 1727 /* 1728 * If we have a timestamp reply, update smoothed 1729 * round trip time. If no timestamp is present but 1730 * transmit timer is running and timed sequence 1731 * number was acked, update smoothed round trip time. 1732 * Since we now have an rtt measurement, cancel the 1733 * timer backoff (cf., Phil Karn's retransmit alg.). 1734 * Recompute the initial retransmit timer. 1735 */ 1736 if (ts_present) 1737 tcp_xmit_timer(tp, tcp_now-ts_ecr+1); 1738 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq)) 1739 tcp_xmit_timer(tp,tp->t_rtt); 1740 1741 /* 1742 * If all outstanding data is acked, stop retransmit 1743 * timer and remember to restart (more output or persist). 1744 * If there is more data to be acked, restart retransmit 1745 * timer, using current (possibly backed-off) value. 1746 */ 1747 if (th->th_ack == tp->snd_max) { 1748 tp->t_timer[TCPT_REXMT] = 0; 1749 needoutput = 1; 1750 } else if (tp->t_timer[TCPT_PERSIST] == 0) 1751 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 1752 /* 1753 * When new data is acked, open the congestion window. 1754 * If the window gives us less than ssthresh packets 1755 * in flight, open exponentially (maxseg per packet). 1756 * Otherwise open linearly: maxseg per window 1757 * (maxseg^2 / cwnd per packet). 1758 */ 1759 { 1760 register u_int cw = tp->snd_cwnd; 1761 register u_int incr = tp->t_maxseg; 1762 1763 if (cw > tp->snd_ssthresh) 1764 incr = incr * incr / cw; 1765 #if defined (TCP_SACK) 1766 if (tp->t_dupacks < tcprexmtthresh) 1767 #endif 1768 tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale); 1769 } 1770 ND6_HINT(tp); 1771 if (acked > so->so_snd.sb_cc) { 1772 tp->snd_wnd -= so->so_snd.sb_cc; 1773 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 1774 ourfinisacked = 1; 1775 } else { 1776 sbdrop(&so->so_snd, acked); 1777 tp->snd_wnd -= acked; 1778 ourfinisacked = 0; 1779 } 1780 if (sb_notify(&so->so_snd)) 1781 sowwakeup(so); 1782 tp->snd_una = th->th_ack; 1783 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1784 tp->snd_nxt = tp->snd_una; 1785 #if defined (TCP_SACK) && defined (TCP_FACK) 1786 if (SEQ_GT(tp->snd_una, tp->snd_fack)) { 1787 tp->snd_fack = tp->snd_una; 1788 /* Update snd_awnd for partial ACK 1789 * without any SACK blocks. 1790 */ 1791 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, 1792 tp->snd_fack) + tp->retran_data; 1793 } 1794 #endif 1795 1796 switch (tp->t_state) { 1797 1798 /* 1799 * In FIN_WAIT_1 STATE in addition to the processing 1800 * for the ESTABLISHED state if our FIN is now acknowledged 1801 * then enter FIN_WAIT_2. 1802 */ 1803 case TCPS_FIN_WAIT_1: 1804 if (ourfinisacked) { 1805 /* 1806 * If we can't receive any more 1807 * data, then closing user can proceed. 1808 * Starting the timer is contrary to the 1809 * specification, but if we don't get a FIN 1810 * we'll hang forever. 1811 */ 1812 if (so->so_state & SS_CANTRCVMORE) { 1813 soisdisconnected(so); 1814 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 1815 } 1816 tp->t_state = TCPS_FIN_WAIT_2; 1817 } 1818 break; 1819 1820 /* 1821 * In CLOSING STATE in addition to the processing for 1822 * the ESTABLISHED state if the ACK acknowledges our FIN 1823 * then enter the TIME-WAIT state, otherwise ignore 1824 * the segment. 1825 */ 1826 case TCPS_CLOSING: 1827 if (ourfinisacked) { 1828 tp->t_state = TCPS_TIME_WAIT; 1829 tcp_canceltimers(tp); 1830 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1831 soisdisconnected(so); 1832 } 1833 break; 1834 1835 /* 1836 * In LAST_ACK, we may still be waiting for data to drain 1837 * and/or to be acked, as well as for the ack of our FIN. 1838 * If our FIN is now acknowledged, delete the TCB, 1839 * enter the closed state and return. 1840 */ 1841 case TCPS_LAST_ACK: 1842 if (ourfinisacked) { 1843 tp = tcp_close(tp); 1844 goto drop; 1845 } 1846 break; 1847 1848 /* 1849 * In TIME_WAIT state the only thing that should arrive 1850 * is a retransmission of the remote FIN. Acknowledge 1851 * it and restart the finack timer. 1852 */ 1853 case TCPS_TIME_WAIT: 1854 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 1855 goto dropafterack; 1856 } 1857 } 1858 1859 step6: 1860 /* 1861 * Update window information. 1862 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1863 */ 1864 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 1865 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) || 1866 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) { 1867 /* keep track of pure window updates */ 1868 if (tlen == 0 && 1869 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1870 tcpstat.tcps_rcvwinupd++; 1871 tp->snd_wnd = tiwin; 1872 tp->snd_wl1 = th->th_seq; 1873 tp->snd_wl2 = th->th_ack; 1874 if (tp->snd_wnd > tp->max_sndwnd) 1875 tp->max_sndwnd = tp->snd_wnd; 1876 needoutput = 1; 1877 } 1878 1879 /* 1880 * Process segments with URG. 1881 */ 1882 if ((tiflags & TH_URG) && th->th_urp && 1883 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1884 /* 1885 * This is a kludge, but if we receive and accept 1886 * random urgent pointers, we'll crash in 1887 * soreceive. It's hard to imagine someone 1888 * actually wanting to send this much urgent data. 1889 */ 1890 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1891 th->th_urp = 0; /* XXX */ 1892 tiflags &= ~TH_URG; /* XXX */ 1893 goto dodata; /* XXX */ 1894 } 1895 /* 1896 * If this segment advances the known urgent pointer, 1897 * then mark the data stream. This should not happen 1898 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1899 * a FIN has been received from the remote side. 1900 * In these states we ignore the URG. 1901 * 1902 * According to RFC961 (Assigned Protocols), 1903 * the urgent pointer points to the last octet 1904 * of urgent data. We continue, however, 1905 * to consider it to indicate the first octet 1906 * of data past the urgent section as the original 1907 * spec states (in one of two places). 1908 */ 1909 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1910 tp->rcv_up = th->th_seq + th->th_urp; 1911 so->so_oobmark = so->so_rcv.sb_cc + 1912 (tp->rcv_up - tp->rcv_nxt) - 1; 1913 if (so->so_oobmark == 0) 1914 so->so_state |= SS_RCVATMARK; 1915 sohasoutofband(so); 1916 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1917 } 1918 /* 1919 * Remove out of band data so doesn't get presented to user. 1920 * This can happen independent of advancing the URG pointer, 1921 * but if two URG's are pending at once, some out-of-band 1922 * data may creep in... ick. 1923 */ 1924 if (th->th_urp <= (u_int16_t) tlen 1925 #ifdef SO_OOBINLINE 1926 && (so->so_options & SO_OOBINLINE) == 0 1927 #endif 1928 ) 1929 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1930 } else 1931 /* 1932 * If no out of band data is expected, 1933 * pull receive urgent pointer along 1934 * with the receive window. 1935 */ 1936 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1937 tp->rcv_up = tp->rcv_nxt; 1938 dodata: /* XXX */ 1939 1940 /* 1941 * Process the segment text, merging it into the TCP sequencing queue, 1942 * and arranging for acknowledgment of receipt if necessary. 1943 * This process logically involves adjusting tp->rcv_wnd as data 1944 * is presented to the user (this happens in tcp_usrreq.c, 1945 * case PRU_RCVD). If a FIN has already been received on this 1946 * connection then we just ignore the text. 1947 */ 1948 if ((tlen || (tiflags & TH_FIN)) && 1949 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1950 if (th->th_seq == tp->rcv_nxt && tp->segq.lh_first == NULL && 1951 tp->t_state == TCPS_ESTABLISHED) { 1952 if (th->th_flags & TH_PUSH) 1953 tp->t_flags |= TF_ACKNOW; 1954 else 1955 tp->t_flags |= TF_DELACK; 1956 tp->rcv_nxt += tlen; 1957 tiflags = th->th_flags & TH_FIN; 1958 tcpstat.tcps_rcvpack++; 1959 tcpstat.tcps_rcvbyte += tlen; 1960 ND6_HINT(tp); 1961 m_adj(m, hdroptlen); 1962 sbappend(&so->so_rcv, m); 1963 sorwakeup(so); 1964 } else { 1965 m_adj(m, hdroptlen); 1966 tiflags = tcp_reass(tp, th, m, &tlen); 1967 tp->t_flags |= TF_ACKNOW; 1968 } 1969 #ifdef TCP_SACK 1970 if (!tp->sack_disable) 1971 tcp_update_sack_list(tp); 1972 #endif 1973 1974 /* 1975 * variable len never referenced again in modern BSD, 1976 * so why bother computing it ?? 1977 */ 1978 #if 0 1979 /* 1980 * Note the amount of data that peer has sent into 1981 * our window, in order to estimate the sender's 1982 * buffer size. 1983 */ 1984 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1985 #endif /* 0 */ 1986 } else { 1987 m_freem(m); 1988 tiflags &= ~TH_FIN; 1989 } 1990 1991 /* 1992 * If FIN is received ACK the FIN and let the user know 1993 * that the connection is closing. Ignore a FIN received before 1994 * the connection is fully established. 1995 */ 1996 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1997 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1998 socantrcvmore(so); 1999 tp->t_flags |= TF_ACKNOW; 2000 tp->rcv_nxt++; 2001 } 2002 switch (tp->t_state) { 2003 2004 /* 2005 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2006 */ 2007 case TCPS_ESTABLISHED: 2008 tp->t_state = TCPS_CLOSE_WAIT; 2009 break; 2010 2011 /* 2012 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2013 * enter the CLOSING state. 2014 */ 2015 case TCPS_FIN_WAIT_1: 2016 tp->t_state = TCPS_CLOSING; 2017 break; 2018 2019 /* 2020 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2021 * starting the time-wait timer, turning off the other 2022 * standard timers. 2023 */ 2024 case TCPS_FIN_WAIT_2: 2025 tp->t_state = TCPS_TIME_WAIT; 2026 tcp_canceltimers(tp); 2027 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 2028 soisdisconnected(so); 2029 break; 2030 2031 /* 2032 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2033 */ 2034 case TCPS_TIME_WAIT: 2035 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; 2036 break; 2037 } 2038 } 2039 if (so->so_options & SO_DEBUG) { 2040 switch (tp->pf == PF_INET6) { 2041 #ifdef INET6 2042 case PF_INET6: 2043 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6, 2044 0, tlen); 2045 break; 2046 #endif /* INET6 */ 2047 case PF_INET: 2048 tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti, 2049 0, tlen); 2050 break; 2051 } 2052 } 2053 2054 /* 2055 * Return any desired output. 2056 */ 2057 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2058 (void) tcp_output(tp); 2059 } 2060 return; 2061 2062 dropafterack: 2063 /* 2064 * Generate an ACK dropping incoming segment if it occupies 2065 * sequence space, where the ACK reflects our state. 2066 */ 2067 if (tiflags & TH_RST) 2068 goto drop; 2069 m_freem(m); 2070 tp->t_flags |= TF_ACKNOW; 2071 (void) tcp_output(tp); 2072 return; 2073 2074 dropwithreset_ratelim: 2075 /* 2076 * We may want to rate-limit RSTs in certain situations, 2077 * particularly if we are sending an RST in response to 2078 * an attempt to connect to or otherwise communicate with 2079 * a port for which we have no socket. 2080 */ 2081 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2082 tcp_rst_ppslim) == 0) { 2083 /* XXX stat */ 2084 goto drop; 2085 } 2086 /* ...fall into dropwithreset... */ 2087 2088 dropwithreset: 2089 /* 2090 * Generate a RST, dropping incoming segment. 2091 * Make ACK acceptable to originator of segment. 2092 * Don't bother to respond if destination was broadcast/multicast. 2093 */ 2094 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 2095 goto drop; 2096 switch (af) { 2097 #ifdef INET6 2098 case AF_INET6: 2099 /* For following calls to tcp_respond */ 2100 if (IN6_IS_ADDR_MULTICAST(&ipv6->ip6_dst)) 2101 goto drop; 2102 break; 2103 #endif /* INET6 */ 2104 case AF_INET: 2105 if (IN_MULTICAST(ip->ip_dst.s_addr)) 2106 goto drop; 2107 } 2108 if (tiflags & TH_ACK) { 2109 tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, 2110 TH_RST); 2111 } else { 2112 if (tiflags & TH_SYN) 2113 tlen++; 2114 tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen, 2115 (tcp_seq)0, TH_RST|TH_ACK); 2116 } 2117 /* destroy temporarily created socket */ 2118 if (dropsocket) 2119 (void) soabort(so); 2120 return; 2121 2122 drop: 2123 #ifdef IPSEC 2124 if (tdbi) 2125 free(tdbi, M_TEMP); 2126 #endif 2127 2128 /* 2129 * Drop space held by incoming segment and return. 2130 */ 2131 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { 2132 switch (tp->pf) { 2133 #ifdef INET6 2134 case PF_INET6: 2135 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6, 2136 0, tlen); 2137 break; 2138 #endif /* INET6 */ 2139 case PF_INET: 2140 tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti, 2141 0, tlen); 2142 break; 2143 } 2144 } 2145 2146 m_freem(m); 2147 /* destroy temporarily created socket */ 2148 if (dropsocket) 2149 (void) soabort(so); 2150 return; 2151 #ifndef TUBA_INCLUDE 2152 } 2153 2154 void 2155 tcp_dooptions(tp, cp, cnt, th, ts_present, ts_val, ts_ecr) 2156 struct tcpcb *tp; 2157 u_char *cp; 2158 int cnt; 2159 struct tcphdr *th; 2160 int *ts_present; 2161 u_int32_t *ts_val, *ts_ecr; 2162 { 2163 u_int16_t mss = 0; 2164 int opt, optlen; 2165 2166 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2167 opt = cp[0]; 2168 if (opt == TCPOPT_EOL) 2169 break; 2170 if (opt == TCPOPT_NOP) 2171 optlen = 1; 2172 else { 2173 if (cnt < 2) 2174 break; 2175 optlen = cp[1]; 2176 if (optlen < 2 || optlen > cnt) 2177 break; 2178 } 2179 switch (opt) { 2180 2181 default: 2182 continue; 2183 2184 case TCPOPT_MAXSEG: 2185 if (optlen != TCPOLEN_MAXSEG) 2186 continue; 2187 if (!(th->th_flags & TH_SYN)) 2188 continue; 2189 bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); 2190 NTOHS(mss); 2191 break; 2192 2193 case TCPOPT_WINDOW: 2194 if (optlen != TCPOLEN_WINDOW) 2195 continue; 2196 if (!(th->th_flags & TH_SYN)) 2197 continue; 2198 tp->t_flags |= TF_RCVD_SCALE; 2199 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2200 break; 2201 2202 case TCPOPT_TIMESTAMP: 2203 if (optlen != TCPOLEN_TIMESTAMP) 2204 continue; 2205 *ts_present = 1; 2206 bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val)); 2207 NTOHL(*ts_val); 2208 bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr)); 2209 NTOHL(*ts_ecr); 2210 2211 /* 2212 * A timestamp received in a SYN makes 2213 * it ok to send timestamp requests and replies. 2214 */ 2215 if (th->th_flags & TH_SYN) { 2216 tp->t_flags |= TF_RCVD_TSTMP; 2217 tp->ts_recent = *ts_val; 2218 tp->ts_recent_age = tcp_now; 2219 } 2220 break; 2221 2222 #ifdef TCP_SACK 2223 case TCPOPT_SACK_PERMITTED: 2224 if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED) 2225 continue; 2226 if (th->th_flags & TH_SYN) 2227 /* MUST only be set on SYN */ 2228 tp->t_flags |= TF_SACK_PERMIT; 2229 break; 2230 case TCPOPT_SACK: 2231 if (tcp_sack_option(tp, th, cp, optlen)) 2232 continue; 2233 break; 2234 #endif 2235 } 2236 } 2237 /* Update t_maxopd and t_maxseg after all options are processed */ 2238 if (th->th_flags & TH_SYN) { 2239 (void) tcp_mss(tp, mss); /* sets t_maxseg */ 2240 2241 if (mss) 2242 tcp_mss_update(tp); 2243 } 2244 } 2245 2246 #if defined(TCP_SACK) 2247 u_long 2248 tcp_seq_subtract(a, b) 2249 u_long a, b; 2250 { 2251 return ((long)(a - b)); 2252 } 2253 #endif 2254 2255 2256 #ifdef TCP_SACK 2257 /* 2258 * This function is called upon receipt of new valid data (while not in header 2259 * prediction mode), and it updates the ordered list of sacks. 2260 */ 2261 void 2262 tcp_update_sack_list(tp) 2263 struct tcpcb *tp; 2264 { 2265 /* 2266 * First reported block MUST be the most recent one. Subsequent 2267 * blocks SHOULD be in the order in which they arrived at the 2268 * receiver. These two conditions make the implementation fully 2269 * compliant with RFC 2018. 2270 */ 2271 int i, j = 0, count = 0, lastpos = -1; 2272 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2273 2274 /* First clean up current list of sacks */ 2275 for (i = 0; i < tp->rcv_numsacks; i++) { 2276 sack = tp->sackblks[i]; 2277 if (sack.start == 0 && sack.end == 0) { 2278 count++; /* count = number of blocks to be discarded */ 2279 continue; 2280 } 2281 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2282 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2283 count++; 2284 } else { 2285 temp[j].start = tp->sackblks[i].start; 2286 temp[j++].end = tp->sackblks[i].end; 2287 } 2288 } 2289 tp->rcv_numsacks -= count; 2290 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2291 tcp_clean_sackreport(tp); 2292 if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { 2293 /* ==> need first sack block */ 2294 tp->sackblks[0].start = tp->rcv_laststart; 2295 tp->sackblks[0].end = tp->rcv_lastend; 2296 tp->rcv_numsacks = 1; 2297 } 2298 return; 2299 } 2300 /* Otherwise, sack blocks are already present. */ 2301 for (i = 0; i < tp->rcv_numsacks; i++) 2302 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2303 if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) 2304 return; /* sack list remains unchanged */ 2305 /* 2306 * From here, segment just received should be (part of) the 1st sack. 2307 * Go through list, possibly coalescing sack block entries. 2308 */ 2309 firstsack.start = tp->rcv_laststart; 2310 firstsack.end = tp->rcv_lastend; 2311 for (i = 0; i < tp->rcv_numsacks; i++) { 2312 sack = tp->sackblks[i]; 2313 if (SEQ_LT(sack.end, firstsack.start) || 2314 SEQ_GT(sack.start, firstsack.end)) 2315 continue; /* no overlap */ 2316 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2317 /* 2318 * identical block; delete it here since we will 2319 * move it to the front of the list. 2320 */ 2321 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2322 lastpos = i; /* last posn with a zero entry */ 2323 continue; 2324 } 2325 if (SEQ_LEQ(sack.start, firstsack.start)) 2326 firstsack.start = sack.start; /* merge blocks */ 2327 if (SEQ_GEQ(sack.end, firstsack.end)) 2328 firstsack.end = sack.end; /* merge blocks */ 2329 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2330 lastpos = i; /* last posn with a zero entry */ 2331 } 2332 if (lastpos != -1) { /* at least one merge */ 2333 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2334 sack = tp->sackblks[i]; 2335 if (sack.start == 0 && sack.end == 0) 2336 continue; 2337 temp[j++] = sack; 2338 } 2339 tp->rcv_numsacks = j; /* including first blk (added later) */ 2340 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2341 tp->sackblks[i] = temp[i]; 2342 } else { /* no merges -- shift sacks by 1 */ 2343 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2344 tp->rcv_numsacks++; 2345 for (i = tp->rcv_numsacks-1; i > 0; i--) 2346 tp->sackblks[i] = tp->sackblks[i-1]; 2347 } 2348 tp->sackblks[0] = firstsack; 2349 return; 2350 } 2351 2352 /* 2353 * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, 2354 * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list 2355 * of holes (oldest to newest, in terms of the sequence space). 2356 */ 2357 int 2358 tcp_sack_option(tp, th, cp, optlen) 2359 struct tcpcb *tp; 2360 struct tcphdr *th; 2361 u_char *cp; 2362 int optlen; 2363 { 2364 int tmp_olen; 2365 u_char *tmp_cp; 2366 struct sackhole *cur, *p, *temp; 2367 2368 if (tp->sack_disable) 2369 return 1; 2370 2371 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2372 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2373 return 1; 2374 tmp_cp = cp + 2; 2375 tmp_olen = optlen - 2; 2376 if (tp->snd_numholes < 0) 2377 tp->snd_numholes = 0; 2378 if (tp->t_maxseg == 0) 2379 panic("tcp_sack_option"); /* Should never happen */ 2380 while (tmp_olen > 0) { 2381 struct sackblk sack; 2382 2383 bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); 2384 NTOHL(sack.start); 2385 bcopy((char *) tmp_cp + sizeof(tcp_seq), 2386 (char *) &(sack.end), sizeof(tcp_seq)); 2387 NTOHL(sack.end); 2388 tmp_olen -= TCPOLEN_SACK; 2389 tmp_cp += TCPOLEN_SACK; 2390 if (SEQ_LEQ(sack.end, sack.start)) 2391 continue; /* bad SACK fields */ 2392 if (SEQ_LEQ(sack.end, tp->snd_una)) 2393 continue; /* old block */ 2394 #if defined(TCP_SACK) && defined(TCP_FACK) 2395 /* Updates snd_fack. */ 2396 if (SEQ_GEQ(sack.end, tp->snd_fack)) 2397 tp->snd_fack = sack.end; 2398 #endif /* TCP_FACK */ 2399 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2400 if (SEQ_LT(sack.start, th->th_ack)) 2401 continue; 2402 } else { 2403 if (SEQ_LT(sack.start, tp->snd_una)) 2404 continue; 2405 } 2406 if (SEQ_GT(sack.end, tp->snd_max)) 2407 continue; 2408 if (tp->snd_holes == 0) { /* first hole */ 2409 tp->snd_holes = (struct sackhole *) 2410 malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT); 2411 if (tp->snd_holes == NULL) { 2412 /* ENOBUFS, so ignore SACKed block for now*/ 2413 continue; 2414 } 2415 cur = tp->snd_holes; 2416 cur->start = th->th_ack; 2417 cur->end = sack.start; 2418 cur->rxmit = cur->start; 2419 cur->next = 0; 2420 tp->snd_numholes = 1; 2421 tp->rcv_lastsack = sack.end; 2422 /* 2423 * dups is at least one. If more data has been 2424 * SACKed, it can be greater than one. 2425 */ 2426 cur->dups = min(tcprexmtthresh, 2427 ((sack.end - cur->end)/tp->t_maxseg)); 2428 if (cur->dups < 1) 2429 cur->dups = 1; 2430 continue; /* with next sack block */ 2431 } 2432 /* Go thru list of holes: p = previous, cur = current */ 2433 p = cur = tp->snd_holes; 2434 while (cur) { 2435 if (SEQ_LEQ(sack.end, cur->start)) 2436 /* SACKs data before the current hole */ 2437 break; /* no use going through more holes */ 2438 if (SEQ_GEQ(sack.start, cur->end)) { 2439 /* SACKs data beyond the current hole */ 2440 cur->dups++; 2441 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2442 tcprexmtthresh) 2443 cur->dups = tcprexmtthresh; 2444 p = cur; 2445 cur = cur->next; 2446 continue; 2447 } 2448 if (SEQ_LEQ(sack.start, cur->start)) { 2449 /* Data acks at least the beginning of hole */ 2450 #if defined(TCP_SACK) && defined(TCP_FACK) 2451 if (SEQ_GT(sack.end, cur->rxmit)) 2452 tp->retran_data -= 2453 tcp_seq_subtract(cur->rxmit, 2454 cur->start); 2455 else 2456 tp->retran_data -= 2457 tcp_seq_subtract(sack.end, 2458 cur->start); 2459 #endif /* TCP_FACK */ 2460 if (SEQ_GEQ(sack.end,cur->end)){ 2461 /* Acks entire hole, so delete hole */ 2462 if (p != cur) { 2463 p->next = cur->next; 2464 free(cur, M_PCB); 2465 cur = p->next; 2466 } else { 2467 cur=cur->next; 2468 free(p, M_PCB); 2469 p = cur; 2470 tp->snd_holes = p; 2471 } 2472 tp->snd_numholes--; 2473 continue; 2474 } 2475 /* otherwise, move start of hole forward */ 2476 cur->start = sack.end; 2477 cur->rxmit = max (cur->rxmit, cur->start); 2478 p = cur; 2479 cur = cur->next; 2480 continue; 2481 } 2482 /* move end of hole backward */ 2483 if (SEQ_GEQ(sack.end, cur->end)) { 2484 #if defined(TCP_SACK) && defined(TCP_FACK) 2485 if (SEQ_GT(cur->rxmit, sack.start)) 2486 tp->retran_data -= 2487 tcp_seq_subtract(cur->rxmit, 2488 sack.start); 2489 #endif /* TCP_FACK */ 2490 cur->end = sack.start; 2491 cur->rxmit = min (cur->rxmit, cur->end); 2492 cur->dups++; 2493 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2494 tcprexmtthresh) 2495 cur->dups = tcprexmtthresh; 2496 p = cur; 2497 cur = cur->next; 2498 continue; 2499 } 2500 if (SEQ_LT(cur->start, sack.start) && 2501 SEQ_GT(cur->end, sack.end)) { 2502 /* 2503 * ACKs some data in middle of a hole; need to 2504 * split current hole 2505 */ 2506 temp = (struct sackhole *)malloc(sizeof(*temp), 2507 M_PCB,M_NOWAIT); 2508 if (temp == NULL) 2509 continue; /* ENOBUFS */ 2510 #if defined(TCP_SACK) && defined(TCP_FACK) 2511 if (SEQ_GT(cur->rxmit, sack.end)) 2512 tp->retran_data -= 2513 tcp_seq_subtract(sack.end, 2514 sack.start); 2515 else if (SEQ_GT(cur->rxmit, sack.start)) 2516 tp->retran_data -= 2517 tcp_seq_subtract(cur->rxmit, 2518 sack.start); 2519 #endif /* TCP_FACK */ 2520 temp->next = cur->next; 2521 temp->start = sack.end; 2522 temp->end = cur->end; 2523 temp->dups = cur->dups; 2524 temp->rxmit = max (cur->rxmit, temp->start); 2525 cur->end = sack.start; 2526 cur->rxmit = min (cur->rxmit, cur->end); 2527 cur->dups++; 2528 if ( ((sack.end - cur->end)/tp->t_maxseg) >= 2529 tcprexmtthresh) 2530 cur->dups = tcprexmtthresh; 2531 cur->next = temp; 2532 p = temp; 2533 cur = p->next; 2534 tp->snd_numholes++; 2535 } 2536 } 2537 /* At this point, p points to the last hole on the list */ 2538 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2539 /* 2540 * Need to append new hole at end. 2541 * Last hole is p (and it's not NULL). 2542 */ 2543 temp = (struct sackhole *) malloc(sizeof(*temp), 2544 M_PCB, M_NOWAIT); 2545 if (temp == NULL) 2546 continue; /* ENOBUFS */ 2547 temp->start = tp->rcv_lastsack; 2548 temp->end = sack.start; 2549 temp->dups = min(tcprexmtthresh, 2550 ((sack.end - sack.start)/tp->t_maxseg)); 2551 if (temp->dups < 1) 2552 temp->dups = 1; 2553 temp->rxmit = temp->start; 2554 temp->next = 0; 2555 p->next = temp; 2556 tp->rcv_lastsack = sack.end; 2557 tp->snd_numholes++; 2558 } 2559 } 2560 #if defined(TCP_SACK) && defined(TCP_FACK) 2561 /* 2562 * Update retran_data and snd_awnd. Go through the list of 2563 * holes. Increment retran_data by (hole->rxmit - hole->start). 2564 */ 2565 tp->retran_data = 0; 2566 cur = tp->snd_holes; 2567 while (cur) { 2568 tp->retran_data += cur->rxmit - cur->start; 2569 cur = cur->next; 2570 } 2571 tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 2572 tp->retran_data; 2573 #endif /* TCP_FACK */ 2574 2575 return 0; 2576 } 2577 2578 /* 2579 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2580 * it is completely acked; otherwise, tcp_sack_option(), called from 2581 * tcp_dooptions(), will fix up the hole. 2582 */ 2583 void 2584 tcp_del_sackholes(tp, th) 2585 struct tcpcb *tp; 2586 struct tcphdr *th; 2587 { 2588 if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) { 2589 /* max because this could be an older ack just arrived */ 2590 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2591 th->th_ack : tp->snd_una; 2592 struct sackhole *cur = tp->snd_holes; 2593 struct sackhole *prev = cur; 2594 while (cur) 2595 if (SEQ_LEQ(cur->end, lastack)) { 2596 cur = cur->next; 2597 free(prev, M_PCB); 2598 prev = cur; 2599 tp->snd_numholes--; 2600 } else if (SEQ_LT(cur->start, lastack)) { 2601 cur->start = lastack; 2602 if (SEQ_LT(cur->rxmit, cur->start)) 2603 cur->rxmit = cur->start; 2604 break; 2605 } else 2606 break; 2607 tp->snd_holes = cur; 2608 } 2609 } 2610 2611 /* 2612 * Delete all receiver-side SACK information. 2613 */ 2614 void 2615 tcp_clean_sackreport(tp) 2616 struct tcpcb *tp; 2617 { 2618 int i; 2619 2620 tp->rcv_numsacks = 0; 2621 for (i = 0; i < MAX_SACK_BLKS; i++) 2622 tp->sackblks[i].start = tp->sackblks[i].end=0; 2623 2624 } 2625 2626 /* 2627 * Checks for partial ack. If partial ack arrives, turn off retransmission 2628 * timer, deflate the window, do not clear tp->t_dupacks, and return 1. 2629 * If the ack advances at least to tp->snd_last, return 0. 2630 */ 2631 int 2632 tcp_sack_partialack(tp, th) 2633 struct tcpcb *tp; 2634 struct tcphdr *th; 2635 { 2636 if (SEQ_LT(th->th_ack, tp->snd_last)) { 2637 /* Turn off retx. timer (will start again next segment) */ 2638 tp->t_timer[TCPT_REXMT] = 0; 2639 tp->t_rtt = 0; 2640 #ifndef TCP_FACK 2641 /* 2642 * Partial window deflation. This statement relies on the 2643 * fact that tp->snd_una has not been updated yet. In FACK 2644 * hold snd_cwnd constant during fast recovery. 2645 */ 2646 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2647 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2648 tp->snd_cwnd += tp->t_maxseg; 2649 } else 2650 tp->snd_cwnd = tp->t_maxseg; 2651 #endif 2652 return 1; 2653 } 2654 return 0; 2655 } 2656 #endif TCP_SACK 2657 2658 /* 2659 * Pull out of band byte out of a segment so 2660 * it doesn't appear in the user's data queue. 2661 * It is still reflected in the segment length for 2662 * sequencing purposes. 2663 */ 2664 void 2665 tcp_pulloutofband(so, urgent, m, off) 2666 struct socket *so; 2667 u_int urgent; 2668 register struct mbuf *m; 2669 int off; 2670 { 2671 int cnt = off + urgent - 1; 2672 2673 while (cnt >= 0) { 2674 if (m->m_len > cnt) { 2675 char *cp = mtod(m, caddr_t) + cnt; 2676 struct tcpcb *tp = sototcpcb(so); 2677 2678 tp->t_iobc = *cp; 2679 tp->t_oobflags |= TCPOOB_HAVEDATA; 2680 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 2681 m->m_len--; 2682 return; 2683 } 2684 cnt -= m->m_len; 2685 m = m->m_next; 2686 if (m == 0) 2687 break; 2688 } 2689 panic("tcp_pulloutofband"); 2690 } 2691 2692 /* 2693 * Collect new round-trip time estimate 2694 * and update averages and current timeout. 2695 */ 2696 void 2697 tcp_xmit_timer(tp, rtt) 2698 register struct tcpcb *tp; 2699 short rtt; 2700 { 2701 register short delta; 2702 short rttmin; 2703 2704 tcpstat.tcps_rttupdated++; 2705 --rtt; 2706 if (tp->t_srtt != 0) { 2707 /* 2708 * srtt is stored as fixed point with 3 bits after the 2709 * binary point (i.e., scaled by 8). The following magic 2710 * is equivalent to the smoothing algorithm in rfc793 with 2711 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2712 * point). Adjust rtt to origin 0. 2713 */ 2714 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 2715 if ((tp->t_srtt += delta) <= 0) 2716 tp->t_srtt = 1; 2717 /* 2718 * We accumulate a smoothed rtt variance (actually, a 2719 * smoothed mean difference), then set the retransmit 2720 * timer to smoothed rtt + 4 times the smoothed variance. 2721 * rttvar is stored as fixed point with 2 bits after the 2722 * binary point (scaled by 4). The following is 2723 * equivalent to rfc793 smoothing with an alpha of .75 2724 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2725 * rfc793's wired-in beta. 2726 */ 2727 if (delta < 0) 2728 delta = -delta; 2729 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2730 if ((tp->t_rttvar += delta) <= 0) 2731 tp->t_rttvar = 1; 2732 } else { 2733 /* 2734 * No rtt measurement yet - use the unsmoothed rtt. 2735 * Set the variance to half the rtt (so our first 2736 * retransmit happens at 3*rtt). 2737 */ 2738 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 2739 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 2740 } 2741 tp->t_rtt = 0; 2742 tp->t_rxtshift = 0; 2743 2744 /* 2745 * the retransmit should happen at rtt + 4 * rttvar. 2746 * Because of the way we do the smoothing, srtt and rttvar 2747 * will each average +1/2 tick of bias. When we compute 2748 * the retransmit timer, we want 1/2 tick of rounding and 2749 * 1 extra tick because of +-1/2 tick uncertainty in the 2750 * firing of the timer. The bias will give us exactly the 2751 * 1.5 tick we need. But, because the bias is 2752 * statistical, we have to test that we don't drop below 2753 * the minimum feasible timer (which is 2 ticks). 2754 */ 2755 if (tp->t_rttmin > rtt + 2) 2756 rttmin = tp->t_rttmin; 2757 else 2758 rttmin = rtt + 2; 2759 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2760 2761 /* 2762 * We received an ack for a packet that wasn't retransmitted; 2763 * it is probably safe to discard any error indications we've 2764 * received recently. This isn't quite right, but close enough 2765 * for now (a route might have failed after we sent a segment, 2766 * and the return path might not be symmetrical). 2767 */ 2768 tp->t_softerror = 0; 2769 } 2770 2771 /* 2772 * Determine a reasonable value for maxseg size. 2773 * If the route is known, check route for mtu. 2774 * If none, use an mss that can be handled on the outgoing 2775 * interface without forcing IP to fragment; if bigger than 2776 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2777 * to utilize large mbufs. If no route is found, route has no mtu, 2778 * or the destination isn't local, use a default, hopefully conservative 2779 * size (usually 512 or the default IP max size, but no more than the mtu 2780 * of the interface), as we can't discover anything about intervening 2781 * gateways or networks. We also initialize the congestion/slow start 2782 * window to be a single segment if the destination isn't local. 2783 * While looking at the routing entry, we also initialize other path-dependent 2784 * parameters from pre-set or cached values in the routing entry. 2785 * 2786 * Also take into account the space needed for options that we 2787 * send regularly. Make maxseg shorter by that amount to assure 2788 * that we can send maxseg amount of data even when the options 2789 * are present. Store the upper limit of the length of options plus 2790 * data in maxopd. 2791 * 2792 * NOTE: offer == -1 indicates that the maxseg size changed due to 2793 * Path MTU discovery. 2794 */ 2795 int 2796 tcp_mss(tp, offer) 2797 register struct tcpcb *tp; 2798 int offer; 2799 { 2800 struct rtentry *rt; 2801 struct ifnet *ifp; 2802 int mss, mssopt; 2803 int iphlen; 2804 #ifdef INET6 2805 int is_ipv6 = 0; 2806 #endif 2807 struct inpcb *inp; 2808 2809 inp = tp->t_inpcb; 2810 2811 mssopt = mss = tcp_mssdflt; 2812 2813 rt = in_pcbrtentry(inp); 2814 2815 if (rt == NULL) 2816 goto out; 2817 2818 ifp = rt->rt_ifp; 2819 2820 switch (tp->pf) { 2821 #ifdef INET6 2822 case AF_INET6: 2823 iphlen = sizeof(struct ip6_hdr); 2824 is_ipv6 = 1; 2825 break; 2826 #endif 2827 case AF_INET: 2828 iphlen = sizeof(struct ip); 2829 break; 2830 default: 2831 /* the family does not support path MTU discovery */ 2832 goto out; 2833 } 2834 2835 #ifdef RTV_MTU 2836 /* 2837 * if there's an mtu associated with the route and we support 2838 * path MTU discovery for the underlying protocol family, use it. 2839 */ 2840 if (rt->rt_rmx.rmx_mtu) { 2841 /* 2842 * One may wish to lower MSS to take into account options, 2843 * especially security-related options. 2844 */ 2845 mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr); 2846 } else 2847 #endif /* RTV_MTU */ 2848 if (!ifp) 2849 /* 2850 * ifp may be null and rmx_mtu may be zero in certain 2851 * v6 cases (e.g., if ND wasn't able to resolve the 2852 * destination host. 2853 */ 2854 goto out; 2855 else if (ip_mtudisc || ifp->if_flags & IFF_LOOPBACK) 2856 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2857 #ifdef INET6 2858 else if (is_ipv6) { 2859 if (IN6_IS_ADDR_V4MAPPED(&inp->inp_faddr6)) { 2860 /* mapped addr case */ 2861 struct in_addr d; 2862 bcopy(&inp->inp_faddr6.s6_addr32[3], &d, sizeof(d)); 2863 if (in_localaddr(d)) 2864 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2865 } else { 2866 if (in6_localaddr(&inp->inp_faddr6)) 2867 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2868 } 2869 } 2870 #endif /* INET6 */ 2871 else if (inp && in_localaddr(inp->inp_faddr)) 2872 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2873 2874 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2875 if (offer != -1) { 2876 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2877 mssopt = max(tcp_mssdflt, mssopt); 2878 } 2879 2880 out: 2881 /* 2882 * The current mss, t_maxseg, is initialized to the default value. 2883 * If we compute a smaller value, reduce the current mss. 2884 * If we compute a larger value, return it for use in sending 2885 * a max seg size option, but don't store it for use 2886 * unless we received an offer at least that large from peer. 2887 * However, do not accept offers under 32 bytes. 2888 */ 2889 if (offer > 0) 2890 tp->t_peermss = offer; 2891 if (tp->t_peermss) 2892 mss = min(mss, tp->t_peermss); 2893 mss = max(mss, 64); /* sanity - at least max opt. space */ 2894 2895 /* 2896 * maxopd stores the maximum length of data AND options 2897 * in a segment; maxseg is the amount of data in a normal 2898 * segment. We need to store this value (maxopd) apart 2899 * from maxseg, because now every segment carries options 2900 * and thus we normally have somewhat less data in segments. 2901 */ 2902 tp->t_maxopd = mss; 2903 2904 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2905 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2906 mss -= TCPOLEN_TSTAMP_APPA; 2907 2908 if (offer == -1) { 2909 /* mss changed due to Path MTU discovery */ 2910 if (mss < tp->t_maxseg) { 2911 /* 2912 * Follow suggestion in RFC 2414 to reduce the 2913 * congestion window by the ratio of the old 2914 * segment size to the new segment size. 2915 */ 2916 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2917 mss, mss); 2918 } 2919 } else 2920 tp->snd_cwnd = mss; 2921 2922 tp->t_maxseg = mss; 2923 2924 return (offer != -1 ? mssopt : mss); 2925 } 2926 2927 /* 2928 * Set connection variables based on the effective MSS. 2929 * We are passed the TCPCB for the actual connection. If we 2930 * are the server, we are called by the compressed state engine 2931 * when the 3-way handshake is complete. If we are the client, 2932 * we are called when we recieve the SYN,ACK from the server. 2933 * 2934 * NOTE: The t_maxseg value must be initialized in the TCPCB 2935 * before this routine is called! 2936 */ 2937 void 2938 tcp_mss_update(tp) 2939 struct tcpcb *tp; 2940 { 2941 int mss, rtt; 2942 u_long bufsize; 2943 struct rtentry *rt; 2944 struct socket *so; 2945 2946 so = tp->t_inpcb->inp_socket; 2947 mss = tp->t_maxseg; 2948 2949 rt = in_pcbrtentry(tp->t_inpcb); 2950 2951 if (rt == NULL) 2952 return; 2953 2954 #ifdef RTV_MTU /* if route characteristics exist ... */ 2955 /* 2956 * While we're here, check if there's an initial rtt 2957 * or rttvar. Convert from the route-table units 2958 * to scaled multiples of the slow timeout timer. 2959 */ 2960 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { 2961 /* 2962 * XXX the lock bit for MTU indicates that the value 2963 * is also a minimum value; this is subject to time. 2964 */ 2965 if (rt->rt_rmx.rmx_locks & RTV_RTT) 2966 TCPT_RANGESET(tp->t_rttmin, 2967 rtt / (RTM_RTTUNIT / PR_SLOWHZ), 2968 TCPTV_MIN, TCPTV_REXMTMAX); 2969 tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); 2970 if (rt->rt_rmx.rmx_rttvar) 2971 tp->t_rttvar = rt->rt_rmx.rmx_rttvar / 2972 (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); 2973 else 2974 /* default variation is +- 1 rtt */ 2975 tp->t_rttvar = 2976 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 2977 TCPT_RANGESET((long) tp->t_rxtcur, 2978 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 2979 tp->t_rttmin, TCPTV_REXMTMAX); 2980 } 2981 #endif 2982 2983 /* 2984 * If there's a pipesize, change the socket buffer 2985 * to that size. Make the socket buffers an integral 2986 * number of mss units; if the mss is larger than 2987 * the socket buffer, decrease the mss. 2988 */ 2989 #ifdef RTV_SPIPE 2990 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) 2991 #endif 2992 bufsize = so->so_snd.sb_hiwat; 2993 if (bufsize < mss) { 2994 mss = bufsize; 2995 /* Update t_maxseg and t_maxopd */ 2996 tcp_mss(tp, mss); 2997 } else { 2998 bufsize = roundup(bufsize, mss); 2999 if (bufsize > sb_max) 3000 bufsize = sb_max; 3001 (void)sbreserve(&so->so_snd, bufsize); 3002 } 3003 3004 #ifdef RTV_RPIPE 3005 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) 3006 #endif 3007 bufsize = so->so_rcv.sb_hiwat; 3008 if (bufsize > mss) { 3009 bufsize = roundup(bufsize, mss); 3010 if (bufsize > sb_max) 3011 bufsize = sb_max; 3012 (void)sbreserve(&so->so_rcv, bufsize); 3013 #ifdef RTV_RPIPE 3014 if (rt->rt_rmx.rmx_recvpipe > 0) 3015 tcp_rscale(tp, so->so_rcv.sb_hiwat); 3016 #endif 3017 } 3018 3019 #ifdef RTV_SSTHRESH 3020 if (rt->rt_rmx.rmx_ssthresh) { 3021 /* 3022 * There's some sort of gateway or interface 3023 * buffer limit on the path. Use this to set 3024 * the slow start threshhold, but set the 3025 * threshold to no less than 2*mss. 3026 */ 3027 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); 3028 } 3029 #endif /* RTV_MTU */ 3030 } 3031 #endif /* TUBA_INCLUDE */ 3032 3033 #if defined (TCP_SACK) 3034 /* 3035 * Checks for partial ack. If partial ack arrives, force the retransmission 3036 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return 3037 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to 3038 * be started again. If the ack advances at least to tp->snd_last, return 0. 3039 */ 3040 int 3041 tcp_newreno(tp, th) 3042 struct tcpcb *tp; 3043 struct tcphdr *th; 3044 { 3045 if (SEQ_LT(th->th_ack, tp->snd_last)) { 3046 /* 3047 * snd_una has not been updated and the socket send buffer 3048 * not yet drained of the acked data, so we have to leave 3049 * snd_una as it was to get the correct data offset in 3050 * tcp_output(). 3051 */ 3052 tcp_seq onxt = tp->snd_nxt; 3053 u_long ocwnd = tp->snd_cwnd; 3054 tp->t_timer[TCPT_REXMT] = 0; 3055 tp->t_rtt = 0; 3056 tp->snd_nxt = th->th_ack; 3057 /* 3058 * Set snd_cwnd to one segment beyond acknowledged offset 3059 * (tp->snd_una not yet updated when this function is called) 3060 */ 3061 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3062 (void) tcp_output(tp); 3063 tp->snd_cwnd = ocwnd; 3064 if (SEQ_GT(onxt, tp->snd_nxt)) 3065 tp->snd_nxt = onxt; 3066 /* 3067 * Partial window deflation. Relies on fact that tp->snd_una 3068 * not updated yet. 3069 */ 3070 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); 3071 return 1; 3072 } 3073 return 0; 3074 } 3075 #endif /* TCP_SACK */ 3076