1 /* $OpenBSD: tcp_input.c,v 1.398 2024/01/11 13:49:49 bluhm Exp $ */ 2 /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/timeout.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_debug.h> 97 98 #if NPF > 0 99 #include <net/pfvar.h> 100 #endif 101 102 struct tcpiphdr tcp_saveti; 103 104 int tcp_mss_adv(struct mbuf *, int); 105 int tcp_flush_queue(struct tcpcb *); 106 107 #ifdef INET6 108 #include <netinet6/in6_var.h> 109 #include <netinet6/nd6.h> 110 111 struct tcpipv6hdr tcp_saveti6; 112 113 /* for the packet header length in the mbuf */ 114 #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len) 115 #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr)) 116 #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip)) 117 #endif /* INET6 */ 118 119 int tcprexmtthresh = 3; 120 int tcptv_keep_init = TCPTV_KEEP_INIT; 121 122 int tcp_rst_ppslim = 100; /* 100pps */ 123 int tcp_rst_ppslim_count = 0; 124 struct timeval tcp_rst_ppslim_last; 125 126 int tcp_ackdrop_ppslim = 100; /* 100pps */ 127 int tcp_ackdrop_ppslim_count = 0; 128 struct timeval tcp_ackdrop_ppslim_last; 129 130 #define TCP_PAWS_IDLE TCP_TIME(24 * 24 * 60 * 60) 131 132 /* for modulo comparisons of timestamps */ 133 #define TSTMP_LT(a,b) ((int32_t)((a)-(b)) < 0) 134 #define TSTMP_GEQ(a,b) ((int32_t)((a)-(b)) >= 0) 135 136 /* for TCP SACK comparisons */ 137 #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b)) 138 #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b)) 139 140 /* 141 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 142 */ 143 #ifdef INET6 144 #define ND6_HINT(tp) \ 145 do { \ 146 if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \ 147 rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) { \ 148 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt); \ 149 } \ 150 } while (0) 151 #else 152 #define ND6_HINT(tp) 153 #endif 154 155 #ifdef TCP_ECN 156 /* 157 * ECN (Explicit Congestion Notification) support based on RFC3168 158 * implementation note: 159 * snd_last is used to track a recovery phase. 160 * when cwnd is reduced, snd_last is set to snd_max. 161 * while snd_last > snd_una, the sender is in a recovery phase and 162 * its cwnd should not be reduced again. 163 * snd_last follows snd_una when not in a recovery phase. 164 */ 165 #endif 166 167 /* 168 * Macro to compute ACK transmission behavior. Delay the ACK unless 169 * we have already delayed an ACK (must send an ACK every two segments). 170 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 171 * option is enabled or when the packet is coming from a loopback 172 * interface. 173 */ 174 #define TCP_SETUP_ACK(tp, tiflags, m) \ 175 do { \ 176 struct ifnet *ifp = NULL; \ 177 if (m && (m->m_flags & M_PKTHDR)) \ 178 ifp = if_get(m->m_pkthdr.ph_ifidx); \ 179 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \ 180 (tcp_ack_on_push && (tiflags) & TH_PUSH) || \ 181 (ifp && (ifp->if_flags & IFF_LOOPBACK))) \ 182 tp->t_flags |= TF_ACKNOW; \ 183 else \ 184 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \ 185 if_put(ifp); \ 186 } while (0) 187 188 void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); 189 void tcp_newreno_partialack(struct tcpcb *, struct tcphdr *); 190 191 void syn_cache_put(struct syn_cache *); 192 void syn_cache_rm(struct syn_cache *); 193 int syn_cache_respond(struct syn_cache *, struct mbuf *, uint64_t); 194 void syn_cache_timer(void *); 195 void syn_cache_insert(struct syn_cache *, struct tcpcb *); 196 void syn_cache_reset(struct sockaddr *, struct sockaddr *, 197 struct tcphdr *, u_int); 198 int syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *, 199 unsigned int, struct socket *, struct mbuf *, u_char *, int, 200 struct tcp_opt_info *, tcp_seq *, uint64_t); 201 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *, 202 struct tcphdr *, unsigned int, unsigned int, struct socket *, 203 struct mbuf *, uint64_t); 204 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *, 205 struct syn_cache_head **, u_int); 206 207 /* 208 * Insert segment ti into reassembly queue of tcp with 209 * control block tp. Return TH_FIN if reassembly now includes 210 * a segment with FIN. The macro form does the common case inline 211 * (segment is the next to be received on an established connection, 212 * and the queue is empty), avoiding linkage into and removal 213 * from the queue and repetition of various conversions. 214 * Set DELACK for segments received in order, but ack immediately 215 * when segments are out of order (so fast retransmit can work). 216 */ 217 218 int 219 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen) 220 { 221 struct tcpqent *p, *q, *nq, *tiqe; 222 223 /* 224 * Allocate a new queue entry, before we throw away any data. 225 * If we can't, just drop the packet. XXX 226 */ 227 tiqe = pool_get(&tcpqe_pool, PR_NOWAIT); 228 if (tiqe == NULL) { 229 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead); 230 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) { 231 /* Reuse last entry since new segment fills a hole */ 232 m_freem(tiqe->tcpqe_m); 233 TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q); 234 } 235 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) { 236 /* Flush segment queue for this connection */ 237 tcp_freeq(tp); 238 tcpstat_inc(tcps_rcvmemdrop); 239 m_freem(m); 240 return (0); 241 } 242 } 243 244 /* 245 * Find a segment which begins after this one does. 246 */ 247 for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL; 248 p = q, q = TAILQ_NEXT(q, tcpqe_q)) 249 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq)) 250 break; 251 252 /* 253 * If there is a preceding segment, it may provide some of 254 * our data already. If so, drop the data from the incoming 255 * segment. If it provides all of our data, drop us. 256 */ 257 if (p != NULL) { 258 struct tcphdr *phdr = p->tcpqe_tcp; 259 int i; 260 261 /* conversion to int (in i) handles seq wraparound */ 262 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq; 263 if (i > 0) { 264 if (i >= *tlen) { 265 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, 266 *tlen); 267 m_freem(m); 268 pool_put(&tcpqe_pool, tiqe); 269 return (0); 270 } 271 m_adj(m, i); 272 *tlen -= i; 273 th->th_seq += i; 274 } 275 } 276 tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen); 277 tp->t_rcvoopack++; 278 279 /* 280 * While we overlap succeeding segments trim them or, 281 * if they are completely covered, dequeue them. 282 */ 283 for (; q != NULL; q = nq) { 284 struct tcphdr *qhdr = q->tcpqe_tcp; 285 int i = (th->th_seq + *tlen) - qhdr->th_seq; 286 287 if (i <= 0) 288 break; 289 if (i < qhdr->th_reseqlen) { 290 qhdr->th_seq += i; 291 qhdr->th_reseqlen -= i; 292 m_adj(q->tcpqe_m, i); 293 break; 294 } 295 nq = TAILQ_NEXT(q, tcpqe_q); 296 m_freem(q->tcpqe_m); 297 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 298 pool_put(&tcpqe_pool, q); 299 } 300 301 /* Insert the new segment queue entry into place. */ 302 tiqe->tcpqe_m = m; 303 th->th_reseqlen = *tlen; 304 tiqe->tcpqe_tcp = th; 305 if (p == NULL) { 306 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q); 307 } else { 308 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q); 309 } 310 311 if (th->th_seq != tp->rcv_nxt) 312 return (0); 313 314 return (tcp_flush_queue(tp)); 315 } 316 317 int 318 tcp_flush_queue(struct tcpcb *tp) 319 { 320 struct socket *so = tp->t_inpcb->inp_socket; 321 struct tcpqent *q, *nq; 322 int flags; 323 324 /* 325 * Present data to user, advancing rcv_nxt through 326 * completed sequence space. 327 */ 328 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 329 return (0); 330 q = TAILQ_FIRST(&tp->t_segq); 331 if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt) 332 return (0); 333 if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen) 334 return (0); 335 do { 336 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen; 337 flags = q->tcpqe_tcp->th_flags & TH_FIN; 338 339 nq = TAILQ_NEXT(q, tcpqe_q); 340 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q); 341 ND6_HINT(tp); 342 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 343 m_freem(q->tcpqe_m); 344 else 345 sbappendstream(so, &so->so_rcv, q->tcpqe_m); 346 pool_put(&tcpqe_pool, q); 347 q = nq; 348 } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt); 349 tp->t_flags |= TF_BLOCKOUTPUT; 350 sorwakeup(so); 351 tp->t_flags &= ~TF_BLOCKOUTPUT; 352 return (flags); 353 } 354 355 /* 356 * TCP input routine, follows pages 65-76 of the 357 * protocol specification dated September, 1981 very closely. 358 */ 359 int 360 tcp_input(struct mbuf **mp, int *offp, int proto, int af) 361 { 362 struct mbuf *m = *mp; 363 int iphlen = *offp; 364 struct ip *ip = NULL; 365 struct inpcb *inp = NULL; 366 u_int8_t *optp = NULL; 367 int optlen = 0; 368 int tlen, off; 369 struct tcpcb *otp = NULL, *tp = NULL; 370 int tiflags; 371 struct socket *so = NULL; 372 int todrop, acked, ourfinisacked; 373 int hdroptlen = 0; 374 short ostate; 375 caddr_t saveti; 376 tcp_seq iss, *reuse = NULL; 377 uint64_t now; 378 u_long tiwin; 379 struct tcp_opt_info opti; 380 struct tcphdr *th; 381 #ifdef INET6 382 struct ip6_hdr *ip6 = NULL; 383 #endif /* INET6 */ 384 #ifdef TCP_ECN 385 u_char iptos; 386 #endif 387 388 tcpstat_inc(tcps_rcvtotal); 389 390 opti.ts_present = 0; 391 opti.maxseg = 0; 392 now = tcp_now(); 393 394 /* 395 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 396 */ 397 if (m->m_flags & (M_BCAST|M_MCAST)) 398 goto drop; 399 400 /* 401 * Get IP and TCP header together in first mbuf. 402 * Note: IP leaves IP header in first mbuf. 403 */ 404 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th)); 405 if (!th) { 406 tcpstat_inc(tcps_rcvshort); 407 return IPPROTO_DONE; 408 } 409 410 tlen = m->m_pkthdr.len - iphlen; 411 switch (af) { 412 case AF_INET: 413 ip = mtod(m, struct ip *); 414 #ifdef TCP_ECN 415 /* save ip_tos before clearing it for checksum */ 416 iptos = ip->ip_tos; 417 #endif 418 break; 419 #ifdef INET6 420 case AF_INET6: 421 ip6 = mtod(m, struct ip6_hdr *); 422 #ifdef TCP_ECN 423 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 424 #endif 425 426 /* 427 * Be proactive about unspecified IPv6 address in source. 428 * As we use all-zero to indicate unbounded/unconnected pcb, 429 * unspecified IPv6 address can be used to confuse us. 430 * 431 * Note that packets with unspecified IPv6 destination is 432 * already dropped in ip6_input. 433 */ 434 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 435 /* XXX stat */ 436 goto drop; 437 } 438 439 /* Discard packets to multicast */ 440 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 441 /* XXX stat */ 442 goto drop; 443 } 444 break; 445 #endif 446 default: 447 unhandled_af(af); 448 } 449 450 /* 451 * Checksum extended TCP header and data. 452 */ 453 if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) { 454 int sum; 455 456 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) { 457 tcpstat_inc(tcps_rcvbadsum); 458 goto drop; 459 } 460 tcpstat_inc(tcps_inswcsum); 461 switch (af) { 462 case AF_INET: 463 sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen); 464 break; 465 #ifdef INET6 466 case AF_INET6: 467 sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 468 tlen); 469 break; 470 #endif 471 } 472 if (sum != 0) { 473 tcpstat_inc(tcps_rcvbadsum); 474 goto drop; 475 } 476 } 477 478 /* 479 * Check that TCP offset makes sense, 480 * pull out TCP options and adjust length. XXX 481 */ 482 off = th->th_off << 2; 483 if (off < sizeof(struct tcphdr) || off > tlen) { 484 tcpstat_inc(tcps_rcvbadoff); 485 goto drop; 486 } 487 tlen -= off; 488 if (off > sizeof(struct tcphdr)) { 489 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off); 490 if (!th) { 491 tcpstat_inc(tcps_rcvshort); 492 return IPPROTO_DONE; 493 } 494 optlen = off - sizeof(struct tcphdr); 495 optp = (u_int8_t *)(th + 1); 496 /* 497 * Do quick retrieval of timestamp options ("options 498 * prediction?"). If timestamp is the only option and it's 499 * formatted as recommended in RFC 1323 appendix A, we 500 * quickly get the values now and not bother calling 501 * tcp_dooptions(), etc. 502 */ 503 if ((optlen == TCPOLEN_TSTAMP_APPA || 504 (optlen > TCPOLEN_TSTAMP_APPA && 505 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 506 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) && 507 (th->th_flags & TH_SYN) == 0) { 508 opti.ts_present = 1; 509 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4)); 510 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8)); 511 optp = NULL; /* we've parsed the options */ 512 } 513 } 514 tiflags = th->th_flags; 515 516 /* 517 * Convert TCP protocol specific fields to host format. 518 */ 519 th->th_seq = ntohl(th->th_seq); 520 th->th_ack = ntohl(th->th_ack); 521 th->th_win = ntohs(th->th_win); 522 th->th_urp = ntohs(th->th_urp); 523 524 if (th->th_dport == 0) { 525 tcpstat_inc(tcps_noport); 526 goto dropwithreset_ratelim; 527 } 528 529 /* 530 * Locate pcb for segment. 531 */ 532 #if NPF > 0 533 inp = pf_inp_lookup(m); 534 #endif 535 findpcb: 536 if (inp == NULL) { 537 switch (af) { 538 #ifdef INET6 539 case AF_INET6: 540 inp = in6_pcblookup(&tcbtable, &ip6->ip6_src, 541 th->th_sport, &ip6->ip6_dst, th->th_dport, 542 m->m_pkthdr.ph_rtableid); 543 break; 544 #endif 545 case AF_INET: 546 inp = in_pcblookup(&tcbtable, ip->ip_src, 547 th->th_sport, ip->ip_dst, th->th_dport, 548 m->m_pkthdr.ph_rtableid); 549 break; 550 } 551 } 552 if (inp == NULL) { 553 tcpstat_inc(tcps_pcbhashmiss); 554 switch (af) { 555 #ifdef INET6 556 case AF_INET6: 557 inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst, 558 th->th_dport, m, m->m_pkthdr.ph_rtableid); 559 break; 560 #endif /* INET6 */ 561 case AF_INET: 562 inp = in_pcblookup_listen(&tcbtable, ip->ip_dst, 563 th->th_dport, m, m->m_pkthdr.ph_rtableid); 564 break; 565 } 566 /* 567 * If the state is CLOSED (i.e., TCB does not exist) then 568 * all data in the incoming segment is discarded. 569 * If the TCB exists but is in CLOSED state, it is embryonic, 570 * but should either do a listen or a connect soon. 571 */ 572 } 573 #ifdef IPSEC 574 if (ipsec_in_use) { 575 struct m_tag *mtag; 576 struct tdb *tdb = NULL; 577 int error; 578 579 /* Find most recent IPsec tag */ 580 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); 581 if (mtag != NULL) { 582 struct tdb_ident *tdbi; 583 584 tdbi = (struct tdb_ident *)(mtag + 1); 585 tdb = gettdb(tdbi->rdomain, tdbi->spi, 586 &tdbi->dst, tdbi->proto); 587 } 588 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN, 589 tdb, inp ? inp->inp_seclevel : NULL, NULL, NULL); 590 tdb_unref(tdb); 591 if (error) { 592 tcpstat_inc(tcps_rcvnosec); 593 goto drop; 594 } 595 } 596 #endif /* IPSEC */ 597 598 if (inp == NULL) { 599 tcpstat_inc(tcps_noport); 600 goto dropwithreset_ratelim; 601 } 602 603 KASSERT(sotoinpcb(inp->inp_socket) == inp); 604 KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp); 605 soassertlocked(inp->inp_socket); 606 607 /* Check the minimum TTL for socket. */ 608 switch (af) { 609 case AF_INET: 610 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) 611 goto drop; 612 break; 613 #ifdef INET6 614 case AF_INET6: 615 if (inp->inp_ip6_minhlim && 616 inp->inp_ip6_minhlim > ip6->ip6_hlim) 617 goto drop; 618 break; 619 #endif 620 } 621 622 tp = intotcpcb(inp); 623 if (tp == NULL) 624 goto dropwithreset_ratelim; 625 if (tp->t_state == TCPS_CLOSED) 626 goto drop; 627 628 /* Unscale the window into a 32-bit value. */ 629 if ((tiflags & TH_SYN) == 0) 630 tiwin = th->th_win << tp->snd_scale; 631 else 632 tiwin = th->th_win; 633 634 so = inp->inp_socket; 635 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { 636 union syn_cache_sa src; 637 union syn_cache_sa dst; 638 639 bzero(&src, sizeof(src)); 640 bzero(&dst, sizeof(dst)); 641 switch (af) { 642 case AF_INET: 643 src.sin.sin_len = sizeof(struct sockaddr_in); 644 src.sin.sin_family = AF_INET; 645 src.sin.sin_addr = ip->ip_src; 646 src.sin.sin_port = th->th_sport; 647 648 dst.sin.sin_len = sizeof(struct sockaddr_in); 649 dst.sin.sin_family = AF_INET; 650 dst.sin.sin_addr = ip->ip_dst; 651 dst.sin.sin_port = th->th_dport; 652 break; 653 #ifdef INET6 654 case AF_INET6: 655 src.sin6.sin6_len = sizeof(struct sockaddr_in6); 656 src.sin6.sin6_family = AF_INET6; 657 src.sin6.sin6_addr = ip6->ip6_src; 658 src.sin6.sin6_port = th->th_sport; 659 660 dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 661 dst.sin6.sin6_family = AF_INET6; 662 dst.sin6.sin6_addr = ip6->ip6_dst; 663 dst.sin6.sin6_port = th->th_dport; 664 break; 665 #endif /* INET6 */ 666 } 667 668 if (so->so_options & SO_DEBUG) { 669 otp = tp; 670 ostate = tp->t_state; 671 switch (af) { 672 #ifdef INET6 673 case AF_INET6: 674 saveti = (caddr_t) &tcp_saveti6; 675 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6)); 676 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th)); 677 break; 678 #endif 679 case AF_INET: 680 saveti = (caddr_t) &tcp_saveti; 681 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip)); 682 memcpy(&tcp_saveti.ti_t, th, sizeof(*th)); 683 break; 684 } 685 } 686 if (so->so_options & SO_ACCEPTCONN) { 687 switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) { 688 689 case TH_SYN|TH_ACK|TH_RST: 690 case TH_SYN|TH_RST: 691 case TH_ACK|TH_RST: 692 case TH_RST: 693 syn_cache_reset(&src.sa, &dst.sa, th, 694 inp->inp_rtableid); 695 goto drop; 696 697 case TH_SYN|TH_ACK: 698 /* 699 * Received a SYN,ACK. This should 700 * never happen while we are in 701 * LISTEN. Send an RST. 702 */ 703 goto badsyn; 704 705 case TH_ACK: 706 so = syn_cache_get(&src.sa, &dst.sa, 707 th, iphlen, tlen, so, m, now); 708 if (so == NULL) { 709 /* 710 * We don't have a SYN for 711 * this ACK; send an RST. 712 */ 713 goto badsyn; 714 } else if (so == (struct socket *)(-1)) { 715 /* 716 * We were unable to create 717 * the connection. If the 718 * 3-way handshake was 719 * completed, and RST has 720 * been sent to the peer. 721 * Since the mbuf might be 722 * in use for the reply, 723 * do not free it. 724 */ 725 m = *mp = NULL; 726 goto drop; 727 } else { 728 /* 729 * We have created a 730 * full-blown connection. 731 */ 732 tp = NULL; 733 in_pcbunref(inp); 734 inp = in_pcbref(sotoinpcb(so)); 735 tp = intotcpcb(inp); 736 if (tp == NULL) 737 goto badsyn; /*XXX*/ 738 739 } 740 break; 741 742 default: 743 /* 744 * None of RST, SYN or ACK was set. 745 * This is an invalid packet for a 746 * TCB in LISTEN state. Send a RST. 747 */ 748 goto badsyn; 749 750 case TH_SYN: 751 /* 752 * Received a SYN. 753 */ 754 #ifdef INET6 755 /* 756 * If deprecated address is forbidden, we do 757 * not accept SYN to deprecated interface 758 * address to prevent any new inbound 759 * connection from getting established. 760 * When we do not accept SYN, we send a TCP 761 * RST, with deprecated source address (instead 762 * of dropping it). We compromise it as it is 763 * much better for peer to send a RST, and 764 * RST will be the final packet for the 765 * exchange. 766 * 767 * If we do not forbid deprecated addresses, we 768 * accept the SYN packet. RFC2462 does not 769 * suggest dropping SYN in this case. 770 * If we decipher RFC2462 5.5.4, it says like 771 * this: 772 * 1. use of deprecated addr with existing 773 * communication is okay - "SHOULD continue 774 * to be used" 775 * 2. use of it with new communication: 776 * (2a) "SHOULD NOT be used if alternate 777 * address with sufficient scope is 778 * available" 779 * (2b) nothing mentioned otherwise. 780 * Here we fall into (2b) case as we have no 781 * choice in our source address selection - we 782 * must obey the peer. 783 * 784 * The wording in RFC2462 is confusing, and 785 * there are multiple description text for 786 * deprecated address handling - worse, they 787 * are not exactly the same. I believe 5.5.4 788 * is the best one, so we follow 5.5.4. 789 */ 790 if (ip6 && !ip6_use_deprecated) { 791 struct in6_ifaddr *ia6; 792 struct ifnet *ifp = 793 if_get(m->m_pkthdr.ph_ifidx); 794 795 if (ifp && 796 (ia6 = in6ifa_ifpwithaddr(ifp, 797 &ip6->ip6_dst)) && 798 (ia6->ia6_flags & 799 IN6_IFF_DEPRECATED)) { 800 tp = NULL; 801 if_put(ifp); 802 goto dropwithreset; 803 } 804 if_put(ifp); 805 } 806 #endif 807 808 /* 809 * LISTEN socket received a SYN 810 * from itself? This can't possibly 811 * be valid; drop the packet. 812 */ 813 if (th->th_dport == th->th_sport) { 814 switch (af) { 815 #ifdef INET6 816 case AF_INET6: 817 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 818 &ip6->ip6_dst)) { 819 tcpstat_inc(tcps_badsyn); 820 goto drop; 821 } 822 break; 823 #endif /* INET6 */ 824 case AF_INET: 825 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { 826 tcpstat_inc(tcps_badsyn); 827 goto drop; 828 } 829 break; 830 } 831 } 832 833 /* 834 * SYN looks ok; create compressed TCP 835 * state for it. 836 */ 837 if (so->so_qlen > so->so_qlimit || 838 syn_cache_add(&src.sa, &dst.sa, th, iphlen, 839 so, m, optp, optlen, &opti, reuse, now) 840 == -1) { 841 tcpstat_inc(tcps_dropsyn); 842 goto drop; 843 } 844 in_pcbunref(inp); 845 return IPPROTO_DONE; 846 } 847 } 848 } 849 850 #ifdef DIAGNOSTIC 851 /* 852 * Should not happen now that all embryonic connections 853 * are handled with compressed state. 854 */ 855 if (tp->t_state == TCPS_LISTEN) 856 panic("tcp_input: TCPS_LISTEN"); 857 #endif 858 859 #if NPF > 0 860 pf_inp_link(m, inp); 861 #endif 862 863 /* 864 * Segment received on connection. 865 * Reset idle time and keep-alive timer. 866 */ 867 tp->t_rcvtime = now; 868 if (TCPS_HAVEESTABLISHED(tp->t_state)) 869 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 870 871 if (tp->sack_enable) 872 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ 873 874 /* 875 * Process options. 876 */ 877 #ifdef TCP_SIGNATURE 878 if (optp || (tp->t_flags & TF_SIGNATURE)) 879 #else 880 if (optp) 881 #endif 882 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti, 883 m->m_pkthdr.ph_rtableid, now)) 884 goto drop; 885 886 if (opti.ts_present && opti.ts_ecr) { 887 int32_t rtt_test; 888 889 /* subtract out the tcp timestamp modulator */ 890 opti.ts_ecr -= tp->ts_modulate; 891 892 /* make sure ts_ecr is sensible */ 893 rtt_test = now - opti.ts_ecr; 894 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX) 895 opti.ts_ecr = 0; 896 } 897 898 #ifdef TCP_ECN 899 /* if congestion experienced, set ECE bit in subsequent packets. */ 900 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { 901 tp->t_flags |= TF_RCVD_CE; 902 tcpstat_inc(tcps_ecn_rcvce); 903 } 904 #endif 905 /* 906 * Header prediction: check for the two common cases 907 * of a uni-directional data xfer. If the packet has 908 * no control flags, is in-sequence, the window didn't 909 * change and we're not retransmitting, it's a 910 * candidate. If the length is zero and the ack moved 911 * forward, we're the sender side of the xfer. Just 912 * free the data acked & wake any higher level process 913 * that was blocked waiting for space. If the length 914 * is non-zero and the ack didn't move, we're the 915 * receiver side. If we're getting packets in-order 916 * (the reassembly queue is empty), add the data to 917 * the socket buffer and note that we need a delayed ack. 918 */ 919 if (tp->t_state == TCPS_ESTABLISHED && 920 #ifdef TCP_ECN 921 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && 922 #else 923 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 924 #endif 925 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 926 th->th_seq == tp->rcv_nxt && 927 tiwin && tiwin == tp->snd_wnd && 928 tp->snd_nxt == tp->snd_max) { 929 930 /* 931 * If last ACK falls within this segment's sequence numbers, 932 * record the timestamp. 933 * Fix from Braden, see Stevens p. 870 934 */ 935 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 936 tp->ts_recent_age = now; 937 tp->ts_recent = opti.ts_val; 938 } 939 940 if (tlen == 0) { 941 if (SEQ_GT(th->th_ack, tp->snd_una) && 942 SEQ_LEQ(th->th_ack, tp->snd_max) && 943 tp->snd_cwnd >= tp->snd_wnd && 944 tp->t_dupacks == 0) { 945 /* 946 * this is a pure ack for outstanding data. 947 */ 948 tcpstat_inc(tcps_predack); 949 if (opti.ts_present && opti.ts_ecr) 950 tcp_xmit_timer(tp, now - opti.ts_ecr); 951 else if (tp->t_rtttime && 952 SEQ_GT(th->th_ack, tp->t_rtseq)) 953 tcp_xmit_timer(tp, now - tp->t_rtttime); 954 acked = th->th_ack - tp->snd_una; 955 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, 956 acked); 957 tp->t_rcvacktime = now; 958 ND6_HINT(tp); 959 sbdrop(so, &so->so_snd, acked); 960 961 /* 962 * If we had a pending ICMP message that 963 * refers to data that have just been 964 * acknowledged, disregard the recorded ICMP 965 * message. 966 */ 967 if ((tp->t_flags & TF_PMTUD_PEND) && 968 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 969 tp->t_flags &= ~TF_PMTUD_PEND; 970 971 /* 972 * Keep track of the largest chunk of data 973 * acknowledged since last PMTU update 974 */ 975 if (tp->t_pmtud_mss_acked < acked) 976 tp->t_pmtud_mss_acked = acked; 977 978 tp->snd_una = th->th_ack; 979 /* Pull snd_wl2 up to prevent seq wrap. */ 980 tp->snd_wl2 = th->th_ack; 981 /* 982 * We want snd_last to track snd_una so 983 * as to avoid sequence wraparound problems 984 * for very large transfers. 985 */ 986 #ifdef TCP_ECN 987 if (SEQ_GT(tp->snd_una, tp->snd_last)) 988 #endif 989 tp->snd_last = tp->snd_una; 990 m_freem(m); 991 992 /* 993 * If all outstanding data are acked, stop 994 * retransmit timer, otherwise restart timer 995 * using current (possibly backed-off) value. 996 * If process is waiting for space, 997 * wakeup/selwakeup/signal. If data 998 * are ready to send, let tcp_output 999 * decide between more output or persist. 1000 */ 1001 if (tp->snd_una == tp->snd_max) 1002 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1003 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1004 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1005 1006 tcp_update_sndspace(tp); 1007 if (sb_notify(so, &so->so_snd)) { 1008 tp->t_flags |= TF_BLOCKOUTPUT; 1009 sowwakeup(so); 1010 tp->t_flags &= ~TF_BLOCKOUTPUT; 1011 } 1012 if (so->so_snd.sb_cc || 1013 tp->t_flags & TF_NEEDOUTPUT) 1014 (void) tcp_output(tp); 1015 in_pcbunref(inp); 1016 return IPPROTO_DONE; 1017 } 1018 } else if (th->th_ack == tp->snd_una && 1019 TAILQ_EMPTY(&tp->t_segq) && 1020 tlen <= sbspace(so, &so->so_rcv)) { 1021 /* 1022 * This is a pure, in-sequence data packet 1023 * with nothing on the reassembly queue and 1024 * we have enough buffer space to take it. 1025 */ 1026 /* Clean receiver SACK report if present */ 1027 if (tp->sack_enable && tp->rcv_numsacks) 1028 tcp_clean_sackreport(tp); 1029 tcpstat_inc(tcps_preddat); 1030 tp->rcv_nxt += tlen; 1031 /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */ 1032 tp->snd_wl1 = th->th_seq; 1033 /* Packet has most recent segment, no urgent exists. */ 1034 tp->rcv_up = tp->rcv_nxt; 1035 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1036 ND6_HINT(tp); 1037 1038 TCP_SETUP_ACK(tp, tiflags, m); 1039 /* 1040 * Drop TCP, IP headers and TCP options then add data 1041 * to socket buffer. 1042 */ 1043 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1044 m_freem(m); 1045 else { 1046 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1047 now - tp->rfbuf_ts > (tp->t_srtt >> 1048 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) { 1049 tcp_update_rcvspace(tp); 1050 /* Start over with next RTT. */ 1051 tp->rfbuf_cnt = 0; 1052 tp->rfbuf_ts = 0; 1053 } else 1054 tp->rfbuf_cnt += tlen; 1055 m_adj(m, iphlen + off); 1056 sbappendstream(so, &so->so_rcv, m); 1057 } 1058 tp->t_flags |= TF_BLOCKOUTPUT; 1059 sorwakeup(so); 1060 tp->t_flags &= ~TF_BLOCKOUTPUT; 1061 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 1062 (void) tcp_output(tp); 1063 in_pcbunref(inp); 1064 return IPPROTO_DONE; 1065 } 1066 } 1067 1068 /* 1069 * Compute mbuf offset to TCP data segment. 1070 */ 1071 hdroptlen = iphlen + off; 1072 1073 /* 1074 * Calculate amount of space in receive window, 1075 * and then do TCP input processing. 1076 * Receive window is amount of space in rcv queue, 1077 * but not less than advertised window. 1078 */ 1079 { int win; 1080 1081 win = sbspace(so, &so->so_rcv); 1082 if (win < 0) 1083 win = 0; 1084 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1085 } 1086 1087 switch (tp->t_state) { 1088 1089 /* 1090 * If the state is SYN_RECEIVED: 1091 * if seg contains SYN/ACK, send an RST. 1092 * if seg contains an ACK, but not for our SYN/ACK, send an RST 1093 */ 1094 1095 case TCPS_SYN_RECEIVED: 1096 if (tiflags & TH_ACK) { 1097 if (tiflags & TH_SYN) { 1098 tcpstat_inc(tcps_badsyn); 1099 goto dropwithreset; 1100 } 1101 if (SEQ_LEQ(th->th_ack, tp->snd_una) || 1102 SEQ_GT(th->th_ack, tp->snd_max)) 1103 goto dropwithreset; 1104 } 1105 break; 1106 1107 /* 1108 * If the state is SYN_SENT: 1109 * if seg contains an ACK, but not for our SYN, drop the input. 1110 * if seg contains a RST, then drop the connection. 1111 * if seg does not contain SYN, then drop it. 1112 * Otherwise this is an acceptable SYN segment 1113 * initialize tp->rcv_nxt and tp->irs 1114 * if seg contains ack then advance tp->snd_una 1115 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1116 * arrange for segment to be acked (eventually) 1117 * continue processing rest of data/controls, beginning with URG 1118 */ 1119 case TCPS_SYN_SENT: 1120 if ((tiflags & TH_ACK) && 1121 (SEQ_LEQ(th->th_ack, tp->iss) || 1122 SEQ_GT(th->th_ack, tp->snd_max))) 1123 goto dropwithreset; 1124 if (tiflags & TH_RST) { 1125 #ifdef TCP_ECN 1126 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1127 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1128 goto drop; 1129 #endif 1130 if (tiflags & TH_ACK) 1131 tp = tcp_drop(tp, ECONNREFUSED); 1132 goto drop; 1133 } 1134 if ((tiflags & TH_SYN) == 0) 1135 goto drop; 1136 if (tiflags & TH_ACK) { 1137 tp->snd_una = th->th_ack; 1138 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1139 tp->snd_nxt = tp->snd_una; 1140 } 1141 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1142 tp->irs = th->th_seq; 1143 tcp_mss(tp, opti.maxseg); 1144 /* Reset initial window to 1 segment for retransmit */ 1145 if (tp->t_rxtshift > 0) 1146 tp->snd_cwnd = tp->t_maxseg; 1147 tcp_rcvseqinit(tp); 1148 tp->t_flags |= TF_ACKNOW; 1149 /* 1150 * If we've sent a SACK_PERMITTED option, and the peer 1151 * also replied with one, then TF_SACK_PERMIT should have 1152 * been set in tcp_dooptions(). If it was not, disable SACKs. 1153 */ 1154 if (tp->sack_enable) 1155 tp->sack_enable = tp->t_flags & TF_SACK_PERMIT; 1156 #ifdef TCP_ECN 1157 /* 1158 * if ECE is set but CWR is not set for SYN-ACK, or 1159 * both ECE and CWR are set for simultaneous open, 1160 * peer is ECN capable. 1161 */ 1162 if (tcp_do_ecn) { 1163 switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) { 1164 case TH_ACK|TH_ECE: 1165 case TH_ECE|TH_CWR: 1166 tp->t_flags |= TF_ECN_PERMIT; 1167 tiflags &= ~(TH_ECE|TH_CWR); 1168 tcpstat_inc(tcps_ecn_accepts); 1169 } 1170 } 1171 #endif 1172 1173 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { 1174 tcpstat_inc(tcps_connects); 1175 tp->t_flags |= TF_BLOCKOUTPUT; 1176 soisconnected(so); 1177 tp->t_flags &= ~TF_BLOCKOUTPUT; 1178 tp->t_state = TCPS_ESTABLISHED; 1179 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1180 /* Do window scaling on this connection? */ 1181 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1182 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1183 tp->snd_scale = tp->requested_s_scale; 1184 tp->rcv_scale = tp->request_r_scale; 1185 } 1186 tcp_flush_queue(tp); 1187 1188 /* 1189 * if we didn't have to retransmit the SYN, 1190 * use its rtt as our initial srtt & rtt var. 1191 */ 1192 if (tp->t_rtttime) 1193 tcp_xmit_timer(tp, now - tp->t_rtttime); 1194 /* 1195 * Since new data was acked (the SYN), open the 1196 * congestion window by one MSS. We do this 1197 * here, because we won't go through the normal 1198 * ACK processing below. And since this is the 1199 * start of the connection, we know we are in 1200 * the exponential phase of slow-start. 1201 */ 1202 tp->snd_cwnd += tp->t_maxseg; 1203 } else 1204 tp->t_state = TCPS_SYN_RECEIVED; 1205 1206 #if 0 1207 trimthenstep6: 1208 #endif 1209 /* 1210 * Advance th->th_seq to correspond to first data byte. 1211 * If data, trim to stay within window, 1212 * dropping FIN if necessary. 1213 */ 1214 th->th_seq++; 1215 if (tlen > tp->rcv_wnd) { 1216 todrop = tlen - tp->rcv_wnd; 1217 m_adj(m, -todrop); 1218 tlen = tp->rcv_wnd; 1219 tiflags &= ~TH_FIN; 1220 tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, 1221 todrop); 1222 } 1223 tp->snd_wl1 = th->th_seq - 1; 1224 tp->rcv_up = th->th_seq; 1225 goto step6; 1226 /* 1227 * If a new connection request is received while in TIME_WAIT, 1228 * drop the old connection and start over if the if the 1229 * timestamp or the sequence numbers are above the previous 1230 * ones. 1231 */ 1232 case TCPS_TIME_WAIT: 1233 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) && 1234 ((opti.ts_present && 1235 TSTMP_LT(tp->ts_recent, opti.ts_val)) || 1236 SEQ_GT(th->th_seq, tp->rcv_nxt))) { 1237 #if NPF > 0 1238 /* 1239 * The socket will be recreated but the new state 1240 * has already been linked to the socket. Remove the 1241 * link between old socket and new state. 1242 */ 1243 pf_inp_unlink(inp); 1244 #endif 1245 /* 1246 * Advance the iss by at least 32768, but 1247 * clear the msb in order to make sure 1248 * that SEG_LT(snd_nxt, iss). 1249 */ 1250 iss = tp->snd_nxt + 1251 ((arc4random() & 0x7fffffff) | 0x8000); 1252 reuse = &iss; 1253 tp = tcp_close(tp); 1254 in_pcbunref(inp); 1255 inp = NULL; 1256 goto findpcb; 1257 } 1258 } 1259 1260 /* 1261 * States other than LISTEN or SYN_SENT. 1262 * First check timestamp, if present. 1263 * Then check that at least some bytes of segment are within 1264 * receive window. If segment begins before rcv_nxt, 1265 * drop leading data (and SYN); if nothing left, just ack. 1266 * 1267 * RFC 1323 PAWS: If we have a timestamp reply on this segment 1268 * and it's less than opti.ts_recent, drop it. 1269 */ 1270 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 1271 TSTMP_LT(opti.ts_val, tp->ts_recent)) { 1272 1273 /* Check to see if ts_recent is over 24 days old. */ 1274 if (now - tp->ts_recent_age > TCP_PAWS_IDLE) { 1275 /* 1276 * Invalidate ts_recent. If this segment updates 1277 * ts_recent, the age will be reset later and ts_recent 1278 * will get a valid value. If it does not, setting 1279 * ts_recent to zero will at least satisfy the 1280 * requirement that zero be placed in the timestamp 1281 * echo reply when ts_recent isn't valid. The 1282 * age isn't reset until we get a valid ts_recent 1283 * because we don't want out-of-order segments to be 1284 * dropped when ts_recent is old. 1285 */ 1286 tp->ts_recent = 0; 1287 } else { 1288 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen); 1289 tcpstat_inc(tcps_pawsdrop); 1290 if (tlen) 1291 goto dropafterack; 1292 goto drop; 1293 } 1294 } 1295 1296 todrop = tp->rcv_nxt - th->th_seq; 1297 if (todrop > 0) { 1298 if (tiflags & TH_SYN) { 1299 tiflags &= ~TH_SYN; 1300 th->th_seq++; 1301 if (th->th_urp > 1) 1302 th->th_urp--; 1303 else 1304 tiflags &= ~TH_URG; 1305 todrop--; 1306 } 1307 if (todrop > tlen || 1308 (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1309 /* 1310 * Any valid FIN must be to the left of the 1311 * window. At this point, FIN must be a 1312 * duplicate or out-of-sequence, so drop it. 1313 */ 1314 tiflags &= ~TH_FIN; 1315 /* 1316 * Send ACK to resynchronize, and drop any data, 1317 * but keep on processing for RST or ACK. 1318 */ 1319 tp->t_flags |= TF_ACKNOW; 1320 todrop = tlen; 1321 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop); 1322 } else { 1323 tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte, 1324 todrop); 1325 } 1326 hdroptlen += todrop; /* drop from head afterwards */ 1327 th->th_seq += todrop; 1328 tlen -= todrop; 1329 if (th->th_urp > todrop) 1330 th->th_urp -= todrop; 1331 else { 1332 tiflags &= ~TH_URG; 1333 th->th_urp = 0; 1334 } 1335 } 1336 1337 /* 1338 * If new data are received on a connection after the 1339 * user processes are gone, then RST the other end. 1340 */ 1341 if ((so->so_state & SS_NOFDREF) && 1342 tp->t_state > TCPS_CLOSE_WAIT && tlen) { 1343 tp = tcp_close(tp); 1344 tcpstat_inc(tcps_rcvafterclose); 1345 goto dropwithreset; 1346 } 1347 1348 /* 1349 * If segment ends after window, drop trailing data 1350 * (and PUSH and FIN); if nothing left, just ACK. 1351 */ 1352 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd); 1353 if (todrop > 0) { 1354 tcpstat_inc(tcps_rcvpackafterwin); 1355 if (todrop >= tlen) { 1356 tcpstat_add(tcps_rcvbyteafterwin, tlen); 1357 /* 1358 * If window is closed can only take segments at 1359 * window edge, and have to drop data and PUSH from 1360 * incoming segments. Continue processing, but 1361 * remember to ack. Otherwise, drop segment 1362 * and ack. 1363 */ 1364 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1365 tp->t_flags |= TF_ACKNOW; 1366 tcpstat_inc(tcps_rcvwinprobe); 1367 } else 1368 goto dropafterack; 1369 } else 1370 tcpstat_add(tcps_rcvbyteafterwin, todrop); 1371 m_adj(m, -todrop); 1372 tlen -= todrop; 1373 tiflags &= ~(TH_PUSH|TH_FIN); 1374 } 1375 1376 /* 1377 * If last ACK falls within this segment's sequence numbers, 1378 * record its timestamp if it's more recent. 1379 * NOTE that the test is modified according to the latest 1380 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1381 */ 1382 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) && 1383 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1384 tp->ts_recent_age = now; 1385 tp->ts_recent = opti.ts_val; 1386 } 1387 1388 /* 1389 * If the RST bit is set examine the state: 1390 * SYN_RECEIVED STATE: 1391 * If passive open, return to LISTEN state. 1392 * If active open, inform user that connection was refused. 1393 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: 1394 * Inform user that connection was reset, and close tcb. 1395 * CLOSING, LAST_ACK, TIME_WAIT STATES 1396 * Close the tcb. 1397 */ 1398 if (tiflags & TH_RST) { 1399 if (th->th_seq != tp->last_ack_sent && 1400 th->th_seq != tp->rcv_nxt && 1401 th->th_seq != (tp->rcv_nxt + 1)) 1402 goto drop; 1403 1404 switch (tp->t_state) { 1405 case TCPS_SYN_RECEIVED: 1406 #ifdef TCP_ECN 1407 /* if ECN is enabled, fall back to non-ecn at rexmit */ 1408 if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) 1409 goto drop; 1410 #endif 1411 so->so_error = ECONNREFUSED; 1412 goto close; 1413 1414 case TCPS_ESTABLISHED: 1415 case TCPS_FIN_WAIT_1: 1416 case TCPS_FIN_WAIT_2: 1417 case TCPS_CLOSE_WAIT: 1418 so->so_error = ECONNRESET; 1419 close: 1420 tp->t_state = TCPS_CLOSED; 1421 tcpstat_inc(tcps_drops); 1422 tp = tcp_close(tp); 1423 goto drop; 1424 case TCPS_CLOSING: 1425 case TCPS_LAST_ACK: 1426 case TCPS_TIME_WAIT: 1427 tp = tcp_close(tp); 1428 goto drop; 1429 } 1430 } 1431 1432 /* 1433 * If a SYN is in the window, then this is an 1434 * error and we ACK and drop the packet. 1435 */ 1436 if (tiflags & TH_SYN) 1437 goto dropafterack_ratelim; 1438 1439 /* 1440 * If the ACK bit is off we drop the segment and return. 1441 */ 1442 if ((tiflags & TH_ACK) == 0) { 1443 if (tp->t_flags & TF_ACKNOW) 1444 goto dropafterack; 1445 else 1446 goto drop; 1447 } 1448 1449 /* 1450 * Ack processing. 1451 */ 1452 switch (tp->t_state) { 1453 1454 /* 1455 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 1456 * ESTABLISHED state and continue processing. 1457 * The ACK was checked above. 1458 */ 1459 case TCPS_SYN_RECEIVED: 1460 tcpstat_inc(tcps_connects); 1461 tp->t_flags |= TF_BLOCKOUTPUT; 1462 soisconnected(so); 1463 tp->t_flags &= ~TF_BLOCKOUTPUT; 1464 tp->t_state = TCPS_ESTABLISHED; 1465 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 1466 /* Do window scaling? */ 1467 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 1468 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 1469 tp->snd_scale = tp->requested_s_scale; 1470 tp->rcv_scale = tp->request_r_scale; 1471 tiwin = th->th_win << tp->snd_scale; 1472 } 1473 tcp_flush_queue(tp); 1474 tp->snd_wl1 = th->th_seq - 1; 1475 /* fall into ... */ 1476 1477 /* 1478 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 1479 * ACKs. If the ack is in the range 1480 * tp->snd_una < th->th_ack <= tp->snd_max 1481 * then advance tp->snd_una to th->th_ack and drop 1482 * data from the retransmission queue. If this ACK reflects 1483 * more up to date window information we update our window information. 1484 */ 1485 case TCPS_ESTABLISHED: 1486 case TCPS_FIN_WAIT_1: 1487 case TCPS_FIN_WAIT_2: 1488 case TCPS_CLOSE_WAIT: 1489 case TCPS_CLOSING: 1490 case TCPS_LAST_ACK: 1491 case TCPS_TIME_WAIT: 1492 #ifdef TCP_ECN 1493 /* 1494 * if we receive ECE and are not already in recovery phase, 1495 * reduce cwnd by half but don't slow-start. 1496 * advance snd_last to snd_max not to reduce cwnd again 1497 * until all outstanding packets are acked. 1498 */ 1499 if (tcp_do_ecn && (tiflags & TH_ECE)) { 1500 if ((tp->t_flags & TF_ECN_PERMIT) && 1501 SEQ_GEQ(tp->snd_una, tp->snd_last)) { 1502 u_int win; 1503 1504 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; 1505 if (win > 1) { 1506 tp->snd_ssthresh = win / 2 * tp->t_maxseg; 1507 tp->snd_cwnd = tp->snd_ssthresh; 1508 tp->snd_last = tp->snd_max; 1509 tp->t_flags |= TF_SEND_CWR; 1510 tcpstat_inc(tcps_cwr_ecn); 1511 } 1512 } 1513 tcpstat_inc(tcps_ecn_rcvece); 1514 } 1515 /* 1516 * if we receive CWR, we know that the peer has reduced 1517 * its congestion window. stop sending ecn-echo. 1518 */ 1519 if ((tiflags & TH_CWR)) { 1520 tp->t_flags &= ~TF_RCVD_CE; 1521 tcpstat_inc(tcps_ecn_rcvcwr); 1522 } 1523 #endif /* TCP_ECN */ 1524 1525 if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 1526 /* 1527 * Duplicate/old ACK processing. 1528 * Increments t_dupacks: 1529 * Pure duplicate (same seq/ack/window, no data) 1530 * Doesn't affect t_dupacks: 1531 * Data packets. 1532 * Normal window updates (window opens) 1533 * Resets t_dupacks: 1534 * New data ACKed. 1535 * Window shrinks 1536 * Old ACK 1537 */ 1538 if (tlen) { 1539 /* Drop very old ACKs unless th_seq matches */ 1540 if (th->th_seq != tp->rcv_nxt && 1541 SEQ_LT(th->th_ack, 1542 tp->snd_una - tp->max_sndwnd)) { 1543 tcpstat_inc(tcps_rcvacktooold); 1544 goto drop; 1545 } 1546 break; 1547 } 1548 /* 1549 * If we get an old ACK, there is probably packet 1550 * reordering going on. Be conservative and reset 1551 * t_dupacks so that we are less aggressive in 1552 * doing a fast retransmit. 1553 */ 1554 if (th->th_ack != tp->snd_una) { 1555 tp->t_dupacks = 0; 1556 break; 1557 } 1558 if (tiwin == tp->snd_wnd) { 1559 tcpstat_inc(tcps_rcvdupack); 1560 /* 1561 * If we have outstanding data (other than 1562 * a window probe), this is a completely 1563 * duplicate ack (ie, window info didn't 1564 * change), the ack is the biggest we've 1565 * seen and we've seen exactly our rexmt 1566 * threshold of them, assume a packet 1567 * has been dropped and retransmit it. 1568 * Kludge snd_nxt & the congestion 1569 * window so we send only this one 1570 * packet. 1571 * 1572 * We know we're losing at the current 1573 * window size so do congestion avoidance 1574 * (set ssthresh to half the current window 1575 * and pull our congestion window back to 1576 * the new ssthresh). 1577 * 1578 * Dup acks mean that packets have left the 1579 * network (they're now cached at the receiver) 1580 * so bump cwnd by the amount in the receiver 1581 * to keep a constant cwnd packets in the 1582 * network. 1583 */ 1584 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) 1585 tp->t_dupacks = 0; 1586 else if (++tp->t_dupacks == tcprexmtthresh) { 1587 tcp_seq onxt = tp->snd_nxt; 1588 u_long win = 1589 ulmin(tp->snd_wnd, tp->snd_cwnd) / 1590 2 / tp->t_maxseg; 1591 1592 if (SEQ_LT(th->th_ack, tp->snd_last)){ 1593 /* 1594 * False fast retx after 1595 * timeout. Do not cut window. 1596 */ 1597 tp->t_dupacks = 0; 1598 goto drop; 1599 } 1600 if (win < 2) 1601 win = 2; 1602 tp->snd_ssthresh = win * tp->t_maxseg; 1603 tp->snd_last = tp->snd_max; 1604 if (tp->sack_enable) { 1605 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1606 tp->t_rtttime = 0; 1607 #ifdef TCP_ECN 1608 tp->t_flags |= TF_SEND_CWR; 1609 #endif 1610 tcpstat_inc(tcps_cwr_frecovery); 1611 tcpstat_inc(tcps_sack_recovery_episode); 1612 /* 1613 * tcp_output() will send 1614 * oldest SACK-eligible rtx. 1615 */ 1616 (void) tcp_output(tp); 1617 tp->snd_cwnd = tp->snd_ssthresh+ 1618 tp->t_maxseg * tp->t_dupacks; 1619 goto drop; 1620 } 1621 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1622 tp->t_rtttime = 0; 1623 tp->snd_nxt = th->th_ack; 1624 tp->snd_cwnd = tp->t_maxseg; 1625 #ifdef TCP_ECN 1626 tp->t_flags |= TF_SEND_CWR; 1627 #endif 1628 tcpstat_inc(tcps_cwr_frecovery); 1629 tcpstat_inc(tcps_sndrexmitfast); 1630 (void) tcp_output(tp); 1631 1632 tp->snd_cwnd = tp->snd_ssthresh + 1633 tp->t_maxseg * tp->t_dupacks; 1634 if (SEQ_GT(onxt, tp->snd_nxt)) 1635 tp->snd_nxt = onxt; 1636 goto drop; 1637 } else if (tp->t_dupacks > tcprexmtthresh) { 1638 tp->snd_cwnd += tp->t_maxseg; 1639 (void) tcp_output(tp); 1640 goto drop; 1641 } 1642 } else if (tiwin < tp->snd_wnd) { 1643 /* 1644 * The window was retracted! Previous dup 1645 * ACKs may have been due to packets arriving 1646 * after the shrunken window, not a missing 1647 * packet, so play it safe and reset t_dupacks 1648 */ 1649 tp->t_dupacks = 0; 1650 } 1651 break; 1652 } 1653 /* 1654 * If the congestion window was inflated to account 1655 * for the other side's cached packets, retract it. 1656 */ 1657 if (tp->t_dupacks >= tcprexmtthresh) { 1658 /* Check for a partial ACK */ 1659 if (SEQ_LT(th->th_ack, tp->snd_last)) { 1660 if (tp->sack_enable) 1661 tcp_sack_partialack(tp, th); 1662 else 1663 tcp_newreno_partialack(tp, th); 1664 } else { 1665 /* Out of fast recovery */ 1666 tp->snd_cwnd = tp->snd_ssthresh; 1667 if (tcp_seq_subtract(tp->snd_max, th->th_ack) < 1668 tp->snd_ssthresh) 1669 tp->snd_cwnd = 1670 tcp_seq_subtract(tp->snd_max, 1671 th->th_ack); 1672 tp->t_dupacks = 0; 1673 } 1674 } else { 1675 /* 1676 * Reset the duplicate ACK counter if we 1677 * were not in fast recovery. 1678 */ 1679 tp->t_dupacks = 0; 1680 } 1681 if (SEQ_GT(th->th_ack, tp->snd_max)) { 1682 tcpstat_inc(tcps_rcvacktoomuch); 1683 goto dropafterack_ratelim; 1684 } 1685 acked = th->th_ack - tp->snd_una; 1686 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); 1687 tp->t_rcvacktime = now; 1688 1689 /* 1690 * If we have a timestamp reply, update smoothed 1691 * round trip time. If no timestamp is present but 1692 * transmit timer is running and timed sequence 1693 * number was acked, update smoothed round trip time. 1694 * Since we now have an rtt measurement, cancel the 1695 * timer backoff (cf., Phil Karn's retransmit alg.). 1696 * Recompute the initial retransmit timer. 1697 */ 1698 if (opti.ts_present && opti.ts_ecr) 1699 tcp_xmit_timer(tp, now - opti.ts_ecr); 1700 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 1701 tcp_xmit_timer(tp, now - tp->t_rtttime); 1702 1703 /* 1704 * If all outstanding data is acked, stop retransmit 1705 * timer and remember to restart (more output or persist). 1706 * If there is more data to be acked, restart retransmit 1707 * timer, using current (possibly backed-off) value. 1708 */ 1709 if (th->th_ack == tp->snd_max) { 1710 TCP_TIMER_DISARM(tp, TCPT_REXMT); 1711 tp->t_flags |= TF_NEEDOUTPUT; 1712 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 1713 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 1714 /* 1715 * When new data is acked, open the congestion window. 1716 * If the window gives us less than ssthresh packets 1717 * in flight, open exponentially (maxseg per packet). 1718 * Otherwise open linearly: maxseg per window 1719 * (maxseg^2 / cwnd per packet). 1720 */ 1721 { 1722 u_int cw = tp->snd_cwnd; 1723 u_int incr = tp->t_maxseg; 1724 1725 if (cw > tp->snd_ssthresh) 1726 incr = max(incr * incr / cw, 1); 1727 if (tp->t_dupacks < tcprexmtthresh) 1728 tp->snd_cwnd = ulmin(cw + incr, 1729 TCP_MAXWIN << tp->snd_scale); 1730 } 1731 ND6_HINT(tp); 1732 if (acked > so->so_snd.sb_cc) { 1733 if (tp->snd_wnd > so->so_snd.sb_cc) 1734 tp->snd_wnd -= so->so_snd.sb_cc; 1735 else 1736 tp->snd_wnd = 0; 1737 sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc); 1738 ourfinisacked = 1; 1739 } else { 1740 sbdrop(so, &so->so_snd, acked); 1741 if (tp->snd_wnd > acked) 1742 tp->snd_wnd -= acked; 1743 else 1744 tp->snd_wnd = 0; 1745 ourfinisacked = 0; 1746 } 1747 1748 tcp_update_sndspace(tp); 1749 if (sb_notify(so, &so->so_snd)) { 1750 tp->t_flags |= TF_BLOCKOUTPUT; 1751 sowwakeup(so); 1752 tp->t_flags &= ~TF_BLOCKOUTPUT; 1753 } 1754 1755 /* 1756 * If we had a pending ICMP message that referred to data 1757 * that have just been acknowledged, disregard the recorded 1758 * ICMP message. 1759 */ 1760 if ((tp->t_flags & TF_PMTUD_PEND) && 1761 SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 1762 tp->t_flags &= ~TF_PMTUD_PEND; 1763 1764 /* 1765 * Keep track of the largest chunk of data acknowledged 1766 * since last PMTU update 1767 */ 1768 if (tp->t_pmtud_mss_acked < acked) 1769 tp->t_pmtud_mss_acked = acked; 1770 1771 tp->snd_una = th->th_ack; 1772 #ifdef TCP_ECN 1773 /* sync snd_last with snd_una */ 1774 if (SEQ_GT(tp->snd_una, tp->snd_last)) 1775 tp->snd_last = tp->snd_una; 1776 #endif 1777 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 1778 tp->snd_nxt = tp->snd_una; 1779 1780 switch (tp->t_state) { 1781 1782 /* 1783 * In FIN_WAIT_1 STATE in addition to the processing 1784 * for the ESTABLISHED state if our FIN is now acknowledged 1785 * then enter FIN_WAIT_2. 1786 */ 1787 case TCPS_FIN_WAIT_1: 1788 if (ourfinisacked) { 1789 /* 1790 * If we can't receive any more 1791 * data, then closing user can proceed. 1792 * Starting the timer is contrary to the 1793 * specification, but if we don't get a FIN 1794 * we'll hang forever. 1795 */ 1796 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 1797 tp->t_flags |= TF_BLOCKOUTPUT; 1798 soisdisconnected(so); 1799 tp->t_flags &= ~TF_BLOCKOUTPUT; 1800 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1801 } 1802 tp->t_state = TCPS_FIN_WAIT_2; 1803 } 1804 break; 1805 1806 /* 1807 * In CLOSING STATE in addition to the processing for 1808 * the ESTABLISHED state if the ACK acknowledges our FIN 1809 * then enter the TIME-WAIT state, otherwise ignore 1810 * the segment. 1811 */ 1812 case TCPS_CLOSING: 1813 if (ourfinisacked) { 1814 tp->t_state = TCPS_TIME_WAIT; 1815 tcp_canceltimers(tp); 1816 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1817 tp->t_flags |= TF_BLOCKOUTPUT; 1818 soisdisconnected(so); 1819 tp->t_flags &= ~TF_BLOCKOUTPUT; 1820 } 1821 break; 1822 1823 /* 1824 * In LAST_ACK, we may still be waiting for data to drain 1825 * and/or to be acked, as well as for the ack of our FIN. 1826 * If our FIN is now acknowledged, delete the TCB, 1827 * enter the closed state and return. 1828 */ 1829 case TCPS_LAST_ACK: 1830 if (ourfinisacked) { 1831 tp = tcp_close(tp); 1832 goto drop; 1833 } 1834 break; 1835 1836 /* 1837 * In TIME_WAIT state the only thing that should arrive 1838 * is a retransmission of the remote FIN. Acknowledge 1839 * it and restart the finack timer. 1840 */ 1841 case TCPS_TIME_WAIT: 1842 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 1843 goto dropafterack; 1844 } 1845 } 1846 1847 step6: 1848 /* 1849 * Update window information. 1850 * Don't look at window if no ACK: TAC's send garbage on first SYN. 1851 */ 1852 if ((tiflags & TH_ACK) && 1853 (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && 1854 (SEQ_LT(tp->snd_wl2, th->th_ack) || 1855 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 1856 /* keep track of pure window updates */ 1857 if (tlen == 0 && 1858 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 1859 tcpstat_inc(tcps_rcvwinupd); 1860 tp->snd_wnd = tiwin; 1861 tp->snd_wl1 = th->th_seq; 1862 tp->snd_wl2 = th->th_ack; 1863 if (tp->snd_wnd > tp->max_sndwnd) 1864 tp->max_sndwnd = tp->snd_wnd; 1865 tp->t_flags |= TF_NEEDOUTPUT; 1866 } 1867 1868 /* 1869 * Process segments with URG. 1870 */ 1871 if ((tiflags & TH_URG) && th->th_urp && 1872 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1873 /* 1874 * This is a kludge, but if we receive and accept 1875 * random urgent pointers, we'll crash in 1876 * soreceive. It's hard to imagine someone 1877 * actually wanting to send this much urgent data. 1878 */ 1879 if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 1880 th->th_urp = 0; /* XXX */ 1881 tiflags &= ~TH_URG; /* XXX */ 1882 goto dodata; /* XXX */ 1883 } 1884 /* 1885 * If this segment advances the known urgent pointer, 1886 * then mark the data stream. This should not happen 1887 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 1888 * a FIN has been received from the remote side. 1889 * In these states we ignore the URG. 1890 * 1891 * According to RFC961 (Assigned Protocols), 1892 * the urgent pointer points to the last octet 1893 * of urgent data. We continue, however, 1894 * to consider it to indicate the first octet 1895 * of data past the urgent section as the original 1896 * spec states (in one of two places). 1897 */ 1898 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 1899 tp->rcv_up = th->th_seq + th->th_urp; 1900 so->so_oobmark = so->so_rcv.sb_cc + 1901 (tp->rcv_up - tp->rcv_nxt) - 1; 1902 if (so->so_oobmark == 0) 1903 so->so_rcv.sb_state |= SS_RCVATMARK; 1904 sohasoutofband(so); 1905 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1906 } 1907 /* 1908 * Remove out of band data so doesn't get presented to user. 1909 * This can happen independent of advancing the URG pointer, 1910 * but if two URG's are pending at once, some out-of-band 1911 * data may creep in... ick. 1912 */ 1913 if (th->th_urp <= (u_int16_t) tlen && 1914 (so->so_options & SO_OOBINLINE) == 0) 1915 tcp_pulloutofband(so, th->th_urp, m, hdroptlen); 1916 } else 1917 /* 1918 * If no out of band data is expected, 1919 * pull receive urgent pointer along 1920 * with the receive window. 1921 */ 1922 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 1923 tp->rcv_up = tp->rcv_nxt; 1924 dodata: /* XXX */ 1925 1926 /* 1927 * Process the segment text, merging it into the TCP sequencing queue, 1928 * and arranging for acknowledgment of receipt if necessary. 1929 * This process logically involves adjusting tp->rcv_wnd as data 1930 * is presented to the user (this happens in tcp_usrreq.c, 1931 * case PRU_RCVD). If a FIN has already been received on this 1932 * connection then we just ignore the text. 1933 */ 1934 if ((tlen || (tiflags & TH_FIN)) && 1935 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1936 tcp_seq laststart = th->th_seq; 1937 tcp_seq lastend = th->th_seq + tlen; 1938 1939 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) && 1940 tp->t_state == TCPS_ESTABLISHED) { 1941 TCP_SETUP_ACK(tp, tiflags, m); 1942 tp->rcv_nxt += tlen; 1943 tiflags = th->th_flags & TH_FIN; 1944 tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen); 1945 ND6_HINT(tp); 1946 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1947 m_freem(m); 1948 else { 1949 m_adj(m, hdroptlen); 1950 sbappendstream(so, &so->so_rcv, m); 1951 } 1952 tp->t_flags |= TF_BLOCKOUTPUT; 1953 sorwakeup(so); 1954 tp->t_flags &= ~TF_BLOCKOUTPUT; 1955 } else { 1956 m_adj(m, hdroptlen); 1957 tiflags = tcp_reass(tp, th, m, &tlen); 1958 tp->t_flags |= TF_ACKNOW; 1959 } 1960 if (tp->sack_enable) 1961 tcp_update_sack_list(tp, laststart, lastend); 1962 1963 /* 1964 * variable len never referenced again in modern BSD, 1965 * so why bother computing it ?? 1966 */ 1967 #if 0 1968 /* 1969 * Note the amount of data that peer has sent into 1970 * our window, in order to estimate the sender's 1971 * buffer size. 1972 */ 1973 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 1974 #endif /* 0 */ 1975 } else { 1976 m_freem(m); 1977 tiflags &= ~TH_FIN; 1978 } 1979 1980 /* 1981 * If FIN is received ACK the FIN and let the user know 1982 * that the connection is closing. Ignore a FIN received before 1983 * the connection is fully established. 1984 */ 1985 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 1986 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 1987 tp->t_flags |= TF_BLOCKOUTPUT; 1988 socantrcvmore(so); 1989 tp->t_flags &= ~TF_BLOCKOUTPUT; 1990 tp->t_flags |= TF_ACKNOW; 1991 tp->rcv_nxt++; 1992 } 1993 switch (tp->t_state) { 1994 1995 /* 1996 * In ESTABLISHED STATE enter the CLOSE_WAIT state. 1997 */ 1998 case TCPS_ESTABLISHED: 1999 tp->t_state = TCPS_CLOSE_WAIT; 2000 break; 2001 2002 /* 2003 * If still in FIN_WAIT_1 STATE FIN has not been acked so 2004 * enter the CLOSING state. 2005 */ 2006 case TCPS_FIN_WAIT_1: 2007 tp->t_state = TCPS_CLOSING; 2008 break; 2009 2010 /* 2011 * In FIN_WAIT_2 state enter the TIME_WAIT state, 2012 * starting the time-wait timer, turning off the other 2013 * standard timers. 2014 */ 2015 case TCPS_FIN_WAIT_2: 2016 tp->t_state = TCPS_TIME_WAIT; 2017 tcp_canceltimers(tp); 2018 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2019 tp->t_flags |= TF_BLOCKOUTPUT; 2020 soisdisconnected(so); 2021 tp->t_flags &= ~TF_BLOCKOUTPUT; 2022 break; 2023 2024 /* 2025 * In TIME_WAIT state restart the 2 MSL time_wait timer. 2026 */ 2027 case TCPS_TIME_WAIT: 2028 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL); 2029 break; 2030 } 2031 } 2032 if (otp) 2033 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen); 2034 2035 /* 2036 * Return any desired output. 2037 */ 2038 if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT)) 2039 (void) tcp_output(tp); 2040 in_pcbunref(inp); 2041 return IPPROTO_DONE; 2042 2043 badsyn: 2044 /* 2045 * Received a bad SYN. Increment counters and dropwithreset. 2046 */ 2047 tcpstat_inc(tcps_badsyn); 2048 tp = NULL; 2049 goto dropwithreset; 2050 2051 dropafterack_ratelim: 2052 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2053 tcp_ackdrop_ppslim) == 0) { 2054 /* XXX stat */ 2055 goto drop; 2056 } 2057 /* ...fall into dropafterack... */ 2058 2059 dropafterack: 2060 /* 2061 * Generate an ACK dropping incoming segment if it occupies 2062 * sequence space, where the ACK reflects our state. 2063 */ 2064 if (tiflags & TH_RST) 2065 goto drop; 2066 m_freem(m); 2067 tp->t_flags |= TF_ACKNOW; 2068 (void) tcp_output(tp); 2069 in_pcbunref(inp); 2070 return IPPROTO_DONE; 2071 2072 dropwithreset_ratelim: 2073 /* 2074 * We may want to rate-limit RSTs in certain situations, 2075 * particularly if we are sending an RST in response to 2076 * an attempt to connect to or otherwise communicate with 2077 * a port for which we have no socket. 2078 */ 2079 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2080 tcp_rst_ppslim) == 0) { 2081 /* XXX stat */ 2082 goto drop; 2083 } 2084 /* ...fall into dropwithreset... */ 2085 2086 dropwithreset: 2087 /* 2088 * Generate a RST, dropping incoming segment. 2089 * Make ACK acceptable to originator of segment. 2090 * Don't bother to respond to RST. 2091 */ 2092 if (tiflags & TH_RST) 2093 goto drop; 2094 if (tiflags & TH_ACK) { 2095 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, 2096 TH_RST, m->m_pkthdr.ph_rtableid, now); 2097 } else { 2098 if (tiflags & TH_SYN) 2099 tlen++; 2100 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen, 2101 (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now); 2102 } 2103 m_freem(m); 2104 in_pcbunref(inp); 2105 return IPPROTO_DONE; 2106 2107 drop: 2108 /* 2109 * Drop space held by incoming segment and return. 2110 */ 2111 if (otp) 2112 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen); 2113 2114 m_freem(m); 2115 in_pcbunref(inp); 2116 return IPPROTO_DONE; 2117 } 2118 2119 int 2120 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th, 2121 struct mbuf *m, int iphlen, struct tcp_opt_info *oi, 2122 u_int rtableid, uint64_t now) 2123 { 2124 u_int16_t mss = 0; 2125 int opt, optlen; 2126 #ifdef TCP_SIGNATURE 2127 caddr_t sigp = NULL; 2128 struct tdb *tdb = NULL; 2129 #endif /* TCP_SIGNATURE */ 2130 2131 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 2132 opt = cp[0]; 2133 if (opt == TCPOPT_EOL) 2134 break; 2135 if (opt == TCPOPT_NOP) 2136 optlen = 1; 2137 else { 2138 if (cnt < 2) 2139 break; 2140 optlen = cp[1]; 2141 if (optlen < 2 || optlen > cnt) 2142 break; 2143 } 2144 switch (opt) { 2145 2146 default: 2147 continue; 2148 2149 case TCPOPT_MAXSEG: 2150 if (optlen != TCPOLEN_MAXSEG) 2151 continue; 2152 if (!(th->th_flags & TH_SYN)) 2153 continue; 2154 if (TCPS_HAVERCVDSYN(tp->t_state)) 2155 continue; 2156 memcpy(&mss, cp + 2, sizeof(mss)); 2157 mss = ntohs(mss); 2158 oi->maxseg = mss; 2159 break; 2160 2161 case TCPOPT_WINDOW: 2162 if (optlen != TCPOLEN_WINDOW) 2163 continue; 2164 if (!(th->th_flags & TH_SYN)) 2165 continue; 2166 if (TCPS_HAVERCVDSYN(tp->t_state)) 2167 continue; 2168 tp->t_flags |= TF_RCVD_SCALE; 2169 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); 2170 break; 2171 2172 case TCPOPT_TIMESTAMP: 2173 if (optlen != TCPOLEN_TIMESTAMP) 2174 continue; 2175 oi->ts_present = 1; 2176 memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 2177 oi->ts_val = ntohl(oi->ts_val); 2178 memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 2179 oi->ts_ecr = ntohl(oi->ts_ecr); 2180 2181 if (!(th->th_flags & TH_SYN)) 2182 continue; 2183 if (TCPS_HAVERCVDSYN(tp->t_state)) 2184 continue; 2185 /* 2186 * A timestamp received in a SYN makes 2187 * it ok to send timestamp requests and replies. 2188 */ 2189 tp->t_flags |= TF_RCVD_TSTMP; 2190 tp->ts_recent = oi->ts_val; 2191 tp->ts_recent_age = now; 2192 break; 2193 2194 case TCPOPT_SACK_PERMITTED: 2195 if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED) 2196 continue; 2197 if (!(th->th_flags & TH_SYN)) 2198 continue; 2199 if (TCPS_HAVERCVDSYN(tp->t_state)) 2200 continue; 2201 /* MUST only be set on SYN */ 2202 tp->t_flags |= TF_SACK_PERMIT; 2203 break; 2204 case TCPOPT_SACK: 2205 tcp_sack_option(tp, th, cp, optlen); 2206 break; 2207 #ifdef TCP_SIGNATURE 2208 case TCPOPT_SIGNATURE: 2209 if (optlen != TCPOLEN_SIGNATURE) 2210 continue; 2211 2212 if (sigp && timingsafe_bcmp(sigp, cp + 2, 16)) 2213 goto bad; 2214 2215 sigp = cp + 2; 2216 break; 2217 #endif /* TCP_SIGNATURE */ 2218 } 2219 } 2220 2221 #ifdef TCP_SIGNATURE 2222 if (tp->t_flags & TF_SIGNATURE) { 2223 union sockaddr_union src, dst; 2224 2225 memset(&src, 0, sizeof(union sockaddr_union)); 2226 memset(&dst, 0, sizeof(union sockaddr_union)); 2227 2228 switch (tp->pf) { 2229 case 0: 2230 case AF_INET: 2231 src.sa.sa_len = sizeof(struct sockaddr_in); 2232 src.sa.sa_family = AF_INET; 2233 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 2234 dst.sa.sa_len = sizeof(struct sockaddr_in); 2235 dst.sa.sa_family = AF_INET; 2236 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 2237 break; 2238 #ifdef INET6 2239 case AF_INET6: 2240 src.sa.sa_len = sizeof(struct sockaddr_in6); 2241 src.sa.sa_family = AF_INET6; 2242 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 2243 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2244 dst.sa.sa_family = AF_INET6; 2245 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 2246 break; 2247 #endif /* INET6 */ 2248 } 2249 2250 tdb = gettdbbysrcdst(rtable_l2(rtableid), 2251 0, &src, &dst, IPPROTO_TCP); 2252 2253 /* 2254 * We don't have an SA for this peer, so we turn off 2255 * TF_SIGNATURE on the listen socket 2256 */ 2257 if (tdb == NULL && tp->t_state == TCPS_LISTEN) 2258 tp->t_flags &= ~TF_SIGNATURE; 2259 2260 } 2261 2262 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) { 2263 tcpstat_inc(tcps_rcvbadsig); 2264 goto bad; 2265 } 2266 2267 if (sigp) { 2268 char sig[16]; 2269 2270 if (tdb == NULL) { 2271 tcpstat_inc(tcps_rcvbadsig); 2272 goto bad; 2273 } 2274 2275 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0) 2276 goto bad; 2277 2278 if (timingsafe_bcmp(sig, sigp, 16)) { 2279 tcpstat_inc(tcps_rcvbadsig); 2280 goto bad; 2281 } 2282 2283 tcpstat_inc(tcps_rcvgoodsig); 2284 } 2285 2286 tdb_unref(tdb); 2287 #endif /* TCP_SIGNATURE */ 2288 2289 return (0); 2290 2291 #ifdef TCP_SIGNATURE 2292 bad: 2293 tdb_unref(tdb); 2294 #endif /* TCP_SIGNATURE */ 2295 return (-1); 2296 } 2297 2298 u_long 2299 tcp_seq_subtract(u_long a, u_long b) 2300 { 2301 return ((long)(a - b)); 2302 } 2303 2304 /* 2305 * This function is called upon receipt of new valid data (while not in header 2306 * prediction mode), and it updates the ordered list of sacks. 2307 */ 2308 void 2309 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, 2310 tcp_seq rcv_lastend) 2311 { 2312 /* 2313 * First reported block MUST be the most recent one. Subsequent 2314 * blocks SHOULD be in the order in which they arrived at the 2315 * receiver. These two conditions make the implementation fully 2316 * compliant with RFC 2018. 2317 */ 2318 int i, j = 0, count = 0, lastpos = -1; 2319 struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; 2320 2321 /* First clean up current list of sacks */ 2322 for (i = 0; i < tp->rcv_numsacks; i++) { 2323 sack = tp->sackblks[i]; 2324 if (sack.start == 0 && sack.end == 0) { 2325 count++; /* count = number of blocks to be discarded */ 2326 continue; 2327 } 2328 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { 2329 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2330 count++; 2331 } else { 2332 temp[j].start = tp->sackblks[i].start; 2333 temp[j++].end = tp->sackblks[i].end; 2334 } 2335 } 2336 tp->rcv_numsacks -= count; 2337 if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ 2338 tcp_clean_sackreport(tp); 2339 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { 2340 /* ==> need first sack block */ 2341 tp->sackblks[0].start = rcv_laststart; 2342 tp->sackblks[0].end = rcv_lastend; 2343 tp->rcv_numsacks = 1; 2344 } 2345 return; 2346 } 2347 /* Otherwise, sack blocks are already present. */ 2348 for (i = 0; i < tp->rcv_numsacks; i++) 2349 tp->sackblks[i] = temp[i]; /* first copy back sack list */ 2350 if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) 2351 return; /* sack list remains unchanged */ 2352 /* 2353 * From here, segment just received should be (part of) the 1st sack. 2354 * Go through list, possibly coalescing sack block entries. 2355 */ 2356 firstsack.start = rcv_laststart; 2357 firstsack.end = rcv_lastend; 2358 for (i = 0; i < tp->rcv_numsacks; i++) { 2359 sack = tp->sackblks[i]; 2360 if (SEQ_LT(sack.end, firstsack.start) || 2361 SEQ_GT(sack.start, firstsack.end)) 2362 continue; /* no overlap */ 2363 if (sack.start == firstsack.start && sack.end == firstsack.end){ 2364 /* 2365 * identical block; delete it here since we will 2366 * move it to the front of the list. 2367 */ 2368 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2369 lastpos = i; /* last posn with a zero entry */ 2370 continue; 2371 } 2372 if (SEQ_LEQ(sack.start, firstsack.start)) 2373 firstsack.start = sack.start; /* merge blocks */ 2374 if (SEQ_GEQ(sack.end, firstsack.end)) 2375 firstsack.end = sack.end; /* merge blocks */ 2376 tp->sackblks[i].start = tp->sackblks[i].end = 0; 2377 lastpos = i; /* last posn with a zero entry */ 2378 } 2379 if (lastpos != -1) { /* at least one merge */ 2380 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { 2381 sack = tp->sackblks[i]; 2382 if (sack.start == 0 && sack.end == 0) 2383 continue; 2384 temp[j++] = sack; 2385 } 2386 tp->rcv_numsacks = j; /* including first blk (added later) */ 2387 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ 2388 tp->sackblks[i] = temp[i]; 2389 } else { /* no merges -- shift sacks by 1 */ 2390 if (tp->rcv_numsacks < MAX_SACK_BLKS) 2391 tp->rcv_numsacks++; 2392 for (i = tp->rcv_numsacks-1; i > 0; i--) 2393 tp->sackblks[i] = tp->sackblks[i-1]; 2394 } 2395 tp->sackblks[0] = firstsack; 2396 return; 2397 } 2398 2399 /* 2400 * Process the TCP SACK option. tp->snd_holes is an ordered list 2401 * of holes (oldest to newest, in terms of the sequence space). 2402 */ 2403 void 2404 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) 2405 { 2406 int tmp_olen; 2407 u_char *tmp_cp; 2408 struct sackhole *cur, *p, *temp; 2409 2410 if (!tp->sack_enable) 2411 return; 2412 /* SACK without ACK doesn't make sense. */ 2413 if ((th->th_flags & TH_ACK) == 0) 2414 return; 2415 /* Make sure the ACK on this segment is in [snd_una, snd_max]. */ 2416 if (SEQ_LT(th->th_ack, tp->snd_una) || 2417 SEQ_GT(th->th_ack, tp->snd_max)) 2418 return; 2419 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2420 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 2421 return; 2422 /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ 2423 tmp_cp = cp + 2; 2424 tmp_olen = optlen - 2; 2425 tcpstat_inc(tcps_sack_rcv_opts); 2426 if (tp->snd_numholes < 0) 2427 tp->snd_numholes = 0; 2428 if (tp->t_maxseg == 0) 2429 panic("tcp_sack_option"); /* Should never happen */ 2430 while (tmp_olen > 0) { 2431 struct sackblk sack; 2432 2433 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq)); 2434 sack.start = ntohl(sack.start); 2435 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq)); 2436 sack.end = ntohl(sack.end); 2437 tmp_olen -= TCPOLEN_SACK; 2438 tmp_cp += TCPOLEN_SACK; 2439 if (SEQ_LEQ(sack.end, sack.start)) 2440 continue; /* bad SACK fields */ 2441 if (SEQ_LEQ(sack.end, tp->snd_una)) 2442 continue; /* old block */ 2443 if (SEQ_GT(th->th_ack, tp->snd_una)) { 2444 if (SEQ_LT(sack.start, th->th_ack)) 2445 continue; 2446 } 2447 if (SEQ_GT(sack.end, tp->snd_max)) 2448 continue; 2449 if (tp->snd_holes == NULL) { /* first hole */ 2450 tp->snd_holes = (struct sackhole *) 2451 pool_get(&sackhl_pool, PR_NOWAIT); 2452 if (tp->snd_holes == NULL) { 2453 /* ENOBUFS, so ignore SACKed block for now */ 2454 goto dropped; 2455 } 2456 cur = tp->snd_holes; 2457 cur->start = th->th_ack; 2458 cur->end = sack.start; 2459 cur->rxmit = cur->start; 2460 cur->next = NULL; 2461 tp->snd_numholes = 1; 2462 tp->rcv_lastsack = sack.end; 2463 /* 2464 * dups is at least one. If more data has been 2465 * SACKed, it can be greater than one. 2466 */ 2467 cur->dups = min(tcprexmtthresh, 2468 ((sack.end - cur->end)/tp->t_maxseg)); 2469 if (cur->dups < 1) 2470 cur->dups = 1; 2471 continue; /* with next sack block */ 2472 } 2473 /* Go thru list of holes: p = previous, cur = current */ 2474 p = cur = tp->snd_holes; 2475 while (cur) { 2476 if (SEQ_LEQ(sack.end, cur->start)) 2477 /* SACKs data before the current hole */ 2478 break; /* no use going through more holes */ 2479 if (SEQ_GEQ(sack.start, cur->end)) { 2480 /* SACKs data beyond the current hole */ 2481 cur->dups++; 2482 if (((sack.end - cur->end)/tp->t_maxseg) >= 2483 tcprexmtthresh) 2484 cur->dups = tcprexmtthresh; 2485 p = cur; 2486 cur = cur->next; 2487 continue; 2488 } 2489 if (SEQ_LEQ(sack.start, cur->start)) { 2490 /* Data acks at least the beginning of hole */ 2491 if (SEQ_GEQ(sack.end, cur->end)) { 2492 /* Acks entire hole, so delete hole */ 2493 if (p != cur) { 2494 p->next = cur->next; 2495 pool_put(&sackhl_pool, cur); 2496 cur = p->next; 2497 } else { 2498 cur = cur->next; 2499 pool_put(&sackhl_pool, p); 2500 p = cur; 2501 tp->snd_holes = p; 2502 } 2503 tp->snd_numholes--; 2504 continue; 2505 } 2506 /* otherwise, move start of hole forward */ 2507 cur->start = sack.end; 2508 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 2509 p = cur; 2510 cur = cur->next; 2511 continue; 2512 } 2513 /* move end of hole backward */ 2514 if (SEQ_GEQ(sack.end, cur->end)) { 2515 cur->end = sack.start; 2516 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2517 cur->dups++; 2518 if (((sack.end - cur->end)/tp->t_maxseg) >= 2519 tcprexmtthresh) 2520 cur->dups = tcprexmtthresh; 2521 p = cur; 2522 cur = cur->next; 2523 continue; 2524 } 2525 if (SEQ_LT(cur->start, sack.start) && 2526 SEQ_GT(cur->end, sack.end)) { 2527 /* 2528 * ACKs some data in middle of a hole; need to 2529 * split current hole 2530 */ 2531 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2532 goto dropped; 2533 temp = (struct sackhole *) 2534 pool_get(&sackhl_pool, PR_NOWAIT); 2535 if (temp == NULL) 2536 goto dropped; /* ENOBUFS */ 2537 temp->next = cur->next; 2538 temp->start = sack.end; 2539 temp->end = cur->end; 2540 temp->dups = cur->dups; 2541 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); 2542 cur->end = sack.start; 2543 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 2544 cur->dups++; 2545 if (((sack.end - cur->end)/tp->t_maxseg) >= 2546 tcprexmtthresh) 2547 cur->dups = tcprexmtthresh; 2548 cur->next = temp; 2549 p = temp; 2550 cur = p->next; 2551 tp->snd_numholes++; 2552 } 2553 } 2554 /* At this point, p points to the last hole on the list */ 2555 if (SEQ_LT(tp->rcv_lastsack, sack.start)) { 2556 /* 2557 * Need to append new hole at end. 2558 * Last hole is p (and it's not NULL). 2559 */ 2560 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT) 2561 goto dropped; 2562 temp = (struct sackhole *) 2563 pool_get(&sackhl_pool, PR_NOWAIT); 2564 if (temp == NULL) 2565 goto dropped; /* ENOBUFS */ 2566 temp->start = tp->rcv_lastsack; 2567 temp->end = sack.start; 2568 temp->dups = min(tcprexmtthresh, 2569 ((sack.end - sack.start)/tp->t_maxseg)); 2570 if (temp->dups < 1) 2571 temp->dups = 1; 2572 temp->rxmit = temp->start; 2573 temp->next = 0; 2574 p->next = temp; 2575 tp->rcv_lastsack = sack.end; 2576 tp->snd_numholes++; 2577 } 2578 } 2579 return; 2580 dropped: 2581 tcpstat_inc(tcps_sack_drop_opts); 2582 } 2583 2584 /* 2585 * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if 2586 * it is completely acked; otherwise, tcp_sack_option(), called from 2587 * tcp_dooptions(), will fix up the hole. 2588 */ 2589 void 2590 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th) 2591 { 2592 if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { 2593 /* max because this could be an older ack just arrived */ 2594 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 2595 th->th_ack : tp->snd_una; 2596 struct sackhole *cur = tp->snd_holes; 2597 struct sackhole *prev; 2598 while (cur) 2599 if (SEQ_LEQ(cur->end, lastack)) { 2600 prev = cur; 2601 cur = cur->next; 2602 pool_put(&sackhl_pool, prev); 2603 tp->snd_numholes--; 2604 } else if (SEQ_LT(cur->start, lastack)) { 2605 cur->start = lastack; 2606 if (SEQ_LT(cur->rxmit, cur->start)) 2607 cur->rxmit = cur->start; 2608 break; 2609 } else 2610 break; 2611 tp->snd_holes = cur; 2612 } 2613 } 2614 2615 /* 2616 * Delete all receiver-side SACK information. 2617 */ 2618 void 2619 tcp_clean_sackreport(struct tcpcb *tp) 2620 { 2621 int i; 2622 2623 tp->rcv_numsacks = 0; 2624 for (i = 0; i < MAX_SACK_BLKS; i++) 2625 tp->sackblks[i].start = tp->sackblks[i].end=0; 2626 2627 } 2628 2629 /* 2630 * Partial ack handling within a sack recovery episode. When a partial ack 2631 * arrives, turn off retransmission timer, deflate the window, do not clear 2632 * tp->t_dupacks. 2633 */ 2634 void 2635 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) 2636 { 2637 /* Turn off retx. timer (will start again next segment) */ 2638 TCP_TIMER_DISARM(tp, TCPT_REXMT); 2639 tp->t_rtttime = 0; 2640 /* 2641 * Partial window deflation. This statement relies on the 2642 * fact that tp->snd_una has not been updated yet. 2643 */ 2644 if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { 2645 tp->snd_cwnd -= th->th_ack - tp->snd_una; 2646 tp->snd_cwnd += tp->t_maxseg; 2647 } else 2648 tp->snd_cwnd = tp->t_maxseg; 2649 tp->snd_cwnd += tp->t_maxseg; 2650 tp->t_flags |= TF_NEEDOUTPUT; 2651 } 2652 2653 /* 2654 * Pull out of band byte out of a segment so 2655 * it doesn't appear in the user's data queue. 2656 * It is still reflected in the segment length for 2657 * sequencing purposes. 2658 */ 2659 void 2660 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off) 2661 { 2662 int cnt = off + urgent - 1; 2663 2664 while (cnt >= 0) { 2665 if (m->m_len > cnt) { 2666 char *cp = mtod(m, caddr_t) + cnt; 2667 struct tcpcb *tp = sototcpcb(so); 2668 2669 tp->t_iobc = *cp; 2670 tp->t_oobflags |= TCPOOB_HAVEDATA; 2671 memmove(cp, cp + 1, m->m_len - cnt - 1); 2672 m->m_len--; 2673 return; 2674 } 2675 cnt -= m->m_len; 2676 m = m->m_next; 2677 if (m == NULL) 2678 break; 2679 } 2680 panic("tcp_pulloutofband"); 2681 } 2682 2683 /* 2684 * Collect new round-trip time estimate 2685 * and update averages and current timeout. 2686 */ 2687 void 2688 tcp_xmit_timer(struct tcpcb *tp, int32_t rtt) 2689 { 2690 int delta, rttmin; 2691 2692 if (rtt < 0) 2693 rtt = 0; 2694 else if (rtt > TCP_RTT_MAX) 2695 rtt = TCP_RTT_MAX; 2696 2697 tcpstat_inc(tcps_rttupdated); 2698 if (tp->t_srtt != 0) { 2699 /* 2700 * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits 2701 * after the binary point (scaled by 4), whereas 2702 * srtt is stored as fixed point with 5 bits after the 2703 * binary point (i.e., scaled by 32). The following magic 2704 * is equivalent to the smoothing algorithm in rfc793 with 2705 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 2706 * point). 2707 */ 2708 delta = (rtt << TCP_RTT_BASE_SHIFT) - 2709 (tp->t_srtt >> TCP_RTT_SHIFT); 2710 if ((tp->t_srtt += delta) <= 0) 2711 tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT; 2712 /* 2713 * We accumulate a smoothed rtt variance (actually, a 2714 * smoothed mean difference), then set the retransmit 2715 * timer to smoothed rtt + 4 times the smoothed variance. 2716 * rttvar is stored as fixed point with 4 bits after the 2717 * binary point (scaled by 16). The following is 2718 * equivalent to rfc793 smoothing with an alpha of .75 2719 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 2720 * rfc793's wired-in beta. 2721 */ 2722 if (delta < 0) 2723 delta = -delta; 2724 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 2725 if ((tp->t_rttvar += delta) <= 0) 2726 tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT; 2727 } else { 2728 /* 2729 * No rtt measurement yet - use the unsmoothed rtt. 2730 * Set the variance to half the rtt (so our first 2731 * retransmit happens at 3*rtt). 2732 */ 2733 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 2734 tp->t_rttvar = (rtt + 1) << 2735 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 2736 } 2737 tp->t_rtttime = 0; 2738 tp->t_rxtshift = 0; 2739 2740 /* 2741 * the retransmit should happen at rtt + 4 * rttvar. 2742 * Because of the way we do the smoothing, srtt and rttvar 2743 * will each average +1/2 tick of bias. When we compute 2744 * the retransmit timer, we want 1/2 tick of rounding and 2745 * 1 extra tick because of +-1/2 tick uncertainty in the 2746 * firing of the timer. The bias will give us exactly the 2747 * 1.5 tick we need. But, because the bias is 2748 * statistical, we have to test that we don't drop below 2749 * the minimum feasible timer (which is 2 ticks). 2750 */ 2751 rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)), 2752 TCPTV_REXMTMAX); 2753 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX); 2754 2755 /* 2756 * We received an ack for a packet that wasn't retransmitted; 2757 * it is probably safe to discard any error indications we've 2758 * received recently. This isn't quite right, but close enough 2759 * for now (a route might have failed after we sent a segment, 2760 * and the return path might not be symmetrical). 2761 */ 2762 tp->t_softerror = 0; 2763 } 2764 2765 /* 2766 * Determine a reasonable value for maxseg size. 2767 * If the route is known, check route for mtu. 2768 * If none, use an mss that can be handled on the outgoing 2769 * interface without forcing IP to fragment; if bigger than 2770 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES 2771 * to utilize large mbufs. If no route is found, route has no mtu, 2772 * or the destination isn't local, use a default, hopefully conservative 2773 * size (usually 512 or the default IP max size, but no more than the mtu 2774 * of the interface), as we can't discover anything about intervening 2775 * gateways or networks. We also initialize the congestion/slow start 2776 * window to be a single segment if the destination isn't local. 2777 * While looking at the routing entry, we also initialize other path-dependent 2778 * parameters from pre-set or cached values in the routing entry. 2779 * 2780 * Also take into account the space needed for options that we 2781 * send regularly. Make maxseg shorter by that amount to assure 2782 * that we can send maxseg amount of data even when the options 2783 * are present. Store the upper limit of the length of options plus 2784 * data in maxopd. 2785 * 2786 * NOTE: offer == -1 indicates that the maxseg size changed due to 2787 * Path MTU discovery. 2788 */ 2789 int 2790 tcp_mss(struct tcpcb *tp, int offer) 2791 { 2792 struct rtentry *rt; 2793 struct ifnet *ifp = NULL; 2794 int mss, mssopt; 2795 int iphlen; 2796 struct inpcb *inp; 2797 2798 inp = tp->t_inpcb; 2799 2800 mssopt = mss = tcp_mssdflt; 2801 2802 rt = in_pcbrtentry(inp); 2803 2804 if (rt == NULL) 2805 goto out; 2806 2807 ifp = if_get(rt->rt_ifidx); 2808 if (ifp == NULL) 2809 goto out; 2810 2811 switch (tp->pf) { 2812 #ifdef INET6 2813 case AF_INET6: 2814 iphlen = sizeof(struct ip6_hdr); 2815 break; 2816 #endif 2817 case AF_INET: 2818 iphlen = sizeof(struct ip); 2819 break; 2820 default: 2821 /* the family does not support path MTU discovery */ 2822 goto out; 2823 } 2824 2825 /* 2826 * if there's an mtu associated with the route and we support 2827 * path MTU discovery for the underlying protocol family, use it. 2828 */ 2829 if (rt->rt_mtu) { 2830 /* 2831 * One may wish to lower MSS to take into account options, 2832 * especially security-related options. 2833 */ 2834 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) { 2835 /* 2836 * RFC2460 section 5, last paragraph: if path MTU is 2837 * smaller than 1280, use 1280 as packet size and 2838 * attach fragment header. 2839 */ 2840 mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) - 2841 sizeof(struct tcphdr); 2842 } else { 2843 mss = rt->rt_mtu - iphlen - 2844 sizeof(struct tcphdr); 2845 } 2846 } else if (ifp->if_flags & IFF_LOOPBACK) { 2847 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2848 } else if (tp->pf == AF_INET) { 2849 if (ip_mtudisc) 2850 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2851 } 2852 #ifdef INET6 2853 else if (tp->pf == AF_INET6) { 2854 /* 2855 * for IPv6, path MTU discovery is always turned on, 2856 * or the node must use packet size <= 1280. 2857 */ 2858 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2859 } 2860 #endif /* INET6 */ 2861 2862 /* Calculate the value that we offer in TCPOPT_MAXSEG */ 2863 if (offer != -1) { 2864 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr); 2865 mssopt = max(tcp_mssdflt, mssopt); 2866 } 2867 out: 2868 if_put(ifp); 2869 /* 2870 * The current mss, t_maxseg, is initialized to the default value. 2871 * If we compute a smaller value, reduce the current mss. 2872 * If we compute a larger value, return it for use in sending 2873 * a max seg size option, but don't store it for use 2874 * unless we received an offer at least that large from peer. 2875 * 2876 * However, do not accept offers lower than the minimum of 2877 * the interface MTU and 216. 2878 */ 2879 if (offer > 0) 2880 tp->t_peermss = offer; 2881 if (tp->t_peermss) 2882 mss = min(mss, max(tp->t_peermss, 216)); 2883 2884 /* sanity - at least max opt. space */ 2885 mss = max(mss, 64); 2886 2887 /* 2888 * maxopd stores the maximum length of data AND options 2889 * in a segment; maxseg is the amount of data in a normal 2890 * segment. We need to store this value (maxopd) apart 2891 * from maxseg, because now every segment carries options 2892 * and thus we normally have somewhat less data in segments. 2893 */ 2894 tp->t_maxopd = mss; 2895 2896 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2897 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2898 mss -= TCPOLEN_TSTAMP_APPA; 2899 #ifdef TCP_SIGNATURE 2900 if (tp->t_flags & TF_SIGNATURE) 2901 mss -= TCPOLEN_SIGLEN; 2902 #endif 2903 2904 if (offer == -1) { 2905 /* mss changed due to Path MTU discovery */ 2906 tp->t_flags &= ~TF_PMTUD_PEND; 2907 tp->t_pmtud_mtu_sent = 0; 2908 tp->t_pmtud_mss_acked = 0; 2909 if (mss < tp->t_maxseg) { 2910 /* 2911 * Follow suggestion in RFC 2414 to reduce the 2912 * congestion window by the ratio of the old 2913 * segment size to the new segment size. 2914 */ 2915 tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) * 2916 mss, mss); 2917 } 2918 } else if (tcp_do_rfc3390 == 2) { 2919 /* increase initial window */ 2920 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600)); 2921 } else if (tcp_do_rfc3390) { 2922 /* increase initial window */ 2923 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380)); 2924 } else 2925 tp->snd_cwnd = mss; 2926 2927 tp->t_maxseg = mss; 2928 2929 return (offer != -1 ? mssopt : mss); 2930 } 2931 2932 u_int 2933 tcp_hdrsz(struct tcpcb *tp) 2934 { 2935 u_int hlen; 2936 2937 switch (tp->pf) { 2938 #ifdef INET6 2939 case AF_INET6: 2940 hlen = sizeof(struct ip6_hdr); 2941 break; 2942 #endif 2943 case AF_INET: 2944 hlen = sizeof(struct ip); 2945 break; 2946 default: 2947 hlen = 0; 2948 break; 2949 } 2950 hlen += sizeof(struct tcphdr); 2951 2952 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 2953 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 2954 hlen += TCPOLEN_TSTAMP_APPA; 2955 #ifdef TCP_SIGNATURE 2956 if (tp->t_flags & TF_SIGNATURE) 2957 hlen += TCPOLEN_SIGLEN; 2958 #endif 2959 return (hlen); 2960 } 2961 2962 /* 2963 * Set connection variables based on the effective MSS. 2964 * We are passed the TCPCB for the actual connection. If we 2965 * are the server, we are called by the compressed state engine 2966 * when the 3-way handshake is complete. If we are the client, 2967 * we are called when we receive the SYN,ACK from the server. 2968 * 2969 * NOTE: The t_maxseg value must be initialized in the TCPCB 2970 * before this routine is called! 2971 */ 2972 void 2973 tcp_mss_update(struct tcpcb *tp) 2974 { 2975 int mss; 2976 u_long bufsize; 2977 struct rtentry *rt; 2978 struct socket *so; 2979 2980 so = tp->t_inpcb->inp_socket; 2981 mss = tp->t_maxseg; 2982 2983 rt = in_pcbrtentry(tp->t_inpcb); 2984 2985 if (rt == NULL) 2986 return; 2987 2988 bufsize = so->so_snd.sb_hiwat; 2989 if (bufsize < mss) { 2990 mss = bufsize; 2991 /* Update t_maxseg and t_maxopd */ 2992 tcp_mss(tp, mss); 2993 } else { 2994 bufsize = roundup(bufsize, mss); 2995 if (bufsize > sb_max) 2996 bufsize = sb_max; 2997 (void)sbreserve(so, &so->so_snd, bufsize); 2998 } 2999 3000 bufsize = so->so_rcv.sb_hiwat; 3001 if (bufsize > mss) { 3002 bufsize = roundup(bufsize, mss); 3003 if (bufsize > sb_max) 3004 bufsize = sb_max; 3005 (void)sbreserve(so, &so->so_rcv, bufsize); 3006 } 3007 3008 } 3009 3010 /* 3011 * When a partial ack arrives, force the retransmission of the 3012 * next unacknowledged segment. Do not clear tp->t_dupacks. 3013 * By setting snd_nxt to ti_ack, this forces retransmission timer 3014 * to be started again. 3015 */ 3016 void 3017 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th) 3018 { 3019 /* 3020 * snd_una has not been updated and the socket send buffer 3021 * not yet drained of the acked data, so we have to leave 3022 * snd_una as it was to get the correct data offset in 3023 * tcp_output(). 3024 */ 3025 tcp_seq onxt = tp->snd_nxt; 3026 u_long ocwnd = tp->snd_cwnd; 3027 3028 TCP_TIMER_DISARM(tp, TCPT_REXMT); 3029 tp->t_rtttime = 0; 3030 tp->snd_nxt = th->th_ack; 3031 /* 3032 * Set snd_cwnd to one segment beyond acknowledged offset 3033 * (tp->snd_una not yet updated when this function is called) 3034 */ 3035 tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); 3036 (void)tcp_output(tp); 3037 tp->snd_cwnd = ocwnd; 3038 if (SEQ_GT(onxt, tp->snd_nxt)) 3039 tp->snd_nxt = onxt; 3040 /* 3041 * Partial window deflation. Relies on fact that tp->snd_una 3042 * not updated yet. 3043 */ 3044 if (tp->snd_cwnd > th->th_ack - tp->snd_una) 3045 tp->snd_cwnd -= th->th_ack - tp->snd_una; 3046 else 3047 tp->snd_cwnd = 0; 3048 tp->snd_cwnd += tp->t_maxseg; 3049 } 3050 3051 int 3052 tcp_mss_adv(struct mbuf *m, int af) 3053 { 3054 int mss = 0; 3055 int iphlen; 3056 struct ifnet *ifp = NULL; 3057 3058 if (m && (m->m_flags & M_PKTHDR)) 3059 ifp = if_get(m->m_pkthdr.ph_ifidx); 3060 3061 switch (af) { 3062 case AF_INET: 3063 if (ifp != NULL) 3064 mss = ifp->if_mtu; 3065 iphlen = sizeof(struct ip); 3066 break; 3067 #ifdef INET6 3068 case AF_INET6: 3069 if (ifp != NULL) 3070 mss = ifp->if_mtu; 3071 iphlen = sizeof(struct ip6_hdr); 3072 break; 3073 #endif 3074 default: 3075 unhandled_af(af); 3076 } 3077 if_put(ifp); 3078 mss = mss - iphlen - sizeof(struct tcphdr); 3079 return (max(mss, tcp_mssdflt)); 3080 } 3081 3082 /* 3083 * TCP compressed state engine. Currently used to hold compressed 3084 * state for SYN_RECEIVED. 3085 */ 3086 3087 /* 3088 * Locks used to protect global data and struct members: 3089 * N net lock 3090 * S syn_cache_mtx tcp syn cache global mutex 3091 */ 3092 3093 /* syn hash parameters */ 3094 int tcp_syn_hash_size = TCP_SYN_HASH_SIZE; /* [N] size of hash table */ 3095 int tcp_syn_cache_limit = /* [N] global entry limit */ 3096 TCP_SYN_HASH_SIZE * TCP_SYN_BUCKET_SIZE; 3097 int tcp_syn_bucket_limit = /* [N] per bucket limit */ 3098 3 * TCP_SYN_BUCKET_SIZE; 3099 int tcp_syn_use_limit = 100000; /* [N] reseed after uses */ 3100 3101 struct pool syn_cache_pool; 3102 struct syn_cache_set tcp_syn_cache[2]; 3103 int tcp_syn_cache_active; 3104 struct mutex syn_cache_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); 3105 3106 #define SYN_HASH(sa, sp, dp, rand) \ 3107 (((sa)->s_addr ^ (rand)[0]) * \ 3108 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3109 #ifndef INET6 3110 #define SYN_HASHALL(hash, src, dst, rand) \ 3111 do { \ 3112 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3113 satosin(src)->sin_port, \ 3114 satosin(dst)->sin_port, (rand)); \ 3115 } while (/*CONSTCOND*/ 0) 3116 #else 3117 #define SYN_HASH6(sa, sp, dp, rand) \ 3118 (((sa)->s6_addr32[0] ^ (rand)[0]) * \ 3119 ((sa)->s6_addr32[1] ^ (rand)[1]) * \ 3120 ((sa)->s6_addr32[2] ^ (rand)[2]) * \ 3121 ((sa)->s6_addr32[3] ^ (rand)[3]) * \ 3122 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4])) 3123 3124 #define SYN_HASHALL(hash, src, dst, rand) \ 3125 do { \ 3126 switch ((src)->sa_family) { \ 3127 case AF_INET: \ 3128 hash = SYN_HASH(&satosin(src)->sin_addr, \ 3129 satosin(src)->sin_port, \ 3130 satosin(dst)->sin_port, (rand)); \ 3131 break; \ 3132 case AF_INET6: \ 3133 hash = SYN_HASH6(&satosin6(src)->sin6_addr, \ 3134 satosin6(src)->sin6_port, \ 3135 satosin6(dst)->sin6_port, (rand)); \ 3136 break; \ 3137 default: \ 3138 hash = 0; \ 3139 } \ 3140 } while (/*CONSTCOND*/0) 3141 #endif /* INET6 */ 3142 3143 void 3144 syn_cache_rm(struct syn_cache *sc) 3145 { 3146 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3147 3148 KASSERT(!ISSET(sc->sc_dynflags, SCF_DEAD)); 3149 SET(sc->sc_dynflags, SCF_DEAD); 3150 TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq); 3151 sc->sc_tp = NULL; 3152 LIST_REMOVE(sc, sc_tpq); 3153 refcnt_rele(&sc->sc_refcnt); 3154 sc->sc_buckethead->sch_length--; 3155 if (timeout_del(&sc->sc_timer)) 3156 refcnt_rele(&sc->sc_refcnt); 3157 sc->sc_set->scs_count--; 3158 } 3159 3160 void 3161 syn_cache_put(struct syn_cache *sc) 3162 { 3163 if (refcnt_rele(&sc->sc_refcnt) == 0) 3164 return; 3165 3166 /* Dealing with last reference, no lock needed. */ 3167 m_free(sc->sc_ipopts); 3168 rtfree(sc->sc_route4.ro_rt); 3169 3170 pool_put(&syn_cache_pool, sc); 3171 } 3172 3173 void 3174 syn_cache_init(void) 3175 { 3176 int i; 3177 3178 /* Initialize the hash buckets. */ 3179 tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size, 3180 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3181 tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size, 3182 sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO); 3183 tcp_syn_cache[0].scs_size = tcp_syn_hash_size; 3184 tcp_syn_cache[1].scs_size = tcp_syn_hash_size; 3185 for (i = 0; i < tcp_syn_hash_size; i++) { 3186 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket); 3187 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket); 3188 } 3189 3190 /* Initialize the syn cache pool. */ 3191 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET, 3192 0, "syncache", NULL); 3193 } 3194 3195 void 3196 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp) 3197 { 3198 struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active]; 3199 struct syn_cache_head *scp; 3200 struct syn_cache *sc2; 3201 int i; 3202 3203 NET_ASSERT_LOCKED(); 3204 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3205 3206 /* 3207 * If there are no entries in the hash table, reinitialize 3208 * the hash secrets. To avoid useless cache swaps and 3209 * reinitialization, use it until the limit is reached. 3210 * An empty cache is also the opportunity to resize the hash. 3211 */ 3212 if (set->scs_count == 0 && set->scs_use <= 0) { 3213 set->scs_use = tcp_syn_use_limit; 3214 if (set->scs_size != tcp_syn_hash_size) { 3215 scp = mallocarray(tcp_syn_hash_size, sizeof(struct 3216 syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO); 3217 if (scp == NULL) { 3218 /* Try again next time. */ 3219 set->scs_use = 0; 3220 } else { 3221 free(set->scs_buckethead, M_SYNCACHE, 3222 set->scs_size * 3223 sizeof(struct syn_cache_head)); 3224 set->scs_buckethead = scp; 3225 set->scs_size = tcp_syn_hash_size; 3226 for (i = 0; i < tcp_syn_hash_size; i++) 3227 TAILQ_INIT(&scp[i].sch_bucket); 3228 } 3229 } 3230 arc4random_buf(set->scs_random, sizeof(set->scs_random)); 3231 tcpstat_inc(tcps_sc_seedrandom); 3232 } 3233 3234 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa, 3235 set->scs_random); 3236 scp = &set->scs_buckethead[sc->sc_hash % set->scs_size]; 3237 sc->sc_buckethead = scp; 3238 3239 /* 3240 * Make sure that we don't overflow the per-bucket 3241 * limit or the total cache size limit. 3242 */ 3243 if (scp->sch_length >= tcp_syn_bucket_limit) { 3244 tcpstat_inc(tcps_sc_bucketoverflow); 3245 /* 3246 * Someone might attack our bucket hash function. Reseed 3247 * with random as soon as the passive syn cache gets empty. 3248 */ 3249 set->scs_use = 0; 3250 /* 3251 * The bucket is full. Toss the oldest element in the 3252 * bucket. This will be the first entry in the bucket. 3253 */ 3254 sc2 = TAILQ_FIRST(&scp->sch_bucket); 3255 #ifdef DIAGNOSTIC 3256 /* 3257 * This should never happen; we should always find an 3258 * entry in our bucket. 3259 */ 3260 if (sc2 == NULL) 3261 panic("%s: bucketoverflow: impossible", __func__); 3262 #endif 3263 syn_cache_rm(sc2); 3264 syn_cache_put(sc2); 3265 } else if (set->scs_count >= tcp_syn_cache_limit) { 3266 struct syn_cache_head *scp2, *sce; 3267 3268 tcpstat_inc(tcps_sc_overflowed); 3269 /* 3270 * The cache is full. Toss the oldest entry in the 3271 * first non-empty bucket we can find. 3272 * 3273 * XXX We would really like to toss the oldest 3274 * entry in the cache, but we hope that this 3275 * condition doesn't happen very often. 3276 */ 3277 scp2 = scp; 3278 if (TAILQ_EMPTY(&scp2->sch_bucket)) { 3279 sce = &set->scs_buckethead[set->scs_size]; 3280 for (++scp2; scp2 != scp; scp2++) { 3281 if (scp2 >= sce) 3282 scp2 = &set->scs_buckethead[0]; 3283 if (! TAILQ_EMPTY(&scp2->sch_bucket)) 3284 break; 3285 } 3286 #ifdef DIAGNOSTIC 3287 /* 3288 * This should never happen; we should always find a 3289 * non-empty bucket. 3290 */ 3291 if (scp2 == scp) 3292 panic("%s: cacheoverflow: impossible", 3293 __func__); 3294 #endif 3295 } 3296 sc2 = TAILQ_FIRST(&scp2->sch_bucket); 3297 syn_cache_rm(sc2); 3298 syn_cache_put(sc2); 3299 } 3300 3301 /* 3302 * Initialize the entry's timer. We don't estimate RTT 3303 * with SYNs, so each packet starts with the default RTT 3304 * and each timer step has a fixed timeout value. 3305 */ 3306 sc->sc_rxttot = 0; 3307 sc->sc_rxtshift = 0; 3308 TCPT_RANGESET(sc->sc_rxtcur, 3309 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3310 TCPTV_REXMTMAX); 3311 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3312 refcnt_take(&sc->sc_refcnt); 3313 3314 /* Link it from tcpcb entry */ 3315 refcnt_take(&sc->sc_refcnt); 3316 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq); 3317 3318 /* Put it into the bucket. */ 3319 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq); 3320 scp->sch_length++; 3321 sc->sc_set = set; 3322 set->scs_count++; 3323 set->scs_use--; 3324 3325 tcpstat_inc(tcps_sc_added); 3326 3327 /* 3328 * If the active cache has exceeded its use limit and 3329 * the passive syn cache is empty, exchange their roles. 3330 */ 3331 if (set->scs_use <= 0 && 3332 tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0) 3333 tcp_syn_cache_active = !tcp_syn_cache_active; 3334 } 3335 3336 /* 3337 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 3338 * If we have retransmitted an entry the maximum number of times, expire 3339 * that entry. 3340 */ 3341 void 3342 syn_cache_timer(void *arg) 3343 { 3344 struct syn_cache *sc = arg; 3345 uint64_t now; 3346 int lastref; 3347 3348 mtx_enter(&syn_cache_mtx); 3349 if (ISSET(sc->sc_dynflags, SCF_DEAD)) 3350 goto freeit; 3351 3352 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) { 3353 /* Drop it -- too many retransmissions. */ 3354 goto dropit; 3355 } 3356 3357 /* 3358 * Compute the total amount of time this entry has 3359 * been on a queue. If this entry has been on longer 3360 * than the keep alive timer would allow, expire it. 3361 */ 3362 sc->sc_rxttot += sc->sc_rxtcur; 3363 if (sc->sc_rxttot >= READ_ONCE(tcptv_keep_init)) 3364 goto dropit; 3365 3366 /* Advance the timer back-off. */ 3367 sc->sc_rxtshift++; 3368 TCPT_RANGESET(sc->sc_rxtcur, 3369 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN, 3370 TCPTV_REXMTMAX); 3371 if (timeout_add_msec(&sc->sc_timer, sc->sc_rxtcur)) 3372 refcnt_take(&sc->sc_refcnt); 3373 mtx_leave(&syn_cache_mtx); 3374 3375 NET_LOCK(); 3376 now = tcp_now(); 3377 (void) syn_cache_respond(sc, NULL, now); 3378 tcpstat_inc(tcps_sc_retransmitted); 3379 NET_UNLOCK(); 3380 3381 syn_cache_put(sc); 3382 return; 3383 3384 dropit: 3385 tcpstat_inc(tcps_sc_timed_out); 3386 syn_cache_rm(sc); 3387 /* Decrement reference of the timer and free object after remove. */ 3388 lastref = refcnt_rele(&sc->sc_refcnt); 3389 KASSERT(lastref == 0); 3390 (void)lastref; 3391 freeit: 3392 mtx_leave(&syn_cache_mtx); 3393 syn_cache_put(sc); 3394 } 3395 3396 /* 3397 * Remove syn cache created by the specified tcb entry, 3398 * because this does not make sense to keep them 3399 * (if there's no tcb entry, syn cache entry will never be used) 3400 */ 3401 void 3402 syn_cache_cleanup(struct tcpcb *tp) 3403 { 3404 struct syn_cache *sc, *nsc; 3405 3406 NET_ASSERT_LOCKED(); 3407 3408 mtx_enter(&syn_cache_mtx); 3409 LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) { 3410 #ifdef DIAGNOSTIC 3411 if (sc->sc_tp != tp) 3412 panic("invalid sc_tp in syn_cache_cleanup"); 3413 #endif 3414 syn_cache_rm(sc); 3415 syn_cache_put(sc); 3416 } 3417 mtx_leave(&syn_cache_mtx); 3418 3419 KASSERT(LIST_EMPTY(&tp->t_sc)); 3420 } 3421 3422 /* 3423 * Find an entry in the syn cache. 3424 */ 3425 struct syn_cache * 3426 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst, 3427 struct syn_cache_head **headp, u_int rtableid) 3428 { 3429 struct syn_cache_set *sets[2]; 3430 struct syn_cache *sc; 3431 struct syn_cache_head *scp; 3432 u_int32_t hash; 3433 int i; 3434 3435 NET_ASSERT_LOCKED(); 3436 MUTEX_ASSERT_LOCKED(&syn_cache_mtx); 3437 3438 /* Check the active cache first, the passive cache is likely empty. */ 3439 sets[0] = &tcp_syn_cache[tcp_syn_cache_active]; 3440 sets[1] = &tcp_syn_cache[!tcp_syn_cache_active]; 3441 for (i = 0; i < 2; i++) { 3442 if (sets[i]->scs_count == 0) 3443 continue; 3444 SYN_HASHALL(hash, src, dst, sets[i]->scs_random); 3445 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size]; 3446 *headp = scp; 3447 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) { 3448 if (sc->sc_hash != hash) 3449 continue; 3450 if (!bcmp(&sc->sc_src, src, src->sa_len) && 3451 !bcmp(&sc->sc_dst, dst, dst->sa_len) && 3452 rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid)) 3453 return (sc); 3454 } 3455 } 3456 return (NULL); 3457 } 3458 3459 /* 3460 * This function gets called when we receive an ACK for a 3461 * socket in the LISTEN state. We look up the connection 3462 * in the syn cache, and if its there, we pull it out of 3463 * the cache and turn it into a full-blown connection in 3464 * the SYN-RECEIVED state. 3465 * 3466 * The return values may not be immediately obvious, and their effects 3467 * can be subtle, so here they are: 3468 * 3469 * NULL SYN was not found in cache; caller should drop the 3470 * packet and send an RST. 3471 * 3472 * -1 We were unable to create the new connection, and are 3473 * aborting it. An ACK,RST is being sent to the peer 3474 * (unless we got screwy sequence numbers; see below), 3475 * because the 3-way handshake has been completed. Caller 3476 * should not free the mbuf, since we may be using it. If 3477 * we are not, we will free it. 3478 * 3479 * Otherwise, the return value is a pointer to the new socket 3480 * associated with the connection. 3481 */ 3482 struct socket * 3483 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3484 u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint64_t now) 3485 { 3486 struct syn_cache *sc; 3487 struct syn_cache_head *scp; 3488 struct inpcb *inp, *oldinp; 3489 struct tcpcb *tp = NULL; 3490 struct mbuf *am; 3491 struct socket *oso; 3492 u_int rtableid; 3493 3494 NET_ASSERT_LOCKED(); 3495 3496 mtx_enter(&syn_cache_mtx); 3497 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3498 if (sc == NULL) { 3499 mtx_leave(&syn_cache_mtx); 3500 return (NULL); 3501 } 3502 3503 /* 3504 * Verify the sequence and ack numbers. Try getting the correct 3505 * response again. 3506 */ 3507 if ((th->th_ack != sc->sc_iss + 1) || 3508 SEQ_LEQ(th->th_seq, sc->sc_irs) || 3509 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) { 3510 refcnt_take(&sc->sc_refcnt); 3511 mtx_leave(&syn_cache_mtx); 3512 (void) syn_cache_respond(sc, m, now); 3513 syn_cache_put(sc); 3514 return ((struct socket *)(-1)); 3515 } 3516 3517 /* Remove this cache entry */ 3518 syn_cache_rm(sc); 3519 mtx_leave(&syn_cache_mtx); 3520 3521 /* 3522 * Ok, create the full blown connection, and set things up 3523 * as they would have been set up if we had created the 3524 * connection when the SYN arrived. If we can't create 3525 * the connection, abort it. 3526 */ 3527 oso = so; 3528 so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT); 3529 if (so == NULL) 3530 goto resetandabort; 3531 3532 oldinp = sotoinpcb(oso); 3533 inp = sotoinpcb(so); 3534 3535 #ifdef IPSEC 3536 /* 3537 * We need to copy the required security levels 3538 * from the old pcb. Ditto for any other 3539 * IPsec-related information. 3540 */ 3541 memcpy(inp->inp_seclevel, oldinp->inp_seclevel, 3542 sizeof(oldinp->inp_seclevel)); 3543 #endif /* IPSEC */ 3544 #ifdef INET6 3545 /* 3546 * inp still has the OLD in_pcb stuff, set the 3547 * v6-related flags on the new guy, too. 3548 */ 3549 inp->inp_flags |= (oldinp->inp_flags & INP_IPV6); 3550 if (inp->inp_flags & INP_IPV6) { 3551 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim; 3552 inp->inp_hops = oldinp->inp_hops; 3553 } else 3554 #endif /* INET6 */ 3555 { 3556 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl; 3557 inp->inp_options = ip_srcroute(m); 3558 if (inp->inp_options == NULL) { 3559 inp->inp_options = sc->sc_ipopts; 3560 sc->sc_ipopts = NULL; 3561 } 3562 } 3563 3564 /* inherit rtable from listening socket */ 3565 rtableid = sc->sc_rtableid; 3566 #if NPF > 0 3567 if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 3568 struct pf_divert *divert; 3569 3570 divert = pf_find_divert(m); 3571 KASSERT(divert != NULL); 3572 rtableid = divert->rdomain; 3573 } 3574 #endif 3575 in_pcbset_laddr(inp, dst, rtableid); 3576 3577 /* 3578 * Give the new socket our cached route reference. 3579 */ 3580 if (src->sa_family == AF_INET) 3581 inp->inp_route = sc->sc_route4; /* struct assignment */ 3582 #ifdef INET6 3583 else 3584 inp->inp_route6 = sc->sc_route6; 3585 #endif 3586 sc->sc_route4.ro_rt = NULL; 3587 3588 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ 3589 if (am == NULL) 3590 goto resetandabort; 3591 am->m_len = src->sa_len; 3592 memcpy(mtod(am, caddr_t), src, src->sa_len); 3593 if (in_pcbconnect(inp, am)) { 3594 (void) m_free(am); 3595 goto resetandabort; 3596 } 3597 (void) m_free(am); 3598 3599 tp = intotcpcb(inp); 3600 tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY); 3601 if (sc->sc_request_r_scale != 15) { 3602 tp->requested_s_scale = sc->sc_requested_s_scale; 3603 tp->request_r_scale = sc->sc_request_r_scale; 3604 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 3605 } 3606 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) 3607 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 3608 3609 tp->t_template = tcp_template(tp); 3610 if (tp->t_template == 0) { 3611 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */ 3612 so = NULL; 3613 goto abort; 3614 } 3615 tp->sack_enable = ISSET(sc->sc_fixflags, SCF_SACK_PERMIT); 3616 tp->ts_modulate = sc->sc_modulate; 3617 tp->ts_recent = sc->sc_timestamp; 3618 tp->iss = sc->sc_iss; 3619 tp->irs = sc->sc_irs; 3620 tcp_sendseqinit(tp); 3621 tp->snd_last = tp->snd_una; 3622 #ifdef TCP_ECN 3623 if (ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) { 3624 tp->t_flags |= TF_ECN_PERMIT; 3625 tcpstat_inc(tcps_ecn_accepts); 3626 } 3627 #endif 3628 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) 3629 tp->t_flags |= TF_SACK_PERMIT; 3630 #ifdef TCP_SIGNATURE 3631 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) 3632 tp->t_flags |= TF_SIGNATURE; 3633 #endif 3634 tcp_rcvseqinit(tp); 3635 tp->t_state = TCPS_SYN_RECEIVED; 3636 tp->t_rcvtime = now; 3637 tp->t_sndtime = now; 3638 tp->t_rcvacktime = now; 3639 tp->t_sndacktime = now; 3640 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 3641 tcpstat_inc(tcps_accepts); 3642 3643 tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */ 3644 if (sc->sc_peermaxseg) 3645 tcp_mss_update(tp); 3646 /* Reset initial window to 1 segment for retransmit */ 3647 if (READ_ONCE(sc->sc_rxtshift) > 0) 3648 tp->snd_cwnd = tp->t_maxseg; 3649 tp->snd_wl1 = sc->sc_irs; 3650 tp->rcv_up = sc->sc_irs + 1; 3651 3652 /* 3653 * This is what would have happened in tcp_output() when 3654 * the SYN,ACK was sent. 3655 */ 3656 tp->snd_up = tp->snd_una; 3657 tp->snd_max = tp->snd_nxt = tp->iss+1; 3658 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 3659 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv)) 3660 tp->rcv_adv = tp->rcv_nxt + sc->sc_win; 3661 tp->last_ack_sent = tp->rcv_nxt; 3662 3663 tcpstat_inc(tcps_sc_completed); 3664 syn_cache_put(sc); 3665 return (so); 3666 3667 resetandabort: 3668 tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST, 3669 m->m_pkthdr.ph_rtableid, now); 3670 abort: 3671 m_freem(m); 3672 if (so != NULL) 3673 soabort(so); 3674 syn_cache_put(sc); 3675 tcpstat_inc(tcps_sc_aborted); 3676 return ((struct socket *)(-1)); 3677 } 3678 3679 /* 3680 * This function is called when we get a RST for a 3681 * non-existent connection, so that we can see if the 3682 * connection is in the syn cache. If it is, zap it. 3683 */ 3684 3685 void 3686 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3687 u_int rtableid) 3688 { 3689 struct syn_cache *sc; 3690 struct syn_cache_head *scp; 3691 3692 NET_ASSERT_LOCKED(); 3693 3694 mtx_enter(&syn_cache_mtx); 3695 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3696 if (sc == NULL) { 3697 mtx_leave(&syn_cache_mtx); 3698 return; 3699 } 3700 if (SEQ_LT(th->th_seq, sc->sc_irs) || 3701 SEQ_GT(th->th_seq, sc->sc_irs + 1)) { 3702 mtx_leave(&syn_cache_mtx); 3703 return; 3704 } 3705 syn_cache_rm(sc); 3706 mtx_leave(&syn_cache_mtx); 3707 tcpstat_inc(tcps_sc_reset); 3708 syn_cache_put(sc); 3709 } 3710 3711 void 3712 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3713 u_int rtableid) 3714 { 3715 struct syn_cache *sc; 3716 struct syn_cache_head *scp; 3717 3718 NET_ASSERT_LOCKED(); 3719 3720 mtx_enter(&syn_cache_mtx); 3721 sc = syn_cache_lookup(src, dst, &scp, rtableid); 3722 if (sc == NULL) { 3723 mtx_leave(&syn_cache_mtx); 3724 return; 3725 } 3726 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 3727 if (ntohl (th->th_seq) != sc->sc_iss) { 3728 mtx_leave(&syn_cache_mtx); 3729 return; 3730 } 3731 3732 /* 3733 * If we've retransmitted 3 times and this is our second error, 3734 * we remove the entry. Otherwise, we allow it to continue on. 3735 * This prevents us from incorrectly nuking an entry during a 3736 * spurious network outage. 3737 * 3738 * See tcp_notify(). 3739 */ 3740 if (!ISSET(sc->sc_dynflags, SCF_UNREACH) || sc->sc_rxtshift < 3) { 3741 SET(sc->sc_dynflags, SCF_UNREACH); 3742 mtx_leave(&syn_cache_mtx); 3743 return; 3744 } 3745 3746 syn_cache_rm(sc); 3747 mtx_leave(&syn_cache_mtx); 3748 tcpstat_inc(tcps_sc_unreach); 3749 syn_cache_put(sc); 3750 } 3751 3752 /* 3753 * Given a LISTEN socket and an inbound SYN request, add 3754 * this to the syn cache, and send back a segment: 3755 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 3756 * to the source. 3757 * 3758 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 3759 * Doing so would require that we hold onto the data and deliver it 3760 * to the application. However, if we are the target of a SYN-flood 3761 * DoS attack, an attacker could send data which would eventually 3762 * consume all available buffer space if it were ACKed. By not ACKing 3763 * the data, we avoid this DoS scenario. 3764 */ 3765 3766 int 3767 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th, 3768 u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen, 3769 struct tcp_opt_info *oi, tcp_seq *issp, uint64_t now) 3770 { 3771 struct tcpcb tb, *tp; 3772 long win; 3773 struct syn_cache *sc; 3774 struct syn_cache_head *scp; 3775 struct mbuf *ipopts; 3776 3777 NET_ASSERT_LOCKED(); 3778 3779 tp = sototcpcb(so); 3780 3781 /* 3782 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN 3783 * 3784 * Note this check is performed in tcp_input() very early on. 3785 */ 3786 3787 /* 3788 * Initialize some local state. 3789 */ 3790 win = sbspace(so, &so->so_rcv); 3791 if (win > TCP_MAXWIN) 3792 win = TCP_MAXWIN; 3793 3794 bzero(&tb, sizeof(tb)); 3795 #ifdef TCP_SIGNATURE 3796 if (optp || (tp->t_flags & TF_SIGNATURE)) { 3797 #else 3798 if (optp) { 3799 #endif 3800 tb.pf = tp->pf; 3801 tb.sack_enable = tp->sack_enable; 3802 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 3803 #ifdef TCP_SIGNATURE 3804 if (tp->t_flags & TF_SIGNATURE) 3805 tb.t_flags |= TF_SIGNATURE; 3806 #endif 3807 tb.t_state = TCPS_LISTEN; 3808 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi, 3809 sotoinpcb(so)->inp_rtableid, now)) 3810 return (-1); 3811 } 3812 3813 switch (src->sa_family) { 3814 case AF_INET: 3815 /* 3816 * Remember the IP options, if any. 3817 */ 3818 ipopts = ip_srcroute(m); 3819 break; 3820 default: 3821 ipopts = NULL; 3822 } 3823 3824 /* 3825 * See if we already have an entry for this connection. 3826 * If we do, resend the SYN,ACK. We do not count this 3827 * as a retransmission (XXX though maybe we should). 3828 */ 3829 mtx_enter(&syn_cache_mtx); 3830 sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid); 3831 if (sc != NULL) { 3832 refcnt_take(&sc->sc_refcnt); 3833 mtx_leave(&syn_cache_mtx); 3834 tcpstat_inc(tcps_sc_dupesyn); 3835 if (ipopts) { 3836 /* 3837 * If we were remembering a previous source route, 3838 * forget it and use the new one we've been given. 3839 */ 3840 m_free(sc->sc_ipopts); 3841 sc->sc_ipopts = ipopts; 3842 } 3843 sc->sc_timestamp = tb.ts_recent; 3844 if (syn_cache_respond(sc, m, now) == 0) { 3845 tcpstat_inc(tcps_sndacks); 3846 tcpstat_inc(tcps_sndtotal); 3847 } 3848 syn_cache_put(sc); 3849 return (0); 3850 } 3851 mtx_leave(&syn_cache_mtx); 3852 3853 sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO); 3854 if (sc == NULL) { 3855 m_free(ipopts); 3856 return (-1); 3857 } 3858 refcnt_init_trace(&sc->sc_refcnt, DT_REFCNT_IDX_SYNCACHE); 3859 timeout_set_flags(&sc->sc_timer, syn_cache_timer, sc, 3860 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 3861 3862 /* 3863 * Fill in the cache, and put the necessary IP and TCP 3864 * options into the reply. 3865 */ 3866 memcpy(&sc->sc_src, src, src->sa_len); 3867 memcpy(&sc->sc_dst, dst, dst->sa_len); 3868 sc->sc_rtableid = sotoinpcb(so)->inp_rtableid; 3869 sc->sc_ipopts = ipopts; 3870 sc->sc_irs = th->th_seq; 3871 3872 sc->sc_iss = issp ? *issp : arc4random(); 3873 sc->sc_peermaxseg = oi->maxseg; 3874 sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family); 3875 sc->sc_win = win; 3876 sc->sc_timestamp = tb.ts_recent; 3877 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) == 3878 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { 3879 SET(sc->sc_fixflags, SCF_TIMESTAMP); 3880 sc->sc_modulate = arc4random(); 3881 } 3882 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3883 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 3884 sc->sc_requested_s_scale = tb.requested_s_scale; 3885 sc->sc_request_r_scale = 0; 3886 /* 3887 * Pick the smallest possible scaling factor that 3888 * will still allow us to scale up to sb_max. 3889 * 3890 * We do this because there are broken firewalls that 3891 * will corrupt the window scale option, leading to 3892 * the other endpoint believing that our advertised 3893 * window is unscaled. At scale factors larger than 3894 * 5 the unscaled window will drop below 1500 bytes, 3895 * leading to serious problems when traversing these 3896 * broken firewalls. 3897 * 3898 * With the default sbmax of 256K, a scale factor 3899 * of 3 will be chosen by this algorithm. Those who 3900 * choose a larger sbmax should watch out 3901 * for the compatibility problems mentioned above. 3902 * 3903 * RFC1323: The Window field in a SYN (i.e., a <SYN> 3904 * or <SYN,ACK>) segment itself is never scaled. 3905 */ 3906 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT && 3907 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max) 3908 sc->sc_request_r_scale++; 3909 } else { 3910 sc->sc_requested_s_scale = 15; 3911 sc->sc_request_r_scale = 15; 3912 } 3913 #ifdef TCP_ECN 3914 /* 3915 * if both ECE and CWR flag bits are set, peer is ECN capable. 3916 */ 3917 if (tcp_do_ecn && 3918 (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) 3919 SET(sc->sc_fixflags, SCF_ECN_PERMIT); 3920 #endif 3921 /* 3922 * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option 3923 * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT). 3924 */ 3925 if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT)) 3926 SET(sc->sc_fixflags, SCF_SACK_PERMIT); 3927 #ifdef TCP_SIGNATURE 3928 if (tb.t_flags & TF_SIGNATURE) 3929 SET(sc->sc_fixflags, SCF_SIGNATURE); 3930 #endif 3931 sc->sc_tp = tp; 3932 if (syn_cache_respond(sc, m, now) == 0) { 3933 mtx_enter(&syn_cache_mtx); 3934 /* 3935 * XXXSMP Currently exclusive netlock prevents another insert 3936 * after our syn_cache_lookup() and before syn_cache_insert(). 3937 * Double insert should be handled and not rely on netlock. 3938 */ 3939 syn_cache_insert(sc, tp); 3940 mtx_leave(&syn_cache_mtx); 3941 tcpstat_inc(tcps_sndacks); 3942 tcpstat_inc(tcps_sndtotal); 3943 } else { 3944 syn_cache_put(sc); 3945 tcpstat_inc(tcps_sc_dropped); 3946 } 3947 3948 return (0); 3949 } 3950 3951 int 3952 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint64_t now) 3953 { 3954 u_int8_t *optp; 3955 int optlen, error; 3956 u_int16_t tlen; 3957 struct ip *ip = NULL; 3958 #ifdef INET6 3959 struct ip6_hdr *ip6 = NULL; 3960 #endif 3961 struct tcphdr *th; 3962 u_int hlen; 3963 struct inpcb *inp; 3964 3965 NET_ASSERT_LOCKED(); 3966 3967 switch (sc->sc_src.sa.sa_family) { 3968 case AF_INET: 3969 hlen = sizeof(struct ip); 3970 break; 3971 #ifdef INET6 3972 case AF_INET6: 3973 hlen = sizeof(struct ip6_hdr); 3974 break; 3975 #endif 3976 default: 3977 m_freem(m); 3978 return (EAFNOSUPPORT); 3979 } 3980 3981 /* Compute the size of the TCP options. */ 3982 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) + 3983 (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT) ? 4 : 0) + 3984 #ifdef TCP_SIGNATURE 3985 (ISSET(sc->sc_fixflags, SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) + 3986 #endif 3987 (ISSET(sc->sc_fixflags, SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0); 3988 3989 tlen = hlen + sizeof(struct tcphdr) + optlen; 3990 3991 /* 3992 * Create the IP+TCP header from scratch. 3993 */ 3994 m_freem(m); 3995 #ifdef DIAGNOSTIC 3996 if (max_linkhdr + tlen > MCLBYTES) 3997 return (ENOBUFS); 3998 #endif 3999 MGETHDR(m, M_DONTWAIT, MT_DATA); 4000 if (m && max_linkhdr + tlen > MHLEN) { 4001 MCLGET(m, M_DONTWAIT); 4002 if ((m->m_flags & M_EXT) == 0) { 4003 m_freem(m); 4004 m = NULL; 4005 } 4006 } 4007 if (m == NULL) 4008 return (ENOBUFS); 4009 4010 /* Fixup the mbuf. */ 4011 m->m_data += max_linkhdr; 4012 m->m_len = m->m_pkthdr.len = tlen; 4013 m->m_pkthdr.ph_ifidx = 0; 4014 m->m_pkthdr.ph_rtableid = sc->sc_rtableid; 4015 memset(mtod(m, u_char *), 0, tlen); 4016 4017 switch (sc->sc_src.sa.sa_family) { 4018 case AF_INET: 4019 ip = mtod(m, struct ip *); 4020 ip->ip_dst = sc->sc_src.sin.sin_addr; 4021 ip->ip_src = sc->sc_dst.sin.sin_addr; 4022 ip->ip_p = IPPROTO_TCP; 4023 th = (struct tcphdr *)(ip + 1); 4024 th->th_dport = sc->sc_src.sin.sin_port; 4025 th->th_sport = sc->sc_dst.sin.sin_port; 4026 break; 4027 #ifdef INET6 4028 case AF_INET6: 4029 ip6 = mtod(m, struct ip6_hdr *); 4030 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr; 4031 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr; 4032 ip6->ip6_nxt = IPPROTO_TCP; 4033 th = (struct tcphdr *)(ip6 + 1); 4034 th->th_dport = sc->sc_src.sin6.sin6_port; 4035 th->th_sport = sc->sc_dst.sin6.sin6_port; 4036 break; 4037 #endif 4038 } 4039 4040 th->th_seq = htonl(sc->sc_iss); 4041 th->th_ack = htonl(sc->sc_irs + 1); 4042 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 4043 th->th_flags = TH_SYN|TH_ACK; 4044 #ifdef TCP_ECN 4045 /* Set ECE for SYN-ACK if peer supports ECN. */ 4046 if (tcp_do_ecn && ISSET(sc->sc_fixflags, SCF_ECN_PERMIT)) 4047 th->th_flags |= TH_ECE; 4048 #endif 4049 th->th_win = htons(sc->sc_win); 4050 /* th_sum already 0 */ 4051 /* th_urp already 0 */ 4052 4053 /* Tack on the TCP options. */ 4054 optp = (u_int8_t *)(th + 1); 4055 *optp++ = TCPOPT_MAXSEG; 4056 *optp++ = 4; 4057 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff; 4058 *optp++ = sc->sc_ourmaxseg & 0xff; 4059 4060 /* Include SACK_PERMIT_HDR option if peer has already done so. */ 4061 if (ISSET(sc->sc_fixflags, SCF_SACK_PERMIT)) { 4062 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR); 4063 optp += 4; 4064 } 4065 4066 if (sc->sc_request_r_scale != 15) { 4067 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 | 4068 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | 4069 sc->sc_request_r_scale); 4070 optp += 4; 4071 } 4072 4073 if (ISSET(sc->sc_fixflags, SCF_TIMESTAMP)) { 4074 u_int32_t *lp = (u_int32_t *)(optp); 4075 /* Form timestamp option as shown in appendix A of RFC 1323. */ 4076 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 4077 *lp++ = htonl(now + sc->sc_modulate); 4078 *lp = htonl(sc->sc_timestamp); 4079 optp += TCPOLEN_TSTAMP_APPA; 4080 } 4081 4082 #ifdef TCP_SIGNATURE 4083 if (ISSET(sc->sc_fixflags, SCF_SIGNATURE)) { 4084 union sockaddr_union src, dst; 4085 struct tdb *tdb; 4086 4087 bzero(&src, sizeof(union sockaddr_union)); 4088 bzero(&dst, sizeof(union sockaddr_union)); 4089 src.sa.sa_len = sc->sc_src.sa.sa_len; 4090 src.sa.sa_family = sc->sc_src.sa.sa_family; 4091 dst.sa.sa_len = sc->sc_dst.sa.sa_len; 4092 dst.sa.sa_family = sc->sc_dst.sa.sa_family; 4093 4094 switch (sc->sc_src.sa.sa_family) { 4095 case 0: /*default to PF_INET*/ 4096 case AF_INET: 4097 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 4098 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 4099 break; 4100 #ifdef INET6 4101 case AF_INET6: 4102 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 4103 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 4104 break; 4105 #endif /* INET6 */ 4106 } 4107 4108 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid), 4109 0, &src, &dst, IPPROTO_TCP); 4110 if (tdb == NULL) { 4111 m_freem(m); 4112 return (EPERM); 4113 } 4114 4115 /* Send signature option */ 4116 *(optp++) = TCPOPT_SIGNATURE; 4117 *(optp++) = TCPOLEN_SIGNATURE; 4118 4119 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th, 4120 hlen, 0, optp) < 0) { 4121 m_freem(m); 4122 tdb_unref(tdb); 4123 return (EINVAL); 4124 } 4125 tdb_unref(tdb); 4126 optp += 16; 4127 4128 /* Pad options list to the next 32 bit boundary and 4129 * terminate it. 4130 */ 4131 *optp++ = TCPOPT_NOP; 4132 *optp++ = TCPOPT_EOL; 4133 } 4134 #endif /* TCP_SIGNATURE */ 4135 4136 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 4137 4138 /* use IPsec policy and ttl from listening socket, on SYN ACK */ 4139 mtx_enter(&syn_cache_mtx); 4140 inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL; 4141 mtx_leave(&syn_cache_mtx); 4142 4143 /* 4144 * Fill in some straggling IP bits. Note the stack expects 4145 * ip_len to be in host order, for convenience. 4146 */ 4147 switch (sc->sc_src.sa.sa_family) { 4148 case AF_INET: 4149 ip->ip_len = htons(tlen); 4150 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl; 4151 if (inp != NULL) 4152 ip->ip_tos = inp->inp_ip.ip_tos; 4153 4154 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4, 4155 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 4156 inp ? inp->inp_seclevel : NULL, 0); 4157 break; 4158 #ifdef INET6 4159 case AF_INET6: 4160 ip6->ip6_vfc &= ~IPV6_VERSION_MASK; 4161 ip6->ip6_vfc |= IPV6_VERSION; 4162 /* ip6_plen will be updated in ip6_output() */ 4163 ip6->ip6_hlim = in6_selecthlim(inp); 4164 /* leave flowlabel = 0, it is legal and require no state mgmt */ 4165 4166 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0, 4167 NULL, inp ? inp->inp_seclevel : NULL); 4168 break; 4169 #endif 4170 } 4171 return (error); 4172 } 4173