1 /*- 2 * Copyright (c) 2016-2018 3 * Netflix Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 /* 29 * Author: Randall Stewart <rrs@netflix.com> 30 * This work is based on the ACM Queue paper 31 * BBR - Congestion Based Congestion Control 32 * and also numerous discussions with Neal, Yuchung and Van. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_inet.h" 39 #include "opt_inet6.h" 40 #include "opt_ipsec.h" 41 #include "opt_tcpdebug.h" 42 #include "opt_ratelimit.h" 43 /*#include "opt_kern_tls.h"*/ 44 #include <sys/param.h> 45 #include <sys/module.h> 46 #include <sys/kernel.h> 47 #ifdef TCP_HHOOK 48 #include <sys/hhook.h> 49 #endif 50 #include <sys/malloc.h> 51 #include <sys/mbuf.h> 52 #include <sys/proc.h> 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #ifdef KERN_TLS 56 #include <sys/sockbuf_tls.h> 57 #endif 58 #include <sys/sysctl.h> 59 #include <sys/systm.h> 60 #include <sys/tree.h> 61 #include <sys/refcount.h> 62 #include <sys/queue.h> 63 #include <sys/smp.h> 64 #include <sys/kthread.h> 65 #include <sys/lock.h> 66 #include <sys/mutex.h> 67 #include <sys/time.h> 68 #include <vm/uma.h> 69 #include <sys/kern_prefetch.h> 70 71 #include <net/route.h> 72 #include <net/vnet.h> 73 #include <net/ethernet.h> 74 #include <net/bpf.h> 75 76 #define TCPSTATES /* for logging */ 77 78 #include <netinet/in.h> 79 #include <netinet/in_kdtrace.h> 80 #include <netinet/in_pcb.h> 81 #include <netinet/ip.h> 82 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 83 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 84 #include <netinet/ip_var.h> 85 #include <netinet/ip6.h> 86 #include <netinet6/in6_pcb.h> 87 #include <netinet6/ip6_var.h> 88 #define TCPOUTFLAGS 89 #include <netinet/tcp.h> 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_seq.h> 92 #include <netinet/tcp_timer.h> 93 #include <netinet/tcp_var.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/tcp_hpts.h> 96 #include <netinet/cc/cc.h> 97 #include <netinet/tcp_log_buf.h> 98 #ifdef TCPDEBUG 99 #include <netinet/tcp_debug.h> 100 #endif /* TCPDEBUG */ 101 #ifdef TCP_OFFLOAD 102 #include <netinet/tcp_offload.h> 103 #endif 104 #ifdef INET6 105 #include <netinet6/tcp6_var.h> 106 #endif 107 #include <netinet/tcp_fastopen.h> 108 109 #include <netipsec/ipsec_support.h> 110 #include <net/if.h> 111 #include <net/if_var.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "rack_bbr_common.h" 126 127 /* 128 * Common TCP Functions - These are shared by borth 129 * rack and BBR. 130 */ 131 132 133 #ifdef KERN_TLS 134 uint32_t 135 ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) 136 { 137 struct sbtls_info *tls; 138 uint32_t len; 139 140 again: 141 tls = so->so_snd.sb_tls_info; 142 len = tls->sb_params.sb_maxlen; /* max tls payload */ 143 len += tls->sb_params.sb_tls_hlen; /* tls header len */ 144 len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ 145 if ((len * 4) > rwnd) { 146 /* 147 * Stroke this will suck counter and what 148 * else should we do Drew? From the 149 * TCP perspective I am not sure 150 * what should be done... 151 */ 152 if (tls->sb_params.sb_maxlen > 4096) { 153 tls->sb_params.sb_maxlen -= 4096; 154 if (tls->sb_params.sb_maxlen < 4096) 155 tls->sb_params.sb_maxlen = 4096; 156 goto again; 157 } 158 } 159 return (len); 160 } 161 #endif 162 163 int 164 ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) 165 { 166 /* 167 * We are passed a raw change of mbuf packets 168 * that arrived in LRO. They are linked via 169 * the m_nextpkt link in the pkt-headers. 170 * 171 * We process each one by: 172 * a) saving off the next 173 * b) stripping off the ether-header 174 * c) formulating the arguments for 175 * the tfb_tcp_hpts_do_segment 176 * d) calling each mbuf to tfb_tcp_hpts_do_segment 177 * after adjusting the time to match the arrival time. 178 * Note that the LRO code assures no IP options are present. 179 * 180 * The symantics for calling tfb_tcp_hpts_do_segment are the 181 * following: 182 * 1) It returns 0 if all went well and you (the caller) need 183 * to release the lock. 184 * 2) If nxt_pkt is set, then the function will surpress calls 185 * to tfb_tcp_output() since you are promising to call again 186 * with another packet. 187 * 3) If it returns 1, then you must free all the packets being 188 * shipped in, the tcb has been destroyed (or about to be destroyed). 189 */ 190 struct mbuf *m_save; 191 struct ether_header *eh; 192 struct epoch_tracker et; 193 struct tcphdr *th; 194 #ifdef INET6 195 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ 196 #endif 197 #ifdef INET 198 struct ip *ip = NULL; /* Keep compiler happy. */ 199 #endif 200 struct ifnet *ifp; 201 struct timeval tv; 202 int32_t retval, nxt_pkt, tlen, off; 203 uint16_t etype; 204 uint16_t drop_hdrlen; 205 uint8_t iptos, no_vn=0, bpf_req=0; 206 207 /* 208 * This is a bit deceptive, we get the 209 * "info epoch" which is really the network 210 * epoch. This covers us on both any INP 211 * type change but also if the ifp goes 212 * away it covers us as well. 213 */ 214 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 215 if (m && m->m_pkthdr.rcvif) 216 ifp = m->m_pkthdr.rcvif; 217 else 218 ifp = NULL; 219 if (ifp) { 220 bpf_req = bpf_peers_present(ifp->if_bpf); 221 } else { 222 /* 223 * We probably should not work around 224 * but kassert, since lro alwasy sets rcvif. 225 */ 226 no_vn = 1; 227 goto skip_vnet; 228 } 229 CURVNET_SET(ifp->if_vnet); 230 skip_vnet: 231 while (m) { 232 m_save = m->m_nextpkt; 233 m->m_nextpkt = NULL; 234 /* Now lets get the ether header */ 235 eh = mtod(m, struct ether_header *); 236 etype = ntohs(eh->ether_type); 237 /* Let the BPF see the packet */ 238 if (bpf_req && ifp) 239 ETHER_BPF_MTAP(ifp, m); 240 m_adj(m, sizeof(*eh)); 241 /* Trim off the ethernet header */ 242 switch (etype) { 243 #ifdef INET6 244 case ETHERTYPE_IPV6: 245 { 246 if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { 247 m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); 248 if (m == NULL) { 249 TCPSTAT_INC(tcps_rcvshort); 250 m_freem(m); 251 goto skipped_pkt; 252 } 253 } 254 ip6 = (struct ip6_hdr *)(eh + 1); 255 th = (struct tcphdr *)(ip6 + 1); 256 tlen = ntohs(ip6->ip6_plen); 257 drop_hdrlen = sizeof(*ip6); 258 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 259 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 260 th->th_sum = m->m_pkthdr.csum_data; 261 else 262 th->th_sum = in6_cksum_pseudo(ip6, tlen, 263 IPPROTO_TCP, m->m_pkthdr.csum_data); 264 th->th_sum ^= 0xffff; 265 } else 266 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); 267 if (th->th_sum) { 268 TCPSTAT_INC(tcps_rcvbadsum); 269 m_freem(m); 270 goto skipped_pkt; 271 } 272 /* 273 * Be proactive about unspecified IPv6 address in source. 274 * As we use all-zero to indicate unbounded/unconnected pcb, 275 * unspecified IPv6 address can be used to confuse us. 276 * 277 * Note that packets with unspecified IPv6 destination is 278 * already dropped in ip6_input. 279 */ 280 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 281 /* XXX stat */ 282 m_freem(m); 283 goto skipped_pkt; 284 } 285 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 286 break; 287 } 288 #endif 289 #ifdef INET 290 case ETHERTYPE_IP: 291 { 292 if (m->m_len < sizeof (struct tcpiphdr)) { 293 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 294 == NULL) { 295 TCPSTAT_INC(tcps_rcvshort); 296 m_freem(m); 297 goto skipped_pkt; 298 } 299 } 300 ip = (struct ip *)(eh + 1); 301 th = (struct tcphdr *)(ip + 1); 302 drop_hdrlen = sizeof(*ip); 303 iptos = ip->ip_tos; 304 tlen = ntohs(ip->ip_len) - sizeof(struct ip); 305 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 306 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 307 th->th_sum = m->m_pkthdr.csum_data; 308 else 309 th->th_sum = in_pseudo(ip->ip_src.s_addr, 310 ip->ip_dst.s_addr, 311 htonl(m->m_pkthdr.csum_data + tlen + 312 IPPROTO_TCP)); 313 th->th_sum ^= 0xffff; 314 } else { 315 int len; 316 struct ipovly *ipov = (struct ipovly *)ip; 317 /* 318 * Checksum extended TCP header and data. 319 */ 320 len = drop_hdrlen + tlen; 321 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 322 ipov->ih_len = htons(tlen); 323 th->th_sum = in_cksum(m, len); 324 /* Reset length for SDT probes. */ 325 ip->ip_len = htons(len); 326 /* Reset TOS bits */ 327 ip->ip_tos = iptos; 328 /* Re-initialization for later version check */ 329 ip->ip_v = IPVERSION; 330 ip->ip_hl = sizeof(*ip) >> 2; 331 } 332 if (th->th_sum) { 333 TCPSTAT_INC(tcps_rcvbadsum); 334 m_freem(m); 335 goto skipped_pkt; 336 } 337 break; 338 } 339 #endif 340 } 341 /* 342 * Convert TCP protocol specific fields to host format. 343 */ 344 tcp_fields_to_host(th); 345 346 off = th->th_off << 2; 347 if (off < sizeof (struct tcphdr) || off > tlen) { 348 TCPSTAT_INC(tcps_rcvbadoff); 349 m_freem(m); 350 goto skipped_pkt; 351 } 352 tlen -= off; 353 drop_hdrlen += off; 354 /* 355 * Now lets setup the timeval to be when we should 356 * have been called (if we can). 357 */ 358 m->m_pkthdr.lro_nsegs = 1; 359 if (m->m_flags & M_TSTMP_LRO) { 360 tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; 361 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; 362 } else { 363 /* Should not be should we kassert instead? */ 364 tcp_get_usecs(&tv); 365 } 366 /* Now what about next packet? */ 367 if (m_save || has_pkt) 368 nxt_pkt = 1; 369 else 370 nxt_pkt = 0; 371 retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, 372 iptos, nxt_pkt, &tv); 373 if (retval) { 374 /* We lost the lock and tcb probably */ 375 m = m_save; 376 while (m) { 377 m_save = m->m_nextpkt; 378 m->m_nextpkt = NULL; 379 m_freem(m); 380 m = m_save; 381 } 382 if (no_vn == 0) 383 CURVNET_RESTORE(); 384 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 385 return (retval); 386 } 387 skipped_pkt: 388 m = m_save; 389 } 390 if (no_vn == 0) 391 CURVNET_RESTORE(); 392 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 393 return (retval); 394 } 395 396 int 397 ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) 398 { 399 struct mbuf *m; 400 401 /* First lets see if we have old packets */ 402 if (tp->t_in_pkt) { 403 m = tp->t_in_pkt; 404 tp->t_in_pkt = NULL; 405 tp->t_tail_pkt = NULL; 406 if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { 407 /* We lost the tcpcb (maybe a RST came in)? */ 408 return (1); 409 } 410 } 411 return (0); 412 } 413 414 uint32_t 415 ctf_outstanding(struct tcpcb *tp) 416 { 417 return (tp->snd_max - tp->snd_una); 418 } 419 420 uint32_t 421 ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) 422 { 423 if (rc_sacked <= ctf_outstanding(tp)) 424 return (ctf_outstanding(tp) - rc_sacked); 425 else { 426 /* TSNH */ 427 #ifdef INVARIANTS 428 panic("tp:%p rc_sacked:%d > out:%d", 429 tp, rc_sacked, ctf_outstanding(tp)); 430 #endif 431 return (0); 432 } 433 } 434 435 void 436 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, 437 int32_t rstreason, int32_t tlen) 438 { 439 if (tp != NULL) { 440 tcp_dropwithreset(m, th, tp, tlen, rstreason); 441 INP_WUNLOCK(tp->t_inpcb); 442 } else 443 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 444 } 445 446 /* 447 * ctf_drop_checks returns 1 for you should not proceed. It places 448 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 449 * that the TCB is unlocked and probably dropped. The 0 indicates the 450 * TCB is still valid and locked. 451 */ 452 int 453 ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 454 { 455 int32_t todrop; 456 int32_t thflags; 457 int32_t tlen; 458 459 thflags = *thf; 460 tlen = *tlenp; 461 todrop = tp->rcv_nxt - th->th_seq; 462 if (todrop > 0) { 463 if (thflags & TH_SYN) { 464 thflags &= ~TH_SYN; 465 th->th_seq++; 466 if (th->th_urp > 1) 467 th->th_urp--; 468 else 469 thflags &= ~TH_URG; 470 todrop--; 471 } 472 /* 473 * Following if statement from Stevens, vol. 2, p. 960. 474 */ 475 if (todrop > tlen 476 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 477 /* 478 * Any valid FIN must be to the left of the window. 479 * At this point the FIN must be a duplicate or out 480 * of sequence; drop it. 481 */ 482 thflags &= ~TH_FIN; 483 /* 484 * Send an ACK to resynchronize and drop any data. 485 * But keep on processing for RST or ACK. 486 */ 487 tp->t_flags |= TF_ACKNOW; 488 todrop = tlen; 489 TCPSTAT_INC(tcps_rcvduppack); 490 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 491 } else { 492 TCPSTAT_INC(tcps_rcvpartduppack); 493 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 494 } 495 /* 496 * DSACK - add SACK block for dropped range 497 */ 498 if (tp->t_flags & TF_SACK_PERMIT) { 499 tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); 500 /* 501 * ACK now, as the next in-sequence segment 502 * will clear the DSACK block again 503 */ 504 tp->t_flags |= TF_ACKNOW; 505 } 506 *drop_hdrlen += todrop; /* drop from the top afterwards */ 507 th->th_seq += todrop; 508 tlen -= todrop; 509 if (th->th_urp > todrop) 510 th->th_urp -= todrop; 511 else { 512 thflags &= ~TH_URG; 513 th->th_urp = 0; 514 } 515 } 516 /* 517 * If segment ends after window, drop trailing data (and PUSH and 518 * FIN); if nothing left, just ACK. 519 */ 520 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 521 if (todrop > 0) { 522 TCPSTAT_INC(tcps_rcvpackafterwin); 523 if (todrop >= tlen) { 524 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 525 /* 526 * If window is closed can only take segments at 527 * window edge, and have to drop data and PUSH from 528 * incoming segments. Continue processing, but 529 * remember to ack. Otherwise, drop segment and 530 * ack. 531 */ 532 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 533 tp->t_flags |= TF_ACKNOW; 534 TCPSTAT_INC(tcps_rcvwinprobe); 535 } else { 536 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 537 return (1); 538 } 539 } else 540 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 541 m_adj(m, -todrop); 542 tlen -= todrop; 543 thflags &= ~(TH_PUSH | TH_FIN); 544 } 545 *thf = thflags; 546 *tlenp = tlen; 547 return (0); 548 } 549 550 /* 551 * The value in ret_val informs the caller 552 * if we dropped the tcb (and lock) or not. 553 * 1 = we dropped it, 0 = the TCB is still locked 554 * and valid. 555 */ 556 void 557 ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) 558 { 559 /* 560 * Generate an ACK dropping incoming segment if it occupies sequence 561 * space, where the ACK reflects our state. 562 * 563 * We can now skip the test for the RST flag since all paths to this 564 * code happen after packets containing RST have been dropped. 565 * 566 * In the SYN-RECEIVED state, don't send an ACK unless the segment 567 * we received passes the SYN-RECEIVED ACK test. If it fails send a 568 * RST. This breaks the loop in the "LAND" DoS attack, and also 569 * prevents an ACK storm between two listening ports that have been 570 * sent forged SYN segments, each with the source address of the 571 * other. 572 */ 573 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 574 (SEQ_GT(tp->snd_una, th->th_ack) || 575 SEQ_GT(th->th_ack, tp->snd_max))) { 576 *ret_val = 1; 577 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 578 return; 579 } else 580 *ret_val = 0; 581 tp->t_flags |= TF_ACKNOW; 582 if (m) 583 m_freem(m); 584 } 585 586 void 587 ctf_do_drop(struct mbuf *m, struct tcpcb *tp) 588 { 589 590 /* 591 * Drop space held by incoming segment and return. 592 */ 593 if (tp != NULL) 594 INP_WUNLOCK(tp->t_inpcb); 595 if (m) 596 m_freem(m); 597 } 598 599 int 600 ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) 601 { 602 /* 603 * RFC5961 Section 3.2 604 * 605 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 606 * window, we send challenge ACK. 607 * 608 * Note: to take into account delayed ACKs, we should test against 609 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 610 * of closed window, not covered by the RFC. 611 */ 612 int dropped = 0; 613 614 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 615 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 616 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 617 618 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 619 KASSERT(tp->t_state != TCPS_SYN_SENT, 620 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 621 __func__, th, tp)); 622 623 if (V_tcp_insecure_rst || 624 (tp->last_ack_sent == th->th_seq) || 625 (tp->rcv_nxt == th->th_seq) || 626 ((tp->last_ack_sent - 1) == th->th_seq)) { 627 TCPSTAT_INC(tcps_drops); 628 /* Drop the connection. */ 629 switch (tp->t_state) { 630 case TCPS_SYN_RECEIVED: 631 so->so_error = ECONNREFUSED; 632 goto close; 633 case TCPS_ESTABLISHED: 634 case TCPS_FIN_WAIT_1: 635 case TCPS_FIN_WAIT_2: 636 case TCPS_CLOSE_WAIT: 637 case TCPS_CLOSING: 638 case TCPS_LAST_ACK: 639 so->so_error = ECONNRESET; 640 close: 641 tcp_state_change(tp, TCPS_CLOSED); 642 /* FALLTHROUGH */ 643 default: 644 tp = tcp_close(tp); 645 } 646 dropped = 1; 647 ctf_do_drop(m, tp); 648 } else { 649 TCPSTAT_INC(tcps_badrst); 650 /* Send challenge ACK. */ 651 tcp_respond(tp, mtod(m, void *), th, m, 652 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 653 tp->last_ack_sent = tp->rcv_nxt; 654 } 655 } else { 656 m_freem(m); 657 } 658 return (dropped); 659 } 660 661 /* 662 * The value in ret_val informs the caller 663 * if we dropped the tcb (and lock) or not. 664 * 1 = we dropped it, 0 = the TCB is still locked 665 * and valid. 666 */ 667 void 668 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) 669 { 670 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 671 672 TCPSTAT_INC(tcps_badsyn); 673 if (V_tcp_insecure_syn && 674 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 675 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 676 tp = tcp_drop(tp, ECONNRESET); 677 *ret_val = 1; 678 ctf_do_drop(m, tp); 679 } else { 680 /* Send challenge ACK. */ 681 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 682 tp->snd_nxt, TH_ACK); 683 tp->last_ack_sent = tp->rcv_nxt; 684 m = NULL; 685 *ret_val = 0; 686 ctf_do_drop(m, NULL); 687 } 688 } 689 690 /* 691 * bbr_ts_check returns 1 for you should not proceed, the state 692 * machine should return. It places in ret_val what should 693 * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates 694 * that the TCB is unlocked and probably dropped. The 0 indicates the 695 * TCB is still valid and locked. 696 */ 697 int 698 ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, 699 int32_t tlen, int32_t thflags, int32_t * ret_val) 700 { 701 702 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 703 /* 704 * Invalidate ts_recent. If this segment updates ts_recent, 705 * the age will be reset later and ts_recent will get a 706 * valid value. If it does not, setting ts_recent to zero 707 * will at least satisfy the requirement that zero be placed 708 * in the timestamp echo reply when ts_recent isn't valid. 709 * The age isn't reset until we get a valid ts_recent 710 * because we don't want out-of-order segments to be dropped 711 * when ts_recent is old. 712 */ 713 tp->ts_recent = 0; 714 } else { 715 TCPSTAT_INC(tcps_rcvduppack); 716 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 717 TCPSTAT_INC(tcps_pawsdrop); 718 *ret_val = 0; 719 if (tlen) { 720 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 721 } else { 722 ctf_do_drop(m, NULL); 723 } 724 return (1); 725 } 726 return (0); 727 } 728 729 void 730 ctf_calc_rwin(struct socket *so, struct tcpcb *tp) 731 { 732 int32_t win; 733 734 /* 735 * Calculate amount of space in receive window, and then do TCP 736 * input processing. Receive window is amount of space in rcv queue, 737 * but not less than advertised window. 738 */ 739 win = sbspace(&so->so_rcv); 740 if (win < 0) 741 win = 0; 742 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 743 } 744 745 void 746 ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, 747 int32_t rstreason, int32_t tlen) 748 { 749 750 if (tp->t_inpcb) { 751 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 752 } 753 tcp_dropwithreset(m, th, tp, tlen, rstreason); 754 INP_WUNLOCK(tp->t_inpcb); 755 } 756 757 uint32_t 758 ctf_fixed_maxseg(struct tcpcb *tp) 759 { 760 int optlen; 761 762 if (tp->t_flags & TF_NOOPT) 763 return (tp->t_maxseg); 764 765 /* 766 * Here we have a simplified code from tcp_addoptions(), 767 * without a proper loop, and having most of paddings hardcoded. 768 * We only consider fixed options that we would send every 769 * time I.e. SACK is not considered. 770 * 771 */ 772 #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) 773 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 774 if (tp->t_flags & TF_RCVD_TSTMP) 775 optlen = TCPOLEN_TSTAMP_APPA; 776 else 777 optlen = 0; 778 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 779 if (tp->t_flags & TF_SIGNATURE) 780 optlen += PAD(TCPOLEN_SIGNATURE); 781 #endif 782 } else { 783 if (tp->t_flags & TF_REQ_TSTMP) 784 optlen = TCPOLEN_TSTAMP_APPA; 785 else 786 optlen = PAD(TCPOLEN_MAXSEG); 787 if (tp->t_flags & TF_REQ_SCALE) 788 optlen += PAD(TCPOLEN_WINDOW); 789 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 790 if (tp->t_flags & TF_SIGNATURE) 791 optlen += PAD(TCPOLEN_SIGNATURE); 792 #endif 793 if (tp->t_flags & TF_SACK_PERMIT) 794 optlen += PAD(TCPOLEN_SACK_PERMITTED); 795 } 796 #undef PAD 797 optlen = min(optlen, TCP_MAXOLEN); 798 return (tp->t_maxseg - optlen); 799 } 800 801 void 802 ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) 803 { 804 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 805 union tcp_log_stackspecific log; 806 struct timeval tv; 807 808 memset(&log, 0, sizeof(log)); 809 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 810 log.u_bbr.flex8 = num_sack_blks; 811 if (num_sack_blks > 0) { 812 log.u_bbr.flex1 = sack_blocks[0].start; 813 log.u_bbr.flex2 = sack_blocks[0].end; 814 } 815 if (num_sack_blks > 1) { 816 log.u_bbr.flex3 = sack_blocks[1].start; 817 log.u_bbr.flex4 = sack_blocks[1].end; 818 } 819 if (num_sack_blks > 2) { 820 log.u_bbr.flex5 = sack_blocks[2].start; 821 log.u_bbr.flex6 = sack_blocks[2].end; 822 } 823 if (num_sack_blks > 3) { 824 log.u_bbr.applimited = sack_blocks[3].start; 825 log.u_bbr.pkts_out = sack_blocks[3].end; 826 } 827 TCP_LOG_EVENTP(tp, NULL, 828 &tp->t_inpcb->inp_socket->so_rcv, 829 &tp->t_inpcb->inp_socket->so_snd, 830 TCP_SACK_FILTER_RES, 0, 831 0, &log, false, &tv); 832 } 833 } 834 835 uint32_t 836 ctf_decay_count(uint32_t count, uint32_t decay) 837 { 838 /* 839 * Given a count, decay it by a set percentage. The 840 * percentage is in thousands i.e. 100% = 1000, 841 * 19.3% = 193. 842 */ 843 uint64_t perc_count, decay_per; 844 uint32_t decayed_count; 845 if (decay > 1000) { 846 /* We don't raise it */ 847 return (count); 848 } 849 perc_count = count; 850 decay_per = decay; 851 perc_count *= decay_per; 852 perc_count /= 1000; 853 /* 854 * So now perc_count holds the 855 * count decay value. 856 */ 857 decayed_count = count - (uint32_t)perc_count; 858 return (decayed_count); 859 } 860