1 /* $OpenBSD: tcp_usrreq.c,v 1.180 2021/03/10 10:21:49 jsg Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 static int pr_slowhz = PR_SLOWHZ; 114 const struct sysctl_bounded_args tcpctl_vars[] = { 115 { TCPCTL_SLOWHZ, &pr_slowhz, 1, 0 }, 116 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 117 { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT }, 118 { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE }, 119 { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL }, 120 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 121 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 122 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 123 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 124 #ifdef TCP_ECN 125 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 126 #endif 127 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 128 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 129 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 130 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 131 }; 132 133 struct inpcbtable tcbtable; 134 135 int tcp_ident(void *, size_t *, void *, size_t, int); 136 137 /* 138 * Process a TCP user request for TCP tb. If this is a send request 139 * then m is the mbuf chain of send data. If this is a timer expiration 140 * (called from the software clock routine), then timertype tells which timer. 141 */ 142 /*ARGSUSED*/ 143 int 144 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 145 struct mbuf *control, struct proc *p) 146 { 147 struct inpcb *inp; 148 struct tcpcb *otp = NULL, *tp = NULL; 149 int error = 0; 150 short ostate; 151 152 if (req == PRU_CONTROL) { 153 #ifdef INET6 154 if (sotopf(so) == PF_INET6) 155 return in6_control(so, (u_long)m, (caddr_t)nam, 156 (struct ifnet *)control); 157 else 158 #endif /* INET6 */ 159 return (in_control(so, (u_long)m, (caddr_t)nam, 160 (struct ifnet *)control)); 161 } 162 163 soassertlocked(so); 164 165 if (control && control->m_len) { 166 error = EINVAL; 167 goto release; 168 } 169 170 inp = sotoinpcb(so); 171 /* 172 * When a TCP is attached to a socket, then there will be 173 * a (struct inpcb) pointed at by the socket, and this 174 * structure will point at a subsidiary (struct tcpcb). 175 */ 176 if (inp == NULL) { 177 error = so->so_error; 178 if (error == 0) 179 error = EINVAL; 180 goto release; 181 } 182 tp = intotcpcb(inp); 183 /* tp might get 0 when using socket splicing */ 184 if (tp == NULL) 185 goto release; 186 if (so->so_options & SO_DEBUG) { 187 otp = tp; 188 ostate = tp->t_state; 189 } 190 191 switch (req) { 192 193 /* 194 * Give the socket an address. 195 */ 196 case PRU_BIND: 197 error = in_pcbbind(inp, nam, p); 198 break; 199 200 /* 201 * Prepare to accept connections. 202 */ 203 case PRU_LISTEN: 204 if (inp->inp_lport == 0) 205 error = in_pcbbind(inp, NULL, p); 206 /* If the in_pcbbind() above is called, the tp->pf 207 should still be whatever it was before. */ 208 if (error == 0) 209 tp->t_state = TCPS_LISTEN; 210 break; 211 212 /* 213 * Initiate connection to peer. 214 * Create a template for use in transmissions on this connection. 215 * Enter SYN_SENT state, and mark socket as connecting. 216 * Start keep-alive timer, and seed output sequence space. 217 * Send initial segment on connection. 218 */ 219 case PRU_CONNECT: 220 #ifdef INET6 221 if (inp->inp_flags & INP_IPV6) { 222 struct sockaddr_in6 *sin6; 223 224 if ((error = in6_nam2sin6(nam, &sin6))) 225 break; 226 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 227 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 228 error = EINVAL; 229 break; 230 } 231 error = in6_pcbconnect(inp, nam); 232 } else 233 #endif /* INET6 */ 234 { 235 struct sockaddr_in *sin; 236 237 if ((error = in_nam2sin(nam, &sin))) 238 break; 239 if ((sin->sin_addr.s_addr == INADDR_ANY) || 240 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 241 IN_MULTICAST(sin->sin_addr.s_addr) || 242 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 243 error = EINVAL; 244 break; 245 } 246 error = in_pcbconnect(inp, nam); 247 } 248 if (error) 249 break; 250 251 tp->t_template = tcp_template(tp); 252 if (tp->t_template == 0) { 253 in_pcbdisconnect(inp); 254 error = ENOBUFS; 255 break; 256 } 257 258 so->so_state |= SS_CONNECTOUT; 259 260 /* Compute window scaling to request. */ 261 tcp_rscale(tp, sb_max); 262 263 soisconnecting(so); 264 tcpstat_inc(tcps_connattempt); 265 tp->t_state = TCPS_SYN_SENT; 266 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 267 tcp_set_iss_tsm(tp); 268 tcp_sendseqinit(tp); 269 tp->snd_last = tp->snd_una; 270 error = tcp_output(tp); 271 break; 272 273 /* 274 * Create a TCP connection between two sockets. 275 */ 276 case PRU_CONNECT2: 277 error = EOPNOTSUPP; 278 break; 279 280 /* 281 * Initiate disconnect from peer. 282 * If connection never passed embryonic stage, just drop; 283 * else if don't need to let data drain, then can just drop anyways, 284 * else have to begin TCP shutdown process: mark socket disconnecting, 285 * drain unread data, state switch to reflect user close, and 286 * send segment (e.g. FIN) to peer. Socket will be really disconnected 287 * when peer sends FIN and acks ours. 288 * 289 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 290 */ 291 case PRU_DISCONNECT: 292 tp = tcp_disconnect(tp); 293 break; 294 295 /* 296 * Accept a connection. Essentially all the work is 297 * done at higher levels; just return the address 298 * of the peer, storing through addr. 299 */ 300 case PRU_ACCEPT: 301 #ifdef INET6 302 if (inp->inp_flags & INP_IPV6) 303 in6_setpeeraddr(inp, nam); 304 else 305 #endif 306 in_setpeeraddr(inp, nam); 307 break; 308 309 /* 310 * Mark the connection as being incapable of further output. 311 */ 312 case PRU_SHUTDOWN: 313 if (so->so_state & SS_CANTSENDMORE) 314 break; 315 socantsendmore(so); 316 tp = tcp_usrclosed(tp); 317 if (tp) 318 error = tcp_output(tp); 319 break; 320 321 /* 322 * After a receive, possibly send window update to peer. 323 */ 324 case PRU_RCVD: 325 /* 326 * soreceive() calls this function when a user receives 327 * ancillary data on a listening socket. We don't call 328 * tcp_output in such a case, since there is no header 329 * template for a listening socket and hence the kernel 330 * will panic. 331 */ 332 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 333 (void) tcp_output(tp); 334 break; 335 336 /* 337 * Do a send by putting data in output queue and updating urgent 338 * marker if URG set. Possibly send more data. 339 */ 340 case PRU_SEND: 341 sbappendstream(so, &so->so_snd, m); 342 error = tcp_output(tp); 343 break; 344 345 /* 346 * Abort the TCP. 347 */ 348 case PRU_ABORT: 349 tp = tcp_drop(tp, ECONNABORTED); 350 break; 351 352 case PRU_SENSE: 353 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 354 break; 355 356 case PRU_RCVOOB: 357 if ((so->so_oobmark == 0 && 358 (so->so_state & SS_RCVATMARK) == 0) || 359 so->so_options & SO_OOBINLINE || 360 tp->t_oobflags & TCPOOB_HADDATA) { 361 error = EINVAL; 362 break; 363 } 364 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 365 error = EWOULDBLOCK; 366 break; 367 } 368 m->m_len = 1; 369 *mtod(m, caddr_t) = tp->t_iobc; 370 if (((long)nam & MSG_PEEK) == 0) 371 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 372 break; 373 374 case PRU_SENDOOB: 375 if (sbspace(so, &so->so_snd) < -512) { 376 m_freem(m); 377 error = ENOBUFS; 378 break; 379 } 380 /* 381 * According to RFC961 (Assigned Protocols), 382 * the urgent pointer points to the last octet 383 * of urgent data. We continue, however, 384 * to consider it to indicate the first octet 385 * of data past the urgent section. 386 * Otherwise, snd_up should be one lower. 387 */ 388 sbappendstream(so, &so->so_snd, m); 389 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 390 tp->t_force = 1; 391 error = tcp_output(tp); 392 tp->t_force = 0; 393 break; 394 395 case PRU_SOCKADDR: 396 #ifdef INET6 397 if (inp->inp_flags & INP_IPV6) 398 in6_setsockaddr(inp, nam); 399 else 400 #endif 401 in_setsockaddr(inp, nam); 402 break; 403 404 case PRU_PEERADDR: 405 #ifdef INET6 406 if (inp->inp_flags & INP_IPV6) 407 in6_setpeeraddr(inp, nam); 408 else 409 #endif 410 in_setpeeraddr(inp, nam); 411 break; 412 413 default: 414 panic("tcp_usrreq"); 415 } 416 if (otp) 417 tcp_trace(TA_USER, ostate, tp, otp, NULL, req, 0); 418 return (error); 419 420 release: 421 if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) { 422 m_freem(control); 423 m_freem(m); 424 } 425 return (error); 426 } 427 428 int 429 tcp_ctloutput(int op, struct socket *so, int level, int optname, 430 struct mbuf *m) 431 { 432 int error = 0; 433 struct inpcb *inp; 434 struct tcpcb *tp; 435 int i; 436 437 inp = sotoinpcb(so); 438 if (inp == NULL) 439 return (ECONNRESET); 440 if (level != IPPROTO_TCP) { 441 switch (so->so_proto->pr_domain->dom_family) { 442 #ifdef INET6 443 case PF_INET6: 444 error = ip6_ctloutput(op, so, level, optname, m); 445 break; 446 #endif /* INET6 */ 447 case PF_INET: 448 error = ip_ctloutput(op, so, level, optname, m); 449 break; 450 default: 451 error = EAFNOSUPPORT; /*?*/ 452 break; 453 } 454 return (error); 455 } 456 tp = intotcpcb(inp); 457 458 switch (op) { 459 460 case PRCO_SETOPT: 461 switch (optname) { 462 463 case TCP_NODELAY: 464 if (m == NULL || m->m_len < sizeof (int)) 465 error = EINVAL; 466 else if (*mtod(m, int *)) 467 tp->t_flags |= TF_NODELAY; 468 else 469 tp->t_flags &= ~TF_NODELAY; 470 break; 471 472 case TCP_NOPUSH: 473 if (m == NULL || m->m_len < sizeof (int)) 474 error = EINVAL; 475 else if (*mtod(m, int *)) 476 tp->t_flags |= TF_NOPUSH; 477 else if (tp->t_flags & TF_NOPUSH) { 478 tp->t_flags &= ~TF_NOPUSH; 479 if (TCPS_HAVEESTABLISHED(tp->t_state)) 480 error = tcp_output(tp); 481 } 482 break; 483 484 case TCP_MAXSEG: 485 if (m == NULL || m->m_len < sizeof (int)) { 486 error = EINVAL; 487 break; 488 } 489 490 i = *mtod(m, int *); 491 if (i > 0 && i <= tp->t_maxseg) 492 tp->t_maxseg = i; 493 else 494 error = EINVAL; 495 break; 496 497 case TCP_SACK_ENABLE: 498 if (m == NULL || m->m_len < sizeof (int)) { 499 error = EINVAL; 500 break; 501 } 502 503 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 504 error = EPERM; 505 break; 506 } 507 508 if (tp->t_flags & TF_SIGNATURE) { 509 error = EPERM; 510 break; 511 } 512 513 if (*mtod(m, int *)) 514 tp->sack_enable = 1; 515 else 516 tp->sack_enable = 0; 517 break; 518 #ifdef TCP_SIGNATURE 519 case TCP_MD5SIG: 520 if (m == NULL || m->m_len < sizeof (int)) { 521 error = EINVAL; 522 break; 523 } 524 525 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 526 error = EPERM; 527 break; 528 } 529 530 if (*mtod(m, int *)) { 531 tp->t_flags |= TF_SIGNATURE; 532 tp->sack_enable = 0; 533 } else 534 tp->t_flags &= ~TF_SIGNATURE; 535 break; 536 #endif /* TCP_SIGNATURE */ 537 default: 538 error = ENOPROTOOPT; 539 break; 540 } 541 break; 542 543 case PRCO_GETOPT: 544 m->m_len = sizeof(int); 545 546 switch (optname) { 547 case TCP_NODELAY: 548 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 549 break; 550 case TCP_NOPUSH: 551 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 552 break; 553 case TCP_MAXSEG: 554 *mtod(m, int *) = tp->t_maxseg; 555 break; 556 case TCP_SACK_ENABLE: 557 *mtod(m, int *) = tp->sack_enable; 558 break; 559 #ifdef TCP_SIGNATURE 560 case TCP_MD5SIG: 561 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 562 break; 563 #endif 564 default: 565 error = ENOPROTOOPT; 566 break; 567 } 568 break; 569 } 570 return (error); 571 } 572 573 /* 574 * Attach TCP protocol to socket, allocating 575 * internet protocol control block, tcp control block, 576 * buffer space, and entering LISTEN state to accept connections. 577 */ 578 int 579 tcp_attach(struct socket *so, int proto) 580 { 581 struct tcpcb *tp; 582 struct inpcb *inp; 583 int error; 584 585 if (so->so_pcb) 586 return EISCONN; 587 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 588 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 589 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 590 error = soreserve(so, tcp_sendspace, tcp_recvspace); 591 if (error) 592 return (error); 593 } 594 595 NET_ASSERT_LOCKED(); 596 error = in_pcballoc(so, &tcbtable); 597 if (error) 598 return (error); 599 inp = sotoinpcb(so); 600 tp = tcp_newtcpcb(inp); 601 if (tp == NULL) { 602 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 603 604 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 605 in_pcbdetach(inp); 606 so->so_state |= nofd; 607 return (ENOBUFS); 608 } 609 tp->t_state = TCPS_CLOSED; 610 #ifdef INET6 611 /* we disallow IPv4 mapped address completely. */ 612 if (inp->inp_flags & INP_IPV6) 613 tp->pf = PF_INET6; 614 else 615 tp->pf = PF_INET; 616 #else 617 tp->pf = PF_INET; 618 #endif 619 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 620 so->so_linger = TCP_LINGERTIME; 621 622 if (so->so_options & SO_DEBUG) 623 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 624 return (0); 625 } 626 627 int 628 tcp_detach(struct socket *so) 629 { 630 struct inpcb *inp; 631 struct tcpcb *otp = NULL, *tp = NULL; 632 int error = 0; 633 short ostate; 634 635 soassertlocked(so); 636 637 inp = sotoinpcb(so); 638 /* 639 * When a TCP is attached to a socket, then there will be 640 * a (struct inpcb) pointed at by the socket, and this 641 * structure will point at a subsidiary (struct tcpcb). 642 */ 643 if (inp == NULL) { 644 error = so->so_error; 645 if (error == 0) 646 error = EINVAL; 647 return (error); 648 } 649 tp = intotcpcb(inp); 650 /* tp might get 0 when using socket splicing */ 651 if (tp == NULL) 652 return (0); 653 if (so->so_options & SO_DEBUG) { 654 otp = tp; 655 ostate = tp->t_state; 656 } 657 658 /* 659 * Detach the TCP protocol from the socket. 660 * If the protocol state is non-embryonic, then can't 661 * do this directly: have to initiate a PRU_DISCONNECT, 662 * which may finish later; embryonic TCB's can just 663 * be discarded here. 664 */ 665 tp = tcp_disconnect(tp); 666 667 if (otp) 668 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 669 return (error); 670 } 671 672 /* 673 * Initiate (or continue) disconnect. 674 * If embryonic state, just send reset (once). 675 * If in ``let data drain'' option and linger null, just drop. 676 * Otherwise (hard), mark socket disconnecting and drop 677 * current input data; switch states based on user close, and 678 * send segment to peer (with FIN). 679 */ 680 struct tcpcb * 681 tcp_disconnect(struct tcpcb *tp) 682 { 683 struct socket *so = tp->t_inpcb->inp_socket; 684 685 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 686 tp = tcp_close(tp); 687 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 688 tp = tcp_drop(tp, 0); 689 else { 690 soisdisconnecting(so); 691 sbflush(so, &so->so_rcv); 692 tp = tcp_usrclosed(tp); 693 if (tp) 694 (void) tcp_output(tp); 695 } 696 return (tp); 697 } 698 699 /* 700 * User issued close, and wish to trail through shutdown states: 701 * if never received SYN, just forget it. If got a SYN from peer, 702 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 703 * If already got a FIN from peer, then almost done; go to LAST_ACK 704 * state. In all other cases, have already sent FIN to peer (e.g. 705 * after PRU_SHUTDOWN), and just have to play tedious game waiting 706 * for peer to send FIN or not respond to keep-alives, etc. 707 * We can let the user exit from the close as soon as the FIN is acked. 708 */ 709 struct tcpcb * 710 tcp_usrclosed(struct tcpcb *tp) 711 { 712 713 switch (tp->t_state) { 714 715 case TCPS_CLOSED: 716 case TCPS_LISTEN: 717 case TCPS_SYN_SENT: 718 tp->t_state = TCPS_CLOSED; 719 tp = tcp_close(tp); 720 break; 721 722 case TCPS_SYN_RECEIVED: 723 case TCPS_ESTABLISHED: 724 tp->t_state = TCPS_FIN_WAIT_1; 725 break; 726 727 case TCPS_CLOSE_WAIT: 728 tp->t_state = TCPS_LAST_ACK; 729 break; 730 } 731 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 732 soisdisconnected(tp->t_inpcb->inp_socket); 733 /* 734 * If we are in FIN_WAIT_2, we arrived here because the 735 * application did a shutdown of the send side. Like the 736 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 737 * a full close, we start a timer to make sure sockets are 738 * not left in FIN_WAIT_2 forever. 739 */ 740 if (tp->t_state == TCPS_FIN_WAIT_2) 741 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 742 } 743 return (tp); 744 } 745 746 /* 747 * Look up a socket for ident or tcpdrop, ... 748 */ 749 int 750 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 751 { 752 int error = 0; 753 struct tcp_ident_mapping tir; 754 struct inpcb *inp; 755 struct tcpcb *tp = NULL; 756 struct sockaddr_in *fin, *lin; 757 #ifdef INET6 758 struct sockaddr_in6 *fin6, *lin6; 759 struct in6_addr f6, l6; 760 #endif 761 762 NET_ASSERT_LOCKED(); 763 764 if (dodrop) { 765 if (oldp != NULL || *oldlenp != 0) 766 return (EINVAL); 767 if (newp == NULL) 768 return (EPERM); 769 if (newlen < sizeof(tir)) 770 return (ENOMEM); 771 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 772 return (error); 773 } else { 774 if (oldp == NULL) 775 return (EINVAL); 776 if (*oldlenp < sizeof(tir)) 777 return (ENOMEM); 778 if (newp != NULL || newlen != 0) 779 return (EINVAL); 780 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 781 return (error); 782 } 783 switch (tir.faddr.ss_family) { 784 #ifdef INET6 785 case AF_INET6: 786 fin6 = (struct sockaddr_in6 *)&tir.faddr; 787 error = in6_embedscope(&f6, fin6, NULL); 788 if (error) 789 return EINVAL; /*?*/ 790 lin6 = (struct sockaddr_in6 *)&tir.laddr; 791 error = in6_embedscope(&l6, lin6, NULL); 792 if (error) 793 return EINVAL; /*?*/ 794 break; 795 #endif 796 case AF_INET: 797 fin = (struct sockaddr_in *)&tir.faddr; 798 lin = (struct sockaddr_in *)&tir.laddr; 799 break; 800 default: 801 return (EINVAL); 802 } 803 804 switch (tir.faddr.ss_family) { 805 #ifdef INET6 806 case AF_INET6: 807 inp = in6_pcbhashlookup(&tcbtable, &f6, 808 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 809 break; 810 #endif 811 case AF_INET: 812 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 813 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 814 break; 815 default: 816 unhandled_af(tir.faddr.ss_family); 817 } 818 819 if (dodrop) { 820 if (inp && (tp = intotcpcb(inp)) && 821 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 822 tp = tcp_drop(tp, ECONNABORTED); 823 else 824 error = ESRCH; 825 return (error); 826 } 827 828 if (inp == NULL) { 829 tcpstat_inc(tcps_pcbhashmiss); 830 switch (tir.faddr.ss_family) { 831 #ifdef INET6 832 case AF_INET6: 833 inp = in6_pcblookup_listen(&tcbtable, 834 &l6, lin6->sin6_port, NULL, tir.rdomain); 835 break; 836 #endif 837 case AF_INET: 838 inp = in_pcblookup_listen(&tcbtable, 839 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 840 break; 841 } 842 } 843 844 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 845 tir.ruid = inp->inp_socket->so_ruid; 846 tir.euid = inp->inp_socket->so_euid; 847 } else { 848 tir.ruid = -1; 849 tir.euid = -1; 850 } 851 852 *oldlenp = sizeof (tir); 853 error = copyout((void *)&tir, oldp, sizeof (tir)); 854 return (error); 855 } 856 857 int 858 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 859 { 860 uint64_t counters[tcps_ncounters]; 861 struct tcpstat tcpstat; 862 struct syn_cache_set *set; 863 int i = 0; 864 865 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 866 867 memset(&tcpstat, 0, sizeof tcpstat); 868 counters_read(tcpcounters, counters, nitems(counters)); 869 ASSIGN(tcps_connattempt); 870 ASSIGN(tcps_accepts); 871 ASSIGN(tcps_connects); 872 ASSIGN(tcps_drops); 873 ASSIGN(tcps_conndrops); 874 ASSIGN(tcps_closed); 875 ASSIGN(tcps_segstimed); 876 ASSIGN(tcps_rttupdated); 877 ASSIGN(tcps_delack); 878 ASSIGN(tcps_timeoutdrop); 879 ASSIGN(tcps_rexmttimeo); 880 ASSIGN(tcps_persisttimeo); 881 ASSIGN(tcps_persistdrop); 882 ASSIGN(tcps_keeptimeo); 883 ASSIGN(tcps_keepprobe); 884 ASSIGN(tcps_keepdrops); 885 ASSIGN(tcps_sndtotal); 886 ASSIGN(tcps_sndpack); 887 ASSIGN(tcps_sndbyte); 888 ASSIGN(tcps_sndrexmitpack); 889 ASSIGN(tcps_sndrexmitbyte); 890 ASSIGN(tcps_sndrexmitfast); 891 ASSIGN(tcps_sndacks); 892 ASSIGN(tcps_sndprobe); 893 ASSIGN(tcps_sndurg); 894 ASSIGN(tcps_sndwinup); 895 ASSIGN(tcps_sndctrl); 896 ASSIGN(tcps_rcvtotal); 897 ASSIGN(tcps_rcvpack); 898 ASSIGN(tcps_rcvbyte); 899 ASSIGN(tcps_rcvbadsum); 900 ASSIGN(tcps_rcvbadoff); 901 ASSIGN(tcps_rcvmemdrop); 902 ASSIGN(tcps_rcvnosec); 903 ASSIGN(tcps_rcvshort); 904 ASSIGN(tcps_rcvduppack); 905 ASSIGN(tcps_rcvdupbyte); 906 ASSIGN(tcps_rcvpartduppack); 907 ASSIGN(tcps_rcvpartdupbyte); 908 ASSIGN(tcps_rcvoopack); 909 ASSIGN(tcps_rcvoobyte); 910 ASSIGN(tcps_rcvpackafterwin); 911 ASSIGN(tcps_rcvbyteafterwin); 912 ASSIGN(tcps_rcvafterclose); 913 ASSIGN(tcps_rcvwinprobe); 914 ASSIGN(tcps_rcvdupack); 915 ASSIGN(tcps_rcvacktoomuch); 916 ASSIGN(tcps_rcvacktooold); 917 ASSIGN(tcps_rcvackpack); 918 ASSIGN(tcps_rcvackbyte); 919 ASSIGN(tcps_rcvwinupd); 920 ASSIGN(tcps_pawsdrop); 921 ASSIGN(tcps_predack); 922 ASSIGN(tcps_preddat); 923 ASSIGN(tcps_pcbhashmiss); 924 ASSIGN(tcps_noport); 925 ASSIGN(tcps_badsyn); 926 ASSIGN(tcps_dropsyn); 927 ASSIGN(tcps_rcvbadsig); 928 ASSIGN(tcps_rcvgoodsig); 929 ASSIGN(tcps_inswcsum); 930 ASSIGN(tcps_outswcsum); 931 ASSIGN(tcps_ecn_accepts); 932 ASSIGN(tcps_ecn_rcvece); 933 ASSIGN(tcps_ecn_rcvcwr); 934 ASSIGN(tcps_ecn_rcvce); 935 ASSIGN(tcps_ecn_sndect); 936 ASSIGN(tcps_ecn_sndece); 937 ASSIGN(tcps_ecn_sndcwr); 938 ASSIGN(tcps_cwr_ecn); 939 ASSIGN(tcps_cwr_frecovery); 940 ASSIGN(tcps_cwr_timeout); 941 ASSIGN(tcps_sc_added); 942 ASSIGN(tcps_sc_completed); 943 ASSIGN(tcps_sc_timed_out); 944 ASSIGN(tcps_sc_overflowed); 945 ASSIGN(tcps_sc_reset); 946 ASSIGN(tcps_sc_unreach); 947 ASSIGN(tcps_sc_bucketoverflow); 948 ASSIGN(tcps_sc_aborted); 949 ASSIGN(tcps_sc_dupesyn); 950 ASSIGN(tcps_sc_dropped); 951 ASSIGN(tcps_sc_collisions); 952 ASSIGN(tcps_sc_retransmitted); 953 ASSIGN(tcps_sc_seedrandom); 954 ASSIGN(tcps_sc_hash_size); 955 ASSIGN(tcps_sc_entry_count); 956 ASSIGN(tcps_sc_entry_limit); 957 ASSIGN(tcps_sc_bucket_maxlen); 958 ASSIGN(tcps_sc_bucket_limit); 959 ASSIGN(tcps_sc_uses_left); 960 ASSIGN(tcps_conndrained); 961 ASSIGN(tcps_sack_recovery_episode); 962 ASSIGN(tcps_sack_rexmits); 963 ASSIGN(tcps_sack_rexmit_bytes); 964 ASSIGN(tcps_sack_rcv_opts); 965 ASSIGN(tcps_sack_snd_opts); 966 ASSIGN(tcps_sack_drop_opts); 967 968 #undef ASSIGN 969 970 set = &tcp_syn_cache[tcp_syn_cache_active]; 971 tcpstat.tcps_sc_hash_size = set->scs_size; 972 tcpstat.tcps_sc_entry_count = set->scs_count; 973 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 974 tcpstat.tcps_sc_bucket_maxlen = 0; 975 for (i = 0; i < set->scs_size; i++) { 976 if (tcpstat.tcps_sc_bucket_maxlen < 977 set->scs_buckethead[i].sch_length) 978 tcpstat.tcps_sc_bucket_maxlen = 979 set->scs_buckethead[i].sch_length; 980 } 981 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 982 tcpstat.tcps_sc_uses_left = set->scs_use; 983 984 return (sysctl_rdstruct(oldp, oldlenp, newp, 985 &tcpstat, sizeof(tcpstat))); 986 } 987 988 /* 989 * Sysctl for tcp variables. 990 */ 991 int 992 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 993 size_t newlen) 994 { 995 int error, nval; 996 997 /* All sysctl names at this level are terminal. */ 998 if (namelen != 1) 999 return (ENOTDIR); 1000 1001 switch (name[0]) { 1002 case TCPCTL_BADDYNAMIC: 1003 NET_LOCK(); 1004 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1005 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1006 NET_UNLOCK(); 1007 return (error); 1008 1009 case TCPCTL_ROOTONLY: 1010 if (newp && securelevel > 0) 1011 return (EPERM); 1012 NET_LOCK(); 1013 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1014 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1015 NET_UNLOCK(); 1016 return (error); 1017 1018 case TCPCTL_IDENT: 1019 NET_LOCK(); 1020 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1021 NET_UNLOCK(); 1022 return (error); 1023 1024 case TCPCTL_DROP: 1025 NET_LOCK(); 1026 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1027 NET_UNLOCK(); 1028 return (error); 1029 1030 case TCPCTL_REASS_LIMIT: 1031 NET_LOCK(); 1032 nval = tcp_reass_limit; 1033 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1034 if (!error && nval != tcp_reass_limit) { 1035 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1036 if (!error) 1037 tcp_reass_limit = nval; 1038 } 1039 NET_UNLOCK(); 1040 return (error); 1041 1042 case TCPCTL_SACKHOLE_LIMIT: 1043 NET_LOCK(); 1044 nval = tcp_sackhole_limit; 1045 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1046 if (!error && nval != tcp_sackhole_limit) { 1047 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1048 if (!error) 1049 tcp_sackhole_limit = nval; 1050 } 1051 NET_UNLOCK(); 1052 return (error); 1053 1054 case TCPCTL_STATS: 1055 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1056 1057 case TCPCTL_SYN_USE_LIMIT: 1058 NET_LOCK(); 1059 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1060 &tcp_syn_use_limit, 0, INT_MAX); 1061 if (!error && newp != NULL) { 1062 /* 1063 * Global tcp_syn_use_limit is used when reseeding a 1064 * new cache. Also update the value in active cache. 1065 */ 1066 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1067 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1068 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1069 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1070 } 1071 NET_UNLOCK(); 1072 return (error); 1073 1074 case TCPCTL_SYN_HASH_SIZE: 1075 NET_LOCK(); 1076 nval = tcp_syn_hash_size; 1077 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1078 &nval, 1, 100000); 1079 if (!error && nval != tcp_syn_hash_size) { 1080 /* 1081 * If global hash size has been changed, 1082 * switch sets as soon as possible. Then 1083 * the actual hash array will be reallocated. 1084 */ 1085 if (tcp_syn_cache[0].scs_size != nval) 1086 tcp_syn_cache[0].scs_use = 0; 1087 if (tcp_syn_cache[1].scs_size != nval) 1088 tcp_syn_cache[1].scs_use = 0; 1089 tcp_syn_hash_size = nval; 1090 } 1091 NET_UNLOCK(); 1092 return (error); 1093 1094 default: 1095 NET_LOCK(); 1096 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, 1097 namelen, oldp, oldlenp, newp, newlen); 1098 NET_UNLOCK(); 1099 return (error); 1100 } 1101 /* NOTREACHED */ 1102 } 1103 1104 /* 1105 * Scale the send buffer so that inflight data is not accounted against 1106 * the limit. The buffer will scale with the congestion window, if the 1107 * the receiver stops acking data the window will shrink and therefore 1108 * the buffer size will shrink as well. 1109 * In low memory situation try to shrink the buffer to the initial size 1110 * disabling the send buffer scaling as long as the situation persists. 1111 */ 1112 void 1113 tcp_update_sndspace(struct tcpcb *tp) 1114 { 1115 struct socket *so = tp->t_inpcb->inp_socket; 1116 u_long nmax = so->so_snd.sb_hiwat; 1117 1118 if (sbchecklowmem()) { 1119 /* low on memory try to get rid of some */ 1120 if (tcp_sendspace < nmax) 1121 nmax = tcp_sendspace; 1122 } else if (so->so_snd.sb_wat != tcp_sendspace) 1123 /* user requested buffer size, auto-scaling disabled */ 1124 nmax = so->so_snd.sb_wat; 1125 else 1126 /* automatic buffer scaling */ 1127 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1128 tp->snd_una); 1129 1130 /* a writable socket must be preserved because of poll(2) semantics */ 1131 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1132 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1133 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1134 /* keep in sync with sbreserve() calculation */ 1135 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1136 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1137 } 1138 1139 /* round to MSS boundary */ 1140 nmax = roundup(nmax, tp->t_maxseg); 1141 1142 if (nmax != so->so_snd.sb_hiwat) 1143 sbreserve(so, &so->so_snd, nmax); 1144 } 1145 1146 /* 1147 * Scale the recv buffer by looking at how much data was transferred in 1148 * on approximated RTT. If more than a big part of the recv buffer was 1149 * transferred during that time we increase the buffer by a constant. 1150 * In low memory situation try to shrink the buffer to the initial size. 1151 */ 1152 void 1153 tcp_update_rcvspace(struct tcpcb *tp) 1154 { 1155 struct socket *so = tp->t_inpcb->inp_socket; 1156 u_long nmax = so->so_rcv.sb_hiwat; 1157 1158 if (sbchecklowmem()) { 1159 /* low on memory try to get rid of some */ 1160 if (tcp_recvspace < nmax) 1161 nmax = tcp_recvspace; 1162 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1163 /* user requested buffer size, auto-scaling disabled */ 1164 nmax = so->so_rcv.sb_wat; 1165 else { 1166 /* automatic buffer scaling */ 1167 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1168 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1169 tcp_autorcvbuf_inc); 1170 } 1171 1172 /* a readable socket must be preserved because of poll(2) semantics */ 1173 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1174 nmax < so->so_snd.sb_lowat) 1175 nmax = so->so_snd.sb_lowat; 1176 1177 if (nmax == so->so_rcv.sb_hiwat) 1178 return; 1179 1180 /* round to MSS boundary */ 1181 nmax = roundup(nmax, tp->t_maxseg); 1182 sbreserve(so, &so->so_rcv, nmax); 1183 } 1184