1 /* $OpenBSD: tcp_usrreq.c,v 1.152 2017/06/26 09:32:32 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 NET_ASSERT_LOCKED(); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 if (inp) { 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) { 174 return (0); 175 } 176 #ifdef KPROF 177 tcp_acounts[tp->t_state][req]++; 178 #endif 179 ostate = tp->t_state; 180 } else 181 ostate = 0; 182 switch (req) { 183 184 /* 185 * PRU_DETACH detaches the TCP protocol from the socket. 186 * If the protocol state is non-embryonic, then can't 187 * do this directly: have to initiate a PRU_DISCONNECT, 188 * which may finish later; embryonic TCB's can just 189 * be discarded here. 190 */ 191 case PRU_DETACH: 192 tp = tcp_disconnect(tp); 193 break; 194 195 /* 196 * Give the socket an address. 197 */ 198 case PRU_BIND: 199 error = in_pcbbind(inp, nam, p); 200 break; 201 202 /* 203 * Prepare to accept connections. 204 */ 205 case PRU_LISTEN: 206 if (inp->inp_lport == 0) 207 error = in_pcbbind(inp, NULL, p); 208 /* If the in_pcbbind() above is called, the tp->pf 209 should still be whatever it was before. */ 210 if (error == 0) 211 tp->t_state = TCPS_LISTEN; 212 break; 213 214 /* 215 * Initiate connection to peer. 216 * Create a template for use in transmissions on this connection. 217 * Enter SYN_SENT state, and mark socket as connecting. 218 * Start keep-alive timer, and seed output sequence space. 219 * Send initial segment on connection. 220 */ 221 case PRU_CONNECT: 222 switch (mtod(nam, struct sockaddr *)->sa_family) { 223 case AF_INET: { 224 struct in_addr *addr = 225 &mtod(nam, struct sockaddr_in *)->sin_addr; 226 227 if ((addr->s_addr == INADDR_ANY) || 228 (addr->s_addr == INADDR_BROADCAST) || 229 IN_MULTICAST(addr->s_addr) || 230 in_broadcast(*addr, inp->inp_rtableid)) { 231 error = EINVAL; 232 break; 233 } 234 235 error = in_pcbconnect(inp, nam); 236 break; 237 } 238 #ifdef INET6 239 case AF_INET6: { 240 struct in6_addr *addr6 = 241 &mtod(nam, struct sockaddr_in6 *)->sin6_addr; 242 243 if (IN6_IS_ADDR_UNSPECIFIED(addr6) || 244 IN6_IS_ADDR_MULTICAST(addr6)) { 245 error = EINVAL; 246 break; 247 } 248 249 error = in6_pcbconnect(inp, nam); 250 break; 251 } 252 #endif /* INET6 */ 253 default: 254 error = EAFNOSUPPORT; 255 break; 256 } 257 258 if (error) 259 break; 260 261 tp->t_template = tcp_template(tp); 262 if (tp->t_template == 0) { 263 in_pcbdisconnect(inp); 264 error = ENOBUFS; 265 break; 266 } 267 268 so->so_state |= SS_CONNECTOUT; 269 270 /* Compute window scaling to request. */ 271 tcp_rscale(tp, sb_max); 272 273 soisconnecting(so); 274 tcpstat_inc(tcps_connattempt); 275 tp->t_state = TCPS_SYN_SENT; 276 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 277 tcp_set_iss_tsm(tp); 278 tcp_sendseqinit(tp); 279 #if defined(TCP_SACK) 280 tp->snd_last = tp->snd_una; 281 #endif 282 #if defined(TCP_SACK) && defined(TCP_FACK) 283 tp->snd_fack = tp->snd_una; 284 tp->retran_data = 0; 285 tp->snd_awnd = 0; 286 #endif 287 error = tcp_output(tp); 288 break; 289 290 /* 291 * Create a TCP connection between two sockets. 292 */ 293 case PRU_CONNECT2: 294 error = EOPNOTSUPP; 295 break; 296 297 /* 298 * Initiate disconnect from peer. 299 * If connection never passed embryonic stage, just drop; 300 * else if don't need to let data drain, then can just drop anyways, 301 * else have to begin TCP shutdown process: mark socket disconnecting, 302 * drain unread data, state switch to reflect user close, and 303 * send segment (e.g. FIN) to peer. Socket will be really disconnected 304 * when peer sends FIN and acks ours. 305 * 306 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 307 */ 308 case PRU_DISCONNECT: 309 tp = tcp_disconnect(tp); 310 break; 311 312 /* 313 * Accept a connection. Essentially all the work is 314 * done at higher levels; just return the address 315 * of the peer, storing through addr. 316 */ 317 case PRU_ACCEPT: 318 #ifdef INET6 319 if (inp->inp_flags & INP_IPV6) 320 in6_setpeeraddr(inp, nam); 321 else 322 #endif 323 in_setpeeraddr(inp, nam); 324 break; 325 326 /* 327 * Mark the connection as being incapable of further output. 328 */ 329 case PRU_SHUTDOWN: 330 if (so->so_state & SS_CANTSENDMORE) 331 break; 332 socantsendmore(so); 333 tp = tcp_usrclosed(tp); 334 if (tp) 335 error = tcp_output(tp); 336 break; 337 338 /* 339 * After a receive, possibly send window update to peer. 340 */ 341 case PRU_RCVD: 342 /* 343 * soreceive() calls this function when a user receives 344 * ancillary data on a listening socket. We don't call 345 * tcp_output in such a case, since there is no header 346 * template for a listening socket and hence the kernel 347 * will panic. 348 */ 349 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 350 (void) tcp_output(tp); 351 break; 352 353 /* 354 * Do a send by putting data in output queue and updating urgent 355 * marker if URG set. Possibly send more data. 356 */ 357 case PRU_SEND: 358 sbappendstream(so, &so->so_snd, m); 359 error = tcp_output(tp); 360 break; 361 362 /* 363 * Abort the TCP. 364 */ 365 case PRU_ABORT: 366 tp = tcp_drop(tp, ECONNABORTED); 367 break; 368 369 case PRU_SENSE: 370 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 371 return (0); 372 373 case PRU_RCVOOB: 374 if ((so->so_oobmark == 0 && 375 (so->so_state & SS_RCVATMARK) == 0) || 376 so->so_options & SO_OOBINLINE || 377 tp->t_oobflags & TCPOOB_HADDATA) { 378 error = EINVAL; 379 break; 380 } 381 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 382 error = EWOULDBLOCK; 383 break; 384 } 385 m->m_len = 1; 386 *mtod(m, caddr_t) = tp->t_iobc; 387 if (((long)nam & MSG_PEEK) == 0) 388 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 389 break; 390 391 case PRU_SENDOOB: 392 if (sbspace(so, &so->so_snd) < -512) { 393 m_freem(m); 394 error = ENOBUFS; 395 break; 396 } 397 /* 398 * According to RFC961 (Assigned Protocols), 399 * the urgent pointer points to the last octet 400 * of urgent data. We continue, however, 401 * to consider it to indicate the first octet 402 * of data past the urgent section. 403 * Otherwise, snd_up should be one lower. 404 */ 405 sbappendstream(so, &so->so_snd, m); 406 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 407 tp->t_force = 1; 408 error = tcp_output(tp); 409 tp->t_force = 0; 410 break; 411 412 case PRU_SOCKADDR: 413 #ifdef INET6 414 if (inp->inp_flags & INP_IPV6) 415 in6_setsockaddr(inp, nam); 416 else 417 #endif 418 in_setsockaddr(inp, nam); 419 break; 420 421 case PRU_PEERADDR: 422 #ifdef INET6 423 if (inp->inp_flags & INP_IPV6) 424 in6_setpeeraddr(inp, nam); 425 else 426 #endif 427 in_setpeeraddr(inp, nam); 428 break; 429 430 default: 431 panic("tcp_usrreq"); 432 } 433 if (tp && (so->so_options & SO_DEBUG)) 434 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 435 return (error); 436 } 437 438 int 439 tcp_ctloutput(int op, struct socket *so, int level, int optname, 440 struct mbuf *m) 441 { 442 int error = 0; 443 struct inpcb *inp; 444 struct tcpcb *tp; 445 int i; 446 447 inp = sotoinpcb(so); 448 if (inp == NULL) { 449 if (op == PRCO_SETOPT) 450 (void) m_free(m); 451 return (ECONNRESET); 452 } 453 if (level != IPPROTO_TCP) { 454 switch (so->so_proto->pr_domain->dom_family) { 455 #ifdef INET6 456 case PF_INET6: 457 error = ip6_ctloutput(op, so, level, optname, m); 458 break; 459 #endif /* INET6 */ 460 case PF_INET: 461 error = ip_ctloutput(op, so, level, optname, m); 462 break; 463 default: 464 error = EAFNOSUPPORT; /*?*/ 465 break; 466 } 467 return (error); 468 } 469 tp = intotcpcb(inp); 470 471 switch (op) { 472 473 case PRCO_SETOPT: 474 switch (optname) { 475 476 case TCP_NODELAY: 477 if (m == NULL || m->m_len < sizeof (int)) 478 error = EINVAL; 479 else if (*mtod(m, int *)) 480 tp->t_flags |= TF_NODELAY; 481 else 482 tp->t_flags &= ~TF_NODELAY; 483 break; 484 485 case TCP_NOPUSH: 486 if (m == NULL || m->m_len < sizeof (int)) 487 error = EINVAL; 488 else if (*mtod(m, int *)) 489 tp->t_flags |= TF_NOPUSH; 490 else if (tp->t_flags & TF_NOPUSH) { 491 tp->t_flags &= ~TF_NOPUSH; 492 if (TCPS_HAVEESTABLISHED(tp->t_state)) 493 error = tcp_output(tp); 494 } 495 break; 496 497 case TCP_MAXSEG: 498 if (m == NULL || m->m_len < sizeof (int)) { 499 error = EINVAL; 500 break; 501 } 502 503 i = *mtod(m, int *); 504 if (i > 0 && i <= tp->t_maxseg) 505 tp->t_maxseg = i; 506 else 507 error = EINVAL; 508 break; 509 510 #ifdef TCP_SACK 511 case TCP_SACK_ENABLE: 512 if (m == NULL || m->m_len < sizeof (int)) { 513 error = EINVAL; 514 break; 515 } 516 517 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 518 error = EPERM; 519 break; 520 } 521 522 if (tp->t_flags & TF_SIGNATURE) { 523 error = EPERM; 524 break; 525 } 526 527 if (*mtod(m, int *)) 528 tp->sack_enable = 1; 529 else 530 tp->sack_enable = 0; 531 break; 532 #endif 533 #ifdef TCP_SIGNATURE 534 case TCP_MD5SIG: 535 if (m == NULL || m->m_len < sizeof (int)) { 536 error = EINVAL; 537 break; 538 } 539 540 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 541 error = EPERM; 542 break; 543 } 544 545 if (*mtod(m, int *)) { 546 tp->t_flags |= TF_SIGNATURE; 547 #ifdef TCP_SACK 548 tp->sack_enable = 0; 549 #endif /* TCP_SACK */ 550 } else 551 tp->t_flags &= ~TF_SIGNATURE; 552 break; 553 #endif /* TCP_SIGNATURE */ 554 default: 555 error = ENOPROTOOPT; 556 break; 557 } 558 m_free(m); 559 break; 560 561 case PRCO_GETOPT: 562 m->m_len = sizeof(int); 563 564 switch (optname) { 565 case TCP_NODELAY: 566 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 567 break; 568 case TCP_NOPUSH: 569 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 570 break; 571 case TCP_MAXSEG: 572 *mtod(m, int *) = tp->t_maxseg; 573 break; 574 #ifdef TCP_SACK 575 case TCP_SACK_ENABLE: 576 *mtod(m, int *) = tp->sack_enable; 577 break; 578 #endif 579 #ifdef TCP_SIGNATURE 580 case TCP_MD5SIG: 581 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 582 break; 583 #endif 584 default: 585 error = ENOPROTOOPT; 586 break; 587 } 588 break; 589 } 590 return (error); 591 } 592 593 /* 594 * Attach TCP protocol to socket, allocating 595 * internet protocol control block, tcp control block, 596 * bufer space, and entering LISTEN state if to accept connections. 597 */ 598 int 599 tcp_attach(struct socket *so, int proto) 600 { 601 struct tcpcb *tp; 602 struct inpcb *inp; 603 int error; 604 605 if (so->so_pcb) 606 return EISCONN; 607 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 608 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 609 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 610 error = soreserve(so, tcp_sendspace, tcp_recvspace); 611 if (error) 612 return (error); 613 } 614 615 error = in_pcballoc(so, &tcbtable); 616 if (error) 617 return (error); 618 inp = sotoinpcb(so); 619 tp = tcp_newtcpcb(inp); 620 if (tp == NULL) { 621 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 622 623 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 624 in_pcbdetach(inp); 625 so->so_state |= nofd; 626 return (ENOBUFS); 627 } 628 tp->t_state = TCPS_CLOSED; 629 #ifdef INET6 630 /* we disallow IPv4 mapped address completely. */ 631 if (inp->inp_flags & INP_IPV6) 632 tp->pf = PF_INET6; 633 else 634 tp->pf = PF_INET; 635 #else 636 tp->pf = PF_INET; 637 #endif 638 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 639 so->so_linger = TCP_LINGERTIME; 640 641 if (tp && (so->so_options & SO_DEBUG)) 642 tcp_trace(TA_USER, 0, tp, (caddr_t)0, 0 /* XXX */, 0); 643 return (0); 644 } 645 646 /* 647 * Initiate (or continue) disconnect. 648 * If embryonic state, just send reset (once). 649 * If in ``let data drain'' option and linger null, just drop. 650 * Otherwise (hard), mark socket disconnecting and drop 651 * current input data; switch states based on user close, and 652 * send segment to peer (with FIN). 653 */ 654 struct tcpcb * 655 tcp_disconnect(struct tcpcb *tp) 656 { 657 struct socket *so = tp->t_inpcb->inp_socket; 658 659 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 660 tp = tcp_close(tp); 661 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 662 tp = tcp_drop(tp, 0); 663 else { 664 soisdisconnecting(so); 665 sbflush(so, &so->so_rcv); 666 tp = tcp_usrclosed(tp); 667 if (tp) 668 (void) tcp_output(tp); 669 } 670 return (tp); 671 } 672 673 /* 674 * User issued close, and wish to trail through shutdown states: 675 * if never received SYN, just forget it. If got a SYN from peer, 676 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 677 * If already got a FIN from peer, then almost done; go to LAST_ACK 678 * state. In all other cases, have already sent FIN to peer (e.g. 679 * after PRU_SHUTDOWN), and just have to play tedious game waiting 680 * for peer to send FIN or not respond to keep-alives, etc. 681 * We can let the user exit from the close as soon as the FIN is acked. 682 */ 683 struct tcpcb * 684 tcp_usrclosed(struct tcpcb *tp) 685 { 686 687 switch (tp->t_state) { 688 689 case TCPS_CLOSED: 690 case TCPS_LISTEN: 691 case TCPS_SYN_SENT: 692 tp->t_state = TCPS_CLOSED; 693 tp = tcp_close(tp); 694 break; 695 696 case TCPS_SYN_RECEIVED: 697 case TCPS_ESTABLISHED: 698 tp->t_state = TCPS_FIN_WAIT_1; 699 break; 700 701 case TCPS_CLOSE_WAIT: 702 tp->t_state = TCPS_LAST_ACK; 703 break; 704 } 705 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 706 soisdisconnected(tp->t_inpcb->inp_socket); 707 /* 708 * If we are in FIN_WAIT_2, we arrived here because the 709 * application did a shutdown of the send side. Like the 710 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 711 * a full close, we start a timer to make sure sockets are 712 * not left in FIN_WAIT_2 forever. 713 */ 714 if (tp->t_state == TCPS_FIN_WAIT_2) 715 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 716 } 717 return (tp); 718 } 719 720 /* 721 * Look up a socket for ident or tcpdrop, ... 722 */ 723 int 724 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 725 { 726 int error = 0; 727 struct tcp_ident_mapping tir; 728 struct inpcb *inp; 729 struct tcpcb *tp = NULL; 730 struct sockaddr_in *fin, *lin; 731 #ifdef INET6 732 struct sockaddr_in6 *fin6, *lin6; 733 struct in6_addr f6, l6; 734 #endif 735 736 NET_ASSERT_LOCKED(); 737 738 if (dodrop) { 739 if (oldp != NULL || *oldlenp != 0) 740 return (EINVAL); 741 if (newp == NULL) 742 return (EPERM); 743 if (newlen < sizeof(tir)) 744 return (ENOMEM); 745 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 746 return (error); 747 } else { 748 if (oldp == NULL) 749 return (EINVAL); 750 if (*oldlenp < sizeof(tir)) 751 return (ENOMEM); 752 if (newp != NULL || newlen != 0) 753 return (EINVAL); 754 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 755 return (error); 756 } 757 switch (tir.faddr.ss_family) { 758 #ifdef INET6 759 case AF_INET6: 760 fin6 = (struct sockaddr_in6 *)&tir.faddr; 761 error = in6_embedscope(&f6, fin6, NULL); 762 if (error) 763 return EINVAL; /*?*/ 764 lin6 = (struct sockaddr_in6 *)&tir.laddr; 765 error = in6_embedscope(&l6, lin6, NULL); 766 if (error) 767 return EINVAL; /*?*/ 768 break; 769 #endif 770 case AF_INET: 771 fin = (struct sockaddr_in *)&tir.faddr; 772 lin = (struct sockaddr_in *)&tir.laddr; 773 break; 774 default: 775 return (EINVAL); 776 } 777 778 switch (tir.faddr.ss_family) { 779 #ifdef INET6 780 case AF_INET6: 781 inp = in6_pcbhashlookup(&tcbtable, &f6, 782 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 783 break; 784 #endif 785 case AF_INET: 786 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 787 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 788 break; 789 default: 790 unhandled_af(tir.faddr.ss_family); 791 } 792 793 if (dodrop) { 794 if (inp && (tp = intotcpcb(inp)) && 795 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 796 tp = tcp_drop(tp, ECONNABORTED); 797 else 798 error = ESRCH; 799 return (error); 800 } 801 802 if (inp == NULL) { 803 tcpstat_inc(tcps_pcbhashmiss); 804 switch (tir.faddr.ss_family) { 805 #ifdef INET6 806 case AF_INET6: 807 inp = in6_pcblookup_listen(&tcbtable, 808 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 809 break; 810 #endif 811 case AF_INET: 812 inp = in_pcblookup_listen(&tcbtable, 813 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 814 break; 815 } 816 } 817 818 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 819 tir.ruid = inp->inp_socket->so_ruid; 820 tir.euid = inp->inp_socket->so_euid; 821 } else { 822 tir.ruid = -1; 823 tir.euid = -1; 824 } 825 826 *oldlenp = sizeof (tir); 827 error = copyout((void *)&tir, oldp, sizeof (tir)); 828 return (error); 829 } 830 831 int 832 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 833 { 834 uint64_t counters[tcps_ncounters]; 835 struct tcpstat tcpstat; 836 struct syn_cache_set *set; 837 int i = 0; 838 839 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 840 841 memset(&tcpstat, 0, sizeof tcpstat); 842 counters_read(tcpcounters, counters, nitems(counters)); 843 ASSIGN(tcps_connattempt); 844 ASSIGN(tcps_accepts); 845 ASSIGN(tcps_connects); 846 ASSIGN(tcps_drops); 847 ASSIGN(tcps_conndrops); 848 ASSIGN(tcps_closed); 849 ASSIGN(tcps_segstimed); 850 ASSIGN(tcps_rttupdated); 851 ASSIGN(tcps_delack); 852 ASSIGN(tcps_timeoutdrop); 853 ASSIGN(tcps_rexmttimeo); 854 ASSIGN(tcps_persisttimeo); 855 ASSIGN(tcps_persistdrop); 856 ASSIGN(tcps_keeptimeo); 857 ASSIGN(tcps_keepprobe); 858 ASSIGN(tcps_keepdrops); 859 ASSIGN(tcps_sndtotal); 860 ASSIGN(tcps_sndpack); 861 ASSIGN(tcps_sndbyte); 862 ASSIGN(tcps_sndrexmitpack); 863 ASSIGN(tcps_sndrexmitbyte); 864 ASSIGN(tcps_sndrexmitfast); 865 ASSIGN(tcps_sndacks); 866 ASSIGN(tcps_sndprobe); 867 ASSIGN(tcps_sndurg); 868 ASSIGN(tcps_sndwinup); 869 ASSIGN(tcps_sndctrl); 870 ASSIGN(tcps_rcvtotal); 871 ASSIGN(tcps_rcvpack); 872 ASSIGN(tcps_rcvbyte); 873 ASSIGN(tcps_rcvbadsum); 874 ASSIGN(tcps_rcvbadoff); 875 ASSIGN(tcps_rcvmemdrop); 876 ASSIGN(tcps_rcvnosec); 877 ASSIGN(tcps_rcvshort); 878 ASSIGN(tcps_rcvduppack); 879 ASSIGN(tcps_rcvdupbyte); 880 ASSIGN(tcps_rcvpartduppack); 881 ASSIGN(tcps_rcvpartdupbyte); 882 ASSIGN(tcps_rcvoopack); 883 ASSIGN(tcps_rcvoobyte); 884 ASSIGN(tcps_rcvpackafterwin); 885 ASSIGN(tcps_rcvbyteafterwin); 886 ASSIGN(tcps_rcvafterclose); 887 ASSIGN(tcps_rcvwinprobe); 888 ASSIGN(tcps_rcvdupack); 889 ASSIGN(tcps_rcvacktoomuch); 890 ASSIGN(tcps_rcvacktooold); 891 ASSIGN(tcps_rcvackpack); 892 ASSIGN(tcps_rcvackbyte); 893 ASSIGN(tcps_rcvwinupd); 894 ASSIGN(tcps_pawsdrop); 895 ASSIGN(tcps_predack); 896 ASSIGN(tcps_preddat); 897 ASSIGN(tcps_pcbhashmiss); 898 ASSIGN(tcps_noport); 899 ASSIGN(tcps_badsyn); 900 ASSIGN(tcps_dropsyn); 901 ASSIGN(tcps_rcvbadsig); 902 ASSIGN(tcps_rcvgoodsig); 903 ASSIGN(tcps_inswcsum); 904 ASSIGN(tcps_outswcsum); 905 ASSIGN(tcps_ecn_accepts); 906 ASSIGN(tcps_ecn_rcvece); 907 ASSIGN(tcps_ecn_rcvcwr); 908 ASSIGN(tcps_ecn_rcvce); 909 ASSIGN(tcps_ecn_sndect); 910 ASSIGN(tcps_ecn_sndece); 911 ASSIGN(tcps_ecn_sndcwr); 912 ASSIGN(tcps_cwr_ecn); 913 ASSIGN(tcps_cwr_frecovery); 914 ASSIGN(tcps_cwr_timeout); 915 ASSIGN(tcps_sc_added); 916 ASSIGN(tcps_sc_completed); 917 ASSIGN(tcps_sc_timed_out); 918 ASSIGN(tcps_sc_overflowed); 919 ASSIGN(tcps_sc_reset); 920 ASSIGN(tcps_sc_unreach); 921 ASSIGN(tcps_sc_bucketoverflow); 922 ASSIGN(tcps_sc_aborted); 923 ASSIGN(tcps_sc_dupesyn); 924 ASSIGN(tcps_sc_dropped); 925 ASSIGN(tcps_sc_collisions); 926 ASSIGN(tcps_sc_retransmitted); 927 ASSIGN(tcps_sc_seedrandom); 928 ASSIGN(tcps_sc_hash_size); 929 ASSIGN(tcps_sc_entry_count); 930 ASSIGN(tcps_sc_entry_limit); 931 ASSIGN(tcps_sc_bucket_maxlen); 932 ASSIGN(tcps_sc_bucket_limit); 933 ASSIGN(tcps_sc_uses_left); 934 ASSIGN(tcps_conndrained); 935 ASSIGN(tcps_sack_recovery_episode); 936 ASSIGN(tcps_sack_rexmits); 937 ASSIGN(tcps_sack_rexmit_bytes); 938 ASSIGN(tcps_sack_rcv_opts); 939 ASSIGN(tcps_sack_snd_opts); 940 941 #undef ASSIGN 942 943 set = &tcp_syn_cache[tcp_syn_cache_active]; 944 tcpstat.tcps_sc_hash_size = set->scs_size; 945 tcpstat.tcps_sc_entry_count = set->scs_count; 946 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 947 tcpstat.tcps_sc_bucket_maxlen = 0; 948 for (i = 0; i < set->scs_size; i++) { 949 if (tcpstat.tcps_sc_bucket_maxlen < 950 set->scs_buckethead[i].sch_length) 951 tcpstat.tcps_sc_bucket_maxlen = 952 set->scs_buckethead[i].sch_length; 953 } 954 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 955 tcpstat.tcps_sc_uses_left = set->scs_use; 956 957 return (sysctl_rdstruct(oldp, oldlenp, newp, 958 &tcpstat, sizeof(tcpstat))); 959 } 960 961 /* 962 * Sysctl for tcp variables. 963 */ 964 int 965 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 966 size_t newlen) 967 { 968 int error, nval; 969 970 NET_ASSERT_LOCKED(); 971 972 /* All sysctl names at this level are terminal. */ 973 if (namelen != 1) 974 return (ENOTDIR); 975 976 switch (name[0]) { 977 #ifdef TCP_SACK 978 case TCPCTL_SACK: 979 return (sysctl_int(oldp, oldlenp, newp, newlen, 980 &tcp_do_sack)); 981 #endif 982 case TCPCTL_SLOWHZ: 983 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 984 985 case TCPCTL_BADDYNAMIC: 986 return (sysctl_struct(oldp, oldlenp, newp, newlen, 987 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 988 989 case TCPCTL_ROOTONLY: 990 if (newp && securelevel > 0) 991 return (EPERM); 992 return (sysctl_struct(oldp, oldlenp, newp, newlen, 993 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 994 995 case TCPCTL_IDENT: 996 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 997 998 case TCPCTL_DROP: 999 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 1000 1001 case TCPCTL_ALWAYS_KEEPALIVE: 1002 return (sysctl_int(oldp, oldlenp, newp, newlen, 1003 &tcp_always_keepalive)); 1004 1005 #ifdef TCP_ECN 1006 case TCPCTL_ECN: 1007 return (sysctl_int(oldp, oldlenp, newp, newlen, 1008 &tcp_do_ecn)); 1009 #endif 1010 case TCPCTL_REASS_LIMIT: 1011 nval = tcp_reass_limit; 1012 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1013 if (error) 1014 return (error); 1015 if (nval != tcp_reass_limit) { 1016 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1017 if (error) 1018 return (error); 1019 tcp_reass_limit = nval; 1020 } 1021 return (0); 1022 #ifdef TCP_SACK 1023 case TCPCTL_SACKHOLE_LIMIT: 1024 nval = tcp_sackhole_limit; 1025 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1026 if (error) 1027 return (error); 1028 if (nval != tcp_sackhole_limit) { 1029 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1030 if (error) 1031 return (error); 1032 tcp_sackhole_limit = nval; 1033 } 1034 return (0); 1035 #endif 1036 1037 case TCPCTL_STATS: 1038 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1039 1040 case TCPCTL_SYN_USE_LIMIT: 1041 error = sysctl_int(oldp, oldlenp, newp, newlen, 1042 &tcp_syn_use_limit); 1043 if (error) 1044 return (error); 1045 if (newp != NULL) { 1046 /* 1047 * Global tcp_syn_use_limit is used when reseeding a 1048 * new cache. Also update the value in active cache. 1049 */ 1050 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1051 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1052 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1053 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1054 } 1055 return (0); 1056 1057 case TCPCTL_SYN_HASH_SIZE: 1058 nval = tcp_syn_hash_size; 1059 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1060 if (error) 1061 return (error); 1062 if (nval != tcp_syn_hash_size) { 1063 if (nval < 1 || nval > 100000) 1064 return (EINVAL); 1065 /* 1066 * If global hash size has been changed, switch sets as 1067 * soon as possible. Then the actual hash array will 1068 * be reallocated. 1069 */ 1070 if (tcp_syn_cache[0].scs_size != nval) 1071 tcp_syn_cache[0].scs_use = 0; 1072 if (tcp_syn_cache[1].scs_size != nval) 1073 tcp_syn_cache[1].scs_use = 0; 1074 tcp_syn_hash_size = nval; 1075 } 1076 return (0); 1077 1078 default: 1079 if (name[0] < TCPCTL_MAXID) 1080 return (sysctl_int_arr(tcpctl_vars, name, namelen, 1081 oldp, oldlenp, newp, newlen)); 1082 return (ENOPROTOOPT); 1083 } 1084 /* NOTREACHED */ 1085 } 1086 1087 /* 1088 * Scale the send buffer so that inflight data is not accounted against 1089 * the limit. The buffer will scale with the congestion window, if the 1090 * the receiver stops acking data the window will shrink and therefor 1091 * the buffer size will shrink as well. 1092 * In low memory situation try to shrink the buffer to the initial size 1093 * disabling the send buffer scaling as long as the situation persists. 1094 */ 1095 void 1096 tcp_update_sndspace(struct tcpcb *tp) 1097 { 1098 struct socket *so = tp->t_inpcb->inp_socket; 1099 u_long nmax = so->so_snd.sb_hiwat; 1100 1101 if (sbchecklowmem()) { 1102 /* low on memory try to get rid of some */ 1103 if (tcp_sendspace < nmax) 1104 nmax = tcp_sendspace; 1105 } else if (so->so_snd.sb_wat != tcp_sendspace) 1106 /* user requested buffer size, auto-scaling disabled */ 1107 nmax = so->so_snd.sb_wat; 1108 else 1109 /* automatic buffer scaling */ 1110 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1111 tp->snd_una); 1112 1113 /* a writable socket must be preserved because of poll(2) semantics */ 1114 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1115 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1116 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1117 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1118 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1119 } 1120 1121 /* round to MSS boundary */ 1122 nmax = roundup(nmax, tp->t_maxseg); 1123 1124 if (nmax != so->so_snd.sb_hiwat) 1125 sbreserve(so, &so->so_snd, nmax); 1126 } 1127 1128 /* 1129 * Scale the recv buffer by looking at how much data was transferred in 1130 * on approximated RTT. If more than a big part of the recv buffer was 1131 * transferred during that time we increase the buffer by a constant. 1132 * In low memory situation try to shrink the buffer to the initial size. 1133 */ 1134 void 1135 tcp_update_rcvspace(struct tcpcb *tp) 1136 { 1137 struct socket *so = tp->t_inpcb->inp_socket; 1138 u_long nmax = so->so_rcv.sb_hiwat; 1139 1140 if (sbchecklowmem()) { 1141 /* low on memory try to get rid of some */ 1142 if (tcp_recvspace < nmax) 1143 nmax = tcp_recvspace; 1144 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1145 /* user requested buffer size, auto-scaling disabled */ 1146 nmax = so->so_rcv.sb_wat; 1147 else { 1148 /* automatic buffer scaling */ 1149 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1150 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1151 tcp_autorcvbuf_inc); 1152 } 1153 1154 /* a readable socket must be preserved because of poll(2) semantics */ 1155 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1156 nmax < so->so_snd.sb_lowat) 1157 nmax = so->so_snd.sb_lowat; 1158 1159 if (nmax == so->so_rcv.sb_hiwat) 1160 return; 1161 1162 /* round to MSS boundary */ 1163 nmax = roundup(nmax, tp->t_maxseg); 1164 sbreserve(so, &so->so_rcv, nmax); 1165 } 1166