1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 67 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 68 */ 69 70 #include "opt_ipsec.h" 71 #include "opt_inet.h" 72 #include "opt_inet6.h" 73 #include "opt_tcpdebug.h" 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/kernel.h> 78 #include <sys/malloc.h> 79 #include <sys/sysctl.h> 80 #include <sys/globaldata.h> 81 #include <sys/thread.h> 82 83 #include <sys/mbuf.h> 84 #ifdef INET6 85 #include <sys/domain.h> 86 #endif /* INET6 */ 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/socketops.h> 90 #include <sys/protosw.h> 91 92 #include <sys/thread2.h> 93 #include <sys/msgport2.h> 94 #include <sys/socketvar2.h> 95 96 #include <net/if.h> 97 #include <net/netisr.h> 98 #include <net/route.h> 99 100 #include <net/netmsg2.h> 101 102 #include <netinet/in.h> 103 #include <netinet/in_systm.h> 104 #ifdef INET6 105 #include <netinet/ip6.h> 106 #endif 107 #include <netinet/in_pcb.h> 108 #ifdef INET6 109 #include <netinet6/in6_pcb.h> 110 #endif 111 #include <netinet/in_var.h> 112 #include <netinet/ip_var.h> 113 #ifdef INET6 114 #include <netinet6/ip6_var.h> 115 #include <netinet6/tcp6_var.h> 116 #endif 117 #include <netinet/tcp.h> 118 #include <netinet/tcp_fsm.h> 119 #include <netinet/tcp_seq.h> 120 #include <netinet/tcp_timer.h> 121 #include <netinet/tcp_timer2.h> 122 #include <netinet/tcp_var.h> 123 #include <netinet/tcpip.h> 124 #ifdef TCPDEBUG 125 #include <netinet/tcp_debug.h> 126 #endif 127 128 #ifdef IPSEC 129 #include <netinet6/ipsec.h> 130 #endif /*IPSEC*/ 131 132 /* 133 * TCP protocol interface to socket abstraction. 134 */ 135 extern char *tcpstates[]; /* XXX ??? */ 136 137 static int tcp_attach (struct socket *, struct pru_attach_info *); 138 static void tcp_connect (netmsg_t msg); 139 #ifdef INET6 140 static void tcp6_connect (netmsg_t msg); 141 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 142 struct mbuf **mp, 143 struct sockaddr_in6 *sin6, 144 struct in6_addr *addr6); 145 #endif /* INET6 */ 146 static struct tcpcb * 147 tcp_disconnect (struct tcpcb *); 148 static struct tcpcb * 149 tcp_usrclosed (struct tcpcb *); 150 151 #ifdef TCPDEBUG 152 #define TCPDEBUG0 int ostate = 0 153 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 154 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 155 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 156 #else 157 #define TCPDEBUG0 158 #define TCPDEBUG1() 159 #define TCPDEBUG2(req) 160 #endif 161 162 static int tcp_lport_extension = 1; 163 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW, 164 &tcp_lport_extension, 0, ""); 165 166 /* 167 * For some ill optimized programs, which try to use TCP_NOPUSH 168 * to improve performance, will have small amount of data sits 169 * in the sending buffer. These small amount of data will _not_ 170 * be pushed into the network until more data are written into 171 * the socket or the socket write side is shutdown. 172 */ 173 static int tcp_disable_nopush = 1; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 175 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 176 177 /* 178 * TCP attaches to socket via pru_attach(), reserving space, 179 * and an internet control block. This is likely occuring on 180 * cpu0 and may have to move later when we bind/connect. 181 */ 182 static void 183 tcp_usr_attach(netmsg_t msg) 184 { 185 struct socket *so = msg->base.nm_so; 186 struct pru_attach_info *ai = msg->attach.nm_ai; 187 int error; 188 struct inpcb *inp; 189 struct tcpcb *tp = NULL; 190 TCPDEBUG0; 191 192 soreference(so); 193 inp = so->so_pcb; 194 TCPDEBUG1(); 195 if (inp) { 196 error = EISCONN; 197 goto out; 198 } 199 200 error = tcp_attach(so, ai); 201 if (error) 202 goto out; 203 204 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 205 so->so_linger = TCP_LINGERTIME; 206 tp = sototcpcb(so); 207 out: 208 sofree(so); /* from ref above */ 209 TCPDEBUG2(PRU_ATTACH); 210 lwkt_replymsg(&msg->lmsg, error); 211 } 212 213 /* 214 * pru_detach() detaches the TCP protocol from the socket. 215 * If the protocol state is non-embryonic, then can't 216 * do this directly: have to initiate a pru_disconnect(), 217 * which may finish later; embryonic TCB's can just 218 * be discarded here. 219 */ 220 static void 221 tcp_usr_detach(netmsg_t msg) 222 { 223 struct socket *so = msg->base.nm_so; 224 int error = 0; 225 struct inpcb *inp; 226 struct tcpcb *tp; 227 TCPDEBUG0; 228 229 inp = so->so_pcb; 230 231 /* 232 * If the inp is already detached it may have been due to an async 233 * close. Just return as if no error occured. 234 * 235 * It's possible for the tcpcb (tp) to disconnect from the inp due 236 * to tcp_drop()->tcp_close() being called. This may occur *after* 237 * the detach message has been queued so we may find a NULL tp here. 238 */ 239 if (inp) { 240 if ((tp = intotcpcb(inp)) != NULL) { 241 TCPDEBUG1(); 242 tp = tcp_disconnect(tp); 243 TCPDEBUG2(PRU_DETACH); 244 } 245 } 246 lwkt_replymsg(&msg->lmsg, error); 247 } 248 249 /* 250 * NOTE: ignore_error is non-zero for certain disconnection races 251 * which we want to silently allow, otherwise close() may return 252 * an unexpected error. 253 * 254 * NOTE: The variables (msg) and (tp) are assumed. 255 */ 256 #define COMMON_START(so, inp, ignore_error) \ 257 TCPDEBUG0; \ 258 \ 259 inp = so->so_pcb; \ 260 do { \ 261 if (inp == NULL) { \ 262 error = ignore_error ? 0 : EINVAL; \ 263 tp = NULL; \ 264 goto out; \ 265 } \ 266 tp = intotcpcb(inp); \ 267 TCPDEBUG1(); \ 268 } while(0) 269 270 #define COMMON_END1(req, noreply) \ 271 out: do { \ 272 TCPDEBUG2(req); \ 273 if (!(noreply)) \ 274 lwkt_replymsg(&msg->lmsg, error); \ 275 return; \ 276 } while(0) 277 278 #define COMMON_END(req) COMMON_END1((req), 0) 279 280 /* 281 * Give the socket an address. 282 */ 283 static void 284 tcp_usr_bind(netmsg_t msg) 285 { 286 struct socket *so = msg->bind.base.nm_so; 287 struct sockaddr *nam = msg->bind.nm_nam; 288 struct thread *td = msg->bind.nm_td; 289 int error = 0; 290 struct inpcb *inp; 291 struct tcpcb *tp; 292 struct sockaddr_in *sinp; 293 294 COMMON_START(so, inp, 0); 295 296 /* 297 * Must check for multicast addresses and disallow binding 298 * to them. 299 */ 300 sinp = (struct sockaddr_in *)nam; 301 if (sinp->sin_family == AF_INET && 302 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 303 error = EAFNOSUPPORT; 304 goto out; 305 } 306 error = in_pcbbind(inp, nam, td); 307 if (error) 308 goto out; 309 COMMON_END(PRU_BIND); 310 311 } 312 313 #ifdef INET6 314 315 static void 316 tcp6_usr_bind(netmsg_t msg) 317 { 318 struct socket *so = msg->bind.base.nm_so; 319 struct sockaddr *nam = msg->bind.nm_nam; 320 struct thread *td = msg->bind.nm_td; 321 int error = 0; 322 struct inpcb *inp; 323 struct tcpcb *tp; 324 struct sockaddr_in6 *sin6p; 325 326 COMMON_START(so, inp, 0); 327 328 /* 329 * Must check for multicast addresses and disallow binding 330 * to them. 331 */ 332 sin6p = (struct sockaddr_in6 *)nam; 333 if (sin6p->sin6_family == AF_INET6 && 334 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 335 error = EAFNOSUPPORT; 336 goto out; 337 } 338 inp->inp_vflag &= ~INP_IPV4; 339 inp->inp_vflag |= INP_IPV6; 340 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 341 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 342 inp->inp_vflag |= INP_IPV4; 343 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 344 struct sockaddr_in sin; 345 346 in6_sin6_2_sin(&sin, sin6p); 347 inp->inp_vflag |= INP_IPV4; 348 inp->inp_vflag &= ~INP_IPV6; 349 error = in_pcbbind(inp, (struct sockaddr *)&sin, td); 350 goto out; 351 } 352 } 353 error = in6_pcbbind(inp, nam, td); 354 if (error) 355 goto out; 356 COMMON_END(PRU_BIND); 357 } 358 #endif /* INET6 */ 359 360 struct netmsg_inswildcard { 361 struct netmsg_base base; 362 struct inpcb *nm_inp; 363 }; 364 365 static void 366 in_pcbinswildcardhash_handler(netmsg_t msg) 367 { 368 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 369 int cpu = mycpuid, nextcpu; 370 371 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 372 373 nextcpu = cpu + 1; 374 if (nextcpu < ncpus2) 375 lwkt_forwardmsg(netisr_portfn(nextcpu), &nm->base.lmsg); 376 else 377 lwkt_replymsg(&nm->base.lmsg, 0); 378 } 379 380 /* 381 * Prepare to accept connections. 382 */ 383 static void 384 tcp_usr_listen(netmsg_t msg) 385 { 386 struct socket *so = msg->listen.base.nm_so; 387 struct thread *td = msg->listen.nm_td; 388 int error = 0; 389 struct inpcb *inp; 390 struct tcpcb *tp; 391 struct netmsg_inswildcard nm; 392 393 COMMON_START(so, inp, 0); 394 395 if (tp->t_flags & TF_LISTEN) 396 goto out; 397 398 if (inp->inp_lport == 0) { 399 error = in_pcbbind(inp, NULL, td); 400 if (error) 401 goto out; 402 } 403 404 tp->t_state = TCPS_LISTEN; 405 tp->t_flags |= TF_LISTEN; 406 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 407 408 if (ncpus > 1) { 409 /* 410 * We have to set the flag because we can't have other cpus 411 * messing with our inp's flags. 412 */ 413 KASSERT(!(inp->inp_flags & INP_CONNECTED), 414 ("already on connhash")); 415 KASSERT(!(inp->inp_flags & INP_WILDCARD), 416 ("already on wildcardhash")); 417 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 418 ("already on MP wildcardhash")); 419 inp->inp_flags |= INP_WILDCARD_MP; 420 421 KKASSERT(so->so_port == netisr_portfn(0)); 422 KKASSERT(&curthread->td_msgport == netisr_portfn(0)); 423 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 424 425 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 426 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 427 nm.nm_inp = inp; 428 lwkt_domsg(netisr_portfn(1), &nm.base.lmsg, 0); 429 } 430 in_pcbinswildcardhash(inp); 431 COMMON_END(PRU_LISTEN); 432 } 433 434 #ifdef INET6 435 436 static void 437 tcp6_usr_listen(netmsg_t msg) 438 { 439 struct socket *so = msg->listen.base.nm_so; 440 struct thread *td = msg->listen.nm_td; 441 int error = 0; 442 struct inpcb *inp; 443 struct tcpcb *tp; 444 struct netmsg_inswildcard nm; 445 446 COMMON_START(so, inp, 0); 447 448 if (tp->t_flags & TF_LISTEN) 449 goto out; 450 451 if (inp->inp_lport == 0) { 452 if (!(inp->inp_flags & IN6P_IPV6_V6ONLY)) 453 inp->inp_vflag |= INP_IPV4; 454 else 455 inp->inp_vflag &= ~INP_IPV4; 456 error = in6_pcbbind(inp, NULL, td); 457 if (error) 458 goto out; 459 } 460 461 tp->t_state = TCPS_LISTEN; 462 tp->t_flags |= TF_LISTEN; 463 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 464 465 if (ncpus > 1) { 466 /* 467 * We have to set the flag because we can't have other cpus 468 * messing with our inp's flags. 469 */ 470 KASSERT(!(inp->inp_flags & INP_CONNECTED), 471 ("already on connhash")); 472 KASSERT(!(inp->inp_flags & INP_WILDCARD), 473 ("already on wildcardhash")); 474 KASSERT(!(inp->inp_flags & INP_WILDCARD_MP), 475 ("already on MP wildcardhash")); 476 inp->inp_flags |= INP_WILDCARD_MP; 477 478 KKASSERT(so->so_port == netisr_portfn(0)); 479 KKASSERT(&curthread->td_msgport == netisr_portfn(0)); 480 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 481 482 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 483 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 484 nm.nm_inp = inp; 485 lwkt_domsg(netisr_portfn(1), &nm.base.lmsg, 0); 486 } 487 in_pcbinswildcardhash(inp); 488 COMMON_END(PRU_LISTEN); 489 } 490 #endif /* INET6 */ 491 492 /* 493 * Initiate connection to peer. 494 * Create a template for use in transmissions on this connection. 495 * Enter SYN_SENT state, and mark socket as connecting. 496 * Start keep-alive timer, and seed output sequence space. 497 * Send initial segment on connection. 498 */ 499 static void 500 tcp_usr_connect(netmsg_t msg) 501 { 502 struct socket *so = msg->connect.base.nm_so; 503 struct sockaddr *nam = msg->connect.nm_nam; 504 struct thread *td = msg->connect.nm_td; 505 int error = 0; 506 struct inpcb *inp; 507 struct tcpcb *tp; 508 struct sockaddr_in *sinp; 509 510 COMMON_START(so, inp, 0); 511 512 /* 513 * Must disallow TCP ``connections'' to multicast addresses. 514 */ 515 sinp = (struct sockaddr_in *)nam; 516 if (sinp->sin_family == AF_INET 517 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 518 error = EAFNOSUPPORT; 519 goto out; 520 } 521 522 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 523 error = EAFNOSUPPORT; /* IPv6 only jail */ 524 goto out; 525 } 526 527 tcp_connect(msg); 528 /* msg is invalid now */ 529 return; 530 out: 531 if (msg->connect.nm_m) { 532 m_freem(msg->connect.nm_m); 533 msg->connect.nm_m = NULL; 534 } 535 lwkt_replymsg(&msg->lmsg, error); 536 } 537 538 #ifdef INET6 539 540 static void 541 tcp6_usr_connect(netmsg_t msg) 542 { 543 struct socket *so = msg->connect.base.nm_so; 544 struct sockaddr *nam = msg->connect.nm_nam; 545 struct thread *td = msg->connect.nm_td; 546 int error = 0; 547 struct inpcb *inp; 548 struct tcpcb *tp; 549 struct sockaddr_in6 *sin6p; 550 551 COMMON_START(so, inp, 0); 552 553 /* 554 * Must disallow TCP ``connections'' to multicast addresses. 555 */ 556 sin6p = (struct sockaddr_in6 *)nam; 557 if (sin6p->sin6_family == AF_INET6 558 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 559 error = EAFNOSUPPORT; 560 goto out; 561 } 562 563 if (!prison_remote_ip(td, nam)) { 564 error = EAFNOSUPPORT; /* IPv4 only jail */ 565 goto out; 566 } 567 568 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 569 struct sockaddr_in *sinp; 570 571 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 572 error = EINVAL; 573 goto out; 574 } 575 sinp = kmalloc(sizeof(*sinp), M_LWKTMSG, M_INTWAIT); 576 in6_sin6_2_sin(sinp, sin6p); 577 inp->inp_vflag |= INP_IPV4; 578 inp->inp_vflag &= ~INP_IPV6; 579 msg->connect.nm_nam = (struct sockaddr *)sinp; 580 msg->connect.nm_reconnect |= NMSG_RECONNECT_NAMALLOC; 581 tcp_connect(msg); 582 /* msg is invalid now */ 583 return; 584 } 585 inp->inp_vflag &= ~INP_IPV4; 586 inp->inp_vflag |= INP_IPV6; 587 inp->inp_inc.inc_isipv6 = 1; 588 589 msg->connect.nm_reconnect |= NMSG_RECONNECT_FALLBACK; 590 tcp6_connect(msg); 591 /* msg is invalid now */ 592 return; 593 out: 594 if (msg->connect.nm_m) { 595 m_freem(msg->connect.nm_m); 596 msg->connect.nm_m = NULL; 597 } 598 lwkt_replymsg(&msg->lmsg, error); 599 } 600 601 #endif /* INET6 */ 602 603 /* 604 * Initiate disconnect from peer. 605 * If connection never passed embryonic stage, just drop; 606 * else if don't need to let data drain, then can just drop anyways, 607 * else have to begin TCP shutdown process: mark socket disconnecting, 608 * drain unread data, state switch to reflect user close, and 609 * send segment (e.g. FIN) to peer. Socket will be really disconnected 610 * when peer sends FIN and acks ours. 611 * 612 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 613 */ 614 static void 615 tcp_usr_disconnect(netmsg_t msg) 616 { 617 struct socket *so = msg->disconnect.base.nm_so; 618 int error = 0; 619 struct inpcb *inp; 620 struct tcpcb *tp; 621 622 COMMON_START(so, inp, 1); 623 tp = tcp_disconnect(tp); 624 COMMON_END(PRU_DISCONNECT); 625 } 626 627 /* 628 * Accept a connection. Essentially all the work is 629 * done at higher levels; just return the address 630 * of the peer, storing through addr. 631 */ 632 static void 633 tcp_usr_accept(netmsg_t msg) 634 { 635 struct socket *so = msg->accept.base.nm_so; 636 struct sockaddr **nam = msg->accept.nm_nam; 637 int error = 0; 638 struct inpcb *inp; 639 struct tcpcb *tp = NULL; 640 TCPDEBUG0; 641 642 inp = so->so_pcb; 643 if (so->so_state & SS_ISDISCONNECTED) { 644 error = ECONNABORTED; 645 goto out; 646 } 647 if (inp == 0) { 648 error = EINVAL; 649 goto out; 650 } 651 652 tp = intotcpcb(inp); 653 TCPDEBUG1(); 654 in_setpeeraddr(so, nam); 655 COMMON_END(PRU_ACCEPT); 656 } 657 658 #ifdef INET6 659 static void 660 tcp6_usr_accept(netmsg_t msg) 661 { 662 struct socket *so = msg->accept.base.nm_so; 663 struct sockaddr **nam = msg->accept.nm_nam; 664 int error = 0; 665 struct inpcb *inp; 666 struct tcpcb *tp = NULL; 667 TCPDEBUG0; 668 669 inp = so->so_pcb; 670 671 if (so->so_state & SS_ISDISCONNECTED) { 672 error = ECONNABORTED; 673 goto out; 674 } 675 if (inp == 0) { 676 error = EINVAL; 677 goto out; 678 } 679 tp = intotcpcb(inp); 680 TCPDEBUG1(); 681 in6_mapped_peeraddr(so, nam); 682 COMMON_END(PRU_ACCEPT); 683 } 684 #endif /* INET6 */ 685 /* 686 * Mark the connection as being incapable of further output. 687 */ 688 static void 689 tcp_usr_shutdown(netmsg_t msg) 690 { 691 struct socket *so = msg->shutdown.base.nm_so; 692 int error = 0; 693 struct inpcb *inp; 694 struct tcpcb *tp; 695 696 COMMON_START(so, inp, 0); 697 socantsendmore(so); 698 tp = tcp_usrclosed(tp); 699 if (tp) 700 error = tcp_output(tp); 701 COMMON_END(PRU_SHUTDOWN); 702 } 703 704 /* 705 * After a receive, possibly send window update to peer. 706 */ 707 static void 708 tcp_usr_rcvd(netmsg_t msg) 709 { 710 struct socket *so = msg->rcvd.base.nm_so; 711 int error = 0, noreply = 0; 712 struct inpcb *inp; 713 struct tcpcb *tp; 714 715 COMMON_START(so, inp, 0); 716 717 if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) { 718 noreply = 1; 719 so_async_rcvd_reply(so); 720 } 721 tcp_output(tp); 722 723 COMMON_END1(PRU_RCVD, noreply); 724 } 725 726 /* 727 * Do a send by putting data in output queue and updating urgent 728 * marker if URG set. Possibly send more data. Unlike the other 729 * pru_*() routines, the mbuf chains are our responsibility. We 730 * must either enqueue them or free them. The other pru_* routines 731 * generally are caller-frees. 732 */ 733 static void 734 tcp_usr_send(netmsg_t msg) 735 { 736 struct socket *so = msg->send.base.nm_so; 737 int flags = msg->send.nm_flags; 738 struct mbuf *m = msg->send.nm_m; 739 int error = 0; 740 struct inpcb *inp; 741 struct tcpcb *tp; 742 TCPDEBUG0; 743 744 KKASSERT(msg->send.nm_control == NULL); 745 KKASSERT(msg->send.nm_addr == NULL); 746 KKASSERT((flags & PRUS_FREEADDR) == 0); 747 748 inp = so->so_pcb; 749 750 if (inp == NULL) { 751 /* 752 * OOPS! we lost a race, the TCP session got reset after 753 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 754 * network interrupt in the non-critical section of sosend(). 755 */ 756 m_freem(m); 757 error = ECONNRESET; /* XXX EPIPE? */ 758 tp = NULL; 759 TCPDEBUG1(); 760 goto out; 761 } 762 tp = intotcpcb(inp); 763 TCPDEBUG1(); 764 765 #ifdef foo 766 /* 767 * This is no longer necessary, since: 768 * - sosendtcp() has already checked it for us 769 * - It does not work with asynchronized send 770 */ 771 772 /* 773 * Don't let too much OOB data build up 774 */ 775 if (flags & PRUS_OOB) { 776 if (ssb_space(&so->so_snd) < -512) { 777 m_freem(m); 778 error = ENOBUFS; 779 goto out; 780 } 781 } 782 #endif 783 784 /* 785 * Pump the data into the socket. 786 */ 787 if (m) 788 ssb_appendstream(&so->so_snd, m); 789 if (flags & PRUS_OOB) { 790 /* 791 * According to RFC961 (Assigned Protocols), 792 * the urgent pointer points to the last octet 793 * of urgent data. We continue, however, 794 * to consider it to indicate the first octet 795 * of data past the urgent section. 796 * Otherwise, snd_up should be one lower. 797 */ 798 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 799 tp->t_flags |= TF_FORCE; 800 error = tcp_output(tp); 801 tp->t_flags &= ~TF_FORCE; 802 } else { 803 if (flags & PRUS_EOF) { 804 /* 805 * Close the send side of the connection after 806 * the data is sent. 807 */ 808 socantsendmore(so); 809 tp = tcp_usrclosed(tp); 810 } 811 if (tp != NULL && !tcp_output_pending(tp)) { 812 if (flags & PRUS_MORETOCOME) 813 tp->t_flags |= TF_MORETOCOME; 814 error = tcp_output_fair(tp); 815 if (flags & PRUS_MORETOCOME) 816 tp->t_flags &= ~TF_MORETOCOME; 817 } 818 } 819 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 820 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 821 (flags & PRUS_NOREPLY)); 822 } 823 824 /* 825 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 826 * will sofree() it when we return. 827 */ 828 static void 829 tcp_usr_abort(netmsg_t msg) 830 { 831 struct socket *so = msg->abort.base.nm_so; 832 int error = 0; 833 struct inpcb *inp; 834 struct tcpcb *tp; 835 836 COMMON_START(so, inp, 1); 837 tp = tcp_drop(tp, ECONNABORTED); 838 COMMON_END(PRU_ABORT); 839 } 840 841 /* 842 * Receive out-of-band data. 843 */ 844 static void 845 tcp_usr_rcvoob(netmsg_t msg) 846 { 847 struct socket *so = msg->rcvoob.base.nm_so; 848 struct mbuf *m = msg->rcvoob.nm_m; 849 int flags = msg->rcvoob.nm_flags; 850 int error = 0; 851 struct inpcb *inp; 852 struct tcpcb *tp; 853 854 COMMON_START(so, inp, 0); 855 if ((so->so_oobmark == 0 && 856 (so->so_state & SS_RCVATMARK) == 0) || 857 so->so_options & SO_OOBINLINE || 858 tp->t_oobflags & TCPOOB_HADDATA) { 859 error = EINVAL; 860 goto out; 861 } 862 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 863 error = EWOULDBLOCK; 864 goto out; 865 } 866 m->m_len = 1; 867 *mtod(m, caddr_t) = tp->t_iobc; 868 if ((flags & MSG_PEEK) == 0) 869 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 870 COMMON_END(PRU_RCVOOB); 871 } 872 873 static void 874 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 875 { 876 in_savefaddr(so, faddr); 877 } 878 879 #ifdef INET6 880 static void 881 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 882 { 883 in6_mapped_savefaddr(so, faddr); 884 } 885 #endif 886 887 /* xxx - should be const */ 888 struct pr_usrreqs tcp_usrreqs = { 889 .pru_abort = tcp_usr_abort, 890 .pru_accept = tcp_usr_accept, 891 .pru_attach = tcp_usr_attach, 892 .pru_bind = tcp_usr_bind, 893 .pru_connect = tcp_usr_connect, 894 .pru_connect2 = pr_generic_notsupp, 895 .pru_control = in_control_dispatch, 896 .pru_detach = tcp_usr_detach, 897 .pru_disconnect = tcp_usr_disconnect, 898 .pru_listen = tcp_usr_listen, 899 .pru_peeraddr = in_setpeeraddr_dispatch, 900 .pru_rcvd = tcp_usr_rcvd, 901 .pru_rcvoob = tcp_usr_rcvoob, 902 .pru_send = tcp_usr_send, 903 .pru_sense = pru_sense_null, 904 .pru_shutdown = tcp_usr_shutdown, 905 .pru_sockaddr = in_setsockaddr_dispatch, 906 .pru_sosend = sosendtcp, 907 .pru_soreceive = sorecvtcp, 908 .pru_savefaddr = tcp_usr_savefaddr 909 }; 910 911 #ifdef INET6 912 struct pr_usrreqs tcp6_usrreqs = { 913 .pru_abort = tcp_usr_abort, 914 .pru_accept = tcp6_usr_accept, 915 .pru_attach = tcp_usr_attach, 916 .pru_bind = tcp6_usr_bind, 917 .pru_connect = tcp6_usr_connect, 918 .pru_connect2 = pr_generic_notsupp, 919 .pru_control = in6_control_dispatch, 920 .pru_detach = tcp_usr_detach, 921 .pru_disconnect = tcp_usr_disconnect, 922 .pru_listen = tcp6_usr_listen, 923 .pru_peeraddr = in6_mapped_peeraddr_dispatch, 924 .pru_rcvd = tcp_usr_rcvd, 925 .pru_rcvoob = tcp_usr_rcvoob, 926 .pru_send = tcp_usr_send, 927 .pru_sense = pru_sense_null, 928 .pru_shutdown = tcp_usr_shutdown, 929 .pru_sockaddr = in6_mapped_sockaddr_dispatch, 930 .pru_sosend = sosendtcp, 931 .pru_soreceive = sorecvtcp, 932 .pru_savefaddr = tcp6_usr_savefaddr 933 }; 934 #endif /* INET6 */ 935 936 static int 937 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 938 struct sockaddr_in *sin, struct sockaddr_in *if_sin) 939 { 940 struct inpcb *inp = tp->t_inpcb, *oinp; 941 struct socket *so = inp->inp_socket; 942 struct route *ro = &inp->inp_route; 943 944 oinp = in_pcblookup_hash(&tcbinfo[mycpu->gd_cpuid], 945 sin->sin_addr, sin->sin_port, 946 (inp->inp_laddr.s_addr != INADDR_ANY ? 947 inp->inp_laddr : if_sin->sin_addr), 948 inp->inp_lport, 0, NULL); 949 if (oinp != NULL) { 950 m_freem(m); 951 return (EADDRINUSE); 952 } 953 if (inp->inp_laddr.s_addr == INADDR_ANY) 954 inp->inp_laddr = if_sin->sin_addr; 955 inp->inp_faddr = sin->sin_addr; 956 inp->inp_fport = sin->sin_port; 957 inp->inp_cpcbinfo = &tcbinfo[mycpu->gd_cpuid]; 958 in_pcbinsconnhash(inp); 959 960 /* 961 * We are now on the inpcb's owner CPU, if the cached route was 962 * freed because the rtentry's owner CPU is not the current CPU 963 * (e.g. in tcp_connect()), then we try to reallocate it here with 964 * the hope that a rtentry may be cloned from a RTF_PRCLONING 965 * rtentry. 966 */ 967 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 968 ro->ro_rt == NULL) { 969 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 970 ro->ro_dst.sa_family = AF_INET; 971 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 972 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 973 sin->sin_addr; 974 rtalloc(ro); 975 } 976 977 /* 978 * Now that no more errors can occur, change the protocol processing 979 * port to the current thread (which is the correct thread). 980 * 981 * Create TCP timer message now; we are on the tcpcb's owner 982 * CPU/thread. 983 */ 984 tcp_create_timermsg(tp, &curthread->td_msgport); 985 986 /* 987 * Compute window scaling to request. Use a larger scaling then 988 * needed for the initial receive buffer in case the receive buffer 989 * gets expanded. 990 */ 991 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 992 tp->request_r_scale = TCP_MIN_WINSHIFT; 993 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 994 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 995 ) { 996 tp->request_r_scale++; 997 } 998 999 soisconnecting(so); 1000 tcpstat.tcps_connattempt++; 1001 tp->t_state = TCPS_SYN_SENT; 1002 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1003 tp->iss = tcp_new_isn(tp); 1004 tcp_sendseqinit(tp); 1005 if (m) { 1006 ssb_appendstream(&so->so_snd, m); 1007 m = NULL; 1008 if (flags & PRUS_OOB) 1009 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1010 } 1011 1012 /* 1013 * Close the send side of the connection after 1014 * the data is sent if flagged. 1015 */ 1016 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1017 socantsendmore(so); 1018 tp = tcp_usrclosed(tp); 1019 } 1020 return (tcp_output(tp)); 1021 } 1022 1023 /* 1024 * Common subroutine to open a TCP connection to remote host specified 1025 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1026 * port number if needed. Call in_pcbladdr to do the routing and to choose 1027 * a local host address (interface). 1028 * Initialize connection parameters and enter SYN-SENT state. 1029 */ 1030 static void 1031 tcp_connect(netmsg_t msg) 1032 { 1033 struct socket *so = msg->connect.base.nm_so; 1034 struct sockaddr *nam = msg->connect.nm_nam; 1035 struct thread *td = msg->connect.nm_td; 1036 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1037 struct sockaddr_in *if_sin; 1038 struct inpcb *inp; 1039 struct tcpcb *tp; 1040 int error, calc_laddr = 1; 1041 lwkt_port_t port; 1042 1043 COMMON_START(so, inp, 0); 1044 1045 /* 1046 * Reconnect our pcb if we have to 1047 */ 1048 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1049 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1050 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1051 } 1052 1053 /* 1054 * Bind if we have to 1055 */ 1056 if (inp->inp_lport == 0) { 1057 if (tcp_lport_extension) { 1058 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1059 1060 error = in_pcbladdr(inp, nam, &if_sin, td); 1061 if (error) 1062 goto out; 1063 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1064 1065 error = in_pcbconn_bind(inp, nam, td); 1066 if (error) 1067 goto out; 1068 1069 calc_laddr = 0; 1070 } else { 1071 error = in_pcbbind(inp, NULL, td); 1072 if (error) 1073 goto out; 1074 } 1075 } 1076 1077 if (calc_laddr) { 1078 /* 1079 * Calculate the correct protocol processing thread. The 1080 * connect operation must run there. Set the forwarding 1081 * port before we forward the message or it will get bounced 1082 * right back to us. 1083 */ 1084 error = in_pcbladdr(inp, nam, &if_sin, td); 1085 if (error) 1086 goto out; 1087 } 1088 KKASSERT(inp->inp_socket == so); 1089 1090 port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port, 1091 (inp->inp_laddr.s_addr ? 1092 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1093 inp->inp_lport); 1094 1095 if (port != &curthread->td_msgport) { 1096 struct route *ro = &inp->inp_route; 1097 1098 /* 1099 * in_pcbladdr() may have allocated a route entry for us 1100 * on the current CPU, but we need a route entry on the 1101 * inpcb's owner CPU, so free it here. 1102 */ 1103 if (ro->ro_rt != NULL) 1104 RTFREE(ro->ro_rt); 1105 bzero(ro, sizeof(*ro)); 1106 1107 /* 1108 * We are moving the protocol processing port the socket 1109 * is on, we have to unlink here and re-link on the 1110 * target cpu. 1111 */ 1112 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1113 sosetport(so, port); 1114 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1115 msg->connect.base.nm_dispatch = tcp_connect; 1116 1117 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1118 /* msg invalid now */ 1119 return; 1120 } 1121 error = tcp_connect_oncpu(tp, msg->connect.nm_flags, 1122 msg->connect.nm_m, sin, if_sin); 1123 msg->connect.nm_m = NULL; 1124 out: 1125 if (msg->connect.nm_m) { 1126 m_freem(msg->connect.nm_m); 1127 msg->connect.nm_m = NULL; 1128 } 1129 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1130 kfree(msg->connect.nm_nam, M_LWKTMSG); 1131 msg->connect.nm_nam = NULL; 1132 } 1133 lwkt_replymsg(&msg->connect.base.lmsg, error); 1134 /* msg invalid now */ 1135 } 1136 1137 #ifdef INET6 1138 1139 static void 1140 tcp6_connect(netmsg_t msg) 1141 { 1142 struct tcpcb *tp; 1143 struct socket *so = msg->connect.base.nm_so; 1144 struct sockaddr *nam = msg->connect.nm_nam; 1145 struct thread *td = msg->connect.nm_td; 1146 struct inpcb *inp; 1147 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1148 struct in6_addr *addr6; 1149 lwkt_port_t port; 1150 int error; 1151 1152 COMMON_START(so, inp, 0); 1153 1154 /* 1155 * Reconnect our pcb if we have to 1156 */ 1157 if (msg->connect.nm_reconnect & NMSG_RECONNECT_RECONNECT) { 1158 msg->connect.nm_reconnect &= ~NMSG_RECONNECT_RECONNECT; 1159 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1160 } 1161 1162 /* 1163 * Bind if we have to 1164 */ 1165 if (inp->inp_lport == 0) { 1166 error = in6_pcbbind(inp, NULL, td); 1167 if (error) 1168 goto out; 1169 } 1170 1171 /* 1172 * Cannot simply call in_pcbconnect, because there might be an 1173 * earlier incarnation of this same connection still in 1174 * TIME_WAIT state, creating an ADDRINUSE error. 1175 */ 1176 error = in6_pcbladdr(inp, nam, &addr6, td); 1177 if (error) 1178 goto out; 1179 1180 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1181 1182 if (port != &curthread->td_msgport) { 1183 struct route *ro = &inp->inp_route; 1184 1185 /* 1186 * in_pcbladdr() may have allocated a route entry for us 1187 * on the current CPU, but we need a route entry on the 1188 * inpcb's owner CPU, so free it here. 1189 */ 1190 if (ro->ro_rt != NULL) 1191 RTFREE(ro->ro_rt); 1192 bzero(ro, sizeof(*ro)); 1193 1194 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1195 sosetport(so, port); 1196 msg->connect.nm_reconnect |= NMSG_RECONNECT_RECONNECT; 1197 msg->connect.base.nm_dispatch = tcp6_connect; 1198 1199 lwkt_forwardmsg(port, &msg->connect.base.lmsg); 1200 /* msg invalid now */ 1201 return; 1202 } 1203 error = tcp6_connect_oncpu(tp, msg->connect.nm_flags, 1204 &msg->connect.nm_m, sin6, addr6); 1205 /* nm_m may still be intact */ 1206 out: 1207 if (error && (msg->connect.nm_reconnect & NMSG_RECONNECT_FALLBACK)) { 1208 tcp_connect(msg); 1209 /* msg invalid now */ 1210 } else { 1211 if (msg->connect.nm_m) { 1212 m_freem(msg->connect.nm_m); 1213 msg->connect.nm_m = NULL; 1214 } 1215 if (msg->connect.nm_reconnect & NMSG_RECONNECT_NAMALLOC) { 1216 kfree(msg->connect.nm_nam, M_LWKTMSG); 1217 msg->connect.nm_nam = NULL; 1218 } 1219 lwkt_replymsg(&msg->connect.base.lmsg, error); 1220 /* msg invalid now */ 1221 } 1222 } 1223 1224 static int 1225 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1226 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1227 { 1228 struct mbuf *m = *mp; 1229 struct inpcb *inp = tp->t_inpcb; 1230 struct socket *so = inp->inp_socket; 1231 struct inpcb *oinp; 1232 1233 /* 1234 * Cannot simply call in_pcbconnect, because there might be an 1235 * earlier incarnation of this same connection still in 1236 * TIME_WAIT state, creating an ADDRINUSE error. 1237 */ 1238 oinp = in6_pcblookup_hash(inp->inp_cpcbinfo, 1239 &sin6->sin6_addr, sin6->sin6_port, 1240 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1241 addr6 : &inp->in6p_laddr), 1242 inp->inp_lport, 0, NULL); 1243 if (oinp) 1244 return (EADDRINUSE); 1245 1246 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1247 inp->in6p_laddr = *addr6; 1248 inp->in6p_faddr = sin6->sin6_addr; 1249 inp->inp_fport = sin6->sin6_port; 1250 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1251 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1252 in_pcbinsconnhash(inp); 1253 1254 /* 1255 * Now that no more errors can occur, change the protocol processing 1256 * port to the current thread (which is the correct thread). 1257 * 1258 * Create TCP timer message now; we are on the tcpcb's owner 1259 * CPU/thread. 1260 */ 1261 tcp_create_timermsg(tp, &curthread->td_msgport); 1262 1263 /* Compute window scaling to request. */ 1264 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1265 tp->request_r_scale = TCP_MIN_WINSHIFT; 1266 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1267 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1268 tp->request_r_scale++; 1269 } 1270 1271 soisconnecting(so); 1272 tcpstat.tcps_connattempt++; 1273 tp->t_state = TCPS_SYN_SENT; 1274 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1275 tp->iss = tcp_new_isn(tp); 1276 tcp_sendseqinit(tp); 1277 if (m) { 1278 ssb_appendstream(&so->so_snd, m); 1279 *mp = NULL; 1280 if (flags & PRUS_OOB) 1281 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1282 } 1283 1284 /* 1285 * Close the send side of the connection after 1286 * the data is sent if flagged. 1287 */ 1288 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1289 socantsendmore(so); 1290 tp = tcp_usrclosed(tp); 1291 } 1292 return (tcp_output(tp)); 1293 } 1294 1295 #endif /* INET6 */ 1296 1297 /* 1298 * The new sockopt interface makes it possible for us to block in the 1299 * copyin/out step (if we take a page fault). Taking a page fault while 1300 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1301 * both now use TSM, there probably isn't any need for this function to 1302 * run in a critical section any more. This needs more examination.) 1303 */ 1304 void 1305 tcp_ctloutput(netmsg_t msg) 1306 { 1307 struct socket *so = msg->base.nm_so; 1308 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1309 int error, opt, optval, opthz; 1310 struct inpcb *inp; 1311 struct tcpcb *tp; 1312 1313 error = 0; 1314 inp = so->so_pcb; 1315 if (inp == NULL) { 1316 error = ECONNRESET; 1317 goto done; 1318 } 1319 1320 if (sopt->sopt_level != IPPROTO_TCP) { 1321 #ifdef INET6 1322 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1323 ip6_ctloutput_dispatch(msg); 1324 else 1325 #endif /* INET6 */ 1326 ip_ctloutput(msg); 1327 /* msg invalid now */ 1328 return; 1329 } 1330 tp = intotcpcb(inp); 1331 1332 switch (sopt->sopt_dir) { 1333 case SOPT_SET: 1334 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1335 sizeof optval); 1336 if (error) 1337 break; 1338 switch (sopt->sopt_name) { 1339 case TCP_FASTKEEP: 1340 if (optval > 0) 1341 tp->t_keepidle = tp->t_keepintvl; 1342 else 1343 tp->t_keepidle = tcp_keepidle; 1344 tcp_timer_keep_activity(tp, 0); 1345 break; 1346 #ifdef TCP_SIGNATURE 1347 case TCP_SIGNATURE_ENABLE: 1348 if (tp->t_state == TCPS_CLOSED) { 1349 /* 1350 * This is the only safe state that this 1351 * option could be changed. Some segments 1352 * could already have been sent in other 1353 * states. 1354 */ 1355 if (optval > 0) 1356 tp->t_flags |= TF_SIGNATURE; 1357 else 1358 tp->t_flags &= ~TF_SIGNATURE; 1359 } else { 1360 error = EOPNOTSUPP; 1361 } 1362 break; 1363 #endif /* TCP_SIGNATURE */ 1364 case TCP_NODELAY: 1365 case TCP_NOOPT: 1366 switch (sopt->sopt_name) { 1367 case TCP_NODELAY: 1368 opt = TF_NODELAY; 1369 break; 1370 case TCP_NOOPT: 1371 opt = TF_NOOPT; 1372 break; 1373 default: 1374 opt = 0; /* dead code to fool gcc */ 1375 break; 1376 } 1377 1378 if (optval) 1379 tp->t_flags |= opt; 1380 else 1381 tp->t_flags &= ~opt; 1382 break; 1383 1384 case TCP_NOPUSH: 1385 if (tcp_disable_nopush) 1386 break; 1387 if (optval) 1388 tp->t_flags |= TF_NOPUSH; 1389 else { 1390 tp->t_flags &= ~TF_NOPUSH; 1391 error = tcp_output(tp); 1392 } 1393 break; 1394 1395 case TCP_MAXSEG: 1396 /* 1397 * Must be between 0 and maxseg. If the requested 1398 * maxseg is too small to satisfy the desired minmss, 1399 * pump it up (silently so sysctl modifications of 1400 * minmss do not create unexpected program failures). 1401 * Handle degenerate cases. 1402 */ 1403 if (optval > 0 && optval <= tp->t_maxseg) { 1404 if (optval + 40 < tcp_minmss) { 1405 optval = tcp_minmss - 40; 1406 if (optval < 0) 1407 optval = 1; 1408 } 1409 tp->t_maxseg = optval; 1410 } else { 1411 error = EINVAL; 1412 } 1413 break; 1414 1415 case TCP_KEEPINIT: 1416 opthz = ((int64_t)optval * hz) / 1000; 1417 if (opthz >= 1) 1418 tp->t_keepinit = opthz; 1419 else 1420 error = EINVAL; 1421 break; 1422 1423 case TCP_KEEPIDLE: 1424 opthz = ((int64_t)optval * hz) / 1000; 1425 if (opthz >= 1) { 1426 tp->t_keepidle = opthz; 1427 tcp_timer_keep_activity(tp, 0); 1428 } else { 1429 error = EINVAL; 1430 } 1431 break; 1432 1433 case TCP_KEEPINTVL: 1434 opthz = ((int64_t)optval * hz) / 1000; 1435 if (opthz >= 1) { 1436 tp->t_keepintvl = opthz; 1437 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1438 } else { 1439 error = EINVAL; 1440 } 1441 break; 1442 1443 case TCP_KEEPCNT: 1444 if (optval > 0) { 1445 tp->t_keepcnt = optval; 1446 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1447 } else { 1448 error = EINVAL; 1449 } 1450 break; 1451 1452 default: 1453 error = ENOPROTOOPT; 1454 break; 1455 } 1456 break; 1457 1458 case SOPT_GET: 1459 switch (sopt->sopt_name) { 1460 #ifdef TCP_SIGNATURE 1461 case TCP_SIGNATURE_ENABLE: 1462 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1463 break; 1464 #endif /* TCP_SIGNATURE */ 1465 case TCP_NODELAY: 1466 optval = tp->t_flags & TF_NODELAY; 1467 break; 1468 case TCP_MAXSEG: 1469 optval = tp->t_maxseg; 1470 break; 1471 case TCP_NOOPT: 1472 optval = tp->t_flags & TF_NOOPT; 1473 break; 1474 case TCP_NOPUSH: 1475 optval = tp->t_flags & TF_NOPUSH; 1476 break; 1477 case TCP_KEEPINIT: 1478 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1479 break; 1480 case TCP_KEEPIDLE: 1481 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1482 break; 1483 case TCP_KEEPINTVL: 1484 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1485 break; 1486 case TCP_KEEPCNT: 1487 optval = tp->t_keepcnt; 1488 break; 1489 default: 1490 error = ENOPROTOOPT; 1491 break; 1492 } 1493 if (error == 0) 1494 soopt_from_kbuf(sopt, &optval, sizeof optval); 1495 break; 1496 } 1497 done: 1498 lwkt_replymsg(&msg->lmsg, error); 1499 } 1500 1501 /* 1502 * tcp_sendspace and tcp_recvspace are the default send and receive window 1503 * sizes, respectively. These are obsolescent (this information should 1504 * be set by the route). 1505 * 1506 * Use a default that does not require tcp window scaling to be turned 1507 * on. Individual programs or the administrator can increase the default. 1508 */ 1509 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1510 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1511 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1512 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1513 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1514 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1515 1516 /* 1517 * Attach TCP protocol to socket, allocating internet protocol control 1518 * block, tcp control block, bufer space, and entering LISTEN state 1519 * if to accept connections. 1520 */ 1521 static int 1522 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1523 { 1524 struct tcpcb *tp; 1525 struct inpcb *inp; 1526 int error; 1527 int cpu; 1528 #ifdef INET6 1529 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1530 #endif 1531 1532 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 1533 lwkt_gettoken(&so->so_rcv.ssb_token); 1534 error = soreserve(so, tcp_sendspace, tcp_recvspace, 1535 ai->sb_rlimit); 1536 lwkt_reltoken(&so->so_rcv.ssb_token); 1537 if (error) 1538 return (error); 1539 } 1540 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE); 1541 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); 1542 cpu = mycpu->gd_cpuid; 1543 1544 /* 1545 * Set the default port for protocol processing. This will likely 1546 * change when we connect. 1547 */ 1548 error = in_pcballoc(so, &tcbinfo[cpu]); 1549 if (error) 1550 return (error); 1551 inp = so->so_pcb; 1552 #ifdef INET6 1553 if (isipv6) { 1554 inp->inp_vflag |= INP_IPV6; 1555 inp->in6p_hops = -1; /* use kernel default */ 1556 } 1557 else 1558 #endif 1559 inp->inp_vflag |= INP_IPV4; 1560 tp = tcp_newtcpcb(inp); 1561 if (tp == NULL) { 1562 /* 1563 * Make sure the socket is destroyed by the pcbdetach. 1564 */ 1565 soreference(so); 1566 #ifdef INET6 1567 if (isipv6) 1568 in6_pcbdetach(inp); 1569 else 1570 #endif 1571 in_pcbdetach(inp); 1572 sofree(so); /* from ref above */ 1573 return (ENOBUFS); 1574 } 1575 tp->t_state = TCPS_CLOSED; 1576 return (0); 1577 } 1578 1579 /* 1580 * Initiate (or continue) disconnect. 1581 * If embryonic state, just send reset (once). 1582 * If in ``let data drain'' option and linger null, just drop. 1583 * Otherwise (hard), mark socket disconnecting and drop 1584 * current input data; switch states based on user close, and 1585 * send segment to peer (with FIN). 1586 */ 1587 static struct tcpcb * 1588 tcp_disconnect(struct tcpcb *tp) 1589 { 1590 struct socket *so = tp->t_inpcb->inp_socket; 1591 1592 if (tp->t_state < TCPS_ESTABLISHED) { 1593 tp = tcp_close(tp); 1594 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1595 tp = tcp_drop(tp, 0); 1596 } else { 1597 lwkt_gettoken(&so->so_rcv.ssb_token); 1598 soisdisconnecting(so); 1599 sbflush(&so->so_rcv.sb); 1600 tp = tcp_usrclosed(tp); 1601 if (tp) 1602 tcp_output(tp); 1603 lwkt_reltoken(&so->so_rcv.ssb_token); 1604 } 1605 return (tp); 1606 } 1607 1608 /* 1609 * User issued close, and wish to trail through shutdown states: 1610 * if never received SYN, just forget it. If got a SYN from peer, 1611 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1612 * If already got a FIN from peer, then almost done; go to LAST_ACK 1613 * state. In all other cases, have already sent FIN to peer (e.g. 1614 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1615 * for peer to send FIN or not respond to keep-alives, etc. 1616 * We can let the user exit from the close as soon as the FIN is acked. 1617 */ 1618 static struct tcpcb * 1619 tcp_usrclosed(struct tcpcb *tp) 1620 { 1621 1622 switch (tp->t_state) { 1623 1624 case TCPS_CLOSED: 1625 case TCPS_LISTEN: 1626 tp->t_state = TCPS_CLOSED; 1627 tp = tcp_close(tp); 1628 break; 1629 1630 case TCPS_SYN_SENT: 1631 case TCPS_SYN_RECEIVED: 1632 tp->t_flags |= TF_NEEDFIN; 1633 break; 1634 1635 case TCPS_ESTABLISHED: 1636 tp->t_state = TCPS_FIN_WAIT_1; 1637 break; 1638 1639 case TCPS_CLOSE_WAIT: 1640 tp->t_state = TCPS_LAST_ACK; 1641 break; 1642 } 1643 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1644 soisdisconnected(tp->t_inpcb->inp_socket); 1645 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1646 if (tp->t_state == TCPS_FIN_WAIT_2) { 1647 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1648 tcp_timer_2msl); 1649 } 1650 } 1651 return (tp); 1652 } 1653