1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 63 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 64 */ 65 66 #include "opt_ipsec.h" 67 #include "opt_inet.h" 68 #include "opt_inet6.h" 69 #include "opt_tcpdebug.h" 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/kernel.h> 74 #include <sys/malloc.h> 75 #include <sys/sysctl.h> 76 #include <sys/globaldata.h> 77 #include <sys/thread.h> 78 79 #include <sys/mbuf.h> 80 #ifdef INET6 81 #include <sys/domain.h> 82 #endif /* INET6 */ 83 #include <sys/socket.h> 84 #include <sys/socketvar.h> 85 #include <sys/socketops.h> 86 #include <sys/protosw.h> 87 88 #include <sys/thread2.h> 89 #include <sys/msgport2.h> 90 #include <sys/socketvar2.h> 91 92 #include <net/if.h> 93 #include <net/netisr.h> 94 #include <net/route.h> 95 96 #include <net/netmsg2.h> 97 #include <net/netisr2.h> 98 99 #include <netinet/in.h> 100 #include <netinet/in_systm.h> 101 #ifdef INET6 102 #include <netinet/ip6.h> 103 #endif 104 #include <netinet/in_pcb.h> 105 #ifdef INET6 106 #include <netinet6/in6_pcb.h> 107 #endif 108 #include <netinet/in_var.h> 109 #include <netinet/ip_var.h> 110 #ifdef INET6 111 #include <netinet6/ip6_var.h> 112 #include <netinet6/tcp6_var.h> 113 #endif 114 #include <netinet/tcp.h> 115 #include <netinet/tcp_fsm.h> 116 #include <netinet/tcp_seq.h> 117 #include <netinet/tcp_timer.h> 118 #include <netinet/tcp_timer2.h> 119 #include <netinet/tcp_var.h> 120 #include <netinet/tcpip.h> 121 #ifdef TCPDEBUG 122 #include <netinet/tcp_debug.h> 123 #endif 124 125 #ifdef IPSEC 126 #include <netinet6/ipsec.h> 127 #endif /*IPSEC*/ 128 129 /* 130 * TCP protocol interface to socket abstraction. 131 */ 132 extern char *tcpstates[]; /* XXX ??? */ 133 134 static int tcp_attach (struct socket *, struct pru_attach_info *); 135 static void tcp_connect (netmsg_t msg); 136 #ifdef INET6 137 static void tcp6_connect (netmsg_t msg); 138 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 139 struct mbuf **mp, 140 struct sockaddr_in6 *sin6, 141 struct in6_addr *addr6); 142 #endif /* INET6 */ 143 static struct tcpcb * 144 tcp_disconnect (struct tcpcb *); 145 static struct tcpcb * 146 tcp_usrclosed (struct tcpcb *); 147 148 #ifdef TCPDEBUG 149 #define TCPDEBUG0 int ostate = 0 150 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 151 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 152 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 153 #else 154 #define TCPDEBUG0 155 #define TCPDEBUG1() 156 #define TCPDEBUG2(req) 157 #endif 158 159 static int tcp_lport_extension = 1; 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, lportext, CTLFLAG_RW, 161 &tcp_lport_extension, 0, ""); 162 163 /* 164 * For some ill optimized programs, which try to use TCP_NOPUSH 165 * to improve performance, will have small amount of data sits 166 * in the sending buffer. These small amount of data will _not_ 167 * be pushed into the network until more data are written into 168 * the socket or the socket write side is shutdown. 169 */ 170 static int tcp_disable_nopush = 1; 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 172 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 173 174 /* 175 * Allocate socket buffer space. 176 */ 177 static int 178 tcp_usr_preattach(struct socket *so, int proto __unused, 179 struct pru_attach_info *ai) 180 { 181 int error; 182 183 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 184 error = soreserve(so, tcp_sendspace, tcp_recvspace, 185 ai->sb_rlimit); 186 if (error) 187 return (error); 188 } 189 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC); 190 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC); 191 192 return 0; 193 } 194 195 /* 196 * TCP attaches to socket via pru_attach(), reserving space, 197 * and an internet control block. This socket may move to 198 * other CPU later when we bind/connect. 199 */ 200 static void 201 tcp_usr_attach(netmsg_t msg) 202 { 203 struct socket *so = msg->base.nm_so; 204 struct pru_attach_info *ai = msg->attach.nm_ai; 205 int error; 206 struct inpcb *inp; 207 struct tcpcb *tp = NULL; 208 TCPDEBUG0; 209 210 inp = so->so_pcb; 211 KASSERT(inp == NULL, ("tcp socket attached")); 212 TCPDEBUG1(); 213 214 error = tcp_attach(so, ai); 215 if (error) 216 goto out; 217 218 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 219 so->so_linger = TCP_LINGERTIME; 220 tp = sototcpcb(so); 221 out: 222 TCPDEBUG2(PRU_ATTACH); 223 lwkt_replymsg(&msg->lmsg, error); 224 } 225 226 /* 227 * pru_detach() detaches the TCP protocol from the socket. 228 * If the protocol state is non-embryonic, then can't 229 * do this directly: have to initiate a pru_disconnect(), 230 * which may finish later; embryonic TCB's can just 231 * be discarded here. 232 */ 233 static void 234 tcp_usr_detach(netmsg_t msg) 235 { 236 struct socket *so = msg->base.nm_so; 237 int error = 0; 238 struct inpcb *inp; 239 struct tcpcb *tp; 240 TCPDEBUG0; 241 242 inp = so->so_pcb; 243 244 /* 245 * If the inp is already detached or never attached, it may have 246 * been due to an async close or async attach failure. Just return 247 * as if no error occured. 248 */ 249 if (inp) { 250 tp = intotcpcb(inp); 251 KASSERT(tp != NULL, ("tcp_usr_detach: tp is NULL")); 252 TCPDEBUG1(); 253 tp = tcp_disconnect(tp); 254 TCPDEBUG2(PRU_DETACH); 255 } 256 lwkt_replymsg(&msg->lmsg, error); 257 } 258 259 /* 260 * NOTE: ignore_error is non-zero for certain disconnection races 261 * which we want to silently allow, otherwise close() may return 262 * an unexpected error. 263 * 264 * NOTE: The variables (msg) and (tp) are assumed. 265 */ 266 #define COMMON_START(so, inp, ignore_error) \ 267 TCPDEBUG0; \ 268 \ 269 inp = so->so_pcb; \ 270 do { \ 271 if (inp == NULL) { \ 272 error = ignore_error ? 0 : EINVAL; \ 273 tp = NULL; \ 274 goto out; \ 275 } \ 276 tp = intotcpcb(inp); \ 277 TCPDEBUG1(); \ 278 } while(0) 279 280 #define COMMON_END1(req, noreply) \ 281 out: do { \ 282 TCPDEBUG2(req); \ 283 if (!(noreply)) \ 284 lwkt_replymsg(&msg->lmsg, error); \ 285 return; \ 286 } while(0) 287 288 #define COMMON_END(req) COMMON_END1((req), 0) 289 290 /* 291 * Give the socket an address. 292 */ 293 static void 294 tcp_usr_bind(netmsg_t msg) 295 { 296 struct socket *so = msg->bind.base.nm_so; 297 struct sockaddr *nam = msg->bind.nm_nam; 298 struct thread *td = msg->bind.nm_td; 299 int error = 0; 300 struct inpcb *inp; 301 struct tcpcb *tp; 302 struct sockaddr_in *sinp; 303 304 COMMON_START(so, inp, 0); 305 306 /* 307 * Must check for multicast addresses and disallow binding 308 * to them. 309 */ 310 sinp = (struct sockaddr_in *)nam; 311 if (sinp->sin_family == AF_INET && 312 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 313 error = EAFNOSUPPORT; 314 goto out; 315 } 316 error = in_pcbbind(inp, nam, td); 317 if (error) 318 goto out; 319 320 COMMON_END(PRU_BIND); 321 } 322 323 #ifdef INET6 324 325 static void 326 tcp6_usr_bind(netmsg_t msg) 327 { 328 struct socket *so = msg->bind.base.nm_so; 329 struct sockaddr *nam = msg->bind.nm_nam; 330 struct thread *td = msg->bind.nm_td; 331 int error = 0; 332 struct inpcb *inp; 333 struct tcpcb *tp; 334 struct sockaddr_in6 *sin6p; 335 336 COMMON_START(so, inp, 0); 337 338 /* 339 * Must check for multicast addresses and disallow binding 340 * to them. 341 */ 342 sin6p = (struct sockaddr_in6 *)nam; 343 if (sin6p->sin6_family == AF_INET6 && 344 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 345 error = EAFNOSUPPORT; 346 goto out; 347 } 348 error = in6_pcbbind(inp, nam, td); 349 if (error) 350 goto out; 351 COMMON_END(PRU_BIND); 352 } 353 #endif /* INET6 */ 354 355 struct netmsg_inswildcard { 356 struct netmsg_base base; 357 struct inpcb *nm_inp; 358 }; 359 360 static void 361 in_pcbinswildcardhash_handler(netmsg_t msg) 362 { 363 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 364 int cpu = mycpuid, nextcpu; 365 366 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 367 368 nextcpu = cpu + 1; 369 if (nextcpu < ncpus2) 370 lwkt_forwardmsg(netisr_cpuport(nextcpu), &nm->base.lmsg); 371 else 372 lwkt_replymsg(&nm->base.lmsg, 0); 373 } 374 375 static void 376 tcp_sosetport(struct lwkt_msg *msg, lwkt_port_t port) 377 { 378 sosetport(((struct netmsg_base *)msg)->nm_so, port); 379 } 380 381 /* 382 * Prepare to accept connections. 383 */ 384 static void 385 tcp_usr_listen(netmsg_t msg) 386 { 387 struct socket *so = msg->listen.base.nm_so; 388 struct thread *td = msg->listen.nm_td; 389 int error = 0; 390 struct inpcb *inp; 391 struct tcpcb *tp; 392 struct netmsg_inswildcard nm; 393 lwkt_port_t port0 = netisr_cpuport(0); 394 395 COMMON_START(so, inp, 0); 396 397 if (&curthread->td_msgport != port0) { 398 lwkt_msg_t lmsg = &msg->listen.base.lmsg; 399 400 KASSERT((msg->listen.nm_flags & PRUL_RELINK) == 0, 401 ("already asked to relink")); 402 403 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]); 404 msg->listen.nm_flags |= PRUL_RELINK; 405 406 /* See the related comment in tcp_connect() */ 407 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 408 lwkt_forwardmsg(port0, lmsg); 409 /* msg invalid now */ 410 return; 411 } 412 KASSERT(so->so_port == port0, ("so_port is not netisr0")); 413 414 if (msg->listen.nm_flags & PRUL_RELINK) { 415 msg->listen.nm_flags &= ~PRUL_RELINK; 416 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]); 417 } 418 KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0")); 419 420 if (tp->t_flags & TF_LISTEN) 421 goto out; 422 423 if (inp->inp_lport == 0) { 424 error = in_pcbbind(inp, NULL, td); 425 if (error) 426 goto out; 427 } 428 429 tp->t_state = TCPS_LISTEN; 430 tp->t_flags |= TF_LISTEN; 431 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 432 433 if (ncpus2 > 1) { 434 /* 435 * Put this inpcb into wildcard hash on other cpus. 436 */ 437 ASSERT_INP_NOTINHASH(inp); 438 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 439 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 440 nm.nm_inp = inp; 441 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 442 } 443 in_pcbinswildcardhash(inp); 444 COMMON_END(PRU_LISTEN); 445 } 446 447 #ifdef INET6 448 449 static void 450 tcp6_usr_listen(netmsg_t msg) 451 { 452 struct socket *so = msg->listen.base.nm_so; 453 struct thread *td = msg->listen.nm_td; 454 int error = 0; 455 struct inpcb *inp; 456 struct tcpcb *tp; 457 struct netmsg_inswildcard nm; 458 459 COMMON_START(so, inp, 0); 460 461 if (tp->t_flags & TF_LISTEN) 462 goto out; 463 464 if (inp->inp_lport == 0) { 465 error = in6_pcbbind(inp, NULL, td); 466 if (error) 467 goto out; 468 } 469 470 tp->t_state = TCPS_LISTEN; 471 tp->t_flags |= TF_LISTEN; 472 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 473 474 if (ncpus2 > 1) { 475 /* 476 * Put this inpcb into wildcard hash on other cpus. 477 */ 478 KKASSERT(so->so_port == netisr_cpuport(0)); 479 ASSERT_IN_NETISR(0); 480 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 481 ASSERT_INP_NOTINHASH(inp); 482 483 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 484 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 485 nm.nm_inp = inp; 486 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 487 } 488 in_pcbinswildcardhash(inp); 489 COMMON_END(PRU_LISTEN); 490 } 491 #endif /* INET6 */ 492 493 /* 494 * Initiate connection to peer. 495 * Create a template for use in transmissions on this connection. 496 * Enter SYN_SENT state, and mark socket as connecting. 497 * Start keep-alive timer, and seed output sequence space. 498 * Send initial segment on connection. 499 */ 500 static void 501 tcp_usr_connect(netmsg_t msg) 502 { 503 struct socket *so = msg->connect.base.nm_so; 504 struct sockaddr *nam = msg->connect.nm_nam; 505 struct thread *td = msg->connect.nm_td; 506 int error = 0; 507 struct inpcb *inp; 508 struct tcpcb *tp; 509 struct sockaddr_in *sinp; 510 511 COMMON_START(so, inp, 0); 512 513 /* 514 * Must disallow TCP ``connections'' to multicast addresses. 515 */ 516 sinp = (struct sockaddr_in *)nam; 517 if (sinp->sin_family == AF_INET 518 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 519 error = EAFNOSUPPORT; 520 goto out; 521 } 522 523 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 524 error = EAFNOSUPPORT; /* IPv6 only jail */ 525 goto out; 526 } 527 528 tcp_connect(msg); 529 /* msg is invalid now */ 530 return; 531 out: 532 if (msg->connect.nm_m) { 533 m_freem(msg->connect.nm_m); 534 msg->connect.nm_m = NULL; 535 } 536 if (msg->connect.nm_flags & PRUC_HELDTD) 537 lwkt_rele(td); 538 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 539 so->so_error = error; 540 soisdisconnected(so); 541 } 542 lwkt_replymsg(&msg->lmsg, error); 543 } 544 545 #ifdef INET6 546 547 static void 548 tcp6_usr_connect(netmsg_t msg) 549 { 550 struct socket *so = msg->connect.base.nm_so; 551 struct sockaddr *nam = msg->connect.nm_nam; 552 struct thread *td = msg->connect.nm_td; 553 int error = 0; 554 struct inpcb *inp; 555 struct tcpcb *tp; 556 struct sockaddr_in6 *sin6p; 557 558 COMMON_START(so, inp, 0); 559 560 /* 561 * Must disallow TCP ``connections'' to multicast addresses. 562 */ 563 sin6p = (struct sockaddr_in6 *)nam; 564 if (sin6p->sin6_family == AF_INET6 565 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 566 error = EAFNOSUPPORT; 567 goto out; 568 } 569 570 if (!prison_remote_ip(td, nam)) { 571 error = EAFNOSUPPORT; /* IPv4 only jail */ 572 goto out; 573 } 574 575 /* Reject v4-mapped address */ 576 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 577 error = EADDRNOTAVAIL; 578 goto out; 579 } 580 581 inp->inp_inc.inc_isipv6 = 1; 582 tcp6_connect(msg); 583 /* msg is invalid now */ 584 return; 585 out: 586 if (msg->connect.nm_m) { 587 m_freem(msg->connect.nm_m); 588 msg->connect.nm_m = NULL; 589 } 590 lwkt_replymsg(&msg->lmsg, error); 591 } 592 593 #endif /* INET6 */ 594 595 /* 596 * Initiate disconnect from peer. 597 * If connection never passed embryonic stage, just drop; 598 * else if don't need to let data drain, then can just drop anyways, 599 * else have to begin TCP shutdown process: mark socket disconnecting, 600 * drain unread data, state switch to reflect user close, and 601 * send segment (e.g. FIN) to peer. Socket will be really disconnected 602 * when peer sends FIN and acks ours. 603 * 604 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 605 */ 606 static void 607 tcp_usr_disconnect(netmsg_t msg) 608 { 609 struct socket *so = msg->disconnect.base.nm_so; 610 int error = 0; 611 struct inpcb *inp; 612 struct tcpcb *tp; 613 614 COMMON_START(so, inp, 1); 615 tp = tcp_disconnect(tp); 616 COMMON_END(PRU_DISCONNECT); 617 } 618 619 /* 620 * Accept a connection. Essentially all the work is 621 * done at higher levels; just return the address 622 * of the peer, storing through addr. 623 */ 624 static void 625 tcp_usr_accept(netmsg_t msg) 626 { 627 struct socket *so = msg->accept.base.nm_so; 628 struct sockaddr **nam = msg->accept.nm_nam; 629 int error = 0; 630 struct inpcb *inp; 631 struct tcpcb *tp = NULL; 632 TCPDEBUG0; 633 634 inp = so->so_pcb; 635 if (so->so_state & SS_ISDISCONNECTED) { 636 error = ECONNABORTED; 637 goto out; 638 } 639 if (inp == NULL) { 640 error = EINVAL; 641 goto out; 642 } 643 644 tp = intotcpcb(inp); 645 TCPDEBUG1(); 646 in_setpeeraddr(so, nam); 647 COMMON_END(PRU_ACCEPT); 648 } 649 650 #ifdef INET6 651 static void 652 tcp6_usr_accept(netmsg_t msg) 653 { 654 struct socket *so = msg->accept.base.nm_so; 655 struct sockaddr **nam = msg->accept.nm_nam; 656 int error = 0; 657 struct inpcb *inp; 658 struct tcpcb *tp = NULL; 659 TCPDEBUG0; 660 661 inp = so->so_pcb; 662 663 if (so->so_state & SS_ISDISCONNECTED) { 664 error = ECONNABORTED; 665 goto out; 666 } 667 if (inp == NULL) { 668 error = EINVAL; 669 goto out; 670 } 671 tp = intotcpcb(inp); 672 TCPDEBUG1(); 673 in6_setpeeraddr(so, nam); 674 COMMON_END(PRU_ACCEPT); 675 } 676 #endif /* INET6 */ 677 678 /* 679 * Mark the connection as being incapable of further output. 680 */ 681 static void 682 tcp_usr_shutdown(netmsg_t msg) 683 { 684 struct socket *so = msg->shutdown.base.nm_so; 685 int error = 0; 686 struct inpcb *inp; 687 struct tcpcb *tp; 688 689 COMMON_START(so, inp, 0); 690 socantsendmore(so); 691 tp = tcp_usrclosed(tp); 692 if (tp) 693 error = tcp_output(tp); 694 COMMON_END(PRU_SHUTDOWN); 695 } 696 697 /* 698 * After a receive, possibly send window update to peer. 699 */ 700 static void 701 tcp_usr_rcvd(netmsg_t msg) 702 { 703 struct socket *so = msg->rcvd.base.nm_so; 704 int error = 0, noreply = 0; 705 struct inpcb *inp; 706 struct tcpcb *tp; 707 708 COMMON_START(so, inp, 0); 709 710 if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) { 711 noreply = 1; 712 so_async_rcvd_reply(so); 713 } 714 tcp_output(tp); 715 716 COMMON_END1(PRU_RCVD, noreply); 717 } 718 719 /* 720 * Do a send by putting data in output queue and updating urgent 721 * marker if URG set. Possibly send more data. Unlike the other 722 * pru_*() routines, the mbuf chains are our responsibility. We 723 * must either enqueue them or free them. The other pru_* routines 724 * generally are caller-frees. 725 */ 726 static void 727 tcp_usr_send(netmsg_t msg) 728 { 729 struct socket *so = msg->send.base.nm_so; 730 int flags = msg->send.nm_flags; 731 struct mbuf *m = msg->send.nm_m; 732 int error = 0; 733 struct inpcb *inp; 734 struct tcpcb *tp; 735 TCPDEBUG0; 736 737 KKASSERT(msg->send.nm_control == NULL); 738 KKASSERT(msg->send.nm_addr == NULL); 739 KKASSERT((flags & PRUS_FREEADDR) == 0); 740 741 inp = so->so_pcb; 742 743 if (inp == NULL) { 744 /* 745 * OOPS! we lost a race, the TCP session got reset after 746 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 747 * network interrupt in the non-critical section of sosend(). 748 */ 749 m_freem(m); 750 error = ECONNRESET; /* XXX EPIPE? */ 751 tp = NULL; 752 TCPDEBUG1(); 753 goto out; 754 } 755 tp = intotcpcb(inp); 756 TCPDEBUG1(); 757 758 #ifdef foo 759 /* 760 * This is no longer necessary, since: 761 * - sosendtcp() has already checked it for us 762 * - It does not work with asynchronized send 763 */ 764 765 /* 766 * Don't let too much OOB data build up 767 */ 768 if (flags & PRUS_OOB) { 769 if (ssb_space(&so->so_snd) < -512) { 770 m_freem(m); 771 error = ENOBUFS; 772 goto out; 773 } 774 } 775 #endif 776 777 /* 778 * Pump the data into the socket. 779 */ 780 if (m) { 781 ssb_appendstream(&so->so_snd, m); 782 sowwakeup(so); 783 } 784 if (flags & PRUS_OOB) { 785 /* 786 * According to RFC961 (Assigned Protocols), 787 * the urgent pointer points to the last octet 788 * of urgent data. We continue, however, 789 * to consider it to indicate the first octet 790 * of data past the urgent section. 791 * Otherwise, snd_up should be one lower. 792 */ 793 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 794 tp->t_flags |= TF_FORCE; 795 error = tcp_output(tp); 796 tp->t_flags &= ~TF_FORCE; 797 } else { 798 if (flags & PRUS_EOF) { 799 /* 800 * Close the send side of the connection after 801 * the data is sent. 802 */ 803 socantsendmore(so); 804 tp = tcp_usrclosed(tp); 805 } 806 if (tp != NULL && !tcp_output_pending(tp)) { 807 if (flags & PRUS_MORETOCOME) 808 tp->t_flags |= TF_MORETOCOME; 809 error = tcp_output_fair(tp); 810 if (flags & PRUS_MORETOCOME) 811 tp->t_flags &= ~TF_MORETOCOME; 812 } 813 } 814 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 815 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 816 (flags & PRUS_NOREPLY)); 817 } 818 819 /* 820 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 821 * will sofree() it when we return. 822 */ 823 static void 824 tcp_usr_abort(netmsg_t msg) 825 { 826 struct socket *so = msg->abort.base.nm_so; 827 int error = 0; 828 struct inpcb *inp; 829 struct tcpcb *tp; 830 831 COMMON_START(so, inp, 1); 832 tp = tcp_drop(tp, ECONNABORTED); 833 COMMON_END(PRU_ABORT); 834 } 835 836 /* 837 * Receive out-of-band data. 838 */ 839 static void 840 tcp_usr_rcvoob(netmsg_t msg) 841 { 842 struct socket *so = msg->rcvoob.base.nm_so; 843 struct mbuf *m = msg->rcvoob.nm_m; 844 int flags = msg->rcvoob.nm_flags; 845 int error = 0; 846 struct inpcb *inp; 847 struct tcpcb *tp; 848 849 COMMON_START(so, inp, 0); 850 if ((so->so_oobmark == 0 && 851 (so->so_state & SS_RCVATMARK) == 0) || 852 so->so_options & SO_OOBINLINE || 853 tp->t_oobflags & TCPOOB_HADDATA) { 854 error = EINVAL; 855 goto out; 856 } 857 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 858 error = EWOULDBLOCK; 859 goto out; 860 } 861 m->m_len = 1; 862 *mtod(m, caddr_t) = tp->t_iobc; 863 if ((flags & MSG_PEEK) == 0) 864 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 865 COMMON_END(PRU_RCVOOB); 866 } 867 868 static void 869 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 870 { 871 in_savefaddr(so, faddr); 872 } 873 874 #ifdef INET6 875 static void 876 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 877 { 878 in6_savefaddr(so, faddr); 879 } 880 #endif 881 882 static int 883 tcp_usr_preconnect(struct socket *so, const struct sockaddr *nam, 884 struct thread *td __unused) 885 { 886 const struct sockaddr_in *sinp; 887 888 sinp = (const struct sockaddr_in *)nam; 889 if (sinp->sin_family == AF_INET && 890 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 891 return EAFNOSUPPORT; 892 893 soisconnecting(so); 894 return 0; 895 } 896 897 /* xxx - should be const */ 898 struct pr_usrreqs tcp_usrreqs = { 899 .pru_abort = tcp_usr_abort, 900 .pru_accept = tcp_usr_accept, 901 .pru_attach = tcp_usr_attach, 902 .pru_bind = tcp_usr_bind, 903 .pru_connect = tcp_usr_connect, 904 .pru_connect2 = pr_generic_notsupp, 905 .pru_control = in_control_dispatch, 906 .pru_detach = tcp_usr_detach, 907 .pru_disconnect = tcp_usr_disconnect, 908 .pru_listen = tcp_usr_listen, 909 .pru_peeraddr = in_setpeeraddr_dispatch, 910 .pru_rcvd = tcp_usr_rcvd, 911 .pru_rcvoob = tcp_usr_rcvoob, 912 .pru_send = tcp_usr_send, 913 .pru_sense = pru_sense_null, 914 .pru_shutdown = tcp_usr_shutdown, 915 .pru_sockaddr = in_setsockaddr_dispatch, 916 .pru_sosend = sosendtcp, 917 .pru_soreceive = sorecvtcp, 918 .pru_savefaddr = tcp_usr_savefaddr, 919 .pru_preconnect = tcp_usr_preconnect, 920 .pru_preattach = tcp_usr_preattach 921 }; 922 923 #ifdef INET6 924 struct pr_usrreqs tcp6_usrreqs = { 925 .pru_abort = tcp_usr_abort, 926 .pru_accept = tcp6_usr_accept, 927 .pru_attach = tcp_usr_attach, 928 .pru_bind = tcp6_usr_bind, 929 .pru_connect = tcp6_usr_connect, 930 .pru_connect2 = pr_generic_notsupp, 931 .pru_control = in6_control_dispatch, 932 .pru_detach = tcp_usr_detach, 933 .pru_disconnect = tcp_usr_disconnect, 934 .pru_listen = tcp6_usr_listen, 935 .pru_peeraddr = in6_setpeeraddr_dispatch, 936 .pru_rcvd = tcp_usr_rcvd, 937 .pru_rcvoob = tcp_usr_rcvoob, 938 .pru_send = tcp_usr_send, 939 .pru_sense = pru_sense_null, 940 .pru_shutdown = tcp_usr_shutdown, 941 .pru_sockaddr = in6_setsockaddr_dispatch, 942 .pru_sosend = sosendtcp, 943 .pru_soreceive = sorecvtcp, 944 .pru_savefaddr = tcp6_usr_savefaddr 945 }; 946 #endif /* INET6 */ 947 948 static int 949 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 950 struct sockaddr_in *sin, struct sockaddr_in *if_sin) 951 { 952 struct inpcb *inp = tp->t_inpcb, *oinp; 953 struct socket *so = inp->inp_socket; 954 struct route *ro = &inp->inp_route; 955 956 KASSERT(inp->inp_pcbinfo == &tcbinfo[mycpu->gd_cpuid], 957 ("pcbinfo mismatch")); 958 959 oinp = in_pcblookup_hash(inp->inp_pcbinfo, 960 sin->sin_addr, sin->sin_port, 961 (inp->inp_laddr.s_addr != INADDR_ANY ? 962 inp->inp_laddr : if_sin->sin_addr), 963 inp->inp_lport, 0, NULL); 964 if (oinp != NULL) { 965 m_freem(m); 966 return (EADDRINUSE); 967 } 968 if (inp->inp_laddr.s_addr == INADDR_ANY) 969 inp->inp_laddr = if_sin->sin_addr; 970 inp->inp_faddr = sin->sin_addr; 971 inp->inp_fport = sin->sin_port; 972 in_pcbinsconnhash(inp); 973 974 /* 975 * We are now on the inpcb's owner CPU, if the cached route was 976 * freed because the rtentry's owner CPU is not the current CPU 977 * (e.g. in tcp_connect()), then we try to reallocate it here with 978 * the hope that a rtentry may be cloned from a RTF_PRCLONING 979 * rtentry. 980 */ 981 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 982 ro->ro_rt == NULL) { 983 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 984 ro->ro_dst.sa_family = AF_INET; 985 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 986 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 987 sin->sin_addr; 988 rtalloc(ro); 989 } 990 991 /* 992 * Now that no more errors can occur, change the protocol processing 993 * port to the current thread (which is the correct thread). 994 * 995 * Create TCP timer message now; we are on the tcpcb's owner 996 * CPU/thread. 997 */ 998 tcp_create_timermsg(tp, &curthread->td_msgport); 999 1000 /* 1001 * Compute window scaling to request. Use a larger scaling then 1002 * needed for the initial receive buffer in case the receive buffer 1003 * gets expanded. 1004 */ 1005 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1006 tp->request_r_scale = TCP_MIN_WINSHIFT; 1007 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1008 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 1009 ) { 1010 tp->request_r_scale++; 1011 } 1012 1013 soisconnecting(so); 1014 tcpstat.tcps_connattempt++; 1015 tp->t_state = TCPS_SYN_SENT; 1016 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1017 tp->iss = tcp_new_isn(tp); 1018 tcp_sendseqinit(tp); 1019 if (m) { 1020 ssb_appendstream(&so->so_snd, m); 1021 m = NULL; 1022 if (flags & PRUS_OOB) 1023 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1024 } 1025 1026 /* 1027 * Close the send side of the connection after 1028 * the data is sent if flagged. 1029 */ 1030 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1031 socantsendmore(so); 1032 tp = tcp_usrclosed(tp); 1033 } 1034 return (tcp_output(tp)); 1035 } 1036 1037 /* 1038 * Common subroutine to open a TCP connection to remote host specified 1039 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1040 * port number if needed. Call in_pcbladdr to do the routing and to choose 1041 * a local host address (interface). 1042 * Initialize connection parameters and enter SYN-SENT state. 1043 */ 1044 static void 1045 tcp_connect(netmsg_t msg) 1046 { 1047 struct socket *so = msg->connect.base.nm_so; 1048 struct sockaddr *nam = msg->connect.nm_nam; 1049 struct thread *td = msg->connect.nm_td; 1050 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1051 struct sockaddr_in *if_sin = NULL; 1052 struct inpcb *inp; 1053 struct tcpcb *tp; 1054 int error; 1055 lwkt_port_t port; 1056 1057 COMMON_START(so, inp, 0); 1058 1059 /* 1060 * Reconnect our pcb if we have to 1061 */ 1062 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1063 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1064 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1065 } 1066 1067 /* 1068 * Bind if we have to 1069 */ 1070 if (inp->inp_lport == 0) { 1071 if (tcp_lport_extension) { 1072 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1073 1074 error = in_pcbladdr(inp, nam, &if_sin, td); 1075 if (error) 1076 goto out; 1077 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1078 1079 error = in_pcbbind_remote(inp, nam, td); 1080 if (error) 1081 goto out; 1082 1083 msg->connect.nm_flags |= PRUC_HASLADDR; 1084 } else { 1085 error = in_pcbbind(inp, NULL, td); 1086 if (error) 1087 goto out; 1088 } 1089 } 1090 1091 if ((msg->connect.nm_flags & PRUC_HASLADDR) == 0) { 1092 /* 1093 * Calculate the correct protocol processing thread. The 1094 * connect operation must run there. Set the forwarding 1095 * port before we forward the message or it will get bounced 1096 * right back to us. 1097 */ 1098 error = in_pcbladdr(inp, nam, &if_sin, td); 1099 if (error) 1100 goto out; 1101 } 1102 KKASSERT(inp->inp_socket == so); 1103 1104 port = tcp_addrport(sin->sin_addr.s_addr, sin->sin_port, 1105 (inp->inp_laddr.s_addr != INADDR_ANY ? 1106 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1107 inp->inp_lport); 1108 1109 if (port != &curthread->td_msgport) { 1110 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1111 1112 /* 1113 * in_pcbladdr() may have allocated a route entry for us 1114 * on the current CPU, but we need a route entry on the 1115 * inpcb's owner CPU, so free it here. 1116 */ 1117 in_pcbresetroute(inp); 1118 1119 /* 1120 * We are moving the protocol processing port the socket 1121 * is on, we have to unlink here and re-link on the 1122 * target cpu. 1123 */ 1124 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1125 msg->connect.nm_flags |= PRUC_RECONNECT; 1126 msg->connect.base.nm_dispatch = tcp_connect; 1127 1128 /* 1129 * Use message put done receipt to change this socket's 1130 * so_port, i.e. _after_ this message was put onto the 1131 * target netisr's msgport but _before_ the message could 1132 * be pulled from the target netisr's msgport, so that: 1133 * - The upper half (socket code) will not see the new 1134 * msgport before this message reaches the new msgport 1135 * and messages for this socket will be ordered. 1136 * - This message will see the new msgport, when its 1137 * handler is called in the target netisr. 1138 * 1139 * NOTE: 1140 * We MUST use messege put done receipt to change this 1141 * socket's so_port: 1142 * If we changed the so_port in this netisr after the 1143 * lwkt_forwardmsg (so messages for this socket will be 1144 * ordered) and changed the so_port in the target netisr 1145 * at the very beginning of this message's handler, we 1146 * would suffer so_port overwritten race, given this 1147 * message might be forwarded again. 1148 * 1149 * NOTE: 1150 * This mechanism depends on that the netisr's msgport 1151 * is spin msgport (currently it is :). 1152 * 1153 * If the upper half saw the new msgport before this 1154 * message reached the target netisr's msgport, the 1155 * messages sent from the upper half could reach the new 1156 * msgport before this message, thus there would be 1157 * message reordering. The worst case could be soclose() 1158 * saw the new msgport and the detach message could reach 1159 * the new msgport before this message, i.e. the inpcb 1160 * could have been destroyed when this message was still 1161 * pending on or on its way to the new msgport. Other 1162 * weird cases could also happen, e.g. inpcb->inp_pcbinfo, 1163 * since we have unlinked this inpcb from the current 1164 * pcbinfo first. 1165 */ 1166 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1167 lwkt_forwardmsg(port, lmsg); 1168 /* msg invalid now */ 1169 return; 1170 } else if (msg->connect.nm_flags & PRUC_HELDTD) { 1171 /* 1172 * The original thread is no longer needed; release it. 1173 */ 1174 lwkt_rele(td); 1175 msg->connect.nm_flags &= ~PRUC_HELDTD; 1176 } 1177 error = tcp_connect_oncpu(tp, msg->connect.nm_sndflags, 1178 msg->connect.nm_m, sin, if_sin); 1179 msg->connect.nm_m = NULL; 1180 out: 1181 if (msg->connect.nm_m) { 1182 m_freem(msg->connect.nm_m); 1183 msg->connect.nm_m = NULL; 1184 } 1185 if (msg->connect.nm_flags & PRUC_HELDTD) 1186 lwkt_rele(td); 1187 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 1188 so->so_error = error; 1189 soisdisconnected(so); 1190 } 1191 lwkt_replymsg(&msg->connect.base.lmsg, error); 1192 /* msg invalid now */ 1193 } 1194 1195 #ifdef INET6 1196 1197 static void 1198 tcp6_connect(netmsg_t msg) 1199 { 1200 struct tcpcb *tp; 1201 struct socket *so = msg->connect.base.nm_so; 1202 struct sockaddr *nam = msg->connect.nm_nam; 1203 struct thread *td = msg->connect.nm_td; 1204 struct inpcb *inp; 1205 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1206 struct in6_addr *addr6; 1207 lwkt_port_t port; 1208 int error; 1209 1210 COMMON_START(so, inp, 0); 1211 1212 /* 1213 * Reconnect our pcb if we have to 1214 */ 1215 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1216 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1217 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1218 } 1219 1220 /* 1221 * Bind if we have to 1222 */ 1223 if (inp->inp_lport == 0) { 1224 error = in6_pcbbind(inp, NULL, td); 1225 if (error) 1226 goto out; 1227 } 1228 1229 /* 1230 * Cannot simply call in_pcbconnect, because there might be an 1231 * earlier incarnation of this same connection still in 1232 * TIME_WAIT state, creating an ADDRINUSE error. 1233 */ 1234 error = in6_pcbladdr(inp, nam, &addr6, td); 1235 if (error) 1236 goto out; 1237 1238 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1239 1240 if (port != &curthread->td_msgport) { 1241 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1242 1243 /* 1244 * in_pcbladdr() may have allocated a route entry for us 1245 * on the current CPU, but we need a route entry on the 1246 * inpcb's owner CPU, so free it here. 1247 */ 1248 in_pcbresetroute(inp); 1249 1250 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1251 msg->connect.nm_flags |= PRUC_RECONNECT; 1252 msg->connect.base.nm_dispatch = tcp6_connect; 1253 1254 /* See the related comment in tcp_connect() */ 1255 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1256 lwkt_forwardmsg(port, lmsg); 1257 /* msg invalid now */ 1258 return; 1259 } 1260 error = tcp6_connect_oncpu(tp, msg->connect.nm_sndflags, 1261 &msg->connect.nm_m, sin6, addr6); 1262 /* nm_m may still be intact */ 1263 out: 1264 if (msg->connect.nm_m) { 1265 m_freem(msg->connect.nm_m); 1266 msg->connect.nm_m = NULL; 1267 } 1268 lwkt_replymsg(&msg->connect.base.lmsg, error); 1269 /* msg invalid now */ 1270 } 1271 1272 static int 1273 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1274 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1275 { 1276 struct mbuf *m = *mp; 1277 struct inpcb *inp = tp->t_inpcb; 1278 struct socket *so = inp->inp_socket; 1279 struct inpcb *oinp; 1280 1281 /* 1282 * Cannot simply call in_pcbconnect, because there might be an 1283 * earlier incarnation of this same connection still in 1284 * TIME_WAIT state, creating an ADDRINUSE error. 1285 */ 1286 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 1287 &sin6->sin6_addr, sin6->sin6_port, 1288 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1289 addr6 : &inp->in6p_laddr), 1290 inp->inp_lport, 0, NULL); 1291 if (oinp) 1292 return (EADDRINUSE); 1293 1294 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1295 inp->in6p_laddr = *addr6; 1296 inp->in6p_faddr = sin6->sin6_addr; 1297 inp->inp_fport = sin6->sin6_port; 1298 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1299 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1300 in_pcbinsconnhash(inp); 1301 1302 /* 1303 * Now that no more errors can occur, change the protocol processing 1304 * port to the current thread (which is the correct thread). 1305 * 1306 * Create TCP timer message now; we are on the tcpcb's owner 1307 * CPU/thread. 1308 */ 1309 tcp_create_timermsg(tp, &curthread->td_msgport); 1310 1311 /* Compute window scaling to request. */ 1312 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1313 tp->request_r_scale = TCP_MIN_WINSHIFT; 1314 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1315 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1316 tp->request_r_scale++; 1317 } 1318 1319 soisconnecting(so); 1320 tcpstat.tcps_connattempt++; 1321 tp->t_state = TCPS_SYN_SENT; 1322 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1323 tp->iss = tcp_new_isn(tp); 1324 tcp_sendseqinit(tp); 1325 if (m) { 1326 ssb_appendstream(&so->so_snd, m); 1327 *mp = NULL; 1328 if (flags & PRUS_OOB) 1329 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1330 } 1331 1332 /* 1333 * Close the send side of the connection after 1334 * the data is sent if flagged. 1335 */ 1336 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1337 socantsendmore(so); 1338 tp = tcp_usrclosed(tp); 1339 } 1340 return (tcp_output(tp)); 1341 } 1342 1343 #endif /* INET6 */ 1344 1345 /* 1346 * The new sockopt interface makes it possible for us to block in the 1347 * copyin/out step (if we take a page fault). Taking a page fault while 1348 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1349 * both now use TSM, there probably isn't any need for this function to 1350 * run in a critical section any more. This needs more examination.) 1351 */ 1352 void 1353 tcp_ctloutput(netmsg_t msg) 1354 { 1355 struct socket *so = msg->base.nm_so; 1356 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1357 int error, opt, optval, opthz; 1358 struct inpcb *inp; 1359 struct tcpcb *tp; 1360 1361 error = 0; 1362 inp = so->so_pcb; 1363 if (inp == NULL) { 1364 error = ECONNRESET; 1365 goto done; 1366 } 1367 tp = intotcpcb(inp); 1368 1369 /* Get socket's owner cpuid hint */ 1370 if (sopt->sopt_level == SOL_SOCKET && 1371 sopt->sopt_dir == SOPT_GET && 1372 sopt->sopt_name == SO_CPUHINT) { 1373 if (tp->t_flags & TF_LISTEN) { 1374 /* 1375 * Listen sockets owner cpuid is always 0, 1376 * which does not make sense if SO_REUSEPORT 1377 * is not set. 1378 */ 1379 if (so->so_options & SO_REUSEPORT) 1380 optval = (inp->inp_lgrpindex & ncpus2_mask); 1381 else 1382 optval = -1; /* no hint */ 1383 } else { 1384 optval = mycpuid; 1385 } 1386 soopt_from_kbuf(sopt, &optval, sizeof(optval)); 1387 goto done; 1388 } 1389 1390 if (sopt->sopt_level != IPPROTO_TCP) { 1391 if (sopt->sopt_level == IPPROTO_IP) { 1392 switch (sopt->sopt_name) { 1393 case IP_MULTICAST_IF: 1394 case IP_MULTICAST_VIF: 1395 case IP_MULTICAST_TTL: 1396 case IP_MULTICAST_LOOP: 1397 case IP_ADD_MEMBERSHIP: 1398 case IP_DROP_MEMBERSHIP: 1399 /* 1400 * Multicast does not make sense on 1401 * TCP sockets. 1402 */ 1403 error = EOPNOTSUPP; 1404 goto done; 1405 } 1406 } 1407 #ifdef INET6 1408 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1409 ip6_ctloutput_dispatch(msg); 1410 else 1411 #endif /* INET6 */ 1412 ip_ctloutput(msg); 1413 /* msg invalid now */ 1414 return; 1415 } 1416 1417 switch (sopt->sopt_dir) { 1418 case SOPT_SET: 1419 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1420 sizeof optval); 1421 if (error) 1422 break; 1423 switch (sopt->sopt_name) { 1424 case TCP_FASTKEEP: 1425 if (optval > 0) 1426 tp->t_keepidle = tp->t_keepintvl; 1427 else 1428 tp->t_keepidle = tcp_keepidle; 1429 tcp_timer_keep_activity(tp, 0); 1430 break; 1431 #ifdef TCP_SIGNATURE 1432 case TCP_SIGNATURE_ENABLE: 1433 if (tp->t_state == TCPS_CLOSED) { 1434 /* 1435 * This is the only safe state that this 1436 * option could be changed. Some segments 1437 * could already have been sent in other 1438 * states. 1439 */ 1440 if (optval > 0) 1441 tp->t_flags |= TF_SIGNATURE; 1442 else 1443 tp->t_flags &= ~TF_SIGNATURE; 1444 } else { 1445 error = EOPNOTSUPP; 1446 } 1447 break; 1448 #endif /* TCP_SIGNATURE */ 1449 case TCP_NODELAY: 1450 case TCP_NOOPT: 1451 switch (sopt->sopt_name) { 1452 case TCP_NODELAY: 1453 opt = TF_NODELAY; 1454 break; 1455 case TCP_NOOPT: 1456 opt = TF_NOOPT; 1457 break; 1458 default: 1459 opt = 0; /* dead code to fool gcc */ 1460 break; 1461 } 1462 1463 if (optval) 1464 tp->t_flags |= opt; 1465 else 1466 tp->t_flags &= ~opt; 1467 break; 1468 1469 case TCP_NOPUSH: 1470 if (tcp_disable_nopush) 1471 break; 1472 if (optval) 1473 tp->t_flags |= TF_NOPUSH; 1474 else { 1475 tp->t_flags &= ~TF_NOPUSH; 1476 error = tcp_output(tp); 1477 } 1478 break; 1479 1480 case TCP_MAXSEG: 1481 /* 1482 * Must be between 0 and maxseg. If the requested 1483 * maxseg is too small to satisfy the desired minmss, 1484 * pump it up (silently so sysctl modifications of 1485 * minmss do not create unexpected program failures). 1486 * Handle degenerate cases. 1487 */ 1488 if (optval > 0 && optval <= tp->t_maxseg) { 1489 if (optval + 40 < tcp_minmss) { 1490 optval = tcp_minmss - 40; 1491 if (optval < 0) 1492 optval = 1; 1493 } 1494 tp->t_maxseg = optval; 1495 } else { 1496 error = EINVAL; 1497 } 1498 break; 1499 1500 case TCP_KEEPINIT: 1501 opthz = ((int64_t)optval * hz) / 1000; 1502 if (opthz >= 1) 1503 tp->t_keepinit = opthz; 1504 else 1505 error = EINVAL; 1506 break; 1507 1508 case TCP_KEEPIDLE: 1509 opthz = ((int64_t)optval * hz) / 1000; 1510 if (opthz >= 1) { 1511 tp->t_keepidle = opthz; 1512 tcp_timer_keep_activity(tp, 0); 1513 } else { 1514 error = EINVAL; 1515 } 1516 break; 1517 1518 case TCP_KEEPINTVL: 1519 opthz = ((int64_t)optval * hz) / 1000; 1520 if (opthz >= 1) { 1521 tp->t_keepintvl = opthz; 1522 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1523 } else { 1524 error = EINVAL; 1525 } 1526 break; 1527 1528 case TCP_KEEPCNT: 1529 if (optval > 0) { 1530 tp->t_keepcnt = optval; 1531 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1532 } else { 1533 error = EINVAL; 1534 } 1535 break; 1536 1537 default: 1538 error = ENOPROTOOPT; 1539 break; 1540 } 1541 break; 1542 1543 case SOPT_GET: 1544 switch (sopt->sopt_name) { 1545 #ifdef TCP_SIGNATURE 1546 case TCP_SIGNATURE_ENABLE: 1547 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1548 break; 1549 #endif /* TCP_SIGNATURE */ 1550 case TCP_NODELAY: 1551 optval = tp->t_flags & TF_NODELAY; 1552 break; 1553 case TCP_MAXSEG: 1554 optval = tp->t_maxseg; 1555 break; 1556 case TCP_NOOPT: 1557 optval = tp->t_flags & TF_NOOPT; 1558 break; 1559 case TCP_NOPUSH: 1560 optval = tp->t_flags & TF_NOPUSH; 1561 break; 1562 case TCP_KEEPINIT: 1563 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1564 break; 1565 case TCP_KEEPIDLE: 1566 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1567 break; 1568 case TCP_KEEPINTVL: 1569 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1570 break; 1571 case TCP_KEEPCNT: 1572 optval = tp->t_keepcnt; 1573 break; 1574 default: 1575 error = ENOPROTOOPT; 1576 break; 1577 } 1578 if (error == 0) 1579 soopt_from_kbuf(sopt, &optval, sizeof optval); 1580 break; 1581 } 1582 done: 1583 lwkt_replymsg(&msg->lmsg, error); 1584 } 1585 1586 /* 1587 * tcp_sendspace and tcp_recvspace are the default send and receive window 1588 * sizes, respectively. These are obsolescent (this information should 1589 * be set by the route). 1590 * 1591 * Use a default that does not require tcp window scaling to be turned 1592 * on. Individual programs or the administrator can increase the default. 1593 */ 1594 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1595 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1596 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1597 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1598 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1599 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1600 1601 /* 1602 * Attach TCP protocol to socket, allocating internet protocol control 1603 * block, tcp control block, buffer space, and entering CLOSED state. 1604 */ 1605 static int 1606 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1607 { 1608 struct tcpcb *tp; 1609 struct inpcb *inp; 1610 int error; 1611 int cpu; 1612 #ifdef INET6 1613 boolean_t isipv6 = INP_CHECK_SOCKAF(so, AF_INET6); 1614 #endif 1615 1616 if (ai != NULL) { 1617 error = tcp_usr_preattach(so, 0 /* don't care */, ai); 1618 if (error) 1619 return (error); 1620 } else { 1621 /* Post attach; do nothing */ 1622 } 1623 1624 cpu = mycpu->gd_cpuid; 1625 1626 /* 1627 * Set the default pcbinfo. This will likely change when we 1628 * bind/connect. 1629 */ 1630 error = in_pcballoc(so, &tcbinfo[cpu]); 1631 if (error) 1632 return (error); 1633 inp = so->so_pcb; 1634 #ifdef INET6 1635 if (isipv6) 1636 inp->in6p_hops = -1; /* use kernel default */ 1637 #endif 1638 tp = tcp_newtcpcb(inp); 1639 KASSERT(tp != NULL, ("tcp_newtcpcb failed")); 1640 tp->t_state = TCPS_CLOSED; 1641 /* Keep a reference for asynchronized pru_rcvd */ 1642 soreference(so); 1643 return (0); 1644 } 1645 1646 /* 1647 * Initiate (or continue) disconnect. 1648 * If embryonic state, just send reset (once). 1649 * If in ``let data drain'' option and linger null, just drop. 1650 * Otherwise (hard), mark socket disconnecting and drop 1651 * current input data; switch states based on user close, and 1652 * send segment to peer (with FIN). 1653 */ 1654 static struct tcpcb * 1655 tcp_disconnect(struct tcpcb *tp) 1656 { 1657 struct socket *so = tp->t_inpcb->inp_socket; 1658 1659 if (tp->t_state < TCPS_ESTABLISHED) { 1660 tp = tcp_close(tp); 1661 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1662 tp = tcp_drop(tp, 0); 1663 } else { 1664 lwkt_gettoken(&so->so_rcv.ssb_token); 1665 soisdisconnecting(so); 1666 sbflush(&so->so_rcv.sb); 1667 tp = tcp_usrclosed(tp); 1668 if (tp) 1669 tcp_output(tp); 1670 lwkt_reltoken(&so->so_rcv.ssb_token); 1671 } 1672 return (tp); 1673 } 1674 1675 /* 1676 * User issued close, and wish to trail through shutdown states: 1677 * if never received SYN, just forget it. If got a SYN from peer, 1678 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1679 * If already got a FIN from peer, then almost done; go to LAST_ACK 1680 * state. In all other cases, have already sent FIN to peer (e.g. 1681 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1682 * for peer to send FIN or not respond to keep-alives, etc. 1683 * We can let the user exit from the close as soon as the FIN is acked. 1684 */ 1685 static struct tcpcb * 1686 tcp_usrclosed(struct tcpcb *tp) 1687 { 1688 1689 switch (tp->t_state) { 1690 1691 case TCPS_CLOSED: 1692 case TCPS_LISTEN: 1693 tp->t_state = TCPS_CLOSED; 1694 tp = tcp_close(tp); 1695 break; 1696 1697 case TCPS_SYN_SENT: 1698 case TCPS_SYN_RECEIVED: 1699 tp->t_flags |= TF_NEEDFIN; 1700 break; 1701 1702 case TCPS_ESTABLISHED: 1703 tp->t_state = TCPS_FIN_WAIT_1; 1704 break; 1705 1706 case TCPS_CLOSE_WAIT: 1707 tp->t_state = TCPS_LAST_ACK; 1708 break; 1709 } 1710 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1711 soisdisconnected(tp->t_inpcb->inp_socket); 1712 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1713 if (tp->t_state == TCPS_FIN_WAIT_2) { 1714 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1715 tcp_timer_2msl); 1716 } 1717 } 1718 return (tp); 1719 } 1720