1 /* 2 * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 63 * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.51.2.17 2002/10/11 11:46:44 ume Exp $ 64 */ 65 66 #include "opt_inet.h" 67 #include "opt_inet6.h" 68 #include "opt_tcpdebug.h" 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/kernel.h> 73 #include <sys/malloc.h> 74 #include <sys/sysctl.h> 75 #include <sys/globaldata.h> 76 #include <sys/thread.h> 77 78 #include <sys/mbuf.h> 79 #ifdef INET6 80 #include <sys/domain.h> 81 #endif /* INET6 */ 82 #include <sys/socket.h> 83 #include <sys/socketvar.h> 84 #include <sys/socketops.h> 85 #include <sys/protosw.h> 86 87 #include <sys/thread2.h> 88 #include <sys/msgport2.h> 89 #include <sys/socketvar2.h> 90 91 #include <net/if.h> 92 #include <net/netisr.h> 93 #include <net/route.h> 94 95 #include <net/netmsg2.h> 96 #include <net/netisr2.h> 97 98 #include <netinet/in.h> 99 #include <netinet/in_systm.h> 100 #ifdef INET6 101 #include <netinet/ip6.h> 102 #endif 103 #include <netinet/in_pcb.h> 104 #ifdef INET6 105 #include <netinet6/in6_pcb.h> 106 #endif 107 #include <netinet/in_var.h> 108 #include <netinet/ip_var.h> 109 #ifdef INET6 110 #include <netinet6/ip6_var.h> 111 #include <netinet6/tcp6_var.h> 112 #endif 113 #include <netinet/tcp.h> 114 #include <netinet/tcp_fsm.h> 115 #include <netinet/tcp_seq.h> 116 #include <netinet/tcp_timer.h> 117 #include <netinet/tcp_timer2.h> 118 #include <netinet/tcp_var.h> 119 #include <netinet/tcpip.h> 120 #ifdef TCPDEBUG 121 #include <netinet/tcp_debug.h> 122 #endif 123 124 /* 125 * TCP protocol interface to socket abstraction. 126 */ 127 extern char *tcpstates[]; /* XXX ??? */ 128 129 static int tcp_attach (struct socket *, struct pru_attach_info *); 130 static void tcp_connect (netmsg_t msg); 131 #ifdef INET6 132 static void tcp6_connect (netmsg_t msg); 133 static int tcp6_connect_oncpu(struct tcpcb *tp, int flags, 134 struct mbuf **mp, 135 struct sockaddr_in6 *sin6, 136 struct in6_addr *addr6); 137 #endif /* INET6 */ 138 static struct tcpcb * 139 tcp_disconnect (struct tcpcb *); 140 static struct tcpcb * 141 tcp_usrclosed (struct tcpcb *); 142 143 #ifdef TCPDEBUG 144 #define TCPDEBUG0 int ostate = 0 145 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 146 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 147 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 148 #else 149 #define TCPDEBUG0 150 #define TCPDEBUG1() 151 #define TCPDEBUG2(req) 152 #endif 153 154 /* 155 * For some ill optimized programs, which try to use TCP_NOPUSH 156 * to improve performance, will have small amount of data sits 157 * in the sending buffer. These small amount of data will _not_ 158 * be pushed into the network until more data are written into 159 * the socket or the socket write side is shutdown. 160 */ 161 static int tcp_disable_nopush = 1; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_nopush, CTLFLAG_RW, 163 &tcp_disable_nopush, 0, "TCP_NOPUSH socket option will have no effect"); 164 165 /* 166 * Allocate socket buffer space. 167 */ 168 static int 169 tcp_usr_preattach(struct socket *so, int proto __unused, 170 struct pru_attach_info *ai) 171 { 172 int error; 173 174 if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) { 175 error = soreserve(so, tcp_sendspace, tcp_recvspace, 176 ai->sb_rlimit); 177 if (error) 178 return (error); 179 } 180 atomic_set_int(&so->so_rcv.ssb_flags, SSB_AUTOSIZE); 181 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE | SSB_PREALLOC); 182 183 return 0; 184 } 185 186 /* 187 * TCP attaches to socket via pru_attach(), reserving space, 188 * and an internet control block. This socket may move to 189 * other CPU later when we bind/connect. 190 */ 191 static void 192 tcp_usr_attach(netmsg_t msg) 193 { 194 struct socket *so = msg->base.nm_so; 195 struct pru_attach_info *ai = msg->attach.nm_ai; 196 int error; 197 struct inpcb *inp; 198 struct tcpcb *tp = NULL; 199 TCPDEBUG0; 200 201 inp = so->so_pcb; 202 KASSERT(inp == NULL, ("tcp socket attached")); 203 TCPDEBUG1(); 204 205 error = tcp_attach(so, ai); 206 if (error) 207 goto out; 208 209 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 210 so->so_linger = TCP_LINGERTIME; 211 tp = sototcpcb(so); 212 out: 213 TCPDEBUG2(PRU_ATTACH); 214 lwkt_replymsg(&msg->lmsg, error); 215 } 216 217 /* 218 * pru_detach() detaches the TCP protocol from the socket. 219 * If the protocol state is non-embryonic, then can't 220 * do this directly: have to initiate a pru_disconnect(), 221 * which may finish later; embryonic TCB's can just 222 * be discarded here. 223 */ 224 static void 225 tcp_usr_detach(netmsg_t msg) 226 { 227 struct socket *so = msg->base.nm_so; 228 int error = 0; 229 struct inpcb *inp; 230 struct tcpcb *tp; 231 TCPDEBUG0; 232 233 inp = so->so_pcb; 234 235 /* 236 * If the inp is already detached or never attached, it may have 237 * been due to an async close or async attach failure. Just return 238 * as if no error occured. 239 */ 240 if (inp) { 241 tp = intotcpcb(inp); 242 KASSERT(tp != NULL, ("tcp_usr_detach: tp is NULL")); 243 TCPDEBUG1(); 244 tp = tcp_disconnect(tp); 245 TCPDEBUG2(PRU_DETACH); 246 } 247 lwkt_replymsg(&msg->lmsg, error); 248 } 249 250 /* 251 * NOTE: ignore_error is non-zero for certain disconnection races 252 * which we want to silently allow, otherwise close() may return 253 * an unexpected error. 254 * 255 * NOTE: The variables (msg) and (tp) are assumed. 256 */ 257 #define COMMON_START(so, inp, ignore_error) \ 258 TCPDEBUG0; \ 259 \ 260 inp = so->so_pcb; \ 261 do { \ 262 if (inp == NULL) { \ 263 error = ignore_error ? 0 : EINVAL; \ 264 tp = NULL; \ 265 goto out; \ 266 } \ 267 tp = intotcpcb(inp); \ 268 TCPDEBUG1(); \ 269 } while(0) 270 271 #define COMMON_END1(req, noreply) \ 272 out: do { \ 273 TCPDEBUG2(req); \ 274 if (!(noreply)) \ 275 lwkt_replymsg(&msg->lmsg, error); \ 276 return; \ 277 } while(0) 278 279 #define COMMON_END(req) COMMON_END1((req), 0) 280 281 static void 282 tcp_sosetport(struct lwkt_msg *msg, lwkt_port_t port) 283 { 284 sosetport(((struct netmsg_base *)msg)->nm_so, port); 285 } 286 287 /* 288 * Give the socket an address. 289 */ 290 static void 291 tcp_usr_bind(netmsg_t msg) 292 { 293 struct socket *so = msg->bind.base.nm_so; 294 struct sockaddr *nam = msg->bind.nm_nam; 295 struct thread *td = msg->bind.nm_td; 296 int error = 0; 297 struct inpcb *inp; 298 struct tcpcb *tp; 299 struct sockaddr_in *sinp; 300 lwkt_port_t port0 = netisr_cpuport(0); 301 302 COMMON_START(so, inp, 0); 303 304 /* 305 * Must check for multicast addresses and disallow binding 306 * to them. 307 */ 308 sinp = (struct sockaddr_in *)nam; 309 if (sinp->sin_family == AF_INET && 310 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 311 error = EAFNOSUPPORT; 312 goto out; 313 } 314 315 /* 316 * Check "already bound" here (in_pcbbind() does the same check 317 * though), so we don't forward a connected socket to netisr0, 318 * which would panic in the following in_pcbunlink(). 319 */ 320 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) { 321 error = EINVAL; /* already bound */ 322 goto out; 323 } 324 325 /* 326 * Use netisr0 to serialize in_pcbbind(), so that pru_detach and 327 * pru_bind for different sockets on the same local port could be 328 * properly ordered. The original race is illustrated here for 329 * reference. 330 * 331 * s1 = socket(); 332 * bind(s1, *.PORT); 333 * close(s1); <----- asynchronous 334 * s2 = socket(); 335 * bind(s2, *.PORT); 336 * 337 * All will expect bind(s2, *.PORT) to succeed. However, it will 338 * fail, if following sequence happens due to random socket initial 339 * msgport and asynchronous close(2): 340 * 341 * netisrN netisrM 342 * : : 343 * : pru_bind(s2) [*.PORT is used by s1] 344 * pru_detach(s1) : 345 */ 346 if (&curthread->td_msgport != port0) { 347 lwkt_msg_t lmsg = &msg->bind.base.lmsg; 348 349 KASSERT((msg->bind.nm_flags & PRUB_RELINK) == 0, 350 ("already asked to relink")); 351 352 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]); 353 msg->bind.nm_flags |= PRUB_RELINK; 354 355 TCP_STATE_MIGRATE_START(tp); 356 357 /* See the related comment in tcp_connect() */ 358 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 359 lwkt_forwardmsg(port0, lmsg); 360 /* msg invalid now */ 361 return; 362 } 363 KASSERT(so->so_port == port0, ("so_port is not netisr0")); 364 365 if (msg->bind.nm_flags & PRUB_RELINK) { 366 msg->bind.nm_flags &= ~PRUB_RELINK; 367 TCP_STATE_MIGRATE_END(tp); 368 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]); 369 } 370 KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0")); 371 372 error = in_pcbbind(inp, nam, td); 373 if (error) 374 goto out; 375 376 COMMON_END(PRU_BIND); 377 } 378 379 #ifdef INET6 380 381 static void 382 tcp6_usr_bind(netmsg_t msg) 383 { 384 struct socket *so = msg->bind.base.nm_so; 385 struct sockaddr *nam = msg->bind.nm_nam; 386 struct thread *td = msg->bind.nm_td; 387 int error = 0; 388 struct inpcb *inp; 389 struct tcpcb *tp; 390 struct sockaddr_in6 *sin6p; 391 392 COMMON_START(so, inp, 0); 393 394 /* 395 * Must check for multicast addresses and disallow binding 396 * to them. 397 */ 398 sin6p = (struct sockaddr_in6 *)nam; 399 if (sin6p->sin6_family == AF_INET6 && 400 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 401 error = EAFNOSUPPORT; 402 goto out; 403 } 404 error = in6_pcbbind(inp, nam, td); 405 if (error) 406 goto out; 407 COMMON_END(PRU_BIND); 408 } 409 #endif /* INET6 */ 410 411 struct netmsg_inswildcard { 412 struct netmsg_base base; 413 struct inpcb *nm_inp; 414 }; 415 416 static void 417 in_pcbinswildcardhash_handler(netmsg_t msg) 418 { 419 struct netmsg_inswildcard *nm = (struct netmsg_inswildcard *)msg; 420 int cpu = mycpuid, nextcpu; 421 422 in_pcbinswildcardhash_oncpu(nm->nm_inp, &tcbinfo[cpu]); 423 424 nextcpu = cpu + 1; 425 if (nextcpu < netisr_ncpus) 426 lwkt_forwardmsg(netisr_cpuport(nextcpu), &nm->base.lmsg); 427 else 428 lwkt_replymsg(&nm->base.lmsg, 0); 429 } 430 431 /* 432 * Prepare to accept connections. 433 */ 434 static void 435 tcp_usr_listen(netmsg_t msg) 436 { 437 struct socket *so = msg->listen.base.nm_so; 438 struct thread *td = msg->listen.nm_td; 439 int error = 0; 440 struct inpcb *inp; 441 struct tcpcb *tp; 442 struct netmsg_inswildcard nm; 443 lwkt_port_t port0 = netisr_cpuport(0); 444 445 COMMON_START(so, inp, 0); 446 447 if (&curthread->td_msgport != port0) { 448 lwkt_msg_t lmsg = &msg->listen.base.lmsg; 449 450 KASSERT((msg->listen.nm_flags & PRUL_RELINK) == 0, 451 ("already asked to relink")); 452 453 in_pcbunlink(so->so_pcb, &tcbinfo[mycpuid]); 454 msg->listen.nm_flags |= PRUL_RELINK; 455 456 TCP_STATE_MIGRATE_START(tp); 457 458 /* See the related comment in tcp_connect() */ 459 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 460 lwkt_forwardmsg(port0, lmsg); 461 /* msg invalid now */ 462 return; 463 } 464 KASSERT(so->so_port == port0, ("so_port is not netisr0")); 465 466 if (msg->listen.nm_flags & PRUL_RELINK) { 467 msg->listen.nm_flags &= ~PRUL_RELINK; 468 TCP_STATE_MIGRATE_END(tp); 469 in_pcblink(so->so_pcb, &tcbinfo[mycpuid]); 470 } 471 KASSERT(inp->inp_pcbinfo == &tcbinfo[0], ("pcbinfo is not tcbinfo0")); 472 473 if (tp->t_flags & TF_LISTEN) 474 goto out; 475 476 if (inp->inp_lport == 0) { 477 error = in_pcbbind(inp, NULL, td); 478 if (error) 479 goto out; 480 } 481 482 TCP_STATE_CHANGE(tp, TCPS_LISTEN); 483 tp->t_flags |= TF_LISTEN; 484 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 485 486 /* 487 * Create tcpcb per-cpu port cache 488 * 489 * NOTE: 490 * This _must_ be done before installing this inpcb into 491 * wildcard hash. 492 */ 493 tcp_pcbport_create(tp); 494 495 if (netisr_ncpus > 1) { 496 /* 497 * Put this inpcb into wildcard hash on other cpus. 498 */ 499 ASSERT_INP_NOTINHASH(inp); 500 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 501 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 502 nm.nm_inp = inp; 503 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 504 } 505 in_pcbinswildcardhash(inp); 506 COMMON_END(PRU_LISTEN); 507 } 508 509 #ifdef INET6 510 511 static void 512 tcp6_usr_listen(netmsg_t msg) 513 { 514 struct socket *so = msg->listen.base.nm_so; 515 struct thread *td = msg->listen.nm_td; 516 int error = 0; 517 struct inpcb *inp; 518 struct tcpcb *tp; 519 struct netmsg_inswildcard nm; 520 521 COMMON_START(so, inp, 0); 522 523 if (tp->t_flags & TF_LISTEN) 524 goto out; 525 526 if (inp->inp_lport == 0) { 527 error = in6_pcbbind(inp, NULL, td); 528 if (error) 529 goto out; 530 } 531 532 TCP_STATE_CHANGE(tp, TCPS_LISTEN); 533 tp->t_flags |= TF_LISTEN; 534 tp->tt_msg = NULL; /* Catch any invalid timer usage */ 535 536 /* 537 * Create tcpcb per-cpu port cache 538 * 539 * NOTE: 540 * This _must_ be done before installing this inpcb into 541 * wildcard hash. 542 */ 543 tcp_pcbport_create(tp); 544 545 if (netisr_ncpus > 1) { 546 /* 547 * Put this inpcb into wildcard hash on other cpus. 548 */ 549 KKASSERT(so->so_port == netisr_cpuport(0)); 550 ASSERT_NETISR0; 551 KKASSERT(inp->inp_pcbinfo == &tcbinfo[0]); 552 ASSERT_INP_NOTINHASH(inp); 553 554 netmsg_init(&nm.base, NULL, &curthread->td_msgport, 555 MSGF_PRIORITY, in_pcbinswildcardhash_handler); 556 nm.nm_inp = inp; 557 lwkt_domsg(netisr_cpuport(1), &nm.base.lmsg, 0); 558 } 559 in_pcbinswildcardhash(inp); 560 COMMON_END(PRU_LISTEN); 561 } 562 #endif /* INET6 */ 563 564 /* 565 * Initiate connection to peer. 566 * Create a template for use in transmissions on this connection. 567 * Enter SYN_SENT state, and mark socket as connecting. 568 * Start keep-alive timer, and seed output sequence space. 569 * Send initial segment on connection. 570 */ 571 static void 572 tcp_usr_connect(netmsg_t msg) 573 { 574 struct socket *so = msg->connect.base.nm_so; 575 struct sockaddr *nam = msg->connect.nm_nam; 576 struct thread *td = msg->connect.nm_td; 577 int error = 0; 578 struct inpcb *inp; 579 struct tcpcb *tp; 580 struct sockaddr_in *sinp; 581 582 ASSERT_NETISR_NCPUS(mycpuid); 583 584 COMMON_START(so, inp, 0); 585 586 /* 587 * Must disallow TCP ``connections'' to multicast addresses. 588 */ 589 sinp = (struct sockaddr_in *)nam; 590 if (sinp->sin_family == AF_INET 591 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 592 error = EAFNOSUPPORT; 593 goto out; 594 } 595 596 if (!prison_remote_ip(td, (struct sockaddr*)sinp)) { 597 error = EAFNOSUPPORT; /* IPv6 only jail */ 598 goto out; 599 } 600 601 tcp_connect(msg); 602 /* msg is invalid now */ 603 return; 604 out: 605 if (msg->connect.nm_m) { 606 m_freem(msg->connect.nm_m); 607 msg->connect.nm_m = NULL; 608 } 609 if (msg->connect.nm_flags & PRUC_HELDTD) 610 lwkt_rele(td); 611 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 612 so->so_error = error; 613 soisdisconnected(so); 614 } 615 lwkt_replymsg(&msg->lmsg, error); 616 } 617 618 #ifdef INET6 619 620 static void 621 tcp6_usr_connect(netmsg_t msg) 622 { 623 struct socket *so = msg->connect.base.nm_so; 624 struct sockaddr *nam = msg->connect.nm_nam; 625 struct thread *td = msg->connect.nm_td; 626 int error = 0; 627 struct inpcb *inp; 628 struct tcpcb *tp; 629 struct sockaddr_in6 *sin6p; 630 631 ASSERT_NETISR_NCPUS(mycpuid); 632 633 COMMON_START(so, inp, 0); 634 635 /* 636 * Must disallow TCP ``connections'' to multicast addresses. 637 */ 638 sin6p = (struct sockaddr_in6 *)nam; 639 if (sin6p->sin6_family == AF_INET6 640 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { 641 error = EAFNOSUPPORT; 642 goto out; 643 } 644 645 if (!prison_remote_ip(td, nam)) { 646 error = EAFNOSUPPORT; /* IPv4 only jail */ 647 goto out; 648 } 649 650 /* Reject v4-mapped address */ 651 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 652 error = EADDRNOTAVAIL; 653 goto out; 654 } 655 656 inp->inp_inc.inc_isipv6 = 1; 657 tcp6_connect(msg); 658 /* msg is invalid now */ 659 return; 660 out: 661 if (msg->connect.nm_m) { 662 m_freem(msg->connect.nm_m); 663 msg->connect.nm_m = NULL; 664 } 665 lwkt_replymsg(&msg->lmsg, error); 666 } 667 668 #endif /* INET6 */ 669 670 /* 671 * Initiate disconnect from peer. 672 * If connection never passed embryonic stage, just drop; 673 * else if don't need to let data drain, then can just drop anyways, 674 * else have to begin TCP shutdown process: mark socket disconnecting, 675 * drain unread data, state switch to reflect user close, and 676 * send segment (e.g. FIN) to peer. Socket will be really disconnected 677 * when peer sends FIN and acks ours. 678 * 679 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 680 */ 681 static void 682 tcp_usr_disconnect(netmsg_t msg) 683 { 684 struct socket *so = msg->disconnect.base.nm_so; 685 int error = 0; 686 struct inpcb *inp; 687 struct tcpcb *tp; 688 689 COMMON_START(so, inp, 1); 690 tp = tcp_disconnect(tp); 691 COMMON_END(PRU_DISCONNECT); 692 } 693 694 /* 695 * Accept a connection. Essentially all the work is 696 * done at higher levels; just return the address 697 * of the peer, storing through addr. 698 */ 699 static void 700 tcp_usr_accept(netmsg_t msg) 701 { 702 struct socket *so = msg->accept.base.nm_so; 703 struct sockaddr **nam = msg->accept.nm_nam; 704 int error = 0; 705 struct inpcb *inp; 706 struct tcpcb *tp = NULL; 707 TCPDEBUG0; 708 709 inp = so->so_pcb; 710 if (so->so_state & SS_ISDISCONNECTED) { 711 error = ECONNABORTED; 712 goto out; 713 } 714 if (inp == NULL) { 715 error = EINVAL; 716 goto out; 717 } 718 719 tp = intotcpcb(inp); 720 TCPDEBUG1(); 721 in_setpeeraddr(so, nam); 722 COMMON_END(PRU_ACCEPT); 723 } 724 725 #ifdef INET6 726 static void 727 tcp6_usr_accept(netmsg_t msg) 728 { 729 struct socket *so = msg->accept.base.nm_so; 730 struct sockaddr **nam = msg->accept.nm_nam; 731 int error = 0; 732 struct inpcb *inp; 733 struct tcpcb *tp = NULL; 734 TCPDEBUG0; 735 736 inp = so->so_pcb; 737 738 if (so->so_state & SS_ISDISCONNECTED) { 739 error = ECONNABORTED; 740 goto out; 741 } 742 if (inp == NULL) { 743 error = EINVAL; 744 goto out; 745 } 746 tp = intotcpcb(inp); 747 TCPDEBUG1(); 748 in6_setpeeraddr(so, nam); 749 COMMON_END(PRU_ACCEPT); 750 } 751 #endif /* INET6 */ 752 753 /* 754 * Mark the connection as being incapable of further output. 755 */ 756 static void 757 tcp_usr_shutdown(netmsg_t msg) 758 { 759 struct socket *so = msg->shutdown.base.nm_so; 760 int error = 0; 761 struct inpcb *inp; 762 struct tcpcb *tp; 763 764 COMMON_START(so, inp, 0); 765 socantsendmore(so); 766 tp = tcp_usrclosed(tp); 767 if (tp) 768 error = tcp_output(tp); 769 COMMON_END(PRU_SHUTDOWN); 770 } 771 772 /* 773 * After a receive, possibly send window update to peer. 774 */ 775 static void 776 tcp_usr_rcvd(netmsg_t msg) 777 { 778 struct socket *so = msg->rcvd.base.nm_so; 779 int error = 0, noreply = 0; 780 struct inpcb *inp; 781 struct tcpcb *tp; 782 783 COMMON_START(so, inp, 0); 784 785 if (msg->rcvd.nm_pru_flags & PRUR_ASYNC) { 786 noreply = 1; 787 so_async_rcvd_reply(so); 788 } 789 tcp_output(tp); 790 791 COMMON_END1(PRU_RCVD, noreply); 792 } 793 794 /* 795 * Do a send by putting data in output queue and updating urgent 796 * marker if URG set. Possibly send more data. Unlike the other 797 * pru_*() routines, the mbuf chains are our responsibility. We 798 * must either enqueue them or free them. The other pru_* routines 799 * generally are caller-frees. 800 */ 801 static void 802 tcp_usr_send(netmsg_t msg) 803 { 804 struct socket *so = msg->send.base.nm_so; 805 int flags = msg->send.nm_flags; 806 struct mbuf *m = msg->send.nm_m; 807 int error = 0; 808 struct inpcb *inp; 809 struct tcpcb *tp; 810 TCPDEBUG0; 811 812 KKASSERT(msg->send.nm_control == NULL); 813 KKASSERT(msg->send.nm_addr == NULL); 814 KKASSERT((flags & PRUS_FREEADDR) == 0); 815 816 inp = so->so_pcb; 817 818 if (inp == NULL) { 819 /* 820 * OOPS! we lost a race, the TCP session got reset after 821 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 822 * network interrupt in the non-critical section of sosend(). 823 */ 824 m_freem(m); 825 error = ECONNRESET; /* XXX EPIPE? */ 826 tp = NULL; 827 TCPDEBUG1(); 828 goto out; 829 } 830 tp = intotcpcb(inp); 831 TCPDEBUG1(); 832 833 #ifdef foo 834 /* 835 * This is no longer necessary, since: 836 * - sosendtcp() has already checked it for us 837 * - It does not work with asynchronized send 838 */ 839 840 /* 841 * Don't let too much OOB data build up 842 */ 843 if (flags & PRUS_OOB) { 844 if (ssb_space(&so->so_snd) < -512) { 845 m_freem(m); 846 error = ENOBUFS; 847 goto out; 848 } 849 } 850 #endif 851 852 /* 853 * Pump the data into the socket. 854 */ 855 if (m) { 856 ssb_appendstream(&so->so_snd, m); 857 sowwakeup(so); 858 } 859 if (flags & PRUS_OOB) { 860 /* 861 * According to RFC961 (Assigned Protocols), 862 * the urgent pointer points to the last octet 863 * of urgent data. We continue, however, 864 * to consider it to indicate the first octet 865 * of data past the urgent section. 866 * Otherwise, snd_up should be one lower. 867 */ 868 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 869 tp->t_flags |= TF_FORCE; 870 error = tcp_output(tp); 871 tp->t_flags &= ~TF_FORCE; 872 } else { 873 if (flags & PRUS_EOF) { 874 /* 875 * Close the send side of the connection after 876 * the data is sent. 877 */ 878 socantsendmore(so); 879 tp = tcp_usrclosed(tp); 880 } 881 if (tp != NULL && !tcp_output_pending(tp)) { 882 if (flags & PRUS_MORETOCOME) 883 tp->t_flags |= TF_MORETOCOME; 884 error = tcp_output_fair(tp); 885 if (flags & PRUS_MORETOCOME) 886 tp->t_flags &= ~TF_MORETOCOME; 887 } 888 } 889 COMMON_END1((flags & PRUS_OOB) ? PRU_SENDOOB : 890 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), 891 (flags & PRUS_NOREPLY)); 892 } 893 894 /* 895 * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort() 896 * will sofree() it when we return. 897 */ 898 static void 899 tcp_usr_abort(netmsg_t msg) 900 { 901 struct socket *so = msg->abort.base.nm_so; 902 int error = 0; 903 struct inpcb *inp; 904 struct tcpcb *tp; 905 906 COMMON_START(so, inp, 1); 907 tp = tcp_drop(tp, ECONNABORTED); 908 COMMON_END(PRU_ABORT); 909 } 910 911 /* 912 * Receive out-of-band data. 913 */ 914 static void 915 tcp_usr_rcvoob(netmsg_t msg) 916 { 917 struct socket *so = msg->rcvoob.base.nm_so; 918 struct mbuf *m = msg->rcvoob.nm_m; 919 int flags = msg->rcvoob.nm_flags; 920 int error = 0; 921 struct inpcb *inp; 922 struct tcpcb *tp; 923 924 COMMON_START(so, inp, 0); 925 if ((so->so_oobmark == 0 && 926 (so->so_state & SS_RCVATMARK) == 0) || 927 so->so_options & SO_OOBINLINE || 928 tp->t_oobflags & TCPOOB_HADDATA) { 929 error = EINVAL; 930 goto out; 931 } 932 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 933 error = EWOULDBLOCK; 934 goto out; 935 } 936 m->m_len = 1; 937 *mtod(m, caddr_t) = tp->t_iobc; 938 if ((flags & MSG_PEEK) == 0) 939 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 940 COMMON_END(PRU_RCVOOB); 941 } 942 943 static void 944 tcp_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 945 { 946 in_savefaddr(so, faddr); 947 } 948 949 #ifdef INET6 950 static void 951 tcp6_usr_savefaddr(struct socket *so, const struct sockaddr *faddr) 952 { 953 in6_savefaddr(so, faddr); 954 } 955 #endif 956 957 static int 958 tcp_usr_preconnect(struct socket *so, const struct sockaddr *nam, 959 struct thread *td __unused) 960 { 961 const struct sockaddr_in *sinp; 962 963 sinp = (const struct sockaddr_in *)nam; 964 if (sinp->sin_family == AF_INET && 965 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 966 return EAFNOSUPPORT; 967 968 soisconnecting(so); 969 return 0; 970 } 971 972 /* xxx - should be const */ 973 struct pr_usrreqs tcp_usrreqs = { 974 .pru_abort = tcp_usr_abort, 975 .pru_accept = tcp_usr_accept, 976 .pru_attach = tcp_usr_attach, 977 .pru_bind = tcp_usr_bind, 978 .pru_connect = tcp_usr_connect, 979 .pru_connect2 = pr_generic_notsupp, 980 .pru_control = in_control_dispatch, 981 .pru_detach = tcp_usr_detach, 982 .pru_disconnect = tcp_usr_disconnect, 983 .pru_listen = tcp_usr_listen, 984 .pru_peeraddr = in_setpeeraddr_dispatch, 985 .pru_rcvd = tcp_usr_rcvd, 986 .pru_rcvoob = tcp_usr_rcvoob, 987 .pru_send = tcp_usr_send, 988 .pru_sense = pru_sense_null, 989 .pru_shutdown = tcp_usr_shutdown, 990 .pru_sockaddr = in_setsockaddr_dispatch, 991 .pru_sosend = sosendtcp, 992 .pru_soreceive = sorecvtcp, 993 .pru_savefaddr = tcp_usr_savefaddr, 994 .pru_preconnect = tcp_usr_preconnect, 995 .pru_preattach = tcp_usr_preattach 996 }; 997 998 #ifdef INET6 999 struct pr_usrreqs tcp6_usrreqs = { 1000 .pru_abort = tcp_usr_abort, 1001 .pru_accept = tcp6_usr_accept, 1002 .pru_attach = tcp_usr_attach, 1003 .pru_bind = tcp6_usr_bind, 1004 .pru_connect = tcp6_usr_connect, 1005 .pru_connect2 = pr_generic_notsupp, 1006 .pru_control = in6_control_dispatch, 1007 .pru_detach = tcp_usr_detach, 1008 .pru_disconnect = tcp_usr_disconnect, 1009 .pru_listen = tcp6_usr_listen, 1010 .pru_peeraddr = in6_setpeeraddr_dispatch, 1011 .pru_rcvd = tcp_usr_rcvd, 1012 .pru_rcvoob = tcp_usr_rcvoob, 1013 .pru_send = tcp_usr_send, 1014 .pru_sense = pru_sense_null, 1015 .pru_shutdown = tcp_usr_shutdown, 1016 .pru_sockaddr = in6_setsockaddr_dispatch, 1017 .pru_sosend = sosendtcp, 1018 .pru_soreceive = sorecvtcp, 1019 .pru_savefaddr = tcp6_usr_savefaddr 1020 }; 1021 #endif /* INET6 */ 1022 1023 static int 1024 tcp_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf *m, 1025 const struct sockaddr_in *sin, struct sockaddr_in *if_sin, 1026 uint16_t hash) 1027 { 1028 struct inpcb *inp = tp->t_inpcb, *oinp; 1029 struct socket *so = inp->inp_socket; 1030 struct route *ro = &inp->inp_route; 1031 1032 KASSERT(inp->inp_pcbinfo == &tcbinfo[mycpu->gd_cpuid], 1033 ("pcbinfo mismatch")); 1034 1035 oinp = in_pcblookup_hash(inp->inp_pcbinfo, 1036 sin->sin_addr, sin->sin_port, 1037 (inp->inp_laddr.s_addr != INADDR_ANY ? 1038 inp->inp_laddr : if_sin->sin_addr), 1039 inp->inp_lport, 0, NULL); 1040 if (oinp != NULL) { 1041 m_freem(m); 1042 return (EADDRINUSE); 1043 } 1044 if (inp->inp_laddr.s_addr == INADDR_ANY) 1045 inp->inp_laddr = if_sin->sin_addr; 1046 KASSERT(inp->inp_faddr.s_addr == sin->sin_addr.s_addr, 1047 ("faddr mismatch for reconnect")); 1048 KASSERT(inp->inp_fport == sin->sin_port, 1049 ("fport mismatch for reconnect")); 1050 in_pcbinsconnhash(inp); 1051 1052 inp->inp_flags |= INP_HASH; 1053 inp->inp_hashval = hash; 1054 1055 /* 1056 * We are now on the inpcb's owner CPU, if the cached route was 1057 * freed because the rtentry's owner CPU is not the current CPU 1058 * (e.g. in tcp_connect()), then we try to reallocate it here with 1059 * the hope that a rtentry may be cloned from a RTF_PRCLONING 1060 * rtentry. 1061 */ 1062 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 1063 ro->ro_rt == NULL) { 1064 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 1065 ro->ro_dst.sa_family = AF_INET; 1066 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 1067 ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = 1068 sin->sin_addr; 1069 rtalloc(ro); 1070 } 1071 1072 /* 1073 * Now that no more errors can occur, change the protocol processing 1074 * port to the current thread (which is the correct thread). 1075 * 1076 * Create TCP timer message now; we are on the tcpcb's owner 1077 * CPU/thread. 1078 */ 1079 tcp_create_timermsg(tp, &curthread->td_msgport); 1080 1081 /* 1082 * Compute window scaling to request. Use a larger scaling then 1083 * needed for the initial receive buffer in case the receive buffer 1084 * gets expanded. 1085 */ 1086 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1087 tp->request_r_scale = TCP_MIN_WINSHIFT; 1088 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1089 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat 1090 ) { 1091 tp->request_r_scale++; 1092 } 1093 1094 soisconnecting(so); 1095 tcpstat.tcps_connattempt++; 1096 TCP_STATE_CHANGE(tp, TCPS_SYN_SENT); 1097 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1098 tp->iss = tcp_new_isn(tp); 1099 tcp_sendseqinit(tp); 1100 if (m) { 1101 ssb_appendstream(&so->so_snd, m); 1102 m = NULL; 1103 if (flags & PRUS_OOB) 1104 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1105 } 1106 1107 /* 1108 * Close the send side of the connection after 1109 * the data is sent if flagged. 1110 */ 1111 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1112 socantsendmore(so); 1113 tp = tcp_usrclosed(tp); 1114 } 1115 return (tcp_output(tp)); 1116 } 1117 1118 /* 1119 * Common subroutine to open a TCP connection to remote host specified 1120 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1121 * port number if needed. Call in_pcbladdr to do the routing and to choose 1122 * a local host address (interface). 1123 * Initialize connection parameters and enter SYN-SENT state. 1124 */ 1125 static void 1126 tcp_connect(netmsg_t msg) 1127 { 1128 struct socket *so = msg->connect.base.nm_so; 1129 struct sockaddr *nam = msg->connect.nm_nam; 1130 struct thread *td = msg->connect.nm_td; 1131 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1132 struct sockaddr_in *if_sin = NULL; 1133 struct inpcb *inp; 1134 struct tcpcb *tp; 1135 int error; 1136 uint16_t hash; 1137 lwkt_port_t port; 1138 1139 COMMON_START(so, inp, 0); 1140 1141 /* 1142 * Reconnect our pcb if we have to 1143 */ 1144 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1145 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1146 TCP_STATE_MIGRATE_END(tp); 1147 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1148 } else { 1149 if (inp->inp_faddr.s_addr != INADDR_ANY) { 1150 kprintf("inpcb %p, double-connect race\n", inp); 1151 error = EISCONN; 1152 if (so->so_state & SS_ISCONNECTING) 1153 error = EALREADY; 1154 goto out; 1155 } 1156 KASSERT(inp->inp_fport == 0, ("invalid fport")); 1157 } 1158 1159 /* 1160 * Select local port, if it is not yet selected. 1161 */ 1162 if (inp->inp_lport == 0) { 1163 KKASSERT(inp->inp_laddr.s_addr == INADDR_ANY); 1164 1165 error = in_pcbladdr(inp, nam, &if_sin, td); 1166 if (error) 1167 goto out; 1168 inp->inp_laddr.s_addr = if_sin->sin_addr.s_addr; 1169 msg->connect.nm_flags |= PRUC_HASLADDR; 1170 1171 /* 1172 * Install faddr/fport earlier, so that when this 1173 * inpcb is installed on to the lport hash, the 1174 * 4-tuple contains correct value. 1175 * 1176 * NOTE: The faddr/fport will have to be installed 1177 * after the in_pcbladdr(), which may change them. 1178 */ 1179 inp->inp_faddr = sin->sin_addr; 1180 inp->inp_fport = sin->sin_port; 1181 1182 error = in_pcbbind_remote(inp, nam, td); 1183 if (error) 1184 goto out; 1185 } 1186 1187 if ((msg->connect.nm_flags & PRUC_HASLADDR) == 0) { 1188 /* 1189 * Rarely used path: 1190 * This inpcb was bound before this connect. 1191 */ 1192 error = in_pcbladdr(inp, nam, &if_sin, td); 1193 if (error) 1194 goto out; 1195 1196 /* 1197 * Save or refresh the faddr/fport, since they may 1198 * be changed by in_pcbladdr(). 1199 */ 1200 inp->inp_faddr = sin->sin_addr; 1201 inp->inp_fport = sin->sin_port; 1202 } 1203 #ifdef INVARIANTS 1204 else { 1205 KASSERT(inp->inp_faddr.s_addr == sin->sin_addr.s_addr, 1206 ("faddr mismatch for reconnect")); 1207 KASSERT(inp->inp_fport == sin->sin_port, 1208 ("fport mismatch for reconnect")); 1209 } 1210 #endif 1211 KKASSERT(inp->inp_socket == so); 1212 1213 hash = tcp_addrhash(sin->sin_addr.s_addr, sin->sin_port, 1214 (inp->inp_laddr.s_addr != INADDR_ANY ? 1215 inp->inp_laddr.s_addr : if_sin->sin_addr.s_addr), 1216 inp->inp_lport); 1217 port = netisr_hashport(hash); 1218 1219 if (port != &curthread->td_msgport) { 1220 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1221 1222 /* 1223 * in_pcbladdr() may have allocated a route entry for us 1224 * on the current CPU, but we need a route entry on the 1225 * inpcb's owner CPU, so free it here. 1226 */ 1227 in_pcbresetroute(inp); 1228 1229 /* 1230 * We are moving the protocol processing port the socket 1231 * is on, we have to unlink here and re-link on the 1232 * target cpu. 1233 */ 1234 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1235 msg->connect.nm_flags |= PRUC_RECONNECT; 1236 msg->connect.base.nm_dispatch = tcp_connect; 1237 1238 TCP_STATE_MIGRATE_START(tp); 1239 1240 /* 1241 * Use message put done receipt to change this socket's 1242 * so_port, i.e. _after_ this message was put onto the 1243 * target netisr's msgport but _before_ the message could 1244 * be pulled from the target netisr's msgport, so that: 1245 * - The upper half (socket code) will not see the new 1246 * msgport before this message reaches the new msgport 1247 * and messages for this socket will be ordered. 1248 * - This message will see the new msgport, when its 1249 * handler is called in the target netisr. 1250 * 1251 * NOTE: 1252 * We MUST use messege put done receipt to change this 1253 * socket's so_port: 1254 * If we changed the so_port in this netisr after the 1255 * lwkt_forwardmsg (so messages for this socket will be 1256 * ordered) and changed the so_port in the target netisr 1257 * at the very beginning of this message's handler, we 1258 * would suffer so_port overwritten race, given this 1259 * message might be forwarded again. 1260 * 1261 * NOTE: 1262 * This mechanism depends on that the netisr's msgport 1263 * is spin msgport (currently it is :). 1264 * 1265 * If the upper half saw the new msgport before this 1266 * message reached the target netisr's msgport, the 1267 * messages sent from the upper half could reach the new 1268 * msgport before this message, thus there would be 1269 * message reordering. The worst case could be soclose() 1270 * saw the new msgport and the detach message could reach 1271 * the new msgport before this message, i.e. the inpcb 1272 * could have been destroyed when this message was still 1273 * pending on or on its way to the new msgport. Other 1274 * weird cases could also happen, e.g. inpcb->inp_pcbinfo, 1275 * since we have unlinked this inpcb from the current 1276 * pcbinfo first. 1277 */ 1278 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1279 lwkt_forwardmsg(port, lmsg); 1280 /* msg invalid now */ 1281 return; 1282 } else if (msg->connect.nm_flags & PRUC_HELDTD) { 1283 /* 1284 * The original thread is no longer needed; release it. 1285 */ 1286 lwkt_rele(td); 1287 msg->connect.nm_flags &= ~PRUC_HELDTD; 1288 } 1289 error = tcp_connect_oncpu(tp, msg->connect.nm_sndflags, 1290 msg->connect.nm_m, sin, if_sin, hash); 1291 msg->connect.nm_m = NULL; 1292 out: 1293 if (msg->connect.nm_m) { 1294 m_freem(msg->connect.nm_m); 1295 msg->connect.nm_m = NULL; 1296 } 1297 if (msg->connect.nm_flags & PRUC_HELDTD) 1298 lwkt_rele(td); 1299 if (error && (msg->connect.nm_flags & PRUC_ASYNC)) { 1300 so->so_error = error; 1301 soisdisconnected(so); 1302 } 1303 lwkt_replymsg(&msg->connect.base.lmsg, error); 1304 /* msg invalid now */ 1305 } 1306 1307 #ifdef INET6 1308 1309 static void 1310 tcp6_connect(netmsg_t msg) 1311 { 1312 struct tcpcb *tp; 1313 struct socket *so = msg->connect.base.nm_so; 1314 struct sockaddr *nam = msg->connect.nm_nam; 1315 struct thread *td = msg->connect.nm_td; 1316 struct inpcb *inp; 1317 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1318 struct in6_addr *addr6; 1319 lwkt_port_t port; 1320 int error; 1321 1322 COMMON_START(so, inp, 0); 1323 1324 /* 1325 * Reconnect our pcb if we have to 1326 */ 1327 if (msg->connect.nm_flags & PRUC_RECONNECT) { 1328 msg->connect.nm_flags &= ~PRUC_RECONNECT; 1329 TCP_STATE_MIGRATE_END(tp); 1330 in_pcblink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1331 } 1332 1333 /* 1334 * Bind if we have to 1335 */ 1336 if (inp->inp_lport == 0) { 1337 error = in6_pcbbind(inp, NULL, td); 1338 if (error) 1339 goto out; 1340 } 1341 1342 /* 1343 * Cannot simply call in_pcbconnect, because there might be an 1344 * earlier incarnation of this same connection still in 1345 * TIME_WAIT state, creating an ADDRINUSE error. 1346 */ 1347 error = in6_pcbladdr(inp, nam, &addr6, td); 1348 if (error) 1349 goto out; 1350 1351 port = tcp6_addrport(); /* XXX hack for now, always cpu0 */ 1352 1353 if (port != &curthread->td_msgport) { 1354 lwkt_msg_t lmsg = &msg->connect.base.lmsg; 1355 1356 /* 1357 * in_pcbladdr() may have allocated a route entry for us 1358 * on the current CPU, but we need a route entry on the 1359 * inpcb's owner CPU, so free it here. 1360 */ 1361 in_pcbresetroute(inp); 1362 1363 in_pcbunlink(so->so_pcb, &tcbinfo[mycpu->gd_cpuid]); 1364 msg->connect.nm_flags |= PRUC_RECONNECT; 1365 msg->connect.base.nm_dispatch = tcp6_connect; 1366 1367 TCP_STATE_MIGRATE_START(tp); 1368 1369 /* See the related comment in tcp_connect() */ 1370 lwkt_setmsg_receipt(lmsg, tcp_sosetport); 1371 lwkt_forwardmsg(port, lmsg); 1372 /* msg invalid now */ 1373 return; 1374 } 1375 error = tcp6_connect_oncpu(tp, msg->connect.nm_sndflags, 1376 &msg->connect.nm_m, sin6, addr6); 1377 /* nm_m may still be intact */ 1378 out: 1379 if (msg->connect.nm_m) { 1380 m_freem(msg->connect.nm_m); 1381 msg->connect.nm_m = NULL; 1382 } 1383 lwkt_replymsg(&msg->connect.base.lmsg, error); 1384 /* msg invalid now */ 1385 } 1386 1387 static int 1388 tcp6_connect_oncpu(struct tcpcb *tp, int flags, struct mbuf **mp, 1389 struct sockaddr_in6 *sin6, struct in6_addr *addr6) 1390 { 1391 struct mbuf *m = *mp; 1392 struct inpcb *inp = tp->t_inpcb; 1393 struct socket *so = inp->inp_socket; 1394 struct inpcb *oinp; 1395 1396 /* 1397 * Cannot simply call in_pcbconnect, because there might be an 1398 * earlier incarnation of this same connection still in 1399 * TIME_WAIT state, creating an ADDRINUSE error. 1400 */ 1401 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 1402 &sin6->sin6_addr, sin6->sin6_port, 1403 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? 1404 addr6 : &inp->in6p_laddr), 1405 inp->inp_lport, 0, NULL); 1406 if (oinp) 1407 return (EADDRINUSE); 1408 1409 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1410 inp->in6p_laddr = *addr6; 1411 inp->in6p_faddr = sin6->sin6_addr; 1412 inp->inp_fport = sin6->sin6_port; 1413 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 1414 inp->in6p_flowinfo = sin6->sin6_flowinfo; 1415 in_pcbinsconnhash(inp); 1416 1417 /* 1418 * Now that no more errors can occur, change the protocol processing 1419 * port to the current thread (which is the correct thread). 1420 * 1421 * Create TCP timer message now; we are on the tcpcb's owner 1422 * CPU/thread. 1423 */ 1424 tcp_create_timermsg(tp, &curthread->td_msgport); 1425 1426 /* Compute window scaling to request. */ 1427 if (tp->request_r_scale < TCP_MIN_WINSHIFT) 1428 tp->request_r_scale = TCP_MIN_WINSHIFT; 1429 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1430 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.ssb_hiwat) { 1431 tp->request_r_scale++; 1432 } 1433 1434 soisconnecting(so); 1435 tcpstat.tcps_connattempt++; 1436 TCP_STATE_CHANGE(tp, TCPS_SYN_SENT); 1437 tcp_callout_reset(tp, tp->tt_keep, tp->t_keepinit, tcp_timer_keep); 1438 tp->iss = tcp_new_isn(tp); 1439 tcp_sendseqinit(tp); 1440 if (m) { 1441 ssb_appendstream(&so->so_snd, m); 1442 *mp = NULL; 1443 if (flags & PRUS_OOB) 1444 tp->snd_up = tp->snd_una + so->so_snd.ssb_cc; 1445 } 1446 1447 /* 1448 * Close the send side of the connection after 1449 * the data is sent if flagged. 1450 */ 1451 if ((flags & (PRUS_OOB|PRUS_EOF)) == PRUS_EOF) { 1452 socantsendmore(so); 1453 tp = tcp_usrclosed(tp); 1454 } 1455 return (tcp_output(tp)); 1456 } 1457 1458 #endif /* INET6 */ 1459 1460 /* 1461 * The new sockopt interface makes it possible for us to block in the 1462 * copyin/out step (if we take a page fault). Taking a page fault while 1463 * in a critical section is probably a Bad Thing. (Since sockets and pcbs 1464 * both now use TSM, there probably isn't any need for this function to 1465 * run in a critical section any more. This needs more examination.) 1466 */ 1467 void 1468 tcp_ctloutput(netmsg_t msg) 1469 { 1470 struct socket *so = msg->base.nm_so; 1471 struct sockopt *sopt = msg->ctloutput.nm_sopt; 1472 struct thread *td = NULL; 1473 int error, opt, optval, opthz; 1474 struct inpcb *inp; 1475 struct tcpcb *tp; 1476 1477 if (msg->ctloutput.nm_flags & PRCO_HELDTD) 1478 td = sopt->sopt_td; 1479 1480 error = 0; 1481 inp = so->so_pcb; 1482 if (inp == NULL) { 1483 error = ECONNRESET; 1484 goto done; 1485 } 1486 tp = intotcpcb(inp); 1487 1488 /* Get socket's owner cpuid hint */ 1489 if (sopt->sopt_level == SOL_SOCKET && 1490 sopt->sopt_dir == SOPT_GET && 1491 sopt->sopt_name == SO_CPUHINT) { 1492 if (tp->t_flags & TF_LISTEN) { 1493 /* 1494 * Listen sockets owner cpuid is always 0, 1495 * which does not make sense if SO_REUSEPORT 1496 * is not set. 1497 * 1498 * NOTE: inp_lgrpindex is _not_ assigned in jail. 1499 */ 1500 if ((so->so_options & SO_REUSEPORT) && 1501 inp->inp_lgrpindex >= 0) 1502 optval = inp->inp_lgrpindex % netisr_ncpus; 1503 else 1504 optval = -1; /* no hint */ 1505 } else { 1506 optval = mycpuid; 1507 } 1508 soopt_from_kbuf(sopt, &optval, sizeof(optval)); 1509 goto done; 1510 } 1511 1512 if (sopt->sopt_level != IPPROTO_TCP) { 1513 if (sopt->sopt_level == IPPROTO_IP) { 1514 switch (sopt->sopt_name) { 1515 case IP_MULTICAST_IF: 1516 case IP_MULTICAST_VIF: 1517 case IP_MULTICAST_TTL: 1518 case IP_MULTICAST_LOOP: 1519 case IP_ADD_MEMBERSHIP: 1520 case IP_DROP_MEMBERSHIP: 1521 /* 1522 * Multicast does not make sense on 1523 * TCP sockets. 1524 */ 1525 error = EOPNOTSUPP; 1526 goto done; 1527 } 1528 } 1529 #ifdef INET6 1530 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1531 ip6_ctloutput_dispatch(msg); 1532 else 1533 #endif /* INET6 */ 1534 ip_ctloutput(msg); 1535 /* msg invalid now */ 1536 if (td != NULL) 1537 lwkt_rele(td); 1538 return; 1539 } 1540 1541 switch (sopt->sopt_dir) { 1542 case SOPT_SET: 1543 error = soopt_to_kbuf(sopt, &optval, sizeof optval, 1544 sizeof optval); 1545 if (error) 1546 break; 1547 switch (sopt->sopt_name) { 1548 case TCP_FASTKEEP: 1549 if (optval > 0) 1550 tp->t_keepidle = tp->t_keepintvl; 1551 else 1552 tp->t_keepidle = tcp_keepidle; 1553 tcp_timer_keep_activity(tp, 0); 1554 break; 1555 #ifdef TCP_SIGNATURE 1556 case TCP_SIGNATURE_ENABLE: 1557 if (tp->t_state == TCPS_CLOSED) { 1558 /* 1559 * This is the only safe state that this 1560 * option could be changed. Some segments 1561 * could already have been sent in other 1562 * states. 1563 */ 1564 if (optval > 0) 1565 tp->t_flags |= TF_SIGNATURE; 1566 else 1567 tp->t_flags &= ~TF_SIGNATURE; 1568 } else { 1569 error = EOPNOTSUPP; 1570 } 1571 break; 1572 #endif /* TCP_SIGNATURE */ 1573 case TCP_NODELAY: 1574 case TCP_NOOPT: 1575 switch (sopt->sopt_name) { 1576 case TCP_NODELAY: 1577 opt = TF_NODELAY; 1578 break; 1579 case TCP_NOOPT: 1580 opt = TF_NOOPT; 1581 break; 1582 default: 1583 opt = 0; /* dead code to fool gcc */ 1584 break; 1585 } 1586 1587 if (optval) 1588 tp->t_flags |= opt; 1589 else 1590 tp->t_flags &= ~opt; 1591 break; 1592 1593 case TCP_NOPUSH: 1594 if (tcp_disable_nopush) 1595 break; 1596 if (optval) 1597 tp->t_flags |= TF_NOPUSH; 1598 else { 1599 tp->t_flags &= ~TF_NOPUSH; 1600 error = tcp_output(tp); 1601 } 1602 break; 1603 1604 case TCP_MAXSEG: 1605 /* 1606 * Must be between 0 and maxseg. If the requested 1607 * maxseg is too small to satisfy the desired minmss, 1608 * pump it up (silently so sysctl modifications of 1609 * minmss do not create unexpected program failures). 1610 * Handle degenerate cases. 1611 */ 1612 if (optval > 0 && optval <= tp->t_maxseg) { 1613 if (optval + 40 < tcp_minmss) { 1614 optval = tcp_minmss - 40; 1615 if (optval < 0) 1616 optval = 1; 1617 } 1618 tp->t_maxseg = optval; 1619 } else { 1620 error = EINVAL; 1621 } 1622 break; 1623 1624 case TCP_KEEPINIT: 1625 opthz = ((int64_t)optval * hz) / 1000; 1626 if (opthz >= 1) 1627 tp->t_keepinit = opthz; 1628 else 1629 error = EINVAL; 1630 break; 1631 1632 case TCP_KEEPIDLE: 1633 opthz = ((int64_t)optval * hz) / 1000; 1634 if (opthz >= 1) { 1635 tp->t_keepidle = opthz; 1636 tcp_timer_keep_activity(tp, 0); 1637 } else { 1638 error = EINVAL; 1639 } 1640 break; 1641 1642 case TCP_KEEPINTVL: 1643 opthz = ((int64_t)optval * hz) / 1000; 1644 if (opthz >= 1) { 1645 tp->t_keepintvl = opthz; 1646 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1647 } else { 1648 error = EINVAL; 1649 } 1650 break; 1651 1652 case TCP_KEEPCNT: 1653 if (optval > 0) { 1654 tp->t_keepcnt = optval; 1655 tp->t_maxidle = tp->t_keepintvl * tp->t_keepcnt; 1656 } else { 1657 error = EINVAL; 1658 } 1659 break; 1660 1661 default: 1662 error = ENOPROTOOPT; 1663 break; 1664 } 1665 break; 1666 1667 case SOPT_GET: 1668 switch (sopt->sopt_name) { 1669 #ifdef TCP_SIGNATURE 1670 case TCP_SIGNATURE_ENABLE: 1671 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1672 break; 1673 #endif /* TCP_SIGNATURE */ 1674 case TCP_NODELAY: 1675 optval = tp->t_flags & TF_NODELAY; 1676 break; 1677 case TCP_MAXSEG: 1678 optval = tp->t_maxseg; 1679 break; 1680 case TCP_NOOPT: 1681 optval = tp->t_flags & TF_NOOPT; 1682 break; 1683 case TCP_NOPUSH: 1684 optval = tp->t_flags & TF_NOPUSH; 1685 break; 1686 case TCP_KEEPINIT: 1687 optval = ((int64_t)tp->t_keepinit * 1000) / hz; 1688 break; 1689 case TCP_KEEPIDLE: 1690 optval = ((int64_t)tp->t_keepidle * 1000) / hz; 1691 break; 1692 case TCP_KEEPINTVL: 1693 optval = ((int64_t)tp->t_keepintvl * 1000) / hz; 1694 break; 1695 case TCP_KEEPCNT: 1696 optval = tp->t_keepcnt; 1697 break; 1698 default: 1699 error = ENOPROTOOPT; 1700 break; 1701 } 1702 if (error == 0) 1703 soopt_from_kbuf(sopt, &optval, sizeof optval); 1704 break; 1705 } 1706 done: 1707 if (td != NULL) 1708 lwkt_rele(td); 1709 lwkt_replymsg(&msg->lmsg, error); 1710 } 1711 1712 struct netmsg_tcp_ctloutput { 1713 struct netmsg_pr_ctloutput ctloutput; 1714 struct sockopt sopt; 1715 int sopt_val; 1716 }; 1717 1718 /* 1719 * Allocate netmsg_pr_ctloutput for asynchronous tcp_ctloutput. 1720 */ 1721 struct netmsg_pr_ctloutput * 1722 tcp_ctloutmsg(struct sockopt *sopt) 1723 { 1724 struct netmsg_tcp_ctloutput *msg; 1725 int flags = 0, error; 1726 1727 KASSERT(sopt->sopt_dir == SOPT_SET, ("not from ctloutput")); 1728 1729 /* Only small set of options allows asynchronous setting. */ 1730 if (sopt->sopt_level != IPPROTO_TCP) 1731 return NULL; 1732 switch (sopt->sopt_name) { 1733 case TCP_NODELAY: 1734 case TCP_NOOPT: 1735 case TCP_NOPUSH: 1736 case TCP_FASTKEEP: 1737 break; 1738 default: 1739 return NULL; 1740 } 1741 1742 msg = kmalloc(sizeof(*msg), M_LWKTMSG, M_WAITOK | M_NULLOK); 1743 if (msg == NULL) { 1744 /* Fallback to synchronous tcp_ctloutput */ 1745 return NULL; 1746 } 1747 1748 /* Save the sockopt */ 1749 msg->sopt = *sopt; 1750 1751 /* Fixup the sopt.sopt_val ptr */ 1752 error = sooptcopyin(sopt, &msg->sopt_val, 1753 sizeof(msg->sopt_val), sizeof(msg->sopt_val)); 1754 if (error) { 1755 kfree(msg, M_LWKTMSG); 1756 return NULL; 1757 } 1758 msg->sopt.sopt_val = &msg->sopt_val; 1759 1760 /* Hold the current thread */ 1761 if (msg->sopt.sopt_td != NULL) { 1762 flags |= PRCO_HELDTD; 1763 lwkt_hold(msg->sopt.sopt_td); 1764 } 1765 1766 msg->ctloutput.nm_flags = flags; 1767 msg->ctloutput.nm_sopt = &msg->sopt; 1768 1769 return &msg->ctloutput; 1770 } 1771 1772 /* 1773 * tcp_sendspace and tcp_recvspace are the default send and receive window 1774 * sizes, respectively. These are obsolescent (this information should 1775 * be set by the route). 1776 * 1777 * Use a default that does not require tcp window scaling to be turned 1778 * on. Individual programs or the administrator can increase the default. 1779 */ 1780 u_long tcp_sendspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1781 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1782 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1783 u_long tcp_recvspace = 57344; /* largest multiple of PAGE_SIZE < 64k */ 1784 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1785 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1786 1787 /* 1788 * Attach TCP protocol to socket, allocating internet protocol control 1789 * block, tcp control block, buffer space, and entering CLOSED state. 1790 */ 1791 static int 1792 tcp_attach(struct socket *so, struct pru_attach_info *ai) 1793 { 1794 struct inpcb *inp; 1795 int error; 1796 int cpu; 1797 #ifdef INET6 1798 boolean_t isipv6 = INP_CHECK_SOCKAF(so, AF_INET6); 1799 #endif 1800 1801 if (ai != NULL) { 1802 error = tcp_usr_preattach(so, 0 /* don't care */, ai); 1803 if (error) 1804 return (error); 1805 } else { 1806 /* Post attach; do nothing */ 1807 } 1808 1809 cpu = mycpu->gd_cpuid; 1810 1811 /* 1812 * Set the default pcbinfo. This will likely change when we 1813 * bind/connect. 1814 */ 1815 error = in_pcballoc(so, &tcbinfo[cpu]); 1816 if (error) 1817 return (error); 1818 inp = so->so_pcb; 1819 #ifdef INET6 1820 if (isipv6) 1821 inp->in6p_hops = -1; /* use kernel default */ 1822 #endif 1823 tcp_newtcpcb(inp); 1824 /* Keep a reference for asynchronized pru_rcvd */ 1825 soreference(so); 1826 return (0); 1827 } 1828 1829 /* 1830 * Initiate (or continue) disconnect. 1831 * If embryonic state, just send reset (once). 1832 * If in ``let data drain'' option and linger null, just drop. 1833 * Otherwise (hard), mark socket disconnecting and drop 1834 * current input data; switch states based on user close, and 1835 * send segment to peer (with FIN). 1836 */ 1837 static struct tcpcb * 1838 tcp_disconnect(struct tcpcb *tp) 1839 { 1840 struct socket *so = tp->t_inpcb->inp_socket; 1841 1842 if (tp->t_state < TCPS_ESTABLISHED) { 1843 tp = tcp_close(tp); 1844 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1845 tp = tcp_drop(tp, 0); 1846 } else { 1847 lwkt_gettoken(&so->so_rcv.ssb_token); 1848 soisdisconnecting(so); 1849 sbflush(&so->so_rcv.sb); 1850 tp = tcp_usrclosed(tp); 1851 if (tp) 1852 tcp_output(tp); 1853 lwkt_reltoken(&so->so_rcv.ssb_token); 1854 } 1855 return (tp); 1856 } 1857 1858 /* 1859 * User issued close, and wish to trail through shutdown states: 1860 * if never received SYN, just forget it. If got a SYN from peer, 1861 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1862 * If already got a FIN from peer, then almost done; go to LAST_ACK 1863 * state. In all other cases, have already sent FIN to peer (e.g. 1864 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1865 * for peer to send FIN or not respond to keep-alives, etc. 1866 * We can let the user exit from the close as soon as the FIN is acked. 1867 */ 1868 static struct tcpcb * 1869 tcp_usrclosed(struct tcpcb *tp) 1870 { 1871 1872 switch (tp->t_state) { 1873 1874 case TCPS_CLOSED: 1875 case TCPS_LISTEN: 1876 TCP_STATE_CHANGE(tp, TCPS_CLOSED); 1877 tp = tcp_close(tp); 1878 break; 1879 1880 case TCPS_SYN_SENT: 1881 case TCPS_SYN_RECEIVED: 1882 tp->t_flags |= TF_NEEDFIN; 1883 break; 1884 1885 case TCPS_ESTABLISHED: 1886 TCP_STATE_CHANGE(tp, TCPS_FIN_WAIT_1); 1887 break; 1888 1889 case TCPS_CLOSE_WAIT: 1890 TCP_STATE_CHANGE(tp, TCPS_LAST_ACK); 1891 break; 1892 } 1893 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1894 soisdisconnected(tp->t_inpcb->inp_socket); 1895 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1896 if (tp->t_state == TCPS_FIN_WAIT_2) { 1897 tcp_callout_reset(tp, tp->tt_2msl, tp->t_maxidle, 1898 tcp_timer_2msl); 1899 } 1900 } 1901 return (tp); 1902 } 1903