1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/fcntl.h> 71 #include <sys/malloc.h> 72 #include <sys/mbuf.h> 73 #include <sys/domain.h> 74 #include <sys/file.h> /* for struct knote */ 75 #include <sys/kernel.h> 76 #include <sys/event.h> 77 #include <sys/proc.h> 78 #include <sys/protosw.h> 79 #include <sys/socket.h> 80 #include <sys/socketvar.h> 81 #include <sys/socketops.h> 82 #include <sys/resourcevar.h> 83 #include <sys/signalvar.h> 84 #include <sys/sysctl.h> 85 #include <sys/uio.h> 86 #include <sys/jail.h> 87 #include <vm/vm_zone.h> 88 #include <vm/pmap.h> 89 #include <net/netmsg2.h> 90 #include <net/netisr2.h> 91 92 #include <sys/thread2.h> 93 #include <sys/socketvar2.h> 94 #include <sys/spinlock2.h> 95 96 #include <machine/limits.h> 97 98 #ifdef INET 99 extern int tcp_sosend_agglim; 100 extern int tcp_sosend_async; 101 extern int tcp_sosend_jcluster; 102 extern int udp_sosend_async; 103 extern int udp_sosend_prepend; 104 105 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 106 #endif /* INET */ 107 108 static void filt_sordetach(struct knote *kn); 109 static int filt_soread(struct knote *kn, long hint); 110 static void filt_sowdetach(struct knote *kn); 111 static int filt_sowrite(struct knote *kn, long hint); 112 static int filt_solisten(struct knote *kn, long hint); 113 114 static int soclose_sync(struct socket *so, int fflag); 115 static void soclose_fast(struct socket *so); 116 117 static struct filterops solisten_filtops = 118 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 119 static struct filterops soread_filtops = 120 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 121 static struct filterops sowrite_filtops = 122 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 123 static struct filterops soexcept_filtops = 124 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 125 126 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 127 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 128 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 129 130 131 static int somaxconn = SOMAXCONN; 132 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 133 &somaxconn, 0, "Maximum pending socket connection queue size"); 134 135 static int use_soclose_fast = 1; 136 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 137 &use_soclose_fast, 0, "Fast socket close"); 138 139 int use_soaccept_pred_fast = 1; 140 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 141 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 142 143 int use_sendfile_async = 1; 144 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 145 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 146 147 int use_soconnect_async = 1; 148 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 149 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 150 151 /* 152 * Socket operation routines. 153 * These routines are called by the routines in 154 * sys_socket.c or from a system process, and 155 * implement the semantics of socket operations by 156 * switching out to the protocol specific routines. 157 */ 158 159 /* 160 * Get a socket structure, and initialize it. 161 * Note that it would probably be better to allocate socket 162 * and PCB at the same time, but I'm not convinced that all 163 * the protocols can be easily modified to do this. 164 */ 165 struct socket * 166 soalloc(int waitok, struct protosw *pr) 167 { 168 struct socket *so; 169 unsigned waitmask; 170 171 waitmask = waitok ? M_WAITOK : M_NOWAIT; 172 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 173 if (so) { 174 /* XXX race condition for reentrant kernel */ 175 so->so_proto = pr; 176 TAILQ_INIT(&so->so_aiojobq); 177 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 178 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 179 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 180 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 181 spin_init(&so->so_rcvd_spin, "soalloc"); 182 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 183 MSGF_DROPABLE | MSGF_PRIORITY, 184 so->so_proto->pr_usrreqs->pru_rcvd); 185 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 186 so->so_state = SS_NOFDREF; 187 so->so_refs = 1; 188 } 189 return so; 190 } 191 192 int 193 socreate(int dom, struct socket **aso, int type, 194 int proto, struct thread *td) 195 { 196 struct proc *p = td->td_proc; 197 struct protosw *prp; 198 struct socket *so; 199 struct pru_attach_info ai; 200 int error; 201 202 if (proto) 203 prp = pffindproto(dom, proto, type); 204 else 205 prp = pffindtype(dom, type); 206 207 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 208 return (EPROTONOSUPPORT); 209 210 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 211 prp->pr_domain->dom_family != PF_LOCAL && 212 prp->pr_domain->dom_family != PF_INET && 213 prp->pr_domain->dom_family != PF_INET6 && 214 prp->pr_domain->dom_family != PF_ROUTE) { 215 return (EPROTONOSUPPORT); 216 } 217 218 if (prp->pr_type != type) 219 return (EPROTOTYPE); 220 so = soalloc(p != NULL, prp); 221 if (so == NULL) 222 return (ENOBUFS); 223 224 /* 225 * Callers of socreate() presumably will connect up a descriptor 226 * and call soclose() if they cannot. This represents our so_refs 227 * (which should be 1) from soalloc(). 228 */ 229 soclrstate(so, SS_NOFDREF); 230 231 /* 232 * Set a default port for protocol processing. No action will occur 233 * on the socket on this port until an inpcb is attached to it and 234 * is able to match incoming packets, or until the socket becomes 235 * available to userland. 236 * 237 * We normally default the socket to the protocol thread on cpu 0, 238 * if protocol does not provide its own method to initialize the 239 * default port. 240 * 241 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 242 * thread and all pr_*()/pru_*() calls are executed synchronously. 243 */ 244 if (prp->pr_flags & PR_SYNC_PORT) 245 so->so_port = &netisr_sync_port; 246 else if (prp->pr_initport != NULL) 247 so->so_port = prp->pr_initport(); 248 else 249 so->so_port = netisr_cpuport(0); 250 251 TAILQ_INIT(&so->so_incomp); 252 TAILQ_INIT(&so->so_comp); 253 so->so_type = type; 254 so->so_cred = crhold(p->p_ucred); 255 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 256 ai.p_ucred = p->p_ucred; 257 ai.fd_rdir = p->p_fd->fd_rdir; 258 259 /* 260 * Auto-sizing of socket buffers is managed by the protocols and 261 * the appropriate flags must be set in the pru_attach function. 262 */ 263 error = so_pru_attach(so, proto, &ai); 264 if (error) { 265 sosetstate(so, SS_NOFDREF); 266 sofree(so); /* from soalloc */ 267 return error; 268 } 269 270 /* 271 * NOTE: Returns referenced socket. 272 */ 273 *aso = so; 274 return (0); 275 } 276 277 int 278 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 279 { 280 int error; 281 282 error = so_pru_bind(so, nam, td); 283 return (error); 284 } 285 286 static void 287 sodealloc(struct socket *so) 288 { 289 KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == 0); 290 /* TODO: assert accept queues are empty, after unix socket is fixed */ 291 292 if (so->so_rcv.ssb_hiwat) 293 (void)chgsbsize(so->so_cred->cr_uidinfo, 294 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 295 if (so->so_snd.ssb_hiwat) 296 (void)chgsbsize(so->so_cred->cr_uidinfo, 297 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 298 #ifdef INET 299 /* remove accept filter if present */ 300 if (so->so_accf != NULL) 301 do_setopt_accept_filter(so, NULL); 302 #endif /* INET */ 303 crfree(so->so_cred); 304 if (so->so_faddr != NULL) 305 kfree(so->so_faddr, M_SONAME); 306 kfree(so, M_SOCKET); 307 } 308 309 int 310 solisten(struct socket *so, int backlog, struct thread *td) 311 { 312 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 313 return (EINVAL); 314 315 lwkt_gettoken(&so->so_rcv.ssb_token); 316 if (TAILQ_EMPTY(&so->so_comp)) 317 so->so_options |= SO_ACCEPTCONN; 318 lwkt_reltoken(&so->so_rcv.ssb_token); 319 if (backlog < 0 || backlog > somaxconn) 320 backlog = somaxconn; 321 so->so_qlimit = backlog; 322 return so_pru_listen(so, td); 323 } 324 325 static void 326 soqflush(struct socket *so) 327 { 328 lwkt_getpooltoken(so); 329 if (so->so_options & SO_ACCEPTCONN) { 330 struct socket *sp; 331 332 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 333 KKASSERT((sp->so_state & (SS_INCOMP | SS_COMP)) == 334 SS_INCOMP); 335 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 336 so->so_incqlen--; 337 soclrstate(sp, SS_INCOMP); 338 soabort_async(sp, TRUE); 339 } 340 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 341 KKASSERT((sp->so_state & (SS_INCOMP | SS_COMP)) == 342 SS_COMP); 343 TAILQ_REMOVE(&so->so_comp, sp, so_list); 344 so->so_qlen--; 345 soclrstate(sp, SS_COMP); 346 soabort_async(sp, TRUE); 347 } 348 } 349 lwkt_relpooltoken(so); 350 } 351 352 /* 353 * Destroy a disconnected socket. This routine is a NOP if entities 354 * still have a reference on the socket: 355 * 356 * so_pcb - The protocol stack still has a reference 357 * SS_NOFDREF - There is no longer a file pointer reference 358 */ 359 void 360 sofree(struct socket *so) 361 { 362 struct socket *head; 363 364 /* 365 * This is a bit hackish at the moment. We need to interlock 366 * any accept queue we are on before we potentially lose the 367 * last reference to avoid races against a re-reference from 368 * someone operating on the queue. 369 */ 370 while ((head = so->so_head) != NULL) { 371 lwkt_getpooltoken(head); 372 if (so->so_head == head) 373 break; 374 lwkt_relpooltoken(head); 375 } 376 377 /* 378 * Arbitrage the last free. 379 */ 380 KKASSERT(so->so_refs > 0); 381 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 382 if (head) 383 lwkt_relpooltoken(head); 384 return; 385 } 386 387 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 388 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 389 390 if (head != NULL) { 391 /* 392 * We're done, remove ourselves from the accept queue we are 393 * on, if we are on one. 394 */ 395 if (so->so_state & SS_INCOMP) { 396 KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == 397 SS_INCOMP); 398 TAILQ_REMOVE(&head->so_incomp, so, so_list); 399 head->so_incqlen--; 400 } else if (so->so_state & SS_COMP) { 401 /* 402 * We must not decommission a socket that's 403 * on the accept(2) queue. If we do, then 404 * accept(2) may hang after select(2) indicated 405 * that the listening socket was ready. 406 */ 407 KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == 408 SS_COMP); 409 lwkt_relpooltoken(head); 410 return; 411 } else { 412 panic("sofree: not queued"); 413 } 414 soclrstate(so, SS_INCOMP); 415 so->so_head = NULL; 416 lwkt_relpooltoken(head); 417 } else { 418 /* Flush accept queues, if we are accepting. */ 419 soqflush(so); 420 } 421 ssb_release(&so->so_snd, so); 422 sorflush(so); 423 sodealloc(so); 424 } 425 426 /* 427 * Close a socket on last file table reference removal. 428 * Initiate disconnect if connected. 429 * Free socket when disconnect complete. 430 */ 431 int 432 soclose(struct socket *so, int fflag) 433 { 434 int error; 435 436 funsetown(&so->so_sigio); 437 sosetstate(so, SS_ISCLOSING); 438 if (!use_soclose_fast || 439 (so->so_proto->pr_flags & PR_SYNC_PORT) || 440 ((so->so_state & SS_ISCONNECTED) && 441 (so->so_options & SO_LINGER))) { 442 error = soclose_sync(so, fflag); 443 } else { 444 soclose_fast(so); 445 error = 0; 446 } 447 return error; 448 } 449 450 void 451 sodiscard(struct socket *so) 452 { 453 if (so->so_state & SS_NOFDREF) 454 panic("soclose: NOFDREF"); 455 sosetstate(so, SS_NOFDREF); /* take ref */ 456 } 457 458 /* 459 * Append the completed queue of head to head_inh (inherting listen socket). 460 */ 461 void 462 soinherit(struct socket *head, struct socket *head_inh) 463 { 464 boolean_t do_wakeup = FALSE; 465 466 KASSERT(head->so_options & SO_ACCEPTCONN, 467 ("head does not accept connection")); 468 KASSERT(head_inh->so_options & SO_ACCEPTCONN, 469 ("head_inh does not accept connection")); 470 471 lwkt_getpooltoken(head); 472 lwkt_getpooltoken(head_inh); 473 474 if (head->so_qlen > 0) 475 do_wakeup = TRUE; 476 477 while (!TAILQ_EMPTY(&head->so_comp)) { 478 struct ucred *old_cr; 479 struct socket *sp; 480 481 sp = TAILQ_FIRST(&head->so_comp); 482 KKASSERT((sp->so_state & (SS_INCOMP | SS_COMP)) == SS_COMP); 483 484 /* 485 * Remove this socket from the current listen socket 486 * completed queue. 487 */ 488 TAILQ_REMOVE(&head->so_comp, sp, so_list); 489 head->so_qlen--; 490 491 /* Save the old ucred for later free. */ 492 old_cr = sp->so_cred; 493 494 /* 495 * Install this socket to the inheriting listen socket 496 * completed queue. 497 */ 498 sp->so_cred = crhold(head_inh->so_cred); /* non-blocking */ 499 sp->so_head = head_inh; 500 501 TAILQ_INSERT_TAIL(&head_inh->so_comp, sp, so_list); 502 head_inh->so_qlen++; 503 504 /* 505 * NOTE: 506 * crfree() may block and release the tokens temporarily. 507 * However, we are fine here, since the transition is done. 508 */ 509 crfree(old_cr); 510 } 511 512 lwkt_relpooltoken(head_inh); 513 lwkt_relpooltoken(head); 514 515 if (do_wakeup) { 516 /* 517 * "New" connections have arrived 518 */ 519 sorwakeup(head_inh); 520 wakeup(&head_inh->so_timeo); 521 } 522 } 523 524 static int 525 soclose_sync(struct socket *so, int fflag) 526 { 527 int error = 0; 528 529 if (so->so_pcb == NULL) 530 goto discard; 531 if (so->so_state & SS_ISCONNECTED) { 532 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 533 error = sodisconnect(so); 534 if (error) 535 goto drop; 536 } 537 if (so->so_options & SO_LINGER) { 538 if ((so->so_state & SS_ISDISCONNECTING) && 539 (fflag & FNONBLOCK)) 540 goto drop; 541 while (so->so_state & SS_ISCONNECTED) { 542 error = tsleep(&so->so_timeo, PCATCH, 543 "soclos", so->so_linger * hz); 544 if (error) 545 break; 546 } 547 } 548 } 549 drop: 550 if (so->so_pcb) { 551 int error2; 552 553 error2 = so_pru_detach(so); 554 if (error2 == EJUSTRETURN) { 555 /* 556 * Protocol will call sodiscard() 557 * and sofree() for us. 558 */ 559 return error; 560 } 561 if (error == 0) 562 error = error2; 563 } 564 discard: 565 sodiscard(so); 566 so_pru_sync(so); /* unpend async sending */ 567 sofree(so); /* dispose of ref */ 568 569 return (error); 570 } 571 572 static void 573 soclose_sofree_async_handler(netmsg_t msg) 574 { 575 sofree(msg->base.nm_so); 576 } 577 578 static void 579 soclose_sofree_async(struct socket *so) 580 { 581 struct netmsg_base *base = &so->so_clomsg; 582 583 netmsg_init(base, so, &netisr_apanic_rport, 0, 584 soclose_sofree_async_handler); 585 lwkt_sendmsg(so->so_port, &base->lmsg); 586 } 587 588 static void 589 soclose_disconn_async_handler(netmsg_t msg) 590 { 591 struct socket *so = msg->base.nm_so; 592 593 if ((so->so_state & SS_ISCONNECTED) && 594 (so->so_state & SS_ISDISCONNECTING) == 0) 595 so_pru_disconnect_direct(so); 596 597 if (so->so_pcb) { 598 int error; 599 600 error = so_pru_detach_direct(so); 601 if (error == EJUSTRETURN) { 602 /* 603 * Protocol will call sodiscard() 604 * and sofree() for us. 605 */ 606 return; 607 } 608 } 609 610 sodiscard(so); 611 sofree(so); 612 } 613 614 static void 615 soclose_disconn_async(struct socket *so) 616 { 617 struct netmsg_base *base = &so->so_clomsg; 618 619 netmsg_init(base, so, &netisr_apanic_rport, 0, 620 soclose_disconn_async_handler); 621 lwkt_sendmsg(so->so_port, &base->lmsg); 622 } 623 624 static void 625 soclose_detach_async_handler(netmsg_t msg) 626 { 627 struct socket *so = msg->base.nm_so; 628 629 if (so->so_pcb) { 630 int error; 631 632 error = so_pru_detach_direct(so); 633 if (error == EJUSTRETURN) { 634 /* 635 * Protocol will call sodiscard() 636 * and sofree() for us. 637 */ 638 return; 639 } 640 } 641 642 sodiscard(so); 643 sofree(so); 644 } 645 646 static void 647 soclose_detach_async(struct socket *so) 648 { 649 struct netmsg_base *base = &so->so_clomsg; 650 651 netmsg_init(base, so, &netisr_apanic_rport, 0, 652 soclose_detach_async_handler); 653 lwkt_sendmsg(so->so_port, &base->lmsg); 654 } 655 656 static void 657 soclose_fast(struct socket *so) 658 { 659 if (so->so_pcb == NULL) 660 goto discard; 661 662 if ((so->so_state & SS_ISCONNECTED) && 663 (so->so_state & SS_ISDISCONNECTING) == 0) { 664 soclose_disconn_async(so); 665 return; 666 } 667 668 if (so->so_pcb) { 669 soclose_detach_async(so); 670 return; 671 } 672 673 discard: 674 sodiscard(so); 675 soclose_sofree_async(so); 676 } 677 678 /* 679 * Abort and destroy a socket. Only one abort can be in progress 680 * at any given moment. 681 */ 682 void 683 soabort_async(struct socket *so, boolean_t clr_head) 684 { 685 /* 686 * Keep a reference before clearing the so_head 687 * to avoid racing socket close in netisr. 688 */ 689 soreference(so); 690 if (clr_head) 691 so->so_head = NULL; 692 so_pru_abort_async(so); 693 } 694 695 void 696 soabort_oncpu(struct socket *so) 697 { 698 soreference(so); 699 so_pru_abort_direct(so); 700 } 701 702 /* 703 * so is passed in ref'd, which becomes owned by 704 * the cleared SS_NOFDREF flag. 705 */ 706 void 707 soaccept_generic(struct socket *so) 708 { 709 if ((so->so_state & SS_NOFDREF) == 0) 710 panic("soaccept: !NOFDREF"); 711 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 712 } 713 714 int 715 soaccept(struct socket *so, struct sockaddr **nam) 716 { 717 int error; 718 719 soaccept_generic(so); 720 error = so_pru_accept(so, nam); 721 return (error); 722 } 723 724 int 725 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 726 boolean_t sync) 727 { 728 int error; 729 730 if (so->so_options & SO_ACCEPTCONN) 731 return (EOPNOTSUPP); 732 /* 733 * If protocol is connection-based, can only connect once. 734 * Otherwise, if connected, try to disconnect first. 735 * This allows user to disconnect by connecting to, e.g., 736 * a null address. 737 */ 738 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 739 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 740 (error = sodisconnect(so)))) { 741 error = EISCONN; 742 } else { 743 /* 744 * Prevent accumulated error from previous connection 745 * from biting us. 746 */ 747 so->so_error = 0; 748 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 749 error = so_pru_connect_async(so, nam, td); 750 else 751 error = so_pru_connect(so, nam, td); 752 } 753 return (error); 754 } 755 756 int 757 soconnect2(struct socket *so1, struct socket *so2) 758 { 759 int error; 760 761 error = so_pru_connect2(so1, so2); 762 return (error); 763 } 764 765 int 766 sodisconnect(struct socket *so) 767 { 768 int error; 769 770 if ((so->so_state & SS_ISCONNECTED) == 0) { 771 error = ENOTCONN; 772 goto bad; 773 } 774 if (so->so_state & SS_ISDISCONNECTING) { 775 error = EALREADY; 776 goto bad; 777 } 778 error = so_pru_disconnect(so); 779 bad: 780 return (error); 781 } 782 783 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 784 /* 785 * Send on a socket. 786 * If send must go all at once and message is larger than 787 * send buffering, then hard error. 788 * Lock against other senders. 789 * If must go all at once and not enough room now, then 790 * inform user that this would block and do nothing. 791 * Otherwise, if nonblocking, send as much as possible. 792 * The data to be sent is described by "uio" if nonzero, 793 * otherwise by the mbuf chain "top" (which must be null 794 * if uio is not). Data provided in mbuf chain must be small 795 * enough to send all at once. 796 * 797 * Returns nonzero on error, timeout or signal; callers 798 * must check for short counts if EINTR/ERESTART are returned. 799 * Data and control buffers are freed on return. 800 */ 801 int 802 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 803 struct mbuf *top, struct mbuf *control, int flags, 804 struct thread *td) 805 { 806 struct mbuf **mp; 807 struct mbuf *m; 808 size_t resid; 809 int space, len; 810 int clen = 0, error, dontroute, mlen; 811 int atomic = sosendallatonce(so) || top; 812 int pru_flags; 813 814 if (uio) { 815 resid = uio->uio_resid; 816 } else { 817 resid = (size_t)top->m_pkthdr.len; 818 #ifdef INVARIANTS 819 len = 0; 820 for (m = top; m; m = m->m_next) 821 len += m->m_len; 822 KKASSERT(top->m_pkthdr.len == len); 823 #endif 824 } 825 826 /* 827 * WARNING! resid is unsigned, space and len are signed. space 828 * can wind up negative if the sockbuf is overcommitted. 829 * 830 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 831 * type sockets since that's an error. 832 */ 833 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 834 error = EINVAL; 835 goto out; 836 } 837 838 dontroute = 839 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 840 (so->so_proto->pr_flags & PR_ATOMIC); 841 if (td->td_lwp != NULL) 842 td->td_lwp->lwp_ru.ru_msgsnd++; 843 if (control) 844 clen = control->m_len; 845 #define gotoerr(errcode) { error = errcode; goto release; } 846 847 restart: 848 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 849 if (error) 850 goto out; 851 852 do { 853 if (so->so_state & SS_CANTSENDMORE) 854 gotoerr(EPIPE); 855 if (so->so_error) { 856 error = so->so_error; 857 so->so_error = 0; 858 goto release; 859 } 860 if ((so->so_state & SS_ISCONNECTED) == 0) { 861 /* 862 * `sendto' and `sendmsg' is allowed on a connection- 863 * based socket if it supports implied connect. 864 * Return ENOTCONN if not connected and no address is 865 * supplied. 866 */ 867 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 868 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 869 if ((so->so_state & SS_ISCONFIRMING) == 0 && 870 !(resid == 0 && clen != 0)) 871 gotoerr(ENOTCONN); 872 } else if (addr == NULL) 873 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 874 ENOTCONN : EDESTADDRREQ); 875 } 876 if ((atomic && resid > so->so_snd.ssb_hiwat) || 877 clen > so->so_snd.ssb_hiwat) { 878 gotoerr(EMSGSIZE); 879 } 880 space = ssb_space(&so->so_snd); 881 if (flags & MSG_OOB) 882 space += 1024; 883 if ((space < 0 || (size_t)space < resid + clen) && uio && 884 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 885 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 886 gotoerr(EWOULDBLOCK); 887 ssb_unlock(&so->so_snd); 888 error = ssb_wait(&so->so_snd); 889 if (error) 890 goto out; 891 goto restart; 892 } 893 mp = ⊤ 894 space -= clen; 895 do { 896 if (uio == NULL) { 897 /* 898 * Data is prepackaged in "top". 899 */ 900 resid = 0; 901 if (flags & MSG_EOR) 902 top->m_flags |= M_EOR; 903 } else do { 904 if (resid > INT_MAX) 905 resid = INT_MAX; 906 m = m_getl((int)resid, M_WAITOK, MT_DATA, 907 top == NULL ? M_PKTHDR : 0, &mlen); 908 if (top == NULL) { 909 m->m_pkthdr.len = 0; 910 m->m_pkthdr.rcvif = NULL; 911 } 912 len = imin((int)szmin(mlen, resid), space); 913 if (resid < MINCLSIZE) { 914 /* 915 * For datagram protocols, leave room 916 * for protocol headers in first mbuf. 917 */ 918 if (atomic && top == NULL && len < mlen) 919 MH_ALIGN(m, len); 920 } 921 space -= len; 922 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 923 resid = uio->uio_resid; 924 m->m_len = len; 925 *mp = m; 926 top->m_pkthdr.len += len; 927 if (error) 928 goto release; 929 mp = &m->m_next; 930 if (resid == 0) { 931 if (flags & MSG_EOR) 932 top->m_flags |= M_EOR; 933 break; 934 } 935 } while (space > 0 && atomic); 936 if (dontroute) 937 so->so_options |= SO_DONTROUTE; 938 if (flags & MSG_OOB) { 939 pru_flags = PRUS_OOB; 940 } else if ((flags & MSG_EOF) && 941 (so->so_proto->pr_flags & PR_IMPLOPCL) && 942 (resid == 0)) { 943 /* 944 * If the user set MSG_EOF, the protocol 945 * understands this flag and nothing left to 946 * send then use PRU_SEND_EOF instead of PRU_SEND. 947 */ 948 pru_flags = PRUS_EOF; 949 } else if (resid > 0 && space > 0) { 950 /* If there is more to send, set PRUS_MORETOCOME */ 951 pru_flags = PRUS_MORETOCOME; 952 } else { 953 pru_flags = 0; 954 } 955 /* 956 * XXX all the SS_CANTSENDMORE checks previously 957 * done could be out of date. We could have recieved 958 * a reset packet in an interrupt or maybe we slept 959 * while doing page faults in uiomove() etc. We could 960 * probably recheck again inside the splnet() protection 961 * here, but there are probably other places that this 962 * also happens. We must rethink this. 963 */ 964 error = so_pru_send(so, pru_flags, top, addr, control, td); 965 if (dontroute) 966 so->so_options &= ~SO_DONTROUTE; 967 clen = 0; 968 control = NULL; 969 top = NULL; 970 mp = ⊤ 971 if (error) 972 goto release; 973 } while (resid && space > 0); 974 } while (resid); 975 976 release: 977 ssb_unlock(&so->so_snd); 978 out: 979 if (top) 980 m_freem(top); 981 if (control) 982 m_freem(control); 983 return (error); 984 } 985 986 #ifdef INET 987 /* 988 * A specialization of sosend() for UDP based on protocol-specific knowledge: 989 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 990 * sosendallatonce() returns true, 991 * the "atomic" variable is true, 992 * and sosendudp() blocks until space is available for the entire send. 993 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 994 * PR_IMPLOPCL flags set. 995 * UDP has no out-of-band data. 996 * UDP has no control data. 997 * UDP does not support MSG_EOR. 998 */ 999 int 1000 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1001 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1002 { 1003 size_t resid; 1004 int error, pru_flags = 0; 1005 int space; 1006 1007 if (td->td_lwp != NULL) 1008 td->td_lwp->lwp_ru.ru_msgsnd++; 1009 if (control) 1010 m_freem(control); 1011 1012 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 1013 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 1014 1015 restart: 1016 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1017 if (error) 1018 goto out; 1019 1020 if (so->so_state & SS_CANTSENDMORE) 1021 gotoerr(EPIPE); 1022 if (so->so_error) { 1023 error = so->so_error; 1024 so->so_error = 0; 1025 goto release; 1026 } 1027 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1028 gotoerr(EDESTADDRREQ); 1029 if (resid > so->so_snd.ssb_hiwat) 1030 gotoerr(EMSGSIZE); 1031 space = ssb_space(&so->so_snd); 1032 if (uio && (space < 0 || (size_t)space < resid)) { 1033 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1034 gotoerr(EWOULDBLOCK); 1035 ssb_unlock(&so->so_snd); 1036 error = ssb_wait(&so->so_snd); 1037 if (error) 1038 goto out; 1039 goto restart; 1040 } 1041 1042 if (uio) { 1043 int hdrlen = max_hdr; 1044 1045 /* 1046 * We try to optimize out the additional mbuf 1047 * allocations in M_PREPEND() on output path, e.g. 1048 * - udp_output(), when it tries to prepend protocol 1049 * headers. 1050 * - Link layer output function, when it tries to 1051 * prepend link layer header. 1052 * 1053 * This probably will not benefit any data that will 1054 * be fragmented, so this optimization is only performed 1055 * when the size of data and max size of protocol+link 1056 * headers fit into one mbuf cluster. 1057 */ 1058 if (uio->uio_resid > MCLBYTES - hdrlen || 1059 !udp_sosend_prepend) { 1060 top = m_uiomove(uio); 1061 if (top == NULL) 1062 goto release; 1063 } else { 1064 int nsize; 1065 1066 top = m_getl(uio->uio_resid + hdrlen, M_WAITOK, 1067 MT_DATA, M_PKTHDR, &nsize); 1068 KASSERT(nsize >= uio->uio_resid + hdrlen, 1069 ("sosendudp invalid nsize %d, " 1070 "resid %zu, hdrlen %d", 1071 nsize, uio->uio_resid, hdrlen)); 1072 1073 top->m_len = uio->uio_resid; 1074 top->m_pkthdr.len = uio->uio_resid; 1075 top->m_data += hdrlen; 1076 1077 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1078 if (error) 1079 goto out; 1080 } 1081 } 1082 1083 if (flags & MSG_DONTROUTE) 1084 pru_flags |= PRUS_DONTROUTE; 1085 1086 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1087 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1088 error = 0; 1089 } else { 1090 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1091 } 1092 top = NULL; /* sent or freed in lower layer */ 1093 1094 release: 1095 ssb_unlock(&so->so_snd); 1096 out: 1097 if (top) 1098 m_freem(top); 1099 return (error); 1100 } 1101 1102 int 1103 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1104 struct mbuf *top, struct mbuf *control, int flags, 1105 struct thread *td) 1106 { 1107 struct mbuf **mp; 1108 struct mbuf *m; 1109 size_t resid; 1110 int space, len; 1111 int error, mlen; 1112 int allatonce; 1113 int pru_flags; 1114 1115 if (uio) { 1116 KKASSERT(top == NULL); 1117 allatonce = 0; 1118 resid = uio->uio_resid; 1119 } else { 1120 allatonce = 1; 1121 resid = (size_t)top->m_pkthdr.len; 1122 #ifdef INVARIANTS 1123 len = 0; 1124 for (m = top; m; m = m->m_next) 1125 len += m->m_len; 1126 KKASSERT(top->m_pkthdr.len == len); 1127 #endif 1128 } 1129 1130 /* 1131 * WARNING! resid is unsigned, space and len are signed. space 1132 * can wind up negative if the sockbuf is overcommitted. 1133 * 1134 * Also check to make sure that MSG_EOR isn't used on TCP 1135 */ 1136 if (flags & MSG_EOR) { 1137 error = EINVAL; 1138 goto out; 1139 } 1140 1141 if (control) { 1142 /* TCP doesn't do control messages (rights, creds, etc) */ 1143 if (control->m_len) { 1144 error = EINVAL; 1145 goto out; 1146 } 1147 m_freem(control); /* empty control, just free it */ 1148 control = NULL; 1149 } 1150 1151 if (td->td_lwp != NULL) 1152 td->td_lwp->lwp_ru.ru_msgsnd++; 1153 1154 #define gotoerr(errcode) { error = errcode; goto release; } 1155 1156 restart: 1157 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1158 if (error) 1159 goto out; 1160 1161 do { 1162 if (so->so_state & SS_CANTSENDMORE) 1163 gotoerr(EPIPE); 1164 if (so->so_error) { 1165 error = so->so_error; 1166 so->so_error = 0; 1167 goto release; 1168 } 1169 if ((so->so_state & SS_ISCONNECTED) == 0 && 1170 (so->so_state & SS_ISCONFIRMING) == 0) 1171 gotoerr(ENOTCONN); 1172 if (allatonce && resid > so->so_snd.ssb_hiwat) 1173 gotoerr(EMSGSIZE); 1174 1175 space = ssb_space_prealloc(&so->so_snd); 1176 if (flags & MSG_OOB) 1177 space += 1024; 1178 if ((space < 0 || (size_t)space < resid) && !allatonce && 1179 space < so->so_snd.ssb_lowat) { 1180 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1181 gotoerr(EWOULDBLOCK); 1182 ssb_unlock(&so->so_snd); 1183 error = ssb_wait(&so->so_snd); 1184 if (error) 1185 goto out; 1186 goto restart; 1187 } 1188 mp = ⊤ 1189 do { 1190 int cnt = 0, async = 0; 1191 1192 if (uio == NULL) { 1193 /* 1194 * Data is prepackaged in "top". 1195 */ 1196 resid = 0; 1197 } else do { 1198 if (resid > INT_MAX) 1199 resid = INT_MAX; 1200 if (tcp_sosend_jcluster) { 1201 m = m_getlj((int)resid, M_WAITOK, MT_DATA, 1202 top == NULL ? M_PKTHDR : 0, &mlen); 1203 } else { 1204 m = m_getl((int)resid, M_WAITOK, MT_DATA, 1205 top == NULL ? M_PKTHDR : 0, &mlen); 1206 } 1207 if (top == NULL) { 1208 m->m_pkthdr.len = 0; 1209 m->m_pkthdr.rcvif = NULL; 1210 } 1211 len = imin((int)szmin(mlen, resid), space); 1212 space -= len; 1213 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1214 resid = uio->uio_resid; 1215 m->m_len = len; 1216 *mp = m; 1217 top->m_pkthdr.len += len; 1218 if (error) 1219 goto release; 1220 mp = &m->m_next; 1221 if (resid == 0) 1222 break; 1223 ++cnt; 1224 } while (space > 0 && cnt < tcp_sosend_agglim); 1225 1226 if (tcp_sosend_async) 1227 async = 1; 1228 1229 if (flags & MSG_OOB) { 1230 pru_flags = PRUS_OOB; 1231 async = 0; 1232 } else if ((flags & MSG_EOF) && resid == 0) { 1233 pru_flags = PRUS_EOF; 1234 } else if (resid > 0 && space > 0) { 1235 /* If there is more to send, set PRUS_MORETOCOME */ 1236 pru_flags = PRUS_MORETOCOME; 1237 async = 1; 1238 } else { 1239 pru_flags = 0; 1240 } 1241 1242 if (flags & MSG_SYNC) 1243 async = 0; 1244 1245 /* 1246 * XXX all the SS_CANTSENDMORE checks previously 1247 * done could be out of date. We could have recieved 1248 * a reset packet in an interrupt or maybe we slept 1249 * while doing page faults in uiomove() etc. We could 1250 * probably recheck again inside the splnet() protection 1251 * here, but there are probably other places that this 1252 * also happens. We must rethink this. 1253 */ 1254 for (m = top; m; m = m->m_next) 1255 ssb_preallocstream(&so->so_snd, m); 1256 if (!async) { 1257 error = so_pru_send(so, pru_flags, top, 1258 NULL, NULL, td); 1259 } else { 1260 so_pru_send_async(so, pru_flags, top, 1261 NULL, NULL, td); 1262 error = 0; 1263 } 1264 1265 top = NULL; 1266 mp = ⊤ 1267 if (error) 1268 goto release; 1269 } while (resid && space > 0); 1270 } while (resid); 1271 1272 release: 1273 ssb_unlock(&so->so_snd); 1274 out: 1275 if (top) 1276 m_freem(top); 1277 if (control) 1278 m_freem(control); 1279 return (error); 1280 } 1281 #endif 1282 1283 /* 1284 * Implement receive operations on a socket. 1285 * 1286 * We depend on the way that records are added to the signalsockbuf 1287 * by sbappend*. In particular, each record (mbufs linked through m_next) 1288 * must begin with an address if the protocol so specifies, 1289 * followed by an optional mbuf or mbufs containing ancillary data, 1290 * and then zero or more mbufs of data. 1291 * 1292 * Although the signalsockbuf is locked, new data may still be appended. 1293 * A token inside the ssb_lock deals with MP issues and still allows 1294 * the network to access the socket if we block in a uio. 1295 * 1296 * The caller may receive the data as a single mbuf chain by supplying 1297 * an mbuf **mp0 for use in returning the chain. The uio is then used 1298 * only for the count in uio_resid. 1299 */ 1300 int 1301 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1302 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1303 { 1304 struct mbuf *m, *n; 1305 struct mbuf *free_chain = NULL; 1306 int flags, len, error, offset; 1307 struct protosw *pr = so->so_proto; 1308 int moff, type = 0; 1309 size_t resid, orig_resid; 1310 1311 if (uio) 1312 resid = uio->uio_resid; 1313 else 1314 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1315 orig_resid = resid; 1316 1317 if (psa) 1318 *psa = NULL; 1319 if (controlp) 1320 *controlp = NULL; 1321 if (flagsp) 1322 flags = *flagsp &~ MSG_EOR; 1323 else 1324 flags = 0; 1325 if (flags & MSG_OOB) { 1326 m = m_get(M_WAITOK, MT_DATA); 1327 if (m == NULL) 1328 return (ENOBUFS); 1329 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1330 if (error) 1331 goto bad; 1332 if (sio) { 1333 do { 1334 sbappend(sio, m); 1335 KKASSERT(resid >= (size_t)m->m_len); 1336 resid -= (size_t)m->m_len; 1337 } while (resid > 0 && m); 1338 } else { 1339 do { 1340 uio->uio_resid = resid; 1341 error = uiomove(mtod(m, caddr_t), 1342 (int)szmin(resid, m->m_len), 1343 uio); 1344 resid = uio->uio_resid; 1345 m = m_free(m); 1346 } while (uio->uio_resid && error == 0 && m); 1347 } 1348 bad: 1349 if (m) 1350 m_freem(m); 1351 return (error); 1352 } 1353 if ((so->so_state & SS_ISCONFIRMING) && resid) 1354 so_pru_rcvd(so, 0); 1355 1356 /* 1357 * The token interlocks against the protocol thread while 1358 * ssb_lock is a blocking lock against other userland entities. 1359 */ 1360 lwkt_gettoken(&so->so_rcv.ssb_token); 1361 restart: 1362 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1363 if (error) 1364 goto done; 1365 1366 m = so->so_rcv.ssb_mb; 1367 /* 1368 * If we have less data than requested, block awaiting more 1369 * (subject to any timeout) if: 1370 * 1. the current count is less than the low water mark, or 1371 * 2. MSG_WAITALL is set, and it is possible to do the entire 1372 * receive operation at once if we block (resid <= hiwat). 1373 * 3. MSG_DONTWAIT is not set 1374 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1375 * we have to do the receive in sections, and thus risk returning 1376 * a short count if a timeout or signal occurs after we start. 1377 */ 1378 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1379 (size_t)so->so_rcv.ssb_cc < resid) && 1380 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1381 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1382 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1383 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1384 if (so->so_error) { 1385 if (m) 1386 goto dontblock; 1387 error = so->so_error; 1388 if ((flags & MSG_PEEK) == 0) 1389 so->so_error = 0; 1390 goto release; 1391 } 1392 if (so->so_state & SS_CANTRCVMORE) { 1393 if (m) 1394 goto dontblock; 1395 else 1396 goto release; 1397 } 1398 for (; m; m = m->m_next) { 1399 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1400 m = so->so_rcv.ssb_mb; 1401 goto dontblock; 1402 } 1403 } 1404 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1405 (pr->pr_flags & PR_CONNREQUIRED)) { 1406 error = ENOTCONN; 1407 goto release; 1408 } 1409 if (resid == 0) 1410 goto release; 1411 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1412 error = EWOULDBLOCK; 1413 goto release; 1414 } 1415 ssb_unlock(&so->so_rcv); 1416 error = ssb_wait(&so->so_rcv); 1417 if (error) 1418 goto done; 1419 goto restart; 1420 } 1421 dontblock: 1422 if (uio && uio->uio_td && uio->uio_td->td_proc) 1423 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1424 1425 /* 1426 * note: m should be == sb_mb here. Cache the next record while 1427 * cleaning up. Note that calling m_free*() will break out critical 1428 * section. 1429 */ 1430 KKASSERT(m == so->so_rcv.ssb_mb); 1431 1432 /* 1433 * Skip any address mbufs prepending the record. 1434 */ 1435 if (pr->pr_flags & PR_ADDR) { 1436 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1437 orig_resid = 0; 1438 if (psa) 1439 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1440 if (flags & MSG_PEEK) 1441 m = m->m_next; 1442 else 1443 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1444 } 1445 1446 /* 1447 * Skip any control mbufs prepending the record. 1448 */ 1449 while (m && m->m_type == MT_CONTROL && error == 0) { 1450 if (flags & MSG_PEEK) { 1451 if (controlp) 1452 *controlp = m_copy(m, 0, m->m_len); 1453 m = m->m_next; /* XXX race */ 1454 } else { 1455 if (controlp) { 1456 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1457 if (pr->pr_domain->dom_externalize && 1458 mtod(m, struct cmsghdr *)->cmsg_type == 1459 SCM_RIGHTS) 1460 error = (*pr->pr_domain->dom_externalize)(m); 1461 *controlp = m; 1462 m = n; 1463 } else { 1464 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1465 } 1466 } 1467 if (controlp && *controlp) { 1468 orig_resid = 0; 1469 controlp = &(*controlp)->m_next; 1470 } 1471 } 1472 1473 /* 1474 * flag OOB data. 1475 */ 1476 if (m) { 1477 type = m->m_type; 1478 if (type == MT_OOBDATA) 1479 flags |= MSG_OOB; 1480 } 1481 1482 /* 1483 * Copy to the UIO or mbuf return chain (*mp). 1484 */ 1485 moff = 0; 1486 offset = 0; 1487 while (m && resid > 0 && error == 0) { 1488 if (m->m_type == MT_OOBDATA) { 1489 if (type != MT_OOBDATA) 1490 break; 1491 } else if (type == MT_OOBDATA) 1492 break; 1493 else 1494 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1495 ("receive 3")); 1496 soclrstate(so, SS_RCVATMARK); 1497 len = (resid > INT_MAX) ? INT_MAX : resid; 1498 if (so->so_oobmark && len > so->so_oobmark - offset) 1499 len = so->so_oobmark - offset; 1500 if (len > m->m_len - moff) 1501 len = m->m_len - moff; 1502 1503 /* 1504 * Copy out to the UIO or pass the mbufs back to the SIO. 1505 * The SIO is dealt with when we eat the mbuf, but deal 1506 * with the resid here either way. 1507 */ 1508 if (uio) { 1509 uio->uio_resid = resid; 1510 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1511 resid = uio->uio_resid; 1512 if (error) 1513 goto release; 1514 } else { 1515 resid -= (size_t)len; 1516 } 1517 1518 /* 1519 * Eat the entire mbuf or just a piece of it 1520 */ 1521 if (len == m->m_len - moff) { 1522 if (m->m_flags & M_EOR) 1523 flags |= MSG_EOR; 1524 if (flags & MSG_PEEK) { 1525 m = m->m_next; 1526 moff = 0; 1527 } else { 1528 if (sio) { 1529 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1530 sbappend(sio, m); 1531 m = n; 1532 } else { 1533 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1534 } 1535 } 1536 } else { 1537 if (flags & MSG_PEEK) { 1538 moff += len; 1539 } else { 1540 if (sio) { 1541 n = m_copym(m, 0, len, M_WAITOK); 1542 if (n) 1543 sbappend(sio, n); 1544 } 1545 m->m_data += len; 1546 m->m_len -= len; 1547 so->so_rcv.ssb_cc -= len; 1548 } 1549 } 1550 if (so->so_oobmark) { 1551 if ((flags & MSG_PEEK) == 0) { 1552 so->so_oobmark -= len; 1553 if (so->so_oobmark == 0) { 1554 sosetstate(so, SS_RCVATMARK); 1555 break; 1556 } 1557 } else { 1558 offset += len; 1559 if (offset == so->so_oobmark) 1560 break; 1561 } 1562 } 1563 if (flags & MSG_EOR) 1564 break; 1565 /* 1566 * If the MSG_WAITALL flag is set (for non-atomic socket), 1567 * we must not quit until resid == 0 or an error 1568 * termination. If a signal/timeout occurs, return 1569 * with a short count but without error. 1570 * Keep signalsockbuf locked against other readers. 1571 */ 1572 while ((flags & MSG_WAITALL) && m == NULL && 1573 resid > 0 && !sosendallatonce(so) && 1574 so->so_rcv.ssb_mb == NULL) { 1575 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1576 break; 1577 /* 1578 * The window might have closed to zero, make 1579 * sure we send an ack now that we've drained 1580 * the buffer or we might end up blocking until 1581 * the idle takes over (5 seconds). 1582 */ 1583 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1584 so_pru_rcvd(so, flags); 1585 error = ssb_wait(&so->so_rcv); 1586 if (error) { 1587 ssb_unlock(&so->so_rcv); 1588 error = 0; 1589 goto done; 1590 } 1591 m = so->so_rcv.ssb_mb; 1592 } 1593 } 1594 1595 /* 1596 * If an atomic read was requested but unread data still remains 1597 * in the record, set MSG_TRUNC. 1598 */ 1599 if (m && pr->pr_flags & PR_ATOMIC) 1600 flags |= MSG_TRUNC; 1601 1602 /* 1603 * Cleanup. If an atomic read was requested drop any unread data. 1604 */ 1605 if ((flags & MSG_PEEK) == 0) { 1606 if (m && (pr->pr_flags & PR_ATOMIC)) 1607 sbdroprecord(&so->so_rcv.sb); 1608 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1609 so_pru_rcvd(so, flags); 1610 } 1611 1612 if (orig_resid == resid && orig_resid && 1613 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1614 ssb_unlock(&so->so_rcv); 1615 goto restart; 1616 } 1617 1618 if (flagsp) 1619 *flagsp |= flags; 1620 release: 1621 ssb_unlock(&so->so_rcv); 1622 done: 1623 lwkt_reltoken(&so->so_rcv.ssb_token); 1624 if (free_chain) 1625 m_freem(free_chain); 1626 return (error); 1627 } 1628 1629 int 1630 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1631 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1632 { 1633 struct mbuf *m, *n; 1634 struct mbuf *free_chain = NULL; 1635 int flags, len, error, offset; 1636 struct protosw *pr = so->so_proto; 1637 int moff; 1638 int didoob; 1639 size_t resid, orig_resid, restmp; 1640 1641 if (uio) 1642 resid = uio->uio_resid; 1643 else 1644 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1645 orig_resid = resid; 1646 1647 if (psa) 1648 *psa = NULL; 1649 if (controlp) 1650 *controlp = NULL; 1651 if (flagsp) 1652 flags = *flagsp &~ MSG_EOR; 1653 else 1654 flags = 0; 1655 if (flags & MSG_OOB) { 1656 m = m_get(M_WAITOK, MT_DATA); 1657 if (m == NULL) 1658 return (ENOBUFS); 1659 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1660 if (error) 1661 goto bad; 1662 if (sio) { 1663 do { 1664 sbappend(sio, m); 1665 KKASSERT(resid >= (size_t)m->m_len); 1666 resid -= (size_t)m->m_len; 1667 } while (resid > 0 && m); 1668 } else { 1669 do { 1670 uio->uio_resid = resid; 1671 error = uiomove(mtod(m, caddr_t), 1672 (int)szmin(resid, m->m_len), 1673 uio); 1674 resid = uio->uio_resid; 1675 m = m_free(m); 1676 } while (uio->uio_resid && error == 0 && m); 1677 } 1678 bad: 1679 if (m) 1680 m_freem(m); 1681 return (error); 1682 } 1683 1684 /* 1685 * The token interlocks against the protocol thread while 1686 * ssb_lock is a blocking lock against other userland entities. 1687 * 1688 * Lock a limited number of mbufs (not all, so sbcompress() still 1689 * works well). The token is used as an interlock for sbwait() so 1690 * release it afterwords. 1691 */ 1692 restart: 1693 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1694 if (error) 1695 goto done; 1696 1697 lwkt_gettoken(&so->so_rcv.ssb_token); 1698 m = so->so_rcv.ssb_mb; 1699 1700 /* 1701 * If we have less data than requested, block awaiting more 1702 * (subject to any timeout) if: 1703 * 1. the current count is less than the low water mark, or 1704 * 2. MSG_WAITALL is set, and it is possible to do the entire 1705 * receive operation at once if we block (resid <= hiwat). 1706 * 3. MSG_DONTWAIT is not set 1707 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1708 * we have to do the receive in sections, and thus risk returning 1709 * a short count if a timeout or signal occurs after we start. 1710 */ 1711 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1712 (size_t)so->so_rcv.ssb_cc < resid) && 1713 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1714 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1715 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1716 if (so->so_error) { 1717 if (m) 1718 goto dontblock; 1719 lwkt_reltoken(&so->so_rcv.ssb_token); 1720 error = so->so_error; 1721 if ((flags & MSG_PEEK) == 0) 1722 so->so_error = 0; 1723 goto release; 1724 } 1725 if (so->so_state & SS_CANTRCVMORE) { 1726 if (m) 1727 goto dontblock; 1728 lwkt_reltoken(&so->so_rcv.ssb_token); 1729 goto release; 1730 } 1731 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1732 (pr->pr_flags & PR_CONNREQUIRED)) { 1733 lwkt_reltoken(&so->so_rcv.ssb_token); 1734 error = ENOTCONN; 1735 goto release; 1736 } 1737 if (resid == 0) { 1738 lwkt_reltoken(&so->so_rcv.ssb_token); 1739 goto release; 1740 } 1741 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1742 lwkt_reltoken(&so->so_rcv.ssb_token); 1743 error = EWOULDBLOCK; 1744 goto release; 1745 } 1746 ssb_unlock(&so->so_rcv); 1747 error = ssb_wait(&so->so_rcv); 1748 lwkt_reltoken(&so->so_rcv.ssb_token); 1749 if (error) 1750 goto done; 1751 goto restart; 1752 } 1753 1754 /* 1755 * Token still held 1756 */ 1757 dontblock: 1758 n = m; 1759 restmp = 0; 1760 while (n && restmp < resid) { 1761 n->m_flags |= M_SOLOCKED; 1762 restmp += n->m_len; 1763 if (n->m_next == NULL) 1764 n = n->m_nextpkt; 1765 else 1766 n = n->m_next; 1767 } 1768 1769 /* 1770 * Release token for loop 1771 */ 1772 lwkt_reltoken(&so->so_rcv.ssb_token); 1773 if (uio && uio->uio_td && uio->uio_td->td_proc) 1774 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1775 1776 /* 1777 * note: m should be == sb_mb here. Cache the next record while 1778 * cleaning up. Note that calling m_free*() will break out critical 1779 * section. 1780 */ 1781 KKASSERT(m == so->so_rcv.ssb_mb); 1782 1783 /* 1784 * Copy to the UIO or mbuf return chain (*mp). 1785 * 1786 * NOTE: Token is not held for loop 1787 */ 1788 moff = 0; 1789 offset = 0; 1790 didoob = 0; 1791 1792 while (m && (m->m_flags & M_SOLOCKED) && resid > 0 && error == 0) { 1793 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1794 ("receive 3")); 1795 1796 soclrstate(so, SS_RCVATMARK); 1797 len = (resid > INT_MAX) ? INT_MAX : resid; 1798 if (so->so_oobmark && len > so->so_oobmark - offset) 1799 len = so->so_oobmark - offset; 1800 if (len > m->m_len - moff) 1801 len = m->m_len - moff; 1802 1803 /* 1804 * Copy out to the UIO or pass the mbufs back to the SIO. 1805 * The SIO is dealt with when we eat the mbuf, but deal 1806 * with the resid here either way. 1807 */ 1808 if (uio) { 1809 uio->uio_resid = resid; 1810 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1811 resid = uio->uio_resid; 1812 if (error) 1813 goto release; 1814 } else { 1815 resid -= (size_t)len; 1816 } 1817 1818 /* 1819 * Eat the entire mbuf or just a piece of it 1820 */ 1821 offset += len; 1822 if (len == m->m_len - moff) { 1823 m = m->m_next; 1824 moff = 0; 1825 } else { 1826 moff += len; 1827 } 1828 1829 /* 1830 * Check oobmark 1831 */ 1832 if (so->so_oobmark && offset == so->so_oobmark) { 1833 didoob = 1; 1834 break; 1835 } 1836 } 1837 1838 /* 1839 * Synchronize sockbuf with data we read. 1840 * 1841 * NOTE: (m) is junk on entry (it could be left over from the 1842 * previous loop). 1843 */ 1844 if ((flags & MSG_PEEK) == 0) { 1845 lwkt_gettoken(&so->so_rcv.ssb_token); 1846 m = so->so_rcv.ssb_mb; 1847 while (m && offset >= m->m_len) { 1848 if (so->so_oobmark) { 1849 so->so_oobmark -= m->m_len; 1850 if (so->so_oobmark == 0) { 1851 sosetstate(so, SS_RCVATMARK); 1852 didoob = 1; 1853 } 1854 } 1855 offset -= m->m_len; 1856 if (sio) { 1857 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1858 sbappend(sio, m); 1859 m = n; 1860 } else { 1861 m = sbunlinkmbuf(&so->so_rcv.sb, 1862 m, &free_chain); 1863 } 1864 } 1865 if (offset) { 1866 KKASSERT(m); 1867 if (sio) { 1868 n = m_copym(m, 0, offset, M_WAITOK); 1869 if (n) 1870 sbappend(sio, n); 1871 } 1872 m->m_data += offset; 1873 m->m_len -= offset; 1874 so->so_rcv.ssb_cc -= offset; 1875 if (so->so_oobmark) { 1876 so->so_oobmark -= offset; 1877 if (so->so_oobmark == 0) { 1878 sosetstate(so, SS_RCVATMARK); 1879 didoob = 1; 1880 } 1881 } 1882 offset = 0; 1883 } 1884 lwkt_reltoken(&so->so_rcv.ssb_token); 1885 } 1886 1887 /* 1888 * If the MSG_WAITALL flag is set (for non-atomic socket), 1889 * we must not quit until resid == 0 or an error termination. 1890 * 1891 * If a signal/timeout occurs, return with a short count but without 1892 * error. 1893 * 1894 * Keep signalsockbuf locked against other readers. 1895 * 1896 * XXX if MSG_PEEK we currently do quit. 1897 */ 1898 if ((flags & MSG_WAITALL) && !(flags & MSG_PEEK) && 1899 didoob == 0 && resid > 0 && 1900 !sosendallatonce(so)) { 1901 lwkt_gettoken(&so->so_rcv.ssb_token); 1902 error = 0; 1903 while ((m = so->so_rcv.ssb_mb) == NULL) { 1904 if (so->so_error || (so->so_state & SS_CANTRCVMORE)) { 1905 error = so->so_error; 1906 break; 1907 } 1908 /* 1909 * The window might have closed to zero, make 1910 * sure we send an ack now that we've drained 1911 * the buffer or we might end up blocking until 1912 * the idle takes over (5 seconds). 1913 */ 1914 if (so->so_pcb) 1915 so_pru_rcvd_async(so); 1916 if (so->so_rcv.ssb_mb == NULL) 1917 error = ssb_wait(&so->so_rcv); 1918 if (error) { 1919 lwkt_reltoken(&so->so_rcv.ssb_token); 1920 ssb_unlock(&so->so_rcv); 1921 error = 0; 1922 goto done; 1923 } 1924 } 1925 if (m && error == 0) 1926 goto dontblock; 1927 lwkt_reltoken(&so->so_rcv.ssb_token); 1928 } 1929 1930 /* 1931 * Token not held here. 1932 * 1933 * Cleanup. If an atomic read was requested drop any unread data XXX 1934 */ 1935 if ((flags & MSG_PEEK) == 0) { 1936 if (so->so_pcb) 1937 so_pru_rcvd_async(so); 1938 } 1939 1940 if (orig_resid == resid && orig_resid && 1941 (so->so_state & SS_CANTRCVMORE) == 0) { 1942 ssb_unlock(&so->so_rcv); 1943 goto restart; 1944 } 1945 1946 if (flagsp) 1947 *flagsp |= flags; 1948 release: 1949 ssb_unlock(&so->so_rcv); 1950 done: 1951 if (free_chain) 1952 m_freem(free_chain); 1953 return (error); 1954 } 1955 1956 /* 1957 * Shut a socket down. Note that we do not get a frontend lock as we 1958 * want to be able to shut the socket down even if another thread is 1959 * blocked in a read(), thus waking it up. 1960 */ 1961 int 1962 soshutdown(struct socket *so, int how) 1963 { 1964 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1965 return (EINVAL); 1966 1967 if (how != SHUT_WR) { 1968 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1969 sorflush(so); 1970 /*ssb_unlock(&so->so_rcv);*/ 1971 } 1972 if (how != SHUT_RD) 1973 return (so_pru_shutdown(so)); 1974 return (0); 1975 } 1976 1977 void 1978 sorflush(struct socket *so) 1979 { 1980 struct signalsockbuf *ssb = &so->so_rcv; 1981 struct protosw *pr = so->so_proto; 1982 struct signalsockbuf asb; 1983 1984 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1985 1986 lwkt_gettoken(&ssb->ssb_token); 1987 socantrcvmore(so); 1988 asb = *ssb; 1989 1990 /* 1991 * Can't just blow up the ssb structure here 1992 */ 1993 bzero(&ssb->sb, sizeof(ssb->sb)); 1994 ssb->ssb_timeo = 0; 1995 ssb->ssb_lowat = 0; 1996 ssb->ssb_hiwat = 0; 1997 ssb->ssb_mbmax = 0; 1998 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1999 2000 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 2001 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 2002 ssb_release(&asb, so); 2003 2004 lwkt_reltoken(&ssb->ssb_token); 2005 } 2006 2007 #ifdef INET 2008 static int 2009 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 2010 { 2011 struct accept_filter_arg *afap = NULL; 2012 struct accept_filter *afp; 2013 struct so_accf *af = so->so_accf; 2014 int error = 0; 2015 2016 /* do not set/remove accept filters on non listen sockets */ 2017 if ((so->so_options & SO_ACCEPTCONN) == 0) { 2018 error = EINVAL; 2019 goto out; 2020 } 2021 2022 /* removing the filter */ 2023 if (sopt == NULL) { 2024 if (af != NULL) { 2025 if (af->so_accept_filter != NULL && 2026 af->so_accept_filter->accf_destroy != NULL) { 2027 af->so_accept_filter->accf_destroy(so); 2028 } 2029 if (af->so_accept_filter_str != NULL) { 2030 kfree(af->so_accept_filter_str, M_ACCF); 2031 } 2032 kfree(af, M_ACCF); 2033 so->so_accf = NULL; 2034 } 2035 so->so_options &= ~SO_ACCEPTFILTER; 2036 return (0); 2037 } 2038 /* adding a filter */ 2039 /* must remove previous filter first */ 2040 if (af != NULL) { 2041 error = EINVAL; 2042 goto out; 2043 } 2044 /* don't put large objects on the kernel stack */ 2045 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 2046 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 2047 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 2048 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 2049 if (error) 2050 goto out; 2051 afp = accept_filt_get(afap->af_name); 2052 if (afp == NULL) { 2053 error = ENOENT; 2054 goto out; 2055 } 2056 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 2057 if (afp->accf_create != NULL) { 2058 if (afap->af_name[0] != '\0') { 2059 int len = strlen(afap->af_name) + 1; 2060 2061 af->so_accept_filter_str = kmalloc(len, M_ACCF, 2062 M_WAITOK); 2063 strcpy(af->so_accept_filter_str, afap->af_name); 2064 } 2065 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 2066 if (af->so_accept_filter_arg == NULL) { 2067 kfree(af->so_accept_filter_str, M_ACCF); 2068 kfree(af, M_ACCF); 2069 so->so_accf = NULL; 2070 error = EINVAL; 2071 goto out; 2072 } 2073 } 2074 af->so_accept_filter = afp; 2075 so->so_accf = af; 2076 so->so_options |= SO_ACCEPTFILTER; 2077 out: 2078 if (afap != NULL) 2079 kfree(afap, M_TEMP); 2080 return (error); 2081 } 2082 #endif /* INET */ 2083 2084 /* 2085 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2086 * an additional variant to handle the case where the option value needs 2087 * to be some kind of integer, but not a specific size. 2088 * In addition to their use here, these functions are also called by the 2089 * protocol-level pr_ctloutput() routines. 2090 */ 2091 int 2092 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2093 { 2094 return soopt_to_kbuf(sopt, buf, len, minlen); 2095 } 2096 2097 int 2098 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2099 { 2100 size_t valsize; 2101 2102 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2103 KKASSERT(kva_p(buf)); 2104 2105 /* 2106 * If the user gives us more than we wanted, we ignore it, 2107 * but if we don't get the minimum length the caller 2108 * wants, we return EINVAL. On success, sopt->sopt_valsize 2109 * is set to however much we actually retrieved. 2110 */ 2111 if ((valsize = sopt->sopt_valsize) < minlen) 2112 return EINVAL; 2113 if (valsize > len) 2114 sopt->sopt_valsize = valsize = len; 2115 2116 bcopy(sopt->sopt_val, buf, valsize); 2117 return 0; 2118 } 2119 2120 2121 int 2122 sosetopt(struct socket *so, struct sockopt *sopt) 2123 { 2124 int error, optval; 2125 struct linger l; 2126 struct timeval tv; 2127 u_long val; 2128 struct signalsockbuf *sotmp; 2129 2130 error = 0; 2131 sopt->sopt_dir = SOPT_SET; 2132 if (sopt->sopt_level != SOL_SOCKET) { 2133 if (so->so_proto && so->so_proto->pr_ctloutput) { 2134 return (so_pr_ctloutput(so, sopt)); 2135 } 2136 error = ENOPROTOOPT; 2137 } else { 2138 switch (sopt->sopt_name) { 2139 #ifdef INET 2140 case SO_ACCEPTFILTER: 2141 error = do_setopt_accept_filter(so, sopt); 2142 if (error) 2143 goto bad; 2144 break; 2145 #endif /* INET */ 2146 case SO_LINGER: 2147 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2148 if (error) 2149 goto bad; 2150 2151 so->so_linger = l.l_linger; 2152 if (l.l_onoff) 2153 so->so_options |= SO_LINGER; 2154 else 2155 so->so_options &= ~SO_LINGER; 2156 break; 2157 2158 case SO_DEBUG: 2159 case SO_KEEPALIVE: 2160 case SO_DONTROUTE: 2161 case SO_USELOOPBACK: 2162 case SO_BROADCAST: 2163 case SO_REUSEADDR: 2164 case SO_REUSEPORT: 2165 case SO_OOBINLINE: 2166 case SO_TIMESTAMP: 2167 case SO_NOSIGPIPE: 2168 error = sooptcopyin(sopt, &optval, sizeof optval, 2169 sizeof optval); 2170 if (error) 2171 goto bad; 2172 if (optval) 2173 so->so_options |= sopt->sopt_name; 2174 else 2175 so->so_options &= ~sopt->sopt_name; 2176 break; 2177 2178 case SO_SNDBUF: 2179 case SO_RCVBUF: 2180 case SO_SNDLOWAT: 2181 case SO_RCVLOWAT: 2182 error = sooptcopyin(sopt, &optval, sizeof optval, 2183 sizeof optval); 2184 if (error) 2185 goto bad; 2186 2187 /* 2188 * Values < 1 make no sense for any of these 2189 * options, so disallow them. 2190 */ 2191 if (optval < 1) { 2192 error = EINVAL; 2193 goto bad; 2194 } 2195 2196 switch (sopt->sopt_name) { 2197 case SO_SNDBUF: 2198 case SO_RCVBUF: 2199 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2200 &so->so_snd : &so->so_rcv, (u_long)optval, 2201 so, 2202 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2203 error = ENOBUFS; 2204 goto bad; 2205 } 2206 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2207 &so->so_snd : &so->so_rcv; 2208 atomic_clear_int(&sotmp->ssb_flags, 2209 SSB_AUTOSIZE); 2210 break; 2211 2212 /* 2213 * Make sure the low-water is never greater than 2214 * the high-water. 2215 */ 2216 case SO_SNDLOWAT: 2217 so->so_snd.ssb_lowat = 2218 (optval > so->so_snd.ssb_hiwat) ? 2219 so->so_snd.ssb_hiwat : optval; 2220 atomic_clear_int(&so->so_snd.ssb_flags, 2221 SSB_AUTOLOWAT); 2222 break; 2223 case SO_RCVLOWAT: 2224 so->so_rcv.ssb_lowat = 2225 (optval > so->so_rcv.ssb_hiwat) ? 2226 so->so_rcv.ssb_hiwat : optval; 2227 atomic_clear_int(&so->so_rcv.ssb_flags, 2228 SSB_AUTOLOWAT); 2229 break; 2230 } 2231 break; 2232 2233 case SO_SNDTIMEO: 2234 case SO_RCVTIMEO: 2235 error = sooptcopyin(sopt, &tv, sizeof tv, 2236 sizeof tv); 2237 if (error) 2238 goto bad; 2239 2240 /* assert(hz > 0); */ 2241 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2242 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2243 error = EDOM; 2244 goto bad; 2245 } 2246 /* assert(tick > 0); */ 2247 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2248 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2249 if (val > INT_MAX) { 2250 error = EDOM; 2251 goto bad; 2252 } 2253 if (val == 0 && tv.tv_usec != 0) 2254 val = 1; 2255 2256 switch (sopt->sopt_name) { 2257 case SO_SNDTIMEO: 2258 so->so_snd.ssb_timeo = val; 2259 break; 2260 case SO_RCVTIMEO: 2261 so->so_rcv.ssb_timeo = val; 2262 break; 2263 } 2264 break; 2265 default: 2266 error = ENOPROTOOPT; 2267 break; 2268 } 2269 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2270 (void) so_pr_ctloutput(so, sopt); 2271 } 2272 } 2273 bad: 2274 return (error); 2275 } 2276 2277 /* Helper routine for getsockopt */ 2278 int 2279 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2280 { 2281 soopt_from_kbuf(sopt, buf, len); 2282 return 0; 2283 } 2284 2285 void 2286 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2287 { 2288 size_t valsize; 2289 2290 if (len == 0) { 2291 sopt->sopt_valsize = 0; 2292 return; 2293 } 2294 2295 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2296 KKASSERT(kva_p(buf)); 2297 2298 /* 2299 * Documented get behavior is that we always return a value, 2300 * possibly truncated to fit in the user's buffer. 2301 * Traditional behavior is that we always tell the user 2302 * precisely how much we copied, rather than something useful 2303 * like the total amount we had available for her. 2304 * Note that this interface is not idempotent; the entire answer must 2305 * generated ahead of time. 2306 */ 2307 valsize = szmin(len, sopt->sopt_valsize); 2308 sopt->sopt_valsize = valsize; 2309 if (sopt->sopt_val != 0) { 2310 bcopy(buf, sopt->sopt_val, valsize); 2311 } 2312 } 2313 2314 int 2315 sogetopt(struct socket *so, struct sockopt *sopt) 2316 { 2317 int error, optval; 2318 long optval_l; 2319 struct linger l; 2320 struct timeval tv; 2321 #ifdef INET 2322 struct accept_filter_arg *afap; 2323 #endif 2324 2325 error = 0; 2326 sopt->sopt_dir = SOPT_GET; 2327 if (sopt->sopt_level != SOL_SOCKET) { 2328 if (so->so_proto && so->so_proto->pr_ctloutput) { 2329 return (so_pr_ctloutput(so, sopt)); 2330 } else 2331 return (ENOPROTOOPT); 2332 } else { 2333 switch (sopt->sopt_name) { 2334 #ifdef INET 2335 case SO_ACCEPTFILTER: 2336 if ((so->so_options & SO_ACCEPTCONN) == 0) 2337 return (EINVAL); 2338 afap = kmalloc(sizeof(*afap), M_TEMP, 2339 M_WAITOK | M_ZERO); 2340 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2341 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2342 if (so->so_accf->so_accept_filter_str != NULL) 2343 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2344 } 2345 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2346 kfree(afap, M_TEMP); 2347 break; 2348 #endif /* INET */ 2349 2350 case SO_LINGER: 2351 l.l_onoff = so->so_options & SO_LINGER; 2352 l.l_linger = so->so_linger; 2353 error = sooptcopyout(sopt, &l, sizeof l); 2354 break; 2355 2356 case SO_USELOOPBACK: 2357 case SO_DONTROUTE: 2358 case SO_DEBUG: 2359 case SO_KEEPALIVE: 2360 case SO_REUSEADDR: 2361 case SO_REUSEPORT: 2362 case SO_BROADCAST: 2363 case SO_OOBINLINE: 2364 case SO_TIMESTAMP: 2365 case SO_NOSIGPIPE: 2366 optval = so->so_options & sopt->sopt_name; 2367 integer: 2368 error = sooptcopyout(sopt, &optval, sizeof optval); 2369 break; 2370 2371 case SO_TYPE: 2372 optval = so->so_type; 2373 goto integer; 2374 2375 case SO_ERROR: 2376 optval = so->so_error; 2377 so->so_error = 0; 2378 goto integer; 2379 2380 case SO_SNDBUF: 2381 optval = so->so_snd.ssb_hiwat; 2382 goto integer; 2383 2384 case SO_RCVBUF: 2385 optval = so->so_rcv.ssb_hiwat; 2386 goto integer; 2387 2388 case SO_SNDLOWAT: 2389 optval = so->so_snd.ssb_lowat; 2390 goto integer; 2391 2392 case SO_RCVLOWAT: 2393 optval = so->so_rcv.ssb_lowat; 2394 goto integer; 2395 2396 case SO_SNDTIMEO: 2397 case SO_RCVTIMEO: 2398 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2399 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2400 2401 tv.tv_sec = optval / hz; 2402 tv.tv_usec = (optval % hz) * ustick; 2403 error = sooptcopyout(sopt, &tv, sizeof tv); 2404 break; 2405 2406 case SO_SNDSPACE: 2407 optval_l = ssb_space(&so->so_snd); 2408 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2409 break; 2410 2411 case SO_CPUHINT: 2412 optval = -1; /* no hint */ 2413 goto integer; 2414 2415 default: 2416 error = ENOPROTOOPT; 2417 break; 2418 } 2419 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) 2420 so_pr_ctloutput(so, sopt); 2421 return (error); 2422 } 2423 } 2424 2425 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2426 int 2427 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2428 { 2429 struct mbuf *m, *m_prev; 2430 int sopt_size = sopt->sopt_valsize, msize; 2431 2432 m = m_getl(sopt_size, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA, 2433 0, &msize); 2434 if (m == NULL) 2435 return (ENOBUFS); 2436 m->m_len = min(msize, sopt_size); 2437 sopt_size -= m->m_len; 2438 *mp = m; 2439 m_prev = m; 2440 2441 while (sopt_size > 0) { 2442 m = m_getl(sopt_size, sopt->sopt_td ? M_WAITOK : M_NOWAIT, 2443 MT_DATA, 0, &msize); 2444 if (m == NULL) { 2445 m_freem(*mp); 2446 return (ENOBUFS); 2447 } 2448 m->m_len = min(msize, sopt_size); 2449 sopt_size -= m->m_len; 2450 m_prev->m_next = m; 2451 m_prev = m; 2452 } 2453 return (0); 2454 } 2455 2456 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2457 int 2458 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2459 { 2460 soopt_to_mbuf(sopt, m); 2461 return 0; 2462 } 2463 2464 void 2465 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2466 { 2467 size_t valsize; 2468 void *val; 2469 2470 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2471 KKASSERT(kva_p(m)); 2472 if (sopt->sopt_val == NULL) 2473 return; 2474 val = sopt->sopt_val; 2475 valsize = sopt->sopt_valsize; 2476 while (m != NULL && valsize >= m->m_len) { 2477 bcopy(val, mtod(m, char *), m->m_len); 2478 valsize -= m->m_len; 2479 val = (caddr_t)val + m->m_len; 2480 m = m->m_next; 2481 } 2482 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2483 panic("ip6_sooptmcopyin"); 2484 } 2485 2486 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2487 int 2488 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2489 { 2490 return soopt_from_mbuf(sopt, m); 2491 } 2492 2493 int 2494 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2495 { 2496 struct mbuf *m0 = m; 2497 size_t valsize = 0; 2498 size_t maxsize; 2499 void *val; 2500 2501 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2502 KKASSERT(kva_p(m)); 2503 if (sopt->sopt_val == NULL) 2504 return 0; 2505 val = sopt->sopt_val; 2506 maxsize = sopt->sopt_valsize; 2507 while (m != NULL && maxsize >= m->m_len) { 2508 bcopy(mtod(m, char *), val, m->m_len); 2509 maxsize -= m->m_len; 2510 val = (caddr_t)val + m->m_len; 2511 valsize += m->m_len; 2512 m = m->m_next; 2513 } 2514 if (m != NULL) { 2515 /* enough soopt buffer should be given from user-land */ 2516 m_freem(m0); 2517 return (EINVAL); 2518 } 2519 sopt->sopt_valsize = valsize; 2520 return 0; 2521 } 2522 2523 void 2524 sohasoutofband(struct socket *so) 2525 { 2526 if (so->so_sigio != NULL) 2527 pgsigio(so->so_sigio, SIGURG, 0); 2528 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2529 } 2530 2531 int 2532 sokqfilter(struct file *fp, struct knote *kn) 2533 { 2534 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2535 struct signalsockbuf *ssb; 2536 2537 switch (kn->kn_filter) { 2538 case EVFILT_READ: 2539 if (so->so_options & SO_ACCEPTCONN) 2540 kn->kn_fop = &solisten_filtops; 2541 else 2542 kn->kn_fop = &soread_filtops; 2543 ssb = &so->so_rcv; 2544 break; 2545 case EVFILT_WRITE: 2546 kn->kn_fop = &sowrite_filtops; 2547 ssb = &so->so_snd; 2548 break; 2549 case EVFILT_EXCEPT: 2550 kn->kn_fop = &soexcept_filtops; 2551 ssb = &so->so_rcv; 2552 break; 2553 default: 2554 return (EOPNOTSUPP); 2555 } 2556 2557 knote_insert(&ssb->ssb_kq.ki_note, kn); 2558 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2559 return (0); 2560 } 2561 2562 static void 2563 filt_sordetach(struct knote *kn) 2564 { 2565 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2566 2567 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2568 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2569 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2570 } 2571 2572 /*ARGSUSED*/ 2573 static int 2574 filt_soread(struct knote *kn, long hint) 2575 { 2576 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2577 2578 if (kn->kn_sfflags & NOTE_OOB) { 2579 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2580 kn->kn_fflags |= NOTE_OOB; 2581 return (1); 2582 } 2583 return (0); 2584 } 2585 kn->kn_data = so->so_rcv.ssb_cc; 2586 2587 if (so->so_state & SS_CANTRCVMORE) { 2588 /* 2589 * Only set NODATA if all data has been exhausted. 2590 */ 2591 if (kn->kn_data == 0) 2592 kn->kn_flags |= EV_NODATA; 2593 kn->kn_flags |= EV_EOF; 2594 kn->kn_fflags = so->so_error; 2595 return (1); 2596 } 2597 if (so->so_error) /* temporary udp error */ 2598 return (1); 2599 if (kn->kn_sfflags & NOTE_LOWAT) 2600 return (kn->kn_data >= kn->kn_sdata); 2601 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2602 !TAILQ_EMPTY(&so->so_comp)); 2603 } 2604 2605 static void 2606 filt_sowdetach(struct knote *kn) 2607 { 2608 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2609 2610 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2611 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2612 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2613 } 2614 2615 /*ARGSUSED*/ 2616 static int 2617 filt_sowrite(struct knote *kn, long hint) 2618 { 2619 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2620 2621 kn->kn_data = ssb_space(&so->so_snd); 2622 if (so->so_state & SS_CANTSENDMORE) { 2623 kn->kn_flags |= (EV_EOF | EV_NODATA); 2624 kn->kn_fflags = so->so_error; 2625 return (1); 2626 } 2627 if (so->so_error) /* temporary udp error */ 2628 return (1); 2629 if (((so->so_state & SS_ISCONNECTED) == 0) && 2630 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2631 return (0); 2632 if (kn->kn_sfflags & NOTE_LOWAT) 2633 return (kn->kn_data >= kn->kn_sdata); 2634 return (kn->kn_data >= so->so_snd.ssb_lowat); 2635 } 2636 2637 /*ARGSUSED*/ 2638 static int 2639 filt_solisten(struct knote *kn, long hint) 2640 { 2641 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2642 2643 kn->kn_data = so->so_qlen; 2644 return (! TAILQ_EMPTY(&so->so_comp)); 2645 } 2646