1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 #include "opt_sctp.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/fcntl.h> 72 #include <sys/malloc.h> 73 #include <sys/mbuf.h> 74 #include <sys/domain.h> 75 #include <sys/file.h> /* for struct knote */ 76 #include <sys/kernel.h> 77 #include <sys/event.h> 78 #include <sys/proc.h> 79 #include <sys/protosw.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 #include <sys/socketops.h> 83 #include <sys/resourcevar.h> 84 #include <sys/signalvar.h> 85 #include <sys/sysctl.h> 86 #include <sys/uio.h> 87 #include <sys/jail.h> 88 #include <vm/vm_zone.h> 89 #include <vm/pmap.h> 90 #include <net/netmsg2.h> 91 #include <net/netisr2.h> 92 93 #include <sys/thread2.h> 94 #include <sys/socketvar2.h> 95 #include <sys/spinlock2.h> 96 97 #include <machine/limits.h> 98 99 #ifdef INET 100 extern int tcp_sosend_agglim; 101 extern int tcp_sosend_async; 102 extern int tcp_sosend_jcluster; 103 extern int udp_sosend_async; 104 extern int udp_sosend_prepend; 105 106 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 107 #endif /* INET */ 108 109 static void filt_sordetach(struct knote *kn); 110 static int filt_soread(struct knote *kn, long hint); 111 static void filt_sowdetach(struct knote *kn); 112 static int filt_sowrite(struct knote *kn, long hint); 113 static int filt_solisten(struct knote *kn, long hint); 114 115 static void sodiscard(struct socket *so); 116 static int soclose_sync(struct socket *so, int fflag); 117 static void soclose_fast(struct socket *so); 118 119 static struct filterops solisten_filtops = 120 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 121 static struct filterops soread_filtops = 122 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 123 static struct filterops sowrite_filtops = 124 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 125 static struct filterops soexcept_filtops = 126 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 127 128 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 129 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 130 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 131 132 133 static int somaxconn = SOMAXCONN; 134 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 135 &somaxconn, 0, "Maximum pending socket connection queue size"); 136 137 static int use_soclose_fast = 1; 138 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 139 &use_soclose_fast, 0, "Fast socket close"); 140 141 int use_soaccept_pred_fast = 1; 142 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 143 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 144 145 int use_sendfile_async = 1; 146 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 147 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 148 149 int use_soconnect_async = 1; 150 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 151 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 152 153 int use_rand_initport = 1; 154 SYSCTL_INT(_kern_ipc, OID_AUTO, rand_initport, CTLFLAG_RW, 155 &use_rand_initport, 0, "socket uses random initial msgport"); 156 157 /* 158 * Socket operation routines. 159 * These routines are called by the routines in 160 * sys_socket.c or from a system process, and 161 * implement the semantics of socket operations by 162 * switching out to the protocol specific routines. 163 */ 164 165 /* 166 * Get a socket structure, and initialize it. 167 * Note that it would probably be better to allocate socket 168 * and PCB at the same time, but I'm not convinced that all 169 * the protocols can be easily modified to do this. 170 */ 171 struct socket * 172 soalloc(int waitok, struct protosw *pr) 173 { 174 struct socket *so; 175 unsigned waitmask; 176 177 waitmask = waitok ? M_WAITOK : M_NOWAIT; 178 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 179 if (so) { 180 /* XXX race condition for reentrant kernel */ 181 so->so_proto = pr; 182 TAILQ_INIT(&so->so_aiojobq); 183 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 184 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 185 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 186 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 187 spin_init(&so->so_rcvd_spin); 188 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 189 MSGF_DROPABLE | MSGF_PRIORITY, 190 so->so_proto->pr_usrreqs->pru_rcvd); 191 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 192 so->so_state = SS_NOFDREF; 193 so->so_refs = 1; 194 } 195 return so; 196 } 197 198 int 199 socreate(int dom, struct socket **aso, int type, 200 int proto, struct thread *td) 201 { 202 struct proc *p = td->td_proc; 203 struct protosw *prp; 204 struct socket *so; 205 struct pru_attach_info ai; 206 int error; 207 208 if (proto) 209 prp = pffindproto(dom, proto, type); 210 else 211 prp = pffindtype(dom, type); 212 213 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 214 return (EPROTONOSUPPORT); 215 216 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 217 prp->pr_domain->dom_family != PF_LOCAL && 218 prp->pr_domain->dom_family != PF_INET && 219 prp->pr_domain->dom_family != PF_INET6 && 220 prp->pr_domain->dom_family != PF_ROUTE) { 221 return (EPROTONOSUPPORT); 222 } 223 224 if (prp->pr_type != type) 225 return (EPROTOTYPE); 226 so = soalloc(p != NULL, prp); 227 if (so == NULL) 228 return (ENOBUFS); 229 230 /* 231 * Callers of socreate() presumably will connect up a descriptor 232 * and call soclose() if they cannot. This represents our so_refs 233 * (which should be 1) from soalloc(). 234 */ 235 soclrstate(so, SS_NOFDREF); 236 237 /* 238 * Set a default port for protocol processing. No action will occur 239 * on the socket on this port until an inpcb is attached to it and 240 * is able to match incoming packets, or until the socket becomes 241 * available to userland. 242 * 243 * We normally default the socket to the protocol thread on cpu 0. 244 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 245 * thread and all pr_*()/pru_*() calls are executed synchronously. 246 */ 247 if (prp->pr_flags & PR_SYNC_PORT) { 248 so->so_port = &netisr_sync_port; 249 } else if (prp->pr_flags & PR_RAND_INITPORT) { 250 if (use_rand_initport) 251 so->so_port = netisr_cpuport(mycpuid & ncpus2_mask); 252 else 253 so->so_port = netisr_cpuport(0); 254 } else { 255 so->so_port = netisr_cpuport(0); 256 } 257 258 TAILQ_INIT(&so->so_incomp); 259 TAILQ_INIT(&so->so_comp); 260 so->so_type = type; 261 so->so_cred = crhold(p->p_ucred); 262 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 263 ai.p_ucred = p->p_ucred; 264 ai.fd_rdir = p->p_fd->fd_rdir; 265 266 /* 267 * Auto-sizing of socket buffers is managed by the protocols and 268 * the appropriate flags must be set in the pru_attach function. 269 */ 270 error = so_pru_attach(so, proto, &ai); 271 if (error) { 272 sosetstate(so, SS_NOFDREF); 273 sofree(so); /* from soalloc */ 274 return error; 275 } 276 277 /* 278 * NOTE: Returns referenced socket. 279 */ 280 *aso = so; 281 return (0); 282 } 283 284 int 285 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 286 { 287 int error; 288 289 error = so_pru_bind(so, nam, td); 290 return (error); 291 } 292 293 static void 294 sodealloc(struct socket *so) 295 { 296 if (so->so_rcv.ssb_hiwat) 297 (void)chgsbsize(so->so_cred->cr_uidinfo, 298 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 299 if (so->so_snd.ssb_hiwat) 300 (void)chgsbsize(so->so_cred->cr_uidinfo, 301 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 302 #ifdef INET 303 /* remove accept filter if present */ 304 if (so->so_accf != NULL) 305 do_setopt_accept_filter(so, NULL); 306 #endif /* INET */ 307 crfree(so->so_cred); 308 if (so->so_faddr != NULL) 309 kfree(so->so_faddr, M_SONAME); 310 kfree(so, M_SOCKET); 311 } 312 313 int 314 solisten(struct socket *so, int backlog, struct thread *td) 315 { 316 int error; 317 #ifdef SCTP 318 short oldopt, oldqlimit; 319 #endif /* SCTP */ 320 321 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 322 return (EINVAL); 323 324 #ifdef SCTP 325 oldopt = so->so_options; 326 oldqlimit = so->so_qlimit; 327 #endif /* SCTP */ 328 329 lwkt_gettoken(&so->so_rcv.ssb_token); 330 if (TAILQ_EMPTY(&so->so_comp)) 331 so->so_options |= SO_ACCEPTCONN; 332 lwkt_reltoken(&so->so_rcv.ssb_token); 333 if (backlog < 0 || backlog > somaxconn) 334 backlog = somaxconn; 335 so->so_qlimit = backlog; 336 /* SCTP needs to look at tweak both the inbound backlog parameter AND 337 * the so_options (UDP model both connect's and gets inbound 338 * connections .. implicitly). 339 */ 340 error = so_pru_listen(so, td); 341 if (error) { 342 #ifdef SCTP 343 /* Restore the params */ 344 so->so_options = oldopt; 345 so->so_qlimit = oldqlimit; 346 #endif /* SCTP */ 347 return (error); 348 } 349 return (0); 350 } 351 352 /* 353 * Destroy a disconnected socket. This routine is a NOP if entities 354 * still have a reference on the socket: 355 * 356 * so_pcb - The protocol stack still has a reference 357 * SS_NOFDREF - There is no longer a file pointer reference 358 */ 359 void 360 sofree(struct socket *so) 361 { 362 struct socket *head; 363 364 /* 365 * This is a bit hackish at the moment. We need to interlock 366 * any accept queue we are on before we potentially lose the 367 * last reference to avoid races against a re-reference from 368 * someone operating on the queue. 369 */ 370 while ((head = so->so_head) != NULL) { 371 lwkt_getpooltoken(head); 372 if (so->so_head == head) 373 break; 374 lwkt_relpooltoken(head); 375 } 376 377 /* 378 * Arbitrage the last free. 379 */ 380 KKASSERT(so->so_refs > 0); 381 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 382 if (head) 383 lwkt_relpooltoken(head); 384 return; 385 } 386 387 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 388 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 389 390 /* 391 * We're done, remove ourselves from the accept queue we are 392 * on, if we are on one. 393 */ 394 if (head != NULL) { 395 if (so->so_state & SS_INCOMP) { 396 TAILQ_REMOVE(&head->so_incomp, so, so_list); 397 head->so_incqlen--; 398 } else if (so->so_state & SS_COMP) { 399 /* 400 * We must not decommission a socket that's 401 * on the accept(2) queue. If we do, then 402 * accept(2) may hang after select(2) indicated 403 * that the listening socket was ready. 404 */ 405 lwkt_relpooltoken(head); 406 return; 407 } else { 408 panic("sofree: not queued"); 409 } 410 soclrstate(so, SS_INCOMP); 411 so->so_head = NULL; 412 lwkt_relpooltoken(head); 413 } 414 ssb_release(&so->so_snd, so); 415 sorflush(so); 416 sodealloc(so); 417 } 418 419 /* 420 * Close a socket on last file table reference removal. 421 * Initiate disconnect if connected. 422 * Free socket when disconnect complete. 423 */ 424 int 425 soclose(struct socket *so, int fflag) 426 { 427 int error; 428 429 funsetown(&so->so_sigio); 430 if (!use_soclose_fast || 431 (so->so_proto->pr_flags & PR_SYNC_PORT) || 432 ((so->so_state & SS_ISCONNECTED) && 433 (so->so_options & SO_LINGER))) { 434 error = soclose_sync(so, fflag); 435 } else { 436 soclose_fast(so); 437 error = 0; 438 } 439 return error; 440 } 441 442 static void 443 sodiscard(struct socket *so) 444 { 445 lwkt_getpooltoken(so); 446 if (so->so_options & SO_ACCEPTCONN) { 447 struct socket *sp; 448 449 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 450 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 451 soclrstate(sp, SS_INCOMP); 452 sp->so_head = NULL; 453 so->so_incqlen--; 454 soaborta(sp); 455 } 456 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 457 TAILQ_REMOVE(&so->so_comp, sp, so_list); 458 soclrstate(sp, SS_COMP); 459 sp->so_head = NULL; 460 so->so_qlen--; 461 soaborta(sp); 462 } 463 } 464 lwkt_relpooltoken(so); 465 466 if (so->so_state & SS_NOFDREF) 467 panic("soclose: NOFDREF"); 468 sosetstate(so, SS_NOFDREF); /* take ref */ 469 } 470 471 void 472 soinherit(struct socket *so, struct socket *so_inh) 473 { 474 TAILQ_HEAD(, socket) comp, incomp; 475 struct socket *sp; 476 int qlen, incqlen; 477 478 KASSERT(so->so_options & SO_ACCEPTCONN, 479 ("so does not accept connection")); 480 KASSERT(so_inh->so_options & SO_ACCEPTCONN, 481 ("so_inh does not accept connection")); 482 483 TAILQ_INIT(&comp); 484 TAILQ_INIT(&incomp); 485 486 lwkt_getpooltoken(so); 487 lwkt_getpooltoken(so_inh); 488 489 /* 490 * Save completed queue and incompleted queue 491 */ 492 TAILQ_CONCAT(&comp, &so->so_comp, so_list); 493 qlen = so->so_qlen; 494 so->so_qlen = 0; 495 496 TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); 497 incqlen = so->so_incqlen; 498 so->so_incqlen = 0; 499 500 /* 501 * Append the saved completed queue and incompleted 502 * queue to the socket inherits them. 503 * 504 * XXX 505 * This may temporarily break the inheriting socket's 506 * so_qlimit. 507 */ 508 TAILQ_FOREACH(sp, &comp, so_list) { 509 sp->so_head = so_inh; 510 crfree(sp->so_cred); 511 sp->so_cred = crhold(so_inh->so_cred); 512 } 513 514 TAILQ_FOREACH(sp, &incomp, so_list) { 515 sp->so_head = so_inh; 516 crfree(sp->so_cred); 517 sp->so_cred = crhold(so_inh->so_cred); 518 } 519 520 TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); 521 so_inh->so_qlen += qlen; 522 523 TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); 524 so_inh->so_incqlen += incqlen; 525 526 lwkt_relpooltoken(so_inh); 527 lwkt_relpooltoken(so); 528 529 if (qlen) { 530 /* 531 * "New" connections have arrived 532 */ 533 sorwakeup(so_inh); 534 wakeup(&so_inh->so_timeo); 535 } 536 } 537 538 static int 539 soclose_sync(struct socket *so, int fflag) 540 { 541 int error = 0; 542 543 if (so->so_pcb == NULL) 544 goto discard; 545 if (so->so_state & SS_ISCONNECTED) { 546 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 547 error = sodisconnect(so); 548 if (error) 549 goto drop; 550 } 551 if (so->so_options & SO_LINGER) { 552 if ((so->so_state & SS_ISDISCONNECTING) && 553 (fflag & FNONBLOCK)) 554 goto drop; 555 while (so->so_state & SS_ISCONNECTED) { 556 error = tsleep(&so->so_timeo, PCATCH, 557 "soclos", so->so_linger * hz); 558 if (error) 559 break; 560 } 561 } 562 } 563 drop: 564 if (so->so_pcb) { 565 int error2; 566 567 error2 = so_pru_detach(so); 568 if (error == 0) 569 error = error2; 570 } 571 discard: 572 sodiscard(so); 573 so_pru_sync(so); /* unpend async sending */ 574 sofree(so); /* dispose of ref */ 575 576 return (error); 577 } 578 579 static void 580 soclose_sofree_async_handler(netmsg_t msg) 581 { 582 sofree(msg->base.nm_so); 583 } 584 585 static void 586 soclose_sofree_async(struct socket *so) 587 { 588 struct netmsg_base *base = &so->so_clomsg; 589 590 netmsg_init(base, so, &netisr_apanic_rport, 0, 591 soclose_sofree_async_handler); 592 lwkt_sendmsg(so->so_port, &base->lmsg); 593 } 594 595 static void 596 soclose_disconn_async_handler(netmsg_t msg) 597 { 598 struct socket *so = msg->base.nm_so; 599 600 if ((so->so_state & SS_ISCONNECTED) && 601 (so->so_state & SS_ISDISCONNECTING) == 0) 602 so_pru_disconnect_direct(so); 603 604 if (so->so_pcb) 605 so_pru_detach_direct(so); 606 607 sodiscard(so); 608 sofree(so); 609 } 610 611 static void 612 soclose_disconn_async(struct socket *so) 613 { 614 struct netmsg_base *base = &so->so_clomsg; 615 616 netmsg_init(base, so, &netisr_apanic_rport, 0, 617 soclose_disconn_async_handler); 618 lwkt_sendmsg(so->so_port, &base->lmsg); 619 } 620 621 static void 622 soclose_detach_async_handler(netmsg_t msg) 623 { 624 struct socket *so = msg->base.nm_so; 625 626 if (so->so_pcb) 627 so_pru_detach_direct(so); 628 629 sodiscard(so); 630 sofree(so); 631 } 632 633 static void 634 soclose_detach_async(struct socket *so) 635 { 636 struct netmsg_base *base = &so->so_clomsg; 637 638 netmsg_init(base, so, &netisr_apanic_rport, 0, 639 soclose_detach_async_handler); 640 lwkt_sendmsg(so->so_port, &base->lmsg); 641 } 642 643 static void 644 soclose_fast(struct socket *so) 645 { 646 if (so->so_pcb == NULL) 647 goto discard; 648 649 if ((so->so_state & SS_ISCONNECTED) && 650 (so->so_state & SS_ISDISCONNECTING) == 0) { 651 soclose_disconn_async(so); 652 return; 653 } 654 655 if (so->so_pcb) { 656 soclose_detach_async(so); 657 return; 658 } 659 660 discard: 661 sodiscard(so); 662 soclose_sofree_async(so); 663 } 664 665 /* 666 * Abort and destroy a socket. Only one abort can be in progress 667 * at any given moment. 668 */ 669 void 670 soabort(struct socket *so) 671 { 672 soreference(so); 673 so_pru_abort(so); 674 } 675 676 void 677 soaborta(struct socket *so) 678 { 679 soreference(so); 680 so_pru_aborta(so); 681 } 682 683 void 684 soabort_oncpu(struct socket *so) 685 { 686 soreference(so); 687 so_pru_abort_oncpu(so); 688 } 689 690 /* 691 * so is passed in ref'd, which becomes owned by 692 * the cleared SS_NOFDREF flag. 693 */ 694 void 695 soaccept_generic(struct socket *so) 696 { 697 if ((so->so_state & SS_NOFDREF) == 0) 698 panic("soaccept: !NOFDREF"); 699 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 700 } 701 702 int 703 soaccept(struct socket *so, struct sockaddr **nam) 704 { 705 int error; 706 707 soaccept_generic(so); 708 error = so_pru_accept(so, nam); 709 return (error); 710 } 711 712 int 713 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 714 boolean_t sync) 715 { 716 int error; 717 718 if (so->so_options & SO_ACCEPTCONN) 719 return (EOPNOTSUPP); 720 /* 721 * If protocol is connection-based, can only connect once. 722 * Otherwise, if connected, try to disconnect first. 723 * This allows user to disconnect by connecting to, e.g., 724 * a null address. 725 */ 726 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 727 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 728 (error = sodisconnect(so)))) { 729 error = EISCONN; 730 } else { 731 /* 732 * Prevent accumulated error from previous connection 733 * from biting us. 734 */ 735 so->so_error = 0; 736 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 737 error = so_pru_connect_async(so, nam, td); 738 else 739 error = so_pru_connect(so, nam, td); 740 } 741 return (error); 742 } 743 744 int 745 soconnect2(struct socket *so1, struct socket *so2) 746 { 747 int error; 748 749 error = so_pru_connect2(so1, so2); 750 return (error); 751 } 752 753 int 754 sodisconnect(struct socket *so) 755 { 756 int error; 757 758 if ((so->so_state & SS_ISCONNECTED) == 0) { 759 error = ENOTCONN; 760 goto bad; 761 } 762 if (so->so_state & SS_ISDISCONNECTING) { 763 error = EALREADY; 764 goto bad; 765 } 766 error = so_pru_disconnect(so); 767 bad: 768 return (error); 769 } 770 771 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 772 /* 773 * Send on a socket. 774 * If send must go all at once and message is larger than 775 * send buffering, then hard error. 776 * Lock against other senders. 777 * If must go all at once and not enough room now, then 778 * inform user that this would block and do nothing. 779 * Otherwise, if nonblocking, send as much as possible. 780 * The data to be sent is described by "uio" if nonzero, 781 * otherwise by the mbuf chain "top" (which must be null 782 * if uio is not). Data provided in mbuf chain must be small 783 * enough to send all at once. 784 * 785 * Returns nonzero on error, timeout or signal; callers 786 * must check for short counts if EINTR/ERESTART are returned. 787 * Data and control buffers are freed on return. 788 */ 789 int 790 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 791 struct mbuf *top, struct mbuf *control, int flags, 792 struct thread *td) 793 { 794 struct mbuf **mp; 795 struct mbuf *m; 796 size_t resid; 797 int space, len; 798 int clen = 0, error, dontroute, mlen; 799 int atomic = sosendallatonce(so) || top; 800 int pru_flags; 801 802 if (uio) { 803 resid = uio->uio_resid; 804 } else { 805 resid = (size_t)top->m_pkthdr.len; 806 #ifdef INVARIANTS 807 len = 0; 808 for (m = top; m; m = m->m_next) 809 len += m->m_len; 810 KKASSERT(top->m_pkthdr.len == len); 811 #endif 812 } 813 814 /* 815 * WARNING! resid is unsigned, space and len are signed. space 816 * can wind up negative if the sockbuf is overcommitted. 817 * 818 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 819 * type sockets since that's an error. 820 */ 821 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 822 error = EINVAL; 823 goto out; 824 } 825 826 dontroute = 827 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 828 (so->so_proto->pr_flags & PR_ATOMIC); 829 if (td->td_lwp != NULL) 830 td->td_lwp->lwp_ru.ru_msgsnd++; 831 if (control) 832 clen = control->m_len; 833 #define gotoerr(errcode) { error = errcode; goto release; } 834 835 restart: 836 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 837 if (error) 838 goto out; 839 840 do { 841 if (so->so_state & SS_CANTSENDMORE) 842 gotoerr(EPIPE); 843 if (so->so_error) { 844 error = so->so_error; 845 so->so_error = 0; 846 goto release; 847 } 848 if ((so->so_state & SS_ISCONNECTED) == 0) { 849 /* 850 * `sendto' and `sendmsg' is allowed on a connection- 851 * based socket if it supports implied connect. 852 * Return ENOTCONN if not connected and no address is 853 * supplied. 854 */ 855 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 856 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 857 if ((so->so_state & SS_ISCONFIRMING) == 0 && 858 !(resid == 0 && clen != 0)) 859 gotoerr(ENOTCONN); 860 } else if (addr == NULL) 861 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 862 ENOTCONN : EDESTADDRREQ); 863 } 864 if ((atomic && resid > so->so_snd.ssb_hiwat) || 865 clen > so->so_snd.ssb_hiwat) { 866 gotoerr(EMSGSIZE); 867 } 868 space = ssb_space(&so->so_snd); 869 if (flags & MSG_OOB) 870 space += 1024; 871 if ((space < 0 || (size_t)space < resid + clen) && uio && 872 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 873 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 874 gotoerr(EWOULDBLOCK); 875 ssb_unlock(&so->so_snd); 876 error = ssb_wait(&so->so_snd); 877 if (error) 878 goto out; 879 goto restart; 880 } 881 mp = ⊤ 882 space -= clen; 883 do { 884 if (uio == NULL) { 885 /* 886 * Data is prepackaged in "top". 887 */ 888 resid = 0; 889 if (flags & MSG_EOR) 890 top->m_flags |= M_EOR; 891 } else do { 892 if (resid > INT_MAX) 893 resid = INT_MAX; 894 m = m_getl((int)resid, MB_WAIT, MT_DATA, 895 top == NULL ? M_PKTHDR : 0, &mlen); 896 if (top == NULL) { 897 m->m_pkthdr.len = 0; 898 m->m_pkthdr.rcvif = NULL; 899 } 900 len = imin((int)szmin(mlen, resid), space); 901 if (resid < MINCLSIZE) { 902 /* 903 * For datagram protocols, leave room 904 * for protocol headers in first mbuf. 905 */ 906 if (atomic && top == NULL && len < mlen) 907 MH_ALIGN(m, len); 908 } 909 space -= len; 910 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 911 resid = uio->uio_resid; 912 m->m_len = len; 913 *mp = m; 914 top->m_pkthdr.len += len; 915 if (error) 916 goto release; 917 mp = &m->m_next; 918 if (resid == 0) { 919 if (flags & MSG_EOR) 920 top->m_flags |= M_EOR; 921 break; 922 } 923 } while (space > 0 && atomic); 924 if (dontroute) 925 so->so_options |= SO_DONTROUTE; 926 if (flags & MSG_OOB) { 927 pru_flags = PRUS_OOB; 928 } else if ((flags & MSG_EOF) && 929 (so->so_proto->pr_flags & PR_IMPLOPCL) && 930 (resid == 0)) { 931 /* 932 * If the user set MSG_EOF, the protocol 933 * understands this flag and nothing left to 934 * send then use PRU_SEND_EOF instead of PRU_SEND. 935 */ 936 pru_flags = PRUS_EOF; 937 } else if (resid > 0 && space > 0) { 938 /* If there is more to send, set PRUS_MORETOCOME */ 939 pru_flags = PRUS_MORETOCOME; 940 } else { 941 pru_flags = 0; 942 } 943 /* 944 * XXX all the SS_CANTSENDMORE checks previously 945 * done could be out of date. We could have recieved 946 * a reset packet in an interrupt or maybe we slept 947 * while doing page faults in uiomove() etc. We could 948 * probably recheck again inside the splnet() protection 949 * here, but there are probably other places that this 950 * also happens. We must rethink this. 951 */ 952 error = so_pru_send(so, pru_flags, top, addr, control, td); 953 if (dontroute) 954 so->so_options &= ~SO_DONTROUTE; 955 clen = 0; 956 control = NULL; 957 top = NULL; 958 mp = ⊤ 959 if (error) 960 goto release; 961 } while (resid && space > 0); 962 } while (resid); 963 964 release: 965 ssb_unlock(&so->so_snd); 966 out: 967 if (top) 968 m_freem(top); 969 if (control) 970 m_freem(control); 971 return (error); 972 } 973 974 #ifdef INET 975 /* 976 * A specialization of sosend() for UDP based on protocol-specific knowledge: 977 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 978 * sosendallatonce() returns true, 979 * the "atomic" variable is true, 980 * and sosendudp() blocks until space is available for the entire send. 981 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 982 * PR_IMPLOPCL flags set. 983 * UDP has no out-of-band data. 984 * UDP has no control data. 985 * UDP does not support MSG_EOR. 986 */ 987 int 988 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 989 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 990 { 991 size_t resid; 992 int error, pru_flags = 0; 993 int space; 994 995 if (td->td_lwp != NULL) 996 td->td_lwp->lwp_ru.ru_msgsnd++; 997 if (control) 998 m_freem(control); 999 1000 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 1001 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 1002 1003 restart: 1004 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1005 if (error) 1006 goto out; 1007 1008 if (so->so_state & SS_CANTSENDMORE) 1009 gotoerr(EPIPE); 1010 if (so->so_error) { 1011 error = so->so_error; 1012 so->so_error = 0; 1013 goto release; 1014 } 1015 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1016 gotoerr(EDESTADDRREQ); 1017 if (resid > so->so_snd.ssb_hiwat) 1018 gotoerr(EMSGSIZE); 1019 space = ssb_space(&so->so_snd); 1020 if (uio && (space < 0 || (size_t)space < resid)) { 1021 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1022 gotoerr(EWOULDBLOCK); 1023 ssb_unlock(&so->so_snd); 1024 error = ssb_wait(&so->so_snd); 1025 if (error) 1026 goto out; 1027 goto restart; 1028 } 1029 1030 if (uio) { 1031 int hdrlen = max_hdr; 1032 1033 /* 1034 * We try to optimize out the additional mbuf 1035 * allocations in M_PREPEND() on output path, e.g. 1036 * - udp_output(), when it tries to prepend protocol 1037 * headers. 1038 * - Link layer output function, when it tries to 1039 * prepend link layer header. 1040 * 1041 * This probably will not benefit any data that will 1042 * be fragmented, so this optimization is only performed 1043 * when the size of data and max size of protocol+link 1044 * headers fit into one mbuf cluster. 1045 */ 1046 if (uio->uio_resid > MCLBYTES - hdrlen || 1047 !udp_sosend_prepend) { 1048 top = m_uiomove(uio); 1049 if (top == NULL) 1050 goto release; 1051 } else { 1052 int nsize; 1053 1054 top = m_getl(uio->uio_resid + hdrlen, MB_WAIT, 1055 MT_DATA, M_PKTHDR, &nsize); 1056 KASSERT(nsize >= uio->uio_resid + hdrlen, 1057 ("sosendudp invalid nsize %d, " 1058 "resid %zu, hdrlen %d", 1059 nsize, uio->uio_resid, hdrlen)); 1060 1061 top->m_len = uio->uio_resid; 1062 top->m_pkthdr.len = uio->uio_resid; 1063 top->m_data += hdrlen; 1064 1065 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1066 if (error) 1067 goto out; 1068 } 1069 } 1070 1071 if (flags & MSG_DONTROUTE) 1072 pru_flags |= PRUS_DONTROUTE; 1073 1074 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1075 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1076 error = 0; 1077 } else { 1078 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1079 } 1080 top = NULL; /* sent or freed in lower layer */ 1081 1082 release: 1083 ssb_unlock(&so->so_snd); 1084 out: 1085 if (top) 1086 m_freem(top); 1087 return (error); 1088 } 1089 1090 int 1091 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1092 struct mbuf *top, struct mbuf *control, int flags, 1093 struct thread *td) 1094 { 1095 struct mbuf **mp; 1096 struct mbuf *m; 1097 size_t resid; 1098 int space, len; 1099 int error, mlen; 1100 int allatonce; 1101 int pru_flags; 1102 1103 if (uio) { 1104 KKASSERT(top == NULL); 1105 allatonce = 0; 1106 resid = uio->uio_resid; 1107 } else { 1108 allatonce = 1; 1109 resid = (size_t)top->m_pkthdr.len; 1110 #ifdef INVARIANTS 1111 len = 0; 1112 for (m = top; m; m = m->m_next) 1113 len += m->m_len; 1114 KKASSERT(top->m_pkthdr.len == len); 1115 #endif 1116 } 1117 1118 /* 1119 * WARNING! resid is unsigned, space and len are signed. space 1120 * can wind up negative if the sockbuf is overcommitted. 1121 * 1122 * Also check to make sure that MSG_EOR isn't used on TCP 1123 */ 1124 if (flags & MSG_EOR) { 1125 error = EINVAL; 1126 goto out; 1127 } 1128 1129 if (control) { 1130 /* TCP doesn't do control messages (rights, creds, etc) */ 1131 if (control->m_len) { 1132 error = EINVAL; 1133 goto out; 1134 } 1135 m_freem(control); /* empty control, just free it */ 1136 control = NULL; 1137 } 1138 1139 if (td->td_lwp != NULL) 1140 td->td_lwp->lwp_ru.ru_msgsnd++; 1141 1142 #define gotoerr(errcode) { error = errcode; goto release; } 1143 1144 restart: 1145 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1146 if (error) 1147 goto out; 1148 1149 do { 1150 if (so->so_state & SS_CANTSENDMORE) 1151 gotoerr(EPIPE); 1152 if (so->so_error) { 1153 error = so->so_error; 1154 so->so_error = 0; 1155 goto release; 1156 } 1157 if ((so->so_state & SS_ISCONNECTED) == 0 && 1158 (so->so_state & SS_ISCONFIRMING) == 0) 1159 gotoerr(ENOTCONN); 1160 if (allatonce && resid > so->so_snd.ssb_hiwat) 1161 gotoerr(EMSGSIZE); 1162 1163 space = ssb_space_prealloc(&so->so_snd); 1164 if (flags & MSG_OOB) 1165 space += 1024; 1166 if ((space < 0 || (size_t)space < resid) && !allatonce && 1167 space < so->so_snd.ssb_lowat) { 1168 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1169 gotoerr(EWOULDBLOCK); 1170 ssb_unlock(&so->so_snd); 1171 error = ssb_wait(&so->so_snd); 1172 if (error) 1173 goto out; 1174 goto restart; 1175 } 1176 mp = ⊤ 1177 do { 1178 int cnt = 0, async = 0; 1179 1180 if (uio == NULL) { 1181 /* 1182 * Data is prepackaged in "top". 1183 */ 1184 resid = 0; 1185 } else do { 1186 if (resid > INT_MAX) 1187 resid = INT_MAX; 1188 if (tcp_sosend_jcluster) { 1189 m = m_getlj((int)resid, MB_WAIT, MT_DATA, 1190 top == NULL ? M_PKTHDR : 0, &mlen); 1191 } else { 1192 m = m_getl((int)resid, MB_WAIT, MT_DATA, 1193 top == NULL ? M_PKTHDR : 0, &mlen); 1194 } 1195 if (top == NULL) { 1196 m->m_pkthdr.len = 0; 1197 m->m_pkthdr.rcvif = NULL; 1198 } 1199 len = imin((int)szmin(mlen, resid), space); 1200 space -= len; 1201 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1202 resid = uio->uio_resid; 1203 m->m_len = len; 1204 *mp = m; 1205 top->m_pkthdr.len += len; 1206 if (error) 1207 goto release; 1208 mp = &m->m_next; 1209 if (resid == 0) 1210 break; 1211 ++cnt; 1212 } while (space > 0 && cnt < tcp_sosend_agglim); 1213 1214 if (tcp_sosend_async) 1215 async = 1; 1216 1217 if (flags & MSG_OOB) { 1218 pru_flags = PRUS_OOB; 1219 async = 0; 1220 } else if ((flags & MSG_EOF) && resid == 0) { 1221 pru_flags = PRUS_EOF; 1222 } else if (resid > 0 && space > 0) { 1223 /* If there is more to send, set PRUS_MORETOCOME */ 1224 pru_flags = PRUS_MORETOCOME; 1225 async = 1; 1226 } else { 1227 pru_flags = 0; 1228 } 1229 1230 if (flags & MSG_SYNC) 1231 async = 0; 1232 1233 /* 1234 * XXX all the SS_CANTSENDMORE checks previously 1235 * done could be out of date. We could have recieved 1236 * a reset packet in an interrupt or maybe we slept 1237 * while doing page faults in uiomove() etc. We could 1238 * probably recheck again inside the splnet() protection 1239 * here, but there are probably other places that this 1240 * also happens. We must rethink this. 1241 */ 1242 for (m = top; m; m = m->m_next) 1243 ssb_preallocstream(&so->so_snd, m); 1244 if (!async) { 1245 error = so_pru_send(so, pru_flags, top, 1246 NULL, NULL, td); 1247 } else { 1248 so_pru_send_async(so, pru_flags, top, 1249 NULL, NULL, td); 1250 error = 0; 1251 } 1252 1253 top = NULL; 1254 mp = ⊤ 1255 if (error) 1256 goto release; 1257 } while (resid && space > 0); 1258 } while (resid); 1259 1260 release: 1261 ssb_unlock(&so->so_snd); 1262 out: 1263 if (top) 1264 m_freem(top); 1265 if (control) 1266 m_freem(control); 1267 return (error); 1268 } 1269 #endif 1270 1271 /* 1272 * Implement receive operations on a socket. 1273 * 1274 * We depend on the way that records are added to the signalsockbuf 1275 * by sbappend*. In particular, each record (mbufs linked through m_next) 1276 * must begin with an address if the protocol so specifies, 1277 * followed by an optional mbuf or mbufs containing ancillary data, 1278 * and then zero or more mbufs of data. 1279 * 1280 * Although the signalsockbuf is locked, new data may still be appended. 1281 * A token inside the ssb_lock deals with MP issues and still allows 1282 * the network to access the socket if we block in a uio. 1283 * 1284 * The caller may receive the data as a single mbuf chain by supplying 1285 * an mbuf **mp0 for use in returning the chain. The uio is then used 1286 * only for the count in uio_resid. 1287 */ 1288 int 1289 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1290 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1291 { 1292 struct mbuf *m, *n; 1293 struct mbuf *free_chain = NULL; 1294 int flags, len, error, offset; 1295 struct protosw *pr = so->so_proto; 1296 int moff, type = 0; 1297 size_t resid, orig_resid; 1298 1299 if (uio) 1300 resid = uio->uio_resid; 1301 else 1302 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1303 orig_resid = resid; 1304 1305 if (psa) 1306 *psa = NULL; 1307 if (controlp) 1308 *controlp = NULL; 1309 if (flagsp) 1310 flags = *flagsp &~ MSG_EOR; 1311 else 1312 flags = 0; 1313 if (flags & MSG_OOB) { 1314 m = m_get(MB_WAIT, MT_DATA); 1315 if (m == NULL) 1316 return (ENOBUFS); 1317 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1318 if (error) 1319 goto bad; 1320 if (sio) { 1321 do { 1322 sbappend(sio, m); 1323 KKASSERT(resid >= (size_t)m->m_len); 1324 resid -= (size_t)m->m_len; 1325 } while (resid > 0 && m); 1326 } else { 1327 do { 1328 uio->uio_resid = resid; 1329 error = uiomove(mtod(m, caddr_t), 1330 (int)szmin(resid, m->m_len), 1331 uio); 1332 resid = uio->uio_resid; 1333 m = m_free(m); 1334 } while (uio->uio_resid && error == 0 && m); 1335 } 1336 bad: 1337 if (m) 1338 m_freem(m); 1339 return (error); 1340 } 1341 if ((so->so_state & SS_ISCONFIRMING) && resid) 1342 so_pru_rcvd(so, 0); 1343 1344 /* 1345 * The token interlocks against the protocol thread while 1346 * ssb_lock is a blocking lock against other userland entities. 1347 */ 1348 lwkt_gettoken(&so->so_rcv.ssb_token); 1349 restart: 1350 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1351 if (error) 1352 goto done; 1353 1354 m = so->so_rcv.ssb_mb; 1355 /* 1356 * If we have less data than requested, block awaiting more 1357 * (subject to any timeout) if: 1358 * 1. the current count is less than the low water mark, or 1359 * 2. MSG_WAITALL is set, and it is possible to do the entire 1360 * receive operation at once if we block (resid <= hiwat). 1361 * 3. MSG_DONTWAIT is not set 1362 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1363 * we have to do the receive in sections, and thus risk returning 1364 * a short count if a timeout or signal occurs after we start. 1365 */ 1366 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1367 (size_t)so->so_rcv.ssb_cc < resid) && 1368 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1369 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1370 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1371 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1372 if (so->so_error) { 1373 if (m) 1374 goto dontblock; 1375 error = so->so_error; 1376 if ((flags & MSG_PEEK) == 0) 1377 so->so_error = 0; 1378 goto release; 1379 } 1380 if (so->so_state & SS_CANTRCVMORE) { 1381 if (m) 1382 goto dontblock; 1383 else 1384 goto release; 1385 } 1386 for (; m; m = m->m_next) { 1387 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1388 m = so->so_rcv.ssb_mb; 1389 goto dontblock; 1390 } 1391 } 1392 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1393 (pr->pr_flags & PR_CONNREQUIRED)) { 1394 error = ENOTCONN; 1395 goto release; 1396 } 1397 if (resid == 0) 1398 goto release; 1399 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1400 error = EWOULDBLOCK; 1401 goto release; 1402 } 1403 ssb_unlock(&so->so_rcv); 1404 error = ssb_wait(&so->so_rcv); 1405 if (error) 1406 goto done; 1407 goto restart; 1408 } 1409 dontblock: 1410 if (uio && uio->uio_td && uio->uio_td->td_proc) 1411 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1412 1413 /* 1414 * note: m should be == sb_mb here. Cache the next record while 1415 * cleaning up. Note that calling m_free*() will break out critical 1416 * section. 1417 */ 1418 KKASSERT(m == so->so_rcv.ssb_mb); 1419 1420 /* 1421 * Skip any address mbufs prepending the record. 1422 */ 1423 if (pr->pr_flags & PR_ADDR) { 1424 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1425 orig_resid = 0; 1426 if (psa) 1427 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1428 if (flags & MSG_PEEK) 1429 m = m->m_next; 1430 else 1431 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1432 } 1433 1434 /* 1435 * Skip any control mbufs prepending the record. 1436 */ 1437 #ifdef SCTP 1438 if (pr->pr_flags & PR_ADDR_OPT) { 1439 /* 1440 * For SCTP we may be getting a 1441 * whole message OR a partial delivery. 1442 */ 1443 if (m && m->m_type == MT_SONAME) { 1444 orig_resid = 0; 1445 if (psa) 1446 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1447 if (flags & MSG_PEEK) 1448 m = m->m_next; 1449 else 1450 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1451 } 1452 } 1453 #endif /* SCTP */ 1454 while (m && m->m_type == MT_CONTROL && error == 0) { 1455 if (flags & MSG_PEEK) { 1456 if (controlp) 1457 *controlp = m_copy(m, 0, m->m_len); 1458 m = m->m_next; /* XXX race */ 1459 } else { 1460 if (controlp) { 1461 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1462 if (pr->pr_domain->dom_externalize && 1463 mtod(m, struct cmsghdr *)->cmsg_type == 1464 SCM_RIGHTS) 1465 error = (*pr->pr_domain->dom_externalize)(m); 1466 *controlp = m; 1467 m = n; 1468 } else { 1469 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1470 } 1471 } 1472 if (controlp && *controlp) { 1473 orig_resid = 0; 1474 controlp = &(*controlp)->m_next; 1475 } 1476 } 1477 1478 /* 1479 * flag OOB data. 1480 */ 1481 if (m) { 1482 type = m->m_type; 1483 if (type == MT_OOBDATA) 1484 flags |= MSG_OOB; 1485 } 1486 1487 /* 1488 * Copy to the UIO or mbuf return chain (*mp). 1489 */ 1490 moff = 0; 1491 offset = 0; 1492 while (m && resid > 0 && error == 0) { 1493 if (m->m_type == MT_OOBDATA) { 1494 if (type != MT_OOBDATA) 1495 break; 1496 } else if (type == MT_OOBDATA) 1497 break; 1498 else 1499 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1500 ("receive 3")); 1501 soclrstate(so, SS_RCVATMARK); 1502 len = (resid > INT_MAX) ? INT_MAX : resid; 1503 if (so->so_oobmark && len > so->so_oobmark - offset) 1504 len = so->so_oobmark - offset; 1505 if (len > m->m_len - moff) 1506 len = m->m_len - moff; 1507 1508 /* 1509 * Copy out to the UIO or pass the mbufs back to the SIO. 1510 * The SIO is dealt with when we eat the mbuf, but deal 1511 * with the resid here either way. 1512 */ 1513 if (uio) { 1514 uio->uio_resid = resid; 1515 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1516 resid = uio->uio_resid; 1517 if (error) 1518 goto release; 1519 } else { 1520 resid -= (size_t)len; 1521 } 1522 1523 /* 1524 * Eat the entire mbuf or just a piece of it 1525 */ 1526 if (len == m->m_len - moff) { 1527 if (m->m_flags & M_EOR) 1528 flags |= MSG_EOR; 1529 #ifdef SCTP 1530 if (m->m_flags & M_NOTIFICATION) 1531 flags |= MSG_NOTIFICATION; 1532 #endif /* SCTP */ 1533 if (flags & MSG_PEEK) { 1534 m = m->m_next; 1535 moff = 0; 1536 } else { 1537 if (sio) { 1538 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1539 sbappend(sio, m); 1540 m = n; 1541 } else { 1542 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1543 } 1544 } 1545 } else { 1546 if (flags & MSG_PEEK) { 1547 moff += len; 1548 } else { 1549 if (sio) { 1550 n = m_copym(m, 0, len, MB_WAIT); 1551 if (n) 1552 sbappend(sio, n); 1553 } 1554 m->m_data += len; 1555 m->m_len -= len; 1556 so->so_rcv.ssb_cc -= len; 1557 } 1558 } 1559 if (so->so_oobmark) { 1560 if ((flags & MSG_PEEK) == 0) { 1561 so->so_oobmark -= len; 1562 if (so->so_oobmark == 0) { 1563 sosetstate(so, SS_RCVATMARK); 1564 break; 1565 } 1566 } else { 1567 offset += len; 1568 if (offset == so->so_oobmark) 1569 break; 1570 } 1571 } 1572 if (flags & MSG_EOR) 1573 break; 1574 /* 1575 * If the MSG_WAITALL flag is set (for non-atomic socket), 1576 * we must not quit until resid == 0 or an error 1577 * termination. If a signal/timeout occurs, return 1578 * with a short count but without error. 1579 * Keep signalsockbuf locked against other readers. 1580 */ 1581 while ((flags & MSG_WAITALL) && m == NULL && 1582 resid > 0 && !sosendallatonce(so) && 1583 so->so_rcv.ssb_mb == NULL) { 1584 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1585 break; 1586 /* 1587 * The window might have closed to zero, make 1588 * sure we send an ack now that we've drained 1589 * the buffer or we might end up blocking until 1590 * the idle takes over (5 seconds). 1591 */ 1592 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1593 so_pru_rcvd(so, flags); 1594 error = ssb_wait(&so->so_rcv); 1595 if (error) { 1596 ssb_unlock(&so->so_rcv); 1597 error = 0; 1598 goto done; 1599 } 1600 m = so->so_rcv.ssb_mb; 1601 } 1602 } 1603 1604 /* 1605 * If an atomic read was requested but unread data still remains 1606 * in the record, set MSG_TRUNC. 1607 */ 1608 if (m && pr->pr_flags & PR_ATOMIC) 1609 flags |= MSG_TRUNC; 1610 1611 /* 1612 * Cleanup. If an atomic read was requested drop any unread data. 1613 */ 1614 if ((flags & MSG_PEEK) == 0) { 1615 if (m && (pr->pr_flags & PR_ATOMIC)) 1616 sbdroprecord(&so->so_rcv.sb); 1617 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1618 so_pru_rcvd(so, flags); 1619 } 1620 1621 if (orig_resid == resid && orig_resid && 1622 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1623 ssb_unlock(&so->so_rcv); 1624 goto restart; 1625 } 1626 1627 if (flagsp) 1628 *flagsp |= flags; 1629 release: 1630 ssb_unlock(&so->so_rcv); 1631 done: 1632 lwkt_reltoken(&so->so_rcv.ssb_token); 1633 if (free_chain) 1634 m_freem(free_chain); 1635 return (error); 1636 } 1637 1638 int 1639 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1640 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1641 { 1642 struct mbuf *m, *n; 1643 struct mbuf *free_chain = NULL; 1644 int flags, len, error, offset; 1645 struct protosw *pr = so->so_proto; 1646 int moff; 1647 size_t resid, orig_resid; 1648 1649 if (uio) 1650 resid = uio->uio_resid; 1651 else 1652 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1653 orig_resid = resid; 1654 1655 if (psa) 1656 *psa = NULL; 1657 if (controlp) 1658 *controlp = NULL; 1659 if (flagsp) 1660 flags = *flagsp &~ MSG_EOR; 1661 else 1662 flags = 0; 1663 if (flags & MSG_OOB) { 1664 m = m_get(MB_WAIT, MT_DATA); 1665 if (m == NULL) 1666 return (ENOBUFS); 1667 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1668 if (error) 1669 goto bad; 1670 if (sio) { 1671 do { 1672 sbappend(sio, m); 1673 KKASSERT(resid >= (size_t)m->m_len); 1674 resid -= (size_t)m->m_len; 1675 } while (resid > 0 && m); 1676 } else { 1677 do { 1678 uio->uio_resid = resid; 1679 error = uiomove(mtod(m, caddr_t), 1680 (int)szmin(resid, m->m_len), 1681 uio); 1682 resid = uio->uio_resid; 1683 m = m_free(m); 1684 } while (uio->uio_resid && error == 0 && m); 1685 } 1686 bad: 1687 if (m) 1688 m_freem(m); 1689 return (error); 1690 } 1691 1692 /* 1693 * The token interlocks against the protocol thread while 1694 * ssb_lock is a blocking lock against other userland entities. 1695 */ 1696 lwkt_gettoken(&so->so_rcv.ssb_token); 1697 restart: 1698 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1699 if (error) 1700 goto done; 1701 1702 m = so->so_rcv.ssb_mb; 1703 /* 1704 * If we have less data than requested, block awaiting more 1705 * (subject to any timeout) if: 1706 * 1. the current count is less than the low water mark, or 1707 * 2. MSG_WAITALL is set, and it is possible to do the entire 1708 * receive operation at once if we block (resid <= hiwat). 1709 * 3. MSG_DONTWAIT is not set 1710 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1711 * we have to do the receive in sections, and thus risk returning 1712 * a short count if a timeout or signal occurs after we start. 1713 */ 1714 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1715 (size_t)so->so_rcv.ssb_cc < resid) && 1716 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1717 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1718 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1719 if (so->so_error) { 1720 if (m) 1721 goto dontblock; 1722 error = so->so_error; 1723 if ((flags & MSG_PEEK) == 0) 1724 so->so_error = 0; 1725 goto release; 1726 } 1727 if (so->so_state & SS_CANTRCVMORE) { 1728 if (m) 1729 goto dontblock; 1730 else 1731 goto release; 1732 } 1733 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1734 (pr->pr_flags & PR_CONNREQUIRED)) { 1735 error = ENOTCONN; 1736 goto release; 1737 } 1738 if (resid == 0) 1739 goto release; 1740 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1741 error = EWOULDBLOCK; 1742 goto release; 1743 } 1744 ssb_unlock(&so->so_rcv); 1745 error = ssb_wait(&so->so_rcv); 1746 if (error) 1747 goto done; 1748 goto restart; 1749 } 1750 dontblock: 1751 if (uio && uio->uio_td && uio->uio_td->td_proc) 1752 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1753 1754 /* 1755 * note: m should be == sb_mb here. Cache the next record while 1756 * cleaning up. Note that calling m_free*() will break out critical 1757 * section. 1758 */ 1759 KKASSERT(m == so->so_rcv.ssb_mb); 1760 1761 /* 1762 * Copy to the UIO or mbuf return chain (*mp). 1763 */ 1764 moff = 0; 1765 offset = 0; 1766 while (m && resid > 0 && error == 0) { 1767 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1768 ("receive 3")); 1769 1770 soclrstate(so, SS_RCVATMARK); 1771 len = (resid > INT_MAX) ? INT_MAX : resid; 1772 if (so->so_oobmark && len > so->so_oobmark - offset) 1773 len = so->so_oobmark - offset; 1774 if (len > m->m_len - moff) 1775 len = m->m_len - moff; 1776 1777 /* 1778 * Copy out to the UIO or pass the mbufs back to the SIO. 1779 * The SIO is dealt with when we eat the mbuf, but deal 1780 * with the resid here either way. 1781 */ 1782 if (uio) { 1783 uio->uio_resid = resid; 1784 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1785 resid = uio->uio_resid; 1786 if (error) 1787 goto release; 1788 } else { 1789 resid -= (size_t)len; 1790 } 1791 1792 /* 1793 * Eat the entire mbuf or just a piece of it 1794 */ 1795 if (len == m->m_len - moff) { 1796 if (flags & MSG_PEEK) { 1797 m = m->m_next; 1798 moff = 0; 1799 } else { 1800 if (sio) { 1801 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1802 sbappend(sio, m); 1803 m = n; 1804 } else { 1805 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1806 } 1807 } 1808 } else { 1809 if (flags & MSG_PEEK) { 1810 moff += len; 1811 } else { 1812 if (sio) { 1813 n = m_copym(m, 0, len, MB_WAIT); 1814 if (n) 1815 sbappend(sio, n); 1816 } 1817 m->m_data += len; 1818 m->m_len -= len; 1819 so->so_rcv.ssb_cc -= len; 1820 } 1821 } 1822 if (so->so_oobmark) { 1823 if ((flags & MSG_PEEK) == 0) { 1824 so->so_oobmark -= len; 1825 if (so->so_oobmark == 0) { 1826 sosetstate(so, SS_RCVATMARK); 1827 break; 1828 } 1829 } else { 1830 offset += len; 1831 if (offset == so->so_oobmark) 1832 break; 1833 } 1834 } 1835 /* 1836 * If the MSG_WAITALL flag is set (for non-atomic socket), 1837 * we must not quit until resid == 0 or an error 1838 * termination. If a signal/timeout occurs, return 1839 * with a short count but without error. 1840 * Keep signalsockbuf locked against other readers. 1841 */ 1842 while ((flags & MSG_WAITALL) && m == NULL && 1843 resid > 0 && !sosendallatonce(so) && 1844 so->so_rcv.ssb_mb == NULL) { 1845 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1846 break; 1847 /* 1848 * The window might have closed to zero, make 1849 * sure we send an ack now that we've drained 1850 * the buffer or we might end up blocking until 1851 * the idle takes over (5 seconds). 1852 */ 1853 if (so->so_pcb) 1854 so_pru_rcvd_async(so); 1855 error = ssb_wait(&so->so_rcv); 1856 if (error) { 1857 ssb_unlock(&so->so_rcv); 1858 error = 0; 1859 goto done; 1860 } 1861 m = so->so_rcv.ssb_mb; 1862 } 1863 } 1864 1865 /* 1866 * Cleanup. If an atomic read was requested drop any unread data. 1867 */ 1868 if ((flags & MSG_PEEK) == 0) { 1869 if (so->so_pcb) 1870 so_pru_rcvd_async(so); 1871 } 1872 1873 if (orig_resid == resid && orig_resid && 1874 (so->so_state & SS_CANTRCVMORE) == 0) { 1875 ssb_unlock(&so->so_rcv); 1876 goto restart; 1877 } 1878 1879 if (flagsp) 1880 *flagsp |= flags; 1881 release: 1882 ssb_unlock(&so->so_rcv); 1883 done: 1884 lwkt_reltoken(&so->so_rcv.ssb_token); 1885 if (free_chain) 1886 m_freem(free_chain); 1887 return (error); 1888 } 1889 1890 /* 1891 * Shut a socket down. Note that we do not get a frontend lock as we 1892 * want to be able to shut the socket down even if another thread is 1893 * blocked in a read(), thus waking it up. 1894 */ 1895 int 1896 soshutdown(struct socket *so, int how) 1897 { 1898 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1899 return (EINVAL); 1900 1901 if (how != SHUT_WR) { 1902 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1903 sorflush(so); 1904 /*ssb_unlock(&so->so_rcv);*/ 1905 } 1906 if (how != SHUT_RD) 1907 return (so_pru_shutdown(so)); 1908 return (0); 1909 } 1910 1911 void 1912 sorflush(struct socket *so) 1913 { 1914 struct signalsockbuf *ssb = &so->so_rcv; 1915 struct protosw *pr = so->so_proto; 1916 struct signalsockbuf asb; 1917 1918 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1919 1920 lwkt_gettoken(&ssb->ssb_token); 1921 socantrcvmore(so); 1922 asb = *ssb; 1923 1924 /* 1925 * Can't just blow up the ssb structure here 1926 */ 1927 bzero(&ssb->sb, sizeof(ssb->sb)); 1928 ssb->ssb_timeo = 0; 1929 ssb->ssb_lowat = 0; 1930 ssb->ssb_hiwat = 0; 1931 ssb->ssb_mbmax = 0; 1932 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1933 1934 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 1935 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1936 ssb_release(&asb, so); 1937 1938 lwkt_reltoken(&ssb->ssb_token); 1939 } 1940 1941 #ifdef INET 1942 static int 1943 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1944 { 1945 struct accept_filter_arg *afap = NULL; 1946 struct accept_filter *afp; 1947 struct so_accf *af = so->so_accf; 1948 int error = 0; 1949 1950 /* do not set/remove accept filters on non listen sockets */ 1951 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1952 error = EINVAL; 1953 goto out; 1954 } 1955 1956 /* removing the filter */ 1957 if (sopt == NULL) { 1958 if (af != NULL) { 1959 if (af->so_accept_filter != NULL && 1960 af->so_accept_filter->accf_destroy != NULL) { 1961 af->so_accept_filter->accf_destroy(so); 1962 } 1963 if (af->so_accept_filter_str != NULL) { 1964 kfree(af->so_accept_filter_str, M_ACCF); 1965 } 1966 kfree(af, M_ACCF); 1967 so->so_accf = NULL; 1968 } 1969 so->so_options &= ~SO_ACCEPTFILTER; 1970 return (0); 1971 } 1972 /* adding a filter */ 1973 /* must remove previous filter first */ 1974 if (af != NULL) { 1975 error = EINVAL; 1976 goto out; 1977 } 1978 /* don't put large objects on the kernel stack */ 1979 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 1980 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1981 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1982 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1983 if (error) 1984 goto out; 1985 afp = accept_filt_get(afap->af_name); 1986 if (afp == NULL) { 1987 error = ENOENT; 1988 goto out; 1989 } 1990 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1991 if (afp->accf_create != NULL) { 1992 if (afap->af_name[0] != '\0') { 1993 int len = strlen(afap->af_name) + 1; 1994 1995 af->so_accept_filter_str = kmalloc(len, M_ACCF, 1996 M_WAITOK); 1997 strcpy(af->so_accept_filter_str, afap->af_name); 1998 } 1999 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 2000 if (af->so_accept_filter_arg == NULL) { 2001 kfree(af->so_accept_filter_str, M_ACCF); 2002 kfree(af, M_ACCF); 2003 so->so_accf = NULL; 2004 error = EINVAL; 2005 goto out; 2006 } 2007 } 2008 af->so_accept_filter = afp; 2009 so->so_accf = af; 2010 so->so_options |= SO_ACCEPTFILTER; 2011 out: 2012 if (afap != NULL) 2013 kfree(afap, M_TEMP); 2014 return (error); 2015 } 2016 #endif /* INET */ 2017 2018 /* 2019 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2020 * an additional variant to handle the case where the option value needs 2021 * to be some kind of integer, but not a specific size. 2022 * In addition to their use here, these functions are also called by the 2023 * protocol-level pr_ctloutput() routines. 2024 */ 2025 int 2026 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2027 { 2028 return soopt_to_kbuf(sopt, buf, len, minlen); 2029 } 2030 2031 int 2032 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2033 { 2034 size_t valsize; 2035 2036 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2037 KKASSERT(kva_p(buf)); 2038 2039 /* 2040 * If the user gives us more than we wanted, we ignore it, 2041 * but if we don't get the minimum length the caller 2042 * wants, we return EINVAL. On success, sopt->sopt_valsize 2043 * is set to however much we actually retrieved. 2044 */ 2045 if ((valsize = sopt->sopt_valsize) < minlen) 2046 return EINVAL; 2047 if (valsize > len) 2048 sopt->sopt_valsize = valsize = len; 2049 2050 bcopy(sopt->sopt_val, buf, valsize); 2051 return 0; 2052 } 2053 2054 2055 int 2056 sosetopt(struct socket *so, struct sockopt *sopt) 2057 { 2058 int error, optval; 2059 struct linger l; 2060 struct timeval tv; 2061 u_long val; 2062 struct signalsockbuf *sotmp; 2063 2064 error = 0; 2065 sopt->sopt_dir = SOPT_SET; 2066 if (sopt->sopt_level != SOL_SOCKET) { 2067 if (so->so_proto && so->so_proto->pr_ctloutput) { 2068 return (so_pr_ctloutput(so, sopt)); 2069 } 2070 error = ENOPROTOOPT; 2071 } else { 2072 switch (sopt->sopt_name) { 2073 #ifdef INET 2074 case SO_ACCEPTFILTER: 2075 error = do_setopt_accept_filter(so, sopt); 2076 if (error) 2077 goto bad; 2078 break; 2079 #endif /* INET */ 2080 case SO_LINGER: 2081 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2082 if (error) 2083 goto bad; 2084 2085 so->so_linger = l.l_linger; 2086 if (l.l_onoff) 2087 so->so_options |= SO_LINGER; 2088 else 2089 so->so_options &= ~SO_LINGER; 2090 break; 2091 2092 case SO_DEBUG: 2093 case SO_KEEPALIVE: 2094 case SO_DONTROUTE: 2095 case SO_USELOOPBACK: 2096 case SO_BROADCAST: 2097 case SO_REUSEADDR: 2098 case SO_REUSEPORT: 2099 case SO_OOBINLINE: 2100 case SO_TIMESTAMP: 2101 case SO_NOSIGPIPE: 2102 error = sooptcopyin(sopt, &optval, sizeof optval, 2103 sizeof optval); 2104 if (error) 2105 goto bad; 2106 if (optval) 2107 so->so_options |= sopt->sopt_name; 2108 else 2109 so->so_options &= ~sopt->sopt_name; 2110 break; 2111 2112 case SO_SNDBUF: 2113 case SO_RCVBUF: 2114 case SO_SNDLOWAT: 2115 case SO_RCVLOWAT: 2116 error = sooptcopyin(sopt, &optval, sizeof optval, 2117 sizeof optval); 2118 if (error) 2119 goto bad; 2120 2121 /* 2122 * Values < 1 make no sense for any of these 2123 * options, so disallow them. 2124 */ 2125 if (optval < 1) { 2126 error = EINVAL; 2127 goto bad; 2128 } 2129 2130 switch (sopt->sopt_name) { 2131 case SO_SNDBUF: 2132 case SO_RCVBUF: 2133 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2134 &so->so_snd : &so->so_rcv, (u_long)optval, 2135 so, 2136 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2137 error = ENOBUFS; 2138 goto bad; 2139 } 2140 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2141 &so->so_snd : &so->so_rcv; 2142 atomic_clear_int(&sotmp->ssb_flags, 2143 SSB_AUTOSIZE); 2144 break; 2145 2146 /* 2147 * Make sure the low-water is never greater than 2148 * the high-water. 2149 */ 2150 case SO_SNDLOWAT: 2151 so->so_snd.ssb_lowat = 2152 (optval > so->so_snd.ssb_hiwat) ? 2153 so->so_snd.ssb_hiwat : optval; 2154 atomic_clear_int(&so->so_snd.ssb_flags, 2155 SSB_AUTOLOWAT); 2156 break; 2157 case SO_RCVLOWAT: 2158 so->so_rcv.ssb_lowat = 2159 (optval > so->so_rcv.ssb_hiwat) ? 2160 so->so_rcv.ssb_hiwat : optval; 2161 atomic_clear_int(&so->so_rcv.ssb_flags, 2162 SSB_AUTOLOWAT); 2163 break; 2164 } 2165 break; 2166 2167 case SO_SNDTIMEO: 2168 case SO_RCVTIMEO: 2169 error = sooptcopyin(sopt, &tv, sizeof tv, 2170 sizeof tv); 2171 if (error) 2172 goto bad; 2173 2174 /* assert(hz > 0); */ 2175 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2176 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2177 error = EDOM; 2178 goto bad; 2179 } 2180 /* assert(tick > 0); */ 2181 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2182 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2183 if (val > INT_MAX) { 2184 error = EDOM; 2185 goto bad; 2186 } 2187 if (val == 0 && tv.tv_usec != 0) 2188 val = 1; 2189 2190 switch (sopt->sopt_name) { 2191 case SO_SNDTIMEO: 2192 so->so_snd.ssb_timeo = val; 2193 break; 2194 case SO_RCVTIMEO: 2195 so->so_rcv.ssb_timeo = val; 2196 break; 2197 } 2198 break; 2199 default: 2200 error = ENOPROTOOPT; 2201 break; 2202 } 2203 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2204 (void) so_pr_ctloutput(so, sopt); 2205 } 2206 } 2207 bad: 2208 return (error); 2209 } 2210 2211 /* Helper routine for getsockopt */ 2212 int 2213 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2214 { 2215 soopt_from_kbuf(sopt, buf, len); 2216 return 0; 2217 } 2218 2219 void 2220 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2221 { 2222 size_t valsize; 2223 2224 if (len == 0) { 2225 sopt->sopt_valsize = 0; 2226 return; 2227 } 2228 2229 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2230 KKASSERT(kva_p(buf)); 2231 2232 /* 2233 * Documented get behavior is that we always return a value, 2234 * possibly truncated to fit in the user's buffer. 2235 * Traditional behavior is that we always tell the user 2236 * precisely how much we copied, rather than something useful 2237 * like the total amount we had available for her. 2238 * Note that this interface is not idempotent; the entire answer must 2239 * generated ahead of time. 2240 */ 2241 valsize = szmin(len, sopt->sopt_valsize); 2242 sopt->sopt_valsize = valsize; 2243 if (sopt->sopt_val != 0) { 2244 bcopy(buf, sopt->sopt_val, valsize); 2245 } 2246 } 2247 2248 int 2249 sogetopt(struct socket *so, struct sockopt *sopt) 2250 { 2251 int error, optval; 2252 long optval_l; 2253 struct linger l; 2254 struct timeval tv; 2255 #ifdef INET 2256 struct accept_filter_arg *afap; 2257 #endif 2258 2259 error = 0; 2260 sopt->sopt_dir = SOPT_GET; 2261 if (sopt->sopt_level != SOL_SOCKET) { 2262 if (so->so_proto && so->so_proto->pr_ctloutput) { 2263 return (so_pr_ctloutput(so, sopt)); 2264 } else 2265 return (ENOPROTOOPT); 2266 } else { 2267 switch (sopt->sopt_name) { 2268 #ifdef INET 2269 case SO_ACCEPTFILTER: 2270 if ((so->so_options & SO_ACCEPTCONN) == 0) 2271 return (EINVAL); 2272 afap = kmalloc(sizeof(*afap), M_TEMP, 2273 M_WAITOK | M_ZERO); 2274 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2275 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2276 if (so->so_accf->so_accept_filter_str != NULL) 2277 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2278 } 2279 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2280 kfree(afap, M_TEMP); 2281 break; 2282 #endif /* INET */ 2283 2284 case SO_LINGER: 2285 l.l_onoff = so->so_options & SO_LINGER; 2286 l.l_linger = so->so_linger; 2287 error = sooptcopyout(sopt, &l, sizeof l); 2288 break; 2289 2290 case SO_USELOOPBACK: 2291 case SO_DONTROUTE: 2292 case SO_DEBUG: 2293 case SO_KEEPALIVE: 2294 case SO_REUSEADDR: 2295 case SO_REUSEPORT: 2296 case SO_BROADCAST: 2297 case SO_OOBINLINE: 2298 case SO_TIMESTAMP: 2299 case SO_NOSIGPIPE: 2300 optval = so->so_options & sopt->sopt_name; 2301 integer: 2302 error = sooptcopyout(sopt, &optval, sizeof optval); 2303 break; 2304 2305 case SO_TYPE: 2306 optval = so->so_type; 2307 goto integer; 2308 2309 case SO_ERROR: 2310 optval = so->so_error; 2311 so->so_error = 0; 2312 goto integer; 2313 2314 case SO_SNDBUF: 2315 optval = so->so_snd.ssb_hiwat; 2316 goto integer; 2317 2318 case SO_RCVBUF: 2319 optval = so->so_rcv.ssb_hiwat; 2320 goto integer; 2321 2322 case SO_SNDLOWAT: 2323 optval = so->so_snd.ssb_lowat; 2324 goto integer; 2325 2326 case SO_RCVLOWAT: 2327 optval = so->so_rcv.ssb_lowat; 2328 goto integer; 2329 2330 case SO_SNDTIMEO: 2331 case SO_RCVTIMEO: 2332 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2333 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2334 2335 tv.tv_sec = optval / hz; 2336 tv.tv_usec = (optval % hz) * ustick; 2337 error = sooptcopyout(sopt, &tv, sizeof tv); 2338 break; 2339 2340 case SO_SNDSPACE: 2341 optval_l = ssb_space(&so->so_snd); 2342 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2343 break; 2344 2345 default: 2346 error = ENOPROTOOPT; 2347 break; 2348 } 2349 return (error); 2350 } 2351 } 2352 2353 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2354 int 2355 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2356 { 2357 struct mbuf *m, *m_prev; 2358 int sopt_size = sopt->sopt_valsize, msize; 2359 2360 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 2361 0, &msize); 2362 if (m == NULL) 2363 return (ENOBUFS); 2364 m->m_len = min(msize, sopt_size); 2365 sopt_size -= m->m_len; 2366 *mp = m; 2367 m_prev = m; 2368 2369 while (sopt_size > 0) { 2370 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 2371 MT_DATA, 0, &msize); 2372 if (m == NULL) { 2373 m_freem(*mp); 2374 return (ENOBUFS); 2375 } 2376 m->m_len = min(msize, sopt_size); 2377 sopt_size -= m->m_len; 2378 m_prev->m_next = m; 2379 m_prev = m; 2380 } 2381 return (0); 2382 } 2383 2384 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2385 int 2386 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2387 { 2388 soopt_to_mbuf(sopt, m); 2389 return 0; 2390 } 2391 2392 void 2393 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2394 { 2395 size_t valsize; 2396 void *val; 2397 2398 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2399 KKASSERT(kva_p(m)); 2400 if (sopt->sopt_val == NULL) 2401 return; 2402 val = sopt->sopt_val; 2403 valsize = sopt->sopt_valsize; 2404 while (m != NULL && valsize >= m->m_len) { 2405 bcopy(val, mtod(m, char *), m->m_len); 2406 valsize -= m->m_len; 2407 val = (caddr_t)val + m->m_len; 2408 m = m->m_next; 2409 } 2410 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2411 panic("ip6_sooptmcopyin"); 2412 } 2413 2414 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2415 int 2416 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2417 { 2418 return soopt_from_mbuf(sopt, m); 2419 } 2420 2421 int 2422 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2423 { 2424 struct mbuf *m0 = m; 2425 size_t valsize = 0; 2426 size_t maxsize; 2427 void *val; 2428 2429 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2430 KKASSERT(kva_p(m)); 2431 if (sopt->sopt_val == NULL) 2432 return 0; 2433 val = sopt->sopt_val; 2434 maxsize = sopt->sopt_valsize; 2435 while (m != NULL && maxsize >= m->m_len) { 2436 bcopy(mtod(m, char *), val, m->m_len); 2437 maxsize -= m->m_len; 2438 val = (caddr_t)val + m->m_len; 2439 valsize += m->m_len; 2440 m = m->m_next; 2441 } 2442 if (m != NULL) { 2443 /* enough soopt buffer should be given from user-land */ 2444 m_freem(m0); 2445 return (EINVAL); 2446 } 2447 sopt->sopt_valsize = valsize; 2448 return 0; 2449 } 2450 2451 void 2452 sohasoutofband(struct socket *so) 2453 { 2454 if (so->so_sigio != NULL) 2455 pgsigio(so->so_sigio, SIGURG, 0); 2456 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2457 } 2458 2459 int 2460 sokqfilter(struct file *fp, struct knote *kn) 2461 { 2462 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2463 struct signalsockbuf *ssb; 2464 2465 switch (kn->kn_filter) { 2466 case EVFILT_READ: 2467 if (so->so_options & SO_ACCEPTCONN) 2468 kn->kn_fop = &solisten_filtops; 2469 else 2470 kn->kn_fop = &soread_filtops; 2471 ssb = &so->so_rcv; 2472 break; 2473 case EVFILT_WRITE: 2474 kn->kn_fop = &sowrite_filtops; 2475 ssb = &so->so_snd; 2476 break; 2477 case EVFILT_EXCEPT: 2478 kn->kn_fop = &soexcept_filtops; 2479 ssb = &so->so_rcv; 2480 break; 2481 default: 2482 return (EOPNOTSUPP); 2483 } 2484 2485 knote_insert(&ssb->ssb_kq.ki_note, kn); 2486 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2487 return (0); 2488 } 2489 2490 static void 2491 filt_sordetach(struct knote *kn) 2492 { 2493 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2494 2495 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2496 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2497 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2498 } 2499 2500 /*ARGSUSED*/ 2501 static int 2502 filt_soread(struct knote *kn, long hint) 2503 { 2504 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2505 2506 if (kn->kn_sfflags & NOTE_OOB) { 2507 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2508 kn->kn_fflags |= NOTE_OOB; 2509 return (1); 2510 } 2511 return (0); 2512 } 2513 kn->kn_data = so->so_rcv.ssb_cc; 2514 2515 if (so->so_state & SS_CANTRCVMORE) { 2516 /* 2517 * Only set NODATA if all data has been exhausted. 2518 */ 2519 if (kn->kn_data == 0) 2520 kn->kn_flags |= EV_NODATA; 2521 kn->kn_flags |= EV_EOF; 2522 kn->kn_fflags = so->so_error; 2523 return (1); 2524 } 2525 if (so->so_error) /* temporary udp error */ 2526 return (1); 2527 if (kn->kn_sfflags & NOTE_LOWAT) 2528 return (kn->kn_data >= kn->kn_sdata); 2529 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2530 !TAILQ_EMPTY(&so->so_comp)); 2531 } 2532 2533 static void 2534 filt_sowdetach(struct knote *kn) 2535 { 2536 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2537 2538 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2539 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2540 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2541 } 2542 2543 /*ARGSUSED*/ 2544 static int 2545 filt_sowrite(struct knote *kn, long hint) 2546 { 2547 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2548 2549 kn->kn_data = ssb_space(&so->so_snd); 2550 if (so->so_state & SS_CANTSENDMORE) { 2551 kn->kn_flags |= (EV_EOF | EV_NODATA); 2552 kn->kn_fflags = so->so_error; 2553 return (1); 2554 } 2555 if (so->so_error) /* temporary udp error */ 2556 return (1); 2557 if (((so->so_state & SS_ISCONNECTED) == 0) && 2558 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2559 return (0); 2560 if (kn->kn_sfflags & NOTE_LOWAT) 2561 return (kn->kn_data >= kn->kn_sdata); 2562 return (kn->kn_data >= so->so_snd.ssb_lowat); 2563 } 2564 2565 /*ARGSUSED*/ 2566 static int 2567 filt_solisten(struct knote *kn, long hint) 2568 { 2569 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2570 2571 kn->kn_data = so->so_qlen; 2572 return (! TAILQ_EMPTY(&so->so_comp)); 2573 } 2574