1 /* $OpenBSD: uipc_socket.c,v 1.364 2025/01/23 10:44:13 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 const char *dom_name = dp->dom_name; 137 struct socket *so; 138 139 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 140 PR_ZERO); 141 if (so == NULL) 142 return (NULL); 143 144 #ifdef WITNESS 145 /* 146 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be 147 * spliced together. 148 */ 149 switch (dp->dom_family) { 150 case AF_INET: 151 case AF_INET6: 152 dom_name = "inet46"; 153 break; 154 } 155 #endif 156 157 refcnt_init_trace(&so->so_refcnt, DT_REFCNT_IDX_SOCKET); 158 rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK); 159 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 160 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 161 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 162 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 163 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 164 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 165 sigio_init(&so->so_sigio); 166 TAILQ_INIT(&so->so_q0); 167 TAILQ_INIT(&so->so_q); 168 169 return (so); 170 } 171 172 /* 173 * Socket operation routines. 174 * These routines are called by the routines in 175 * sys_socket.c or from a system process, and 176 * implement the semantics of socket operations by 177 * switching out to the protocol specific routines. 178 */ 179 int 180 socreate(int dom, struct socket **aso, int type, int proto) 181 { 182 struct proc *p = curproc; /* XXX */ 183 const struct protosw *prp; 184 struct socket *so; 185 int error; 186 187 if (proto) 188 prp = pffindproto(dom, proto, type); 189 else 190 prp = pffindtype(dom, type); 191 if (prp == NULL || prp->pr_usrreqs == NULL) 192 return (EPROTONOSUPPORT); 193 if (prp->pr_type != type) 194 return (EPROTOTYPE); 195 so = soalloc(prp, M_WAIT); 196 so->so_type = type; 197 if (suser(p) == 0) 198 so->so_state = SS_PRIV; 199 so->so_ruid = p->p_ucred->cr_ruid; 200 so->so_euid = p->p_ucred->cr_uid; 201 so->so_rgid = p->p_ucred->cr_rgid; 202 so->so_egid = p->p_ucred->cr_gid; 203 so->so_cpid = p->p_p->ps_pid; 204 so->so_proto = prp; 205 so->so_snd.sb_timeo_nsecs = INFSLP; 206 so->so_rcv.sb_timeo_nsecs = INFSLP; 207 208 solock(so); 209 error = pru_attach(so, proto, M_WAIT); 210 if (error) { 211 so->so_state |= SS_NOFDREF; 212 /* sofree() calls sounlock(). */ 213 sofree(so, 0); 214 return (error); 215 } 216 sounlock(so); 217 *aso = so; 218 return (0); 219 } 220 221 int 222 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 223 { 224 soassertlocked(so); 225 return pru_bind(so, nam, p); 226 } 227 228 int 229 solisten(struct socket *so, int backlog) 230 { 231 int somaxconn_local = atomic_load_int(&somaxconn); 232 int sominconn_local = atomic_load_int(&sominconn); 233 int error; 234 235 switch (so->so_type) { 236 case SOCK_STREAM: 237 case SOCK_SEQPACKET: 238 break; 239 default: 240 return (EOPNOTSUPP); 241 } 242 243 soassertlocked(so); 244 245 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 246 return (EINVAL); 247 #ifdef SOCKET_SPLICE 248 if (isspliced(so) || issplicedback(so)) 249 return (EOPNOTSUPP); 250 #endif /* SOCKET_SPLICE */ 251 error = pru_listen(so); 252 if (error) 253 return (error); 254 if (TAILQ_FIRST(&so->so_q) == NULL) 255 so->so_options |= SO_ACCEPTCONN; 256 if (backlog < 0 || backlog > somaxconn_local) 257 backlog = somaxconn_local; 258 if (backlog < sominconn_local) 259 backlog = sominconn_local; 260 so->so_qlimit = backlog; 261 return (0); 262 } 263 264 void 265 sorele(struct socket *so) 266 { 267 if (refcnt_rele(&so->so_refcnt) == 0) 268 return; 269 270 sigio_free(&so->so_sigio); 271 klist_free(&so->so_rcv.sb_klist); 272 klist_free(&so->so_snd.sb_klist); 273 274 mtx_enter(&so->so_snd.sb_mtx); 275 sbrelease(so, &so->so_snd); 276 mtx_leave(&so->so_snd.sb_mtx); 277 278 if (so->so_proto->pr_flags & PR_RIGHTS && 279 so->so_proto->pr_domain->dom_dispose) 280 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 281 m_purge(so->so_rcv.sb_mb); 282 283 #ifdef SOCKET_SPLICE 284 if (so->so_sp) 285 pool_put(&sosplice_pool, so->so_sp); 286 #endif 287 pool_put(&socket_pool, so); 288 } 289 290 #define SOSP_FREEING_READ 1 291 #define SOSP_FREEING_WRITE 2 292 void 293 sofree(struct socket *so, int keep_lock) 294 { 295 int persocket = solock_persocket(so); 296 297 soassertlocked(so); 298 299 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 300 if (!keep_lock) 301 sounlock(so); 302 return; 303 } 304 if (so->so_head) { 305 struct socket *head = so->so_head; 306 307 /* 308 * We must not decommission a socket that's on the accept(2) 309 * queue. If we do, then accept(2) may hang after select(2) 310 * indicated that the listening socket was ready. 311 */ 312 if (so->so_onq == &head->so_q) { 313 if (!keep_lock) 314 sounlock(so); 315 return; 316 } 317 318 if (persocket) { 319 soref(head); 320 sounlock(so); 321 solock(head); 322 solock(so); 323 324 if (so->so_onq != &head->so_q0) { 325 sounlock(so); 326 sounlock(head); 327 sorele(head); 328 return; 329 } 330 } 331 332 soqremque(so, 0); 333 334 if (persocket) { 335 sounlock(head); 336 sorele(head); 337 } 338 } 339 340 if (!keep_lock) 341 sounlock(so); 342 sorele(so); 343 } 344 345 static inline uint64_t 346 solinger_nsec(struct socket *so) 347 { 348 if (so->so_linger == 0) 349 return INFSLP; 350 351 return SEC_TO_NSEC(so->so_linger); 352 } 353 354 /* 355 * Close a socket on last file table reference removal. 356 * Initiate disconnect if connected. 357 * Free socket when disconnect complete. 358 */ 359 int 360 soclose(struct socket *so, int flags) 361 { 362 struct socket *so2; 363 int error = 0; 364 365 solock(so); 366 /* Revoke async IO early. There is a final revocation in sofree(). */ 367 sigio_free(&so->so_sigio); 368 if (so->so_state & SS_ISCONNECTED) { 369 if (so->so_pcb == NULL) 370 goto discard; 371 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 372 error = sodisconnect(so); 373 if (error) 374 goto drop; 375 } 376 if (so->so_options & SO_LINGER) { 377 if ((so->so_state & SS_ISDISCONNECTING) && 378 (flags & MSG_DONTWAIT)) 379 goto drop; 380 while (so->so_state & SS_ISCONNECTED) { 381 error = sosleep_nsec(so, &so->so_timeo, 382 PSOCK | PCATCH, "netcls", 383 solinger_nsec(so)); 384 if (error) 385 break; 386 } 387 } 388 } 389 drop: 390 if (so->so_pcb) { 391 int error2; 392 error2 = pru_detach(so); 393 if (error == 0) 394 error = error2; 395 } 396 if (so->so_options & SO_ACCEPTCONN) { 397 int persocket = solock_persocket(so); 398 399 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 400 soref(so2); 401 solock(so2); 402 (void) soqremque(so2, 0); 403 sounlock(so); 404 soabort(so2); 405 sounlock(so2); 406 sorele(so2); 407 solock(so); 408 } 409 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 410 soref(so2); 411 solock_nonet(so2); 412 (void) soqremque(so2, 1); 413 if (persocket) 414 sounlock(so); 415 soabort(so2); 416 sounlock_nonet(so2); 417 sorele(so2); 418 if (persocket) 419 solock(so); 420 } 421 } 422 discard: 423 #ifdef SOCKET_SPLICE 424 if (so->so_sp) { 425 struct socket *soback; 426 427 sounlock(so); 428 mtx_enter(&so->so_snd.sb_mtx); 429 /* 430 * Concurrent sounsplice() locks `sb_mtx' mutexes on 431 * both `so_snd' and `so_rcv' before unsplice sockets. 432 */ 433 if ((soback = so->so_sp->ssp_soback) == NULL) { 434 mtx_leave(&so->so_snd.sb_mtx); 435 goto notsplicedback; 436 } 437 soref(soback); 438 mtx_leave(&so->so_snd.sb_mtx); 439 440 /* 441 * `so' can be only unspliced, and never spliced again. 442 * Thus if issplicedback(so) check is positive, socket is 443 * still spliced and `ssp_soback' points to the same 444 * socket that `soback'. 445 */ 446 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 447 if (issplicedback(so)) { 448 int freeing = SOSP_FREEING_WRITE; 449 450 if (so->so_sp->ssp_soback == so) 451 freeing |= SOSP_FREEING_READ; 452 sounsplice(so->so_sp->ssp_soback, so, freeing); 453 } 454 sbunlock(&soback->so_rcv); 455 sorele(soback); 456 457 notsplicedback: 458 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 459 if (isspliced(so)) { 460 struct socket *sosp; 461 int freeing = SOSP_FREEING_READ; 462 463 if (so == so->so_sp->ssp_socket) 464 freeing |= SOSP_FREEING_WRITE; 465 sosp = soref(so->so_sp->ssp_socket); 466 sounsplice(so, so->so_sp->ssp_socket, freeing); 467 sorele(sosp); 468 } 469 sbunlock(&so->so_rcv); 470 471 timeout_del_barrier(&so->so_sp->ssp_idleto); 472 task_del(sosplice_taskq, &so->so_sp->ssp_task); 473 taskq_barrier(sosplice_taskq); 474 475 solock(so); 476 } 477 #endif /* SOCKET_SPLICE */ 478 479 if (so->so_state & SS_NOFDREF) 480 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 481 so->so_state |= SS_NOFDREF; 482 483 /* sofree() calls sounlock(). */ 484 sofree(so, 0); 485 return (error); 486 } 487 488 void 489 soabort(struct socket *so) 490 { 491 soassertlocked(so); 492 pru_abort(so); 493 } 494 495 int 496 soaccept(struct socket *so, struct mbuf *nam) 497 { 498 int error = 0; 499 500 soassertlocked(so); 501 502 if ((so->so_state & SS_NOFDREF) == 0) 503 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 504 so->so_state &= ~SS_NOFDREF; 505 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 506 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 507 error = pru_accept(so, nam); 508 else 509 error = ECONNABORTED; 510 return (error); 511 } 512 513 int 514 soconnect(struct socket *so, struct mbuf *nam) 515 { 516 int error; 517 518 soassertlocked(so); 519 520 if (so->so_options & SO_ACCEPTCONN) 521 return (EOPNOTSUPP); 522 /* 523 * If protocol is connection-based, can only connect once. 524 * Otherwise, if connected, try to disconnect first. 525 * This allows user to disconnect by connecting to, e.g., 526 * a null address. 527 */ 528 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 529 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 530 (error = sodisconnect(so)))) 531 error = EISCONN; 532 else 533 error = pru_connect(so, nam); 534 return (error); 535 } 536 537 int 538 soconnect2(struct socket *so1, struct socket *so2) 539 { 540 int persocket, error; 541 542 if ((persocket = solock_persocket(so1))) 543 solock_pair(so1, so2); 544 else 545 solock(so1); 546 547 error = pru_connect2(so1, so2); 548 549 if (persocket) 550 sounlock(so2); 551 sounlock(so1); 552 return (error); 553 } 554 555 int 556 sodisconnect(struct socket *so) 557 { 558 int error; 559 560 soassertlocked(so); 561 562 if ((so->so_state & SS_ISCONNECTED) == 0) 563 return (ENOTCONN); 564 if (so->so_state & SS_ISDISCONNECTING) 565 return (EALREADY); 566 error = pru_disconnect(so); 567 return (error); 568 } 569 570 int m_getuio(struct mbuf **, int, long, struct uio *); 571 572 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 573 /* 574 * Send on a socket. 575 * If send must go all at once and message is larger than 576 * send buffering, then hard error. 577 * Lock against other senders. 578 * If must go all at once and not enough room now, then 579 * inform user that this would block and do nothing. 580 * Otherwise, if nonblocking, send as much as possible. 581 * The data to be sent is described by "uio" if nonzero, 582 * otherwise by the mbuf chain "top" (which must be null 583 * if uio is not). Data provided in mbuf chain must be small 584 * enough to send all at once. 585 * 586 * Returns nonzero on error, timeout or signal; callers 587 * must check for short counts if EINTR/ERESTART are returned. 588 * Data and control buffers are freed on return. 589 */ 590 int 591 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 592 struct mbuf *control, int flags) 593 { 594 long space, clen = 0; 595 size_t resid; 596 int error; 597 int atomic = sosendallatonce(so) || top; 598 599 if (uio) 600 resid = uio->uio_resid; 601 else 602 resid = top->m_pkthdr.len; 603 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 604 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 605 m_freem(top); 606 m_freem(control); 607 return (EINVAL); 608 } 609 if (uio && uio->uio_procp) 610 uio->uio_procp->p_ru.ru_msgsnd++; 611 if (control) { 612 /* 613 * In theory clen should be unsigned (since control->m_len is). 614 * However, space must be signed, as it might be less than 0 615 * if we over-committed, and we must use a signed comparison 616 * of space and clen. 617 */ 618 clen = control->m_len; 619 /* reserve extra space for AF_UNIX's internalize */ 620 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 621 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 622 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 623 clen = CMSG_SPACE( 624 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 625 (sizeof(struct fdpass) / sizeof(int))); 626 } 627 628 #define snderr(errno) { error = errno; goto release; } 629 630 restart: 631 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 632 goto out; 633 mtx_enter(&so->so_snd.sb_mtx); 634 so->so_snd.sb_state |= SS_ISSENDING; 635 do { 636 if (so->so_snd.sb_state & SS_CANTSENDMORE) 637 snderr(EPIPE); 638 if ((error = READ_ONCE(so->so_error))) { 639 so->so_error = 0; 640 snderr(error); 641 } 642 if ((so->so_state & SS_ISCONNECTED) == 0) { 643 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 644 if (!(resid == 0 && clen != 0)) 645 snderr(ENOTCONN); 646 } else if (addr == NULL) 647 snderr(EDESTADDRREQ); 648 } 649 space = sbspace_locked(so, &so->so_snd); 650 if (flags & MSG_OOB) 651 space += 1024; 652 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 653 if (atomic && resid > so->so_snd.sb_hiwat) 654 snderr(EMSGSIZE); 655 } else { 656 if (clen > so->so_snd.sb_hiwat || 657 (atomic && resid > so->so_snd.sb_hiwat - clen)) 658 snderr(EMSGSIZE); 659 } 660 if (space < clen || 661 (space - clen < resid && 662 (atomic || space < so->so_snd.sb_lowat))) { 663 if (flags & MSG_DONTWAIT) 664 snderr(EWOULDBLOCK); 665 sbunlock(&so->so_snd); 666 error = sbwait(&so->so_snd); 667 so->so_snd.sb_state &= ~SS_ISSENDING; 668 mtx_leave(&so->so_snd.sb_mtx); 669 if (error) 670 goto out; 671 goto restart; 672 } 673 space -= clen; 674 do { 675 if (uio == NULL) { 676 /* 677 * Data is prepackaged in "top". 678 */ 679 resid = 0; 680 if (flags & MSG_EOR) 681 top->m_flags |= M_EOR; 682 } else { 683 mtx_leave(&so->so_snd.sb_mtx); 684 error = m_getuio(&top, atomic, space, uio); 685 mtx_enter(&so->so_snd.sb_mtx); 686 if (error) 687 goto release; 688 space -= top->m_pkthdr.len; 689 resid = uio->uio_resid; 690 if (flags & MSG_EOR) 691 top->m_flags |= M_EOR; 692 } 693 if (resid == 0) 694 so->so_snd.sb_state &= ~SS_ISSENDING; 695 if (top && so->so_options & SO_ZEROIZE) 696 top->m_flags |= M_ZEROIZE; 697 mtx_leave(&so->so_snd.sb_mtx); 698 solock_shared(so); 699 if (flags & MSG_OOB) 700 error = pru_sendoob(so, top, addr, control); 701 else 702 error = pru_send(so, top, addr, control); 703 sounlock_shared(so); 704 mtx_enter(&so->so_snd.sb_mtx); 705 clen = 0; 706 control = NULL; 707 top = NULL; 708 if (error) 709 goto release; 710 } while (resid && space > 0); 711 } while (resid); 712 713 release: 714 so->so_snd.sb_state &= ~SS_ISSENDING; 715 mtx_leave(&so->so_snd.sb_mtx); 716 sbunlock(&so->so_snd); 717 out: 718 m_freem(top); 719 m_freem(control); 720 return (error); 721 } 722 723 int 724 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 725 { 726 struct mbuf *m, *top = NULL; 727 struct mbuf **nextp = ⊤ 728 u_long len, mlen; 729 size_t resid = uio->uio_resid; 730 int error; 731 732 do { 733 if (top == NULL) { 734 MGETHDR(m, M_WAIT, MT_DATA); 735 mlen = MHLEN; 736 } else { 737 MGET(m, M_WAIT, MT_DATA); 738 mlen = MLEN; 739 } 740 /* chain mbuf together */ 741 *nextp = m; 742 nextp = &m->m_next; 743 744 resid = ulmin(resid, space); 745 if (resid >= MINCLSIZE) { 746 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 747 if ((m->m_flags & M_EXT) == 0) 748 MCLGETL(m, M_NOWAIT, MCLBYTES); 749 if ((m->m_flags & M_EXT) == 0) 750 goto nopages; 751 mlen = m->m_ext.ext_size; 752 len = ulmin(mlen, resid); 753 /* 754 * For datagram protocols, leave room 755 * for protocol headers in first mbuf. 756 */ 757 if (atomic && m == top && len < mlen - max_hdr) 758 m->m_data += max_hdr; 759 } else { 760 nopages: 761 len = ulmin(mlen, resid); 762 /* 763 * For datagram protocols, leave room 764 * for protocol headers in first mbuf. 765 */ 766 if (atomic && m == top && len < mlen - max_hdr) 767 m_align(m, len); 768 } 769 770 error = uiomove(mtod(m, caddr_t), len, uio); 771 if (error) { 772 m_freem(top); 773 return (error); 774 } 775 776 /* adjust counters */ 777 resid = uio->uio_resid; 778 space -= len; 779 m->m_len = len; 780 top->m_pkthdr.len += len; 781 782 /* Is there more space and more data? */ 783 } while (space > 0 && resid > 0); 784 785 *mp = top; 786 return 0; 787 } 788 789 /* 790 * Following replacement or removal of the first mbuf on the first 791 * mbuf chain of a socket buffer, push necessary state changes back 792 * into the socket buffer so that other consumers see the values 793 * consistently. 'nextrecord' is the callers locally stored value of 794 * the original value of sb->sb_mb->m_nextpkt which must be restored 795 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 796 */ 797 void 798 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 799 { 800 801 /* 802 * First, update for the new value of nextrecord. If necessary, 803 * make it the first record. 804 */ 805 if (sb->sb_mb != NULL) 806 sb->sb_mb->m_nextpkt = nextrecord; 807 else 808 sb->sb_mb = nextrecord; 809 810 /* 811 * Now update any dependent socket buffer fields to reflect 812 * the new state. This is an inline of SB_EMPTY_FIXUP, with 813 * the addition of a second clause that takes care of the 814 * case where sb_mb has been updated, but remains the last 815 * record. 816 */ 817 if (sb->sb_mb == NULL) { 818 sb->sb_mbtail = NULL; 819 sb->sb_lastrecord = NULL; 820 } else if (sb->sb_mb->m_nextpkt == NULL) 821 sb->sb_lastrecord = sb->sb_mb; 822 } 823 824 /* 825 * Implement receive operations on a socket. 826 * We depend on the way that records are added to the sockbuf 827 * by sbappend*. In particular, each record (mbufs linked through m_next) 828 * must begin with an address if the protocol so specifies, 829 * followed by an optional mbuf or mbufs containing ancillary data, 830 * and then zero or more mbufs of data. 831 * In order to avoid blocking network for the entire time here, we release 832 * the solock() while doing the actual copy to user space. 833 * Although the sockbuf is locked, new data may still be appended, 834 * and thus we must maintain consistency of the sockbuf during that time. 835 * 836 * The caller may receive the data as a single mbuf chain by supplying 837 * an mbuf **mp0 for use in returning the chain. The uio is then used 838 * only for the count in uio_resid. 839 */ 840 int 841 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 842 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 843 socklen_t controllen) 844 { 845 struct mbuf *m, **mp; 846 struct mbuf *cm; 847 u_long len, offset, moff; 848 int flags, error, error2, type, uio_error = 0; 849 const struct protosw *pr = so->so_proto; 850 struct mbuf *nextrecord; 851 size_t resid, orig_resid = uio->uio_resid; 852 853 mp = mp0; 854 if (paddr) 855 *paddr = NULL; 856 if (controlp) 857 *controlp = NULL; 858 if (flagsp) 859 flags = *flagsp &~ MSG_EOR; 860 else 861 flags = 0; 862 if (flags & MSG_OOB) { 863 m = m_get(M_WAIT, MT_DATA); 864 solock_shared(so); 865 error = pru_rcvoob(so, m, flags & MSG_PEEK); 866 sounlock_shared(so); 867 if (error) 868 goto bad; 869 do { 870 error = uiomove(mtod(m, caddr_t), 871 ulmin(uio->uio_resid, m->m_len), uio); 872 m = m_free(m); 873 } while (uio->uio_resid && error == 0 && m); 874 bad: 875 m_freem(m); 876 return (error); 877 } 878 if (mp) 879 *mp = NULL; 880 881 restart: 882 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 883 return (error); 884 mtx_enter(&so->so_rcv.sb_mtx); 885 886 m = so->so_rcv.sb_mb; 887 #ifdef SOCKET_SPLICE 888 if (isspliced(so)) 889 m = NULL; 890 #endif /* SOCKET_SPLICE */ 891 /* 892 * If we have less data than requested, block awaiting more 893 * (subject to any timeout) if: 894 * 1. the current count is less than the low water mark, 895 * 2. MSG_WAITALL is set, and it is possible to do the entire 896 * receive operation at once if we block (resid <= hiwat), or 897 * 3. MSG_DONTWAIT is not set. 898 * If MSG_WAITALL is set but resid is larger than the receive buffer, 899 * we have to do the receive in sections, and thus risk returning 900 * a short count if a timeout or signal occurs after we start. 901 */ 902 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 903 so->so_rcv.sb_cc < uio->uio_resid) && 904 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 905 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 906 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 907 #ifdef DIAGNOSTIC 908 if (m == NULL && so->so_rcv.sb_cc) 909 #ifdef SOCKET_SPLICE 910 if (!isspliced(so)) 911 #endif /* SOCKET_SPLICE */ 912 panic("receive 1: so %p, so_type %d, sb_cc %lu", 913 so, so->so_type, so->so_rcv.sb_cc); 914 #endif 915 if ((error2 = READ_ONCE(so->so_error))) { 916 if (m) 917 goto dontblock; 918 error = error2; 919 if ((flags & MSG_PEEK) == 0) 920 so->so_error = 0; 921 goto release; 922 } 923 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 924 if (m) 925 goto dontblock; 926 else if (so->so_rcv.sb_cc == 0) 927 goto release; 928 } 929 for (; m; m = m->m_next) 930 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 931 m = so->so_rcv.sb_mb; 932 goto dontblock; 933 } 934 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 935 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 936 error = ENOTCONN; 937 goto release; 938 } 939 if (uio->uio_resid == 0 && controlp == NULL) 940 goto release; 941 if (flags & MSG_DONTWAIT) { 942 error = EWOULDBLOCK; 943 goto release; 944 } 945 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 946 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 947 948 sbunlock(&so->so_rcv); 949 error = sbwait(&so->so_rcv); 950 mtx_leave(&so->so_rcv.sb_mtx); 951 if (error) 952 return (error); 953 goto restart; 954 } 955 dontblock: 956 /* 957 * On entry here, m points to the first record of the socket buffer. 958 * From this point onward, we maintain 'nextrecord' as a cache of the 959 * pointer to the next record in the socket buffer. We must keep the 960 * various socket buffer pointers and local stack versions of the 961 * pointers in sync, pushing out modifications before operations that 962 * may sleep, and re-reading them afterwards. 963 * 964 * Otherwise, we will race with the network stack appending new data 965 * or records onto the socket buffer by using inconsistent/stale 966 * versions of the field, possibly resulting in socket buffer 967 * corruption. 968 */ 969 if (uio->uio_procp) 970 uio->uio_procp->p_ru.ru_msgrcv++; 971 KASSERT(m == so->so_rcv.sb_mb); 972 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 973 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 974 nextrecord = m->m_nextpkt; 975 if (pr->pr_flags & PR_ADDR) { 976 #ifdef DIAGNOSTIC 977 if (m->m_type != MT_SONAME) 978 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 979 so, so->so_type, m, m->m_type); 980 #endif 981 orig_resid = 0; 982 if (flags & MSG_PEEK) { 983 if (paddr) 984 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 985 m = m->m_next; 986 } else { 987 sbfree(so, &so->so_rcv, m); 988 if (paddr) { 989 *paddr = m; 990 so->so_rcv.sb_mb = m->m_next; 991 m->m_next = NULL; 992 m = so->so_rcv.sb_mb; 993 } else { 994 so->so_rcv.sb_mb = m_free(m); 995 m = so->so_rcv.sb_mb; 996 } 997 sbsync(&so->so_rcv, nextrecord); 998 } 999 } 1000 while (m && m->m_type == MT_CONTROL && error == 0) { 1001 int skip = 0; 1002 if (flags & MSG_PEEK) { 1003 if (mtod(m, struct cmsghdr *)->cmsg_type == 1004 SCM_RIGHTS) { 1005 /* don't leak internalized SCM_RIGHTS msgs */ 1006 skip = 1; 1007 } else if (controlp) 1008 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1009 m = m->m_next; 1010 } else { 1011 sbfree(so, &so->so_rcv, m); 1012 so->so_rcv.sb_mb = m->m_next; 1013 m->m_nextpkt = m->m_next = NULL; 1014 cm = m; 1015 m = so->so_rcv.sb_mb; 1016 sbsync(&so->so_rcv, nextrecord); 1017 if (controlp) { 1018 if (pr->pr_domain->dom_externalize) { 1019 mtx_leave(&so->so_rcv.sb_mtx); 1020 error = 1021 (*pr->pr_domain->dom_externalize) 1022 (cm, controllen, flags); 1023 mtx_enter(&so->so_rcv.sb_mtx); 1024 } 1025 *controlp = cm; 1026 } else { 1027 /* 1028 * Dispose of any SCM_RIGHTS message that went 1029 * through the read path rather than recv. 1030 */ 1031 if (pr->pr_domain->dom_dispose) { 1032 mtx_leave(&so->so_rcv.sb_mtx); 1033 pr->pr_domain->dom_dispose(cm); 1034 mtx_enter(&so->so_rcv.sb_mtx); 1035 } 1036 m_free(cm); 1037 } 1038 } 1039 if (m != NULL) 1040 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1041 else 1042 nextrecord = so->so_rcv.sb_mb; 1043 if (controlp && !skip) 1044 controlp = &(*controlp)->m_next; 1045 orig_resid = 0; 1046 } 1047 1048 /* If m is non-NULL, we have some data to read. */ 1049 if (m) { 1050 type = m->m_type; 1051 if (type == MT_OOBDATA) 1052 flags |= MSG_OOB; 1053 if (m->m_flags & M_BCAST) 1054 flags |= MSG_BCAST; 1055 if (m->m_flags & M_MCAST) 1056 flags |= MSG_MCAST; 1057 } 1058 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1059 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1060 1061 moff = 0; 1062 offset = 0; 1063 while (m && uio->uio_resid > 0 && error == 0) { 1064 if (m->m_type == MT_OOBDATA) { 1065 if (type != MT_OOBDATA) 1066 break; 1067 } else if (type == MT_OOBDATA) { 1068 break; 1069 } else if (m->m_type == MT_CONTROL) { 1070 /* 1071 * If there is more than one control message in the 1072 * stream, we do a short read. Next can be received 1073 * or disposed by another system call. 1074 */ 1075 break; 1076 #ifdef DIAGNOSTIC 1077 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1078 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1079 so, so->so_type, m, m->m_type); 1080 #endif 1081 } 1082 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1083 len = uio->uio_resid; 1084 if (so->so_oobmark && len > so->so_oobmark - offset) 1085 len = so->so_oobmark - offset; 1086 if (len > m->m_len - moff) 1087 len = m->m_len - moff; 1088 /* 1089 * If mp is set, just pass back the mbufs. 1090 * Otherwise copy them out via the uio, then free. 1091 * Sockbuf must be consistent here (points to current mbuf, 1092 * it points to next record) when we drop priority; 1093 * we must note any additions to the sockbuf when we 1094 * block interrupts again. 1095 */ 1096 if (mp == NULL && uio_error == 0) { 1097 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1098 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1099 resid = uio->uio_resid; 1100 mtx_leave(&so->so_rcv.sb_mtx); 1101 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1102 mtx_enter(&so->so_rcv.sb_mtx); 1103 if (uio_error) 1104 uio->uio_resid = resid - len; 1105 } else 1106 uio->uio_resid -= len; 1107 if (len == m->m_len - moff) { 1108 if (m->m_flags & M_EOR) 1109 flags |= MSG_EOR; 1110 if (flags & MSG_PEEK) { 1111 m = m->m_next; 1112 moff = 0; 1113 orig_resid = 0; 1114 } else { 1115 nextrecord = m->m_nextpkt; 1116 sbfree(so, &so->so_rcv, m); 1117 if (mp) { 1118 *mp = m; 1119 mp = &m->m_next; 1120 so->so_rcv.sb_mb = m = m->m_next; 1121 *mp = NULL; 1122 } else { 1123 so->so_rcv.sb_mb = m_free(m); 1124 m = so->so_rcv.sb_mb; 1125 } 1126 /* 1127 * If m != NULL, we also know that 1128 * so->so_rcv.sb_mb != NULL. 1129 */ 1130 KASSERT(so->so_rcv.sb_mb == m); 1131 if (m) { 1132 m->m_nextpkt = nextrecord; 1133 if (nextrecord == NULL) 1134 so->so_rcv.sb_lastrecord = m; 1135 } else { 1136 so->so_rcv.sb_mb = nextrecord; 1137 SB_EMPTY_FIXUP(&so->so_rcv); 1138 } 1139 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1140 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1141 } 1142 } else { 1143 if (flags & MSG_PEEK) { 1144 moff += len; 1145 orig_resid = 0; 1146 } else { 1147 if (mp) 1148 *mp = m_copym(m, 0, len, M_WAIT); 1149 m->m_data += len; 1150 m->m_len -= len; 1151 so->so_rcv.sb_cc -= len; 1152 so->so_rcv.sb_datacc -= len; 1153 } 1154 } 1155 if (so->so_oobmark) { 1156 if ((flags & MSG_PEEK) == 0) { 1157 so->so_oobmark -= len; 1158 if (so->so_oobmark == 0) { 1159 so->so_rcv.sb_state |= SS_RCVATMARK; 1160 break; 1161 } 1162 } else { 1163 offset += len; 1164 if (offset == so->so_oobmark) 1165 break; 1166 } 1167 } 1168 if (flags & MSG_EOR) 1169 break; 1170 /* 1171 * If the MSG_WAITALL flag is set (for non-atomic socket), 1172 * we must not quit until "uio->uio_resid == 0" or an error 1173 * termination. If a signal/timeout occurs, return 1174 * with a short count but without error. 1175 * Keep sockbuf locked against other readers. 1176 */ 1177 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1178 !sosendallatonce(so) && !nextrecord) { 1179 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1180 so->so_error) 1181 break; 1182 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1183 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1184 if (sbwait(&so->so_rcv)) { 1185 mtx_leave(&so->so_rcv.sb_mtx); 1186 sbunlock(&so->so_rcv); 1187 return (0); 1188 } 1189 if ((m = so->so_rcv.sb_mb) != NULL) 1190 nextrecord = m->m_nextpkt; 1191 } 1192 } 1193 1194 if (m && pr->pr_flags & PR_ATOMIC) { 1195 flags |= MSG_TRUNC; 1196 if ((flags & MSG_PEEK) == 0) 1197 (void) sbdroprecord(so, &so->so_rcv); 1198 } 1199 if ((flags & MSG_PEEK) == 0) { 1200 if (m == NULL) { 1201 /* 1202 * First part is an inline SB_EMPTY_FIXUP(). Second 1203 * part makes sure sb_lastrecord is up-to-date if 1204 * there is still data in the socket buffer. 1205 */ 1206 so->so_rcv.sb_mb = nextrecord; 1207 if (so->so_rcv.sb_mb == NULL) { 1208 so->so_rcv.sb_mbtail = NULL; 1209 so->so_rcv.sb_lastrecord = NULL; 1210 } else if (nextrecord->m_nextpkt == NULL) 1211 so->so_rcv.sb_lastrecord = nextrecord; 1212 } 1213 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1214 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1215 if (pr->pr_flags & PR_WANTRCVD) { 1216 mtx_leave(&so->so_rcv.sb_mtx); 1217 solock_shared(so); 1218 pru_rcvd(so); 1219 sounlock_shared(so); 1220 mtx_enter(&so->so_rcv.sb_mtx); 1221 } 1222 } 1223 if (orig_resid == uio->uio_resid && orig_resid && 1224 (flags & MSG_EOR) == 0 && 1225 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1226 mtx_leave(&so->so_rcv.sb_mtx); 1227 sbunlock(&so->so_rcv); 1228 goto restart; 1229 } 1230 1231 if (uio_error) 1232 error = uio_error; 1233 1234 if (flagsp) 1235 *flagsp |= flags; 1236 release: 1237 mtx_leave(&so->so_rcv.sb_mtx); 1238 sbunlock(&so->so_rcv); 1239 return (error); 1240 } 1241 1242 int 1243 soshutdown(struct socket *so, int how) 1244 { 1245 int error = 0; 1246 1247 switch (how) { 1248 case SHUT_RD: 1249 sorflush(so); 1250 break; 1251 case SHUT_RDWR: 1252 sorflush(so); 1253 /* FALLTHROUGH */ 1254 case SHUT_WR: 1255 solock(so); 1256 error = pru_shutdown(so); 1257 sounlock(so); 1258 break; 1259 default: 1260 error = EINVAL; 1261 break; 1262 } 1263 1264 return (error); 1265 } 1266 1267 void 1268 sorflush(struct socket *so) 1269 { 1270 struct sockbuf *sb = &so->so_rcv; 1271 struct mbuf *m; 1272 const struct protosw *pr = so->so_proto; 1273 int error; 1274 1275 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1276 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1277 KASSERT(error == 0); 1278 1279 solock_shared(so); 1280 socantrcvmore(so); 1281 mtx_enter(&sb->sb_mtx); 1282 m = sb->sb_mb; 1283 memset(&sb->sb_startzero, 0, 1284 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1285 sb->sb_timeo_nsecs = INFSLP; 1286 mtx_leave(&sb->sb_mtx); 1287 sounlock_shared(so); 1288 sbunlock(sb); 1289 1290 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1291 (*pr->pr_domain->dom_dispose)(m); 1292 m_purge(m); 1293 } 1294 1295 #ifdef SOCKET_SPLICE 1296 1297 #define so_splicelen so_sp->ssp_len 1298 #define so_splicemax so_sp->ssp_max 1299 #define so_idletv so_sp->ssp_idletv 1300 #define so_idleto so_sp->ssp_idleto 1301 #define so_splicetask so_sp->ssp_task 1302 1303 void 1304 sosplice_solock_pair(struct socket *so1, struct socket *so2) 1305 { 1306 NET_LOCK_SHARED(); 1307 1308 if (so1 == so2) 1309 rw_enter_write(&so1->so_lock); 1310 else if (so1 < so2) { 1311 rw_enter_write(&so1->so_lock); 1312 rw_enter_write(&so2->so_lock); 1313 } else { 1314 rw_enter_write(&so2->so_lock); 1315 rw_enter_write(&so1->so_lock); 1316 } 1317 } 1318 1319 void 1320 sosplice_sounlock_pair(struct socket *so1, struct socket *so2) 1321 { 1322 if (so1 == so2) 1323 rw_exit_write(&so1->so_lock); 1324 else if (so1 < so2) { 1325 rw_exit_write(&so2->so_lock); 1326 rw_exit_write(&so1->so_lock); 1327 } else { 1328 rw_exit_write(&so1->so_lock); 1329 rw_exit_write(&so2->so_lock); 1330 } 1331 1332 NET_UNLOCK_SHARED(); 1333 } 1334 1335 int 1336 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1337 { 1338 struct file *fp; 1339 struct socket *sosp; 1340 struct taskq *tq; 1341 int error = 0; 1342 1343 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1344 return (EPROTONOSUPPORT); 1345 if (max && max < 0) 1346 return (EINVAL); 1347 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1348 return (EINVAL); 1349 1350 /* If no fd is given, unsplice by removing existing link. */ 1351 if (fd < 0) { 1352 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1353 return (error); 1354 if (so->so_sp && so->so_sp->ssp_socket) { 1355 sosp = soref(so->so_sp->ssp_socket); 1356 sounsplice(so, so->so_sp->ssp_socket, 0); 1357 sorele(sosp); 1358 } else 1359 error = EPROTO; 1360 sbunlock(&so->so_rcv); 1361 return (error); 1362 } 1363 1364 if (sosplice_taskq == NULL) { 1365 rw_enter_write(&sosplice_lock); 1366 if (sosplice_taskq == NULL) { 1367 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1368 TASKQ_MPSAFE); 1369 if (tq == NULL) { 1370 rw_exit_write(&sosplice_lock); 1371 return (ENOMEM); 1372 } 1373 /* Ensure the taskq is fully visible to other CPUs. */ 1374 membar_producer(); 1375 sosplice_taskq = tq; 1376 } 1377 rw_exit_write(&sosplice_lock); 1378 } else { 1379 /* Ensure the taskq is fully visible on this CPU. */ 1380 membar_consumer(); 1381 } 1382 1383 /* Find sosp, the drain socket where data will be spliced into. */ 1384 if ((error = getsock(curproc, fd, &fp)) != 0) 1385 return (error); 1386 sosp = fp->f_data; 1387 1388 if (sosp->so_proto->pr_usrreqs->pru_send != 1389 so->so_proto->pr_usrreqs->pru_send) { 1390 error = EPROTONOSUPPORT; 1391 goto frele; 1392 } 1393 1394 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1395 goto frele; 1396 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1397 sbunlock(&so->so_rcv); 1398 goto frele; 1399 } 1400 sosplice_solock_pair(so, sosp); 1401 1402 if ((so->so_options & SO_ACCEPTCONN) || 1403 (sosp->so_options & SO_ACCEPTCONN)) { 1404 error = EOPNOTSUPP; 1405 goto release; 1406 } 1407 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1408 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1409 error = ENOTCONN; 1410 goto release; 1411 } 1412 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1413 error = ENOTCONN; 1414 goto release; 1415 } 1416 if (so->so_sp == NULL) { 1417 struct sosplice *so_sp; 1418 1419 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1420 timeout_set_flags(&so_sp->ssp_idleto, soidle, so, 1421 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1422 task_set(&so_sp->ssp_task, sotask, so); 1423 1424 so->so_sp = so_sp; 1425 } 1426 if (sosp->so_sp == NULL) { 1427 struct sosplice *so_sp; 1428 1429 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1430 timeout_set_flags(&so_sp->ssp_idleto, soidle, sosp, 1431 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1432 task_set(&so_sp->ssp_task, sotask, sosp); 1433 1434 sosp->so_sp = so_sp; 1435 } 1436 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1437 error = EBUSY; 1438 goto release; 1439 } 1440 1441 so->so_splicelen = 0; 1442 so->so_splicemax = max; 1443 if (tv) 1444 so->so_idletv = *tv; 1445 else 1446 timerclear(&so->so_idletv); 1447 1448 /* 1449 * To prevent sorwakeup() calling somove() before this somove() 1450 * has finished, the socket buffers are not marked as spliced yet. 1451 */ 1452 1453 /* Splice so and sosp together. */ 1454 mtx_enter(&so->so_rcv.sb_mtx); 1455 mtx_enter(&sosp->so_snd.sb_mtx); 1456 so->so_sp->ssp_socket = sosp; 1457 sosp->so_sp->ssp_soback = so; 1458 mtx_leave(&sosp->so_snd.sb_mtx); 1459 mtx_leave(&so->so_rcv.sb_mtx); 1460 1461 sosplice_sounlock_pair(so, sosp); 1462 sbunlock(&sosp->so_snd); 1463 1464 if (somove(so, M_WAIT)) { 1465 mtx_enter(&so->so_rcv.sb_mtx); 1466 mtx_enter(&sosp->so_snd.sb_mtx); 1467 so->so_rcv.sb_flags |= SB_SPLICE; 1468 sosp->so_snd.sb_flags |= SB_SPLICE; 1469 mtx_leave(&sosp->so_snd.sb_mtx); 1470 mtx_leave(&so->so_rcv.sb_mtx); 1471 } 1472 1473 sbunlock(&so->so_rcv); 1474 FRELE(fp, curproc); 1475 return (0); 1476 1477 release: 1478 sosplice_sounlock_pair(so, sosp); 1479 sbunlock(&sosp->so_snd); 1480 sbunlock(&so->so_rcv); 1481 frele: 1482 FRELE(fp, curproc); 1483 return (error); 1484 } 1485 1486 void 1487 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1488 { 1489 sbassertlocked(&so->so_rcv); 1490 1491 mtx_enter(&so->so_rcv.sb_mtx); 1492 mtx_enter(&sosp->so_snd.sb_mtx); 1493 so->so_rcv.sb_flags &= ~SB_SPLICE; 1494 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1495 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1496 mtx_leave(&sosp->so_snd.sb_mtx); 1497 mtx_leave(&so->so_rcv.sb_mtx); 1498 1499 task_del(sosplice_taskq, &so->so_splicetask); 1500 timeout_del(&so->so_idleto); 1501 1502 /* Do not wakeup a socket that is about to be freed. */ 1503 if ((freeing & SOSP_FREEING_READ) == 0) { 1504 int readable; 1505 1506 solock_shared(so); 1507 mtx_enter(&so->so_rcv.sb_mtx); 1508 readable = soreadable(so); 1509 mtx_leave(&so->so_rcv.sb_mtx); 1510 if (readable) 1511 sorwakeup(so); 1512 sounlock_shared(so); 1513 } 1514 if ((freeing & SOSP_FREEING_WRITE) == 0) { 1515 solock_shared(sosp); 1516 if (sowriteable(sosp)) 1517 sowwakeup(sosp); 1518 sounlock_shared(sosp); 1519 } 1520 } 1521 1522 void 1523 soidle(void *arg) 1524 { 1525 struct socket *so = arg; 1526 1527 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1528 if (so->so_rcv.sb_flags & SB_SPLICE) { 1529 struct socket *sosp; 1530 1531 WRITE_ONCE(so->so_error, ETIMEDOUT); 1532 sosp = soref(so->so_sp->ssp_socket); 1533 sounsplice(so, so->so_sp->ssp_socket, 0); 1534 sorele(sosp); 1535 } 1536 sbunlock(&so->so_rcv); 1537 } 1538 1539 void 1540 sotask(void *arg) 1541 { 1542 struct socket *so = arg; 1543 int doyield = 0; 1544 1545 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1546 if (so->so_rcv.sb_flags & SB_SPLICE) { 1547 if (so->so_proto->pr_flags & PR_WANTRCVD) 1548 doyield = 1; 1549 somove(so, M_DONTWAIT); 1550 } 1551 sbunlock(&so->so_rcv); 1552 1553 if (doyield) { 1554 /* Avoid user land starvation. */ 1555 yield(); 1556 } 1557 } 1558 1559 /* 1560 * Move data from receive buffer of spliced source socket to send 1561 * buffer of drain socket. Try to move as much as possible in one 1562 * big chunk. It is a TCP only implementation. 1563 * Return value 0 means splicing has been finished, 1 continue. 1564 */ 1565 int 1566 somove(struct socket *so, int wait) 1567 { 1568 struct socket *sosp = so->so_sp->ssp_socket; 1569 struct mbuf *m, **mp, *nextrecord; 1570 u_long len, off, oobmark; 1571 long space; 1572 int error = 0, maxreached = 0, unsplice = 0; 1573 unsigned int rcvstate; 1574 1575 sbassertlocked(&so->so_rcv); 1576 1577 if (so->so_proto->pr_flags & PR_WANTRCVD) 1578 sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 1579 1580 mtx_enter(&so->so_rcv.sb_mtx); 1581 mtx_enter(&sosp->so_snd.sb_mtx); 1582 1583 nextpkt: 1584 if ((error = READ_ONCE(so->so_error))) 1585 goto release; 1586 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1587 error = EPIPE; 1588 goto release; 1589 } 1590 1591 error = READ_ONCE(sosp->so_error); 1592 if (error) { 1593 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1594 goto release; 1595 error = 0; 1596 } 1597 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1598 goto release; 1599 1600 /* Calculate how many bytes can be copied now. */ 1601 len = so->so_rcv.sb_datacc; 1602 if (so->so_splicemax) { 1603 KASSERT(so->so_splicelen < so->so_splicemax); 1604 if (so->so_splicemax <= so->so_splicelen + len) { 1605 len = so->so_splicemax - so->so_splicelen; 1606 maxreached = 1; 1607 } 1608 } 1609 space = sbspace_locked(sosp, &sosp->so_snd); 1610 if (so->so_oobmark && so->so_oobmark < len && 1611 so->so_oobmark < space + 1024) 1612 space += 1024; 1613 if (space <= 0) { 1614 maxreached = 0; 1615 goto release; 1616 } 1617 if (space < len) { 1618 maxreached = 0; 1619 if (space < sosp->so_snd.sb_lowat) 1620 goto release; 1621 len = space; 1622 } 1623 sosp->so_snd.sb_state |= SS_ISSENDING; 1624 1625 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1626 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1627 m = so->so_rcv.sb_mb; 1628 if (m == NULL) 1629 goto release; 1630 nextrecord = m->m_nextpkt; 1631 1632 /* Drop address and control information not used with splicing. */ 1633 if (so->so_proto->pr_flags & PR_ADDR) { 1634 #ifdef DIAGNOSTIC 1635 if (m->m_type != MT_SONAME) 1636 panic("somove soname: so %p, so_type %d, m %p, " 1637 "m_type %d", so, so->so_type, m, m->m_type); 1638 #endif 1639 m = m->m_next; 1640 } 1641 while (m && m->m_type == MT_CONTROL) 1642 m = m->m_next; 1643 if (m == NULL) { 1644 sbdroprecord(so, &so->so_rcv); 1645 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1646 mtx_leave(&sosp->so_snd.sb_mtx); 1647 mtx_leave(&so->so_rcv.sb_mtx); 1648 solock_shared(so); 1649 pru_rcvd(so); 1650 sounlock_shared(so); 1651 mtx_enter(&so->so_rcv.sb_mtx); 1652 mtx_enter(&sosp->so_snd.sb_mtx); 1653 } 1654 goto nextpkt; 1655 } 1656 1657 /* 1658 * By splicing sockets connected to localhost, userland might create a 1659 * loop. Dissolve splicing with error if loop is detected by counter. 1660 * 1661 * If we deal with looped broadcast/multicast packet we bail out with 1662 * no error to suppress splice termination. 1663 */ 1664 if ((m->m_flags & M_PKTHDR) && 1665 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1666 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1667 error = ELOOP; 1668 goto release; 1669 } 1670 1671 if (so->so_proto->pr_flags & PR_ATOMIC) { 1672 if ((m->m_flags & M_PKTHDR) == 0) 1673 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1674 "m_type %d", so, so->so_type, m, m->m_type); 1675 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1676 error = EMSGSIZE; 1677 goto release; 1678 } 1679 if (len < m->m_pkthdr.len) 1680 goto release; 1681 if (m->m_pkthdr.len < len) { 1682 maxreached = 0; 1683 len = m->m_pkthdr.len; 1684 } 1685 /* 1686 * Throw away the name mbuf after it has been assured 1687 * that the whole first record can be processed. 1688 */ 1689 m = so->so_rcv.sb_mb; 1690 sbfree(so, &so->so_rcv, m); 1691 so->so_rcv.sb_mb = m_free(m); 1692 sbsync(&so->so_rcv, nextrecord); 1693 } 1694 /* 1695 * Throw away the control mbufs after it has been assured 1696 * that the whole first record can be processed. 1697 */ 1698 m = so->so_rcv.sb_mb; 1699 while (m && m->m_type == MT_CONTROL) { 1700 sbfree(so, &so->so_rcv, m); 1701 so->so_rcv.sb_mb = m_free(m); 1702 m = so->so_rcv.sb_mb; 1703 sbsync(&so->so_rcv, nextrecord); 1704 } 1705 1706 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1707 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1708 1709 /* Take at most len mbufs out of receive buffer. */ 1710 for (off = 0, mp = &m; off <= len && *mp; 1711 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1712 u_long size = len - off; 1713 1714 #ifdef DIAGNOSTIC 1715 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1716 panic("somove type: so %p, so_type %d, m %p, " 1717 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1718 #endif 1719 if ((*mp)->m_len > size) { 1720 /* 1721 * Move only a partial mbuf at maximum splice length or 1722 * if the drain buffer is too small for this large mbuf. 1723 */ 1724 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1725 len -= size; 1726 break; 1727 } 1728 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1729 if (*mp == NULL) { 1730 len -= size; 1731 break; 1732 } 1733 so->so_rcv.sb_mb->m_data += size; 1734 so->so_rcv.sb_mb->m_len -= size; 1735 so->so_rcv.sb_cc -= size; 1736 so->so_rcv.sb_datacc -= size; 1737 } else { 1738 *mp = so->so_rcv.sb_mb; 1739 sbfree(so, &so->so_rcv, *mp); 1740 so->so_rcv.sb_mb = (*mp)->m_next; 1741 sbsync(&so->so_rcv, nextrecord); 1742 } 1743 } 1744 *mp = NULL; 1745 1746 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1747 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1748 SBCHECK(so, &so->so_rcv); 1749 if (m == NULL) 1750 goto release; 1751 m->m_nextpkt = NULL; 1752 if (m->m_flags & M_PKTHDR) { 1753 m_resethdr(m); 1754 m->m_pkthdr.len = len; 1755 } 1756 1757 /* Receive buffer did shrink by len bytes, adjust oob. */ 1758 rcvstate = so->so_rcv.sb_state; 1759 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1760 oobmark = so->so_oobmark; 1761 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1762 if (oobmark) { 1763 if (oobmark == len) 1764 so->so_rcv.sb_state |= SS_RCVATMARK; 1765 if (oobmark >= len) 1766 oobmark = 0; 1767 } 1768 1769 /* Send window update to source peer as receive buffer has changed. */ 1770 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1771 mtx_leave(&sosp->so_snd.sb_mtx); 1772 mtx_leave(&so->so_rcv.sb_mtx); 1773 solock_shared(so); 1774 pru_rcvd(so); 1775 sounlock_shared(so); 1776 mtx_enter(&so->so_rcv.sb_mtx); 1777 mtx_enter(&sosp->so_snd.sb_mtx); 1778 } 1779 1780 /* 1781 * Handle oob data. If any malloc fails, ignore error. 1782 * TCP urgent data is not very reliable anyway. 1783 */ 1784 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1785 (so->so_options & SO_OOBINLINE)) { 1786 struct mbuf *o = NULL; 1787 1788 if (rcvstate & SS_RCVATMARK) { 1789 o = m_get(wait, MT_DATA); 1790 rcvstate &= ~SS_RCVATMARK; 1791 } else if (oobmark) { 1792 o = m_split(m, oobmark, wait); 1793 if (o) { 1794 mtx_leave(&sosp->so_snd.sb_mtx); 1795 mtx_leave(&so->so_rcv.sb_mtx); 1796 solock_shared(sosp); 1797 error = pru_send(sosp, m, NULL, NULL); 1798 sounlock_shared(sosp); 1799 mtx_enter(&so->so_rcv.sb_mtx); 1800 mtx_enter(&sosp->so_snd.sb_mtx); 1801 1802 if (error) { 1803 if (sosp->so_snd.sb_state & 1804 SS_CANTSENDMORE) 1805 error = EPIPE; 1806 m_freem(o); 1807 goto release; 1808 } 1809 len -= oobmark; 1810 so->so_splicelen += oobmark; 1811 m = o; 1812 o = m_get(wait, MT_DATA); 1813 } 1814 oobmark = 0; 1815 } 1816 if (o) { 1817 o->m_len = 1; 1818 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1819 1820 mtx_leave(&sosp->so_snd.sb_mtx); 1821 mtx_leave(&so->so_rcv.sb_mtx); 1822 solock_shared(sosp); 1823 error = pru_sendoob(sosp, o, NULL, NULL); 1824 sounlock_shared(sosp); 1825 mtx_enter(&so->so_rcv.sb_mtx); 1826 mtx_enter(&sosp->so_snd.sb_mtx); 1827 1828 if (error) { 1829 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1830 error = EPIPE; 1831 m_freem(m); 1832 goto release; 1833 } 1834 len -= 1; 1835 so->so_splicelen += 1; 1836 if (oobmark) { 1837 oobmark -= 1; 1838 if (oobmark == 0) 1839 rcvstate |= SS_RCVATMARK; 1840 } 1841 m_adj(m, 1); 1842 } 1843 } 1844 1845 /* Append all remaining data to drain socket. */ 1846 if (so->so_rcv.sb_cc == 0 || maxreached) 1847 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1848 1849 mtx_leave(&sosp->so_snd.sb_mtx); 1850 mtx_leave(&so->so_rcv.sb_mtx); 1851 solock_shared(sosp); 1852 error = pru_send(sosp, m, NULL, NULL); 1853 sounlock_shared(sosp); 1854 mtx_enter(&so->so_rcv.sb_mtx); 1855 mtx_enter(&sosp->so_snd.sb_mtx); 1856 1857 if (error) { 1858 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1859 sosp->so_pcb == NULL) 1860 error = EPIPE; 1861 goto release; 1862 } 1863 so->so_splicelen += len; 1864 1865 /* Move several packets if possible. */ 1866 if (!maxreached && nextrecord) 1867 goto nextpkt; 1868 1869 release: 1870 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1871 1872 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1873 error = EFBIG; 1874 if (error) 1875 WRITE_ONCE(so->so_error, error); 1876 1877 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1878 so->so_rcv.sb_cc == 0) || 1879 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1880 maxreached || error) 1881 unsplice = 1; 1882 1883 mtx_leave(&sosp->so_snd.sb_mtx); 1884 mtx_leave(&so->so_rcv.sb_mtx); 1885 1886 if (so->so_proto->pr_flags & PR_WANTRCVD) 1887 sbunlock(&so->so_snd); 1888 1889 if (unsplice) { 1890 soref(sosp); 1891 sounsplice(so, sosp, 0); 1892 sorele(sosp); 1893 1894 return (0); 1895 } 1896 if (timerisset(&so->so_idletv)) 1897 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1898 return (1); 1899 } 1900 #endif /* SOCKET_SPLICE */ 1901 1902 void 1903 sorwakeup(struct socket *so) 1904 { 1905 #ifdef SOCKET_SPLICE 1906 if (so->so_proto->pr_flags & PR_SPLICE) { 1907 mtx_enter(&so->so_rcv.sb_mtx); 1908 if (so->so_rcv.sb_flags & SB_SPLICE) 1909 task_add(sosplice_taskq, &so->so_splicetask); 1910 if (isspliced(so)) { 1911 mtx_leave(&so->so_rcv.sb_mtx); 1912 return; 1913 } 1914 mtx_leave(&so->so_rcv.sb_mtx); 1915 } 1916 #endif 1917 sowakeup(so, &so->so_rcv); 1918 if (so->so_upcall) 1919 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1920 } 1921 1922 void 1923 sowwakeup(struct socket *so) 1924 { 1925 #ifdef SOCKET_SPLICE 1926 if (so->so_proto->pr_flags & PR_SPLICE) { 1927 mtx_enter(&so->so_snd.sb_mtx); 1928 if (so->so_snd.sb_flags & SB_SPLICE) 1929 task_add(sosplice_taskq, 1930 &so->so_sp->ssp_soback->so_splicetask); 1931 if (issplicedback(so)) { 1932 mtx_leave(&so->so_snd.sb_mtx); 1933 return; 1934 } 1935 mtx_leave(&so->so_snd.sb_mtx); 1936 } 1937 #endif 1938 sowakeup(so, &so->so_snd); 1939 } 1940 1941 int 1942 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1943 { 1944 int error = 0; 1945 1946 if (level != SOL_SOCKET) { 1947 if (so->so_proto->pr_ctloutput) { 1948 solock(so); 1949 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1950 level, optname, m); 1951 sounlock(so); 1952 return (error); 1953 } 1954 error = ENOPROTOOPT; 1955 } else { 1956 switch (optname) { 1957 1958 case SO_LINGER: 1959 if (m == NULL || m->m_len != sizeof (struct linger) || 1960 mtod(m, struct linger *)->l_linger < 0 || 1961 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1962 return (EINVAL); 1963 1964 solock(so); 1965 so->so_linger = mtod(m, struct linger *)->l_linger; 1966 if (*mtod(m, int *)) 1967 so->so_options |= optname; 1968 else 1969 so->so_options &= ~optname; 1970 sounlock(so); 1971 1972 break; 1973 case SO_BINDANY: 1974 if ((error = suser(curproc)) != 0) /* XXX */ 1975 return (error); 1976 /* FALLTHROUGH */ 1977 1978 case SO_DEBUG: 1979 case SO_KEEPALIVE: 1980 case SO_USELOOPBACK: 1981 case SO_BROADCAST: 1982 case SO_REUSEADDR: 1983 case SO_REUSEPORT: 1984 case SO_OOBINLINE: 1985 case SO_TIMESTAMP: 1986 case SO_ZEROIZE: 1987 if (m == NULL || m->m_len < sizeof (int)) 1988 return (EINVAL); 1989 1990 solock(so); 1991 if (*mtod(m, int *)) 1992 so->so_options |= optname; 1993 else 1994 so->so_options &= ~optname; 1995 sounlock(so); 1996 1997 break; 1998 case SO_DONTROUTE: 1999 if (m == NULL || m->m_len < sizeof (int)) 2000 return (EINVAL); 2001 if (*mtod(m, int *)) 2002 error = EOPNOTSUPP; 2003 break; 2004 2005 case SO_SNDBUF: 2006 case SO_RCVBUF: 2007 case SO_SNDLOWAT: 2008 case SO_RCVLOWAT: 2009 { 2010 struct sockbuf *sb = (optname == SO_SNDBUF || 2011 optname == SO_SNDLOWAT ? 2012 &so->so_snd : &so->so_rcv); 2013 u_long cnt; 2014 2015 if (m == NULL || m->m_len < sizeof (int)) 2016 return (EINVAL); 2017 cnt = *mtod(m, int *); 2018 if ((long)cnt <= 0) 2019 cnt = 1; 2020 2021 mtx_enter(&sb->sb_mtx); 2022 switch (optname) { 2023 case SO_SNDBUF: 2024 case SO_RCVBUF: 2025 if (sb->sb_state & 2026 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2027 error = EINVAL; 2028 break; 2029 } 2030 if (sbcheckreserve(cnt, sb->sb_wat) || 2031 sbreserve(so, sb, cnt)) { 2032 error = ENOBUFS; 2033 break; 2034 } 2035 sb->sb_wat = cnt; 2036 break; 2037 case SO_SNDLOWAT: 2038 case SO_RCVLOWAT: 2039 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2040 sb->sb_hiwat : cnt; 2041 break; 2042 } 2043 mtx_leave(&sb->sb_mtx); 2044 2045 break; 2046 } 2047 2048 case SO_SNDTIMEO: 2049 case SO_RCVTIMEO: 2050 { 2051 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2052 &so->so_snd : &so->so_rcv); 2053 struct timeval tv; 2054 uint64_t nsecs; 2055 2056 if (m == NULL || m->m_len < sizeof (tv)) 2057 return (EINVAL); 2058 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2059 if (!timerisvalid(&tv)) 2060 return (EINVAL); 2061 nsecs = TIMEVAL_TO_NSEC(&tv); 2062 if (nsecs == UINT64_MAX) 2063 return (EDOM); 2064 if (nsecs == 0) 2065 nsecs = INFSLP; 2066 2067 mtx_enter(&sb->sb_mtx); 2068 sb->sb_timeo_nsecs = nsecs; 2069 mtx_leave(&sb->sb_mtx); 2070 break; 2071 } 2072 2073 case SO_RTABLE: 2074 if (so->so_proto->pr_domain && 2075 so->so_proto->pr_domain->dom_protosw && 2076 so->so_proto->pr_ctloutput) { 2077 const struct domain *dom = 2078 so->so_proto->pr_domain; 2079 2080 level = dom->dom_protosw->pr_protocol; 2081 solock(so); 2082 error = (*so->so_proto->pr_ctloutput) 2083 (PRCO_SETOPT, so, level, optname, m); 2084 sounlock(so); 2085 } else 2086 error = ENOPROTOOPT; 2087 break; 2088 #ifdef SOCKET_SPLICE 2089 case SO_SPLICE: 2090 if (m == NULL) { 2091 error = sosplice(so, -1, 0, NULL); 2092 } else if (m->m_len < sizeof(int)) { 2093 error = EINVAL; 2094 } else if (m->m_len < sizeof(struct splice)) { 2095 error = sosplice(so, *mtod(m, int *), 0, NULL); 2096 } else { 2097 error = sosplice(so, 2098 mtod(m, struct splice *)->sp_fd, 2099 mtod(m, struct splice *)->sp_max, 2100 &mtod(m, struct splice *)->sp_idle); 2101 } 2102 break; 2103 #endif /* SOCKET_SPLICE */ 2104 2105 default: 2106 error = ENOPROTOOPT; 2107 break; 2108 } 2109 } 2110 2111 return (error); 2112 } 2113 2114 int 2115 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2116 { 2117 int error = 0; 2118 2119 if (level != SOL_SOCKET) { 2120 if (so->so_proto->pr_ctloutput) { 2121 m->m_len = 0; 2122 2123 solock(so); 2124 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2125 level, optname, m); 2126 sounlock(so); 2127 return (error); 2128 } else 2129 return (ENOPROTOOPT); 2130 } else { 2131 m->m_len = sizeof (int); 2132 2133 switch (optname) { 2134 2135 case SO_LINGER: 2136 m->m_len = sizeof (struct linger); 2137 solock_shared(so); 2138 mtod(m, struct linger *)->l_onoff = 2139 so->so_options & SO_LINGER; 2140 mtod(m, struct linger *)->l_linger = so->so_linger; 2141 sounlock_shared(so); 2142 break; 2143 2144 case SO_BINDANY: 2145 case SO_USELOOPBACK: 2146 case SO_DEBUG: 2147 case SO_KEEPALIVE: 2148 case SO_REUSEADDR: 2149 case SO_REUSEPORT: 2150 case SO_BROADCAST: 2151 case SO_OOBINLINE: 2152 case SO_ACCEPTCONN: 2153 case SO_TIMESTAMP: 2154 case SO_ZEROIZE: 2155 *mtod(m, int *) = so->so_options & optname; 2156 break; 2157 2158 case SO_DONTROUTE: 2159 *mtod(m, int *) = 0; 2160 break; 2161 2162 case SO_TYPE: 2163 *mtod(m, int *) = so->so_type; 2164 break; 2165 2166 case SO_ERROR: 2167 solock(so); 2168 *mtod(m, int *) = so->so_error; 2169 so->so_error = 0; 2170 sounlock(so); 2171 2172 break; 2173 2174 case SO_DOMAIN: 2175 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2176 break; 2177 2178 case SO_PROTOCOL: 2179 *mtod(m, int *) = so->so_proto->pr_protocol; 2180 break; 2181 2182 case SO_SNDBUF: 2183 *mtod(m, int *) = so->so_snd.sb_hiwat; 2184 break; 2185 2186 case SO_RCVBUF: 2187 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2188 break; 2189 2190 case SO_SNDLOWAT: 2191 *mtod(m, int *) = so->so_snd.sb_lowat; 2192 break; 2193 2194 case SO_RCVLOWAT: 2195 *mtod(m, int *) = so->so_rcv.sb_lowat; 2196 break; 2197 2198 case SO_SNDTIMEO: 2199 case SO_RCVTIMEO: 2200 { 2201 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2202 &so->so_snd : &so->so_rcv); 2203 struct timeval tv; 2204 uint64_t nsecs; 2205 2206 mtx_enter(&sb->sb_mtx); 2207 nsecs = sb->sb_timeo_nsecs; 2208 mtx_leave(&sb->sb_mtx); 2209 2210 m->m_len = sizeof(struct timeval); 2211 memset(&tv, 0, sizeof(tv)); 2212 if (nsecs != INFSLP) 2213 NSEC_TO_TIMEVAL(nsecs, &tv); 2214 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2215 break; 2216 } 2217 2218 case SO_RTABLE: 2219 if (so->so_proto->pr_domain && 2220 so->so_proto->pr_domain->dom_protosw && 2221 so->so_proto->pr_ctloutput) { 2222 const struct domain *dom = 2223 so->so_proto->pr_domain; 2224 2225 level = dom->dom_protosw->pr_protocol; 2226 solock(so); 2227 error = (*so->so_proto->pr_ctloutput) 2228 (PRCO_GETOPT, so, level, optname, m); 2229 sounlock(so); 2230 if (error) 2231 return (error); 2232 break; 2233 } 2234 return (ENOPROTOOPT); 2235 2236 #ifdef SOCKET_SPLICE 2237 case SO_SPLICE: 2238 { 2239 off_t len; 2240 2241 m->m_len = sizeof(off_t); 2242 solock_shared(so); 2243 len = so->so_sp ? so->so_sp->ssp_len : 0; 2244 sounlock_shared(so); 2245 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2246 break; 2247 } 2248 #endif /* SOCKET_SPLICE */ 2249 2250 case SO_PEERCRED: 2251 if (so->so_proto->pr_protocol == AF_UNIX) { 2252 struct unpcb *unp = sotounpcb(so); 2253 2254 solock(so); 2255 if (unp->unp_flags & UNP_FEIDS) { 2256 m->m_len = sizeof(unp->unp_connid); 2257 memcpy(mtod(m, caddr_t), 2258 &(unp->unp_connid), m->m_len); 2259 sounlock(so); 2260 break; 2261 } 2262 sounlock(so); 2263 2264 return (ENOTCONN); 2265 } 2266 return (EOPNOTSUPP); 2267 2268 default: 2269 return (ENOPROTOOPT); 2270 } 2271 return (0); 2272 } 2273 } 2274 2275 void 2276 sohasoutofband(struct socket *so) 2277 { 2278 pgsigio(&so->so_sigio, SIGURG, 0); 2279 knote(&so->so_rcv.sb_klist, 0); 2280 } 2281 2282 void 2283 sofilt_lock(struct socket *so, struct sockbuf *sb) 2284 { 2285 switch (so->so_proto->pr_domain->dom_family) { 2286 case PF_INET: 2287 case PF_INET6: 2288 NET_LOCK_SHARED(); 2289 break; 2290 default: 2291 rw_enter_write(&so->so_lock); 2292 break; 2293 } 2294 2295 mtx_enter(&sb->sb_mtx); 2296 } 2297 2298 void 2299 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2300 { 2301 mtx_leave(&sb->sb_mtx); 2302 2303 switch (so->so_proto->pr_domain->dom_family) { 2304 case PF_INET: 2305 case PF_INET6: 2306 NET_UNLOCK_SHARED(); 2307 break; 2308 default: 2309 rw_exit_write(&so->so_lock); 2310 break; 2311 } 2312 } 2313 2314 int 2315 soo_kqfilter(struct file *fp, struct knote *kn) 2316 { 2317 struct socket *so = kn->kn_fp->f_data; 2318 struct sockbuf *sb; 2319 2320 switch (kn->kn_filter) { 2321 case EVFILT_READ: 2322 kn->kn_fop = &soread_filtops; 2323 sb = &so->so_rcv; 2324 break; 2325 case EVFILT_WRITE: 2326 kn->kn_fop = &sowrite_filtops; 2327 sb = &so->so_snd; 2328 break; 2329 case EVFILT_EXCEPT: 2330 kn->kn_fop = &soexcept_filtops; 2331 sb = &so->so_rcv; 2332 break; 2333 default: 2334 return (EINVAL); 2335 } 2336 2337 klist_insert(&sb->sb_klist, kn); 2338 2339 return (0); 2340 } 2341 2342 void 2343 filt_sordetach(struct knote *kn) 2344 { 2345 struct socket *so = kn->kn_fp->f_data; 2346 2347 klist_remove(&so->so_rcv.sb_klist, kn); 2348 } 2349 2350 int 2351 filt_soread(struct knote *kn, long hint) 2352 { 2353 struct socket *so = kn->kn_fp->f_data; 2354 u_int state = READ_ONCE(so->so_state); 2355 u_int error = READ_ONCE(so->so_error); 2356 int rv = 0; 2357 2358 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2359 2360 if (so->so_options & SO_ACCEPTCONN) { 2361 short qlen = READ_ONCE(so->so_qlen); 2362 2363 soassertlocked_readonly(so); 2364 2365 kn->kn_data = qlen; 2366 rv = (kn->kn_data != 0); 2367 2368 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2369 if (state & SS_ISDISCONNECTED) { 2370 kn->kn_flags |= __EV_HUP; 2371 rv = 1; 2372 } else { 2373 rv = qlen || soreadable(so); 2374 } 2375 } 2376 2377 return rv; 2378 } 2379 2380 kn->kn_data = so->so_rcv.sb_cc; 2381 #ifdef SOCKET_SPLICE 2382 if (isspliced(so)) { 2383 rv = 0; 2384 } else 2385 #endif /* SOCKET_SPLICE */ 2386 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2387 kn->kn_flags |= EV_EOF; 2388 if (kn->kn_flags & __EV_POLL) { 2389 if (state & SS_ISDISCONNECTED) 2390 kn->kn_flags |= __EV_HUP; 2391 } 2392 kn->kn_fflags = error; 2393 rv = 1; 2394 } else if (error) { 2395 rv = 1; 2396 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2397 rv = (kn->kn_data >= kn->kn_sdata); 2398 } else { 2399 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2400 } 2401 2402 return rv; 2403 } 2404 2405 void 2406 filt_sowdetach(struct knote *kn) 2407 { 2408 struct socket *so = kn->kn_fp->f_data; 2409 2410 klist_remove(&so->so_snd.sb_klist, kn); 2411 } 2412 2413 int 2414 filt_sowrite(struct knote *kn, long hint) 2415 { 2416 struct socket *so = kn->kn_fp->f_data; 2417 u_int state = READ_ONCE(so->so_state); 2418 u_int error = READ_ONCE(so->so_error); 2419 int rv; 2420 2421 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2422 2423 kn->kn_data = sbspace_locked(so, &so->so_snd); 2424 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2425 kn->kn_flags |= EV_EOF; 2426 if (kn->kn_flags & __EV_POLL) { 2427 if (state & SS_ISDISCONNECTED) 2428 kn->kn_flags |= __EV_HUP; 2429 } 2430 kn->kn_fflags = error; 2431 rv = 1; 2432 } else if (error) { 2433 rv = 1; 2434 } else if (((state & SS_ISCONNECTED) == 0) && 2435 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2436 rv = 0; 2437 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2438 rv = (kn->kn_data >= kn->kn_sdata); 2439 } else { 2440 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2441 } 2442 2443 return (rv); 2444 } 2445 2446 int 2447 filt_soexcept(struct knote *kn, long hint) 2448 { 2449 struct socket *so = kn->kn_fp->f_data; 2450 int rv = 0; 2451 2452 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2453 2454 #ifdef SOCKET_SPLICE 2455 if (isspliced(so)) { 2456 rv = 0; 2457 } else 2458 #endif /* SOCKET_SPLICE */ 2459 if (kn->kn_sfflags & NOTE_OOB) { 2460 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2461 kn->kn_fflags |= NOTE_OOB; 2462 kn->kn_data -= so->so_oobmark; 2463 rv = 1; 2464 } 2465 } 2466 2467 if (kn->kn_flags & __EV_POLL) { 2468 u_int state = READ_ONCE(so->so_state); 2469 2470 if (state & SS_ISDISCONNECTED) { 2471 kn->kn_flags |= __EV_HUP; 2472 rv = 1; 2473 } 2474 } 2475 2476 return rv; 2477 } 2478 2479 int 2480 filt_sowmodify(struct kevent *kev, struct knote *kn) 2481 { 2482 struct socket *so = kn->kn_fp->f_data; 2483 int rv; 2484 2485 sofilt_lock(so, &so->so_snd); 2486 rv = knote_modify(kev, kn); 2487 sofilt_unlock(so, &so->so_snd); 2488 2489 return (rv); 2490 } 2491 2492 int 2493 filt_sowprocess(struct knote *kn, struct kevent *kev) 2494 { 2495 struct socket *so = kn->kn_fp->f_data; 2496 int rv; 2497 2498 sofilt_lock(so, &so->so_snd); 2499 rv = knote_process(kn, kev); 2500 sofilt_unlock(so, &so->so_snd); 2501 2502 return (rv); 2503 } 2504 2505 int 2506 filt_sormodify(struct kevent *kev, struct knote *kn) 2507 { 2508 struct socket *so = kn->kn_fp->f_data; 2509 int rv; 2510 2511 sofilt_lock(so, &so->so_rcv); 2512 rv = knote_modify(kev, kn); 2513 sofilt_unlock(so, &so->so_rcv); 2514 2515 return (rv); 2516 } 2517 2518 int 2519 filt_sorprocess(struct knote *kn, struct kevent *kev) 2520 { 2521 struct socket *so = kn->kn_fp->f_data; 2522 int rv; 2523 2524 sofilt_lock(so, &so->so_rcv); 2525 rv = knote_process(kn, kev); 2526 sofilt_unlock(so, &so->so_rcv); 2527 2528 return (rv); 2529 } 2530 2531 #ifdef DDB 2532 void 2533 sobuf_print(struct sockbuf *, 2534 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2535 2536 void 2537 sobuf_print(struct sockbuf *sb, 2538 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2539 { 2540 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2541 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2542 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2543 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2544 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2545 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2546 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2547 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2548 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2549 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2550 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2551 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2552 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2553 } 2554 2555 void 2556 so_print(void *v, 2557 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2558 { 2559 struct socket *so = v; 2560 2561 (*pr)("socket %p\n", so); 2562 (*pr)("so_type: %i\n", so->so_type); 2563 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2564 (*pr)("so_linger: %i\n", so->so_linger); 2565 (*pr)("so_state: 0x%04x\n", so->so_state); 2566 (*pr)("so_pcb: %p\n", so->so_pcb); 2567 (*pr)("so_proto: %p\n", so->so_proto); 2568 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2569 2570 (*pr)("so_head: %p\n", so->so_head); 2571 (*pr)("so_onq: %p\n", so->so_onq); 2572 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2573 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2574 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2575 (*pr)("so_q0len: %i\n", so->so_q0len); 2576 (*pr)("so_qlen: %i\n", so->so_qlen); 2577 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2578 (*pr)("so_timeo: %i\n", so->so_timeo); 2579 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2580 2581 (*pr)("so_sp: %p\n", so->so_sp); 2582 if (so->so_sp != NULL) { 2583 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2584 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2585 (*pr)("\tssp_len: %lld\n", 2586 (unsigned long long)so->so_sp->ssp_len); 2587 (*pr)("\tssp_max: %lld\n", 2588 (unsigned long long)so->so_sp->ssp_max); 2589 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2590 so->so_sp->ssp_idletv.tv_usec); 2591 (*pr)("\tssp_idleto: %spending (@%i)\n", 2592 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2593 so->so_sp->ssp_idleto.to_time); 2594 } 2595 2596 (*pr)("so_rcv:\n"); 2597 sobuf_print(&so->so_rcv, pr); 2598 (*pr)("so_snd:\n"); 2599 sobuf_print(&so->so_snd, pr); 2600 2601 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2602 so->so_upcall, so->so_upcallarg); 2603 2604 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2605 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2606 (*pr)("so_cpid: %d\n", so->so_cpid); 2607 } 2608 #endif 2609