1 /* $OpenBSD: uipc_socket.c,v 1.356 2025/01/04 15:57:02 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 const char *dom_name = dp->dom_name; 137 struct socket *so; 138 139 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 140 PR_ZERO); 141 if (so == NULL) 142 return (NULL); 143 144 #ifdef WITNESS 145 /* 146 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be 147 * spliced together. 148 */ 149 switch (dp->dom_family) { 150 case AF_INET: 151 case AF_INET6: 152 dom_name = "inet46"; 153 break; 154 } 155 #endif 156 157 refcnt_init(&so->so_refcnt); 158 rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK); 159 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 160 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 161 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 162 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 163 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 164 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 165 sigio_init(&so->so_sigio); 166 TAILQ_INIT(&so->so_q0); 167 TAILQ_INIT(&so->so_q); 168 169 so->so_snd.sb_flags |= SB_MTXLOCK; 170 so->so_rcv.sb_flags |= SB_MTXLOCK; 171 172 return (so); 173 } 174 175 /* 176 * Socket operation routines. 177 * These routines are called by the routines in 178 * sys_socket.c or from a system process, and 179 * implement the semantics of socket operations by 180 * switching out to the protocol specific routines. 181 */ 182 int 183 socreate(int dom, struct socket **aso, int type, int proto) 184 { 185 struct proc *p = curproc; /* XXX */ 186 const struct protosw *prp; 187 struct socket *so; 188 int error; 189 190 if (proto) 191 prp = pffindproto(dom, proto, type); 192 else 193 prp = pffindtype(dom, type); 194 if (prp == NULL || prp->pr_usrreqs == NULL) 195 return (EPROTONOSUPPORT); 196 if (prp->pr_type != type) 197 return (EPROTOTYPE); 198 so = soalloc(prp, M_WAIT); 199 so->so_type = type; 200 if (suser(p) == 0) 201 so->so_state = SS_PRIV; 202 so->so_ruid = p->p_ucred->cr_ruid; 203 so->so_euid = p->p_ucred->cr_uid; 204 so->so_rgid = p->p_ucred->cr_rgid; 205 so->so_egid = p->p_ucred->cr_gid; 206 so->so_cpid = p->p_p->ps_pid; 207 so->so_proto = prp; 208 so->so_snd.sb_timeo_nsecs = INFSLP; 209 so->so_rcv.sb_timeo_nsecs = INFSLP; 210 211 solock(so); 212 error = pru_attach(so, proto, M_WAIT); 213 if (error) { 214 so->so_state |= SS_NOFDREF; 215 /* sofree() calls sounlock(). */ 216 sofree(so, 0); 217 return (error); 218 } 219 sounlock(so); 220 *aso = so; 221 return (0); 222 } 223 224 int 225 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 226 { 227 soassertlocked(so); 228 return pru_bind(so, nam, p); 229 } 230 231 int 232 solisten(struct socket *so, int backlog) 233 { 234 int somaxconn_local = atomic_load_int(&somaxconn); 235 int sominconn_local = atomic_load_int(&sominconn); 236 int error; 237 238 switch (so->so_type) { 239 case SOCK_STREAM: 240 case SOCK_SEQPACKET: 241 break; 242 default: 243 return (EOPNOTSUPP); 244 } 245 246 soassertlocked(so); 247 248 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 249 return (EINVAL); 250 #ifdef SOCKET_SPLICE 251 if (isspliced(so) || issplicedback(so)) 252 return (EOPNOTSUPP); 253 #endif /* SOCKET_SPLICE */ 254 error = pru_listen(so); 255 if (error) 256 return (error); 257 if (TAILQ_FIRST(&so->so_q) == NULL) 258 so->so_options |= SO_ACCEPTCONN; 259 if (backlog < 0 || backlog > somaxconn_local) 260 backlog = somaxconn_local; 261 if (backlog < sominconn_local) 262 backlog = sominconn_local; 263 so->so_qlimit = backlog; 264 return (0); 265 } 266 267 void 268 sorele(struct socket *so) 269 { 270 if (refcnt_rele(&so->so_refcnt) == 0) 271 return; 272 273 sigio_free(&so->so_sigio); 274 klist_free(&so->so_rcv.sb_klist); 275 klist_free(&so->so_snd.sb_klist); 276 277 mtx_enter(&so->so_snd.sb_mtx); 278 sbrelease(so, &so->so_snd); 279 mtx_leave(&so->so_snd.sb_mtx); 280 281 if (so->so_proto->pr_flags & PR_RIGHTS && 282 so->so_proto->pr_domain->dom_dispose) 283 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 284 m_purge(so->so_rcv.sb_mb); 285 286 #ifdef SOCKET_SPLICE 287 if (so->so_sp) 288 pool_put(&sosplice_pool, so->so_sp); 289 #endif 290 pool_put(&socket_pool, so); 291 } 292 293 #define SOSP_FREEING_READ 1 294 #define SOSP_FREEING_WRITE 2 295 void 296 sofree(struct socket *so, int keep_lock) 297 { 298 int persocket = solock_persocket(so); 299 300 soassertlocked(so); 301 302 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 303 if (!keep_lock) 304 sounlock(so); 305 return; 306 } 307 if (so->so_head) { 308 struct socket *head = so->so_head; 309 310 /* 311 * We must not decommission a socket that's on the accept(2) 312 * queue. If we do, then accept(2) may hang after select(2) 313 * indicated that the listening socket was ready. 314 */ 315 if (so->so_onq == &head->so_q) { 316 if (!keep_lock) 317 sounlock(so); 318 return; 319 } 320 321 if (persocket) { 322 soref(head); 323 sounlock(so); 324 solock(head); 325 solock(so); 326 327 if (so->so_onq != &head->so_q0) { 328 sounlock(so); 329 sounlock(head); 330 sorele(head); 331 return; 332 } 333 } 334 335 soqremque(so, 0); 336 337 if (persocket) { 338 sounlock(head); 339 sorele(head); 340 } 341 } 342 343 if (!keep_lock) 344 sounlock(so); 345 sorele(so); 346 } 347 348 static inline uint64_t 349 solinger_nsec(struct socket *so) 350 { 351 if (so->so_linger == 0) 352 return INFSLP; 353 354 return SEC_TO_NSEC(so->so_linger); 355 } 356 357 /* 358 * Close a socket on last file table reference removal. 359 * Initiate disconnect if connected. 360 * Free socket when disconnect complete. 361 */ 362 int 363 soclose(struct socket *so, int flags) 364 { 365 struct socket *so2; 366 int error = 0; 367 368 solock(so); 369 /* Revoke async IO early. There is a final revocation in sofree(). */ 370 sigio_free(&so->so_sigio); 371 if (so->so_state & SS_ISCONNECTED) { 372 if (so->so_pcb == NULL) 373 goto discard; 374 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 375 error = sodisconnect(so); 376 if (error) 377 goto drop; 378 } 379 if (so->so_options & SO_LINGER) { 380 if ((so->so_state & SS_ISDISCONNECTING) && 381 (flags & MSG_DONTWAIT)) 382 goto drop; 383 while (so->so_state & SS_ISCONNECTED) { 384 error = sosleep_nsec(so, &so->so_timeo, 385 PSOCK | PCATCH, "netcls", 386 solinger_nsec(so)); 387 if (error) 388 break; 389 } 390 } 391 } 392 drop: 393 if (so->so_pcb) { 394 int error2; 395 error2 = pru_detach(so); 396 if (error == 0) 397 error = error2; 398 } 399 if (so->so_options & SO_ACCEPTCONN) { 400 int persocket = solock_persocket(so); 401 402 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 403 if (persocket) 404 solock(so2); 405 (void) soqremque(so2, 0); 406 if (persocket) 407 sounlock(so); 408 soabort(so2); 409 if (persocket) 410 solock(so); 411 } 412 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 413 if (persocket) 414 solock(so2); 415 (void) soqremque(so2, 1); 416 if (persocket) 417 sounlock(so); 418 soabort(so2); 419 if (persocket) 420 solock(so); 421 } 422 } 423 discard: 424 #ifdef SOCKET_SPLICE 425 if (so->so_sp) { 426 struct socket *soback; 427 428 sounlock(so); 429 mtx_enter(&so->so_snd.sb_mtx); 430 /* 431 * Concurrent sounsplice() locks `sb_mtx' mutexes on 432 * both `so_snd' and `so_rcv' before unsplice sockets. 433 */ 434 if ((soback = so->so_sp->ssp_soback) == NULL) { 435 mtx_leave(&so->so_snd.sb_mtx); 436 goto notsplicedback; 437 } 438 soref(soback); 439 mtx_leave(&so->so_snd.sb_mtx); 440 441 /* 442 * `so' can be only unspliced, and never spliced again. 443 * Thus if issplicedback(so) check is positive, socket is 444 * still spliced and `ssp_soback' points to the same 445 * socket that `soback'. 446 */ 447 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 448 if (issplicedback(so)) { 449 int freeing = SOSP_FREEING_WRITE; 450 451 if (so->so_sp->ssp_soback == so) 452 freeing |= SOSP_FREEING_READ; 453 sounsplice(so->so_sp->ssp_soback, so, freeing); 454 } 455 sbunlock(&soback->so_rcv); 456 sorele(soback); 457 458 notsplicedback: 459 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 460 if (isspliced(so)) { 461 struct socket *sosp; 462 int freeing = SOSP_FREEING_READ; 463 464 if (so == so->so_sp->ssp_socket) 465 freeing |= SOSP_FREEING_WRITE; 466 sosp = soref(so->so_sp->ssp_socket); 467 sounsplice(so, so->so_sp->ssp_socket, freeing); 468 sorele(sosp); 469 } 470 sbunlock(&so->so_rcv); 471 472 timeout_del_barrier(&so->so_sp->ssp_idleto); 473 task_del(sosplice_taskq, &so->so_sp->ssp_task); 474 taskq_barrier(sosplice_taskq); 475 476 solock(so); 477 } 478 #endif /* SOCKET_SPLICE */ 479 480 if (so->so_state & SS_NOFDREF) 481 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 482 so->so_state |= SS_NOFDREF; 483 484 /* sofree() calls sounlock(). */ 485 sofree(so, 0); 486 return (error); 487 } 488 489 void 490 soabort(struct socket *so) 491 { 492 soassertlocked(so); 493 pru_abort(so); 494 } 495 496 int 497 soaccept(struct socket *so, struct mbuf *nam) 498 { 499 int error = 0; 500 501 soassertlocked(so); 502 503 if ((so->so_state & SS_NOFDREF) == 0) 504 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 505 so->so_state &= ~SS_NOFDREF; 506 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 507 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 508 error = pru_accept(so, nam); 509 else 510 error = ECONNABORTED; 511 return (error); 512 } 513 514 int 515 soconnect(struct socket *so, struct mbuf *nam) 516 { 517 int error; 518 519 soassertlocked(so); 520 521 if (so->so_options & SO_ACCEPTCONN) 522 return (EOPNOTSUPP); 523 /* 524 * If protocol is connection-based, can only connect once. 525 * Otherwise, if connected, try to disconnect first. 526 * This allows user to disconnect by connecting to, e.g., 527 * a null address. 528 */ 529 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 530 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 531 (error = sodisconnect(so)))) 532 error = EISCONN; 533 else 534 error = pru_connect(so, nam); 535 return (error); 536 } 537 538 int 539 soconnect2(struct socket *so1, struct socket *so2) 540 { 541 int persocket, error; 542 543 if ((persocket = solock_persocket(so1))) 544 solock_pair(so1, so2); 545 else 546 solock(so1); 547 548 error = pru_connect2(so1, so2); 549 550 if (persocket) 551 sounlock(so2); 552 sounlock(so1); 553 return (error); 554 } 555 556 int 557 sodisconnect(struct socket *so) 558 { 559 int error; 560 561 soassertlocked(so); 562 563 if ((so->so_state & SS_ISCONNECTED) == 0) 564 return (ENOTCONN); 565 if (so->so_state & SS_ISDISCONNECTING) 566 return (EALREADY); 567 error = pru_disconnect(so); 568 return (error); 569 } 570 571 int m_getuio(struct mbuf **, int, long, struct uio *); 572 573 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 574 /* 575 * Send on a socket. 576 * If send must go all at once and message is larger than 577 * send buffering, then hard error. 578 * Lock against other senders. 579 * If must go all at once and not enough room now, then 580 * inform user that this would block and do nothing. 581 * Otherwise, if nonblocking, send as much as possible. 582 * The data to be sent is described by "uio" if nonzero, 583 * otherwise by the mbuf chain "top" (which must be null 584 * if uio is not). Data provided in mbuf chain must be small 585 * enough to send all at once. 586 * 587 * Returns nonzero on error, timeout or signal; callers 588 * must check for short counts if EINTR/ERESTART are returned. 589 * Data and control buffers are freed on return. 590 */ 591 int 592 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 593 struct mbuf *control, int flags) 594 { 595 long space, clen = 0; 596 size_t resid; 597 int error; 598 int atomic = sosendallatonce(so) || top; 599 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 600 601 if (uio) 602 resid = uio->uio_resid; 603 else 604 resid = top->m_pkthdr.len; 605 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 606 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 607 m_freem(top); 608 m_freem(control); 609 return (EINVAL); 610 } 611 if (uio && uio->uio_procp) 612 uio->uio_procp->p_ru.ru_msgsnd++; 613 if (control) { 614 /* 615 * In theory clen should be unsigned (since control->m_len is). 616 * However, space must be signed, as it might be less than 0 617 * if we over-committed, and we must use a signed comparison 618 * of space and clen. 619 */ 620 clen = control->m_len; 621 /* reserve extra space for AF_UNIX's internalize */ 622 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 623 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 624 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 625 clen = CMSG_SPACE( 626 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 627 (sizeof(struct fdpass) / sizeof(int))); 628 } 629 630 #define snderr(errno) { error = errno; goto release; } 631 632 restart: 633 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 634 goto out; 635 if (dosolock) 636 solock_shared(so); 637 sb_mtx_lock(&so->so_snd); 638 so->so_snd.sb_state |= SS_ISSENDING; 639 do { 640 if (so->so_snd.sb_state & SS_CANTSENDMORE) 641 snderr(EPIPE); 642 if ((error = READ_ONCE(so->so_error))) { 643 so->so_error = 0; 644 snderr(error); 645 } 646 if ((so->so_state & SS_ISCONNECTED) == 0) { 647 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 648 if (!(resid == 0 && clen != 0)) 649 snderr(ENOTCONN); 650 } else if (addr == NULL) 651 snderr(EDESTADDRREQ); 652 } 653 space = sbspace_locked(so, &so->so_snd); 654 if (flags & MSG_OOB) 655 space += 1024; 656 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 657 if (atomic && resid > so->so_snd.sb_hiwat) 658 snderr(EMSGSIZE); 659 } else { 660 if (clen > so->so_snd.sb_hiwat || 661 (atomic && resid > so->so_snd.sb_hiwat - clen)) 662 snderr(EMSGSIZE); 663 } 664 if (space < clen || 665 (space - clen < resid && 666 (atomic || space < so->so_snd.sb_lowat))) { 667 if (flags & MSG_DONTWAIT) 668 snderr(EWOULDBLOCK); 669 sbunlock(&so->so_snd); 670 error = sbwait(so, &so->so_snd); 671 so->so_snd.sb_state &= ~SS_ISSENDING; 672 sb_mtx_unlock(&so->so_snd); 673 if (dosolock) 674 sounlock_shared(so); 675 if (error) 676 goto out; 677 goto restart; 678 } 679 space -= clen; 680 do { 681 if (uio == NULL) { 682 /* 683 * Data is prepackaged in "top". 684 */ 685 resid = 0; 686 if (flags & MSG_EOR) 687 top->m_flags |= M_EOR; 688 } else { 689 sb_mtx_unlock(&so->so_snd); 690 if (dosolock) 691 sounlock_shared(so); 692 error = m_getuio(&top, atomic, space, uio); 693 if (dosolock) 694 solock_shared(so); 695 sb_mtx_lock(&so->so_snd); 696 if (error) 697 goto release; 698 space -= top->m_pkthdr.len; 699 resid = uio->uio_resid; 700 if (flags & MSG_EOR) 701 top->m_flags |= M_EOR; 702 } 703 if (resid == 0) 704 so->so_snd.sb_state &= ~SS_ISSENDING; 705 if (top && so->so_options & SO_ZEROIZE) 706 top->m_flags |= M_ZEROIZE; 707 sb_mtx_unlock(&so->so_snd); 708 if (!dosolock) 709 solock_shared(so); 710 if (flags & MSG_OOB) 711 error = pru_sendoob(so, top, addr, control); 712 else 713 error = pru_send(so, top, addr, control); 714 if (!dosolock) 715 sounlock_shared(so); 716 sb_mtx_lock(&so->so_snd); 717 clen = 0; 718 control = NULL; 719 top = NULL; 720 if (error) 721 goto release; 722 } while (resid && space > 0); 723 } while (resid); 724 725 release: 726 so->so_snd.sb_state &= ~SS_ISSENDING; 727 sb_mtx_unlock(&so->so_snd); 728 if (dosolock) 729 sounlock_shared(so); 730 sbunlock(&so->so_snd); 731 out: 732 m_freem(top); 733 m_freem(control); 734 return (error); 735 } 736 737 int 738 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 739 { 740 struct mbuf *m, *top = NULL; 741 struct mbuf **nextp = ⊤ 742 u_long len, mlen; 743 size_t resid = uio->uio_resid; 744 int error; 745 746 do { 747 if (top == NULL) { 748 MGETHDR(m, M_WAIT, MT_DATA); 749 mlen = MHLEN; 750 } else { 751 MGET(m, M_WAIT, MT_DATA); 752 mlen = MLEN; 753 } 754 /* chain mbuf together */ 755 *nextp = m; 756 nextp = &m->m_next; 757 758 resid = ulmin(resid, space); 759 if (resid >= MINCLSIZE) { 760 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 761 if ((m->m_flags & M_EXT) == 0) 762 MCLGETL(m, M_NOWAIT, MCLBYTES); 763 if ((m->m_flags & M_EXT) == 0) 764 goto nopages; 765 mlen = m->m_ext.ext_size; 766 len = ulmin(mlen, resid); 767 /* 768 * For datagram protocols, leave room 769 * for protocol headers in first mbuf. 770 */ 771 if (atomic && m == top && len < mlen - max_hdr) 772 m->m_data += max_hdr; 773 } else { 774 nopages: 775 len = ulmin(mlen, resid); 776 /* 777 * For datagram protocols, leave room 778 * for protocol headers in first mbuf. 779 */ 780 if (atomic && m == top && len < mlen - max_hdr) 781 m_align(m, len); 782 } 783 784 error = uiomove(mtod(m, caddr_t), len, uio); 785 if (error) { 786 m_freem(top); 787 return (error); 788 } 789 790 /* adjust counters */ 791 resid = uio->uio_resid; 792 space -= len; 793 m->m_len = len; 794 top->m_pkthdr.len += len; 795 796 /* Is there more space and more data? */ 797 } while (space > 0 && resid > 0); 798 799 *mp = top; 800 return 0; 801 } 802 803 /* 804 * Following replacement or removal of the first mbuf on the first 805 * mbuf chain of a socket buffer, push necessary state changes back 806 * into the socket buffer so that other consumers see the values 807 * consistently. 'nextrecord' is the callers locally stored value of 808 * the original value of sb->sb_mb->m_nextpkt which must be restored 809 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 810 */ 811 void 812 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 813 { 814 815 /* 816 * First, update for the new value of nextrecord. If necessary, 817 * make it the first record. 818 */ 819 if (sb->sb_mb != NULL) 820 sb->sb_mb->m_nextpkt = nextrecord; 821 else 822 sb->sb_mb = nextrecord; 823 824 /* 825 * Now update any dependent socket buffer fields to reflect 826 * the new state. This is an inline of SB_EMPTY_FIXUP, with 827 * the addition of a second clause that takes care of the 828 * case where sb_mb has been updated, but remains the last 829 * record. 830 */ 831 if (sb->sb_mb == NULL) { 832 sb->sb_mbtail = NULL; 833 sb->sb_lastrecord = NULL; 834 } else if (sb->sb_mb->m_nextpkt == NULL) 835 sb->sb_lastrecord = sb->sb_mb; 836 } 837 838 /* 839 * Implement receive operations on a socket. 840 * We depend on the way that records are added to the sockbuf 841 * by sbappend*. In particular, each record (mbufs linked through m_next) 842 * must begin with an address if the protocol so specifies, 843 * followed by an optional mbuf or mbufs containing ancillary data, 844 * and then zero or more mbufs of data. 845 * In order to avoid blocking network for the entire time here, we release 846 * the solock() while doing the actual copy to user space. 847 * Although the sockbuf is locked, new data may still be appended, 848 * and thus we must maintain consistency of the sockbuf during that time. 849 * 850 * The caller may receive the data as a single mbuf chain by supplying 851 * an mbuf **mp0 for use in returning the chain. The uio is then used 852 * only for the count in uio_resid. 853 */ 854 int 855 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 856 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 857 socklen_t controllen) 858 { 859 struct mbuf *m, **mp; 860 struct mbuf *cm; 861 u_long len, offset, moff; 862 int flags, error, error2, type, uio_error = 0; 863 const struct protosw *pr = so->so_proto; 864 struct mbuf *nextrecord; 865 size_t resid, orig_resid = uio->uio_resid; 866 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 867 868 mp = mp0; 869 if (paddr) 870 *paddr = NULL; 871 if (controlp) 872 *controlp = NULL; 873 if (flagsp) 874 flags = *flagsp &~ MSG_EOR; 875 else 876 flags = 0; 877 if (flags & MSG_OOB) { 878 m = m_get(M_WAIT, MT_DATA); 879 solock_shared(so); 880 error = pru_rcvoob(so, m, flags & MSG_PEEK); 881 sounlock_shared(so); 882 if (error) 883 goto bad; 884 do { 885 error = uiomove(mtod(m, caddr_t), 886 ulmin(uio->uio_resid, m->m_len), uio); 887 m = m_free(m); 888 } while (uio->uio_resid && error == 0 && m); 889 bad: 890 m_freem(m); 891 return (error); 892 } 893 if (mp) 894 *mp = NULL; 895 896 restart: 897 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 898 return (error); 899 if (dosolock) 900 solock_shared(so); 901 sb_mtx_lock(&so->so_rcv); 902 903 m = so->so_rcv.sb_mb; 904 #ifdef SOCKET_SPLICE 905 if (isspliced(so)) 906 m = NULL; 907 #endif /* SOCKET_SPLICE */ 908 /* 909 * If we have less data than requested, block awaiting more 910 * (subject to any timeout) if: 911 * 1. the current count is less than the low water mark, 912 * 2. MSG_WAITALL is set, and it is possible to do the entire 913 * receive operation at once if we block (resid <= hiwat), or 914 * 3. MSG_DONTWAIT is not set. 915 * If MSG_WAITALL is set but resid is larger than the receive buffer, 916 * we have to do the receive in sections, and thus risk returning 917 * a short count if a timeout or signal occurs after we start. 918 */ 919 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 920 so->so_rcv.sb_cc < uio->uio_resid) && 921 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 922 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 923 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 924 #ifdef DIAGNOSTIC 925 if (m == NULL && so->so_rcv.sb_cc) 926 #ifdef SOCKET_SPLICE 927 if (!isspliced(so)) 928 #endif /* SOCKET_SPLICE */ 929 panic("receive 1: so %p, so_type %d, sb_cc %lu", 930 so, so->so_type, so->so_rcv.sb_cc); 931 #endif 932 if ((error2 = READ_ONCE(so->so_error))) { 933 if (m) 934 goto dontblock; 935 error = error2; 936 if ((flags & MSG_PEEK) == 0) 937 so->so_error = 0; 938 goto release; 939 } 940 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 941 if (m) 942 goto dontblock; 943 else if (so->so_rcv.sb_cc == 0) 944 goto release; 945 } 946 for (; m; m = m->m_next) 947 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 948 m = so->so_rcv.sb_mb; 949 goto dontblock; 950 } 951 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 952 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 953 error = ENOTCONN; 954 goto release; 955 } 956 if (uio->uio_resid == 0 && controlp == NULL) 957 goto release; 958 if (flags & MSG_DONTWAIT) { 959 error = EWOULDBLOCK; 960 goto release; 961 } 962 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 963 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 964 965 sbunlock(&so->so_rcv); 966 error = sbwait(so, &so->so_rcv); 967 sb_mtx_unlock(&so->so_rcv); 968 if (dosolock) 969 sounlock_shared(so); 970 if (error) 971 return (error); 972 goto restart; 973 } 974 dontblock: 975 /* 976 * On entry here, m points to the first record of the socket buffer. 977 * From this point onward, we maintain 'nextrecord' as a cache of the 978 * pointer to the next record in the socket buffer. We must keep the 979 * various socket buffer pointers and local stack versions of the 980 * pointers in sync, pushing out modifications before operations that 981 * may sleep, and re-reading them afterwards. 982 * 983 * Otherwise, we will race with the network stack appending new data 984 * or records onto the socket buffer by using inconsistent/stale 985 * versions of the field, possibly resulting in socket buffer 986 * corruption. 987 */ 988 if (uio->uio_procp) 989 uio->uio_procp->p_ru.ru_msgrcv++; 990 KASSERT(m == so->so_rcv.sb_mb); 991 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 992 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 993 nextrecord = m->m_nextpkt; 994 if (pr->pr_flags & PR_ADDR) { 995 #ifdef DIAGNOSTIC 996 if (m->m_type != MT_SONAME) 997 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 998 so, so->so_type, m, m->m_type); 999 #endif 1000 orig_resid = 0; 1001 if (flags & MSG_PEEK) { 1002 if (paddr) 1003 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 1004 m = m->m_next; 1005 } else { 1006 sbfree(so, &so->so_rcv, m); 1007 if (paddr) { 1008 *paddr = m; 1009 so->so_rcv.sb_mb = m->m_next; 1010 m->m_next = NULL; 1011 m = so->so_rcv.sb_mb; 1012 } else { 1013 so->so_rcv.sb_mb = m_free(m); 1014 m = so->so_rcv.sb_mb; 1015 } 1016 sbsync(&so->so_rcv, nextrecord); 1017 } 1018 } 1019 while (m && m->m_type == MT_CONTROL && error == 0) { 1020 int skip = 0; 1021 if (flags & MSG_PEEK) { 1022 if (mtod(m, struct cmsghdr *)->cmsg_type == 1023 SCM_RIGHTS) { 1024 /* don't leak internalized SCM_RIGHTS msgs */ 1025 skip = 1; 1026 } else if (controlp) 1027 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1028 m = m->m_next; 1029 } else { 1030 sbfree(so, &so->so_rcv, m); 1031 so->so_rcv.sb_mb = m->m_next; 1032 m->m_nextpkt = m->m_next = NULL; 1033 cm = m; 1034 m = so->so_rcv.sb_mb; 1035 sbsync(&so->so_rcv, nextrecord); 1036 if (controlp) { 1037 if (pr->pr_domain->dom_externalize) { 1038 sb_mtx_unlock(&so->so_rcv); 1039 if (dosolock) 1040 sounlock_shared(so); 1041 error = 1042 (*pr->pr_domain->dom_externalize) 1043 (cm, controllen, flags); 1044 if (dosolock) 1045 solock_shared(so); 1046 sb_mtx_lock(&so->so_rcv); 1047 } 1048 *controlp = cm; 1049 } else { 1050 /* 1051 * Dispose of any SCM_RIGHTS message that went 1052 * through the read path rather than recv. 1053 */ 1054 if (pr->pr_domain->dom_dispose) { 1055 sb_mtx_unlock(&so->so_rcv); 1056 pr->pr_domain->dom_dispose(cm); 1057 sb_mtx_lock(&so->so_rcv); 1058 } 1059 m_free(cm); 1060 } 1061 } 1062 if (m != NULL) 1063 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1064 else 1065 nextrecord = so->so_rcv.sb_mb; 1066 if (controlp && !skip) 1067 controlp = &(*controlp)->m_next; 1068 orig_resid = 0; 1069 } 1070 1071 /* If m is non-NULL, we have some data to read. */ 1072 if (m) { 1073 type = m->m_type; 1074 if (type == MT_OOBDATA) 1075 flags |= MSG_OOB; 1076 if (m->m_flags & M_BCAST) 1077 flags |= MSG_BCAST; 1078 if (m->m_flags & M_MCAST) 1079 flags |= MSG_MCAST; 1080 } 1081 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1082 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1083 1084 moff = 0; 1085 offset = 0; 1086 while (m && uio->uio_resid > 0 && error == 0) { 1087 if (m->m_type == MT_OOBDATA) { 1088 if (type != MT_OOBDATA) 1089 break; 1090 } else if (type == MT_OOBDATA) { 1091 break; 1092 } else if (m->m_type == MT_CONTROL) { 1093 /* 1094 * If there is more than one control message in the 1095 * stream, we do a short read. Next can be received 1096 * or disposed by another system call. 1097 */ 1098 break; 1099 #ifdef DIAGNOSTIC 1100 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1101 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1102 so, so->so_type, m, m->m_type); 1103 #endif 1104 } 1105 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1106 len = uio->uio_resid; 1107 if (so->so_oobmark && len > so->so_oobmark - offset) 1108 len = so->so_oobmark - offset; 1109 if (len > m->m_len - moff) 1110 len = m->m_len - moff; 1111 /* 1112 * If mp is set, just pass back the mbufs. 1113 * Otherwise copy them out via the uio, then free. 1114 * Sockbuf must be consistent here (points to current mbuf, 1115 * it points to next record) when we drop priority; 1116 * we must note any additions to the sockbuf when we 1117 * block interrupts again. 1118 */ 1119 if (mp == NULL && uio_error == 0) { 1120 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1121 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1122 resid = uio->uio_resid; 1123 sb_mtx_unlock(&so->so_rcv); 1124 if (dosolock) 1125 sounlock_shared(so); 1126 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1127 if (dosolock) 1128 solock_shared(so); 1129 sb_mtx_lock(&so->so_rcv); 1130 if (uio_error) 1131 uio->uio_resid = resid - len; 1132 } else 1133 uio->uio_resid -= len; 1134 if (len == m->m_len - moff) { 1135 if (m->m_flags & M_EOR) 1136 flags |= MSG_EOR; 1137 if (flags & MSG_PEEK) { 1138 m = m->m_next; 1139 moff = 0; 1140 orig_resid = 0; 1141 } else { 1142 nextrecord = m->m_nextpkt; 1143 sbfree(so, &so->so_rcv, m); 1144 if (mp) { 1145 *mp = m; 1146 mp = &m->m_next; 1147 so->so_rcv.sb_mb = m = m->m_next; 1148 *mp = NULL; 1149 } else { 1150 so->so_rcv.sb_mb = m_free(m); 1151 m = so->so_rcv.sb_mb; 1152 } 1153 /* 1154 * If m != NULL, we also know that 1155 * so->so_rcv.sb_mb != NULL. 1156 */ 1157 KASSERT(so->so_rcv.sb_mb == m); 1158 if (m) { 1159 m->m_nextpkt = nextrecord; 1160 if (nextrecord == NULL) 1161 so->so_rcv.sb_lastrecord = m; 1162 } else { 1163 so->so_rcv.sb_mb = nextrecord; 1164 SB_EMPTY_FIXUP(&so->so_rcv); 1165 } 1166 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1167 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1168 } 1169 } else { 1170 if (flags & MSG_PEEK) { 1171 moff += len; 1172 orig_resid = 0; 1173 } else { 1174 if (mp) 1175 *mp = m_copym(m, 0, len, M_WAIT); 1176 m->m_data += len; 1177 m->m_len -= len; 1178 so->so_rcv.sb_cc -= len; 1179 so->so_rcv.sb_datacc -= len; 1180 } 1181 } 1182 if (so->so_oobmark) { 1183 if ((flags & MSG_PEEK) == 0) { 1184 so->so_oobmark -= len; 1185 if (so->so_oobmark == 0) { 1186 so->so_rcv.sb_state |= SS_RCVATMARK; 1187 break; 1188 } 1189 } else { 1190 offset += len; 1191 if (offset == so->so_oobmark) 1192 break; 1193 } 1194 } 1195 if (flags & MSG_EOR) 1196 break; 1197 /* 1198 * If the MSG_WAITALL flag is set (for non-atomic socket), 1199 * we must not quit until "uio->uio_resid == 0" or an error 1200 * termination. If a signal/timeout occurs, return 1201 * with a short count but without error. 1202 * Keep sockbuf locked against other readers. 1203 */ 1204 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1205 !sosendallatonce(so) && !nextrecord) { 1206 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1207 so->so_error) 1208 break; 1209 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1210 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1211 if (sbwait(so, &so->so_rcv)) { 1212 sb_mtx_unlock(&so->so_rcv); 1213 if (dosolock) 1214 sounlock_shared(so); 1215 sbunlock(&so->so_rcv); 1216 return (0); 1217 } 1218 if ((m = so->so_rcv.sb_mb) != NULL) 1219 nextrecord = m->m_nextpkt; 1220 } 1221 } 1222 1223 if (m && pr->pr_flags & PR_ATOMIC) { 1224 flags |= MSG_TRUNC; 1225 if ((flags & MSG_PEEK) == 0) 1226 (void) sbdroprecord(so, &so->so_rcv); 1227 } 1228 if ((flags & MSG_PEEK) == 0) { 1229 if (m == NULL) { 1230 /* 1231 * First part is an inline SB_EMPTY_FIXUP(). Second 1232 * part makes sure sb_lastrecord is up-to-date if 1233 * there is still data in the socket buffer. 1234 */ 1235 so->so_rcv.sb_mb = nextrecord; 1236 if (so->so_rcv.sb_mb == NULL) { 1237 so->so_rcv.sb_mbtail = NULL; 1238 so->so_rcv.sb_lastrecord = NULL; 1239 } else if (nextrecord->m_nextpkt == NULL) 1240 so->so_rcv.sb_lastrecord = nextrecord; 1241 } 1242 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1243 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1244 if (pr->pr_flags & PR_WANTRCVD) { 1245 sb_mtx_unlock(&so->so_rcv); 1246 if (!dosolock) 1247 solock_shared(so); 1248 pru_rcvd(so); 1249 if (!dosolock) 1250 sounlock_shared(so); 1251 sb_mtx_lock(&so->so_rcv); 1252 } 1253 } 1254 if (orig_resid == uio->uio_resid && orig_resid && 1255 (flags & MSG_EOR) == 0 && 1256 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1257 sb_mtx_unlock(&so->so_rcv); 1258 sbunlock(&so->so_rcv); 1259 goto restart; 1260 } 1261 1262 if (uio_error) 1263 error = uio_error; 1264 1265 if (flagsp) 1266 *flagsp |= flags; 1267 release: 1268 sb_mtx_unlock(&so->so_rcv); 1269 if (dosolock) 1270 sounlock_shared(so); 1271 sbunlock(&so->so_rcv); 1272 return (error); 1273 } 1274 1275 int 1276 soshutdown(struct socket *so, int how) 1277 { 1278 int error = 0; 1279 1280 switch (how) { 1281 case SHUT_RD: 1282 sorflush(so); 1283 break; 1284 case SHUT_RDWR: 1285 sorflush(so); 1286 /* FALLTHROUGH */ 1287 case SHUT_WR: 1288 solock(so); 1289 error = pru_shutdown(so); 1290 sounlock(so); 1291 break; 1292 default: 1293 error = EINVAL; 1294 break; 1295 } 1296 1297 return (error); 1298 } 1299 1300 void 1301 sorflush(struct socket *so) 1302 { 1303 struct sockbuf *sb = &so->so_rcv; 1304 struct mbuf *m; 1305 const struct protosw *pr = so->so_proto; 1306 int error; 1307 1308 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1309 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1310 KASSERT(error == 0); 1311 1312 solock_shared(so); 1313 socantrcvmore(so); 1314 mtx_enter(&sb->sb_mtx); 1315 m = sb->sb_mb; 1316 memset(&sb->sb_startzero, 0, 1317 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1318 sb->sb_timeo_nsecs = INFSLP; 1319 mtx_leave(&sb->sb_mtx); 1320 sounlock_shared(so); 1321 sbunlock(sb); 1322 1323 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1324 (*pr->pr_domain->dom_dispose)(m); 1325 m_purge(m); 1326 } 1327 1328 #ifdef SOCKET_SPLICE 1329 1330 #define so_splicelen so_sp->ssp_len 1331 #define so_splicemax so_sp->ssp_max 1332 #define so_idletv so_sp->ssp_idletv 1333 #define so_idleto so_sp->ssp_idleto 1334 #define so_splicetask so_sp->ssp_task 1335 1336 void 1337 sosplice_solock_pair(struct socket *so1, struct socket *so2) 1338 { 1339 NET_LOCK_SHARED(); 1340 1341 if (so1 == so2) 1342 rw_enter_write(&so1->so_lock); 1343 else if (so1 < so2) { 1344 rw_enter_write(&so1->so_lock); 1345 rw_enter_write(&so2->so_lock); 1346 } else { 1347 rw_enter_write(&so2->so_lock); 1348 rw_enter_write(&so1->so_lock); 1349 } 1350 } 1351 1352 void 1353 sosplice_sounlock_pair(struct socket *so1, struct socket *so2) 1354 { 1355 if (so1 == so2) 1356 rw_exit_write(&so1->so_lock); 1357 else if (so1 < so2) { 1358 rw_exit_write(&so2->so_lock); 1359 rw_exit_write(&so1->so_lock); 1360 } else { 1361 rw_exit_write(&so1->so_lock); 1362 rw_exit_write(&so2->so_lock); 1363 } 1364 1365 NET_UNLOCK_SHARED(); 1366 } 1367 1368 int 1369 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1370 { 1371 struct file *fp; 1372 struct socket *sosp; 1373 struct taskq *tq; 1374 int error = 0; 1375 1376 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1377 return (EPROTONOSUPPORT); 1378 if (max && max < 0) 1379 return (EINVAL); 1380 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1381 return (EINVAL); 1382 1383 /* If no fd is given, unsplice by removing existing link. */ 1384 if (fd < 0) { 1385 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1386 return (error); 1387 if (so->so_sp && so->so_sp->ssp_socket) { 1388 sosp = soref(so->so_sp->ssp_socket); 1389 sounsplice(so, so->so_sp->ssp_socket, 0); 1390 sorele(sosp); 1391 } 1392 sbunlock(&so->so_rcv); 1393 return (0); 1394 } 1395 1396 if (sosplice_taskq == NULL) { 1397 rw_enter_write(&sosplice_lock); 1398 if (sosplice_taskq == NULL) { 1399 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1400 TASKQ_MPSAFE); 1401 if (tq == NULL) { 1402 rw_exit_write(&sosplice_lock); 1403 return (ENOMEM); 1404 } 1405 /* Ensure the taskq is fully visible to other CPUs. */ 1406 membar_producer(); 1407 sosplice_taskq = tq; 1408 } 1409 rw_exit_write(&sosplice_lock); 1410 } else { 1411 /* Ensure the taskq is fully visible on this CPU. */ 1412 membar_consumer(); 1413 } 1414 1415 /* Find sosp, the drain socket where data will be spliced into. */ 1416 if ((error = getsock(curproc, fd, &fp)) != 0) 1417 return (error); 1418 sosp = fp->f_data; 1419 1420 if (sosp->so_proto->pr_usrreqs->pru_send != 1421 so->so_proto->pr_usrreqs->pru_send) { 1422 error = EPROTONOSUPPORT; 1423 goto frele; 1424 } 1425 1426 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1427 goto frele; 1428 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1429 sbunlock(&so->so_rcv); 1430 goto frele; 1431 } 1432 sosplice_solock_pair(so, sosp); 1433 1434 if ((so->so_options & SO_ACCEPTCONN) || 1435 (sosp->so_options & SO_ACCEPTCONN)) { 1436 error = EOPNOTSUPP; 1437 goto release; 1438 } 1439 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1440 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1441 error = ENOTCONN; 1442 goto release; 1443 } 1444 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1445 error = ENOTCONN; 1446 goto release; 1447 } 1448 if (so->so_sp == NULL) 1449 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1450 if (sosp->so_sp == NULL) 1451 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1452 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1453 error = EBUSY; 1454 goto release; 1455 } 1456 1457 so->so_splicelen = 0; 1458 so->so_splicemax = max; 1459 if (tv) 1460 so->so_idletv = *tv; 1461 else 1462 timerclear(&so->so_idletv); 1463 timeout_set_flags(&so->so_idleto, soidle, so, 1464 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1465 task_set(&so->so_splicetask, sotask, so); 1466 1467 /* 1468 * To prevent sorwakeup() calling somove() before this somove() 1469 * has finished, the socket buffers are not marked as spliced yet. 1470 */ 1471 1472 /* Splice so and sosp together. */ 1473 mtx_enter(&so->so_rcv.sb_mtx); 1474 mtx_enter(&sosp->so_snd.sb_mtx); 1475 so->so_sp->ssp_socket = sosp; 1476 sosp->so_sp->ssp_soback = so; 1477 mtx_leave(&sosp->so_snd.sb_mtx); 1478 mtx_leave(&so->so_rcv.sb_mtx); 1479 1480 sosplice_sounlock_pair(so, sosp); 1481 sbunlock(&sosp->so_snd); 1482 1483 if (somove(so, M_WAIT)) { 1484 mtx_enter(&so->so_rcv.sb_mtx); 1485 mtx_enter(&sosp->so_snd.sb_mtx); 1486 so->so_rcv.sb_flags |= SB_SPLICE; 1487 sosp->so_snd.sb_flags |= SB_SPLICE; 1488 mtx_leave(&sosp->so_snd.sb_mtx); 1489 mtx_leave(&so->so_rcv.sb_mtx); 1490 } 1491 1492 sbunlock(&so->so_rcv); 1493 FRELE(fp, curproc); 1494 return (0); 1495 1496 release: 1497 sosplice_sounlock_pair(so, sosp); 1498 sbunlock(&sosp->so_snd); 1499 sbunlock(&so->so_rcv); 1500 frele: 1501 FRELE(fp, curproc); 1502 return (error); 1503 } 1504 1505 void 1506 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1507 { 1508 sbassertlocked(&so->so_rcv); 1509 1510 task_del(sosplice_taskq, &so->so_splicetask); 1511 timeout_del(&so->so_idleto); 1512 1513 mtx_enter(&so->so_rcv.sb_mtx); 1514 mtx_enter(&sosp->so_snd.sb_mtx); 1515 so->so_rcv.sb_flags &= ~SB_SPLICE; 1516 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1517 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1518 mtx_leave(&sosp->so_snd.sb_mtx); 1519 mtx_leave(&so->so_rcv.sb_mtx); 1520 1521 /* Do not wakeup a socket that is about to be freed. */ 1522 if ((freeing & SOSP_FREEING_READ) == 0) { 1523 int readable; 1524 1525 solock_shared(so); 1526 mtx_enter(&so->so_rcv.sb_mtx); 1527 readable = soreadable(so); 1528 mtx_leave(&so->so_rcv.sb_mtx); 1529 if (readable) 1530 sorwakeup(so); 1531 sounlock_shared(so); 1532 } 1533 if ((freeing & SOSP_FREEING_WRITE) == 0) { 1534 solock_shared(sosp); 1535 if (sowriteable(sosp)) 1536 sowwakeup(sosp); 1537 sounlock_shared(sosp); 1538 } 1539 } 1540 1541 void 1542 soidle(void *arg) 1543 { 1544 struct socket *so = arg; 1545 1546 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1547 if (so->so_rcv.sb_flags & SB_SPLICE) { 1548 struct socket *sosp; 1549 1550 WRITE_ONCE(so->so_error, ETIMEDOUT); 1551 sosp = soref(so->so_sp->ssp_socket); 1552 sounsplice(so, so->so_sp->ssp_socket, 0); 1553 sorele(sosp); 1554 } 1555 sbunlock(&so->so_rcv); 1556 } 1557 1558 void 1559 sotask(void *arg) 1560 { 1561 struct socket *so = arg; 1562 int doyield = 0; 1563 1564 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1565 if (so->so_rcv.sb_flags & SB_SPLICE) { 1566 if (so->so_proto->pr_flags & PR_WANTRCVD) 1567 doyield = 1; 1568 somove(so, M_DONTWAIT); 1569 } 1570 sbunlock(&so->so_rcv); 1571 1572 if (doyield) { 1573 /* Avoid user land starvation. */ 1574 yield(); 1575 } 1576 } 1577 1578 /* 1579 * Move data from receive buffer of spliced source socket to send 1580 * buffer of drain socket. Try to move as much as possible in one 1581 * big chunk. It is a TCP only implementation. 1582 * Return value 0 means splicing has been finished, 1 continue. 1583 */ 1584 int 1585 somove(struct socket *so, int wait) 1586 { 1587 struct socket *sosp = so->so_sp->ssp_socket; 1588 struct mbuf *m, **mp, *nextrecord; 1589 u_long len, off, oobmark; 1590 long space; 1591 int error = 0, maxreached = 0, unsplice = 0; 1592 unsigned int rcvstate; 1593 int sockdgram = ((so->so_proto->pr_flags & 1594 PR_WANTRCVD) == 0); 1595 1596 sbassertlocked(&so->so_rcv); 1597 1598 if (!sockdgram) { 1599 sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 1600 solock(so); 1601 } 1602 1603 mtx_enter(&so->so_rcv.sb_mtx); 1604 mtx_enter(&sosp->so_snd.sb_mtx); 1605 1606 nextpkt: 1607 if ((error = READ_ONCE(so->so_error))) 1608 goto release; 1609 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1610 error = EPIPE; 1611 goto release; 1612 } 1613 1614 error = READ_ONCE(sosp->so_error); 1615 if (error) { 1616 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1617 goto release; 1618 error = 0; 1619 } 1620 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1621 goto release; 1622 1623 /* Calculate how many bytes can be copied now. */ 1624 len = so->so_rcv.sb_datacc; 1625 if (so->so_splicemax) { 1626 KASSERT(so->so_splicelen < so->so_splicemax); 1627 if (so->so_splicemax <= so->so_splicelen + len) { 1628 len = so->so_splicemax - so->so_splicelen; 1629 maxreached = 1; 1630 } 1631 } 1632 space = sbspace_locked(sosp, &sosp->so_snd); 1633 if (so->so_oobmark && so->so_oobmark < len && 1634 so->so_oobmark < space + 1024) 1635 space += 1024; 1636 if (space <= 0) { 1637 maxreached = 0; 1638 goto release; 1639 } 1640 if (space < len) { 1641 maxreached = 0; 1642 if (space < sosp->so_snd.sb_lowat) 1643 goto release; 1644 len = space; 1645 } 1646 sosp->so_snd.sb_state |= SS_ISSENDING; 1647 1648 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1649 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1650 m = so->so_rcv.sb_mb; 1651 if (m == NULL) 1652 goto release; 1653 nextrecord = m->m_nextpkt; 1654 1655 /* Drop address and control information not used with splicing. */ 1656 if (so->so_proto->pr_flags & PR_ADDR) { 1657 #ifdef DIAGNOSTIC 1658 if (m->m_type != MT_SONAME) 1659 panic("somove soname: so %p, so_type %d, m %p, " 1660 "m_type %d", so, so->so_type, m, m->m_type); 1661 #endif 1662 m = m->m_next; 1663 } 1664 while (m && m->m_type == MT_CONTROL) 1665 m = m->m_next; 1666 if (m == NULL) { 1667 sbdroprecord(so, &so->so_rcv); 1668 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1669 mtx_leave(&sosp->so_snd.sb_mtx); 1670 mtx_leave(&so->so_rcv.sb_mtx); 1671 pru_rcvd(so); 1672 mtx_enter(&so->so_rcv.sb_mtx); 1673 mtx_enter(&sosp->so_snd.sb_mtx); 1674 } 1675 goto nextpkt; 1676 } 1677 1678 /* 1679 * By splicing sockets connected to localhost, userland might create a 1680 * loop. Dissolve splicing with error if loop is detected by counter. 1681 * 1682 * If we deal with looped broadcast/multicast packet we bail out with 1683 * no error to suppress splice termination. 1684 */ 1685 if ((m->m_flags & M_PKTHDR) && 1686 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1687 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1688 error = ELOOP; 1689 goto release; 1690 } 1691 1692 if (so->so_proto->pr_flags & PR_ATOMIC) { 1693 if ((m->m_flags & M_PKTHDR) == 0) 1694 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1695 "m_type %d", so, so->so_type, m, m->m_type); 1696 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1697 error = EMSGSIZE; 1698 goto release; 1699 } 1700 if (len < m->m_pkthdr.len) 1701 goto release; 1702 if (m->m_pkthdr.len < len) { 1703 maxreached = 0; 1704 len = m->m_pkthdr.len; 1705 } 1706 /* 1707 * Throw away the name mbuf after it has been assured 1708 * that the whole first record can be processed. 1709 */ 1710 m = so->so_rcv.sb_mb; 1711 sbfree(so, &so->so_rcv, m); 1712 so->so_rcv.sb_mb = m_free(m); 1713 sbsync(&so->so_rcv, nextrecord); 1714 } 1715 /* 1716 * Throw away the control mbufs after it has been assured 1717 * that the whole first record can be processed. 1718 */ 1719 m = so->so_rcv.sb_mb; 1720 while (m && m->m_type == MT_CONTROL) { 1721 sbfree(so, &so->so_rcv, m); 1722 so->so_rcv.sb_mb = m_free(m); 1723 m = so->so_rcv.sb_mb; 1724 sbsync(&so->so_rcv, nextrecord); 1725 } 1726 1727 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1728 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1729 1730 /* Take at most len mbufs out of receive buffer. */ 1731 for (off = 0, mp = &m; off <= len && *mp; 1732 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1733 u_long size = len - off; 1734 1735 #ifdef DIAGNOSTIC 1736 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1737 panic("somove type: so %p, so_type %d, m %p, " 1738 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1739 #endif 1740 if ((*mp)->m_len > size) { 1741 /* 1742 * Move only a partial mbuf at maximum splice length or 1743 * if the drain buffer is too small for this large mbuf. 1744 */ 1745 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1746 len -= size; 1747 break; 1748 } 1749 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1750 if (*mp == NULL) { 1751 len -= size; 1752 break; 1753 } 1754 so->so_rcv.sb_mb->m_data += size; 1755 so->so_rcv.sb_mb->m_len -= size; 1756 so->so_rcv.sb_cc -= size; 1757 so->so_rcv.sb_datacc -= size; 1758 } else { 1759 *mp = so->so_rcv.sb_mb; 1760 sbfree(so, &so->so_rcv, *mp); 1761 so->so_rcv.sb_mb = (*mp)->m_next; 1762 sbsync(&so->so_rcv, nextrecord); 1763 } 1764 } 1765 *mp = NULL; 1766 1767 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1768 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1769 SBCHECK(so, &so->so_rcv); 1770 if (m == NULL) 1771 goto release; 1772 m->m_nextpkt = NULL; 1773 if (m->m_flags & M_PKTHDR) { 1774 m_resethdr(m); 1775 m->m_pkthdr.len = len; 1776 } 1777 1778 /* Send window update to source peer as receive buffer has changed. */ 1779 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1780 mtx_leave(&sosp->so_snd.sb_mtx); 1781 mtx_leave(&so->so_rcv.sb_mtx); 1782 pru_rcvd(so); 1783 mtx_enter(&so->so_rcv.sb_mtx); 1784 mtx_enter(&sosp->so_snd.sb_mtx); 1785 } 1786 1787 /* Receive buffer did shrink by len bytes, adjust oob. */ 1788 rcvstate = so->so_rcv.sb_state; 1789 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1790 oobmark = so->so_oobmark; 1791 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1792 if (oobmark) { 1793 if (oobmark == len) 1794 so->so_rcv.sb_state |= SS_RCVATMARK; 1795 if (oobmark >= len) 1796 oobmark = 0; 1797 } 1798 1799 /* 1800 * Handle oob data. If any malloc fails, ignore error. 1801 * TCP urgent data is not very reliable anyway. 1802 */ 1803 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1804 (so->so_options & SO_OOBINLINE)) { 1805 struct mbuf *o = NULL; 1806 1807 if (rcvstate & SS_RCVATMARK) { 1808 o = m_get(wait, MT_DATA); 1809 rcvstate &= ~SS_RCVATMARK; 1810 } else if (oobmark) { 1811 o = m_split(m, oobmark, wait); 1812 if (o) { 1813 mtx_leave(&sosp->so_snd.sb_mtx); 1814 mtx_leave(&so->so_rcv.sb_mtx); 1815 error = pru_send(sosp, m, NULL, NULL); 1816 mtx_enter(&so->so_rcv.sb_mtx); 1817 mtx_enter(&sosp->so_snd.sb_mtx); 1818 1819 if (error) { 1820 if (sosp->so_snd.sb_state & 1821 SS_CANTSENDMORE) 1822 error = EPIPE; 1823 m_freem(o); 1824 goto release; 1825 } 1826 len -= oobmark; 1827 so->so_splicelen += oobmark; 1828 m = o; 1829 o = m_get(wait, MT_DATA); 1830 } 1831 oobmark = 0; 1832 } 1833 if (o) { 1834 o->m_len = 1; 1835 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1836 1837 mtx_leave(&sosp->so_snd.sb_mtx); 1838 mtx_leave(&so->so_rcv.sb_mtx); 1839 error = pru_sendoob(sosp, o, NULL, NULL); 1840 mtx_enter(&so->so_rcv.sb_mtx); 1841 mtx_enter(&sosp->so_snd.sb_mtx); 1842 1843 if (error) { 1844 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1845 error = EPIPE; 1846 m_freem(m); 1847 goto release; 1848 } 1849 len -= 1; 1850 so->so_splicelen += 1; 1851 if (oobmark) { 1852 oobmark -= 1; 1853 if (oobmark == 0) 1854 rcvstate |= SS_RCVATMARK; 1855 } 1856 m_adj(m, 1); 1857 } 1858 } 1859 1860 /* Append all remaining data to drain socket. */ 1861 if (so->so_rcv.sb_cc == 0 || maxreached) 1862 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1863 1864 mtx_leave(&sosp->so_snd.sb_mtx); 1865 mtx_leave(&so->so_rcv.sb_mtx); 1866 1867 if (sockdgram) 1868 solock_shared(sosp); 1869 error = pru_send(sosp, m, NULL, NULL); 1870 if (sockdgram) 1871 sounlock_shared(sosp); 1872 1873 mtx_enter(&so->so_rcv.sb_mtx); 1874 mtx_enter(&sosp->so_snd.sb_mtx); 1875 1876 if (error) { 1877 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1878 sosp->so_pcb == NULL) 1879 error = EPIPE; 1880 goto release; 1881 } 1882 so->so_splicelen += len; 1883 1884 /* Move several packets if possible. */ 1885 if (!maxreached && nextrecord) 1886 goto nextpkt; 1887 1888 release: 1889 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1890 1891 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1892 error = EFBIG; 1893 if (error) 1894 WRITE_ONCE(so->so_error, error); 1895 1896 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1897 so->so_rcv.sb_cc == 0) || 1898 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1899 maxreached || error) 1900 unsplice = 1; 1901 1902 mtx_leave(&sosp->so_snd.sb_mtx); 1903 mtx_leave(&so->so_rcv.sb_mtx); 1904 1905 if (!sockdgram) { 1906 sbunlock(&so->so_snd); 1907 sounlock(so); 1908 } 1909 1910 if (unsplice) { 1911 soref(sosp); 1912 sounsplice(so, sosp, 0); 1913 sorele(sosp); 1914 1915 return (0); 1916 } 1917 if (timerisset(&so->so_idletv)) 1918 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1919 return (1); 1920 } 1921 #endif /* SOCKET_SPLICE */ 1922 1923 void 1924 sorwakeup(struct socket *so) 1925 { 1926 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1927 soassertlocked_readonly(so); 1928 1929 #ifdef SOCKET_SPLICE 1930 if (so->so_proto->pr_flags & PR_SPLICE) { 1931 sb_mtx_lock(&so->so_rcv); 1932 if (so->so_rcv.sb_flags & SB_SPLICE) 1933 task_add(sosplice_taskq, &so->so_splicetask); 1934 if (isspliced(so)) { 1935 sb_mtx_unlock(&so->so_rcv); 1936 return; 1937 } 1938 sb_mtx_unlock(&so->so_rcv); 1939 } 1940 #endif 1941 sowakeup(so, &so->so_rcv); 1942 if (so->so_upcall) 1943 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1944 } 1945 1946 void 1947 sowwakeup(struct socket *so) 1948 { 1949 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1950 soassertlocked_readonly(so); 1951 1952 #ifdef SOCKET_SPLICE 1953 if (so->so_proto->pr_flags & PR_SPLICE) { 1954 sb_mtx_lock(&so->so_snd); 1955 if (so->so_snd.sb_flags & SB_SPLICE) 1956 task_add(sosplice_taskq, 1957 &so->so_sp->ssp_soback->so_splicetask); 1958 if (issplicedback(so)) { 1959 sb_mtx_unlock(&so->so_snd); 1960 return; 1961 } 1962 sb_mtx_unlock(&so->so_snd); 1963 } 1964 #endif 1965 sowakeup(so, &so->so_snd); 1966 } 1967 1968 int 1969 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1970 { 1971 int error = 0; 1972 1973 if (level != SOL_SOCKET) { 1974 if (so->so_proto->pr_ctloutput) { 1975 solock(so); 1976 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1977 level, optname, m); 1978 sounlock(so); 1979 return (error); 1980 } 1981 error = ENOPROTOOPT; 1982 } else { 1983 switch (optname) { 1984 1985 case SO_LINGER: 1986 if (m == NULL || m->m_len != sizeof (struct linger) || 1987 mtod(m, struct linger *)->l_linger < 0 || 1988 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1989 return (EINVAL); 1990 1991 solock(so); 1992 so->so_linger = mtod(m, struct linger *)->l_linger; 1993 if (*mtod(m, int *)) 1994 so->so_options |= optname; 1995 else 1996 so->so_options &= ~optname; 1997 sounlock(so); 1998 1999 break; 2000 case SO_BINDANY: 2001 if ((error = suser(curproc)) != 0) /* XXX */ 2002 return (error); 2003 /* FALLTHROUGH */ 2004 2005 case SO_DEBUG: 2006 case SO_KEEPALIVE: 2007 case SO_USELOOPBACK: 2008 case SO_BROADCAST: 2009 case SO_REUSEADDR: 2010 case SO_REUSEPORT: 2011 case SO_OOBINLINE: 2012 case SO_TIMESTAMP: 2013 case SO_ZEROIZE: 2014 if (m == NULL || m->m_len < sizeof (int)) 2015 return (EINVAL); 2016 2017 solock(so); 2018 if (*mtod(m, int *)) 2019 so->so_options |= optname; 2020 else 2021 so->so_options &= ~optname; 2022 sounlock(so); 2023 2024 break; 2025 case SO_DONTROUTE: 2026 if (m == NULL || m->m_len < sizeof (int)) 2027 return (EINVAL); 2028 if (*mtod(m, int *)) 2029 error = EOPNOTSUPP; 2030 break; 2031 2032 case SO_SNDBUF: 2033 case SO_RCVBUF: 2034 case SO_SNDLOWAT: 2035 case SO_RCVLOWAT: 2036 { 2037 struct sockbuf *sb = (optname == SO_SNDBUF || 2038 optname == SO_SNDLOWAT ? 2039 &so->so_snd : &so->so_rcv); 2040 u_long cnt; 2041 2042 if (m == NULL || m->m_len < sizeof (int)) 2043 return (EINVAL); 2044 cnt = *mtod(m, int *); 2045 if ((long)cnt <= 0) 2046 cnt = 1; 2047 2048 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2049 solock(so); 2050 mtx_enter(&sb->sb_mtx); 2051 2052 switch (optname) { 2053 case SO_SNDBUF: 2054 case SO_RCVBUF: 2055 if (sb->sb_state & 2056 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2057 error = EINVAL; 2058 break; 2059 } 2060 if (sbcheckreserve(cnt, sb->sb_wat) || 2061 sbreserve(so, sb, cnt)) { 2062 error = ENOBUFS; 2063 break; 2064 } 2065 sb->sb_wat = cnt; 2066 break; 2067 case SO_SNDLOWAT: 2068 case SO_RCVLOWAT: 2069 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2070 sb->sb_hiwat : cnt; 2071 break; 2072 } 2073 2074 mtx_leave(&sb->sb_mtx); 2075 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2076 sounlock(so); 2077 2078 break; 2079 } 2080 2081 case SO_SNDTIMEO: 2082 case SO_RCVTIMEO: 2083 { 2084 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2085 &so->so_snd : &so->so_rcv); 2086 struct timeval tv; 2087 uint64_t nsecs; 2088 2089 if (m == NULL || m->m_len < sizeof (tv)) 2090 return (EINVAL); 2091 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2092 if (!timerisvalid(&tv)) 2093 return (EINVAL); 2094 nsecs = TIMEVAL_TO_NSEC(&tv); 2095 if (nsecs == UINT64_MAX) 2096 return (EDOM); 2097 if (nsecs == 0) 2098 nsecs = INFSLP; 2099 2100 mtx_enter(&sb->sb_mtx); 2101 sb->sb_timeo_nsecs = nsecs; 2102 mtx_leave(&sb->sb_mtx); 2103 break; 2104 } 2105 2106 case SO_RTABLE: 2107 if (so->so_proto->pr_domain && 2108 so->so_proto->pr_domain->dom_protosw && 2109 so->so_proto->pr_ctloutput) { 2110 const struct domain *dom = 2111 so->so_proto->pr_domain; 2112 2113 level = dom->dom_protosw->pr_protocol; 2114 solock(so); 2115 error = (*so->so_proto->pr_ctloutput) 2116 (PRCO_SETOPT, so, level, optname, m); 2117 sounlock(so); 2118 } else 2119 error = ENOPROTOOPT; 2120 break; 2121 #ifdef SOCKET_SPLICE 2122 case SO_SPLICE: 2123 if (m == NULL) { 2124 error = sosplice(so, -1, 0, NULL); 2125 } else if (m->m_len < sizeof(int)) { 2126 error = EINVAL; 2127 } else if (m->m_len < sizeof(struct splice)) { 2128 error = sosplice(so, *mtod(m, int *), 0, NULL); 2129 } else { 2130 error = sosplice(so, 2131 mtod(m, struct splice *)->sp_fd, 2132 mtod(m, struct splice *)->sp_max, 2133 &mtod(m, struct splice *)->sp_idle); 2134 } 2135 break; 2136 #endif /* SOCKET_SPLICE */ 2137 2138 default: 2139 error = ENOPROTOOPT; 2140 break; 2141 } 2142 } 2143 2144 return (error); 2145 } 2146 2147 int 2148 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2149 { 2150 int error = 0; 2151 2152 if (level != SOL_SOCKET) { 2153 if (so->so_proto->pr_ctloutput) { 2154 m->m_len = 0; 2155 2156 solock(so); 2157 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2158 level, optname, m); 2159 sounlock(so); 2160 return (error); 2161 } else 2162 return (ENOPROTOOPT); 2163 } else { 2164 m->m_len = sizeof (int); 2165 2166 switch (optname) { 2167 2168 case SO_LINGER: 2169 m->m_len = sizeof (struct linger); 2170 solock_shared(so); 2171 mtod(m, struct linger *)->l_onoff = 2172 so->so_options & SO_LINGER; 2173 mtod(m, struct linger *)->l_linger = so->so_linger; 2174 sounlock_shared(so); 2175 break; 2176 2177 case SO_BINDANY: 2178 case SO_USELOOPBACK: 2179 case SO_DEBUG: 2180 case SO_KEEPALIVE: 2181 case SO_REUSEADDR: 2182 case SO_REUSEPORT: 2183 case SO_BROADCAST: 2184 case SO_OOBINLINE: 2185 case SO_ACCEPTCONN: 2186 case SO_TIMESTAMP: 2187 case SO_ZEROIZE: 2188 *mtod(m, int *) = so->so_options & optname; 2189 break; 2190 2191 case SO_DONTROUTE: 2192 *mtod(m, int *) = 0; 2193 break; 2194 2195 case SO_TYPE: 2196 *mtod(m, int *) = so->so_type; 2197 break; 2198 2199 case SO_ERROR: 2200 solock(so); 2201 *mtod(m, int *) = so->so_error; 2202 so->so_error = 0; 2203 sounlock(so); 2204 2205 break; 2206 2207 case SO_DOMAIN: 2208 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2209 break; 2210 2211 case SO_PROTOCOL: 2212 *mtod(m, int *) = so->so_proto->pr_protocol; 2213 break; 2214 2215 case SO_SNDBUF: 2216 *mtod(m, int *) = so->so_snd.sb_hiwat; 2217 break; 2218 2219 case SO_RCVBUF: 2220 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2221 break; 2222 2223 case SO_SNDLOWAT: 2224 *mtod(m, int *) = so->so_snd.sb_lowat; 2225 break; 2226 2227 case SO_RCVLOWAT: 2228 *mtod(m, int *) = so->so_rcv.sb_lowat; 2229 break; 2230 2231 case SO_SNDTIMEO: 2232 case SO_RCVTIMEO: 2233 { 2234 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2235 &so->so_snd : &so->so_rcv); 2236 struct timeval tv; 2237 uint64_t nsecs; 2238 2239 mtx_enter(&sb->sb_mtx); 2240 nsecs = sb->sb_timeo_nsecs; 2241 mtx_leave(&sb->sb_mtx); 2242 2243 m->m_len = sizeof(struct timeval); 2244 memset(&tv, 0, sizeof(tv)); 2245 if (nsecs != INFSLP) 2246 NSEC_TO_TIMEVAL(nsecs, &tv); 2247 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2248 break; 2249 } 2250 2251 case SO_RTABLE: 2252 if (so->so_proto->pr_domain && 2253 so->so_proto->pr_domain->dom_protosw && 2254 so->so_proto->pr_ctloutput) { 2255 const struct domain *dom = 2256 so->so_proto->pr_domain; 2257 2258 level = dom->dom_protosw->pr_protocol; 2259 solock(so); 2260 error = (*so->so_proto->pr_ctloutput) 2261 (PRCO_GETOPT, so, level, optname, m); 2262 sounlock(so); 2263 if (error) 2264 return (error); 2265 break; 2266 } 2267 return (ENOPROTOOPT); 2268 2269 #ifdef SOCKET_SPLICE 2270 case SO_SPLICE: 2271 { 2272 off_t len; 2273 2274 m->m_len = sizeof(off_t); 2275 solock_shared(so); 2276 len = so->so_sp ? so->so_sp->ssp_len : 0; 2277 sounlock_shared(so); 2278 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2279 break; 2280 } 2281 #endif /* SOCKET_SPLICE */ 2282 2283 case SO_PEERCRED: 2284 if (so->so_proto->pr_protocol == AF_UNIX) { 2285 struct unpcb *unp = sotounpcb(so); 2286 2287 solock(so); 2288 if (unp->unp_flags & UNP_FEIDS) { 2289 m->m_len = sizeof(unp->unp_connid); 2290 memcpy(mtod(m, caddr_t), 2291 &(unp->unp_connid), m->m_len); 2292 sounlock(so); 2293 break; 2294 } 2295 sounlock(so); 2296 2297 return (ENOTCONN); 2298 } 2299 return (EOPNOTSUPP); 2300 2301 default: 2302 return (ENOPROTOOPT); 2303 } 2304 return (0); 2305 } 2306 } 2307 2308 void 2309 sohasoutofband(struct socket *so) 2310 { 2311 pgsigio(&so->so_sigio, SIGURG, 0); 2312 knote(&so->so_rcv.sb_klist, 0); 2313 } 2314 2315 void 2316 sofilt_lock(struct socket *so, struct sockbuf *sb) 2317 { 2318 switch (so->so_proto->pr_domain->dom_family) { 2319 case PF_INET: 2320 case PF_INET6: 2321 NET_LOCK_SHARED(); 2322 break; 2323 default: 2324 rw_enter_write(&so->so_lock); 2325 break; 2326 } 2327 2328 mtx_enter(&sb->sb_mtx); 2329 } 2330 2331 void 2332 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2333 { 2334 mtx_leave(&sb->sb_mtx); 2335 2336 switch (so->so_proto->pr_domain->dom_family) { 2337 case PF_INET: 2338 case PF_INET6: 2339 NET_UNLOCK_SHARED(); 2340 break; 2341 default: 2342 rw_exit_write(&so->so_lock); 2343 break; 2344 } 2345 } 2346 2347 int 2348 soo_kqfilter(struct file *fp, struct knote *kn) 2349 { 2350 struct socket *so = kn->kn_fp->f_data; 2351 struct sockbuf *sb; 2352 2353 switch (kn->kn_filter) { 2354 case EVFILT_READ: 2355 kn->kn_fop = &soread_filtops; 2356 sb = &so->so_rcv; 2357 break; 2358 case EVFILT_WRITE: 2359 kn->kn_fop = &sowrite_filtops; 2360 sb = &so->so_snd; 2361 break; 2362 case EVFILT_EXCEPT: 2363 kn->kn_fop = &soexcept_filtops; 2364 sb = &so->so_rcv; 2365 break; 2366 default: 2367 return (EINVAL); 2368 } 2369 2370 klist_insert(&sb->sb_klist, kn); 2371 2372 return (0); 2373 } 2374 2375 void 2376 filt_sordetach(struct knote *kn) 2377 { 2378 struct socket *so = kn->kn_fp->f_data; 2379 2380 klist_remove(&so->so_rcv.sb_klist, kn); 2381 } 2382 2383 int 2384 filt_soread(struct knote *kn, long hint) 2385 { 2386 struct socket *so = kn->kn_fp->f_data; 2387 u_int state = READ_ONCE(so->so_state); 2388 u_int error = READ_ONCE(so->so_error); 2389 int rv = 0; 2390 2391 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2392 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2393 soassertlocked_readonly(so); 2394 2395 if (so->so_options & SO_ACCEPTCONN) { 2396 short qlen = READ_ONCE(so->so_qlen); 2397 2398 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2399 soassertlocked_readonly(so); 2400 2401 kn->kn_data = qlen; 2402 rv = (kn->kn_data != 0); 2403 2404 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2405 if (state & SS_ISDISCONNECTED) { 2406 kn->kn_flags |= __EV_HUP; 2407 rv = 1; 2408 } else { 2409 rv = qlen || soreadable(so); 2410 } 2411 } 2412 2413 return rv; 2414 } 2415 2416 kn->kn_data = so->so_rcv.sb_cc; 2417 #ifdef SOCKET_SPLICE 2418 if (isspliced(so)) { 2419 rv = 0; 2420 } else 2421 #endif /* SOCKET_SPLICE */ 2422 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2423 kn->kn_flags |= EV_EOF; 2424 if (kn->kn_flags & __EV_POLL) { 2425 if (state & SS_ISDISCONNECTED) 2426 kn->kn_flags |= __EV_HUP; 2427 } 2428 kn->kn_fflags = error; 2429 rv = 1; 2430 } else if (error) { 2431 rv = 1; 2432 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2433 rv = (kn->kn_data >= kn->kn_sdata); 2434 } else { 2435 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2436 } 2437 2438 return rv; 2439 } 2440 2441 void 2442 filt_sowdetach(struct knote *kn) 2443 { 2444 struct socket *so = kn->kn_fp->f_data; 2445 2446 klist_remove(&so->so_snd.sb_klist, kn); 2447 } 2448 2449 int 2450 filt_sowrite(struct knote *kn, long hint) 2451 { 2452 struct socket *so = kn->kn_fp->f_data; 2453 u_int state = READ_ONCE(so->so_state); 2454 u_int error = READ_ONCE(so->so_error); 2455 int rv; 2456 2457 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2458 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2459 soassertlocked_readonly(so); 2460 2461 kn->kn_data = sbspace_locked(so, &so->so_snd); 2462 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2463 kn->kn_flags |= EV_EOF; 2464 if (kn->kn_flags & __EV_POLL) { 2465 if (state & SS_ISDISCONNECTED) 2466 kn->kn_flags |= __EV_HUP; 2467 } 2468 kn->kn_fflags = error; 2469 rv = 1; 2470 } else if (error) { 2471 rv = 1; 2472 } else if (((state & SS_ISCONNECTED) == 0) && 2473 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2474 rv = 0; 2475 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2476 rv = (kn->kn_data >= kn->kn_sdata); 2477 } else { 2478 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2479 } 2480 2481 return (rv); 2482 } 2483 2484 int 2485 filt_soexcept(struct knote *kn, long hint) 2486 { 2487 struct socket *so = kn->kn_fp->f_data; 2488 int rv = 0; 2489 2490 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2491 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2492 soassertlocked_readonly(so); 2493 2494 #ifdef SOCKET_SPLICE 2495 if (isspliced(so)) { 2496 rv = 0; 2497 } else 2498 #endif /* SOCKET_SPLICE */ 2499 if (kn->kn_sfflags & NOTE_OOB) { 2500 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2501 kn->kn_fflags |= NOTE_OOB; 2502 kn->kn_data -= so->so_oobmark; 2503 rv = 1; 2504 } 2505 } 2506 2507 if (kn->kn_flags & __EV_POLL) { 2508 u_int state = READ_ONCE(so->so_state); 2509 2510 if (state & SS_ISDISCONNECTED) { 2511 kn->kn_flags |= __EV_HUP; 2512 rv = 1; 2513 } 2514 } 2515 2516 return rv; 2517 } 2518 2519 int 2520 filt_sowmodify(struct kevent *kev, struct knote *kn) 2521 { 2522 struct socket *so = kn->kn_fp->f_data; 2523 int rv; 2524 2525 sofilt_lock(so, &so->so_snd); 2526 rv = knote_modify(kev, kn); 2527 sofilt_unlock(so, &so->so_snd); 2528 2529 return (rv); 2530 } 2531 2532 int 2533 filt_sowprocess(struct knote *kn, struct kevent *kev) 2534 { 2535 struct socket *so = kn->kn_fp->f_data; 2536 int rv; 2537 2538 sofilt_lock(so, &so->so_snd); 2539 rv = knote_process(kn, kev); 2540 sofilt_unlock(so, &so->so_snd); 2541 2542 return (rv); 2543 } 2544 2545 int 2546 filt_sormodify(struct kevent *kev, struct knote *kn) 2547 { 2548 struct socket *so = kn->kn_fp->f_data; 2549 int rv; 2550 2551 sofilt_lock(so, &so->so_rcv); 2552 rv = knote_modify(kev, kn); 2553 sofilt_unlock(so, &so->so_rcv); 2554 2555 return (rv); 2556 } 2557 2558 int 2559 filt_sorprocess(struct knote *kn, struct kevent *kev) 2560 { 2561 struct socket *so = kn->kn_fp->f_data; 2562 int rv; 2563 2564 sofilt_lock(so, &so->so_rcv); 2565 rv = knote_process(kn, kev); 2566 sofilt_unlock(so, &so->so_rcv); 2567 2568 return (rv); 2569 } 2570 2571 #ifdef DDB 2572 void 2573 sobuf_print(struct sockbuf *, 2574 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2575 2576 void 2577 sobuf_print(struct sockbuf *sb, 2578 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2579 { 2580 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2581 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2582 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2583 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2584 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2585 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2586 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2587 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2588 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2589 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2590 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2591 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2592 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2593 } 2594 2595 void 2596 so_print(void *v, 2597 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2598 { 2599 struct socket *so = v; 2600 2601 (*pr)("socket %p\n", so); 2602 (*pr)("so_type: %i\n", so->so_type); 2603 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2604 (*pr)("so_linger: %i\n", so->so_linger); 2605 (*pr)("so_state: 0x%04x\n", so->so_state); 2606 (*pr)("so_pcb: %p\n", so->so_pcb); 2607 (*pr)("so_proto: %p\n", so->so_proto); 2608 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2609 2610 (*pr)("so_head: %p\n", so->so_head); 2611 (*pr)("so_onq: %p\n", so->so_onq); 2612 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2613 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2614 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2615 (*pr)("so_q0len: %i\n", so->so_q0len); 2616 (*pr)("so_qlen: %i\n", so->so_qlen); 2617 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2618 (*pr)("so_timeo: %i\n", so->so_timeo); 2619 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2620 2621 (*pr)("so_sp: %p\n", so->so_sp); 2622 if (so->so_sp != NULL) { 2623 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2624 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2625 (*pr)("\tssp_len: %lld\n", 2626 (unsigned long long)so->so_sp->ssp_len); 2627 (*pr)("\tssp_max: %lld\n", 2628 (unsigned long long)so->so_sp->ssp_max); 2629 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2630 so->so_sp->ssp_idletv.tv_usec); 2631 (*pr)("\tssp_idleto: %spending (@%i)\n", 2632 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2633 so->so_sp->ssp_idleto.to_time); 2634 } 2635 2636 (*pr)("so_rcv:\n"); 2637 sobuf_print(&so->so_rcv, pr); 2638 (*pr)("so_snd:\n"); 2639 sobuf_print(&so->so_snd, pr); 2640 2641 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2642 so->so_upcall, so->so_upcallarg); 2643 2644 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2645 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2646 (*pr)("so_cpid: %d\n", so->so_cpid); 2647 } 2648 #endif 2649