1 /* $OpenBSD: uipc_socket.c,v 1.337 2024/07/12 17:20:18 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 70 void filt_sordetach(struct knote *kn); 71 int filt_soread(struct knote *kn, long hint); 72 void filt_sowdetach(struct knote *kn); 73 int filt_sowrite(struct knote *kn, long hint); 74 int filt_soexcept(struct knote *kn, long hint); 75 76 int filt_sowmodify(struct kevent *kev, struct knote *kn); 77 int filt_sowprocess(struct knote *kn, struct kevent *kev); 78 79 int filt_sormodify(struct kevent *kev, struct knote *kn); 80 int filt_sorprocess(struct knote *kn, struct kevent *kev); 81 82 const struct filterops soread_filtops = { 83 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 84 .f_attach = NULL, 85 .f_detach = filt_sordetach, 86 .f_event = filt_soread, 87 .f_modify = filt_sormodify, 88 .f_process = filt_sorprocess, 89 }; 90 91 const struct filterops sowrite_filtops = { 92 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 93 .f_attach = NULL, 94 .f_detach = filt_sowdetach, 95 .f_event = filt_sowrite, 96 .f_modify = filt_sowmodify, 97 .f_process = filt_sowprocess, 98 }; 99 100 const struct filterops soexcept_filtops = { 101 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 102 .f_attach = NULL, 103 .f_detach = filt_sordetach, 104 .f_event = filt_soexcept, 105 .f_modify = filt_sormodify, 106 .f_process = filt_sorprocess, 107 }; 108 109 #ifndef SOMINCONN 110 #define SOMINCONN 80 111 #endif /* SOMINCONN */ 112 113 int somaxconn = SOMAXCONN; 114 int sominconn = SOMINCONN; 115 116 struct pool socket_pool; 117 #ifdef SOCKET_SPLICE 118 struct pool sosplice_pool; 119 struct taskq *sosplice_taskq; 120 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 121 #endif 122 123 void 124 soinit(void) 125 { 126 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 127 "sockpl", NULL); 128 #ifdef SOCKET_SPLICE 129 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 130 "sosppl", NULL); 131 #endif 132 } 133 134 struct socket * 135 soalloc(const struct protosw *prp, int wait) 136 { 137 const struct domain *dp = prp->pr_domain; 138 struct socket *so; 139 140 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 141 PR_ZERO); 142 if (so == NULL) 143 return (NULL); 144 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 145 refcnt_init(&so->so_refcnt); 146 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 147 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 148 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 149 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 150 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 151 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 152 sigio_init(&so->so_sigio); 153 TAILQ_INIT(&so->so_q0); 154 TAILQ_INIT(&so->so_q); 155 156 switch (dp->dom_family) { 157 case AF_INET: 158 case AF_INET6: 159 switch (prp->pr_type) { 160 case SOCK_RAW: 161 case SOCK_DGRAM: 162 so->so_snd.sb_flags |= SB_MTXLOCK; 163 so->so_rcv.sb_flags |= SB_MTXLOCK; 164 break; 165 } 166 break; 167 case AF_KEY: 168 case AF_ROUTE: 169 case AF_UNIX: 170 so->so_snd.sb_flags |= SB_MTXLOCK; 171 so->so_rcv.sb_flags |= SB_MTXLOCK; 172 break; 173 } 174 175 return (so); 176 } 177 178 /* 179 * Socket operation routines. 180 * These routines are called by the routines in 181 * sys_socket.c or from a system process, and 182 * implement the semantics of socket operations by 183 * switching out to the protocol specific routines. 184 */ 185 int 186 socreate(int dom, struct socket **aso, int type, int proto) 187 { 188 struct proc *p = curproc; /* XXX */ 189 const struct protosw *prp; 190 struct socket *so; 191 int error; 192 193 if (proto) 194 prp = pffindproto(dom, proto, type); 195 else 196 prp = pffindtype(dom, type); 197 if (prp == NULL || prp->pr_usrreqs == NULL) 198 return (EPROTONOSUPPORT); 199 if (prp->pr_type != type) 200 return (EPROTOTYPE); 201 so = soalloc(prp, M_WAIT); 202 so->so_type = type; 203 if (suser(p) == 0) 204 so->so_state = SS_PRIV; 205 so->so_ruid = p->p_ucred->cr_ruid; 206 so->so_euid = p->p_ucred->cr_uid; 207 so->so_rgid = p->p_ucred->cr_rgid; 208 so->so_egid = p->p_ucred->cr_gid; 209 so->so_cpid = p->p_p->ps_pid; 210 so->so_proto = prp; 211 so->so_snd.sb_timeo_nsecs = INFSLP; 212 so->so_rcv.sb_timeo_nsecs = INFSLP; 213 214 solock(so); 215 error = pru_attach(so, proto, M_WAIT); 216 if (error) { 217 so->so_state |= SS_NOFDREF; 218 /* sofree() calls sounlock(). */ 219 sofree(so, 0); 220 return (error); 221 } 222 sounlock(so); 223 *aso = so; 224 return (0); 225 } 226 227 int 228 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 229 { 230 soassertlocked(so); 231 return pru_bind(so, nam, p); 232 } 233 234 int 235 solisten(struct socket *so, int backlog) 236 { 237 int somaxconn_local = READ_ONCE(somaxconn); 238 int sominconn_local = READ_ONCE(sominconn); 239 int error; 240 241 switch (so->so_type) { 242 case SOCK_STREAM: 243 case SOCK_SEQPACKET: 244 break; 245 default: 246 return (EOPNOTSUPP); 247 } 248 249 soassertlocked(so); 250 251 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 252 return (EINVAL); 253 #ifdef SOCKET_SPLICE 254 if (isspliced(so) || issplicedback(so)) 255 return (EOPNOTSUPP); 256 #endif /* SOCKET_SPLICE */ 257 error = pru_listen(so); 258 if (error) 259 return (error); 260 if (TAILQ_FIRST(&so->so_q) == NULL) 261 so->so_options |= SO_ACCEPTCONN; 262 if (backlog < 0 || backlog > somaxconn_local) 263 backlog = somaxconn_local; 264 if (backlog < sominconn_local) 265 backlog = sominconn_local; 266 so->so_qlimit = backlog; 267 return (0); 268 } 269 270 #define SOSP_FREEING_READ 1 271 #define SOSP_FREEING_WRITE 2 272 void 273 sofree(struct socket *so, int keep_lock) 274 { 275 int persocket = solock_persocket(so); 276 277 soassertlocked(so); 278 279 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 280 if (!keep_lock) 281 sounlock(so); 282 return; 283 } 284 if (so->so_head) { 285 struct socket *head = so->so_head; 286 287 /* 288 * We must not decommission a socket that's on the accept(2) 289 * queue. If we do, then accept(2) may hang after select(2) 290 * indicated that the listening socket was ready. 291 */ 292 if (so->so_onq == &head->so_q) { 293 if (!keep_lock) 294 sounlock(so); 295 return; 296 } 297 298 if (persocket) { 299 /* 300 * Concurrent close of `head' could 301 * abort `so' due to re-lock. 302 */ 303 soref(so); 304 soref(head); 305 sounlock(so); 306 solock(head); 307 solock(so); 308 309 if (so->so_onq != &head->so_q0) { 310 sounlock(head); 311 sounlock(so); 312 sorele(head); 313 sorele(so); 314 return; 315 } 316 317 sorele(head); 318 sorele(so); 319 } 320 321 soqremque(so, 0); 322 323 if (persocket) 324 sounlock(head); 325 } 326 327 if (persocket) { 328 sounlock(so); 329 refcnt_finalize(&so->so_refcnt, "sofinal"); 330 solock(so); 331 } 332 333 sigio_free(&so->so_sigio); 334 klist_free(&so->so_rcv.sb_klist); 335 klist_free(&so->so_snd.sb_klist); 336 #ifdef SOCKET_SPLICE 337 if (issplicedback(so)) { 338 int freeing = SOSP_FREEING_WRITE; 339 340 if (so->so_sp->ssp_soback == so) 341 freeing |= SOSP_FREEING_READ; 342 sounsplice(so->so_sp->ssp_soback, so, freeing); 343 } 344 if (isspliced(so)) { 345 int freeing = SOSP_FREEING_READ; 346 347 if (so == so->so_sp->ssp_socket) 348 freeing |= SOSP_FREEING_WRITE; 349 sounsplice(so, so->so_sp->ssp_socket, freeing); 350 } 351 #endif /* SOCKET_SPLICE */ 352 353 mtx_enter(&so->so_snd.sb_mtx); 354 sbrelease(so, &so->so_snd); 355 mtx_leave(&so->so_snd.sb_mtx); 356 357 /* 358 * Unlocked dispose and cleanup is safe. Socket is unlinked 359 * from everywhere. Even concurrent sotask() thread will not 360 * call somove(). 361 */ 362 if (so->so_proto->pr_flags & PR_RIGHTS && 363 so->so_proto->pr_domain->dom_dispose) 364 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 365 m_purge(so->so_rcv.sb_mb); 366 367 if (!keep_lock) 368 sounlock(so); 369 370 #ifdef SOCKET_SPLICE 371 if (so->so_sp) { 372 /* Reuse splice idle, sounsplice() has been called before. */ 373 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 374 timeout_add(&so->so_sp->ssp_idleto, 0); 375 } else 376 #endif /* SOCKET_SPLICE */ 377 { 378 pool_put(&socket_pool, so); 379 } 380 } 381 382 static inline uint64_t 383 solinger_nsec(struct socket *so) 384 { 385 if (so->so_linger == 0) 386 return INFSLP; 387 388 return SEC_TO_NSEC(so->so_linger); 389 } 390 391 /* 392 * Close a socket on last file table reference removal. 393 * Initiate disconnect if connected. 394 * Free socket when disconnect complete. 395 */ 396 int 397 soclose(struct socket *so, int flags) 398 { 399 struct socket *so2; 400 int error = 0; 401 402 solock(so); 403 /* Revoke async IO early. There is a final revocation in sofree(). */ 404 sigio_free(&so->so_sigio); 405 if (so->so_state & SS_ISCONNECTED) { 406 if (so->so_pcb == NULL) 407 goto discard; 408 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 409 error = sodisconnect(so); 410 if (error) 411 goto drop; 412 } 413 if (so->so_options & SO_LINGER) { 414 if ((so->so_state & SS_ISDISCONNECTING) && 415 (flags & MSG_DONTWAIT)) 416 goto drop; 417 while (so->so_state & SS_ISCONNECTED) { 418 error = sosleep_nsec(so, &so->so_timeo, 419 PSOCK | PCATCH, "netcls", 420 solinger_nsec(so)); 421 if (error) 422 break; 423 } 424 } 425 } 426 drop: 427 if (so->so_pcb) { 428 int error2; 429 error2 = pru_detach(so); 430 if (error == 0) 431 error = error2; 432 } 433 if (so->so_options & SO_ACCEPTCONN) { 434 int persocket = solock_persocket(so); 435 436 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 437 if (persocket) 438 solock(so2); 439 (void) soqremque(so2, 0); 440 if (persocket) 441 sounlock(so); 442 soabort(so2); 443 if (persocket) 444 solock(so); 445 } 446 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 447 if (persocket) 448 solock(so2); 449 (void) soqremque(so2, 1); 450 if (persocket) 451 sounlock(so); 452 soabort(so2); 453 if (persocket) 454 solock(so); 455 } 456 } 457 discard: 458 if (so->so_state & SS_NOFDREF) 459 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 460 so->so_state |= SS_NOFDREF; 461 /* sofree() calls sounlock(). */ 462 sofree(so, 0); 463 return (error); 464 } 465 466 void 467 soabort(struct socket *so) 468 { 469 soassertlocked(so); 470 pru_abort(so); 471 } 472 473 int 474 soaccept(struct socket *so, struct mbuf *nam) 475 { 476 int error = 0; 477 478 soassertlocked(so); 479 480 if ((so->so_state & SS_NOFDREF) == 0) 481 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 482 so->so_state &= ~SS_NOFDREF; 483 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 484 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 485 error = pru_accept(so, nam); 486 else 487 error = ECONNABORTED; 488 return (error); 489 } 490 491 int 492 soconnect(struct socket *so, struct mbuf *nam) 493 { 494 int error; 495 496 soassertlocked(so); 497 498 if (so->so_options & SO_ACCEPTCONN) 499 return (EOPNOTSUPP); 500 /* 501 * If protocol is connection-based, can only connect once. 502 * Otherwise, if connected, try to disconnect first. 503 * This allows user to disconnect by connecting to, e.g., 504 * a null address. 505 */ 506 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 507 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 508 (error = sodisconnect(so)))) 509 error = EISCONN; 510 else 511 error = pru_connect(so, nam); 512 return (error); 513 } 514 515 int 516 soconnect2(struct socket *so1, struct socket *so2) 517 { 518 int persocket, error; 519 520 if ((persocket = solock_persocket(so1))) 521 solock_pair(so1, so2); 522 else 523 solock(so1); 524 525 error = pru_connect2(so1, so2); 526 527 if (persocket) 528 sounlock(so2); 529 sounlock(so1); 530 return (error); 531 } 532 533 int 534 sodisconnect(struct socket *so) 535 { 536 int error; 537 538 soassertlocked(so); 539 540 if ((so->so_state & SS_ISCONNECTED) == 0) 541 return (ENOTCONN); 542 if (so->so_state & SS_ISDISCONNECTING) 543 return (EALREADY); 544 error = pru_disconnect(so); 545 return (error); 546 } 547 548 int m_getuio(struct mbuf **, int, long, struct uio *); 549 550 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 551 /* 552 * Send on a socket. 553 * If send must go all at once and message is larger than 554 * send buffering, then hard error. 555 * Lock against other senders. 556 * If must go all at once and not enough room now, then 557 * inform user that this would block and do nothing. 558 * Otherwise, if nonblocking, send as much as possible. 559 * The data to be sent is described by "uio" if nonzero, 560 * otherwise by the mbuf chain "top" (which must be null 561 * if uio is not). Data provided in mbuf chain must be small 562 * enough to send all at once. 563 * 564 * Returns nonzero on error, timeout or signal; callers 565 * must check for short counts if EINTR/ERESTART are returned. 566 * Data and control buffers are freed on return. 567 */ 568 int 569 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 570 struct mbuf *control, int flags) 571 { 572 long space, clen = 0; 573 size_t resid; 574 int error; 575 int atomic = sosendallatonce(so) || top; 576 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 577 578 if (uio) 579 resid = uio->uio_resid; 580 else 581 resid = top->m_pkthdr.len; 582 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 583 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 584 m_freem(top); 585 m_freem(control); 586 return (EINVAL); 587 } 588 if (uio && uio->uio_procp) 589 uio->uio_procp->p_ru.ru_msgsnd++; 590 if (control) { 591 /* 592 * In theory clen should be unsigned (since control->m_len is). 593 * However, space must be signed, as it might be less than 0 594 * if we over-committed, and we must use a signed comparison 595 * of space and clen. 596 */ 597 clen = control->m_len; 598 /* reserve extra space for AF_UNIX's internalize */ 599 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 600 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 601 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 602 clen = CMSG_SPACE( 603 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 604 (sizeof(struct fdpass) / sizeof(int))); 605 } 606 607 #define snderr(errno) { error = errno; goto release; } 608 609 restart: 610 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 611 goto out; 612 if (dosolock) 613 solock_shared(so); 614 sb_mtx_lock(&so->so_snd); 615 so->so_snd.sb_state |= SS_ISSENDING; 616 do { 617 if (so->so_snd.sb_state & SS_CANTSENDMORE) 618 snderr(EPIPE); 619 if ((error = READ_ONCE(so->so_error))) { 620 so->so_error = 0; 621 snderr(error); 622 } 623 if ((so->so_state & SS_ISCONNECTED) == 0) { 624 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 625 if (!(resid == 0 && clen != 0)) 626 snderr(ENOTCONN); 627 } else if (addr == NULL) 628 snderr(EDESTADDRREQ); 629 } 630 space = sbspace_locked(so, &so->so_snd); 631 if (flags & MSG_OOB) 632 space += 1024; 633 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 634 if (atomic && resid > so->so_snd.sb_hiwat) 635 snderr(EMSGSIZE); 636 } else { 637 if (clen > so->so_snd.sb_hiwat || 638 (atomic && resid > so->so_snd.sb_hiwat - clen)) 639 snderr(EMSGSIZE); 640 } 641 if (space < clen || 642 (space - clen < resid && 643 (atomic || space < so->so_snd.sb_lowat))) { 644 if (flags & MSG_DONTWAIT) 645 snderr(EWOULDBLOCK); 646 sbunlock(&so->so_snd); 647 error = sbwait(so, &so->so_snd); 648 so->so_snd.sb_state &= ~SS_ISSENDING; 649 sb_mtx_unlock(&so->so_snd); 650 if (dosolock) 651 sounlock_shared(so); 652 if (error) 653 goto out; 654 goto restart; 655 } 656 space -= clen; 657 do { 658 if (uio == NULL) { 659 /* 660 * Data is prepackaged in "top". 661 */ 662 resid = 0; 663 if (flags & MSG_EOR) 664 top->m_flags |= M_EOR; 665 } else { 666 sb_mtx_unlock(&so->so_snd); 667 if (dosolock) 668 sounlock_shared(so); 669 error = m_getuio(&top, atomic, space, uio); 670 if (dosolock) 671 solock_shared(so); 672 sb_mtx_lock(&so->so_snd); 673 if (error) 674 goto release; 675 space -= top->m_pkthdr.len; 676 resid = uio->uio_resid; 677 if (flags & MSG_EOR) 678 top->m_flags |= M_EOR; 679 } 680 if (resid == 0) 681 so->so_snd.sb_state &= ~SS_ISSENDING; 682 if (top && so->so_options & SO_ZEROIZE) 683 top->m_flags |= M_ZEROIZE; 684 sb_mtx_unlock(&so->so_snd); 685 if (!dosolock) 686 solock_shared(so); 687 if (flags & MSG_OOB) 688 error = pru_sendoob(so, top, addr, control); 689 else 690 error = pru_send(so, top, addr, control); 691 if (!dosolock) 692 sounlock_shared(so); 693 sb_mtx_lock(&so->so_snd); 694 clen = 0; 695 control = NULL; 696 top = NULL; 697 if (error) 698 goto release; 699 } while (resid && space > 0); 700 } while (resid); 701 702 release: 703 so->so_snd.sb_state &= ~SS_ISSENDING; 704 sb_mtx_unlock(&so->so_snd); 705 if (dosolock) 706 sounlock_shared(so); 707 sbunlock(&so->so_snd); 708 out: 709 m_freem(top); 710 m_freem(control); 711 return (error); 712 } 713 714 int 715 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 716 { 717 struct mbuf *m, *top = NULL; 718 struct mbuf **nextp = ⊤ 719 u_long len, mlen; 720 size_t resid = uio->uio_resid; 721 int error; 722 723 do { 724 if (top == NULL) { 725 MGETHDR(m, M_WAIT, MT_DATA); 726 mlen = MHLEN; 727 m->m_pkthdr.len = 0; 728 m->m_pkthdr.ph_ifidx = 0; 729 } else { 730 MGET(m, M_WAIT, MT_DATA); 731 mlen = MLEN; 732 } 733 /* chain mbuf together */ 734 *nextp = m; 735 nextp = &m->m_next; 736 737 resid = ulmin(resid, space); 738 if (resid >= MINCLSIZE) { 739 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 740 if ((m->m_flags & M_EXT) == 0) 741 MCLGETL(m, M_NOWAIT, MCLBYTES); 742 if ((m->m_flags & M_EXT) == 0) 743 goto nopages; 744 mlen = m->m_ext.ext_size; 745 len = ulmin(mlen, resid); 746 /* 747 * For datagram protocols, leave room 748 * for protocol headers in first mbuf. 749 */ 750 if (atomic && m == top && len < mlen - max_hdr) 751 m->m_data += max_hdr; 752 } else { 753 nopages: 754 len = ulmin(mlen, resid); 755 /* 756 * For datagram protocols, leave room 757 * for protocol headers in first mbuf. 758 */ 759 if (atomic && m == top && len < mlen - max_hdr) 760 m_align(m, len); 761 } 762 763 error = uiomove(mtod(m, caddr_t), len, uio); 764 if (error) { 765 m_freem(top); 766 return (error); 767 } 768 769 /* adjust counters */ 770 resid = uio->uio_resid; 771 space -= len; 772 m->m_len = len; 773 top->m_pkthdr.len += len; 774 775 /* Is there more space and more data? */ 776 } while (space > 0 && resid > 0); 777 778 *mp = top; 779 return 0; 780 } 781 782 /* 783 * Following replacement or removal of the first mbuf on the first 784 * mbuf chain of a socket buffer, push necessary state changes back 785 * into the socket buffer so that other consumers see the values 786 * consistently. 'nextrecord' is the callers locally stored value of 787 * the original value of sb->sb_mb->m_nextpkt which must be restored 788 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 789 */ 790 void 791 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 792 { 793 794 /* 795 * First, update for the new value of nextrecord. If necessary, 796 * make it the first record. 797 */ 798 if (sb->sb_mb != NULL) 799 sb->sb_mb->m_nextpkt = nextrecord; 800 else 801 sb->sb_mb = nextrecord; 802 803 /* 804 * Now update any dependent socket buffer fields to reflect 805 * the new state. This is an inline of SB_EMPTY_FIXUP, with 806 * the addition of a second clause that takes care of the 807 * case where sb_mb has been updated, but remains the last 808 * record. 809 */ 810 if (sb->sb_mb == NULL) { 811 sb->sb_mbtail = NULL; 812 sb->sb_lastrecord = NULL; 813 } else if (sb->sb_mb->m_nextpkt == NULL) 814 sb->sb_lastrecord = sb->sb_mb; 815 } 816 817 /* 818 * Implement receive operations on a socket. 819 * We depend on the way that records are added to the sockbuf 820 * by sbappend*. In particular, each record (mbufs linked through m_next) 821 * must begin with an address if the protocol so specifies, 822 * followed by an optional mbuf or mbufs containing ancillary data, 823 * and then zero or more mbufs of data. 824 * In order to avoid blocking network for the entire time here, we release 825 * the solock() while doing the actual copy to user space. 826 * Although the sockbuf is locked, new data may still be appended, 827 * and thus we must maintain consistency of the sockbuf during that time. 828 * 829 * The caller may receive the data as a single mbuf chain by supplying 830 * an mbuf **mp0 for use in returning the chain. The uio is then used 831 * only for the count in uio_resid. 832 */ 833 int 834 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 835 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 836 socklen_t controllen) 837 { 838 struct mbuf *m, **mp; 839 struct mbuf *cm; 840 u_long len, offset, moff; 841 int flags, error, error2, type, uio_error = 0; 842 const struct protosw *pr = so->so_proto; 843 struct mbuf *nextrecord; 844 size_t resid, orig_resid = uio->uio_resid; 845 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 846 847 mp = mp0; 848 if (paddr) 849 *paddr = NULL; 850 if (controlp) 851 *controlp = NULL; 852 if (flagsp) 853 flags = *flagsp &~ MSG_EOR; 854 else 855 flags = 0; 856 if (flags & MSG_OOB) { 857 m = m_get(M_WAIT, MT_DATA); 858 solock(so); 859 error = pru_rcvoob(so, m, flags & MSG_PEEK); 860 sounlock(so); 861 if (error) 862 goto bad; 863 do { 864 error = uiomove(mtod(m, caddr_t), 865 ulmin(uio->uio_resid, m->m_len), uio); 866 m = m_free(m); 867 } while (uio->uio_resid && error == 0 && m); 868 bad: 869 m_freem(m); 870 return (error); 871 } 872 if (mp) 873 *mp = NULL; 874 875 restart: 876 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 877 return (error); 878 if (dosolock) 879 solock_shared(so); 880 sb_mtx_lock(&so->so_rcv); 881 882 m = so->so_rcv.sb_mb; 883 #ifdef SOCKET_SPLICE 884 if (isspliced(so)) 885 m = NULL; 886 #endif /* SOCKET_SPLICE */ 887 /* 888 * If we have less data than requested, block awaiting more 889 * (subject to any timeout) if: 890 * 1. the current count is less than the low water mark, 891 * 2. MSG_WAITALL is set, and it is possible to do the entire 892 * receive operation at once if we block (resid <= hiwat), or 893 * 3. MSG_DONTWAIT is not set. 894 * If MSG_WAITALL is set but resid is larger than the receive buffer, 895 * we have to do the receive in sections, and thus risk returning 896 * a short count if a timeout or signal occurs after we start. 897 */ 898 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 899 so->so_rcv.sb_cc < uio->uio_resid) && 900 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 901 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 902 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 903 #ifdef DIAGNOSTIC 904 if (m == NULL && so->so_rcv.sb_cc) 905 #ifdef SOCKET_SPLICE 906 if (!isspliced(so)) 907 #endif /* SOCKET_SPLICE */ 908 panic("receive 1: so %p, so_type %d, sb_cc %lu", 909 so, so->so_type, so->so_rcv.sb_cc); 910 #endif 911 if ((error2 = READ_ONCE(so->so_error))) { 912 if (m) 913 goto dontblock; 914 error = error2; 915 if ((flags & MSG_PEEK) == 0) 916 so->so_error = 0; 917 goto release; 918 } 919 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 920 if (m) 921 goto dontblock; 922 else if (so->so_rcv.sb_cc == 0) 923 goto release; 924 } 925 for (; m; m = m->m_next) 926 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 927 m = so->so_rcv.sb_mb; 928 goto dontblock; 929 } 930 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 931 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 932 error = ENOTCONN; 933 goto release; 934 } 935 if (uio->uio_resid == 0 && controlp == NULL) 936 goto release; 937 if (flags & MSG_DONTWAIT) { 938 error = EWOULDBLOCK; 939 goto release; 940 } 941 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 942 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 943 944 sbunlock(&so->so_rcv); 945 error = sbwait(so, &so->so_rcv); 946 sb_mtx_unlock(&so->so_rcv); 947 if (dosolock) 948 sounlock_shared(so); 949 if (error) 950 return (error); 951 goto restart; 952 } 953 dontblock: 954 /* 955 * On entry here, m points to the first record of the socket buffer. 956 * From this point onward, we maintain 'nextrecord' as a cache of the 957 * pointer to the next record in the socket buffer. We must keep the 958 * various socket buffer pointers and local stack versions of the 959 * pointers in sync, pushing out modifications before operations that 960 * may sleep, and re-reading them afterwards. 961 * 962 * Otherwise, we will race with the network stack appending new data 963 * or records onto the socket buffer by using inconsistent/stale 964 * versions of the field, possibly resulting in socket buffer 965 * corruption. 966 */ 967 if (uio->uio_procp) 968 uio->uio_procp->p_ru.ru_msgrcv++; 969 KASSERT(m == so->so_rcv.sb_mb); 970 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 971 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 972 nextrecord = m->m_nextpkt; 973 if (pr->pr_flags & PR_ADDR) { 974 #ifdef DIAGNOSTIC 975 if (m->m_type != MT_SONAME) 976 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 977 so, so->so_type, m, m->m_type); 978 #endif 979 orig_resid = 0; 980 if (flags & MSG_PEEK) { 981 if (paddr) 982 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 983 m = m->m_next; 984 } else { 985 sbfree(so, &so->so_rcv, m); 986 if (paddr) { 987 *paddr = m; 988 so->so_rcv.sb_mb = m->m_next; 989 m->m_next = NULL; 990 m = so->so_rcv.sb_mb; 991 } else { 992 so->so_rcv.sb_mb = m_free(m); 993 m = so->so_rcv.sb_mb; 994 } 995 sbsync(&so->so_rcv, nextrecord); 996 } 997 } 998 while (m && m->m_type == MT_CONTROL && error == 0) { 999 int skip = 0; 1000 if (flags & MSG_PEEK) { 1001 if (mtod(m, struct cmsghdr *)->cmsg_type == 1002 SCM_RIGHTS) { 1003 /* don't leak internalized SCM_RIGHTS msgs */ 1004 skip = 1; 1005 } else if (controlp) 1006 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1007 m = m->m_next; 1008 } else { 1009 sbfree(so, &so->so_rcv, m); 1010 so->so_rcv.sb_mb = m->m_next; 1011 m->m_nextpkt = m->m_next = NULL; 1012 cm = m; 1013 m = so->so_rcv.sb_mb; 1014 sbsync(&so->so_rcv, nextrecord); 1015 if (controlp) { 1016 if (pr->pr_domain->dom_externalize) { 1017 sb_mtx_unlock(&so->so_rcv); 1018 if (dosolock) 1019 sounlock_shared(so); 1020 error = 1021 (*pr->pr_domain->dom_externalize) 1022 (cm, controllen, flags); 1023 if (dosolock) 1024 solock_shared(so); 1025 sb_mtx_lock(&so->so_rcv); 1026 } 1027 *controlp = cm; 1028 } else { 1029 /* 1030 * Dispose of any SCM_RIGHTS message that went 1031 * through the read path rather than recv. 1032 */ 1033 if (pr->pr_domain->dom_dispose) { 1034 sb_mtx_unlock(&so->so_rcv); 1035 pr->pr_domain->dom_dispose(cm); 1036 sb_mtx_lock(&so->so_rcv); 1037 } 1038 m_free(cm); 1039 } 1040 } 1041 if (m != NULL) 1042 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1043 else 1044 nextrecord = so->so_rcv.sb_mb; 1045 if (controlp && !skip) 1046 controlp = &(*controlp)->m_next; 1047 orig_resid = 0; 1048 } 1049 1050 /* If m is non-NULL, we have some data to read. */ 1051 if (m) { 1052 type = m->m_type; 1053 if (type == MT_OOBDATA) 1054 flags |= MSG_OOB; 1055 if (m->m_flags & M_BCAST) 1056 flags |= MSG_BCAST; 1057 if (m->m_flags & M_MCAST) 1058 flags |= MSG_MCAST; 1059 } 1060 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1061 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1062 1063 moff = 0; 1064 offset = 0; 1065 while (m && uio->uio_resid > 0 && error == 0) { 1066 if (m->m_type == MT_OOBDATA) { 1067 if (type != MT_OOBDATA) 1068 break; 1069 } else if (type == MT_OOBDATA) { 1070 break; 1071 } else if (m->m_type == MT_CONTROL) { 1072 /* 1073 * If there is more than one control message in the 1074 * stream, we do a short read. Next can be received 1075 * or disposed by another system call. 1076 */ 1077 break; 1078 #ifdef DIAGNOSTIC 1079 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1080 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1081 so, so->so_type, m, m->m_type); 1082 #endif 1083 } 1084 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1085 len = uio->uio_resid; 1086 if (so->so_oobmark && len > so->so_oobmark - offset) 1087 len = so->so_oobmark - offset; 1088 if (len > m->m_len - moff) 1089 len = m->m_len - moff; 1090 /* 1091 * If mp is set, just pass back the mbufs. 1092 * Otherwise copy them out via the uio, then free. 1093 * Sockbuf must be consistent here (points to current mbuf, 1094 * it points to next record) when we drop priority; 1095 * we must note any additions to the sockbuf when we 1096 * block interrupts again. 1097 */ 1098 if (mp == NULL && uio_error == 0) { 1099 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1100 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1101 resid = uio->uio_resid; 1102 sb_mtx_unlock(&so->so_rcv); 1103 if (dosolock) 1104 sounlock_shared(so); 1105 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1106 if (dosolock) 1107 solock_shared(so); 1108 sb_mtx_lock(&so->so_rcv); 1109 if (uio_error) 1110 uio->uio_resid = resid - len; 1111 } else 1112 uio->uio_resid -= len; 1113 if (len == m->m_len - moff) { 1114 if (m->m_flags & M_EOR) 1115 flags |= MSG_EOR; 1116 if (flags & MSG_PEEK) { 1117 m = m->m_next; 1118 moff = 0; 1119 orig_resid = 0; 1120 } else { 1121 nextrecord = m->m_nextpkt; 1122 sbfree(so, &so->so_rcv, m); 1123 if (mp) { 1124 *mp = m; 1125 mp = &m->m_next; 1126 so->so_rcv.sb_mb = m = m->m_next; 1127 *mp = NULL; 1128 } else { 1129 so->so_rcv.sb_mb = m_free(m); 1130 m = so->so_rcv.sb_mb; 1131 } 1132 /* 1133 * If m != NULL, we also know that 1134 * so->so_rcv.sb_mb != NULL. 1135 */ 1136 KASSERT(so->so_rcv.sb_mb == m); 1137 if (m) { 1138 m->m_nextpkt = nextrecord; 1139 if (nextrecord == NULL) 1140 so->so_rcv.sb_lastrecord = m; 1141 } else { 1142 so->so_rcv.sb_mb = nextrecord; 1143 SB_EMPTY_FIXUP(&so->so_rcv); 1144 } 1145 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1146 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1147 } 1148 } else { 1149 if (flags & MSG_PEEK) { 1150 moff += len; 1151 orig_resid = 0; 1152 } else { 1153 if (mp) 1154 *mp = m_copym(m, 0, len, M_WAIT); 1155 m->m_data += len; 1156 m->m_len -= len; 1157 so->so_rcv.sb_cc -= len; 1158 so->so_rcv.sb_datacc -= len; 1159 } 1160 } 1161 if (so->so_oobmark) { 1162 if ((flags & MSG_PEEK) == 0) { 1163 so->so_oobmark -= len; 1164 if (so->so_oobmark == 0) { 1165 so->so_rcv.sb_state |= SS_RCVATMARK; 1166 break; 1167 } 1168 } else { 1169 offset += len; 1170 if (offset == so->so_oobmark) 1171 break; 1172 } 1173 } 1174 if (flags & MSG_EOR) 1175 break; 1176 /* 1177 * If the MSG_WAITALL flag is set (for non-atomic socket), 1178 * we must not quit until "uio->uio_resid == 0" or an error 1179 * termination. If a signal/timeout occurs, return 1180 * with a short count but without error. 1181 * Keep sockbuf locked against other readers. 1182 */ 1183 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1184 !sosendallatonce(so) && !nextrecord) { 1185 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1186 so->so_error) 1187 break; 1188 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1189 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1190 if (sbwait(so, &so->so_rcv)) { 1191 sb_mtx_unlock(&so->so_rcv); 1192 if (dosolock) 1193 sounlock_shared(so); 1194 sbunlock(&so->so_rcv); 1195 return (0); 1196 } 1197 if ((m = so->so_rcv.sb_mb) != NULL) 1198 nextrecord = m->m_nextpkt; 1199 } 1200 } 1201 1202 if (m && pr->pr_flags & PR_ATOMIC) { 1203 flags |= MSG_TRUNC; 1204 if ((flags & MSG_PEEK) == 0) 1205 (void) sbdroprecord(so, &so->so_rcv); 1206 } 1207 if ((flags & MSG_PEEK) == 0) { 1208 if (m == NULL) { 1209 /* 1210 * First part is an inline SB_EMPTY_FIXUP(). Second 1211 * part makes sure sb_lastrecord is up-to-date if 1212 * there is still data in the socket buffer. 1213 */ 1214 so->so_rcv.sb_mb = nextrecord; 1215 if (so->so_rcv.sb_mb == NULL) { 1216 so->so_rcv.sb_mbtail = NULL; 1217 so->so_rcv.sb_lastrecord = NULL; 1218 } else if (nextrecord->m_nextpkt == NULL) 1219 so->so_rcv.sb_lastrecord = nextrecord; 1220 } 1221 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1222 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1223 if (pr->pr_flags & PR_WANTRCVD) { 1224 sb_mtx_unlock(&so->so_rcv); 1225 if (!dosolock) 1226 solock_shared(so); 1227 pru_rcvd(so); 1228 if (!dosolock) 1229 sounlock_shared(so); 1230 sb_mtx_lock(&so->so_rcv); 1231 } 1232 } 1233 if (orig_resid == uio->uio_resid && orig_resid && 1234 (flags & MSG_EOR) == 0 && 1235 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1236 sb_mtx_unlock(&so->so_rcv); 1237 sbunlock(&so->so_rcv); 1238 goto restart; 1239 } 1240 1241 if (uio_error) 1242 error = uio_error; 1243 1244 if (flagsp) 1245 *flagsp |= flags; 1246 release: 1247 sb_mtx_unlock(&so->so_rcv); 1248 if (dosolock) 1249 sounlock_shared(so); 1250 sbunlock(&so->so_rcv); 1251 return (error); 1252 } 1253 1254 int 1255 soshutdown(struct socket *so, int how) 1256 { 1257 int error = 0; 1258 1259 switch (how) { 1260 case SHUT_RD: 1261 sorflush(so); 1262 break; 1263 case SHUT_RDWR: 1264 sorflush(so); 1265 /* FALLTHROUGH */ 1266 case SHUT_WR: 1267 solock(so); 1268 error = pru_shutdown(so); 1269 sounlock(so); 1270 break; 1271 default: 1272 error = EINVAL; 1273 break; 1274 } 1275 1276 return (error); 1277 } 1278 1279 void 1280 sorflush(struct socket *so) 1281 { 1282 struct sockbuf *sb = &so->so_rcv; 1283 struct mbuf *m; 1284 const struct protosw *pr = so->so_proto; 1285 int error; 1286 1287 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1288 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1289 KASSERT(error == 0); 1290 1291 solock_shared(so); 1292 socantrcvmore(so); 1293 mtx_enter(&sb->sb_mtx); 1294 m = sb->sb_mb; 1295 memset(&sb->sb_startzero, 0, 1296 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1297 sb->sb_timeo_nsecs = INFSLP; 1298 mtx_leave(&sb->sb_mtx); 1299 sounlock_shared(so); 1300 sbunlock(sb); 1301 1302 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1303 (*pr->pr_domain->dom_dispose)(m); 1304 m_purge(m); 1305 } 1306 1307 #ifdef SOCKET_SPLICE 1308 1309 #define so_splicelen so_sp->ssp_len 1310 #define so_splicemax so_sp->ssp_max 1311 #define so_idletv so_sp->ssp_idletv 1312 #define so_idleto so_sp->ssp_idleto 1313 #define so_splicetask so_sp->ssp_task 1314 1315 int 1316 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1317 { 1318 struct file *fp; 1319 struct socket *sosp; 1320 struct taskq *tq; 1321 int error = 0; 1322 1323 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1324 return (EPROTONOSUPPORT); 1325 if (max && max < 0) 1326 return (EINVAL); 1327 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1328 return (EINVAL); 1329 1330 /* If no fd is given, unsplice by removing existing link. */ 1331 if (fd < 0) { 1332 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1333 return (error); 1334 solock(so); 1335 if (so->so_options & SO_ACCEPTCONN) { 1336 error = EOPNOTSUPP; 1337 goto out; 1338 } 1339 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1340 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1341 error = ENOTCONN; 1342 goto out; 1343 } 1344 1345 if (so->so_sp && so->so_sp->ssp_socket) 1346 sounsplice(so, so->so_sp->ssp_socket, 0); 1347 out: 1348 sounlock(so); 1349 sbunlock(&so->so_rcv); 1350 return (error); 1351 } 1352 1353 if (sosplice_taskq == NULL) { 1354 rw_enter_write(&sosplice_lock); 1355 if (sosplice_taskq == NULL) { 1356 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1357 TASKQ_MPSAFE); 1358 if (tq == NULL) { 1359 rw_exit_write(&sosplice_lock); 1360 return (ENOMEM); 1361 } 1362 /* Ensure the taskq is fully visible to other CPUs. */ 1363 membar_producer(); 1364 sosplice_taskq = tq; 1365 } 1366 rw_exit_write(&sosplice_lock); 1367 } else { 1368 /* Ensure the taskq is fully visible on this CPU. */ 1369 membar_consumer(); 1370 } 1371 1372 /* Find sosp, the drain socket where data will be spliced into. */ 1373 if ((error = getsock(curproc, fd, &fp)) != 0) 1374 return (error); 1375 sosp = fp->f_data; 1376 1377 if (sosp->so_proto->pr_usrreqs->pru_send != 1378 so->so_proto->pr_usrreqs->pru_send) { 1379 error = EPROTONOSUPPORT; 1380 goto frele; 1381 } 1382 1383 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1384 goto frele; 1385 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1386 sbunlock(&so->so_rcv); 1387 goto frele; 1388 } 1389 solock(so); 1390 1391 if ((so->so_options & SO_ACCEPTCONN) || 1392 (sosp->so_options & SO_ACCEPTCONN)) { 1393 error = EOPNOTSUPP; 1394 goto release; 1395 } 1396 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1397 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1398 error = ENOTCONN; 1399 goto release; 1400 } 1401 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1402 error = ENOTCONN; 1403 goto release; 1404 } 1405 if (so->so_sp == NULL) 1406 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1407 if (sosp->so_sp == NULL) 1408 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1409 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1410 error = EBUSY; 1411 goto release; 1412 } 1413 1414 /* Splice so and sosp together. */ 1415 mtx_enter(&so->so_rcv.sb_mtx); 1416 mtx_enter(&sosp->so_snd.sb_mtx); 1417 so->so_sp->ssp_socket = sosp; 1418 sosp->so_sp->ssp_soback = so; 1419 mtx_leave(&sosp->so_snd.sb_mtx); 1420 mtx_leave(&so->so_rcv.sb_mtx); 1421 1422 so->so_splicelen = 0; 1423 so->so_splicemax = max; 1424 if (tv) 1425 so->so_idletv = *tv; 1426 else 1427 timerclear(&so->so_idletv); 1428 timeout_set_proc(&so->so_idleto, soidle, so); 1429 task_set(&so->so_splicetask, sotask, so); 1430 1431 /* 1432 * To prevent softnet interrupt from calling somove() while 1433 * we sleep, the socket buffers are not marked as spliced yet. 1434 */ 1435 if (somove(so, M_WAIT)) { 1436 mtx_enter(&so->so_rcv.sb_mtx); 1437 mtx_enter(&sosp->so_snd.sb_mtx); 1438 so->so_rcv.sb_flags |= SB_SPLICE; 1439 sosp->so_snd.sb_flags |= SB_SPLICE; 1440 mtx_leave(&sosp->so_snd.sb_mtx); 1441 mtx_leave(&so->so_rcv.sb_mtx); 1442 } 1443 1444 release: 1445 sounlock(so); 1446 sbunlock(&sosp->so_snd); 1447 sbunlock(&so->so_rcv); 1448 frele: 1449 FRELE(fp, curproc); 1450 1451 return (error); 1452 } 1453 1454 void 1455 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1456 { 1457 soassertlocked(so); 1458 1459 task_del(sosplice_taskq, &so->so_splicetask); 1460 timeout_del(&so->so_idleto); 1461 1462 mtx_enter(&so->so_rcv.sb_mtx); 1463 mtx_enter(&sosp->so_snd.sb_mtx); 1464 so->so_rcv.sb_flags &= ~SB_SPLICE; 1465 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1466 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1467 mtx_leave(&sosp->so_snd.sb_mtx); 1468 mtx_leave(&so->so_rcv.sb_mtx); 1469 1470 /* Do not wakeup a socket that is about to be freed. */ 1471 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1472 sorwakeup(so); 1473 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1474 sowwakeup(sosp); 1475 } 1476 1477 void 1478 soidle(void *arg) 1479 { 1480 struct socket *so = arg; 1481 1482 solock(so); 1483 if (so->so_rcv.sb_flags & SB_SPLICE) { 1484 so->so_error = ETIMEDOUT; 1485 sounsplice(so, so->so_sp->ssp_socket, 0); 1486 } 1487 sounlock(so); 1488 } 1489 1490 void 1491 sotask(void *arg) 1492 { 1493 struct socket *so = arg; 1494 1495 solock(so); 1496 if (so->so_rcv.sb_flags & SB_SPLICE) { 1497 /* 1498 * We may not sleep here as sofree() and unsplice() may be 1499 * called from softnet interrupt context. This would remove 1500 * the socket during somove(). 1501 */ 1502 somove(so, M_DONTWAIT); 1503 } 1504 sounlock(so); 1505 1506 /* Avoid user land starvation. */ 1507 yield(); 1508 } 1509 1510 /* 1511 * The socket splicing task or idle timeout may sleep while grabbing the net 1512 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1513 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1514 * after all pending socket splicing tasks or timeouts have finished. Do this 1515 * by scheduling it on the same threads. 1516 */ 1517 void 1518 soreaper(void *arg) 1519 { 1520 struct socket *so = arg; 1521 1522 /* Reuse splice task, sounsplice() has been called before. */ 1523 task_set(&so->so_sp->ssp_task, soput, so); 1524 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1525 } 1526 1527 void 1528 soput(void *arg) 1529 { 1530 struct socket *so = arg; 1531 1532 pool_put(&sosplice_pool, so->so_sp); 1533 pool_put(&socket_pool, so); 1534 } 1535 1536 /* 1537 * Move data from receive buffer of spliced source socket to send 1538 * buffer of drain socket. Try to move as much as possible in one 1539 * big chunk. It is a TCP only implementation. 1540 * Return value 0 means splicing has been finished, 1 continue. 1541 */ 1542 int 1543 somove(struct socket *so, int wait) 1544 { 1545 struct socket *sosp = so->so_sp->ssp_socket; 1546 struct mbuf *m, **mp, *nextrecord; 1547 u_long len, off, oobmark; 1548 long space; 1549 int error = 0, maxreached = 0; 1550 unsigned int rcvstate; 1551 1552 soassertlocked(so); 1553 1554 nextpkt: 1555 if (so->so_error) { 1556 error = so->so_error; 1557 goto release; 1558 } 1559 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1560 error = EPIPE; 1561 goto release; 1562 } 1563 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1564 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1565 error = sosp->so_error; 1566 goto release; 1567 } 1568 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1569 goto release; 1570 1571 /* Calculate how many bytes can be copied now. */ 1572 len = so->so_rcv.sb_datacc; 1573 if (so->so_splicemax) { 1574 KASSERT(so->so_splicelen < so->so_splicemax); 1575 if (so->so_splicemax <= so->so_splicelen + len) { 1576 len = so->so_splicemax - so->so_splicelen; 1577 maxreached = 1; 1578 } 1579 } 1580 mtx_enter(&sosp->so_snd.sb_mtx); 1581 space = sbspace_locked(sosp, &sosp->so_snd); 1582 if (so->so_oobmark && so->so_oobmark < len && 1583 so->so_oobmark < space + 1024) 1584 space += 1024; 1585 if (space <= 0) { 1586 mtx_leave(&sosp->so_snd.sb_mtx); 1587 maxreached = 0; 1588 goto release; 1589 } 1590 if (space < len) { 1591 maxreached = 0; 1592 if (space < sosp->so_snd.sb_lowat) { 1593 mtx_leave(&sosp->so_snd.sb_mtx); 1594 goto release; 1595 } 1596 len = space; 1597 } 1598 sosp->so_snd.sb_state |= SS_ISSENDING; 1599 mtx_leave(&sosp->so_snd.sb_mtx); 1600 1601 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1602 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1603 m = so->so_rcv.sb_mb; 1604 if (m == NULL) 1605 goto release; 1606 nextrecord = m->m_nextpkt; 1607 1608 /* Drop address and control information not used with splicing. */ 1609 if (so->so_proto->pr_flags & PR_ADDR) { 1610 #ifdef DIAGNOSTIC 1611 if (m->m_type != MT_SONAME) 1612 panic("somove soname: so %p, so_type %d, m %p, " 1613 "m_type %d", so, so->so_type, m, m->m_type); 1614 #endif 1615 m = m->m_next; 1616 } 1617 while (m && m->m_type == MT_CONTROL) 1618 m = m->m_next; 1619 if (m == NULL) { 1620 sbdroprecord(so, &so->so_rcv); 1621 if (so->so_proto->pr_flags & PR_WANTRCVD) 1622 pru_rcvd(so); 1623 goto nextpkt; 1624 } 1625 1626 /* 1627 * By splicing sockets connected to localhost, userland might create a 1628 * loop. Dissolve splicing with error if loop is detected by counter. 1629 * 1630 * If we deal with looped broadcast/multicast packet we bail out with 1631 * no error to suppress splice termination. 1632 */ 1633 if ((m->m_flags & M_PKTHDR) && 1634 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1635 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1636 error = ELOOP; 1637 goto release; 1638 } 1639 1640 if (so->so_proto->pr_flags & PR_ATOMIC) { 1641 if ((m->m_flags & M_PKTHDR) == 0) 1642 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1643 "m_type %d", so, so->so_type, m, m->m_type); 1644 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1645 error = EMSGSIZE; 1646 goto release; 1647 } 1648 if (len < m->m_pkthdr.len) 1649 goto release; 1650 if (m->m_pkthdr.len < len) { 1651 maxreached = 0; 1652 len = m->m_pkthdr.len; 1653 } 1654 /* 1655 * Throw away the name mbuf after it has been assured 1656 * that the whole first record can be processed. 1657 */ 1658 m = so->so_rcv.sb_mb; 1659 sbfree(so, &so->so_rcv, m); 1660 so->so_rcv.sb_mb = m_free(m); 1661 sbsync(&so->so_rcv, nextrecord); 1662 } 1663 /* 1664 * Throw away the control mbufs after it has been assured 1665 * that the whole first record can be processed. 1666 */ 1667 m = so->so_rcv.sb_mb; 1668 while (m && m->m_type == MT_CONTROL) { 1669 sbfree(so, &so->so_rcv, m); 1670 so->so_rcv.sb_mb = m_free(m); 1671 m = so->so_rcv.sb_mb; 1672 sbsync(&so->so_rcv, nextrecord); 1673 } 1674 1675 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1676 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1677 1678 /* Take at most len mbufs out of receive buffer. */ 1679 for (off = 0, mp = &m; off <= len && *mp; 1680 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1681 u_long size = len - off; 1682 1683 #ifdef DIAGNOSTIC 1684 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1685 panic("somove type: so %p, so_type %d, m %p, " 1686 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1687 #endif 1688 if ((*mp)->m_len > size) { 1689 /* 1690 * Move only a partial mbuf at maximum splice length or 1691 * if the drain buffer is too small for this large mbuf. 1692 */ 1693 if (!maxreached && so->so_snd.sb_datacc > 0) { 1694 len -= size; 1695 break; 1696 } 1697 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1698 if (*mp == NULL) { 1699 len -= size; 1700 break; 1701 } 1702 so->so_rcv.sb_mb->m_data += size; 1703 so->so_rcv.sb_mb->m_len -= size; 1704 so->so_rcv.sb_cc -= size; 1705 so->so_rcv.sb_datacc -= size; 1706 } else { 1707 *mp = so->so_rcv.sb_mb; 1708 sbfree(so, &so->so_rcv, *mp); 1709 so->so_rcv.sb_mb = (*mp)->m_next; 1710 sbsync(&so->so_rcv, nextrecord); 1711 } 1712 } 1713 *mp = NULL; 1714 1715 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1716 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1717 SBCHECK(so, &so->so_rcv); 1718 if (m == NULL) 1719 goto release; 1720 m->m_nextpkt = NULL; 1721 if (m->m_flags & M_PKTHDR) { 1722 m_resethdr(m); 1723 m->m_pkthdr.len = len; 1724 } 1725 1726 /* Send window update to source peer as receive buffer has changed. */ 1727 if (so->so_proto->pr_flags & PR_WANTRCVD) 1728 pru_rcvd(so); 1729 1730 /* Receive buffer did shrink by len bytes, adjust oob. */ 1731 mtx_enter(&so->so_rcv.sb_mtx); 1732 rcvstate = so->so_rcv.sb_state; 1733 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1734 oobmark = so->so_oobmark; 1735 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1736 if (oobmark) { 1737 if (oobmark == len) 1738 so->so_rcv.sb_state |= SS_RCVATMARK; 1739 if (oobmark >= len) 1740 oobmark = 0; 1741 } 1742 mtx_leave(&so->so_rcv.sb_mtx); 1743 1744 /* 1745 * Handle oob data. If any malloc fails, ignore error. 1746 * TCP urgent data is not very reliable anyway. 1747 */ 1748 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1749 (so->so_options & SO_OOBINLINE)) { 1750 struct mbuf *o = NULL; 1751 1752 if (rcvstate & SS_RCVATMARK) { 1753 o = m_get(wait, MT_DATA); 1754 rcvstate &= ~SS_RCVATMARK; 1755 } else if (oobmark) { 1756 o = m_split(m, oobmark, wait); 1757 if (o) { 1758 error = pru_send(sosp, m, NULL, NULL); 1759 if (error) { 1760 if (sosp->so_snd.sb_state & 1761 SS_CANTSENDMORE) 1762 error = EPIPE; 1763 m_freem(o); 1764 goto release; 1765 } 1766 len -= oobmark; 1767 so->so_splicelen += oobmark; 1768 m = o; 1769 o = m_get(wait, MT_DATA); 1770 } 1771 oobmark = 0; 1772 } 1773 if (o) { 1774 o->m_len = 1; 1775 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1776 error = pru_sendoob(sosp, o, NULL, NULL); 1777 if (error) { 1778 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1779 error = EPIPE; 1780 m_freem(m); 1781 goto release; 1782 } 1783 len -= 1; 1784 so->so_splicelen += 1; 1785 if (oobmark) { 1786 oobmark -= 1; 1787 if (oobmark == 0) 1788 rcvstate |= SS_RCVATMARK; 1789 } 1790 m_adj(m, 1); 1791 } 1792 } 1793 1794 mtx_enter(&sosp->so_snd.sb_mtx); 1795 /* Append all remaining data to drain socket. */ 1796 if (so->so_rcv.sb_cc == 0 || maxreached) 1797 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1798 mtx_leave(&sosp->so_snd.sb_mtx); 1799 1800 error = pru_send(sosp, m, NULL, NULL); 1801 if (error) { 1802 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1803 error = EPIPE; 1804 goto release; 1805 } 1806 so->so_splicelen += len; 1807 1808 /* Move several packets if possible. */ 1809 if (!maxreached && nextrecord) 1810 goto nextpkt; 1811 1812 release: 1813 mtx_enter(&sosp->so_snd.sb_mtx); 1814 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1815 mtx_leave(&sosp->so_snd.sb_mtx); 1816 1817 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1818 error = EFBIG; 1819 if (error) 1820 so->so_error = error; 1821 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1822 so->so_rcv.sb_cc == 0) || 1823 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1824 maxreached || error) { 1825 sounsplice(so, sosp, 0); 1826 return (0); 1827 } 1828 if (timerisset(&so->so_idletv)) 1829 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1830 return (1); 1831 } 1832 1833 #endif /* SOCKET_SPLICE */ 1834 1835 void 1836 sorwakeup(struct socket *so) 1837 { 1838 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1839 soassertlocked_readonly(so); 1840 1841 #ifdef SOCKET_SPLICE 1842 if (so->so_rcv.sb_flags & SB_SPLICE) { 1843 /* 1844 * TCP has a sendbuffer that can handle multiple packets 1845 * at once. So queue the stream a bit to accumulate data. 1846 * The sosplice thread will call somove() later and send 1847 * the packets calling tcp_output() only once. 1848 * In the UDP case, send out the packets immediately. 1849 * Using a thread would make things slower. 1850 */ 1851 if (so->so_proto->pr_flags & PR_WANTRCVD) 1852 task_add(sosplice_taskq, &so->so_splicetask); 1853 else 1854 somove(so, M_DONTWAIT); 1855 } 1856 if (isspliced(so)) 1857 return; 1858 #endif 1859 sowakeup(so, &so->so_rcv); 1860 if (so->so_upcall) 1861 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1862 } 1863 1864 void 1865 sowwakeup(struct socket *so) 1866 { 1867 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1868 soassertlocked_readonly(so); 1869 1870 #ifdef SOCKET_SPLICE 1871 if (so->so_snd.sb_flags & SB_SPLICE) 1872 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1873 if (issplicedback(so)) 1874 return; 1875 #endif 1876 sowakeup(so, &so->so_snd); 1877 } 1878 1879 int 1880 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1881 { 1882 int error = 0; 1883 1884 if (level != SOL_SOCKET) { 1885 if (so->so_proto->pr_ctloutput) { 1886 solock(so); 1887 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1888 level, optname, m); 1889 sounlock(so); 1890 return (error); 1891 } 1892 error = ENOPROTOOPT; 1893 } else { 1894 switch (optname) { 1895 1896 case SO_LINGER: 1897 if (m == NULL || m->m_len != sizeof (struct linger) || 1898 mtod(m, struct linger *)->l_linger < 0 || 1899 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1900 return (EINVAL); 1901 1902 solock(so); 1903 so->so_linger = mtod(m, struct linger *)->l_linger; 1904 if (*mtod(m, int *)) 1905 so->so_options |= optname; 1906 else 1907 so->so_options &= ~optname; 1908 sounlock(so); 1909 1910 break; 1911 case SO_BINDANY: 1912 if ((error = suser(curproc)) != 0) /* XXX */ 1913 return (error); 1914 /* FALLTHROUGH */ 1915 1916 case SO_DEBUG: 1917 case SO_KEEPALIVE: 1918 case SO_USELOOPBACK: 1919 case SO_BROADCAST: 1920 case SO_REUSEADDR: 1921 case SO_REUSEPORT: 1922 case SO_OOBINLINE: 1923 case SO_TIMESTAMP: 1924 case SO_ZEROIZE: 1925 if (m == NULL || m->m_len < sizeof (int)) 1926 return (EINVAL); 1927 1928 solock(so); 1929 if (*mtod(m, int *)) 1930 so->so_options |= optname; 1931 else 1932 so->so_options &= ~optname; 1933 sounlock(so); 1934 1935 break; 1936 case SO_DONTROUTE: 1937 if (m == NULL || m->m_len < sizeof (int)) 1938 return (EINVAL); 1939 if (*mtod(m, int *)) 1940 error = EOPNOTSUPP; 1941 break; 1942 1943 case SO_SNDBUF: 1944 case SO_RCVBUF: 1945 case SO_SNDLOWAT: 1946 case SO_RCVLOWAT: 1947 { 1948 struct sockbuf *sb = (optname == SO_SNDBUF || 1949 optname == SO_SNDLOWAT ? 1950 &so->so_snd : &so->so_rcv); 1951 u_long cnt; 1952 1953 if (m == NULL || m->m_len < sizeof (int)) 1954 return (EINVAL); 1955 cnt = *mtod(m, int *); 1956 if ((long)cnt <= 0) 1957 cnt = 1; 1958 1959 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 1960 solock(so); 1961 mtx_enter(&sb->sb_mtx); 1962 1963 switch (optname) { 1964 case SO_SNDBUF: 1965 case SO_RCVBUF: 1966 if (sb->sb_state & 1967 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1968 error = EINVAL; 1969 break; 1970 } 1971 if (sbcheckreserve(cnt, sb->sb_wat) || 1972 sbreserve(so, sb, cnt)) { 1973 error = ENOBUFS; 1974 break; 1975 } 1976 sb->sb_wat = cnt; 1977 break; 1978 case SO_SNDLOWAT: 1979 case SO_RCVLOWAT: 1980 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 1981 sb->sb_hiwat : cnt; 1982 break; 1983 } 1984 1985 mtx_leave(&sb->sb_mtx); 1986 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 1987 sounlock(so); 1988 1989 break; 1990 } 1991 1992 case SO_SNDTIMEO: 1993 case SO_RCVTIMEO: 1994 { 1995 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 1996 &so->so_snd : &so->so_rcv); 1997 struct timeval tv; 1998 uint64_t nsecs; 1999 2000 if (m == NULL || m->m_len < sizeof (tv)) 2001 return (EINVAL); 2002 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2003 if (!timerisvalid(&tv)) 2004 return (EINVAL); 2005 nsecs = TIMEVAL_TO_NSEC(&tv); 2006 if (nsecs == UINT64_MAX) 2007 return (EDOM); 2008 if (nsecs == 0) 2009 nsecs = INFSLP; 2010 2011 mtx_enter(&sb->sb_mtx); 2012 sb->sb_timeo_nsecs = nsecs; 2013 mtx_leave(&sb->sb_mtx); 2014 break; 2015 } 2016 2017 case SO_RTABLE: 2018 if (so->so_proto->pr_domain && 2019 so->so_proto->pr_domain->dom_protosw && 2020 so->so_proto->pr_ctloutput) { 2021 const struct domain *dom = 2022 so->so_proto->pr_domain; 2023 2024 level = dom->dom_protosw->pr_protocol; 2025 solock(so); 2026 error = (*so->so_proto->pr_ctloutput) 2027 (PRCO_SETOPT, so, level, optname, m); 2028 sounlock(so); 2029 } else 2030 error = ENOPROTOOPT; 2031 break; 2032 #ifdef SOCKET_SPLICE 2033 case SO_SPLICE: 2034 if (m == NULL) { 2035 error = sosplice(so, -1, 0, NULL); 2036 } else if (m->m_len < sizeof(int)) { 2037 error = EINVAL; 2038 } else if (m->m_len < sizeof(struct splice)) { 2039 error = sosplice(so, *mtod(m, int *), 0, NULL); 2040 } else { 2041 error = sosplice(so, 2042 mtod(m, struct splice *)->sp_fd, 2043 mtod(m, struct splice *)->sp_max, 2044 &mtod(m, struct splice *)->sp_idle); 2045 } 2046 break; 2047 #endif /* SOCKET_SPLICE */ 2048 2049 default: 2050 error = ENOPROTOOPT; 2051 break; 2052 } 2053 } 2054 2055 return (error); 2056 } 2057 2058 int 2059 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2060 { 2061 int error = 0; 2062 2063 if (level != SOL_SOCKET) { 2064 if (so->so_proto->pr_ctloutput) { 2065 m->m_len = 0; 2066 2067 solock(so); 2068 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2069 level, optname, m); 2070 sounlock(so); 2071 return (error); 2072 } else 2073 return (ENOPROTOOPT); 2074 } else { 2075 m->m_len = sizeof (int); 2076 2077 switch (optname) { 2078 2079 case SO_LINGER: 2080 m->m_len = sizeof (struct linger); 2081 solock_shared(so); 2082 mtod(m, struct linger *)->l_onoff = 2083 so->so_options & SO_LINGER; 2084 mtod(m, struct linger *)->l_linger = so->so_linger; 2085 sounlock_shared(so); 2086 break; 2087 2088 case SO_BINDANY: 2089 case SO_USELOOPBACK: 2090 case SO_DEBUG: 2091 case SO_KEEPALIVE: 2092 case SO_REUSEADDR: 2093 case SO_REUSEPORT: 2094 case SO_BROADCAST: 2095 case SO_OOBINLINE: 2096 case SO_ACCEPTCONN: 2097 case SO_TIMESTAMP: 2098 case SO_ZEROIZE: 2099 *mtod(m, int *) = so->so_options & optname; 2100 break; 2101 2102 case SO_DONTROUTE: 2103 *mtod(m, int *) = 0; 2104 break; 2105 2106 case SO_TYPE: 2107 *mtod(m, int *) = so->so_type; 2108 break; 2109 2110 case SO_ERROR: 2111 solock(so); 2112 *mtod(m, int *) = so->so_error; 2113 so->so_error = 0; 2114 sounlock(so); 2115 2116 break; 2117 2118 case SO_DOMAIN: 2119 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2120 break; 2121 2122 case SO_PROTOCOL: 2123 *mtod(m, int *) = so->so_proto->pr_protocol; 2124 break; 2125 2126 case SO_SNDBUF: 2127 *mtod(m, int *) = so->so_snd.sb_hiwat; 2128 break; 2129 2130 case SO_RCVBUF: 2131 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2132 break; 2133 2134 case SO_SNDLOWAT: 2135 *mtod(m, int *) = so->so_snd.sb_lowat; 2136 break; 2137 2138 case SO_RCVLOWAT: 2139 *mtod(m, int *) = so->so_rcv.sb_lowat; 2140 break; 2141 2142 case SO_SNDTIMEO: 2143 case SO_RCVTIMEO: 2144 { 2145 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2146 &so->so_snd : &so->so_rcv); 2147 struct timeval tv; 2148 uint64_t nsecs; 2149 2150 mtx_enter(&sb->sb_mtx); 2151 nsecs = sb->sb_timeo_nsecs; 2152 mtx_leave(&sb->sb_mtx); 2153 2154 m->m_len = sizeof(struct timeval); 2155 memset(&tv, 0, sizeof(tv)); 2156 if (nsecs != INFSLP) 2157 NSEC_TO_TIMEVAL(nsecs, &tv); 2158 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2159 break; 2160 } 2161 2162 case SO_RTABLE: 2163 if (so->so_proto->pr_domain && 2164 so->so_proto->pr_domain->dom_protosw && 2165 so->so_proto->pr_ctloutput) { 2166 const struct domain *dom = 2167 so->so_proto->pr_domain; 2168 2169 level = dom->dom_protosw->pr_protocol; 2170 solock(so); 2171 error = (*so->so_proto->pr_ctloutput) 2172 (PRCO_GETOPT, so, level, optname, m); 2173 sounlock(so); 2174 if (error) 2175 return (error); 2176 break; 2177 } 2178 return (ENOPROTOOPT); 2179 2180 #ifdef SOCKET_SPLICE 2181 case SO_SPLICE: 2182 { 2183 off_t len; 2184 2185 m->m_len = sizeof(off_t); 2186 solock_shared(so); 2187 len = so->so_sp ? so->so_sp->ssp_len : 0; 2188 sounlock_shared(so); 2189 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2190 break; 2191 } 2192 #endif /* SOCKET_SPLICE */ 2193 2194 case SO_PEERCRED: 2195 if (so->so_proto->pr_protocol == AF_UNIX) { 2196 struct unpcb *unp = sotounpcb(so); 2197 2198 solock(so); 2199 if (unp->unp_flags & UNP_FEIDS) { 2200 m->m_len = sizeof(unp->unp_connid); 2201 memcpy(mtod(m, caddr_t), 2202 &(unp->unp_connid), m->m_len); 2203 sounlock(so); 2204 break; 2205 } 2206 sounlock(so); 2207 2208 return (ENOTCONN); 2209 } 2210 return (EOPNOTSUPP); 2211 2212 default: 2213 return (ENOPROTOOPT); 2214 } 2215 return (0); 2216 } 2217 } 2218 2219 void 2220 sohasoutofband(struct socket *so) 2221 { 2222 pgsigio(&so->so_sigio, SIGURG, 0); 2223 knote(&so->so_rcv.sb_klist, 0); 2224 } 2225 2226 void 2227 sofilt_lock(struct socket *so, struct sockbuf *sb) 2228 { 2229 switch (so->so_proto->pr_domain->dom_family) { 2230 case PF_INET: 2231 case PF_INET6: 2232 NET_LOCK_SHARED(); 2233 break; 2234 default: 2235 rw_enter_write(&so->so_lock); 2236 break; 2237 } 2238 2239 mtx_enter(&sb->sb_mtx); 2240 } 2241 2242 void 2243 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2244 { 2245 mtx_leave(&sb->sb_mtx); 2246 2247 switch (so->so_proto->pr_domain->dom_family) { 2248 case PF_INET: 2249 case PF_INET6: 2250 NET_UNLOCK_SHARED(); 2251 break; 2252 default: 2253 rw_exit_write(&so->so_lock); 2254 break; 2255 } 2256 } 2257 2258 int 2259 soo_kqfilter(struct file *fp, struct knote *kn) 2260 { 2261 struct socket *so = kn->kn_fp->f_data; 2262 struct sockbuf *sb; 2263 2264 switch (kn->kn_filter) { 2265 case EVFILT_READ: 2266 kn->kn_fop = &soread_filtops; 2267 sb = &so->so_rcv; 2268 break; 2269 case EVFILT_WRITE: 2270 kn->kn_fop = &sowrite_filtops; 2271 sb = &so->so_snd; 2272 break; 2273 case EVFILT_EXCEPT: 2274 kn->kn_fop = &soexcept_filtops; 2275 sb = &so->so_rcv; 2276 break; 2277 default: 2278 return (EINVAL); 2279 } 2280 2281 klist_insert(&sb->sb_klist, kn); 2282 2283 return (0); 2284 } 2285 2286 void 2287 filt_sordetach(struct knote *kn) 2288 { 2289 struct socket *so = kn->kn_fp->f_data; 2290 2291 klist_remove(&so->so_rcv.sb_klist, kn); 2292 } 2293 2294 int 2295 filt_soread(struct knote *kn, long hint) 2296 { 2297 struct socket *so = kn->kn_fp->f_data; 2298 int rv = 0; 2299 2300 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2301 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2302 soassertlocked_readonly(so); 2303 2304 if (so->so_options & SO_ACCEPTCONN) { 2305 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2306 soassertlocked_readonly(so); 2307 2308 kn->kn_data = so->so_qlen; 2309 rv = (kn->kn_data != 0); 2310 2311 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2312 if (so->so_state & SS_ISDISCONNECTED) { 2313 kn->kn_flags |= __EV_HUP; 2314 rv = 1; 2315 } else { 2316 rv = soreadable(so); 2317 } 2318 } 2319 2320 return rv; 2321 } 2322 2323 kn->kn_data = so->so_rcv.sb_cc; 2324 #ifdef SOCKET_SPLICE 2325 if (isspliced(so)) { 2326 rv = 0; 2327 } else 2328 #endif /* SOCKET_SPLICE */ 2329 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2330 kn->kn_flags |= EV_EOF; 2331 if (kn->kn_flags & __EV_POLL) { 2332 if (so->so_state & SS_ISDISCONNECTED) 2333 kn->kn_flags |= __EV_HUP; 2334 } 2335 kn->kn_fflags = so->so_error; 2336 rv = 1; 2337 } else if (so->so_error) { 2338 rv = 1; 2339 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2340 rv = (kn->kn_data >= kn->kn_sdata); 2341 } else { 2342 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2343 } 2344 2345 return rv; 2346 } 2347 2348 void 2349 filt_sowdetach(struct knote *kn) 2350 { 2351 struct socket *so = kn->kn_fp->f_data; 2352 2353 klist_remove(&so->so_snd.sb_klist, kn); 2354 } 2355 2356 int 2357 filt_sowrite(struct knote *kn, long hint) 2358 { 2359 struct socket *so = kn->kn_fp->f_data; 2360 int rv; 2361 2362 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2363 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2364 soassertlocked_readonly(so); 2365 2366 kn->kn_data = sbspace_locked(so, &so->so_snd); 2367 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2368 kn->kn_flags |= EV_EOF; 2369 if (kn->kn_flags & __EV_POLL) { 2370 if (so->so_state & SS_ISDISCONNECTED) 2371 kn->kn_flags |= __EV_HUP; 2372 } 2373 kn->kn_fflags = so->so_error; 2374 rv = 1; 2375 } else if (so->so_error) { 2376 rv = 1; 2377 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2378 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2379 rv = 0; 2380 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2381 rv = (kn->kn_data >= kn->kn_sdata); 2382 } else { 2383 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2384 } 2385 2386 return (rv); 2387 } 2388 2389 int 2390 filt_soexcept(struct knote *kn, long hint) 2391 { 2392 struct socket *so = kn->kn_fp->f_data; 2393 int rv = 0; 2394 2395 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2396 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2397 soassertlocked_readonly(so); 2398 2399 #ifdef SOCKET_SPLICE 2400 if (isspliced(so)) { 2401 rv = 0; 2402 } else 2403 #endif /* SOCKET_SPLICE */ 2404 if (kn->kn_sfflags & NOTE_OOB) { 2405 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2406 kn->kn_fflags |= NOTE_OOB; 2407 kn->kn_data -= so->so_oobmark; 2408 rv = 1; 2409 } 2410 } 2411 2412 if (kn->kn_flags & __EV_POLL) { 2413 if (so->so_state & SS_ISDISCONNECTED) { 2414 kn->kn_flags |= __EV_HUP; 2415 rv = 1; 2416 } 2417 } 2418 2419 return rv; 2420 } 2421 2422 int 2423 filt_sowmodify(struct kevent *kev, struct knote *kn) 2424 { 2425 struct socket *so = kn->kn_fp->f_data; 2426 int rv; 2427 2428 sofilt_lock(so, &so->so_snd); 2429 rv = knote_modify(kev, kn); 2430 sofilt_unlock(so, &so->so_snd); 2431 2432 return (rv); 2433 } 2434 2435 int 2436 filt_sowprocess(struct knote *kn, struct kevent *kev) 2437 { 2438 struct socket *so = kn->kn_fp->f_data; 2439 int rv; 2440 2441 sofilt_lock(so, &so->so_snd); 2442 rv = knote_process(kn, kev); 2443 sofilt_unlock(so, &so->so_snd); 2444 2445 return (rv); 2446 } 2447 2448 int 2449 filt_sormodify(struct kevent *kev, struct knote *kn) 2450 { 2451 struct socket *so = kn->kn_fp->f_data; 2452 int rv; 2453 2454 sofilt_lock(so, &so->so_rcv); 2455 rv = knote_modify(kev, kn); 2456 sofilt_unlock(so, &so->so_rcv); 2457 2458 return (rv); 2459 } 2460 2461 int 2462 filt_sorprocess(struct knote *kn, struct kevent *kev) 2463 { 2464 struct socket *so = kn->kn_fp->f_data; 2465 int rv; 2466 2467 sofilt_lock(so, &so->so_rcv); 2468 rv = knote_process(kn, kev); 2469 sofilt_unlock(so, &so->so_rcv); 2470 2471 return (rv); 2472 } 2473 2474 #ifdef DDB 2475 void 2476 sobuf_print(struct sockbuf *, 2477 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2478 2479 void 2480 sobuf_print(struct sockbuf *sb, 2481 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2482 { 2483 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2484 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2485 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2486 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2487 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2488 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2489 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2490 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2491 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2492 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2493 (*pr)("\tsb_sel: ...\n"); 2494 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2495 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2496 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2497 } 2498 2499 void 2500 so_print(void *v, 2501 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2502 { 2503 struct socket *so = v; 2504 2505 (*pr)("socket %p\n", so); 2506 (*pr)("so_type: %i\n", so->so_type); 2507 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2508 (*pr)("so_linger: %i\n", so->so_linger); 2509 (*pr)("so_state: 0x%04x\n", so->so_state); 2510 (*pr)("so_pcb: %p\n", so->so_pcb); 2511 (*pr)("so_proto: %p\n", so->so_proto); 2512 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2513 2514 (*pr)("so_head: %p\n", so->so_head); 2515 (*pr)("so_onq: %p\n", so->so_onq); 2516 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2517 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2518 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2519 (*pr)("so_q0len: %i\n", so->so_q0len); 2520 (*pr)("so_qlen: %i\n", so->so_qlen); 2521 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2522 (*pr)("so_timeo: %i\n", so->so_timeo); 2523 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2524 2525 (*pr)("so_sp: %p\n", so->so_sp); 2526 if (so->so_sp != NULL) { 2527 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2528 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2529 (*pr)("\tssp_len: %lld\n", 2530 (unsigned long long)so->so_sp->ssp_len); 2531 (*pr)("\tssp_max: %lld\n", 2532 (unsigned long long)so->so_sp->ssp_max); 2533 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2534 so->so_sp->ssp_idletv.tv_usec); 2535 (*pr)("\tssp_idleto: %spending (@%i)\n", 2536 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2537 so->so_sp->ssp_idleto.to_time); 2538 } 2539 2540 (*pr)("so_rcv:\n"); 2541 sobuf_print(&so->so_rcv, pr); 2542 (*pr)("so_snd:\n"); 2543 sobuf_print(&so->so_snd, pr); 2544 2545 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2546 so->so_upcall, so->so_upcallarg); 2547 2548 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2549 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2550 (*pr)("so_cpid: %d\n", so->so_cpid); 2551 } 2552 #endif 2553