1 /* $OpenBSD: uipc_socket.c,v 1.334 2024/05/17 19:02:04 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 void sorflush_locked(struct socket *); 70 71 void filt_sordetach(struct knote *kn); 72 int filt_soread(struct knote *kn, long hint); 73 void filt_sowdetach(struct knote *kn); 74 int filt_sowrite(struct knote *kn, long hint); 75 int filt_soexcept(struct knote *kn, long hint); 76 77 int filt_sowmodify(struct kevent *kev, struct knote *kn); 78 int filt_sowprocess(struct knote *kn, struct kevent *kev); 79 80 int filt_sormodify(struct kevent *kev, struct knote *kn); 81 int filt_sorprocess(struct knote *kn, struct kevent *kev); 82 83 const struct filterops soread_filtops = { 84 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 85 .f_attach = NULL, 86 .f_detach = filt_sordetach, 87 .f_event = filt_soread, 88 .f_modify = filt_sormodify, 89 .f_process = filt_sorprocess, 90 }; 91 92 const struct filterops sowrite_filtops = { 93 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 94 .f_attach = NULL, 95 .f_detach = filt_sowdetach, 96 .f_event = filt_sowrite, 97 .f_modify = filt_sowmodify, 98 .f_process = filt_sowprocess, 99 }; 100 101 const struct filterops soexcept_filtops = { 102 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 103 .f_attach = NULL, 104 .f_detach = filt_sordetach, 105 .f_event = filt_soexcept, 106 .f_modify = filt_sormodify, 107 .f_process = filt_sorprocess, 108 }; 109 110 #ifndef SOMINCONN 111 #define SOMINCONN 80 112 #endif /* SOMINCONN */ 113 114 int somaxconn = SOMAXCONN; 115 int sominconn = SOMINCONN; 116 117 struct pool socket_pool; 118 #ifdef SOCKET_SPLICE 119 struct pool sosplice_pool; 120 struct taskq *sosplice_taskq; 121 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 122 #endif 123 124 void 125 soinit(void) 126 { 127 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 128 "sockpl", NULL); 129 #ifdef SOCKET_SPLICE 130 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 131 "sosppl", NULL); 132 #endif 133 } 134 135 struct socket * 136 soalloc(const struct protosw *prp, int wait) 137 { 138 const struct domain *dp = prp->pr_domain; 139 struct socket *so; 140 141 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 142 PR_ZERO); 143 if (so == NULL) 144 return (NULL); 145 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 146 refcnt_init(&so->so_refcnt); 147 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 148 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 149 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 150 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 151 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 152 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 153 sigio_init(&so->so_sigio); 154 TAILQ_INIT(&so->so_q0); 155 TAILQ_INIT(&so->so_q); 156 157 switch (dp->dom_family) { 158 case AF_INET: 159 case AF_INET6: 160 switch (prp->pr_type) { 161 case SOCK_RAW: 162 so->so_snd.sb_flags |= SB_MTXLOCK; 163 /* FALLTHROUGH */ 164 case SOCK_DGRAM: 165 so->so_rcv.sb_flags |= SB_MTXLOCK; 166 break; 167 } 168 break; 169 case AF_KEY: 170 case AF_UNIX: 171 so->so_snd.sb_flags |= SB_MTXLOCK; 172 so->so_rcv.sb_flags |= SB_MTXLOCK; 173 break; 174 } 175 176 return (so); 177 } 178 179 /* 180 * Socket operation routines. 181 * These routines are called by the routines in 182 * sys_socket.c or from a system process, and 183 * implement the semantics of socket operations by 184 * switching out to the protocol specific routines. 185 */ 186 int 187 socreate(int dom, struct socket **aso, int type, int proto) 188 { 189 struct proc *p = curproc; /* XXX */ 190 const struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 if (prp == NULL || prp->pr_usrreqs == NULL) 199 return (EPROTONOSUPPORT); 200 if (prp->pr_type != type) 201 return (EPROTOTYPE); 202 so = soalloc(prp, M_WAIT); 203 so->so_type = type; 204 if (suser(p) == 0) 205 so->so_state = SS_PRIV; 206 so->so_ruid = p->p_ucred->cr_ruid; 207 so->so_euid = p->p_ucred->cr_uid; 208 so->so_rgid = p->p_ucred->cr_rgid; 209 so->so_egid = p->p_ucred->cr_gid; 210 so->so_cpid = p->p_p->ps_pid; 211 so->so_proto = prp; 212 so->so_snd.sb_timeo_nsecs = INFSLP; 213 so->so_rcv.sb_timeo_nsecs = INFSLP; 214 215 solock(so); 216 error = pru_attach(so, proto, M_WAIT); 217 if (error) { 218 so->so_state |= SS_NOFDREF; 219 /* sofree() calls sounlock(). */ 220 sofree(so, 0); 221 return (error); 222 } 223 sounlock(so); 224 *aso = so; 225 return (0); 226 } 227 228 int 229 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 230 { 231 soassertlocked(so); 232 return pru_bind(so, nam, p); 233 } 234 235 int 236 solisten(struct socket *so, int backlog) 237 { 238 int somaxconn_local = READ_ONCE(somaxconn); 239 int sominconn_local = READ_ONCE(sominconn); 240 int error; 241 242 switch (so->so_type) { 243 case SOCK_STREAM: 244 case SOCK_SEQPACKET: 245 break; 246 default: 247 return (EOPNOTSUPP); 248 } 249 250 soassertlocked(so); 251 252 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 253 return (EINVAL); 254 #ifdef SOCKET_SPLICE 255 if (isspliced(so) || issplicedback(so)) 256 return (EOPNOTSUPP); 257 #endif /* SOCKET_SPLICE */ 258 error = pru_listen(so); 259 if (error) 260 return (error); 261 if (TAILQ_FIRST(&so->so_q) == NULL) 262 so->so_options |= SO_ACCEPTCONN; 263 if (backlog < 0 || backlog > somaxconn_local) 264 backlog = somaxconn_local; 265 if (backlog < sominconn_local) 266 backlog = sominconn_local; 267 so->so_qlimit = backlog; 268 return (0); 269 } 270 271 #define SOSP_FREEING_READ 1 272 #define SOSP_FREEING_WRITE 2 273 void 274 sofree(struct socket *so, int keep_lock) 275 { 276 int persocket = solock_persocket(so); 277 278 soassertlocked(so); 279 280 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 281 if (!keep_lock) 282 sounlock(so); 283 return; 284 } 285 if (so->so_head) { 286 struct socket *head = so->so_head; 287 288 /* 289 * We must not decommission a socket that's on the accept(2) 290 * queue. If we do, then accept(2) may hang after select(2) 291 * indicated that the listening socket was ready. 292 */ 293 if (so->so_onq == &head->so_q) { 294 if (!keep_lock) 295 sounlock(so); 296 return; 297 } 298 299 if (persocket) { 300 /* 301 * Concurrent close of `head' could 302 * abort `so' due to re-lock. 303 */ 304 soref(so); 305 soref(head); 306 sounlock(so); 307 solock(head); 308 solock(so); 309 310 if (so->so_onq != &head->so_q0) { 311 sounlock(head); 312 sounlock(so); 313 sorele(head); 314 sorele(so); 315 return; 316 } 317 318 sorele(head); 319 sorele(so); 320 } 321 322 soqremque(so, 0); 323 324 if (persocket) 325 sounlock(head); 326 } 327 328 if (persocket) { 329 sounlock(so); 330 refcnt_finalize(&so->so_refcnt, "sofinal"); 331 solock(so); 332 } 333 334 sigio_free(&so->so_sigio); 335 klist_free(&so->so_rcv.sb_klist); 336 klist_free(&so->so_snd.sb_klist); 337 #ifdef SOCKET_SPLICE 338 if (issplicedback(so)) { 339 int freeing = SOSP_FREEING_WRITE; 340 341 if (so->so_sp->ssp_soback == so) 342 freeing |= SOSP_FREEING_READ; 343 sounsplice(so->so_sp->ssp_soback, so, freeing); 344 } 345 if (isspliced(so)) { 346 int freeing = SOSP_FREEING_READ; 347 348 if (so == so->so_sp->ssp_socket) 349 freeing |= SOSP_FREEING_WRITE; 350 sounsplice(so, so->so_sp->ssp_socket, freeing); 351 } 352 #endif /* SOCKET_SPLICE */ 353 354 mtx_enter(&so->so_snd.sb_mtx); 355 sbrelease(so, &so->so_snd); 356 mtx_leave(&so->so_snd.sb_mtx); 357 358 /* 359 * Unlocked dispose and cleanup is safe. Socket is unlinked 360 * from everywhere. Even concurrent sotask() thread will not 361 * call somove(). 362 */ 363 if (so->so_proto->pr_flags & PR_RIGHTS && 364 so->so_proto->pr_domain->dom_dispose) 365 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 366 m_purge(so->so_rcv.sb_mb); 367 368 if (!keep_lock) 369 sounlock(so); 370 371 #ifdef SOCKET_SPLICE 372 if (so->so_sp) { 373 /* Reuse splice idle, sounsplice() has been called before. */ 374 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 375 timeout_add(&so->so_sp->ssp_idleto, 0); 376 } else 377 #endif /* SOCKET_SPLICE */ 378 { 379 pool_put(&socket_pool, so); 380 } 381 } 382 383 static inline uint64_t 384 solinger_nsec(struct socket *so) 385 { 386 if (so->so_linger == 0) 387 return INFSLP; 388 389 return SEC_TO_NSEC(so->so_linger); 390 } 391 392 /* 393 * Close a socket on last file table reference removal. 394 * Initiate disconnect if connected. 395 * Free socket when disconnect complete. 396 */ 397 int 398 soclose(struct socket *so, int flags) 399 { 400 struct socket *so2; 401 int error = 0; 402 403 solock(so); 404 /* Revoke async IO early. There is a final revocation in sofree(). */ 405 sigio_free(&so->so_sigio); 406 if (so->so_state & SS_ISCONNECTED) { 407 if (so->so_pcb == NULL) 408 goto discard; 409 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 410 error = sodisconnect(so); 411 if (error) 412 goto drop; 413 } 414 if (so->so_options & SO_LINGER) { 415 if ((so->so_state & SS_ISDISCONNECTING) && 416 (flags & MSG_DONTWAIT)) 417 goto drop; 418 while (so->so_state & SS_ISCONNECTED) { 419 error = sosleep_nsec(so, &so->so_timeo, 420 PSOCK | PCATCH, "netcls", 421 solinger_nsec(so)); 422 if (error) 423 break; 424 } 425 } 426 } 427 drop: 428 if (so->so_pcb) { 429 int error2; 430 error2 = pru_detach(so); 431 if (error == 0) 432 error = error2; 433 } 434 if (so->so_options & SO_ACCEPTCONN) { 435 int persocket = solock_persocket(so); 436 437 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 438 if (persocket) 439 solock(so2); 440 (void) soqremque(so2, 0); 441 if (persocket) 442 sounlock(so); 443 soabort(so2); 444 if (persocket) 445 solock(so); 446 } 447 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 448 if (persocket) 449 solock(so2); 450 (void) soqremque(so2, 1); 451 if (persocket) 452 sounlock(so); 453 soabort(so2); 454 if (persocket) 455 solock(so); 456 } 457 } 458 discard: 459 if (so->so_state & SS_NOFDREF) 460 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 461 so->so_state |= SS_NOFDREF; 462 /* sofree() calls sounlock(). */ 463 sofree(so, 0); 464 return (error); 465 } 466 467 void 468 soabort(struct socket *so) 469 { 470 soassertlocked(so); 471 pru_abort(so); 472 } 473 474 int 475 soaccept(struct socket *so, struct mbuf *nam) 476 { 477 int error = 0; 478 479 soassertlocked(so); 480 481 if ((so->so_state & SS_NOFDREF) == 0) 482 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 483 so->so_state &= ~SS_NOFDREF; 484 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 485 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 486 error = pru_accept(so, nam); 487 else 488 error = ECONNABORTED; 489 return (error); 490 } 491 492 int 493 soconnect(struct socket *so, struct mbuf *nam) 494 { 495 int error; 496 497 soassertlocked(so); 498 499 if (so->so_options & SO_ACCEPTCONN) 500 return (EOPNOTSUPP); 501 /* 502 * If protocol is connection-based, can only connect once. 503 * Otherwise, if connected, try to disconnect first. 504 * This allows user to disconnect by connecting to, e.g., 505 * a null address. 506 */ 507 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 508 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 509 (error = sodisconnect(so)))) 510 error = EISCONN; 511 else 512 error = pru_connect(so, nam); 513 return (error); 514 } 515 516 int 517 soconnect2(struct socket *so1, struct socket *so2) 518 { 519 int persocket, error; 520 521 if ((persocket = solock_persocket(so1))) 522 solock_pair(so1, so2); 523 else 524 solock(so1); 525 526 error = pru_connect2(so1, so2); 527 528 if (persocket) 529 sounlock(so2); 530 sounlock(so1); 531 return (error); 532 } 533 534 int 535 sodisconnect(struct socket *so) 536 { 537 int error; 538 539 soassertlocked(so); 540 541 if ((so->so_state & SS_ISCONNECTED) == 0) 542 return (ENOTCONN); 543 if (so->so_state & SS_ISDISCONNECTING) 544 return (EALREADY); 545 error = pru_disconnect(so); 546 return (error); 547 } 548 549 int m_getuio(struct mbuf **, int, long, struct uio *); 550 551 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 552 /* 553 * Send on a socket. 554 * If send must go all at once and message is larger than 555 * send buffering, then hard error. 556 * Lock against other senders. 557 * If must go all at once and not enough room now, then 558 * inform user that this would block and do nothing. 559 * Otherwise, if nonblocking, send as much as possible. 560 * The data to be sent is described by "uio" if nonzero, 561 * otherwise by the mbuf chain "top" (which must be null 562 * if uio is not). Data provided in mbuf chain must be small 563 * enough to send all at once. 564 * 565 * Returns nonzero on error, timeout or signal; callers 566 * must check for short counts if EINTR/ERESTART are returned. 567 * Data and control buffers are freed on return. 568 */ 569 int 570 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 571 struct mbuf *control, int flags) 572 { 573 long space, clen = 0; 574 size_t resid; 575 int error; 576 int atomic = sosendallatonce(so) || top; 577 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 578 579 if (uio) 580 resid = uio->uio_resid; 581 else 582 resid = top->m_pkthdr.len; 583 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 584 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 585 m_freem(top); 586 m_freem(control); 587 return (EINVAL); 588 } 589 if (uio && uio->uio_procp) 590 uio->uio_procp->p_ru.ru_msgsnd++; 591 if (control) { 592 /* 593 * In theory clen should be unsigned (since control->m_len is). 594 * However, space must be signed, as it might be less than 0 595 * if we over-committed, and we must use a signed comparison 596 * of space and clen. 597 */ 598 clen = control->m_len; 599 /* reserve extra space for AF_UNIX's internalize */ 600 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 601 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 602 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 603 clen = CMSG_SPACE( 604 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 605 (sizeof(struct fdpass) / sizeof(int))); 606 } 607 608 #define snderr(errno) { error = errno; goto release; } 609 610 if (dosolock) 611 solock_shared(so); 612 restart: 613 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 614 goto out; 615 sb_mtx_lock(&so->so_snd); 616 so->so_snd.sb_state |= SS_ISSENDING; 617 do { 618 if (so->so_snd.sb_state & SS_CANTSENDMORE) 619 snderr(EPIPE); 620 if ((error = READ_ONCE(so->so_error))) { 621 so->so_error = 0; 622 snderr(error); 623 } 624 if ((so->so_state & SS_ISCONNECTED) == 0) { 625 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 626 if (!(resid == 0 && clen != 0)) 627 snderr(ENOTCONN); 628 } else if (addr == NULL) 629 snderr(EDESTADDRREQ); 630 } 631 space = sbspace(so, &so->so_snd); 632 if (flags & MSG_OOB) 633 space += 1024; 634 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 635 if (atomic && resid > so->so_snd.sb_hiwat) 636 snderr(EMSGSIZE); 637 } else { 638 if (clen > so->so_snd.sb_hiwat || 639 (atomic && resid > so->so_snd.sb_hiwat - clen)) 640 snderr(EMSGSIZE); 641 } 642 if (space < clen || 643 (space - clen < resid && 644 (atomic || space < so->so_snd.sb_lowat))) { 645 if (flags & MSG_DONTWAIT) 646 snderr(EWOULDBLOCK); 647 sbunlock(so, &so->so_snd); 648 649 if (so->so_snd.sb_flags & SB_MTXLOCK) 650 error = sbwait_locked(so, &so->so_snd); 651 else 652 error = sbwait(so, &so->so_snd); 653 654 so->so_snd.sb_state &= ~SS_ISSENDING; 655 sb_mtx_unlock(&so->so_snd); 656 if (error) 657 goto out; 658 goto restart; 659 } 660 space -= clen; 661 do { 662 if (uio == NULL) { 663 /* 664 * Data is prepackaged in "top". 665 */ 666 resid = 0; 667 if (flags & MSG_EOR) 668 top->m_flags |= M_EOR; 669 } else { 670 sb_mtx_unlock(&so->so_snd); 671 if (dosolock) 672 sounlock_shared(so); 673 error = m_getuio(&top, atomic, space, uio); 674 if (dosolock) 675 solock_shared(so); 676 sb_mtx_lock(&so->so_snd); 677 if (error) 678 goto release; 679 space -= top->m_pkthdr.len; 680 resid = uio->uio_resid; 681 if (flags & MSG_EOR) 682 top->m_flags |= M_EOR; 683 } 684 if (resid == 0) 685 so->so_snd.sb_state &= ~SS_ISSENDING; 686 if (top && so->so_options & SO_ZEROIZE) 687 top->m_flags |= M_ZEROIZE; 688 sb_mtx_unlock(&so->so_snd); 689 if (!dosolock) 690 solock_shared(so); 691 if (flags & MSG_OOB) 692 error = pru_sendoob(so, top, addr, control); 693 else 694 error = pru_send(so, top, addr, control); 695 if (!dosolock) 696 sounlock_shared(so); 697 sb_mtx_lock(&so->so_snd); 698 clen = 0; 699 control = NULL; 700 top = NULL; 701 if (error) 702 goto release; 703 } while (resid && space > 0); 704 } while (resid); 705 706 release: 707 so->so_snd.sb_state &= ~SS_ISSENDING; 708 sb_mtx_unlock(&so->so_snd); 709 sbunlock(so, &so->so_snd); 710 out: 711 if (dosolock) 712 sounlock_shared(so); 713 m_freem(top); 714 m_freem(control); 715 return (error); 716 } 717 718 int 719 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 720 { 721 struct mbuf *m, *top = NULL; 722 struct mbuf **nextp = ⊤ 723 u_long len, mlen; 724 size_t resid = uio->uio_resid; 725 int error; 726 727 do { 728 if (top == NULL) { 729 MGETHDR(m, M_WAIT, MT_DATA); 730 mlen = MHLEN; 731 m->m_pkthdr.len = 0; 732 m->m_pkthdr.ph_ifidx = 0; 733 } else { 734 MGET(m, M_WAIT, MT_DATA); 735 mlen = MLEN; 736 } 737 /* chain mbuf together */ 738 *nextp = m; 739 nextp = &m->m_next; 740 741 resid = ulmin(resid, space); 742 if (resid >= MINCLSIZE) { 743 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 744 if ((m->m_flags & M_EXT) == 0) 745 MCLGETL(m, M_NOWAIT, MCLBYTES); 746 if ((m->m_flags & M_EXT) == 0) 747 goto nopages; 748 mlen = m->m_ext.ext_size; 749 len = ulmin(mlen, resid); 750 /* 751 * For datagram protocols, leave room 752 * for protocol headers in first mbuf. 753 */ 754 if (atomic && m == top && len < mlen - max_hdr) 755 m->m_data += max_hdr; 756 } else { 757 nopages: 758 len = ulmin(mlen, resid); 759 /* 760 * For datagram protocols, leave room 761 * for protocol headers in first mbuf. 762 */ 763 if (atomic && m == top && len < mlen - max_hdr) 764 m_align(m, len); 765 } 766 767 error = uiomove(mtod(m, caddr_t), len, uio); 768 if (error) { 769 m_freem(top); 770 return (error); 771 } 772 773 /* adjust counters */ 774 resid = uio->uio_resid; 775 space -= len; 776 m->m_len = len; 777 top->m_pkthdr.len += len; 778 779 /* Is there more space and more data? */ 780 } while (space > 0 && resid > 0); 781 782 *mp = top; 783 return 0; 784 } 785 786 /* 787 * Following replacement or removal of the first mbuf on the first 788 * mbuf chain of a socket buffer, push necessary state changes back 789 * into the socket buffer so that other consumers see the values 790 * consistently. 'nextrecord' is the callers locally stored value of 791 * the original value of sb->sb_mb->m_nextpkt which must be restored 792 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 793 */ 794 void 795 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 796 { 797 798 /* 799 * First, update for the new value of nextrecord. If necessary, 800 * make it the first record. 801 */ 802 if (sb->sb_mb != NULL) 803 sb->sb_mb->m_nextpkt = nextrecord; 804 else 805 sb->sb_mb = nextrecord; 806 807 /* 808 * Now update any dependent socket buffer fields to reflect 809 * the new state. This is an inline of SB_EMPTY_FIXUP, with 810 * the addition of a second clause that takes care of the 811 * case where sb_mb has been updated, but remains the last 812 * record. 813 */ 814 if (sb->sb_mb == NULL) { 815 sb->sb_mbtail = NULL; 816 sb->sb_lastrecord = NULL; 817 } else if (sb->sb_mb->m_nextpkt == NULL) 818 sb->sb_lastrecord = sb->sb_mb; 819 } 820 821 /* 822 * Implement receive operations on a socket. 823 * We depend on the way that records are added to the sockbuf 824 * by sbappend*. In particular, each record (mbufs linked through m_next) 825 * must begin with an address if the protocol so specifies, 826 * followed by an optional mbuf or mbufs containing ancillary data, 827 * and then zero or more mbufs of data. 828 * In order to avoid blocking network for the entire time here, we release 829 * the solock() while doing the actual copy to user space. 830 * Although the sockbuf is locked, new data may still be appended, 831 * and thus we must maintain consistency of the sockbuf during that time. 832 * 833 * The caller may receive the data as a single mbuf chain by supplying 834 * an mbuf **mp0 for use in returning the chain. The uio is then used 835 * only for the count in uio_resid. 836 */ 837 int 838 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 839 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 840 socklen_t controllen) 841 { 842 struct mbuf *m, **mp; 843 struct mbuf *cm; 844 u_long len, offset, moff; 845 int flags, error, error2, type, uio_error = 0; 846 const struct protosw *pr = so->so_proto; 847 struct mbuf *nextrecord; 848 size_t resid, orig_resid = uio->uio_resid; 849 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 850 851 mp = mp0; 852 if (paddr) 853 *paddr = NULL; 854 if (controlp) 855 *controlp = NULL; 856 if (flagsp) 857 flags = *flagsp &~ MSG_EOR; 858 else 859 flags = 0; 860 if (flags & MSG_OOB) { 861 m = m_get(M_WAIT, MT_DATA); 862 solock(so); 863 error = pru_rcvoob(so, m, flags & MSG_PEEK); 864 sounlock(so); 865 if (error) 866 goto bad; 867 do { 868 error = uiomove(mtod(m, caddr_t), 869 ulmin(uio->uio_resid, m->m_len), uio); 870 m = m_free(m); 871 } while (uio->uio_resid && error == 0 && m); 872 bad: 873 m_freem(m); 874 return (error); 875 } 876 if (mp) 877 *mp = NULL; 878 879 if (dosolock) 880 solock_shared(so); 881 restart: 882 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) 883 goto out; 884 sb_mtx_lock(&so->so_rcv); 885 886 m = so->so_rcv.sb_mb; 887 #ifdef SOCKET_SPLICE 888 if (isspliced(so)) 889 m = NULL; 890 #endif /* SOCKET_SPLICE */ 891 /* 892 * If we have less data than requested, block awaiting more 893 * (subject to any timeout) if: 894 * 1. the current count is less than the low water mark, 895 * 2. MSG_WAITALL is set, and it is possible to do the entire 896 * receive operation at once if we block (resid <= hiwat), or 897 * 3. MSG_DONTWAIT is not set. 898 * If MSG_WAITALL is set but resid is larger than the receive buffer, 899 * we have to do the receive in sections, and thus risk returning 900 * a short count if a timeout or signal occurs after we start. 901 */ 902 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 903 so->so_rcv.sb_cc < uio->uio_resid) && 904 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 905 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 906 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 907 #ifdef DIAGNOSTIC 908 if (m == NULL && so->so_rcv.sb_cc) 909 #ifdef SOCKET_SPLICE 910 if (!isspliced(so)) 911 #endif /* SOCKET_SPLICE */ 912 panic("receive 1: so %p, so_type %d, sb_cc %lu", 913 so, so->so_type, so->so_rcv.sb_cc); 914 #endif 915 if ((error2 = READ_ONCE(so->so_error))) { 916 if (m) 917 goto dontblock; 918 error = error2; 919 if ((flags & MSG_PEEK) == 0) 920 so->so_error = 0; 921 goto release; 922 } 923 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 924 if (m) 925 goto dontblock; 926 else if (so->so_rcv.sb_cc == 0) 927 goto release; 928 } 929 for (; m; m = m->m_next) 930 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 931 m = so->so_rcv.sb_mb; 932 goto dontblock; 933 } 934 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 935 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 936 error = ENOTCONN; 937 goto release; 938 } 939 if (uio->uio_resid == 0 && controlp == NULL) 940 goto release; 941 if (flags & MSG_DONTWAIT) { 942 error = EWOULDBLOCK; 943 goto release; 944 } 945 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 946 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 947 948 if (so->so_rcv.sb_flags & SB_MTXLOCK) { 949 sbunlock_locked(so, &so->so_rcv); 950 if (dosolock) 951 sounlock_shared(so); 952 error = sbwait_locked(so, &so->so_rcv); 953 sb_mtx_unlock(&so->so_rcv); 954 if (error) 955 return (error); 956 if (dosolock) 957 solock_shared(so); 958 } else { 959 sb_mtx_unlock(&so->so_rcv); 960 sbunlock(so, &so->so_rcv); 961 error = sbwait(so, &so->so_rcv); 962 if (error) { 963 sounlock_shared(so); 964 return (error); 965 } 966 } 967 goto restart; 968 } 969 dontblock: 970 /* 971 * On entry here, m points to the first record of the socket buffer. 972 * From this point onward, we maintain 'nextrecord' as a cache of the 973 * pointer to the next record in the socket buffer. We must keep the 974 * various socket buffer pointers and local stack versions of the 975 * pointers in sync, pushing out modifications before operations that 976 * may sleep, and re-reading them afterwards. 977 * 978 * Otherwise, we will race with the network stack appending new data 979 * or records onto the socket buffer by using inconsistent/stale 980 * versions of the field, possibly resulting in socket buffer 981 * corruption. 982 */ 983 if (uio->uio_procp) 984 uio->uio_procp->p_ru.ru_msgrcv++; 985 KASSERT(m == so->so_rcv.sb_mb); 986 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 987 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 988 nextrecord = m->m_nextpkt; 989 if (pr->pr_flags & PR_ADDR) { 990 #ifdef DIAGNOSTIC 991 if (m->m_type != MT_SONAME) 992 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 993 so, so->so_type, m, m->m_type); 994 #endif 995 orig_resid = 0; 996 if (flags & MSG_PEEK) { 997 if (paddr) 998 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 999 m = m->m_next; 1000 } else { 1001 sbfree(so, &so->so_rcv, m); 1002 if (paddr) { 1003 *paddr = m; 1004 so->so_rcv.sb_mb = m->m_next; 1005 m->m_next = NULL; 1006 m = so->so_rcv.sb_mb; 1007 } else { 1008 so->so_rcv.sb_mb = m_free(m); 1009 m = so->so_rcv.sb_mb; 1010 } 1011 sbsync(&so->so_rcv, nextrecord); 1012 } 1013 } 1014 while (m && m->m_type == MT_CONTROL && error == 0) { 1015 int skip = 0; 1016 if (flags & MSG_PEEK) { 1017 if (mtod(m, struct cmsghdr *)->cmsg_type == 1018 SCM_RIGHTS) { 1019 /* don't leak internalized SCM_RIGHTS msgs */ 1020 skip = 1; 1021 } else if (controlp) 1022 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1023 m = m->m_next; 1024 } else { 1025 sbfree(so, &so->so_rcv, m); 1026 so->so_rcv.sb_mb = m->m_next; 1027 m->m_nextpkt = m->m_next = NULL; 1028 cm = m; 1029 m = so->so_rcv.sb_mb; 1030 sbsync(&so->so_rcv, nextrecord); 1031 if (controlp) { 1032 if (pr->pr_domain->dom_externalize) { 1033 sb_mtx_unlock(&so->so_rcv); 1034 if (dosolock) 1035 sounlock_shared(so); 1036 error = 1037 (*pr->pr_domain->dom_externalize) 1038 (cm, controllen, flags); 1039 if (dosolock) 1040 solock_shared(so); 1041 sb_mtx_lock(&so->so_rcv); 1042 } 1043 *controlp = cm; 1044 } else { 1045 /* 1046 * Dispose of any SCM_RIGHTS message that went 1047 * through the read path rather than recv. 1048 */ 1049 if (pr->pr_domain->dom_dispose) { 1050 sb_mtx_unlock(&so->so_rcv); 1051 pr->pr_domain->dom_dispose(cm); 1052 sb_mtx_lock(&so->so_rcv); 1053 } 1054 m_free(cm); 1055 } 1056 } 1057 if (m != NULL) 1058 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1059 else 1060 nextrecord = so->so_rcv.sb_mb; 1061 if (controlp && !skip) 1062 controlp = &(*controlp)->m_next; 1063 orig_resid = 0; 1064 } 1065 1066 /* If m is non-NULL, we have some data to read. */ 1067 if (m) { 1068 type = m->m_type; 1069 if (type == MT_OOBDATA) 1070 flags |= MSG_OOB; 1071 if (m->m_flags & M_BCAST) 1072 flags |= MSG_BCAST; 1073 if (m->m_flags & M_MCAST) 1074 flags |= MSG_MCAST; 1075 } 1076 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1077 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1078 1079 moff = 0; 1080 offset = 0; 1081 while (m && uio->uio_resid > 0 && error == 0) { 1082 if (m->m_type == MT_OOBDATA) { 1083 if (type != MT_OOBDATA) 1084 break; 1085 } else if (type == MT_OOBDATA) { 1086 break; 1087 } else if (m->m_type == MT_CONTROL) { 1088 /* 1089 * If there is more than one control message in the 1090 * stream, we do a short read. Next can be received 1091 * or disposed by another system call. 1092 */ 1093 break; 1094 #ifdef DIAGNOSTIC 1095 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1096 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1097 so, so->so_type, m, m->m_type); 1098 #endif 1099 } 1100 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1101 len = uio->uio_resid; 1102 if (so->so_oobmark && len > so->so_oobmark - offset) 1103 len = so->so_oobmark - offset; 1104 if (len > m->m_len - moff) 1105 len = m->m_len - moff; 1106 /* 1107 * If mp is set, just pass back the mbufs. 1108 * Otherwise copy them out via the uio, then free. 1109 * Sockbuf must be consistent here (points to current mbuf, 1110 * it points to next record) when we drop priority; 1111 * we must note any additions to the sockbuf when we 1112 * block interrupts again. 1113 */ 1114 if (mp == NULL && uio_error == 0) { 1115 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1116 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1117 resid = uio->uio_resid; 1118 sb_mtx_unlock(&so->so_rcv); 1119 if (dosolock) 1120 sounlock_shared(so); 1121 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1122 if (dosolock) 1123 solock_shared(so); 1124 sb_mtx_lock(&so->so_rcv); 1125 if (uio_error) 1126 uio->uio_resid = resid - len; 1127 } else 1128 uio->uio_resid -= len; 1129 if (len == m->m_len - moff) { 1130 if (m->m_flags & M_EOR) 1131 flags |= MSG_EOR; 1132 if (flags & MSG_PEEK) { 1133 m = m->m_next; 1134 moff = 0; 1135 orig_resid = 0; 1136 } else { 1137 nextrecord = m->m_nextpkt; 1138 sbfree(so, &so->so_rcv, m); 1139 if (mp) { 1140 *mp = m; 1141 mp = &m->m_next; 1142 so->so_rcv.sb_mb = m = m->m_next; 1143 *mp = NULL; 1144 } else { 1145 so->so_rcv.sb_mb = m_free(m); 1146 m = so->so_rcv.sb_mb; 1147 } 1148 /* 1149 * If m != NULL, we also know that 1150 * so->so_rcv.sb_mb != NULL. 1151 */ 1152 KASSERT(so->so_rcv.sb_mb == m); 1153 if (m) { 1154 m->m_nextpkt = nextrecord; 1155 if (nextrecord == NULL) 1156 so->so_rcv.sb_lastrecord = m; 1157 } else { 1158 so->so_rcv.sb_mb = nextrecord; 1159 SB_EMPTY_FIXUP(&so->so_rcv); 1160 } 1161 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1162 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1163 } 1164 } else { 1165 if (flags & MSG_PEEK) { 1166 moff += len; 1167 orig_resid = 0; 1168 } else { 1169 if (mp) 1170 *mp = m_copym(m, 0, len, M_WAIT); 1171 m->m_data += len; 1172 m->m_len -= len; 1173 so->so_rcv.sb_cc -= len; 1174 so->so_rcv.sb_datacc -= len; 1175 } 1176 } 1177 if (so->so_oobmark) { 1178 if ((flags & MSG_PEEK) == 0) { 1179 so->so_oobmark -= len; 1180 if (so->so_oobmark == 0) { 1181 so->so_rcv.sb_state |= SS_RCVATMARK; 1182 break; 1183 } 1184 } else { 1185 offset += len; 1186 if (offset == so->so_oobmark) 1187 break; 1188 } 1189 } 1190 if (flags & MSG_EOR) 1191 break; 1192 /* 1193 * If the MSG_WAITALL flag is set (for non-atomic socket), 1194 * we must not quit until "uio->uio_resid == 0" or an error 1195 * termination. If a signal/timeout occurs, return 1196 * with a short count but without error. 1197 * Keep sockbuf locked against other readers. 1198 */ 1199 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1200 !sosendallatonce(so) && !nextrecord) { 1201 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1202 so->so_error) 1203 break; 1204 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1205 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1206 if (dosolock) { 1207 sb_mtx_unlock(&so->so_rcv); 1208 error = sbwait(so, &so->so_rcv); 1209 if (error) { 1210 sbunlock(so, &so->so_rcv); 1211 sounlock_shared(so); 1212 return (0); 1213 } 1214 sb_mtx_lock(&so->so_rcv); 1215 } else { 1216 if (sbwait_locked(so, &so->so_rcv)) { 1217 sb_mtx_unlock(&so->so_rcv); 1218 sbunlock(so, &so->so_rcv); 1219 return (0); 1220 } 1221 } 1222 if ((m = so->so_rcv.sb_mb) != NULL) 1223 nextrecord = m->m_nextpkt; 1224 } 1225 } 1226 1227 if (m && pr->pr_flags & PR_ATOMIC) { 1228 flags |= MSG_TRUNC; 1229 if ((flags & MSG_PEEK) == 0) 1230 (void) sbdroprecord(so, &so->so_rcv); 1231 } 1232 if ((flags & MSG_PEEK) == 0) { 1233 if (m == NULL) { 1234 /* 1235 * First part is an inline SB_EMPTY_FIXUP(). Second 1236 * part makes sure sb_lastrecord is up-to-date if 1237 * there is still data in the socket buffer. 1238 */ 1239 so->so_rcv.sb_mb = nextrecord; 1240 if (so->so_rcv.sb_mb == NULL) { 1241 so->so_rcv.sb_mbtail = NULL; 1242 so->so_rcv.sb_lastrecord = NULL; 1243 } else if (nextrecord->m_nextpkt == NULL) 1244 so->so_rcv.sb_lastrecord = nextrecord; 1245 } 1246 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1247 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1248 if (pr->pr_flags & PR_WANTRCVD) { 1249 sb_mtx_unlock(&so->so_rcv); 1250 if (!dosolock) 1251 solock_shared(so); 1252 pru_rcvd(so); 1253 if (!dosolock) 1254 sounlock_shared(so); 1255 sb_mtx_lock(&so->so_rcv); 1256 } 1257 } 1258 if (orig_resid == uio->uio_resid && orig_resid && 1259 (flags & MSG_EOR) == 0 && 1260 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1261 sb_mtx_unlock(&so->so_rcv); 1262 sbunlock(so, &so->so_rcv); 1263 goto restart; 1264 } 1265 1266 if (uio_error) 1267 error = uio_error; 1268 1269 if (flagsp) 1270 *flagsp |= flags; 1271 release: 1272 sb_mtx_unlock(&so->so_rcv); 1273 sbunlock(so, &so->so_rcv); 1274 out: 1275 if (dosolock) 1276 sounlock_shared(so); 1277 return (error); 1278 } 1279 1280 int 1281 soshutdown(struct socket *so, int how) 1282 { 1283 int error = 0; 1284 1285 switch (how) { 1286 case SHUT_RD: 1287 sorflush(so); 1288 break; 1289 case SHUT_RDWR: 1290 sorflush(so); 1291 /* FALLTHROUGH */ 1292 case SHUT_WR: 1293 solock(so); 1294 error = pru_shutdown(so); 1295 sounlock(so); 1296 break; 1297 default: 1298 error = EINVAL; 1299 break; 1300 } 1301 1302 return (error); 1303 } 1304 1305 void 1306 sorflush_locked(struct socket *so) 1307 { 1308 struct sockbuf *sb = &so->so_rcv; 1309 struct mbuf *m; 1310 const struct protosw *pr = so->so_proto; 1311 int error; 1312 1313 if ((sb->sb_flags & SB_MTXLOCK) == 0) 1314 soassertlocked(so); 1315 1316 error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); 1317 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1318 KASSERT(error == 0); 1319 1320 if (sb->sb_flags & SB_MTXLOCK) 1321 solock(so); 1322 socantrcvmore(so); 1323 if (sb->sb_flags & SB_MTXLOCK) 1324 sounlock(so); 1325 1326 mtx_enter(&sb->sb_mtx); 1327 m = sb->sb_mb; 1328 memset(&sb->sb_startzero, 0, 1329 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1330 sb->sb_timeo_nsecs = INFSLP; 1331 mtx_leave(&sb->sb_mtx); 1332 sbunlock(so, sb); 1333 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1334 (*pr->pr_domain->dom_dispose)(m); 1335 m_purge(m); 1336 } 1337 1338 void 1339 sorflush(struct socket *so) 1340 { 1341 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1342 solock_shared(so); 1343 sorflush_locked(so); 1344 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1345 sounlock_shared(so); 1346 } 1347 1348 #ifdef SOCKET_SPLICE 1349 1350 #define so_splicelen so_sp->ssp_len 1351 #define so_splicemax so_sp->ssp_max 1352 #define so_idletv so_sp->ssp_idletv 1353 #define so_idleto so_sp->ssp_idleto 1354 #define so_splicetask so_sp->ssp_task 1355 1356 int 1357 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1358 { 1359 struct file *fp = NULL; 1360 struct socket *sosp; 1361 struct taskq *tq; 1362 int error = 0; 1363 1364 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1365 return (EPROTONOSUPPORT); 1366 if (max && max < 0) 1367 return (EINVAL); 1368 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1369 return (EINVAL); 1370 1371 if (sosplice_taskq == NULL) { 1372 rw_enter_write(&sosplice_lock); 1373 if (sosplice_taskq == NULL) { 1374 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1375 TASKQ_MPSAFE); 1376 if (tq == NULL) { 1377 rw_exit_write(&sosplice_lock); 1378 return (ENOMEM); 1379 } 1380 /* Ensure the taskq is fully visible to other CPUs. */ 1381 membar_producer(); 1382 sosplice_taskq = tq; 1383 } 1384 rw_exit_write(&sosplice_lock); 1385 } else { 1386 /* Ensure the taskq is fully visible on this CPU. */ 1387 membar_consumer(); 1388 } 1389 1390 if (so->so_rcv.sb_flags & SB_MTXLOCK) { 1391 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) 1392 return (error); 1393 solock(so); 1394 } else { 1395 solock(so); 1396 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1397 sounlock(so); 1398 return (error); 1399 } 1400 } 1401 1402 if (so->so_options & SO_ACCEPTCONN) { 1403 error = EOPNOTSUPP; 1404 goto out; 1405 } 1406 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1407 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1408 error = ENOTCONN; 1409 goto out; 1410 } 1411 if (so->so_sp == NULL) 1412 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1413 1414 /* If no fd is given, unsplice by removing existing link. */ 1415 if (fd < 0) { 1416 if (so->so_sp->ssp_socket) 1417 sounsplice(so, so->so_sp->ssp_socket, 0); 1418 goto out; 1419 } 1420 1421 /* Find sosp, the drain socket where data will be spliced into. */ 1422 if ((error = getsock(curproc, fd, &fp)) != 0) 1423 goto out; 1424 sosp = fp->f_data; 1425 if (sosp->so_proto->pr_usrreqs->pru_send != 1426 so->so_proto->pr_usrreqs->pru_send) { 1427 error = EPROTONOSUPPORT; 1428 goto out; 1429 } 1430 if (sosp->so_sp == NULL) 1431 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1432 1433 if ((error = sblock(sosp, &sosp->so_snd, SBL_WAIT)) != 0) { 1434 goto out; 1435 } 1436 1437 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1438 error = EBUSY; 1439 goto release; 1440 } 1441 if (sosp->so_options & SO_ACCEPTCONN) { 1442 error = EOPNOTSUPP; 1443 goto release; 1444 } 1445 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1446 error = ENOTCONN; 1447 goto release; 1448 } 1449 1450 /* Splice so and sosp together. */ 1451 mtx_enter(&so->so_rcv.sb_mtx); 1452 so->so_sp->ssp_socket = sosp; 1453 sosp->so_sp->ssp_soback = so; 1454 mtx_leave(&so->so_rcv.sb_mtx); 1455 so->so_splicelen = 0; 1456 so->so_splicemax = max; 1457 if (tv) 1458 so->so_idletv = *tv; 1459 else 1460 timerclear(&so->so_idletv); 1461 timeout_set_proc(&so->so_idleto, soidle, so); 1462 task_set(&so->so_splicetask, sotask, so); 1463 1464 /* 1465 * To prevent softnet interrupt from calling somove() while 1466 * we sleep, the socket buffers are not marked as spliced yet. 1467 */ 1468 if (somove(so, M_WAIT)) { 1469 mtx_enter(&so->so_rcv.sb_mtx); 1470 so->so_rcv.sb_flags |= SB_SPLICE; 1471 mtx_leave(&so->so_rcv.sb_mtx); 1472 sosp->so_snd.sb_flags |= SB_SPLICE; 1473 } 1474 1475 release: 1476 sbunlock(sosp, &sosp->so_snd); 1477 out: 1478 if (so->so_rcv.sb_flags & SB_MTXLOCK) { 1479 sounlock(so); 1480 sbunlock(so, &so->so_rcv); 1481 } else { 1482 sbunlock(so, &so->so_rcv); 1483 sounlock(so); 1484 } 1485 1486 if (fp) 1487 FRELE(fp, curproc); 1488 1489 return (error); 1490 } 1491 1492 void 1493 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1494 { 1495 soassertlocked(so); 1496 1497 task_del(sosplice_taskq, &so->so_splicetask); 1498 timeout_del(&so->so_idleto); 1499 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1500 1501 mtx_enter(&so->so_rcv.sb_mtx); 1502 so->so_rcv.sb_flags &= ~SB_SPLICE; 1503 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1504 mtx_leave(&so->so_rcv.sb_mtx); 1505 1506 /* Do not wakeup a socket that is about to be freed. */ 1507 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1508 sorwakeup(so); 1509 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1510 sowwakeup(sosp); 1511 } 1512 1513 void 1514 soidle(void *arg) 1515 { 1516 struct socket *so = arg; 1517 1518 solock(so); 1519 if (so->so_rcv.sb_flags & SB_SPLICE) { 1520 so->so_error = ETIMEDOUT; 1521 sounsplice(so, so->so_sp->ssp_socket, 0); 1522 } 1523 sounlock(so); 1524 } 1525 1526 void 1527 sotask(void *arg) 1528 { 1529 struct socket *so = arg; 1530 1531 solock(so); 1532 if (so->so_rcv.sb_flags & SB_SPLICE) { 1533 /* 1534 * We may not sleep here as sofree() and unsplice() may be 1535 * called from softnet interrupt context. This would remove 1536 * the socket during somove(). 1537 */ 1538 somove(so, M_DONTWAIT); 1539 } 1540 sounlock(so); 1541 1542 /* Avoid user land starvation. */ 1543 yield(); 1544 } 1545 1546 /* 1547 * The socket splicing task or idle timeout may sleep while grabbing the net 1548 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1549 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1550 * after all pending socket splicing tasks or timeouts have finished. Do this 1551 * by scheduling it on the same threads. 1552 */ 1553 void 1554 soreaper(void *arg) 1555 { 1556 struct socket *so = arg; 1557 1558 /* Reuse splice task, sounsplice() has been called before. */ 1559 task_set(&so->so_sp->ssp_task, soput, so); 1560 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1561 } 1562 1563 void 1564 soput(void *arg) 1565 { 1566 struct socket *so = arg; 1567 1568 pool_put(&sosplice_pool, so->so_sp); 1569 pool_put(&socket_pool, so); 1570 } 1571 1572 /* 1573 * Move data from receive buffer of spliced source socket to send 1574 * buffer of drain socket. Try to move as much as possible in one 1575 * big chunk. It is a TCP only implementation. 1576 * Return value 0 means splicing has been finished, 1 continue. 1577 */ 1578 int 1579 somove(struct socket *so, int wait) 1580 { 1581 struct socket *sosp = so->so_sp->ssp_socket; 1582 struct mbuf *m, **mp, *nextrecord; 1583 u_long len, off, oobmark; 1584 long space; 1585 int error = 0, maxreached = 0; 1586 unsigned int rcvstate; 1587 1588 soassertlocked(so); 1589 1590 nextpkt: 1591 if (so->so_error) { 1592 error = so->so_error; 1593 goto release; 1594 } 1595 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1596 error = EPIPE; 1597 goto release; 1598 } 1599 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1600 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1601 error = sosp->so_error; 1602 goto release; 1603 } 1604 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1605 goto release; 1606 1607 /* Calculate how many bytes can be copied now. */ 1608 len = so->so_rcv.sb_datacc; 1609 if (so->so_splicemax) { 1610 KASSERT(so->so_splicelen < so->so_splicemax); 1611 if (so->so_splicemax <= so->so_splicelen + len) { 1612 len = so->so_splicemax - so->so_splicelen; 1613 maxreached = 1; 1614 } 1615 } 1616 space = sbspace(sosp, &sosp->so_snd); 1617 if (so->so_oobmark && so->so_oobmark < len && 1618 so->so_oobmark < space + 1024) 1619 space += 1024; 1620 if (space <= 0) { 1621 maxreached = 0; 1622 goto release; 1623 } 1624 if (space < len) { 1625 maxreached = 0; 1626 if (space < sosp->so_snd.sb_lowat) 1627 goto release; 1628 len = space; 1629 } 1630 sosp->so_snd.sb_state |= SS_ISSENDING; 1631 1632 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1633 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1634 m = so->so_rcv.sb_mb; 1635 if (m == NULL) 1636 goto release; 1637 nextrecord = m->m_nextpkt; 1638 1639 /* Drop address and control information not used with splicing. */ 1640 if (so->so_proto->pr_flags & PR_ADDR) { 1641 #ifdef DIAGNOSTIC 1642 if (m->m_type != MT_SONAME) 1643 panic("somove soname: so %p, so_type %d, m %p, " 1644 "m_type %d", so, so->so_type, m, m->m_type); 1645 #endif 1646 m = m->m_next; 1647 } 1648 while (m && m->m_type == MT_CONTROL) 1649 m = m->m_next; 1650 if (m == NULL) { 1651 sbdroprecord(so, &so->so_rcv); 1652 if (so->so_proto->pr_flags & PR_WANTRCVD) 1653 pru_rcvd(so); 1654 goto nextpkt; 1655 } 1656 1657 /* 1658 * By splicing sockets connected to localhost, userland might create a 1659 * loop. Dissolve splicing with error if loop is detected by counter. 1660 * 1661 * If we deal with looped broadcast/multicast packet we bail out with 1662 * no error to suppress splice termination. 1663 */ 1664 if ((m->m_flags & M_PKTHDR) && 1665 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1666 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1667 error = ELOOP; 1668 goto release; 1669 } 1670 1671 if (so->so_proto->pr_flags & PR_ATOMIC) { 1672 if ((m->m_flags & M_PKTHDR) == 0) 1673 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1674 "m_type %d", so, so->so_type, m, m->m_type); 1675 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1676 error = EMSGSIZE; 1677 goto release; 1678 } 1679 if (len < m->m_pkthdr.len) 1680 goto release; 1681 if (m->m_pkthdr.len < len) { 1682 maxreached = 0; 1683 len = m->m_pkthdr.len; 1684 } 1685 /* 1686 * Throw away the name mbuf after it has been assured 1687 * that the whole first record can be processed. 1688 */ 1689 m = so->so_rcv.sb_mb; 1690 sbfree(so, &so->so_rcv, m); 1691 so->so_rcv.sb_mb = m_free(m); 1692 sbsync(&so->so_rcv, nextrecord); 1693 } 1694 /* 1695 * Throw away the control mbufs after it has been assured 1696 * that the whole first record can be processed. 1697 */ 1698 m = so->so_rcv.sb_mb; 1699 while (m && m->m_type == MT_CONTROL) { 1700 sbfree(so, &so->so_rcv, m); 1701 so->so_rcv.sb_mb = m_free(m); 1702 m = so->so_rcv.sb_mb; 1703 sbsync(&so->so_rcv, nextrecord); 1704 } 1705 1706 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1707 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1708 1709 /* Take at most len mbufs out of receive buffer. */ 1710 for (off = 0, mp = &m; off <= len && *mp; 1711 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1712 u_long size = len - off; 1713 1714 #ifdef DIAGNOSTIC 1715 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1716 panic("somove type: so %p, so_type %d, m %p, " 1717 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1718 #endif 1719 if ((*mp)->m_len > size) { 1720 /* 1721 * Move only a partial mbuf at maximum splice length or 1722 * if the drain buffer is too small for this large mbuf. 1723 */ 1724 if (!maxreached && so->so_snd.sb_datacc > 0) { 1725 len -= size; 1726 break; 1727 } 1728 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1729 if (*mp == NULL) { 1730 len -= size; 1731 break; 1732 } 1733 so->so_rcv.sb_mb->m_data += size; 1734 so->so_rcv.sb_mb->m_len -= size; 1735 so->so_rcv.sb_cc -= size; 1736 so->so_rcv.sb_datacc -= size; 1737 } else { 1738 *mp = so->so_rcv.sb_mb; 1739 sbfree(so, &so->so_rcv, *mp); 1740 so->so_rcv.sb_mb = (*mp)->m_next; 1741 sbsync(&so->so_rcv, nextrecord); 1742 } 1743 } 1744 *mp = NULL; 1745 1746 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1747 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1748 SBCHECK(so, &so->so_rcv); 1749 if (m == NULL) 1750 goto release; 1751 m->m_nextpkt = NULL; 1752 if (m->m_flags & M_PKTHDR) { 1753 m_resethdr(m); 1754 m->m_pkthdr.len = len; 1755 } 1756 1757 /* Send window update to source peer as receive buffer has changed. */ 1758 if (so->so_proto->pr_flags & PR_WANTRCVD) 1759 pru_rcvd(so); 1760 1761 /* Receive buffer did shrink by len bytes, adjust oob. */ 1762 mtx_enter(&so->so_rcv.sb_mtx); 1763 rcvstate = so->so_rcv.sb_state; 1764 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1765 oobmark = so->so_oobmark; 1766 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1767 if (oobmark) { 1768 if (oobmark == len) 1769 so->so_rcv.sb_state |= SS_RCVATMARK; 1770 if (oobmark >= len) 1771 oobmark = 0; 1772 } 1773 mtx_leave(&so->so_rcv.sb_mtx); 1774 1775 /* 1776 * Handle oob data. If any malloc fails, ignore error. 1777 * TCP urgent data is not very reliable anyway. 1778 */ 1779 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1780 (so->so_options & SO_OOBINLINE)) { 1781 struct mbuf *o = NULL; 1782 1783 if (rcvstate & SS_RCVATMARK) { 1784 o = m_get(wait, MT_DATA); 1785 rcvstate &= ~SS_RCVATMARK; 1786 } else if (oobmark) { 1787 o = m_split(m, oobmark, wait); 1788 if (o) { 1789 error = pru_send(sosp, m, NULL, NULL); 1790 if (error) { 1791 if (sosp->so_snd.sb_state & 1792 SS_CANTSENDMORE) 1793 error = EPIPE; 1794 m_freem(o); 1795 goto release; 1796 } 1797 len -= oobmark; 1798 so->so_splicelen += oobmark; 1799 m = o; 1800 o = m_get(wait, MT_DATA); 1801 } 1802 oobmark = 0; 1803 } 1804 if (o) { 1805 o->m_len = 1; 1806 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1807 error = pru_sendoob(sosp, o, NULL, NULL); 1808 if (error) { 1809 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1810 error = EPIPE; 1811 m_freem(m); 1812 goto release; 1813 } 1814 len -= 1; 1815 so->so_splicelen += 1; 1816 if (oobmark) { 1817 oobmark -= 1; 1818 if (oobmark == 0) 1819 rcvstate |= SS_RCVATMARK; 1820 } 1821 m_adj(m, 1); 1822 } 1823 } 1824 1825 /* Append all remaining data to drain socket. */ 1826 if (so->so_rcv.sb_cc == 0 || maxreached) 1827 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1828 error = pru_send(sosp, m, NULL, NULL); 1829 if (error) { 1830 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1831 error = EPIPE; 1832 goto release; 1833 } 1834 so->so_splicelen += len; 1835 1836 /* Move several packets if possible. */ 1837 if (!maxreached && nextrecord) 1838 goto nextpkt; 1839 1840 release: 1841 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1842 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1843 error = EFBIG; 1844 if (error) 1845 so->so_error = error; 1846 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1847 so->so_rcv.sb_cc == 0) || 1848 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1849 maxreached || error) { 1850 sounsplice(so, sosp, 0); 1851 return (0); 1852 } 1853 if (timerisset(&so->so_idletv)) 1854 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1855 return (1); 1856 } 1857 1858 #endif /* SOCKET_SPLICE */ 1859 1860 void 1861 sorwakeup(struct socket *so) 1862 { 1863 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1864 soassertlocked_readonly(so); 1865 1866 #ifdef SOCKET_SPLICE 1867 if (so->so_rcv.sb_flags & SB_SPLICE) { 1868 /* 1869 * TCP has a sendbuffer that can handle multiple packets 1870 * at once. So queue the stream a bit to accumulate data. 1871 * The sosplice thread will call somove() later and send 1872 * the packets calling tcp_output() only once. 1873 * In the UDP case, send out the packets immediately. 1874 * Using a thread would make things slower. 1875 */ 1876 if (so->so_proto->pr_flags & PR_WANTRCVD) 1877 task_add(sosplice_taskq, &so->so_splicetask); 1878 else 1879 somove(so, M_DONTWAIT); 1880 } 1881 if (isspliced(so)) 1882 return; 1883 #endif 1884 sowakeup(so, &so->so_rcv); 1885 if (so->so_upcall) 1886 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1887 } 1888 1889 void 1890 sowwakeup(struct socket *so) 1891 { 1892 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1893 soassertlocked_readonly(so); 1894 1895 #ifdef SOCKET_SPLICE 1896 if (so->so_snd.sb_flags & SB_SPLICE) 1897 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1898 if (issplicedback(so)) 1899 return; 1900 #endif 1901 sowakeup(so, &so->so_snd); 1902 } 1903 1904 int 1905 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1906 { 1907 int error = 0; 1908 1909 if (level != SOL_SOCKET) { 1910 if (so->so_proto->pr_ctloutput) { 1911 solock(so); 1912 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1913 level, optname, m); 1914 sounlock(so); 1915 return (error); 1916 } 1917 error = ENOPROTOOPT; 1918 } else { 1919 switch (optname) { 1920 1921 case SO_LINGER: 1922 if (m == NULL || m->m_len != sizeof (struct linger) || 1923 mtod(m, struct linger *)->l_linger < 0 || 1924 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1925 return (EINVAL); 1926 1927 solock(so); 1928 so->so_linger = mtod(m, struct linger *)->l_linger; 1929 if (*mtod(m, int *)) 1930 so->so_options |= optname; 1931 else 1932 so->so_options &= ~optname; 1933 sounlock(so); 1934 1935 break; 1936 case SO_BINDANY: 1937 if ((error = suser(curproc)) != 0) /* XXX */ 1938 return (error); 1939 /* FALLTHROUGH */ 1940 1941 case SO_DEBUG: 1942 case SO_KEEPALIVE: 1943 case SO_USELOOPBACK: 1944 case SO_BROADCAST: 1945 case SO_REUSEADDR: 1946 case SO_REUSEPORT: 1947 case SO_OOBINLINE: 1948 case SO_TIMESTAMP: 1949 case SO_ZEROIZE: 1950 if (m == NULL || m->m_len < sizeof (int)) 1951 return (EINVAL); 1952 1953 solock(so); 1954 if (*mtod(m, int *)) 1955 so->so_options |= optname; 1956 else 1957 so->so_options &= ~optname; 1958 sounlock(so); 1959 1960 break; 1961 case SO_DONTROUTE: 1962 if (m == NULL || m->m_len < sizeof (int)) 1963 return (EINVAL); 1964 if (*mtod(m, int *)) 1965 error = EOPNOTSUPP; 1966 break; 1967 1968 case SO_SNDBUF: 1969 case SO_RCVBUF: 1970 case SO_SNDLOWAT: 1971 case SO_RCVLOWAT: 1972 { 1973 struct sockbuf *sb = (optname == SO_SNDBUF || 1974 optname == SO_SNDLOWAT ? 1975 &so->so_snd : &so->so_rcv); 1976 u_long cnt; 1977 1978 if (m == NULL || m->m_len < sizeof (int)) 1979 return (EINVAL); 1980 cnt = *mtod(m, int *); 1981 if ((long)cnt <= 0) 1982 cnt = 1; 1983 1984 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 1985 solock(so); 1986 mtx_enter(&sb->sb_mtx); 1987 1988 switch (optname) { 1989 case SO_SNDBUF: 1990 case SO_RCVBUF: 1991 if (sb->sb_state & 1992 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1993 error = EINVAL; 1994 break; 1995 } 1996 if (sbcheckreserve(cnt, sb->sb_wat) || 1997 sbreserve(so, sb, cnt)) { 1998 error = ENOBUFS; 1999 break; 2000 } 2001 sb->sb_wat = cnt; 2002 break; 2003 case SO_SNDLOWAT: 2004 case SO_RCVLOWAT: 2005 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2006 sb->sb_hiwat : cnt; 2007 break; 2008 } 2009 2010 mtx_leave(&sb->sb_mtx); 2011 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2012 sounlock(so); 2013 2014 break; 2015 } 2016 2017 case SO_SNDTIMEO: 2018 case SO_RCVTIMEO: 2019 { 2020 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2021 &so->so_snd : &so->so_rcv); 2022 struct timeval tv; 2023 uint64_t nsecs; 2024 2025 if (m == NULL || m->m_len < sizeof (tv)) 2026 return (EINVAL); 2027 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2028 if (!timerisvalid(&tv)) 2029 return (EINVAL); 2030 nsecs = TIMEVAL_TO_NSEC(&tv); 2031 if (nsecs == UINT64_MAX) 2032 return (EDOM); 2033 if (nsecs == 0) 2034 nsecs = INFSLP; 2035 2036 mtx_enter(&sb->sb_mtx); 2037 sb->sb_timeo_nsecs = nsecs; 2038 mtx_leave(&sb->sb_mtx); 2039 break; 2040 } 2041 2042 case SO_RTABLE: 2043 if (so->so_proto->pr_domain && 2044 so->so_proto->pr_domain->dom_protosw && 2045 so->so_proto->pr_ctloutput) { 2046 const struct domain *dom = 2047 so->so_proto->pr_domain; 2048 2049 level = dom->dom_protosw->pr_protocol; 2050 solock(so); 2051 error = (*so->so_proto->pr_ctloutput) 2052 (PRCO_SETOPT, so, level, optname, m); 2053 sounlock(so); 2054 } else 2055 error = ENOPROTOOPT; 2056 break; 2057 #ifdef SOCKET_SPLICE 2058 case SO_SPLICE: 2059 if (m == NULL) { 2060 error = sosplice(so, -1, 0, NULL); 2061 } else if (m->m_len < sizeof(int)) { 2062 error = EINVAL; 2063 } else if (m->m_len < sizeof(struct splice)) { 2064 error = sosplice(so, *mtod(m, int *), 0, NULL); 2065 } else { 2066 error = sosplice(so, 2067 mtod(m, struct splice *)->sp_fd, 2068 mtod(m, struct splice *)->sp_max, 2069 &mtod(m, struct splice *)->sp_idle); 2070 } 2071 break; 2072 #endif /* SOCKET_SPLICE */ 2073 2074 default: 2075 error = ENOPROTOOPT; 2076 break; 2077 } 2078 } 2079 2080 return (error); 2081 } 2082 2083 int 2084 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2085 { 2086 int error = 0; 2087 2088 if (level != SOL_SOCKET) { 2089 if (so->so_proto->pr_ctloutput) { 2090 m->m_len = 0; 2091 2092 solock(so); 2093 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2094 level, optname, m); 2095 sounlock(so); 2096 return (error); 2097 } else 2098 return (ENOPROTOOPT); 2099 } else { 2100 m->m_len = sizeof (int); 2101 2102 switch (optname) { 2103 2104 case SO_LINGER: 2105 m->m_len = sizeof (struct linger); 2106 solock_shared(so); 2107 mtod(m, struct linger *)->l_onoff = 2108 so->so_options & SO_LINGER; 2109 mtod(m, struct linger *)->l_linger = so->so_linger; 2110 sounlock_shared(so); 2111 break; 2112 2113 case SO_BINDANY: 2114 case SO_USELOOPBACK: 2115 case SO_DEBUG: 2116 case SO_KEEPALIVE: 2117 case SO_REUSEADDR: 2118 case SO_REUSEPORT: 2119 case SO_BROADCAST: 2120 case SO_OOBINLINE: 2121 case SO_ACCEPTCONN: 2122 case SO_TIMESTAMP: 2123 case SO_ZEROIZE: 2124 *mtod(m, int *) = so->so_options & optname; 2125 break; 2126 2127 case SO_DONTROUTE: 2128 *mtod(m, int *) = 0; 2129 break; 2130 2131 case SO_TYPE: 2132 *mtod(m, int *) = so->so_type; 2133 break; 2134 2135 case SO_ERROR: 2136 solock(so); 2137 *mtod(m, int *) = so->so_error; 2138 so->so_error = 0; 2139 sounlock(so); 2140 2141 break; 2142 2143 case SO_DOMAIN: 2144 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2145 break; 2146 2147 case SO_PROTOCOL: 2148 *mtod(m, int *) = so->so_proto->pr_protocol; 2149 break; 2150 2151 case SO_SNDBUF: 2152 *mtod(m, int *) = so->so_snd.sb_hiwat; 2153 break; 2154 2155 case SO_RCVBUF: 2156 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2157 break; 2158 2159 case SO_SNDLOWAT: 2160 *mtod(m, int *) = so->so_snd.sb_lowat; 2161 break; 2162 2163 case SO_RCVLOWAT: 2164 *mtod(m, int *) = so->so_rcv.sb_lowat; 2165 break; 2166 2167 case SO_SNDTIMEO: 2168 case SO_RCVTIMEO: 2169 { 2170 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2171 &so->so_snd : &so->so_rcv); 2172 struct timeval tv; 2173 uint64_t nsecs; 2174 2175 mtx_enter(&sb->sb_mtx); 2176 nsecs = sb->sb_timeo_nsecs; 2177 mtx_leave(&sb->sb_mtx); 2178 2179 m->m_len = sizeof(struct timeval); 2180 memset(&tv, 0, sizeof(tv)); 2181 if (nsecs != INFSLP) 2182 NSEC_TO_TIMEVAL(nsecs, &tv); 2183 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2184 break; 2185 } 2186 2187 case SO_RTABLE: 2188 if (so->so_proto->pr_domain && 2189 so->so_proto->pr_domain->dom_protosw && 2190 so->so_proto->pr_ctloutput) { 2191 const struct domain *dom = 2192 so->so_proto->pr_domain; 2193 2194 level = dom->dom_protosw->pr_protocol; 2195 solock(so); 2196 error = (*so->so_proto->pr_ctloutput) 2197 (PRCO_GETOPT, so, level, optname, m); 2198 sounlock(so); 2199 if (error) 2200 return (error); 2201 break; 2202 } 2203 return (ENOPROTOOPT); 2204 2205 #ifdef SOCKET_SPLICE 2206 case SO_SPLICE: 2207 { 2208 off_t len; 2209 2210 m->m_len = sizeof(off_t); 2211 solock_shared(so); 2212 len = so->so_sp ? so->so_sp->ssp_len : 0; 2213 sounlock_shared(so); 2214 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2215 break; 2216 } 2217 #endif /* SOCKET_SPLICE */ 2218 2219 case SO_PEERCRED: 2220 if (so->so_proto->pr_protocol == AF_UNIX) { 2221 struct unpcb *unp = sotounpcb(so); 2222 2223 solock(so); 2224 if (unp->unp_flags & UNP_FEIDS) { 2225 m->m_len = sizeof(unp->unp_connid); 2226 memcpy(mtod(m, caddr_t), 2227 &(unp->unp_connid), m->m_len); 2228 sounlock(so); 2229 break; 2230 } 2231 sounlock(so); 2232 2233 return (ENOTCONN); 2234 } 2235 return (EOPNOTSUPP); 2236 2237 default: 2238 return (ENOPROTOOPT); 2239 } 2240 return (0); 2241 } 2242 } 2243 2244 void 2245 sohasoutofband(struct socket *so) 2246 { 2247 pgsigio(&so->so_sigio, SIGURG, 0); 2248 knote(&so->so_rcv.sb_klist, 0); 2249 } 2250 2251 void 2252 sofilt_lock(struct socket *so, struct sockbuf *sb) 2253 { 2254 switch (so->so_proto->pr_domain->dom_family) { 2255 case PF_INET: 2256 case PF_INET6: 2257 NET_LOCK_SHARED(); 2258 break; 2259 default: 2260 rw_enter_write(&so->so_lock); 2261 break; 2262 } 2263 2264 mtx_enter(&sb->sb_mtx); 2265 } 2266 2267 void 2268 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2269 { 2270 mtx_leave(&sb->sb_mtx); 2271 2272 switch (so->so_proto->pr_domain->dom_family) { 2273 case PF_INET: 2274 case PF_INET6: 2275 NET_UNLOCK_SHARED(); 2276 break; 2277 default: 2278 rw_exit_write(&so->so_lock); 2279 break; 2280 } 2281 } 2282 2283 int 2284 soo_kqfilter(struct file *fp, struct knote *kn) 2285 { 2286 struct socket *so = kn->kn_fp->f_data; 2287 struct sockbuf *sb; 2288 2289 switch (kn->kn_filter) { 2290 case EVFILT_READ: 2291 kn->kn_fop = &soread_filtops; 2292 sb = &so->so_rcv; 2293 break; 2294 case EVFILT_WRITE: 2295 kn->kn_fop = &sowrite_filtops; 2296 sb = &so->so_snd; 2297 break; 2298 case EVFILT_EXCEPT: 2299 kn->kn_fop = &soexcept_filtops; 2300 sb = &so->so_rcv; 2301 break; 2302 default: 2303 return (EINVAL); 2304 } 2305 2306 klist_insert(&sb->sb_klist, kn); 2307 2308 return (0); 2309 } 2310 2311 void 2312 filt_sordetach(struct knote *kn) 2313 { 2314 struct socket *so = kn->kn_fp->f_data; 2315 2316 klist_remove(&so->so_rcv.sb_klist, kn); 2317 } 2318 2319 int 2320 filt_soread(struct knote *kn, long hint) 2321 { 2322 struct socket *so = kn->kn_fp->f_data; 2323 int rv = 0; 2324 2325 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2326 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2327 soassertlocked_readonly(so); 2328 2329 if (so->so_options & SO_ACCEPTCONN) { 2330 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2331 soassertlocked_readonly(so); 2332 2333 kn->kn_data = so->so_qlen; 2334 rv = (kn->kn_data != 0); 2335 2336 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2337 if (so->so_state & SS_ISDISCONNECTED) { 2338 kn->kn_flags |= __EV_HUP; 2339 rv = 1; 2340 } else { 2341 rv = soreadable(so); 2342 } 2343 } 2344 2345 return rv; 2346 } 2347 2348 kn->kn_data = so->so_rcv.sb_cc; 2349 #ifdef SOCKET_SPLICE 2350 if (isspliced(so)) { 2351 rv = 0; 2352 } else 2353 #endif /* SOCKET_SPLICE */ 2354 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2355 kn->kn_flags |= EV_EOF; 2356 if (kn->kn_flags & __EV_POLL) { 2357 if (so->so_state & SS_ISDISCONNECTED) 2358 kn->kn_flags |= __EV_HUP; 2359 } 2360 kn->kn_fflags = so->so_error; 2361 rv = 1; 2362 } else if (so->so_error) { 2363 rv = 1; 2364 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2365 rv = (kn->kn_data >= kn->kn_sdata); 2366 } else { 2367 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2368 } 2369 2370 return rv; 2371 } 2372 2373 void 2374 filt_sowdetach(struct knote *kn) 2375 { 2376 struct socket *so = kn->kn_fp->f_data; 2377 2378 klist_remove(&so->so_snd.sb_klist, kn); 2379 } 2380 2381 int 2382 filt_sowrite(struct knote *kn, long hint) 2383 { 2384 struct socket *so = kn->kn_fp->f_data; 2385 int rv; 2386 2387 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2388 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2389 soassertlocked_readonly(so); 2390 2391 kn->kn_data = sbspace(so, &so->so_snd); 2392 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2393 kn->kn_flags |= EV_EOF; 2394 if (kn->kn_flags & __EV_POLL) { 2395 if (so->so_state & SS_ISDISCONNECTED) 2396 kn->kn_flags |= __EV_HUP; 2397 } 2398 kn->kn_fflags = so->so_error; 2399 rv = 1; 2400 } else if (so->so_error) { 2401 rv = 1; 2402 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2403 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2404 rv = 0; 2405 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2406 rv = (kn->kn_data >= kn->kn_sdata); 2407 } else { 2408 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2409 } 2410 2411 return (rv); 2412 } 2413 2414 int 2415 filt_soexcept(struct knote *kn, long hint) 2416 { 2417 struct socket *so = kn->kn_fp->f_data; 2418 int rv = 0; 2419 2420 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2421 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2422 soassertlocked_readonly(so); 2423 2424 #ifdef SOCKET_SPLICE 2425 if (isspliced(so)) { 2426 rv = 0; 2427 } else 2428 #endif /* SOCKET_SPLICE */ 2429 if (kn->kn_sfflags & NOTE_OOB) { 2430 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2431 kn->kn_fflags |= NOTE_OOB; 2432 kn->kn_data -= so->so_oobmark; 2433 rv = 1; 2434 } 2435 } 2436 2437 if (kn->kn_flags & __EV_POLL) { 2438 if (so->so_state & SS_ISDISCONNECTED) { 2439 kn->kn_flags |= __EV_HUP; 2440 rv = 1; 2441 } 2442 } 2443 2444 return rv; 2445 } 2446 2447 int 2448 filt_sowmodify(struct kevent *kev, struct knote *kn) 2449 { 2450 struct socket *so = kn->kn_fp->f_data; 2451 int rv; 2452 2453 sofilt_lock(so, &so->so_snd); 2454 rv = knote_modify(kev, kn); 2455 sofilt_unlock(so, &so->so_snd); 2456 2457 return (rv); 2458 } 2459 2460 int 2461 filt_sowprocess(struct knote *kn, struct kevent *kev) 2462 { 2463 struct socket *so = kn->kn_fp->f_data; 2464 int rv; 2465 2466 sofilt_lock(so, &so->so_snd); 2467 rv = knote_process(kn, kev); 2468 sofilt_unlock(so, &so->so_snd); 2469 2470 return (rv); 2471 } 2472 2473 int 2474 filt_sormodify(struct kevent *kev, struct knote *kn) 2475 { 2476 struct socket *so = kn->kn_fp->f_data; 2477 int rv; 2478 2479 sofilt_lock(so, &so->so_rcv); 2480 rv = knote_modify(kev, kn); 2481 sofilt_unlock(so, &so->so_rcv); 2482 2483 return (rv); 2484 } 2485 2486 int 2487 filt_sorprocess(struct knote *kn, struct kevent *kev) 2488 { 2489 struct socket *so = kn->kn_fp->f_data; 2490 int rv; 2491 2492 sofilt_lock(so, &so->so_rcv); 2493 rv = knote_process(kn, kev); 2494 sofilt_unlock(so, &so->so_rcv); 2495 2496 return (rv); 2497 } 2498 2499 #ifdef DDB 2500 void 2501 sobuf_print(struct sockbuf *, 2502 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2503 2504 void 2505 sobuf_print(struct sockbuf *sb, 2506 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2507 { 2508 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2509 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2510 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2511 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2512 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2513 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2514 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2515 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2516 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2517 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2518 (*pr)("\tsb_sel: ...\n"); 2519 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2520 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2521 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2522 } 2523 2524 void 2525 so_print(void *v, 2526 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2527 { 2528 struct socket *so = v; 2529 2530 (*pr)("socket %p\n", so); 2531 (*pr)("so_type: %i\n", so->so_type); 2532 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2533 (*pr)("so_linger: %i\n", so->so_linger); 2534 (*pr)("so_state: 0x%04x\n", so->so_state); 2535 (*pr)("so_pcb: %p\n", so->so_pcb); 2536 (*pr)("so_proto: %p\n", so->so_proto); 2537 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2538 2539 (*pr)("so_head: %p\n", so->so_head); 2540 (*pr)("so_onq: %p\n", so->so_onq); 2541 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2542 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2543 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2544 (*pr)("so_q0len: %i\n", so->so_q0len); 2545 (*pr)("so_qlen: %i\n", so->so_qlen); 2546 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2547 (*pr)("so_timeo: %i\n", so->so_timeo); 2548 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2549 2550 (*pr)("so_sp: %p\n", so->so_sp); 2551 if (so->so_sp != NULL) { 2552 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2553 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2554 (*pr)("\tssp_len: %lld\n", 2555 (unsigned long long)so->so_sp->ssp_len); 2556 (*pr)("\tssp_max: %lld\n", 2557 (unsigned long long)so->so_sp->ssp_max); 2558 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2559 so->so_sp->ssp_idletv.tv_usec); 2560 (*pr)("\tssp_idleto: %spending (@%i)\n", 2561 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2562 so->so_sp->ssp_idleto.to_time); 2563 } 2564 2565 (*pr)("so_rcv:\n"); 2566 sobuf_print(&so->so_rcv, pr); 2567 (*pr)("so_snd:\n"); 2568 sobuf_print(&so->so_snd, pr); 2569 2570 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2571 so->so_upcall, so->so_upcallarg); 2572 2573 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2574 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2575 (*pr)("so_cpid: %d\n", so->so_cpid); 2576 } 2577 #endif 2578