1 /* $OpenBSD: uipc_socket.c,v 1.330 2024/04/15 21:31:29 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 void sorflush_locked(struct socket *); 70 71 void filt_sordetach(struct knote *kn); 72 int filt_soread(struct knote *kn, long hint); 73 void filt_sowdetach(struct knote *kn); 74 int filt_sowrite(struct knote *kn, long hint); 75 int filt_soexcept(struct knote *kn, long hint); 76 77 int filt_sowmodify(struct kevent *kev, struct knote *kn); 78 int filt_sowprocess(struct knote *kn, struct kevent *kev); 79 80 int filt_sormodify(struct kevent *kev, struct knote *kn); 81 int filt_sorprocess(struct knote *kn, struct kevent *kev); 82 83 const struct filterops soread_filtops = { 84 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 85 .f_attach = NULL, 86 .f_detach = filt_sordetach, 87 .f_event = filt_soread, 88 .f_modify = filt_sormodify, 89 .f_process = filt_sorprocess, 90 }; 91 92 const struct filterops sowrite_filtops = { 93 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 94 .f_attach = NULL, 95 .f_detach = filt_sowdetach, 96 .f_event = filt_sowrite, 97 .f_modify = filt_sowmodify, 98 .f_process = filt_sowprocess, 99 }; 100 101 const struct filterops soexcept_filtops = { 102 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 103 .f_attach = NULL, 104 .f_detach = filt_sordetach, 105 .f_event = filt_soexcept, 106 .f_modify = filt_sormodify, 107 .f_process = filt_sorprocess, 108 }; 109 110 #ifndef SOMINCONN 111 #define SOMINCONN 80 112 #endif /* SOMINCONN */ 113 114 int somaxconn = SOMAXCONN; 115 int sominconn = SOMINCONN; 116 117 struct pool socket_pool; 118 #ifdef SOCKET_SPLICE 119 struct pool sosplice_pool; 120 struct taskq *sosplice_taskq; 121 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 122 #endif 123 124 void 125 soinit(void) 126 { 127 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 128 "sockpl", NULL); 129 #ifdef SOCKET_SPLICE 130 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 131 "sosppl", NULL); 132 #endif 133 } 134 135 struct socket * 136 soalloc(const struct protosw *prp, int wait) 137 { 138 const struct domain *dp = prp->pr_domain; 139 struct socket *so; 140 141 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 142 PR_ZERO); 143 if (so == NULL) 144 return (NULL); 145 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 146 refcnt_init(&so->so_refcnt); 147 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 148 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 149 mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); 150 mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); 151 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 152 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 153 sigio_init(&so->so_sigio); 154 TAILQ_INIT(&so->so_q0); 155 TAILQ_INIT(&so->so_q); 156 157 switch (dp->dom_family) { 158 case AF_INET: 159 case AF_INET6: 160 switch (prp->pr_type) { 161 case SOCK_DGRAM: 162 case SOCK_RAW: 163 so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; 164 break; 165 } 166 break; 167 case AF_UNIX: 168 so->so_rcv.sb_flags |= SB_MTXLOCK; 169 break; 170 } 171 172 return (so); 173 } 174 175 /* 176 * Socket operation routines. 177 * These routines are called by the routines in 178 * sys_socket.c or from a system process, and 179 * implement the semantics of socket operations by 180 * switching out to the protocol specific routines. 181 */ 182 int 183 socreate(int dom, struct socket **aso, int type, int proto) 184 { 185 struct proc *p = curproc; /* XXX */ 186 const struct protosw *prp; 187 struct socket *so; 188 int error; 189 190 if (proto) 191 prp = pffindproto(dom, proto, type); 192 else 193 prp = pffindtype(dom, type); 194 if (prp == NULL || prp->pr_usrreqs == NULL) 195 return (EPROTONOSUPPORT); 196 if (prp->pr_type != type) 197 return (EPROTOTYPE); 198 so = soalloc(prp, M_WAIT); 199 so->so_type = type; 200 if (suser(p) == 0) 201 so->so_state = SS_PRIV; 202 so->so_ruid = p->p_ucred->cr_ruid; 203 so->so_euid = p->p_ucred->cr_uid; 204 so->so_rgid = p->p_ucred->cr_rgid; 205 so->so_egid = p->p_ucred->cr_gid; 206 so->so_cpid = p->p_p->ps_pid; 207 so->so_proto = prp; 208 so->so_snd.sb_timeo_nsecs = INFSLP; 209 so->so_rcv.sb_timeo_nsecs = INFSLP; 210 211 solock(so); 212 error = pru_attach(so, proto, M_WAIT); 213 if (error) { 214 so->so_state |= SS_NOFDREF; 215 /* sofree() calls sounlock(). */ 216 sofree(so, 0); 217 return (error); 218 } 219 sounlock(so); 220 *aso = so; 221 return (0); 222 } 223 224 int 225 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 226 { 227 soassertlocked(so); 228 return pru_bind(so, nam, p); 229 } 230 231 int 232 solisten(struct socket *so, int backlog) 233 { 234 int somaxconn_local = READ_ONCE(somaxconn); 235 int sominconn_local = READ_ONCE(sominconn); 236 int error; 237 238 switch (so->so_type) { 239 case SOCK_STREAM: 240 case SOCK_SEQPACKET: 241 break; 242 default: 243 return (EOPNOTSUPP); 244 } 245 246 soassertlocked(so); 247 248 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 249 return (EINVAL); 250 #ifdef SOCKET_SPLICE 251 if (isspliced(so) || issplicedback(so)) 252 return (EOPNOTSUPP); 253 #endif /* SOCKET_SPLICE */ 254 error = pru_listen(so); 255 if (error) 256 return (error); 257 if (TAILQ_FIRST(&so->so_q) == NULL) 258 so->so_options |= SO_ACCEPTCONN; 259 if (backlog < 0 || backlog > somaxconn_local) 260 backlog = somaxconn_local; 261 if (backlog < sominconn_local) 262 backlog = sominconn_local; 263 so->so_qlimit = backlog; 264 return (0); 265 } 266 267 #define SOSP_FREEING_READ 1 268 #define SOSP_FREEING_WRITE 2 269 void 270 sofree(struct socket *so, int keep_lock) 271 { 272 int persocket = solock_persocket(so); 273 274 soassertlocked(so); 275 276 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 277 if (!keep_lock) 278 sounlock(so); 279 return; 280 } 281 if (so->so_head) { 282 struct socket *head = so->so_head; 283 284 /* 285 * We must not decommission a socket that's on the accept(2) 286 * queue. If we do, then accept(2) may hang after select(2) 287 * indicated that the listening socket was ready. 288 */ 289 if (so->so_onq == &head->so_q) { 290 if (!keep_lock) 291 sounlock(so); 292 return; 293 } 294 295 if (persocket) { 296 /* 297 * Concurrent close of `head' could 298 * abort `so' due to re-lock. 299 */ 300 soref(so); 301 soref(head); 302 sounlock(so); 303 solock(head); 304 solock(so); 305 306 if (so->so_onq != &head->so_q0) { 307 sounlock(head); 308 sounlock(so); 309 sorele(head); 310 sorele(so); 311 return; 312 } 313 314 sorele(head); 315 sorele(so); 316 } 317 318 soqremque(so, 0); 319 320 if (persocket) 321 sounlock(head); 322 } 323 324 if (persocket) { 325 sounlock(so); 326 refcnt_finalize(&so->so_refcnt, "sofinal"); 327 solock(so); 328 } 329 330 sigio_free(&so->so_sigio); 331 klist_free(&so->so_rcv.sb_klist); 332 klist_free(&so->so_snd.sb_klist); 333 #ifdef SOCKET_SPLICE 334 if (issplicedback(so)) { 335 int freeing = SOSP_FREEING_WRITE; 336 337 if (so->so_sp->ssp_soback == so) 338 freeing |= SOSP_FREEING_READ; 339 sounsplice(so->so_sp->ssp_soback, so, freeing); 340 } 341 if (isspliced(so)) { 342 int freeing = SOSP_FREEING_READ; 343 344 if (so == so->so_sp->ssp_socket) 345 freeing |= SOSP_FREEING_WRITE; 346 sounsplice(so, so->so_sp->ssp_socket, freeing); 347 } 348 #endif /* SOCKET_SPLICE */ 349 sbrelease(so, &so->so_snd); 350 351 /* 352 * Regardless on '_locked' postfix, must release solock() before 353 * call sorflush_locked() for SB_OWNLOCK marked socket. Can't 354 * release solock() and call sorflush() because solock() release 355 * is unwanted for tcp(4) socket. 356 */ 357 358 if (so->so_rcv.sb_flags & SB_OWNLOCK) 359 sounlock(so); 360 361 sorflush_locked(so); 362 363 if (!((so->so_rcv.sb_flags & SB_OWNLOCK) || keep_lock)) 364 sounlock(so); 365 366 #ifdef SOCKET_SPLICE 367 if (so->so_sp) { 368 /* Reuse splice idle, sounsplice() has been called before. */ 369 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 370 timeout_add(&so->so_sp->ssp_idleto, 0); 371 } else 372 #endif /* SOCKET_SPLICE */ 373 { 374 pool_put(&socket_pool, so); 375 } 376 } 377 378 static inline uint64_t 379 solinger_nsec(struct socket *so) 380 { 381 if (so->so_linger == 0) 382 return INFSLP; 383 384 return SEC_TO_NSEC(so->so_linger); 385 } 386 387 /* 388 * Close a socket on last file table reference removal. 389 * Initiate disconnect if connected. 390 * Free socket when disconnect complete. 391 */ 392 int 393 soclose(struct socket *so, int flags) 394 { 395 struct socket *so2; 396 int error = 0; 397 398 solock(so); 399 /* Revoke async IO early. There is a final revocation in sofree(). */ 400 sigio_free(&so->so_sigio); 401 if (so->so_state & SS_ISCONNECTED) { 402 if (so->so_pcb == NULL) 403 goto discard; 404 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 405 error = sodisconnect(so); 406 if (error) 407 goto drop; 408 } 409 if (so->so_options & SO_LINGER) { 410 if ((so->so_state & SS_ISDISCONNECTING) && 411 (flags & MSG_DONTWAIT)) 412 goto drop; 413 while (so->so_state & SS_ISCONNECTED) { 414 error = sosleep_nsec(so, &so->so_timeo, 415 PSOCK | PCATCH, "netcls", 416 solinger_nsec(so)); 417 if (error) 418 break; 419 } 420 } 421 } 422 drop: 423 if (so->so_pcb) { 424 int error2; 425 error2 = pru_detach(so); 426 if (error == 0) 427 error = error2; 428 } 429 if (so->so_options & SO_ACCEPTCONN) { 430 int persocket = solock_persocket(so); 431 432 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 433 if (persocket) 434 solock(so2); 435 (void) soqremque(so2, 0); 436 if (persocket) 437 sounlock(so); 438 soabort(so2); 439 if (persocket) 440 solock(so); 441 } 442 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 443 if (persocket) 444 solock(so2); 445 (void) soqremque(so2, 1); 446 if (persocket) 447 sounlock(so); 448 soabort(so2); 449 if (persocket) 450 solock(so); 451 } 452 } 453 discard: 454 if (so->so_state & SS_NOFDREF) 455 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 456 so->so_state |= SS_NOFDREF; 457 /* sofree() calls sounlock(). */ 458 sofree(so, 0); 459 return (error); 460 } 461 462 void 463 soabort(struct socket *so) 464 { 465 soassertlocked(so); 466 pru_abort(so); 467 } 468 469 int 470 soaccept(struct socket *so, struct mbuf *nam) 471 { 472 int error = 0; 473 474 soassertlocked(so); 475 476 if ((so->so_state & SS_NOFDREF) == 0) 477 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 478 so->so_state &= ~SS_NOFDREF; 479 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 480 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 481 error = pru_accept(so, nam); 482 else 483 error = ECONNABORTED; 484 return (error); 485 } 486 487 int 488 soconnect(struct socket *so, struct mbuf *nam) 489 { 490 int error; 491 492 soassertlocked(so); 493 494 if (so->so_options & SO_ACCEPTCONN) 495 return (EOPNOTSUPP); 496 /* 497 * If protocol is connection-based, can only connect once. 498 * Otherwise, if connected, try to disconnect first. 499 * This allows user to disconnect by connecting to, e.g., 500 * a null address. 501 */ 502 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 503 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 504 (error = sodisconnect(so)))) 505 error = EISCONN; 506 else 507 error = pru_connect(so, nam); 508 return (error); 509 } 510 511 int 512 soconnect2(struct socket *so1, struct socket *so2) 513 { 514 int persocket, error; 515 516 if ((persocket = solock_persocket(so1))) 517 solock_pair(so1, so2); 518 else 519 solock(so1); 520 521 error = pru_connect2(so1, so2); 522 523 if (persocket) 524 sounlock(so2); 525 sounlock(so1); 526 return (error); 527 } 528 529 int 530 sodisconnect(struct socket *so) 531 { 532 int error; 533 534 soassertlocked(so); 535 536 if ((so->so_state & SS_ISCONNECTED) == 0) 537 return (ENOTCONN); 538 if (so->so_state & SS_ISDISCONNECTING) 539 return (EALREADY); 540 error = pru_disconnect(so); 541 return (error); 542 } 543 544 int m_getuio(struct mbuf **, int, long, struct uio *); 545 546 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 547 /* 548 * Send on a socket. 549 * If send must go all at once and message is larger than 550 * send buffering, then hard error. 551 * Lock against other senders. 552 * If must go all at once and not enough room now, then 553 * inform user that this would block and do nothing. 554 * Otherwise, if nonblocking, send as much as possible. 555 * The data to be sent is described by "uio" if nonzero, 556 * otherwise by the mbuf chain "top" (which must be null 557 * if uio is not). Data provided in mbuf chain must be small 558 * enough to send all at once. 559 * 560 * Returns nonzero on error, timeout or signal; callers 561 * must check for short counts if EINTR/ERESTART are returned. 562 * Data and control buffers are freed on return. 563 */ 564 int 565 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 566 struct mbuf *control, int flags) 567 { 568 long space, clen = 0; 569 size_t resid; 570 int error; 571 int atomic = sosendallatonce(so) || top; 572 573 if (uio) 574 resid = uio->uio_resid; 575 else 576 resid = top->m_pkthdr.len; 577 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 578 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 579 m_freem(top); 580 m_freem(control); 581 return (EINVAL); 582 } 583 if (uio && uio->uio_procp) 584 uio->uio_procp->p_ru.ru_msgsnd++; 585 if (control) { 586 /* 587 * In theory clen should be unsigned (since control->m_len is). 588 * However, space must be signed, as it might be less than 0 589 * if we over-committed, and we must use a signed comparison 590 * of space and clen. 591 */ 592 clen = control->m_len; 593 /* reserve extra space for AF_UNIX's internalize */ 594 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 595 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 596 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 597 clen = CMSG_SPACE( 598 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 599 (sizeof(struct fdpass) / sizeof(int))); 600 } 601 602 #define snderr(errno) { error = errno; goto release; } 603 604 solock_shared(so); 605 restart: 606 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 607 goto out; 608 so->so_snd.sb_state |= SS_ISSENDING; 609 do { 610 if (so->so_snd.sb_state & SS_CANTSENDMORE) 611 snderr(EPIPE); 612 if (so->so_error) { 613 error = so->so_error; 614 so->so_error = 0; 615 snderr(error); 616 } 617 if ((so->so_state & SS_ISCONNECTED) == 0) { 618 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 619 if (!(resid == 0 && clen != 0)) 620 snderr(ENOTCONN); 621 } else if (addr == NULL) 622 snderr(EDESTADDRREQ); 623 } 624 space = sbspace(so, &so->so_snd); 625 if (flags & MSG_OOB) 626 space += 1024; 627 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 628 if (atomic && resid > so->so_snd.sb_hiwat) 629 snderr(EMSGSIZE); 630 } else { 631 if (clen > so->so_snd.sb_hiwat || 632 (atomic && resid > so->so_snd.sb_hiwat - clen)) 633 snderr(EMSGSIZE); 634 } 635 if (space < clen || 636 (space - clen < resid && 637 (atomic || space < so->so_snd.sb_lowat))) { 638 if (flags & MSG_DONTWAIT) 639 snderr(EWOULDBLOCK); 640 sbunlock(so, &so->so_snd); 641 error = sbwait(so, &so->so_snd); 642 so->so_snd.sb_state &= ~SS_ISSENDING; 643 if (error) 644 goto out; 645 goto restart; 646 } 647 space -= clen; 648 do { 649 if (uio == NULL) { 650 /* 651 * Data is prepackaged in "top". 652 */ 653 resid = 0; 654 if (flags & MSG_EOR) 655 top->m_flags |= M_EOR; 656 } else { 657 sounlock_shared(so); 658 error = m_getuio(&top, atomic, space, uio); 659 solock_shared(so); 660 if (error) 661 goto release; 662 space -= top->m_pkthdr.len; 663 resid = uio->uio_resid; 664 if (flags & MSG_EOR) 665 top->m_flags |= M_EOR; 666 } 667 if (resid == 0) 668 so->so_snd.sb_state &= ~SS_ISSENDING; 669 if (top && so->so_options & SO_ZEROIZE) 670 top->m_flags |= M_ZEROIZE; 671 if (flags & MSG_OOB) 672 error = pru_sendoob(so, top, addr, control); 673 else 674 error = pru_send(so, top, addr, control); 675 clen = 0; 676 control = NULL; 677 top = NULL; 678 if (error) 679 goto release; 680 } while (resid && space > 0); 681 } while (resid); 682 683 release: 684 so->so_snd.sb_state &= ~SS_ISSENDING; 685 sbunlock(so, &so->so_snd); 686 out: 687 sounlock_shared(so); 688 m_freem(top); 689 m_freem(control); 690 return (error); 691 } 692 693 int 694 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 695 { 696 struct mbuf *m, *top = NULL; 697 struct mbuf **nextp = ⊤ 698 u_long len, mlen; 699 size_t resid = uio->uio_resid; 700 int error; 701 702 do { 703 if (top == NULL) { 704 MGETHDR(m, M_WAIT, MT_DATA); 705 mlen = MHLEN; 706 m->m_pkthdr.len = 0; 707 m->m_pkthdr.ph_ifidx = 0; 708 } else { 709 MGET(m, M_WAIT, MT_DATA); 710 mlen = MLEN; 711 } 712 /* chain mbuf together */ 713 *nextp = m; 714 nextp = &m->m_next; 715 716 resid = ulmin(resid, space); 717 if (resid >= MINCLSIZE) { 718 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 719 if ((m->m_flags & M_EXT) == 0) 720 MCLGETL(m, M_NOWAIT, MCLBYTES); 721 if ((m->m_flags & M_EXT) == 0) 722 goto nopages; 723 mlen = m->m_ext.ext_size; 724 len = ulmin(mlen, resid); 725 /* 726 * For datagram protocols, leave room 727 * for protocol headers in first mbuf. 728 */ 729 if (atomic && m == top && len < mlen - max_hdr) 730 m->m_data += max_hdr; 731 } else { 732 nopages: 733 len = ulmin(mlen, resid); 734 /* 735 * For datagram protocols, leave room 736 * for protocol headers in first mbuf. 737 */ 738 if (atomic && m == top && len < mlen - max_hdr) 739 m_align(m, len); 740 } 741 742 error = uiomove(mtod(m, caddr_t), len, uio); 743 if (error) { 744 m_freem(top); 745 return (error); 746 } 747 748 /* adjust counters */ 749 resid = uio->uio_resid; 750 space -= len; 751 m->m_len = len; 752 top->m_pkthdr.len += len; 753 754 /* Is there more space and more data? */ 755 } while (space > 0 && resid > 0); 756 757 *mp = top; 758 return 0; 759 } 760 761 /* 762 * Following replacement or removal of the first mbuf on the first 763 * mbuf chain of a socket buffer, push necessary state changes back 764 * into the socket buffer so that other consumers see the values 765 * consistently. 'nextrecord' is the callers locally stored value of 766 * the original value of sb->sb_mb->m_nextpkt which must be restored 767 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 768 */ 769 void 770 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 771 { 772 773 /* 774 * First, update for the new value of nextrecord. If necessary, 775 * make it the first record. 776 */ 777 if (sb->sb_mb != NULL) 778 sb->sb_mb->m_nextpkt = nextrecord; 779 else 780 sb->sb_mb = nextrecord; 781 782 /* 783 * Now update any dependent socket buffer fields to reflect 784 * the new state. This is an inline of SB_EMPTY_FIXUP, with 785 * the addition of a second clause that takes care of the 786 * case where sb_mb has been updated, but remains the last 787 * record. 788 */ 789 if (sb->sb_mb == NULL) { 790 sb->sb_mbtail = NULL; 791 sb->sb_lastrecord = NULL; 792 } else if (sb->sb_mb->m_nextpkt == NULL) 793 sb->sb_lastrecord = sb->sb_mb; 794 } 795 796 /* 797 * Implement receive operations on a socket. 798 * We depend on the way that records are added to the sockbuf 799 * by sbappend*. In particular, each record (mbufs linked through m_next) 800 * must begin with an address if the protocol so specifies, 801 * followed by an optional mbuf or mbufs containing ancillary data, 802 * and then zero or more mbufs of data. 803 * In order to avoid blocking network for the entire time here, we release 804 * the solock() while doing the actual copy to user space. 805 * Although the sockbuf is locked, new data may still be appended, 806 * and thus we must maintain consistency of the sockbuf during that time. 807 * 808 * The caller may receive the data as a single mbuf chain by supplying 809 * an mbuf **mp0 for use in returning the chain. The uio is then used 810 * only for the count in uio_resid. 811 */ 812 int 813 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 814 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 815 socklen_t controllen) 816 { 817 struct mbuf *m, **mp; 818 struct mbuf *cm; 819 u_long len, offset, moff; 820 int flags, error, error2, type, uio_error = 0; 821 const struct protosw *pr = so->so_proto; 822 struct mbuf *nextrecord; 823 size_t resid, orig_resid = uio->uio_resid; 824 int dosolock = ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0); 825 826 mp = mp0; 827 if (paddr) 828 *paddr = NULL; 829 if (controlp) 830 *controlp = NULL; 831 if (flagsp) 832 flags = *flagsp &~ MSG_EOR; 833 else 834 flags = 0; 835 if (flags & MSG_OOB) { 836 m = m_get(M_WAIT, MT_DATA); 837 solock(so); 838 error = pru_rcvoob(so, m, flags & MSG_PEEK); 839 sounlock(so); 840 if (error) 841 goto bad; 842 do { 843 error = uiomove(mtod(m, caddr_t), 844 ulmin(uio->uio_resid, m->m_len), uio); 845 m = m_free(m); 846 } while (uio->uio_resid && error == 0 && m); 847 bad: 848 m_freem(m); 849 return (error); 850 } 851 if (mp) 852 *mp = NULL; 853 854 if (dosolock) 855 solock_shared(so); 856 restart: 857 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) 858 goto out; 859 sb_mtx_lock(&so->so_rcv); 860 861 m = so->so_rcv.sb_mb; 862 #ifdef SOCKET_SPLICE 863 if (isspliced(so)) 864 m = NULL; 865 #endif /* SOCKET_SPLICE */ 866 /* 867 * If we have less data than requested, block awaiting more 868 * (subject to any timeout) if: 869 * 1. the current count is less than the low water mark, 870 * 2. MSG_WAITALL is set, and it is possible to do the entire 871 * receive operation at once if we block (resid <= hiwat), or 872 * 3. MSG_DONTWAIT is not set. 873 * If MSG_WAITALL is set but resid is larger than the receive buffer, 874 * we have to do the receive in sections, and thus risk returning 875 * a short count if a timeout or signal occurs after we start. 876 */ 877 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 878 so->so_rcv.sb_cc < uio->uio_resid) && 879 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 880 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 881 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 882 #ifdef DIAGNOSTIC 883 if (m == NULL && so->so_rcv.sb_cc) 884 #ifdef SOCKET_SPLICE 885 if (!isspliced(so)) 886 #endif /* SOCKET_SPLICE */ 887 panic("receive 1: so %p, so_type %d, sb_cc %lu", 888 so, so->so_type, so->so_rcv.sb_cc); 889 #endif 890 if ((error2 = READ_ONCE(so->so_error))) { 891 if (m) 892 goto dontblock; 893 error = error2; 894 if ((flags & MSG_PEEK) == 0) 895 so->so_error = 0; 896 goto release; 897 } 898 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 899 if (m) 900 goto dontblock; 901 else if (so->so_rcv.sb_cc == 0) 902 goto release; 903 } 904 for (; m; m = m->m_next) 905 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 906 m = so->so_rcv.sb_mb; 907 goto dontblock; 908 } 909 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 910 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 911 error = ENOTCONN; 912 goto release; 913 } 914 if (uio->uio_resid == 0 && controlp == NULL) 915 goto release; 916 if (flags & MSG_DONTWAIT) { 917 error = EWOULDBLOCK; 918 goto release; 919 } 920 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 921 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 922 923 if (so->so_rcv.sb_flags & (SB_MTXLOCK | SB_OWNLOCK)) { 924 sbunlock_locked(so, &so->so_rcv); 925 if (dosolock) 926 sounlock_shared(so); 927 error = sbwait_locked(so, &so->so_rcv); 928 sb_mtx_unlock(&so->so_rcv); 929 if (error) 930 return (error); 931 if (dosolock) 932 solock_shared(so); 933 } else { 934 sb_mtx_unlock(&so->so_rcv); 935 sbunlock(so, &so->so_rcv); 936 error = sbwait(so, &so->so_rcv); 937 if (error) { 938 sounlock_shared(so); 939 return (error); 940 } 941 } 942 goto restart; 943 } 944 dontblock: 945 /* 946 * On entry here, m points to the first record of the socket buffer. 947 * From this point onward, we maintain 'nextrecord' as a cache of the 948 * pointer to the next record in the socket buffer. We must keep the 949 * various socket buffer pointers and local stack versions of the 950 * pointers in sync, pushing out modifications before operations that 951 * may sleep, and re-reading them afterwards. 952 * 953 * Otherwise, we will race with the network stack appending new data 954 * or records onto the socket buffer by using inconsistent/stale 955 * versions of the field, possibly resulting in socket buffer 956 * corruption. 957 */ 958 if (uio->uio_procp) 959 uio->uio_procp->p_ru.ru_msgrcv++; 960 KASSERT(m == so->so_rcv.sb_mb); 961 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 962 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 963 nextrecord = m->m_nextpkt; 964 if (pr->pr_flags & PR_ADDR) { 965 #ifdef DIAGNOSTIC 966 if (m->m_type != MT_SONAME) 967 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 968 so, so->so_type, m, m->m_type); 969 #endif 970 orig_resid = 0; 971 if (flags & MSG_PEEK) { 972 if (paddr) 973 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 974 m = m->m_next; 975 } else { 976 sbfree(so, &so->so_rcv, m); 977 if (paddr) { 978 *paddr = m; 979 so->so_rcv.sb_mb = m->m_next; 980 m->m_next = NULL; 981 m = so->so_rcv.sb_mb; 982 } else { 983 so->so_rcv.sb_mb = m_free(m); 984 m = so->so_rcv.sb_mb; 985 } 986 sbsync(&so->so_rcv, nextrecord); 987 } 988 } 989 while (m && m->m_type == MT_CONTROL && error == 0) { 990 int skip = 0; 991 if (flags & MSG_PEEK) { 992 if (mtod(m, struct cmsghdr *)->cmsg_type == 993 SCM_RIGHTS) { 994 /* don't leak internalized SCM_RIGHTS msgs */ 995 skip = 1; 996 } else if (controlp) 997 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 998 m = m->m_next; 999 } else { 1000 sbfree(so, &so->so_rcv, m); 1001 so->so_rcv.sb_mb = m->m_next; 1002 m->m_nextpkt = m->m_next = NULL; 1003 cm = m; 1004 m = so->so_rcv.sb_mb; 1005 sbsync(&so->so_rcv, nextrecord); 1006 if (controlp) { 1007 if (pr->pr_domain->dom_externalize) { 1008 sb_mtx_unlock(&so->so_rcv); 1009 if (dosolock) 1010 sounlock_shared(so); 1011 error = 1012 (*pr->pr_domain->dom_externalize) 1013 (cm, controllen, flags); 1014 if (dosolock) 1015 solock_shared(so); 1016 sb_mtx_lock(&so->so_rcv); 1017 } 1018 *controlp = cm; 1019 } else { 1020 /* 1021 * Dispose of any SCM_RIGHTS message that went 1022 * through the read path rather than recv. 1023 */ 1024 if (pr->pr_domain->dom_dispose) { 1025 sb_mtx_unlock(&so->so_rcv); 1026 pr->pr_domain->dom_dispose(cm); 1027 sb_mtx_lock(&so->so_rcv); 1028 } 1029 m_free(cm); 1030 } 1031 } 1032 if (m != NULL) 1033 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1034 else 1035 nextrecord = so->so_rcv.sb_mb; 1036 if (controlp && !skip) 1037 controlp = &(*controlp)->m_next; 1038 orig_resid = 0; 1039 } 1040 1041 /* If m is non-NULL, we have some data to read. */ 1042 if (m) { 1043 type = m->m_type; 1044 if (type == MT_OOBDATA) 1045 flags |= MSG_OOB; 1046 if (m->m_flags & M_BCAST) 1047 flags |= MSG_BCAST; 1048 if (m->m_flags & M_MCAST) 1049 flags |= MSG_MCAST; 1050 } 1051 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1052 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1053 1054 moff = 0; 1055 offset = 0; 1056 while (m && uio->uio_resid > 0 && error == 0) { 1057 if (m->m_type == MT_OOBDATA) { 1058 if (type != MT_OOBDATA) 1059 break; 1060 } else if (type == MT_OOBDATA) { 1061 break; 1062 } else if (m->m_type == MT_CONTROL) { 1063 /* 1064 * If there is more than one control message in the 1065 * stream, we do a short read. Next can be received 1066 * or disposed by another system call. 1067 */ 1068 break; 1069 #ifdef DIAGNOSTIC 1070 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1071 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1072 so, so->so_type, m, m->m_type); 1073 #endif 1074 } 1075 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1076 len = uio->uio_resid; 1077 if (so->so_oobmark && len > so->so_oobmark - offset) 1078 len = so->so_oobmark - offset; 1079 if (len > m->m_len - moff) 1080 len = m->m_len - moff; 1081 /* 1082 * If mp is set, just pass back the mbufs. 1083 * Otherwise copy them out via the uio, then free. 1084 * Sockbuf must be consistent here (points to current mbuf, 1085 * it points to next record) when we drop priority; 1086 * we must note any additions to the sockbuf when we 1087 * block interrupts again. 1088 */ 1089 if (mp == NULL && uio_error == 0) { 1090 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1091 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1092 resid = uio->uio_resid; 1093 sb_mtx_unlock(&so->so_rcv); 1094 if (dosolock) 1095 sounlock_shared(so); 1096 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1097 if (dosolock) 1098 solock_shared(so); 1099 sb_mtx_lock(&so->so_rcv); 1100 if (uio_error) 1101 uio->uio_resid = resid - len; 1102 } else 1103 uio->uio_resid -= len; 1104 if (len == m->m_len - moff) { 1105 if (m->m_flags & M_EOR) 1106 flags |= MSG_EOR; 1107 if (flags & MSG_PEEK) { 1108 m = m->m_next; 1109 moff = 0; 1110 orig_resid = 0; 1111 } else { 1112 nextrecord = m->m_nextpkt; 1113 sbfree(so, &so->so_rcv, m); 1114 if (mp) { 1115 *mp = m; 1116 mp = &m->m_next; 1117 so->so_rcv.sb_mb = m = m->m_next; 1118 *mp = NULL; 1119 } else { 1120 so->so_rcv.sb_mb = m_free(m); 1121 m = so->so_rcv.sb_mb; 1122 } 1123 /* 1124 * If m != NULL, we also know that 1125 * so->so_rcv.sb_mb != NULL. 1126 */ 1127 KASSERT(so->so_rcv.sb_mb == m); 1128 if (m) { 1129 m->m_nextpkt = nextrecord; 1130 if (nextrecord == NULL) 1131 so->so_rcv.sb_lastrecord = m; 1132 } else { 1133 so->so_rcv.sb_mb = nextrecord; 1134 SB_EMPTY_FIXUP(&so->so_rcv); 1135 } 1136 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1137 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1138 } 1139 } else { 1140 if (flags & MSG_PEEK) { 1141 moff += len; 1142 orig_resid = 0; 1143 } else { 1144 if (mp) 1145 *mp = m_copym(m, 0, len, M_WAIT); 1146 m->m_data += len; 1147 m->m_len -= len; 1148 so->so_rcv.sb_cc -= len; 1149 so->so_rcv.sb_datacc -= len; 1150 } 1151 } 1152 if (so->so_oobmark) { 1153 if ((flags & MSG_PEEK) == 0) { 1154 so->so_oobmark -= len; 1155 if (so->so_oobmark == 0) { 1156 so->so_rcv.sb_state |= SS_RCVATMARK; 1157 break; 1158 } 1159 } else { 1160 offset += len; 1161 if (offset == so->so_oobmark) 1162 break; 1163 } 1164 } 1165 if (flags & MSG_EOR) 1166 break; 1167 /* 1168 * If the MSG_WAITALL flag is set (for non-atomic socket), 1169 * we must not quit until "uio->uio_resid == 0" or an error 1170 * termination. If a signal/timeout occurs, return 1171 * with a short count but without error. 1172 * Keep sockbuf locked against other readers. 1173 */ 1174 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1175 !sosendallatonce(so) && !nextrecord) { 1176 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1177 so->so_error) 1178 break; 1179 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1180 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1181 if (dosolock) { 1182 sb_mtx_unlock(&so->so_rcv); 1183 error = sbwait(so, &so->so_rcv); 1184 if (error) { 1185 sbunlock(so, &so->so_rcv); 1186 sounlock_shared(so); 1187 return (0); 1188 } 1189 sb_mtx_lock(&so->so_rcv); 1190 } else { 1191 if (sbwait_locked(so, &so->so_rcv)) { 1192 sb_mtx_unlock(&so->so_rcv); 1193 sbunlock(so, &so->so_rcv); 1194 return (0); 1195 } 1196 } 1197 if ((m = so->so_rcv.sb_mb) != NULL) 1198 nextrecord = m->m_nextpkt; 1199 } 1200 } 1201 1202 if (m && pr->pr_flags & PR_ATOMIC) { 1203 flags |= MSG_TRUNC; 1204 if ((flags & MSG_PEEK) == 0) 1205 (void) sbdroprecord(so, &so->so_rcv); 1206 } 1207 if ((flags & MSG_PEEK) == 0) { 1208 if (m == NULL) { 1209 /* 1210 * First part is an inline SB_EMPTY_FIXUP(). Second 1211 * part makes sure sb_lastrecord is up-to-date if 1212 * there is still data in the socket buffer. 1213 */ 1214 so->so_rcv.sb_mb = nextrecord; 1215 if (so->so_rcv.sb_mb == NULL) { 1216 so->so_rcv.sb_mbtail = NULL; 1217 so->so_rcv.sb_lastrecord = NULL; 1218 } else if (nextrecord->m_nextpkt == NULL) 1219 so->so_rcv.sb_lastrecord = nextrecord; 1220 } 1221 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1222 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1223 if (pr->pr_flags & PR_WANTRCVD) { 1224 sb_mtx_unlock(&so->so_rcv); 1225 pru_rcvd(so); 1226 sb_mtx_lock(&so->so_rcv); 1227 } 1228 } 1229 if (orig_resid == uio->uio_resid && orig_resid && 1230 (flags & MSG_EOR) == 0 && 1231 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1232 sb_mtx_unlock(&so->so_rcv); 1233 sbunlock(so, &so->so_rcv); 1234 goto restart; 1235 } 1236 1237 if (uio_error) 1238 error = uio_error; 1239 1240 if (flagsp) 1241 *flagsp |= flags; 1242 release: 1243 sb_mtx_unlock(&so->so_rcv); 1244 sbunlock(so, &so->so_rcv); 1245 out: 1246 if (dosolock) 1247 sounlock_shared(so); 1248 return (error); 1249 } 1250 1251 int 1252 soshutdown(struct socket *so, int how) 1253 { 1254 int error = 0; 1255 1256 switch (how) { 1257 case SHUT_RD: 1258 sorflush(so); 1259 break; 1260 case SHUT_RDWR: 1261 sorflush(so); 1262 /* FALLTHROUGH */ 1263 case SHUT_WR: 1264 solock(so); 1265 error = pru_shutdown(so); 1266 sounlock(so); 1267 break; 1268 default: 1269 error = EINVAL; 1270 break; 1271 } 1272 1273 return (error); 1274 } 1275 1276 void 1277 sorflush_locked(struct socket *so) 1278 { 1279 struct sockbuf *sb = &so->so_rcv; 1280 struct mbuf *m; 1281 const struct protosw *pr = so->so_proto; 1282 int error; 1283 1284 if ((sb->sb_flags & SB_OWNLOCK) == 0) 1285 soassertlocked(so); 1286 1287 error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); 1288 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1289 KASSERT(error == 0); 1290 1291 if (sb->sb_flags & SB_OWNLOCK) 1292 solock(so); 1293 socantrcvmore(so); 1294 if (sb->sb_flags & SB_OWNLOCK) 1295 sounlock(so); 1296 1297 mtx_enter(&sb->sb_mtx); 1298 m = sb->sb_mb; 1299 memset(&sb->sb_startzero, 0, 1300 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1301 sb->sb_timeo_nsecs = INFSLP; 1302 mtx_leave(&sb->sb_mtx); 1303 sbunlock(so, sb); 1304 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1305 (*pr->pr_domain->dom_dispose)(m); 1306 m_purge(m); 1307 } 1308 1309 void 1310 sorflush(struct socket *so) 1311 { 1312 if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) 1313 solock_shared(so); 1314 sorflush_locked(so); 1315 if ((so->so_rcv.sb_flags & SB_OWNLOCK) == 0) 1316 sounlock_shared(so); 1317 } 1318 1319 #ifdef SOCKET_SPLICE 1320 1321 #define so_splicelen so_sp->ssp_len 1322 #define so_splicemax so_sp->ssp_max 1323 #define so_idletv so_sp->ssp_idletv 1324 #define so_idleto so_sp->ssp_idleto 1325 #define so_splicetask so_sp->ssp_task 1326 1327 int 1328 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1329 { 1330 struct file *fp = NULL; 1331 struct socket *sosp; 1332 struct taskq *tq; 1333 int error = 0; 1334 1335 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1336 return (EPROTONOSUPPORT); 1337 if (max && max < 0) 1338 return (EINVAL); 1339 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1340 return (EINVAL); 1341 1342 if (sosplice_taskq == NULL) { 1343 rw_enter_write(&sosplice_lock); 1344 if (sosplice_taskq == NULL) { 1345 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1346 TASKQ_MPSAFE); 1347 if (tq == NULL) { 1348 rw_exit_write(&sosplice_lock); 1349 return (ENOMEM); 1350 } 1351 /* Ensure the taskq is fully visible to other CPUs. */ 1352 membar_producer(); 1353 sosplice_taskq = tq; 1354 } 1355 rw_exit_write(&sosplice_lock); 1356 } else { 1357 /* Ensure the taskq is fully visible on this CPU. */ 1358 membar_consumer(); 1359 } 1360 1361 if (so->so_rcv.sb_flags & SB_OWNLOCK) { 1362 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) 1363 return (error); 1364 solock(so); 1365 } else { 1366 solock(so); 1367 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1368 sounlock(so); 1369 return (error); 1370 } 1371 } 1372 1373 if (so->so_options & SO_ACCEPTCONN) { 1374 error = EOPNOTSUPP; 1375 goto out; 1376 } 1377 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1378 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1379 error = ENOTCONN; 1380 goto out; 1381 } 1382 if (so->so_sp == NULL) 1383 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1384 1385 /* If no fd is given, unsplice by removing existing link. */ 1386 if (fd < 0) { 1387 if (so->so_sp->ssp_socket) 1388 sounsplice(so, so->so_sp->ssp_socket, 0); 1389 goto out; 1390 } 1391 1392 /* Find sosp, the drain socket where data will be spliced into. */ 1393 if ((error = getsock(curproc, fd, &fp)) != 0) 1394 goto out; 1395 sosp = fp->f_data; 1396 if (sosp->so_proto->pr_usrreqs->pru_send != 1397 so->so_proto->pr_usrreqs->pru_send) { 1398 error = EPROTONOSUPPORT; 1399 goto out; 1400 } 1401 if (sosp->so_sp == NULL) 1402 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1403 1404 if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) { 1405 goto out; 1406 } 1407 1408 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1409 error = EBUSY; 1410 goto release; 1411 } 1412 if (sosp->so_options & SO_ACCEPTCONN) { 1413 error = EOPNOTSUPP; 1414 goto release; 1415 } 1416 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1417 error = ENOTCONN; 1418 goto release; 1419 } 1420 1421 /* Splice so and sosp together. */ 1422 mtx_enter(&so->so_rcv.sb_mtx); 1423 so->so_sp->ssp_socket = sosp; 1424 sosp->so_sp->ssp_soback = so; 1425 mtx_leave(&so->so_rcv.sb_mtx); 1426 so->so_splicelen = 0; 1427 so->so_splicemax = max; 1428 if (tv) 1429 so->so_idletv = *tv; 1430 else 1431 timerclear(&so->so_idletv); 1432 timeout_set_proc(&so->so_idleto, soidle, so); 1433 task_set(&so->so_splicetask, sotask, so); 1434 1435 /* 1436 * To prevent softnet interrupt from calling somove() while 1437 * we sleep, the socket buffers are not marked as spliced yet. 1438 */ 1439 if (somove(so, M_WAIT)) { 1440 mtx_enter(&so->so_rcv.sb_mtx); 1441 so->so_rcv.sb_flags |= SB_SPLICE; 1442 mtx_leave(&so->so_rcv.sb_mtx); 1443 sosp->so_snd.sb_flags |= SB_SPLICE; 1444 } 1445 1446 release: 1447 sbunlock(sosp, &sosp->so_snd); 1448 out: 1449 if (so->so_rcv.sb_flags & SB_OWNLOCK) { 1450 sounlock(so); 1451 sbunlock(so, &so->so_rcv); 1452 } else { 1453 sbunlock(so, &so->so_rcv); 1454 sounlock(so); 1455 } 1456 1457 if (fp) 1458 FRELE(fp, curproc); 1459 1460 return (error); 1461 } 1462 1463 void 1464 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1465 { 1466 soassertlocked(so); 1467 1468 task_del(sosplice_taskq, &so->so_splicetask); 1469 timeout_del(&so->so_idleto); 1470 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1471 1472 mtx_enter(&so->so_rcv.sb_mtx); 1473 so->so_rcv.sb_flags &= ~SB_SPLICE; 1474 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1475 mtx_leave(&so->so_rcv.sb_mtx); 1476 1477 /* Do not wakeup a socket that is about to be freed. */ 1478 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1479 sorwakeup(so); 1480 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1481 sowwakeup(sosp); 1482 } 1483 1484 void 1485 soidle(void *arg) 1486 { 1487 struct socket *so = arg; 1488 1489 solock(so); 1490 if (so->so_rcv.sb_flags & SB_SPLICE) { 1491 so->so_error = ETIMEDOUT; 1492 sounsplice(so, so->so_sp->ssp_socket, 0); 1493 } 1494 sounlock(so); 1495 } 1496 1497 void 1498 sotask(void *arg) 1499 { 1500 struct socket *so = arg; 1501 1502 solock(so); 1503 if (so->so_rcv.sb_flags & SB_SPLICE) { 1504 /* 1505 * We may not sleep here as sofree() and unsplice() may be 1506 * called from softnet interrupt context. This would remove 1507 * the socket during somove(). 1508 */ 1509 somove(so, M_DONTWAIT); 1510 } 1511 sounlock(so); 1512 1513 /* Avoid user land starvation. */ 1514 yield(); 1515 } 1516 1517 /* 1518 * The socket splicing task or idle timeout may sleep while grabbing the net 1519 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1520 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1521 * after all pending socket splicing tasks or timeouts have finished. Do this 1522 * by scheduling it on the same threads. 1523 */ 1524 void 1525 soreaper(void *arg) 1526 { 1527 struct socket *so = arg; 1528 1529 /* Reuse splice task, sounsplice() has been called before. */ 1530 task_set(&so->so_sp->ssp_task, soput, so); 1531 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1532 } 1533 1534 void 1535 soput(void *arg) 1536 { 1537 struct socket *so = arg; 1538 1539 pool_put(&sosplice_pool, so->so_sp); 1540 pool_put(&socket_pool, so); 1541 } 1542 1543 /* 1544 * Move data from receive buffer of spliced source socket to send 1545 * buffer of drain socket. Try to move as much as possible in one 1546 * big chunk. It is a TCP only implementation. 1547 * Return value 0 means splicing has been finished, 1 continue. 1548 */ 1549 int 1550 somove(struct socket *so, int wait) 1551 { 1552 struct socket *sosp = so->so_sp->ssp_socket; 1553 struct mbuf *m, **mp, *nextrecord; 1554 u_long len, off, oobmark; 1555 long space; 1556 int error = 0, maxreached = 0; 1557 unsigned int rcvstate; 1558 1559 soassertlocked(so); 1560 1561 nextpkt: 1562 if (so->so_error) { 1563 error = so->so_error; 1564 goto release; 1565 } 1566 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1567 error = EPIPE; 1568 goto release; 1569 } 1570 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1571 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1572 error = sosp->so_error; 1573 goto release; 1574 } 1575 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1576 goto release; 1577 1578 /* Calculate how many bytes can be copied now. */ 1579 len = so->so_rcv.sb_datacc; 1580 if (so->so_splicemax) { 1581 KASSERT(so->so_splicelen < so->so_splicemax); 1582 if (so->so_splicemax <= so->so_splicelen + len) { 1583 len = so->so_splicemax - so->so_splicelen; 1584 maxreached = 1; 1585 } 1586 } 1587 space = sbspace(sosp, &sosp->so_snd); 1588 if (so->so_oobmark && so->so_oobmark < len && 1589 so->so_oobmark < space + 1024) 1590 space += 1024; 1591 if (space <= 0) { 1592 maxreached = 0; 1593 goto release; 1594 } 1595 if (space < len) { 1596 maxreached = 0; 1597 if (space < sosp->so_snd.sb_lowat) 1598 goto release; 1599 len = space; 1600 } 1601 sosp->so_snd.sb_state |= SS_ISSENDING; 1602 1603 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1604 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1605 m = so->so_rcv.sb_mb; 1606 if (m == NULL) 1607 goto release; 1608 nextrecord = m->m_nextpkt; 1609 1610 /* Drop address and control information not used with splicing. */ 1611 if (so->so_proto->pr_flags & PR_ADDR) { 1612 #ifdef DIAGNOSTIC 1613 if (m->m_type != MT_SONAME) 1614 panic("somove soname: so %p, so_type %d, m %p, " 1615 "m_type %d", so, so->so_type, m, m->m_type); 1616 #endif 1617 m = m->m_next; 1618 } 1619 while (m && m->m_type == MT_CONTROL) 1620 m = m->m_next; 1621 if (m == NULL) { 1622 sbdroprecord(so, &so->so_rcv); 1623 if (so->so_proto->pr_flags & PR_WANTRCVD) 1624 pru_rcvd(so); 1625 goto nextpkt; 1626 } 1627 1628 /* 1629 * By splicing sockets connected to localhost, userland might create a 1630 * loop. Dissolve splicing with error if loop is detected by counter. 1631 * 1632 * If we deal with looped broadcast/multicast packet we bail out with 1633 * no error to suppress splice termination. 1634 */ 1635 if ((m->m_flags & M_PKTHDR) && 1636 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1637 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1638 error = ELOOP; 1639 goto release; 1640 } 1641 1642 if (so->so_proto->pr_flags & PR_ATOMIC) { 1643 if ((m->m_flags & M_PKTHDR) == 0) 1644 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1645 "m_type %d", so, so->so_type, m, m->m_type); 1646 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1647 error = EMSGSIZE; 1648 goto release; 1649 } 1650 if (len < m->m_pkthdr.len) 1651 goto release; 1652 if (m->m_pkthdr.len < len) { 1653 maxreached = 0; 1654 len = m->m_pkthdr.len; 1655 } 1656 /* 1657 * Throw away the name mbuf after it has been assured 1658 * that the whole first record can be processed. 1659 */ 1660 m = so->so_rcv.sb_mb; 1661 sbfree(so, &so->so_rcv, m); 1662 so->so_rcv.sb_mb = m_free(m); 1663 sbsync(&so->so_rcv, nextrecord); 1664 } 1665 /* 1666 * Throw away the control mbufs after it has been assured 1667 * that the whole first record can be processed. 1668 */ 1669 m = so->so_rcv.sb_mb; 1670 while (m && m->m_type == MT_CONTROL) { 1671 sbfree(so, &so->so_rcv, m); 1672 so->so_rcv.sb_mb = m_free(m); 1673 m = so->so_rcv.sb_mb; 1674 sbsync(&so->so_rcv, nextrecord); 1675 } 1676 1677 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1678 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1679 1680 /* Take at most len mbufs out of receive buffer. */ 1681 for (off = 0, mp = &m; off <= len && *mp; 1682 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1683 u_long size = len - off; 1684 1685 #ifdef DIAGNOSTIC 1686 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1687 panic("somove type: so %p, so_type %d, m %p, " 1688 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1689 #endif 1690 if ((*mp)->m_len > size) { 1691 /* 1692 * Move only a partial mbuf at maximum splice length or 1693 * if the drain buffer is too small for this large mbuf. 1694 */ 1695 if (!maxreached && so->so_snd.sb_datacc > 0) { 1696 len -= size; 1697 break; 1698 } 1699 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1700 if (*mp == NULL) { 1701 len -= size; 1702 break; 1703 } 1704 so->so_rcv.sb_mb->m_data += size; 1705 so->so_rcv.sb_mb->m_len -= size; 1706 so->so_rcv.sb_cc -= size; 1707 so->so_rcv.sb_datacc -= size; 1708 } else { 1709 *mp = so->so_rcv.sb_mb; 1710 sbfree(so, &so->so_rcv, *mp); 1711 so->so_rcv.sb_mb = (*mp)->m_next; 1712 sbsync(&so->so_rcv, nextrecord); 1713 } 1714 } 1715 *mp = NULL; 1716 1717 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1718 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1719 SBCHECK(so, &so->so_rcv); 1720 if (m == NULL) 1721 goto release; 1722 m->m_nextpkt = NULL; 1723 if (m->m_flags & M_PKTHDR) { 1724 m_resethdr(m); 1725 m->m_pkthdr.len = len; 1726 } 1727 1728 /* Send window update to source peer as receive buffer has changed. */ 1729 if (so->so_proto->pr_flags & PR_WANTRCVD) 1730 pru_rcvd(so); 1731 1732 /* Receive buffer did shrink by len bytes, adjust oob. */ 1733 mtx_enter(&so->so_rcv.sb_mtx); 1734 rcvstate = so->so_rcv.sb_state; 1735 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1736 oobmark = so->so_oobmark; 1737 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1738 if (oobmark) { 1739 if (oobmark == len) 1740 so->so_rcv.sb_state |= SS_RCVATMARK; 1741 if (oobmark >= len) 1742 oobmark = 0; 1743 } 1744 mtx_leave(&so->so_rcv.sb_mtx); 1745 1746 /* 1747 * Handle oob data. If any malloc fails, ignore error. 1748 * TCP urgent data is not very reliable anyway. 1749 */ 1750 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1751 (so->so_options & SO_OOBINLINE)) { 1752 struct mbuf *o = NULL; 1753 1754 if (rcvstate & SS_RCVATMARK) { 1755 o = m_get(wait, MT_DATA); 1756 rcvstate &= ~SS_RCVATMARK; 1757 } else if (oobmark) { 1758 o = m_split(m, oobmark, wait); 1759 if (o) { 1760 error = pru_send(sosp, m, NULL, NULL); 1761 if (error) { 1762 if (sosp->so_snd.sb_state & 1763 SS_CANTSENDMORE) 1764 error = EPIPE; 1765 m_freem(o); 1766 goto release; 1767 } 1768 len -= oobmark; 1769 so->so_splicelen += oobmark; 1770 m = o; 1771 o = m_get(wait, MT_DATA); 1772 } 1773 oobmark = 0; 1774 } 1775 if (o) { 1776 o->m_len = 1; 1777 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1778 error = pru_sendoob(sosp, o, NULL, NULL); 1779 if (error) { 1780 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1781 error = EPIPE; 1782 m_freem(m); 1783 goto release; 1784 } 1785 len -= 1; 1786 so->so_splicelen += 1; 1787 if (oobmark) { 1788 oobmark -= 1; 1789 if (oobmark == 0) 1790 rcvstate |= SS_RCVATMARK; 1791 } 1792 m_adj(m, 1); 1793 } 1794 } 1795 1796 /* Append all remaining data to drain socket. */ 1797 if (so->so_rcv.sb_cc == 0 || maxreached) 1798 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1799 error = pru_send(sosp, m, NULL, NULL); 1800 if (error) { 1801 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1802 error = EPIPE; 1803 goto release; 1804 } 1805 so->so_splicelen += len; 1806 1807 /* Move several packets if possible. */ 1808 if (!maxreached && nextrecord) 1809 goto nextpkt; 1810 1811 release: 1812 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1813 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1814 error = EFBIG; 1815 if (error) 1816 so->so_error = error; 1817 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1818 so->so_rcv.sb_cc == 0) || 1819 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1820 maxreached || error) { 1821 sounsplice(so, sosp, 0); 1822 return (0); 1823 } 1824 if (timerisset(&so->so_idletv)) 1825 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1826 return (1); 1827 } 1828 1829 #endif /* SOCKET_SPLICE */ 1830 1831 void 1832 sorwakeup(struct socket *so) 1833 { 1834 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1835 soassertlocked_readonly(so); 1836 1837 #ifdef SOCKET_SPLICE 1838 if (so->so_rcv.sb_flags & SB_SPLICE) { 1839 /* 1840 * TCP has a sendbuffer that can handle multiple packets 1841 * at once. So queue the stream a bit to accumulate data. 1842 * The sosplice thread will call somove() later and send 1843 * the packets calling tcp_output() only once. 1844 * In the UDP case, send out the packets immediately. 1845 * Using a thread would make things slower. 1846 */ 1847 if (so->so_proto->pr_flags & PR_WANTRCVD) 1848 task_add(sosplice_taskq, &so->so_splicetask); 1849 else 1850 somove(so, M_DONTWAIT); 1851 } 1852 if (isspliced(so)) 1853 return; 1854 #endif 1855 sowakeup(so, &so->so_rcv); 1856 if (so->so_upcall) 1857 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1858 } 1859 1860 void 1861 sowwakeup(struct socket *so) 1862 { 1863 soassertlocked_readonly(so); 1864 1865 #ifdef SOCKET_SPLICE 1866 if (so->so_snd.sb_flags & SB_SPLICE) 1867 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1868 if (issplicedback(so)) 1869 return; 1870 #endif 1871 sowakeup(so, &so->so_snd); 1872 } 1873 1874 int 1875 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1876 { 1877 int error = 0; 1878 1879 if (level != SOL_SOCKET) { 1880 if (so->so_proto->pr_ctloutput) { 1881 solock(so); 1882 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1883 level, optname, m); 1884 sounlock(so); 1885 return (error); 1886 } 1887 error = ENOPROTOOPT; 1888 } else { 1889 switch (optname) { 1890 1891 case SO_LINGER: 1892 if (m == NULL || m->m_len != sizeof (struct linger) || 1893 mtod(m, struct linger *)->l_linger < 0 || 1894 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1895 return (EINVAL); 1896 1897 solock(so); 1898 so->so_linger = mtod(m, struct linger *)->l_linger; 1899 if (*mtod(m, int *)) 1900 so->so_options |= optname; 1901 else 1902 so->so_options &= ~optname; 1903 sounlock(so); 1904 1905 break; 1906 case SO_BINDANY: 1907 if ((error = suser(curproc)) != 0) /* XXX */ 1908 return (error); 1909 /* FALLTHROUGH */ 1910 1911 case SO_DEBUG: 1912 case SO_KEEPALIVE: 1913 case SO_USELOOPBACK: 1914 case SO_BROADCAST: 1915 case SO_REUSEADDR: 1916 case SO_REUSEPORT: 1917 case SO_OOBINLINE: 1918 case SO_TIMESTAMP: 1919 case SO_ZEROIZE: 1920 if (m == NULL || m->m_len < sizeof (int)) 1921 return (EINVAL); 1922 1923 solock(so); 1924 if (*mtod(m, int *)) 1925 so->so_options |= optname; 1926 else 1927 so->so_options &= ~optname; 1928 sounlock(so); 1929 1930 break; 1931 case SO_DONTROUTE: 1932 if (m == NULL || m->m_len < sizeof (int)) 1933 return (EINVAL); 1934 if (*mtod(m, int *)) 1935 error = EOPNOTSUPP; 1936 break; 1937 1938 case SO_SNDBUF: 1939 case SO_RCVBUF: 1940 case SO_SNDLOWAT: 1941 case SO_RCVLOWAT: 1942 { 1943 struct sockbuf *sb = (optname == SO_SNDBUF || 1944 optname == SO_SNDLOWAT ? 1945 &so->so_snd : &so->so_rcv); 1946 u_long cnt; 1947 1948 if (m == NULL || m->m_len < sizeof (int)) 1949 return (EINVAL); 1950 cnt = *mtod(m, int *); 1951 if ((long)cnt <= 0) 1952 cnt = 1; 1953 1954 if (((sb->sb_flags & SB_OWNLOCK) == 0)) 1955 solock(so); 1956 mtx_enter(&sb->sb_mtx); 1957 1958 switch (optname) { 1959 case SO_SNDBUF: 1960 case SO_RCVBUF: 1961 if (sb->sb_state & 1962 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1963 error = EINVAL; 1964 break; 1965 } 1966 if (sbcheckreserve(cnt, sb->sb_wat) || 1967 sbreserve(so, sb, cnt)) { 1968 error = ENOBUFS; 1969 break; 1970 } 1971 sb->sb_wat = cnt; 1972 break; 1973 case SO_SNDLOWAT: 1974 case SO_RCVLOWAT: 1975 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 1976 sb->sb_hiwat : cnt; 1977 break; 1978 } 1979 1980 mtx_leave(&sb->sb_mtx); 1981 if (((sb->sb_flags & SB_OWNLOCK) == 0)) 1982 sounlock(so); 1983 1984 break; 1985 } 1986 1987 case SO_SNDTIMEO: 1988 case SO_RCVTIMEO: 1989 { 1990 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 1991 &so->so_snd : &so->so_rcv); 1992 struct timeval tv; 1993 uint64_t nsecs; 1994 1995 if (m == NULL || m->m_len < sizeof (tv)) 1996 return (EINVAL); 1997 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1998 if (!timerisvalid(&tv)) 1999 return (EINVAL); 2000 nsecs = TIMEVAL_TO_NSEC(&tv); 2001 if (nsecs == UINT64_MAX) 2002 return (EDOM); 2003 if (nsecs == 0) 2004 nsecs = INFSLP; 2005 2006 mtx_enter(&sb->sb_mtx); 2007 sb->sb_timeo_nsecs = nsecs; 2008 mtx_leave(&sb->sb_mtx); 2009 break; 2010 } 2011 2012 case SO_RTABLE: 2013 if (so->so_proto->pr_domain && 2014 so->so_proto->pr_domain->dom_protosw && 2015 so->so_proto->pr_ctloutput) { 2016 const struct domain *dom = 2017 so->so_proto->pr_domain; 2018 2019 level = dom->dom_protosw->pr_protocol; 2020 solock(so); 2021 error = (*so->so_proto->pr_ctloutput) 2022 (PRCO_SETOPT, so, level, optname, m); 2023 sounlock(so); 2024 } else 2025 error = ENOPROTOOPT; 2026 break; 2027 #ifdef SOCKET_SPLICE 2028 case SO_SPLICE: 2029 if (m == NULL) { 2030 error = sosplice(so, -1, 0, NULL); 2031 } else if (m->m_len < sizeof(int)) { 2032 error = EINVAL; 2033 } else if (m->m_len < sizeof(struct splice)) { 2034 error = sosplice(so, *mtod(m, int *), 0, NULL); 2035 } else { 2036 error = sosplice(so, 2037 mtod(m, struct splice *)->sp_fd, 2038 mtod(m, struct splice *)->sp_max, 2039 &mtod(m, struct splice *)->sp_idle); 2040 } 2041 break; 2042 #endif /* SOCKET_SPLICE */ 2043 2044 default: 2045 error = ENOPROTOOPT; 2046 break; 2047 } 2048 } 2049 2050 return (error); 2051 } 2052 2053 int 2054 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2055 { 2056 int error = 0; 2057 2058 if (level != SOL_SOCKET) { 2059 if (so->so_proto->pr_ctloutput) { 2060 m->m_len = 0; 2061 2062 solock(so); 2063 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2064 level, optname, m); 2065 sounlock(so); 2066 return (error); 2067 } else 2068 return (ENOPROTOOPT); 2069 } else { 2070 m->m_len = sizeof (int); 2071 2072 switch (optname) { 2073 2074 case SO_LINGER: 2075 m->m_len = sizeof (struct linger); 2076 solock_shared(so); 2077 mtod(m, struct linger *)->l_onoff = 2078 so->so_options & SO_LINGER; 2079 mtod(m, struct linger *)->l_linger = so->so_linger; 2080 sounlock_shared(so); 2081 break; 2082 2083 case SO_BINDANY: 2084 case SO_USELOOPBACK: 2085 case SO_DEBUG: 2086 case SO_KEEPALIVE: 2087 case SO_REUSEADDR: 2088 case SO_REUSEPORT: 2089 case SO_BROADCAST: 2090 case SO_OOBINLINE: 2091 case SO_ACCEPTCONN: 2092 case SO_TIMESTAMP: 2093 case SO_ZEROIZE: 2094 *mtod(m, int *) = so->so_options & optname; 2095 break; 2096 2097 case SO_DONTROUTE: 2098 *mtod(m, int *) = 0; 2099 break; 2100 2101 case SO_TYPE: 2102 *mtod(m, int *) = so->so_type; 2103 break; 2104 2105 case SO_ERROR: 2106 solock(so); 2107 *mtod(m, int *) = so->so_error; 2108 so->so_error = 0; 2109 sounlock(so); 2110 2111 break; 2112 2113 case SO_DOMAIN: 2114 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2115 break; 2116 2117 case SO_PROTOCOL: 2118 *mtod(m, int *) = so->so_proto->pr_protocol; 2119 break; 2120 2121 case SO_SNDBUF: 2122 *mtod(m, int *) = so->so_snd.sb_hiwat; 2123 break; 2124 2125 case SO_RCVBUF: 2126 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2127 break; 2128 2129 case SO_SNDLOWAT: 2130 *mtod(m, int *) = so->so_snd.sb_lowat; 2131 break; 2132 2133 case SO_RCVLOWAT: 2134 *mtod(m, int *) = so->so_rcv.sb_lowat; 2135 break; 2136 2137 case SO_SNDTIMEO: 2138 case SO_RCVTIMEO: 2139 { 2140 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2141 &so->so_snd : &so->so_rcv); 2142 struct timeval tv; 2143 uint64_t nsecs; 2144 2145 mtx_enter(&sb->sb_mtx); 2146 nsecs = sb->sb_timeo_nsecs; 2147 mtx_leave(&sb->sb_mtx); 2148 2149 m->m_len = sizeof(struct timeval); 2150 memset(&tv, 0, sizeof(tv)); 2151 if (nsecs != INFSLP) 2152 NSEC_TO_TIMEVAL(nsecs, &tv); 2153 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2154 break; 2155 } 2156 2157 case SO_RTABLE: 2158 if (so->so_proto->pr_domain && 2159 so->so_proto->pr_domain->dom_protosw && 2160 so->so_proto->pr_ctloutput) { 2161 const struct domain *dom = 2162 so->so_proto->pr_domain; 2163 2164 level = dom->dom_protosw->pr_protocol; 2165 solock(so); 2166 error = (*so->so_proto->pr_ctloutput) 2167 (PRCO_GETOPT, so, level, optname, m); 2168 sounlock(so); 2169 if (error) 2170 return (error); 2171 break; 2172 } 2173 return (ENOPROTOOPT); 2174 2175 #ifdef SOCKET_SPLICE 2176 case SO_SPLICE: 2177 { 2178 off_t len; 2179 2180 m->m_len = sizeof(off_t); 2181 solock_shared(so); 2182 len = so->so_sp ? so->so_sp->ssp_len : 0; 2183 sounlock_shared(so); 2184 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2185 break; 2186 } 2187 #endif /* SOCKET_SPLICE */ 2188 2189 case SO_PEERCRED: 2190 if (so->so_proto->pr_protocol == AF_UNIX) { 2191 struct unpcb *unp = sotounpcb(so); 2192 2193 solock(so); 2194 if (unp->unp_flags & UNP_FEIDS) { 2195 m->m_len = sizeof(unp->unp_connid); 2196 memcpy(mtod(m, caddr_t), 2197 &(unp->unp_connid), m->m_len); 2198 sounlock(so); 2199 break; 2200 } 2201 sounlock(so); 2202 2203 return (ENOTCONN); 2204 } 2205 return (EOPNOTSUPP); 2206 2207 default: 2208 return (ENOPROTOOPT); 2209 } 2210 return (0); 2211 } 2212 } 2213 2214 void 2215 sohasoutofband(struct socket *so) 2216 { 2217 pgsigio(&so->so_sigio, SIGURG, 0); 2218 knote(&so->so_rcv.sb_klist, 0); 2219 } 2220 2221 void 2222 sofilt_lock(struct socket *so, struct sockbuf *sb) 2223 { 2224 switch (so->so_proto->pr_domain->dom_family) { 2225 case PF_INET: 2226 case PF_INET6: 2227 NET_LOCK_SHARED(); 2228 break; 2229 default: 2230 rw_enter_write(&so->so_lock); 2231 break; 2232 } 2233 2234 mtx_enter(&sb->sb_mtx); 2235 } 2236 2237 void 2238 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2239 { 2240 mtx_leave(&sb->sb_mtx); 2241 2242 switch (so->so_proto->pr_domain->dom_family) { 2243 case PF_INET: 2244 case PF_INET6: 2245 NET_UNLOCK_SHARED(); 2246 break; 2247 default: 2248 rw_exit_write(&so->so_lock); 2249 break; 2250 } 2251 } 2252 2253 int 2254 soo_kqfilter(struct file *fp, struct knote *kn) 2255 { 2256 struct socket *so = kn->kn_fp->f_data; 2257 struct sockbuf *sb; 2258 2259 switch (kn->kn_filter) { 2260 case EVFILT_READ: 2261 kn->kn_fop = &soread_filtops; 2262 sb = &so->so_rcv; 2263 break; 2264 case EVFILT_WRITE: 2265 kn->kn_fop = &sowrite_filtops; 2266 sb = &so->so_snd; 2267 break; 2268 case EVFILT_EXCEPT: 2269 kn->kn_fop = &soexcept_filtops; 2270 sb = &so->so_rcv; 2271 break; 2272 default: 2273 return (EINVAL); 2274 } 2275 2276 klist_insert(&sb->sb_klist, kn); 2277 2278 return (0); 2279 } 2280 2281 void 2282 filt_sordetach(struct knote *kn) 2283 { 2284 struct socket *so = kn->kn_fp->f_data; 2285 2286 klist_remove(&so->so_rcv.sb_klist, kn); 2287 } 2288 2289 int 2290 filt_soread(struct knote *kn, long hint) 2291 { 2292 struct socket *so = kn->kn_fp->f_data; 2293 int rv = 0; 2294 2295 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2296 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2297 soassertlocked_readonly(so); 2298 2299 if (so->so_options & SO_ACCEPTCONN) { 2300 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2301 soassertlocked_readonly(so); 2302 2303 kn->kn_data = so->so_qlen; 2304 rv = (kn->kn_data != 0); 2305 2306 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2307 if (so->so_state & SS_ISDISCONNECTED) { 2308 kn->kn_flags |= __EV_HUP; 2309 rv = 1; 2310 } else { 2311 rv = soreadable(so); 2312 } 2313 } 2314 2315 return rv; 2316 } 2317 2318 kn->kn_data = so->so_rcv.sb_cc; 2319 #ifdef SOCKET_SPLICE 2320 if (isspliced(so)) { 2321 rv = 0; 2322 } else 2323 #endif /* SOCKET_SPLICE */ 2324 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2325 kn->kn_flags |= EV_EOF; 2326 if (kn->kn_flags & __EV_POLL) { 2327 if (so->so_state & SS_ISDISCONNECTED) 2328 kn->kn_flags |= __EV_HUP; 2329 } 2330 kn->kn_fflags = so->so_error; 2331 rv = 1; 2332 } else if (so->so_error) { 2333 rv = 1; 2334 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2335 rv = (kn->kn_data >= kn->kn_sdata); 2336 } else { 2337 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2338 } 2339 2340 return rv; 2341 } 2342 2343 void 2344 filt_sowdetach(struct knote *kn) 2345 { 2346 struct socket *so = kn->kn_fp->f_data; 2347 2348 klist_remove(&so->so_snd.sb_klist, kn); 2349 } 2350 2351 int 2352 filt_sowrite(struct knote *kn, long hint) 2353 { 2354 struct socket *so = kn->kn_fp->f_data; 2355 int rv; 2356 2357 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2358 soassertlocked_readonly(so); 2359 2360 kn->kn_data = sbspace(so, &so->so_snd); 2361 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2362 kn->kn_flags |= EV_EOF; 2363 if (kn->kn_flags & __EV_POLL) { 2364 if (so->so_state & SS_ISDISCONNECTED) 2365 kn->kn_flags |= __EV_HUP; 2366 } 2367 kn->kn_fflags = so->so_error; 2368 rv = 1; 2369 } else if (so->so_error) { 2370 rv = 1; 2371 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2372 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2373 rv = 0; 2374 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2375 rv = (kn->kn_data >= kn->kn_sdata); 2376 } else { 2377 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2378 } 2379 2380 return (rv); 2381 } 2382 2383 int 2384 filt_soexcept(struct knote *kn, long hint) 2385 { 2386 struct socket *so = kn->kn_fp->f_data; 2387 int rv = 0; 2388 2389 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2390 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2391 soassertlocked_readonly(so); 2392 2393 #ifdef SOCKET_SPLICE 2394 if (isspliced(so)) { 2395 rv = 0; 2396 } else 2397 #endif /* SOCKET_SPLICE */ 2398 if (kn->kn_sfflags & NOTE_OOB) { 2399 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2400 kn->kn_fflags |= NOTE_OOB; 2401 kn->kn_data -= so->so_oobmark; 2402 rv = 1; 2403 } 2404 } 2405 2406 if (kn->kn_flags & __EV_POLL) { 2407 if (so->so_state & SS_ISDISCONNECTED) { 2408 kn->kn_flags |= __EV_HUP; 2409 rv = 1; 2410 } 2411 } 2412 2413 return rv; 2414 } 2415 2416 int 2417 filt_sowmodify(struct kevent *kev, struct knote *kn) 2418 { 2419 struct socket *so = kn->kn_fp->f_data; 2420 int rv; 2421 2422 sofilt_lock(so, &so->so_snd); 2423 rv = knote_modify(kev, kn); 2424 sofilt_unlock(so, &so->so_snd); 2425 2426 return (rv); 2427 } 2428 2429 int 2430 filt_sowprocess(struct knote *kn, struct kevent *kev) 2431 { 2432 struct socket *so = kn->kn_fp->f_data; 2433 int rv; 2434 2435 sofilt_lock(so, &so->so_snd); 2436 rv = knote_process(kn, kev); 2437 sofilt_unlock(so, &so->so_snd); 2438 2439 return (rv); 2440 } 2441 2442 int 2443 filt_sormodify(struct kevent *kev, struct knote *kn) 2444 { 2445 struct socket *so = kn->kn_fp->f_data; 2446 int rv; 2447 2448 sofilt_lock(so, &so->so_rcv); 2449 rv = knote_modify(kev, kn); 2450 sofilt_unlock(so, &so->so_rcv); 2451 2452 return (rv); 2453 } 2454 2455 int 2456 filt_sorprocess(struct knote *kn, struct kevent *kev) 2457 { 2458 struct socket *so = kn->kn_fp->f_data; 2459 int rv; 2460 2461 sofilt_lock(so, &so->so_rcv); 2462 rv = knote_process(kn, kev); 2463 sofilt_unlock(so, &so->so_rcv); 2464 2465 return (rv); 2466 } 2467 2468 #ifdef DDB 2469 void 2470 sobuf_print(struct sockbuf *, 2471 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2472 2473 void 2474 sobuf_print(struct sockbuf *sb, 2475 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2476 { 2477 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2478 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2479 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2480 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2481 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2482 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2483 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2484 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2485 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2486 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2487 (*pr)("\tsb_sel: ...\n"); 2488 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2489 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2490 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2491 } 2492 2493 void 2494 so_print(void *v, 2495 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2496 { 2497 struct socket *so = v; 2498 2499 (*pr)("socket %p\n", so); 2500 (*pr)("so_type: %i\n", so->so_type); 2501 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2502 (*pr)("so_linger: %i\n", so->so_linger); 2503 (*pr)("so_state: 0x%04x\n", so->so_state); 2504 (*pr)("so_pcb: %p\n", so->so_pcb); 2505 (*pr)("so_proto: %p\n", so->so_proto); 2506 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2507 2508 (*pr)("so_head: %p\n", so->so_head); 2509 (*pr)("so_onq: %p\n", so->so_onq); 2510 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2511 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2512 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2513 (*pr)("so_q0len: %i\n", so->so_q0len); 2514 (*pr)("so_qlen: %i\n", so->so_qlen); 2515 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2516 (*pr)("so_timeo: %i\n", so->so_timeo); 2517 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2518 2519 (*pr)("so_sp: %p\n", so->so_sp); 2520 if (so->so_sp != NULL) { 2521 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2522 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2523 (*pr)("\tssp_len: %lld\n", 2524 (unsigned long long)so->so_sp->ssp_len); 2525 (*pr)("\tssp_max: %lld\n", 2526 (unsigned long long)so->so_sp->ssp_max); 2527 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2528 so->so_sp->ssp_idletv.tv_usec); 2529 (*pr)("\tssp_idleto: %spending (@%i)\n", 2530 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2531 so->so_sp->ssp_idleto.to_time); 2532 } 2533 2534 (*pr)("so_rcv:\n"); 2535 sobuf_print(&so->so_rcv, pr); 2536 (*pr)("so_snd:\n"); 2537 sobuf_print(&so->so_snd, pr); 2538 2539 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2540 so->so_upcall, so->so_upcallarg); 2541 2542 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2543 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2544 (*pr)("so_cpid: %d\n", so->so_cpid); 2545 } 2546 #endif 2547