1 /* $OpenBSD: uipc_socket.c,v 1.326 2024/04/02 12:21:39 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 69 void filt_sordetach(struct knote *kn); 70 int filt_soread(struct knote *kn, long hint); 71 void filt_sowdetach(struct knote *kn); 72 int filt_sowrite(struct knote *kn, long hint); 73 int filt_soexcept(struct knote *kn, long hint); 74 75 int filt_sowmodify(struct kevent *kev, struct knote *kn); 76 int filt_sowprocess(struct knote *kn, struct kevent *kev); 77 78 int filt_sormodify(struct kevent *kev, struct knote *kn); 79 int filt_sorprocess(struct knote *kn, struct kevent *kev); 80 81 const struct filterops soread_filtops = { 82 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 83 .f_attach = NULL, 84 .f_detach = filt_sordetach, 85 .f_event = filt_soread, 86 .f_modify = filt_sormodify, 87 .f_process = filt_sorprocess, 88 }; 89 90 const struct filterops sowrite_filtops = { 91 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 92 .f_attach = NULL, 93 .f_detach = filt_sowdetach, 94 .f_event = filt_sowrite, 95 .f_modify = filt_sowmodify, 96 .f_process = filt_sowprocess, 97 }; 98 99 const struct filterops soexcept_filtops = { 100 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 101 .f_attach = NULL, 102 .f_detach = filt_sordetach, 103 .f_event = filt_soexcept, 104 .f_modify = filt_sormodify, 105 .f_process = filt_sorprocess, 106 }; 107 108 #ifndef SOMINCONN 109 #define SOMINCONN 80 110 #endif /* SOMINCONN */ 111 112 int somaxconn = SOMAXCONN; 113 int sominconn = SOMINCONN; 114 115 struct pool socket_pool; 116 #ifdef SOCKET_SPLICE 117 struct pool sosplice_pool; 118 struct taskq *sosplice_taskq; 119 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 120 #endif 121 122 void 123 soinit(void) 124 { 125 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 126 "sockpl", NULL); 127 #ifdef SOCKET_SPLICE 128 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 129 "sosppl", NULL); 130 #endif 131 } 132 133 struct socket * 134 soalloc(const struct protosw *prp, int wait) 135 { 136 const struct domain *dp = prp->pr_domain; 137 struct socket *so; 138 139 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 140 PR_ZERO); 141 if (so == NULL) 142 return (NULL); 143 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 144 refcnt_init(&so->so_refcnt); 145 mtx_init(&so->so_rcv.sb_mtx, IPL_MPFLOOR); 146 mtx_init(&so->so_snd.sb_mtx, IPL_MPFLOOR); 147 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 148 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 149 sigio_init(&so->so_sigio); 150 TAILQ_INIT(&so->so_q0); 151 TAILQ_INIT(&so->so_q); 152 153 switch (dp->dom_family) { 154 case AF_INET: 155 case AF_INET6: 156 switch (prp->pr_type) { 157 case SOCK_DGRAM: 158 so->so_rcv.sb_flags |= SB_OWNLOCK; 159 /* FALLTHROUGH */ 160 case SOCK_RAW: 161 so->so_rcv.sb_flags |= SB_MTXLOCK; 162 break; 163 } 164 break; 165 case AF_UNIX: 166 so->so_rcv.sb_flags |= SB_MTXLOCK | SB_OWNLOCK; 167 break; 168 } 169 170 return (so); 171 } 172 173 /* 174 * Socket operation routines. 175 * These routines are called by the routines in 176 * sys_socket.c or from a system process, and 177 * implement the semantics of socket operations by 178 * switching out to the protocol specific routines. 179 */ 180 int 181 socreate(int dom, struct socket **aso, int type, int proto) 182 { 183 struct proc *p = curproc; /* XXX */ 184 const struct protosw *prp; 185 struct socket *so; 186 int error; 187 188 if (proto) 189 prp = pffindproto(dom, proto, type); 190 else 191 prp = pffindtype(dom, type); 192 if (prp == NULL || prp->pr_usrreqs == NULL) 193 return (EPROTONOSUPPORT); 194 if (prp->pr_type != type) 195 return (EPROTOTYPE); 196 so = soalloc(prp, M_WAIT); 197 so->so_type = type; 198 if (suser(p) == 0) 199 so->so_state = SS_PRIV; 200 so->so_ruid = p->p_ucred->cr_ruid; 201 so->so_euid = p->p_ucred->cr_uid; 202 so->so_rgid = p->p_ucred->cr_rgid; 203 so->so_egid = p->p_ucred->cr_gid; 204 so->so_cpid = p->p_p->ps_pid; 205 so->so_proto = prp; 206 so->so_snd.sb_timeo_nsecs = INFSLP; 207 so->so_rcv.sb_timeo_nsecs = INFSLP; 208 209 solock(so); 210 error = pru_attach(so, proto, M_WAIT); 211 if (error) { 212 so->so_state |= SS_NOFDREF; 213 /* sofree() calls sounlock(). */ 214 sofree(so, 0); 215 return (error); 216 } 217 sounlock(so); 218 *aso = so; 219 return (0); 220 } 221 222 int 223 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 224 { 225 soassertlocked(so); 226 return pru_bind(so, nam, p); 227 } 228 229 int 230 solisten(struct socket *so, int backlog) 231 { 232 int somaxconn_local = READ_ONCE(somaxconn); 233 int sominconn_local = READ_ONCE(sominconn); 234 int error; 235 236 switch (so->so_type) { 237 case SOCK_STREAM: 238 case SOCK_SEQPACKET: 239 break; 240 default: 241 return (EOPNOTSUPP); 242 } 243 244 soassertlocked(so); 245 246 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 247 return (EINVAL); 248 #ifdef SOCKET_SPLICE 249 if (isspliced(so) || issplicedback(so)) 250 return (EOPNOTSUPP); 251 #endif /* SOCKET_SPLICE */ 252 error = pru_listen(so); 253 if (error) 254 return (error); 255 if (TAILQ_FIRST(&so->so_q) == NULL) 256 so->so_options |= SO_ACCEPTCONN; 257 if (backlog < 0 || backlog > somaxconn_local) 258 backlog = somaxconn_local; 259 if (backlog < sominconn_local) 260 backlog = sominconn_local; 261 so->so_qlimit = backlog; 262 return (0); 263 } 264 265 #define SOSP_FREEING_READ 1 266 #define SOSP_FREEING_WRITE 2 267 void 268 sofree(struct socket *so, int keep_lock) 269 { 270 int persocket = solock_persocket(so); 271 272 soassertlocked(so); 273 274 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 275 if (!keep_lock) 276 sounlock(so); 277 return; 278 } 279 if (so->so_head) { 280 struct socket *head = so->so_head; 281 282 /* 283 * We must not decommission a socket that's on the accept(2) 284 * queue. If we do, then accept(2) may hang after select(2) 285 * indicated that the listening socket was ready. 286 */ 287 if (so->so_onq == &head->so_q) { 288 if (!keep_lock) 289 sounlock(so); 290 return; 291 } 292 293 if (persocket) { 294 /* 295 * Concurrent close of `head' could 296 * abort `so' due to re-lock. 297 */ 298 soref(so); 299 soref(head); 300 sounlock(so); 301 solock(head); 302 solock(so); 303 304 if (so->so_onq != &head->so_q0) { 305 sounlock(head); 306 sounlock(so); 307 sorele(head); 308 sorele(so); 309 return; 310 } 311 312 sorele(head); 313 sorele(so); 314 } 315 316 soqremque(so, 0); 317 318 if (persocket) 319 sounlock(head); 320 } 321 322 if (persocket) { 323 sounlock(so); 324 refcnt_finalize(&so->so_refcnt, "sofinal"); 325 solock(so); 326 } 327 328 sigio_free(&so->so_sigio); 329 klist_free(&so->so_rcv.sb_klist); 330 klist_free(&so->so_snd.sb_klist); 331 #ifdef SOCKET_SPLICE 332 if (issplicedback(so)) { 333 int freeing = SOSP_FREEING_WRITE; 334 335 if (so->so_sp->ssp_soback == so) 336 freeing |= SOSP_FREEING_READ; 337 sounsplice(so->so_sp->ssp_soback, so, freeing); 338 } 339 if (isspliced(so)) { 340 int freeing = SOSP_FREEING_READ; 341 342 if (so == so->so_sp->ssp_socket) 343 freeing |= SOSP_FREEING_WRITE; 344 sounsplice(so, so->so_sp->ssp_socket, freeing); 345 } 346 #endif /* SOCKET_SPLICE */ 347 sbrelease(so, &so->so_snd); 348 sorflush(so); 349 if (!keep_lock) 350 sounlock(so); 351 #ifdef SOCKET_SPLICE 352 if (so->so_sp) { 353 /* Reuse splice idle, sounsplice() has been called before. */ 354 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 355 timeout_add(&so->so_sp->ssp_idleto, 0); 356 } else 357 #endif /* SOCKET_SPLICE */ 358 { 359 pool_put(&socket_pool, so); 360 } 361 } 362 363 static inline uint64_t 364 solinger_nsec(struct socket *so) 365 { 366 if (so->so_linger == 0) 367 return INFSLP; 368 369 return SEC_TO_NSEC(so->so_linger); 370 } 371 372 /* 373 * Close a socket on last file table reference removal. 374 * Initiate disconnect if connected. 375 * Free socket when disconnect complete. 376 */ 377 int 378 soclose(struct socket *so, int flags) 379 { 380 struct socket *so2; 381 int error = 0; 382 383 solock(so); 384 /* Revoke async IO early. There is a final revocation in sofree(). */ 385 sigio_free(&so->so_sigio); 386 if (so->so_state & SS_ISCONNECTED) { 387 if (so->so_pcb == NULL) 388 goto discard; 389 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 390 error = sodisconnect(so); 391 if (error) 392 goto drop; 393 } 394 if (so->so_options & SO_LINGER) { 395 if ((so->so_state & SS_ISDISCONNECTING) && 396 (flags & MSG_DONTWAIT)) 397 goto drop; 398 while (so->so_state & SS_ISCONNECTED) { 399 error = sosleep_nsec(so, &so->so_timeo, 400 PSOCK | PCATCH, "netcls", 401 solinger_nsec(so)); 402 if (error) 403 break; 404 } 405 } 406 } 407 drop: 408 if (so->so_pcb) { 409 int error2; 410 error2 = pru_detach(so); 411 if (error == 0) 412 error = error2; 413 } 414 if (so->so_options & SO_ACCEPTCONN) { 415 int persocket = solock_persocket(so); 416 417 if (persocket) { 418 /* Wait concurrent sonewconn() threads. */ 419 while (so->so_newconn > 0) { 420 so->so_state |= SS_NEWCONN_WAIT; 421 sosleep_nsec(so, &so->so_newconn, PSOCK, 422 "newcon", INFSLP); 423 } 424 } 425 426 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 427 if (persocket) 428 solock(so2); 429 (void) soqremque(so2, 0); 430 if (persocket) 431 sounlock(so); 432 soabort(so2); 433 if (persocket) 434 solock(so); 435 } 436 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 437 if (persocket) 438 solock(so2); 439 (void) soqremque(so2, 1); 440 if (persocket) 441 sounlock(so); 442 soabort(so2); 443 if (persocket) 444 solock(so); 445 } 446 } 447 discard: 448 if (so->so_state & SS_NOFDREF) 449 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 450 so->so_state |= SS_NOFDREF; 451 /* sofree() calls sounlock(). */ 452 sofree(so, 0); 453 return (error); 454 } 455 456 void 457 soabort(struct socket *so) 458 { 459 soassertlocked(so); 460 pru_abort(so); 461 } 462 463 int 464 soaccept(struct socket *so, struct mbuf *nam) 465 { 466 int error = 0; 467 468 soassertlocked(so); 469 470 if ((so->so_state & SS_NOFDREF) == 0) 471 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 472 so->so_state &= ~SS_NOFDREF; 473 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 474 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 475 error = pru_accept(so, nam); 476 else 477 error = ECONNABORTED; 478 return (error); 479 } 480 481 int 482 soconnect(struct socket *so, struct mbuf *nam) 483 { 484 int error; 485 486 soassertlocked(so); 487 488 if (so->so_options & SO_ACCEPTCONN) 489 return (EOPNOTSUPP); 490 /* 491 * If protocol is connection-based, can only connect once. 492 * Otherwise, if connected, try to disconnect first. 493 * This allows user to disconnect by connecting to, e.g., 494 * a null address. 495 */ 496 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 497 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 498 (error = sodisconnect(so)))) 499 error = EISCONN; 500 else 501 error = pru_connect(so, nam); 502 return (error); 503 } 504 505 int 506 soconnect2(struct socket *so1, struct socket *so2) 507 { 508 int persocket, error; 509 510 if ((persocket = solock_persocket(so1))) 511 solock_pair(so1, so2); 512 else 513 solock(so1); 514 515 error = pru_connect2(so1, so2); 516 517 if (persocket) 518 sounlock(so2); 519 sounlock(so1); 520 return (error); 521 } 522 523 int 524 sodisconnect(struct socket *so) 525 { 526 int error; 527 528 soassertlocked(so); 529 530 if ((so->so_state & SS_ISCONNECTED) == 0) 531 return (ENOTCONN); 532 if (so->so_state & SS_ISDISCONNECTING) 533 return (EALREADY); 534 error = pru_disconnect(so); 535 return (error); 536 } 537 538 int m_getuio(struct mbuf **, int, long, struct uio *); 539 540 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 541 /* 542 * Send on a socket. 543 * If send must go all at once and message is larger than 544 * send buffering, then hard error. 545 * Lock against other senders. 546 * If must go all at once and not enough room now, then 547 * inform user that this would block and do nothing. 548 * Otherwise, if nonblocking, send as much as possible. 549 * The data to be sent is described by "uio" if nonzero, 550 * otherwise by the mbuf chain "top" (which must be null 551 * if uio is not). Data provided in mbuf chain must be small 552 * enough to send all at once. 553 * 554 * Returns nonzero on error, timeout or signal; callers 555 * must check for short counts if EINTR/ERESTART are returned. 556 * Data and control buffers are freed on return. 557 */ 558 int 559 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 560 struct mbuf *control, int flags) 561 { 562 long space, clen = 0; 563 size_t resid; 564 int error; 565 int atomic = sosendallatonce(so) || top; 566 567 if (uio) 568 resid = uio->uio_resid; 569 else 570 resid = top->m_pkthdr.len; 571 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 572 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 573 m_freem(top); 574 m_freem(control); 575 return (EINVAL); 576 } 577 if (uio && uio->uio_procp) 578 uio->uio_procp->p_ru.ru_msgsnd++; 579 if (control) { 580 /* 581 * In theory clen should be unsigned (since control->m_len is). 582 * However, space must be signed, as it might be less than 0 583 * if we over-committed, and we must use a signed comparison 584 * of space and clen. 585 */ 586 clen = control->m_len; 587 /* reserve extra space for AF_UNIX's internalize */ 588 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 589 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 590 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 591 clen = CMSG_SPACE( 592 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 593 (sizeof(struct fdpass) / sizeof(int))); 594 } 595 596 #define snderr(errno) { error = errno; goto release; } 597 598 solock_shared(so); 599 restart: 600 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 601 goto out; 602 so->so_snd.sb_state |= SS_ISSENDING; 603 do { 604 if (so->so_snd.sb_state & SS_CANTSENDMORE) 605 snderr(EPIPE); 606 if (so->so_error) { 607 error = so->so_error; 608 so->so_error = 0; 609 snderr(error); 610 } 611 if ((so->so_state & SS_ISCONNECTED) == 0) { 612 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 613 if (!(resid == 0 && clen != 0)) 614 snderr(ENOTCONN); 615 } else if (addr == NULL) 616 snderr(EDESTADDRREQ); 617 } 618 space = sbspace(so, &so->so_snd); 619 if (flags & MSG_OOB) 620 space += 1024; 621 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 622 if (atomic && resid > so->so_snd.sb_hiwat) 623 snderr(EMSGSIZE); 624 } else { 625 if (clen > so->so_snd.sb_hiwat || 626 (atomic && resid > so->so_snd.sb_hiwat - clen)) 627 snderr(EMSGSIZE); 628 } 629 if (space < clen || 630 (space - clen < resid && 631 (atomic || space < so->so_snd.sb_lowat))) { 632 if (flags & MSG_DONTWAIT) 633 snderr(EWOULDBLOCK); 634 sbunlock(so, &so->so_snd); 635 error = sbwait(so, &so->so_snd); 636 so->so_snd.sb_state &= ~SS_ISSENDING; 637 if (error) 638 goto out; 639 goto restart; 640 } 641 space -= clen; 642 do { 643 if (uio == NULL) { 644 /* 645 * Data is prepackaged in "top". 646 */ 647 resid = 0; 648 if (flags & MSG_EOR) 649 top->m_flags |= M_EOR; 650 } else { 651 sounlock_shared(so); 652 error = m_getuio(&top, atomic, space, uio); 653 solock_shared(so); 654 if (error) 655 goto release; 656 space -= top->m_pkthdr.len; 657 resid = uio->uio_resid; 658 if (flags & MSG_EOR) 659 top->m_flags |= M_EOR; 660 } 661 if (resid == 0) 662 so->so_snd.sb_state &= ~SS_ISSENDING; 663 if (top && so->so_options & SO_ZEROIZE) 664 top->m_flags |= M_ZEROIZE; 665 if (flags & MSG_OOB) 666 error = pru_sendoob(so, top, addr, control); 667 else 668 error = pru_send(so, top, addr, control); 669 clen = 0; 670 control = NULL; 671 top = NULL; 672 if (error) 673 goto release; 674 } while (resid && space > 0); 675 } while (resid); 676 677 release: 678 so->so_snd.sb_state &= ~SS_ISSENDING; 679 sbunlock(so, &so->so_snd); 680 out: 681 sounlock_shared(so); 682 m_freem(top); 683 m_freem(control); 684 return (error); 685 } 686 687 int 688 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 689 { 690 struct mbuf *m, *top = NULL; 691 struct mbuf **nextp = ⊤ 692 u_long len, mlen; 693 size_t resid = uio->uio_resid; 694 int error; 695 696 do { 697 if (top == NULL) { 698 MGETHDR(m, M_WAIT, MT_DATA); 699 mlen = MHLEN; 700 m->m_pkthdr.len = 0; 701 m->m_pkthdr.ph_ifidx = 0; 702 } else { 703 MGET(m, M_WAIT, MT_DATA); 704 mlen = MLEN; 705 } 706 /* chain mbuf together */ 707 *nextp = m; 708 nextp = &m->m_next; 709 710 resid = ulmin(resid, space); 711 if (resid >= MINCLSIZE) { 712 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 713 if ((m->m_flags & M_EXT) == 0) 714 MCLGETL(m, M_NOWAIT, MCLBYTES); 715 if ((m->m_flags & M_EXT) == 0) 716 goto nopages; 717 mlen = m->m_ext.ext_size; 718 len = ulmin(mlen, resid); 719 /* 720 * For datagram protocols, leave room 721 * for protocol headers in first mbuf. 722 */ 723 if (atomic && m == top && len < mlen - max_hdr) 724 m->m_data += max_hdr; 725 } else { 726 nopages: 727 len = ulmin(mlen, resid); 728 /* 729 * For datagram protocols, leave room 730 * for protocol headers in first mbuf. 731 */ 732 if (atomic && m == top && len < mlen - max_hdr) 733 m_align(m, len); 734 } 735 736 error = uiomove(mtod(m, caddr_t), len, uio); 737 if (error) { 738 m_freem(top); 739 return (error); 740 } 741 742 /* adjust counters */ 743 resid = uio->uio_resid; 744 space -= len; 745 m->m_len = len; 746 top->m_pkthdr.len += len; 747 748 /* Is there more space and more data? */ 749 } while (space > 0 && resid > 0); 750 751 *mp = top; 752 return 0; 753 } 754 755 /* 756 * Following replacement or removal of the first mbuf on the first 757 * mbuf chain of a socket buffer, push necessary state changes back 758 * into the socket buffer so that other consumers see the values 759 * consistently. 'nextrecord' is the callers locally stored value of 760 * the original value of sb->sb_mb->m_nextpkt which must be restored 761 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 762 */ 763 void 764 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 765 { 766 767 /* 768 * First, update for the new value of nextrecord. If necessary, 769 * make it the first record. 770 */ 771 if (sb->sb_mb != NULL) 772 sb->sb_mb->m_nextpkt = nextrecord; 773 else 774 sb->sb_mb = nextrecord; 775 776 /* 777 * Now update any dependent socket buffer fields to reflect 778 * the new state. This is an inline of SB_EMPTY_FIXUP, with 779 * the addition of a second clause that takes care of the 780 * case where sb_mb has been updated, but remains the last 781 * record. 782 */ 783 if (sb->sb_mb == NULL) { 784 sb->sb_mbtail = NULL; 785 sb->sb_lastrecord = NULL; 786 } else if (sb->sb_mb->m_nextpkt == NULL) 787 sb->sb_lastrecord = sb->sb_mb; 788 } 789 790 /* 791 * Implement receive operations on a socket. 792 * We depend on the way that records are added to the sockbuf 793 * by sbappend*. In particular, each record (mbufs linked through m_next) 794 * must begin with an address if the protocol so specifies, 795 * followed by an optional mbuf or mbufs containing ancillary data, 796 * and then zero or more mbufs of data. 797 * In order to avoid blocking network for the entire time here, we release 798 * the solock() while doing the actual copy to user space. 799 * Although the sockbuf is locked, new data may still be appended, 800 * and thus we must maintain consistency of the sockbuf during that time. 801 * 802 * The caller may receive the data as a single mbuf chain by supplying 803 * an mbuf **mp0 for use in returning the chain. The uio is then used 804 * only for the count in uio_resid. 805 */ 806 int 807 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 808 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 809 socklen_t controllen) 810 { 811 struct mbuf *m, **mp; 812 struct mbuf *cm; 813 u_long len, offset, moff; 814 int flags, error, type, uio_error = 0; 815 const struct protosw *pr = so->so_proto; 816 struct mbuf *nextrecord; 817 size_t resid, orig_resid = uio->uio_resid; 818 819 mp = mp0; 820 if (paddr) 821 *paddr = NULL; 822 if (controlp) 823 *controlp = NULL; 824 if (flagsp) 825 flags = *flagsp &~ MSG_EOR; 826 else 827 flags = 0; 828 if (flags & MSG_OOB) { 829 m = m_get(M_WAIT, MT_DATA); 830 solock(so); 831 error = pru_rcvoob(so, m, flags & MSG_PEEK); 832 sounlock(so); 833 if (error) 834 goto bad; 835 do { 836 error = uiomove(mtod(m, caddr_t), 837 ulmin(uio->uio_resid, m->m_len), uio); 838 m = m_free(m); 839 } while (uio->uio_resid && error == 0 && m); 840 bad: 841 m_freem(m); 842 return (error); 843 } 844 if (mp) 845 *mp = NULL; 846 847 solock_shared(so); 848 restart: 849 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 850 sounlock_shared(so); 851 return (error); 852 } 853 sb_mtx_lock(&so->so_rcv); 854 855 m = so->so_rcv.sb_mb; 856 #ifdef SOCKET_SPLICE 857 if (isspliced(so)) 858 m = NULL; 859 #endif /* SOCKET_SPLICE */ 860 /* 861 * If we have less data than requested, block awaiting more 862 * (subject to any timeout) if: 863 * 1. the current count is less than the low water mark, 864 * 2. MSG_WAITALL is set, and it is possible to do the entire 865 * receive operation at once if we block (resid <= hiwat), or 866 * 3. MSG_DONTWAIT is not set. 867 * If MSG_WAITALL is set but resid is larger than the receive buffer, 868 * we have to do the receive in sections, and thus risk returning 869 * a short count if a timeout or signal occurs after we start. 870 */ 871 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 872 so->so_rcv.sb_cc < uio->uio_resid) && 873 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 874 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 875 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 876 #ifdef DIAGNOSTIC 877 if (m == NULL && so->so_rcv.sb_cc) 878 #ifdef SOCKET_SPLICE 879 if (!isspliced(so)) 880 #endif /* SOCKET_SPLICE */ 881 panic("receive 1: so %p, so_type %d, sb_cc %lu", 882 so, so->so_type, so->so_rcv.sb_cc); 883 #endif 884 if (so->so_error) { 885 if (m) 886 goto dontblock; 887 error = so->so_error; 888 if ((flags & MSG_PEEK) == 0) 889 so->so_error = 0; 890 goto release; 891 } 892 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 893 if (m) 894 goto dontblock; 895 else if (so->so_rcv.sb_cc == 0) 896 goto release; 897 } 898 for (; m; m = m->m_next) 899 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 900 m = so->so_rcv.sb_mb; 901 goto dontblock; 902 } 903 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 904 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 905 error = ENOTCONN; 906 goto release; 907 } 908 if (uio->uio_resid == 0 && controlp == NULL) 909 goto release; 910 if (flags & MSG_DONTWAIT) { 911 error = EWOULDBLOCK; 912 goto release; 913 } 914 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 915 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 916 917 if (so->so_rcv.sb_flags & SB_OWNLOCK) { 918 sbunlock_locked(so, &so->so_rcv); 919 sounlock_shared(so); 920 error = sbwait_locked(so, &so->so_rcv); 921 sb_mtx_unlock(&so->so_rcv); 922 if (error) 923 return (error); 924 solock_shared(so); 925 } else { 926 sb_mtx_unlock(&so->so_rcv); 927 sbunlock(so, &so->so_rcv); 928 error = sbwait(so, &so->so_rcv); 929 if (error) { 930 sounlock_shared(so); 931 return (error); 932 } 933 } 934 goto restart; 935 } 936 dontblock: 937 /* 938 * On entry here, m points to the first record of the socket buffer. 939 * From this point onward, we maintain 'nextrecord' as a cache of the 940 * pointer to the next record in the socket buffer. We must keep the 941 * various socket buffer pointers and local stack versions of the 942 * pointers in sync, pushing out modifications before operations that 943 * may sleep, and re-reading them afterwards. 944 * 945 * Otherwise, we will race with the network stack appending new data 946 * or records onto the socket buffer by using inconsistent/stale 947 * versions of the field, possibly resulting in socket buffer 948 * corruption. 949 */ 950 if (uio->uio_procp) 951 uio->uio_procp->p_ru.ru_msgrcv++; 952 KASSERT(m == so->so_rcv.sb_mb); 953 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 954 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 955 nextrecord = m->m_nextpkt; 956 if (pr->pr_flags & PR_ADDR) { 957 #ifdef DIAGNOSTIC 958 if (m->m_type != MT_SONAME) 959 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 960 so, so->so_type, m, m->m_type); 961 #endif 962 orig_resid = 0; 963 if (flags & MSG_PEEK) { 964 if (paddr) 965 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 966 m = m->m_next; 967 } else { 968 sbfree(so, &so->so_rcv, m); 969 if (paddr) { 970 *paddr = m; 971 so->so_rcv.sb_mb = m->m_next; 972 m->m_next = NULL; 973 m = so->so_rcv.sb_mb; 974 } else { 975 so->so_rcv.sb_mb = m_free(m); 976 m = so->so_rcv.sb_mb; 977 } 978 sbsync(&so->so_rcv, nextrecord); 979 } 980 } 981 while (m && m->m_type == MT_CONTROL && error == 0) { 982 int skip = 0; 983 if (flags & MSG_PEEK) { 984 if (mtod(m, struct cmsghdr *)->cmsg_type == 985 SCM_RIGHTS) { 986 /* don't leak internalized SCM_RIGHTS msgs */ 987 skip = 1; 988 } else if (controlp) 989 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 990 m = m->m_next; 991 } else { 992 sbfree(so, &so->so_rcv, m); 993 so->so_rcv.sb_mb = m->m_next; 994 m->m_nextpkt = m->m_next = NULL; 995 cm = m; 996 m = so->so_rcv.sb_mb; 997 sbsync(&so->so_rcv, nextrecord); 998 if (controlp) { 999 if (pr->pr_domain->dom_externalize) { 1000 sb_mtx_unlock(&so->so_rcv); 1001 sounlock_shared(so); 1002 error = 1003 (*pr->pr_domain->dom_externalize) 1004 (cm, controllen, flags); 1005 solock_shared(so); 1006 sb_mtx_lock(&so->so_rcv); 1007 } 1008 *controlp = cm; 1009 } else { 1010 /* 1011 * Dispose of any SCM_RIGHTS message that went 1012 * through the read path rather than recv. 1013 */ 1014 if (pr->pr_domain->dom_dispose) { 1015 sb_mtx_unlock(&so->so_rcv); 1016 pr->pr_domain->dom_dispose(cm); 1017 sb_mtx_lock(&so->so_rcv); 1018 } 1019 m_free(cm); 1020 } 1021 } 1022 if (m != NULL) 1023 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1024 else 1025 nextrecord = so->so_rcv.sb_mb; 1026 if (controlp && !skip) 1027 controlp = &(*controlp)->m_next; 1028 orig_resid = 0; 1029 } 1030 1031 /* If m is non-NULL, we have some data to read. */ 1032 if (m) { 1033 type = m->m_type; 1034 if (type == MT_OOBDATA) 1035 flags |= MSG_OOB; 1036 if (m->m_flags & M_BCAST) 1037 flags |= MSG_BCAST; 1038 if (m->m_flags & M_MCAST) 1039 flags |= MSG_MCAST; 1040 } 1041 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1042 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1043 1044 moff = 0; 1045 offset = 0; 1046 while (m && uio->uio_resid > 0 && error == 0) { 1047 if (m->m_type == MT_OOBDATA) { 1048 if (type != MT_OOBDATA) 1049 break; 1050 } else if (type == MT_OOBDATA) { 1051 break; 1052 } else if (m->m_type == MT_CONTROL) { 1053 /* 1054 * If there is more than one control message in the 1055 * stream, we do a short read. Next can be received 1056 * or disposed by another system call. 1057 */ 1058 break; 1059 #ifdef DIAGNOSTIC 1060 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1061 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1062 so, so->so_type, m, m->m_type); 1063 #endif 1064 } 1065 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1066 len = uio->uio_resid; 1067 if (so->so_oobmark && len > so->so_oobmark - offset) 1068 len = so->so_oobmark - offset; 1069 if (len > m->m_len - moff) 1070 len = m->m_len - moff; 1071 /* 1072 * If mp is set, just pass back the mbufs. 1073 * Otherwise copy them out via the uio, then free. 1074 * Sockbuf must be consistent here (points to current mbuf, 1075 * it points to next record) when we drop priority; 1076 * we must note any additions to the sockbuf when we 1077 * block interrupts again. 1078 */ 1079 if (mp == NULL && uio_error == 0) { 1080 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1081 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1082 resid = uio->uio_resid; 1083 sb_mtx_unlock(&so->so_rcv); 1084 sounlock_shared(so); 1085 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1086 solock_shared(so); 1087 sb_mtx_lock(&so->so_rcv); 1088 if (uio_error) 1089 uio->uio_resid = resid - len; 1090 } else 1091 uio->uio_resid -= len; 1092 if (len == m->m_len - moff) { 1093 if (m->m_flags & M_EOR) 1094 flags |= MSG_EOR; 1095 if (flags & MSG_PEEK) { 1096 m = m->m_next; 1097 moff = 0; 1098 orig_resid = 0; 1099 } else { 1100 nextrecord = m->m_nextpkt; 1101 sbfree(so, &so->so_rcv, m); 1102 if (mp) { 1103 *mp = m; 1104 mp = &m->m_next; 1105 so->so_rcv.sb_mb = m = m->m_next; 1106 *mp = NULL; 1107 } else { 1108 so->so_rcv.sb_mb = m_free(m); 1109 m = so->so_rcv.sb_mb; 1110 } 1111 /* 1112 * If m != NULL, we also know that 1113 * so->so_rcv.sb_mb != NULL. 1114 */ 1115 KASSERT(so->so_rcv.sb_mb == m); 1116 if (m) { 1117 m->m_nextpkt = nextrecord; 1118 if (nextrecord == NULL) 1119 so->so_rcv.sb_lastrecord = m; 1120 } else { 1121 so->so_rcv.sb_mb = nextrecord; 1122 SB_EMPTY_FIXUP(&so->so_rcv); 1123 } 1124 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1125 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1126 } 1127 } else { 1128 if (flags & MSG_PEEK) { 1129 moff += len; 1130 orig_resid = 0; 1131 } else { 1132 if (mp) 1133 *mp = m_copym(m, 0, len, M_WAIT); 1134 m->m_data += len; 1135 m->m_len -= len; 1136 so->so_rcv.sb_cc -= len; 1137 so->so_rcv.sb_datacc -= len; 1138 } 1139 } 1140 if (so->so_oobmark) { 1141 if ((flags & MSG_PEEK) == 0) { 1142 so->so_oobmark -= len; 1143 if (so->so_oobmark == 0) { 1144 so->so_rcv.sb_state |= SS_RCVATMARK; 1145 break; 1146 } 1147 } else { 1148 offset += len; 1149 if (offset == so->so_oobmark) 1150 break; 1151 } 1152 } 1153 if (flags & MSG_EOR) 1154 break; 1155 /* 1156 * If the MSG_WAITALL flag is set (for non-atomic socket), 1157 * we must not quit until "uio->uio_resid == 0" or an error 1158 * termination. If a signal/timeout occurs, return 1159 * with a short count but without error. 1160 * Keep sockbuf locked against other readers. 1161 */ 1162 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1163 !sosendallatonce(so) && !nextrecord) { 1164 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1165 so->so_error) 1166 break; 1167 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1168 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1169 sb_mtx_unlock(&so->so_rcv); 1170 error = sbwait(so, &so->so_rcv); 1171 if (error) { 1172 sbunlock(so, &so->so_rcv); 1173 sounlock_shared(so); 1174 return (0); 1175 } 1176 sb_mtx_lock(&so->so_rcv); 1177 if ((m = so->so_rcv.sb_mb) != NULL) 1178 nextrecord = m->m_nextpkt; 1179 } 1180 } 1181 1182 if (m && pr->pr_flags & PR_ATOMIC) { 1183 flags |= MSG_TRUNC; 1184 if ((flags & MSG_PEEK) == 0) 1185 (void) sbdroprecord(so, &so->so_rcv); 1186 } 1187 if ((flags & MSG_PEEK) == 0) { 1188 if (m == NULL) { 1189 /* 1190 * First part is an inline SB_EMPTY_FIXUP(). Second 1191 * part makes sure sb_lastrecord is up-to-date if 1192 * there is still data in the socket buffer. 1193 */ 1194 so->so_rcv.sb_mb = nextrecord; 1195 if (so->so_rcv.sb_mb == NULL) { 1196 so->so_rcv.sb_mbtail = NULL; 1197 so->so_rcv.sb_lastrecord = NULL; 1198 } else if (nextrecord->m_nextpkt == NULL) 1199 so->so_rcv.sb_lastrecord = nextrecord; 1200 } 1201 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1202 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1203 if (pr->pr_flags & PR_WANTRCVD) { 1204 sb_mtx_unlock(&so->so_rcv); 1205 pru_rcvd(so); 1206 sb_mtx_lock(&so->so_rcv); 1207 } 1208 } 1209 if (orig_resid == uio->uio_resid && orig_resid && 1210 (flags & MSG_EOR) == 0 && 1211 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1212 sb_mtx_unlock(&so->so_rcv); 1213 sbunlock(so, &so->so_rcv); 1214 goto restart; 1215 } 1216 1217 if (uio_error) 1218 error = uio_error; 1219 1220 if (flagsp) 1221 *flagsp |= flags; 1222 release: 1223 sb_mtx_unlock(&so->so_rcv); 1224 sbunlock(so, &so->so_rcv); 1225 sounlock_shared(so); 1226 return (error); 1227 } 1228 1229 int 1230 soshutdown(struct socket *so, int how) 1231 { 1232 int error = 0; 1233 1234 solock(so); 1235 switch (how) { 1236 case SHUT_RD: 1237 sorflush(so); 1238 break; 1239 case SHUT_RDWR: 1240 sorflush(so); 1241 /* FALLTHROUGH */ 1242 case SHUT_WR: 1243 error = pru_shutdown(so); 1244 break; 1245 default: 1246 error = EINVAL; 1247 break; 1248 } 1249 sounlock(so); 1250 1251 return (error); 1252 } 1253 1254 void 1255 sorflush(struct socket *so) 1256 { 1257 struct sockbuf *sb = &so->so_rcv; 1258 struct mbuf *m; 1259 const struct protosw *pr = so->so_proto; 1260 int error; 1261 1262 error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); 1263 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1264 KASSERT(error == 0); 1265 socantrcvmore(so); 1266 mtx_enter(&sb->sb_mtx); 1267 m = sb->sb_mb; 1268 memset(&sb->sb_startzero, 0, 1269 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1270 sb->sb_timeo_nsecs = INFSLP; 1271 mtx_leave(&sb->sb_mtx); 1272 sbunlock(so, sb); 1273 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1274 (*pr->pr_domain->dom_dispose)(m); 1275 m_purge(m); 1276 } 1277 1278 #ifdef SOCKET_SPLICE 1279 1280 #define so_splicelen so_sp->ssp_len 1281 #define so_splicemax so_sp->ssp_max 1282 #define so_idletv so_sp->ssp_idletv 1283 #define so_idleto so_sp->ssp_idleto 1284 #define so_splicetask so_sp->ssp_task 1285 1286 int 1287 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1288 { 1289 struct file *fp; 1290 struct socket *sosp; 1291 struct sosplice *sp; 1292 struct taskq *tq; 1293 int error = 0; 1294 1295 soassertlocked(so); 1296 1297 if (sosplice_taskq == NULL) { 1298 rw_enter_write(&sosplice_lock); 1299 if (sosplice_taskq == NULL) { 1300 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1301 TASKQ_MPSAFE); 1302 if (tq == NULL) { 1303 rw_exit_write(&sosplice_lock); 1304 return (ENOMEM); 1305 } 1306 /* Ensure the taskq is fully visible to other CPUs. */ 1307 membar_producer(); 1308 sosplice_taskq = tq; 1309 } 1310 rw_exit_write(&sosplice_lock); 1311 } else { 1312 /* Ensure the taskq is fully visible on this CPU. */ 1313 membar_consumer(); 1314 } 1315 1316 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1317 return (EPROTONOSUPPORT); 1318 if (so->so_options & SO_ACCEPTCONN) 1319 return (EOPNOTSUPP); 1320 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1321 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1322 return (ENOTCONN); 1323 if (so->so_sp == NULL) { 1324 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1325 if (so->so_sp == NULL) 1326 so->so_sp = sp; 1327 else 1328 pool_put(&sosplice_pool, sp); 1329 } 1330 1331 /* If no fd is given, unsplice by removing existing link. */ 1332 if (fd < 0) { 1333 /* Lock receive buffer. */ 1334 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1335 return (error); 1336 } 1337 if (so->so_sp->ssp_socket) 1338 sounsplice(so, so->so_sp->ssp_socket, 0); 1339 sbunlock(so, &so->so_rcv); 1340 return (0); 1341 } 1342 1343 if (max && max < 0) 1344 return (EINVAL); 1345 1346 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1347 return (EINVAL); 1348 1349 /* Find sosp, the drain socket where data will be spliced into. */ 1350 if ((error = getsock(curproc, fd, &fp)) != 0) 1351 return (error); 1352 sosp = fp->f_data; 1353 if (sosp->so_proto->pr_usrreqs->pru_send != 1354 so->so_proto->pr_usrreqs->pru_send) { 1355 error = EPROTONOSUPPORT; 1356 goto frele; 1357 } 1358 if (sosp->so_sp == NULL) { 1359 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1360 if (sosp->so_sp == NULL) 1361 sosp->so_sp = sp; 1362 else 1363 pool_put(&sosplice_pool, sp); 1364 } 1365 1366 /* Lock both receive and send buffer. */ 1367 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1368 goto frele; 1369 } 1370 if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) { 1371 sbunlock(so, &so->so_rcv); 1372 goto frele; 1373 } 1374 1375 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1376 error = EBUSY; 1377 goto release; 1378 } 1379 if (sosp->so_options & SO_ACCEPTCONN) { 1380 error = EOPNOTSUPP; 1381 goto release; 1382 } 1383 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1384 error = ENOTCONN; 1385 goto release; 1386 } 1387 1388 /* Splice so and sosp together. */ 1389 so->so_sp->ssp_socket = sosp; 1390 sosp->so_sp->ssp_soback = so; 1391 so->so_splicelen = 0; 1392 so->so_splicemax = max; 1393 if (tv) 1394 so->so_idletv = *tv; 1395 else 1396 timerclear(&so->so_idletv); 1397 timeout_set_proc(&so->so_idleto, soidle, so); 1398 task_set(&so->so_splicetask, sotask, so); 1399 1400 /* 1401 * To prevent softnet interrupt from calling somove() while 1402 * we sleep, the socket buffers are not marked as spliced yet. 1403 */ 1404 if (somove(so, M_WAIT)) { 1405 mtx_enter(&so->so_rcv.sb_mtx); 1406 so->so_rcv.sb_flags |= SB_SPLICE; 1407 mtx_leave(&so->so_rcv.sb_mtx); 1408 sosp->so_snd.sb_flags |= SB_SPLICE; 1409 } 1410 1411 release: 1412 sbunlock(sosp, &sosp->so_snd); 1413 sbunlock(so, &so->so_rcv); 1414 frele: 1415 /* 1416 * FRELE() must not be called with the socket lock held. It is safe to 1417 * release the lock here as long as no other operation happen on the 1418 * socket when sosplice() returns. The dance could be avoided by 1419 * grabbing the socket lock inside this function. 1420 */ 1421 sounlock(so); 1422 FRELE(fp, curproc); 1423 solock(so); 1424 return (error); 1425 } 1426 1427 void 1428 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1429 { 1430 soassertlocked(so); 1431 1432 task_del(sosplice_taskq, &so->so_splicetask); 1433 timeout_del(&so->so_idleto); 1434 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1435 mtx_enter(&so->so_rcv.sb_mtx); 1436 so->so_rcv.sb_flags &= ~SB_SPLICE; 1437 mtx_leave(&so->so_rcv.sb_mtx); 1438 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1439 /* Do not wakeup a socket that is about to be freed. */ 1440 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1441 sorwakeup(so); 1442 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1443 sowwakeup(sosp); 1444 } 1445 1446 void 1447 soidle(void *arg) 1448 { 1449 struct socket *so = arg; 1450 1451 solock(so); 1452 if (so->so_rcv.sb_flags & SB_SPLICE) { 1453 so->so_error = ETIMEDOUT; 1454 sounsplice(so, so->so_sp->ssp_socket, 0); 1455 } 1456 sounlock(so); 1457 } 1458 1459 void 1460 sotask(void *arg) 1461 { 1462 struct socket *so = arg; 1463 1464 solock(so); 1465 if (so->so_rcv.sb_flags & SB_SPLICE) { 1466 /* 1467 * We may not sleep here as sofree() and unsplice() may be 1468 * called from softnet interrupt context. This would remove 1469 * the socket during somove(). 1470 */ 1471 somove(so, M_DONTWAIT); 1472 } 1473 sounlock(so); 1474 1475 /* Avoid user land starvation. */ 1476 yield(); 1477 } 1478 1479 /* 1480 * The socket splicing task or idle timeout may sleep while grabbing the net 1481 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1482 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1483 * after all pending socket splicing tasks or timeouts have finished. Do this 1484 * by scheduling it on the same threads. 1485 */ 1486 void 1487 soreaper(void *arg) 1488 { 1489 struct socket *so = arg; 1490 1491 /* Reuse splice task, sounsplice() has been called before. */ 1492 task_set(&so->so_sp->ssp_task, soput, so); 1493 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1494 } 1495 1496 void 1497 soput(void *arg) 1498 { 1499 struct socket *so = arg; 1500 1501 pool_put(&sosplice_pool, so->so_sp); 1502 pool_put(&socket_pool, so); 1503 } 1504 1505 /* 1506 * Move data from receive buffer of spliced source socket to send 1507 * buffer of drain socket. Try to move as much as possible in one 1508 * big chunk. It is a TCP only implementation. 1509 * Return value 0 means splicing has been finished, 1 continue. 1510 */ 1511 int 1512 somove(struct socket *so, int wait) 1513 { 1514 struct socket *sosp = so->so_sp->ssp_socket; 1515 struct mbuf *m, **mp, *nextrecord; 1516 u_long len, off, oobmark; 1517 long space; 1518 int error = 0, maxreached = 0; 1519 unsigned int rcvstate; 1520 1521 soassertlocked(so); 1522 1523 nextpkt: 1524 if (so->so_error) { 1525 error = so->so_error; 1526 goto release; 1527 } 1528 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1529 error = EPIPE; 1530 goto release; 1531 } 1532 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1533 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1534 error = sosp->so_error; 1535 goto release; 1536 } 1537 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1538 goto release; 1539 1540 /* Calculate how many bytes can be copied now. */ 1541 len = so->so_rcv.sb_datacc; 1542 if (so->so_splicemax) { 1543 KASSERT(so->so_splicelen < so->so_splicemax); 1544 if (so->so_splicemax <= so->so_splicelen + len) { 1545 len = so->so_splicemax - so->so_splicelen; 1546 maxreached = 1; 1547 } 1548 } 1549 space = sbspace(sosp, &sosp->so_snd); 1550 if (so->so_oobmark && so->so_oobmark < len && 1551 so->so_oobmark < space + 1024) 1552 space += 1024; 1553 if (space <= 0) { 1554 maxreached = 0; 1555 goto release; 1556 } 1557 if (space < len) { 1558 maxreached = 0; 1559 if (space < sosp->so_snd.sb_lowat) 1560 goto release; 1561 len = space; 1562 } 1563 sosp->so_snd.sb_state |= SS_ISSENDING; 1564 1565 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1566 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1567 m = so->so_rcv.sb_mb; 1568 if (m == NULL) 1569 goto release; 1570 nextrecord = m->m_nextpkt; 1571 1572 /* Drop address and control information not used with splicing. */ 1573 if (so->so_proto->pr_flags & PR_ADDR) { 1574 #ifdef DIAGNOSTIC 1575 if (m->m_type != MT_SONAME) 1576 panic("somove soname: so %p, so_type %d, m %p, " 1577 "m_type %d", so, so->so_type, m, m->m_type); 1578 #endif 1579 m = m->m_next; 1580 } 1581 while (m && m->m_type == MT_CONTROL) 1582 m = m->m_next; 1583 if (m == NULL) { 1584 sbdroprecord(so, &so->so_rcv); 1585 if (so->so_proto->pr_flags & PR_WANTRCVD) 1586 pru_rcvd(so); 1587 goto nextpkt; 1588 } 1589 1590 /* 1591 * By splicing sockets connected to localhost, userland might create a 1592 * loop. Dissolve splicing with error if loop is detected by counter. 1593 * 1594 * If we deal with looped broadcast/multicast packet we bail out with 1595 * no error to suppress splice termination. 1596 */ 1597 if ((m->m_flags & M_PKTHDR) && 1598 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1599 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1600 error = ELOOP; 1601 goto release; 1602 } 1603 1604 if (so->so_proto->pr_flags & PR_ATOMIC) { 1605 if ((m->m_flags & M_PKTHDR) == 0) 1606 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1607 "m_type %d", so, so->so_type, m, m->m_type); 1608 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1609 error = EMSGSIZE; 1610 goto release; 1611 } 1612 if (len < m->m_pkthdr.len) 1613 goto release; 1614 if (m->m_pkthdr.len < len) { 1615 maxreached = 0; 1616 len = m->m_pkthdr.len; 1617 } 1618 /* 1619 * Throw away the name mbuf after it has been assured 1620 * that the whole first record can be processed. 1621 */ 1622 m = so->so_rcv.sb_mb; 1623 sbfree(so, &so->so_rcv, m); 1624 so->so_rcv.sb_mb = m_free(m); 1625 sbsync(&so->so_rcv, nextrecord); 1626 } 1627 /* 1628 * Throw away the control mbufs after it has been assured 1629 * that the whole first record can be processed. 1630 */ 1631 m = so->so_rcv.sb_mb; 1632 while (m && m->m_type == MT_CONTROL) { 1633 sbfree(so, &so->so_rcv, m); 1634 so->so_rcv.sb_mb = m_free(m); 1635 m = so->so_rcv.sb_mb; 1636 sbsync(&so->so_rcv, nextrecord); 1637 } 1638 1639 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1640 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1641 1642 /* Take at most len mbufs out of receive buffer. */ 1643 for (off = 0, mp = &m; off <= len && *mp; 1644 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1645 u_long size = len - off; 1646 1647 #ifdef DIAGNOSTIC 1648 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1649 panic("somove type: so %p, so_type %d, m %p, " 1650 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1651 #endif 1652 if ((*mp)->m_len > size) { 1653 /* 1654 * Move only a partial mbuf at maximum splice length or 1655 * if the drain buffer is too small for this large mbuf. 1656 */ 1657 if (!maxreached && so->so_snd.sb_datacc > 0) { 1658 len -= size; 1659 break; 1660 } 1661 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1662 if (*mp == NULL) { 1663 len -= size; 1664 break; 1665 } 1666 so->so_rcv.sb_mb->m_data += size; 1667 so->so_rcv.sb_mb->m_len -= size; 1668 so->so_rcv.sb_cc -= size; 1669 so->so_rcv.sb_datacc -= size; 1670 } else { 1671 *mp = so->so_rcv.sb_mb; 1672 sbfree(so, &so->so_rcv, *mp); 1673 so->so_rcv.sb_mb = (*mp)->m_next; 1674 sbsync(&so->so_rcv, nextrecord); 1675 } 1676 } 1677 *mp = NULL; 1678 1679 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1680 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1681 SBCHECK(so, &so->so_rcv); 1682 if (m == NULL) 1683 goto release; 1684 m->m_nextpkt = NULL; 1685 if (m->m_flags & M_PKTHDR) { 1686 m_resethdr(m); 1687 m->m_pkthdr.len = len; 1688 } 1689 1690 /* Send window update to source peer as receive buffer has changed. */ 1691 if (so->so_proto->pr_flags & PR_WANTRCVD) 1692 pru_rcvd(so); 1693 1694 /* Receive buffer did shrink by len bytes, adjust oob. */ 1695 mtx_enter(&so->so_rcv.sb_mtx); 1696 rcvstate = so->so_rcv.sb_state; 1697 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1698 oobmark = so->so_oobmark; 1699 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1700 if (oobmark) { 1701 if (oobmark == len) 1702 so->so_rcv.sb_state |= SS_RCVATMARK; 1703 if (oobmark >= len) 1704 oobmark = 0; 1705 } 1706 mtx_leave(&so->so_rcv.sb_mtx); 1707 1708 /* 1709 * Handle oob data. If any malloc fails, ignore error. 1710 * TCP urgent data is not very reliable anyway. 1711 */ 1712 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1713 (so->so_options & SO_OOBINLINE)) { 1714 struct mbuf *o = NULL; 1715 1716 if (rcvstate & SS_RCVATMARK) { 1717 o = m_get(wait, MT_DATA); 1718 rcvstate &= ~SS_RCVATMARK; 1719 } else if (oobmark) { 1720 o = m_split(m, oobmark, wait); 1721 if (o) { 1722 error = pru_send(sosp, m, NULL, NULL); 1723 if (error) { 1724 if (sosp->so_snd.sb_state & 1725 SS_CANTSENDMORE) 1726 error = EPIPE; 1727 m_freem(o); 1728 goto release; 1729 } 1730 len -= oobmark; 1731 so->so_splicelen += oobmark; 1732 m = o; 1733 o = m_get(wait, MT_DATA); 1734 } 1735 oobmark = 0; 1736 } 1737 if (o) { 1738 o->m_len = 1; 1739 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1740 error = pru_sendoob(sosp, o, NULL, NULL); 1741 if (error) { 1742 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1743 error = EPIPE; 1744 m_freem(m); 1745 goto release; 1746 } 1747 len -= 1; 1748 so->so_splicelen += 1; 1749 if (oobmark) { 1750 oobmark -= 1; 1751 if (oobmark == 0) 1752 rcvstate |= SS_RCVATMARK; 1753 } 1754 m_adj(m, 1); 1755 } 1756 } 1757 1758 /* Append all remaining data to drain socket. */ 1759 if (so->so_rcv.sb_cc == 0 || maxreached) 1760 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1761 error = pru_send(sosp, m, NULL, NULL); 1762 if (error) { 1763 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1764 error = EPIPE; 1765 goto release; 1766 } 1767 so->so_splicelen += len; 1768 1769 /* Move several packets if possible. */ 1770 if (!maxreached && nextrecord) 1771 goto nextpkt; 1772 1773 release: 1774 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1775 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1776 error = EFBIG; 1777 if (error) 1778 so->so_error = error; 1779 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1780 so->so_rcv.sb_cc == 0) || 1781 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1782 maxreached || error) { 1783 sounsplice(so, sosp, 0); 1784 return (0); 1785 } 1786 if (timerisset(&so->so_idletv)) 1787 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1788 return (1); 1789 } 1790 1791 #endif /* SOCKET_SPLICE */ 1792 1793 void 1794 sorwakeup(struct socket *so) 1795 { 1796 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1797 soassertlocked_readonly(so); 1798 1799 #ifdef SOCKET_SPLICE 1800 if (so->so_rcv.sb_flags & SB_SPLICE) { 1801 /* 1802 * TCP has a sendbuffer that can handle multiple packets 1803 * at once. So queue the stream a bit to accumulate data. 1804 * The sosplice thread will call somove() later and send 1805 * the packets calling tcp_output() only once. 1806 * In the UDP case, send out the packets immediately. 1807 * Using a thread would make things slower. 1808 */ 1809 if (so->so_proto->pr_flags & PR_WANTRCVD) 1810 task_add(sosplice_taskq, &so->so_splicetask); 1811 else 1812 somove(so, M_DONTWAIT); 1813 } 1814 if (isspliced(so)) 1815 return; 1816 #endif 1817 sowakeup(so, &so->so_rcv); 1818 if (so->so_upcall) 1819 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1820 } 1821 1822 void 1823 sowwakeup(struct socket *so) 1824 { 1825 soassertlocked_readonly(so); 1826 1827 #ifdef SOCKET_SPLICE 1828 if (so->so_snd.sb_flags & SB_SPLICE) 1829 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1830 if (issplicedback(so)) 1831 return; 1832 #endif 1833 sowakeup(so, &so->so_snd); 1834 } 1835 1836 int 1837 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1838 { 1839 int error = 0; 1840 1841 if (level != SOL_SOCKET) { 1842 if (so->so_proto->pr_ctloutput) { 1843 solock(so); 1844 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1845 level, optname, m); 1846 sounlock(so); 1847 return (error); 1848 } 1849 error = ENOPROTOOPT; 1850 } else { 1851 switch (optname) { 1852 1853 case SO_LINGER: 1854 if (m == NULL || m->m_len != sizeof (struct linger) || 1855 mtod(m, struct linger *)->l_linger < 0 || 1856 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1857 return (EINVAL); 1858 1859 solock(so); 1860 so->so_linger = mtod(m, struct linger *)->l_linger; 1861 if (*mtod(m, int *)) 1862 so->so_options |= optname; 1863 else 1864 so->so_options &= ~optname; 1865 sounlock(so); 1866 1867 break; 1868 case SO_BINDANY: 1869 if ((error = suser(curproc)) != 0) /* XXX */ 1870 return (error); 1871 /* FALLTHROUGH */ 1872 1873 case SO_DEBUG: 1874 case SO_KEEPALIVE: 1875 case SO_USELOOPBACK: 1876 case SO_BROADCAST: 1877 case SO_REUSEADDR: 1878 case SO_REUSEPORT: 1879 case SO_OOBINLINE: 1880 case SO_TIMESTAMP: 1881 case SO_ZEROIZE: 1882 if (m == NULL || m->m_len < sizeof (int)) 1883 return (EINVAL); 1884 1885 solock(so); 1886 if (*mtod(m, int *)) 1887 so->so_options |= optname; 1888 else 1889 so->so_options &= ~optname; 1890 sounlock(so); 1891 1892 break; 1893 case SO_DONTROUTE: 1894 if (m == NULL || m->m_len < sizeof (int)) 1895 return (EINVAL); 1896 if (*mtod(m, int *)) 1897 error = EOPNOTSUPP; 1898 break; 1899 1900 case SO_SNDBUF: 1901 case SO_RCVBUF: 1902 case SO_SNDLOWAT: 1903 case SO_RCVLOWAT: 1904 { 1905 struct sockbuf *sb = (optname == SO_SNDBUF || 1906 optname == SO_SNDLOWAT ? 1907 &so->so_snd : &so->so_rcv); 1908 u_long cnt; 1909 1910 if (m == NULL || m->m_len < sizeof (int)) 1911 return (EINVAL); 1912 cnt = *mtod(m, int *); 1913 if ((long)cnt <= 0) 1914 cnt = 1; 1915 1916 solock(so); 1917 mtx_enter(&sb->sb_mtx); 1918 1919 switch (optname) { 1920 case SO_SNDBUF: 1921 case SO_RCVBUF: 1922 if (sb->sb_state & 1923 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1924 error = EINVAL; 1925 break; 1926 } 1927 if (sbcheckreserve(cnt, sb->sb_wat) || 1928 sbreserve(so, sb, cnt)) { 1929 error = ENOBUFS; 1930 break; 1931 } 1932 sb->sb_wat = cnt; 1933 break; 1934 case SO_SNDLOWAT: 1935 case SO_RCVLOWAT: 1936 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 1937 sb->sb_hiwat : cnt; 1938 break; 1939 } 1940 1941 mtx_leave(&sb->sb_mtx); 1942 sounlock(so); 1943 1944 break; 1945 } 1946 1947 case SO_SNDTIMEO: 1948 case SO_RCVTIMEO: 1949 { 1950 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 1951 &so->so_snd : &so->so_rcv); 1952 struct timeval tv; 1953 uint64_t nsecs; 1954 1955 if (m == NULL || m->m_len < sizeof (tv)) 1956 return (EINVAL); 1957 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1958 if (!timerisvalid(&tv)) 1959 return (EINVAL); 1960 nsecs = TIMEVAL_TO_NSEC(&tv); 1961 if (nsecs == UINT64_MAX) 1962 return (EDOM); 1963 if (nsecs == 0) 1964 nsecs = INFSLP; 1965 1966 mtx_enter(&sb->sb_mtx); 1967 sb->sb_timeo_nsecs = nsecs; 1968 mtx_leave(&sb->sb_mtx); 1969 break; 1970 } 1971 1972 case SO_RTABLE: 1973 if (so->so_proto->pr_domain && 1974 so->so_proto->pr_domain->dom_protosw && 1975 so->so_proto->pr_ctloutput) { 1976 const struct domain *dom = 1977 so->so_proto->pr_domain; 1978 1979 level = dom->dom_protosw->pr_protocol; 1980 solock(so); 1981 error = (*so->so_proto->pr_ctloutput) 1982 (PRCO_SETOPT, so, level, optname, m); 1983 sounlock(so); 1984 } else 1985 error = ENOPROTOOPT; 1986 break; 1987 #ifdef SOCKET_SPLICE 1988 case SO_SPLICE: 1989 solock(so); 1990 if (m == NULL) { 1991 error = sosplice(so, -1, 0, NULL); 1992 } else if (m->m_len < sizeof(int)) { 1993 error = EINVAL; 1994 } else if (m->m_len < sizeof(struct splice)) { 1995 error = sosplice(so, *mtod(m, int *), 0, NULL); 1996 } else { 1997 error = sosplice(so, 1998 mtod(m, struct splice *)->sp_fd, 1999 mtod(m, struct splice *)->sp_max, 2000 &mtod(m, struct splice *)->sp_idle); 2001 } 2002 sounlock(so); 2003 break; 2004 #endif /* SOCKET_SPLICE */ 2005 2006 default: 2007 error = ENOPROTOOPT; 2008 break; 2009 } 2010 } 2011 2012 return (error); 2013 } 2014 2015 int 2016 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2017 { 2018 int error = 0; 2019 2020 if (level != SOL_SOCKET) { 2021 if (so->so_proto->pr_ctloutput) { 2022 m->m_len = 0; 2023 2024 solock(so); 2025 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2026 level, optname, m); 2027 sounlock(so); 2028 return (error); 2029 } else 2030 return (ENOPROTOOPT); 2031 } else { 2032 m->m_len = sizeof (int); 2033 2034 switch (optname) { 2035 2036 case SO_LINGER: 2037 m->m_len = sizeof (struct linger); 2038 solock_shared(so); 2039 mtod(m, struct linger *)->l_onoff = 2040 so->so_options & SO_LINGER; 2041 mtod(m, struct linger *)->l_linger = so->so_linger; 2042 sounlock_shared(so); 2043 break; 2044 2045 case SO_BINDANY: 2046 case SO_USELOOPBACK: 2047 case SO_DEBUG: 2048 case SO_KEEPALIVE: 2049 case SO_REUSEADDR: 2050 case SO_REUSEPORT: 2051 case SO_BROADCAST: 2052 case SO_OOBINLINE: 2053 case SO_TIMESTAMP: 2054 case SO_ZEROIZE: 2055 *mtod(m, int *) = so->so_options & optname; 2056 break; 2057 2058 case SO_DONTROUTE: 2059 *mtod(m, int *) = 0; 2060 break; 2061 2062 case SO_TYPE: 2063 *mtod(m, int *) = so->so_type; 2064 break; 2065 2066 case SO_ERROR: 2067 solock(so); 2068 *mtod(m, int *) = so->so_error; 2069 so->so_error = 0; 2070 sounlock(so); 2071 2072 break; 2073 2074 case SO_DOMAIN: 2075 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2076 break; 2077 2078 case SO_PROTOCOL: 2079 *mtod(m, int *) = so->so_proto->pr_protocol; 2080 break; 2081 2082 case SO_SNDBUF: 2083 *mtod(m, int *) = so->so_snd.sb_hiwat; 2084 break; 2085 2086 case SO_RCVBUF: 2087 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2088 break; 2089 2090 case SO_SNDLOWAT: 2091 *mtod(m, int *) = so->so_snd.sb_lowat; 2092 break; 2093 2094 case SO_RCVLOWAT: 2095 *mtod(m, int *) = so->so_rcv.sb_lowat; 2096 break; 2097 2098 case SO_SNDTIMEO: 2099 case SO_RCVTIMEO: 2100 { 2101 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2102 &so->so_snd : &so->so_rcv); 2103 struct timeval tv; 2104 uint64_t nsecs; 2105 2106 mtx_enter(&sb->sb_mtx); 2107 nsecs = sb->sb_timeo_nsecs; 2108 mtx_leave(&sb->sb_mtx); 2109 2110 m->m_len = sizeof(struct timeval); 2111 memset(&tv, 0, sizeof(tv)); 2112 if (nsecs != INFSLP) 2113 NSEC_TO_TIMEVAL(nsecs, &tv); 2114 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2115 break; 2116 } 2117 2118 case SO_RTABLE: 2119 if (so->so_proto->pr_domain && 2120 so->so_proto->pr_domain->dom_protosw && 2121 so->so_proto->pr_ctloutput) { 2122 const struct domain *dom = 2123 so->so_proto->pr_domain; 2124 2125 level = dom->dom_protosw->pr_protocol; 2126 solock(so); 2127 error = (*so->so_proto->pr_ctloutput) 2128 (PRCO_GETOPT, so, level, optname, m); 2129 sounlock(so); 2130 if (error) 2131 return (error); 2132 break; 2133 } 2134 return (ENOPROTOOPT); 2135 2136 #ifdef SOCKET_SPLICE 2137 case SO_SPLICE: 2138 { 2139 off_t len; 2140 2141 m->m_len = sizeof(off_t); 2142 solock_shared(so); 2143 len = so->so_sp ? so->so_sp->ssp_len : 0; 2144 sounlock_shared(so); 2145 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2146 break; 2147 } 2148 #endif /* SOCKET_SPLICE */ 2149 2150 case SO_PEERCRED: 2151 if (so->so_proto->pr_protocol == AF_UNIX) { 2152 struct unpcb *unp = sotounpcb(so); 2153 2154 solock(so); 2155 if (unp->unp_flags & UNP_FEIDS) { 2156 m->m_len = sizeof(unp->unp_connid); 2157 memcpy(mtod(m, caddr_t), 2158 &(unp->unp_connid), m->m_len); 2159 sounlock(so); 2160 break; 2161 } 2162 sounlock(so); 2163 2164 return (ENOTCONN); 2165 } 2166 return (EOPNOTSUPP); 2167 2168 default: 2169 return (ENOPROTOOPT); 2170 } 2171 return (0); 2172 } 2173 } 2174 2175 void 2176 sohasoutofband(struct socket *so) 2177 { 2178 pgsigio(&so->so_sigio, SIGURG, 0); 2179 knote(&so->so_rcv.sb_klist, 0); 2180 } 2181 2182 void 2183 sofilt_lock(struct socket *so, struct sockbuf *sb) 2184 { 2185 switch (so->so_proto->pr_domain->dom_family) { 2186 case PF_INET: 2187 case PF_INET6: 2188 NET_LOCK_SHARED(); 2189 break; 2190 default: 2191 rw_enter_write(&so->so_lock); 2192 break; 2193 } 2194 2195 mtx_enter(&sb->sb_mtx); 2196 } 2197 2198 void 2199 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2200 { 2201 mtx_leave(&sb->sb_mtx); 2202 2203 switch (so->so_proto->pr_domain->dom_family) { 2204 case PF_INET: 2205 case PF_INET6: 2206 NET_UNLOCK_SHARED(); 2207 break; 2208 default: 2209 rw_exit_write(&so->so_lock); 2210 break; 2211 } 2212 } 2213 2214 int 2215 soo_kqfilter(struct file *fp, struct knote *kn) 2216 { 2217 struct socket *so = kn->kn_fp->f_data; 2218 struct sockbuf *sb; 2219 2220 switch (kn->kn_filter) { 2221 case EVFILT_READ: 2222 kn->kn_fop = &soread_filtops; 2223 sb = &so->so_rcv; 2224 break; 2225 case EVFILT_WRITE: 2226 kn->kn_fop = &sowrite_filtops; 2227 sb = &so->so_snd; 2228 break; 2229 case EVFILT_EXCEPT: 2230 kn->kn_fop = &soexcept_filtops; 2231 sb = &so->so_rcv; 2232 break; 2233 default: 2234 return (EINVAL); 2235 } 2236 2237 klist_insert(&sb->sb_klist, kn); 2238 2239 return (0); 2240 } 2241 2242 void 2243 filt_sordetach(struct knote *kn) 2244 { 2245 struct socket *so = kn->kn_fp->f_data; 2246 2247 klist_remove(&so->so_rcv.sb_klist, kn); 2248 } 2249 2250 int 2251 filt_soread(struct knote *kn, long hint) 2252 { 2253 struct socket *so = kn->kn_fp->f_data; 2254 int rv = 0; 2255 2256 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2257 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2258 soassertlocked_readonly(so); 2259 2260 if (so->so_options & SO_ACCEPTCONN) { 2261 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2262 soassertlocked_readonly(so); 2263 2264 kn->kn_data = so->so_qlen; 2265 rv = (kn->kn_data != 0); 2266 2267 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2268 if (so->so_state & SS_ISDISCONNECTED) { 2269 kn->kn_flags |= __EV_HUP; 2270 rv = 1; 2271 } else { 2272 rv = soreadable(so); 2273 } 2274 } 2275 2276 return rv; 2277 } 2278 2279 kn->kn_data = so->so_rcv.sb_cc; 2280 #ifdef SOCKET_SPLICE 2281 if (isspliced(so)) { 2282 rv = 0; 2283 } else 2284 #endif /* SOCKET_SPLICE */ 2285 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2286 kn->kn_flags |= EV_EOF; 2287 if (kn->kn_flags & __EV_POLL) { 2288 if (so->so_state & SS_ISDISCONNECTED) 2289 kn->kn_flags |= __EV_HUP; 2290 } 2291 kn->kn_fflags = so->so_error; 2292 rv = 1; 2293 } else if (so->so_error) { 2294 rv = 1; 2295 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2296 rv = (kn->kn_data >= kn->kn_sdata); 2297 } else { 2298 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2299 } 2300 2301 return rv; 2302 } 2303 2304 void 2305 filt_sowdetach(struct knote *kn) 2306 { 2307 struct socket *so = kn->kn_fp->f_data; 2308 2309 klist_remove(&so->so_snd.sb_klist, kn); 2310 } 2311 2312 int 2313 filt_sowrite(struct knote *kn, long hint) 2314 { 2315 struct socket *so = kn->kn_fp->f_data; 2316 int rv; 2317 2318 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2319 soassertlocked_readonly(so); 2320 2321 kn->kn_data = sbspace(so, &so->so_snd); 2322 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2323 kn->kn_flags |= EV_EOF; 2324 if (kn->kn_flags & __EV_POLL) { 2325 if (so->so_state & SS_ISDISCONNECTED) 2326 kn->kn_flags |= __EV_HUP; 2327 } 2328 kn->kn_fflags = so->so_error; 2329 rv = 1; 2330 } else if (so->so_error) { 2331 rv = 1; 2332 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2333 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2334 rv = 0; 2335 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2336 rv = (kn->kn_data >= kn->kn_sdata); 2337 } else { 2338 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2339 } 2340 2341 return (rv); 2342 } 2343 2344 int 2345 filt_soexcept(struct knote *kn, long hint) 2346 { 2347 struct socket *so = kn->kn_fp->f_data; 2348 int rv = 0; 2349 2350 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2351 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2352 soassertlocked_readonly(so); 2353 2354 #ifdef SOCKET_SPLICE 2355 if (isspliced(so)) { 2356 rv = 0; 2357 } else 2358 #endif /* SOCKET_SPLICE */ 2359 if (kn->kn_sfflags & NOTE_OOB) { 2360 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2361 kn->kn_fflags |= NOTE_OOB; 2362 kn->kn_data -= so->so_oobmark; 2363 rv = 1; 2364 } 2365 } 2366 2367 if (kn->kn_flags & __EV_POLL) { 2368 if (so->so_state & SS_ISDISCONNECTED) { 2369 kn->kn_flags |= __EV_HUP; 2370 rv = 1; 2371 } 2372 } 2373 2374 return rv; 2375 } 2376 2377 int 2378 filt_sowmodify(struct kevent *kev, struct knote *kn) 2379 { 2380 struct socket *so = kn->kn_fp->f_data; 2381 int rv; 2382 2383 sofilt_lock(so, &so->so_snd); 2384 rv = knote_modify(kev, kn); 2385 sofilt_unlock(so, &so->so_snd); 2386 2387 return (rv); 2388 } 2389 2390 int 2391 filt_sowprocess(struct knote *kn, struct kevent *kev) 2392 { 2393 struct socket *so = kn->kn_fp->f_data; 2394 int rv; 2395 2396 sofilt_lock(so, &so->so_snd); 2397 rv = knote_process(kn, kev); 2398 sofilt_unlock(so, &so->so_snd); 2399 2400 return (rv); 2401 } 2402 2403 int 2404 filt_sormodify(struct kevent *kev, struct knote *kn) 2405 { 2406 struct socket *so = kn->kn_fp->f_data; 2407 int rv; 2408 2409 sofilt_lock(so, &so->so_rcv); 2410 rv = knote_modify(kev, kn); 2411 sofilt_unlock(so, &so->so_rcv); 2412 2413 return (rv); 2414 } 2415 2416 int 2417 filt_sorprocess(struct knote *kn, struct kevent *kev) 2418 { 2419 struct socket *so = kn->kn_fp->f_data; 2420 int rv; 2421 2422 sofilt_lock(so, &so->so_rcv); 2423 rv = knote_process(kn, kev); 2424 sofilt_unlock(so, &so->so_rcv); 2425 2426 return (rv); 2427 } 2428 2429 #ifdef DDB 2430 void 2431 sobuf_print(struct sockbuf *, 2432 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2433 2434 void 2435 sobuf_print(struct sockbuf *sb, 2436 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2437 { 2438 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2439 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2440 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2441 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2442 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2443 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2444 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2445 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2446 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2447 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2448 (*pr)("\tsb_sel: ...\n"); 2449 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2450 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2451 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2452 } 2453 2454 void 2455 so_print(void *v, 2456 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2457 { 2458 struct socket *so = v; 2459 2460 (*pr)("socket %p\n", so); 2461 (*pr)("so_type: %i\n", so->so_type); 2462 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2463 (*pr)("so_linger: %i\n", so->so_linger); 2464 (*pr)("so_state: 0x%04x\n", so->so_state); 2465 (*pr)("so_pcb: %p\n", so->so_pcb); 2466 (*pr)("so_proto: %p\n", so->so_proto); 2467 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2468 2469 (*pr)("so_head: %p\n", so->so_head); 2470 (*pr)("so_onq: %p\n", so->so_onq); 2471 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2472 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2473 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2474 (*pr)("so_q0len: %i\n", so->so_q0len); 2475 (*pr)("so_qlen: %i\n", so->so_qlen); 2476 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2477 (*pr)("so_timeo: %i\n", so->so_timeo); 2478 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2479 2480 (*pr)("so_sp: %p\n", so->so_sp); 2481 if (so->so_sp != NULL) { 2482 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2483 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2484 (*pr)("\tssp_len: %lld\n", 2485 (unsigned long long)so->so_sp->ssp_len); 2486 (*pr)("\tssp_max: %lld\n", 2487 (unsigned long long)so->so_sp->ssp_max); 2488 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2489 so->so_sp->ssp_idletv.tv_usec); 2490 (*pr)("\tssp_idleto: %spending (@%i)\n", 2491 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2492 so->so_sp->ssp_idleto.to_time); 2493 } 2494 2495 (*pr)("so_rcv:\n"); 2496 sobuf_print(&so->so_rcv, pr); 2497 (*pr)("so_snd:\n"); 2498 sobuf_print(&so->so_snd, pr); 2499 2500 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2501 so->so_upcall, so->so_upcallarg); 2502 2503 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2504 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2505 (*pr)("so_cpid: %d\n", so->so_cpid); 2506 } 2507 #endif 2508