1 /* $OpenBSD: uipc_socket.c,v 1.270 2021/12/13 14:56:55 visa Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/kernel.h> 44 #include <sys/event.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/unpcb.h> 48 #include <sys/socketvar.h> 49 #include <sys/signalvar.h> 50 #include <net/if.h> 51 #include <sys/pool.h> 52 #include <sys/atomic.h> 53 #include <sys/rwlock.h> 54 #include <sys/time.h> 55 56 #ifdef DDB 57 #include <machine/db_machdep.h> 58 #endif 59 60 void sbsync(struct sockbuf *, struct mbuf *); 61 62 int sosplice(struct socket *, int, off_t, struct timeval *); 63 void sounsplice(struct socket *, struct socket *, int); 64 void soidle(void *); 65 void sotask(void *); 66 void soreaper(void *); 67 void soput(void *); 68 int somove(struct socket *, int); 69 void sorflush(struct socket *); 70 71 void filt_sordetach(struct knote *kn); 72 int filt_soread(struct knote *kn, long hint); 73 int filt_soreadmodify(struct kevent *kev, struct knote *kn); 74 int filt_soreadprocess(struct knote *kn, struct kevent *kev); 75 int filt_soread_common(struct knote *kn, struct socket *so); 76 void filt_sowdetach(struct knote *kn); 77 int filt_sowrite(struct knote *kn, long hint); 78 int filt_sowritemodify(struct kevent *kev, struct knote *kn); 79 int filt_sowriteprocess(struct knote *kn, struct kevent *kev); 80 int filt_sowrite_common(struct knote *kn, struct socket *so); 81 int filt_soexcept(struct knote *kn, long hint); 82 int filt_soexceptmodify(struct kevent *kev, struct knote *kn); 83 int filt_soexceptprocess(struct knote *kn, struct kevent *kev); 84 int filt_soexcept_common(struct knote *kn, struct socket *so); 85 int filt_solisten(struct knote *kn, long hint); 86 int filt_solistenmodify(struct kevent *kev, struct knote *kn); 87 int filt_solistenprocess(struct knote *kn, struct kevent *kev); 88 int filt_solisten_common(struct knote *kn, struct socket *so); 89 90 const struct filterops solisten_filtops = { 91 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 92 .f_attach = NULL, 93 .f_detach = filt_sordetach, 94 .f_event = filt_solisten, 95 .f_modify = filt_solistenmodify, 96 .f_process = filt_solistenprocess, 97 }; 98 99 const struct filterops soread_filtops = { 100 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 101 .f_attach = NULL, 102 .f_detach = filt_sordetach, 103 .f_event = filt_soread, 104 .f_modify = filt_soreadmodify, 105 .f_process = filt_soreadprocess, 106 }; 107 108 const struct filterops sowrite_filtops = { 109 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 110 .f_attach = NULL, 111 .f_detach = filt_sowdetach, 112 .f_event = filt_sowrite, 113 .f_modify = filt_sowritemodify, 114 .f_process = filt_sowriteprocess, 115 }; 116 117 const struct filterops soexcept_filtops = { 118 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 119 .f_attach = NULL, 120 .f_detach = filt_sordetach, 121 .f_event = filt_soexcept, 122 .f_modify = filt_soexceptmodify, 123 .f_process = filt_soexceptprocess, 124 }; 125 126 #ifndef SOMINCONN 127 #define SOMINCONN 80 128 #endif /* SOMINCONN */ 129 130 int somaxconn = SOMAXCONN; 131 int sominconn = SOMINCONN; 132 133 struct pool socket_pool; 134 #ifdef SOCKET_SPLICE 135 struct pool sosplice_pool; 136 struct taskq *sosplice_taskq; 137 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 138 #endif 139 140 void 141 soinit(void) 142 { 143 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 144 "sockpl", NULL); 145 #ifdef SOCKET_SPLICE 146 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 147 "sosppl", NULL); 148 #endif 149 } 150 151 struct socket * 152 soalloc(int prflags) 153 { 154 struct socket *so; 155 156 so = pool_get(&socket_pool, prflags); 157 if (so == NULL) 158 return (NULL); 159 rw_init(&so->so_lock, "solock"); 160 return (so); 161 } 162 163 /* 164 * Socket operation routines. 165 * These routines are called by the routines in 166 * sys_socket.c or from a system process, and 167 * implement the semantics of socket operations by 168 * switching out to the protocol specific routines. 169 */ 170 int 171 socreate(int dom, struct socket **aso, int type, int proto) 172 { 173 struct proc *p = curproc; /* XXX */ 174 const struct protosw *prp; 175 struct socket *so; 176 int error, s; 177 178 if (proto) 179 prp = pffindproto(dom, proto, type); 180 else 181 prp = pffindtype(dom, type); 182 if (prp == NULL || prp->pr_attach == NULL) 183 return (EPROTONOSUPPORT); 184 if (prp->pr_type != type) 185 return (EPROTOTYPE); 186 so = soalloc(PR_WAITOK | PR_ZERO); 187 klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); 188 klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); 189 sigio_init(&so->so_sigio); 190 TAILQ_INIT(&so->so_q0); 191 TAILQ_INIT(&so->so_q); 192 so->so_type = type; 193 if (suser(p) == 0) 194 so->so_state = SS_PRIV; 195 so->so_ruid = p->p_ucred->cr_ruid; 196 so->so_euid = p->p_ucred->cr_uid; 197 so->so_rgid = p->p_ucred->cr_rgid; 198 so->so_egid = p->p_ucred->cr_gid; 199 so->so_cpid = p->p_p->ps_pid; 200 so->so_proto = prp; 201 so->so_snd.sb_timeo_nsecs = INFSLP; 202 so->so_rcv.sb_timeo_nsecs = INFSLP; 203 204 s = solock(so); 205 error = (*prp->pr_attach)(so, proto); 206 if (error) { 207 so->so_state |= SS_NOFDREF; 208 /* sofree() calls sounlock(). */ 209 sofree(so, s); 210 return (error); 211 } 212 sounlock(so, s); 213 *aso = so; 214 return (0); 215 } 216 217 int 218 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 219 { 220 int error; 221 222 soassertlocked(so); 223 224 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p); 225 return (error); 226 } 227 228 int 229 solisten(struct socket *so, int backlog) 230 { 231 int error; 232 233 soassertlocked(so); 234 235 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 236 return (EINVAL); 237 #ifdef SOCKET_SPLICE 238 if (isspliced(so) || issplicedback(so)) 239 return (EOPNOTSUPP); 240 #endif /* SOCKET_SPLICE */ 241 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, 242 curproc); 243 if (error) 244 return (error); 245 if (TAILQ_FIRST(&so->so_q) == NULL) 246 so->so_options |= SO_ACCEPTCONN; 247 if (backlog < 0 || backlog > somaxconn) 248 backlog = somaxconn; 249 if (backlog < sominconn) 250 backlog = sominconn; 251 so->so_qlimit = backlog; 252 return (0); 253 } 254 255 #define SOSP_FREEING_READ 1 256 #define SOSP_FREEING_WRITE 2 257 void 258 sofree(struct socket *so, int s) 259 { 260 soassertlocked(so); 261 262 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 263 sounlock(so, s); 264 return; 265 } 266 if (so->so_head) { 267 /* 268 * We must not decommission a socket that's on the accept(2) 269 * queue. If we do, then accept(2) may hang after select(2) 270 * indicated that the listening socket was ready. 271 */ 272 if (!soqremque(so, 0)) { 273 sounlock(so, s); 274 return; 275 } 276 } 277 sigio_free(&so->so_sigio); 278 klist_free(&so->so_rcv.sb_sel.si_note); 279 klist_free(&so->so_snd.sb_sel.si_note); 280 #ifdef SOCKET_SPLICE 281 if (so->so_sp) { 282 if (issplicedback(so)) { 283 int freeing = SOSP_FREEING_WRITE; 284 285 if (so->so_sp->ssp_soback == so) 286 freeing |= SOSP_FREEING_READ; 287 sounsplice(so->so_sp->ssp_soback, so, freeing); 288 } 289 if (isspliced(so)) { 290 int freeing = SOSP_FREEING_READ; 291 292 if (so == so->so_sp->ssp_socket) 293 freeing |= SOSP_FREEING_WRITE; 294 sounsplice(so, so->so_sp->ssp_socket, freeing); 295 } 296 } 297 #endif /* SOCKET_SPLICE */ 298 sbrelease(so, &so->so_snd); 299 sorflush(so); 300 sounlock(so, s); 301 #ifdef SOCKET_SPLICE 302 if (so->so_sp) { 303 /* Reuse splice idle, sounsplice() has been called before. */ 304 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 305 timeout_add(&so->so_sp->ssp_idleto, 0); 306 } else 307 #endif /* SOCKET_SPLICE */ 308 { 309 pool_put(&socket_pool, so); 310 } 311 } 312 313 static inline uint64_t 314 solinger_nsec(struct socket *so) 315 { 316 if (so->so_linger == 0) 317 return INFSLP; 318 319 return SEC_TO_NSEC(so->so_linger); 320 } 321 322 /* 323 * Close a socket on last file table reference removal. 324 * Initiate disconnect if connected. 325 * Free socket when disconnect complete. 326 */ 327 int 328 soclose(struct socket *so, int flags) 329 { 330 struct socket *so2; 331 int s, error = 0; 332 333 s = solock(so); 334 /* Revoke async IO early. There is a final revocation in sofree(). */ 335 sigio_free(&so->so_sigio); 336 if (so->so_state & SS_ISCONNECTED) { 337 if (so->so_pcb == NULL) 338 goto discard; 339 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 340 error = sodisconnect(so); 341 if (error) 342 goto drop; 343 } 344 if (so->so_options & SO_LINGER) { 345 if ((so->so_state & SS_ISDISCONNECTING) && 346 (flags & MSG_DONTWAIT)) 347 goto drop; 348 while (so->so_state & SS_ISCONNECTED) { 349 error = sosleep_nsec(so, &so->so_timeo, 350 PSOCK | PCATCH, "netcls", 351 solinger_nsec(so)); 352 if (error) 353 break; 354 } 355 } 356 } 357 drop: 358 if (so->so_pcb) { 359 int error2; 360 KASSERT(so->so_proto->pr_detach); 361 error2 = (*so->so_proto->pr_detach)(so); 362 if (error == 0) 363 error = error2; 364 } 365 if (so->so_options & SO_ACCEPTCONN) { 366 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 367 (void) soqremque(so2, 0); 368 (void) soabort(so2); 369 } 370 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 371 (void) soqremque(so2, 1); 372 (void) soabort(so2); 373 } 374 } 375 discard: 376 if (so->so_state & SS_NOFDREF) 377 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 378 so->so_state |= SS_NOFDREF; 379 /* sofree() calls sounlock(). */ 380 sofree(so, s); 381 return (error); 382 } 383 384 int 385 soabort(struct socket *so) 386 { 387 soassertlocked(so); 388 389 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, 390 curproc); 391 } 392 393 int 394 soaccept(struct socket *so, struct mbuf *nam) 395 { 396 int error = 0; 397 398 soassertlocked(so); 399 400 if ((so->so_state & SS_NOFDREF) == 0) 401 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 402 so->so_state &= ~SS_NOFDREF; 403 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 404 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 405 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 406 nam, NULL, curproc); 407 else 408 error = ECONNABORTED; 409 return (error); 410 } 411 412 int 413 soconnect(struct socket *so, struct mbuf *nam) 414 { 415 int error; 416 417 soassertlocked(so); 418 419 if (so->so_options & SO_ACCEPTCONN) 420 return (EOPNOTSUPP); 421 /* 422 * If protocol is connection-based, can only connect once. 423 * Otherwise, if connected, try to disconnect first. 424 * This allows user to disconnect by connecting to, e.g., 425 * a null address. 426 */ 427 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 428 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 429 (error = sodisconnect(so)))) 430 error = EISCONN; 431 else 432 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 433 NULL, nam, NULL, curproc); 434 return (error); 435 } 436 437 int 438 soconnect2(struct socket *so1, struct socket *so2) 439 { 440 int s, error; 441 442 s = solock(so1); 443 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 444 (struct mbuf *)so2, NULL, curproc); 445 sounlock(so1, s); 446 return (error); 447 } 448 449 int 450 sodisconnect(struct socket *so) 451 { 452 int error; 453 454 soassertlocked(so); 455 456 if ((so->so_state & SS_ISCONNECTED) == 0) 457 return (ENOTCONN); 458 if (so->so_state & SS_ISDISCONNECTING) 459 return (EALREADY); 460 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 461 NULL, curproc); 462 return (error); 463 } 464 465 int m_getuio(struct mbuf **, int, long, struct uio *); 466 467 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 468 /* 469 * Send on a socket. 470 * If send must go all at once and message is larger than 471 * send buffering, then hard error. 472 * Lock against other senders. 473 * If must go all at once and not enough room now, then 474 * inform user that this would block and do nothing. 475 * Otherwise, if nonblocking, send as much as possible. 476 * The data to be sent is described by "uio" if nonzero, 477 * otherwise by the mbuf chain "top" (which must be null 478 * if uio is not). Data provided in mbuf chain must be small 479 * enough to send all at once. 480 * 481 * Returns nonzero on error, timeout or signal; callers 482 * must check for short counts if EINTR/ERESTART are returned. 483 * Data and control buffers are freed on return. 484 */ 485 int 486 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 487 struct mbuf *control, int flags) 488 { 489 long space, clen = 0; 490 size_t resid; 491 int error, s; 492 int atomic = sosendallatonce(so) || top; 493 494 if (uio) 495 resid = uio->uio_resid; 496 else 497 resid = top->m_pkthdr.len; 498 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 499 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 500 m_freem(top); 501 m_freem(control); 502 return (EINVAL); 503 } 504 if (uio && uio->uio_procp) 505 uio->uio_procp->p_ru.ru_msgsnd++; 506 if (control) { 507 /* 508 * In theory clen should be unsigned (since control->m_len is). 509 * However, space must be signed, as it might be less than 0 510 * if we over-committed, and we must use a signed comparison 511 * of space and clen. 512 */ 513 clen = control->m_len; 514 /* reserve extra space for AF_UNIX's internalize */ 515 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 516 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 517 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 518 clen = CMSG_SPACE( 519 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 520 (sizeof(struct fdpass) / sizeof(int))); 521 } 522 523 #define snderr(errno) { error = errno; goto release; } 524 525 s = solock(so); 526 restart: 527 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 528 goto out; 529 so->so_state |= SS_ISSENDING; 530 do { 531 if (so->so_state & SS_CANTSENDMORE) 532 snderr(EPIPE); 533 if (so->so_error) { 534 error = so->so_error; 535 so->so_error = 0; 536 snderr(error); 537 } 538 if ((so->so_state & SS_ISCONNECTED) == 0) { 539 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 540 if (!(resid == 0 && clen != 0)) 541 snderr(ENOTCONN); 542 } else if (addr == NULL) 543 snderr(EDESTADDRREQ); 544 } 545 space = sbspace(so, &so->so_snd); 546 if (flags & MSG_OOB) 547 space += 1024; 548 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 549 if (atomic && resid > so->so_snd.sb_hiwat) 550 snderr(EMSGSIZE); 551 } else { 552 if (clen > so->so_snd.sb_hiwat || 553 (atomic && resid > so->so_snd.sb_hiwat - clen)) 554 snderr(EMSGSIZE); 555 } 556 if (space < clen || 557 (space - clen < resid && 558 (atomic || space < so->so_snd.sb_lowat))) { 559 if (flags & MSG_DONTWAIT) 560 snderr(EWOULDBLOCK); 561 sbunlock(so, &so->so_snd); 562 error = sbwait(so, &so->so_snd); 563 so->so_state &= ~SS_ISSENDING; 564 if (error) 565 goto out; 566 goto restart; 567 } 568 space -= clen; 569 do { 570 if (uio == NULL) { 571 /* 572 * Data is prepackaged in "top". 573 */ 574 resid = 0; 575 if (flags & MSG_EOR) 576 top->m_flags |= M_EOR; 577 } else { 578 sounlock(so, s); 579 error = m_getuio(&top, atomic, space, uio); 580 s = solock(so); 581 if (error) 582 goto release; 583 space -= top->m_pkthdr.len; 584 resid = uio->uio_resid; 585 if (flags & MSG_EOR) 586 top->m_flags |= M_EOR; 587 } 588 if (resid == 0) 589 so->so_state &= ~SS_ISSENDING; 590 if (top && so->so_options & SO_ZEROIZE) 591 top->m_flags |= M_ZEROIZE; 592 error = (*so->so_proto->pr_usrreq)(so, 593 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 594 top, addr, control, curproc); 595 clen = 0; 596 control = NULL; 597 top = NULL; 598 if (error) 599 goto release; 600 } while (resid && space > 0); 601 } while (resid); 602 603 release: 604 so->so_state &= ~SS_ISSENDING; 605 sbunlock(so, &so->so_snd); 606 out: 607 sounlock(so, s); 608 m_freem(top); 609 m_freem(control); 610 return (error); 611 } 612 613 int 614 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 615 { 616 struct mbuf *m, *top = NULL; 617 struct mbuf **nextp = ⊤ 618 u_long len, mlen; 619 size_t resid = uio->uio_resid; 620 int error; 621 622 do { 623 if (top == NULL) { 624 MGETHDR(m, M_WAIT, MT_DATA); 625 mlen = MHLEN; 626 m->m_pkthdr.len = 0; 627 m->m_pkthdr.ph_ifidx = 0; 628 } else { 629 MGET(m, M_WAIT, MT_DATA); 630 mlen = MLEN; 631 } 632 /* chain mbuf together */ 633 *nextp = m; 634 nextp = &m->m_next; 635 636 resid = ulmin(resid, space); 637 if (resid >= MINCLSIZE) { 638 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 639 if ((m->m_flags & M_EXT) == 0) 640 MCLGETL(m, M_NOWAIT, MCLBYTES); 641 if ((m->m_flags & M_EXT) == 0) 642 goto nopages; 643 mlen = m->m_ext.ext_size; 644 len = ulmin(mlen, resid); 645 /* 646 * For datagram protocols, leave room 647 * for protocol headers in first mbuf. 648 */ 649 if (atomic && m == top && len < mlen - max_hdr) 650 m->m_data += max_hdr; 651 } else { 652 nopages: 653 len = ulmin(mlen, resid); 654 /* 655 * For datagram protocols, leave room 656 * for protocol headers in first mbuf. 657 */ 658 if (atomic && m == top && len < mlen - max_hdr) 659 m_align(m, len); 660 } 661 662 error = uiomove(mtod(m, caddr_t), len, uio); 663 if (error) { 664 m_freem(top); 665 return (error); 666 } 667 668 /* adjust counters */ 669 resid = uio->uio_resid; 670 space -= len; 671 m->m_len = len; 672 top->m_pkthdr.len += len; 673 674 /* Is there more space and more data? */ 675 } while (space > 0 && resid > 0); 676 677 *mp = top; 678 return 0; 679 } 680 681 /* 682 * Following replacement or removal of the first mbuf on the first 683 * mbuf chain of a socket buffer, push necessary state changes back 684 * into the socket buffer so that other consumers see the values 685 * consistently. 'nextrecord' is the callers locally stored value of 686 * the original value of sb->sb_mb->m_nextpkt which must be restored 687 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 688 */ 689 void 690 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 691 { 692 693 /* 694 * First, update for the new value of nextrecord. If necessary, 695 * make it the first record. 696 */ 697 if (sb->sb_mb != NULL) 698 sb->sb_mb->m_nextpkt = nextrecord; 699 else 700 sb->sb_mb = nextrecord; 701 702 /* 703 * Now update any dependent socket buffer fields to reflect 704 * the new state. This is an inline of SB_EMPTY_FIXUP, with 705 * the addition of a second clause that takes care of the 706 * case where sb_mb has been updated, but remains the last 707 * record. 708 */ 709 if (sb->sb_mb == NULL) { 710 sb->sb_mbtail = NULL; 711 sb->sb_lastrecord = NULL; 712 } else if (sb->sb_mb->m_nextpkt == NULL) 713 sb->sb_lastrecord = sb->sb_mb; 714 } 715 716 /* 717 * Implement receive operations on a socket. 718 * We depend on the way that records are added to the sockbuf 719 * by sbappend*. In particular, each record (mbufs linked through m_next) 720 * must begin with an address if the protocol so specifies, 721 * followed by an optional mbuf or mbufs containing ancillary data, 722 * and then zero or more mbufs of data. 723 * In order to avoid blocking network for the entire time here, we release 724 * the solock() while doing the actual copy to user space. 725 * Although the sockbuf is locked, new data may still be appended, 726 * and thus we must maintain consistency of the sockbuf during that time. 727 * 728 * The caller may receive the data as a single mbuf chain by supplying 729 * an mbuf **mp0 for use in returning the chain. The uio is then used 730 * only for the count in uio_resid. 731 */ 732 int 733 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 734 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 735 socklen_t controllen) 736 { 737 struct mbuf *m, **mp; 738 struct mbuf *cm; 739 u_long len, offset, moff; 740 int flags, error, s, type, uio_error = 0; 741 const struct protosw *pr = so->so_proto; 742 struct mbuf *nextrecord; 743 size_t resid, orig_resid = uio->uio_resid; 744 745 mp = mp0; 746 if (paddr) 747 *paddr = NULL; 748 if (controlp) 749 *controlp = NULL; 750 if (flagsp) 751 flags = *flagsp &~ MSG_EOR; 752 else 753 flags = 0; 754 if (flags & MSG_OOB) { 755 m = m_get(M_WAIT, MT_DATA); 756 s = solock(so); 757 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 758 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); 759 sounlock(so, s); 760 if (error) 761 goto bad; 762 do { 763 error = uiomove(mtod(m, caddr_t), 764 ulmin(uio->uio_resid, m->m_len), uio); 765 m = m_free(m); 766 } while (uio->uio_resid && error == 0 && m); 767 bad: 768 m_freem(m); 769 return (error); 770 } 771 if (mp) 772 *mp = NULL; 773 774 s = solock(so); 775 restart: 776 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 777 sounlock(so, s); 778 return (error); 779 } 780 781 m = so->so_rcv.sb_mb; 782 #ifdef SOCKET_SPLICE 783 if (isspliced(so)) 784 m = NULL; 785 #endif /* SOCKET_SPLICE */ 786 /* 787 * If we have less data than requested, block awaiting more 788 * (subject to any timeout) if: 789 * 1. the current count is less than the low water mark, 790 * 2. MSG_WAITALL is set, and it is possible to do the entire 791 * receive operation at once if we block (resid <= hiwat), or 792 * 3. MSG_DONTWAIT is not set. 793 * If MSG_WAITALL is set but resid is larger than the receive buffer, 794 * we have to do the receive in sections, and thus risk returning 795 * a short count if a timeout or signal occurs after we start. 796 */ 797 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 798 so->so_rcv.sb_cc < uio->uio_resid) && 799 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 800 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 801 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 802 #ifdef DIAGNOSTIC 803 if (m == NULL && so->so_rcv.sb_cc) 804 #ifdef SOCKET_SPLICE 805 if (!isspliced(so)) 806 #endif /* SOCKET_SPLICE */ 807 panic("receive 1: so %p, so_type %d, sb_cc %lu", 808 so, so->so_type, so->so_rcv.sb_cc); 809 #endif 810 if (so->so_error) { 811 if (m) 812 goto dontblock; 813 error = so->so_error; 814 if ((flags & MSG_PEEK) == 0) 815 so->so_error = 0; 816 goto release; 817 } 818 if (so->so_state & SS_CANTRCVMORE) { 819 if (m) 820 goto dontblock; 821 else if (so->so_rcv.sb_cc == 0) 822 goto release; 823 } 824 for (; m; m = m->m_next) 825 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 826 m = so->so_rcv.sb_mb; 827 goto dontblock; 828 } 829 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 830 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 831 error = ENOTCONN; 832 goto release; 833 } 834 if (uio->uio_resid == 0 && controlp == NULL) 835 goto release; 836 if (flags & MSG_DONTWAIT) { 837 error = EWOULDBLOCK; 838 goto release; 839 } 840 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 841 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 842 sbunlock(so, &so->so_rcv); 843 error = sbwait(so, &so->so_rcv); 844 if (error) { 845 sounlock(so, s); 846 return (error); 847 } 848 goto restart; 849 } 850 dontblock: 851 /* 852 * On entry here, m points to the first record of the socket buffer. 853 * From this point onward, we maintain 'nextrecord' as a cache of the 854 * pointer to the next record in the socket buffer. We must keep the 855 * various socket buffer pointers and local stack versions of the 856 * pointers in sync, pushing out modifications before operations that 857 * may sleep, and re-reading them afterwards. 858 * 859 * Otherwise, we will race with the network stack appending new data 860 * or records onto the socket buffer by using inconsistent/stale 861 * versions of the field, possibly resulting in socket buffer 862 * corruption. 863 */ 864 if (uio->uio_procp) 865 uio->uio_procp->p_ru.ru_msgrcv++; 866 KASSERT(m == so->so_rcv.sb_mb); 867 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 868 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 869 nextrecord = m->m_nextpkt; 870 if (pr->pr_flags & PR_ADDR) { 871 #ifdef DIAGNOSTIC 872 if (m->m_type != MT_SONAME) 873 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 874 so, so->so_type, m, m->m_type); 875 #endif 876 orig_resid = 0; 877 if (flags & MSG_PEEK) { 878 if (paddr) 879 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 880 m = m->m_next; 881 } else { 882 sbfree(so, &so->so_rcv, m); 883 if (paddr) { 884 *paddr = m; 885 so->so_rcv.sb_mb = m->m_next; 886 m->m_next = NULL; 887 m = so->so_rcv.sb_mb; 888 } else { 889 so->so_rcv.sb_mb = m_free(m); 890 m = so->so_rcv.sb_mb; 891 } 892 sbsync(&so->so_rcv, nextrecord); 893 } 894 } 895 while (m && m->m_type == MT_CONTROL && error == 0) { 896 int skip = 0; 897 if (flags & MSG_PEEK) { 898 if (mtod(m, struct cmsghdr *)->cmsg_type == 899 SCM_RIGHTS) { 900 /* don't leak internalized SCM_RIGHTS msgs */ 901 skip = 1; 902 } else if (controlp) 903 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 904 m = m->m_next; 905 } else { 906 sbfree(so, &so->so_rcv, m); 907 so->so_rcv.sb_mb = m->m_next; 908 m->m_nextpkt = m->m_next = NULL; 909 cm = m; 910 m = so->so_rcv.sb_mb; 911 sbsync(&so->so_rcv, nextrecord); 912 if (controlp) { 913 if (pr->pr_domain->dom_externalize) { 914 sounlock(so, s); 915 error = 916 (*pr->pr_domain->dom_externalize) 917 (cm, controllen, flags); 918 s = solock(so); 919 } 920 *controlp = cm; 921 } else { 922 /* 923 * Dispose of any SCM_RIGHTS message that went 924 * through the read path rather than recv. 925 */ 926 if (pr->pr_domain->dom_dispose) 927 pr->pr_domain->dom_dispose(cm); 928 m_free(cm); 929 } 930 } 931 if (m != NULL) 932 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 933 else 934 nextrecord = so->so_rcv.sb_mb; 935 if (controlp && !skip) 936 controlp = &(*controlp)->m_next; 937 orig_resid = 0; 938 } 939 940 /* If m is non-NULL, we have some data to read. */ 941 if (m) { 942 type = m->m_type; 943 if (type == MT_OOBDATA) 944 flags |= MSG_OOB; 945 if (m->m_flags & M_BCAST) 946 flags |= MSG_BCAST; 947 if (m->m_flags & M_MCAST) 948 flags |= MSG_MCAST; 949 } 950 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 951 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 952 953 moff = 0; 954 offset = 0; 955 while (m && uio->uio_resid > 0 && error == 0) { 956 if (m->m_type == MT_OOBDATA) { 957 if (type != MT_OOBDATA) 958 break; 959 } else if (type == MT_OOBDATA) { 960 break; 961 } else if (m->m_type == MT_CONTROL) { 962 /* 963 * If there is more than one control message in the 964 * stream, we do a short read. Next can be received 965 * or disposed by another system call. 966 */ 967 break; 968 #ifdef DIAGNOSTIC 969 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 970 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 971 so, so->so_type, m, m->m_type); 972 #endif 973 } 974 so->so_state &= ~SS_RCVATMARK; 975 len = uio->uio_resid; 976 if (so->so_oobmark && len > so->so_oobmark - offset) 977 len = so->so_oobmark - offset; 978 if (len > m->m_len - moff) 979 len = m->m_len - moff; 980 /* 981 * If mp is set, just pass back the mbufs. 982 * Otherwise copy them out via the uio, then free. 983 * Sockbuf must be consistent here (points to current mbuf, 984 * it points to next record) when we drop priority; 985 * we must note any additions to the sockbuf when we 986 * block interrupts again. 987 */ 988 if (mp == NULL && uio_error == 0) { 989 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 990 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 991 resid = uio->uio_resid; 992 sounlock(so, s); 993 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 994 s = solock(so); 995 if (uio_error) 996 uio->uio_resid = resid - len; 997 } else 998 uio->uio_resid -= len; 999 if (len == m->m_len - moff) { 1000 if (m->m_flags & M_EOR) 1001 flags |= MSG_EOR; 1002 if (flags & MSG_PEEK) { 1003 m = m->m_next; 1004 moff = 0; 1005 orig_resid = 0; 1006 } else { 1007 nextrecord = m->m_nextpkt; 1008 sbfree(so, &so->so_rcv, m); 1009 if (mp) { 1010 *mp = m; 1011 mp = &m->m_next; 1012 so->so_rcv.sb_mb = m = m->m_next; 1013 *mp = NULL; 1014 } else { 1015 so->so_rcv.sb_mb = m_free(m); 1016 m = so->so_rcv.sb_mb; 1017 } 1018 /* 1019 * If m != NULL, we also know that 1020 * so->so_rcv.sb_mb != NULL. 1021 */ 1022 KASSERT(so->so_rcv.sb_mb == m); 1023 if (m) { 1024 m->m_nextpkt = nextrecord; 1025 if (nextrecord == NULL) 1026 so->so_rcv.sb_lastrecord = m; 1027 } else { 1028 so->so_rcv.sb_mb = nextrecord; 1029 SB_EMPTY_FIXUP(&so->so_rcv); 1030 } 1031 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1032 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1033 } 1034 } else { 1035 if (flags & MSG_PEEK) { 1036 moff += len; 1037 orig_resid = 0; 1038 } else { 1039 if (mp) 1040 *mp = m_copym(m, 0, len, M_WAIT); 1041 m->m_data += len; 1042 m->m_len -= len; 1043 so->so_rcv.sb_cc -= len; 1044 so->so_rcv.sb_datacc -= len; 1045 } 1046 } 1047 if (so->so_oobmark) { 1048 if ((flags & MSG_PEEK) == 0) { 1049 so->so_oobmark -= len; 1050 if (so->so_oobmark == 0) { 1051 so->so_state |= SS_RCVATMARK; 1052 break; 1053 } 1054 } else { 1055 offset += len; 1056 if (offset == so->so_oobmark) 1057 break; 1058 } 1059 } 1060 if (flags & MSG_EOR) 1061 break; 1062 /* 1063 * If the MSG_WAITALL flag is set (for non-atomic socket), 1064 * we must not quit until "uio->uio_resid == 0" or an error 1065 * termination. If a signal/timeout occurs, return 1066 * with a short count but without error. 1067 * Keep sockbuf locked against other readers. 1068 */ 1069 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1070 !sosendallatonce(so) && !nextrecord) { 1071 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1072 break; 1073 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1074 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1075 error = sbwait(so, &so->so_rcv); 1076 if (error) { 1077 sbunlock(so, &so->so_rcv); 1078 sounlock(so, s); 1079 return (0); 1080 } 1081 if ((m = so->so_rcv.sb_mb) != NULL) 1082 nextrecord = m->m_nextpkt; 1083 } 1084 } 1085 1086 if (m && pr->pr_flags & PR_ATOMIC) { 1087 flags |= MSG_TRUNC; 1088 if ((flags & MSG_PEEK) == 0) 1089 (void) sbdroprecord(so, &so->so_rcv); 1090 } 1091 if ((flags & MSG_PEEK) == 0) { 1092 if (m == NULL) { 1093 /* 1094 * First part is an inline SB_EMPTY_FIXUP(). Second 1095 * part makes sure sb_lastrecord is up-to-date if 1096 * there is still data in the socket buffer. 1097 */ 1098 so->so_rcv.sb_mb = nextrecord; 1099 if (so->so_rcv.sb_mb == NULL) { 1100 so->so_rcv.sb_mbtail = NULL; 1101 so->so_rcv.sb_lastrecord = NULL; 1102 } else if (nextrecord->m_nextpkt == NULL) 1103 so->so_rcv.sb_lastrecord = nextrecord; 1104 } 1105 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1106 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1107 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1108 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1109 (struct mbuf *)(long)flags, NULL, curproc); 1110 } 1111 if (orig_resid == uio->uio_resid && orig_resid && 1112 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1113 sbunlock(so, &so->so_rcv); 1114 goto restart; 1115 } 1116 1117 if (uio_error) 1118 error = uio_error; 1119 1120 if (flagsp) 1121 *flagsp |= flags; 1122 release: 1123 sbunlock(so, &so->so_rcv); 1124 sounlock(so, s); 1125 return (error); 1126 } 1127 1128 int 1129 soshutdown(struct socket *so, int how) 1130 { 1131 const struct protosw *pr = so->so_proto; 1132 int s, error = 0; 1133 1134 s = solock(so); 1135 switch (how) { 1136 case SHUT_RD: 1137 sorflush(so); 1138 break; 1139 case SHUT_RDWR: 1140 sorflush(so); 1141 /* FALLTHROUGH */ 1142 case SHUT_WR: 1143 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, 1144 curproc); 1145 break; 1146 default: 1147 error = EINVAL; 1148 break; 1149 } 1150 sounlock(so, s); 1151 1152 return (error); 1153 } 1154 1155 void 1156 sorflush(struct socket *so) 1157 { 1158 struct sockbuf *sb = &so->so_rcv; 1159 struct mbuf *m; 1160 const struct protosw *pr = so->so_proto; 1161 int error; 1162 1163 sb->sb_flags |= SB_NOINTR; 1164 error = sblock(so, sb, M_WAITOK); 1165 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1166 KASSERT(error == 0); 1167 socantrcvmore(so); 1168 m = sb->sb_mb; 1169 memset(&sb->sb_startzero, 0, 1170 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1171 sb->sb_timeo_nsecs = INFSLP; 1172 sbunlock(so, sb); 1173 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1174 (*pr->pr_domain->dom_dispose)(m); 1175 m_purge(m); 1176 } 1177 1178 #ifdef SOCKET_SPLICE 1179 1180 #define so_splicelen so_sp->ssp_len 1181 #define so_splicemax so_sp->ssp_max 1182 #define so_idletv so_sp->ssp_idletv 1183 #define so_idleto so_sp->ssp_idleto 1184 #define so_splicetask so_sp->ssp_task 1185 1186 int 1187 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1188 { 1189 struct file *fp; 1190 struct socket *sosp; 1191 struct sosplice *sp; 1192 struct taskq *tq; 1193 int error = 0; 1194 1195 soassertlocked(so); 1196 1197 if (sosplice_taskq == NULL) { 1198 rw_enter_write(&sosplice_lock); 1199 if (sosplice_taskq == NULL) { 1200 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1201 TASKQ_MPSAFE); 1202 /* Ensure the taskq is fully visible to other CPUs. */ 1203 membar_producer(); 1204 sosplice_taskq = tq; 1205 } 1206 rw_exit_write(&sosplice_lock); 1207 } 1208 if (sosplice_taskq == NULL) 1209 return (ENOMEM); 1210 1211 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1212 return (EPROTONOSUPPORT); 1213 if (so->so_options & SO_ACCEPTCONN) 1214 return (EOPNOTSUPP); 1215 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1216 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1217 return (ENOTCONN); 1218 if (so->so_sp == NULL) { 1219 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1220 if (so->so_sp == NULL) 1221 so->so_sp = sp; 1222 else 1223 pool_put(&sosplice_pool, sp); 1224 } 1225 1226 /* If no fd is given, unsplice by removing existing link. */ 1227 if (fd < 0) { 1228 /* Lock receive buffer. */ 1229 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1230 return (error); 1231 } 1232 if (so->so_sp->ssp_socket) 1233 sounsplice(so, so->so_sp->ssp_socket, 0); 1234 sbunlock(so, &so->so_rcv); 1235 return (0); 1236 } 1237 1238 if (max && max < 0) 1239 return (EINVAL); 1240 1241 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1242 return (EINVAL); 1243 1244 /* Find sosp, the drain socket where data will be spliced into. */ 1245 if ((error = getsock(curproc, fd, &fp)) != 0) 1246 return (error); 1247 sosp = fp->f_data; 1248 if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) { 1249 error = EPROTONOSUPPORT; 1250 goto frele; 1251 } 1252 if (sosp->so_sp == NULL) { 1253 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1254 if (sosp->so_sp == NULL) 1255 sosp->so_sp = sp; 1256 else 1257 pool_put(&sosplice_pool, sp); 1258 } 1259 1260 /* Lock both receive and send buffer. */ 1261 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1262 goto frele; 1263 } 1264 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1265 sbunlock(so, &so->so_rcv); 1266 goto frele; 1267 } 1268 1269 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1270 error = EBUSY; 1271 goto release; 1272 } 1273 if (sosp->so_options & SO_ACCEPTCONN) { 1274 error = EOPNOTSUPP; 1275 goto release; 1276 } 1277 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1278 error = ENOTCONN; 1279 goto release; 1280 } 1281 1282 /* Splice so and sosp together. */ 1283 so->so_sp->ssp_socket = sosp; 1284 sosp->so_sp->ssp_soback = so; 1285 so->so_splicelen = 0; 1286 so->so_splicemax = max; 1287 if (tv) 1288 so->so_idletv = *tv; 1289 else 1290 timerclear(&so->so_idletv); 1291 timeout_set_proc(&so->so_idleto, soidle, so); 1292 task_set(&so->so_splicetask, sotask, so); 1293 1294 /* 1295 * To prevent softnet interrupt from calling somove() while 1296 * we sleep, the socket buffers are not marked as spliced yet. 1297 */ 1298 if (somove(so, M_WAIT)) { 1299 so->so_rcv.sb_flags |= SB_SPLICE; 1300 sosp->so_snd.sb_flags |= SB_SPLICE; 1301 } 1302 1303 release: 1304 sbunlock(sosp, &sosp->so_snd); 1305 sbunlock(so, &so->so_rcv); 1306 frele: 1307 /* 1308 * FRELE() must not be called with the socket lock held. It is safe to 1309 * release the lock here as long as no other operation happen on the 1310 * socket when sosplice() returns. The dance could be avoided by 1311 * grabbing the socket lock inside this function. 1312 */ 1313 sounlock(so, SL_LOCKED); 1314 FRELE(fp, curproc); 1315 solock(so); 1316 return (error); 1317 } 1318 1319 void 1320 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1321 { 1322 soassertlocked(so); 1323 1324 task_del(sosplice_taskq, &so->so_splicetask); 1325 timeout_del(&so->so_idleto); 1326 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1327 so->so_rcv.sb_flags &= ~SB_SPLICE; 1328 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1329 /* Do not wakeup a socket that is about to be freed. */ 1330 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1331 sorwakeup(so); 1332 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1333 sowwakeup(sosp); 1334 } 1335 1336 void 1337 soidle(void *arg) 1338 { 1339 struct socket *so = arg; 1340 int s; 1341 1342 s = solock(so); 1343 if (so->so_rcv.sb_flags & SB_SPLICE) { 1344 so->so_error = ETIMEDOUT; 1345 sounsplice(so, so->so_sp->ssp_socket, 0); 1346 } 1347 sounlock(so, s); 1348 } 1349 1350 void 1351 sotask(void *arg) 1352 { 1353 struct socket *so = arg; 1354 int s; 1355 1356 s = solock(so); 1357 if (so->so_rcv.sb_flags & SB_SPLICE) { 1358 /* 1359 * We may not sleep here as sofree() and unsplice() may be 1360 * called from softnet interrupt context. This would remove 1361 * the socket during somove(). 1362 */ 1363 somove(so, M_DONTWAIT); 1364 } 1365 sounlock(so, s); 1366 1367 /* Avoid user land starvation. */ 1368 yield(); 1369 } 1370 1371 /* 1372 * The socket splicing task or idle timeout may sleep while grabbing the net 1373 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1374 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1375 * after all pending socket splicing tasks or timeouts have finished. Do this 1376 * by scheduling it on the same threads. 1377 */ 1378 void 1379 soreaper(void *arg) 1380 { 1381 struct socket *so = arg; 1382 1383 /* Reuse splice task, sounsplice() has been called before. */ 1384 task_set(&so->so_sp->ssp_task, soput, so); 1385 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1386 } 1387 1388 void 1389 soput(void *arg) 1390 { 1391 struct socket *so = arg; 1392 1393 pool_put(&sosplice_pool, so->so_sp); 1394 pool_put(&socket_pool, so); 1395 } 1396 1397 /* 1398 * Move data from receive buffer of spliced source socket to send 1399 * buffer of drain socket. Try to move as much as possible in one 1400 * big chunk. It is a TCP only implementation. 1401 * Return value 0 means splicing has been finished, 1 continue. 1402 */ 1403 int 1404 somove(struct socket *so, int wait) 1405 { 1406 struct socket *sosp = so->so_sp->ssp_socket; 1407 struct mbuf *m, **mp, *nextrecord; 1408 u_long len, off, oobmark; 1409 long space; 1410 int error = 0, maxreached = 0; 1411 unsigned int state; 1412 1413 soassertlocked(so); 1414 1415 nextpkt: 1416 if (so->so_error) { 1417 error = so->so_error; 1418 goto release; 1419 } 1420 if (sosp->so_state & SS_CANTSENDMORE) { 1421 error = EPIPE; 1422 goto release; 1423 } 1424 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1425 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1426 error = sosp->so_error; 1427 goto release; 1428 } 1429 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1430 goto release; 1431 1432 /* Calculate how many bytes can be copied now. */ 1433 len = so->so_rcv.sb_datacc; 1434 if (so->so_splicemax) { 1435 KASSERT(so->so_splicelen < so->so_splicemax); 1436 if (so->so_splicemax <= so->so_splicelen + len) { 1437 len = so->so_splicemax - so->so_splicelen; 1438 maxreached = 1; 1439 } 1440 } 1441 space = sbspace(sosp, &sosp->so_snd); 1442 if (so->so_oobmark && so->so_oobmark < len && 1443 so->so_oobmark < space + 1024) 1444 space += 1024; 1445 if (space <= 0) { 1446 maxreached = 0; 1447 goto release; 1448 } 1449 if (space < len) { 1450 maxreached = 0; 1451 if (space < sosp->so_snd.sb_lowat) 1452 goto release; 1453 len = space; 1454 } 1455 sosp->so_state |= SS_ISSENDING; 1456 1457 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1458 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1459 m = so->so_rcv.sb_mb; 1460 if (m == NULL) 1461 goto release; 1462 nextrecord = m->m_nextpkt; 1463 1464 /* Drop address and control information not used with splicing. */ 1465 if (so->so_proto->pr_flags & PR_ADDR) { 1466 #ifdef DIAGNOSTIC 1467 if (m->m_type != MT_SONAME) 1468 panic("somove soname: so %p, so_type %d, m %p, " 1469 "m_type %d", so, so->so_type, m, m->m_type); 1470 #endif 1471 m = m->m_next; 1472 } 1473 while (m && m->m_type == MT_CONTROL) 1474 m = m->m_next; 1475 if (m == NULL) { 1476 sbdroprecord(so, &so->so_rcv); 1477 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1478 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1479 NULL, NULL, NULL); 1480 goto nextpkt; 1481 } 1482 1483 /* 1484 * By splicing sockets connected to localhost, userland might create a 1485 * loop. Dissolve splicing with error if loop is detected by counter. 1486 * 1487 * If we deal with looped broadcast/multicast packet we bail out with 1488 * no error to suppress splice termination. 1489 */ 1490 if ((m->m_flags & M_PKTHDR) && 1491 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1492 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1493 error = ELOOP; 1494 goto release; 1495 } 1496 1497 if (so->so_proto->pr_flags & PR_ATOMIC) { 1498 if ((m->m_flags & M_PKTHDR) == 0) 1499 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1500 "m_type %d", so, so->so_type, m, m->m_type); 1501 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1502 error = EMSGSIZE; 1503 goto release; 1504 } 1505 if (len < m->m_pkthdr.len) 1506 goto release; 1507 if (m->m_pkthdr.len < len) { 1508 maxreached = 0; 1509 len = m->m_pkthdr.len; 1510 } 1511 /* 1512 * Throw away the name mbuf after it has been assured 1513 * that the whole first record can be processed. 1514 */ 1515 m = so->so_rcv.sb_mb; 1516 sbfree(so, &so->so_rcv, m); 1517 so->so_rcv.sb_mb = m_free(m); 1518 sbsync(&so->so_rcv, nextrecord); 1519 } 1520 /* 1521 * Throw away the control mbufs after it has been assured 1522 * that the whole first record can be processed. 1523 */ 1524 m = so->so_rcv.sb_mb; 1525 while (m && m->m_type == MT_CONTROL) { 1526 sbfree(so, &so->so_rcv, m); 1527 so->so_rcv.sb_mb = m_free(m); 1528 m = so->so_rcv.sb_mb; 1529 sbsync(&so->so_rcv, nextrecord); 1530 } 1531 1532 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1533 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1534 1535 /* Take at most len mbufs out of receive buffer. */ 1536 for (off = 0, mp = &m; off <= len && *mp; 1537 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1538 u_long size = len - off; 1539 1540 #ifdef DIAGNOSTIC 1541 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1542 panic("somove type: so %p, so_type %d, m %p, " 1543 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1544 #endif 1545 if ((*mp)->m_len > size) { 1546 /* 1547 * Move only a partial mbuf at maximum splice length or 1548 * if the drain buffer is too small for this large mbuf. 1549 */ 1550 if (!maxreached && so->so_snd.sb_datacc > 0) { 1551 len -= size; 1552 break; 1553 } 1554 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1555 if (*mp == NULL) { 1556 len -= size; 1557 break; 1558 } 1559 so->so_rcv.sb_mb->m_data += size; 1560 so->so_rcv.sb_mb->m_len -= size; 1561 so->so_rcv.sb_cc -= size; 1562 so->so_rcv.sb_datacc -= size; 1563 } else { 1564 *mp = so->so_rcv.sb_mb; 1565 sbfree(so, &so->so_rcv, *mp); 1566 so->so_rcv.sb_mb = (*mp)->m_next; 1567 sbsync(&so->so_rcv, nextrecord); 1568 } 1569 } 1570 *mp = NULL; 1571 1572 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1573 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1574 SBCHECK(so, &so->so_rcv); 1575 if (m == NULL) 1576 goto release; 1577 m->m_nextpkt = NULL; 1578 if (m->m_flags & M_PKTHDR) { 1579 m_resethdr(m); 1580 m->m_pkthdr.len = len; 1581 } 1582 1583 /* Send window update to source peer as receive buffer has changed. */ 1584 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1585 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1586 NULL, NULL, NULL); 1587 1588 /* Receive buffer did shrink by len bytes, adjust oob. */ 1589 state = so->so_state; 1590 so->so_state &= ~SS_RCVATMARK; 1591 oobmark = so->so_oobmark; 1592 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1593 if (oobmark) { 1594 if (oobmark == len) 1595 so->so_state |= SS_RCVATMARK; 1596 if (oobmark >= len) 1597 oobmark = 0; 1598 } 1599 1600 /* 1601 * Handle oob data. If any malloc fails, ignore error. 1602 * TCP urgent data is not very reliable anyway. 1603 */ 1604 while (((state & SS_RCVATMARK) || oobmark) && 1605 (so->so_options & SO_OOBINLINE)) { 1606 struct mbuf *o = NULL; 1607 1608 if (state & SS_RCVATMARK) { 1609 o = m_get(wait, MT_DATA); 1610 state &= ~SS_RCVATMARK; 1611 } else if (oobmark) { 1612 o = m_split(m, oobmark, wait); 1613 if (o) { 1614 error = (*sosp->so_proto->pr_usrreq)(sosp, 1615 PRU_SEND, m, NULL, NULL, NULL); 1616 if (error) { 1617 if (sosp->so_state & SS_CANTSENDMORE) 1618 error = EPIPE; 1619 m_freem(o); 1620 goto release; 1621 } 1622 len -= oobmark; 1623 so->so_splicelen += oobmark; 1624 m = o; 1625 o = m_get(wait, MT_DATA); 1626 } 1627 oobmark = 0; 1628 } 1629 if (o) { 1630 o->m_len = 1; 1631 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1632 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB, 1633 o, NULL, NULL, NULL); 1634 if (error) { 1635 if (sosp->so_state & SS_CANTSENDMORE) 1636 error = EPIPE; 1637 m_freem(m); 1638 goto release; 1639 } 1640 len -= 1; 1641 so->so_splicelen += 1; 1642 if (oobmark) { 1643 oobmark -= 1; 1644 if (oobmark == 0) 1645 state |= SS_RCVATMARK; 1646 } 1647 m_adj(m, 1); 1648 } 1649 } 1650 1651 /* Append all remaining data to drain socket. */ 1652 if (so->so_rcv.sb_cc == 0 || maxreached) 1653 sosp->so_state &= ~SS_ISSENDING; 1654 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL, 1655 NULL); 1656 if (error) { 1657 if (sosp->so_state & SS_CANTSENDMORE) 1658 error = EPIPE; 1659 goto release; 1660 } 1661 so->so_splicelen += len; 1662 1663 /* Move several packets if possible. */ 1664 if (!maxreached && nextrecord) 1665 goto nextpkt; 1666 1667 release: 1668 sosp->so_state &= ~SS_ISSENDING; 1669 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1670 error = EFBIG; 1671 if (error) 1672 so->so_error = error; 1673 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1674 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1675 sounsplice(so, sosp, 0); 1676 return (0); 1677 } 1678 if (timerisset(&so->so_idletv)) 1679 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1680 return (1); 1681 } 1682 1683 #endif /* SOCKET_SPLICE */ 1684 1685 void 1686 sorwakeup(struct socket *so) 1687 { 1688 soassertlocked(so); 1689 1690 #ifdef SOCKET_SPLICE 1691 if (so->so_rcv.sb_flags & SB_SPLICE) { 1692 /* 1693 * TCP has a sendbuffer that can handle multiple packets 1694 * at once. So queue the stream a bit to accumulate data. 1695 * The sosplice thread will call somove() later and send 1696 * the packets calling tcp_output() only once. 1697 * In the UDP case, send out the packets immediately. 1698 * Using a thread would make things slower. 1699 */ 1700 if (so->so_proto->pr_flags & PR_WANTRCVD) 1701 task_add(sosplice_taskq, &so->so_splicetask); 1702 else 1703 somove(so, M_DONTWAIT); 1704 } 1705 if (isspliced(so)) 1706 return; 1707 #endif 1708 sowakeup(so, &so->so_rcv); 1709 if (so->so_upcall) 1710 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1711 } 1712 1713 void 1714 sowwakeup(struct socket *so) 1715 { 1716 soassertlocked(so); 1717 1718 #ifdef SOCKET_SPLICE 1719 if (so->so_snd.sb_flags & SB_SPLICE) 1720 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1721 if (issplicedback(so)) 1722 return; 1723 #endif 1724 sowakeup(so, &so->so_snd); 1725 } 1726 1727 int 1728 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1729 { 1730 int error = 0; 1731 1732 soassertlocked(so); 1733 1734 if (level != SOL_SOCKET) { 1735 if (so->so_proto->pr_ctloutput) { 1736 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1737 level, optname, m); 1738 return (error); 1739 } 1740 error = ENOPROTOOPT; 1741 } else { 1742 switch (optname) { 1743 case SO_BINDANY: 1744 if ((error = suser(curproc)) != 0) /* XXX */ 1745 return (error); 1746 break; 1747 } 1748 1749 switch (optname) { 1750 1751 case SO_LINGER: 1752 if (m == NULL || m->m_len != sizeof (struct linger) || 1753 mtod(m, struct linger *)->l_linger < 0 || 1754 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1755 return (EINVAL); 1756 so->so_linger = mtod(m, struct linger *)->l_linger; 1757 /* FALLTHROUGH */ 1758 1759 case SO_BINDANY: 1760 case SO_DEBUG: 1761 case SO_KEEPALIVE: 1762 case SO_USELOOPBACK: 1763 case SO_BROADCAST: 1764 case SO_REUSEADDR: 1765 case SO_REUSEPORT: 1766 case SO_OOBINLINE: 1767 case SO_TIMESTAMP: 1768 case SO_ZEROIZE: 1769 if (m == NULL || m->m_len < sizeof (int)) 1770 return (EINVAL); 1771 if (*mtod(m, int *)) 1772 so->so_options |= optname; 1773 else 1774 so->so_options &= ~optname; 1775 break; 1776 1777 case SO_DONTROUTE: 1778 if (m == NULL || m->m_len < sizeof (int)) 1779 return (EINVAL); 1780 if (*mtod(m, int *)) 1781 error = EOPNOTSUPP; 1782 break; 1783 1784 case SO_SNDBUF: 1785 case SO_RCVBUF: 1786 case SO_SNDLOWAT: 1787 case SO_RCVLOWAT: 1788 { 1789 u_long cnt; 1790 1791 if (m == NULL || m->m_len < sizeof (int)) 1792 return (EINVAL); 1793 cnt = *mtod(m, int *); 1794 if ((long)cnt <= 0) 1795 cnt = 1; 1796 switch (optname) { 1797 1798 case SO_SNDBUF: 1799 if (so->so_state & SS_CANTSENDMORE) 1800 return (EINVAL); 1801 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1802 sbreserve(so, &so->so_snd, cnt)) 1803 return (ENOBUFS); 1804 so->so_snd.sb_wat = cnt; 1805 break; 1806 1807 case SO_RCVBUF: 1808 if (so->so_state & SS_CANTRCVMORE) 1809 return (EINVAL); 1810 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1811 sbreserve(so, &so->so_rcv, cnt)) 1812 return (ENOBUFS); 1813 so->so_rcv.sb_wat = cnt; 1814 break; 1815 1816 case SO_SNDLOWAT: 1817 so->so_snd.sb_lowat = 1818 (cnt > so->so_snd.sb_hiwat) ? 1819 so->so_snd.sb_hiwat : cnt; 1820 break; 1821 case SO_RCVLOWAT: 1822 so->so_rcv.sb_lowat = 1823 (cnt > so->so_rcv.sb_hiwat) ? 1824 so->so_rcv.sb_hiwat : cnt; 1825 break; 1826 } 1827 break; 1828 } 1829 1830 case SO_SNDTIMEO: 1831 case SO_RCVTIMEO: 1832 { 1833 struct timeval tv; 1834 uint64_t nsecs; 1835 1836 if (m == NULL || m->m_len < sizeof (tv)) 1837 return (EINVAL); 1838 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1839 if (!timerisvalid(&tv)) 1840 return (EINVAL); 1841 nsecs = TIMEVAL_TO_NSEC(&tv); 1842 if (nsecs == UINT64_MAX) 1843 return (EDOM); 1844 if (nsecs == 0) 1845 nsecs = INFSLP; 1846 switch (optname) { 1847 1848 case SO_SNDTIMEO: 1849 so->so_snd.sb_timeo_nsecs = nsecs; 1850 break; 1851 case SO_RCVTIMEO: 1852 so->so_rcv.sb_timeo_nsecs = nsecs; 1853 break; 1854 } 1855 break; 1856 } 1857 1858 case SO_RTABLE: 1859 if (so->so_proto->pr_domain && 1860 so->so_proto->pr_domain->dom_protosw && 1861 so->so_proto->pr_ctloutput) { 1862 const struct domain *dom = 1863 so->so_proto->pr_domain; 1864 1865 level = dom->dom_protosw->pr_protocol; 1866 error = (*so->so_proto->pr_ctloutput) 1867 (PRCO_SETOPT, so, level, optname, m); 1868 return (error); 1869 } 1870 error = ENOPROTOOPT; 1871 break; 1872 1873 #ifdef SOCKET_SPLICE 1874 case SO_SPLICE: 1875 if (m == NULL) { 1876 error = sosplice(so, -1, 0, NULL); 1877 } else if (m->m_len < sizeof(int)) { 1878 return (EINVAL); 1879 } else if (m->m_len < sizeof(struct splice)) { 1880 error = sosplice(so, *mtod(m, int *), 0, NULL); 1881 } else { 1882 error = sosplice(so, 1883 mtod(m, struct splice *)->sp_fd, 1884 mtod(m, struct splice *)->sp_max, 1885 &mtod(m, struct splice *)->sp_idle); 1886 } 1887 break; 1888 #endif /* SOCKET_SPLICE */ 1889 1890 default: 1891 error = ENOPROTOOPT; 1892 break; 1893 } 1894 if (error == 0 && so->so_proto->pr_ctloutput) { 1895 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1896 level, optname, m); 1897 } 1898 } 1899 1900 return (error); 1901 } 1902 1903 int 1904 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1905 { 1906 int error = 0; 1907 1908 soassertlocked(so); 1909 1910 if (level != SOL_SOCKET) { 1911 if (so->so_proto->pr_ctloutput) { 1912 m->m_len = 0; 1913 1914 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1915 level, optname, m); 1916 if (error) 1917 return (error); 1918 return (0); 1919 } else 1920 return (ENOPROTOOPT); 1921 } else { 1922 m->m_len = sizeof (int); 1923 1924 switch (optname) { 1925 1926 case SO_LINGER: 1927 m->m_len = sizeof (struct linger); 1928 mtod(m, struct linger *)->l_onoff = 1929 so->so_options & SO_LINGER; 1930 mtod(m, struct linger *)->l_linger = so->so_linger; 1931 break; 1932 1933 case SO_BINDANY: 1934 case SO_USELOOPBACK: 1935 case SO_DEBUG: 1936 case SO_KEEPALIVE: 1937 case SO_REUSEADDR: 1938 case SO_REUSEPORT: 1939 case SO_BROADCAST: 1940 case SO_OOBINLINE: 1941 case SO_TIMESTAMP: 1942 case SO_ZEROIZE: 1943 *mtod(m, int *) = so->so_options & optname; 1944 break; 1945 1946 case SO_DONTROUTE: 1947 *mtod(m, int *) = 0; 1948 break; 1949 1950 case SO_TYPE: 1951 *mtod(m, int *) = so->so_type; 1952 break; 1953 1954 case SO_ERROR: 1955 *mtod(m, int *) = so->so_error; 1956 so->so_error = 0; 1957 break; 1958 1959 case SO_DOMAIN: 1960 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 1961 break; 1962 1963 case SO_PROTOCOL: 1964 *mtod(m, int *) = so->so_proto->pr_protocol; 1965 break; 1966 1967 case SO_SNDBUF: 1968 *mtod(m, int *) = so->so_snd.sb_hiwat; 1969 break; 1970 1971 case SO_RCVBUF: 1972 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1973 break; 1974 1975 case SO_SNDLOWAT: 1976 *mtod(m, int *) = so->so_snd.sb_lowat; 1977 break; 1978 1979 case SO_RCVLOWAT: 1980 *mtod(m, int *) = so->so_rcv.sb_lowat; 1981 break; 1982 1983 case SO_SNDTIMEO: 1984 case SO_RCVTIMEO: 1985 { 1986 struct timeval tv; 1987 uint64_t nsecs = (optname == SO_SNDTIMEO ? 1988 so->so_snd.sb_timeo_nsecs : 1989 so->so_rcv.sb_timeo_nsecs); 1990 1991 m->m_len = sizeof(struct timeval); 1992 memset(&tv, 0, sizeof(tv)); 1993 if (nsecs != INFSLP) 1994 NSEC_TO_TIMEVAL(nsecs, &tv); 1995 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 1996 break; 1997 } 1998 1999 case SO_RTABLE: 2000 if (so->so_proto->pr_domain && 2001 so->so_proto->pr_domain->dom_protosw && 2002 so->so_proto->pr_ctloutput) { 2003 const struct domain *dom = 2004 so->so_proto->pr_domain; 2005 2006 level = dom->dom_protosw->pr_protocol; 2007 error = (*so->so_proto->pr_ctloutput) 2008 (PRCO_GETOPT, so, level, optname, m); 2009 if (error) 2010 return (error); 2011 break; 2012 } 2013 return (ENOPROTOOPT); 2014 2015 #ifdef SOCKET_SPLICE 2016 case SO_SPLICE: 2017 { 2018 off_t len; 2019 2020 m->m_len = sizeof(off_t); 2021 len = so->so_sp ? so->so_sp->ssp_len : 0; 2022 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2023 break; 2024 } 2025 #endif /* SOCKET_SPLICE */ 2026 2027 case SO_PEERCRED: 2028 if (so->so_proto->pr_protocol == AF_UNIX) { 2029 struct unpcb *unp = sotounpcb(so); 2030 2031 if (unp->unp_flags & UNP_FEIDS) { 2032 m->m_len = sizeof(unp->unp_connid); 2033 memcpy(mtod(m, caddr_t), 2034 &(unp->unp_connid), m->m_len); 2035 break; 2036 } 2037 return (ENOTCONN); 2038 } 2039 return (EOPNOTSUPP); 2040 2041 default: 2042 return (ENOPROTOOPT); 2043 } 2044 return (0); 2045 } 2046 } 2047 2048 void 2049 sohasoutofband(struct socket *so) 2050 { 2051 pgsigio(&so->so_sigio, SIGURG, 0); 2052 selwakeup(&so->so_rcv.sb_sel); 2053 } 2054 2055 int 2056 soo_kqfilter(struct file *fp, struct knote *kn) 2057 { 2058 struct socket *so = kn->kn_fp->f_data; 2059 struct sockbuf *sb; 2060 int s; 2061 2062 s = solock(so); 2063 switch (kn->kn_filter) { 2064 case EVFILT_READ: 2065 if (so->so_options & SO_ACCEPTCONN) 2066 kn->kn_fop = &solisten_filtops; 2067 else 2068 kn->kn_fop = &soread_filtops; 2069 sb = &so->so_rcv; 2070 break; 2071 case EVFILT_WRITE: 2072 kn->kn_fop = &sowrite_filtops; 2073 sb = &so->so_snd; 2074 break; 2075 case EVFILT_EXCEPT: 2076 kn->kn_fop = &soexcept_filtops; 2077 sb = &so->so_rcv; 2078 break; 2079 default: 2080 sounlock(so, s); 2081 return (EINVAL); 2082 } 2083 2084 klist_insert_locked(&sb->sb_sel.si_note, kn); 2085 sounlock(so, s); 2086 2087 return (0); 2088 } 2089 2090 void 2091 filt_sordetach(struct knote *kn) 2092 { 2093 struct socket *so = kn->kn_fp->f_data; 2094 2095 klist_remove(&so->so_rcv.sb_sel.si_note, kn); 2096 } 2097 2098 int 2099 filt_soread_common(struct knote *kn, struct socket *so) 2100 { 2101 int rv = 0; 2102 2103 soassertlocked(so); 2104 2105 kn->kn_data = so->so_rcv.sb_cc; 2106 #ifdef SOCKET_SPLICE 2107 if (isspliced(so)) { 2108 rv = 0; 2109 } else 2110 #endif /* SOCKET_SPLICE */ 2111 if (so->so_state & SS_CANTRCVMORE) { 2112 kn->kn_flags |= EV_EOF; 2113 if (kn->kn_flags & __EV_POLL) { 2114 if (so->so_state & SS_ISDISCONNECTED) 2115 kn->kn_flags |= __EV_HUP; 2116 } 2117 kn->kn_fflags = so->so_error; 2118 rv = 1; 2119 } else if (so->so_error) { /* temporary udp error */ 2120 rv = 1; 2121 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2122 rv = (kn->kn_data >= kn->kn_sdata); 2123 } else { 2124 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2125 } 2126 2127 return rv; 2128 } 2129 2130 int 2131 filt_soread(struct knote *kn, long hint) 2132 { 2133 struct socket *so = kn->kn_fp->f_data; 2134 2135 return (filt_soread_common(kn, so)); 2136 } 2137 2138 int 2139 filt_soreadmodify(struct kevent *kev, struct knote *kn) 2140 { 2141 struct socket *so = kn->kn_fp->f_data; 2142 int rv, s; 2143 2144 s = solock(so); 2145 knote_modify(kev, kn); 2146 rv = filt_soread_common(kn, so); 2147 sounlock(so, s); 2148 2149 return (rv); 2150 } 2151 2152 int 2153 filt_soreadprocess(struct knote *kn, struct kevent *kev) 2154 { 2155 struct socket *so = kn->kn_fp->f_data; 2156 int rv, s; 2157 2158 s = solock(so); 2159 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2160 rv = 1; 2161 else 2162 rv = filt_soread_common(kn, so); 2163 if (rv != 0) 2164 knote_submit(kn, kev); 2165 sounlock(so, s); 2166 2167 return (rv); 2168 } 2169 2170 void 2171 filt_sowdetach(struct knote *kn) 2172 { 2173 struct socket *so = kn->kn_fp->f_data; 2174 2175 klist_remove(&so->so_snd.sb_sel.si_note, kn); 2176 } 2177 2178 int 2179 filt_sowrite_common(struct knote *kn, struct socket *so) 2180 { 2181 int rv; 2182 2183 soassertlocked(so); 2184 2185 kn->kn_data = sbspace(so, &so->so_snd); 2186 if (so->so_state & SS_CANTSENDMORE) { 2187 kn->kn_flags |= EV_EOF; 2188 if (kn->kn_flags & __EV_POLL) { 2189 if (so->so_state & SS_ISDISCONNECTED) 2190 kn->kn_flags |= __EV_HUP; 2191 } 2192 kn->kn_fflags = so->so_error; 2193 rv = 1; 2194 } else if (so->so_error) { /* temporary udp error */ 2195 rv = 1; 2196 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2197 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2198 rv = 0; 2199 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2200 rv = (kn->kn_data >= kn->kn_sdata); 2201 } else { 2202 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2203 } 2204 2205 return (rv); 2206 } 2207 2208 int 2209 filt_sowrite(struct knote *kn, long hint) 2210 { 2211 struct socket *so = kn->kn_fp->f_data; 2212 2213 return (filt_sowrite_common(kn, so)); 2214 } 2215 2216 int 2217 filt_sowritemodify(struct kevent *kev, struct knote *kn) 2218 { 2219 struct socket *so = kn->kn_fp->f_data; 2220 int rv, s; 2221 2222 s = solock(so); 2223 knote_modify(kev, kn); 2224 rv = filt_sowrite_common(kn, so); 2225 sounlock(so, s); 2226 2227 return (rv); 2228 } 2229 2230 int 2231 filt_sowriteprocess(struct knote *kn, struct kevent *kev) 2232 { 2233 struct socket *so = kn->kn_fp->f_data; 2234 int rv, s; 2235 2236 s = solock(so); 2237 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2238 rv = 1; 2239 else 2240 rv = filt_sowrite_common(kn, so); 2241 if (rv != 0) 2242 knote_submit(kn, kev); 2243 sounlock(so, s); 2244 2245 return (rv); 2246 } 2247 2248 int 2249 filt_soexcept_common(struct knote *kn, struct socket *so) 2250 { 2251 int rv = 0; 2252 2253 soassertlocked(so); 2254 2255 #ifdef SOCKET_SPLICE 2256 if (isspliced(so)) { 2257 rv = 0; 2258 } else 2259 #endif /* SOCKET_SPLICE */ 2260 if (kn->kn_sfflags & NOTE_OOB) { 2261 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { 2262 kn->kn_fflags |= NOTE_OOB; 2263 kn->kn_data -= so->so_oobmark; 2264 rv = 1; 2265 } 2266 } 2267 2268 if (kn->kn_flags & __EV_POLL) { 2269 if (so->so_state & SS_ISDISCONNECTED) { 2270 kn->kn_flags |= __EV_HUP; 2271 rv = 1; 2272 } 2273 } 2274 2275 return rv; 2276 } 2277 2278 int 2279 filt_soexcept(struct knote *kn, long hint) 2280 { 2281 struct socket *so = kn->kn_fp->f_data; 2282 2283 return (filt_soexcept_common(kn, so)); 2284 } 2285 2286 int 2287 filt_soexceptmodify(struct kevent *kev, struct knote *kn) 2288 { 2289 struct socket *so = kn->kn_fp->f_data; 2290 int rv, s; 2291 2292 s = solock(so); 2293 knote_modify(kev, kn); 2294 rv = filt_soexcept_common(kn, so); 2295 sounlock(so, s); 2296 2297 return (rv); 2298 } 2299 2300 int 2301 filt_soexceptprocess(struct knote *kn, struct kevent *kev) 2302 { 2303 struct socket *so = kn->kn_fp->f_data; 2304 int rv, s; 2305 2306 s = solock(so); 2307 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2308 rv = 1; 2309 else 2310 rv = filt_soexcept_common(kn, so); 2311 if (rv != 0) 2312 knote_submit(kn, kev); 2313 sounlock(so, s); 2314 2315 return (rv); 2316 } 2317 2318 int 2319 filt_solisten_common(struct knote *kn, struct socket *so) 2320 { 2321 soassertlocked(so); 2322 2323 kn->kn_data = so->so_qlen; 2324 2325 return (kn->kn_data != 0); 2326 } 2327 2328 int 2329 filt_solisten(struct knote *kn, long hint) 2330 { 2331 struct socket *so = kn->kn_fp->f_data; 2332 2333 return (filt_solisten_common(kn, so)); 2334 } 2335 2336 int 2337 filt_solistenmodify(struct kevent *kev, struct knote *kn) 2338 { 2339 struct socket *so = kn->kn_fp->f_data; 2340 int rv, s; 2341 2342 s = solock(so); 2343 knote_modify(kev, kn); 2344 rv = filt_solisten_common(kn, so); 2345 sounlock(so, s); 2346 2347 return (rv); 2348 } 2349 2350 int 2351 filt_solistenprocess(struct knote *kn, struct kevent *kev) 2352 { 2353 struct socket *so = kn->kn_fp->f_data; 2354 int rv, s; 2355 2356 s = solock(so); 2357 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2358 rv = 1; 2359 else 2360 rv = filt_solisten_common(kn, so); 2361 if (rv != 0) 2362 knote_submit(kn, kev); 2363 sounlock(so, s); 2364 2365 return (rv); 2366 } 2367 2368 void 2369 klist_soassertlk(void *arg) 2370 { 2371 struct socket *so = arg; 2372 2373 soassertlocked(so); 2374 } 2375 2376 int 2377 klist_solock(void *arg) 2378 { 2379 struct socket *so = arg; 2380 2381 return (solock(so)); 2382 } 2383 2384 void 2385 klist_sounlock(void *arg, int ls) 2386 { 2387 struct socket *so = arg; 2388 2389 sounlock(so, ls); 2390 } 2391 2392 const struct klistops socket_klistops = { 2393 .klo_assertlk = klist_soassertlk, 2394 .klo_lock = klist_solock, 2395 .klo_unlock = klist_sounlock, 2396 }; 2397 2398 #ifdef DDB 2399 void 2400 sobuf_print(struct sockbuf *, 2401 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2402 2403 void 2404 sobuf_print(struct sockbuf *sb, 2405 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2406 { 2407 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2408 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2409 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2410 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2411 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2412 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2413 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2414 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2415 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2416 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2417 (*pr)("\tsb_sel: ...\n"); 2418 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2419 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2420 } 2421 2422 void 2423 so_print(void *v, 2424 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2425 { 2426 struct socket *so = v; 2427 2428 (*pr)("socket %p\n", so); 2429 (*pr)("so_type: %i\n", so->so_type); 2430 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2431 (*pr)("so_linger: %i\n", so->so_linger); 2432 (*pr)("so_state: 0x%04x\n", so->so_state); 2433 (*pr)("so_pcb: %p\n", so->so_pcb); 2434 (*pr)("so_proto: %p\n", so->so_proto); 2435 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2436 2437 (*pr)("so_head: %p\n", so->so_head); 2438 (*pr)("so_onq: %p\n", so->so_onq); 2439 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2440 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2441 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2442 (*pr)("so_q0len: %i\n", so->so_q0len); 2443 (*pr)("so_qlen: %i\n", so->so_qlen); 2444 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2445 (*pr)("so_timeo: %i\n", so->so_timeo); 2446 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2447 2448 (*pr)("so_sp: %p\n", so->so_sp); 2449 if (so->so_sp != NULL) { 2450 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2451 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2452 (*pr)("\tssp_len: %lld\n", 2453 (unsigned long long)so->so_sp->ssp_len); 2454 (*pr)("\tssp_max: %lld\n", 2455 (unsigned long long)so->so_sp->ssp_max); 2456 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2457 so->so_sp->ssp_idletv.tv_usec); 2458 (*pr)("\tssp_idleto: %spending (@%i)\n", 2459 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2460 so->so_sp->ssp_idleto.to_time); 2461 } 2462 2463 (*pr)("so_rcv:\n"); 2464 sobuf_print(&so->so_rcv, pr); 2465 (*pr)("so_snd:\n"); 2466 sobuf_print(&so->so_snd, pr); 2467 2468 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2469 so->so_upcall, so->so_upcallarg); 2470 2471 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2472 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2473 (*pr)("so_cpid: %d\n", so->so_cpid); 2474 } 2475 #endif 2476