1 /* $OpenBSD: uipc_socket.c,v 1.83 2010/07/03 04:44:51 guenther Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/malloc.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/kernel.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/resourcevar.h> 50 #include <net/route.h> 51 #include <sys/pool.h> 52 53 void filt_sordetach(struct knote *kn); 54 int filt_soread(struct knote *kn, long hint); 55 void filt_sowdetach(struct knote *kn); 56 int filt_sowrite(struct knote *kn, long hint); 57 int filt_solisten(struct knote *kn, long hint); 58 59 struct filterops solisten_filtops = 60 { 1, NULL, filt_sordetach, filt_solisten }; 61 struct filterops soread_filtops = 62 { 1, NULL, filt_sordetach, filt_soread }; 63 struct filterops sowrite_filtops = 64 { 1, NULL, filt_sowdetach, filt_sowrite }; 65 66 67 #ifndef SOMINCONN 68 #define SOMINCONN 80 69 #endif /* SOMINCONN */ 70 71 int somaxconn = SOMAXCONN; 72 int sominconn = SOMINCONN; 73 74 struct pool socket_pool; 75 76 void 77 soinit(void) 78 { 79 80 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 81 } 82 83 /* 84 * Socket operation routines. 85 * These routines are called by the routines in 86 * sys_socket.c or from a system process, and 87 * implement the semantics of socket operations by 88 * switching out to the protocol specific routines. 89 */ 90 /*ARGSUSED*/ 91 int 92 socreate(int dom, struct socket **aso, int type, int proto) 93 { 94 struct proc *p = curproc; /* XXX */ 95 struct protosw *prp; 96 struct socket *so; 97 int error, s; 98 99 if (proto) 100 prp = pffindproto(dom, proto, type); 101 else 102 prp = pffindtype(dom, type); 103 if (prp == NULL || prp->pr_usrreq == 0) 104 return (EPROTONOSUPPORT); 105 if (prp->pr_type != type) 106 return (EPROTOTYPE); 107 s = splsoftnet(); 108 so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO); 109 TAILQ_INIT(&so->so_q0); 110 TAILQ_INIT(&so->so_q); 111 so->so_type = type; 112 if (suser(p, 0) == 0) 113 so->so_state = SS_PRIV; 114 so->so_ruid = p->p_cred->p_ruid; 115 so->so_euid = p->p_ucred->cr_uid; 116 so->so_rgid = p->p_cred->p_rgid; 117 so->so_egid = p->p_ucred->cr_gid; 118 so->so_cpid = p->p_pid; 119 so->so_proto = prp; 120 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 121 (struct mbuf *)(long)proto, NULL, p); 122 if (error) { 123 so->so_state |= SS_NOFDREF; 124 sofree(so); 125 splx(s); 126 return (error); 127 } 128 splx(s); 129 *aso = so; 130 return (0); 131 } 132 133 int 134 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 135 { 136 int s = splsoftnet(); 137 int error; 138 139 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p); 140 splx(s); 141 return (error); 142 } 143 144 int 145 solisten(struct socket *so, int backlog) 146 { 147 int s = splsoftnet(), error; 148 149 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, 150 curproc); 151 if (error) { 152 splx(s); 153 return (error); 154 } 155 if (TAILQ_FIRST(&so->so_q) == NULL) 156 so->so_options |= SO_ACCEPTCONN; 157 if (backlog < 0 || backlog > somaxconn) 158 backlog = somaxconn; 159 if (backlog < sominconn) 160 backlog = sominconn; 161 so->so_qlimit = backlog; 162 splx(s); 163 return (0); 164 } 165 166 /* 167 * Must be called at splsoftnet() 168 */ 169 170 void 171 sofree(struct socket *so) 172 { 173 splsoftassert(IPL_SOFTNET); 174 175 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 176 return; 177 if (so->so_head) { 178 /* 179 * We must not decommission a socket that's on the accept(2) 180 * queue. If we do, then accept(2) may hang after select(2) 181 * indicated that the listening socket was ready. 182 */ 183 if (!soqremque(so, 0)) 184 return; 185 } 186 sbrelease(&so->so_snd); 187 sorflush(so); 188 pool_put(&socket_pool, so); 189 } 190 191 /* 192 * Close a socket on last file table reference removal. 193 * Initiate disconnect if connected. 194 * Free socket when disconnect complete. 195 */ 196 int 197 soclose(struct socket *so) 198 { 199 struct socket *so2; 200 int s = splsoftnet(); /* conservative */ 201 int error = 0; 202 203 if (so->so_options & SO_ACCEPTCONN) { 204 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 205 (void) soqremque(so2, 0); 206 (void) soabort(so2); 207 } 208 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 209 (void) soqremque(so2, 1); 210 (void) soabort(so2); 211 } 212 } 213 if (so->so_pcb == 0) 214 goto discard; 215 if (so->so_state & SS_ISCONNECTED) { 216 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 217 error = sodisconnect(so); 218 if (error) 219 goto drop; 220 } 221 if (so->so_options & SO_LINGER) { 222 if ((so->so_state & SS_ISDISCONNECTING) && 223 (so->so_state & SS_NBIO)) 224 goto drop; 225 while (so->so_state & SS_ISCONNECTED) { 226 error = tsleep(&so->so_timeo, 227 PSOCK | PCATCH, "netcls", 228 so->so_linger * hz); 229 if (error) 230 break; 231 } 232 } 233 } 234 drop: 235 if (so->so_pcb) { 236 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL, 237 NULL, NULL, curproc); 238 if (error == 0) 239 error = error2; 240 } 241 discard: 242 if (so->so_state & SS_NOFDREF) 243 panic("soclose: NOFDREF"); 244 so->so_state |= SS_NOFDREF; 245 sofree(so); 246 splx(s); 247 return (error); 248 } 249 250 /* 251 * Must be called at splsoftnet. 252 */ 253 int 254 soabort(struct socket *so) 255 { 256 splsoftassert(IPL_SOFTNET); 257 258 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, 259 curproc); 260 } 261 262 int 263 soaccept(struct socket *so, struct mbuf *nam) 264 { 265 int s = splsoftnet(); 266 int error = 0; 267 268 if ((so->so_state & SS_NOFDREF) == 0) 269 panic("soaccept: !NOFDREF"); 270 so->so_state &= ~SS_NOFDREF; 271 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 272 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 273 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 274 nam, NULL, curproc); 275 else 276 error = ECONNABORTED; 277 splx(s); 278 return (error); 279 } 280 281 int 282 soconnect(struct socket *so, struct mbuf *nam) 283 { 284 int s; 285 int error; 286 287 if (so->so_options & SO_ACCEPTCONN) 288 return (EOPNOTSUPP); 289 s = splsoftnet(); 290 /* 291 * If protocol is connection-based, can only connect once. 292 * Otherwise, if connected, try to disconnect first. 293 * This allows user to disconnect by connecting to, e.g., 294 * a null address. 295 */ 296 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 297 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 298 (error = sodisconnect(so)))) 299 error = EISCONN; 300 else 301 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 302 NULL, nam, NULL, curproc); 303 splx(s); 304 return (error); 305 } 306 307 int 308 soconnect2(struct socket *so1, struct socket *so2) 309 { 310 int s = splsoftnet(); 311 int error; 312 313 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 314 (struct mbuf *)so2, NULL, curproc); 315 splx(s); 316 return (error); 317 } 318 319 int 320 sodisconnect(struct socket *so) 321 { 322 int s = splsoftnet(); 323 int error; 324 325 if ((so->so_state & SS_ISCONNECTED) == 0) { 326 error = ENOTCONN; 327 goto bad; 328 } 329 if (so->so_state & SS_ISDISCONNECTING) { 330 error = EALREADY; 331 goto bad; 332 } 333 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 334 NULL, curproc); 335 bad: 336 splx(s); 337 return (error); 338 } 339 340 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 341 /* 342 * Send on a socket. 343 * If send must go all at once and message is larger than 344 * send buffering, then hard error. 345 * Lock against other senders. 346 * If must go all at once and not enough room now, then 347 * inform user that this would block and do nothing. 348 * Otherwise, if nonblocking, send as much as possible. 349 * The data to be sent is described by "uio" if nonzero, 350 * otherwise by the mbuf chain "top" (which must be null 351 * if uio is not). Data provided in mbuf chain must be small 352 * enough to send all at once. 353 * 354 * Returns nonzero on error, timeout or signal; callers 355 * must check for short counts if EINTR/ERESTART are returned. 356 * Data and control buffers are freed on return. 357 */ 358 int 359 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 360 struct mbuf *control, int flags) 361 { 362 struct mbuf **mp; 363 struct mbuf *m; 364 long space, len, mlen, clen = 0; 365 quad_t resid; 366 int error, s, dontroute; 367 int atomic = sosendallatonce(so) || top; 368 369 if (uio) 370 resid = uio->uio_resid; 371 else 372 resid = top->m_pkthdr.len; 373 /* 374 * In theory resid should be unsigned (since uio->uio_resid is). 375 * However, space must be signed, as it might be less than 0 376 * if we over-committed, and we must use a signed comparison 377 * of space and resid. On the other hand, a negative resid 378 * causes us to loop sending 0-length segments to the protocol. 379 * MSG_EOR on a SOCK_STREAM socket is also invalid. 380 */ 381 if (resid < 0 || 382 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 383 error = EINVAL; 384 goto out; 385 } 386 dontroute = 387 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 388 (so->so_proto->pr_flags & PR_ATOMIC); 389 if (uio && uio->uio_procp) 390 uio->uio_procp->p_stats->p_ru.ru_msgsnd++; 391 if (control) 392 clen = control->m_len; 393 #define snderr(errno) { error = errno; splx(s); goto release; } 394 395 restart: 396 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 397 goto out; 398 so->so_state |= SS_ISSENDING; 399 do { 400 s = splsoftnet(); 401 if (so->so_state & SS_CANTSENDMORE) 402 snderr(EPIPE); 403 if (so->so_error) { 404 error = so->so_error; 405 so->so_error = 0; 406 splx(s); 407 goto release; 408 } 409 if ((so->so_state & SS_ISCONNECTED) == 0) { 410 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 411 if ((so->so_state & SS_ISCONFIRMING) == 0 && 412 !(resid == 0 && clen != 0)) 413 snderr(ENOTCONN); 414 } else if (addr == 0) 415 snderr(EDESTADDRREQ); 416 } 417 space = sbspace(&so->so_snd); 418 if (flags & MSG_OOB) 419 space += 1024; 420 if ((atomic && resid > so->so_snd.sb_hiwat) || 421 clen > so->so_snd.sb_hiwat) 422 snderr(EMSGSIZE); 423 if (space < resid + clen && 424 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 425 if (so->so_state & SS_NBIO) 426 snderr(EWOULDBLOCK); 427 sbunlock(&so->so_snd); 428 error = sbwait(&so->so_snd); 429 so->so_state &= ~SS_ISSENDING; 430 splx(s); 431 if (error) 432 goto out; 433 goto restart; 434 } 435 splx(s); 436 mp = ⊤ 437 space -= clen; 438 do { 439 if (uio == NULL) { 440 /* 441 * Data is prepackaged in "top". 442 */ 443 resid = 0; 444 if (flags & MSG_EOR) 445 top->m_flags |= M_EOR; 446 } else do { 447 if (top == 0) { 448 MGETHDR(m, M_WAIT, MT_DATA); 449 mlen = MHLEN; 450 m->m_pkthdr.len = 0; 451 m->m_pkthdr.rcvif = (struct ifnet *)0; 452 } else { 453 MGET(m, M_WAIT, MT_DATA); 454 mlen = MLEN; 455 } 456 if (resid >= MINCLSIZE && space >= MCLBYTES) { 457 MCLGET(m, M_NOWAIT); 458 if ((m->m_flags & M_EXT) == 0) 459 goto nopages; 460 mlen = MCLBYTES; 461 if (atomic && top == 0) { 462 len = lmin(MCLBYTES - max_hdr, resid); 463 m->m_data += max_hdr; 464 } else 465 len = lmin(MCLBYTES, resid); 466 space -= len; 467 } else { 468 nopages: 469 len = lmin(lmin(mlen, resid), space); 470 space -= len; 471 /* 472 * For datagram protocols, leave room 473 * for protocol headers in first mbuf. 474 */ 475 if (atomic && top == 0 && len < mlen) 476 MH_ALIGN(m, len); 477 } 478 error = uiomove(mtod(m, caddr_t), (int)len, 479 uio); 480 resid = uio->uio_resid; 481 m->m_len = len; 482 *mp = m; 483 top->m_pkthdr.len += len; 484 if (error) 485 goto release; 486 mp = &m->m_next; 487 if (resid <= 0) { 488 if (flags & MSG_EOR) 489 top->m_flags |= M_EOR; 490 break; 491 } 492 } while (space > 0 && atomic); 493 if (dontroute) 494 so->so_options |= SO_DONTROUTE; 495 s = splsoftnet(); /* XXX */ 496 if (resid <= 0) 497 so->so_state &= ~SS_ISSENDING; 498 error = (*so->so_proto->pr_usrreq)(so, 499 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 500 top, addr, control, curproc); 501 splx(s); 502 if (dontroute) 503 so->so_options &= ~SO_DONTROUTE; 504 clen = 0; 505 control = 0; 506 top = 0; 507 mp = ⊤ 508 if (error) 509 goto release; 510 } while (resid && space > 0); 511 } while (resid); 512 513 release: 514 so->so_state &= ~SS_ISSENDING; 515 sbunlock(&so->so_snd); 516 out: 517 if (top) 518 m_freem(top); 519 if (control) 520 m_freem(control); 521 return (error); 522 } 523 524 /* 525 * Implement receive operations on a socket. 526 * We depend on the way that records are added to the sockbuf 527 * by sbappend*. In particular, each record (mbufs linked through m_next) 528 * must begin with an address if the protocol so specifies, 529 * followed by an optional mbuf or mbufs containing ancillary data, 530 * and then zero or more mbufs of data. 531 * In order to avoid blocking network interrupts for the entire time here, 532 * we splx() while doing the actual copy to user space. 533 * Although the sockbuf is locked, new data may still be appended, 534 * and thus we must maintain consistency of the sockbuf during that time. 535 * 536 * The caller may receive the data as a single mbuf chain by supplying 537 * an mbuf **mp0 for use in returning the chain. The uio is then used 538 * only for the count in uio_resid. 539 */ 540 int 541 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 542 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 543 socklen_t controllen) 544 { 545 struct mbuf *m, **mp; 546 int flags, len, error, s, offset; 547 struct protosw *pr = so->so_proto; 548 struct mbuf *nextrecord; 549 int moff, type = 0; 550 size_t orig_resid = uio->uio_resid; 551 int uio_error = 0; 552 int resid; 553 554 mp = mp0; 555 if (paddr) 556 *paddr = 0; 557 if (controlp) 558 *controlp = 0; 559 if (flagsp) 560 flags = *flagsp &~ MSG_EOR; 561 else 562 flags = 0; 563 if (so->so_state & SS_NBIO) 564 flags |= MSG_DONTWAIT; 565 if (flags & MSG_OOB) { 566 m = m_get(M_WAIT, MT_DATA); 567 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 568 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); 569 if (error) 570 goto bad; 571 do { 572 error = uiomove(mtod(m, caddr_t), 573 (int) min(uio->uio_resid, m->m_len), uio); 574 m = m_free(m); 575 } while (uio->uio_resid && error == 0 && m); 576 bad: 577 if (m) 578 m_freem(m); 579 return (error); 580 } 581 if (mp) 582 *mp = NULL; 583 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 584 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, curproc); 585 586 restart: 587 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 588 return (error); 589 s = splsoftnet(); 590 591 m = so->so_rcv.sb_mb; 592 /* 593 * If we have less data than requested, block awaiting more 594 * (subject to any timeout) if: 595 * 1. the current count is less than the low water mark, 596 * 2. MSG_WAITALL is set, and it is possible to do the entire 597 * receive operation at once if we block (resid <= hiwat), or 598 * 3. MSG_DONTWAIT is not set. 599 * If MSG_WAITALL is set but resid is larger than the receive buffer, 600 * we have to do the receive in sections, and thus risk returning 601 * a short count if a timeout or signal occurs after we start. 602 */ 603 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 604 so->so_rcv.sb_cc < uio->uio_resid) && 605 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 606 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 607 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 608 #ifdef DIAGNOSTIC 609 if (m == NULL && so->so_rcv.sb_cc) 610 panic("receive 1"); 611 #endif 612 if (so->so_error) { 613 if (m) 614 goto dontblock; 615 error = so->so_error; 616 if ((flags & MSG_PEEK) == 0) 617 so->so_error = 0; 618 goto release; 619 } 620 if (so->so_state & SS_CANTRCVMORE) { 621 if (m) 622 goto dontblock; 623 else 624 goto release; 625 } 626 for (; m; m = m->m_next) 627 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 628 m = so->so_rcv.sb_mb; 629 goto dontblock; 630 } 631 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 632 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 633 error = ENOTCONN; 634 goto release; 635 } 636 if (uio->uio_resid == 0 && controlp == NULL) 637 goto release; 638 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 639 error = EWOULDBLOCK; 640 goto release; 641 } 642 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 643 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 644 sbunlock(&so->so_rcv); 645 error = sbwait(&so->so_rcv); 646 splx(s); 647 if (error) 648 return (error); 649 goto restart; 650 } 651 dontblock: 652 /* 653 * On entry here, m points to the first record of the socket buffer. 654 * While we process the initial mbufs containing address and control 655 * info, we save a copy of m->m_nextpkt into nextrecord. 656 */ 657 if (uio->uio_procp) 658 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 659 KASSERT(m == so->so_rcv.sb_mb); 660 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 661 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 662 nextrecord = m->m_nextpkt; 663 if (pr->pr_flags & PR_ADDR) { 664 #ifdef DIAGNOSTIC 665 if (m->m_type != MT_SONAME) 666 panic("receive 1a"); 667 #endif 668 orig_resid = 0; 669 if (flags & MSG_PEEK) { 670 if (paddr) 671 *paddr = m_copy(m, 0, m->m_len); 672 m = m->m_next; 673 } else { 674 sbfree(&so->so_rcv, m); 675 if (paddr) { 676 *paddr = m; 677 so->so_rcv.sb_mb = m->m_next; 678 m->m_next = 0; 679 m = so->so_rcv.sb_mb; 680 } else { 681 MFREE(m, so->so_rcv.sb_mb); 682 m = so->so_rcv.sb_mb; 683 } 684 } 685 } 686 while (m && m->m_type == MT_CONTROL && error == 0) { 687 if (flags & MSG_PEEK) { 688 if (controlp) 689 *controlp = m_copy(m, 0, m->m_len); 690 m = m->m_next; 691 } else { 692 sbfree(&so->so_rcv, m); 693 if (controlp) { 694 if (pr->pr_domain->dom_externalize && 695 mtod(m, struct cmsghdr *)->cmsg_type == 696 SCM_RIGHTS) 697 error = (*pr->pr_domain->dom_externalize)(m, 698 controllen); 699 *controlp = m; 700 so->so_rcv.sb_mb = m->m_next; 701 m->m_next = 0; 702 m = so->so_rcv.sb_mb; 703 } else { 704 /* 705 * Dispose of any SCM_RIGHTS message that went 706 * through the read path rather than recv. 707 */ 708 if (pr->pr_domain->dom_dispose && 709 mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 710 pr->pr_domain->dom_dispose(m); 711 MFREE(m, so->so_rcv.sb_mb); 712 m = so->so_rcv.sb_mb; 713 } 714 } 715 if (controlp) { 716 orig_resid = 0; 717 controlp = &(*controlp)->m_next; 718 } 719 } 720 721 /* 722 * If m is non-NULL, we have some data to read. From now on, 723 * make sure to keep sb_lastrecord consistent when working on 724 * the last packet on the chain (nextrecord == NULL) and we 725 * change m->m_nextpkt. 726 */ 727 if (m) { 728 if ((flags & MSG_PEEK) == 0) { 729 m->m_nextpkt = nextrecord; 730 /* 731 * If nextrecord == NULL (this is a single chain), 732 * then sb_lastrecord may not be valid here if m 733 * was changed earlier. 734 */ 735 if (nextrecord == NULL) { 736 KASSERT(so->so_rcv.sb_mb == m); 737 so->so_rcv.sb_lastrecord = m; 738 } 739 } 740 type = m->m_type; 741 if (type == MT_OOBDATA) 742 flags |= MSG_OOB; 743 if (m->m_flags & M_BCAST) 744 flags |= MSG_BCAST; 745 if (m->m_flags & M_MCAST) 746 flags |= MSG_MCAST; 747 } else { 748 if ((flags & MSG_PEEK) == 0) { 749 KASSERT(so->so_rcv.sb_mb == m); 750 so->so_rcv.sb_mb = nextrecord; 751 SB_EMPTY_FIXUP(&so->so_rcv); 752 } 753 } 754 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 755 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 756 757 moff = 0; 758 offset = 0; 759 while (m && uio->uio_resid > 0 && error == 0) { 760 if (m->m_type == MT_OOBDATA) { 761 if (type != MT_OOBDATA) 762 break; 763 } else if (type == MT_OOBDATA) 764 break; 765 #ifdef DIAGNOSTIC 766 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 767 panic("receive 3"); 768 #endif 769 so->so_state &= ~SS_RCVATMARK; 770 len = uio->uio_resid; 771 if (so->so_oobmark && len > so->so_oobmark - offset) 772 len = so->so_oobmark - offset; 773 if (len > m->m_len - moff) 774 len = m->m_len - moff; 775 /* 776 * If mp is set, just pass back the mbufs. 777 * Otherwise copy them out via the uio, then free. 778 * Sockbuf must be consistent here (points to current mbuf, 779 * it points to next record) when we drop priority; 780 * we must note any additions to the sockbuf when we 781 * block interrupts again. 782 */ 783 if (mp == NULL && uio_error == 0) { 784 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 785 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 786 resid = uio->uio_resid; 787 splx(s); 788 uio_error = 789 uiomove(mtod(m, caddr_t) + moff, (int)len, 790 uio); 791 s = splsoftnet(); 792 if (uio_error) 793 uio->uio_resid = resid - len; 794 } else 795 uio->uio_resid -= len; 796 if (len == m->m_len - moff) { 797 if (m->m_flags & M_EOR) 798 flags |= MSG_EOR; 799 if (flags & MSG_PEEK) { 800 m = m->m_next; 801 moff = 0; 802 } else { 803 nextrecord = m->m_nextpkt; 804 sbfree(&so->so_rcv, m); 805 if (mp) { 806 *mp = m; 807 mp = &m->m_next; 808 so->so_rcv.sb_mb = m = m->m_next; 809 *mp = NULL; 810 } else { 811 MFREE(m, so->so_rcv.sb_mb); 812 m = so->so_rcv.sb_mb; 813 } 814 /* 815 * If m != NULL, we also know that 816 * so->so_rcv.sb_mb != NULL. 817 */ 818 KASSERT(so->so_rcv.sb_mb == m); 819 if (m) { 820 m->m_nextpkt = nextrecord; 821 if (nextrecord == NULL) 822 so->so_rcv.sb_lastrecord = m; 823 } else { 824 so->so_rcv.sb_mb = nextrecord; 825 SB_EMPTY_FIXUP(&so->so_rcv); 826 } 827 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 828 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 829 } 830 } else { 831 if (flags & MSG_PEEK) 832 moff += len; 833 else { 834 if (mp) 835 *mp = m_copym(m, 0, len, M_WAIT); 836 m->m_data += len; 837 m->m_len -= len; 838 so->so_rcv.sb_cc -= len; 839 so->so_rcv.sb_datacc -= len; 840 } 841 } 842 if (so->so_oobmark) { 843 if ((flags & MSG_PEEK) == 0) { 844 so->so_oobmark -= len; 845 if (so->so_oobmark == 0) { 846 so->so_state |= SS_RCVATMARK; 847 break; 848 } 849 } else { 850 offset += len; 851 if (offset == so->so_oobmark) 852 break; 853 } 854 } 855 if (flags & MSG_EOR) 856 break; 857 /* 858 * If the MSG_WAITALL flag is set (for non-atomic socket), 859 * we must not quit until "uio->uio_resid == 0" or an error 860 * termination. If a signal/timeout occurs, return 861 * with a short count but without error. 862 * Keep sockbuf locked against other readers. 863 */ 864 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 865 !sosendallatonce(so) && !nextrecord) { 866 if (so->so_error || so->so_state & SS_CANTRCVMORE) 867 break; 868 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 869 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 870 error = sbwait(&so->so_rcv); 871 if (error) { 872 sbunlock(&so->so_rcv); 873 splx(s); 874 return (0); 875 } 876 if ((m = so->so_rcv.sb_mb) != NULL) 877 nextrecord = m->m_nextpkt; 878 } 879 } 880 881 if (m && pr->pr_flags & PR_ATOMIC) { 882 flags |= MSG_TRUNC; 883 if ((flags & MSG_PEEK) == 0) 884 (void) sbdroprecord(&so->so_rcv); 885 } 886 if ((flags & MSG_PEEK) == 0) { 887 if (m == NULL) { 888 /* 889 * First part is an inline SB_EMPTY_FIXUP(). Second 890 * part makes sure sb_lastrecord is up-to-date if 891 * there is still data in the socket buffer. 892 */ 893 so->so_rcv.sb_mb = nextrecord; 894 if (so->so_rcv.sb_mb == NULL) { 895 so->so_rcv.sb_mbtail = NULL; 896 so->so_rcv.sb_lastrecord = NULL; 897 } else if (nextrecord->m_nextpkt == NULL) 898 so->so_rcv.sb_lastrecord = nextrecord; 899 } 900 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 901 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 902 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 903 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 904 (struct mbuf *)(long)flags, NULL, curproc); 905 } 906 if (orig_resid == uio->uio_resid && orig_resid && 907 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 908 sbunlock(&so->so_rcv); 909 splx(s); 910 goto restart; 911 } 912 913 if (uio_error) 914 error = uio_error; 915 916 if (flagsp) 917 *flagsp |= flags; 918 release: 919 sbunlock(&so->so_rcv); 920 splx(s); 921 return (error); 922 } 923 924 int 925 soshutdown(struct socket *so, int how) 926 { 927 struct protosw *pr = so->so_proto; 928 929 switch (how) { 930 case SHUT_RD: 931 case SHUT_RDWR: 932 sorflush(so); 933 if (how == SHUT_RD) 934 return (0); 935 /* FALLTHROUGH */ 936 case SHUT_WR: 937 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, 938 curproc); 939 default: 940 return (EINVAL); 941 } 942 } 943 944 void 945 sorflush(struct socket *so) 946 { 947 struct sockbuf *sb = &so->so_rcv; 948 struct protosw *pr = so->so_proto; 949 int s; 950 struct sockbuf asb; 951 952 sb->sb_flags |= SB_NOINTR; 953 (void) sblock(sb, M_WAITOK); 954 s = splnet(); 955 socantrcvmore(so); 956 sbunlock(sb); 957 asb = *sb; 958 bzero(sb, sizeof (*sb)); 959 /* XXX - the bzero stumps all over so_rcv */ 960 if (asb.sb_flags & SB_KNOTE) { 961 sb->sb_sel.si_note = asb.sb_sel.si_note; 962 sb->sb_flags = SB_KNOTE; 963 } 964 splx(s); 965 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 966 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 967 sbrelease(&asb); 968 } 969 970 int 971 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 972 { 973 int error = 0; 974 struct mbuf *m = m0; 975 976 if (level != SOL_SOCKET) { 977 if (so->so_proto && so->so_proto->pr_ctloutput) 978 return ((*so->so_proto->pr_ctloutput) 979 (PRCO_SETOPT, so, level, optname, &m0)); 980 error = ENOPROTOOPT; 981 } else { 982 switch (optname) { 983 case SO_BINDANY: 984 if ((error = suser(curproc, 0)) != 0) /* XXX */ 985 goto bad; 986 break; 987 } 988 989 switch (optname) { 990 991 case SO_LINGER: 992 if (m == NULL || m->m_len != sizeof (struct linger) || 993 mtod(m, struct linger *)->l_linger < 0 || 994 mtod(m, struct linger *)->l_linger > SHRT_MAX) { 995 error = EINVAL; 996 goto bad; 997 } 998 so->so_linger = mtod(m, struct linger *)->l_linger; 999 /* FALLTHROUGH */ 1000 1001 case SO_BINDANY: 1002 case SO_DEBUG: 1003 case SO_KEEPALIVE: 1004 case SO_DONTROUTE: 1005 case SO_USELOOPBACK: 1006 case SO_BROADCAST: 1007 case SO_REUSEADDR: 1008 case SO_REUSEPORT: 1009 case SO_OOBINLINE: 1010 case SO_JUMBO: 1011 case SO_TIMESTAMP: 1012 if (m == NULL || m->m_len < sizeof (int)) { 1013 error = EINVAL; 1014 goto bad; 1015 } 1016 if (*mtod(m, int *)) 1017 so->so_options |= optname; 1018 else 1019 so->so_options &= ~optname; 1020 break; 1021 1022 case SO_SNDBUF: 1023 case SO_RCVBUF: 1024 case SO_SNDLOWAT: 1025 case SO_RCVLOWAT: 1026 { 1027 u_long cnt; 1028 1029 if (m == NULL || m->m_len < sizeof (int)) { 1030 error = EINVAL; 1031 goto bad; 1032 } 1033 cnt = *mtod(m, int *); 1034 if ((long)cnt <= 0) 1035 cnt = 1; 1036 switch (optname) { 1037 1038 case SO_SNDBUF: 1039 if (sbcheckreserve(cnt, so->so_snd.sb_hiwat) || 1040 sbreserve(&so->so_snd, cnt)) { 1041 error = ENOBUFS; 1042 goto bad; 1043 } 1044 break; 1045 1046 case SO_RCVBUF: 1047 if (sbcheckreserve(cnt, so->so_rcv.sb_hiwat) || 1048 sbreserve(&so->so_rcv, cnt)) { 1049 error = ENOBUFS; 1050 goto bad; 1051 } 1052 break; 1053 1054 case SO_SNDLOWAT: 1055 so->so_snd.sb_lowat = (cnt > so->so_snd.sb_hiwat) ? 1056 so->so_snd.sb_hiwat : cnt; 1057 break; 1058 case SO_RCVLOWAT: 1059 so->so_rcv.sb_lowat = (cnt > so->so_rcv.sb_hiwat) ? 1060 so->so_rcv.sb_hiwat : cnt; 1061 break; 1062 } 1063 break; 1064 } 1065 1066 case SO_SNDTIMEO: 1067 case SO_RCVTIMEO: 1068 { 1069 struct timeval *tv; 1070 u_short val; 1071 1072 if (m == NULL || m->m_len < sizeof (*tv)) { 1073 error = EINVAL; 1074 goto bad; 1075 } 1076 tv = mtod(m, struct timeval *); 1077 if (tv->tv_sec > (USHRT_MAX - tv->tv_usec / tick) / hz) { 1078 error = EDOM; 1079 goto bad; 1080 } 1081 val = tv->tv_sec * hz + tv->tv_usec / tick; 1082 if (val == 0 && tv->tv_usec != 0) 1083 val = 1; 1084 1085 switch (optname) { 1086 1087 case SO_SNDTIMEO: 1088 so->so_snd.sb_timeo = val; 1089 break; 1090 case SO_RCVTIMEO: 1091 so->so_rcv.sb_timeo = val; 1092 break; 1093 } 1094 break; 1095 } 1096 1097 default: 1098 error = ENOPROTOOPT; 1099 break; 1100 } 1101 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1102 (void) ((*so->so_proto->pr_ctloutput) 1103 (PRCO_SETOPT, so, level, optname, &m0)); 1104 m = NULL; /* freed by protocol */ 1105 } 1106 } 1107 bad: 1108 if (m) 1109 (void) m_free(m); 1110 return (error); 1111 } 1112 1113 int 1114 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1115 { 1116 struct mbuf *m; 1117 1118 if (level != SOL_SOCKET) { 1119 if (so->so_proto && so->so_proto->pr_ctloutput) { 1120 return ((*so->so_proto->pr_ctloutput) 1121 (PRCO_GETOPT, so, level, optname, mp)); 1122 } else 1123 return (ENOPROTOOPT); 1124 } else { 1125 m = m_get(M_WAIT, MT_SOOPTS); 1126 m->m_len = sizeof (int); 1127 1128 switch (optname) { 1129 1130 case SO_LINGER: 1131 m->m_len = sizeof (struct linger); 1132 mtod(m, struct linger *)->l_onoff = 1133 so->so_options & SO_LINGER; 1134 mtod(m, struct linger *)->l_linger = so->so_linger; 1135 break; 1136 1137 case SO_BINDANY: 1138 case SO_USELOOPBACK: 1139 case SO_DONTROUTE: 1140 case SO_DEBUG: 1141 case SO_KEEPALIVE: 1142 case SO_REUSEADDR: 1143 case SO_REUSEPORT: 1144 case SO_BROADCAST: 1145 case SO_OOBINLINE: 1146 case SO_JUMBO: 1147 case SO_TIMESTAMP: 1148 *mtod(m, int *) = so->so_options & optname; 1149 break; 1150 1151 case SO_TYPE: 1152 *mtod(m, int *) = so->so_type; 1153 break; 1154 1155 case SO_ERROR: 1156 *mtod(m, int *) = so->so_error; 1157 so->so_error = 0; 1158 break; 1159 1160 case SO_SNDBUF: 1161 *mtod(m, int *) = so->so_snd.sb_hiwat; 1162 break; 1163 1164 case SO_RCVBUF: 1165 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1166 break; 1167 1168 case SO_SNDLOWAT: 1169 *mtod(m, int *) = so->so_snd.sb_lowat; 1170 break; 1171 1172 case SO_RCVLOWAT: 1173 *mtod(m, int *) = so->so_rcv.sb_lowat; 1174 break; 1175 1176 case SO_SNDTIMEO: 1177 case SO_RCVTIMEO: 1178 { 1179 int val = (optname == SO_SNDTIMEO ? 1180 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1181 1182 m->m_len = sizeof(struct timeval); 1183 mtod(m, struct timeval *)->tv_sec = val / hz; 1184 mtod(m, struct timeval *)->tv_usec = 1185 (val % hz) * tick; 1186 break; 1187 } 1188 1189 case SO_PEERCRED: 1190 if (so->so_proto->pr_protocol == AF_UNIX) { 1191 struct unpcb *unp = sotounpcb(so); 1192 1193 if (unp->unp_flags & UNP_FEIDS) { 1194 *mp = m = m_get(M_WAIT, MT_SOOPTS); 1195 m->m_len = sizeof(unp->unp_connid); 1196 bcopy((caddr_t)(&(unp->unp_connid)), 1197 mtod(m, caddr_t), 1198 (unsigned)m->m_len); 1199 } else 1200 return (ENOTCONN); 1201 } else 1202 return (EOPNOTSUPP); 1203 break; 1204 1205 default: 1206 (void)m_free(m); 1207 return (ENOPROTOOPT); 1208 } 1209 *mp = m; 1210 return (0); 1211 } 1212 } 1213 1214 void 1215 sohasoutofband(struct socket *so) 1216 { 1217 csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid); 1218 selwakeup(&so->so_rcv.sb_sel); 1219 } 1220 1221 int 1222 soo_kqfilter(struct file *fp, struct knote *kn) 1223 { 1224 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1225 struct sockbuf *sb; 1226 int s; 1227 1228 switch (kn->kn_filter) { 1229 case EVFILT_READ: 1230 if (so->so_options & SO_ACCEPTCONN) 1231 kn->kn_fop = &solisten_filtops; 1232 else 1233 kn->kn_fop = &soread_filtops; 1234 sb = &so->so_rcv; 1235 break; 1236 case EVFILT_WRITE: 1237 kn->kn_fop = &sowrite_filtops; 1238 sb = &so->so_snd; 1239 break; 1240 default: 1241 return (1); 1242 } 1243 1244 s = splnet(); 1245 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1246 sb->sb_flags |= SB_KNOTE; 1247 splx(s); 1248 return (0); 1249 } 1250 1251 void 1252 filt_sordetach(struct knote *kn) 1253 { 1254 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1255 int s = splnet(); 1256 1257 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1258 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1259 so->so_rcv.sb_flags &= ~SB_KNOTE; 1260 splx(s); 1261 } 1262 1263 /*ARGSUSED*/ 1264 int 1265 filt_soread(struct knote *kn, long hint) 1266 { 1267 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1268 1269 kn->kn_data = so->so_rcv.sb_cc; 1270 if (so->so_state & SS_CANTRCVMORE) { 1271 kn->kn_flags |= EV_EOF; 1272 kn->kn_fflags = so->so_error; 1273 return (1); 1274 } 1275 if (so->so_error) /* temporary udp error */ 1276 return (1); 1277 if (kn->kn_sfflags & NOTE_LOWAT) 1278 return (kn->kn_data >= kn->kn_sdata); 1279 return (kn->kn_data >= so->so_rcv.sb_lowat); 1280 } 1281 1282 void 1283 filt_sowdetach(struct knote *kn) 1284 { 1285 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1286 int s = splnet(); 1287 1288 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 1289 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 1290 so->so_snd.sb_flags &= ~SB_KNOTE; 1291 splx(s); 1292 } 1293 1294 /*ARGSUSED*/ 1295 int 1296 filt_sowrite(struct knote *kn, long hint) 1297 { 1298 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1299 1300 kn->kn_data = sbspace(&so->so_snd); 1301 if (so->so_state & SS_CANTSENDMORE) { 1302 kn->kn_flags |= EV_EOF; 1303 kn->kn_fflags = so->so_error; 1304 return (1); 1305 } 1306 if (so->so_error) /* temporary udp error */ 1307 return (1); 1308 if (((so->so_state & SS_ISCONNECTED) == 0) && 1309 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1310 return (0); 1311 if (kn->kn_sfflags & NOTE_LOWAT) 1312 return (kn->kn_data >= kn->kn_sdata); 1313 return (kn->kn_data >= so->so_snd.sb_lowat); 1314 } 1315 1316 /*ARGSUSED*/ 1317 int 1318 filt_solisten(struct knote *kn, long hint) 1319 { 1320 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1321 1322 kn->kn_data = so->so_qlen; 1323 return (so->so_qlen != 0); 1324 } 1325