1 /* $NetBSD: uipc_socket.c,v 1.66 2002/05/07 08:06:35 enami Exp $ */ 2 3 /*- 4 * Copyright (c) 2002 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 72 */ 73 74 #include <sys/cdefs.h> 75 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.66 2002/05/07 08:06:35 enami Exp $"); 76 77 #include "opt_sock_counters.h" 78 #include "opt_sosend_loan.h" 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/file.h> 84 #include <sys/malloc.h> 85 #include <sys/mbuf.h> 86 #include <sys/domain.h> 87 #include <sys/kernel.h> 88 #include <sys/protosw.h> 89 #include <sys/socket.h> 90 #include <sys/socketvar.h> 91 #include <sys/signalvar.h> 92 #include <sys/resourcevar.h> 93 #include <sys/pool.h> 94 95 #include <uvm/uvm.h> 96 97 struct pool socket_pool; 98 99 extern int somaxconn; /* patchable (XXX sysctl) */ 100 int somaxconn = SOMAXCONN; 101 102 #ifdef SOSEND_COUNTERS 103 #include <sys/device.h> 104 105 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 106 NULL, "sosend", "loan big"); 107 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 108 NULL, "sosend", "copy big"); 109 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 110 NULL, "sosend", "copy small"); 111 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 112 NULL, "sosend", "kva limit"); 113 114 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 115 116 #else 117 118 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 119 120 #endif /* SOSEND_COUNTERS */ 121 122 void 123 soinit(void) 124 { 125 126 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, 127 "sockpl", NULL); 128 129 #ifdef SOSEND_COUNTERS 130 evcnt_attach_static(&sosend_loan_big); 131 evcnt_attach_static(&sosend_copy_big); 132 evcnt_attach_static(&sosend_copy_small); 133 evcnt_attach_static(&sosend_kvalimit); 134 #endif /* SOSEND_COUNTERS */ 135 } 136 137 #ifdef SOSEND_LOAN 138 int use_sosend_loan = 1; 139 #else 140 int use_sosend_loan = 0; 141 #endif 142 143 struct mbuf *so_pendfree; 144 145 int somaxkva = 16 * 1024 * 1024; 146 int socurkva; 147 int sokvawaiters; 148 149 #define SOCK_LOAN_THRESH 4096 150 #define SOCK_LOAN_CHUNK 65536 151 152 static void 153 sodoloanfree(caddr_t buf, u_int size) 154 { 155 struct vm_page **pgs; 156 vaddr_t va, sva, eva; 157 vsize_t len; 158 paddr_t pa; 159 int i, npgs; 160 161 eva = round_page((vaddr_t) buf + size); 162 sva = trunc_page((vaddr_t) buf); 163 len = eva - sva; 164 npgs = len >> PAGE_SHIFT; 165 166 pgs = alloca(npgs * sizeof(*pgs)); 167 168 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { 169 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) 170 panic("sodoloanfree: va 0x%lx not mapped", va); 171 pgs[i] = PHYS_TO_VM_PAGE(pa); 172 } 173 174 pmap_kremove(sva, len); 175 pmap_update(pmap_kernel()); 176 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 177 uvm_km_free(kernel_map, sva, len); 178 socurkva -= len; 179 if (sokvawaiters) 180 wakeup(&socurkva); 181 } 182 183 static size_t 184 sodopendfree(struct socket *so) 185 { 186 struct mbuf *m; 187 size_t rv = 0; 188 int s; 189 190 s = splvm(); 191 192 for (;;) { 193 m = so_pendfree; 194 if (m == NULL) 195 break; 196 so_pendfree = m->m_next; 197 splx(s); 198 199 rv += m->m_ext.ext_size; 200 sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); 201 s = splvm(); 202 pool_cache_put(&mbpool_cache, m); 203 } 204 205 for (;;) { 206 m = so->so_pendfree; 207 if (m == NULL) 208 break; 209 so->so_pendfree = m->m_next; 210 splx(s); 211 212 rv += m->m_ext.ext_size; 213 sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); 214 s = splvm(); 215 pool_cache_put(&mbpool_cache, m); 216 } 217 218 splx(s); 219 return (rv); 220 } 221 222 static void 223 soloanfree(struct mbuf *m, caddr_t buf, u_int size, void *arg) 224 { 225 struct socket *so = arg; 226 int s; 227 228 if (m == NULL) { 229 sodoloanfree(buf, size); 230 return; 231 } 232 233 s = splvm(); 234 m->m_next = so->so_pendfree; 235 so->so_pendfree = m; 236 splx(s); 237 if (sokvawaiters) 238 wakeup(&socurkva); 239 } 240 241 static long 242 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 243 { 244 struct iovec *iov = uio->uio_iov; 245 vaddr_t sva, eva; 246 vsize_t len; 247 struct vm_page **pgs; 248 vaddr_t lva, va; 249 int npgs, s, i, error; 250 251 if (uio->uio_segflg != UIO_USERSPACE) 252 return (0); 253 254 if (iov->iov_len < (size_t) space) 255 space = iov->iov_len; 256 if (space > SOCK_LOAN_CHUNK) 257 space = SOCK_LOAN_CHUNK; 258 259 eva = round_page((vaddr_t) iov->iov_base + space); 260 sva = trunc_page((vaddr_t) iov->iov_base); 261 len = eva - sva; 262 npgs = len >> PAGE_SHIFT; 263 264 while (socurkva + len > somaxkva) { 265 if (sodopendfree(so)) 266 continue; 267 SOSEND_COUNTER_INCR(&sosend_kvalimit); 268 s = splvm(); 269 sokvawaiters++; 270 (void) tsleep(&socurkva, PVM, "sokva", 0); 271 sokvawaiters--; 272 splx(s); 273 } 274 275 lva = uvm_km_valloc_wait(kernel_map, len); 276 if (lva == 0) 277 return (0); 278 socurkva += len; 279 280 pgs = alloca(npgs * sizeof(*pgs)); 281 282 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, 283 pgs, UVM_LOAN_TOPAGE); 284 if (error) { 285 uvm_km_free(kernel_map, lva, len); 286 socurkva -= len; 287 return (0); 288 } 289 290 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 291 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ); 292 pmap_update(pmap_kernel()); 293 294 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 295 296 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); 297 298 uio->uio_resid -= space; 299 /* uio_offset not updated, not set/used for write(2) */ 300 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; 301 uio->uio_iov->iov_len -= space; 302 if (uio->uio_iov->iov_len == 0) { 303 uio->uio_iov++; 304 uio->uio_iovcnt--; 305 } 306 307 return (space); 308 } 309 310 /* 311 * Socket operation routines. 312 * These routines are called by the routines in 313 * sys_socket.c or from a system process, and 314 * implement the semantics of socket operations by 315 * switching out to the protocol specific routines. 316 */ 317 /*ARGSUSED*/ 318 int 319 socreate(int dom, struct socket **aso, int type, int proto) 320 { 321 struct proc *p; 322 struct protosw *prp; 323 struct socket *so; 324 int error, s; 325 326 p = curproc; /* XXX */ 327 if (proto) 328 prp = pffindproto(dom, proto, type); 329 else 330 prp = pffindtype(dom, type); 331 if (prp == 0 || prp->pr_usrreq == 0) 332 return (EPROTONOSUPPORT); 333 if (prp->pr_type != type) 334 return (EPROTOTYPE); 335 s = splsoftnet(); 336 so = pool_get(&socket_pool, PR_WAITOK); 337 memset((caddr_t)so, 0, sizeof(*so)); 338 TAILQ_INIT(&so->so_q0); 339 TAILQ_INIT(&so->so_q); 340 so->so_type = type; 341 so->so_proto = prp; 342 so->so_send = sosend; 343 so->so_receive = soreceive; 344 if (p != 0) 345 so->so_uid = p->p_ucred->cr_uid; 346 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, 347 (struct mbuf *)(long)proto, (struct mbuf *)0, p); 348 if (error) { 349 so->so_state |= SS_NOFDREF; 350 sofree(so); 351 splx(s); 352 return (error); 353 } 354 splx(s); 355 *aso = so; 356 return (0); 357 } 358 359 int 360 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 361 { 362 int s, error; 363 364 s = splsoftnet(); 365 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, 366 nam, (struct mbuf *)0, p); 367 splx(s); 368 return (error); 369 } 370 371 int 372 solisten(struct socket *so, int backlog) 373 { 374 int s, error; 375 376 s = splsoftnet(); 377 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, 378 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 379 if (error) { 380 splx(s); 381 return (error); 382 } 383 if (TAILQ_EMPTY(&so->so_q)) 384 so->so_options |= SO_ACCEPTCONN; 385 if (backlog < 0) 386 backlog = 0; 387 so->so_qlimit = min(backlog, somaxconn); 388 splx(s); 389 return (0); 390 } 391 392 void 393 sofree(struct socket *so) 394 { 395 struct mbuf *m; 396 397 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 398 return; 399 if (so->so_head) { 400 /* 401 * We must not decommission a socket that's on the accept(2) 402 * queue. If we do, then accept(2) may hang after select(2) 403 * indicated that the listening socket was ready. 404 */ 405 if (!soqremque(so, 0)) 406 return; 407 } 408 sbrelease(&so->so_snd); 409 sorflush(so); 410 while ((m = so->so_pendfree) != NULL) { 411 so->so_pendfree = m->m_next; 412 m->m_next = so_pendfree; 413 so_pendfree = m; 414 } 415 pool_put(&socket_pool, so); 416 } 417 418 /* 419 * Close a socket on last file table reference removal. 420 * Initiate disconnect if connected. 421 * Free socket when disconnect complete. 422 */ 423 int 424 soclose(struct socket *so) 425 { 426 struct socket *so2; 427 int s, error; 428 429 error = 0; 430 s = splsoftnet(); /* conservative */ 431 if (so->so_options & SO_ACCEPTCONN) { 432 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 433 (void) soqremque(so2, 0); 434 (void) soabort(so2); 435 } 436 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 437 (void) soqremque(so2, 1); 438 (void) soabort(so2); 439 } 440 } 441 if (so->so_pcb == 0) 442 goto discard; 443 if (so->so_state & SS_ISCONNECTED) { 444 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 445 error = sodisconnect(so); 446 if (error) 447 goto drop; 448 } 449 if (so->so_options & SO_LINGER) { 450 if ((so->so_state & SS_ISDISCONNECTING) && 451 (so->so_state & SS_NBIO)) 452 goto drop; 453 while (so->so_state & SS_ISCONNECTED) { 454 error = tsleep((caddr_t)&so->so_timeo, 455 PSOCK | PCATCH, netcls, 456 so->so_linger * hz); 457 if (error) 458 break; 459 } 460 } 461 } 462 drop: 463 if (so->so_pcb) { 464 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 465 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 466 (struct proc *)0); 467 if (error == 0) 468 error = error2; 469 } 470 discard: 471 if (so->so_state & SS_NOFDREF) 472 panic("soclose: NOFDREF"); 473 so->so_state |= SS_NOFDREF; 474 sofree(so); 475 splx(s); 476 return (error); 477 } 478 479 /* 480 * Must be called at splsoftnet... 481 */ 482 int 483 soabort(struct socket *so) 484 { 485 486 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, 487 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 488 } 489 490 int 491 soaccept(struct socket *so, struct mbuf *nam) 492 { 493 int s, error; 494 495 error = 0; 496 s = splsoftnet(); 497 if ((so->so_state & SS_NOFDREF) == 0) 498 panic("soaccept: !NOFDREF"); 499 so->so_state &= ~SS_NOFDREF; 500 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 501 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 502 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 503 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); 504 else 505 error = ECONNABORTED; 506 507 splx(s); 508 return (error); 509 } 510 511 int 512 soconnect(struct socket *so, struct mbuf *nam) 513 { 514 struct proc *p; 515 int s, error; 516 517 p = curproc; /* XXX */ 518 if (so->so_options & SO_ACCEPTCONN) 519 return (EOPNOTSUPP); 520 s = splsoftnet(); 521 /* 522 * If protocol is connection-based, can only connect once. 523 * Otherwise, if connected, try to disconnect first. 524 * This allows user to disconnect by connecting to, e.g., 525 * a null address. 526 */ 527 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 528 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 529 (error = sodisconnect(so)))) 530 error = EISCONN; 531 else 532 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 533 (struct mbuf *)0, nam, (struct mbuf *)0, p); 534 splx(s); 535 return (error); 536 } 537 538 int 539 soconnect2(struct socket *so1, struct socket *so2) 540 { 541 int s, error; 542 543 s = splsoftnet(); 544 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 545 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, 546 (struct proc *)0); 547 splx(s); 548 return (error); 549 } 550 551 int 552 sodisconnect(struct socket *so) 553 { 554 int s, error; 555 556 s = splsoftnet(); 557 if ((so->so_state & SS_ISCONNECTED) == 0) { 558 error = ENOTCONN; 559 goto bad; 560 } 561 if (so->so_state & SS_ISDISCONNECTING) { 562 error = EALREADY; 563 goto bad; 564 } 565 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 566 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 567 (struct proc *)0); 568 bad: 569 splx(s); 570 sodopendfree(so); 571 return (error); 572 } 573 574 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 575 /* 576 * Send on a socket. 577 * If send must go all at once and message is larger than 578 * send buffering, then hard error. 579 * Lock against other senders. 580 * If must go all at once and not enough room now, then 581 * inform user that this would block and do nothing. 582 * Otherwise, if nonblocking, send as much as possible. 583 * The data to be sent is described by "uio" if nonzero, 584 * otherwise by the mbuf chain "top" (which must be null 585 * if uio is not). Data provided in mbuf chain must be small 586 * enough to send all at once. 587 * 588 * Returns nonzero on error, timeout or signal; callers 589 * must check for short counts if EINTR/ERESTART are returned. 590 * Data and control buffers are freed on return. 591 */ 592 int 593 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 594 struct mbuf *control, int flags) 595 { 596 struct proc *p; 597 struct mbuf **mp, *m; 598 long space, len, resid, clen, mlen; 599 int error, s, dontroute, atomic; 600 601 sodopendfree(so); 602 603 p = curproc; /* XXX */ 604 clen = 0; 605 atomic = sosendallatonce(so) || top; 606 if (uio) 607 resid = uio->uio_resid; 608 else 609 resid = top->m_pkthdr.len; 610 /* 611 * In theory resid should be unsigned. 612 * However, space must be signed, as it might be less than 0 613 * if we over-committed, and we must use a signed comparison 614 * of space and resid. On the other hand, a negative resid 615 * causes us to loop sending 0-length segments to the protocol. 616 */ 617 if (resid < 0) { 618 error = EINVAL; 619 goto out; 620 } 621 dontroute = 622 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 623 (so->so_proto->pr_flags & PR_ATOMIC); 624 p->p_stats->p_ru.ru_msgsnd++; 625 if (control) 626 clen = control->m_len; 627 #define snderr(errno) { error = errno; splx(s); goto release; } 628 629 restart: 630 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 631 goto out; 632 do { 633 s = splsoftnet(); 634 if (so->so_state & SS_CANTSENDMORE) 635 snderr(EPIPE); 636 if (so->so_error) { 637 error = so->so_error; 638 so->so_error = 0; 639 splx(s); 640 goto release; 641 } 642 if ((so->so_state & SS_ISCONNECTED) == 0) { 643 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 644 if ((so->so_state & SS_ISCONFIRMING) == 0 && 645 !(resid == 0 && clen != 0)) 646 snderr(ENOTCONN); 647 } else if (addr == 0) 648 snderr(EDESTADDRREQ); 649 } 650 space = sbspace(&so->so_snd); 651 if (flags & MSG_OOB) 652 space += 1024; 653 if ((atomic && resid > so->so_snd.sb_hiwat) || 654 clen > so->so_snd.sb_hiwat) 655 snderr(EMSGSIZE); 656 if (space < resid + clen && uio && 657 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 658 if (so->so_state & SS_NBIO) 659 snderr(EWOULDBLOCK); 660 sbunlock(&so->so_snd); 661 error = sbwait(&so->so_snd); 662 splx(s); 663 if (error) 664 goto out; 665 goto restart; 666 } 667 splx(s); 668 mp = ⊤ 669 space -= clen; 670 do { 671 if (uio == NULL) { 672 /* 673 * Data is prepackaged in "top". 674 */ 675 resid = 0; 676 if (flags & MSG_EOR) 677 top->m_flags |= M_EOR; 678 } else do { 679 if (top == 0) { 680 MGETHDR(m, M_WAIT, MT_DATA); 681 mlen = MHLEN; 682 m->m_pkthdr.len = 0; 683 m->m_pkthdr.rcvif = (struct ifnet *)0; 684 } else { 685 MGET(m, M_WAIT, MT_DATA); 686 mlen = MLEN; 687 } 688 if (use_sosend_loan && 689 uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && 690 space >= SOCK_LOAN_THRESH && 691 (len = sosend_loan(so, uio, m, 692 space)) != 0) { 693 SOSEND_COUNTER_INCR(&sosend_loan_big); 694 space -= len; 695 goto have_data; 696 } 697 if (resid >= MINCLSIZE && space >= MCLBYTES) { 698 SOSEND_COUNTER_INCR(&sosend_copy_big); 699 MCLGET(m, M_WAIT); 700 if ((m->m_flags & M_EXT) == 0) 701 goto nopages; 702 mlen = MCLBYTES; 703 if (atomic && top == 0) { 704 len = lmin(MCLBYTES - max_hdr, 705 resid); 706 m->m_data += max_hdr; 707 } else 708 len = lmin(MCLBYTES, resid); 709 space -= len; 710 } else { 711 nopages: 712 SOSEND_COUNTER_INCR(&sosend_copy_small); 713 len = lmin(lmin(mlen, resid), space); 714 space -= len; 715 /* 716 * For datagram protocols, leave room 717 * for protocol headers in first mbuf. 718 */ 719 if (atomic && top == 0 && len < mlen) 720 MH_ALIGN(m, len); 721 } 722 error = uiomove(mtod(m, caddr_t), (int)len, 723 uio); 724 have_data: 725 resid = uio->uio_resid; 726 m->m_len = len; 727 *mp = m; 728 top->m_pkthdr.len += len; 729 if (error) 730 goto release; 731 mp = &m->m_next; 732 if (resid <= 0) { 733 if (flags & MSG_EOR) 734 top->m_flags |= M_EOR; 735 break; 736 } 737 } while (space > 0 && atomic); 738 739 s = splsoftnet(); 740 741 if (so->so_state & SS_CANTSENDMORE) 742 snderr(EPIPE); 743 744 if (dontroute) 745 so->so_options |= SO_DONTROUTE; 746 if (resid > 0) 747 so->so_state |= SS_MORETOCOME; 748 error = (*so->so_proto->pr_usrreq)(so, 749 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 750 top, addr, control, p); 751 if (dontroute) 752 so->so_options &= ~SO_DONTROUTE; 753 if (resid > 0) 754 so->so_state &= ~SS_MORETOCOME; 755 splx(s); 756 757 clen = 0; 758 control = 0; 759 top = 0; 760 mp = ⊤ 761 if (error) 762 goto release; 763 } while (resid && space > 0); 764 } while (resid); 765 766 release: 767 sbunlock(&so->so_snd); 768 out: 769 if (top) 770 m_freem(top); 771 if (control) 772 m_freem(control); 773 return (error); 774 } 775 776 /* 777 * Implement receive operations on a socket. 778 * We depend on the way that records are added to the sockbuf 779 * by sbappend*. In particular, each record (mbufs linked through m_next) 780 * must begin with an address if the protocol so specifies, 781 * followed by an optional mbuf or mbufs containing ancillary data, 782 * and then zero or more mbufs of data. 783 * In order to avoid blocking network interrupts for the entire time here, 784 * we splx() while doing the actual copy to user space. 785 * Although the sockbuf is locked, new data may still be appended, 786 * and thus we must maintain consistency of the sockbuf during that time. 787 * 788 * The caller may receive the data as a single mbuf chain by supplying 789 * an mbuf **mp0 for use in returning the chain. The uio is then used 790 * only for the count in uio_resid. 791 */ 792 int 793 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 794 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 795 { 796 struct mbuf *m, **mp; 797 int flags, len, error, s, offset, moff, type, orig_resid; 798 struct protosw *pr; 799 struct mbuf *nextrecord; 800 801 pr = so->so_proto; 802 mp = mp0; 803 type = 0; 804 orig_resid = uio->uio_resid; 805 if (paddr) 806 *paddr = 0; 807 if (controlp) 808 *controlp = 0; 809 if (flagsp) 810 flags = *flagsp &~ MSG_EOR; 811 else 812 flags = 0; 813 814 if ((flags & MSG_DONTWAIT) == 0) 815 sodopendfree(so); 816 817 if (flags & MSG_OOB) { 818 m = m_get(M_WAIT, MT_DATA); 819 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 820 (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, 821 (struct proc *)0); 822 if (error) 823 goto bad; 824 do { 825 error = uiomove(mtod(m, caddr_t), 826 (int) min(uio->uio_resid, m->m_len), uio); 827 m = m_free(m); 828 } while (uio->uio_resid && error == 0 && m); 829 bad: 830 if (m) 831 m_freem(m); 832 return (error); 833 } 834 if (mp) 835 *mp = (struct mbuf *)0; 836 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 837 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 838 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 839 840 restart: 841 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 842 return (error); 843 s = splsoftnet(); 844 845 m = so->so_rcv.sb_mb; 846 /* 847 * If we have less data than requested, block awaiting more 848 * (subject to any timeout) if: 849 * 1. the current count is less than the low water mark, 850 * 2. MSG_WAITALL is set, and it is possible to do the entire 851 * receive operation at once if we block (resid <= hiwat), or 852 * 3. MSG_DONTWAIT is not set. 853 * If MSG_WAITALL is set but resid is larger than the receive buffer, 854 * we have to do the receive in sections, and thus risk returning 855 * a short count if a timeout or signal occurs after we start. 856 */ 857 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 858 so->so_rcv.sb_cc < uio->uio_resid) && 859 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 860 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 861 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 862 #ifdef DIAGNOSTIC 863 if (m == 0 && so->so_rcv.sb_cc) 864 panic("receive 1"); 865 #endif 866 if (so->so_error) { 867 if (m) 868 goto dontblock; 869 error = so->so_error; 870 if ((flags & MSG_PEEK) == 0) 871 so->so_error = 0; 872 goto release; 873 } 874 if (so->so_state & SS_CANTRCVMORE) { 875 if (m) 876 goto dontblock; 877 else 878 goto release; 879 } 880 for (; m; m = m->m_next) 881 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 882 m = so->so_rcv.sb_mb; 883 goto dontblock; 884 } 885 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 886 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 887 error = ENOTCONN; 888 goto release; 889 } 890 if (uio->uio_resid == 0) 891 goto release; 892 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 893 error = EWOULDBLOCK; 894 goto release; 895 } 896 sbunlock(&so->so_rcv); 897 error = sbwait(&so->so_rcv); 898 splx(s); 899 if (error) 900 return (error); 901 goto restart; 902 } 903 dontblock: 904 #ifdef notyet /* XXXX */ 905 if (uio->uio_procp) 906 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 907 #endif 908 nextrecord = m->m_nextpkt; 909 if (pr->pr_flags & PR_ADDR) { 910 #ifdef DIAGNOSTIC 911 if (m->m_type != MT_SONAME) 912 panic("receive 1a"); 913 #endif 914 orig_resid = 0; 915 if (flags & MSG_PEEK) { 916 if (paddr) 917 *paddr = m_copy(m, 0, m->m_len); 918 m = m->m_next; 919 } else { 920 sbfree(&so->so_rcv, m); 921 if (paddr) { 922 *paddr = m; 923 so->so_rcv.sb_mb = m->m_next; 924 m->m_next = 0; 925 m = so->so_rcv.sb_mb; 926 } else { 927 MFREE(m, so->so_rcv.sb_mb); 928 m = so->so_rcv.sb_mb; 929 } 930 } 931 } 932 while (m && m->m_type == MT_CONTROL && error == 0) { 933 if (flags & MSG_PEEK) { 934 if (controlp) 935 *controlp = m_copy(m, 0, m->m_len); 936 m = m->m_next; 937 } else { 938 sbfree(&so->so_rcv, m); 939 if (controlp) { 940 if (pr->pr_domain->dom_externalize && 941 mtod(m, struct cmsghdr *)->cmsg_type == 942 SCM_RIGHTS) 943 error = (*pr->pr_domain->dom_externalize)(m); 944 *controlp = m; 945 so->so_rcv.sb_mb = m->m_next; 946 m->m_next = 0; 947 m = so->so_rcv.sb_mb; 948 } else { 949 MFREE(m, so->so_rcv.sb_mb); 950 m = so->so_rcv.sb_mb; 951 } 952 } 953 if (controlp) { 954 orig_resid = 0; 955 controlp = &(*controlp)->m_next; 956 } 957 } 958 if (m) { 959 if ((flags & MSG_PEEK) == 0) 960 m->m_nextpkt = nextrecord; 961 type = m->m_type; 962 if (type == MT_OOBDATA) 963 flags |= MSG_OOB; 964 } 965 moff = 0; 966 offset = 0; 967 while (m && uio->uio_resid > 0 && error == 0) { 968 if (m->m_type == MT_OOBDATA) { 969 if (type != MT_OOBDATA) 970 break; 971 } else if (type == MT_OOBDATA) 972 break; 973 #ifdef DIAGNOSTIC 974 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 975 panic("receive 3"); 976 #endif 977 so->so_state &= ~SS_RCVATMARK; 978 len = uio->uio_resid; 979 if (so->so_oobmark && len > so->so_oobmark - offset) 980 len = so->so_oobmark - offset; 981 if (len > m->m_len - moff) 982 len = m->m_len - moff; 983 /* 984 * If mp is set, just pass back the mbufs. 985 * Otherwise copy them out via the uio, then free. 986 * Sockbuf must be consistent here (points to current mbuf, 987 * it points to next record) when we drop priority; 988 * we must note any additions to the sockbuf when we 989 * block interrupts again. 990 */ 991 if (mp == 0) { 992 splx(s); 993 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 994 s = splsoftnet(); 995 if (error) 996 goto release; 997 } else 998 uio->uio_resid -= len; 999 if (len == m->m_len - moff) { 1000 if (m->m_flags & M_EOR) 1001 flags |= MSG_EOR; 1002 if (flags & MSG_PEEK) { 1003 m = m->m_next; 1004 moff = 0; 1005 } else { 1006 nextrecord = m->m_nextpkt; 1007 sbfree(&so->so_rcv, m); 1008 if (mp) { 1009 *mp = m; 1010 mp = &m->m_next; 1011 so->so_rcv.sb_mb = m = m->m_next; 1012 *mp = (struct mbuf *)0; 1013 } else { 1014 MFREE(m, so->so_rcv.sb_mb); 1015 m = so->so_rcv.sb_mb; 1016 } 1017 if (m) 1018 m->m_nextpkt = nextrecord; 1019 } 1020 } else { 1021 if (flags & MSG_PEEK) 1022 moff += len; 1023 else { 1024 if (mp) 1025 *mp = m_copym(m, 0, len, M_WAIT); 1026 m->m_data += len; 1027 m->m_len -= len; 1028 so->so_rcv.sb_cc -= len; 1029 } 1030 } 1031 if (so->so_oobmark) { 1032 if ((flags & MSG_PEEK) == 0) { 1033 so->so_oobmark -= len; 1034 if (so->so_oobmark == 0) { 1035 so->so_state |= SS_RCVATMARK; 1036 break; 1037 } 1038 } else { 1039 offset += len; 1040 if (offset == so->so_oobmark) 1041 break; 1042 } 1043 } 1044 if (flags & MSG_EOR) 1045 break; 1046 /* 1047 * If the MSG_WAITALL flag is set (for non-atomic socket), 1048 * we must not quit until "uio->uio_resid == 0" or an error 1049 * termination. If a signal/timeout occurs, return 1050 * with a short count but without error. 1051 * Keep sockbuf locked against other readers. 1052 */ 1053 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1054 !sosendallatonce(so) && !nextrecord) { 1055 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1056 break; 1057 error = sbwait(&so->so_rcv); 1058 if (error) { 1059 sbunlock(&so->so_rcv); 1060 splx(s); 1061 return (0); 1062 } 1063 if ((m = so->so_rcv.sb_mb) != NULL) 1064 nextrecord = m->m_nextpkt; 1065 } 1066 } 1067 1068 if (m && pr->pr_flags & PR_ATOMIC) { 1069 flags |= MSG_TRUNC; 1070 if ((flags & MSG_PEEK) == 0) 1071 (void) sbdroprecord(&so->so_rcv); 1072 } 1073 if ((flags & MSG_PEEK) == 0) { 1074 if (m == 0) 1075 so->so_rcv.sb_mb = nextrecord; 1076 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1077 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1078 (struct mbuf *)(long)flags, (struct mbuf *)0, 1079 (struct proc *)0); 1080 } 1081 if (orig_resid == uio->uio_resid && orig_resid && 1082 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1083 sbunlock(&so->so_rcv); 1084 splx(s); 1085 goto restart; 1086 } 1087 1088 if (flagsp) 1089 *flagsp |= flags; 1090 release: 1091 sbunlock(&so->so_rcv); 1092 splx(s); 1093 return (error); 1094 } 1095 1096 int 1097 soshutdown(struct socket *so, int how) 1098 { 1099 struct protosw *pr; 1100 1101 pr = so->so_proto; 1102 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1103 return (EINVAL); 1104 1105 if (how == SHUT_RD || how == SHUT_RDWR) 1106 sorflush(so); 1107 if (how == SHUT_WR || how == SHUT_RDWR) 1108 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, 1109 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 1110 return (0); 1111 } 1112 1113 void 1114 sorflush(struct socket *so) 1115 { 1116 struct sockbuf *sb, asb; 1117 struct protosw *pr; 1118 int s; 1119 1120 sb = &so->so_rcv; 1121 pr = so->so_proto; 1122 sb->sb_flags |= SB_NOINTR; 1123 (void) sblock(sb, M_WAITOK); 1124 s = splnet(); 1125 socantrcvmore(so); 1126 sbunlock(sb); 1127 asb = *sb; 1128 memset((caddr_t)sb, 0, sizeof(*sb)); 1129 splx(s); 1130 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1131 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1132 sbrelease(&asb); 1133 } 1134 1135 int 1136 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1137 { 1138 int error; 1139 struct mbuf *m; 1140 1141 error = 0; 1142 m = m0; 1143 if (level != SOL_SOCKET) { 1144 if (so->so_proto && so->so_proto->pr_ctloutput) 1145 return ((*so->so_proto->pr_ctloutput) 1146 (PRCO_SETOPT, so, level, optname, &m0)); 1147 error = ENOPROTOOPT; 1148 } else { 1149 switch (optname) { 1150 1151 case SO_LINGER: 1152 if (m == NULL || m->m_len != sizeof(struct linger)) { 1153 error = EINVAL; 1154 goto bad; 1155 } 1156 so->so_linger = mtod(m, struct linger *)->l_linger; 1157 /* fall thru... */ 1158 1159 case SO_DEBUG: 1160 case SO_KEEPALIVE: 1161 case SO_DONTROUTE: 1162 case SO_USELOOPBACK: 1163 case SO_BROADCAST: 1164 case SO_REUSEADDR: 1165 case SO_REUSEPORT: 1166 case SO_OOBINLINE: 1167 case SO_TIMESTAMP: 1168 if (m == NULL || m->m_len < sizeof(int)) { 1169 error = EINVAL; 1170 goto bad; 1171 } 1172 if (*mtod(m, int *)) 1173 so->so_options |= optname; 1174 else 1175 so->so_options &= ~optname; 1176 break; 1177 1178 case SO_SNDBUF: 1179 case SO_RCVBUF: 1180 case SO_SNDLOWAT: 1181 case SO_RCVLOWAT: 1182 { 1183 int optval; 1184 1185 if (m == NULL || m->m_len < sizeof(int)) { 1186 error = EINVAL; 1187 goto bad; 1188 } 1189 1190 /* 1191 * Values < 1 make no sense for any of these 1192 * options, so disallow them. 1193 */ 1194 optval = *mtod(m, int *); 1195 if (optval < 1) { 1196 error = EINVAL; 1197 goto bad; 1198 } 1199 1200 switch (optname) { 1201 1202 case SO_SNDBUF: 1203 case SO_RCVBUF: 1204 if (sbreserve(optname == SO_SNDBUF ? 1205 &so->so_snd : &so->so_rcv, 1206 (u_long) optval) == 0) { 1207 error = ENOBUFS; 1208 goto bad; 1209 } 1210 break; 1211 1212 /* 1213 * Make sure the low-water is never greater than 1214 * the high-water. 1215 */ 1216 case SO_SNDLOWAT: 1217 so->so_snd.sb_lowat = 1218 (optval > so->so_snd.sb_hiwat) ? 1219 so->so_snd.sb_hiwat : optval; 1220 break; 1221 case SO_RCVLOWAT: 1222 so->so_rcv.sb_lowat = 1223 (optval > so->so_rcv.sb_hiwat) ? 1224 so->so_rcv.sb_hiwat : optval; 1225 break; 1226 } 1227 break; 1228 } 1229 1230 case SO_SNDTIMEO: 1231 case SO_RCVTIMEO: 1232 { 1233 struct timeval *tv; 1234 short val; 1235 1236 if (m == NULL || m->m_len < sizeof(*tv)) { 1237 error = EINVAL; 1238 goto bad; 1239 } 1240 tv = mtod(m, struct timeval *); 1241 if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { 1242 error = EDOM; 1243 goto bad; 1244 } 1245 val = tv->tv_sec * hz + tv->tv_usec / tick; 1246 1247 switch (optname) { 1248 1249 case SO_SNDTIMEO: 1250 so->so_snd.sb_timeo = val; 1251 break; 1252 case SO_RCVTIMEO: 1253 so->so_rcv.sb_timeo = val; 1254 break; 1255 } 1256 break; 1257 } 1258 1259 default: 1260 error = ENOPROTOOPT; 1261 break; 1262 } 1263 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1264 (void) ((*so->so_proto->pr_ctloutput) 1265 (PRCO_SETOPT, so, level, optname, &m0)); 1266 m = NULL; /* freed by protocol */ 1267 } 1268 } 1269 bad: 1270 if (m) 1271 (void) m_free(m); 1272 return (error); 1273 } 1274 1275 int 1276 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1277 { 1278 struct mbuf *m; 1279 1280 if (level != SOL_SOCKET) { 1281 if (so->so_proto && so->so_proto->pr_ctloutput) { 1282 return ((*so->so_proto->pr_ctloutput) 1283 (PRCO_GETOPT, so, level, optname, mp)); 1284 } else 1285 return (ENOPROTOOPT); 1286 } else { 1287 m = m_get(M_WAIT, MT_SOOPTS); 1288 m->m_len = sizeof(int); 1289 1290 switch (optname) { 1291 1292 case SO_LINGER: 1293 m->m_len = sizeof(struct linger); 1294 mtod(m, struct linger *)->l_onoff = 1295 so->so_options & SO_LINGER; 1296 mtod(m, struct linger *)->l_linger = so->so_linger; 1297 break; 1298 1299 case SO_USELOOPBACK: 1300 case SO_DONTROUTE: 1301 case SO_DEBUG: 1302 case SO_KEEPALIVE: 1303 case SO_REUSEADDR: 1304 case SO_REUSEPORT: 1305 case SO_BROADCAST: 1306 case SO_OOBINLINE: 1307 case SO_TIMESTAMP: 1308 *mtod(m, int *) = so->so_options & optname; 1309 break; 1310 1311 case SO_TYPE: 1312 *mtod(m, int *) = so->so_type; 1313 break; 1314 1315 case SO_ERROR: 1316 *mtod(m, int *) = so->so_error; 1317 so->so_error = 0; 1318 break; 1319 1320 case SO_SNDBUF: 1321 *mtod(m, int *) = so->so_snd.sb_hiwat; 1322 break; 1323 1324 case SO_RCVBUF: 1325 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1326 break; 1327 1328 case SO_SNDLOWAT: 1329 *mtod(m, int *) = so->so_snd.sb_lowat; 1330 break; 1331 1332 case SO_RCVLOWAT: 1333 *mtod(m, int *) = so->so_rcv.sb_lowat; 1334 break; 1335 1336 case SO_SNDTIMEO: 1337 case SO_RCVTIMEO: 1338 { 1339 int val = (optname == SO_SNDTIMEO ? 1340 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1341 1342 m->m_len = sizeof(struct timeval); 1343 mtod(m, struct timeval *)->tv_sec = val / hz; 1344 mtod(m, struct timeval *)->tv_usec = 1345 (val % hz) * tick; 1346 break; 1347 } 1348 1349 default: 1350 (void)m_free(m); 1351 return (ENOPROTOOPT); 1352 } 1353 *mp = m; 1354 return (0); 1355 } 1356 } 1357 1358 void 1359 sohasoutofband(struct socket *so) 1360 { 1361 struct proc *p; 1362 1363 if (so->so_pgid < 0) 1364 gsignal(-so->so_pgid, SIGURG); 1365 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) 1366 psignal(p, SIGURG); 1367 selwakeup(&so->so_rcv.sb_sel); 1368 } 1369