1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $ 38 * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.59 2005/12/01 18:40:56 dillon Exp $ 39 */ 40 41 #include "opt_ktrace.h" 42 #include "opt_sctp.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/sysproto.h> 48 #include <sys/malloc.h> 49 #include <sys/filedesc.h> 50 #include <sys/event.h> 51 #include <sys/proc.h> 52 #include <sys/fcntl.h> 53 #include <sys/file.h> 54 #include <sys/filio.h> 55 #include <sys/kern_syscall.h> 56 #include <sys/mbuf.h> 57 #include <sys/protosw.h> 58 #include <sys/sfbuf.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/socketops.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #include <sys/lock.h> 65 #include <sys/mount.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_kern.h> 74 #include <vm/vm_extern.h> 75 #include <sys/file2.h> 76 #include <sys/signalvar.h> 77 #include <sys/serialize.h> 78 79 #include <sys/thread2.h> 80 #include <sys/msgport2.h> 81 82 #ifdef SCTP 83 #include <netinet/sctp_peeloff.h> 84 #endif /* SCTP */ 85 86 struct sfbuf_mref { 87 struct sf_buf *sf; 88 int mref_count; 89 struct lwkt_serialize serializer; 90 }; 91 92 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures"); 93 94 /* 95 * System call interface to the socket abstraction. 96 */ 97 98 extern struct fileops socketops; 99 100 /* 101 * socket_args(int domain, int type, int protocol) 102 */ 103 int 104 kern_socket(int domain, int type, int protocol, int *res) 105 { 106 struct thread *td = curthread; 107 struct proc *p = td->td_proc; 108 struct filedesc *fdp; 109 struct socket *so; 110 struct file *fp; 111 int fd, error; 112 113 KKASSERT(p); 114 fdp = p->p_fd; 115 116 error = falloc(p, &fp, &fd); 117 if (error) 118 return (error); 119 error = socreate(domain, &so, type, protocol, td); 120 if (error) { 121 if (fdp->fd_files[fd].fp == fp) { 122 funsetfd(fdp, fd); 123 fdrop(fp, td); 124 } 125 } else { 126 fp->f_type = DTYPE_SOCKET; 127 fp->f_flag = FREAD | FWRITE; 128 fp->f_ops = &socketops; 129 fp->f_data = so; 130 *res = fd; 131 } 132 fdrop(fp, td); 133 return (error); 134 } 135 136 int 137 socket(struct socket_args *uap) 138 { 139 int error; 140 141 error = kern_socket(uap->domain, uap->type, uap->protocol, 142 &uap->sysmsg_result); 143 144 return (error); 145 } 146 147 int 148 kern_bind(int s, struct sockaddr *sa) 149 { 150 struct thread *td = curthread; 151 struct proc *p = td->td_proc; 152 struct file *fp; 153 int error; 154 155 KKASSERT(p); 156 error = holdsock(p->p_fd, s, &fp); 157 if (error) 158 return (error); 159 error = sobind((struct socket *)fp->f_data, sa, td); 160 fdrop(fp, td); 161 return (error); 162 } 163 164 /* 165 * bind_args(int s, caddr_t name, int namelen) 166 */ 167 int 168 bind(struct bind_args *uap) 169 { 170 struct sockaddr *sa; 171 int error; 172 173 error = getsockaddr(&sa, uap->name, uap->namelen); 174 if (error) 175 return (error); 176 error = kern_bind(uap->s, sa); 177 FREE(sa, M_SONAME); 178 179 return (error); 180 } 181 182 int 183 kern_listen(int s, int backlog) 184 { 185 struct thread *td = curthread; 186 struct proc *p = td->td_proc; 187 struct file *fp; 188 int error; 189 190 KKASSERT(p); 191 error = holdsock(p->p_fd, s, &fp); 192 if (error) 193 return (error); 194 error = solisten((struct socket *)fp->f_data, backlog, td); 195 fdrop(fp, td); 196 return(error); 197 } 198 199 /* 200 * listen_args(int s, int backlog) 201 */ 202 int 203 listen(struct listen_args *uap) 204 { 205 int error; 206 207 error = kern_listen(uap->s, uap->backlog); 208 return (error); 209 } 210 211 /* 212 * Returns the accepted socket as well. 213 */ 214 static boolean_t 215 soaccept_predicate(struct netmsg *msg0) 216 { 217 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 218 struct socket *head = msg->nm_so; 219 220 if (head->so_error != 0) { 221 msg->nm_lmsg.ms_error = head->so_error; 222 return (TRUE); 223 } 224 if (!TAILQ_EMPTY(&head->so_comp)) { 225 /* Abuse nm_so field as copy in/copy out parameter. XXX JH */ 226 msg->nm_so = TAILQ_FIRST(&head->so_comp); 227 TAILQ_REMOVE(&head->so_comp, msg->nm_so, so_list); 228 head->so_qlen--; 229 230 msg->nm_lmsg.ms_error = 0; 231 return (TRUE); 232 } 233 if (head->so_state & SS_CANTRCVMORE) { 234 msg->nm_lmsg.ms_error = ECONNABORTED; 235 return (TRUE); 236 } 237 if (head->so_state & SS_NBIO) { 238 msg->nm_lmsg.ms_error = EWOULDBLOCK; 239 return (TRUE); 240 } 241 242 return (FALSE); 243 } 244 245 /* 246 * The second argument to kern_accept() is a handle to a struct sockaddr. 247 * This allows kern_accept() to return a pointer to an allocated struct 248 * sockaddr which must be freed later with FREE(). The caller must 249 * initialize *name to NULL. 250 */ 251 int 252 kern_accept(int s, struct sockaddr **name, int *namelen, int *res) 253 { 254 struct thread *td = curthread; 255 struct proc *p = td->td_proc; 256 struct filedesc *fdp = p->p_fd; 257 struct file *lfp = NULL; 258 struct file *nfp = NULL; 259 struct sockaddr *sa; 260 struct socket *head, *so; 261 struct netmsg_so_notify msg; 262 lwkt_port_t port; 263 int fd; 264 u_int fflag; /* type must match fp->f_flag */ 265 int error, tmp; 266 267 if (name && namelen && *namelen < 0) 268 return (EINVAL); 269 270 error = holdsock(fdp, s, &lfp); 271 if (error) 272 return (error); 273 274 error = falloc(p, &nfp, &fd); 275 if (error) { /* Probably ran out of file descriptors. */ 276 *res = -1; 277 fdrop(lfp, td); 278 return (error); 279 } 280 *res = fd; 281 282 head = (struct socket *)lfp->f_data; 283 if ((head->so_options & SO_ACCEPTCONN) == 0) { 284 error = EINVAL; 285 goto done; 286 } 287 288 /* optimize for uniprocessor case later XXX JH */ 289 port = head->so_proto->pr_mport(head, NULL, PRU_PRED); 290 lwkt_initmsg(&msg.nm_lmsg, &curthread->td_msgport, 291 MSGF_PCATCH | MSGF_ABORTABLE, 292 lwkt_cmd_func(netmsg_so_notify), 293 lwkt_cmd_func(netmsg_so_notify_abort)); 294 msg.nm_predicate = soaccept_predicate; 295 msg.nm_so = head; 296 msg.nm_etype = NM_REVENT; 297 error = lwkt_domsg(port, &msg.nm_lmsg); 298 if (error) 299 goto done; 300 301 /* 302 * At this point we have the connection that's ready to be accepted. 303 */ 304 so = msg.nm_so; 305 306 fflag = lfp->f_flag; 307 308 /* connection has been removed from the listen queue */ 309 KNOTE(&head->so_rcv.sb_sel.si_note, 0); 310 311 so->so_state &= ~SS_COMP; 312 so->so_head = NULL; 313 if (head->so_sigio != NULL) 314 fsetown(fgetown(head->so_sigio), &so->so_sigio); 315 316 nfp->f_type = DTYPE_SOCKET; 317 nfp->f_flag = fflag; 318 nfp->f_ops = &socketops; 319 nfp->f_data = so; 320 /* Sync socket nonblocking/async state with file flags */ 321 tmp = fflag & FNONBLOCK; 322 (void) fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, td); 323 tmp = fflag & FASYNC; 324 (void) fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, td); 325 326 sa = NULL; 327 error = soaccept(so, &sa); 328 329 /* 330 * Set the returned name and namelen as applicable. Set the returned 331 * namelen to 0 for older code which might ignore the return value 332 * from accept. 333 */ 334 if (error == 0) { 335 if (sa && name && namelen) { 336 if (*namelen > sa->sa_len) 337 *namelen = sa->sa_len; 338 *name = sa; 339 } else { 340 if (sa) 341 FREE(sa, M_SONAME); 342 } 343 } 344 345 done: 346 /* 347 * close the new descriptor, assuming someone hasn't ripped it 348 * out from under us. Note that *res is normally ignored if an 349 * error is returned but a syscall message will still have access 350 * to the result code. 351 */ 352 if (error) { 353 *res = -1; 354 if (fdp->fd_files[fd].fp == nfp) { 355 funsetfd(fdp, fd); 356 fdrop(nfp, td); 357 } 358 } 359 360 /* 361 * Release explicitly held references before returning. 362 */ 363 if (nfp) 364 fdrop(nfp, td); 365 fdrop(lfp, td); 366 return (error); 367 } 368 369 /* 370 * accept_args(int s, caddr_t name, int *anamelen) 371 */ 372 int 373 accept(struct accept_args *uap) 374 { 375 struct sockaddr *sa = NULL; 376 int sa_len; 377 int error; 378 379 if (uap->name) { 380 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 381 if (error) 382 return (error); 383 384 error = kern_accept(uap->s, &sa, &sa_len, &uap->sysmsg_result); 385 386 if (error == 0) 387 error = copyout(sa, uap->name, sa_len); 388 if (error == 0) { 389 error = copyout(&sa_len, uap->anamelen, 390 sizeof(*uap->anamelen)); 391 } 392 if (sa) 393 FREE(sa, M_SONAME); 394 } else { 395 error = kern_accept(uap->s, NULL, 0, &uap->sysmsg_result); 396 } 397 return (error); 398 } 399 400 /* 401 * Returns TRUE if predicate satisfied. 402 */ 403 static boolean_t 404 soconnected_predicate(struct netmsg *msg0) 405 { 406 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 407 struct socket *so = msg->nm_so; 408 409 /* check predicate */ 410 if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) { 411 msg->nm_lmsg.ms_error = so->so_error; 412 return (TRUE); 413 } 414 415 return (FALSE); 416 } 417 418 int 419 kern_connect(int s, struct sockaddr *sa) 420 { 421 struct thread *td = curthread; 422 struct proc *p = td->td_proc; 423 struct file *fp; 424 struct socket *so; 425 int error; 426 427 error = holdsock(p->p_fd, s, &fp); 428 if (error) 429 return (error); 430 so = (struct socket *)fp->f_data; 431 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 432 error = EALREADY; 433 goto done; 434 } 435 error = soconnect(so, sa, td); 436 if (error) 437 goto bad; 438 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 439 error = EINPROGRESS; 440 goto done; 441 } 442 if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 443 struct netmsg_so_notify msg; 444 lwkt_port_t port; 445 446 port = so->so_proto->pr_mport(so, sa, PRU_PRED); 447 lwkt_initmsg(&msg.nm_lmsg, 448 &curthread->td_msgport, 449 MSGF_PCATCH | MSGF_ABORTABLE, 450 lwkt_cmd_func(netmsg_so_notify), 451 lwkt_cmd_func(netmsg_so_notify_abort)); 452 msg.nm_predicate = soconnected_predicate; 453 msg.nm_so = so; 454 msg.nm_etype = NM_REVENT; 455 error = lwkt_domsg(port, &msg.nm_lmsg); 456 } 457 if (error == 0) { 458 error = so->so_error; 459 so->so_error = 0; 460 } 461 bad: 462 so->so_state &= ~SS_ISCONNECTING; 463 if (error == ERESTART) 464 error = EINTR; 465 done: 466 fdrop(fp, td); 467 return (error); 468 } 469 470 /* 471 * connect_args(int s, caddr_t name, int namelen) 472 */ 473 int 474 connect(struct connect_args *uap) 475 { 476 struct sockaddr *sa; 477 int error; 478 479 error = getsockaddr(&sa, uap->name, uap->namelen); 480 if (error) 481 return (error); 482 error = kern_connect(uap->s, sa); 483 FREE(sa, M_SONAME); 484 485 return (error); 486 } 487 488 int 489 kern_socketpair(int domain, int type, int protocol, int *sv) 490 { 491 struct thread *td = curthread; 492 struct proc *p = td->td_proc; 493 struct filedesc *fdp; 494 struct file *fp1, *fp2; 495 struct socket *so1, *so2; 496 int fd, error; 497 498 KKASSERT(p); 499 fdp = p->p_fd; 500 error = socreate(domain, &so1, type, protocol, td); 501 if (error) 502 return (error); 503 error = socreate(domain, &so2, type, protocol, td); 504 if (error) 505 goto free1; 506 error = falloc(p, &fp1, &fd); 507 if (error) 508 goto free2; 509 sv[0] = fd; 510 fp1->f_data = so1; 511 error = falloc(p, &fp2, &fd); 512 if (error) 513 goto free3; 514 fp2->f_data = so2; 515 sv[1] = fd; 516 error = soconnect2(so1, so2); 517 if (error) 518 goto free4; 519 if (type == SOCK_DGRAM) { 520 /* 521 * Datagram socket connection is asymmetric. 522 */ 523 error = soconnect2(so2, so1); 524 if (error) 525 goto free4; 526 } 527 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 528 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 529 fp1->f_ops = fp2->f_ops = &socketops; 530 fdrop(fp1, td); 531 fdrop(fp2, td); 532 return (error); 533 free4: 534 if (fdp->fd_files[sv[1]].fp == fp2) { 535 funsetfd(fdp, sv[1]); 536 fdrop(fp2, td); 537 } 538 fdrop(fp2, td); 539 free3: 540 if (fdp->fd_files[sv[0]].fp == fp1) { 541 funsetfd(fdp, sv[0]); 542 fdrop(fp1, td); 543 } 544 fdrop(fp1, td); 545 free2: 546 (void)soclose(so2); 547 free1: 548 (void)soclose(so1); 549 return (error); 550 } 551 552 /* 553 * socketpair(int domain, int type, int protocol, int *rsv) 554 */ 555 int 556 socketpair(struct socketpair_args *uap) 557 { 558 int error, sockv[2]; 559 560 error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv); 561 562 if (error == 0) 563 error = copyout(sockv, uap->rsv, sizeof(sockv)); 564 return (error); 565 } 566 567 int 568 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, 569 struct mbuf *control, int flags, int *res) 570 { 571 struct thread *td = curthread; 572 struct proc *p = td->td_proc; 573 struct file *fp; 574 int len, error; 575 struct socket *so; 576 #ifdef KTRACE 577 struct iovec *ktriov = NULL; 578 struct uio ktruio; 579 #endif 580 581 error = holdsock(p->p_fd, s, &fp); 582 if (error) 583 return (error); 584 if (auio->uio_resid < 0) { 585 error = EINVAL; 586 goto done; 587 } 588 #ifdef KTRACE 589 if (KTRPOINT(td, KTR_GENIO)) { 590 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 591 592 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 593 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 594 ktruio = *auio; 595 } 596 #endif 597 len = auio->uio_resid; 598 so = (struct socket *)fp->f_data; 599 error = so_pru_sosend(so, sa, auio, NULL, control, flags, td); 600 if (error) { 601 if (auio->uio_resid != len && (error == ERESTART || 602 error == EINTR || error == EWOULDBLOCK)) 603 error = 0; 604 if (error == EPIPE) 605 psignal(p, SIGPIPE); 606 } 607 #ifdef KTRACE 608 if (ktriov != NULL) { 609 if (error == 0) { 610 ktruio.uio_iov = ktriov; 611 ktruio.uio_resid = len - auio->uio_resid; 612 ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error); 613 } 614 FREE(ktriov, M_TEMP); 615 } 616 #endif 617 if (error == 0) 618 *res = len - auio->uio_resid; 619 done: 620 fdrop(fp, td); 621 return (error); 622 } 623 624 /* 625 * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen) 626 */ 627 int 628 sendto(struct sendto_args *uap) 629 { 630 struct thread *td = curthread; 631 struct uio auio; 632 struct iovec aiov; 633 struct sockaddr *sa = NULL; 634 int error; 635 636 if (uap->to) { 637 error = getsockaddr(&sa, uap->to, uap->tolen); 638 if (error) 639 return (error); 640 } 641 aiov.iov_base = uap->buf; 642 aiov.iov_len = uap->len; 643 auio.uio_iov = &aiov; 644 auio.uio_iovcnt = 1; 645 auio.uio_offset = 0; 646 auio.uio_resid = uap->len; 647 auio.uio_segflg = UIO_USERSPACE; 648 auio.uio_rw = UIO_WRITE; 649 auio.uio_td = td; 650 651 error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags, 652 &uap->sysmsg_result); 653 654 if (sa) 655 FREE(sa, M_SONAME); 656 return (error); 657 } 658 659 /* 660 * sendmsg_args(int s, caddr_t msg, int flags) 661 */ 662 int 663 sendmsg(struct sendmsg_args *uap) 664 { 665 struct thread *td = curthread; 666 struct msghdr msg; 667 struct uio auio; 668 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 669 struct sockaddr *sa = NULL; 670 struct mbuf *control = NULL; 671 int error; 672 673 error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg)); 674 if (error) 675 return (error); 676 677 /* 678 * Conditionally copyin msg.msg_name. 679 */ 680 if (msg.msg_name) { 681 error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen); 682 if (error) 683 return (error); 684 } 685 686 /* 687 * Populate auio. 688 */ 689 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 690 &auio.uio_resid); 691 if (error) 692 goto cleanup; 693 auio.uio_iov = iov; 694 auio.uio_iovcnt = msg.msg_iovlen; 695 auio.uio_offset = 0; 696 auio.uio_segflg = UIO_USERSPACE; 697 auio.uio_rw = UIO_WRITE; 698 auio.uio_td = td; 699 700 /* 701 * Conditionally copyin msg.msg_control. 702 */ 703 if (msg.msg_control) { 704 if (msg.msg_controllen < sizeof(struct cmsghdr) || 705 msg.msg_controllen > MLEN) { 706 error = EINVAL; 707 goto cleanup; 708 } 709 control = m_get(MB_WAIT, MT_CONTROL); 710 if (control == NULL) { 711 error = ENOBUFS; 712 goto cleanup; 713 } 714 control->m_len = msg.msg_controllen; 715 error = copyin(msg.msg_control, mtod(control, caddr_t), 716 msg.msg_controllen); 717 if (error) { 718 m_free(control); 719 goto cleanup; 720 } 721 } 722 723 error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags, 724 &uap->sysmsg_result); 725 726 cleanup: 727 if (sa) 728 FREE(sa, M_SONAME); 729 iovec_free(&iov, aiov); 730 return (error); 731 } 732 733 /* 734 * kern_recvmsg() takes a handle to sa and control. If the handle is non- 735 * null, it returns a dynamically allocated struct sockaddr and an mbuf. 736 * Don't forget to FREE() and m_free() these if they are returned. 737 */ 738 int 739 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, 740 struct mbuf **control, int *flags, int *res) 741 { 742 struct thread *td = curthread; 743 struct proc *p = td->td_proc; 744 struct file *fp; 745 int len, error; 746 struct socket *so; 747 #ifdef KTRACE 748 struct iovec *ktriov = NULL; 749 struct uio ktruio; 750 #endif 751 752 error = holdsock(p->p_fd, s, &fp); 753 if (error) 754 return (error); 755 if (auio->uio_resid < 0) { 756 error = EINVAL; 757 goto done; 758 } 759 #ifdef KTRACE 760 if (KTRPOINT(td, KTR_GENIO)) { 761 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 762 763 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 764 bcopy(auio->uio_iov, ktriov, iovlen); 765 ktruio = *auio; 766 } 767 #endif 768 len = auio->uio_resid; 769 so = (struct socket *)fp->f_data; 770 error = so_pru_soreceive(so, sa, auio, NULL, control, flags); 771 if (error) { 772 if (auio->uio_resid != len && (error == ERESTART || 773 error == EINTR || error == EWOULDBLOCK)) 774 error = 0; 775 } 776 #ifdef KTRACE 777 if (ktriov != NULL) { 778 if (error == 0) { 779 ktruio.uio_iov = ktriov; 780 ktruio.uio_resid = len - auio->uio_resid; 781 ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error); 782 } 783 FREE(ktriov, M_TEMP); 784 } 785 #endif 786 if (error == 0) 787 *res = len - auio->uio_resid; 788 done: 789 fdrop(fp, td); 790 return (error); 791 } 792 793 /* 794 * recvfrom_args(int s, caddr_t buf, size_t len, int flags, 795 * caddr_t from, int *fromlenaddr) 796 */ 797 int 798 recvfrom(struct recvfrom_args *uap) 799 { 800 struct thread *td = curthread; 801 struct uio auio; 802 struct iovec aiov; 803 struct sockaddr *sa = NULL; 804 int error, fromlen; 805 806 if (uap->from && uap->fromlenaddr) { 807 error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen)); 808 if (error) 809 return (error); 810 if (fromlen < 0) 811 return (EINVAL); 812 } else { 813 fromlen = 0; 814 } 815 aiov.iov_base = uap->buf; 816 aiov.iov_len = uap->len; 817 auio.uio_iov = &aiov; 818 auio.uio_iovcnt = 1; 819 auio.uio_offset = 0; 820 auio.uio_resid = uap->len; 821 auio.uio_segflg = UIO_USERSPACE; 822 auio.uio_rw = UIO_READ; 823 auio.uio_td = td; 824 825 error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL, 826 &uap->flags, &uap->sysmsg_result); 827 828 if (error == 0 && uap->from) { 829 /* note: sa may still be NULL */ 830 if (sa) { 831 fromlen = MIN(fromlen, sa->sa_len); 832 error = copyout(sa, uap->from, fromlen); 833 } else { 834 fromlen = 0; 835 } 836 if (error == 0) { 837 error = copyout(&fromlen, uap->fromlenaddr, 838 sizeof(fromlen)); 839 } 840 } 841 if (sa) 842 FREE(sa, M_SONAME); 843 844 return (error); 845 } 846 847 /* 848 * recvmsg_args(int s, struct msghdr *msg, int flags) 849 */ 850 int 851 recvmsg(struct recvmsg_args *uap) 852 { 853 struct thread *td = curthread; 854 struct msghdr msg; 855 struct uio auio; 856 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 857 struct mbuf *m, *control = NULL; 858 struct sockaddr *sa = NULL; 859 caddr_t ctlbuf; 860 socklen_t *ufromlenp, *ucontrollenp; 861 int error, fromlen, controllen, len, flags, *uflagsp; 862 863 /* 864 * This copyin handles everything except the iovec. 865 */ 866 error = copyin(uap->msg, &msg, sizeof(msg)); 867 if (error) 868 return (error); 869 870 if (msg.msg_name && msg.msg_namelen < 0) 871 return (EINVAL); 872 if (msg.msg_control && msg.msg_controllen < 0) 873 return (EINVAL); 874 875 ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 876 msg_namelen)); 877 ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 878 msg_controllen)); 879 uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr, 880 msg_flags)); 881 882 /* 883 * Populate auio. 884 */ 885 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 886 &auio.uio_resid); 887 if (error) 888 return (error); 889 auio.uio_iov = iov; 890 auio.uio_iovcnt = msg.msg_iovlen; 891 auio.uio_offset = 0; 892 auio.uio_segflg = UIO_USERSPACE; 893 auio.uio_rw = UIO_READ; 894 auio.uio_td = td; 895 896 flags = uap->flags; 897 898 error = kern_recvmsg(uap->s, msg.msg_name ? &sa : NULL, &auio, 899 msg.msg_control ? &control : NULL, &flags, &uap->sysmsg_result); 900 901 /* 902 * Conditionally copyout the name and populate the namelen field. 903 */ 904 if (error == 0 && msg.msg_name) { 905 fromlen = MIN(msg.msg_namelen, sa->sa_len); 906 error = copyout(sa, msg.msg_name, fromlen); 907 if (error == 0) 908 error = copyout(&fromlen, ufromlenp, 909 sizeof(*ufromlenp)); 910 } 911 912 /* 913 * Copyout msg.msg_control and msg.msg_controllen. 914 */ 915 if (error == 0 && msg.msg_control) { 916 len = msg.msg_controllen; 917 m = control; 918 ctlbuf = (caddr_t)msg.msg_control; 919 920 while(m && len > 0) { 921 unsigned int tocopy; 922 923 if (len >= m->m_len) { 924 tocopy = m->m_len; 925 } else { 926 msg.msg_flags |= MSG_CTRUNC; 927 tocopy = len; 928 } 929 930 error = copyout(mtod(m, caddr_t), ctlbuf, tocopy); 931 if (error) 932 goto cleanup; 933 934 ctlbuf += tocopy; 935 len -= tocopy; 936 m = m->m_next; 937 } 938 controllen = ctlbuf - (caddr_t)msg.msg_control; 939 error = copyout(&controllen, ucontrollenp, 940 sizeof(*ucontrollenp)); 941 } 942 943 if (error == 0) 944 error = copyout(&flags, uflagsp, sizeof(*uflagsp)); 945 946 cleanup: 947 if (sa) 948 FREE(sa, M_SONAME); 949 iovec_free(&iov, aiov); 950 if (control) 951 m_freem(control); 952 return (error); 953 } 954 955 /* 956 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 957 * in kernel pointer instead of a userland pointer. This allows us 958 * to manipulate socket options in the emulation code. 959 */ 960 int 961 kern_setsockopt(int s, struct sockopt *sopt) 962 { 963 struct thread *td = curthread; 964 struct proc *p = td->td_proc; 965 struct file *fp; 966 int error; 967 968 if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0) 969 return (EFAULT); 970 if (sopt->sopt_valsize < 0) 971 return (EINVAL); 972 973 error = holdsock(p->p_fd, s, &fp); 974 if (error) 975 return (error); 976 977 error = sosetopt((struct socket *)fp->f_data, sopt); 978 fdrop(fp, td); 979 return (error); 980 } 981 982 /* 983 * setsockopt_args(int s, int level, int name, caddr_t val, int valsize) 984 */ 985 int 986 setsockopt(struct setsockopt_args *uap) 987 { 988 struct thread *td = curthread; 989 struct sockopt sopt; 990 int error; 991 992 sopt.sopt_level = uap->level; 993 sopt.sopt_name = uap->name; 994 sopt.sopt_val = uap->val; 995 sopt.sopt_valsize = uap->valsize; 996 sopt.sopt_td = td; 997 998 error = kern_setsockopt(uap->s, &sopt); 999 return(error); 1000 } 1001 1002 /* 1003 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1004 * in kernel pointer instead of a userland pointer. This allows us 1005 * to manipulate socket options in the emulation code. 1006 */ 1007 int 1008 kern_getsockopt(int s, struct sockopt *sopt) 1009 { 1010 struct thread *td = curthread; 1011 struct proc *p = td->td_proc; 1012 struct file *fp; 1013 int error; 1014 1015 if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0) 1016 return (EFAULT); 1017 if (sopt->sopt_valsize < 0) 1018 return (EINVAL); 1019 1020 error = holdsock(p->p_fd, s, &fp); 1021 if (error) 1022 return (error); 1023 1024 error = sogetopt((struct socket *)fp->f_data, sopt); 1025 fdrop(fp, td); 1026 return (error); 1027 } 1028 1029 /* 1030 * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize) 1031 */ 1032 int 1033 getsockopt(struct getsockopt_args *uap) 1034 { 1035 struct thread *td = curthread; 1036 struct sockopt sopt; 1037 int error, valsize; 1038 1039 if (uap->val) { 1040 error = copyin(uap->avalsize, &valsize, sizeof(valsize)); 1041 if (error) 1042 return (error); 1043 if (valsize < 0) 1044 return (EINVAL); 1045 } else { 1046 valsize = 0; 1047 } 1048 1049 sopt.sopt_level = uap->level; 1050 sopt.sopt_name = uap->name; 1051 sopt.sopt_val = uap->val; 1052 sopt.sopt_valsize = valsize; 1053 sopt.sopt_td = td; 1054 1055 error = kern_getsockopt(uap->s, &sopt); 1056 if (error == 0) { 1057 valsize = sopt.sopt_valsize; 1058 error = copyout(&valsize, uap->avalsize, sizeof(valsize)); 1059 } 1060 return (error); 1061 } 1062 1063 /* 1064 * The second argument to kern_getsockname() is a handle to a struct sockaddr. 1065 * This allows kern_getsockname() to return a pointer to an allocated struct 1066 * sockaddr which must be freed later with FREE(). The caller must 1067 * initialize *name to NULL. 1068 */ 1069 int 1070 kern_getsockname(int s, struct sockaddr **name, int *namelen) 1071 { 1072 struct thread *td = curthread; 1073 struct proc *p = td->td_proc; 1074 struct file *fp; 1075 struct socket *so; 1076 struct sockaddr *sa = NULL; 1077 int error; 1078 1079 error = holdsock(p->p_fd, s, &fp); 1080 if (error) 1081 return (error); 1082 if (*namelen < 0) { 1083 fdrop(fp, td); 1084 return (EINVAL); 1085 } 1086 so = (struct socket *)fp->f_data; 1087 error = so_pru_sockaddr(so, &sa); 1088 if (error == 0) { 1089 if (sa == 0) { 1090 *namelen = 0; 1091 } else { 1092 *namelen = MIN(*namelen, sa->sa_len); 1093 *name = sa; 1094 } 1095 } 1096 1097 fdrop(fp, td); 1098 return (error); 1099 } 1100 1101 /* 1102 * getsockname_args(int fdes, caddr_t asa, int *alen) 1103 * 1104 * Get socket name. 1105 */ 1106 int 1107 getsockname(struct getsockname_args *uap) 1108 { 1109 struct sockaddr *sa = NULL; 1110 int error, sa_len; 1111 1112 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1113 if (error) 1114 return (error); 1115 1116 error = kern_getsockname(uap->fdes, &sa, &sa_len); 1117 1118 if (error == 0) 1119 error = copyout(sa, uap->asa, sa_len); 1120 if (error == 0) 1121 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1122 if (sa) 1123 FREE(sa, M_SONAME); 1124 return (error); 1125 } 1126 1127 /* 1128 * The second argument to kern_getpeername() is a handle to a struct sockaddr. 1129 * This allows kern_getpeername() to return a pointer to an allocated struct 1130 * sockaddr which must be freed later with FREE(). The caller must 1131 * initialize *name to NULL. 1132 */ 1133 int 1134 kern_getpeername(int s, struct sockaddr **name, int *namelen) 1135 { 1136 struct thread *td = curthread; 1137 struct proc *p = td->td_proc; 1138 struct file *fp; 1139 struct socket *so; 1140 struct sockaddr *sa = NULL; 1141 int error; 1142 1143 error = holdsock(p->p_fd, s, &fp); 1144 if (error) 1145 return (error); 1146 if (*namelen < 0) { 1147 fdrop(fp, td); 1148 return (EINVAL); 1149 } 1150 so = (struct socket *)fp->f_data; 1151 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1152 fdrop(fp, td); 1153 return (ENOTCONN); 1154 } 1155 error = so_pru_peeraddr(so, &sa); 1156 if (error == 0) { 1157 if (sa == 0) { 1158 *namelen = 0; 1159 } else { 1160 *namelen = MIN(*namelen, sa->sa_len); 1161 *name = sa; 1162 } 1163 } 1164 1165 fdrop(fp, td); 1166 return (error); 1167 } 1168 1169 /* 1170 * getpeername_args(int fdes, caddr_t asa, int *alen) 1171 * 1172 * Get name of peer for connected socket. 1173 */ 1174 int 1175 getpeername(struct getpeername_args *uap) 1176 { 1177 struct sockaddr *sa = NULL; 1178 int error, sa_len; 1179 1180 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1181 if (error) 1182 return (error); 1183 1184 error = kern_getpeername(uap->fdes, &sa, &sa_len); 1185 1186 if (error == 0) 1187 error = copyout(sa, uap->asa, sa_len); 1188 if (error == 0) 1189 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1190 if (sa) 1191 FREE(sa, M_SONAME); 1192 return (error); 1193 } 1194 1195 int 1196 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) 1197 { 1198 struct sockaddr *sa; 1199 int error; 1200 1201 *namp = NULL; 1202 if (len > SOCK_MAXADDRLEN) 1203 return ENAMETOOLONG; 1204 if (len < offsetof(struct sockaddr, sa_data[0])) 1205 return EDOM; 1206 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1207 error = copyin(uaddr, sa, len); 1208 if (error) { 1209 FREE(sa, M_SONAME); 1210 } else { 1211 #if BYTE_ORDER != BIG_ENDIAN 1212 /* 1213 * The bind(), connect(), and sendto() syscalls were not 1214 * versioned for COMPAT_43. Thus, this check must stay. 1215 */ 1216 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1217 sa->sa_family = sa->sa_len; 1218 #endif 1219 sa->sa_len = len; 1220 *namp = sa; 1221 } 1222 return error; 1223 } 1224 1225 /* 1226 * holdsock() - load the struct file pointer associated 1227 * with a socket into *fpp. If an error occurs, non-zero 1228 * will be returned and *fpp will be set to NULL. 1229 */ 1230 int 1231 holdsock(struct filedesc *fdp, int fdes, struct file **fpp) 1232 { 1233 struct file *fp; 1234 int error = 0; 1235 1236 *fpp = NULL; 1237 if ((unsigned)fdes >= fdp->fd_nfiles) 1238 return EBADF; 1239 if ((fp = fdp->fd_files[fdes].fp) == NULL) 1240 return EBADF; 1241 if (fp->f_type != DTYPE_SOCKET) 1242 return ENOTSOCK; 1243 fhold(fp); 1244 *fpp = fp; 1245 return (error); 1246 } 1247 1248 /* 1249 * Detach a mapped page and release resources back to the system. 1250 * We must release our wiring and if the object is ripped out 1251 * from under the vm_page we become responsible for freeing the 1252 * page. These routines must be MPSAFE. 1253 * 1254 * XXX HACK XXX TEMPORARY UNTIL WE IMPLEMENT EXT MBUF REFERENCE COUNTING 1255 * 1256 * XXX vm_page_*() routines are not MPSAFE yet, the MP lock is required. 1257 */ 1258 static void 1259 sf_buf_mref(void *arg) 1260 { 1261 struct sfbuf_mref *sfm = arg; 1262 1263 /* 1264 * We must already hold a ref so there is no race to 0, just 1265 * atomically increment the count. 1266 */ 1267 atomic_add_int(&sfm->mref_count, 1); 1268 } 1269 1270 static void 1271 sf_buf_mfree(void *arg) 1272 { 1273 struct sfbuf_mref *sfm = arg; 1274 vm_page_t m; 1275 1276 KKASSERT(sfm->mref_count > 0); 1277 if (sfm->mref_count == 1) { 1278 /* 1279 * We are the only holder so no further locking is required, 1280 * the sfbuf can simply be freed. 1281 */ 1282 sfm->mref_count = 0; 1283 goto freeit; 1284 } else { 1285 /* 1286 * There may be other holders, we must obtain the serializer 1287 * to protect against a sf_buf_mfree() race to 0. An atomic 1288 * operation is still required for races against 1289 * sf_buf_mref(). 1290 * 1291 * XXX vm_page_*() and SFBUF routines not MPSAFE yet. 1292 */ 1293 lwkt_serialize_enter(&sfm->serializer); 1294 atomic_subtract_int(&sfm->mref_count, 1); 1295 if (sfm->mref_count == 0) { 1296 lwkt_serialize_exit(&sfm->serializer); 1297 freeit: 1298 get_mplock(); 1299 crit_enter(); 1300 m = sf_buf_page(sfm->sf); 1301 sf_buf_free(sfm->sf); 1302 vm_page_unwire(m, 0); 1303 if (m->wire_count == 0 && m->object == NULL) 1304 vm_page_try_to_free(m); 1305 crit_exit(); 1306 rel_mplock(); 1307 free(sfm, M_SENDFILE); 1308 } else { 1309 lwkt_serialize_exit(&sfm->serializer); 1310 } 1311 } 1312 } 1313 1314 /* 1315 * sendfile(2). 1316 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1317 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1318 * 1319 * Send a file specified by 'fd' and starting at 'offset' to a socket 1320 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1321 * nbytes == 0. Optionally add a header and/or trailer to the socket 1322 * output. If specified, write the total number of bytes sent into *sbytes. 1323 * 1324 * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused 1325 * the headers to count against the remaining bytes to be sent from 1326 * the file descriptor. We may wish to implement a compatibility syscall 1327 * in the future. 1328 */ 1329 int 1330 sendfile(struct sendfile_args *uap) 1331 { 1332 struct thread *td = curthread; 1333 struct proc *p = td->td_proc; 1334 struct file *fp; 1335 struct filedesc *fdp; 1336 struct vnode *vp = NULL; 1337 struct sf_hdtr hdtr; 1338 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 1339 struct uio auio; 1340 struct mbuf *mheader = NULL; 1341 off_t hdtr_size = 0, sbytes; 1342 int error, hbytes = 0, tbytes; 1343 1344 KKASSERT(p); 1345 fdp = p->p_fd; 1346 1347 /* 1348 * Do argument checking. Must be a regular file in, stream 1349 * type and connected socket out, positive offset. 1350 */ 1351 fp = holdfp(fdp, uap->fd, FREAD); 1352 if (fp == NULL) { 1353 return (EBADF); 1354 } 1355 if (fp->f_type != DTYPE_VNODE) { 1356 fdrop(fp, td); 1357 return (EINVAL); 1358 } 1359 vp = (struct vnode *)fp->f_data; 1360 vref(vp); 1361 fdrop(fp, td); 1362 1363 /* 1364 * If specified, get the pointer to the sf_hdtr struct for 1365 * any headers/trailers. 1366 */ 1367 if (uap->hdtr) { 1368 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1369 if (error) 1370 goto done; 1371 /* 1372 * Send any headers. 1373 */ 1374 if (hdtr.headers) { 1375 error = iovec_copyin(hdtr.headers, &iov, aiov, 1376 hdtr.hdr_cnt, &hbytes); 1377 if (error) 1378 goto done; 1379 auio.uio_iov = iov; 1380 auio.uio_iovcnt = hdtr.hdr_cnt; 1381 auio.uio_offset = 0; 1382 auio.uio_segflg = UIO_USERSPACE; 1383 auio.uio_rw = UIO_WRITE; 1384 auio.uio_td = td; 1385 auio.uio_resid = hbytes; 1386 1387 mheader = m_uiomove(&auio); 1388 1389 iovec_free(&iov, aiov); 1390 if (mheader == NULL) 1391 goto done; 1392 } 1393 } 1394 1395 error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader, 1396 &sbytes, uap->flags); 1397 if (error) 1398 goto done; 1399 1400 /* 1401 * Send trailers. Wimp out and use writev(2). 1402 */ 1403 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1404 error = iovec_copyin(hdtr.trailers, &iov, aiov, 1405 hdtr.trl_cnt, &auio.uio_resid); 1406 if (error) 1407 goto done; 1408 auio.uio_iov = iov; 1409 auio.uio_iovcnt = hdtr.trl_cnt; 1410 auio.uio_offset = 0; 1411 auio.uio_segflg = UIO_USERSPACE; 1412 auio.uio_rw = UIO_WRITE; 1413 auio.uio_td = td; 1414 1415 error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes); 1416 1417 iovec_free(&iov, aiov); 1418 if (error) 1419 goto done; 1420 hdtr_size += tbytes; /* trailer bytes successfully sent */ 1421 } 1422 1423 done: 1424 if (uap->sbytes != NULL) { 1425 sbytes += hdtr_size; 1426 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1427 } 1428 if (vp) 1429 vrele(vp); 1430 return (error); 1431 } 1432 1433 int 1434 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, 1435 struct mbuf *mheader, off_t *sbytes, int flags) 1436 { 1437 struct thread *td = curthread; 1438 struct proc *p = td->td_proc; 1439 struct vm_object *obj; 1440 struct socket *so; 1441 struct file *fp; 1442 struct mbuf *m; 1443 struct sf_buf *sf; 1444 struct sfbuf_mref *sfm; 1445 struct vm_page *pg; 1446 off_t off, xfsize; 1447 off_t hbytes = 0; 1448 int error = 0; 1449 1450 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { 1451 error = EINVAL; 1452 goto done0; 1453 } 1454 error = holdsock(p->p_fd, sfd, &fp); 1455 if (error) 1456 goto done0; 1457 so = (struct socket *)fp->f_data; 1458 if (so->so_type != SOCK_STREAM) { 1459 error = EINVAL; 1460 goto done; 1461 } 1462 if ((so->so_state & SS_ISCONNECTED) == 0) { 1463 error = ENOTCONN; 1464 goto done; 1465 } 1466 if (offset < 0) { 1467 error = EINVAL; 1468 goto done; 1469 } 1470 1471 *sbytes = 0; 1472 /* 1473 * Protect against multiple writers to the socket. 1474 */ 1475 (void) sblock(&so->so_snd, M_WAITOK); 1476 1477 /* 1478 * Loop through the pages in the file, starting with the requested 1479 * offset. Get a file page (do I/O if necessary), map the file page 1480 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1481 * it on the socket. 1482 */ 1483 for (off = offset; ; off += xfsize, *sbytes += xfsize + hbytes) { 1484 vm_pindex_t pindex; 1485 vm_offset_t pgoff; 1486 1487 pindex = OFF_TO_IDX(off); 1488 retry_lookup: 1489 /* 1490 * Calculate the amount to transfer. Not to exceed a page, 1491 * the EOF, or the passed in nbytes. 1492 */ 1493 xfsize = obj->un_pager.vnp.vnp_size - off; 1494 if (xfsize > PAGE_SIZE) 1495 xfsize = PAGE_SIZE; 1496 pgoff = (vm_offset_t)(off & PAGE_MASK); 1497 if (PAGE_SIZE - pgoff < xfsize) 1498 xfsize = PAGE_SIZE - pgoff; 1499 if (nbytes && xfsize > (nbytes - *sbytes)) 1500 xfsize = nbytes - *sbytes; 1501 if (xfsize <= 0) 1502 break; 1503 /* 1504 * Optimize the non-blocking case by looking at the socket space 1505 * before going to the extra work of constituting the sf_buf. 1506 */ 1507 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1508 if (so->so_state & SS_CANTSENDMORE) 1509 error = EPIPE; 1510 else 1511 error = EAGAIN; 1512 sbunlock(&so->so_snd); 1513 goto done; 1514 } 1515 /* 1516 * Attempt to look up the page. 1517 * 1518 * Allocate if not found, wait and loop if busy, then 1519 * wire the page. critical section protection is 1520 * required to maintain the object association (an 1521 * interrupt can free the page) through to the 1522 * vm_page_wire() call. 1523 */ 1524 crit_enter(); 1525 pg = vm_page_lookup(obj, pindex); 1526 if (pg == NULL) { 1527 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1528 if (pg == NULL) { 1529 vm_wait(); 1530 crit_exit(); 1531 goto retry_lookup; 1532 } 1533 vm_page_wakeup(pg); 1534 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1535 crit_exit(); 1536 goto retry_lookup; 1537 } 1538 vm_page_wire(pg); 1539 crit_exit(); 1540 1541 /* 1542 * If page is not valid for what we need, initiate I/O 1543 */ 1544 1545 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1546 struct uio auio; 1547 struct iovec aiov; 1548 int bsize; 1549 1550 /* 1551 * Ensure that our page is still around when the I/O 1552 * completes. 1553 */ 1554 vm_page_io_start(pg); 1555 1556 /* 1557 * Get the page from backing store. 1558 */ 1559 bsize = vp->v_mount->mnt_stat.f_iosize; 1560 auio.uio_iov = &aiov; 1561 auio.uio_iovcnt = 1; 1562 aiov.iov_base = 0; 1563 aiov.iov_len = MAXBSIZE; 1564 auio.uio_resid = MAXBSIZE; 1565 auio.uio_offset = trunc_page(off); 1566 auio.uio_segflg = UIO_NOCOPY; 1567 auio.uio_rw = UIO_READ; 1568 auio.uio_td = td; 1569 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); 1570 error = VOP_READ(vp, &auio, 1571 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1572 p->p_ucred); 1573 VOP_UNLOCK(vp, 0, td); 1574 vm_page_flag_clear(pg, PG_ZERO); 1575 vm_page_io_finish(pg); 1576 if (error) { 1577 crit_enter(); 1578 vm_page_unwire(pg, 0); 1579 vm_page_try_to_free(pg); 1580 crit_exit(); 1581 sbunlock(&so->so_snd); 1582 goto done; 1583 } 1584 } 1585 1586 1587 /* 1588 * Get a sendfile buf. We usually wait as long as necessary, 1589 * but this wait can be interrupted. 1590 */ 1591 if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { 1592 crit_enter(); 1593 vm_page_unwire(pg, 0); 1594 vm_page_try_to_free(pg); 1595 crit_exit(); 1596 sbunlock(&so->so_snd); 1597 error = EINTR; 1598 goto done; 1599 } 1600 1601 /* 1602 * Get an mbuf header and set it up as having external storage. 1603 */ 1604 MGETHDR(m, MB_WAIT, MT_DATA); 1605 if (m == NULL) { 1606 error = ENOBUFS; 1607 sf_buf_free(sf); 1608 sbunlock(&so->so_snd); 1609 goto done; 1610 } 1611 1612 /* 1613 * sfm is a temporary hack, use a per-cpu cache for this. 1614 */ 1615 sfm = malloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK); 1616 sfm->sf = sf; 1617 sfm->mref_count = 1; 1618 lwkt_serialize_init(&sfm->serializer); 1619 1620 m->m_ext.ext_free = sf_buf_mfree; 1621 m->m_ext.ext_ref = sf_buf_mref; 1622 m->m_ext.ext_arg = sfm; 1623 m->m_ext.ext_buf = (void *)sf->kva; 1624 m->m_ext.ext_size = PAGE_SIZE; 1625 m->m_data = (char *) sf->kva + pgoff; 1626 m->m_flags |= M_EXT; 1627 m->m_pkthdr.len = m->m_len = xfsize; 1628 KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0); 1629 1630 if (mheader != NULL) { 1631 hbytes = mheader->m_pkthdr.len; 1632 mheader->m_pkthdr.len += m->m_pkthdr.len; 1633 m_cat(mheader, m); 1634 m = mheader; 1635 mheader = NULL; 1636 } else 1637 hbytes = 0; 1638 1639 /* 1640 * Add the buffer to the socket buffer chain. 1641 */ 1642 crit_enter(); 1643 retry_space: 1644 /* 1645 * Make sure that the socket is still able to take more data. 1646 * CANTSENDMORE being true usually means that the connection 1647 * was closed. so_error is true when an error was sensed after 1648 * a previous send. 1649 * The state is checked after the page mapping and buffer 1650 * allocation above since those operations may block and make 1651 * any socket checks stale. From this point forward, nothing 1652 * blocks before the pru_send (or more accurately, any blocking 1653 * results in a loop back to here to re-check). 1654 */ 1655 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1656 if (so->so_state & SS_CANTSENDMORE) { 1657 error = EPIPE; 1658 } else { 1659 error = so->so_error; 1660 so->so_error = 0; 1661 } 1662 m_freem(m); 1663 sbunlock(&so->so_snd); 1664 crit_exit(); 1665 goto done; 1666 } 1667 /* 1668 * Wait for socket space to become available. We do this just 1669 * after checking the connection state above in order to avoid 1670 * a race condition with sbwait(). 1671 */ 1672 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1673 if (so->so_state & SS_NBIO) { 1674 m_freem(m); 1675 sbunlock(&so->so_snd); 1676 crit_exit(); 1677 error = EAGAIN; 1678 goto done; 1679 } 1680 error = sbwait(&so->so_snd); 1681 /* 1682 * An error from sbwait usually indicates that we've 1683 * been interrupted by a signal. If we've sent anything 1684 * then return bytes sent, otherwise return the error. 1685 */ 1686 if (error) { 1687 m_freem(m); 1688 sbunlock(&so->so_snd); 1689 crit_exit(); 1690 goto done; 1691 } 1692 goto retry_space; 1693 } 1694 error = so_pru_send(so, 0, m, NULL, NULL, td); 1695 crit_exit(); 1696 if (error) { 1697 sbunlock(&so->so_snd); 1698 goto done; 1699 } 1700 } 1701 if (mheader != NULL) { 1702 *sbytes += mheader->m_pkthdr.len; 1703 error = so_pru_send(so, 0, mheader, NULL, NULL, td); 1704 mheader = NULL; 1705 } 1706 sbunlock(&so->so_snd); 1707 1708 done: 1709 fdrop(fp, td); 1710 done0: 1711 if (mheader != NULL) 1712 m_freem(mheader); 1713 return (error); 1714 } 1715 1716 int 1717 sctp_peeloff(struct sctp_peeloff_args *uap) 1718 { 1719 #ifdef SCTP 1720 struct thread *td = curthread; 1721 struct proc *p = td->td_proc; 1722 struct filedesc *fdp = p->p_fd; 1723 struct file *lfp = NULL; 1724 struct file *nfp = NULL; 1725 int error; 1726 struct socket *head, *so; 1727 caddr_t assoc_id; 1728 int fd; 1729 short fflag; /* type must match fp->f_flag */ 1730 1731 assoc_id = uap->name; 1732 error = holdsock(fdp, uap->sd, &lfp); 1733 if (error) { 1734 return (error); 1735 } 1736 crit_enter(); 1737 head = (struct socket *)lfp->f_data; 1738 error = sctp_can_peel_off(head, assoc_id); 1739 if (error) { 1740 crit_exit(); 1741 goto done; 1742 } 1743 /* 1744 * At this point we know we do have a assoc to pull 1745 * we proceed to get the fd setup. This may block 1746 * but that is ok. 1747 */ 1748 1749 fflag = lfp->f_flag; 1750 error = falloc(p, &nfp, &fd); 1751 if (error) { 1752 /* 1753 * Probably ran out of file descriptors. Put the 1754 * unaccepted connection back onto the queue and 1755 * do another wakeup so some other process might 1756 * have a chance at it. 1757 */ 1758 crit_exit(); 1759 goto done; 1760 } 1761 fhold(nfp); 1762 uap->sysmsg_result = fd; 1763 1764 so = sctp_get_peeloff(head, assoc_id, &error); 1765 if (so == NULL) { 1766 /* 1767 * Either someone else peeled it off OR 1768 * we can't get a socket. 1769 */ 1770 goto noconnection; 1771 } 1772 so->so_state &= ~SS_COMP; 1773 so->so_state &= ~SS_NOFDREF; 1774 so->so_head = NULL; 1775 if (head->so_sigio != NULL) 1776 fsetown(fgetown(head->so_sigio), &so->so_sigio); 1777 1778 nfp->f_type = DTYPE_SOCKET; 1779 nfp->f_flag = fflag; 1780 nfp->f_ops = &socketops; 1781 nfp->f_data = so; 1782 1783 noconnection: 1784 /* 1785 * close the new descriptor, assuming someone hasn't ripped it 1786 * out from under us. 1787 */ 1788 if (error) { 1789 if (fdp->fd_files[fd].fp == nfp) { 1790 funsetfd(fdp, fd); 1791 fdrop(nfp, td); 1792 } 1793 } 1794 crit_exit(); 1795 /* 1796 * Release explicitly held references before returning. 1797 */ 1798 done: 1799 if (nfp != NULL) 1800 fdrop(nfp, td); 1801 fdrop(lfp, td); 1802 return (error); 1803 #else /* SCTP */ 1804 return(EOPNOTSUPP); 1805 #endif /* SCTP */ 1806 } 1807