1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $ 38 * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.92 2008/11/26 13:10:56 sephe Exp $ 39 */ 40 41 #include "opt_ktrace.h" 42 #include "opt_sctp.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/sysproto.h> 48 #include <sys/malloc.h> 49 #include <sys/filedesc.h> 50 #include <sys/event.h> 51 #include <sys/proc.h> 52 #include <sys/fcntl.h> 53 #include <sys/file.h> 54 #include <sys/filio.h> 55 #include <sys/kern_syscall.h> 56 #include <sys/mbuf.h> 57 #include <sys/protosw.h> 58 #include <sys/sfbuf.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/socketops.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #include <sys/lock.h> 65 #include <sys/mount.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_kern.h> 74 #include <vm/vm_extern.h> 75 #include <sys/file2.h> 76 #include <sys/signalvar.h> 77 #include <sys/serialize.h> 78 79 #include <sys/thread2.h> 80 #include <sys/msgport2.h> 81 #include <sys/socketvar2.h> 82 #include <net/netmsg2.h> 83 84 #ifdef SCTP 85 #include <netinet/sctp_peeloff.h> 86 #endif /* SCTP */ 87 88 struct sfbuf_mref { 89 struct sf_buf *sf; 90 int mref_count; 91 }; 92 93 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures"); 94 95 /* 96 * System call interface to the socket abstraction. 97 */ 98 99 extern struct fileops socketops; 100 101 /* 102 * socket_args(int domain, int type, int protocol) 103 */ 104 int 105 kern_socket(int domain, int type, int protocol, int *res) 106 { 107 struct thread *td = curthread; 108 struct proc *p = td->td_proc; 109 struct socket *so; 110 struct file *fp; 111 int fd, error; 112 113 KKASSERT(p); 114 115 error = falloc(p, &fp, &fd); 116 if (error) 117 return (error); 118 error = socreate(domain, &so, type, protocol, td); 119 if (error) { 120 fsetfd(p, NULL, fd); 121 } else { 122 fp->f_type = DTYPE_SOCKET; 123 fp->f_flag = FREAD | FWRITE; 124 fp->f_ops = &socketops; 125 fp->f_data = so; 126 *res = fd; 127 fsetfd(p, fp, fd); 128 } 129 fdrop(fp); 130 return (error); 131 } 132 133 int 134 sys_socket(struct socket_args *uap) 135 { 136 int error; 137 138 error = kern_socket(uap->domain, uap->type, uap->protocol, 139 &uap->sysmsg_iresult); 140 141 return (error); 142 } 143 144 int 145 kern_bind(int s, struct sockaddr *sa) 146 { 147 struct thread *td = curthread; 148 struct proc *p = td->td_proc; 149 struct file *fp; 150 int error; 151 152 KKASSERT(p); 153 error = holdsock(p->p_fd, s, &fp); 154 if (error) 155 return (error); 156 error = sobind((struct socket *)fp->f_data, sa, td); 157 fdrop(fp); 158 return (error); 159 } 160 161 /* 162 * bind_args(int s, caddr_t name, int namelen) 163 */ 164 int 165 sys_bind(struct bind_args *uap) 166 { 167 struct sockaddr *sa; 168 int error; 169 170 error = getsockaddr(&sa, uap->name, uap->namelen); 171 if (error) 172 return (error); 173 error = kern_bind(uap->s, sa); 174 FREE(sa, M_SONAME); 175 176 return (error); 177 } 178 179 int 180 kern_listen(int s, int backlog) 181 { 182 struct thread *td = curthread; 183 struct proc *p = td->td_proc; 184 struct file *fp; 185 int error; 186 187 KKASSERT(p); 188 error = holdsock(p->p_fd, s, &fp); 189 if (error) 190 return (error); 191 error = solisten((struct socket *)fp->f_data, backlog, td); 192 fdrop(fp); 193 return(error); 194 } 195 196 /* 197 * listen_args(int s, int backlog) 198 */ 199 int 200 sys_listen(struct listen_args *uap) 201 { 202 int error; 203 204 error = kern_listen(uap->s, uap->backlog); 205 return (error); 206 } 207 208 /* 209 * Returns the accepted socket as well. 210 */ 211 static boolean_t 212 soaccept_predicate(struct netmsg *msg0) 213 { 214 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 215 struct socket *head = msg->nm_so; 216 217 if (head->so_error != 0) { 218 msg->nm_netmsg.nm_lmsg.ms_error = head->so_error; 219 return (TRUE); 220 } 221 if (!TAILQ_EMPTY(&head->so_comp)) { 222 /* Abuse nm_so field as copy in/copy out parameter. XXX JH */ 223 msg->nm_so = TAILQ_FIRST(&head->so_comp); 224 TAILQ_REMOVE(&head->so_comp, msg->nm_so, so_list); 225 head->so_qlen--; 226 227 msg->nm_netmsg.nm_lmsg.ms_error = 0; 228 return (TRUE); 229 } 230 if (head->so_state & SS_CANTRCVMORE) { 231 msg->nm_netmsg.nm_lmsg.ms_error = ECONNABORTED; 232 return (TRUE); 233 } 234 if (msg->nm_fflags & FNONBLOCK) { 235 msg->nm_netmsg.nm_lmsg.ms_error = EWOULDBLOCK; 236 return (TRUE); 237 } 238 239 return (FALSE); 240 } 241 242 /* 243 * The second argument to kern_accept() is a handle to a struct sockaddr. 244 * This allows kern_accept() to return a pointer to an allocated struct 245 * sockaddr which must be freed later with FREE(). The caller must 246 * initialize *name to NULL. 247 */ 248 int 249 kern_accept(int s, int fflags, struct sockaddr **name, int *namelen, int *res) 250 { 251 struct thread *td = curthread; 252 struct proc *p = td->td_proc; 253 struct file *lfp = NULL; 254 struct file *nfp = NULL; 255 struct sockaddr *sa; 256 struct socket *head, *so; 257 struct netmsg_so_notify msg; 258 lwkt_port_t port; 259 int fd; 260 u_int fflag; /* type must match fp->f_flag */ 261 int error, tmp; 262 263 *res = -1; 264 if (name && namelen && *namelen < 0) 265 return (EINVAL); 266 267 error = holdsock(p->p_fd, s, &lfp); 268 if (error) 269 return (error); 270 271 error = falloc(p, &nfp, &fd); 272 if (error) { /* Probably ran out of file descriptors. */ 273 fdrop(lfp); 274 return (error); 275 } 276 head = (struct socket *)lfp->f_data; 277 if ((head->so_options & SO_ACCEPTCONN) == 0) { 278 error = EINVAL; 279 goto done; 280 } 281 282 if (fflags & O_FBLOCKING) 283 fflags |= lfp->f_flag & ~FNONBLOCK; 284 else if (fflags & O_FNONBLOCKING) 285 fflags |= lfp->f_flag | FNONBLOCK; 286 else 287 fflags = lfp->f_flag; 288 289 /* optimize for uniprocessor case later XXX JH */ 290 port = head->so_proto->pr_mport(head, NULL, NULL, PRU_PRED); 291 netmsg_init_abortable(&msg.nm_netmsg, &curthread->td_msgport, 292 0, 293 netmsg_so_notify, 294 netmsg_so_notify_doabort); 295 msg.nm_predicate = soaccept_predicate; 296 msg.nm_fflags = fflags; 297 msg.nm_so = head; 298 msg.nm_etype = NM_REVENT; 299 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 300 if (error) 301 goto done; 302 303 /* 304 * At this point we have the connection that's ready to be accepted. 305 */ 306 so = msg.nm_so; 307 308 fflag = lfp->f_flag; 309 310 /* connection has been removed from the listen queue */ 311 KNOTE(&head->so_rcv.ssb_sel.si_note, 0); 312 313 so->so_state &= ~SS_COMP; 314 so->so_head = NULL; 315 if (head->so_sigio != NULL) 316 fsetown(fgetown(head->so_sigio), &so->so_sigio); 317 318 nfp->f_type = DTYPE_SOCKET; 319 nfp->f_flag = fflag; 320 nfp->f_ops = &socketops; 321 nfp->f_data = so; 322 /* Sync socket nonblocking/async state with file flags */ 323 tmp = fflag & FNONBLOCK; 324 fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, p->p_ucred, NULL); 325 tmp = fflag & FASYNC; 326 fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, p->p_ucred, NULL); 327 328 sa = NULL; 329 error = soaccept(so, &sa); 330 331 /* 332 * Set the returned name and namelen as applicable. Set the returned 333 * namelen to 0 for older code which might ignore the return value 334 * from accept. 335 */ 336 if (error == 0) { 337 if (sa && name && namelen) { 338 if (*namelen > sa->sa_len) 339 *namelen = sa->sa_len; 340 *name = sa; 341 } else { 342 if (sa) 343 FREE(sa, M_SONAME); 344 } 345 } 346 347 done: 348 /* 349 * If an error occured clear the reserved descriptor, else associate 350 * nfp with it. 351 * 352 * Note that *res is normally ignored if an error is returned but 353 * a syscall message will still have access to the result code. 354 */ 355 if (error) { 356 fsetfd(p, NULL, fd); 357 } else { 358 *res = fd; 359 fsetfd(p, nfp, fd); 360 } 361 fdrop(nfp); 362 fdrop(lfp); 363 return (error); 364 } 365 366 /* 367 * accept(int s, caddr_t name, int *anamelen) 368 */ 369 int 370 sys_accept(struct accept_args *uap) 371 { 372 struct sockaddr *sa = NULL; 373 int sa_len; 374 int error; 375 376 if (uap->name) { 377 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 378 if (error) 379 return (error); 380 381 error = kern_accept(uap->s, 0, &sa, &sa_len, 382 &uap->sysmsg_iresult); 383 384 if (error == 0) 385 error = copyout(sa, uap->name, sa_len); 386 if (error == 0) { 387 error = copyout(&sa_len, uap->anamelen, 388 sizeof(*uap->anamelen)); 389 } 390 if (sa) 391 FREE(sa, M_SONAME); 392 } else { 393 error = kern_accept(uap->s, 0, NULL, 0, 394 &uap->sysmsg_iresult); 395 } 396 return (error); 397 } 398 399 /* 400 * extaccept(int s, int fflags, caddr_t name, int *anamelen) 401 */ 402 int 403 sys_extaccept(struct extaccept_args *uap) 404 { 405 struct sockaddr *sa = NULL; 406 int sa_len; 407 int error; 408 int fflags = uap->flags & O_FMASK; 409 410 if (uap->name) { 411 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 412 if (error) 413 return (error); 414 415 error = kern_accept(uap->s, fflags, &sa, &sa_len, 416 &uap->sysmsg_iresult); 417 418 if (error == 0) 419 error = copyout(sa, uap->name, sa_len); 420 if (error == 0) { 421 error = copyout(&sa_len, uap->anamelen, 422 sizeof(*uap->anamelen)); 423 } 424 if (sa) 425 FREE(sa, M_SONAME); 426 } else { 427 error = kern_accept(uap->s, fflags, NULL, 0, 428 &uap->sysmsg_iresult); 429 } 430 return (error); 431 } 432 433 434 /* 435 * Returns TRUE if predicate satisfied. 436 */ 437 static boolean_t 438 soconnected_predicate(struct netmsg *msg0) 439 { 440 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 441 struct socket *so = msg->nm_so; 442 443 /* check predicate */ 444 if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) { 445 msg->nm_netmsg.nm_lmsg.ms_error = so->so_error; 446 return (TRUE); 447 } 448 449 return (FALSE); 450 } 451 452 int 453 kern_connect(int s, int fflags, struct sockaddr *sa) 454 { 455 struct thread *td = curthread; 456 struct proc *p = td->td_proc; 457 struct file *fp; 458 struct socket *so; 459 int error, interrupted = 0; 460 461 error = holdsock(p->p_fd, s, &fp); 462 if (error) 463 return (error); 464 so = (struct socket *)fp->f_data; 465 466 if (fflags & O_FBLOCKING) 467 /* fflags &= ~FNONBLOCK; */; 468 else if (fflags & O_FNONBLOCKING) 469 fflags |= FNONBLOCK; 470 else 471 fflags = fp->f_flag; 472 473 if (so->so_state & SS_ISCONNECTING) { 474 error = EALREADY; 475 goto done; 476 } 477 error = soconnect(so, sa, td); 478 if (error) 479 goto bad; 480 if ((fflags & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) { 481 error = EINPROGRESS; 482 goto done; 483 } 484 if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 485 struct netmsg_so_notify msg; 486 lwkt_port_t port; 487 488 port = so->so_proto->pr_mport(so, sa, NULL, PRU_PRED); 489 netmsg_init_abortable(&msg.nm_netmsg, 490 &curthread->td_msgport, 491 0, 492 netmsg_so_notify, 493 netmsg_so_notify_doabort); 494 msg.nm_predicate = soconnected_predicate; 495 msg.nm_so = so; 496 msg.nm_etype = NM_REVENT; 497 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 498 if (error == EINTR || error == ERESTART) 499 interrupted = 1; 500 } 501 if (error == 0) { 502 error = so->so_error; 503 so->so_error = 0; 504 } 505 bad: 506 if (!interrupted) 507 so->so_state &= ~SS_ISCONNECTING; 508 if (error == ERESTART) 509 error = EINTR; 510 done: 511 fdrop(fp); 512 return (error); 513 } 514 515 /* 516 * connect_args(int s, caddr_t name, int namelen) 517 */ 518 int 519 sys_connect(struct connect_args *uap) 520 { 521 struct sockaddr *sa; 522 int error; 523 524 error = getsockaddr(&sa, uap->name, uap->namelen); 525 if (error) 526 return (error); 527 error = kern_connect(uap->s, 0, sa); 528 FREE(sa, M_SONAME); 529 530 return (error); 531 } 532 533 /* 534 * connect_args(int s, int fflags, caddr_t name, int namelen) 535 */ 536 int 537 sys_extconnect(struct extconnect_args *uap) 538 { 539 struct sockaddr *sa; 540 int error; 541 int fflags = uap->flags & O_FMASK; 542 543 error = getsockaddr(&sa, uap->name, uap->namelen); 544 if (error) 545 return (error); 546 error = kern_connect(uap->s, fflags, sa); 547 FREE(sa, M_SONAME); 548 549 return (error); 550 } 551 552 int 553 kern_socketpair(int domain, int type, int protocol, int *sv) 554 { 555 struct thread *td = curthread; 556 struct proc *p = td->td_proc; 557 struct file *fp1, *fp2; 558 struct socket *so1, *so2; 559 int fd1, fd2, error; 560 561 KKASSERT(p); 562 error = socreate(domain, &so1, type, protocol, td); 563 if (error) 564 return (error); 565 error = socreate(domain, &so2, type, protocol, td); 566 if (error) 567 goto free1; 568 error = falloc(p, &fp1, &fd1); 569 if (error) 570 goto free2; 571 sv[0] = fd1; 572 fp1->f_data = so1; 573 error = falloc(p, &fp2, &fd2); 574 if (error) 575 goto free3; 576 fp2->f_data = so2; 577 sv[1] = fd2; 578 error = soconnect2(so1, so2); 579 if (error) 580 goto free4; 581 if (type == SOCK_DGRAM) { 582 /* 583 * Datagram socket connection is asymmetric. 584 */ 585 error = soconnect2(so2, so1); 586 if (error) 587 goto free4; 588 } 589 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 590 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 591 fp1->f_ops = fp2->f_ops = &socketops; 592 fsetfd(p, fp1, fd1); 593 fsetfd(p, fp2, fd2); 594 fdrop(fp1); 595 fdrop(fp2); 596 return (error); 597 free4: 598 fsetfd(p, NULL, fd2); 599 fdrop(fp2); 600 free3: 601 fsetfd(p, NULL, fd1); 602 fdrop(fp1); 603 free2: 604 (void)soclose(so2, 0); 605 free1: 606 (void)soclose(so1, 0); 607 return (error); 608 } 609 610 /* 611 * socketpair(int domain, int type, int protocol, int *rsv) 612 */ 613 int 614 sys_socketpair(struct socketpair_args *uap) 615 { 616 int error, sockv[2]; 617 618 error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv); 619 620 if (error == 0) 621 error = copyout(sockv, uap->rsv, sizeof(sockv)); 622 return (error); 623 } 624 625 int 626 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, 627 struct mbuf *control, int flags, size_t *res) 628 { 629 struct thread *td = curthread; 630 struct lwp *lp = td->td_lwp; 631 struct proc *p = td->td_proc; 632 struct file *fp; 633 size_t len; 634 int error; 635 struct socket *so; 636 #ifdef KTRACE 637 struct iovec *ktriov = NULL; 638 struct uio ktruio; 639 #endif 640 641 error = holdsock(p->p_fd, s, &fp); 642 if (error) 643 return (error); 644 #ifdef KTRACE 645 if (KTRPOINT(td, KTR_GENIO)) { 646 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 647 648 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 649 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 650 ktruio = *auio; 651 } 652 #endif 653 len = auio->uio_resid; 654 so = (struct socket *)fp->f_data; 655 if ((flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 656 if (fp->f_flag & FNONBLOCK) 657 flags |= MSG_FNONBLOCKING; 658 } 659 error = so_pru_sosend(so, sa, auio, NULL, control, flags, td); 660 if (error) { 661 if (auio->uio_resid != len && (error == ERESTART || 662 error == EINTR || error == EWOULDBLOCK)) 663 error = 0; 664 if (error == EPIPE) 665 lwpsignal(p, lp, SIGPIPE); 666 } 667 #ifdef KTRACE 668 if (ktriov != NULL) { 669 if (error == 0) { 670 ktruio.uio_iov = ktriov; 671 ktruio.uio_resid = len - auio->uio_resid; 672 ktrgenio(lp, s, UIO_WRITE, &ktruio, error); 673 } 674 FREE(ktriov, M_TEMP); 675 } 676 #endif 677 if (error == 0) 678 *res = len - auio->uio_resid; 679 fdrop(fp); 680 return (error); 681 } 682 683 /* 684 * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen) 685 */ 686 int 687 sys_sendto(struct sendto_args *uap) 688 { 689 struct thread *td = curthread; 690 struct uio auio; 691 struct iovec aiov; 692 struct sockaddr *sa = NULL; 693 int error; 694 695 if (uap->to) { 696 error = getsockaddr(&sa, uap->to, uap->tolen); 697 if (error) 698 return (error); 699 } 700 aiov.iov_base = uap->buf; 701 aiov.iov_len = uap->len; 702 auio.uio_iov = &aiov; 703 auio.uio_iovcnt = 1; 704 auio.uio_offset = 0; 705 auio.uio_resid = uap->len; 706 auio.uio_segflg = UIO_USERSPACE; 707 auio.uio_rw = UIO_WRITE; 708 auio.uio_td = td; 709 710 error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags, 711 &uap->sysmsg_szresult); 712 713 if (sa) 714 FREE(sa, M_SONAME); 715 return (error); 716 } 717 718 /* 719 * sendmsg_args(int s, caddr_t msg, int flags) 720 */ 721 int 722 sys_sendmsg(struct sendmsg_args *uap) 723 { 724 struct thread *td = curthread; 725 struct msghdr msg; 726 struct uio auio; 727 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 728 struct sockaddr *sa = NULL; 729 struct mbuf *control = NULL; 730 int error; 731 732 error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg)); 733 if (error) 734 return (error); 735 736 /* 737 * Conditionally copyin msg.msg_name. 738 */ 739 if (msg.msg_name) { 740 error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen); 741 if (error) 742 return (error); 743 } 744 745 /* 746 * Populate auio. 747 */ 748 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 749 &auio.uio_resid); 750 if (error) 751 goto cleanup2; 752 auio.uio_iov = iov; 753 auio.uio_iovcnt = msg.msg_iovlen; 754 auio.uio_offset = 0; 755 auio.uio_segflg = UIO_USERSPACE; 756 auio.uio_rw = UIO_WRITE; 757 auio.uio_td = td; 758 759 /* 760 * Conditionally copyin msg.msg_control. 761 */ 762 if (msg.msg_control) { 763 if (msg.msg_controllen < sizeof(struct cmsghdr) || 764 msg.msg_controllen > MLEN) { 765 error = EINVAL; 766 goto cleanup; 767 } 768 control = m_get(MB_WAIT, MT_CONTROL); 769 if (control == NULL) { 770 error = ENOBUFS; 771 goto cleanup; 772 } 773 control->m_len = msg.msg_controllen; 774 error = copyin(msg.msg_control, mtod(control, caddr_t), 775 msg.msg_controllen); 776 if (error) { 777 m_free(control); 778 goto cleanup; 779 } 780 } 781 782 error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags, 783 &uap->sysmsg_szresult); 784 785 cleanup: 786 iovec_free(&iov, aiov); 787 cleanup2: 788 if (sa) 789 FREE(sa, M_SONAME); 790 return (error); 791 } 792 793 /* 794 * kern_recvmsg() takes a handle to sa and control. If the handle is non- 795 * null, it returns a dynamically allocated struct sockaddr and an mbuf. 796 * Don't forget to FREE() and m_free() these if they are returned. 797 */ 798 int 799 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, 800 struct mbuf **control, int *flags, size_t *res) 801 { 802 struct thread *td = curthread; 803 struct proc *p = td->td_proc; 804 struct file *fp; 805 size_t len; 806 int error; 807 int lflags; 808 struct socket *so; 809 #ifdef KTRACE 810 struct iovec *ktriov = NULL; 811 struct uio ktruio; 812 #endif 813 814 error = holdsock(p->p_fd, s, &fp); 815 if (error) 816 return (error); 817 #ifdef KTRACE 818 if (KTRPOINT(td, KTR_GENIO)) { 819 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 820 821 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 822 bcopy(auio->uio_iov, ktriov, iovlen); 823 ktruio = *auio; 824 } 825 #endif 826 len = auio->uio_resid; 827 so = (struct socket *)fp->f_data; 828 829 if (flags == NULL || (*flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 830 if (fp->f_flag & FNONBLOCK) { 831 if (flags) { 832 *flags |= MSG_FNONBLOCKING; 833 } else { 834 lflags = MSG_FNONBLOCKING; 835 flags = &lflags; 836 } 837 } 838 } 839 840 error = so_pru_soreceive(so, sa, auio, NULL, control, flags); 841 if (error) { 842 if (auio->uio_resid != len && (error == ERESTART || 843 error == EINTR || error == EWOULDBLOCK)) 844 error = 0; 845 } 846 #ifdef KTRACE 847 if (ktriov != NULL) { 848 if (error == 0) { 849 ktruio.uio_iov = ktriov; 850 ktruio.uio_resid = len - auio->uio_resid; 851 ktrgenio(td->td_lwp, s, UIO_READ, &ktruio, error); 852 } 853 FREE(ktriov, M_TEMP); 854 } 855 #endif 856 if (error == 0) 857 *res = len - auio->uio_resid; 858 fdrop(fp); 859 return (error); 860 } 861 862 /* 863 * recvfrom_args(int s, caddr_t buf, size_t len, int flags, 864 * caddr_t from, int *fromlenaddr) 865 */ 866 int 867 sys_recvfrom(struct recvfrom_args *uap) 868 { 869 struct thread *td = curthread; 870 struct uio auio; 871 struct iovec aiov; 872 struct sockaddr *sa = NULL; 873 int error, fromlen; 874 875 if (uap->from && uap->fromlenaddr) { 876 error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen)); 877 if (error) 878 return (error); 879 if (fromlen < 0) 880 return (EINVAL); 881 } else { 882 fromlen = 0; 883 } 884 aiov.iov_base = uap->buf; 885 aiov.iov_len = uap->len; 886 auio.uio_iov = &aiov; 887 auio.uio_iovcnt = 1; 888 auio.uio_offset = 0; 889 auio.uio_resid = uap->len; 890 auio.uio_segflg = UIO_USERSPACE; 891 auio.uio_rw = UIO_READ; 892 auio.uio_td = td; 893 894 error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL, 895 &uap->flags, &uap->sysmsg_szresult); 896 897 if (error == 0 && uap->from) { 898 /* note: sa may still be NULL */ 899 if (sa) { 900 fromlen = MIN(fromlen, sa->sa_len); 901 error = copyout(sa, uap->from, fromlen); 902 } else { 903 fromlen = 0; 904 } 905 if (error == 0) { 906 error = copyout(&fromlen, uap->fromlenaddr, 907 sizeof(fromlen)); 908 } 909 } 910 if (sa) 911 FREE(sa, M_SONAME); 912 913 return (error); 914 } 915 916 /* 917 * recvmsg_args(int s, struct msghdr *msg, int flags) 918 */ 919 int 920 sys_recvmsg(struct recvmsg_args *uap) 921 { 922 struct thread *td = curthread; 923 struct msghdr msg; 924 struct uio auio; 925 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 926 struct mbuf *m, *control = NULL; 927 struct sockaddr *sa = NULL; 928 caddr_t ctlbuf; 929 socklen_t *ufromlenp, *ucontrollenp; 930 int error, fromlen, controllen, len, flags, *uflagsp; 931 932 /* 933 * This copyin handles everything except the iovec. 934 */ 935 error = copyin(uap->msg, &msg, sizeof(msg)); 936 if (error) 937 return (error); 938 939 if (msg.msg_name && msg.msg_namelen < 0) 940 return (EINVAL); 941 if (msg.msg_control && msg.msg_controllen < 0) 942 return (EINVAL); 943 944 ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 945 msg_namelen)); 946 ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 947 msg_controllen)); 948 uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr, 949 msg_flags)); 950 951 /* 952 * Populate auio. 953 */ 954 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 955 &auio.uio_resid); 956 if (error) 957 return (error); 958 auio.uio_iov = iov; 959 auio.uio_iovcnt = msg.msg_iovlen; 960 auio.uio_offset = 0; 961 auio.uio_segflg = UIO_USERSPACE; 962 auio.uio_rw = UIO_READ; 963 auio.uio_td = td; 964 965 flags = uap->flags; 966 967 error = kern_recvmsg(uap->s, 968 (msg.msg_name ? &sa : NULL), &auio, 969 (msg.msg_control ? &control : NULL), &flags, 970 &uap->sysmsg_szresult); 971 972 /* 973 * Conditionally copyout the name and populate the namelen field. 974 */ 975 if (error == 0 && msg.msg_name) { 976 /* note: sa may still be NULL */ 977 if (sa != NULL) { 978 fromlen = MIN(msg.msg_namelen, sa->sa_len); 979 error = copyout(sa, msg.msg_name, fromlen); 980 } else { 981 fromlen = 0; 982 } 983 if (error == 0) 984 error = copyout(&fromlen, ufromlenp, 985 sizeof(*ufromlenp)); 986 } 987 988 /* 989 * Copyout msg.msg_control and msg.msg_controllen. 990 */ 991 if (error == 0 && msg.msg_control) { 992 len = msg.msg_controllen; 993 m = control; 994 ctlbuf = (caddr_t)msg.msg_control; 995 996 while(m && len > 0) { 997 unsigned int tocopy; 998 999 if (len >= m->m_len) { 1000 tocopy = m->m_len; 1001 } else { 1002 msg.msg_flags |= MSG_CTRUNC; 1003 tocopy = len; 1004 } 1005 1006 error = copyout(mtod(m, caddr_t), ctlbuf, tocopy); 1007 if (error) 1008 goto cleanup; 1009 1010 ctlbuf += tocopy; 1011 len -= tocopy; 1012 m = m->m_next; 1013 } 1014 controllen = ctlbuf - (caddr_t)msg.msg_control; 1015 error = copyout(&controllen, ucontrollenp, 1016 sizeof(*ucontrollenp)); 1017 } 1018 1019 if (error == 0) 1020 error = copyout(&flags, uflagsp, sizeof(*uflagsp)); 1021 1022 cleanup: 1023 if (sa) 1024 FREE(sa, M_SONAME); 1025 iovec_free(&iov, aiov); 1026 if (control) 1027 m_freem(control); 1028 return (error); 1029 } 1030 1031 /* 1032 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1033 * in kernel pointer instead of a userland pointer. This allows us 1034 * to manipulate socket options in the emulation code. 1035 */ 1036 int 1037 kern_setsockopt(int s, struct sockopt *sopt) 1038 { 1039 struct thread *td = curthread; 1040 struct proc *p = td->td_proc; 1041 struct file *fp; 1042 int error; 1043 1044 if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0) 1045 return (EFAULT); 1046 if (sopt->sopt_valsize < 0) 1047 return (EINVAL); 1048 1049 error = holdsock(p->p_fd, s, &fp); 1050 if (error) 1051 return (error); 1052 1053 error = sosetopt((struct socket *)fp->f_data, sopt); 1054 fdrop(fp); 1055 return (error); 1056 } 1057 1058 /* 1059 * setsockopt_args(int s, int level, int name, caddr_t val, int valsize) 1060 */ 1061 int 1062 sys_setsockopt(struct setsockopt_args *uap) 1063 { 1064 struct thread *td = curthread; 1065 struct sockopt sopt; 1066 int error; 1067 1068 sopt.sopt_level = uap->level; 1069 sopt.sopt_name = uap->name; 1070 sopt.sopt_valsize = uap->valsize; 1071 sopt.sopt_td = td; 1072 sopt.sopt_val = NULL; 1073 1074 if (sopt.sopt_valsize < 0 || sopt.sopt_valsize > SOMAXOPT_SIZE) 1075 return (EINVAL); 1076 if (uap->val) { 1077 sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP, M_WAITOK); 1078 error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize); 1079 if (error) 1080 goto out; 1081 } 1082 1083 error = kern_setsockopt(uap->s, &sopt); 1084 out: 1085 if (uap->val) 1086 kfree(sopt.sopt_val, M_TEMP); 1087 return(error); 1088 } 1089 1090 /* 1091 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1092 * in kernel pointer instead of a userland pointer. This allows us 1093 * to manipulate socket options in the emulation code. 1094 */ 1095 int 1096 kern_getsockopt(int s, struct sockopt *sopt) 1097 { 1098 struct thread *td = curthread; 1099 struct proc *p = td->td_proc; 1100 struct file *fp; 1101 int error; 1102 1103 if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0) 1104 return (EFAULT); 1105 if (sopt->sopt_valsize < 0 || sopt->sopt_valsize > SOMAXOPT_SIZE) 1106 return (EINVAL); 1107 1108 error = holdsock(p->p_fd, s, &fp); 1109 if (error) 1110 return (error); 1111 1112 error = sogetopt((struct socket *)fp->f_data, sopt); 1113 fdrop(fp); 1114 return (error); 1115 } 1116 1117 /* 1118 * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize) 1119 */ 1120 int 1121 sys_getsockopt(struct getsockopt_args *uap) 1122 { 1123 struct thread *td = curthread; 1124 struct sockopt sopt; 1125 int error, valsize; 1126 1127 if (uap->val) { 1128 error = copyin(uap->avalsize, &valsize, sizeof(valsize)); 1129 if (error) 1130 return (error); 1131 } else { 1132 valsize = 0; 1133 } 1134 1135 sopt.sopt_level = uap->level; 1136 sopt.sopt_name = uap->name; 1137 sopt.sopt_valsize = valsize; 1138 sopt.sopt_td = td; 1139 sopt.sopt_val = NULL; 1140 1141 if (sopt.sopt_valsize < 0 || sopt.sopt_valsize > SOMAXOPT_SIZE) 1142 return (EINVAL); 1143 if (uap->val) { 1144 sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP, M_WAITOK); 1145 error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize); 1146 if (error) 1147 goto out; 1148 } 1149 1150 error = kern_getsockopt(uap->s, &sopt); 1151 if (error) 1152 goto out; 1153 valsize = sopt.sopt_valsize; 1154 error = copyout(&valsize, uap->avalsize, sizeof(valsize)); 1155 if (error) 1156 goto out; 1157 if (uap->val) 1158 error = copyout(sopt.sopt_val, uap->val, sopt.sopt_valsize); 1159 out: 1160 if (uap->val) 1161 kfree(sopt.sopt_val, M_TEMP); 1162 return (error); 1163 } 1164 1165 /* 1166 * The second argument to kern_getsockname() is a handle to a struct sockaddr. 1167 * This allows kern_getsockname() to return a pointer to an allocated struct 1168 * sockaddr which must be freed later with FREE(). The caller must 1169 * initialize *name to NULL. 1170 */ 1171 int 1172 kern_getsockname(int s, struct sockaddr **name, int *namelen) 1173 { 1174 struct thread *td = curthread; 1175 struct proc *p = td->td_proc; 1176 struct file *fp; 1177 struct socket *so; 1178 struct sockaddr *sa = NULL; 1179 int error; 1180 1181 error = holdsock(p->p_fd, s, &fp); 1182 if (error) 1183 return (error); 1184 if (*namelen < 0) { 1185 fdrop(fp); 1186 return (EINVAL); 1187 } 1188 so = (struct socket *)fp->f_data; 1189 error = so_pru_sockaddr(so, &sa); 1190 if (error == 0) { 1191 if (sa == NULL) { 1192 *namelen = 0; 1193 } else { 1194 *namelen = MIN(*namelen, sa->sa_len); 1195 *name = sa; 1196 } 1197 } 1198 1199 fdrop(fp); 1200 return (error); 1201 } 1202 1203 /* 1204 * getsockname_args(int fdes, caddr_t asa, int *alen) 1205 * 1206 * Get socket name. 1207 */ 1208 int 1209 sys_getsockname(struct getsockname_args *uap) 1210 { 1211 struct sockaddr *sa = NULL; 1212 int error, sa_len; 1213 1214 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1215 if (error) 1216 return (error); 1217 1218 error = kern_getsockname(uap->fdes, &sa, &sa_len); 1219 1220 if (error == 0) 1221 error = copyout(sa, uap->asa, sa_len); 1222 if (error == 0) 1223 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1224 if (sa) 1225 FREE(sa, M_SONAME); 1226 return (error); 1227 } 1228 1229 /* 1230 * The second argument to kern_getpeername() is a handle to a struct sockaddr. 1231 * This allows kern_getpeername() to return a pointer to an allocated struct 1232 * sockaddr which must be freed later with FREE(). The caller must 1233 * initialize *name to NULL. 1234 */ 1235 int 1236 kern_getpeername(int s, struct sockaddr **name, int *namelen) 1237 { 1238 struct thread *td = curthread; 1239 struct proc *p = td->td_proc; 1240 struct file *fp; 1241 struct socket *so; 1242 struct sockaddr *sa = NULL; 1243 int error; 1244 1245 error = holdsock(p->p_fd, s, &fp); 1246 if (error) 1247 return (error); 1248 if (*namelen < 0) { 1249 fdrop(fp); 1250 return (EINVAL); 1251 } 1252 so = (struct socket *)fp->f_data; 1253 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1254 fdrop(fp); 1255 return (ENOTCONN); 1256 } 1257 error = so_pru_peeraddr(so, &sa); 1258 if (error == 0) { 1259 if (sa == NULL) { 1260 *namelen = 0; 1261 } else { 1262 *namelen = MIN(*namelen, sa->sa_len); 1263 *name = sa; 1264 } 1265 } 1266 1267 fdrop(fp); 1268 return (error); 1269 } 1270 1271 /* 1272 * getpeername_args(int fdes, caddr_t asa, int *alen) 1273 * 1274 * Get name of peer for connected socket. 1275 */ 1276 int 1277 sys_getpeername(struct getpeername_args *uap) 1278 { 1279 struct sockaddr *sa = NULL; 1280 int error, sa_len; 1281 1282 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1283 if (error) 1284 return (error); 1285 1286 error = kern_getpeername(uap->fdes, &sa, &sa_len); 1287 1288 if (error == 0) 1289 error = copyout(sa, uap->asa, sa_len); 1290 if (error == 0) 1291 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1292 if (sa) 1293 FREE(sa, M_SONAME); 1294 return (error); 1295 } 1296 1297 int 1298 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) 1299 { 1300 struct sockaddr *sa; 1301 int error; 1302 1303 *namp = NULL; 1304 if (len > SOCK_MAXADDRLEN) 1305 return ENAMETOOLONG; 1306 if (len < offsetof(struct sockaddr, sa_data[0])) 1307 return EDOM; 1308 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1309 error = copyin(uaddr, sa, len); 1310 if (error) { 1311 FREE(sa, M_SONAME); 1312 } else { 1313 #if BYTE_ORDER != BIG_ENDIAN 1314 /* 1315 * The bind(), connect(), and sendto() syscalls were not 1316 * versioned for COMPAT_43. Thus, this check must stay. 1317 */ 1318 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1319 sa->sa_family = sa->sa_len; 1320 #endif 1321 sa->sa_len = len; 1322 *namp = sa; 1323 } 1324 return error; 1325 } 1326 1327 /* 1328 * Detach a mapped page and release resources back to the system. 1329 * We must release our wiring and if the object is ripped out 1330 * from under the vm_page we become responsible for freeing the 1331 * page. These routines must be MPSAFE. 1332 * 1333 * XXX HACK XXX TEMPORARY UNTIL WE IMPLEMENT EXT MBUF REFERENCE COUNTING 1334 * 1335 * XXX vm_page_*() routines are not MPSAFE yet, the MP lock is required. 1336 */ 1337 static void 1338 sf_buf_mref(void *arg) 1339 { 1340 struct sfbuf_mref *sfm = arg; 1341 1342 /* 1343 * We must already hold a ref so there is no race to 0, just 1344 * atomically increment the count. 1345 */ 1346 atomic_add_int(&sfm->mref_count, 1); 1347 } 1348 1349 static void 1350 sf_buf_mfree(void *arg) 1351 { 1352 struct sfbuf_mref *sfm = arg; 1353 vm_page_t m; 1354 1355 KKASSERT(sfm->mref_count > 0); 1356 if (atomic_fetchadd_int(&sfm->mref_count, -1) == 1) { 1357 /* 1358 * XXX vm_page_*() and SFBUF routines not MPSAFE yet. 1359 */ 1360 get_mplock(); 1361 crit_enter(); 1362 m = sf_buf_page(sfm->sf); 1363 sf_buf_free(sfm->sf); 1364 vm_page_unwire(m, 0); 1365 if (m->wire_count == 0 && m->object == NULL) 1366 vm_page_try_to_free(m); 1367 crit_exit(); 1368 rel_mplock(); 1369 kfree(sfm, M_SENDFILE); 1370 } 1371 } 1372 1373 /* 1374 * sendfile(2). 1375 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1376 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1377 * 1378 * Send a file specified by 'fd' and starting at 'offset' to a socket 1379 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1380 * nbytes == 0. Optionally add a header and/or trailer to the socket 1381 * output. If specified, write the total number of bytes sent into *sbytes. 1382 * 1383 * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused 1384 * the headers to count against the remaining bytes to be sent from 1385 * the file descriptor. We may wish to implement a compatibility syscall 1386 * in the future. 1387 */ 1388 int 1389 sys_sendfile(struct sendfile_args *uap) 1390 { 1391 struct thread *td = curthread; 1392 struct proc *p = td->td_proc; 1393 struct file *fp; 1394 struct vnode *vp = NULL; 1395 struct sf_hdtr hdtr; 1396 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 1397 struct uio auio; 1398 struct mbuf *mheader = NULL; 1399 size_t hbytes = 0; 1400 size_t tbytes; 1401 off_t hdtr_size = 0; 1402 off_t sbytes; 1403 int error; 1404 1405 KKASSERT(p); 1406 1407 /* 1408 * Do argument checking. Must be a regular file in, stream 1409 * type and connected socket out, positive offset. 1410 */ 1411 fp = holdfp(p->p_fd, uap->fd, FREAD); 1412 if (fp == NULL) { 1413 return (EBADF); 1414 } 1415 if (fp->f_type != DTYPE_VNODE) { 1416 fdrop(fp); 1417 return (EINVAL); 1418 } 1419 vp = (struct vnode *)fp->f_data; 1420 vref(vp); 1421 fdrop(fp); 1422 1423 /* 1424 * If specified, get the pointer to the sf_hdtr struct for 1425 * any headers/trailers. 1426 */ 1427 if (uap->hdtr) { 1428 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1429 if (error) 1430 goto done; 1431 /* 1432 * Send any headers. 1433 */ 1434 if (hdtr.headers) { 1435 error = iovec_copyin(hdtr.headers, &iov, aiov, 1436 hdtr.hdr_cnt, &hbytes); 1437 if (error) 1438 goto done; 1439 auio.uio_iov = iov; 1440 auio.uio_iovcnt = hdtr.hdr_cnt; 1441 auio.uio_offset = 0; 1442 auio.uio_segflg = UIO_USERSPACE; 1443 auio.uio_rw = UIO_WRITE; 1444 auio.uio_td = td; 1445 auio.uio_resid = hbytes; 1446 1447 mheader = m_uiomove(&auio); 1448 1449 iovec_free(&iov, aiov); 1450 if (mheader == NULL) 1451 goto done; 1452 } 1453 } 1454 1455 error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader, 1456 &sbytes, uap->flags); 1457 if (error) 1458 goto done; 1459 1460 /* 1461 * Send trailers. Wimp out and use writev(2). 1462 */ 1463 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1464 error = iovec_copyin(hdtr.trailers, &iov, aiov, 1465 hdtr.trl_cnt, &auio.uio_resid); 1466 if (error) 1467 goto done; 1468 auio.uio_iov = iov; 1469 auio.uio_iovcnt = hdtr.trl_cnt; 1470 auio.uio_offset = 0; 1471 auio.uio_segflg = UIO_USERSPACE; 1472 auio.uio_rw = UIO_WRITE; 1473 auio.uio_td = td; 1474 1475 error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes); 1476 1477 iovec_free(&iov, aiov); 1478 if (error) 1479 goto done; 1480 hdtr_size += tbytes; /* trailer bytes successfully sent */ 1481 } 1482 1483 done: 1484 if (uap->sbytes != NULL) { 1485 sbytes += hdtr_size; 1486 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1487 } 1488 if (vp) 1489 vrele(vp); 1490 return (error); 1491 } 1492 1493 int 1494 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, 1495 struct mbuf *mheader, off_t *sbytes, int flags) 1496 { 1497 struct thread *td = curthread; 1498 struct proc *p = td->td_proc; 1499 struct vm_object *obj; 1500 struct socket *so; 1501 struct file *fp; 1502 struct mbuf *m; 1503 struct sf_buf *sf; 1504 struct sfbuf_mref *sfm; 1505 struct vm_page *pg; 1506 off_t off, xfsize; 1507 off_t hbytes = 0; 1508 int error = 0; 1509 1510 if (vp->v_type != VREG) { 1511 error = EINVAL; 1512 goto done0; 1513 } 1514 if ((obj = vp->v_object) == NULL) { 1515 error = EINVAL; 1516 goto done0; 1517 } 1518 error = holdsock(p->p_fd, sfd, &fp); 1519 if (error) 1520 goto done0; 1521 so = (struct socket *)fp->f_data; 1522 if (so->so_type != SOCK_STREAM) { 1523 error = EINVAL; 1524 goto done; 1525 } 1526 if ((so->so_state & SS_ISCONNECTED) == 0) { 1527 error = ENOTCONN; 1528 goto done; 1529 } 1530 if (offset < 0) { 1531 error = EINVAL; 1532 goto done; 1533 } 1534 1535 *sbytes = 0; 1536 /* 1537 * Protect against multiple writers to the socket. 1538 */ 1539 ssb_lock(&so->so_snd, M_WAITOK); 1540 1541 /* 1542 * Loop through the pages in the file, starting with the requested 1543 * offset. Get a file page (do I/O if necessary), map the file page 1544 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1545 * it on the socket. 1546 */ 1547 for (off = offset; ; off += xfsize, *sbytes += xfsize + hbytes) { 1548 vm_pindex_t pindex; 1549 vm_offset_t pgoff; 1550 1551 pindex = OFF_TO_IDX(off); 1552 retry_lookup: 1553 /* 1554 * Calculate the amount to transfer. Not to exceed a page, 1555 * the EOF, or the passed in nbytes. 1556 */ 1557 xfsize = vp->v_filesize - off; 1558 if (xfsize > PAGE_SIZE) 1559 xfsize = PAGE_SIZE; 1560 pgoff = (vm_offset_t)(off & PAGE_MASK); 1561 if (PAGE_SIZE - pgoff < xfsize) 1562 xfsize = PAGE_SIZE - pgoff; 1563 if (nbytes && xfsize > (nbytes - *sbytes)) 1564 xfsize = nbytes - *sbytes; 1565 if (xfsize <= 0) 1566 break; 1567 /* 1568 * Optimize the non-blocking case by looking at the socket space 1569 * before going to the extra work of constituting the sf_buf. 1570 */ 1571 if ((fp->f_flag & FNONBLOCK) && ssb_space(&so->so_snd) <= 0) { 1572 if (so->so_state & SS_CANTSENDMORE) 1573 error = EPIPE; 1574 else 1575 error = EAGAIN; 1576 ssb_unlock(&so->so_snd); 1577 goto done; 1578 } 1579 /* 1580 * Attempt to look up the page. 1581 * 1582 * Allocate if not found, wait and loop if busy, then 1583 * wire the page. critical section protection is 1584 * required to maintain the object association (an 1585 * interrupt can free the page) through to the 1586 * vm_page_wire() call. 1587 */ 1588 crit_enter(); 1589 pg = vm_page_lookup(obj, pindex); 1590 if (pg == NULL) { 1591 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1592 if (pg == NULL) { 1593 vm_wait(0); 1594 crit_exit(); 1595 goto retry_lookup; 1596 } 1597 vm_page_wakeup(pg); 1598 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1599 crit_exit(); 1600 goto retry_lookup; 1601 } 1602 vm_page_wire(pg); 1603 crit_exit(); 1604 1605 /* 1606 * If page is not valid for what we need, initiate I/O 1607 */ 1608 1609 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1610 struct uio auio; 1611 struct iovec aiov; 1612 int bsize; 1613 1614 /* 1615 * Ensure that our page is still around when the I/O 1616 * completes. 1617 */ 1618 vm_page_io_start(pg); 1619 1620 /* 1621 * Get the page from backing store. 1622 */ 1623 bsize = vp->v_mount->mnt_stat.f_iosize; 1624 auio.uio_iov = &aiov; 1625 auio.uio_iovcnt = 1; 1626 aiov.iov_base = 0; 1627 aiov.iov_len = MAXBSIZE; 1628 auio.uio_resid = MAXBSIZE; 1629 auio.uio_offset = trunc_page(off); 1630 auio.uio_segflg = UIO_NOCOPY; 1631 auio.uio_rw = UIO_READ; 1632 auio.uio_td = td; 1633 vn_lock(vp, LK_SHARED | LK_RETRY); 1634 error = VOP_READ(vp, &auio, 1635 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1636 p->p_ucred); 1637 vn_unlock(vp); 1638 vm_page_flag_clear(pg, PG_ZERO); 1639 vm_page_io_finish(pg); 1640 if (error) { 1641 crit_enter(); 1642 vm_page_unwire(pg, 0); 1643 vm_page_try_to_free(pg); 1644 crit_exit(); 1645 ssb_unlock(&so->so_snd); 1646 goto done; 1647 } 1648 } 1649 1650 1651 /* 1652 * Get a sendfile buf. We usually wait as long as necessary, 1653 * but this wait can be interrupted. 1654 */ 1655 if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { 1656 crit_enter(); 1657 vm_page_unwire(pg, 0); 1658 vm_page_try_to_free(pg); 1659 crit_exit(); 1660 ssb_unlock(&so->so_snd); 1661 error = EINTR; 1662 goto done; 1663 } 1664 1665 /* 1666 * Get an mbuf header and set it up as having external storage. 1667 */ 1668 MGETHDR(m, MB_WAIT, MT_DATA); 1669 if (m == NULL) { 1670 error = ENOBUFS; 1671 sf_buf_free(sf); 1672 ssb_unlock(&so->so_snd); 1673 goto done; 1674 } 1675 1676 /* 1677 * sfm is a temporary hack, use a per-cpu cache for this. 1678 */ 1679 sfm = kmalloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK); 1680 sfm->sf = sf; 1681 sfm->mref_count = 1; 1682 1683 m->m_ext.ext_free = sf_buf_mfree; 1684 m->m_ext.ext_ref = sf_buf_mref; 1685 m->m_ext.ext_arg = sfm; 1686 m->m_ext.ext_buf = (void *)sf->kva; 1687 m->m_ext.ext_size = PAGE_SIZE; 1688 m->m_data = (char *) sf->kva + pgoff; 1689 m->m_flags |= M_EXT; 1690 m->m_pkthdr.len = m->m_len = xfsize; 1691 KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0); 1692 1693 if (mheader != NULL) { 1694 hbytes = mheader->m_pkthdr.len; 1695 mheader->m_pkthdr.len += m->m_pkthdr.len; 1696 m_cat(mheader, m); 1697 m = mheader; 1698 mheader = NULL; 1699 } else 1700 hbytes = 0; 1701 1702 /* 1703 * Add the buffer to the socket buffer chain. 1704 */ 1705 crit_enter(); 1706 retry_space: 1707 /* 1708 * Make sure that the socket is still able to take more data. 1709 * CANTSENDMORE being true usually means that the connection 1710 * was closed. so_error is true when an error was sensed after 1711 * a previous send. 1712 * The state is checked after the page mapping and buffer 1713 * allocation above since those operations may block and make 1714 * any socket checks stale. From this point forward, nothing 1715 * blocks before the pru_send (or more accurately, any blocking 1716 * results in a loop back to here to re-check). 1717 */ 1718 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1719 if (so->so_state & SS_CANTSENDMORE) { 1720 error = EPIPE; 1721 } else { 1722 error = so->so_error; 1723 so->so_error = 0; 1724 } 1725 m_freem(m); 1726 ssb_unlock(&so->so_snd); 1727 crit_exit(); 1728 goto done; 1729 } 1730 /* 1731 * Wait for socket space to become available. We do this just 1732 * after checking the connection state above in order to avoid 1733 * a race condition with ssb_wait(). 1734 */ 1735 if (ssb_space(&so->so_snd) < so->so_snd.ssb_lowat) { 1736 if (fp->f_flag & FNONBLOCK) { 1737 m_freem(m); 1738 ssb_unlock(&so->so_snd); 1739 crit_exit(); 1740 error = EAGAIN; 1741 goto done; 1742 } 1743 error = ssb_wait(&so->so_snd); 1744 /* 1745 * An error from ssb_wait usually indicates that we've 1746 * been interrupted by a signal. If we've sent anything 1747 * then return bytes sent, otherwise return the error. 1748 */ 1749 if (error) { 1750 m_freem(m); 1751 ssb_unlock(&so->so_snd); 1752 crit_exit(); 1753 goto done; 1754 } 1755 goto retry_space; 1756 } 1757 error = so_pru_send(so, 0, m, NULL, NULL, td); 1758 crit_exit(); 1759 if (error) { 1760 ssb_unlock(&so->so_snd); 1761 goto done; 1762 } 1763 } 1764 if (mheader != NULL) { 1765 *sbytes += mheader->m_pkthdr.len; 1766 error = so_pru_send(so, 0, mheader, NULL, NULL, td); 1767 mheader = NULL; 1768 } 1769 ssb_unlock(&so->so_snd); 1770 1771 done: 1772 fdrop(fp); 1773 done0: 1774 if (mheader != NULL) 1775 m_freem(mheader); 1776 return (error); 1777 } 1778 1779 int 1780 sys_sctp_peeloff(struct sctp_peeloff_args *uap) 1781 { 1782 #ifdef SCTP 1783 struct thread *td = curthread; 1784 struct proc *p = td->td_proc; 1785 struct file *lfp = NULL; 1786 struct file *nfp = NULL; 1787 int error; 1788 struct socket *head, *so; 1789 caddr_t assoc_id; 1790 int fd; 1791 short fflag; /* type must match fp->f_flag */ 1792 1793 assoc_id = uap->name; 1794 error = holdsock(p->p_fd, uap->sd, &lfp); 1795 if (error) { 1796 return (error); 1797 } 1798 crit_enter(); 1799 head = (struct socket *)lfp->f_data; 1800 error = sctp_can_peel_off(head, assoc_id); 1801 if (error) { 1802 crit_exit(); 1803 goto done; 1804 } 1805 /* 1806 * At this point we know we do have a assoc to pull 1807 * we proceed to get the fd setup. This may block 1808 * but that is ok. 1809 */ 1810 1811 fflag = lfp->f_flag; 1812 error = falloc(p, &nfp, &fd); 1813 if (error) { 1814 /* 1815 * Probably ran out of file descriptors. Put the 1816 * unaccepted connection back onto the queue and 1817 * do another wakeup so some other process might 1818 * have a chance at it. 1819 */ 1820 crit_exit(); 1821 goto done; 1822 } 1823 uap->sysmsg_iresult = fd; 1824 1825 so = sctp_get_peeloff(head, assoc_id, &error); 1826 if (so == NULL) { 1827 /* 1828 * Either someone else peeled it off OR 1829 * we can't get a socket. 1830 */ 1831 goto noconnection; 1832 } 1833 so->so_state &= ~SS_COMP; 1834 so->so_state &= ~SS_NOFDREF; 1835 so->so_head = NULL; 1836 if (head->so_sigio != NULL) 1837 fsetown(fgetown(head->so_sigio), &so->so_sigio); 1838 1839 nfp->f_type = DTYPE_SOCKET; 1840 nfp->f_flag = fflag; 1841 nfp->f_ops = &socketops; 1842 nfp->f_data = so; 1843 1844 noconnection: 1845 /* 1846 * Assign the file pointer to the reserved descriptor, or clear 1847 * the reserved descriptor if an error occured. 1848 */ 1849 if (error) 1850 fsetfd(p, NULL, fd); 1851 else 1852 fsetfd(p, nfp, fd); 1853 crit_exit(); 1854 /* 1855 * Release explicitly held references before returning. 1856 */ 1857 done: 1858 if (nfp != NULL) 1859 fdrop(nfp); 1860 fdrop(lfp); 1861 return (error); 1862 #else /* SCTP */ 1863 return(EOPNOTSUPP); 1864 #endif /* SCTP */ 1865 } 1866