1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $ 38 * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.92 2008/11/26 13:10:56 sephe Exp $ 39 */ 40 41 #include "opt_ktrace.h" 42 #include "opt_sctp.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/sysproto.h> 48 #include <sys/malloc.h> 49 #include <sys/filedesc.h> 50 #include <sys/event.h> 51 #include <sys/proc.h> 52 #include <sys/fcntl.h> 53 #include <sys/file.h> 54 #include <sys/filio.h> 55 #include <sys/kern_syscall.h> 56 #include <sys/mbuf.h> 57 #include <sys/protosw.h> 58 #include <sys/sfbuf.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/socketops.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #include <sys/lock.h> 65 #include <sys/mount.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_kern.h> 74 #include <vm/vm_extern.h> 75 #include <sys/file2.h> 76 #include <sys/signalvar.h> 77 #include <sys/serialize.h> 78 79 #include <sys/thread2.h> 80 #include <sys/msgport2.h> 81 #include <sys/socketvar2.h> 82 #include <net/netmsg2.h> 83 84 #ifdef SCTP 85 #include <netinet/sctp_peeloff.h> 86 #endif /* SCTP */ 87 88 struct sfbuf_mref { 89 struct sf_buf *sf; 90 int mref_count; 91 }; 92 93 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures"); 94 95 /* 96 * System call interface to the socket abstraction. 97 */ 98 99 extern struct fileops socketops; 100 101 /* 102 * socket_args(int domain, int type, int protocol) 103 */ 104 int 105 kern_socket(int domain, int type, int protocol, int *res) 106 { 107 struct thread *td = curthread; 108 struct proc *p = td->td_proc; 109 struct socket *so; 110 struct file *fp; 111 int fd, error; 112 113 KKASSERT(p); 114 115 error = falloc(p, &fp, &fd); 116 if (error) 117 return (error); 118 error = socreate(domain, &so, type, protocol, td); 119 if (error) { 120 fsetfd(p, NULL, fd); 121 } else { 122 fp->f_type = DTYPE_SOCKET; 123 fp->f_flag = FREAD | FWRITE; 124 fp->f_ops = &socketops; 125 fp->f_data = so; 126 *res = fd; 127 fsetfd(p, fp, fd); 128 } 129 fdrop(fp); 130 return (error); 131 } 132 133 int 134 sys_socket(struct socket_args *uap) 135 { 136 int error; 137 138 error = kern_socket(uap->domain, uap->type, uap->protocol, 139 &uap->sysmsg_result); 140 141 return (error); 142 } 143 144 int 145 kern_bind(int s, struct sockaddr *sa) 146 { 147 struct thread *td = curthread; 148 struct proc *p = td->td_proc; 149 struct file *fp; 150 int error; 151 152 KKASSERT(p); 153 error = holdsock(p->p_fd, s, &fp); 154 if (error) 155 return (error); 156 error = sobind((struct socket *)fp->f_data, sa, td); 157 fdrop(fp); 158 return (error); 159 } 160 161 /* 162 * bind_args(int s, caddr_t name, int namelen) 163 */ 164 int 165 sys_bind(struct bind_args *uap) 166 { 167 struct sockaddr *sa; 168 int error; 169 170 error = getsockaddr(&sa, uap->name, uap->namelen); 171 if (error) 172 return (error); 173 error = kern_bind(uap->s, sa); 174 FREE(sa, M_SONAME); 175 176 return (error); 177 } 178 179 int 180 kern_listen(int s, int backlog) 181 { 182 struct thread *td = curthread; 183 struct proc *p = td->td_proc; 184 struct file *fp; 185 int error; 186 187 KKASSERT(p); 188 error = holdsock(p->p_fd, s, &fp); 189 if (error) 190 return (error); 191 error = solisten((struct socket *)fp->f_data, backlog, td); 192 fdrop(fp); 193 return(error); 194 } 195 196 /* 197 * listen_args(int s, int backlog) 198 */ 199 int 200 sys_listen(struct listen_args *uap) 201 { 202 int error; 203 204 error = kern_listen(uap->s, uap->backlog); 205 return (error); 206 } 207 208 /* 209 * Returns the accepted socket as well. 210 */ 211 static boolean_t 212 soaccept_predicate(struct netmsg *msg0) 213 { 214 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 215 struct socket *head = msg->nm_so; 216 217 if (head->so_error != 0) { 218 msg->nm_netmsg.nm_lmsg.ms_error = head->so_error; 219 return (TRUE); 220 } 221 if (!TAILQ_EMPTY(&head->so_comp)) { 222 /* Abuse nm_so field as copy in/copy out parameter. XXX JH */ 223 msg->nm_so = TAILQ_FIRST(&head->so_comp); 224 TAILQ_REMOVE(&head->so_comp, msg->nm_so, so_list); 225 head->so_qlen--; 226 227 msg->nm_netmsg.nm_lmsg.ms_error = 0; 228 return (TRUE); 229 } 230 if (head->so_state & SS_CANTRCVMORE) { 231 msg->nm_netmsg.nm_lmsg.ms_error = ECONNABORTED; 232 return (TRUE); 233 } 234 if (msg->nm_fflags & FNONBLOCK) { 235 msg->nm_netmsg.nm_lmsg.ms_error = EWOULDBLOCK; 236 return (TRUE); 237 } 238 239 return (FALSE); 240 } 241 242 /* 243 * The second argument to kern_accept() is a handle to a struct sockaddr. 244 * This allows kern_accept() to return a pointer to an allocated struct 245 * sockaddr which must be freed later with FREE(). The caller must 246 * initialize *name to NULL. 247 */ 248 int 249 kern_accept(int s, int fflags, struct sockaddr **name, int *namelen, int *res) 250 { 251 struct thread *td = curthread; 252 struct proc *p = td->td_proc; 253 struct file *lfp = NULL; 254 struct file *nfp = NULL; 255 struct sockaddr *sa; 256 struct socket *head, *so; 257 struct netmsg_so_notify msg; 258 lwkt_port_t port; 259 int fd; 260 u_int fflag; /* type must match fp->f_flag */ 261 int error, tmp; 262 263 *res = -1; 264 if (name && namelen && *namelen < 0) 265 return (EINVAL); 266 267 error = holdsock(p->p_fd, s, &lfp); 268 if (error) 269 return (error); 270 271 error = falloc(p, &nfp, &fd); 272 if (error) { /* Probably ran out of file descriptors. */ 273 fdrop(lfp); 274 return (error); 275 } 276 head = (struct socket *)lfp->f_data; 277 if ((head->so_options & SO_ACCEPTCONN) == 0) { 278 error = EINVAL; 279 goto done; 280 } 281 282 if (fflags & O_FBLOCKING) 283 fflags |= lfp->f_flag & ~FNONBLOCK; 284 else if (fflags & O_FNONBLOCKING) 285 fflags |= lfp->f_flag | FNONBLOCK; 286 else 287 fflags = lfp->f_flag; 288 289 /* optimize for uniprocessor case later XXX JH */ 290 port = head->so_proto->pr_mport(head, NULL, NULL, PRU_PRED); 291 netmsg_init_abortable(&msg.nm_netmsg, &curthread->td_msgport, 292 0, 293 netmsg_so_notify, 294 netmsg_so_notify_doabort); 295 msg.nm_predicate = soaccept_predicate; 296 msg.nm_fflags = fflags; 297 msg.nm_so = head; 298 msg.nm_etype = NM_REVENT; 299 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 300 if (error) 301 goto done; 302 303 /* 304 * At this point we have the connection that's ready to be accepted. 305 */ 306 so = msg.nm_so; 307 308 fflag = lfp->f_flag; 309 310 /* connection has been removed from the listen queue */ 311 KNOTE(&head->so_rcv.ssb_sel.si_note, 0); 312 313 so->so_state &= ~SS_COMP; 314 so->so_head = NULL; 315 if (head->so_sigio != NULL) 316 fsetown(fgetown(head->so_sigio), &so->so_sigio); 317 318 nfp->f_type = DTYPE_SOCKET; 319 nfp->f_flag = fflag; 320 nfp->f_ops = &socketops; 321 nfp->f_data = so; 322 /* Sync socket nonblocking/async state with file flags */ 323 tmp = fflag & FNONBLOCK; 324 (void) fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, p->p_ucred); 325 tmp = fflag & FASYNC; 326 (void) fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, p->p_ucred); 327 328 sa = NULL; 329 error = soaccept(so, &sa); 330 331 /* 332 * Set the returned name and namelen as applicable. Set the returned 333 * namelen to 0 for older code which might ignore the return value 334 * from accept. 335 */ 336 if (error == 0) { 337 if (sa && name && namelen) { 338 if (*namelen > sa->sa_len) 339 *namelen = sa->sa_len; 340 *name = sa; 341 } else { 342 if (sa) 343 FREE(sa, M_SONAME); 344 } 345 } 346 347 done: 348 /* 349 * If an error occured clear the reserved descriptor, else associate 350 * nfp with it. 351 * 352 * Note that *res is normally ignored if an error is returned but 353 * a syscall message will still have access to the result code. 354 */ 355 if (error) { 356 fsetfd(p, NULL, fd); 357 } else { 358 *res = fd; 359 fsetfd(p, nfp, fd); 360 } 361 fdrop(nfp); 362 fdrop(lfp); 363 return (error); 364 } 365 366 /* 367 * accept(int s, caddr_t name, int *anamelen) 368 */ 369 int 370 sys_accept(struct accept_args *uap) 371 { 372 struct sockaddr *sa = NULL; 373 int sa_len; 374 int error; 375 376 if (uap->name) { 377 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 378 if (error) 379 return (error); 380 381 error = kern_accept(uap->s, 0, &sa, &sa_len, &uap->sysmsg_result); 382 383 if (error == 0) 384 error = copyout(sa, uap->name, sa_len); 385 if (error == 0) { 386 error = copyout(&sa_len, uap->anamelen, 387 sizeof(*uap->anamelen)); 388 } 389 if (sa) 390 FREE(sa, M_SONAME); 391 } else { 392 error = kern_accept(uap->s, 0, NULL, 0, &uap->sysmsg_result); 393 } 394 return (error); 395 } 396 397 /* 398 * extaccept(int s, int fflags, caddr_t name, int *anamelen) 399 */ 400 int 401 sys_extaccept(struct extaccept_args *uap) 402 { 403 struct sockaddr *sa = NULL; 404 int sa_len; 405 int error; 406 int fflags = uap->flags & O_FMASK; 407 408 if (uap->name) { 409 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 410 if (error) 411 return (error); 412 413 error = kern_accept(uap->s, fflags, &sa, &sa_len, &uap->sysmsg_result); 414 415 if (error == 0) 416 error = copyout(sa, uap->name, sa_len); 417 if (error == 0) { 418 error = copyout(&sa_len, uap->anamelen, 419 sizeof(*uap->anamelen)); 420 } 421 if (sa) 422 FREE(sa, M_SONAME); 423 } else { 424 error = kern_accept(uap->s, fflags, NULL, 0, &uap->sysmsg_result); 425 } 426 return (error); 427 } 428 429 430 /* 431 * Returns TRUE if predicate satisfied. 432 */ 433 static boolean_t 434 soconnected_predicate(struct netmsg *msg0) 435 { 436 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 437 struct socket *so = msg->nm_so; 438 439 /* check predicate */ 440 if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) { 441 msg->nm_netmsg.nm_lmsg.ms_error = so->so_error; 442 return (TRUE); 443 } 444 445 return (FALSE); 446 } 447 448 int 449 kern_connect(int s, int fflags, struct sockaddr *sa) 450 { 451 struct thread *td = curthread; 452 struct proc *p = td->td_proc; 453 struct file *fp; 454 struct socket *so; 455 int error, interrupted = 0; 456 457 error = holdsock(p->p_fd, s, &fp); 458 if (error) 459 return (error); 460 so = (struct socket *)fp->f_data; 461 462 if (fflags & O_FBLOCKING) 463 /* fflags &= ~FNONBLOCK; */; 464 else if (fflags & O_FNONBLOCKING) 465 fflags |= FNONBLOCK; 466 else 467 fflags = fp->f_flag; 468 469 if (so->so_state & SS_ISCONNECTING) { 470 error = EALREADY; 471 goto done; 472 } 473 error = soconnect(so, sa, td); 474 if (error) 475 goto bad; 476 if ((fflags & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) { 477 error = EINPROGRESS; 478 goto done; 479 } 480 if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 481 struct netmsg_so_notify msg; 482 lwkt_port_t port; 483 484 port = so->so_proto->pr_mport(so, sa, NULL, PRU_PRED); 485 netmsg_init_abortable(&msg.nm_netmsg, 486 &curthread->td_msgport, 487 0, 488 netmsg_so_notify, 489 netmsg_so_notify_doabort); 490 msg.nm_predicate = soconnected_predicate; 491 msg.nm_so = so; 492 msg.nm_etype = NM_REVENT; 493 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 494 if (error == EINTR || error == ERESTART) 495 interrupted = 1; 496 } 497 if (error == 0) { 498 error = so->so_error; 499 so->so_error = 0; 500 } 501 bad: 502 if (!interrupted) 503 so->so_state &= ~SS_ISCONNECTING; 504 if (error == ERESTART) 505 error = EINTR; 506 done: 507 fdrop(fp); 508 return (error); 509 } 510 511 /* 512 * connect_args(int s, caddr_t name, int namelen) 513 */ 514 int 515 sys_connect(struct connect_args *uap) 516 { 517 struct sockaddr *sa; 518 int error; 519 520 error = getsockaddr(&sa, uap->name, uap->namelen); 521 if (error) 522 return (error); 523 error = kern_connect(uap->s, 0, sa); 524 FREE(sa, M_SONAME); 525 526 return (error); 527 } 528 529 /* 530 * connect_args(int s, int fflags, caddr_t name, int namelen) 531 */ 532 int 533 sys_extconnect(struct extconnect_args *uap) 534 { 535 struct sockaddr *sa; 536 int error; 537 int fflags = uap->flags & O_FMASK; 538 539 error = getsockaddr(&sa, uap->name, uap->namelen); 540 if (error) 541 return (error); 542 error = kern_connect(uap->s, fflags, sa); 543 FREE(sa, M_SONAME); 544 545 return (error); 546 } 547 548 int 549 kern_socketpair(int domain, int type, int protocol, int *sv) 550 { 551 struct thread *td = curthread; 552 struct proc *p = td->td_proc; 553 struct file *fp1, *fp2; 554 struct socket *so1, *so2; 555 int fd1, fd2, error; 556 557 KKASSERT(p); 558 error = socreate(domain, &so1, type, protocol, td); 559 if (error) 560 return (error); 561 error = socreate(domain, &so2, type, protocol, td); 562 if (error) 563 goto free1; 564 error = falloc(p, &fp1, &fd1); 565 if (error) 566 goto free2; 567 sv[0] = fd1; 568 fp1->f_data = so1; 569 error = falloc(p, &fp2, &fd2); 570 if (error) 571 goto free3; 572 fp2->f_data = so2; 573 sv[1] = fd2; 574 error = soconnect2(so1, so2); 575 if (error) 576 goto free4; 577 if (type == SOCK_DGRAM) { 578 /* 579 * Datagram socket connection is asymmetric. 580 */ 581 error = soconnect2(so2, so1); 582 if (error) 583 goto free4; 584 } 585 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 586 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 587 fp1->f_ops = fp2->f_ops = &socketops; 588 fsetfd(p, fp1, fd1); 589 fsetfd(p, fp2, fd2); 590 fdrop(fp1); 591 fdrop(fp2); 592 return (error); 593 free4: 594 fsetfd(p, NULL, fd2); 595 fdrop(fp2); 596 free3: 597 fsetfd(p, NULL, fd1); 598 fdrop(fp1); 599 free2: 600 (void)soclose(so2, 0); 601 free1: 602 (void)soclose(so1, 0); 603 return (error); 604 } 605 606 /* 607 * socketpair(int domain, int type, int protocol, int *rsv) 608 */ 609 int 610 sys_socketpair(struct socketpair_args *uap) 611 { 612 int error, sockv[2]; 613 614 error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv); 615 616 if (error == 0) 617 error = copyout(sockv, uap->rsv, sizeof(sockv)); 618 return (error); 619 } 620 621 int 622 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, 623 struct mbuf *control, int flags, int *res) 624 { 625 struct thread *td = curthread; 626 struct lwp *lp = td->td_lwp; 627 struct proc *p = td->td_proc; 628 struct file *fp; 629 int len, error; 630 struct socket *so; 631 #ifdef KTRACE 632 struct iovec *ktriov = NULL; 633 struct uio ktruio; 634 #endif 635 636 error = holdsock(p->p_fd, s, &fp); 637 if (error) 638 return (error); 639 if (auio->uio_resid < 0) { 640 error = EINVAL; 641 goto done; 642 } 643 #ifdef KTRACE 644 if (KTRPOINT(td, KTR_GENIO)) { 645 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 646 647 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 648 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 649 ktruio = *auio; 650 } 651 #endif 652 len = auio->uio_resid; 653 so = (struct socket *)fp->f_data; 654 if ((flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 655 if (fp->f_flag & FNONBLOCK) 656 flags |= MSG_FNONBLOCKING; 657 } 658 error = so_pru_sosend(so, sa, auio, NULL, control, flags, td); 659 if (error) { 660 if (auio->uio_resid != len && (error == ERESTART || 661 error == EINTR || error == EWOULDBLOCK)) 662 error = 0; 663 if (error == EPIPE) 664 lwpsignal(p, lp, SIGPIPE); 665 } 666 #ifdef KTRACE 667 if (ktriov != NULL) { 668 if (error == 0) { 669 ktruio.uio_iov = ktriov; 670 ktruio.uio_resid = len - auio->uio_resid; 671 ktrgenio(lp, s, UIO_WRITE, &ktruio, error); 672 } 673 FREE(ktriov, M_TEMP); 674 } 675 #endif 676 if (error == 0) 677 *res = len - auio->uio_resid; 678 done: 679 fdrop(fp); 680 return (error); 681 } 682 683 /* 684 * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen) 685 */ 686 int 687 sys_sendto(struct sendto_args *uap) 688 { 689 struct thread *td = curthread; 690 struct uio auio; 691 struct iovec aiov; 692 struct sockaddr *sa = NULL; 693 int error; 694 695 if (uap->to) { 696 error = getsockaddr(&sa, uap->to, uap->tolen); 697 if (error) 698 return (error); 699 } 700 aiov.iov_base = uap->buf; 701 aiov.iov_len = uap->len; 702 auio.uio_iov = &aiov; 703 auio.uio_iovcnt = 1; 704 auio.uio_offset = 0; 705 auio.uio_resid = uap->len; 706 auio.uio_segflg = UIO_USERSPACE; 707 auio.uio_rw = UIO_WRITE; 708 auio.uio_td = td; 709 710 error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags, 711 &uap->sysmsg_result); 712 713 if (sa) 714 FREE(sa, M_SONAME); 715 return (error); 716 } 717 718 /* 719 * sendmsg_args(int s, caddr_t msg, int flags) 720 */ 721 int 722 sys_sendmsg(struct sendmsg_args *uap) 723 { 724 struct thread *td = curthread; 725 struct msghdr msg; 726 struct uio auio; 727 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 728 struct sockaddr *sa = NULL; 729 struct mbuf *control = NULL; 730 int error; 731 732 error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg)); 733 if (error) 734 return (error); 735 736 /* 737 * Conditionally copyin msg.msg_name. 738 */ 739 if (msg.msg_name) { 740 error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen); 741 if (error) 742 return (error); 743 } 744 745 /* 746 * Populate auio. 747 */ 748 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 749 &auio.uio_resid); 750 if (error) 751 goto cleanup2; 752 auio.uio_iov = iov; 753 auio.uio_iovcnt = msg.msg_iovlen; 754 auio.uio_offset = 0; 755 auio.uio_segflg = UIO_USERSPACE; 756 auio.uio_rw = UIO_WRITE; 757 auio.uio_td = td; 758 759 /* 760 * Conditionally copyin msg.msg_control. 761 */ 762 if (msg.msg_control) { 763 if (msg.msg_controllen < sizeof(struct cmsghdr) || 764 msg.msg_controllen > MLEN) { 765 error = EINVAL; 766 goto cleanup; 767 } 768 control = m_get(MB_WAIT, MT_CONTROL); 769 if (control == NULL) { 770 error = ENOBUFS; 771 goto cleanup; 772 } 773 control->m_len = msg.msg_controllen; 774 error = copyin(msg.msg_control, mtod(control, caddr_t), 775 msg.msg_controllen); 776 if (error) { 777 m_free(control); 778 goto cleanup; 779 } 780 } 781 782 error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags, 783 &uap->sysmsg_result); 784 785 cleanup: 786 iovec_free(&iov, aiov); 787 cleanup2: 788 if (sa) 789 FREE(sa, M_SONAME); 790 return (error); 791 } 792 793 /* 794 * kern_recvmsg() takes a handle to sa and control. If the handle is non- 795 * null, it returns a dynamically allocated struct sockaddr and an mbuf. 796 * Don't forget to FREE() and m_free() these if they are returned. 797 */ 798 int 799 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, 800 struct mbuf **control, int *flags, int *res) 801 { 802 struct thread *td = curthread; 803 struct proc *p = td->td_proc; 804 struct file *fp; 805 int len, error; 806 int lflags; 807 struct socket *so; 808 #ifdef KTRACE 809 struct iovec *ktriov = NULL; 810 struct uio ktruio; 811 #endif 812 813 error = holdsock(p->p_fd, s, &fp); 814 if (error) 815 return (error); 816 if (auio->uio_resid < 0) { 817 error = EINVAL; 818 goto done; 819 } 820 #ifdef KTRACE 821 if (KTRPOINT(td, KTR_GENIO)) { 822 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 823 824 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 825 bcopy(auio->uio_iov, ktriov, iovlen); 826 ktruio = *auio; 827 } 828 #endif 829 len = auio->uio_resid; 830 so = (struct socket *)fp->f_data; 831 832 if (flags == NULL || (*flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 833 if (fp->f_flag & FNONBLOCK) { 834 if (flags) { 835 *flags |= MSG_FNONBLOCKING; 836 } else { 837 lflags = MSG_FNONBLOCKING; 838 flags = &lflags; 839 } 840 } 841 } 842 843 error = so_pru_soreceive(so, sa, auio, NULL, control, flags); 844 if (error) { 845 if (auio->uio_resid != len && (error == ERESTART || 846 error == EINTR || error == EWOULDBLOCK)) 847 error = 0; 848 } 849 #ifdef KTRACE 850 if (ktriov != NULL) { 851 if (error == 0) { 852 ktruio.uio_iov = ktriov; 853 ktruio.uio_resid = len - auio->uio_resid; 854 ktrgenio(td->td_lwp, s, UIO_READ, &ktruio, error); 855 } 856 FREE(ktriov, M_TEMP); 857 } 858 #endif 859 if (error == 0) 860 *res = len - auio->uio_resid; 861 done: 862 fdrop(fp); 863 return (error); 864 } 865 866 /* 867 * recvfrom_args(int s, caddr_t buf, size_t len, int flags, 868 * caddr_t from, int *fromlenaddr) 869 */ 870 int 871 sys_recvfrom(struct recvfrom_args *uap) 872 { 873 struct thread *td = curthread; 874 struct uio auio; 875 struct iovec aiov; 876 struct sockaddr *sa = NULL; 877 int error, fromlen; 878 879 if (uap->from && uap->fromlenaddr) { 880 error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen)); 881 if (error) 882 return (error); 883 if (fromlen < 0) 884 return (EINVAL); 885 } else { 886 fromlen = 0; 887 } 888 aiov.iov_base = uap->buf; 889 aiov.iov_len = uap->len; 890 auio.uio_iov = &aiov; 891 auio.uio_iovcnt = 1; 892 auio.uio_offset = 0; 893 auio.uio_resid = uap->len; 894 auio.uio_segflg = UIO_USERSPACE; 895 auio.uio_rw = UIO_READ; 896 auio.uio_td = td; 897 898 error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL, 899 &uap->flags, &uap->sysmsg_result); 900 901 if (error == 0 && uap->from) { 902 /* note: sa may still be NULL */ 903 if (sa) { 904 fromlen = MIN(fromlen, sa->sa_len); 905 error = copyout(sa, uap->from, fromlen); 906 } else { 907 fromlen = 0; 908 } 909 if (error == 0) { 910 error = copyout(&fromlen, uap->fromlenaddr, 911 sizeof(fromlen)); 912 } 913 } 914 if (sa) 915 FREE(sa, M_SONAME); 916 917 return (error); 918 } 919 920 /* 921 * recvmsg_args(int s, struct msghdr *msg, int flags) 922 */ 923 int 924 sys_recvmsg(struct recvmsg_args *uap) 925 { 926 struct thread *td = curthread; 927 struct msghdr msg; 928 struct uio auio; 929 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 930 struct mbuf *m, *control = NULL; 931 struct sockaddr *sa = NULL; 932 caddr_t ctlbuf; 933 socklen_t *ufromlenp, *ucontrollenp; 934 int error, fromlen, controllen, len, flags, *uflagsp; 935 936 /* 937 * This copyin handles everything except the iovec. 938 */ 939 error = copyin(uap->msg, &msg, sizeof(msg)); 940 if (error) 941 return (error); 942 943 if (msg.msg_name && msg.msg_namelen < 0) 944 return (EINVAL); 945 if (msg.msg_control && msg.msg_controllen < 0) 946 return (EINVAL); 947 948 ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 949 msg_namelen)); 950 ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 951 msg_controllen)); 952 uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr, 953 msg_flags)); 954 955 /* 956 * Populate auio. 957 */ 958 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 959 &auio.uio_resid); 960 if (error) 961 return (error); 962 auio.uio_iov = iov; 963 auio.uio_iovcnt = msg.msg_iovlen; 964 auio.uio_offset = 0; 965 auio.uio_segflg = UIO_USERSPACE; 966 auio.uio_rw = UIO_READ; 967 auio.uio_td = td; 968 969 flags = uap->flags; 970 971 error = kern_recvmsg(uap->s, msg.msg_name ? &sa : NULL, &auio, 972 msg.msg_control ? &control : NULL, &flags, &uap->sysmsg_result); 973 974 /* 975 * Conditionally copyout the name and populate the namelen field. 976 */ 977 if (error == 0 && msg.msg_name) { 978 /* note: sa may still be NULL */ 979 if (sa != NULL) { 980 fromlen = MIN(msg.msg_namelen, sa->sa_len); 981 error = copyout(sa, msg.msg_name, fromlen); 982 } else { 983 fromlen = 0; 984 } 985 if (error == 0) 986 error = copyout(&fromlen, ufromlenp, 987 sizeof(*ufromlenp)); 988 } 989 990 /* 991 * Copyout msg.msg_control and msg.msg_controllen. 992 */ 993 if (error == 0 && msg.msg_control) { 994 len = msg.msg_controllen; 995 m = control; 996 ctlbuf = (caddr_t)msg.msg_control; 997 998 while(m && len > 0) { 999 unsigned int tocopy; 1000 1001 if (len >= m->m_len) { 1002 tocopy = m->m_len; 1003 } else { 1004 msg.msg_flags |= MSG_CTRUNC; 1005 tocopy = len; 1006 } 1007 1008 error = copyout(mtod(m, caddr_t), ctlbuf, tocopy); 1009 if (error) 1010 goto cleanup; 1011 1012 ctlbuf += tocopy; 1013 len -= tocopy; 1014 m = m->m_next; 1015 } 1016 controllen = ctlbuf - (caddr_t)msg.msg_control; 1017 error = copyout(&controllen, ucontrollenp, 1018 sizeof(*ucontrollenp)); 1019 } 1020 1021 if (error == 0) 1022 error = copyout(&flags, uflagsp, sizeof(*uflagsp)); 1023 1024 cleanup: 1025 if (sa) 1026 FREE(sa, M_SONAME); 1027 iovec_free(&iov, aiov); 1028 if (control) 1029 m_freem(control); 1030 return (error); 1031 } 1032 1033 /* 1034 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1035 * in kernel pointer instead of a userland pointer. This allows us 1036 * to manipulate socket options in the emulation code. 1037 */ 1038 int 1039 kern_setsockopt(int s, struct sockopt *sopt) 1040 { 1041 struct thread *td = curthread; 1042 struct proc *p = td->td_proc; 1043 struct file *fp; 1044 int error; 1045 1046 if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0) 1047 return (EFAULT); 1048 if (sopt->sopt_valsize < 0) 1049 return (EINVAL); 1050 1051 error = holdsock(p->p_fd, s, &fp); 1052 if (error) 1053 return (error); 1054 1055 error = sosetopt((struct socket *)fp->f_data, sopt); 1056 fdrop(fp); 1057 return (error); 1058 } 1059 1060 /* 1061 * setsockopt_args(int s, int level, int name, caddr_t val, int valsize) 1062 */ 1063 int 1064 sys_setsockopt(struct setsockopt_args *uap) 1065 { 1066 struct thread *td = curthread; 1067 struct sockopt sopt; 1068 int error; 1069 1070 sopt.sopt_level = uap->level; 1071 sopt.sopt_name = uap->name; 1072 sopt.sopt_valsize = uap->valsize; 1073 sopt.sopt_td = td; 1074 sopt.sopt_val = NULL; 1075 1076 if (sopt.sopt_valsize < 0 || sopt.sopt_valsize > SOMAXOPT_SIZE) 1077 return (EINVAL); 1078 if (uap->val) { 1079 sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP, M_WAITOK); 1080 error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize); 1081 if (error) 1082 goto out; 1083 } 1084 1085 error = kern_setsockopt(uap->s, &sopt); 1086 out: 1087 if (uap->val) 1088 kfree(sopt.sopt_val, M_TEMP); 1089 return(error); 1090 } 1091 1092 /* 1093 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1094 * in kernel pointer instead of a userland pointer. This allows us 1095 * to manipulate socket options in the emulation code. 1096 */ 1097 int 1098 kern_getsockopt(int s, struct sockopt *sopt) 1099 { 1100 struct thread *td = curthread; 1101 struct proc *p = td->td_proc; 1102 struct file *fp; 1103 int error; 1104 1105 if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0) 1106 return (EFAULT); 1107 if (sopt->sopt_valsize < 0 || sopt->sopt_valsize > SOMAXOPT_SIZE) 1108 return (EINVAL); 1109 1110 error = holdsock(p->p_fd, s, &fp); 1111 if (error) 1112 return (error); 1113 1114 error = sogetopt((struct socket *)fp->f_data, sopt); 1115 fdrop(fp); 1116 return (error); 1117 } 1118 1119 /* 1120 * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize) 1121 */ 1122 int 1123 sys_getsockopt(struct getsockopt_args *uap) 1124 { 1125 struct thread *td = curthread; 1126 struct sockopt sopt; 1127 int error, valsize; 1128 1129 if (uap->val) { 1130 error = copyin(uap->avalsize, &valsize, sizeof(valsize)); 1131 if (error) 1132 return (error); 1133 } else { 1134 valsize = 0; 1135 } 1136 1137 sopt.sopt_level = uap->level; 1138 sopt.sopt_name = uap->name; 1139 sopt.sopt_valsize = valsize; 1140 sopt.sopt_td = td; 1141 sopt.sopt_val = NULL; 1142 1143 if (sopt.sopt_valsize < 0 || sopt.sopt_valsize > SOMAXOPT_SIZE) 1144 return (EINVAL); 1145 if (uap->val) { 1146 sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP, M_WAITOK); 1147 error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize); 1148 if (error) 1149 goto out; 1150 } 1151 1152 error = kern_getsockopt(uap->s, &sopt); 1153 if (error) 1154 goto out; 1155 valsize = sopt.sopt_valsize; 1156 error = copyout(&valsize, uap->avalsize, sizeof(valsize)); 1157 if (error) 1158 goto out; 1159 if (uap->val) 1160 error = copyout(sopt.sopt_val, uap->val, sopt.sopt_valsize); 1161 out: 1162 if (uap->val) 1163 kfree(sopt.sopt_val, M_TEMP); 1164 return (error); 1165 } 1166 1167 /* 1168 * The second argument to kern_getsockname() is a handle to a struct sockaddr. 1169 * This allows kern_getsockname() to return a pointer to an allocated struct 1170 * sockaddr which must be freed later with FREE(). The caller must 1171 * initialize *name to NULL. 1172 */ 1173 int 1174 kern_getsockname(int s, struct sockaddr **name, int *namelen) 1175 { 1176 struct thread *td = curthread; 1177 struct proc *p = td->td_proc; 1178 struct file *fp; 1179 struct socket *so; 1180 struct sockaddr *sa = NULL; 1181 int error; 1182 1183 error = holdsock(p->p_fd, s, &fp); 1184 if (error) 1185 return (error); 1186 if (*namelen < 0) { 1187 fdrop(fp); 1188 return (EINVAL); 1189 } 1190 so = (struct socket *)fp->f_data; 1191 error = so_pru_sockaddr(so, &sa); 1192 if (error == 0) { 1193 if (sa == NULL) { 1194 *namelen = 0; 1195 } else { 1196 *namelen = MIN(*namelen, sa->sa_len); 1197 *name = sa; 1198 } 1199 } 1200 1201 fdrop(fp); 1202 return (error); 1203 } 1204 1205 /* 1206 * getsockname_args(int fdes, caddr_t asa, int *alen) 1207 * 1208 * Get socket name. 1209 */ 1210 int 1211 sys_getsockname(struct getsockname_args *uap) 1212 { 1213 struct sockaddr *sa = NULL; 1214 int error, sa_len; 1215 1216 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1217 if (error) 1218 return (error); 1219 1220 error = kern_getsockname(uap->fdes, &sa, &sa_len); 1221 1222 if (error == 0) 1223 error = copyout(sa, uap->asa, sa_len); 1224 if (error == 0) 1225 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1226 if (sa) 1227 FREE(sa, M_SONAME); 1228 return (error); 1229 } 1230 1231 /* 1232 * The second argument to kern_getpeername() is a handle to a struct sockaddr. 1233 * This allows kern_getpeername() to return a pointer to an allocated struct 1234 * sockaddr which must be freed later with FREE(). The caller must 1235 * initialize *name to NULL. 1236 */ 1237 int 1238 kern_getpeername(int s, struct sockaddr **name, int *namelen) 1239 { 1240 struct thread *td = curthread; 1241 struct proc *p = td->td_proc; 1242 struct file *fp; 1243 struct socket *so; 1244 struct sockaddr *sa = NULL; 1245 int error; 1246 1247 error = holdsock(p->p_fd, s, &fp); 1248 if (error) 1249 return (error); 1250 if (*namelen < 0) { 1251 fdrop(fp); 1252 return (EINVAL); 1253 } 1254 so = (struct socket *)fp->f_data; 1255 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1256 fdrop(fp); 1257 return (ENOTCONN); 1258 } 1259 error = so_pru_peeraddr(so, &sa); 1260 if (error == 0) { 1261 if (sa == NULL) { 1262 *namelen = 0; 1263 } else { 1264 *namelen = MIN(*namelen, sa->sa_len); 1265 *name = sa; 1266 } 1267 } 1268 1269 fdrop(fp); 1270 return (error); 1271 } 1272 1273 /* 1274 * getpeername_args(int fdes, caddr_t asa, int *alen) 1275 * 1276 * Get name of peer for connected socket. 1277 */ 1278 int 1279 sys_getpeername(struct getpeername_args *uap) 1280 { 1281 struct sockaddr *sa = NULL; 1282 int error, sa_len; 1283 1284 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1285 if (error) 1286 return (error); 1287 1288 error = kern_getpeername(uap->fdes, &sa, &sa_len); 1289 1290 if (error == 0) 1291 error = copyout(sa, uap->asa, sa_len); 1292 if (error == 0) 1293 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1294 if (sa) 1295 FREE(sa, M_SONAME); 1296 return (error); 1297 } 1298 1299 int 1300 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) 1301 { 1302 struct sockaddr *sa; 1303 int error; 1304 1305 *namp = NULL; 1306 if (len > SOCK_MAXADDRLEN) 1307 return ENAMETOOLONG; 1308 if (len < offsetof(struct sockaddr, sa_data[0])) 1309 return EDOM; 1310 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1311 error = copyin(uaddr, sa, len); 1312 if (error) { 1313 FREE(sa, M_SONAME); 1314 } else { 1315 #if BYTE_ORDER != BIG_ENDIAN 1316 /* 1317 * The bind(), connect(), and sendto() syscalls were not 1318 * versioned for COMPAT_43. Thus, this check must stay. 1319 */ 1320 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1321 sa->sa_family = sa->sa_len; 1322 #endif 1323 sa->sa_len = len; 1324 *namp = sa; 1325 } 1326 return error; 1327 } 1328 1329 /* 1330 * Detach a mapped page and release resources back to the system. 1331 * We must release our wiring and if the object is ripped out 1332 * from under the vm_page we become responsible for freeing the 1333 * page. These routines must be MPSAFE. 1334 * 1335 * XXX HACK XXX TEMPORARY UNTIL WE IMPLEMENT EXT MBUF REFERENCE COUNTING 1336 * 1337 * XXX vm_page_*() routines are not MPSAFE yet, the MP lock is required. 1338 */ 1339 static void 1340 sf_buf_mref(void *arg) 1341 { 1342 struct sfbuf_mref *sfm = arg; 1343 1344 /* 1345 * We must already hold a ref so there is no race to 0, just 1346 * atomically increment the count. 1347 */ 1348 atomic_add_int(&sfm->mref_count, 1); 1349 } 1350 1351 static void 1352 sf_buf_mfree(void *arg) 1353 { 1354 struct sfbuf_mref *sfm = arg; 1355 vm_page_t m; 1356 1357 KKASSERT(sfm->mref_count > 0); 1358 if (atomic_fetchadd_int(&sfm->mref_count, -1) == 1) { 1359 /* 1360 * XXX vm_page_*() and SFBUF routines not MPSAFE yet. 1361 */ 1362 get_mplock(); 1363 crit_enter(); 1364 m = sf_buf_page(sfm->sf); 1365 sf_buf_free(sfm->sf); 1366 vm_page_unwire(m, 0); 1367 if (m->wire_count == 0 && m->object == NULL) 1368 vm_page_try_to_free(m); 1369 crit_exit(); 1370 rel_mplock(); 1371 kfree(sfm, M_SENDFILE); 1372 } 1373 } 1374 1375 /* 1376 * sendfile(2). 1377 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1378 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1379 * 1380 * Send a file specified by 'fd' and starting at 'offset' to a socket 1381 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1382 * nbytes == 0. Optionally add a header and/or trailer to the socket 1383 * output. If specified, write the total number of bytes sent into *sbytes. 1384 * 1385 * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused 1386 * the headers to count against the remaining bytes to be sent from 1387 * the file descriptor. We may wish to implement a compatibility syscall 1388 * in the future. 1389 */ 1390 int 1391 sys_sendfile(struct sendfile_args *uap) 1392 { 1393 struct thread *td = curthread; 1394 struct proc *p = td->td_proc; 1395 struct file *fp; 1396 struct vnode *vp = NULL; 1397 struct sf_hdtr hdtr; 1398 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 1399 struct uio auio; 1400 struct mbuf *mheader = NULL; 1401 off_t hdtr_size = 0, sbytes; 1402 int error, hbytes = 0, tbytes; 1403 1404 KKASSERT(p); 1405 1406 /* 1407 * Do argument checking. Must be a regular file in, stream 1408 * type and connected socket out, positive offset. 1409 */ 1410 fp = holdfp(p->p_fd, uap->fd, FREAD); 1411 if (fp == NULL) { 1412 return (EBADF); 1413 } 1414 if (fp->f_type != DTYPE_VNODE) { 1415 fdrop(fp); 1416 return (EINVAL); 1417 } 1418 vp = (struct vnode *)fp->f_data; 1419 vref(vp); 1420 fdrop(fp); 1421 1422 /* 1423 * If specified, get the pointer to the sf_hdtr struct for 1424 * any headers/trailers. 1425 */ 1426 if (uap->hdtr) { 1427 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1428 if (error) 1429 goto done; 1430 /* 1431 * Send any headers. 1432 */ 1433 if (hdtr.headers) { 1434 error = iovec_copyin(hdtr.headers, &iov, aiov, 1435 hdtr.hdr_cnt, &hbytes); 1436 if (error) 1437 goto done; 1438 auio.uio_iov = iov; 1439 auio.uio_iovcnt = hdtr.hdr_cnt; 1440 auio.uio_offset = 0; 1441 auio.uio_segflg = UIO_USERSPACE; 1442 auio.uio_rw = UIO_WRITE; 1443 auio.uio_td = td; 1444 auio.uio_resid = hbytes; 1445 1446 mheader = m_uiomove(&auio); 1447 1448 iovec_free(&iov, aiov); 1449 if (mheader == NULL) 1450 goto done; 1451 } 1452 } 1453 1454 error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader, 1455 &sbytes, uap->flags); 1456 if (error) 1457 goto done; 1458 1459 /* 1460 * Send trailers. Wimp out and use writev(2). 1461 */ 1462 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1463 error = iovec_copyin(hdtr.trailers, &iov, aiov, 1464 hdtr.trl_cnt, &auio.uio_resid); 1465 if (error) 1466 goto done; 1467 auio.uio_iov = iov; 1468 auio.uio_iovcnt = hdtr.trl_cnt; 1469 auio.uio_offset = 0; 1470 auio.uio_segflg = UIO_USERSPACE; 1471 auio.uio_rw = UIO_WRITE; 1472 auio.uio_td = td; 1473 1474 error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes); 1475 1476 iovec_free(&iov, aiov); 1477 if (error) 1478 goto done; 1479 hdtr_size += tbytes; /* trailer bytes successfully sent */ 1480 } 1481 1482 done: 1483 if (uap->sbytes != NULL) { 1484 sbytes += hdtr_size; 1485 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1486 } 1487 if (vp) 1488 vrele(vp); 1489 return (error); 1490 } 1491 1492 int 1493 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, 1494 struct mbuf *mheader, off_t *sbytes, int flags) 1495 { 1496 struct thread *td = curthread; 1497 struct proc *p = td->td_proc; 1498 struct vm_object *obj; 1499 struct socket *so; 1500 struct file *fp; 1501 struct mbuf *m; 1502 struct sf_buf *sf; 1503 struct sfbuf_mref *sfm; 1504 struct vm_page *pg; 1505 off_t off, xfsize; 1506 off_t hbytes = 0; 1507 int error = 0; 1508 1509 if (vp->v_type != VREG) { 1510 error = EINVAL; 1511 goto done0; 1512 } 1513 if ((obj = vp->v_object) == NULL) { 1514 error = EINVAL; 1515 goto done0; 1516 } 1517 error = holdsock(p->p_fd, sfd, &fp); 1518 if (error) 1519 goto done0; 1520 so = (struct socket *)fp->f_data; 1521 if (so->so_type != SOCK_STREAM) { 1522 error = EINVAL; 1523 goto done; 1524 } 1525 if ((so->so_state & SS_ISCONNECTED) == 0) { 1526 error = ENOTCONN; 1527 goto done; 1528 } 1529 if (offset < 0) { 1530 error = EINVAL; 1531 goto done; 1532 } 1533 1534 *sbytes = 0; 1535 /* 1536 * Protect against multiple writers to the socket. 1537 */ 1538 ssb_lock(&so->so_snd, M_WAITOK); 1539 1540 /* 1541 * Loop through the pages in the file, starting with the requested 1542 * offset. Get a file page (do I/O if necessary), map the file page 1543 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1544 * it on the socket. 1545 */ 1546 for (off = offset; ; off += xfsize, *sbytes += xfsize + hbytes) { 1547 vm_pindex_t pindex; 1548 vm_offset_t pgoff; 1549 1550 pindex = OFF_TO_IDX(off); 1551 retry_lookup: 1552 /* 1553 * Calculate the amount to transfer. Not to exceed a page, 1554 * the EOF, or the passed in nbytes. 1555 */ 1556 xfsize = vp->v_filesize - off; 1557 if (xfsize > PAGE_SIZE) 1558 xfsize = PAGE_SIZE; 1559 pgoff = (vm_offset_t)(off & PAGE_MASK); 1560 if (PAGE_SIZE - pgoff < xfsize) 1561 xfsize = PAGE_SIZE - pgoff; 1562 if (nbytes && xfsize > (nbytes - *sbytes)) 1563 xfsize = nbytes - *sbytes; 1564 if (xfsize <= 0) 1565 break; 1566 /* 1567 * Optimize the non-blocking case by looking at the socket space 1568 * before going to the extra work of constituting the sf_buf. 1569 */ 1570 if ((fp->f_flag & FNONBLOCK) && ssb_space(&so->so_snd) <= 0) { 1571 if (so->so_state & SS_CANTSENDMORE) 1572 error = EPIPE; 1573 else 1574 error = EAGAIN; 1575 ssb_unlock(&so->so_snd); 1576 goto done; 1577 } 1578 /* 1579 * Attempt to look up the page. 1580 * 1581 * Allocate if not found, wait and loop if busy, then 1582 * wire the page. critical section protection is 1583 * required to maintain the object association (an 1584 * interrupt can free the page) through to the 1585 * vm_page_wire() call. 1586 */ 1587 crit_enter(); 1588 pg = vm_page_lookup(obj, pindex); 1589 if (pg == NULL) { 1590 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1591 if (pg == NULL) { 1592 vm_wait(0); 1593 crit_exit(); 1594 goto retry_lookup; 1595 } 1596 vm_page_wakeup(pg); 1597 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1598 crit_exit(); 1599 goto retry_lookup; 1600 } 1601 vm_page_wire(pg); 1602 crit_exit(); 1603 1604 /* 1605 * If page is not valid for what we need, initiate I/O 1606 */ 1607 1608 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1609 struct uio auio; 1610 struct iovec aiov; 1611 int bsize; 1612 1613 /* 1614 * Ensure that our page is still around when the I/O 1615 * completes. 1616 */ 1617 vm_page_io_start(pg); 1618 1619 /* 1620 * Get the page from backing store. 1621 */ 1622 bsize = vp->v_mount->mnt_stat.f_iosize; 1623 auio.uio_iov = &aiov; 1624 auio.uio_iovcnt = 1; 1625 aiov.iov_base = 0; 1626 aiov.iov_len = MAXBSIZE; 1627 auio.uio_resid = MAXBSIZE; 1628 auio.uio_offset = trunc_page(off); 1629 auio.uio_segflg = UIO_NOCOPY; 1630 auio.uio_rw = UIO_READ; 1631 auio.uio_td = td; 1632 vn_lock(vp, LK_SHARED | LK_RETRY); 1633 error = VOP_READ(vp, &auio, 1634 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1635 p->p_ucred); 1636 vn_unlock(vp); 1637 vm_page_flag_clear(pg, PG_ZERO); 1638 vm_page_io_finish(pg); 1639 if (error) { 1640 crit_enter(); 1641 vm_page_unwire(pg, 0); 1642 vm_page_try_to_free(pg); 1643 crit_exit(); 1644 ssb_unlock(&so->so_snd); 1645 goto done; 1646 } 1647 } 1648 1649 1650 /* 1651 * Get a sendfile buf. We usually wait as long as necessary, 1652 * but this wait can be interrupted. 1653 */ 1654 if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { 1655 crit_enter(); 1656 vm_page_unwire(pg, 0); 1657 vm_page_try_to_free(pg); 1658 crit_exit(); 1659 ssb_unlock(&so->so_snd); 1660 error = EINTR; 1661 goto done; 1662 } 1663 1664 /* 1665 * Get an mbuf header and set it up as having external storage. 1666 */ 1667 MGETHDR(m, MB_WAIT, MT_DATA); 1668 if (m == NULL) { 1669 error = ENOBUFS; 1670 sf_buf_free(sf); 1671 ssb_unlock(&so->so_snd); 1672 goto done; 1673 } 1674 1675 /* 1676 * sfm is a temporary hack, use a per-cpu cache for this. 1677 */ 1678 sfm = kmalloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK); 1679 sfm->sf = sf; 1680 sfm->mref_count = 1; 1681 1682 m->m_ext.ext_free = sf_buf_mfree; 1683 m->m_ext.ext_ref = sf_buf_mref; 1684 m->m_ext.ext_arg = sfm; 1685 m->m_ext.ext_buf = (void *)sf->kva; 1686 m->m_ext.ext_size = PAGE_SIZE; 1687 m->m_data = (char *) sf->kva + pgoff; 1688 m->m_flags |= M_EXT; 1689 m->m_pkthdr.len = m->m_len = xfsize; 1690 KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0); 1691 1692 if (mheader != NULL) { 1693 hbytes = mheader->m_pkthdr.len; 1694 mheader->m_pkthdr.len += m->m_pkthdr.len; 1695 m_cat(mheader, m); 1696 m = mheader; 1697 mheader = NULL; 1698 } else 1699 hbytes = 0; 1700 1701 /* 1702 * Add the buffer to the socket buffer chain. 1703 */ 1704 crit_enter(); 1705 retry_space: 1706 /* 1707 * Make sure that the socket is still able to take more data. 1708 * CANTSENDMORE being true usually means that the connection 1709 * was closed. so_error is true when an error was sensed after 1710 * a previous send. 1711 * The state is checked after the page mapping and buffer 1712 * allocation above since those operations may block and make 1713 * any socket checks stale. From this point forward, nothing 1714 * blocks before the pru_send (or more accurately, any blocking 1715 * results in a loop back to here to re-check). 1716 */ 1717 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1718 if (so->so_state & SS_CANTSENDMORE) { 1719 error = EPIPE; 1720 } else { 1721 error = so->so_error; 1722 so->so_error = 0; 1723 } 1724 m_freem(m); 1725 ssb_unlock(&so->so_snd); 1726 crit_exit(); 1727 goto done; 1728 } 1729 /* 1730 * Wait for socket space to become available. We do this just 1731 * after checking the connection state above in order to avoid 1732 * a race condition with ssb_wait(). 1733 */ 1734 if (ssb_space(&so->so_snd) < so->so_snd.ssb_lowat) { 1735 if (fp->f_flag & FNONBLOCK) { 1736 m_freem(m); 1737 ssb_unlock(&so->so_snd); 1738 crit_exit(); 1739 error = EAGAIN; 1740 goto done; 1741 } 1742 error = ssb_wait(&so->so_snd); 1743 /* 1744 * An error from ssb_wait usually indicates that we've 1745 * been interrupted by a signal. If we've sent anything 1746 * then return bytes sent, otherwise return the error. 1747 */ 1748 if (error) { 1749 m_freem(m); 1750 ssb_unlock(&so->so_snd); 1751 crit_exit(); 1752 goto done; 1753 } 1754 goto retry_space; 1755 } 1756 error = so_pru_send(so, 0, m, NULL, NULL, td); 1757 crit_exit(); 1758 if (error) { 1759 ssb_unlock(&so->so_snd); 1760 goto done; 1761 } 1762 } 1763 if (mheader != NULL) { 1764 *sbytes += mheader->m_pkthdr.len; 1765 error = so_pru_send(so, 0, mheader, NULL, NULL, td); 1766 mheader = NULL; 1767 } 1768 ssb_unlock(&so->so_snd); 1769 1770 done: 1771 fdrop(fp); 1772 done0: 1773 if (mheader != NULL) 1774 m_freem(mheader); 1775 return (error); 1776 } 1777 1778 int 1779 sys_sctp_peeloff(struct sctp_peeloff_args *uap) 1780 { 1781 #ifdef SCTP 1782 struct thread *td = curthread; 1783 struct proc *p = td->td_proc; 1784 struct file *lfp = NULL; 1785 struct file *nfp = NULL; 1786 int error; 1787 struct socket *head, *so; 1788 caddr_t assoc_id; 1789 int fd; 1790 short fflag; /* type must match fp->f_flag */ 1791 1792 assoc_id = uap->name; 1793 error = holdsock(p->p_fd, uap->sd, &lfp); 1794 if (error) { 1795 return (error); 1796 } 1797 crit_enter(); 1798 head = (struct socket *)lfp->f_data; 1799 error = sctp_can_peel_off(head, assoc_id); 1800 if (error) { 1801 crit_exit(); 1802 goto done; 1803 } 1804 /* 1805 * At this point we know we do have a assoc to pull 1806 * we proceed to get the fd setup. This may block 1807 * but that is ok. 1808 */ 1809 1810 fflag = lfp->f_flag; 1811 error = falloc(p, &nfp, &fd); 1812 if (error) { 1813 /* 1814 * Probably ran out of file descriptors. Put the 1815 * unaccepted connection back onto the queue and 1816 * do another wakeup so some other process might 1817 * have a chance at it. 1818 */ 1819 crit_exit(); 1820 goto done; 1821 } 1822 uap->sysmsg_result = fd; 1823 1824 so = sctp_get_peeloff(head, assoc_id, &error); 1825 if (so == NULL) { 1826 /* 1827 * Either someone else peeled it off OR 1828 * we can't get a socket. 1829 */ 1830 goto noconnection; 1831 } 1832 so->so_state &= ~SS_COMP; 1833 so->so_state &= ~SS_NOFDREF; 1834 so->so_head = NULL; 1835 if (head->so_sigio != NULL) 1836 fsetown(fgetown(head->so_sigio), &so->so_sigio); 1837 1838 nfp->f_type = DTYPE_SOCKET; 1839 nfp->f_flag = fflag; 1840 nfp->f_ops = &socketops; 1841 nfp->f_data = so; 1842 1843 noconnection: 1844 /* 1845 * Assign the file pointer to the reserved descriptor, or clear 1846 * the reserved descriptor if an error occured. 1847 */ 1848 if (error) 1849 fsetfd(p, NULL, fd); 1850 else 1851 fsetfd(p, nfp, fd); 1852 crit_exit(); 1853 /* 1854 * Release explicitly held references before returning. 1855 */ 1856 done: 1857 if (nfp != NULL) 1858 fdrop(nfp); 1859 fdrop(lfp); 1860 return (error); 1861 #else /* SCTP */ 1862 return(EOPNOTSUPP); 1863 #endif /* SCTP */ 1864 } 1865