1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $ 38 * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.82 2007/05/24 20:51:16 dillon Exp $ 39 */ 40 41 #include "opt_ktrace.h" 42 #include "opt_sctp.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kernel.h> 47 #include <sys/sysproto.h> 48 #include <sys/malloc.h> 49 #include <sys/filedesc.h> 50 #include <sys/event.h> 51 #include <sys/proc.h> 52 #include <sys/fcntl.h> 53 #include <sys/file.h> 54 #include <sys/filio.h> 55 #include <sys/kern_syscall.h> 56 #include <sys/mbuf.h> 57 #include <sys/protosw.h> 58 #include <sys/sfbuf.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/socketops.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #include <sys/lock.h> 65 #include <sys/mount.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_kern.h> 74 #include <vm/vm_extern.h> 75 #include <sys/file2.h> 76 #include <sys/signalvar.h> 77 #include <sys/serialize.h> 78 79 #include <sys/thread2.h> 80 #include <sys/msgport2.h> 81 #include <net/netmsg2.h> 82 83 #ifdef SCTP 84 #include <netinet/sctp_peeloff.h> 85 #endif /* SCTP */ 86 87 struct sfbuf_mref { 88 struct sf_buf *sf; 89 int mref_count; 90 struct lwkt_serialize serializer; 91 }; 92 93 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures"); 94 95 /* 96 * System call interface to the socket abstraction. 97 */ 98 99 extern struct fileops socketops; 100 101 /* 102 * socket_args(int domain, int type, int protocol) 103 */ 104 int 105 kern_socket(int domain, int type, int protocol, int *res) 106 { 107 struct thread *td = curthread; 108 struct proc *p = td->td_proc; 109 struct socket *so; 110 struct file *fp; 111 int fd, error; 112 113 KKASSERT(p); 114 115 error = falloc(p, &fp, &fd); 116 if (error) 117 return (error); 118 error = socreate(domain, &so, type, protocol, td); 119 if (error) { 120 fsetfd(p, NULL, fd); 121 } else { 122 fp->f_type = DTYPE_SOCKET; 123 fp->f_flag = FREAD | FWRITE; 124 fp->f_ops = &socketops; 125 fp->f_data = so; 126 *res = fd; 127 fsetfd(p, fp, fd); 128 } 129 fdrop(fp); 130 return (error); 131 } 132 133 int 134 sys_socket(struct socket_args *uap) 135 { 136 int error; 137 138 error = kern_socket(uap->domain, uap->type, uap->protocol, 139 &uap->sysmsg_result); 140 141 return (error); 142 } 143 144 int 145 kern_bind(int s, struct sockaddr *sa) 146 { 147 struct thread *td = curthread; 148 struct proc *p = td->td_proc; 149 struct file *fp; 150 int error; 151 152 KKASSERT(p); 153 error = holdsock(p->p_fd, s, &fp); 154 if (error) 155 return (error); 156 error = sobind((struct socket *)fp->f_data, sa, td); 157 fdrop(fp); 158 return (error); 159 } 160 161 /* 162 * bind_args(int s, caddr_t name, int namelen) 163 */ 164 int 165 sys_bind(struct bind_args *uap) 166 { 167 struct sockaddr *sa; 168 int error; 169 170 error = getsockaddr(&sa, uap->name, uap->namelen); 171 if (error) 172 return (error); 173 error = kern_bind(uap->s, sa); 174 FREE(sa, M_SONAME); 175 176 return (error); 177 } 178 179 int 180 kern_listen(int s, int backlog) 181 { 182 struct thread *td = curthread; 183 struct proc *p = td->td_proc; 184 struct file *fp; 185 int error; 186 187 KKASSERT(p); 188 error = holdsock(p->p_fd, s, &fp); 189 if (error) 190 return (error); 191 error = solisten((struct socket *)fp->f_data, backlog, td); 192 fdrop(fp); 193 return(error); 194 } 195 196 /* 197 * listen_args(int s, int backlog) 198 */ 199 int 200 sys_listen(struct listen_args *uap) 201 { 202 int error; 203 204 error = kern_listen(uap->s, uap->backlog); 205 return (error); 206 } 207 208 /* 209 * Returns the accepted socket as well. 210 */ 211 static boolean_t 212 soaccept_predicate(struct netmsg *msg0) 213 { 214 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 215 struct socket *head = msg->nm_so; 216 217 if (head->so_error != 0) { 218 msg->nm_netmsg.nm_lmsg.ms_error = head->so_error; 219 return (TRUE); 220 } 221 if (!TAILQ_EMPTY(&head->so_comp)) { 222 /* Abuse nm_so field as copy in/copy out parameter. XXX JH */ 223 msg->nm_so = TAILQ_FIRST(&head->so_comp); 224 TAILQ_REMOVE(&head->so_comp, msg->nm_so, so_list); 225 head->so_qlen--; 226 227 msg->nm_netmsg.nm_lmsg.ms_error = 0; 228 return (TRUE); 229 } 230 if (head->so_state & SS_CANTRCVMORE) { 231 msg->nm_netmsg.nm_lmsg.ms_error = ECONNABORTED; 232 return (TRUE); 233 } 234 if (msg->nm_fflags & FNONBLOCK) { 235 msg->nm_netmsg.nm_lmsg.ms_error = EWOULDBLOCK; 236 return (TRUE); 237 } 238 239 return (FALSE); 240 } 241 242 /* 243 * The second argument to kern_accept() is a handle to a struct sockaddr. 244 * This allows kern_accept() to return a pointer to an allocated struct 245 * sockaddr which must be freed later with FREE(). The caller must 246 * initialize *name to NULL. 247 */ 248 int 249 kern_accept(int s, int fflags, struct sockaddr **name, int *namelen, int *res) 250 { 251 struct thread *td = curthread; 252 struct proc *p = td->td_proc; 253 struct file *lfp = NULL; 254 struct file *nfp = NULL; 255 struct sockaddr *sa; 256 struct socket *head, *so; 257 struct netmsg_so_notify msg; 258 lwkt_port_t port; 259 int fd; 260 u_int fflag; /* type must match fp->f_flag */ 261 int error, tmp; 262 263 *res = -1; 264 if (name && namelen && *namelen < 0) 265 return (EINVAL); 266 267 error = holdsock(p->p_fd, s, &lfp); 268 if (error) 269 return (error); 270 271 error = falloc(p, &nfp, &fd); 272 if (error) { /* Probably ran out of file descriptors. */ 273 fdrop(lfp); 274 return (error); 275 } 276 head = (struct socket *)lfp->f_data; 277 if ((head->so_options & SO_ACCEPTCONN) == 0) { 278 error = EINVAL; 279 goto done; 280 } 281 282 if (fflags & O_FBLOCKING) 283 fflags |= lfp->f_flag & ~FNONBLOCK; 284 else if (fflags & O_FNONBLOCKING) 285 fflags |= lfp->f_flag | FNONBLOCK; 286 else 287 fflags = lfp->f_flag; 288 289 /* optimize for uniprocessor case later XXX JH */ 290 port = head->so_proto->pr_mport(head, NULL, PRU_PRED); 291 netmsg_init_abortable(&msg.nm_netmsg, &curthread->td_msgport, 292 0, 293 netmsg_so_notify, 294 netmsg_so_notify_doabort); 295 msg.nm_predicate = soaccept_predicate; 296 msg.nm_fflags = fflags; 297 msg.nm_so = head; 298 msg.nm_etype = NM_REVENT; 299 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 300 if (error) 301 goto done; 302 303 /* 304 * At this point we have the connection that's ready to be accepted. 305 */ 306 so = msg.nm_so; 307 308 fflag = lfp->f_flag; 309 310 /* connection has been removed from the listen queue */ 311 KNOTE(&head->so_rcv.ssb_sel.si_note, 0); 312 313 so->so_state &= ~SS_COMP; 314 so->so_head = NULL; 315 if (head->so_sigio != NULL) 316 fsetown(fgetown(head->so_sigio), &so->so_sigio); 317 318 nfp->f_type = DTYPE_SOCKET; 319 nfp->f_flag = fflag; 320 nfp->f_ops = &socketops; 321 nfp->f_data = so; 322 /* Sync socket nonblocking/async state with file flags */ 323 tmp = fflag & FNONBLOCK; 324 (void) fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, p->p_ucred); 325 tmp = fflag & FASYNC; 326 (void) fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, p->p_ucred); 327 328 sa = NULL; 329 error = soaccept(so, &sa); 330 331 /* 332 * Set the returned name and namelen as applicable. Set the returned 333 * namelen to 0 for older code which might ignore the return value 334 * from accept. 335 */ 336 if (error == 0) { 337 if (sa && name && namelen) { 338 if (*namelen > sa->sa_len) 339 *namelen = sa->sa_len; 340 *name = sa; 341 } else { 342 if (sa) 343 FREE(sa, M_SONAME); 344 } 345 } 346 347 done: 348 /* 349 * If an error occured clear the reserved descriptor, else associate 350 * nfp with it. 351 * 352 * Note that *res is normally ignored if an error is returned but 353 * a syscall message will still have access to the result code. 354 */ 355 if (error) { 356 fsetfd(p, NULL, fd); 357 } else { 358 *res = fd; 359 fsetfd(p, nfp, fd); 360 } 361 fdrop(nfp); 362 fdrop(lfp); 363 return (error); 364 } 365 366 /* 367 * accept(int s, caddr_t name, int *anamelen) 368 */ 369 int 370 sys_accept(struct accept_args *uap) 371 { 372 struct sockaddr *sa = NULL; 373 int sa_len; 374 int error; 375 376 if (uap->name) { 377 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 378 if (error) 379 return (error); 380 381 error = kern_accept(uap->s, 0, &sa, &sa_len, &uap->sysmsg_result); 382 383 if (error == 0) 384 error = copyout(sa, uap->name, sa_len); 385 if (error == 0) { 386 error = copyout(&sa_len, uap->anamelen, 387 sizeof(*uap->anamelen)); 388 } 389 if (sa) 390 FREE(sa, M_SONAME); 391 } else { 392 error = kern_accept(uap->s, 0, NULL, 0, &uap->sysmsg_result); 393 } 394 return (error); 395 } 396 397 /* 398 * extaccept(int s, int fflags, caddr_t name, int *anamelen) 399 */ 400 int 401 sys_extaccept(struct extaccept_args *uap) 402 { 403 struct sockaddr *sa = NULL; 404 int sa_len; 405 int error; 406 int fflags = uap->flags & O_FMASK; 407 408 if (uap->name) { 409 error = copyin(uap->anamelen, &sa_len, sizeof(sa_len)); 410 if (error) 411 return (error); 412 413 error = kern_accept(uap->s, fflags, &sa, &sa_len, &uap->sysmsg_result); 414 415 if (error == 0) 416 error = copyout(sa, uap->name, sa_len); 417 if (error == 0) { 418 error = copyout(&sa_len, uap->anamelen, 419 sizeof(*uap->anamelen)); 420 } 421 if (sa) 422 FREE(sa, M_SONAME); 423 } else { 424 error = kern_accept(uap->s, fflags, NULL, 0, &uap->sysmsg_result); 425 } 426 return (error); 427 } 428 429 430 /* 431 * Returns TRUE if predicate satisfied. 432 */ 433 static boolean_t 434 soconnected_predicate(struct netmsg *msg0) 435 { 436 struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0; 437 struct socket *so = msg->nm_so; 438 439 /* check predicate */ 440 if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) { 441 msg->nm_netmsg.nm_lmsg.ms_error = so->so_error; 442 return (TRUE); 443 } 444 445 return (FALSE); 446 } 447 448 int 449 kern_connect(int s, int fflags, struct sockaddr *sa) 450 { 451 struct thread *td = curthread; 452 struct proc *p = td->td_proc; 453 struct file *fp; 454 struct socket *so; 455 int error; 456 457 error = holdsock(p->p_fd, s, &fp); 458 if (error) 459 return (error); 460 so = (struct socket *)fp->f_data; 461 462 if (fflags & O_FBLOCKING) 463 /* fflags &= ~FNONBLOCK; */; 464 else if (fflags & O_FNONBLOCKING) 465 fflags |= FNONBLOCK; 466 else 467 fflags = fp->f_flag; 468 469 if ((fflags & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) { 470 error = EALREADY; 471 goto done; 472 } 473 error = soconnect(so, sa, td); 474 if (error) 475 goto bad; 476 if ((fflags & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) { 477 error = EINPROGRESS; 478 goto done; 479 } 480 if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 481 struct netmsg_so_notify msg; 482 lwkt_port_t port; 483 484 port = so->so_proto->pr_mport(so, sa, PRU_PRED); 485 netmsg_init_abortable(&msg.nm_netmsg, 486 &curthread->td_msgport, 487 0, 488 netmsg_so_notify, 489 netmsg_so_notify_doabort); 490 msg.nm_predicate = soconnected_predicate; 491 msg.nm_so = so; 492 msg.nm_etype = NM_REVENT; 493 error = lwkt_domsg(port, &msg.nm_netmsg.nm_lmsg, PCATCH); 494 } 495 if (error == 0) { 496 error = so->so_error; 497 so->so_error = 0; 498 } 499 bad: 500 so->so_state &= ~SS_ISCONNECTING; 501 if (error == ERESTART) 502 error = EINTR; 503 done: 504 fdrop(fp); 505 return (error); 506 } 507 508 /* 509 * connect_args(int s, caddr_t name, int namelen) 510 */ 511 int 512 sys_connect(struct connect_args *uap) 513 { 514 struct sockaddr *sa; 515 int error; 516 517 error = getsockaddr(&sa, uap->name, uap->namelen); 518 if (error) 519 return (error); 520 error = kern_connect(uap->s, 0, sa); 521 FREE(sa, M_SONAME); 522 523 return (error); 524 } 525 526 /* 527 * connect_args(int s, int fflags, caddr_t name, int namelen) 528 */ 529 int 530 sys_extconnect(struct extconnect_args *uap) 531 { 532 struct sockaddr *sa; 533 int error; 534 int fflags = uap->flags & O_FMASK; 535 536 error = getsockaddr(&sa, uap->name, uap->namelen); 537 if (error) 538 return (error); 539 error = kern_connect(uap->s, fflags, sa); 540 FREE(sa, M_SONAME); 541 542 return (error); 543 } 544 545 int 546 kern_socketpair(int domain, int type, int protocol, int *sv) 547 { 548 struct thread *td = curthread; 549 struct proc *p = td->td_proc; 550 struct file *fp1, *fp2; 551 struct socket *so1, *so2; 552 int fd1, fd2, error; 553 554 KKASSERT(p); 555 error = socreate(domain, &so1, type, protocol, td); 556 if (error) 557 return (error); 558 error = socreate(domain, &so2, type, protocol, td); 559 if (error) 560 goto free1; 561 error = falloc(p, &fp1, &fd1); 562 if (error) 563 goto free2; 564 sv[0] = fd1; 565 fp1->f_data = so1; 566 error = falloc(p, &fp2, &fd2); 567 if (error) 568 goto free3; 569 fp2->f_data = so2; 570 sv[1] = fd2; 571 error = soconnect2(so1, so2); 572 if (error) 573 goto free4; 574 if (type == SOCK_DGRAM) { 575 /* 576 * Datagram socket connection is asymmetric. 577 */ 578 error = soconnect2(so2, so1); 579 if (error) 580 goto free4; 581 } 582 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 583 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 584 fp1->f_ops = fp2->f_ops = &socketops; 585 fsetfd(p, fp1, fd1); 586 fsetfd(p, fp2, fd2); 587 fdrop(fp1); 588 fdrop(fp2); 589 return (error); 590 free4: 591 fsetfd(p, NULL, fd2); 592 fdrop(fp2); 593 free3: 594 fsetfd(p, NULL, fd1); 595 fdrop(fp1); 596 free2: 597 (void)soclose(so2, 0); 598 free1: 599 (void)soclose(so1, 0); 600 return (error); 601 } 602 603 /* 604 * socketpair(int domain, int type, int protocol, int *rsv) 605 */ 606 int 607 sys_socketpair(struct socketpair_args *uap) 608 { 609 int error, sockv[2]; 610 611 error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv); 612 613 if (error == 0) 614 error = copyout(sockv, uap->rsv, sizeof(sockv)); 615 return (error); 616 } 617 618 int 619 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio, 620 struct mbuf *control, int flags, int *res) 621 { 622 struct thread *td = curthread; 623 struct lwp *lp = td->td_lwp; 624 struct proc *p = td->td_proc; 625 struct file *fp; 626 int len, error; 627 struct socket *so; 628 #ifdef KTRACE 629 struct iovec *ktriov = NULL; 630 struct uio ktruio; 631 #endif 632 633 error = holdsock(p->p_fd, s, &fp); 634 if (error) 635 return (error); 636 if (auio->uio_resid < 0) { 637 error = EINVAL; 638 goto done; 639 } 640 #ifdef KTRACE 641 if (KTRPOINT(td, KTR_GENIO)) { 642 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 643 644 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 645 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 646 ktruio = *auio; 647 } 648 #endif 649 len = auio->uio_resid; 650 so = (struct socket *)fp->f_data; 651 if ((flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 652 if (fp->f_flag & FNONBLOCK) 653 flags |= MSG_FNONBLOCKING; 654 } 655 error = so_pru_sosend(so, sa, auio, NULL, control, flags, td); 656 if (error) { 657 if (auio->uio_resid != len && (error == ERESTART || 658 error == EINTR || error == EWOULDBLOCK)) 659 error = 0; 660 if (error == EPIPE) 661 lwpsignal(p, lp, SIGPIPE); 662 } 663 #ifdef KTRACE 664 if (ktriov != NULL) { 665 if (error == 0) { 666 ktruio.uio_iov = ktriov; 667 ktruio.uio_resid = len - auio->uio_resid; 668 ktrgenio(p, s, UIO_WRITE, &ktruio, error); 669 } 670 FREE(ktriov, M_TEMP); 671 } 672 #endif 673 if (error == 0) 674 *res = len - auio->uio_resid; 675 done: 676 fdrop(fp); 677 return (error); 678 } 679 680 /* 681 * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen) 682 */ 683 int 684 sys_sendto(struct sendto_args *uap) 685 { 686 struct thread *td = curthread; 687 struct uio auio; 688 struct iovec aiov; 689 struct sockaddr *sa = NULL; 690 int error; 691 692 if (uap->to) { 693 error = getsockaddr(&sa, uap->to, uap->tolen); 694 if (error) 695 return (error); 696 } 697 aiov.iov_base = uap->buf; 698 aiov.iov_len = uap->len; 699 auio.uio_iov = &aiov; 700 auio.uio_iovcnt = 1; 701 auio.uio_offset = 0; 702 auio.uio_resid = uap->len; 703 auio.uio_segflg = UIO_USERSPACE; 704 auio.uio_rw = UIO_WRITE; 705 auio.uio_td = td; 706 707 error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags, 708 &uap->sysmsg_result); 709 710 if (sa) 711 FREE(sa, M_SONAME); 712 return (error); 713 } 714 715 /* 716 * sendmsg_args(int s, caddr_t msg, int flags) 717 */ 718 int 719 sys_sendmsg(struct sendmsg_args *uap) 720 { 721 struct thread *td = curthread; 722 struct msghdr msg; 723 struct uio auio; 724 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 725 struct sockaddr *sa = NULL; 726 struct mbuf *control = NULL; 727 int error; 728 729 error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg)); 730 if (error) 731 return (error); 732 733 /* 734 * Conditionally copyin msg.msg_name. 735 */ 736 if (msg.msg_name) { 737 error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen); 738 if (error) 739 return (error); 740 } 741 742 /* 743 * Populate auio. 744 */ 745 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 746 &auio.uio_resid); 747 if (error) 748 goto cleanup2; 749 auio.uio_iov = iov; 750 auio.uio_iovcnt = msg.msg_iovlen; 751 auio.uio_offset = 0; 752 auio.uio_segflg = UIO_USERSPACE; 753 auio.uio_rw = UIO_WRITE; 754 auio.uio_td = td; 755 756 /* 757 * Conditionally copyin msg.msg_control. 758 */ 759 if (msg.msg_control) { 760 if (msg.msg_controllen < sizeof(struct cmsghdr) || 761 msg.msg_controllen > MLEN) { 762 error = EINVAL; 763 goto cleanup; 764 } 765 control = m_get(MB_WAIT, MT_CONTROL); 766 if (control == NULL) { 767 error = ENOBUFS; 768 goto cleanup; 769 } 770 control->m_len = msg.msg_controllen; 771 error = copyin(msg.msg_control, mtod(control, caddr_t), 772 msg.msg_controllen); 773 if (error) { 774 m_free(control); 775 goto cleanup; 776 } 777 } 778 779 error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags, 780 &uap->sysmsg_result); 781 782 cleanup: 783 iovec_free(&iov, aiov); 784 cleanup2: 785 if (sa) 786 FREE(sa, M_SONAME); 787 return (error); 788 } 789 790 /* 791 * kern_recvmsg() takes a handle to sa and control. If the handle is non- 792 * null, it returns a dynamically allocated struct sockaddr and an mbuf. 793 * Don't forget to FREE() and m_free() these if they are returned. 794 */ 795 int 796 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio, 797 struct mbuf **control, int *flags, int *res) 798 { 799 struct thread *td = curthread; 800 struct proc *p = td->td_proc; 801 struct file *fp; 802 int len, error; 803 int lflags; 804 struct socket *so; 805 #ifdef KTRACE 806 struct iovec *ktriov = NULL; 807 struct uio ktruio; 808 #endif 809 810 error = holdsock(p->p_fd, s, &fp); 811 if (error) 812 return (error); 813 if (auio->uio_resid < 0) { 814 error = EINVAL; 815 goto done; 816 } 817 #ifdef KTRACE 818 if (KTRPOINT(td, KTR_GENIO)) { 819 int iovlen = auio->uio_iovcnt * sizeof (struct iovec); 820 821 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 822 bcopy(auio->uio_iov, ktriov, iovlen); 823 ktruio = *auio; 824 } 825 #endif 826 len = auio->uio_resid; 827 so = (struct socket *)fp->f_data; 828 829 if (flags == NULL || (*flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) { 830 if (fp->f_flag & FNONBLOCK) { 831 if (flags) { 832 *flags |= MSG_FNONBLOCKING; 833 } else { 834 lflags = MSG_FNONBLOCKING; 835 flags = &lflags; 836 } 837 } 838 } 839 840 error = so_pru_soreceive(so, sa, auio, NULL, control, flags); 841 if (error) { 842 if (auio->uio_resid != len && (error == ERESTART || 843 error == EINTR || error == EWOULDBLOCK)) 844 error = 0; 845 } 846 #ifdef KTRACE 847 if (ktriov != NULL) { 848 if (error == 0) { 849 ktruio.uio_iov = ktriov; 850 ktruio.uio_resid = len - auio->uio_resid; 851 ktrgenio(p, s, UIO_READ, &ktruio, error); 852 } 853 FREE(ktriov, M_TEMP); 854 } 855 #endif 856 if (error == 0) 857 *res = len - auio->uio_resid; 858 done: 859 fdrop(fp); 860 return (error); 861 } 862 863 /* 864 * recvfrom_args(int s, caddr_t buf, size_t len, int flags, 865 * caddr_t from, int *fromlenaddr) 866 */ 867 int 868 sys_recvfrom(struct recvfrom_args *uap) 869 { 870 struct thread *td = curthread; 871 struct uio auio; 872 struct iovec aiov; 873 struct sockaddr *sa = NULL; 874 int error, fromlen; 875 876 if (uap->from && uap->fromlenaddr) { 877 error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen)); 878 if (error) 879 return (error); 880 if (fromlen < 0) 881 return (EINVAL); 882 } else { 883 fromlen = 0; 884 } 885 aiov.iov_base = uap->buf; 886 aiov.iov_len = uap->len; 887 auio.uio_iov = &aiov; 888 auio.uio_iovcnt = 1; 889 auio.uio_offset = 0; 890 auio.uio_resid = uap->len; 891 auio.uio_segflg = UIO_USERSPACE; 892 auio.uio_rw = UIO_READ; 893 auio.uio_td = td; 894 895 error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL, 896 &uap->flags, &uap->sysmsg_result); 897 898 if (error == 0 && uap->from) { 899 /* note: sa may still be NULL */ 900 if (sa) { 901 fromlen = MIN(fromlen, sa->sa_len); 902 error = copyout(sa, uap->from, fromlen); 903 } else { 904 fromlen = 0; 905 } 906 if (error == 0) { 907 error = copyout(&fromlen, uap->fromlenaddr, 908 sizeof(fromlen)); 909 } 910 } 911 if (sa) 912 FREE(sa, M_SONAME); 913 914 return (error); 915 } 916 917 /* 918 * recvmsg_args(int s, struct msghdr *msg, int flags) 919 */ 920 int 921 sys_recvmsg(struct recvmsg_args *uap) 922 { 923 struct thread *td = curthread; 924 struct msghdr msg; 925 struct uio auio; 926 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 927 struct mbuf *m, *control = NULL; 928 struct sockaddr *sa = NULL; 929 caddr_t ctlbuf; 930 socklen_t *ufromlenp, *ucontrollenp; 931 int error, fromlen, controllen, len, flags, *uflagsp; 932 933 /* 934 * This copyin handles everything except the iovec. 935 */ 936 error = copyin(uap->msg, &msg, sizeof(msg)); 937 if (error) 938 return (error); 939 940 if (msg.msg_name && msg.msg_namelen < 0) 941 return (EINVAL); 942 if (msg.msg_control && msg.msg_controllen < 0) 943 return (EINVAL); 944 945 ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 946 msg_namelen)); 947 ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr, 948 msg_controllen)); 949 uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr, 950 msg_flags)); 951 952 /* 953 * Populate auio. 954 */ 955 error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen, 956 &auio.uio_resid); 957 if (error) 958 return (error); 959 auio.uio_iov = iov; 960 auio.uio_iovcnt = msg.msg_iovlen; 961 auio.uio_offset = 0; 962 auio.uio_segflg = UIO_USERSPACE; 963 auio.uio_rw = UIO_READ; 964 auio.uio_td = td; 965 966 flags = uap->flags; 967 968 error = kern_recvmsg(uap->s, msg.msg_name ? &sa : NULL, &auio, 969 msg.msg_control ? &control : NULL, &flags, &uap->sysmsg_result); 970 971 /* 972 * Conditionally copyout the name and populate the namelen field. 973 */ 974 if (error == 0 && msg.msg_name) { 975 /* note: sa may still be NULL */ 976 if (sa != NULL) { 977 fromlen = MIN(msg.msg_namelen, sa->sa_len); 978 error = copyout(sa, msg.msg_name, fromlen); 979 } else 980 fromlen = 0; 981 if (error == 0) 982 error = copyout(&fromlen, ufromlenp, 983 sizeof(*ufromlenp)); 984 } 985 986 /* 987 * Copyout msg.msg_control and msg.msg_controllen. 988 */ 989 if (error == 0 && msg.msg_control) { 990 len = msg.msg_controllen; 991 m = control; 992 ctlbuf = (caddr_t)msg.msg_control; 993 994 while(m && len > 0) { 995 unsigned int tocopy; 996 997 if (len >= m->m_len) { 998 tocopy = m->m_len; 999 } else { 1000 msg.msg_flags |= MSG_CTRUNC; 1001 tocopy = len; 1002 } 1003 1004 error = copyout(mtod(m, caddr_t), ctlbuf, tocopy); 1005 if (error) 1006 goto cleanup; 1007 1008 ctlbuf += tocopy; 1009 len -= tocopy; 1010 m = m->m_next; 1011 } 1012 controllen = ctlbuf - (caddr_t)msg.msg_control; 1013 error = copyout(&controllen, ucontrollenp, 1014 sizeof(*ucontrollenp)); 1015 } 1016 1017 if (error == 0) 1018 error = copyout(&flags, uflagsp, sizeof(*uflagsp)); 1019 1020 cleanup: 1021 if (sa) 1022 FREE(sa, M_SONAME); 1023 iovec_free(&iov, aiov); 1024 if (control) 1025 m_freem(control); 1026 return (error); 1027 } 1028 1029 /* 1030 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1031 * in kernel pointer instead of a userland pointer. This allows us 1032 * to manipulate socket options in the emulation code. 1033 */ 1034 int 1035 kern_setsockopt(int s, struct sockopt *sopt) 1036 { 1037 struct thread *td = curthread; 1038 struct proc *p = td->td_proc; 1039 struct file *fp; 1040 int error; 1041 1042 if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0) 1043 return (EFAULT); 1044 if (sopt->sopt_valsize < 0) 1045 return (EINVAL); 1046 1047 error = holdsock(p->p_fd, s, &fp); 1048 if (error) 1049 return (error); 1050 1051 error = sosetopt((struct socket *)fp->f_data, sopt); 1052 fdrop(fp); 1053 return (error); 1054 } 1055 1056 /* 1057 * setsockopt_args(int s, int level, int name, caddr_t val, int valsize) 1058 */ 1059 int 1060 sys_setsockopt(struct setsockopt_args *uap) 1061 { 1062 struct thread *td = curthread; 1063 struct sockopt sopt; 1064 int error; 1065 1066 sopt.sopt_level = uap->level; 1067 sopt.sopt_name = uap->name; 1068 sopt.sopt_val = uap->val; 1069 sopt.sopt_valsize = uap->valsize; 1070 sopt.sopt_td = td; 1071 1072 error = kern_setsockopt(uap->s, &sopt); 1073 return(error); 1074 } 1075 1076 /* 1077 * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an 1078 * in kernel pointer instead of a userland pointer. This allows us 1079 * to manipulate socket options in the emulation code. 1080 */ 1081 int 1082 kern_getsockopt(int s, struct sockopt *sopt) 1083 { 1084 struct thread *td = curthread; 1085 struct proc *p = td->td_proc; 1086 struct file *fp; 1087 int error; 1088 1089 if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0) 1090 return (EFAULT); 1091 if (sopt->sopt_valsize < 0) 1092 return (EINVAL); 1093 1094 error = holdsock(p->p_fd, s, &fp); 1095 if (error) 1096 return (error); 1097 1098 error = sogetopt((struct socket *)fp->f_data, sopt); 1099 fdrop(fp); 1100 return (error); 1101 } 1102 1103 /* 1104 * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize) 1105 */ 1106 int 1107 sys_getsockopt(struct getsockopt_args *uap) 1108 { 1109 struct thread *td = curthread; 1110 struct sockopt sopt; 1111 int error, valsize; 1112 1113 if (uap->val) { 1114 error = copyin(uap->avalsize, &valsize, sizeof(valsize)); 1115 if (error) 1116 return (error); 1117 if (valsize < 0) 1118 return (EINVAL); 1119 } else { 1120 valsize = 0; 1121 } 1122 1123 sopt.sopt_level = uap->level; 1124 sopt.sopt_name = uap->name; 1125 sopt.sopt_val = uap->val; 1126 sopt.sopt_valsize = valsize; 1127 sopt.sopt_td = td; 1128 1129 error = kern_getsockopt(uap->s, &sopt); 1130 if (error == 0) { 1131 valsize = sopt.sopt_valsize; 1132 error = copyout(&valsize, uap->avalsize, sizeof(valsize)); 1133 } 1134 return (error); 1135 } 1136 1137 /* 1138 * The second argument to kern_getsockname() is a handle to a struct sockaddr. 1139 * This allows kern_getsockname() to return a pointer to an allocated struct 1140 * sockaddr which must be freed later with FREE(). The caller must 1141 * initialize *name to NULL. 1142 */ 1143 int 1144 kern_getsockname(int s, struct sockaddr **name, int *namelen) 1145 { 1146 struct thread *td = curthread; 1147 struct proc *p = td->td_proc; 1148 struct file *fp; 1149 struct socket *so; 1150 struct sockaddr *sa = NULL; 1151 int error; 1152 1153 error = holdsock(p->p_fd, s, &fp); 1154 if (error) 1155 return (error); 1156 if (*namelen < 0) { 1157 fdrop(fp); 1158 return (EINVAL); 1159 } 1160 so = (struct socket *)fp->f_data; 1161 error = so_pru_sockaddr(so, &sa); 1162 if (error == 0) { 1163 if (sa == 0) { 1164 *namelen = 0; 1165 } else { 1166 *namelen = MIN(*namelen, sa->sa_len); 1167 *name = sa; 1168 } 1169 } 1170 1171 fdrop(fp); 1172 return (error); 1173 } 1174 1175 /* 1176 * getsockname_args(int fdes, caddr_t asa, int *alen) 1177 * 1178 * Get socket name. 1179 */ 1180 int 1181 sys_getsockname(struct getsockname_args *uap) 1182 { 1183 struct sockaddr *sa = NULL; 1184 int error, sa_len; 1185 1186 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1187 if (error) 1188 return (error); 1189 1190 error = kern_getsockname(uap->fdes, &sa, &sa_len); 1191 1192 if (error == 0) 1193 error = copyout(sa, uap->asa, sa_len); 1194 if (error == 0) 1195 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1196 if (sa) 1197 FREE(sa, M_SONAME); 1198 return (error); 1199 } 1200 1201 /* 1202 * The second argument to kern_getpeername() is a handle to a struct sockaddr. 1203 * This allows kern_getpeername() to return a pointer to an allocated struct 1204 * sockaddr which must be freed later with FREE(). The caller must 1205 * initialize *name to NULL. 1206 */ 1207 int 1208 kern_getpeername(int s, struct sockaddr **name, int *namelen) 1209 { 1210 struct thread *td = curthread; 1211 struct proc *p = td->td_proc; 1212 struct file *fp; 1213 struct socket *so; 1214 struct sockaddr *sa = NULL; 1215 int error; 1216 1217 error = holdsock(p->p_fd, s, &fp); 1218 if (error) 1219 return (error); 1220 if (*namelen < 0) { 1221 fdrop(fp); 1222 return (EINVAL); 1223 } 1224 so = (struct socket *)fp->f_data; 1225 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1226 fdrop(fp); 1227 return (ENOTCONN); 1228 } 1229 error = so_pru_peeraddr(so, &sa); 1230 if (error == 0) { 1231 if (sa == 0) { 1232 *namelen = 0; 1233 } else { 1234 *namelen = MIN(*namelen, sa->sa_len); 1235 *name = sa; 1236 } 1237 } 1238 1239 fdrop(fp); 1240 return (error); 1241 } 1242 1243 /* 1244 * getpeername_args(int fdes, caddr_t asa, int *alen) 1245 * 1246 * Get name of peer for connected socket. 1247 */ 1248 int 1249 sys_getpeername(struct getpeername_args *uap) 1250 { 1251 struct sockaddr *sa = NULL; 1252 int error, sa_len; 1253 1254 error = copyin(uap->alen, &sa_len, sizeof(sa_len)); 1255 if (error) 1256 return (error); 1257 1258 error = kern_getpeername(uap->fdes, &sa, &sa_len); 1259 1260 if (error == 0) 1261 error = copyout(sa, uap->asa, sa_len); 1262 if (error == 0) 1263 error = copyout(&sa_len, uap->alen, sizeof(*uap->alen)); 1264 if (sa) 1265 FREE(sa, M_SONAME); 1266 return (error); 1267 } 1268 1269 int 1270 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) 1271 { 1272 struct sockaddr *sa; 1273 int error; 1274 1275 *namp = NULL; 1276 if (len > SOCK_MAXADDRLEN) 1277 return ENAMETOOLONG; 1278 if (len < offsetof(struct sockaddr, sa_data[0])) 1279 return EDOM; 1280 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1281 error = copyin(uaddr, sa, len); 1282 if (error) { 1283 FREE(sa, M_SONAME); 1284 } else { 1285 #if BYTE_ORDER != BIG_ENDIAN 1286 /* 1287 * The bind(), connect(), and sendto() syscalls were not 1288 * versioned for COMPAT_43. Thus, this check must stay. 1289 */ 1290 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1291 sa->sa_family = sa->sa_len; 1292 #endif 1293 sa->sa_len = len; 1294 *namp = sa; 1295 } 1296 return error; 1297 } 1298 1299 /* 1300 * Detach a mapped page and release resources back to the system. 1301 * We must release our wiring and if the object is ripped out 1302 * from under the vm_page we become responsible for freeing the 1303 * page. These routines must be MPSAFE. 1304 * 1305 * XXX HACK XXX TEMPORARY UNTIL WE IMPLEMENT EXT MBUF REFERENCE COUNTING 1306 * 1307 * XXX vm_page_*() routines are not MPSAFE yet, the MP lock is required. 1308 */ 1309 static void 1310 sf_buf_mref(void *arg) 1311 { 1312 struct sfbuf_mref *sfm = arg; 1313 1314 /* 1315 * We must already hold a ref so there is no race to 0, just 1316 * atomically increment the count. 1317 */ 1318 atomic_add_int(&sfm->mref_count, 1); 1319 } 1320 1321 static void 1322 sf_buf_mfree(void *arg) 1323 { 1324 struct sfbuf_mref *sfm = arg; 1325 vm_page_t m; 1326 1327 KKASSERT(sfm->mref_count > 0); 1328 if (sfm->mref_count == 1) { 1329 /* 1330 * We are the only holder so no further locking is required, 1331 * the sfbuf can simply be freed. 1332 */ 1333 sfm->mref_count = 0; 1334 goto freeit; 1335 } else { 1336 /* 1337 * There may be other holders, we must obtain the serializer 1338 * to protect against a sf_buf_mfree() race to 0. An atomic 1339 * operation is still required for races against 1340 * sf_buf_mref(). 1341 * 1342 * XXX vm_page_*() and SFBUF routines not MPSAFE yet. 1343 */ 1344 lwkt_serialize_enter(&sfm->serializer); 1345 atomic_subtract_int(&sfm->mref_count, 1); 1346 if (sfm->mref_count == 0) { 1347 lwkt_serialize_exit(&sfm->serializer); 1348 freeit: 1349 get_mplock(); 1350 crit_enter(); 1351 m = sf_buf_page(sfm->sf); 1352 sf_buf_free(sfm->sf); 1353 vm_page_unwire(m, 0); 1354 if (m->wire_count == 0 && m->object == NULL) 1355 vm_page_try_to_free(m); 1356 crit_exit(); 1357 rel_mplock(); 1358 kfree(sfm, M_SENDFILE); 1359 } else { 1360 lwkt_serialize_exit(&sfm->serializer); 1361 } 1362 } 1363 } 1364 1365 /* 1366 * sendfile(2). 1367 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1368 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1369 * 1370 * Send a file specified by 'fd' and starting at 'offset' to a socket 1371 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1372 * nbytes == 0. Optionally add a header and/or trailer to the socket 1373 * output. If specified, write the total number of bytes sent into *sbytes. 1374 * 1375 * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused 1376 * the headers to count against the remaining bytes to be sent from 1377 * the file descriptor. We may wish to implement a compatibility syscall 1378 * in the future. 1379 */ 1380 int 1381 sys_sendfile(struct sendfile_args *uap) 1382 { 1383 struct thread *td = curthread; 1384 struct proc *p = td->td_proc; 1385 struct file *fp; 1386 struct vnode *vp = NULL; 1387 struct sf_hdtr hdtr; 1388 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 1389 struct uio auio; 1390 struct mbuf *mheader = NULL; 1391 off_t hdtr_size = 0, sbytes; 1392 int error, hbytes = 0, tbytes; 1393 1394 KKASSERT(p); 1395 1396 /* 1397 * Do argument checking. Must be a regular file in, stream 1398 * type and connected socket out, positive offset. 1399 */ 1400 fp = holdfp(p->p_fd, uap->fd, FREAD); 1401 if (fp == NULL) { 1402 return (EBADF); 1403 } 1404 if (fp->f_type != DTYPE_VNODE) { 1405 fdrop(fp); 1406 return (EINVAL); 1407 } 1408 vp = (struct vnode *)fp->f_data; 1409 vref(vp); 1410 fdrop(fp); 1411 1412 /* 1413 * If specified, get the pointer to the sf_hdtr struct for 1414 * any headers/trailers. 1415 */ 1416 if (uap->hdtr) { 1417 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1418 if (error) 1419 goto done; 1420 /* 1421 * Send any headers. 1422 */ 1423 if (hdtr.headers) { 1424 error = iovec_copyin(hdtr.headers, &iov, aiov, 1425 hdtr.hdr_cnt, &hbytes); 1426 if (error) 1427 goto done; 1428 auio.uio_iov = iov; 1429 auio.uio_iovcnt = hdtr.hdr_cnt; 1430 auio.uio_offset = 0; 1431 auio.uio_segflg = UIO_USERSPACE; 1432 auio.uio_rw = UIO_WRITE; 1433 auio.uio_td = td; 1434 auio.uio_resid = hbytes; 1435 1436 mheader = m_uiomove(&auio); 1437 1438 iovec_free(&iov, aiov); 1439 if (mheader == NULL) 1440 goto done; 1441 } 1442 } 1443 1444 error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader, 1445 &sbytes, uap->flags); 1446 if (error) 1447 goto done; 1448 1449 /* 1450 * Send trailers. Wimp out and use writev(2). 1451 */ 1452 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1453 error = iovec_copyin(hdtr.trailers, &iov, aiov, 1454 hdtr.trl_cnt, &auio.uio_resid); 1455 if (error) 1456 goto done; 1457 auio.uio_iov = iov; 1458 auio.uio_iovcnt = hdtr.trl_cnt; 1459 auio.uio_offset = 0; 1460 auio.uio_segflg = UIO_USERSPACE; 1461 auio.uio_rw = UIO_WRITE; 1462 auio.uio_td = td; 1463 1464 error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes); 1465 1466 iovec_free(&iov, aiov); 1467 if (error) 1468 goto done; 1469 hdtr_size += tbytes; /* trailer bytes successfully sent */ 1470 } 1471 1472 done: 1473 if (uap->sbytes != NULL) { 1474 sbytes += hdtr_size; 1475 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1476 } 1477 if (vp) 1478 vrele(vp); 1479 return (error); 1480 } 1481 1482 int 1483 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes, 1484 struct mbuf *mheader, off_t *sbytes, int flags) 1485 { 1486 struct thread *td = curthread; 1487 struct proc *p = td->td_proc; 1488 struct vm_object *obj; 1489 struct socket *so; 1490 struct file *fp; 1491 struct mbuf *m; 1492 struct sf_buf *sf; 1493 struct sfbuf_mref *sfm; 1494 struct vm_page *pg; 1495 off_t off, xfsize; 1496 off_t hbytes = 0; 1497 int error = 0; 1498 1499 if (vp->v_type != VREG) { 1500 error = EINVAL; 1501 goto done0; 1502 } 1503 if ((obj = vp->v_object) == NULL) { 1504 error = EINVAL; 1505 goto done0; 1506 } 1507 error = holdsock(p->p_fd, sfd, &fp); 1508 if (error) 1509 goto done0; 1510 so = (struct socket *)fp->f_data; 1511 if (so->so_type != SOCK_STREAM) { 1512 error = EINVAL; 1513 goto done; 1514 } 1515 if ((so->so_state & SS_ISCONNECTED) == 0) { 1516 error = ENOTCONN; 1517 goto done; 1518 } 1519 if (offset < 0) { 1520 error = EINVAL; 1521 goto done; 1522 } 1523 1524 *sbytes = 0; 1525 /* 1526 * Protect against multiple writers to the socket. 1527 */ 1528 ssb_lock(&so->so_snd, M_WAITOK); 1529 1530 /* 1531 * Loop through the pages in the file, starting with the requested 1532 * offset. Get a file page (do I/O if necessary), map the file page 1533 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1534 * it on the socket. 1535 */ 1536 for (off = offset; ; off += xfsize, *sbytes += xfsize + hbytes) { 1537 vm_pindex_t pindex; 1538 vm_offset_t pgoff; 1539 1540 pindex = OFF_TO_IDX(off); 1541 retry_lookup: 1542 /* 1543 * Calculate the amount to transfer. Not to exceed a page, 1544 * the EOF, or the passed in nbytes. 1545 */ 1546 xfsize = vp->v_filesize - off; 1547 if (xfsize > PAGE_SIZE) 1548 xfsize = PAGE_SIZE; 1549 pgoff = (vm_offset_t)(off & PAGE_MASK); 1550 if (PAGE_SIZE - pgoff < xfsize) 1551 xfsize = PAGE_SIZE - pgoff; 1552 if (nbytes && xfsize > (nbytes - *sbytes)) 1553 xfsize = nbytes - *sbytes; 1554 if (xfsize <= 0) 1555 break; 1556 /* 1557 * Optimize the non-blocking case by looking at the socket space 1558 * before going to the extra work of constituting the sf_buf. 1559 */ 1560 if ((fp->f_flag & FNONBLOCK) && ssb_space(&so->so_snd) <= 0) { 1561 if (so->so_state & SS_CANTSENDMORE) 1562 error = EPIPE; 1563 else 1564 error = EAGAIN; 1565 ssb_unlock(&so->so_snd); 1566 goto done; 1567 } 1568 /* 1569 * Attempt to look up the page. 1570 * 1571 * Allocate if not found, wait and loop if busy, then 1572 * wire the page. critical section protection is 1573 * required to maintain the object association (an 1574 * interrupt can free the page) through to the 1575 * vm_page_wire() call. 1576 */ 1577 crit_enter(); 1578 pg = vm_page_lookup(obj, pindex); 1579 if (pg == NULL) { 1580 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1581 if (pg == NULL) { 1582 vm_wait(); 1583 crit_exit(); 1584 goto retry_lookup; 1585 } 1586 vm_page_wakeup(pg); 1587 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1588 crit_exit(); 1589 goto retry_lookup; 1590 } 1591 vm_page_wire(pg); 1592 crit_exit(); 1593 1594 /* 1595 * If page is not valid for what we need, initiate I/O 1596 */ 1597 1598 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1599 struct uio auio; 1600 struct iovec aiov; 1601 int bsize; 1602 1603 /* 1604 * Ensure that our page is still around when the I/O 1605 * completes. 1606 */ 1607 vm_page_io_start(pg); 1608 1609 /* 1610 * Get the page from backing store. 1611 */ 1612 bsize = vp->v_mount->mnt_stat.f_iosize; 1613 auio.uio_iov = &aiov; 1614 auio.uio_iovcnt = 1; 1615 aiov.iov_base = 0; 1616 aiov.iov_len = MAXBSIZE; 1617 auio.uio_resid = MAXBSIZE; 1618 auio.uio_offset = trunc_page(off); 1619 auio.uio_segflg = UIO_NOCOPY; 1620 auio.uio_rw = UIO_READ; 1621 auio.uio_td = td; 1622 vn_lock(vp, LK_SHARED | LK_RETRY); 1623 error = VOP_READ(vp, &auio, 1624 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1625 p->p_ucred); 1626 vn_unlock(vp); 1627 vm_page_flag_clear(pg, PG_ZERO); 1628 vm_page_io_finish(pg); 1629 if (error) { 1630 crit_enter(); 1631 vm_page_unwire(pg, 0); 1632 vm_page_try_to_free(pg); 1633 crit_exit(); 1634 ssb_unlock(&so->so_snd); 1635 goto done; 1636 } 1637 } 1638 1639 1640 /* 1641 * Get a sendfile buf. We usually wait as long as necessary, 1642 * but this wait can be interrupted. 1643 */ 1644 if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { 1645 crit_enter(); 1646 vm_page_unwire(pg, 0); 1647 vm_page_try_to_free(pg); 1648 crit_exit(); 1649 ssb_unlock(&so->so_snd); 1650 error = EINTR; 1651 goto done; 1652 } 1653 1654 /* 1655 * Get an mbuf header and set it up as having external storage. 1656 */ 1657 MGETHDR(m, MB_WAIT, MT_DATA); 1658 if (m == NULL) { 1659 error = ENOBUFS; 1660 sf_buf_free(sf); 1661 ssb_unlock(&so->so_snd); 1662 goto done; 1663 } 1664 1665 /* 1666 * sfm is a temporary hack, use a per-cpu cache for this. 1667 */ 1668 sfm = kmalloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK); 1669 sfm->sf = sf; 1670 sfm->mref_count = 1; 1671 lwkt_serialize_init(&sfm->serializer); 1672 1673 m->m_ext.ext_free = sf_buf_mfree; 1674 m->m_ext.ext_ref = sf_buf_mref; 1675 m->m_ext.ext_arg = sfm; 1676 m->m_ext.ext_buf = (void *)sf->kva; 1677 m->m_ext.ext_size = PAGE_SIZE; 1678 m->m_data = (char *) sf->kva + pgoff; 1679 m->m_flags |= M_EXT; 1680 m->m_pkthdr.len = m->m_len = xfsize; 1681 KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0); 1682 1683 if (mheader != NULL) { 1684 hbytes = mheader->m_pkthdr.len; 1685 mheader->m_pkthdr.len += m->m_pkthdr.len; 1686 m_cat(mheader, m); 1687 m = mheader; 1688 mheader = NULL; 1689 } else 1690 hbytes = 0; 1691 1692 /* 1693 * Add the buffer to the socket buffer chain. 1694 */ 1695 crit_enter(); 1696 retry_space: 1697 /* 1698 * Make sure that the socket is still able to take more data. 1699 * CANTSENDMORE being true usually means that the connection 1700 * was closed. so_error is true when an error was sensed after 1701 * a previous send. 1702 * The state is checked after the page mapping and buffer 1703 * allocation above since those operations may block and make 1704 * any socket checks stale. From this point forward, nothing 1705 * blocks before the pru_send (or more accurately, any blocking 1706 * results in a loop back to here to re-check). 1707 */ 1708 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1709 if (so->so_state & SS_CANTSENDMORE) { 1710 error = EPIPE; 1711 } else { 1712 error = so->so_error; 1713 so->so_error = 0; 1714 } 1715 m_freem(m); 1716 ssb_unlock(&so->so_snd); 1717 crit_exit(); 1718 goto done; 1719 } 1720 /* 1721 * Wait for socket space to become available. We do this just 1722 * after checking the connection state above in order to avoid 1723 * a race condition with ssb_wait(). 1724 */ 1725 if (ssb_space(&so->so_snd) < so->so_snd.ssb_lowat) { 1726 if (fp->f_flag & FNONBLOCK) { 1727 m_freem(m); 1728 ssb_unlock(&so->so_snd); 1729 crit_exit(); 1730 error = EAGAIN; 1731 goto done; 1732 } 1733 error = ssb_wait(&so->so_snd); 1734 /* 1735 * An error from ssb_wait usually indicates that we've 1736 * been interrupted by a signal. If we've sent anything 1737 * then return bytes sent, otherwise return the error. 1738 */ 1739 if (error) { 1740 m_freem(m); 1741 ssb_unlock(&so->so_snd); 1742 crit_exit(); 1743 goto done; 1744 } 1745 goto retry_space; 1746 } 1747 error = so_pru_send(so, 0, m, NULL, NULL, td); 1748 crit_exit(); 1749 if (error) { 1750 ssb_unlock(&so->so_snd); 1751 goto done; 1752 } 1753 } 1754 if (mheader != NULL) { 1755 *sbytes += mheader->m_pkthdr.len; 1756 error = so_pru_send(so, 0, mheader, NULL, NULL, td); 1757 mheader = NULL; 1758 } 1759 ssb_unlock(&so->so_snd); 1760 1761 done: 1762 fdrop(fp); 1763 done0: 1764 if (mheader != NULL) 1765 m_freem(mheader); 1766 return (error); 1767 } 1768 1769 int 1770 sys_sctp_peeloff(struct sctp_peeloff_args *uap) 1771 { 1772 #ifdef SCTP 1773 struct thread *td = curthread; 1774 struct proc *p = td->td_proc; 1775 struct file *lfp = NULL; 1776 struct file *nfp = NULL; 1777 int error; 1778 struct socket *head, *so; 1779 caddr_t assoc_id; 1780 int fd; 1781 short fflag; /* type must match fp->f_flag */ 1782 1783 assoc_id = uap->name; 1784 error = holdsock(p->p_fd, uap->sd, &lfp); 1785 if (error) { 1786 return (error); 1787 } 1788 crit_enter(); 1789 head = (struct socket *)lfp->f_data; 1790 error = sctp_can_peel_off(head, assoc_id); 1791 if (error) { 1792 crit_exit(); 1793 goto done; 1794 } 1795 /* 1796 * At this point we know we do have a assoc to pull 1797 * we proceed to get the fd setup. This may block 1798 * but that is ok. 1799 */ 1800 1801 fflag = lfp->f_flag; 1802 error = falloc(p, &nfp, &fd); 1803 if (error) { 1804 /* 1805 * Probably ran out of file descriptors. Put the 1806 * unaccepted connection back onto the queue and 1807 * do another wakeup so some other process might 1808 * have a chance at it. 1809 */ 1810 crit_exit(); 1811 goto done; 1812 } 1813 uap->sysmsg_result = fd; 1814 1815 so = sctp_get_peeloff(head, assoc_id, &error); 1816 if (so == NULL) { 1817 /* 1818 * Either someone else peeled it off OR 1819 * we can't get a socket. 1820 */ 1821 goto noconnection; 1822 } 1823 so->so_state &= ~SS_COMP; 1824 so->so_state &= ~SS_NOFDREF; 1825 so->so_head = NULL; 1826 if (head->so_sigio != NULL) 1827 fsetown(fgetown(head->so_sigio), &so->so_sigio); 1828 1829 nfp->f_type = DTYPE_SOCKET; 1830 nfp->f_flag = fflag; 1831 nfp->f_ops = &socketops; 1832 nfp->f_data = so; 1833 1834 noconnection: 1835 /* 1836 * Assign the file pointer to the reserved descriptor, or clear 1837 * the reserved descriptor if an error occured. 1838 */ 1839 if (error) 1840 fsetfd(p, NULL, fd); 1841 else 1842 fsetfd(p, nfp, fd); 1843 crit_exit(); 1844 /* 1845 * Release explicitly held references before returning. 1846 */ 1847 done: 1848 if (nfp != NULL) 1849 fdrop(nfp); 1850 fdrop(lfp); 1851 return (error); 1852 #else /* SCTP */ 1853 return(EOPNOTSUPP); 1854 #endif /* SCTP */ 1855 } 1856