1 /* 2 * Copyright (c) 2005 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Jeffrey Hsu. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * 35 * Copyright (c) 1982, 1986, 1989, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 72 * $FreeBSD: src/sys/kern/kern_descrip.c,v 1.81.2.19 2004/02/28 00:43:31 tegge Exp $ 73 * $DragonFly: src/sys/kern/kern_descrip.c,v 1.69 2006/06/14 16:58:04 dillon Exp $ 74 */ 75 76 #include "opt_compat.h" 77 #include <sys/param.h> 78 #include <sys/systm.h> 79 #include <sys/malloc.h> 80 #include <sys/sysproto.h> 81 #include <sys/conf.h> 82 #include <sys/filedesc.h> 83 #include <sys/kernel.h> 84 #include <sys/sysctl.h> 85 #include <sys/vnode.h> 86 #include <sys/proc.h> 87 #include <sys/nlookup.h> 88 #include <sys/file.h> 89 #include <sys/stat.h> 90 #include <sys/filio.h> 91 #include <sys/fcntl.h> 92 #include <sys/unistd.h> 93 #include <sys/resourcevar.h> 94 #include <sys/event.h> 95 #include <sys/kern_syscall.h> 96 #include <sys/kcore.h> 97 #include <sys/kinfo.h> 98 99 #include <vm/vm.h> 100 #include <vm/vm_extern.h> 101 102 #include <sys/thread2.h> 103 #include <sys/file2.h> 104 #include <sys/spinlock2.h> 105 106 static void fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd); 107 static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr); 108 static struct file *funsetfd_locked (struct filedesc *fdp, int fd); 109 static int checkfpclosed(struct filedesc *fdp, int fd, struct file *fp); 110 static void ffree(struct file *fp); 111 112 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); 113 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader", 114 "file desc to leader structures"); 115 MALLOC_DEFINE(M_FILE, "file", "Open file structure"); 116 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 117 118 static d_open_t fdopen; 119 #define NUMFDESC 64 120 121 #define CDEV_MAJOR 22 122 static struct cdevsw fildesc_cdevsw = { 123 /* name */ "FD", 124 /* maj */ CDEV_MAJOR, 125 /* flags */ 0, 126 /* port */ NULL, 127 /* clone */ NULL, 128 129 /* open */ fdopen, 130 /* close */ noclose, 131 /* read */ noread, 132 /* write */ nowrite, 133 /* ioctl */ noioctl, 134 /* poll */ nopoll, 135 /* mmap */ nommap, 136 /* strategy */ nostrategy, 137 /* dump */ nodump, 138 /* psize */ nopsize 139 }; 140 141 static int badfo_readwrite (struct file *fp, struct uio *uio, 142 struct ucred *cred, int flags); 143 static int badfo_ioctl (struct file *fp, u_long com, caddr_t data, 144 struct ucred *cred); 145 static int badfo_poll (struct file *fp, int events, struct ucred *cred); 146 static int badfo_kqfilter (struct file *fp, struct knote *kn); 147 static int badfo_stat (struct file *fp, struct stat *sb, struct ucred *cred); 148 static int badfo_close (struct file *fp); 149 static int badfo_shutdown (struct file *fp, int how); 150 151 /* 152 * Descriptor management. 153 */ 154 static struct filelist filehead = LIST_HEAD_INITIALIZER(&filehead); 155 static struct spinlock filehead_spin = SPINLOCK_INITIALIZER(&filehead_spin); 156 static int nfiles; /* actual number of open files */ 157 extern int cmask; 158 159 /* 160 * Fixup fd_freefile and fd_lastfile after a descriptor has been cleared. 161 * 162 * MPSAFE - must be called with fdp->fd_spin exclusively held 163 */ 164 static __inline 165 void 166 fdfixup_locked(struct filedesc *fdp, int fd) 167 { 168 if (fd < fdp->fd_freefile) { 169 fdp->fd_freefile = fd; 170 } 171 while (fdp->fd_lastfile >= 0 && 172 fdp->fd_files[fdp->fd_lastfile].fp == NULL && 173 fdp->fd_files[fdp->fd_lastfile].reserved == 0 174 ) { 175 --fdp->fd_lastfile; 176 } 177 } 178 179 /* 180 * System calls on descriptors. 181 * 182 * MPSAFE 183 */ 184 int 185 sys_getdtablesize(struct getdtablesize_args *uap) 186 { 187 struct proc *p = curproc; 188 struct plimit *limit = p->p_limit; 189 190 spin_lock_rd(&limit->p_spin); 191 uap->sysmsg_result = 192 min((int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 193 spin_unlock_rd(&limit->p_spin); 194 return (0); 195 } 196 197 /* 198 * Duplicate a file descriptor to a particular value. 199 * 200 * note: keep in mind that a potential race condition exists when closing 201 * descriptors from a shared descriptor table (via rfork). 202 * 203 * MPSAFE 204 */ 205 int 206 sys_dup2(struct dup2_args *uap) 207 { 208 int error; 209 210 error = kern_dup(DUP_FIXED, uap->from, uap->to, uap->sysmsg_fds); 211 212 return (error); 213 } 214 215 /* 216 * Duplicate a file descriptor. 217 * 218 * MPSAFE 219 */ 220 int 221 sys_dup(struct dup_args *uap) 222 { 223 int error; 224 225 error = kern_dup(DUP_VARIABLE, uap->fd, 0, uap->sysmsg_fds); 226 227 return (error); 228 } 229 230 /* 231 * MPALMOSTSAFE - acquires mplock for fp operations 232 */ 233 int 234 kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) 235 { 236 struct thread *td = curthread; 237 struct proc *p = td->td_proc; 238 struct file *fp; 239 struct vnode *vp; 240 u_int newmin; 241 u_int oflags; 242 int tmp, error, flg = F_POSIX; 243 244 KKASSERT(p); 245 246 /* 247 * Operations on file descriptors that do not require a file pointer. 248 */ 249 switch (cmd) { 250 case F_GETFD: 251 error = fgetfdflags(p->p_fd, fd, &tmp); 252 if (error == 0) 253 dat->fc_cloexec = (tmp & UF_EXCLOSE) ? FD_CLOEXEC : 0; 254 return (error); 255 256 case F_SETFD: 257 if (dat->fc_cloexec & FD_CLOEXEC) 258 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 259 else 260 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 261 return (error); 262 case F_DUPFD: 263 newmin = dat->fc_fd; 264 error = kern_dup(DUP_VARIABLE, fd, newmin, &dat->fc_fd); 265 return (error); 266 default: 267 break; 268 } 269 270 /* 271 * Operations on file pointers 272 */ 273 if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) 274 return (EBADF); 275 276 get_mplock(); 277 switch (cmd) { 278 case F_GETFL: 279 dat->fc_flags = OFLAGS(fp->f_flag); 280 error = 0; 281 break; 282 283 case F_SETFL: 284 oflags = fp->f_flag & FCNTLFLAGS; 285 fp->f_flag &= ~FCNTLFLAGS; 286 fp->f_flag |= FFLAGS(dat->fc_flags & ~O_ACCMODE) & FCNTLFLAGS; 287 error = 0; 288 if ((fp->f_flag ^ oflags) & FASYNC) { 289 tmp = fp->f_flag & FASYNC; 290 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred); 291 } 292 if (error) 293 fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) | oflags; 294 break; 295 296 case F_GETOWN: 297 error = fo_ioctl(fp, FIOGETOWN, (caddr_t)&dat->fc_owner, cred); 298 break; 299 300 case F_SETOWN: 301 error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&dat->fc_owner, cred); 302 break; 303 304 case F_SETLKW: 305 flg |= F_WAIT; 306 /* Fall into F_SETLK */ 307 308 case F_SETLK: 309 if (fp->f_type != DTYPE_VNODE) { 310 error = EBADF; 311 break; 312 } 313 vp = (struct vnode *)fp->f_data; 314 315 /* 316 * copyin/lockop may block 317 */ 318 if (dat->fc_flock.l_whence == SEEK_CUR) 319 dat->fc_flock.l_start += fp->f_offset; 320 321 switch (dat->fc_flock.l_type) { 322 case F_RDLCK: 323 if ((fp->f_flag & FREAD) == 0) { 324 error = EBADF; 325 break; 326 } 327 p->p_leader->p_flag |= P_ADVLOCK; 328 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 329 &dat->fc_flock, flg); 330 break; 331 case F_WRLCK: 332 if ((fp->f_flag & FWRITE) == 0) { 333 error = EBADF; 334 break; 335 } 336 p->p_leader->p_flag |= P_ADVLOCK; 337 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 338 &dat->fc_flock, flg); 339 break; 340 case F_UNLCK: 341 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 342 &dat->fc_flock, F_POSIX); 343 break; 344 default: 345 error = EINVAL; 346 break; 347 } 348 349 /* 350 * It is possible to race a close() on the descriptor while 351 * we were blocked getting the lock. If this occurs the 352 * close might not have caught the lock. 353 */ 354 if (checkfpclosed(p->p_fd, fd, fp)) { 355 dat->fc_flock.l_whence = SEEK_SET; 356 dat->fc_flock.l_start = 0; 357 dat->fc_flock.l_len = 0; 358 dat->fc_flock.l_type = F_UNLCK; 359 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 360 F_UNLCK, &dat->fc_flock, F_POSIX); 361 } 362 break; 363 364 case F_GETLK: 365 if (fp->f_type != DTYPE_VNODE) { 366 error = EBADF; 367 break; 368 } 369 vp = (struct vnode *)fp->f_data; 370 /* 371 * copyin/lockop may block 372 */ 373 if (dat->fc_flock.l_type != F_RDLCK && 374 dat->fc_flock.l_type != F_WRLCK && 375 dat->fc_flock.l_type != F_UNLCK) { 376 error = EINVAL; 377 break; 378 } 379 if (dat->fc_flock.l_whence == SEEK_CUR) 380 dat->fc_flock.l_start += fp->f_offset; 381 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, 382 &dat->fc_flock, F_POSIX); 383 break; 384 default: 385 error = EINVAL; 386 break; 387 } 388 rel_mplock(); 389 390 fdrop(fp); 391 return (error); 392 } 393 394 /* 395 * The file control system call. 396 * 397 * MPSAFE 398 */ 399 int 400 sys_fcntl(struct fcntl_args *uap) 401 { 402 union fcntl_dat dat; 403 int error; 404 405 switch (uap->cmd) { 406 case F_DUPFD: 407 dat.fc_fd = uap->arg; 408 break; 409 case F_SETFD: 410 dat.fc_cloexec = uap->arg; 411 break; 412 case F_SETFL: 413 dat.fc_flags = uap->arg; 414 break; 415 case F_SETOWN: 416 dat.fc_owner = uap->arg; 417 break; 418 case F_SETLKW: 419 case F_SETLK: 420 case F_GETLK: 421 error = copyin((caddr_t)uap->arg, &dat.fc_flock, 422 sizeof(struct flock)); 423 if (error) 424 return (error); 425 break; 426 } 427 428 error = kern_fcntl(uap->fd, uap->cmd, &dat, curproc->p_ucred); 429 430 if (error == 0) { 431 switch (uap->cmd) { 432 case F_DUPFD: 433 uap->sysmsg_result = dat.fc_fd; 434 break; 435 case F_GETFD: 436 uap->sysmsg_result = dat.fc_cloexec; 437 break; 438 case F_GETFL: 439 uap->sysmsg_result = dat.fc_flags; 440 break; 441 case F_GETOWN: 442 uap->sysmsg_result = dat.fc_owner; 443 case F_GETLK: 444 error = copyout(&dat.fc_flock, (caddr_t)uap->arg, 445 sizeof(struct flock)); 446 break; 447 } 448 } 449 450 return (error); 451 } 452 453 /* 454 * Common code for dup, dup2, and fcntl(F_DUPFD). 455 * 456 * The type flag can be either DUP_FIXED or DUP_VARIABLE. DUP_FIXED tells 457 * kern_dup() to destructively dup over an existing file descriptor if new 458 * is already open. DUP_VARIABLE tells kern_dup() to find the lowest 459 * unused file descriptor that is greater than or equal to new. 460 * 461 * MPSAFE 462 */ 463 int 464 kern_dup(enum dup_type type, int old, int new, int *res) 465 { 466 struct thread *td = curthread; 467 struct proc *p = td->td_proc; 468 struct filedesc *fdp = p->p_fd; 469 struct file *fp; 470 struct file *delfp; 471 int oldflags; 472 int holdleaders; 473 int error, newfd; 474 475 /* 476 * Verify that we have a valid descriptor to dup from and 477 * possibly to dup to. 478 */ 479 retry: 480 spin_lock_wr(&fdp->fd_spin); 481 if (new < 0 || new > p->p_rlimit[RLIMIT_NOFILE].rlim_cur || 482 new >= maxfilesperproc) { 483 spin_unlock_wr(&fdp->fd_spin); 484 return (EINVAL); 485 } 486 if ((unsigned)old >= fdp->fd_nfiles || fdp->fd_files[old].fp == NULL) { 487 spin_unlock_wr(&fdp->fd_spin); 488 return (EBADF); 489 } 490 if (type == DUP_FIXED && old == new) { 491 *res = new; 492 spin_unlock_wr(&fdp->fd_spin); 493 return (0); 494 } 495 fp = fdp->fd_files[old].fp; 496 oldflags = fdp->fd_files[old].fileflags; 497 fhold(fp); /* MPSAFE - can be called with a spinlock held */ 498 499 /* 500 * Allocate a new descriptor if DUP_VARIABLE, or expand the table 501 * if the requested descriptor is beyond the current table size. 502 * 503 * This can block. Retry if the source descriptor no longer matches 504 * or if our expectation in the expansion case races. 505 * 506 * If we are not expanding or allocating a new decriptor, then reset 507 * the target descriptor to a reserved state so we have a uniform 508 * setup for the next code block. 509 */ 510 if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) { 511 spin_unlock_wr(&fdp->fd_spin); 512 error = fdalloc(p, new, &newfd); 513 spin_lock_wr(&fdp->fd_spin); 514 if (error) { 515 spin_unlock_wr(&fdp->fd_spin); 516 fdrop(fp); 517 return (error); 518 } 519 /* 520 * Check for ripout 521 */ 522 if (old >= fdp->fd_nfiles || fdp->fd_files[old].fp != fp) { 523 fsetfd_locked(fdp, NULL, newfd); 524 spin_unlock_wr(&fdp->fd_spin); 525 fdrop(fp); 526 goto retry; 527 } 528 /* 529 * Check for expansion race 530 */ 531 if (type != DUP_VARIABLE && new != newfd) { 532 fsetfd_locked(fdp, NULL, newfd); 533 spin_unlock_wr(&fdp->fd_spin); 534 fdrop(fp); 535 goto retry; 536 } 537 /* 538 * Check for ripout, newfd reused old (this case probably 539 * can't occur). 540 */ 541 if (old == newfd) { 542 fsetfd_locked(fdp, NULL, newfd); 543 spin_unlock_wr(&fdp->fd_spin); 544 fdrop(fp); 545 goto retry; 546 } 547 new = newfd; 548 delfp = NULL; 549 } else { 550 if (fdp->fd_files[new].reserved) { 551 spin_unlock_wr(&fdp->fd_spin); 552 fdrop(fp); 553 printf("Warning: dup(): target descriptor %d is reserved, waiting for it to be resolved\n", new); 554 tsleep(fdp, 0, "fdres", hz); 555 goto retry; 556 } 557 558 /* 559 * If the target descriptor was never allocated we have 560 * to allocate it. If it was we have to clean out the 561 * old descriptor. delfp inherits the ref from the 562 * descriptor table. 563 */ 564 delfp = fdp->fd_files[new].fp; 565 fdp->fd_files[new].fp = NULL; 566 fdp->fd_files[new].reserved = 1; 567 if (delfp == NULL) { 568 fdreserve_locked(fdp, new, 1); 569 if (new > fdp->fd_lastfile) 570 fdp->fd_lastfile = new; 571 } 572 573 } 574 575 /* 576 * NOTE: still holding an exclusive spinlock 577 */ 578 579 /* 580 * If a descriptor is being overwritten we may hve to tell 581 * fdfree() to sleep to ensure that all relevant process 582 * leaders can be traversed in closef(). 583 */ 584 if (delfp != NULL && p->p_fdtol != NULL) { 585 fdp->fd_holdleaderscount++; 586 holdleaders = 1; 587 } else { 588 holdleaders = 0; 589 } 590 KASSERT(delfp == NULL || type == DUP_FIXED, 591 ("dup() picked an open file")); 592 593 /* 594 * Duplicate the source descriptor, update lastfile. If the new 595 * descriptor was not allocated and we aren't replacing an existing 596 * descriptor we have to mark the descriptor as being in use. 597 * 598 * The fd_files[] array inherits fp's hold reference. 599 */ 600 fsetfd_locked(fdp, fp, new); 601 fdp->fd_files[new].fileflags = oldflags & ~UF_EXCLOSE; 602 spin_unlock_wr(&fdp->fd_spin); 603 fdrop(fp); 604 *res = new; 605 606 /* 607 * If we dup'd over a valid file, we now own the reference to it 608 * and must dispose of it using closef() semantics (as if a 609 * close() were performed on it). 610 */ 611 if (delfp) { 612 (void)closef(delfp, td); 613 if (holdleaders) { 614 spin_lock_wr(&fdp->fd_spin); 615 fdp->fd_holdleaderscount--; 616 if (fdp->fd_holdleaderscount == 0 && 617 fdp->fd_holdleaderswakeup != 0) { 618 fdp->fd_holdleaderswakeup = 0; 619 spin_unlock_wr(&fdp->fd_spin); 620 wakeup(&fdp->fd_holdleaderscount); 621 } else { 622 spin_unlock_wr(&fdp->fd_spin); 623 } 624 } 625 } 626 return (0); 627 } 628 629 /* 630 * If sigio is on the list associated with a process or process group, 631 * disable signalling from the device, remove sigio from the list and 632 * free sigio. 633 */ 634 void 635 funsetown(struct sigio *sigio) 636 { 637 if (sigio == NULL) 638 return; 639 crit_enter(); 640 *(sigio->sio_myref) = NULL; 641 crit_exit(); 642 if (sigio->sio_pgid < 0) { 643 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, 644 sigio, sio_pgsigio); 645 } else /* if ((*sigiop)->sio_pgid > 0) */ { 646 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, 647 sigio, sio_pgsigio); 648 } 649 crfree(sigio->sio_ucred); 650 free(sigio, M_SIGIO); 651 } 652 653 /* Free a list of sigio structures. */ 654 void 655 funsetownlst(struct sigiolst *sigiolst) 656 { 657 struct sigio *sigio; 658 659 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) 660 funsetown(sigio); 661 } 662 663 /* 664 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 665 * 666 * After permission checking, add a sigio structure to the sigio list for 667 * the process or process group. 668 */ 669 int 670 fsetown(pid_t pgid, struct sigio **sigiop) 671 { 672 struct proc *proc; 673 struct pgrp *pgrp; 674 struct sigio *sigio; 675 676 if (pgid == 0) { 677 funsetown(*sigiop); 678 return (0); 679 } 680 if (pgid > 0) { 681 proc = pfind(pgid); 682 if (proc == NULL) 683 return (ESRCH); 684 685 /* 686 * Policy - Don't allow a process to FSETOWN a process 687 * in another session. 688 * 689 * Remove this test to allow maximum flexibility or 690 * restrict FSETOWN to the current process or process 691 * group for maximum safety. 692 */ 693 if (proc->p_session != curproc->p_session) 694 return (EPERM); 695 696 pgrp = NULL; 697 } else /* if (pgid < 0) */ { 698 pgrp = pgfind(-pgid); 699 if (pgrp == NULL) 700 return (ESRCH); 701 702 /* 703 * Policy - Don't allow a process to FSETOWN a process 704 * in another session. 705 * 706 * Remove this test to allow maximum flexibility or 707 * restrict FSETOWN to the current process or process 708 * group for maximum safety. 709 */ 710 if (pgrp->pg_session != curproc->p_session) 711 return (EPERM); 712 713 proc = NULL; 714 } 715 funsetown(*sigiop); 716 sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); 717 if (pgid > 0) { 718 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); 719 sigio->sio_proc = proc; 720 } else { 721 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 722 sigio->sio_pgrp = pgrp; 723 } 724 sigio->sio_pgid = pgid; 725 sigio->sio_ucred = crhold(curproc->p_ucred); 726 /* It would be convenient if p_ruid was in ucred. */ 727 sigio->sio_ruid = curproc->p_ucred->cr_ruid; 728 sigio->sio_myref = sigiop; 729 crit_enter(); 730 *sigiop = sigio; 731 crit_exit(); 732 return (0); 733 } 734 735 /* 736 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 737 */ 738 pid_t 739 fgetown(struct sigio *sigio) 740 { 741 return (sigio != NULL ? sigio->sio_pgid : 0); 742 } 743 744 /* 745 * Close many file descriptors. 746 * 747 * MPSAFE 748 */ 749 int 750 sys_closefrom(struct closefrom_args *uap) 751 { 752 return(kern_closefrom(uap->fd)); 753 } 754 755 /* 756 * Close all file descriptors greater then or equal to fd 757 * 758 * MPSAFE 759 */ 760 int 761 kern_closefrom(int fd) 762 { 763 struct thread *td = curthread; 764 struct proc *p = td->td_proc; 765 struct filedesc *fdp; 766 767 KKASSERT(p); 768 fdp = p->p_fd; 769 770 if (fd < 0) 771 return (EINVAL); 772 773 /* 774 * NOTE: This function will skip unassociated descriptors and 775 * reserved descriptors that have not yet been assigned. 776 * fd_lastfile can change as a side effect of kern_close(). 777 */ 778 spin_lock_wr(&fdp->fd_spin); 779 while (fd <= fdp->fd_lastfile) { 780 if (fdp->fd_files[fd].fp != NULL) { 781 spin_unlock_wr(&fdp->fd_spin); 782 /* ok if this races another close */ 783 if (kern_close(fd) == EINTR) 784 return (EINTR); 785 spin_lock_wr(&fdp->fd_spin); 786 } 787 ++fd; 788 } 789 spin_unlock_wr(&fdp->fd_spin); 790 return (0); 791 } 792 793 /* 794 * Close a file descriptor. 795 * 796 * MPSAFE 797 */ 798 int 799 sys_close(struct close_args *uap) 800 { 801 return(kern_close(uap->fd)); 802 } 803 804 /* 805 * MPALMOSTSAFE - acquires mplock around knote_fdclose() calls 806 */ 807 int 808 kern_close(int fd) 809 { 810 struct thread *td = curthread; 811 struct proc *p = td->td_proc; 812 struct filedesc *fdp; 813 struct file *fp; 814 int error; 815 int holdleaders; 816 817 KKASSERT(p); 818 fdp = p->p_fd; 819 820 spin_lock_wr(&fdp->fd_spin); 821 if ((fp = funsetfd_locked(fdp, fd)) == NULL) { 822 spin_unlock_wr(&fdp->fd_spin); 823 return (EBADF); 824 } 825 holdleaders = 0; 826 if (p->p_fdtol != NULL) { 827 /* 828 * Ask fdfree() to sleep to ensure that all relevant 829 * process leaders can be traversed in closef(). 830 */ 831 fdp->fd_holdleaderscount++; 832 holdleaders = 1; 833 } 834 835 /* 836 * we now hold the fp reference that used to be owned by the descriptor 837 * array. 838 */ 839 spin_unlock_wr(&fdp->fd_spin); 840 if (fd < fdp->fd_knlistsize) { 841 get_mplock(); 842 if (fd < fdp->fd_knlistsize) 843 knote_fdclose(p, fd); 844 rel_mplock(); 845 } 846 error = closef(fp, td); 847 if (holdleaders) { 848 spin_lock_wr(&fdp->fd_spin); 849 fdp->fd_holdleaderscount--; 850 if (fdp->fd_holdleaderscount == 0 && 851 fdp->fd_holdleaderswakeup != 0) { 852 fdp->fd_holdleaderswakeup = 0; 853 spin_unlock_wr(&fdp->fd_spin); 854 wakeup(&fdp->fd_holdleaderscount); 855 } else { 856 spin_unlock_wr(&fdp->fd_spin); 857 } 858 } 859 return (error); 860 } 861 862 /* 863 * shutdown_args(int fd, int how) 864 */ 865 int 866 kern_shutdown(int fd, int how) 867 { 868 struct thread *td = curthread; 869 struct proc *p = td->td_proc; 870 struct file *fp; 871 int error; 872 873 KKASSERT(p); 874 875 if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) 876 return (EBADF); 877 error = fo_shutdown(fp, how); 878 fdrop(fp); 879 880 return (error); 881 } 882 883 int 884 sys_shutdown(struct shutdown_args *uap) 885 { 886 int error; 887 888 error = kern_shutdown(uap->s, uap->how); 889 890 return (error); 891 } 892 893 int 894 kern_fstat(int fd, struct stat *ub) 895 { 896 struct thread *td = curthread; 897 struct proc *p = td->td_proc; 898 struct file *fp; 899 int error; 900 901 KKASSERT(p); 902 903 if ((fp = holdfp(p->p_fd, fd, -1)) == NULL) 904 return (EBADF); 905 error = fo_stat(fp, ub, p->p_ucred); 906 fdrop(fp); 907 908 return (error); 909 } 910 911 /* 912 * Return status information about a file descriptor. 913 */ 914 int 915 sys_fstat(struct fstat_args *uap) 916 { 917 struct stat st; 918 int error; 919 920 error = kern_fstat(uap->fd, &st); 921 922 if (error == 0) 923 error = copyout(&st, uap->sb, sizeof(st)); 924 return (error); 925 } 926 927 /* 928 * Return pathconf information about a file descriptor. 929 */ 930 /* ARGSUSED */ 931 int 932 sys_fpathconf(struct fpathconf_args *uap) 933 { 934 struct thread *td = curthread; 935 struct proc *p = td->td_proc; 936 struct file *fp; 937 struct vnode *vp; 938 int error = 0; 939 940 KKASSERT(p); 941 942 if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) 943 return (EBADF); 944 945 switch (fp->f_type) { 946 case DTYPE_PIPE: 947 case DTYPE_SOCKET: 948 if (uap->name != _PC_PIPE_BUF) { 949 error = EINVAL; 950 } else { 951 uap->sysmsg_result = PIPE_BUF; 952 error = 0; 953 } 954 break; 955 case DTYPE_FIFO: 956 case DTYPE_VNODE: 957 vp = (struct vnode *)fp->f_data; 958 error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds); 959 break; 960 default: 961 error = EOPNOTSUPP; 962 break; 963 } 964 fdrop(fp); 965 return(error); 966 } 967 968 static int fdexpand; 969 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); 970 971 /* 972 * Grow the file table so it can hold through descriptor (want). 973 * 974 * The fdp's spinlock must be held exclusively on entry and may be held 975 * exclusively on return. The spinlock may be cycled by the routine. 976 * 977 * MPSAFE 978 */ 979 static void 980 fdgrow_locked(struct filedesc *fdp, int want) 981 { 982 struct fdnode *newfiles; 983 struct fdnode *oldfiles; 984 int nf, extra; 985 986 nf = fdp->fd_nfiles; 987 do { 988 /* nf has to be of the form 2^n - 1 */ 989 nf = 2 * nf + 1; 990 } while (nf <= want); 991 992 spin_unlock_wr(&fdp->fd_spin); 993 newfiles = malloc(nf * sizeof(struct fdnode), M_FILEDESC, M_WAITOK); 994 spin_lock_wr(&fdp->fd_spin); 995 996 /* 997 * We could have raced another extend while we were not holding 998 * the spinlock. 999 */ 1000 if (fdp->fd_nfiles >= nf) { 1001 spin_unlock_wr(&fdp->fd_spin); 1002 free(newfiles, M_FILEDESC); 1003 spin_lock_wr(&fdp->fd_spin); 1004 return; 1005 } 1006 /* 1007 * Copy the existing ofile and ofileflags arrays 1008 * and zero the new portion of each array. 1009 */ 1010 extra = nf - fdp->fd_nfiles; 1011 bcopy(fdp->fd_files, newfiles, fdp->fd_nfiles * sizeof(struct fdnode)); 1012 bzero(&newfiles[fdp->fd_nfiles], extra * sizeof(struct fdnode)); 1013 1014 oldfiles = fdp->fd_files; 1015 fdp->fd_files = newfiles; 1016 fdp->fd_nfiles = nf; 1017 1018 if (oldfiles != fdp->fd_builtin_files) { 1019 spin_unlock_wr(&fdp->fd_spin); 1020 free(oldfiles, M_FILEDESC); 1021 spin_lock_wr(&fdp->fd_spin); 1022 } 1023 fdexpand++; 1024 } 1025 1026 /* 1027 * Number of nodes in right subtree, including the root. 1028 */ 1029 static __inline int 1030 right_subtree_size(int n) 1031 { 1032 return (n ^ (n | (n + 1))); 1033 } 1034 1035 /* 1036 * Bigger ancestor. 1037 */ 1038 static __inline int 1039 right_ancestor(int n) 1040 { 1041 return (n | (n + 1)); 1042 } 1043 1044 /* 1045 * Smaller ancestor. 1046 */ 1047 static __inline int 1048 left_ancestor(int n) 1049 { 1050 return ((n & (n + 1)) - 1); 1051 } 1052 1053 /* 1054 * Traverse the in-place binary tree buttom-up adjusting the allocation 1055 * count so scans can determine where free descriptors are located. 1056 * 1057 * MPSAFE - caller must be holding an exclusive spinlock on fdp 1058 */ 1059 static 1060 void 1061 fdreserve_locked(struct filedesc *fdp, int fd, int incr) 1062 { 1063 while (fd >= 0) { 1064 fdp->fd_files[fd].allocated += incr; 1065 KKASSERT(fdp->fd_files[fd].allocated >= 0); 1066 fd = left_ancestor(fd); 1067 } 1068 } 1069 1070 /* 1071 * Reserve a file descriptor for the process. If no error occurs, the 1072 * caller MUST at some point call fsetfd() or assign a file pointer 1073 * or dispose of the reservation. 1074 * 1075 * MPSAFE 1076 */ 1077 int 1078 fdalloc(struct proc *p, int want, int *result) 1079 { 1080 struct filedesc *fdp = p->p_fd; 1081 int fd, rsize, rsum, node, lim; 1082 1083 spin_lock_rd(&p->p_limit->p_spin); 1084 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 1085 spin_unlock_rd(&p->p_limit->p_spin); 1086 if (want >= lim) 1087 return (EMFILE); 1088 spin_lock_wr(&fdp->fd_spin); 1089 if (want >= fdp->fd_nfiles) 1090 fdgrow_locked(fdp, want); 1091 1092 /* 1093 * Search for a free descriptor starting at the higher 1094 * of want or fd_freefile. If that fails, consider 1095 * expanding the ofile array. 1096 * 1097 * NOTE! the 'allocated' field is a cumulative recursive allocation 1098 * count. If we happen to see a value of 0 then we can shortcut 1099 * our search. Otherwise we run through through the tree going 1100 * down branches we know have free descriptor(s) until we hit a 1101 * leaf node. The leaf node will be free but will not necessarily 1102 * have an allocated field of 0. 1103 */ 1104 retry: 1105 /* move up the tree looking for a subtree with a free node */ 1106 for (fd = max(want, fdp->fd_freefile); fd < min(fdp->fd_nfiles, lim); 1107 fd = right_ancestor(fd)) { 1108 if (fdp->fd_files[fd].allocated == 0) 1109 goto found; 1110 1111 rsize = right_subtree_size(fd); 1112 if (fdp->fd_files[fd].allocated == rsize) 1113 continue; /* right subtree full */ 1114 1115 /* 1116 * Free fd is in the right subtree of the tree rooted at fd. 1117 * Call that subtree R. Look for the smallest (leftmost) 1118 * subtree of R with an unallocated fd: continue moving 1119 * down the left branch until encountering a full left 1120 * subtree, then move to the right. 1121 */ 1122 for (rsum = 0, rsize /= 2; rsize > 0; rsize /= 2) { 1123 node = fd + rsize; 1124 rsum += fdp->fd_files[node].allocated; 1125 if (fdp->fd_files[fd].allocated == rsum + rsize) { 1126 fd = node; /* move to the right */ 1127 if (fdp->fd_files[node].allocated == 0) 1128 goto found; 1129 rsum = 0; 1130 } 1131 } 1132 goto found; 1133 } 1134 1135 /* 1136 * No space in current array. Expand? 1137 */ 1138 if (fdp->fd_nfiles >= lim) { 1139 spin_unlock_wr(&fdp->fd_spin); 1140 return (EMFILE); 1141 } 1142 fdgrow_locked(fdp, want); 1143 goto retry; 1144 1145 found: 1146 KKASSERT(fd < fdp->fd_nfiles); 1147 if (fd > fdp->fd_lastfile) 1148 fdp->fd_lastfile = fd; 1149 if (want <= fdp->fd_freefile) 1150 fdp->fd_freefile = fd; 1151 *result = fd; 1152 KKASSERT(fdp->fd_files[fd].fp == NULL); 1153 KKASSERT(fdp->fd_files[fd].reserved == 0); 1154 fdp->fd_files[fd].fileflags = 0; 1155 fdp->fd_files[fd].reserved = 1; 1156 fdreserve_locked(fdp, fd, 1); 1157 spin_unlock_wr(&fdp->fd_spin); 1158 return (0); 1159 } 1160 1161 /* 1162 * Check to see whether n user file descriptors 1163 * are available to the process p. 1164 * 1165 * MPSAFE 1166 */ 1167 int 1168 fdavail(struct proc *p, int n) 1169 { 1170 struct filedesc *fdp = p->p_fd; 1171 struct fdnode *fdnode; 1172 int i, lim, last; 1173 1174 spin_lock_rd(&p->p_limit->p_spin); 1175 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 1176 spin_unlock_rd(&p->p_limit->p_spin); 1177 1178 spin_lock_rd(&fdp->fd_spin); 1179 if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) { 1180 spin_unlock_rd(&fdp->fd_spin); 1181 return (1); 1182 } 1183 last = min(fdp->fd_nfiles, lim); 1184 fdnode = &fdp->fd_files[fdp->fd_freefile]; 1185 for (i = last - fdp->fd_freefile; --i >= 0; ++fdnode) { 1186 if (fdnode->fp == NULL && --n <= 0) { 1187 spin_unlock_rd(&fdp->fd_spin); 1188 return (1); 1189 } 1190 } 1191 spin_unlock_rd(&fdp->fd_spin); 1192 return (0); 1193 } 1194 1195 /* 1196 * falloc: 1197 * Create a new open file structure and reserve a file decriptor 1198 * for the process that refers to it. 1199 * 1200 * Root creds are checked using p, or assumed if p is NULL. If 1201 * resultfd is non-NULL then p must also be non-NULL. No file 1202 * descriptor is reserved if resultfd is NULL. 1203 * 1204 * A file pointer with a refcount of 1 is returned. Note that the 1205 * file pointer is NOT associated with the descriptor. If falloc 1206 * returns success, fsetfd() MUST be called to either associate the 1207 * file pointer or clear the reservation. 1208 * 1209 * MPSAFE 1210 */ 1211 int 1212 falloc(struct proc *p, struct file **resultfp, int *resultfd) 1213 { 1214 static struct timeval lastfail; 1215 static int curfail; 1216 struct file *fp; 1217 int error; 1218 1219 fp = NULL; 1220 1221 /* 1222 * Handle filetable full issues and root overfill. 1223 */ 1224 if (nfiles >= maxfiles - maxfilesrootres && 1225 ((p && p->p_ucred->cr_ruid != 0) || nfiles >= maxfiles)) { 1226 if (ppsratecheck(&lastfail, &curfail, 1)) { 1227 printf("kern.maxfiles limit exceeded by uid %d, please see tuning(7).\n", 1228 (p ? p->p_ucred->cr_ruid : -1)); 1229 } 1230 error = ENFILE; 1231 goto done; 1232 } 1233 1234 /* 1235 * Allocate a new file descriptor. 1236 */ 1237 fp = malloc(sizeof(struct file), M_FILE, M_WAITOK | M_ZERO); 1238 spin_init(&fp->f_spin); 1239 fp->f_count = 1; 1240 fp->f_ops = &badfileops; 1241 fp->f_seqcount = 1; 1242 if (p) 1243 fp->f_cred = crhold(p->p_ucred); 1244 else 1245 fp->f_cred = crhold(proc0.p_ucred); 1246 spin_lock_wr(&filehead_spin); 1247 nfiles++; 1248 LIST_INSERT_HEAD(&filehead, fp, f_list); 1249 spin_unlock_wr(&filehead_spin); 1250 if (resultfd) { 1251 if ((error = fdalloc(p, 0, resultfd)) != 0) { 1252 fdrop(fp); 1253 fp = NULL; 1254 } 1255 } else { 1256 error = 0; 1257 } 1258 done: 1259 *resultfp = fp; 1260 return (error); 1261 } 1262 1263 /* 1264 * MPSAFE 1265 */ 1266 static 1267 int 1268 checkfpclosed(struct filedesc *fdp, int fd, struct file *fp) 1269 { 1270 int error; 1271 1272 spin_lock_rd(&fdp->fd_spin); 1273 if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp) 1274 error = EBADF; 1275 else 1276 error = 0; 1277 spin_unlock_rd(&fdp->fd_spin); 1278 return (error); 1279 } 1280 1281 /* 1282 * Associate a file pointer with a previously reserved file descriptor. 1283 * This function always succeeds. 1284 * 1285 * If fp is NULL, the file descriptor is returned to the pool. 1286 */ 1287 1288 /* 1289 * MPSAFE (exclusive spinlock must be held on call) 1290 */ 1291 static void 1292 fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) 1293 { 1294 KKASSERT((unsigned)fd < fdp->fd_nfiles); 1295 KKASSERT(fdp->fd_files[fd].reserved != 0); 1296 if (fp) { 1297 fhold(fp); 1298 fdp->fd_files[fd].fp = fp; 1299 fdp->fd_files[fd].reserved = 0; 1300 if (fp->f_type == DTYPE_KQUEUE) { 1301 if (fdp->fd_knlistsize < 0) 1302 fdp->fd_knlistsize = 0; 1303 } 1304 } else { 1305 fdp->fd_files[fd].reserved = 0; 1306 fdreserve_locked(fdp, fd, -1); 1307 fdfixup_locked(fdp, fd); 1308 } 1309 } 1310 1311 /* 1312 * MPSAFE 1313 */ 1314 void 1315 fsetfd(struct proc *p, struct file *fp, int fd) 1316 { 1317 struct filedesc *fdp = p->p_fd; 1318 1319 spin_lock_wr(&fdp->fd_spin); 1320 fsetfd_locked(fdp, fp, fd); 1321 spin_unlock_wr(&fdp->fd_spin); 1322 } 1323 1324 /* 1325 * MPSAFE (exclusive spinlock must be held on call) 1326 */ 1327 static 1328 struct file * 1329 funsetfd_locked(struct filedesc *fdp, int fd) 1330 { 1331 struct file *fp; 1332 1333 if ((unsigned)fd >= fdp->fd_nfiles) 1334 return (NULL); 1335 if ((fp = fdp->fd_files[fd].fp) == NULL) 1336 return (NULL); 1337 fdp->fd_files[fd].fp = NULL; 1338 fdp->fd_files[fd].fileflags = 0; 1339 1340 fdreserve_locked(fdp, fd, -1); 1341 fdfixup_locked(fdp, fd); 1342 return(fp); 1343 } 1344 1345 /* 1346 * MPSAFE 1347 */ 1348 int 1349 fgetfdflags(struct filedesc *fdp, int fd, int *flagsp) 1350 { 1351 int error; 1352 1353 spin_lock_rd(&fdp->fd_spin); 1354 if (((u_int)fd) >= fdp->fd_nfiles) { 1355 error = EBADF; 1356 } else if (fdp->fd_files[fd].fp == NULL) { 1357 error = EBADF; 1358 } else { 1359 *flagsp = fdp->fd_files[fd].fileflags; 1360 error = 0; 1361 } 1362 spin_unlock_rd(&fdp->fd_spin); 1363 return (error); 1364 } 1365 1366 /* 1367 * MPSAFE 1368 */ 1369 int 1370 fsetfdflags(struct filedesc *fdp, int fd, int add_flags) 1371 { 1372 int error; 1373 1374 spin_lock_wr(&fdp->fd_spin); 1375 if (((u_int)fd) >= fdp->fd_nfiles) { 1376 error = EBADF; 1377 } else if (fdp->fd_files[fd].fp == NULL) { 1378 error = EBADF; 1379 } else { 1380 fdp->fd_files[fd].fileflags |= add_flags; 1381 error = 0; 1382 } 1383 spin_unlock_wr(&fdp->fd_spin); 1384 return (error); 1385 } 1386 1387 /* 1388 * MPSAFE 1389 */ 1390 int 1391 fclrfdflags(struct filedesc *fdp, int fd, int rem_flags) 1392 { 1393 int error; 1394 1395 spin_lock_wr(&fdp->fd_spin); 1396 if (((u_int)fd) >= fdp->fd_nfiles) { 1397 error = EBADF; 1398 } else if (fdp->fd_files[fd].fp == NULL) { 1399 error = EBADF; 1400 } else { 1401 fdp->fd_files[fd].fileflags &= ~rem_flags; 1402 error = 0; 1403 } 1404 spin_unlock_wr(&fdp->fd_spin); 1405 return (error); 1406 } 1407 1408 void 1409 fsetcred(struct file *fp, struct ucred *cr) 1410 { 1411 crhold(cr); 1412 crfree(fp->f_cred); 1413 fp->f_cred = cr; 1414 } 1415 1416 /* 1417 * Free a file descriptor. 1418 */ 1419 static 1420 void 1421 ffree(struct file *fp) 1422 { 1423 KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!")); 1424 spin_lock_wr(&filehead_spin); 1425 LIST_REMOVE(fp, f_list); 1426 nfiles--; 1427 spin_unlock_wr(&filehead_spin); 1428 crfree(fp->f_cred); 1429 if (fp->f_ncp) { 1430 cache_drop(fp->f_ncp); 1431 fp->f_ncp = NULL; 1432 } 1433 free(fp, M_FILE); 1434 } 1435 1436 /* 1437 * called from init_main, initialize filedesc0 for proc0. 1438 */ 1439 void 1440 fdinit_bootstrap(struct proc *p0, struct filedesc *fdp0, int cmask) 1441 { 1442 p0->p_fd = fdp0; 1443 p0->p_fdtol = NULL; 1444 fdp0->fd_refcnt = 1; 1445 fdp0->fd_cmask = cmask; 1446 fdp0->fd_files = fdp0->fd_builtin_files; 1447 fdp0->fd_nfiles = NDFILE; 1448 fdp0->fd_lastfile = -1; 1449 spin_init(&fdp0->fd_spin); 1450 } 1451 1452 /* 1453 * Build a new filedesc structure. 1454 * 1455 * NOT MPSAFE (vref) 1456 */ 1457 struct filedesc * 1458 fdinit(struct proc *p) 1459 { 1460 struct filedesc *newfdp; 1461 struct filedesc *fdp = p->p_fd; 1462 1463 newfdp = malloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK|M_ZERO); 1464 spin_lock_rd(&fdp->fd_spin); 1465 if (fdp->fd_cdir) { 1466 newfdp->fd_cdir = fdp->fd_cdir; 1467 vref(newfdp->fd_cdir); 1468 newfdp->fd_ncdir = cache_hold(fdp->fd_ncdir); 1469 } 1470 1471 /* 1472 * rdir may not be set in e.g. proc0 or anything vm_fork'd off of 1473 * proc0, but should unconditionally exist in other processes. 1474 */ 1475 if (fdp->fd_rdir) { 1476 newfdp->fd_rdir = fdp->fd_rdir; 1477 vref(newfdp->fd_rdir); 1478 newfdp->fd_nrdir = cache_hold(fdp->fd_nrdir); 1479 } 1480 if (fdp->fd_jdir) { 1481 newfdp->fd_jdir = fdp->fd_jdir; 1482 vref(newfdp->fd_jdir); 1483 newfdp->fd_njdir = cache_hold(fdp->fd_njdir); 1484 } 1485 spin_unlock_rd(&fdp->fd_spin); 1486 1487 /* Create the file descriptor table. */ 1488 newfdp->fd_refcnt = 1; 1489 newfdp->fd_cmask = cmask; 1490 newfdp->fd_files = newfdp->fd_builtin_files; 1491 newfdp->fd_nfiles = NDFILE; 1492 newfdp->fd_knlistsize = -1; 1493 newfdp->fd_lastfile = -1; 1494 spin_init(&newfdp->fd_spin); 1495 1496 return (newfdp); 1497 } 1498 1499 /* 1500 * Share a filedesc structure. 1501 * 1502 * MPSAFE 1503 */ 1504 struct filedesc * 1505 fdshare(struct proc *p) 1506 { 1507 struct filedesc *fdp; 1508 1509 fdp = p->p_fd; 1510 spin_lock_wr(&fdp->fd_spin); 1511 fdp->fd_refcnt++; 1512 spin_unlock_wr(&fdp->fd_spin); 1513 return (fdp); 1514 } 1515 1516 /* 1517 * Copy a filedesc structure. 1518 * 1519 * MPSAFE 1520 */ 1521 struct filedesc * 1522 fdcopy(struct proc *p) 1523 { 1524 struct filedesc *fdp = p->p_fd; 1525 struct filedesc *newfdp; 1526 struct fdnode *fdnode; 1527 int i; 1528 int ni; 1529 1530 /* 1531 * Certain daemons might not have file descriptors. 1532 */ 1533 if (fdp == NULL) 1534 return (NULL); 1535 1536 /* 1537 * Allocate the new filedesc and fd_files[] array. This can race 1538 * with operations by other threads on the fdp so we have to be 1539 * careful. 1540 */ 1541 newfdp = malloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK | M_ZERO); 1542 again: 1543 spin_lock_rd(&fdp->fd_spin); 1544 if (fdp->fd_lastfile < NDFILE) { 1545 newfdp->fd_files = newfdp->fd_builtin_files; 1546 i = NDFILE; 1547 } else { 1548 /* 1549 * We have to allocate (N^2-1) entries for our in-place 1550 * binary tree. Allow the table to shrink. 1551 */ 1552 i = fdp->fd_nfiles; 1553 ni = (i - 1) / 2; 1554 while (ni > fdp->fd_lastfile && ni > NDFILE) { 1555 i = ni; 1556 ni = (i - 1) / 2; 1557 } 1558 spin_unlock_rd(&fdp->fd_spin); 1559 newfdp->fd_files = malloc(i * sizeof(struct fdnode), 1560 M_FILEDESC, M_WAITOK | M_ZERO); 1561 1562 /* 1563 * Check for race, retry 1564 */ 1565 spin_lock_rd(&fdp->fd_spin); 1566 if (i <= fdp->fd_lastfile) { 1567 spin_unlock_rd(&fdp->fd_spin); 1568 free(newfdp->fd_files, M_FILEDESC); 1569 goto again; 1570 } 1571 } 1572 1573 /* 1574 * Dup the remaining fields. vref() and cache_hold() can be 1575 * safely called while holding the read spinlock on fdp. 1576 * 1577 * The read spinlock on fdp is still being held. 1578 * 1579 * NOTE: vref and cache_hold calls for the case where the vnode 1580 * or cache entry already has at least one ref may be called 1581 * while holding spin locks. 1582 */ 1583 if ((newfdp->fd_cdir = fdp->fd_cdir) != NULL) { 1584 vref(newfdp->fd_cdir); 1585 newfdp->fd_ncdir = cache_hold(fdp->fd_ncdir); 1586 } 1587 /* 1588 * We must check for fd_rdir here, at least for now because 1589 * the init process is created before we have access to the 1590 * rootvode to take a reference to it. 1591 */ 1592 if ((newfdp->fd_rdir = fdp->fd_rdir) != NULL) { 1593 vref(newfdp->fd_rdir); 1594 newfdp->fd_nrdir = cache_hold(fdp->fd_nrdir); 1595 } 1596 if ((newfdp->fd_jdir = fdp->fd_jdir) != NULL) { 1597 vref(newfdp->fd_jdir); 1598 newfdp->fd_njdir = cache_hold(fdp->fd_njdir); 1599 } 1600 newfdp->fd_refcnt = 1; 1601 newfdp->fd_nfiles = i; 1602 newfdp->fd_lastfile = fdp->fd_lastfile; 1603 newfdp->fd_freefile = fdp->fd_freefile; 1604 newfdp->fd_cmask = fdp->fd_cmask; 1605 newfdp->fd_knlist = NULL; 1606 newfdp->fd_knlistsize = -1; 1607 newfdp->fd_knhash = NULL; 1608 newfdp->fd_knhashmask = 0; 1609 spin_init(&newfdp->fd_spin); 1610 1611 /* 1612 * Copy the descriptor table through (i). This also copies the 1613 * allocation state. Then go through and ref the file pointers 1614 * and clean up any KQ descriptors. 1615 * 1616 * kq descriptors cannot be copied. Since we haven't ref'd the 1617 * copied files yet we can ignore the return value from funsetfd(). 1618 * 1619 * The read spinlock on fdp is still being held. 1620 */ 1621 bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode)); 1622 for (i = 0 ; i < newfdp->fd_nfiles; ++i) { 1623 fdnode = &newfdp->fd_files[i]; 1624 if (fdnode->reserved) { 1625 fdreserve_locked(newfdp, i, -1); 1626 fdnode->reserved = 0; 1627 fdfixup_locked(newfdp, i); 1628 } else if (fdnode->fp) { 1629 if (fdnode->fp->f_type == DTYPE_KQUEUE) { 1630 (void)funsetfd_locked(newfdp, i); 1631 } else { 1632 fhold(fdnode->fp); 1633 } 1634 } 1635 } 1636 spin_unlock_rd(&fdp->fd_spin); 1637 return (newfdp); 1638 } 1639 1640 /* 1641 * Release a filedesc structure. 1642 * 1643 * NOT MPSAFE (MPSAFE for refs > 1, but the final cleanup code is not MPSAFE) 1644 */ 1645 void 1646 fdfree(struct proc *p) 1647 { 1648 struct thread *td = p->p_thread; 1649 struct filedesc *fdp = p->p_fd; 1650 struct fdnode *fdnode; 1651 int i; 1652 struct filedesc_to_leader *fdtol; 1653 struct file *fp; 1654 struct vnode *vp; 1655 struct flock lf; 1656 1657 /* Certain daemons might not have file descriptors. */ 1658 if (fdp == NULL) 1659 return; 1660 1661 /* 1662 * Severe messing around to follow 1663 */ 1664 spin_lock_wr(&fdp->fd_spin); 1665 1666 /* Check for special need to clear POSIX style locks */ 1667 fdtol = p->p_fdtol; 1668 if (fdtol != NULL) { 1669 KASSERT(fdtol->fdl_refcount > 0, 1670 ("filedesc_to_refcount botch: fdl_refcount=%d", 1671 fdtol->fdl_refcount)); 1672 if (fdtol->fdl_refcount == 1 && 1673 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 1674 for (i = 0; i <= fdp->fd_lastfile; ++i) { 1675 fdnode = &fdp->fd_files[i]; 1676 if (fdnode->fp == NULL || 1677 fdnode->fp->f_type != DTYPE_VNODE) { 1678 continue; 1679 } 1680 fp = fdnode->fp; 1681 fhold(fp); 1682 spin_unlock_wr(&fdp->fd_spin); 1683 1684 lf.l_whence = SEEK_SET; 1685 lf.l_start = 0; 1686 lf.l_len = 0; 1687 lf.l_type = F_UNLCK; 1688 vp = (struct vnode *)fp->f_data; 1689 (void) VOP_ADVLOCK(vp, 1690 (caddr_t)p->p_leader, 1691 F_UNLCK, 1692 &lf, 1693 F_POSIX); 1694 fdrop(fp); 1695 spin_lock_wr(&fdp->fd_spin); 1696 } 1697 } 1698 retry: 1699 if (fdtol->fdl_refcount == 1) { 1700 if (fdp->fd_holdleaderscount > 0 && 1701 (p->p_leader->p_flag & P_ADVLOCK) != 0) { 1702 /* 1703 * close() or do_dup() has cleared a reference 1704 * in a shared file descriptor table. 1705 */ 1706 fdp->fd_holdleaderswakeup = 1; 1707 msleep(&fdp->fd_holdleaderscount, 1708 &fdp->fd_spin, 0, "fdlhold", 0); 1709 goto retry; 1710 } 1711 if (fdtol->fdl_holdcount > 0) { 1712 /* 1713 * Ensure that fdtol->fdl_leader 1714 * remains valid in closef(). 1715 */ 1716 fdtol->fdl_wakeup = 1; 1717 msleep(fdtol, &fdp->fd_spin, 0, "fdlhold", 0); 1718 goto retry; 1719 } 1720 } 1721 fdtol->fdl_refcount--; 1722 if (fdtol->fdl_refcount == 0 && 1723 fdtol->fdl_holdcount == 0) { 1724 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 1725 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 1726 } else { 1727 fdtol = NULL; 1728 } 1729 p->p_fdtol = NULL; 1730 if (fdtol != NULL) { 1731 spin_unlock_wr(&fdp->fd_spin); 1732 free(fdtol, M_FILEDESC_TO_LEADER); 1733 spin_lock_wr(&fdp->fd_spin); 1734 } 1735 } 1736 if (--fdp->fd_refcnt > 0) { 1737 spin_unlock_wr(&fdp->fd_spin); 1738 return; 1739 } 1740 spin_unlock_wr(&fdp->fd_spin); 1741 1742 /* 1743 * we are the last reference to the structure, we can 1744 * safely assume it will not change out from under us. 1745 */ 1746 for (i = 0; i <= fdp->fd_lastfile; ++i) { 1747 if (fdp->fd_files[i].fp) 1748 closef(fdp->fd_files[i].fp, td); 1749 } 1750 if (fdp->fd_files != fdp->fd_builtin_files) 1751 free(fdp->fd_files, M_FILEDESC); 1752 if (fdp->fd_cdir) { 1753 cache_drop(fdp->fd_ncdir); 1754 vrele(fdp->fd_cdir); 1755 } 1756 if (fdp->fd_rdir) { 1757 cache_drop(fdp->fd_nrdir); 1758 vrele(fdp->fd_rdir); 1759 } 1760 if (fdp->fd_jdir) { 1761 cache_drop(fdp->fd_njdir); 1762 vrele(fdp->fd_jdir); 1763 } 1764 if (fdp->fd_knlist) 1765 free(fdp->fd_knlist, M_KQUEUE); 1766 if (fdp->fd_knhash) 1767 free(fdp->fd_knhash, M_KQUEUE); 1768 free(fdp, M_FILEDESC); 1769 } 1770 1771 /* 1772 * Retrieve and reference the file pointer associated with a descriptor. 1773 * 1774 * MPSAFE 1775 */ 1776 struct file * 1777 holdfp(struct filedesc *fdp, int fd, int flag) 1778 { 1779 struct file* fp; 1780 1781 spin_lock_rd(&fdp->fd_spin); 1782 if (((u_int)fd) >= fdp->fd_nfiles) { 1783 fp = NULL; 1784 goto done; 1785 } 1786 if ((fp = fdp->fd_files[fd].fp) == NULL) 1787 goto done; 1788 if ((fp->f_flag & flag) == 0 && flag != -1) { 1789 fp = NULL; 1790 goto done; 1791 } 1792 fhold(fp); 1793 done: 1794 spin_unlock_rd(&fdp->fd_spin); 1795 return (fp); 1796 } 1797 1798 /* 1799 * holdsock() - load the struct file pointer associated 1800 * with a socket into *fpp. If an error occurs, non-zero 1801 * will be returned and *fpp will be set to NULL. 1802 * 1803 * MPSAFE 1804 */ 1805 int 1806 holdsock(struct filedesc *fdp, int fd, struct file **fpp) 1807 { 1808 struct file *fp; 1809 int error; 1810 1811 spin_lock_rd(&fdp->fd_spin); 1812 if ((unsigned)fd >= fdp->fd_nfiles) { 1813 error = EBADF; 1814 fp = NULL; 1815 goto done; 1816 } 1817 if ((fp = fdp->fd_files[fd].fp) == NULL) { 1818 error = EBADF; 1819 goto done; 1820 } 1821 if (fp->f_type != DTYPE_SOCKET) { 1822 error = ENOTSOCK; 1823 goto done; 1824 } 1825 fhold(fp); 1826 error = 0; 1827 done: 1828 spin_unlock_rd(&fdp->fd_spin); 1829 *fpp = fp; 1830 return (error); 1831 } 1832 1833 /* 1834 * Convert a user file descriptor to a held file pointer. 1835 * 1836 * MPSAFE 1837 */ 1838 int 1839 holdvnode(struct filedesc *fdp, int fd, struct file **fpp) 1840 { 1841 struct file *fp; 1842 int error; 1843 1844 spin_lock_rd(&fdp->fd_spin); 1845 if ((unsigned)fd >= fdp->fd_nfiles) { 1846 error = EBADF; 1847 fp = NULL; 1848 goto done; 1849 } 1850 if ((fp = fdp->fd_files[fd].fp) == NULL) { 1851 error = EBADF; 1852 goto done; 1853 } 1854 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 1855 error = EINVAL; 1856 goto done; 1857 } 1858 fhold(fp); 1859 error = 0; 1860 done: 1861 spin_unlock_rd(&fdp->fd_spin); 1862 *fpp = fp; 1863 return (error); 1864 } 1865 1866 /* 1867 * For setugid programs, we don't want to people to use that setugidness 1868 * to generate error messages which write to a file which otherwise would 1869 * otherwise be off-limits to the process. 1870 * 1871 * This is a gross hack to plug the hole. A better solution would involve 1872 * a special vop or other form of generalized access control mechanism. We 1873 * go ahead and just reject all procfs file systems accesses as dangerous. 1874 * 1875 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is 1876 * sufficient. We also don't for check setugidness since we know we are. 1877 */ 1878 static int 1879 is_unsafe(struct file *fp) 1880 { 1881 if (fp->f_type == DTYPE_VNODE && 1882 ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS) 1883 return (1); 1884 return (0); 1885 } 1886 1887 /* 1888 * Make this setguid thing safe, if at all possible. 1889 * 1890 * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() 1891 */ 1892 void 1893 setugidsafety(struct proc *p) 1894 { 1895 struct thread *td = p->p_thread; 1896 struct filedesc *fdp = p->p_fd; 1897 int i; 1898 1899 /* Certain daemons might not have file descriptors. */ 1900 if (fdp == NULL) 1901 return; 1902 1903 /* 1904 * note: fdp->fd_files may be reallocated out from under us while 1905 * we are blocked in a close. Be careful! 1906 */ 1907 for (i = 0; i <= fdp->fd_lastfile; i++) { 1908 if (i > 2) 1909 break; 1910 if (fdp->fd_files[i].fp && is_unsafe(fdp->fd_files[i].fp)) { 1911 struct file *fp; 1912 1913 if (i < fdp->fd_knlistsize) 1914 knote_fdclose(p, i); 1915 /* 1916 * NULL-out descriptor prior to close to avoid 1917 * a race while close blocks. 1918 */ 1919 if ((fp = funsetfd_locked(fdp, i)) != NULL) 1920 closef(fp, td); 1921 } 1922 } 1923 } 1924 1925 /* 1926 * Close any files on exec? 1927 * 1928 * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() 1929 */ 1930 void 1931 fdcloseexec(struct proc *p) 1932 { 1933 struct thread *td = p->p_thread; 1934 struct filedesc *fdp = p->p_fd; 1935 int i; 1936 1937 /* Certain daemons might not have file descriptors. */ 1938 if (fdp == NULL) 1939 return; 1940 1941 /* 1942 * We cannot cache fd_files since operations may block and rip 1943 * them out from under us. 1944 */ 1945 for (i = 0; i <= fdp->fd_lastfile; i++) { 1946 if (fdp->fd_files[i].fp != NULL && 1947 (fdp->fd_files[i].fileflags & UF_EXCLOSE)) { 1948 struct file *fp; 1949 1950 if (i < fdp->fd_knlistsize) 1951 knote_fdclose(p, i); 1952 /* 1953 * NULL-out descriptor prior to close to avoid 1954 * a race while close blocks. 1955 */ 1956 if ((fp = funsetfd_locked(fdp, i)) != NULL) 1957 closef(fp, td); 1958 } 1959 } 1960 } 1961 1962 /* 1963 * It is unsafe for set[ug]id processes to be started with file 1964 * descriptors 0..2 closed, as these descriptors are given implicit 1965 * significance in the Standard C library. fdcheckstd() will create a 1966 * descriptor referencing /dev/null for each of stdin, stdout, and 1967 * stderr that is not already open. 1968 * 1969 * NOT MPSAFE - calls falloc, vn_open, etc 1970 */ 1971 int 1972 fdcheckstd(struct proc *p) 1973 { 1974 struct nlookupdata nd; 1975 struct filedesc *fdp; 1976 struct file *fp; 1977 register_t retval; 1978 int i, error, flags, devnull; 1979 1980 fdp = p->p_fd; 1981 if (fdp == NULL) 1982 return (0); 1983 devnull = -1; 1984 error = 0; 1985 for (i = 0; i < 3; i++) { 1986 if (fdp->fd_files[i].fp != NULL) 1987 continue; 1988 if (devnull < 0) { 1989 if ((error = falloc(p, &fp, &devnull)) != 0) 1990 break; 1991 1992 error = nlookup_init(&nd, "/dev/null", UIO_SYSSPACE, 1993 NLC_FOLLOW|NLC_LOCKVP); 1994 flags = FREAD | FWRITE; 1995 if (error == 0) 1996 error = vn_open(&nd, fp, flags, 0); 1997 if (error == 0) 1998 fsetfd(p, fp, devnull); 1999 else 2000 fsetfd(p, NULL, devnull); 2001 fdrop(fp); 2002 nlookup_done(&nd); 2003 if (error) 2004 break; 2005 KKASSERT(i == devnull); 2006 } else { 2007 error = kern_dup(DUP_FIXED, devnull, i, &retval); 2008 if (error != 0) 2009 break; 2010 } 2011 } 2012 return (error); 2013 } 2014 2015 /* 2016 * Internal form of close. 2017 * Decrement reference count on file structure. 2018 * Note: td and/or p may be NULL when closing a file 2019 * that was being passed in a message. 2020 * 2021 * MPALMOSTSAFE - acquires mplock for VOP operations 2022 */ 2023 int 2024 closef(struct file *fp, struct thread *td) 2025 { 2026 struct vnode *vp; 2027 struct flock lf; 2028 struct filedesc_to_leader *fdtol; 2029 struct proc *p; 2030 2031 if (fp == NULL) 2032 return (0); 2033 if (td == NULL) { 2034 td = curthread; 2035 p = NULL; /* allow no proc association */ 2036 } else { 2037 p = td->td_proc; /* can also be NULL */ 2038 } 2039 /* 2040 * POSIX record locking dictates that any close releases ALL 2041 * locks owned by this process. This is handled by setting 2042 * a flag in the unlock to free ONLY locks obeying POSIX 2043 * semantics, and not to free BSD-style file locks. 2044 * If the descriptor was in a message, POSIX-style locks 2045 * aren't passed with the descriptor. 2046 */ 2047 if (p != NULL && fp->f_type == DTYPE_VNODE && 2048 (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS) 2049 ) { 2050 get_mplock(); 2051 if ((p->p_leader->p_flag & P_ADVLOCK) != 0) { 2052 lf.l_whence = SEEK_SET; 2053 lf.l_start = 0; 2054 lf.l_len = 0; 2055 lf.l_type = F_UNLCK; 2056 vp = (struct vnode *)fp->f_data; 2057 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 2058 &lf, F_POSIX); 2059 } 2060 fdtol = p->p_fdtol; 2061 if (fdtol != NULL) { 2062 /* 2063 * Handle special case where file descriptor table 2064 * is shared between multiple process leaders. 2065 */ 2066 for (fdtol = fdtol->fdl_next; 2067 fdtol != p->p_fdtol; 2068 fdtol = fdtol->fdl_next) { 2069 if ((fdtol->fdl_leader->p_flag & 2070 P_ADVLOCK) == 0) 2071 continue; 2072 fdtol->fdl_holdcount++; 2073 lf.l_whence = SEEK_SET; 2074 lf.l_start = 0; 2075 lf.l_len = 0; 2076 lf.l_type = F_UNLCK; 2077 vp = (struct vnode *)fp->f_data; 2078 (void) VOP_ADVLOCK(vp, 2079 (caddr_t)fdtol->fdl_leader, 2080 F_UNLCK, &lf, F_POSIX); 2081 fdtol->fdl_holdcount--; 2082 if (fdtol->fdl_holdcount == 0 && 2083 fdtol->fdl_wakeup != 0) { 2084 fdtol->fdl_wakeup = 0; 2085 wakeup(fdtol); 2086 } 2087 } 2088 } 2089 rel_mplock(); 2090 } 2091 return (fdrop(fp)); 2092 } 2093 2094 /* 2095 * MPSAFE 2096 * 2097 * fhold() can only be called if f_count is already at least 1 (i.e. the 2098 * caller of fhold() already has a reference to the file pointer in some 2099 * manner or other). 2100 * 2101 * This is a rare case where callers are allowed to hold spinlocks, so 2102 * we can't ourselves. Since we are not obtaining the fp spinlock, 2103 * we have to use an atomic lock to interlock against fdrop(). 2104 */ 2105 void 2106 fhold(struct file *fp) 2107 { 2108 atomic_add_int(&fp->f_count, 1); 2109 } 2110 2111 /* 2112 * A spinlock is required to handle 1->0 transitions on f_count. We have 2113 * to use atomic_sub_int so as not to race the atomic_add_int in fhold(). 2114 * 2115 * MPALMOSTSAFE - acquires mplock for final close sequence 2116 */ 2117 int 2118 fdrop(struct file *fp) 2119 { 2120 struct flock lf; 2121 struct vnode *vp; 2122 int error; 2123 2124 spin_lock_wr(&fp->f_spin); 2125 atomic_subtract_int(&fp->f_count, 1); 2126 if (fp->f_count > 0) { 2127 spin_unlock_wr(&fp->f_spin); 2128 return (0); 2129 } 2130 spin_unlock_wr(&fp->f_spin); 2131 2132 get_mplock(); 2133 2134 /* 2135 * The last reference has gone away, we own the fp structure free 2136 * and clear. 2137 */ 2138 if (fp->f_count < 0) 2139 panic("fdrop: count < 0"); 2140 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE && 2141 (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS) 2142 ) { 2143 lf.l_whence = SEEK_SET; 2144 lf.l_start = 0; 2145 lf.l_len = 0; 2146 lf.l_type = F_UNLCK; 2147 vp = (struct vnode *)fp->f_data; 2148 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); 2149 } 2150 if (fp->f_ops != &badfileops) 2151 error = fo_close(fp); 2152 else 2153 error = 0; 2154 ffree(fp); 2155 rel_mplock(); 2156 return (error); 2157 } 2158 2159 /* 2160 * Apply an advisory lock on a file descriptor. 2161 * 2162 * Just attempt to get a record lock of the requested type on 2163 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). 2164 */ 2165 int 2166 sys_flock(struct flock_args *uap) 2167 { 2168 struct proc *p = curproc; 2169 struct file *fp; 2170 struct vnode *vp; 2171 struct flock lf; 2172 int error; 2173 2174 if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL) 2175 return (EBADF); 2176 if (fp->f_type != DTYPE_VNODE) { 2177 error = EOPNOTSUPP; 2178 goto done; 2179 } 2180 vp = (struct vnode *)fp->f_data; 2181 lf.l_whence = SEEK_SET; 2182 lf.l_start = 0; 2183 lf.l_len = 0; 2184 if (uap->how & LOCK_UN) { 2185 lf.l_type = F_UNLCK; 2186 fp->f_flag &= ~FHASLOCK; 2187 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); 2188 goto done; 2189 } 2190 if (uap->how & LOCK_EX) 2191 lf.l_type = F_WRLCK; 2192 else if (uap->how & LOCK_SH) 2193 lf.l_type = F_RDLCK; 2194 else { 2195 error = EBADF; 2196 goto done; 2197 } 2198 fp->f_flag |= FHASLOCK; 2199 if (uap->how & LOCK_NB) 2200 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0); 2201 else 2202 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT); 2203 done: 2204 fdrop(fp); 2205 return (error); 2206 } 2207 2208 /* 2209 * File Descriptor pseudo-device driver (/dev/fd/). 2210 * 2211 * Opening minor device N dup()s the file (if any) connected to file 2212 * descriptor N belonging to the calling process. Note that this driver 2213 * consists of only the ``open()'' routine, because all subsequent 2214 * references to this file will be direct to the other driver. 2215 */ 2216 /* ARGSUSED */ 2217 static int 2218 fdopen(dev_t dev, int mode, int type, struct thread *td) 2219 { 2220 KKASSERT(td->td_lwp != NULL); 2221 2222 /* 2223 * XXX Kludge: set curlwp->lwp_dupfd to contain the value of the 2224 * the file descriptor being sought for duplication. The error 2225 * return ensures that the vnode for this device will be released 2226 * by vn_open. Open will detect this special error and take the 2227 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 2228 * will simply report the error. 2229 */ 2230 td->td_lwp->lwp_dupfd = minor(dev); 2231 return (ENODEV); 2232 } 2233 2234 /* 2235 * The caller has reserved the file descriptor dfd for us. On success we 2236 * must fsetfd() it. On failure the caller will clean it up. 2237 * 2238 * NOT MPSAFE - isn't getting spinlocks, possibly other things 2239 */ 2240 int 2241 dupfdopen(struct proc *p, int dfd, int sfd, int mode, int error) 2242 { 2243 struct filedesc *fdp = p->p_fd; 2244 struct file *wfp; 2245 struct file *xfp; 2246 2247 if ((wfp = holdfp(fdp, sfd, -1)) == NULL) 2248 return (EBADF); 2249 2250 /* 2251 * There are two cases of interest here. 2252 * 2253 * For ENODEV simply dup sfd to file descriptor dfd and return. 2254 * 2255 * For ENXIO steal away the file structure from sfd and store it 2256 * dfd. sfd is effectively closed by this operation. 2257 * 2258 * Any other error code is just returned. 2259 */ 2260 switch (error) { 2261 case ENODEV: 2262 /* 2263 * Check that the mode the file is being opened for is a 2264 * subset of the mode of the existing descriptor. 2265 */ 2266 if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) 2267 return (EACCES); 2268 fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags; 2269 fsetfd(p, wfp, dfd); 2270 error = 0; 2271 break; 2272 case ENXIO: 2273 /* 2274 * Steal away the file pointer from dfd, and stuff it into indx. 2275 */ 2276 fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags; 2277 fsetfd(p, wfp, dfd); 2278 if ((xfp = funsetfd_locked(fdp, sfd)) != NULL) 2279 fdrop(xfp); 2280 KKASSERT(xfp == wfp); /* XXX MP RACE */ 2281 error = 0; 2282 break; 2283 default: 2284 break; 2285 } 2286 fdrop(wfp); 2287 return (error); 2288 } 2289 2290 /* 2291 * NOT MPSAFE - I think these refer to a common file descriptor table 2292 * and we need to spinlock that to link fdtol in. 2293 */ 2294 struct filedesc_to_leader * 2295 filedesc_to_leader_alloc(struct filedesc_to_leader *old, 2296 struct proc *leader) 2297 { 2298 struct filedesc_to_leader *fdtol; 2299 2300 fdtol = malloc(sizeof(struct filedesc_to_leader), 2301 M_FILEDESC_TO_LEADER, M_WAITOK); 2302 fdtol->fdl_refcount = 1; 2303 fdtol->fdl_holdcount = 0; 2304 fdtol->fdl_wakeup = 0; 2305 fdtol->fdl_leader = leader; 2306 if (old != NULL) { 2307 fdtol->fdl_next = old->fdl_next; 2308 fdtol->fdl_prev = old; 2309 old->fdl_next = fdtol; 2310 fdtol->fdl_next->fdl_prev = fdtol; 2311 } else { 2312 fdtol->fdl_next = fdtol; 2313 fdtol->fdl_prev = fdtol; 2314 } 2315 return fdtol; 2316 } 2317 2318 /* 2319 * Scan all file pointers in the system. The callback is made with 2320 * both the master list spinlock held and the fp spinlock held, 2321 * both exclusively. 2322 * 2323 * MPSAFE 2324 * 2325 * WARNING: both the filehead spinlock and the file pointer spinlock are 2326 * held exclusively when the callback is made. The file pointer is not 2327 * referenced. 2328 */ 2329 void 2330 allfiles_scan_exclusive(int (*callback)(struct file *, void *), void *data) 2331 { 2332 struct file *fp; 2333 int res; 2334 2335 spin_lock_wr(&filehead_spin); 2336 LIST_FOREACH(fp, &filehead, f_list) { 2337 spin_lock_wr(&fp->f_spin); 2338 res = callback(fp, data); 2339 spin_unlock_wr(&fp->f_spin); 2340 if (res < 0) 2341 break; 2342 } 2343 spin_unlock_wr(&filehead_spin); 2344 } 2345 2346 /* 2347 * Get file structures. 2348 * 2349 * NOT MPSAFE - process list scan, SYSCTL_OUT (probably not mpsafe) 2350 */ 2351 2352 struct sysctl_kern_file_info { 2353 int count; 2354 int error; 2355 struct sysctl_req *req; 2356 }; 2357 2358 static int sysctl_kern_file_callback(struct proc *p, void *data); 2359 2360 static int 2361 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 2362 { 2363 struct sysctl_kern_file_info info; 2364 2365 /* 2366 * Note: because the number of file descriptors is calculated 2367 * in different ways for sizing vs returning the data, 2368 * there is information leakage from the first loop. However, 2369 * it is of a similar order of magnitude to the leakage from 2370 * global system statistics such as kern.openfiles. 2371 * 2372 * When just doing a count, note that we cannot just count 2373 * the elements and add f_count via the filehead list because 2374 * threaded processes share their descriptor table and f_count might 2375 * still be '1' in that case. 2376 * 2377 * Since the SYSCTL op can block, we must hold the process to 2378 * prevent it being ripped out from under us either in the 2379 * file descriptor loop or in the greater LIST_FOREACH. The 2380 * process may be in varying states of disrepair. If the process 2381 * is in SZOMB we may have caught it just as it is being removed 2382 * from the allproc list, we must skip it in that case to maintain 2383 * an unbroken chain through the allproc list. 2384 */ 2385 info.count = 0; 2386 info.error = 0; 2387 info.req = req; 2388 allproc_scan(sysctl_kern_file_callback, &info); 2389 2390 /* 2391 * When just calculating the size, overestimate a bit to try to 2392 * prevent system activity from causing the buffer-fill call 2393 * to fail later on. 2394 */ 2395 if (req->oldptr == NULL) { 2396 info.count = (info.count + 16) + (info.count / 10); 2397 info.error = SYSCTL_OUT(req, NULL, 2398 info.count * sizeof(struct kinfo_file)); 2399 } 2400 return (info.error); 2401 } 2402 2403 static int 2404 sysctl_kern_file_callback(struct proc *p, void *data) 2405 { 2406 struct sysctl_kern_file_info *info = data; 2407 struct kinfo_file kf; 2408 struct filedesc *fdp; 2409 struct file *fp; 2410 uid_t uid; 2411 int n; 2412 2413 if (p->p_stat == SIDL || (p->p_flag & P_ZOMBIE)) 2414 return(0); 2415 if (!PRISON_CHECK(info->req->td->td_proc->p_ucred, p->p_ucred) != 0) 2416 return(0); 2417 if ((fdp = p->p_fd) == NULL) 2418 return(0); 2419 spin_lock_rd(&fdp->fd_spin); 2420 for (n = 0; n < fdp->fd_nfiles; ++n) { 2421 if ((fp = fdp->fd_files[n].fp) == NULL) 2422 continue; 2423 if (info->req->oldptr == NULL) { 2424 ++info->count; 2425 } else { 2426 uid = p->p_ucred ? p->p_ucred->cr_uid : -1; 2427 kcore_make_file(&kf, fp, p->p_pid, uid, n); 2428 spin_unlock_rd(&fdp->fd_spin); 2429 info->error = SYSCTL_OUT(info->req, &kf, sizeof(kf)); 2430 spin_lock_rd(&fdp->fd_spin); 2431 if (info->error) 2432 break; 2433 } 2434 } 2435 spin_unlock_rd(&fdp->fd_spin); 2436 if (info->error) 2437 return(-1); 2438 return(0); 2439 } 2440 2441 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 2442 0, 0, sysctl_kern_file, "S,file", "Entire file table"); 2443 2444 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 2445 &maxfilesperproc, 0, "Maximum files allowed open per process"); 2446 2447 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 2448 &maxfiles, 0, "Maximum number of files"); 2449 2450 SYSCTL_INT(_kern, OID_AUTO, maxfilesrootres, CTLFLAG_RW, 2451 &maxfilesrootres, 0, "Descriptors reserved for root use"); 2452 2453 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 2454 &nfiles, 0, "System-wide number of open files"); 2455 2456 static void 2457 fildesc_drvinit(void *unused) 2458 { 2459 int fd; 2460 2461 cdevsw_add(&fildesc_cdevsw, 0, 0); 2462 for (fd = 0; fd < NUMFDESC; fd++) { 2463 make_dev(&fildesc_cdevsw, fd, 2464 UID_BIN, GID_BIN, 0666, "fd/%d", fd); 2465 } 2466 make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "stdin"); 2467 make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "stdout"); 2468 make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "stderr"); 2469 } 2470 2471 /* 2472 * MPSAFE 2473 */ 2474 struct fileops badfileops = { 2475 NULL, /* port */ 2476 NULL, /* clone */ 2477 badfo_readwrite, 2478 badfo_readwrite, 2479 badfo_ioctl, 2480 badfo_poll, 2481 badfo_kqfilter, 2482 badfo_stat, 2483 badfo_close, 2484 badfo_shutdown 2485 }; 2486 2487 /* 2488 * MPSAFE 2489 */ 2490 static int 2491 badfo_readwrite( 2492 struct file *fp, 2493 struct uio *uio, 2494 struct ucred *cred, 2495 int flags 2496 ) { 2497 return (EBADF); 2498 } 2499 2500 /* 2501 * MPSAFE 2502 */ 2503 static int 2504 badfo_ioctl(struct file *fp, u_long com, caddr_t data, struct ucred *cred) 2505 { 2506 return (EBADF); 2507 } 2508 2509 /* 2510 * MPSAFE 2511 */ 2512 static int 2513 badfo_poll(struct file *fp, int events, struct ucred *cred) 2514 { 2515 return (0); 2516 } 2517 2518 /* 2519 * MPSAFE 2520 */ 2521 static int 2522 badfo_kqfilter(struct file *fp, struct knote *kn) 2523 { 2524 return (0); 2525 } 2526 2527 static int 2528 badfo_stat(struct file *fp, struct stat *sb, struct ucred *cred) 2529 { 2530 return (EBADF); 2531 } 2532 2533 /* 2534 * MPSAFE 2535 */ 2536 static int 2537 badfo_close(struct file *fp) 2538 { 2539 return (EBADF); 2540 } 2541 2542 /* 2543 * MPSAFE 2544 */ 2545 static int 2546 badfo_shutdown(struct file *fp, int how) 2547 { 2548 return (EBADF); 2549 } 2550 2551 /* 2552 * MPSAFE 2553 */ 2554 int 2555 nofo_shutdown(struct file *fp, int how) 2556 { 2557 return (EOPNOTSUPP); 2558 } 2559 2560 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, 2561 fildesc_drvinit,NULL) 2562