1 /* 2 * Copyright (c) 2005-2018 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Jeffrey Hsu and Matthew Dillon. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * 35 * Copyright (c) 1982, 1986, 1989, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 68 * $FreeBSD: src/sys/kern/kern_descrip.c,v 1.81.2.19 2004/02/28 00:43:31 tegge Exp $ 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/malloc.h> 74 #include <sys/sysproto.h> 75 #include <sys/conf.h> 76 #include <sys/device.h> 77 #include <sys/file.h> 78 #include <sys/filedesc.h> 79 #include <sys/kernel.h> 80 #include <sys/sysctl.h> 81 #include <sys/vnode.h> 82 #include <sys/proc.h> 83 #include <sys/nlookup.h> 84 #include <sys/stat.h> 85 #include <sys/filio.h> 86 #include <sys/fcntl.h> 87 #include <sys/unistd.h> 88 #include <sys/resourcevar.h> 89 #include <sys/event.h> 90 #include <sys/kern_syscall.h> 91 #include <sys/kcore.h> 92 #include <sys/kinfo.h> 93 #include <sys/un.h> 94 #include <sys/objcache.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 99 #include <sys/file2.h> 100 #include <sys/spinlock2.h> 101 102 static int fdalloc_locked(struct proc *p, struct filedesc *fdp, 103 int want, int *result); 104 static void fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd); 105 static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr); 106 static struct file *funsetfd_locked (struct filedesc *fdp, int fd); 107 static void ffree(struct file *fp); 108 109 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); 110 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader", 111 "file desc to leader structures"); 112 MALLOC_DEFINE(M_FILE, "file", "Open file structure"); 113 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 114 115 static struct krate krate_uidinfo = { .freq = 1 }; 116 117 static d_open_t fdopen; 118 #define NUMFDESC 64 119 120 #define CDEV_MAJOR 22 121 static struct dev_ops fildesc_ops = { 122 { "FD", 0, 0 }, 123 .d_open = fdopen, 124 }; 125 126 /* 127 * Descriptor management. 128 */ 129 #ifndef NFILELIST_HEADS 130 #define NFILELIST_HEADS 257 /* primary number */ 131 #endif 132 133 struct filelist_head { 134 struct spinlock spin; 135 struct filelist list; 136 } __cachealign; 137 138 static struct filelist_head filelist_heads[NFILELIST_HEADS]; 139 140 static int nfiles; /* actual number of open files */ 141 extern int cmask; 142 143 struct lwkt_token revoke_token = LWKT_TOKEN_INITIALIZER(revoke_token); 144 145 static struct objcache *file_objcache; 146 147 static struct objcache_malloc_args file_malloc_args = { 148 .objsize = sizeof(struct file), 149 .mtype = M_FILE 150 }; 151 152 /* 153 * Fixup fd_freefile and fd_lastfile after a descriptor has been cleared. 154 * 155 * must be called with fdp->fd_spin exclusively held 156 */ 157 static __inline 158 void 159 fdfixup_locked(struct filedesc *fdp, int fd) 160 { 161 if (fd < fdp->fd_freefile) { 162 fdp->fd_freefile = fd; 163 } 164 while (fdp->fd_lastfile >= 0 && 165 fdp->fd_files[fdp->fd_lastfile].fp == NULL && 166 fdp->fd_files[fdp->fd_lastfile].reserved == 0 167 ) { 168 --fdp->fd_lastfile; 169 } 170 } 171 172 /* 173 * Clear the fd thread caches for this fdnode. 174 * 175 * If match_fdc is NULL, all thread caches of fdn will be cleared. 176 * The caller must hold fdp->fd_spin exclusively. The threads caching 177 * the descriptor do not have to be the current thread. The (status) 178 * argument is ignored. 179 * 180 * If match_fdc is not NULL, only the match_fdc's cache will be cleared. 181 * The caller must hold fdp->fd_spin shared and match_fdc must match a 182 * fdcache entry in curthread. match_fdc has been locked by the caller 183 * and had the specified (status). 184 * 185 * Since we are matching against a fp in the fdp (which must still be present 186 * at this time), fp will have at least two refs on any match and we can 187 * decrement the count trivially. 188 */ 189 static 190 void 191 fclearcache(struct fdnode *fdn, struct fdcache *match_fdc, int status) 192 { 193 struct fdcache *fdc; 194 struct file *fp; 195 int i; 196 197 /* 198 * match_fdc == NULL We are cleaning out all tdcache entries 199 * for the fdn and hold fdp->fd_spin exclusively. 200 * This can race against the target threads 201 * cleaning out specific entries. 202 * 203 * match_fdc != NULL We are cleaning out a specific tdcache 204 * entry on behalf of the owning thread 205 * and hold fdp->fd_spin shared. The thread 206 * has already locked the entry. This cannot 207 * race. 208 */ 209 fp = fdn->fp; 210 for (i = 0; i < NTDCACHEFD; ++i) { 211 if ((fdc = fdn->tdcache[i]) == NULL) 212 continue; 213 214 /* 215 * If match_fdc is non-NULL we are being asked to 216 * clear a specific fdc owned by curthread. There must 217 * be exactly one match. The caller has already locked 218 * the cache entry and will dispose of the lock after 219 * we return. 220 * 221 * Since we also have a shared lock on fdp, we 222 * can do this without atomic ops. 223 */ 224 if (match_fdc) { 225 if (fdc != match_fdc) 226 continue; 227 fdn->tdcache[i] = NULL; 228 KASSERT(fp == fdc->fp, 229 ("fclearcache(1): fp mismatch %p/%p\n", 230 fp, fdc->fp)); 231 fdc->fp = NULL; 232 fdc->fd = -1; 233 234 /* 235 * status can be 0 or 2. If 2 the ref is borrowed, 236 * if 0 the ref is not borrowed and we have to drop 237 * it. 238 */ 239 if (status == 0) 240 atomic_add_int(&fp->f_count, -1); 241 fdn->isfull = 0; /* heuristic */ 242 return; 243 } 244 245 /* 246 * Otherwise we hold an exclusive spin-lock and can only 247 * race thread consumers borrowing cache entries. 248 * 249 * Acquire the lock and dispose of the entry. We have to 250 * spin until we get the lock. 251 */ 252 for (;;) { 253 status = atomic_swap_int(&fdc->locked, 1); 254 if (status == 1) { /* foreign lock, retry */ 255 cpu_pause(); 256 continue; 257 } 258 fdn->tdcache[i] = NULL; 259 KASSERT(fp == fdc->fp, 260 ("fclearcache(2): fp mismatch %p/%p\n", 261 fp, fdc->fp)); 262 fdc->fp = NULL; 263 fdc->fd = -1; 264 if (status == 0) 265 atomic_add_int(&fp->f_count, -1); 266 fdn->isfull = 0; /* heuristic */ 267 atomic_swap_int(&fdc->locked, 0); 268 break; 269 } 270 } 271 KKASSERT(match_fdc == NULL); 272 } 273 274 /* 275 * Retrieve the fp for the specified fd given the specified file descriptor 276 * table. The fdp does not have to be owned by the current process. 277 * If flags != -1, fp->f_flag must contain at least one of the flags. 278 * 279 * This function is not able to cache the fp. 280 */ 281 struct file * 282 holdfp_fdp(struct filedesc *fdp, int fd, int flag) 283 { 284 struct file *fp; 285 286 spin_lock_shared(&fdp->fd_spin); 287 if (((u_int)fd) < fdp->fd_nfiles) { 288 fp = fdp->fd_files[fd].fp; /* can be NULL */ 289 if (fp) { 290 if ((fp->f_flag & flag) == 0 && flag != -1) { 291 fp = NULL; 292 } else { 293 fhold(fp); 294 } 295 } 296 } else { 297 fp = NULL; 298 } 299 spin_unlock_shared(&fdp->fd_spin); 300 301 return fp; 302 } 303 304 struct file * 305 holdfp_fdp_locked(struct filedesc *fdp, int fd, int flag) 306 { 307 struct file *fp; 308 309 if (((u_int)fd) < fdp->fd_nfiles) { 310 fp = fdp->fd_files[fd].fp; /* can be NULL */ 311 if (fp) { 312 if ((fp->f_flag & flag) == 0 && flag != -1) { 313 fp = NULL; 314 } else { 315 fhold(fp); 316 } 317 } 318 } else { 319 fp = NULL; 320 } 321 return fp; 322 } 323 324 /* 325 * Acquire the fp for the specified file descriptor, using the thread 326 * cache if possible and caching it if possible. 327 * 328 * td must be the curren thread. 329 */ 330 static 331 struct file * 332 _holdfp_cache(thread_t td, int fd) 333 { 334 struct filedesc *fdp; 335 struct fdcache *fdc; 336 struct fdcache *best; 337 struct fdnode *fdn; 338 struct file *fp; 339 int status; 340 int delta; 341 int i; 342 343 /* 344 * Fast 345 */ 346 for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { 347 if (fdc->fd != fd || fdc->fp == NULL) 348 continue; 349 status = atomic_swap_int(&fdc->locked, 1); 350 351 /* 352 * If someone else has locked our cache entry they are in 353 * the middle of clearing it, skip the entry. 354 */ 355 if (status == 1) 356 continue; 357 358 /* 359 * We have locked the entry, but if it no longer matches 360 * restore the previous state (0 or 2) and skip the entry. 361 */ 362 if (fdc->fd != fd || fdc->fp == NULL) { 363 atomic_swap_int(&fdc->locked, status); 364 continue; 365 } 366 367 /* 368 * We have locked a valid entry. We can borrow the ref 369 * for a mode 0 entry. We can get a valid fp for a mode 370 * 2 entry but not borrow the ref. 371 */ 372 if (status == 0) { 373 fp = fdc->fp; 374 fdc->lru = ++td->td_fdcache_lru; 375 atomic_swap_int(&fdc->locked, 2); 376 377 return fp; 378 } 379 if (status == 2) { 380 fp = fdc->fp; 381 fhold(fp); 382 fdc->lru = ++td->td_fdcache_lru; 383 atomic_swap_int(&fdc->locked, 2); 384 385 return fp; 386 } 387 KKASSERT(0); 388 } 389 390 /* 391 * Lookup the descriptor the slow way. This can contend against 392 * modifying operations in a multi-threaded environment and cause 393 * cache line ping ponging otherwise. 394 */ 395 fdp = td->td_proc->p_fd; 396 spin_lock_shared(&fdp->fd_spin); 397 398 if (((u_int)fd) < fdp->fd_nfiles) { 399 fp = fdp->fd_files[fd].fp; /* can be NULL */ 400 if (fp) { 401 fhold(fp); 402 if (fdp->fd_files[fd].isfull == 0) 403 goto enter; 404 } 405 } else { 406 fp = NULL; 407 } 408 spin_unlock_shared(&fdp->fd_spin); 409 410 return fp; 411 412 /* 413 * We found a valid fp and held it, fdp is still shared locked. 414 * Enter the fp into the per-thread cache. Find the oldest entry 415 * via lru, or an empty entry. 416 * 417 * Because fdp's spinlock is held (shared is fine), no other 418 * thread should be in the middle of clearing our selected entry. 419 */ 420 enter: 421 best = &td->td_fdcache[0]; 422 for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { 423 if (fdc->fp == NULL) { 424 best = fdc; 425 break; 426 } 427 delta = fdc->lru - best->lru; 428 if (delta < 0) 429 best = fdc; 430 } 431 432 /* 433 * Replace best 434 * 435 * Don't enter into the cache if we cannot get the lock. 436 */ 437 status = atomic_swap_int(&best->locked, 1); 438 if (status == 1) 439 goto done; 440 441 /* 442 * Clear the previous cache entry if present 443 */ 444 if (best->fp) { 445 KKASSERT(best->fd >= 0); 446 fclearcache(&fdp->fd_files[best->fd], best, status); 447 } 448 449 /* 450 * Create our new cache entry. This entry is 'safe' until we tie 451 * into the fdnode. If we cannot tie in, we will clear the entry. 452 */ 453 best->fd = fd; 454 best->fp = fp; 455 best->lru = ++td->td_fdcache_lru; 456 best->locked = 2; /* borrowed ref */ 457 458 fdn = &fdp->fd_files[fd]; 459 for (i = 0; i < NTDCACHEFD; ++i) { 460 if (fdn->tdcache[i] == NULL && 461 atomic_cmpset_ptr((void **)&fdn->tdcache[i], NULL, best)) { 462 goto done; 463 } 464 } 465 fdn->isfull = 1; /* no space */ 466 best->fd = -1; 467 best->fp = NULL; 468 best->locked = 0; 469 done: 470 spin_unlock_shared(&fdp->fd_spin); 471 472 return fp; 473 } 474 475 /* 476 * Drop the file pointer and return to the thread cache if possible. 477 * 478 * Caller must not hold fdp's spin lock. 479 * td must be the current thread. 480 */ 481 void 482 dropfp(thread_t td, int fd, struct file *fp) 483 { 484 struct filedesc *fdp; 485 struct fdcache *fdc; 486 int status; 487 488 fdp = td->td_proc->p_fd; 489 490 /* 491 * If our placeholder is still present we can re-cache the ref. 492 * 493 * Note that we can race an fclearcache(). 494 */ 495 for (fdc = &td->td_fdcache[0]; fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { 496 if (fdc->fp != fp || fdc->fd != fd) 497 continue; 498 status = atomic_swap_int(&fdc->locked, 1); 499 switch(status) { 500 case 0: 501 /* 502 * Not in mode 2, fdrop fp without caching. 503 */ 504 atomic_swap_int(&fdc->locked, 0); 505 break; 506 case 1: 507 /* 508 * Not in mode 2, locked by someone else. 509 * fdrop fp without caching. 510 */ 511 break; 512 case 2: 513 /* 514 * Intact borrowed ref, return to mode 0 515 * indicating that we have returned the ref. 516 * 517 * Return the borrowed ref (2->1->0) 518 */ 519 if (fdc->fp == fp && fdc->fd == fd) { 520 atomic_swap_int(&fdc->locked, 0); 521 return; 522 } 523 atomic_swap_int(&fdc->locked, 2); 524 break; 525 } 526 } 527 528 /* 529 * Failed to re-cache, drop the fp without caching. 530 */ 531 fdrop(fp); 532 } 533 534 /* 535 * Clear all descriptors cached in the per-thread fd cache for 536 * the specified thread. 537 * 538 * Caller must not hold p_fd->spin. This function will temporarily 539 * obtain a shared spin lock. 540 */ 541 void 542 fexitcache(thread_t td) 543 { 544 struct filedesc *fdp; 545 struct fdcache *fdc; 546 int status; 547 int i; 548 549 if (td->td_proc == NULL) 550 return; 551 fdp = td->td_proc->p_fd; 552 if (fdp == NULL) 553 return; 554 555 /* 556 * A shared lock is sufficient as the caller controls td and we 557 * are only clearing td's cache. 558 */ 559 spin_lock_shared(&fdp->fd_spin); 560 for (i = 0; i < NFDCACHE; ++i) { 561 fdc = &td->td_fdcache[i]; 562 if (fdc->fp) { 563 status = atomic_swap_int(&fdc->locked, 1); 564 if (status == 1) { 565 cpu_pause(); 566 --i; 567 continue; 568 } 569 if (fdc->fp) { 570 KKASSERT(fdc->fd >= 0); 571 fclearcache(&fdp->fd_files[fdc->fd], fdc, 572 status); 573 } 574 atomic_swap_int(&fdc->locked, 0); 575 } 576 } 577 spin_unlock_shared(&fdp->fd_spin); 578 } 579 580 static __inline struct filelist_head * 581 fp2filelist(const struct file *fp) 582 { 583 u_int i; 584 585 i = (u_int)(uintptr_t)fp % NFILELIST_HEADS; 586 return &filelist_heads[i]; 587 } 588 589 static __inline 590 struct plimit * 591 readplimits(struct proc *p) 592 { 593 thread_t td = curthread; 594 struct plimit *limit; 595 596 limit = td->td_limit; 597 if (limit != p->p_limit) { 598 spin_lock_shared(&p->p_spin); 599 limit = p->p_limit; 600 atomic_add_int(&limit->p_refcnt, 1); 601 spin_unlock_shared(&p->p_spin); 602 if (td->td_limit) 603 plimit_free(td->td_limit); 604 td->td_limit = limit; 605 } 606 return limit; 607 } 608 609 /* 610 * System calls on descriptors. 611 */ 612 int 613 sys_getdtablesize(struct getdtablesize_args *uap) 614 { 615 struct proc *p = curproc; 616 struct plimit *limit = readplimits(p); 617 int dtsize; 618 619 if (limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX) 620 dtsize = INT_MAX; 621 else 622 dtsize = (int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur; 623 624 if (dtsize > maxfilesperproc) 625 dtsize = maxfilesperproc; 626 if (dtsize < minfilesperproc) 627 dtsize = minfilesperproc; 628 if (p->p_ucred->cr_uid && dtsize > maxfilesperuser) 629 dtsize = maxfilesperuser; 630 uap->sysmsg_result = dtsize; 631 return (0); 632 } 633 634 /* 635 * Duplicate a file descriptor to a particular value. 636 * 637 * note: keep in mind that a potential race condition exists when closing 638 * descriptors from a shared descriptor table (via rfork). 639 */ 640 int 641 sys_dup2(struct dup2_args *uap) 642 { 643 int error; 644 int fd = 0; 645 646 error = kern_dup(DUP_FIXED, uap->from, uap->to, &fd); 647 uap->sysmsg_fds[0] = fd; 648 649 return (error); 650 } 651 652 /* 653 * Duplicate a file descriptor. 654 */ 655 int 656 sys_dup(struct dup_args *uap) 657 { 658 int error; 659 int fd = 0; 660 661 error = kern_dup(DUP_VARIABLE, uap->fd, 0, &fd); 662 uap->sysmsg_fds[0] = fd; 663 664 return (error); 665 } 666 667 /* 668 * MPALMOSTSAFE - acquires mplock for fp operations 669 */ 670 int 671 kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred) 672 { 673 struct thread *td = curthread; 674 struct proc *p = td->td_proc; 675 struct file *fp; 676 struct vnode *vp; 677 u_int newmin; 678 u_int oflags; 679 u_int nflags; 680 int closedcounter; 681 int tmp, error, flg = F_POSIX; 682 683 KKASSERT(p); 684 685 /* 686 * Operations on file descriptors that do not require a file pointer. 687 */ 688 switch (cmd) { 689 case F_GETFD: 690 error = fgetfdflags(p->p_fd, fd, &tmp); 691 if (error == 0) 692 dat->fc_cloexec = (tmp & UF_EXCLOSE) ? FD_CLOEXEC : 0; 693 return (error); 694 695 case F_SETFD: 696 if (dat->fc_cloexec & FD_CLOEXEC) 697 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 698 else 699 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 700 return (error); 701 case F_DUPFD: 702 newmin = dat->fc_fd; 703 error = kern_dup(DUP_VARIABLE | DUP_FCNTL, fd, newmin, 704 &dat->fc_fd); 705 return (error); 706 case F_DUPFD_CLOEXEC: 707 newmin = dat->fc_fd; 708 error = kern_dup(DUP_VARIABLE | DUP_CLOEXEC | DUP_FCNTL, 709 fd, newmin, &dat->fc_fd); 710 return (error); 711 case F_DUP2FD: 712 newmin = dat->fc_fd; 713 error = kern_dup(DUP_FIXED, fd, newmin, &dat->fc_fd); 714 return (error); 715 case F_DUP2FD_CLOEXEC: 716 newmin = dat->fc_fd; 717 error = kern_dup(DUP_FIXED | DUP_CLOEXEC, fd, newmin, 718 &dat->fc_fd); 719 return (error); 720 default: 721 break; 722 } 723 724 /* 725 * Operations on file pointers 726 */ 727 closedcounter = p->p_fd->fd_closedcounter; 728 if ((fp = holdfp(td, fd, -1)) == NULL) 729 return (EBADF); 730 731 switch (cmd) { 732 case F_GETFL: 733 dat->fc_flags = OFLAGS(fp->f_flag); 734 error = 0; 735 break; 736 737 case F_SETFL: 738 oflags = fp->f_flag; 739 nflags = FFLAGS(dat->fc_flags & ~O_ACCMODE) & FCNTLFLAGS; 740 nflags |= oflags & ~FCNTLFLAGS; 741 742 error = 0; 743 if (((nflags ^ oflags) & O_APPEND) && (oflags & FAPPENDONLY)) 744 error = EINVAL; 745 if (error == 0 && ((nflags ^ oflags) & FASYNC)) { 746 tmp = nflags & FASYNC; 747 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, 748 cred, NULL); 749 } 750 751 /* 752 * If no error, must be atomically set. 753 */ 754 while (error == 0) { 755 oflags = fp->f_flag; 756 cpu_ccfence(); 757 nflags = (oflags & ~FCNTLFLAGS) | (nflags & FCNTLFLAGS); 758 if (atomic_cmpset_int(&fp->f_flag, oflags, nflags)) 759 break; 760 cpu_pause(); 761 } 762 break; 763 764 case F_GETOWN: 765 error = fo_ioctl(fp, FIOGETOWN, (caddr_t)&dat->fc_owner, 766 cred, NULL); 767 break; 768 769 case F_SETOWN: 770 error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&dat->fc_owner, 771 cred, NULL); 772 break; 773 774 case F_SETLKW: 775 flg |= F_WAIT; 776 /* Fall into F_SETLK */ 777 778 case F_SETLK: 779 if (fp->f_type != DTYPE_VNODE) { 780 error = EBADF; 781 break; 782 } 783 vp = (struct vnode *)fp->f_data; 784 785 /* 786 * copyin/lockop may block 787 */ 788 if (dat->fc_flock.l_whence == SEEK_CUR) 789 dat->fc_flock.l_start += fp->f_offset; 790 791 switch (dat->fc_flock.l_type) { 792 case F_RDLCK: 793 if ((fp->f_flag & FREAD) == 0) { 794 error = EBADF; 795 break; 796 } 797 if (p->p_leader->p_advlock_flag == 0) 798 p->p_leader->p_advlock_flag = 1; 799 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 800 &dat->fc_flock, flg); 801 break; 802 case F_WRLCK: 803 if ((fp->f_flag & FWRITE) == 0) { 804 error = EBADF; 805 break; 806 } 807 if (p->p_leader->p_advlock_flag == 0) 808 p->p_leader->p_advlock_flag = 1; 809 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 810 &dat->fc_flock, flg); 811 break; 812 case F_UNLCK: 813 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 814 &dat->fc_flock, F_POSIX); 815 break; 816 default: 817 error = EINVAL; 818 break; 819 } 820 821 /* 822 * It is possible to race a close() on the descriptor while 823 * we were blocked getting the lock. If this occurs the 824 * close might not have caught the lock. 825 */ 826 if (checkfdclosed(td, p->p_fd, fd, fp, closedcounter)) { 827 dat->fc_flock.l_whence = SEEK_SET; 828 dat->fc_flock.l_start = 0; 829 dat->fc_flock.l_len = 0; 830 dat->fc_flock.l_type = F_UNLCK; 831 VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 832 F_UNLCK, &dat->fc_flock, F_POSIX); 833 } 834 break; 835 836 case F_GETLK: 837 if (fp->f_type != DTYPE_VNODE) { 838 error = EBADF; 839 break; 840 } 841 vp = (struct vnode *)fp->f_data; 842 /* 843 * copyin/lockop may block 844 */ 845 if (dat->fc_flock.l_type != F_RDLCK && 846 dat->fc_flock.l_type != F_WRLCK && 847 dat->fc_flock.l_type != F_UNLCK) { 848 error = EINVAL; 849 break; 850 } 851 if (dat->fc_flock.l_whence == SEEK_CUR) 852 dat->fc_flock.l_start += fp->f_offset; 853 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, 854 &dat->fc_flock, F_POSIX); 855 break; 856 default: 857 error = EINVAL; 858 break; 859 } 860 861 fdrop(fp); 862 return (error); 863 } 864 865 /* 866 * The file control system call. 867 */ 868 int 869 sys_fcntl(struct fcntl_args *uap) 870 { 871 union fcntl_dat dat; 872 int error; 873 874 switch (uap->cmd) { 875 case F_DUPFD: 876 case F_DUP2FD: 877 case F_DUPFD_CLOEXEC: 878 case F_DUP2FD_CLOEXEC: 879 dat.fc_fd = uap->arg; 880 break; 881 case F_SETFD: 882 dat.fc_cloexec = uap->arg; 883 break; 884 case F_SETFL: 885 dat.fc_flags = uap->arg; 886 break; 887 case F_SETOWN: 888 dat.fc_owner = uap->arg; 889 break; 890 case F_SETLKW: 891 case F_SETLK: 892 case F_GETLK: 893 error = copyin((caddr_t)uap->arg, &dat.fc_flock, 894 sizeof(struct flock)); 895 if (error) 896 return (error); 897 break; 898 } 899 900 error = kern_fcntl(uap->fd, uap->cmd, &dat, curthread->td_ucred); 901 902 if (error == 0) { 903 switch (uap->cmd) { 904 case F_DUPFD: 905 case F_DUP2FD: 906 case F_DUPFD_CLOEXEC: 907 case F_DUP2FD_CLOEXEC: 908 uap->sysmsg_result = dat.fc_fd; 909 break; 910 case F_GETFD: 911 uap->sysmsg_result = dat.fc_cloexec; 912 break; 913 case F_GETFL: 914 uap->sysmsg_result = dat.fc_flags; 915 break; 916 case F_GETOWN: 917 uap->sysmsg_result = dat.fc_owner; 918 break; 919 case F_GETLK: 920 error = copyout(&dat.fc_flock, (caddr_t)uap->arg, 921 sizeof(struct flock)); 922 break; 923 } 924 } 925 926 return (error); 927 } 928 929 /* 930 * Common code for dup, dup2, and fcntl(F_DUPFD). 931 * 932 * There are four type flags: DUP_FCNTL, DUP_FIXED, DUP_VARIABLE, and 933 * DUP_CLOEXEC. 934 * 935 * DUP_FCNTL is for handling EINVAL vs. EBADF differences between 936 * fcntl()'s F_DUPFD and F_DUPFD_CLOEXEC and dup2() (per POSIX). 937 * The next two flags are mutually exclusive, and the fourth is optional. 938 * DUP_FIXED tells kern_dup() to destructively dup over an existing file 939 * descriptor if "new" is already open. DUP_VARIABLE tells kern_dup() 940 * to find the lowest unused file descriptor that is greater than or 941 * equal to "new". DUP_CLOEXEC, which works with either of the first 942 * two flags, sets the close-on-exec flag on the "new" file descriptor. 943 */ 944 int 945 kern_dup(int flags, int old, int new, int *res) 946 { 947 struct thread *td = curthread; 948 struct proc *p = td->td_proc; 949 struct plimit *limit = readplimits(p); 950 struct filedesc *fdp = p->p_fd; 951 struct file *fp; 952 struct file *delfp; 953 int oldflags; 954 int holdleaders; 955 int dtsize; 956 int error, newfd; 957 958 /* 959 * Verify that we have a valid descriptor to dup from and 960 * possibly to dup to. When the new descriptor is out of 961 * bounds, fcntl()'s F_DUPFD and F_DUPFD_CLOEXEC must 962 * return EINVAL, while dup2() returns EBADF in 963 * this case. 964 * 965 * NOTE: maxfilesperuser is not applicable to dup() 966 */ 967 retry: 968 if (limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX) 969 dtsize = INT_MAX; 970 else 971 dtsize = (int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur; 972 if (dtsize > maxfilesperproc) 973 dtsize = maxfilesperproc; 974 if (dtsize < minfilesperproc) 975 dtsize = minfilesperproc; 976 977 if (new < 0 || new > dtsize) 978 return (flags & DUP_FCNTL ? EINVAL : EBADF); 979 980 spin_lock(&fdp->fd_spin); 981 if ((unsigned)old >= fdp->fd_nfiles || fdp->fd_files[old].fp == NULL) { 982 spin_unlock(&fdp->fd_spin); 983 return (EBADF); 984 } 985 if ((flags & DUP_FIXED) && old == new) { 986 *res = new; 987 if (flags & DUP_CLOEXEC) 988 fdp->fd_files[new].fileflags |= UF_EXCLOSE; 989 spin_unlock(&fdp->fd_spin); 990 return (0); 991 } 992 fp = fdp->fd_files[old].fp; 993 oldflags = fdp->fd_files[old].fileflags; 994 fhold(fp); 995 996 /* 997 * Allocate a new descriptor if DUP_VARIABLE, or expand the table 998 * if the requested descriptor is beyond the current table size. 999 * 1000 * This can block. Retry if the source descriptor no longer matches 1001 * or if our expectation in the expansion case races. 1002 * 1003 * If we are not expanding or allocating a new decriptor, then reset 1004 * the target descriptor to a reserved state so we have a uniform 1005 * setup for the next code block. 1006 */ 1007 if ((flags & DUP_VARIABLE) || new >= fdp->fd_nfiles) { 1008 error = fdalloc_locked(p, fdp, new, &newfd); 1009 if (error) { 1010 spin_unlock(&fdp->fd_spin); 1011 fdrop(fp); 1012 return (error); 1013 } 1014 /* 1015 * Check for ripout 1016 */ 1017 if (old >= fdp->fd_nfiles || fdp->fd_files[old].fp != fp) { 1018 fsetfd_locked(fdp, NULL, newfd); 1019 spin_unlock(&fdp->fd_spin); 1020 fdrop(fp); 1021 goto retry; 1022 } 1023 /* 1024 * Check for expansion race 1025 */ 1026 if ((flags & DUP_VARIABLE) == 0 && new != newfd) { 1027 fsetfd_locked(fdp, NULL, newfd); 1028 spin_unlock(&fdp->fd_spin); 1029 fdrop(fp); 1030 goto retry; 1031 } 1032 /* 1033 * Check for ripout, newfd reused old (this case probably 1034 * can't occur). 1035 */ 1036 if (old == newfd) { 1037 fsetfd_locked(fdp, NULL, newfd); 1038 spin_unlock(&fdp->fd_spin); 1039 fdrop(fp); 1040 goto retry; 1041 } 1042 new = newfd; 1043 delfp = NULL; 1044 } else { 1045 if (fdp->fd_files[new].reserved) { 1046 spin_unlock(&fdp->fd_spin); 1047 fdrop(fp); 1048 kprintf("Warning: dup(): target descriptor %d is " 1049 "reserved, waiting for it to be resolved\n", 1050 new); 1051 tsleep(fdp, 0, "fdres", hz); 1052 goto retry; 1053 } 1054 1055 /* 1056 * If the target descriptor was never allocated we have 1057 * to allocate it. If it was we have to clean out the 1058 * old descriptor. delfp inherits the ref from the 1059 * descriptor table. 1060 */ 1061 ++fdp->fd_closedcounter; 1062 fclearcache(&fdp->fd_files[new], NULL, 0); 1063 ++fdp->fd_closedcounter; 1064 delfp = fdp->fd_files[new].fp; 1065 fdp->fd_files[new].fp = NULL; 1066 fdp->fd_files[new].reserved = 1; 1067 if (delfp == NULL) { 1068 fdreserve_locked(fdp, new, 1); 1069 if (new > fdp->fd_lastfile) 1070 fdp->fd_lastfile = new; 1071 } 1072 1073 } 1074 1075 /* 1076 * NOTE: still holding an exclusive spinlock 1077 */ 1078 1079 /* 1080 * If a descriptor is being overwritten we may hve to tell 1081 * fdfree() to sleep to ensure that all relevant process 1082 * leaders can be traversed in closef(). 1083 */ 1084 if (delfp != NULL && p->p_fdtol != NULL) { 1085 fdp->fd_holdleaderscount++; 1086 holdleaders = 1; 1087 } else { 1088 holdleaders = 0; 1089 } 1090 KASSERT(delfp == NULL || (flags & DUP_FIXED), 1091 ("dup() picked an open file")); 1092 1093 /* 1094 * Duplicate the source descriptor, update lastfile. If the new 1095 * descriptor was not allocated and we aren't replacing an existing 1096 * descriptor we have to mark the descriptor as being in use. 1097 * 1098 * The fd_files[] array inherits fp's hold reference. 1099 */ 1100 fsetfd_locked(fdp, fp, new); 1101 if ((flags & DUP_CLOEXEC) != 0) 1102 fdp->fd_files[new].fileflags = oldflags | UF_EXCLOSE; 1103 else 1104 fdp->fd_files[new].fileflags = oldflags & ~UF_EXCLOSE; 1105 spin_unlock(&fdp->fd_spin); 1106 fdrop(fp); 1107 *res = new; 1108 1109 /* 1110 * If we dup'd over a valid file, we now own the reference to it 1111 * and must dispose of it using closef() semantics (as if a 1112 * close() were performed on it). 1113 */ 1114 if (delfp) { 1115 if (SLIST_FIRST(&delfp->f_klist)) 1116 knote_fdclose(delfp, fdp, new); 1117 closef(delfp, p); 1118 if (holdleaders) { 1119 spin_lock(&fdp->fd_spin); 1120 fdp->fd_holdleaderscount--; 1121 if (fdp->fd_holdleaderscount == 0 && 1122 fdp->fd_holdleaderswakeup != 0) { 1123 fdp->fd_holdleaderswakeup = 0; 1124 spin_unlock(&fdp->fd_spin); 1125 wakeup(&fdp->fd_holdleaderscount); 1126 } else { 1127 spin_unlock(&fdp->fd_spin); 1128 } 1129 } 1130 } 1131 return (0); 1132 } 1133 1134 /* 1135 * If sigio is on the list associated with a process or process group, 1136 * disable signalling from the device, remove sigio from the list and 1137 * free sigio. 1138 */ 1139 void 1140 funsetown(struct sigio **sigiop) 1141 { 1142 struct pgrp *pgrp; 1143 struct proc *p; 1144 struct sigio *sigio; 1145 1146 if ((sigio = *sigiop) != NULL) { 1147 lwkt_gettoken(&sigio_token); /* protect sigio */ 1148 KKASSERT(sigiop == sigio->sio_myref); 1149 sigio = *sigiop; 1150 *sigiop = NULL; 1151 lwkt_reltoken(&sigio_token); 1152 } 1153 if (sigio == NULL) 1154 return; 1155 1156 if (sigio->sio_pgid < 0) { 1157 pgrp = sigio->sio_pgrp; 1158 sigio->sio_pgrp = NULL; 1159 lwkt_gettoken(&pgrp->pg_token); 1160 SLIST_REMOVE(&pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); 1161 lwkt_reltoken(&pgrp->pg_token); 1162 pgrel(pgrp); 1163 } else /* if ((*sigiop)->sio_pgid > 0) */ { 1164 p = sigio->sio_proc; 1165 sigio->sio_proc = NULL; 1166 PHOLD(p); 1167 lwkt_gettoken(&p->p_token); 1168 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); 1169 lwkt_reltoken(&p->p_token); 1170 PRELE(p); 1171 } 1172 crfree(sigio->sio_ucred); 1173 sigio->sio_ucred = NULL; 1174 kfree(sigio, M_SIGIO); 1175 } 1176 1177 /* 1178 * Free a list of sigio structures. Caller is responsible for ensuring 1179 * that the list is MPSAFE. 1180 */ 1181 void 1182 funsetownlst(struct sigiolst *sigiolst) 1183 { 1184 struct sigio *sigio; 1185 1186 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) 1187 funsetown(sigio->sio_myref); 1188 } 1189 1190 /* 1191 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 1192 * 1193 * After permission checking, add a sigio structure to the sigio list for 1194 * the process or process group. 1195 */ 1196 int 1197 fsetown(pid_t pgid, struct sigio **sigiop) 1198 { 1199 struct proc *proc = NULL; 1200 struct pgrp *pgrp = NULL; 1201 struct sigio *sigio; 1202 int error; 1203 1204 if (pgid == 0) { 1205 funsetown(sigiop); 1206 return (0); 1207 } 1208 1209 if (pgid > 0) { 1210 proc = pfind(pgid); 1211 if (proc == NULL) { 1212 error = ESRCH; 1213 goto done; 1214 } 1215 1216 /* 1217 * Policy - Don't allow a process to FSETOWN a process 1218 * in another session. 1219 * 1220 * Remove this test to allow maximum flexibility or 1221 * restrict FSETOWN to the current process or process 1222 * group for maximum safety. 1223 */ 1224 if (proc->p_session != curproc->p_session) { 1225 error = EPERM; 1226 goto done; 1227 } 1228 } else /* if (pgid < 0) */ { 1229 pgrp = pgfind(-pgid); 1230 if (pgrp == NULL) { 1231 error = ESRCH; 1232 goto done; 1233 } 1234 1235 /* 1236 * Policy - Don't allow a process to FSETOWN a process 1237 * in another session. 1238 * 1239 * Remove this test to allow maximum flexibility or 1240 * restrict FSETOWN to the current process or process 1241 * group for maximum safety. 1242 */ 1243 if (pgrp->pg_session != curproc->p_session) { 1244 error = EPERM; 1245 goto done; 1246 } 1247 } 1248 sigio = kmalloc(sizeof(struct sigio), M_SIGIO, M_WAITOK | M_ZERO); 1249 if (pgid > 0) { 1250 KKASSERT(pgrp == NULL); 1251 lwkt_gettoken(&proc->p_token); 1252 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); 1253 sigio->sio_proc = proc; 1254 lwkt_reltoken(&proc->p_token); 1255 } else { 1256 KKASSERT(proc == NULL); 1257 lwkt_gettoken(&pgrp->pg_token); 1258 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 1259 sigio->sio_pgrp = pgrp; 1260 lwkt_reltoken(&pgrp->pg_token); 1261 pgrp = NULL; 1262 } 1263 sigio->sio_pgid = pgid; 1264 sigio->sio_ucred = crhold(curthread->td_ucred); 1265 /* It would be convenient if p_ruid was in ucred. */ 1266 sigio->sio_ruid = sigio->sio_ucred->cr_ruid; 1267 sigio->sio_myref = sigiop; 1268 1269 lwkt_gettoken(&sigio_token); 1270 while (*sigiop) 1271 funsetown(sigiop); 1272 *sigiop = sigio; 1273 lwkt_reltoken(&sigio_token); 1274 error = 0; 1275 done: 1276 if (pgrp) 1277 pgrel(pgrp); 1278 if (proc) 1279 PRELE(proc); 1280 return (error); 1281 } 1282 1283 /* 1284 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). 1285 */ 1286 pid_t 1287 fgetown(struct sigio **sigiop) 1288 { 1289 struct sigio *sigio; 1290 pid_t own; 1291 1292 lwkt_gettoken_shared(&sigio_token); 1293 sigio = *sigiop; 1294 own = (sigio != NULL ? sigio->sio_pgid : 0); 1295 lwkt_reltoken(&sigio_token); 1296 1297 return (own); 1298 } 1299 1300 /* 1301 * Close many file descriptors. 1302 */ 1303 int 1304 sys_closefrom(struct closefrom_args *uap) 1305 { 1306 return(kern_closefrom(uap->fd)); 1307 } 1308 1309 /* 1310 * Close all file descriptors greater then or equal to fd 1311 */ 1312 int 1313 kern_closefrom(int fd) 1314 { 1315 struct thread *td = curthread; 1316 struct proc *p = td->td_proc; 1317 struct filedesc *fdp; 1318 int error; 1319 int e2; 1320 1321 KKASSERT(p); 1322 fdp = p->p_fd; 1323 1324 if (fd < 0) 1325 return (EINVAL); 1326 1327 /* 1328 * NOTE: This function will skip unassociated descriptors and 1329 * reserved descriptors that have not yet been assigned. 1330 * fd_lastfile can change as a side effect of kern_close(). 1331 * 1332 * NOTE: We accumulate EINTR errors and return EINTR if any 1333 * close() returned EINTR. However, the descriptor is 1334 * still closed and we do not break out of the loop. 1335 */ 1336 error = 0; 1337 spin_lock(&fdp->fd_spin); 1338 while (fd <= fdp->fd_lastfile) { 1339 if (fdp->fd_files[fd].fp != NULL) { 1340 spin_unlock(&fdp->fd_spin); 1341 /* ok if this races another close */ 1342 e2 = kern_close(fd); 1343 if (e2 == EINTR) 1344 error = EINTR; 1345 spin_lock(&fdp->fd_spin); 1346 } 1347 ++fd; 1348 } 1349 spin_unlock(&fdp->fd_spin); 1350 1351 return error; 1352 } 1353 1354 /* 1355 * Close a file descriptor. 1356 */ 1357 int 1358 sys_close(struct close_args *uap) 1359 { 1360 return(kern_close(uap->fd)); 1361 } 1362 1363 /* 1364 * close() helper 1365 */ 1366 int 1367 kern_close(int fd) 1368 { 1369 struct thread *td = curthread; 1370 struct proc *p = td->td_proc; 1371 struct filedesc *fdp; 1372 struct file *fp; 1373 int error; 1374 int holdleaders; 1375 1376 KKASSERT(p); 1377 fdp = p->p_fd; 1378 1379 /* 1380 * funsetfd*() also clears the fd cache 1381 */ 1382 spin_lock(&fdp->fd_spin); 1383 if ((fp = funsetfd_locked(fdp, fd)) == NULL) { 1384 spin_unlock(&fdp->fd_spin); 1385 return (EBADF); 1386 } 1387 holdleaders = 0; 1388 if (p->p_fdtol != NULL) { 1389 /* 1390 * Ask fdfree() to sleep to ensure that all relevant 1391 * process leaders can be traversed in closef(). 1392 */ 1393 fdp->fd_holdleaderscount++; 1394 holdleaders = 1; 1395 } 1396 1397 /* 1398 * we now hold the fp reference that used to be owned by the descriptor 1399 * array. 1400 */ 1401 spin_unlock(&fdp->fd_spin); 1402 if (SLIST_FIRST(&fp->f_klist)) 1403 knote_fdclose(fp, fdp, fd); 1404 error = closef(fp, p); 1405 if (holdleaders) { 1406 spin_lock(&fdp->fd_spin); 1407 fdp->fd_holdleaderscount--; 1408 if (fdp->fd_holdleaderscount == 0 && 1409 fdp->fd_holdleaderswakeup != 0) { 1410 fdp->fd_holdleaderswakeup = 0; 1411 spin_unlock(&fdp->fd_spin); 1412 wakeup(&fdp->fd_holdleaderscount); 1413 } else { 1414 spin_unlock(&fdp->fd_spin); 1415 } 1416 } 1417 return (error); 1418 } 1419 1420 /* 1421 * shutdown_args(int fd, int how) 1422 */ 1423 int 1424 kern_shutdown(int fd, int how) 1425 { 1426 struct thread *td = curthread; 1427 struct file *fp; 1428 int error; 1429 1430 if ((fp = holdfp(td, fd, -1)) == NULL) 1431 return (EBADF); 1432 error = fo_shutdown(fp, how); 1433 fdrop(fp); 1434 1435 return (error); 1436 } 1437 1438 /* 1439 * MPALMOSTSAFE 1440 */ 1441 int 1442 sys_shutdown(struct shutdown_args *uap) 1443 { 1444 int error; 1445 1446 error = kern_shutdown(uap->s, uap->how); 1447 1448 return (error); 1449 } 1450 1451 /* 1452 * fstat() helper 1453 */ 1454 int 1455 kern_fstat(int fd, struct stat *ub) 1456 { 1457 struct thread *td = curthread; 1458 struct file *fp; 1459 int error; 1460 1461 if ((fp = holdfp(td, fd, -1)) == NULL) 1462 return (EBADF); 1463 error = fo_stat(fp, ub, td->td_ucred); 1464 fdrop(fp); 1465 1466 return (error); 1467 } 1468 1469 /* 1470 * Return status information about a file descriptor. 1471 */ 1472 int 1473 sys_fstat(struct fstat_args *uap) 1474 { 1475 struct stat st; 1476 int error; 1477 1478 error = kern_fstat(uap->fd, &st); 1479 1480 if (error == 0) 1481 error = copyout(&st, uap->sb, sizeof(st)); 1482 return (error); 1483 } 1484 1485 /* 1486 * Return pathconf information about a file descriptor. 1487 * 1488 * MPALMOSTSAFE 1489 */ 1490 int 1491 sys_fpathconf(struct fpathconf_args *uap) 1492 { 1493 struct thread *td = curthread; 1494 struct file *fp; 1495 struct vnode *vp; 1496 int error = 0; 1497 1498 if ((fp = holdfp(td, uap->fd, -1)) == NULL) 1499 return (EBADF); 1500 1501 switch (fp->f_type) { 1502 case DTYPE_PIPE: 1503 case DTYPE_SOCKET: 1504 if (uap->name != _PC_PIPE_BUF) { 1505 error = EINVAL; 1506 } else { 1507 uap->sysmsg_result = PIPE_BUF; 1508 error = 0; 1509 } 1510 break; 1511 case DTYPE_FIFO: 1512 case DTYPE_VNODE: 1513 vp = (struct vnode *)fp->f_data; 1514 error = VOP_PATHCONF(vp, uap->name, &uap->sysmsg_reg); 1515 break; 1516 default: 1517 error = EOPNOTSUPP; 1518 break; 1519 } 1520 fdrop(fp); 1521 return(error); 1522 } 1523 1524 /* 1525 * Grow the file table so it can hold through descriptor (want). 1526 * 1527 * The fdp's spinlock must be held exclusively on entry and may be held 1528 * exclusively on return. The spinlock may be cycled by the routine. 1529 */ 1530 static void 1531 fdgrow_locked(struct filedesc *fdp, int want) 1532 { 1533 struct fdnode *newfiles; 1534 struct fdnode *oldfiles; 1535 int nf, extra; 1536 1537 nf = fdp->fd_nfiles; 1538 do { 1539 /* nf has to be of the form 2^n - 1 */ 1540 nf = 2 * nf + 1; 1541 } while (nf <= want); 1542 1543 spin_unlock(&fdp->fd_spin); 1544 newfiles = kmalloc(nf * sizeof(struct fdnode), M_FILEDESC, M_WAITOK); 1545 spin_lock(&fdp->fd_spin); 1546 1547 /* 1548 * We could have raced another extend while we were not holding 1549 * the spinlock. 1550 */ 1551 if (fdp->fd_nfiles >= nf) { 1552 spin_unlock(&fdp->fd_spin); 1553 kfree(newfiles, M_FILEDESC); 1554 spin_lock(&fdp->fd_spin); 1555 return; 1556 } 1557 /* 1558 * Copy the existing ofile and ofileflags arrays 1559 * and zero the new portion of each array. 1560 */ 1561 extra = nf - fdp->fd_nfiles; 1562 bcopy(fdp->fd_files, newfiles, fdp->fd_nfiles * sizeof(struct fdnode)); 1563 bzero(&newfiles[fdp->fd_nfiles], extra * sizeof(struct fdnode)); 1564 1565 oldfiles = fdp->fd_files; 1566 fdp->fd_files = newfiles; 1567 fdp->fd_nfiles = nf; 1568 1569 if (oldfiles != fdp->fd_builtin_files) { 1570 spin_unlock(&fdp->fd_spin); 1571 kfree(oldfiles, M_FILEDESC); 1572 spin_lock(&fdp->fd_spin); 1573 } 1574 } 1575 1576 /* 1577 * Number of nodes in right subtree, including the root. 1578 */ 1579 static __inline int 1580 right_subtree_size(int n) 1581 { 1582 return (n ^ (n | (n + 1))); 1583 } 1584 1585 /* 1586 * Bigger ancestor. 1587 */ 1588 static __inline int 1589 right_ancestor(int n) 1590 { 1591 return (n | (n + 1)); 1592 } 1593 1594 /* 1595 * Smaller ancestor. 1596 */ 1597 static __inline int 1598 left_ancestor(int n) 1599 { 1600 return ((n & (n + 1)) - 1); 1601 } 1602 1603 /* 1604 * Traverse the in-place binary tree buttom-up adjusting the allocation 1605 * count so scans can determine where free descriptors are located. 1606 * 1607 * caller must be holding an exclusive spinlock on fdp 1608 */ 1609 static 1610 void 1611 fdreserve_locked(struct filedesc *fdp, int fd, int incr) 1612 { 1613 while (fd >= 0) { 1614 fdp->fd_files[fd].allocated += incr; 1615 KKASSERT(fdp->fd_files[fd].allocated >= 0); 1616 fd = left_ancestor(fd); 1617 } 1618 } 1619 1620 /* 1621 * Reserve a file descriptor for the process. If no error occurs, the 1622 * caller MUST at some point call fsetfd() or assign a file pointer 1623 * or dispose of the reservation. 1624 */ 1625 static 1626 int 1627 fdalloc_locked(struct proc *p, struct filedesc *fdp, int want, int *result) 1628 { 1629 struct plimit *limit = readplimits(p); 1630 struct uidinfo *uip; 1631 int fd, rsize, rsum, node, lim; 1632 1633 /* 1634 * Check dtable size limit 1635 */ 1636 *result = -1; /* avoid gcc warnings */ 1637 if (limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX) 1638 lim = INT_MAX; 1639 else 1640 lim = (int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur; 1641 1642 if (lim > maxfilesperproc) 1643 lim = maxfilesperproc; 1644 if (lim < minfilesperproc) 1645 lim = minfilesperproc; 1646 if (want >= lim) 1647 return (EMFILE); 1648 1649 /* 1650 * Check that the user has not run out of descriptors (non-root only). 1651 * As a safety measure the dtable is allowed to have at least 1652 * minfilesperproc open fds regardless of the maxfilesperuser limit. 1653 * 1654 * This isn't as loose a spec as ui_posixlocks, so we use atomic 1655 * ops to force synchronize and recheck if we would otherwise 1656 * error. 1657 */ 1658 if (p->p_ucred->cr_uid && fdp->fd_nfiles >= minfilesperproc) { 1659 uip = p->p_ucred->cr_uidinfo; 1660 if (uip->ui_openfiles > maxfilesperuser) { 1661 int n; 1662 int count; 1663 1664 count = 0; 1665 for (n = 0; n < ncpus; ++n) { 1666 count += atomic_swap_int( 1667 &uip->ui_pcpu[n].pu_openfiles, 0); 1668 } 1669 atomic_add_int(&uip->ui_openfiles, count); 1670 if (uip->ui_openfiles > maxfilesperuser) { 1671 krateprintf(&krate_uidinfo, 1672 "Warning: user %d pid %d (%s) " 1673 "ran out of file descriptors " 1674 "(%d/%d)\n", 1675 p->p_ucred->cr_uid, (int)p->p_pid, 1676 p->p_comm, 1677 uip->ui_openfiles, maxfilesperuser); 1678 return(ENFILE); 1679 } 1680 } 1681 } 1682 1683 /* 1684 * Grow the dtable if necessary 1685 */ 1686 if (want >= fdp->fd_nfiles) 1687 fdgrow_locked(fdp, want); 1688 1689 /* 1690 * Search for a free descriptor starting at the higher 1691 * of want or fd_freefile. If that fails, consider 1692 * expanding the ofile array. 1693 * 1694 * NOTE! the 'allocated' field is a cumulative recursive allocation 1695 * count. If we happen to see a value of 0 then we can shortcut 1696 * our search. Otherwise we run through through the tree going 1697 * down branches we know have free descriptor(s) until we hit a 1698 * leaf node. The leaf node will be free but will not necessarily 1699 * have an allocated field of 0. 1700 */ 1701 retry: 1702 /* move up the tree looking for a subtree with a free node */ 1703 for (fd = max(want, fdp->fd_freefile); fd < min(fdp->fd_nfiles, lim); 1704 fd = right_ancestor(fd)) { 1705 if (fdp->fd_files[fd].allocated == 0) 1706 goto found; 1707 1708 rsize = right_subtree_size(fd); 1709 if (fdp->fd_files[fd].allocated == rsize) 1710 continue; /* right subtree full */ 1711 1712 /* 1713 * Free fd is in the right subtree of the tree rooted at fd. 1714 * Call that subtree R. Look for the smallest (leftmost) 1715 * subtree of R with an unallocated fd: continue moving 1716 * down the left branch until encountering a full left 1717 * subtree, then move to the right. 1718 */ 1719 for (rsum = 0, rsize /= 2; rsize > 0; rsize /= 2) { 1720 node = fd + rsize; 1721 rsum += fdp->fd_files[node].allocated; 1722 if (fdp->fd_files[fd].allocated == rsum + rsize) { 1723 fd = node; /* move to the right */ 1724 if (fdp->fd_files[node].allocated == 0) 1725 goto found; 1726 rsum = 0; 1727 } 1728 } 1729 goto found; 1730 } 1731 1732 /* 1733 * No space in current array. Expand? 1734 */ 1735 if (fdp->fd_nfiles >= lim) { 1736 return (EMFILE); 1737 } 1738 fdgrow_locked(fdp, want); 1739 goto retry; 1740 1741 found: 1742 KKASSERT(fd < fdp->fd_nfiles); 1743 if (fd > fdp->fd_lastfile) 1744 fdp->fd_lastfile = fd; 1745 if (want <= fdp->fd_freefile) 1746 fdp->fd_freefile = fd; 1747 *result = fd; 1748 KKASSERT(fdp->fd_files[fd].fp == NULL); 1749 KKASSERT(fdp->fd_files[fd].reserved == 0); 1750 fdp->fd_files[fd].fileflags = 0; 1751 fdp->fd_files[fd].reserved = 1; 1752 fdreserve_locked(fdp, fd, 1); 1753 1754 return (0); 1755 } 1756 1757 int 1758 fdalloc(struct proc *p, int want, int *result) 1759 { 1760 struct filedesc *fdp = p->p_fd; 1761 int error; 1762 1763 spin_lock(&fdp->fd_spin); 1764 error = fdalloc_locked(p, fdp, want, result); 1765 spin_unlock(&fdp->fd_spin); 1766 1767 return error; 1768 } 1769 1770 /* 1771 * Check to see whether n user file descriptors 1772 * are available to the process p. 1773 */ 1774 int 1775 fdavail(struct proc *p, int n) 1776 { 1777 struct plimit *limit = readplimits(p); 1778 struct filedesc *fdp = p->p_fd; 1779 struct fdnode *fdnode; 1780 int i, lim, last; 1781 1782 if (limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX) 1783 lim = INT_MAX; 1784 else 1785 lim = (int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur; 1786 1787 if (lim > maxfilesperproc) 1788 lim = maxfilesperproc; 1789 if (lim < minfilesperproc) 1790 lim = minfilesperproc; 1791 1792 spin_lock(&fdp->fd_spin); 1793 if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) { 1794 spin_unlock(&fdp->fd_spin); 1795 return (1); 1796 } 1797 last = min(fdp->fd_nfiles, lim); 1798 fdnode = &fdp->fd_files[fdp->fd_freefile]; 1799 for (i = last - fdp->fd_freefile; --i >= 0; ++fdnode) { 1800 if (fdnode->fp == NULL && --n <= 0) { 1801 spin_unlock(&fdp->fd_spin); 1802 return (1); 1803 } 1804 } 1805 spin_unlock(&fdp->fd_spin); 1806 return (0); 1807 } 1808 1809 /* 1810 * Revoke open descriptors referencing (f_data, f_type) 1811 * 1812 * Any revoke executed within a prison is only able to 1813 * revoke descriptors for processes within that prison. 1814 * 1815 * Returns 0 on success or an error code. 1816 */ 1817 struct fdrevoke_info { 1818 void *data; 1819 short type; 1820 short unused; 1821 int found; 1822 struct ucred *cred; 1823 struct file *nfp; 1824 }; 1825 1826 static int fdrevoke_check_callback(struct file *fp, void *vinfo); 1827 static int fdrevoke_proc_callback(struct proc *p, void *vinfo); 1828 1829 int 1830 fdrevoke(void *f_data, short f_type, struct ucred *cred) 1831 { 1832 struct fdrevoke_info info; 1833 int error; 1834 1835 bzero(&info, sizeof(info)); 1836 info.data = f_data; 1837 info.type = f_type; 1838 info.cred = cred; 1839 error = falloc(NULL, &info.nfp, NULL); 1840 if (error) 1841 return (error); 1842 1843 /* 1844 * Scan the file pointer table once. dups do not dup file pointers, 1845 * only descriptors, so there is no leak. Set FREVOKED on the fps 1846 * being revoked. 1847 * 1848 * Any fps sent over unix-domain sockets will be revoked by the 1849 * socket code checking for FREVOKED when the fps are externialized. 1850 * revoke_token is used to make sure that fps marked FREVOKED and 1851 * externalized will be picked up by the following allproc_scan(). 1852 */ 1853 lwkt_gettoken(&revoke_token); 1854 allfiles_scan_exclusive(fdrevoke_check_callback, &info); 1855 lwkt_reltoken(&revoke_token); 1856 1857 /* 1858 * If any fps were marked track down the related descriptors 1859 * and close them. Any dup()s at this point will notice 1860 * the FREVOKED already set in the fp and do the right thing. 1861 */ 1862 if (info.found) 1863 allproc_scan(fdrevoke_proc_callback, &info, 0); 1864 fdrop(info.nfp); 1865 return(0); 1866 } 1867 1868 /* 1869 * Locate matching file pointers directly. 1870 * 1871 * WARNING: allfiles_scan_exclusive() holds a spinlock through these calls! 1872 */ 1873 static int 1874 fdrevoke_check_callback(struct file *fp, void *vinfo) 1875 { 1876 struct fdrevoke_info *info = vinfo; 1877 1878 /* 1879 * File pointers already flagged for revokation are skipped. 1880 */ 1881 if (fp->f_flag & FREVOKED) 1882 return(0); 1883 1884 /* 1885 * If revoking from a prison file pointers created outside of 1886 * that prison, or file pointers without creds, cannot be revoked. 1887 */ 1888 if (info->cred->cr_prison && 1889 (fp->f_cred == NULL || 1890 info->cred->cr_prison != fp->f_cred->cr_prison)) { 1891 return(0); 1892 } 1893 1894 /* 1895 * If the file pointer matches then mark it for revocation. The 1896 * flag is currently only used by unp_revoke_gc(). 1897 * 1898 * info->found is a heuristic and can race in a SMP environment. 1899 */ 1900 if (info->data == fp->f_data && info->type == fp->f_type) { 1901 atomic_set_int(&fp->f_flag, FREVOKED); 1902 info->found = 1; 1903 } 1904 return(0); 1905 } 1906 1907 /* 1908 * Locate matching file pointers via process descriptor tables. 1909 */ 1910 static int 1911 fdrevoke_proc_callback(struct proc *p, void *vinfo) 1912 { 1913 struct fdrevoke_info *info = vinfo; 1914 struct filedesc *fdp; 1915 struct file *fp; 1916 int n; 1917 1918 if (p->p_stat == SIDL || p->p_stat == SZOMB) 1919 return(0); 1920 if (info->cred->cr_prison && 1921 info->cred->cr_prison != p->p_ucred->cr_prison) { 1922 return(0); 1923 } 1924 1925 /* 1926 * If the controlling terminal of the process matches the 1927 * vnode being revoked we clear the controlling terminal. 1928 * 1929 * The normal spec_close() may not catch this because it 1930 * uses curproc instead of p. 1931 */ 1932 if (p->p_session && info->type == DTYPE_VNODE && 1933 info->data == p->p_session->s_ttyvp) { 1934 p->p_session->s_ttyvp = NULL; 1935 vrele(info->data); 1936 } 1937 1938 /* 1939 * Softref the fdp to prevent it from being destroyed 1940 */ 1941 spin_lock(&p->p_spin); 1942 if ((fdp = p->p_fd) == NULL) { 1943 spin_unlock(&p->p_spin); 1944 return(0); 1945 } 1946 atomic_add_int(&fdp->fd_softrefs, 1); 1947 spin_unlock(&p->p_spin); 1948 1949 /* 1950 * Locate and close any matching file descriptors, replacing 1951 * them with info->nfp. 1952 */ 1953 spin_lock(&fdp->fd_spin); 1954 for (n = 0; n < fdp->fd_nfiles; ++n) { 1955 if ((fp = fdp->fd_files[n].fp) == NULL) 1956 continue; 1957 if (fp->f_flag & FREVOKED) { 1958 ++fdp->fd_closedcounter; 1959 fclearcache(&fdp->fd_files[n], NULL, 0); 1960 ++fdp->fd_closedcounter; 1961 fhold(info->nfp); 1962 fdp->fd_files[n].fp = info->nfp; 1963 spin_unlock(&fdp->fd_spin); 1964 knote_fdclose(fp, fdp, n); /* XXX */ 1965 closef(fp, p); 1966 spin_lock(&fdp->fd_spin); 1967 } 1968 } 1969 spin_unlock(&fdp->fd_spin); 1970 atomic_subtract_int(&fdp->fd_softrefs, 1); 1971 return(0); 1972 } 1973 1974 /* 1975 * falloc: 1976 * Create a new open file structure and reserve a file decriptor 1977 * for the process that refers to it. 1978 * 1979 * Root creds are checked using lp, or assumed if lp is NULL. If 1980 * resultfd is non-NULL then lp must also be non-NULL. No file 1981 * descriptor is reserved (and no process context is needed) if 1982 * resultfd is NULL. 1983 * 1984 * A file pointer with a refcount of 1 is returned. Note that the 1985 * file pointer is NOT associated with the descriptor. If falloc 1986 * returns success, fsetfd() MUST be called to either associate the 1987 * file pointer or clear the reservation. 1988 */ 1989 int 1990 falloc(struct lwp *lp, struct file **resultfp, int *resultfd) 1991 { 1992 static struct timeval lastfail; 1993 static int curfail; 1994 struct filelist_head *head; 1995 struct file *fp; 1996 struct ucred *cred = lp ? lp->lwp_thread->td_ucred : proc0.p_ucred; 1997 int error; 1998 1999 fp = NULL; 2000 2001 /* 2002 * Handle filetable full issues and root overfill. 2003 */ 2004 if (nfiles >= maxfiles - maxfilesrootres && 2005 (cred->cr_ruid != 0 || nfiles >= maxfiles)) { 2006 if (ppsratecheck(&lastfail, &curfail, 1)) { 2007 kprintf("kern.maxfiles limit exceeded by uid %d, " 2008 "please see tuning(7).\n", 2009 cred->cr_ruid); 2010 } 2011 error = ENFILE; 2012 goto done; 2013 } 2014 2015 /* 2016 * Allocate a new file descriptor. 2017 */ 2018 fp = objcache_get(file_objcache, M_WAITOK); 2019 bzero(fp, sizeof(*fp)); 2020 spin_init(&fp->f_spin, "falloc"); 2021 SLIST_INIT(&fp->f_klist); 2022 fp->f_count = 1; 2023 fp->f_ops = &badfileops; 2024 fp->f_seqcount = 1; 2025 fsetcred(fp, cred); 2026 atomic_add_int(&nfiles, 1); 2027 2028 head = fp2filelist(fp); 2029 spin_lock(&head->spin); 2030 LIST_INSERT_HEAD(&head->list, fp, f_list); 2031 spin_unlock(&head->spin); 2032 2033 if (resultfd) { 2034 if ((error = fdalloc(lp->lwp_proc, 0, resultfd)) != 0) { 2035 fdrop(fp); 2036 fp = NULL; 2037 } 2038 } else { 2039 error = 0; 2040 } 2041 done: 2042 *resultfp = fp; 2043 return (error); 2044 } 2045 2046 /* 2047 * Check for races against a file descriptor by determining that the 2048 * file pointer is still associated with the specified file descriptor, 2049 * and a close is not currently in progress. 2050 */ 2051 int 2052 checkfdclosed(thread_t td, struct filedesc *fdp, int fd, struct file *fp, 2053 int closedcounter) 2054 { 2055 struct fdcache *fdc; 2056 int error; 2057 2058 cpu_lfence(); 2059 if (fdp->fd_closedcounter == closedcounter) 2060 return 0; 2061 2062 if (td->td_proc && td->td_proc->p_fd == fdp) { 2063 for (fdc = &td->td_fdcache[0]; 2064 fdc < &td->td_fdcache[NFDCACHE]; ++fdc) { 2065 if (fdc->fd == fd && fdc->fp == fp) 2066 return 0; 2067 } 2068 } 2069 2070 spin_lock_shared(&fdp->fd_spin); 2071 if ((unsigned)fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp) 2072 error = EBADF; 2073 else 2074 error = 0; 2075 spin_unlock_shared(&fdp->fd_spin); 2076 return (error); 2077 } 2078 2079 /* 2080 * Associate a file pointer with a previously reserved file descriptor. 2081 * This function always succeeds. 2082 * 2083 * If fp is NULL, the file descriptor is returned to the pool. 2084 * 2085 * Caller must hold an exclusive spinlock on fdp->fd_spin. 2086 */ 2087 static void 2088 fsetfd_locked(struct filedesc *fdp, struct file *fp, int fd) 2089 { 2090 KKASSERT((unsigned)fd < fdp->fd_nfiles); 2091 KKASSERT(fdp->fd_files[fd].reserved != 0); 2092 if (fp) { 2093 fhold(fp); 2094 /* fclearcache(&fdp->fd_files[fd], NULL, 0); */ 2095 fdp->fd_files[fd].fp = fp; 2096 fdp->fd_files[fd].reserved = 0; 2097 } else { 2098 fdp->fd_files[fd].reserved = 0; 2099 fdreserve_locked(fdp, fd, -1); 2100 fdfixup_locked(fdp, fd); 2101 } 2102 } 2103 2104 /* 2105 * Caller must hold an exclusive spinlock on fdp->fd_spin. 2106 */ 2107 void 2108 fsetfd(struct filedesc *fdp, struct file *fp, int fd) 2109 { 2110 spin_lock(&fdp->fd_spin); 2111 fsetfd_locked(fdp, fp, fd); 2112 spin_unlock(&fdp->fd_spin); 2113 } 2114 2115 /* 2116 * Caller must hold an exclusive spinlock on fdp->fd_spin. 2117 */ 2118 static 2119 struct file * 2120 funsetfd_locked(struct filedesc *fdp, int fd) 2121 { 2122 struct file *fp; 2123 2124 if ((unsigned)fd >= fdp->fd_nfiles) 2125 return (NULL); 2126 if ((fp = fdp->fd_files[fd].fp) == NULL) 2127 return (NULL); 2128 ++fdp->fd_closedcounter; 2129 fclearcache(&fdp->fd_files[fd], NULL, 0); 2130 fdp->fd_files[fd].fp = NULL; 2131 fdp->fd_files[fd].fileflags = 0; 2132 ++fdp->fd_closedcounter; 2133 2134 fdreserve_locked(fdp, fd, -1); 2135 fdfixup_locked(fdp, fd); 2136 2137 return(fp); 2138 } 2139 2140 /* 2141 * WARNING: May not be called before initial fsetfd(). 2142 */ 2143 int 2144 fgetfdflags(struct filedesc *fdp, int fd, int *flagsp) 2145 { 2146 int error; 2147 2148 spin_lock_shared(&fdp->fd_spin); 2149 if (((u_int)fd) >= fdp->fd_nfiles) { 2150 error = EBADF; 2151 } else if (fdp->fd_files[fd].fp == NULL) { 2152 error = EBADF; 2153 } else { 2154 *flagsp = fdp->fd_files[fd].fileflags; 2155 error = 0; 2156 } 2157 spin_unlock_shared(&fdp->fd_spin); 2158 2159 return (error); 2160 } 2161 2162 /* 2163 * WARNING: May not be called before initial fsetfd(). 2164 */ 2165 int 2166 fsetfdflags(struct filedesc *fdp, int fd, int add_flags) 2167 { 2168 int error; 2169 2170 spin_lock(&fdp->fd_spin); 2171 if (((u_int)fd) >= fdp->fd_nfiles) { 2172 error = EBADF; 2173 } else if (fdp->fd_files[fd].fp == NULL) { 2174 error = EBADF; 2175 } else { 2176 fdp->fd_files[fd].fileflags |= add_flags; 2177 error = 0; 2178 } 2179 spin_unlock(&fdp->fd_spin); 2180 2181 return (error); 2182 } 2183 2184 /* 2185 * WARNING: May not be called before initial fsetfd(). 2186 */ 2187 int 2188 fclrfdflags(struct filedesc *fdp, int fd, int rem_flags) 2189 { 2190 int error; 2191 2192 spin_lock(&fdp->fd_spin); 2193 if (((u_int)fd) >= fdp->fd_nfiles) { 2194 error = EBADF; 2195 } else if (fdp->fd_files[fd].fp == NULL) { 2196 error = EBADF; 2197 } else { 2198 fdp->fd_files[fd].fileflags &= ~rem_flags; 2199 error = 0; 2200 } 2201 spin_unlock(&fdp->fd_spin); 2202 2203 return (error); 2204 } 2205 2206 /* 2207 * Set/Change/Clear the creds for a fp and synchronize the uidinfo. 2208 */ 2209 void 2210 fsetcred(struct file *fp, struct ucred *ncr) 2211 { 2212 struct ucred *ocr; 2213 struct uidinfo *uip; 2214 struct uidcount *pup; 2215 int cpu = mycpuid; 2216 int count; 2217 2218 ocr = fp->f_cred; 2219 if (ocr == NULL || ncr == NULL || ocr->cr_uidinfo != ncr->cr_uidinfo) { 2220 if (ocr) { 2221 uip = ocr->cr_uidinfo; 2222 pup = &uip->ui_pcpu[cpu]; 2223 atomic_add_int(&pup->pu_openfiles, -1); 2224 if (pup->pu_openfiles < -PUP_LIMIT || 2225 pup->pu_openfiles > PUP_LIMIT) { 2226 count = atomic_swap_int(&pup->pu_openfiles, 0); 2227 atomic_add_int(&uip->ui_openfiles, count); 2228 } 2229 } 2230 if (ncr) { 2231 uip = ncr->cr_uidinfo; 2232 pup = &uip->ui_pcpu[cpu]; 2233 atomic_add_int(&pup->pu_openfiles, 1); 2234 if (pup->pu_openfiles < -PUP_LIMIT || 2235 pup->pu_openfiles > PUP_LIMIT) { 2236 count = atomic_swap_int(&pup->pu_openfiles, 0); 2237 atomic_add_int(&uip->ui_openfiles, count); 2238 } 2239 } 2240 } 2241 if (ncr) 2242 crhold(ncr); 2243 fp->f_cred = ncr; 2244 if (ocr) 2245 crfree(ocr); 2246 } 2247 2248 /* 2249 * Free a file descriptor. 2250 */ 2251 static 2252 void 2253 ffree(struct file *fp) 2254 { 2255 KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!")); 2256 fsetcred(fp, NULL); 2257 if (fp->f_nchandle.ncp) 2258 cache_drop(&fp->f_nchandle); 2259 objcache_put(file_objcache, fp); 2260 } 2261 2262 /* 2263 * called from init_main, initialize filedesc0 for proc0. 2264 */ 2265 void 2266 fdinit_bootstrap(struct proc *p0, struct filedesc *fdp0, int cmask) 2267 { 2268 p0->p_fd = fdp0; 2269 p0->p_fdtol = NULL; 2270 fdp0->fd_refcnt = 1; 2271 fdp0->fd_cmask = cmask; 2272 fdp0->fd_files = fdp0->fd_builtin_files; 2273 fdp0->fd_nfiles = NDFILE; 2274 fdp0->fd_lastfile = -1; 2275 spin_init(&fdp0->fd_spin, "fdinitbootstrap"); 2276 } 2277 2278 /* 2279 * Build a new filedesc structure. 2280 */ 2281 struct filedesc * 2282 fdinit(struct proc *p) 2283 { 2284 struct filedesc *newfdp; 2285 struct filedesc *fdp = p->p_fd; 2286 2287 newfdp = kmalloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK|M_ZERO); 2288 spin_lock(&fdp->fd_spin); 2289 if (fdp->fd_cdir) { 2290 newfdp->fd_cdir = fdp->fd_cdir; 2291 vref(newfdp->fd_cdir); 2292 cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir); 2293 } 2294 2295 /* 2296 * rdir may not be set in e.g. proc0 or anything vm_fork'd off of 2297 * proc0, but should unconditionally exist in other processes. 2298 */ 2299 if (fdp->fd_rdir) { 2300 newfdp->fd_rdir = fdp->fd_rdir; 2301 vref(newfdp->fd_rdir); 2302 cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir); 2303 } 2304 if (fdp->fd_jdir) { 2305 newfdp->fd_jdir = fdp->fd_jdir; 2306 vref(newfdp->fd_jdir); 2307 cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir); 2308 } 2309 spin_unlock(&fdp->fd_spin); 2310 2311 /* Create the file descriptor table. */ 2312 newfdp->fd_refcnt = 1; 2313 newfdp->fd_cmask = cmask; 2314 newfdp->fd_files = newfdp->fd_builtin_files; 2315 newfdp->fd_nfiles = NDFILE; 2316 newfdp->fd_lastfile = -1; 2317 spin_init(&newfdp->fd_spin, "fdinit"); 2318 2319 return (newfdp); 2320 } 2321 2322 /* 2323 * Share a filedesc structure. 2324 */ 2325 struct filedesc * 2326 fdshare(struct proc *p) 2327 { 2328 struct filedesc *fdp; 2329 2330 fdp = p->p_fd; 2331 spin_lock(&fdp->fd_spin); 2332 fdp->fd_refcnt++; 2333 spin_unlock(&fdp->fd_spin); 2334 return (fdp); 2335 } 2336 2337 /* 2338 * Copy a filedesc structure. 2339 */ 2340 int 2341 fdcopy(struct proc *p, struct filedesc **fpp) 2342 { 2343 struct filedesc *fdp = p->p_fd; 2344 struct filedesc *newfdp; 2345 struct fdnode *fdnode; 2346 int i; 2347 int ni; 2348 2349 /* 2350 * Certain daemons might not have file descriptors. 2351 */ 2352 if (fdp == NULL) 2353 return (0); 2354 2355 /* 2356 * Allocate the new filedesc and fd_files[] array. This can race 2357 * with operations by other threads on the fdp so we have to be 2358 * careful. 2359 */ 2360 newfdp = kmalloc(sizeof(struct filedesc), 2361 M_FILEDESC, M_WAITOK | M_ZERO | M_NULLOK); 2362 if (newfdp == NULL) { 2363 *fpp = NULL; 2364 return (-1); 2365 } 2366 again: 2367 spin_lock(&fdp->fd_spin); 2368 if (fdp->fd_lastfile < NDFILE) { 2369 newfdp->fd_files = newfdp->fd_builtin_files; 2370 i = NDFILE; 2371 } else { 2372 /* 2373 * We have to allocate (N^2-1) entries for our in-place 2374 * binary tree. Allow the table to shrink. 2375 */ 2376 i = fdp->fd_nfiles; 2377 ni = (i - 1) / 2; 2378 while (ni > fdp->fd_lastfile && ni > NDFILE) { 2379 i = ni; 2380 ni = (i - 1) / 2; 2381 } 2382 spin_unlock(&fdp->fd_spin); 2383 newfdp->fd_files = kmalloc(i * sizeof(struct fdnode), 2384 M_FILEDESC, M_WAITOK | M_ZERO); 2385 2386 /* 2387 * Check for race, retry 2388 */ 2389 spin_lock(&fdp->fd_spin); 2390 if (i <= fdp->fd_lastfile) { 2391 spin_unlock(&fdp->fd_spin); 2392 kfree(newfdp->fd_files, M_FILEDESC); 2393 goto again; 2394 } 2395 } 2396 2397 /* 2398 * Dup the remaining fields. vref() and cache_hold() can be 2399 * safely called while holding the read spinlock on fdp. 2400 * 2401 * The read spinlock on fdp is still being held. 2402 * 2403 * NOTE: vref and cache_hold calls for the case where the vnode 2404 * or cache entry already has at least one ref may be called 2405 * while holding spin locks. 2406 */ 2407 if ((newfdp->fd_cdir = fdp->fd_cdir) != NULL) { 2408 vref(newfdp->fd_cdir); 2409 cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir); 2410 } 2411 /* 2412 * We must check for fd_rdir here, at least for now because 2413 * the init process is created before we have access to the 2414 * rootvode to take a reference to it. 2415 */ 2416 if ((newfdp->fd_rdir = fdp->fd_rdir) != NULL) { 2417 vref(newfdp->fd_rdir); 2418 cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir); 2419 } 2420 if ((newfdp->fd_jdir = fdp->fd_jdir) != NULL) { 2421 vref(newfdp->fd_jdir); 2422 cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir); 2423 } 2424 newfdp->fd_refcnt = 1; 2425 newfdp->fd_nfiles = i; 2426 newfdp->fd_lastfile = fdp->fd_lastfile; 2427 newfdp->fd_freefile = fdp->fd_freefile; 2428 newfdp->fd_cmask = fdp->fd_cmask; 2429 spin_init(&newfdp->fd_spin, "fdcopy"); 2430 2431 /* 2432 * Copy the descriptor table through (i). This also copies the 2433 * allocation state. Then go through and ref the file pointers 2434 * and clean up any KQ descriptors. 2435 * 2436 * kq descriptors cannot be copied. Since we haven't ref'd the 2437 * copied files yet we can ignore the return value from funsetfd(). 2438 * 2439 * The read spinlock on fdp is still being held. 2440 * 2441 * Be sure to clean out fdnode->tdcache, otherwise bad things will 2442 * happen. 2443 */ 2444 bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode)); 2445 for (i = 0 ; i < newfdp->fd_nfiles; ++i) { 2446 fdnode = &newfdp->fd_files[i]; 2447 if (fdnode->reserved) { 2448 fdreserve_locked(newfdp, i, -1); 2449 fdnode->reserved = 0; 2450 fdfixup_locked(newfdp, i); 2451 } else if (fdnode->fp) { 2452 bzero(&fdnode->tdcache, sizeof(fdnode->tdcache)); 2453 if (fdnode->fp->f_type == DTYPE_KQUEUE) { 2454 (void)funsetfd_locked(newfdp, i); 2455 } else { 2456 fhold(fdnode->fp); 2457 } 2458 } 2459 } 2460 spin_unlock(&fdp->fd_spin); 2461 *fpp = newfdp; 2462 return (0); 2463 } 2464 2465 /* 2466 * Release a filedesc structure. 2467 * 2468 * NOT MPSAFE (MPSAFE for refs > 1, but the final cleanup code is not MPSAFE) 2469 */ 2470 void 2471 fdfree(struct proc *p, struct filedesc *repl) 2472 { 2473 struct filedesc *fdp; 2474 struct fdnode *fdnode; 2475 int i; 2476 struct filedesc_to_leader *fdtol; 2477 struct file *fp; 2478 struct vnode *vp; 2479 struct flock lf; 2480 2481 /* 2482 * Before destroying or replacing p->p_fd we must be sure to 2483 * clean out the cache of the last thread, which should be 2484 * curthread. 2485 */ 2486 fexitcache(curthread); 2487 2488 /* 2489 * Certain daemons might not have file descriptors. 2490 */ 2491 fdp = p->p_fd; 2492 if (fdp == NULL) { 2493 p->p_fd = repl; 2494 return; 2495 } 2496 2497 /* 2498 * Severe messing around to follow. 2499 */ 2500 spin_lock(&fdp->fd_spin); 2501 2502 /* Check for special need to clear POSIX style locks */ 2503 fdtol = p->p_fdtol; 2504 if (fdtol != NULL) { 2505 KASSERT(fdtol->fdl_refcount > 0, 2506 ("filedesc_to_refcount botch: fdl_refcount=%d", 2507 fdtol->fdl_refcount)); 2508 if (fdtol->fdl_refcount == 1 && p->p_leader->p_advlock_flag) { 2509 for (i = 0; i <= fdp->fd_lastfile; ++i) { 2510 fdnode = &fdp->fd_files[i]; 2511 if (fdnode->fp == NULL || 2512 fdnode->fp->f_type != DTYPE_VNODE) { 2513 continue; 2514 } 2515 fp = fdnode->fp; 2516 fhold(fp); 2517 spin_unlock(&fdp->fd_spin); 2518 2519 lf.l_whence = SEEK_SET; 2520 lf.l_start = 0; 2521 lf.l_len = 0; 2522 lf.l_type = F_UNLCK; 2523 vp = (struct vnode *)fp->f_data; 2524 VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 2525 F_UNLCK, &lf, F_POSIX); 2526 fdrop(fp); 2527 spin_lock(&fdp->fd_spin); 2528 } 2529 } 2530 retry: 2531 if (fdtol->fdl_refcount == 1) { 2532 if (fdp->fd_holdleaderscount > 0 && 2533 p->p_leader->p_advlock_flag) { 2534 /* 2535 * close() or do_dup() has cleared a reference 2536 * in a shared file descriptor table. 2537 */ 2538 fdp->fd_holdleaderswakeup = 1; 2539 ssleep(&fdp->fd_holdleaderscount, 2540 &fdp->fd_spin, 0, "fdlhold", 0); 2541 goto retry; 2542 } 2543 if (fdtol->fdl_holdcount > 0) { 2544 /* 2545 * Ensure that fdtol->fdl_leader 2546 * remains valid in closef(). 2547 */ 2548 fdtol->fdl_wakeup = 1; 2549 ssleep(fdtol, &fdp->fd_spin, 0, "fdlhold", 0); 2550 goto retry; 2551 } 2552 } 2553 fdtol->fdl_refcount--; 2554 if (fdtol->fdl_refcount == 0 && 2555 fdtol->fdl_holdcount == 0) { 2556 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 2557 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 2558 } else { 2559 fdtol = NULL; 2560 } 2561 p->p_fdtol = NULL; 2562 if (fdtol != NULL) { 2563 spin_unlock(&fdp->fd_spin); 2564 kfree(fdtol, M_FILEDESC_TO_LEADER); 2565 spin_lock(&fdp->fd_spin); 2566 } 2567 } 2568 if (--fdp->fd_refcnt > 0) { 2569 spin_unlock(&fdp->fd_spin); 2570 spin_lock(&p->p_spin); 2571 p->p_fd = repl; 2572 spin_unlock(&p->p_spin); 2573 return; 2574 } 2575 2576 /* 2577 * Even though we are the last reference to the structure allproc 2578 * scans may still reference the structure. Maintain proper 2579 * locks until we can replace p->p_fd. 2580 * 2581 * Also note that kqueue's closef still needs to reference the 2582 * fdp via p->p_fd, so we have to close the descriptors before 2583 * we replace p->p_fd. 2584 */ 2585 for (i = 0; i <= fdp->fd_lastfile; ++i) { 2586 if (fdp->fd_files[i].fp) { 2587 fp = funsetfd_locked(fdp, i); 2588 if (fp) { 2589 spin_unlock(&fdp->fd_spin); 2590 if (SLIST_FIRST(&fp->f_klist)) 2591 knote_fdclose(fp, fdp, i); 2592 closef(fp, p); 2593 spin_lock(&fdp->fd_spin); 2594 } 2595 } 2596 } 2597 spin_unlock(&fdp->fd_spin); 2598 2599 /* 2600 * Interlock against an allproc scan operations (typically frevoke). 2601 */ 2602 spin_lock(&p->p_spin); 2603 p->p_fd = repl; 2604 spin_unlock(&p->p_spin); 2605 2606 /* 2607 * Wait for any softrefs to go away. This race rarely occurs so 2608 * we can use a non-critical-path style poll/sleep loop. The 2609 * race only occurs against allproc scans. 2610 * 2611 * No new softrefs can occur with the fdp disconnected from the 2612 * process. 2613 */ 2614 if (fdp->fd_softrefs) { 2615 kprintf("pid %d: Warning, fdp race avoided\n", p->p_pid); 2616 while (fdp->fd_softrefs) 2617 tsleep(&fdp->fd_softrefs, 0, "fdsoft", 1); 2618 } 2619 2620 if (fdp->fd_files != fdp->fd_builtin_files) 2621 kfree(fdp->fd_files, M_FILEDESC); 2622 if (fdp->fd_cdir) { 2623 cache_drop(&fdp->fd_ncdir); 2624 vrele(fdp->fd_cdir); 2625 } 2626 if (fdp->fd_rdir) { 2627 cache_drop(&fdp->fd_nrdir); 2628 vrele(fdp->fd_rdir); 2629 } 2630 if (fdp->fd_jdir) { 2631 cache_drop(&fdp->fd_njdir); 2632 vrele(fdp->fd_jdir); 2633 } 2634 kfree(fdp, M_FILEDESC); 2635 } 2636 2637 /* 2638 * Retrieve and reference the file pointer associated with a descriptor. 2639 * 2640 * td must be the current thread. 2641 */ 2642 struct file * 2643 holdfp(thread_t td, int fd, int flag) 2644 { 2645 struct file *fp; 2646 2647 fp = _holdfp_cache(td, fd); 2648 if (fp) { 2649 if ((fp->f_flag & flag) == 0 && flag != -1) { 2650 fdrop(fp); 2651 fp = NULL; 2652 } 2653 } 2654 return fp; 2655 } 2656 2657 /* 2658 * holdsock() - load the struct file pointer associated 2659 * with a socket into *fpp. If an error occurs, non-zero 2660 * will be returned and *fpp will be set to NULL. 2661 * 2662 * td must be the current thread. 2663 */ 2664 int 2665 holdsock(thread_t td, int fd, struct file **fpp) 2666 { 2667 struct file *fp; 2668 int error; 2669 2670 /* 2671 * Lockless shortcut 2672 */ 2673 fp = _holdfp_cache(td, fd); 2674 if (fp) { 2675 if (fp->f_type != DTYPE_SOCKET) { 2676 fdrop(fp); 2677 fp = NULL; 2678 error = ENOTSOCK; 2679 } else { 2680 error = 0; 2681 } 2682 } else { 2683 error = EBADF; 2684 } 2685 *fpp = fp; 2686 2687 return (error); 2688 } 2689 2690 /* 2691 * Convert a user file descriptor to a held file pointer. 2692 * 2693 * td must be the current thread. 2694 */ 2695 int 2696 holdvnode(thread_t td, int fd, struct file **fpp) 2697 { 2698 struct file *fp; 2699 int error; 2700 2701 fp = _holdfp_cache(td, fd); 2702 if (fp) { 2703 if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { 2704 fdrop(fp); 2705 fp = NULL; 2706 error = EINVAL; 2707 } else { 2708 error = 0; 2709 } 2710 } else { 2711 error = EBADF; 2712 } 2713 *fpp = fp; 2714 2715 return (error); 2716 } 2717 2718 /* 2719 * For setugid programs, we don't want to people to use that setugidness 2720 * to generate error messages which write to a file which otherwise would 2721 * otherwise be off-limits to the process. 2722 * 2723 * This is a gross hack to plug the hole. A better solution would involve 2724 * a special vop or other form of generalized access control mechanism. We 2725 * go ahead and just reject all procfs file systems accesses as dangerous. 2726 * 2727 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is 2728 * sufficient. We also don't for check setugidness since we know we are. 2729 */ 2730 static int 2731 is_unsafe(struct file *fp) 2732 { 2733 if (fp->f_type == DTYPE_VNODE && 2734 ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS) 2735 return (1); 2736 return (0); 2737 } 2738 2739 /* 2740 * Make this setguid thing safe, if at all possible. 2741 * 2742 * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() 2743 */ 2744 void 2745 setugidsafety(struct proc *p) 2746 { 2747 struct filedesc *fdp = p->p_fd; 2748 int i; 2749 2750 /* Certain daemons might not have file descriptors. */ 2751 if (fdp == NULL) 2752 return; 2753 2754 /* 2755 * note: fdp->fd_files may be reallocated out from under us while 2756 * we are blocked in a close. Be careful! 2757 */ 2758 for (i = 0; i <= fdp->fd_lastfile; i++) { 2759 if (i > 2) 2760 break; 2761 if (fdp->fd_files[i].fp && is_unsafe(fdp->fd_files[i].fp)) { 2762 struct file *fp; 2763 2764 /* 2765 * NULL-out descriptor prior to close to avoid 2766 * a race while close blocks. 2767 */ 2768 if ((fp = funsetfd_locked(fdp, i)) != NULL) { 2769 knote_fdclose(fp, fdp, i); 2770 closef(fp, p); 2771 } 2772 } 2773 } 2774 } 2775 2776 /* 2777 * Close all CLOEXEC files on exec. 2778 * 2779 * Only a single thread remains for the current process. 2780 * 2781 * NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose() 2782 */ 2783 void 2784 fdcloseexec(struct proc *p) 2785 { 2786 struct filedesc *fdp = p->p_fd; 2787 int i; 2788 2789 /* Certain daemons might not have file descriptors. */ 2790 if (fdp == NULL) 2791 return; 2792 2793 /* 2794 * We cannot cache fd_files since operations may block and rip 2795 * them out from under us. 2796 */ 2797 for (i = 0; i <= fdp->fd_lastfile; i++) { 2798 if (fdp->fd_files[i].fp != NULL && 2799 (fdp->fd_files[i].fileflags & UF_EXCLOSE)) { 2800 struct file *fp; 2801 2802 /* 2803 * NULL-out descriptor prior to close to avoid 2804 * a race while close blocks. 2805 * 2806 * (funsetfd*() also clears the fd cache) 2807 */ 2808 if ((fp = funsetfd_locked(fdp, i)) != NULL) { 2809 knote_fdclose(fp, fdp, i); 2810 closef(fp, p); 2811 } 2812 } 2813 } 2814 } 2815 2816 /* 2817 * It is unsafe for set[ug]id processes to be started with file 2818 * descriptors 0..2 closed, as these descriptors are given implicit 2819 * significance in the Standard C library. fdcheckstd() will create a 2820 * descriptor referencing /dev/null for each of stdin, stdout, and 2821 * stderr that is not already open. 2822 * 2823 * NOT MPSAFE - calls falloc, vn_open, etc 2824 */ 2825 int 2826 fdcheckstd(struct lwp *lp) 2827 { 2828 struct nlookupdata nd; 2829 struct filedesc *fdp; 2830 struct file *fp; 2831 int retval; 2832 int i, error, flags, devnull; 2833 2834 fdp = lp->lwp_proc->p_fd; 2835 if (fdp == NULL) 2836 return (0); 2837 devnull = -1; 2838 error = 0; 2839 for (i = 0; i < 3; i++) { 2840 if (fdp->fd_files[i].fp != NULL) 2841 continue; 2842 if (devnull < 0) { 2843 if ((error = falloc(lp, &fp, &devnull)) != 0) 2844 break; 2845 2846 error = nlookup_init(&nd, "/dev/null", UIO_SYSSPACE, 2847 NLC_FOLLOW|NLC_LOCKVP); 2848 flags = FREAD | FWRITE; 2849 if (error == 0) 2850 error = vn_open(&nd, fp, flags, 0); 2851 if (error == 0) 2852 fsetfd(fdp, fp, devnull); 2853 else 2854 fsetfd(fdp, NULL, devnull); 2855 fdrop(fp); 2856 nlookup_done(&nd); 2857 if (error) 2858 break; 2859 KKASSERT(i == devnull); 2860 } else { 2861 error = kern_dup(DUP_FIXED, devnull, i, &retval); 2862 if (error != 0) 2863 break; 2864 } 2865 } 2866 return (error); 2867 } 2868 2869 /* 2870 * Internal form of close. 2871 * Decrement reference count on file structure. 2872 * Note: td and/or p may be NULL when closing a file 2873 * that was being passed in a message. 2874 * 2875 * MPALMOSTSAFE - acquires mplock for VOP operations 2876 */ 2877 int 2878 closef(struct file *fp, struct proc *p) 2879 { 2880 struct vnode *vp; 2881 struct flock lf; 2882 struct filedesc_to_leader *fdtol; 2883 2884 if (fp == NULL) 2885 return (0); 2886 2887 /* 2888 * POSIX record locking dictates that any close releases ALL 2889 * locks owned by this process. This is handled by setting 2890 * a flag in the unlock to free ONLY locks obeying POSIX 2891 * semantics, and not to free BSD-style file locks. 2892 * If the descriptor was in a message, POSIX-style locks 2893 * aren't passed with the descriptor. 2894 */ 2895 if (p != NULL && fp->f_type == DTYPE_VNODE && 2896 (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS) 2897 ) { 2898 if (p->p_leader->p_advlock_flag) { 2899 lf.l_whence = SEEK_SET; 2900 lf.l_start = 0; 2901 lf.l_len = 0; 2902 lf.l_type = F_UNLCK; 2903 vp = (struct vnode *)fp->f_data; 2904 VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 2905 &lf, F_POSIX); 2906 } 2907 fdtol = p->p_fdtol; 2908 if (fdtol != NULL) { 2909 lwkt_gettoken(&p->p_token); 2910 2911 /* 2912 * Handle special case where file descriptor table 2913 * is shared between multiple process leaders. 2914 */ 2915 for (fdtol = fdtol->fdl_next; 2916 fdtol != p->p_fdtol; 2917 fdtol = fdtol->fdl_next) { 2918 if (fdtol->fdl_leader->p_advlock_flag == 0) 2919 continue; 2920 fdtol->fdl_holdcount++; 2921 lf.l_whence = SEEK_SET; 2922 lf.l_start = 0; 2923 lf.l_len = 0; 2924 lf.l_type = F_UNLCK; 2925 vp = (struct vnode *)fp->f_data; 2926 VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, 2927 F_UNLCK, &lf, F_POSIX); 2928 fdtol->fdl_holdcount--; 2929 if (fdtol->fdl_holdcount == 0 && 2930 fdtol->fdl_wakeup != 0) { 2931 fdtol->fdl_wakeup = 0; 2932 wakeup(fdtol); 2933 } 2934 } 2935 lwkt_reltoken(&p->p_token); 2936 } 2937 } 2938 return (fdrop(fp)); 2939 } 2940 2941 /* 2942 * fhold() can only be called if f_count is already at least 1 (i.e. the 2943 * caller of fhold() already has a reference to the file pointer in some 2944 * manner or other). 2945 * 2946 * Atomic ops are used for incrementing and decrementing f_count before 2947 * the 1->0 transition. f_count 1->0 transition is special, see the 2948 * comment in fdrop(). 2949 */ 2950 void 2951 fhold(struct file *fp) 2952 { 2953 /* 0->1 transition will never work */ 2954 KASSERT(fp->f_count > 0, ("fhold: invalid f_count %d", fp->f_count)); 2955 atomic_add_int(&fp->f_count, 1); 2956 } 2957 2958 /* 2959 * fdrop() - drop a reference to a descriptor 2960 */ 2961 int 2962 fdrop(struct file *fp) 2963 { 2964 struct flock lf; 2965 struct vnode *vp; 2966 int error, do_free = 0; 2967 2968 /* 2969 * NOTE: 2970 * Simple atomic_fetchadd_int(f_count, -1) here will cause use- 2971 * after-free or double free (due to f_count 0->1 transition), if 2972 * fhold() is called on the fps found through filehead iteration. 2973 */ 2974 for (;;) { 2975 int count = fp->f_count; 2976 2977 cpu_ccfence(); 2978 KASSERT(count > 0, ("fdrop: invalid f_count %d", count)); 2979 if (count == 1) { 2980 struct filelist_head *head = fp2filelist(fp); 2981 2982 /* 2983 * About to drop the last reference, hold the 2984 * filehead spin lock and drop it, so that no 2985 * one could see this fp through filehead anymore, 2986 * let alone fhold() this fp. 2987 */ 2988 spin_lock(&head->spin); 2989 if (atomic_cmpset_int(&fp->f_count, count, 0)) { 2990 LIST_REMOVE(fp, f_list); 2991 spin_unlock(&head->spin); 2992 atomic_subtract_int(&nfiles, 1); 2993 do_free = 1; /* free this fp */ 2994 break; 2995 } 2996 spin_unlock(&head->spin); 2997 /* retry */ 2998 } else if (atomic_cmpset_int(&fp->f_count, count, count - 1)) { 2999 break; 3000 } 3001 /* retry */ 3002 } 3003 if (!do_free) 3004 return (0); 3005 3006 KKASSERT(SLIST_FIRST(&fp->f_klist) == NULL); 3007 3008 /* 3009 * The last reference has gone away, we own the fp structure free 3010 * and clear. 3011 */ 3012 if (fp->f_count < 0) 3013 panic("fdrop: count < 0"); 3014 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE && 3015 (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS) 3016 ) { 3017 lf.l_whence = SEEK_SET; 3018 lf.l_start = 0; 3019 lf.l_len = 0; 3020 lf.l_type = F_UNLCK; 3021 vp = (struct vnode *)fp->f_data; 3022 VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); 3023 } 3024 if (fp->f_ops != &badfileops) 3025 error = fo_close(fp); 3026 else 3027 error = 0; 3028 ffree(fp); 3029 return (error); 3030 } 3031 3032 /* 3033 * Apply an advisory lock on a file descriptor. 3034 * 3035 * Just attempt to get a record lock of the requested type on 3036 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). 3037 * 3038 * MPALMOSTSAFE 3039 */ 3040 int 3041 sys_flock(struct flock_args *uap) 3042 { 3043 thread_t td = curthread; 3044 struct file *fp; 3045 struct vnode *vp; 3046 struct flock lf; 3047 int error; 3048 3049 if ((fp = holdfp(td, uap->fd, -1)) == NULL) 3050 return (EBADF); 3051 if (fp->f_type != DTYPE_VNODE) { 3052 error = EOPNOTSUPP; 3053 goto done; 3054 } 3055 vp = (struct vnode *)fp->f_data; 3056 lf.l_whence = SEEK_SET; 3057 lf.l_start = 0; 3058 lf.l_len = 0; 3059 if (uap->how & LOCK_UN) { 3060 lf.l_type = F_UNLCK; 3061 atomic_clear_int(&fp->f_flag, FHASLOCK); /* race ok */ 3062 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0); 3063 goto done; 3064 } 3065 if (uap->how & LOCK_EX) 3066 lf.l_type = F_WRLCK; 3067 else if (uap->how & LOCK_SH) 3068 lf.l_type = F_RDLCK; 3069 else { 3070 error = EBADF; 3071 goto done; 3072 } 3073 if (uap->how & LOCK_NB) 3074 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0); 3075 else 3076 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT); 3077 atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */ 3078 done: 3079 fdrop(fp); 3080 return (error); 3081 } 3082 3083 /* 3084 * File Descriptor pseudo-device driver (/dev/fd/). 3085 * 3086 * Opening minor device N dup()s the file (if any) connected to file 3087 * descriptor N belonging to the calling process. Note that this driver 3088 * consists of only the ``open()'' routine, because all subsequent 3089 * references to this file will be direct to the other driver. 3090 */ 3091 static int 3092 fdopen(struct dev_open_args *ap) 3093 { 3094 thread_t td = curthread; 3095 3096 KKASSERT(td->td_lwp != NULL); 3097 3098 /* 3099 * XXX Kludge: set curlwp->lwp_dupfd to contain the value of the 3100 * the file descriptor being sought for duplication. The error 3101 * return ensures that the vnode for this device will be released 3102 * by vn_open. Open will detect this special error and take the 3103 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 3104 * will simply report the error. 3105 */ 3106 td->td_lwp->lwp_dupfd = minor(ap->a_head.a_dev); 3107 return (ENODEV); 3108 } 3109 3110 /* 3111 * The caller has reserved the file descriptor dfd for us. On success we 3112 * must fsetfd() it. On failure the caller will clean it up. 3113 */ 3114 int 3115 dupfdopen(thread_t td, int dfd, int sfd, int mode, int error) 3116 { 3117 struct filedesc *fdp; 3118 struct file *wfp; 3119 struct file *xfp; 3120 int werror; 3121 3122 if ((wfp = holdfp(td, sfd, -1)) == NULL) 3123 return (EBADF); 3124 3125 /* 3126 * Close a revoke/dup race. Duping a descriptor marked as revoked 3127 * will dup a dummy descriptor instead of the real one. 3128 */ 3129 if (wfp->f_flag & FREVOKED) { 3130 kprintf("Warning: attempt to dup() a revoked descriptor\n"); 3131 fdrop(wfp); 3132 wfp = NULL; 3133 werror = falloc(NULL, &wfp, NULL); 3134 if (werror) 3135 return (werror); 3136 } 3137 3138 fdp = td->td_proc->p_fd; 3139 3140 /* 3141 * There are two cases of interest here. 3142 * 3143 * For ENODEV simply dup sfd to file descriptor dfd and return. 3144 * 3145 * For ENXIO steal away the file structure from sfd and store it 3146 * dfd. sfd is effectively closed by this operation. 3147 * 3148 * Any other error code is just returned. 3149 */ 3150 switch (error) { 3151 case ENODEV: 3152 /* 3153 * Check that the mode the file is being opened for is a 3154 * subset of the mode of the existing descriptor. 3155 */ 3156 if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { 3157 error = EACCES; 3158 break; 3159 } 3160 spin_lock(&fdp->fd_spin); 3161 fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags; 3162 fsetfd_locked(fdp, wfp, dfd); 3163 spin_unlock(&fdp->fd_spin); 3164 error = 0; 3165 break; 3166 case ENXIO: 3167 /* 3168 * Steal away the file pointer from dfd, and stuff it into indx. 3169 */ 3170 spin_lock(&fdp->fd_spin); 3171 fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags; 3172 fsetfd(fdp, wfp, dfd); 3173 if ((xfp = funsetfd_locked(fdp, sfd)) != NULL) { 3174 spin_unlock(&fdp->fd_spin); 3175 fdrop(xfp); 3176 } else { 3177 spin_unlock(&fdp->fd_spin); 3178 } 3179 error = 0; 3180 break; 3181 default: 3182 break; 3183 } 3184 fdrop(wfp); 3185 return (error); 3186 } 3187 3188 /* 3189 * NOT MPSAFE - I think these refer to a common file descriptor table 3190 * and we need to spinlock that to link fdtol in. 3191 */ 3192 struct filedesc_to_leader * 3193 filedesc_to_leader_alloc(struct filedesc_to_leader *old, 3194 struct proc *leader) 3195 { 3196 struct filedesc_to_leader *fdtol; 3197 3198 fdtol = kmalloc(sizeof(struct filedesc_to_leader), 3199 M_FILEDESC_TO_LEADER, M_WAITOK | M_ZERO); 3200 fdtol->fdl_refcount = 1; 3201 fdtol->fdl_holdcount = 0; 3202 fdtol->fdl_wakeup = 0; 3203 fdtol->fdl_leader = leader; 3204 if (old != NULL) { 3205 fdtol->fdl_next = old->fdl_next; 3206 fdtol->fdl_prev = old; 3207 old->fdl_next = fdtol; 3208 fdtol->fdl_next->fdl_prev = fdtol; 3209 } else { 3210 fdtol->fdl_next = fdtol; 3211 fdtol->fdl_prev = fdtol; 3212 } 3213 return fdtol; 3214 } 3215 3216 /* 3217 * Scan all file pointers in the system. The callback is made with 3218 * the master list spinlock held exclusively. 3219 */ 3220 void 3221 allfiles_scan_exclusive(int (*callback)(struct file *, void *), void *data) 3222 { 3223 int i; 3224 3225 for (i = 0; i < NFILELIST_HEADS; ++i) { 3226 struct filelist_head *head = &filelist_heads[i]; 3227 struct file *fp; 3228 3229 spin_lock(&head->spin); 3230 LIST_FOREACH(fp, &head->list, f_list) { 3231 int res; 3232 3233 res = callback(fp, data); 3234 if (res < 0) 3235 break; 3236 } 3237 spin_unlock(&head->spin); 3238 } 3239 } 3240 3241 /* 3242 * Get file structures. 3243 * 3244 * NOT MPSAFE - process list scan, SYSCTL_OUT (probably not mpsafe) 3245 */ 3246 3247 struct sysctl_kern_file_info { 3248 int count; 3249 int error; 3250 struct sysctl_req *req; 3251 }; 3252 3253 static int sysctl_kern_file_callback(struct proc *p, void *data); 3254 3255 static int 3256 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 3257 { 3258 struct sysctl_kern_file_info info; 3259 3260 /* 3261 * Note: because the number of file descriptors is calculated 3262 * in different ways for sizing vs returning the data, 3263 * there is information leakage from the first loop. However, 3264 * it is of a similar order of magnitude to the leakage from 3265 * global system statistics such as kern.openfiles. 3266 * 3267 * When just doing a count, note that we cannot just count 3268 * the elements and add f_count via the filehead list because 3269 * threaded processes share their descriptor table and f_count might 3270 * still be '1' in that case. 3271 * 3272 * Since the SYSCTL op can block, we must hold the process to 3273 * prevent it being ripped out from under us either in the 3274 * file descriptor loop or in the greater LIST_FOREACH. The 3275 * process may be in varying states of disrepair. If the process 3276 * is in SZOMB we may have caught it just as it is being removed 3277 * from the allproc list, we must skip it in that case to maintain 3278 * an unbroken chain through the allproc list. 3279 */ 3280 info.count = 0; 3281 info.error = 0; 3282 info.req = req; 3283 allproc_scan(sysctl_kern_file_callback, &info, 0); 3284 3285 /* 3286 * When just calculating the size, overestimate a bit to try to 3287 * prevent system activity from causing the buffer-fill call 3288 * to fail later on. 3289 */ 3290 if (req->oldptr == NULL) { 3291 info.count = (info.count + 16) + (info.count / 10); 3292 info.error = SYSCTL_OUT(req, NULL, 3293 info.count * sizeof(struct kinfo_file)); 3294 } 3295 return (info.error); 3296 } 3297 3298 static int 3299 sysctl_kern_file_callback(struct proc *p, void *data) 3300 { 3301 struct sysctl_kern_file_info *info = data; 3302 struct kinfo_file kf; 3303 struct filedesc *fdp; 3304 struct file *fp; 3305 uid_t uid; 3306 int n; 3307 3308 if (p->p_stat == SIDL || p->p_stat == SZOMB) 3309 return(0); 3310 if (!(PRISON_CHECK(info->req->td->td_ucred, p->p_ucred) != 0)) 3311 return(0); 3312 3313 /* 3314 * Softref the fdp to prevent it from being destroyed 3315 */ 3316 spin_lock(&p->p_spin); 3317 if ((fdp = p->p_fd) == NULL) { 3318 spin_unlock(&p->p_spin); 3319 return(0); 3320 } 3321 atomic_add_int(&fdp->fd_softrefs, 1); 3322 spin_unlock(&p->p_spin); 3323 3324 /* 3325 * The fdp's own spinlock prevents the contents from being 3326 * modified. 3327 */ 3328 spin_lock_shared(&fdp->fd_spin); 3329 for (n = 0; n < fdp->fd_nfiles; ++n) { 3330 if ((fp = fdp->fd_files[n].fp) == NULL) 3331 continue; 3332 if (info->req->oldptr == NULL) { 3333 ++info->count; 3334 } else { 3335 uid = p->p_ucred ? p->p_ucred->cr_uid : -1; 3336 kcore_make_file(&kf, fp, p->p_pid, uid, n); 3337 spin_unlock_shared(&fdp->fd_spin); 3338 info->error = SYSCTL_OUT(info->req, &kf, sizeof(kf)); 3339 spin_lock_shared(&fdp->fd_spin); 3340 if (info->error) 3341 break; 3342 } 3343 } 3344 spin_unlock_shared(&fdp->fd_spin); 3345 atomic_subtract_int(&fdp->fd_softrefs, 1); 3346 if (info->error) 3347 return(-1); 3348 return(0); 3349 } 3350 3351 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 3352 0, 0, sysctl_kern_file, "S,file", "Entire file table"); 3353 3354 SYSCTL_INT(_kern, OID_AUTO, minfilesperproc, CTLFLAG_RW, 3355 &minfilesperproc, 0, "Minimum files allowed open per process"); 3356 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 3357 &maxfilesperproc, 0, "Maximum files allowed open per process"); 3358 SYSCTL_INT(_kern, OID_AUTO, maxfilesperuser, CTLFLAG_RW, 3359 &maxfilesperuser, 0, "Maximum files allowed open per user"); 3360 3361 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 3362 &maxfiles, 0, "Maximum number of files"); 3363 3364 SYSCTL_INT(_kern, OID_AUTO, maxfilesrootres, CTLFLAG_RW, 3365 &maxfilesrootres, 0, "Descriptors reserved for root use"); 3366 3367 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 3368 &nfiles, 0, "System-wide number of open files"); 3369 3370 static void 3371 fildesc_drvinit(void *unused) 3372 { 3373 int fd; 3374 3375 for (fd = 0; fd < NUMFDESC; fd++) { 3376 make_dev(&fildesc_ops, fd, 3377 UID_BIN, GID_BIN, 0666, "fd/%d", fd); 3378 } 3379 3380 make_dev(&fildesc_ops, 0, UID_ROOT, GID_WHEEL, 0666, "stdin"); 3381 make_dev(&fildesc_ops, 1, UID_ROOT, GID_WHEEL, 0666, "stdout"); 3382 make_dev(&fildesc_ops, 2, UID_ROOT, GID_WHEEL, 0666, "stderr"); 3383 } 3384 3385 struct fileops badfileops = { 3386 .fo_read = badfo_readwrite, 3387 .fo_write = badfo_readwrite, 3388 .fo_ioctl = badfo_ioctl, 3389 .fo_kqfilter = badfo_kqfilter, 3390 .fo_stat = badfo_stat, 3391 .fo_close = badfo_close, 3392 .fo_shutdown = badfo_shutdown 3393 }; 3394 3395 int 3396 badfo_readwrite( 3397 struct file *fp, 3398 struct uio *uio, 3399 struct ucred *cred, 3400 int flags 3401 ) { 3402 return (EBADF); 3403 } 3404 3405 int 3406 badfo_ioctl(struct file *fp, u_long com, caddr_t data, 3407 struct ucred *cred, struct sysmsg *msgv) 3408 { 3409 return (EBADF); 3410 } 3411 3412 /* 3413 * Must return an error to prevent registration, typically 3414 * due to a revoked descriptor (file_filtops assigned). 3415 */ 3416 int 3417 badfo_kqfilter(struct file *fp, struct knote *kn) 3418 { 3419 return (EOPNOTSUPP); 3420 } 3421 3422 int 3423 badfo_stat(struct file *fp, struct stat *sb, struct ucred *cred) 3424 { 3425 return (EBADF); 3426 } 3427 3428 int 3429 badfo_close(struct file *fp) 3430 { 3431 return (EBADF); 3432 } 3433 3434 int 3435 badfo_shutdown(struct file *fp, int how) 3436 { 3437 return (EBADF); 3438 } 3439 3440 int 3441 nofo_shutdown(struct file *fp, int how) 3442 { 3443 return (EOPNOTSUPP); 3444 } 3445 3446 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE + CDEV_MAJOR, 3447 fildesc_drvinit,NULL); 3448 3449 static void 3450 filelist_heads_init(void *arg __unused) 3451 { 3452 int i; 3453 3454 for (i = 0; i < NFILELIST_HEADS; ++i) { 3455 struct filelist_head *head = &filelist_heads[i]; 3456 3457 spin_init(&head->spin, "filehead_spin"); 3458 LIST_INIT(&head->list); 3459 } 3460 } 3461 3462 SYSINIT(filelistheads, SI_BOOT1_LOCK, SI_ORDER_ANY, 3463 filelist_heads_init, NULL); 3464 3465 static void 3466 file_objcache_init(void *dummy __unused) 3467 { 3468 file_objcache = objcache_create("file", maxfiles, maxfiles / 8, 3469 NULL, NULL, NULL, /* TODO: ctor/dtor */ 3470 objcache_malloc_alloc, objcache_malloc_free, &file_malloc_args); 3471 } 3472 SYSINIT(fpobjcache, SI_BOOT2_POST_SMP, SI_ORDER_ANY, file_objcache_init, NULL); 3473