1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/capsicum.h> 54 #include <sys/condvar.h> 55 #include <sys/conf.h> 56 #include <sys/counter.h> 57 #include <sys/dirent.h> 58 #include <sys/event.h> 59 #include <sys/eventhandler.h> 60 #include <sys/extattr.h> 61 #include <sys/file.h> 62 #include <sys/fcntl.h> 63 #include <sys/jail.h> 64 #include <sys/kdb.h> 65 #include <sys/kernel.h> 66 #include <sys/kthread.h> 67 #include <sys/ktr.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smp.h> 80 #include <sys/stat.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 #include <sys/watchdog.h> 86 87 #include <machine/stdarg.h> 88 89 #include <security/mac/mac_framework.h> 90 91 #include <vm/vm.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_extern.h> 94 #include <vm/pmap.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_page.h> 97 #include <vm/vm_kern.h> 98 #include <vm/uma.h> 99 100 #ifdef DDB 101 #include <ddb/ddb.h> 102 #endif 103 104 static void delmntque(struct vnode *vp); 105 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 106 int slpflag, int slptimeo); 107 static void syncer_shutdown(void *arg, int howto); 108 static int vtryrecycle(struct vnode *vp); 109 static void v_init_counters(struct vnode *); 110 static void v_incr_devcount(struct vnode *); 111 static void v_decr_devcount(struct vnode *); 112 static void vgonel(struct vnode *); 113 static void vfs_knllock(void *arg); 114 static void vfs_knlunlock(void *arg); 115 static void vfs_knl_assert_locked(void *arg); 116 static void vfs_knl_assert_unlocked(void *arg); 117 static void vnlru_return_batches(struct vfsops *mnt_op); 118 static void destroy_vpollinfo(struct vpollinfo *vi); 119 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 120 daddr_t startlbn, daddr_t endlbn); 121 122 /* 123 * These fences are intended for cases where some synchronization is 124 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 125 * and v_usecount) updates. Access to v_iflags is generally synchronized 126 * by the interlock, but we have some internal assertions that check vnode 127 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 128 * for now. 129 */ 130 #ifdef INVARIANTS 131 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 132 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 133 #else 134 #define VNODE_REFCOUNT_FENCE_ACQ() 135 #define VNODE_REFCOUNT_FENCE_REL() 136 #endif 137 138 /* 139 * Number of vnodes in existence. Increased whenever getnewvnode() 140 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 141 */ 142 static u_long __exclusive_cache_line numvnodes; 143 144 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 145 "Number of vnodes in existence"); 146 147 static counter_u64_t vnodes_created; 148 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 149 "Number of vnodes created by getnewvnode"); 150 151 static u_long mnt_free_list_batch = 128; 152 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, 153 &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); 154 155 /* 156 * Conversion tables for conversion from vnode types to inode formats 157 * and back. 158 */ 159 enum vtype iftovt_tab[16] = { 160 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 161 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 162 }; 163 int vttoif_tab[10] = { 164 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 165 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 166 }; 167 168 /* 169 * List of vnodes that are ready for recycling. 170 */ 171 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 172 173 /* 174 * "Free" vnode target. Free vnodes are rarely completely free, but are 175 * just ones that are cheap to recycle. Usually they are for files which 176 * have been stat'd but not read; these usually have inode and namecache 177 * data attached to them. This target is the preferred minimum size of a 178 * sub-cache consisting mostly of such files. The system balances the size 179 * of this sub-cache with its complement to try to prevent either from 180 * thrashing while the other is relatively inactive. The targets express 181 * a preference for the best balance. 182 * 183 * "Above" this target there are 2 further targets (watermarks) related 184 * to recyling of free vnodes. In the best-operating case, the cache is 185 * exactly full, the free list has size between vlowat and vhiwat above the 186 * free target, and recycling from it and normal use maintains this state. 187 * Sometimes the free list is below vlowat or even empty, but this state 188 * is even better for immediate use provided the cache is not full. 189 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 190 * ones) to reach one of these states. The watermarks are currently hard- 191 * coded as 4% and 9% of the available space higher. These and the default 192 * of 25% for wantfreevnodes are too large if the memory size is large. 193 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 194 * whenever vnlru_proc() becomes active. 195 */ 196 static u_long wantfreevnodes; 197 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 198 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 199 static u_long freevnodes; 200 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 201 &freevnodes, 0, "Number of \"free\" vnodes"); 202 203 static counter_u64_t recycles_count; 204 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 205 "Number of vnodes recycled to meet vnode cache targets"); 206 207 /* 208 * Various variables used for debugging the new implementation of 209 * reassignbuf(). 210 * XXX these are probably of (very) limited utility now. 211 */ 212 static int reassignbufcalls; 213 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS, 214 &reassignbufcalls, 0, "Number of calls to reassignbuf"); 215 216 static counter_u64_t free_owe_inact; 217 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 218 "Number of times free vnodes kept on active list due to VFS " 219 "owing inactivation"); 220 221 /* To keep more than one thread at a time from running vfs_getnewfsid */ 222 static struct mtx mntid_mtx; 223 224 /* 225 * Lock for any access to the following: 226 * vnode_free_list 227 * numvnodes 228 * freevnodes 229 */ 230 static struct mtx __exclusive_cache_line vnode_free_list_mtx; 231 232 /* Publicly exported FS */ 233 struct nfs_public nfs_pub; 234 235 static uma_zone_t buf_trie_zone; 236 237 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 238 static uma_zone_t vnode_zone; 239 static uma_zone_t vnodepoll_zone; 240 241 /* 242 * The workitem queue. 243 * 244 * It is useful to delay writes of file data and filesystem metadata 245 * for tens of seconds so that quickly created and deleted files need 246 * not waste disk bandwidth being created and removed. To realize this, 247 * we append vnodes to a "workitem" queue. When running with a soft 248 * updates implementation, most pending metadata dependencies should 249 * not wait for more than a few seconds. Thus, mounted on block devices 250 * are delayed only about a half the time that file data is delayed. 251 * Similarly, directory updates are more critical, so are only delayed 252 * about a third the time that file data is delayed. Thus, there are 253 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 254 * one each second (driven off the filesystem syncer process). The 255 * syncer_delayno variable indicates the next queue that is to be processed. 256 * Items that need to be processed soon are placed in this queue: 257 * 258 * syncer_workitem_pending[syncer_delayno] 259 * 260 * A delay of fifteen seconds is done by placing the request fifteen 261 * entries later in the queue: 262 * 263 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 264 * 265 */ 266 static int syncer_delayno; 267 static long syncer_mask; 268 LIST_HEAD(synclist, bufobj); 269 static struct synclist *syncer_workitem_pending; 270 /* 271 * The sync_mtx protects: 272 * bo->bo_synclist 273 * sync_vnode_count 274 * syncer_delayno 275 * syncer_state 276 * syncer_workitem_pending 277 * syncer_worklist_len 278 * rushjob 279 */ 280 static struct mtx sync_mtx; 281 static struct cv sync_wakeup; 282 283 #define SYNCER_MAXDELAY 32 284 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 285 static int syncdelay = 30; /* max time to delay syncing data */ 286 static int filedelay = 30; /* time to delay syncing files */ 287 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 288 "Time to delay syncing files (in seconds)"); 289 static int dirdelay = 29; /* time to delay syncing directories */ 290 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 291 "Time to delay syncing directories (in seconds)"); 292 static int metadelay = 28; /* time to delay syncing metadata */ 293 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 294 "Time to delay syncing metadata (in seconds)"); 295 static int rushjob; /* number of slots to run ASAP */ 296 static int stat_rush_requests; /* number of times I/O speeded up */ 297 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 298 "Number of times I/O speeded up (rush requests)"); 299 300 /* 301 * When shutting down the syncer, run it at four times normal speed. 302 */ 303 #define SYNCER_SHUTDOWN_SPEEDUP 4 304 static int sync_vnode_count; 305 static int syncer_worklist_len; 306 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 307 syncer_state; 308 309 /* Target for maximum number of vnodes. */ 310 int desiredvnodes; 311 static int gapvnodes; /* gap between wanted and desired */ 312 static int vhiwat; /* enough extras after expansion */ 313 static int vlowat; /* minimal extras before expansion */ 314 static int vstir; /* nonzero to stir non-free vnodes */ 315 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 316 317 static int 318 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 319 { 320 int error, old_desiredvnodes; 321 322 old_desiredvnodes = desiredvnodes; 323 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 324 return (error); 325 if (old_desiredvnodes != desiredvnodes) { 326 wantfreevnodes = desiredvnodes / 4; 327 /* XXX locking seems to be incomplete. */ 328 vfs_hash_changesize(desiredvnodes); 329 cache_changesize(desiredvnodes); 330 } 331 return (0); 332 } 333 334 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 335 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 336 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 337 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 338 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 339 static int vnlru_nowhere; 340 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 341 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 342 343 static int 344 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 345 { 346 struct vnode *vp; 347 struct nameidata nd; 348 char *buf; 349 unsigned long ndflags; 350 int error; 351 352 if (req->newptr == NULL) 353 return (EINVAL); 354 if (req->newlen >= PATH_MAX) 355 return (E2BIG); 356 357 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 358 error = SYSCTL_IN(req, buf, req->newlen); 359 if (error != 0) 360 goto out; 361 362 buf[req->newlen] = '\0'; 363 364 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; 365 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); 366 if ((error = namei(&nd)) != 0) 367 goto out; 368 vp = nd.ni_vp; 369 370 if (VN_IS_DOOMED(vp)) { 371 /* 372 * This vnode is being recycled. Return != 0 to let the caller 373 * know that the sysctl had no effect. Return EAGAIN because a 374 * subsequent call will likely succeed (since namei will create 375 * a new vnode if necessary) 376 */ 377 error = EAGAIN; 378 goto putvnode; 379 } 380 381 counter_u64_add(recycles_count, 1); 382 vgone(vp); 383 putvnode: 384 NDFREE(&nd, 0); 385 out: 386 free(buf, M_TEMP); 387 return (error); 388 } 389 390 static int 391 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 392 { 393 struct thread *td = curthread; 394 struct vnode *vp; 395 struct file *fp; 396 int error; 397 int fd; 398 399 if (req->newptr == NULL) 400 return (EBADF); 401 402 error = sysctl_handle_int(oidp, &fd, 0, req); 403 if (error != 0) 404 return (error); 405 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 406 if (error != 0) 407 return (error); 408 vp = fp->f_vnode; 409 410 error = vn_lock(vp, LK_EXCLUSIVE); 411 if (error != 0) 412 goto drop; 413 414 counter_u64_add(recycles_count, 1); 415 vgone(vp); 416 VOP_UNLOCK(vp); 417 drop: 418 fdrop(fp, td); 419 return (error); 420 } 421 422 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 423 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 424 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 425 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 426 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 427 sysctl_ftry_reclaim_vnode, "I", 428 "Try to reclaim a vnode by its file descriptor"); 429 430 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 431 static int vnsz2log; 432 433 /* 434 * Support for the bufobj clean & dirty pctrie. 435 */ 436 static void * 437 buf_trie_alloc(struct pctrie *ptree) 438 { 439 440 return uma_zalloc(buf_trie_zone, M_NOWAIT); 441 } 442 443 static void 444 buf_trie_free(struct pctrie *ptree, void *node) 445 { 446 447 uma_zfree(buf_trie_zone, node); 448 } 449 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 450 451 /* 452 * Initialize the vnode management data structures. 453 * 454 * Reevaluate the following cap on the number of vnodes after the physical 455 * memory size exceeds 512GB. In the limit, as the physical memory size 456 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 457 */ 458 #ifndef MAXVNODES_MAX 459 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 460 #endif 461 462 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 463 464 static struct vnode * 465 vn_alloc_marker(struct mount *mp) 466 { 467 struct vnode *vp; 468 469 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 470 vp->v_type = VMARKER; 471 vp->v_mount = mp; 472 473 return (vp); 474 } 475 476 static void 477 vn_free_marker(struct vnode *vp) 478 { 479 480 MPASS(vp->v_type == VMARKER); 481 free(vp, M_VNODE_MARKER); 482 } 483 484 /* 485 * Initialize a vnode as it first enters the zone. 486 */ 487 static int 488 vnode_init(void *mem, int size, int flags) 489 { 490 struct vnode *vp; 491 492 vp = mem; 493 bzero(vp, size); 494 /* 495 * Setup locks. 496 */ 497 vp->v_vnlock = &vp->v_lock; 498 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 499 /* 500 * By default, don't allow shared locks unless filesystems opt-in. 501 */ 502 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 503 LK_NOSHARE | LK_IS_VNODE); 504 /* 505 * Initialize bufobj. 506 */ 507 bufobj_init(&vp->v_bufobj, vp); 508 /* 509 * Initialize namecache. 510 */ 511 LIST_INIT(&vp->v_cache_src); 512 TAILQ_INIT(&vp->v_cache_dst); 513 /* 514 * Initialize rangelocks. 515 */ 516 rangelock_init(&vp->v_rl); 517 return (0); 518 } 519 520 /* 521 * Free a vnode when it is cleared from the zone. 522 */ 523 static void 524 vnode_fini(void *mem, int size) 525 { 526 struct vnode *vp; 527 struct bufobj *bo; 528 529 vp = mem; 530 rangelock_destroy(&vp->v_rl); 531 lockdestroy(vp->v_vnlock); 532 mtx_destroy(&vp->v_interlock); 533 bo = &vp->v_bufobj; 534 rw_destroy(BO_LOCKPTR(bo)); 535 } 536 537 /* 538 * Provide the size of NFS nclnode and NFS fh for calculation of the 539 * vnode memory consumption. The size is specified directly to 540 * eliminate dependency on NFS-private header. 541 * 542 * Other filesystems may use bigger or smaller (like UFS and ZFS) 543 * private inode data, but the NFS-based estimation is ample enough. 544 * Still, we care about differences in the size between 64- and 32-bit 545 * platforms. 546 * 547 * Namecache structure size is heuristically 548 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 549 */ 550 #ifdef _LP64 551 #define NFS_NCLNODE_SZ (528 + 64) 552 #define NC_SZ 148 553 #else 554 #define NFS_NCLNODE_SZ (360 + 32) 555 #define NC_SZ 92 556 #endif 557 558 static void 559 vntblinit(void *dummy __unused) 560 { 561 u_int i; 562 int physvnodes, virtvnodes; 563 564 /* 565 * Desiredvnodes is a function of the physical memory size and the 566 * kernel's heap size. Generally speaking, it scales with the 567 * physical memory size. The ratio of desiredvnodes to the physical 568 * memory size is 1:16 until desiredvnodes exceeds 98,304. 569 * Thereafter, the 570 * marginal ratio of desiredvnodes to the physical memory size is 571 * 1:64. However, desiredvnodes is limited by the kernel's heap 572 * size. The memory required by desiredvnodes vnodes and vm objects 573 * must not exceed 1/10th of the kernel's heap size. 574 */ 575 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 576 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 577 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 578 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 579 desiredvnodes = min(physvnodes, virtvnodes); 580 if (desiredvnodes > MAXVNODES_MAX) { 581 if (bootverbose) 582 printf("Reducing kern.maxvnodes %d -> %d\n", 583 desiredvnodes, MAXVNODES_MAX); 584 desiredvnodes = MAXVNODES_MAX; 585 } 586 wantfreevnodes = desiredvnodes / 4; 587 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 588 TAILQ_INIT(&vnode_free_list); 589 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 590 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 591 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 592 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 593 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 594 /* 595 * Preallocate enough nodes to support one-per buf so that 596 * we can not fail an insert. reassignbuf() callers can not 597 * tolerate the insertion failure. 598 */ 599 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 600 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 601 UMA_ZONE_NOFREE | UMA_ZONE_VM); 602 uma_prealloc(buf_trie_zone, nbuf); 603 604 vnodes_created = counter_u64_alloc(M_WAITOK); 605 recycles_count = counter_u64_alloc(M_WAITOK); 606 free_owe_inact = counter_u64_alloc(M_WAITOK); 607 608 /* 609 * Initialize the filesystem syncer. 610 */ 611 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 612 &syncer_mask); 613 syncer_maxdelay = syncer_mask + 1; 614 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 615 cv_init(&sync_wakeup, "syncer"); 616 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 617 vnsz2log++; 618 vnsz2log--; 619 } 620 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 621 622 623 /* 624 * Mark a mount point as busy. Used to synchronize access and to delay 625 * unmounting. Eventually, mountlist_mtx is not released on failure. 626 * 627 * vfs_busy() is a custom lock, it can block the caller. 628 * vfs_busy() only sleeps if the unmount is active on the mount point. 629 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 630 * vnode belonging to mp. 631 * 632 * Lookup uses vfs_busy() to traverse mount points. 633 * root fs var fs 634 * / vnode lock A / vnode lock (/var) D 635 * /var vnode lock B /log vnode lock(/var/log) E 636 * vfs_busy lock C vfs_busy lock F 637 * 638 * Within each file system, the lock order is C->A->B and F->D->E. 639 * 640 * When traversing across mounts, the system follows that lock order: 641 * 642 * C->A->B 643 * | 644 * +->F->D->E 645 * 646 * The lookup() process for namei("/var") illustrates the process: 647 * VOP_LOOKUP() obtains B while A is held 648 * vfs_busy() obtains a shared lock on F while A and B are held 649 * vput() releases lock on B 650 * vput() releases lock on A 651 * VFS_ROOT() obtains lock on D while shared lock on F is held 652 * vfs_unbusy() releases shared lock on F 653 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 654 * Attempt to lock A (instead of vp_crossmp) while D is held would 655 * violate the global order, causing deadlocks. 656 * 657 * dounmount() locks B while F is drained. 658 */ 659 int 660 vfs_busy(struct mount *mp, int flags) 661 { 662 663 MPASS((flags & ~MBF_MASK) == 0); 664 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 665 666 if (vfs_op_thread_enter(mp)) { 667 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 668 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 669 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 670 vfs_mp_count_add_pcpu(mp, ref, 1); 671 vfs_mp_count_add_pcpu(mp, lockref, 1); 672 vfs_op_thread_exit(mp); 673 if (flags & MBF_MNTLSTLOCK) 674 mtx_unlock(&mountlist_mtx); 675 return (0); 676 } 677 678 MNT_ILOCK(mp); 679 vfs_assert_mount_counters(mp); 680 MNT_REF(mp); 681 /* 682 * If mount point is currently being unmounted, sleep until the 683 * mount point fate is decided. If thread doing the unmounting fails, 684 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 685 * that this mount point has survived the unmount attempt and vfs_busy 686 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 687 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 688 * about to be really destroyed. vfs_busy needs to release its 689 * reference on the mount point in this case and return with ENOENT, 690 * telling the caller that mount mount it tried to busy is no longer 691 * valid. 692 */ 693 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 694 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 695 MNT_REL(mp); 696 MNT_IUNLOCK(mp); 697 CTR1(KTR_VFS, "%s: failed busying before sleeping", 698 __func__); 699 return (ENOENT); 700 } 701 if (flags & MBF_MNTLSTLOCK) 702 mtx_unlock(&mountlist_mtx); 703 mp->mnt_kern_flag |= MNTK_MWAIT; 704 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 705 if (flags & MBF_MNTLSTLOCK) 706 mtx_lock(&mountlist_mtx); 707 MNT_ILOCK(mp); 708 } 709 if (flags & MBF_MNTLSTLOCK) 710 mtx_unlock(&mountlist_mtx); 711 mp->mnt_lockref++; 712 MNT_IUNLOCK(mp); 713 return (0); 714 } 715 716 /* 717 * Free a busy filesystem. 718 */ 719 void 720 vfs_unbusy(struct mount *mp) 721 { 722 int c; 723 724 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 725 726 if (vfs_op_thread_enter(mp)) { 727 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 728 vfs_mp_count_sub_pcpu(mp, lockref, 1); 729 vfs_mp_count_sub_pcpu(mp, ref, 1); 730 vfs_op_thread_exit(mp); 731 return; 732 } 733 734 MNT_ILOCK(mp); 735 vfs_assert_mount_counters(mp); 736 MNT_REL(mp); 737 c = --mp->mnt_lockref; 738 if (mp->mnt_vfs_ops == 0) { 739 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 740 MNT_IUNLOCK(mp); 741 return; 742 } 743 if (c < 0) 744 vfs_dump_mount_counters(mp); 745 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 746 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 747 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 748 mp->mnt_kern_flag &= ~MNTK_DRAINING; 749 wakeup(&mp->mnt_lockref); 750 } 751 MNT_IUNLOCK(mp); 752 } 753 754 /* 755 * Lookup a mount point by filesystem identifier. 756 */ 757 struct mount * 758 vfs_getvfs(fsid_t *fsid) 759 { 760 struct mount *mp; 761 762 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 763 mtx_lock(&mountlist_mtx); 764 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 765 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 766 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 767 vfs_ref(mp); 768 mtx_unlock(&mountlist_mtx); 769 return (mp); 770 } 771 } 772 mtx_unlock(&mountlist_mtx); 773 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 774 return ((struct mount *) 0); 775 } 776 777 /* 778 * Lookup a mount point by filesystem identifier, busying it before 779 * returning. 780 * 781 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 782 * cache for popular filesystem identifiers. The cache is lockess, using 783 * the fact that struct mount's are never freed. In worst case we may 784 * get pointer to unmounted or even different filesystem, so we have to 785 * check what we got, and go slow way if so. 786 */ 787 struct mount * 788 vfs_busyfs(fsid_t *fsid) 789 { 790 #define FSID_CACHE_SIZE 256 791 typedef struct mount * volatile vmp_t; 792 static vmp_t cache[FSID_CACHE_SIZE]; 793 struct mount *mp; 794 int error; 795 uint32_t hash; 796 797 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 798 hash = fsid->val[0] ^ fsid->val[1]; 799 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 800 mp = cache[hash]; 801 if (mp == NULL || 802 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 803 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 804 goto slow; 805 if (vfs_busy(mp, 0) != 0) { 806 cache[hash] = NULL; 807 goto slow; 808 } 809 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 810 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 811 return (mp); 812 else 813 vfs_unbusy(mp); 814 815 slow: 816 mtx_lock(&mountlist_mtx); 817 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 818 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 819 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 820 error = vfs_busy(mp, MBF_MNTLSTLOCK); 821 if (error) { 822 cache[hash] = NULL; 823 mtx_unlock(&mountlist_mtx); 824 return (NULL); 825 } 826 cache[hash] = mp; 827 return (mp); 828 } 829 } 830 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 831 mtx_unlock(&mountlist_mtx); 832 return ((struct mount *) 0); 833 } 834 835 /* 836 * Check if a user can access privileged mount options. 837 */ 838 int 839 vfs_suser(struct mount *mp, struct thread *td) 840 { 841 int error; 842 843 if (jailed(td->td_ucred)) { 844 /* 845 * If the jail of the calling thread lacks permission for 846 * this type of file system, deny immediately. 847 */ 848 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 849 return (EPERM); 850 851 /* 852 * If the file system was mounted outside the jail of the 853 * calling thread, deny immediately. 854 */ 855 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 856 return (EPERM); 857 } 858 859 /* 860 * If file system supports delegated administration, we don't check 861 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 862 * by the file system itself. 863 * If this is not the user that did original mount, we check for 864 * the PRIV_VFS_MOUNT_OWNER privilege. 865 */ 866 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 867 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 868 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 869 return (error); 870 } 871 return (0); 872 } 873 874 /* 875 * Get a new unique fsid. Try to make its val[0] unique, since this value 876 * will be used to create fake device numbers for stat(). Also try (but 877 * not so hard) make its val[0] unique mod 2^16, since some emulators only 878 * support 16-bit device numbers. We end up with unique val[0]'s for the 879 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 880 * 881 * Keep in mind that several mounts may be running in parallel. Starting 882 * the search one past where the previous search terminated is both a 883 * micro-optimization and a defense against returning the same fsid to 884 * different mounts. 885 */ 886 void 887 vfs_getnewfsid(struct mount *mp) 888 { 889 static uint16_t mntid_base; 890 struct mount *nmp; 891 fsid_t tfsid; 892 int mtype; 893 894 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 895 mtx_lock(&mntid_mtx); 896 mtype = mp->mnt_vfc->vfc_typenum; 897 tfsid.val[1] = mtype; 898 mtype = (mtype & 0xFF) << 24; 899 for (;;) { 900 tfsid.val[0] = makedev(255, 901 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 902 mntid_base++; 903 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 904 break; 905 vfs_rel(nmp); 906 } 907 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 908 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 909 mtx_unlock(&mntid_mtx); 910 } 911 912 /* 913 * Knob to control the precision of file timestamps: 914 * 915 * 0 = seconds only; nanoseconds zeroed. 916 * 1 = seconds and nanoseconds, accurate within 1/HZ. 917 * 2 = seconds and nanoseconds, truncated to microseconds. 918 * >=3 = seconds and nanoseconds, maximum precision. 919 */ 920 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 921 922 static int timestamp_precision = TSP_USEC; 923 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 924 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 925 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 926 "3+: sec + ns (max. precision))"); 927 928 /* 929 * Get a current timestamp. 930 */ 931 void 932 vfs_timestamp(struct timespec *tsp) 933 { 934 struct timeval tv; 935 936 switch (timestamp_precision) { 937 case TSP_SEC: 938 tsp->tv_sec = time_second; 939 tsp->tv_nsec = 0; 940 break; 941 case TSP_HZ: 942 getnanotime(tsp); 943 break; 944 case TSP_USEC: 945 microtime(&tv); 946 TIMEVAL_TO_TIMESPEC(&tv, tsp); 947 break; 948 case TSP_NSEC: 949 default: 950 nanotime(tsp); 951 break; 952 } 953 } 954 955 /* 956 * Set vnode attributes to VNOVAL 957 */ 958 void 959 vattr_null(struct vattr *vap) 960 { 961 962 vap->va_type = VNON; 963 vap->va_size = VNOVAL; 964 vap->va_bytes = VNOVAL; 965 vap->va_mode = VNOVAL; 966 vap->va_nlink = VNOVAL; 967 vap->va_uid = VNOVAL; 968 vap->va_gid = VNOVAL; 969 vap->va_fsid = VNOVAL; 970 vap->va_fileid = VNOVAL; 971 vap->va_blocksize = VNOVAL; 972 vap->va_rdev = VNOVAL; 973 vap->va_atime.tv_sec = VNOVAL; 974 vap->va_atime.tv_nsec = VNOVAL; 975 vap->va_mtime.tv_sec = VNOVAL; 976 vap->va_mtime.tv_nsec = VNOVAL; 977 vap->va_ctime.tv_sec = VNOVAL; 978 vap->va_ctime.tv_nsec = VNOVAL; 979 vap->va_birthtime.tv_sec = VNOVAL; 980 vap->va_birthtime.tv_nsec = VNOVAL; 981 vap->va_flags = VNOVAL; 982 vap->va_gen = VNOVAL; 983 vap->va_vaflags = 0; 984 } 985 986 /* 987 * This routine is called when we have too many vnodes. It attempts 988 * to free <count> vnodes and will potentially free vnodes that still 989 * have VM backing store (VM backing store is typically the cause 990 * of a vnode blowout so we want to do this). Therefore, this operation 991 * is not considered cheap. 992 * 993 * A number of conditions may prevent a vnode from being reclaimed. 994 * the buffer cache may have references on the vnode, a directory 995 * vnode may still have references due to the namei cache representing 996 * underlying files, or the vnode may be in active use. It is not 997 * desirable to reuse such vnodes. These conditions may cause the 998 * number of vnodes to reach some minimum value regardless of what 999 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1000 * 1001 * @param mp Try to reclaim vnodes from this mountpoint 1002 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1003 * entries if this argument is strue 1004 * @param trigger Only reclaim vnodes with fewer than this many resident 1005 * pages. 1006 * @return The number of vnodes that were reclaimed. 1007 */ 1008 static int 1009 vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger) 1010 { 1011 struct vnode *vp; 1012 int count, done, target; 1013 1014 done = 0; 1015 vn_start_write(NULL, &mp, V_WAIT); 1016 MNT_ILOCK(mp); 1017 count = mp->mnt_nvnodelistsize; 1018 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1019 target = target / 10 + 1; 1020 while (count != 0 && done < target) { 1021 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 1022 while (vp != NULL && vp->v_type == VMARKER) 1023 vp = TAILQ_NEXT(vp, v_nmntvnodes); 1024 if (vp == NULL) 1025 break; 1026 /* 1027 * XXX LRU is completely broken for non-free vnodes. First 1028 * by calling here in mountpoint order, then by moving 1029 * unselected vnodes to the end here, and most grossly by 1030 * removing the vlruvp() function that was supposed to 1031 * maintain the order. (This function was born broken 1032 * since syncer problems prevented it doing anything.) The 1033 * order is closer to LRC (C = Created). 1034 * 1035 * LRU reclaiming of vnodes seems to have last worked in 1036 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 1037 * Then there was no hold count, and inactive vnodes were 1038 * simply put on the free list in LRU order. The separate 1039 * lists also break LRU. We prefer to reclaim from the 1040 * free list for technical reasons. This tends to thrash 1041 * the free list to keep very unrecently used held vnodes. 1042 * The problem is mitigated by keeping the free list large. 1043 */ 1044 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1045 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1046 --count; 1047 if (!VI_TRYLOCK(vp)) 1048 goto next_iter; 1049 /* 1050 * If it's been deconstructed already, it's still 1051 * referenced, or it exceeds the trigger, skip it. 1052 * Also skip free vnodes. We are trying to make space 1053 * to expand the free list, not reduce it. 1054 */ 1055 if (vp->v_usecount || 1056 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1057 ((vp->v_iflag & VI_FREE) != 0) || 1058 VN_IS_DOOMED(vp) || (vp->v_object != NULL && 1059 vp->v_object->resident_page_count > trigger)) { 1060 VI_UNLOCK(vp); 1061 goto next_iter; 1062 } 1063 MNT_IUNLOCK(mp); 1064 vholdl(vp); 1065 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 1066 vdrop(vp); 1067 goto next_iter_mntunlocked; 1068 } 1069 VI_LOCK(vp); 1070 /* 1071 * v_usecount may have been bumped after VOP_LOCK() dropped 1072 * the vnode interlock and before it was locked again. 1073 * 1074 * It is not necessary to recheck VIRF_DOOMED because it can 1075 * only be set by another thread that holds both the vnode 1076 * lock and vnode interlock. If another thread has the 1077 * vnode lock before we get to VOP_LOCK() and obtains the 1078 * vnode interlock after VOP_LOCK() drops the vnode 1079 * interlock, the other thread will be unable to drop the 1080 * vnode lock before our VOP_LOCK() call fails. 1081 */ 1082 if (vp->v_usecount || 1083 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1084 (vp->v_object != NULL && 1085 vp->v_object->resident_page_count > trigger)) { 1086 VOP_UNLOCK(vp); 1087 vdropl(vp); 1088 goto next_iter_mntunlocked; 1089 } 1090 KASSERT(!VN_IS_DOOMED(vp), 1091 ("VIRF_DOOMED unexpectedly detected in vlrureclaim()")); 1092 counter_u64_add(recycles_count, 1); 1093 vgonel(vp); 1094 VOP_UNLOCK(vp); 1095 vdropl(vp); 1096 done++; 1097 next_iter_mntunlocked: 1098 if (!should_yield()) 1099 goto relock_mnt; 1100 goto yield; 1101 next_iter: 1102 if (!should_yield()) 1103 continue; 1104 MNT_IUNLOCK(mp); 1105 yield: 1106 kern_yield(PRI_USER); 1107 relock_mnt: 1108 MNT_ILOCK(mp); 1109 } 1110 MNT_IUNLOCK(mp); 1111 vn_finished_write(mp); 1112 return done; 1113 } 1114 1115 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1116 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1117 0, 1118 "limit on vnode free requests per call to the vnlru_free routine"); 1119 1120 /* 1121 * Attempt to reduce the free list by the requested amount. 1122 */ 1123 static void 1124 vnlru_free_locked(int count, struct vfsops *mnt_op) 1125 { 1126 struct vnode *vp; 1127 struct mount *mp; 1128 bool tried_batches; 1129 1130 tried_batches = false; 1131 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1132 if (count > max_vnlru_free) 1133 count = max_vnlru_free; 1134 for (; count > 0; count--) { 1135 vp = TAILQ_FIRST(&vnode_free_list); 1136 /* 1137 * The list can be modified while the free_list_mtx 1138 * has been dropped and vp could be NULL here. 1139 */ 1140 if (vp == NULL) { 1141 if (tried_batches) 1142 break; 1143 mtx_unlock(&vnode_free_list_mtx); 1144 vnlru_return_batches(mnt_op); 1145 tried_batches = true; 1146 mtx_lock(&vnode_free_list_mtx); 1147 continue; 1148 } 1149 1150 VNASSERT(vp->v_op != NULL, vp, 1151 ("vnlru_free: vnode already reclaimed.")); 1152 KASSERT((vp->v_iflag & VI_FREE) != 0, 1153 ("Removing vnode not on freelist")); 1154 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1155 ("Mangling active vnode")); 1156 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 1157 1158 /* 1159 * Don't recycle if our vnode is from different type 1160 * of mount point. Note that mp is type-safe, the 1161 * check does not reach unmapped address even if 1162 * vnode is reclaimed. 1163 * Don't recycle if we can't get the interlock without 1164 * blocking. 1165 */ 1166 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && 1167 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 1168 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 1169 continue; 1170 } 1171 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 1172 vp, ("vp inconsistent on freelist")); 1173 1174 /* 1175 * The clear of VI_FREE prevents activation of the 1176 * vnode. There is no sense in putting the vnode on 1177 * the mount point active list, only to remove it 1178 * later during recycling. Inline the relevant part 1179 * of vholdl(), to avoid triggering assertions or 1180 * activating. 1181 */ 1182 freevnodes--; 1183 vp->v_iflag &= ~VI_FREE; 1184 VNODE_REFCOUNT_FENCE_REL(); 1185 refcount_acquire(&vp->v_holdcnt); 1186 1187 mtx_unlock(&vnode_free_list_mtx); 1188 VI_UNLOCK(vp); 1189 vtryrecycle(vp); 1190 /* 1191 * If the recycled succeeded this vdrop will actually free 1192 * the vnode. If not it will simply place it back on 1193 * the free list. 1194 */ 1195 vdrop(vp); 1196 mtx_lock(&vnode_free_list_mtx); 1197 } 1198 } 1199 1200 void 1201 vnlru_free(int count, struct vfsops *mnt_op) 1202 { 1203 1204 mtx_lock(&vnode_free_list_mtx); 1205 vnlru_free_locked(count, mnt_op); 1206 mtx_unlock(&vnode_free_list_mtx); 1207 } 1208 1209 1210 /* XXX some names and initialization are bad for limits and watermarks. */ 1211 static int 1212 vspace(void) 1213 { 1214 u_long rnumvnodes, rfreevnodes; 1215 int space; 1216 1217 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1218 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1219 vlowat = vhiwat / 2; 1220 rnumvnodes = atomic_load_long(&numvnodes); 1221 rfreevnodes = atomic_load_long(&freevnodes); 1222 if (rnumvnodes > desiredvnodes) 1223 return (0); 1224 space = desiredvnodes - rnumvnodes; 1225 if (freevnodes > wantfreevnodes) 1226 space += rfreevnodes - wantfreevnodes; 1227 return (space); 1228 } 1229 1230 static void 1231 vnlru_return_batch_locked(struct mount *mp) 1232 { 1233 struct vnode *vp; 1234 1235 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 1236 1237 if (mp->mnt_tmpfreevnodelistsize == 0) 1238 return; 1239 1240 TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { 1241 VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, 1242 ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); 1243 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 1244 } 1245 mtx_lock(&vnode_free_list_mtx); 1246 TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); 1247 freevnodes += mp->mnt_tmpfreevnodelistsize; 1248 mtx_unlock(&vnode_free_list_mtx); 1249 mp->mnt_tmpfreevnodelistsize = 0; 1250 } 1251 1252 static void 1253 vnlru_return_batch(struct mount *mp) 1254 { 1255 1256 mtx_lock(&mp->mnt_listmtx); 1257 vnlru_return_batch_locked(mp); 1258 mtx_unlock(&mp->mnt_listmtx); 1259 } 1260 1261 static void 1262 vnlru_return_batches(struct vfsops *mnt_op) 1263 { 1264 struct mount *mp, *nmp; 1265 bool need_unbusy; 1266 1267 mtx_lock(&mountlist_mtx); 1268 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1269 need_unbusy = false; 1270 if (mnt_op != NULL && mp->mnt_op != mnt_op) 1271 goto next; 1272 if (mp->mnt_tmpfreevnodelistsize == 0) 1273 goto next; 1274 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { 1275 vnlru_return_batch(mp); 1276 need_unbusy = true; 1277 mtx_lock(&mountlist_mtx); 1278 } 1279 next: 1280 nmp = TAILQ_NEXT(mp, mnt_list); 1281 if (need_unbusy) 1282 vfs_unbusy(mp); 1283 } 1284 mtx_unlock(&mountlist_mtx); 1285 } 1286 1287 /* 1288 * Attempt to recycle vnodes in a context that is always safe to block. 1289 * Calling vlrurecycle() from the bowels of filesystem code has some 1290 * interesting deadlock problems. 1291 */ 1292 static struct proc *vnlruproc; 1293 static int vnlruproc_sig; 1294 1295 static void 1296 vnlru_proc(void) 1297 { 1298 u_long rnumvnodes, rfreevnodes; 1299 struct mount *mp, *nmp; 1300 unsigned long onumvnodes; 1301 int done, force, trigger, usevnodes, vsp; 1302 bool reclaim_nc_src; 1303 1304 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1305 SHUTDOWN_PRI_FIRST); 1306 1307 force = 0; 1308 for (;;) { 1309 kproc_suspend_check(vnlruproc); 1310 mtx_lock(&vnode_free_list_mtx); 1311 rnumvnodes = atomic_load_long(&numvnodes); 1312 /* 1313 * If numvnodes is too large (due to desiredvnodes being 1314 * adjusted using its sysctl, or emergency growth), first 1315 * try to reduce it by discarding from the free list. 1316 */ 1317 if (rnumvnodes > desiredvnodes) 1318 vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); 1319 /* 1320 * Sleep if the vnode cache is in a good state. This is 1321 * when it is not over-full and has space for about a 4% 1322 * or 9% expansion (by growing its size or inexcessively 1323 * reducing its free list). Otherwise, try to reclaim 1324 * space for a 10% expansion. 1325 */ 1326 if (vstir && force == 0) { 1327 force = 1; 1328 vstir = 0; 1329 } 1330 vsp = vspace(); 1331 if (vsp >= vlowat && force == 0) { 1332 vnlruproc_sig = 0; 1333 wakeup(&vnlruproc_sig); 1334 msleep(vnlruproc, &vnode_free_list_mtx, 1335 PVFS|PDROP, "vlruwt", hz); 1336 continue; 1337 } 1338 mtx_unlock(&vnode_free_list_mtx); 1339 done = 0; 1340 rnumvnodes = atomic_load_long(&numvnodes); 1341 rfreevnodes = atomic_load_long(&freevnodes); 1342 1343 onumvnodes = rnumvnodes; 1344 /* 1345 * Calculate parameters for recycling. These are the same 1346 * throughout the loop to give some semblance of fairness. 1347 * The trigger point is to avoid recycling vnodes with lots 1348 * of resident pages. We aren't trying to free memory; we 1349 * are trying to recycle or at least free vnodes. 1350 */ 1351 if (rnumvnodes <= desiredvnodes) 1352 usevnodes = rnumvnodes - rfreevnodes; 1353 else 1354 usevnodes = rnumvnodes; 1355 if (usevnodes <= 0) 1356 usevnodes = 1; 1357 /* 1358 * The trigger value is is chosen to give a conservatively 1359 * large value to ensure that it alone doesn't prevent 1360 * making progress. The value can easily be so large that 1361 * it is effectively infinite in some congested and 1362 * misconfigured cases, and this is necessary. Normally 1363 * it is about 8 to 100 (pages), which is quite large. 1364 */ 1365 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1366 if (force < 2) 1367 trigger = vsmalltrigger; 1368 reclaim_nc_src = force >= 3; 1369 mtx_lock(&mountlist_mtx); 1370 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1371 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1372 nmp = TAILQ_NEXT(mp, mnt_list); 1373 continue; 1374 } 1375 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1376 mtx_lock(&mountlist_mtx); 1377 nmp = TAILQ_NEXT(mp, mnt_list); 1378 vfs_unbusy(mp); 1379 } 1380 mtx_unlock(&mountlist_mtx); 1381 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1382 uma_reclaim(UMA_RECLAIM_DRAIN); 1383 if (done == 0) { 1384 if (force == 0 || force == 1) { 1385 force = 2; 1386 continue; 1387 } 1388 if (force == 2) { 1389 force = 3; 1390 continue; 1391 } 1392 force = 0; 1393 vnlru_nowhere++; 1394 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1395 } else 1396 kern_yield(PRI_USER); 1397 /* 1398 * After becoming active to expand above low water, keep 1399 * active until above high water. 1400 */ 1401 vsp = vspace(); 1402 force = vsp < vhiwat; 1403 } 1404 } 1405 1406 static struct kproc_desc vnlru_kp = { 1407 "vnlru", 1408 vnlru_proc, 1409 &vnlruproc 1410 }; 1411 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1412 &vnlru_kp); 1413 1414 /* 1415 * Routines having to do with the management of the vnode table. 1416 */ 1417 1418 /* 1419 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1420 * before we actually vgone(). This function must be called with the vnode 1421 * held to prevent the vnode from being returned to the free list midway 1422 * through vgone(). 1423 */ 1424 static int 1425 vtryrecycle(struct vnode *vp) 1426 { 1427 struct mount *vnmp; 1428 1429 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1430 VNASSERT(vp->v_holdcnt, vp, 1431 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1432 /* 1433 * This vnode may found and locked via some other list, if so we 1434 * can't recycle it yet. 1435 */ 1436 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1437 CTR2(KTR_VFS, 1438 "%s: impossible to recycle, vp %p lock is already held", 1439 __func__, vp); 1440 return (EWOULDBLOCK); 1441 } 1442 /* 1443 * Don't recycle if its filesystem is being suspended. 1444 */ 1445 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1446 VOP_UNLOCK(vp); 1447 CTR2(KTR_VFS, 1448 "%s: impossible to recycle, cannot start the write for %p", 1449 __func__, vp); 1450 return (EBUSY); 1451 } 1452 /* 1453 * If we got this far, we need to acquire the interlock and see if 1454 * anyone picked up this vnode from another list. If not, we will 1455 * mark it with DOOMED via vgonel() so that anyone who does find it 1456 * will skip over it. 1457 */ 1458 VI_LOCK(vp); 1459 if (vp->v_usecount) { 1460 VOP_UNLOCK(vp); 1461 VI_UNLOCK(vp); 1462 vn_finished_write(vnmp); 1463 CTR2(KTR_VFS, 1464 "%s: impossible to recycle, %p is already referenced", 1465 __func__, vp); 1466 return (EBUSY); 1467 } 1468 if (!VN_IS_DOOMED(vp)) { 1469 counter_u64_add(recycles_count, 1); 1470 vgonel(vp); 1471 } 1472 VOP_UNLOCK(vp); 1473 VI_UNLOCK(vp); 1474 vn_finished_write(vnmp); 1475 return (0); 1476 } 1477 1478 static void 1479 vcheckspace(void) 1480 { 1481 int vsp; 1482 1483 vsp = vspace(); 1484 if (vsp < vlowat && vnlruproc_sig == 0) { 1485 vnlruproc_sig = 1; 1486 wakeup(vnlruproc); 1487 } 1488 } 1489 1490 /* 1491 * Wait if necessary for space for a new vnode. 1492 */ 1493 static int 1494 getnewvnode_wait(int suspended) 1495 { 1496 1497 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1498 if (numvnodes >= desiredvnodes) { 1499 if (suspended) { 1500 /* 1501 * The file system is being suspended. We cannot 1502 * risk a deadlock here, so allow allocation of 1503 * another vnode even if this would give too many. 1504 */ 1505 return (0); 1506 } 1507 if (vnlruproc_sig == 0) { 1508 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1509 wakeup(vnlruproc); 1510 } 1511 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1512 "vlruwk", hz); 1513 } 1514 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1515 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1516 vnlru_free_locked(1, NULL); 1517 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1518 } 1519 1520 /* 1521 * This hack is fragile, and probably not needed any more now that the 1522 * watermark handling works. 1523 */ 1524 void 1525 getnewvnode_reserve(u_int count) 1526 { 1527 u_long rnumvnodes, rfreevnodes; 1528 struct thread *td; 1529 1530 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1531 /* XXX no longer so quick, but this part is not racy. */ 1532 mtx_lock(&vnode_free_list_mtx); 1533 rnumvnodes = atomic_load_long(&numvnodes); 1534 rfreevnodes = atomic_load_long(&freevnodes); 1535 if (rnumvnodes + count > desiredvnodes && rfreevnodes > wantfreevnodes) 1536 vnlru_free_locked(ulmin(rnumvnodes + count - desiredvnodes, 1537 rfreevnodes - wantfreevnodes), NULL); 1538 mtx_unlock(&vnode_free_list_mtx); 1539 1540 td = curthread; 1541 /* First try to be quick and racy. */ 1542 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1543 td->td_vp_reserv += count; 1544 vcheckspace(); /* XXX no longer so quick, but more racy */ 1545 return; 1546 } else 1547 atomic_subtract_long(&numvnodes, count); 1548 1549 mtx_lock(&vnode_free_list_mtx); 1550 while (count > 0) { 1551 if (getnewvnode_wait(0) == 0) { 1552 count--; 1553 td->td_vp_reserv++; 1554 atomic_add_long(&numvnodes, 1); 1555 } 1556 } 1557 vcheckspace(); 1558 mtx_unlock(&vnode_free_list_mtx); 1559 } 1560 1561 /* 1562 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1563 * misconfgured or changed significantly. Reducing desiredvnodes below 1564 * the reserved amount should cause bizarre behaviour like reducing it 1565 * below the number of active vnodes -- the system will try to reduce 1566 * numvnodes to match, but should fail, so the subtraction below should 1567 * not overflow. 1568 */ 1569 void 1570 getnewvnode_drop_reserve(void) 1571 { 1572 struct thread *td; 1573 1574 td = curthread; 1575 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1576 td->td_vp_reserv = 0; 1577 } 1578 1579 /* 1580 * Return the next vnode from the free list. 1581 */ 1582 int 1583 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1584 struct vnode **vpp) 1585 { 1586 struct vnode *vp; 1587 struct thread *td; 1588 struct lock_object *lo; 1589 static int cyclecount; 1590 int error __unused; 1591 1592 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1593 1594 KASSERT(vops->registered, 1595 ("%s: not registered vector op %p\n", __func__, vops)); 1596 1597 vp = NULL; 1598 td = curthread; 1599 if (td->td_vp_reserv > 0) { 1600 td->td_vp_reserv -= 1; 1601 goto alloc; 1602 } 1603 mtx_lock(&vnode_free_list_mtx); 1604 if (numvnodes < desiredvnodes) 1605 cyclecount = 0; 1606 else if (cyclecount++ >= freevnodes) { 1607 cyclecount = 0; 1608 vstir = 1; 1609 } 1610 /* 1611 * Grow the vnode cache if it will not be above its target max 1612 * after growing. Otherwise, if the free list is nonempty, try 1613 * to reclaim 1 item from it before growing the cache (possibly 1614 * above its target max if the reclamation failed or is delayed). 1615 * Otherwise, wait for some space. In all cases, schedule 1616 * vnlru_proc() if we are getting short of space. The watermarks 1617 * should be chosen so that we never wait or even reclaim from 1618 * the free list to below its target minimum. 1619 */ 1620 if (numvnodes + 1 <= desiredvnodes) 1621 ; 1622 else if (freevnodes > 0) 1623 vnlru_free_locked(1, NULL); 1624 else { 1625 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1626 MNTK_SUSPEND)); 1627 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1628 if (error != 0) { 1629 mtx_unlock(&vnode_free_list_mtx); 1630 return (error); 1631 } 1632 #endif 1633 } 1634 vcheckspace(); 1635 atomic_add_long(&numvnodes, 1); 1636 mtx_unlock(&vnode_free_list_mtx); 1637 alloc: 1638 counter_u64_add(vnodes_created, 1); 1639 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1640 /* 1641 * Locks are given the generic name "vnode" when created. 1642 * Follow the historic practice of using the filesystem 1643 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1644 * 1645 * Locks live in a witness group keyed on their name. Thus, 1646 * when a lock is renamed, it must also move from the witness 1647 * group of its old name to the witness group of its new name. 1648 * 1649 * The change only needs to be made when the vnode moves 1650 * from one filesystem type to another. We ensure that each 1651 * filesystem use a single static name pointer for its tag so 1652 * that we can compare pointers rather than doing a strcmp(). 1653 */ 1654 lo = &vp->v_vnlock->lock_object; 1655 if (lo->lo_name != tag) { 1656 lo->lo_name = tag; 1657 WITNESS_DESTROY(lo); 1658 WITNESS_INIT(lo, tag); 1659 } 1660 /* 1661 * By default, don't allow shared locks unless filesystems opt-in. 1662 */ 1663 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1664 /* 1665 * Finalize various vnode identity bits. 1666 */ 1667 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1668 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1669 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1670 vp->v_type = VNON; 1671 vp->v_op = vops; 1672 v_init_counters(vp); 1673 vp->v_bufobj.bo_ops = &buf_ops_bio; 1674 #ifdef DIAGNOSTIC 1675 if (mp == NULL && vops != &dead_vnodeops) 1676 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1677 #endif 1678 #ifdef MAC 1679 mac_vnode_init(vp); 1680 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1681 mac_vnode_associate_singlelabel(mp, vp); 1682 #endif 1683 if (mp != NULL) { 1684 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1685 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1686 vp->v_vflag |= VV_NOKNOTE; 1687 } 1688 1689 /* 1690 * For the filesystems which do not use vfs_hash_insert(), 1691 * still initialize v_hash to have vfs_hash_index() useful. 1692 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1693 * its own hashing. 1694 */ 1695 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1696 1697 *vpp = vp; 1698 return (0); 1699 } 1700 1701 static void 1702 freevnode(struct vnode *vp) 1703 { 1704 struct bufobj *bo; 1705 1706 /* 1707 * The vnode has been marked for destruction, so free it. 1708 * 1709 * The vnode will be returned to the zone where it will 1710 * normally remain until it is needed for another vnode. We 1711 * need to cleanup (or verify that the cleanup has already 1712 * been done) any residual data left from its current use 1713 * so as not to contaminate the freshly allocated vnode. 1714 */ 1715 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1716 atomic_subtract_long(&numvnodes, 1); 1717 bo = &vp->v_bufobj; 1718 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 1719 ("cleaned vnode still on the free list.")); 1720 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1721 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 1722 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1723 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1724 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1725 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1726 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1727 ("clean blk trie not empty")); 1728 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1729 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1730 ("dirty blk trie not empty")); 1731 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 1732 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 1733 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 1734 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1735 ("Dangling rangelock waiters")); 1736 VI_UNLOCK(vp); 1737 #ifdef MAC 1738 mac_vnode_destroy(vp); 1739 #endif 1740 if (vp->v_pollinfo != NULL) { 1741 destroy_vpollinfo(vp->v_pollinfo); 1742 vp->v_pollinfo = NULL; 1743 } 1744 #ifdef INVARIANTS 1745 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 1746 vp->v_op = NULL; 1747 #endif 1748 vp->v_mountedhere = NULL; 1749 vp->v_unpcb = NULL; 1750 vp->v_rdev = NULL; 1751 vp->v_fifoinfo = NULL; 1752 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 1753 vp->v_irflag = 0; 1754 vp->v_iflag = 0; 1755 vp->v_vflag = 0; 1756 bo->bo_flag = 0; 1757 uma_zfree(vnode_zone, vp); 1758 } 1759 1760 /* 1761 * Delete from old mount point vnode list, if on one. 1762 */ 1763 static void 1764 delmntque(struct vnode *vp) 1765 { 1766 struct mount *mp; 1767 1768 mp = vp->v_mount; 1769 if (mp == NULL) 1770 return; 1771 MNT_ILOCK(mp); 1772 VI_LOCK(vp); 1773 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1774 ("Active vnode list size %d > Vnode list size %d", 1775 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1776 if (vp->v_iflag & VI_ACTIVE) { 1777 vp->v_iflag &= ~VI_ACTIVE; 1778 mtx_lock(&mp->mnt_listmtx); 1779 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1780 mp->mnt_activevnodelistsize--; 1781 mtx_unlock(&mp->mnt_listmtx); 1782 } 1783 vp->v_mount = NULL; 1784 VI_UNLOCK(vp); 1785 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1786 ("bad mount point vnode list size")); 1787 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1788 mp->mnt_nvnodelistsize--; 1789 MNT_REL(mp); 1790 MNT_IUNLOCK(mp); 1791 } 1792 1793 static void 1794 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1795 { 1796 1797 vp->v_data = NULL; 1798 vp->v_op = &dead_vnodeops; 1799 vgone(vp); 1800 vput(vp); 1801 } 1802 1803 /* 1804 * Insert into list of vnodes for the new mount point, if available. 1805 */ 1806 int 1807 insmntque1(struct vnode *vp, struct mount *mp, 1808 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1809 { 1810 1811 KASSERT(vp->v_mount == NULL, 1812 ("insmntque: vnode already on per mount vnode list")); 1813 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1814 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1815 1816 /* 1817 * We acquire the vnode interlock early to ensure that the 1818 * vnode cannot be recycled by another process releasing a 1819 * holdcnt on it before we get it on both the vnode list 1820 * and the active vnode list. The mount mutex protects only 1821 * manipulation of the vnode list and the vnode freelist 1822 * mutex protects only manipulation of the active vnode list. 1823 * Hence the need to hold the vnode interlock throughout. 1824 */ 1825 MNT_ILOCK(mp); 1826 VI_LOCK(vp); 1827 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1828 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1829 mp->mnt_nvnodelistsize == 0)) && 1830 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1831 VI_UNLOCK(vp); 1832 MNT_IUNLOCK(mp); 1833 if (dtr != NULL) 1834 dtr(vp, dtr_arg); 1835 return (EBUSY); 1836 } 1837 vp->v_mount = mp; 1838 MNT_REF(mp); 1839 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1840 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1841 ("neg mount point vnode list size")); 1842 mp->mnt_nvnodelistsize++; 1843 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1844 ("Activating already active vnode")); 1845 vp->v_iflag |= VI_ACTIVE; 1846 mtx_lock(&mp->mnt_listmtx); 1847 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1848 mp->mnt_activevnodelistsize++; 1849 mtx_unlock(&mp->mnt_listmtx); 1850 VI_UNLOCK(vp); 1851 MNT_IUNLOCK(mp); 1852 return (0); 1853 } 1854 1855 int 1856 insmntque(struct vnode *vp, struct mount *mp) 1857 { 1858 1859 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1860 } 1861 1862 /* 1863 * Flush out and invalidate all buffers associated with a bufobj 1864 * Called with the underlying object locked. 1865 */ 1866 int 1867 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1868 { 1869 int error; 1870 1871 BO_LOCK(bo); 1872 if (flags & V_SAVE) { 1873 error = bufobj_wwait(bo, slpflag, slptimeo); 1874 if (error) { 1875 BO_UNLOCK(bo); 1876 return (error); 1877 } 1878 if (bo->bo_dirty.bv_cnt > 0) { 1879 BO_UNLOCK(bo); 1880 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1881 return (error); 1882 /* 1883 * XXX We could save a lock/unlock if this was only 1884 * enabled under INVARIANTS 1885 */ 1886 BO_LOCK(bo); 1887 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1888 panic("vinvalbuf: dirty bufs"); 1889 } 1890 } 1891 /* 1892 * If you alter this loop please notice that interlock is dropped and 1893 * reacquired in flushbuflist. Special care is needed to ensure that 1894 * no race conditions occur from this. 1895 */ 1896 do { 1897 error = flushbuflist(&bo->bo_clean, 1898 flags, bo, slpflag, slptimeo); 1899 if (error == 0 && !(flags & V_CLEANONLY)) 1900 error = flushbuflist(&bo->bo_dirty, 1901 flags, bo, slpflag, slptimeo); 1902 if (error != 0 && error != EAGAIN) { 1903 BO_UNLOCK(bo); 1904 return (error); 1905 } 1906 } while (error != 0); 1907 1908 /* 1909 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1910 * have write I/O in-progress but if there is a VM object then the 1911 * VM object can also have read-I/O in-progress. 1912 */ 1913 do { 1914 bufobj_wwait(bo, 0, 0); 1915 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 1916 BO_UNLOCK(bo); 1917 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 1918 BO_LOCK(bo); 1919 } 1920 } while (bo->bo_numoutput > 0); 1921 BO_UNLOCK(bo); 1922 1923 /* 1924 * Destroy the copy in the VM cache, too. 1925 */ 1926 if (bo->bo_object != NULL && 1927 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1928 VM_OBJECT_WLOCK(bo->bo_object); 1929 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1930 OBJPR_CLEANONLY : 0); 1931 VM_OBJECT_WUNLOCK(bo->bo_object); 1932 } 1933 1934 #ifdef INVARIANTS 1935 BO_LOCK(bo); 1936 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1937 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1938 bo->bo_clean.bv_cnt > 0)) 1939 panic("vinvalbuf: flush failed"); 1940 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 1941 bo->bo_dirty.bv_cnt > 0) 1942 panic("vinvalbuf: flush dirty failed"); 1943 BO_UNLOCK(bo); 1944 #endif 1945 return (0); 1946 } 1947 1948 /* 1949 * Flush out and invalidate all buffers associated with a vnode. 1950 * Called with the underlying object locked. 1951 */ 1952 int 1953 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1954 { 1955 1956 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1957 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1958 if (vp->v_object != NULL && vp->v_object->handle != vp) 1959 return (0); 1960 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1961 } 1962 1963 /* 1964 * Flush out buffers on the specified list. 1965 * 1966 */ 1967 static int 1968 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1969 int slptimeo) 1970 { 1971 struct buf *bp, *nbp; 1972 int retval, error; 1973 daddr_t lblkno; 1974 b_xflags_t xflags; 1975 1976 ASSERT_BO_WLOCKED(bo); 1977 1978 retval = 0; 1979 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1980 /* 1981 * If we are flushing both V_NORMAL and V_ALT buffers then 1982 * do not skip any buffers. If we are flushing only V_NORMAL 1983 * buffers then skip buffers marked as BX_ALTDATA. If we are 1984 * flushing only V_ALT buffers then skip buffers not marked 1985 * as BX_ALTDATA. 1986 */ 1987 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 1988 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 1989 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 1990 continue; 1991 } 1992 if (nbp != NULL) { 1993 lblkno = nbp->b_lblkno; 1994 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1995 } 1996 retval = EAGAIN; 1997 error = BUF_TIMELOCK(bp, 1998 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1999 "flushbuf", slpflag, slptimeo); 2000 if (error) { 2001 BO_LOCK(bo); 2002 return (error != ENOLCK ? error : EAGAIN); 2003 } 2004 KASSERT(bp->b_bufobj == bo, 2005 ("bp %p wrong b_bufobj %p should be %p", 2006 bp, bp->b_bufobj, bo)); 2007 /* 2008 * XXX Since there are no node locks for NFS, I 2009 * believe there is a slight chance that a delayed 2010 * write will occur while sleeping just above, so 2011 * check for it. 2012 */ 2013 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2014 (flags & V_SAVE)) { 2015 bremfree(bp); 2016 bp->b_flags |= B_ASYNC; 2017 bwrite(bp); 2018 BO_LOCK(bo); 2019 return (EAGAIN); /* XXX: why not loop ? */ 2020 } 2021 bremfree(bp); 2022 bp->b_flags |= (B_INVAL | B_RELBUF); 2023 bp->b_flags &= ~B_ASYNC; 2024 brelse(bp); 2025 BO_LOCK(bo); 2026 if (nbp == NULL) 2027 break; 2028 nbp = gbincore(bo, lblkno); 2029 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2030 != xflags) 2031 break; /* nbp invalid */ 2032 } 2033 return (retval); 2034 } 2035 2036 int 2037 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2038 { 2039 struct buf *bp; 2040 int error; 2041 daddr_t lblkno; 2042 2043 ASSERT_BO_LOCKED(bo); 2044 2045 for (lblkno = startn;;) { 2046 again: 2047 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2048 if (bp == NULL || bp->b_lblkno >= endn || 2049 bp->b_lblkno < startn) 2050 break; 2051 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2052 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2053 if (error != 0) { 2054 BO_RLOCK(bo); 2055 if (error == ENOLCK) 2056 goto again; 2057 return (error); 2058 } 2059 KASSERT(bp->b_bufobj == bo, 2060 ("bp %p wrong b_bufobj %p should be %p", 2061 bp, bp->b_bufobj, bo)); 2062 lblkno = bp->b_lblkno + 1; 2063 if ((bp->b_flags & B_MANAGED) == 0) 2064 bremfree(bp); 2065 bp->b_flags |= B_RELBUF; 2066 /* 2067 * In the VMIO case, use the B_NOREUSE flag to hint that the 2068 * pages backing each buffer in the range are unlikely to be 2069 * reused. Dirty buffers will have the hint applied once 2070 * they've been written. 2071 */ 2072 if ((bp->b_flags & B_VMIO) != 0) 2073 bp->b_flags |= B_NOREUSE; 2074 brelse(bp); 2075 BO_RLOCK(bo); 2076 } 2077 return (0); 2078 } 2079 2080 /* 2081 * Truncate a file's buffer and pages to a specified length. This 2082 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2083 * sync activity. 2084 */ 2085 int 2086 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2087 { 2088 struct buf *bp, *nbp; 2089 struct bufobj *bo; 2090 daddr_t startlbn; 2091 2092 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2093 vp, blksize, (uintmax_t)length); 2094 2095 /* 2096 * Round up to the *next* lbn. 2097 */ 2098 startlbn = howmany(length, blksize); 2099 2100 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2101 2102 bo = &vp->v_bufobj; 2103 restart_unlocked: 2104 BO_LOCK(bo); 2105 2106 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2107 ; 2108 2109 if (length > 0) { 2110 restartsync: 2111 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2112 if (bp->b_lblkno > 0) 2113 continue; 2114 /* 2115 * Since we hold the vnode lock this should only 2116 * fail if we're racing with the buf daemon. 2117 */ 2118 if (BUF_LOCK(bp, 2119 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2120 BO_LOCKPTR(bo)) == ENOLCK) 2121 goto restart_unlocked; 2122 2123 VNASSERT((bp->b_flags & B_DELWRI), vp, 2124 ("buf(%p) on dirty queue without DELWRI", bp)); 2125 2126 bremfree(bp); 2127 bawrite(bp); 2128 BO_LOCK(bo); 2129 goto restartsync; 2130 } 2131 } 2132 2133 bufobj_wwait(bo, 0, 0); 2134 BO_UNLOCK(bo); 2135 vnode_pager_setsize(vp, length); 2136 2137 return (0); 2138 } 2139 2140 /* 2141 * Invalidate the cached pages of a file's buffer within the range of block 2142 * numbers [startlbn, endlbn). 2143 */ 2144 void 2145 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2146 int blksize) 2147 { 2148 struct bufobj *bo; 2149 off_t start, end; 2150 2151 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2152 2153 start = blksize * startlbn; 2154 end = blksize * endlbn; 2155 2156 bo = &vp->v_bufobj; 2157 BO_LOCK(bo); 2158 MPASS(blksize == bo->bo_bsize); 2159 2160 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2161 ; 2162 2163 BO_UNLOCK(bo); 2164 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2165 } 2166 2167 static int 2168 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2169 daddr_t startlbn, daddr_t endlbn) 2170 { 2171 struct buf *bp, *nbp; 2172 bool anyfreed; 2173 2174 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2175 ASSERT_BO_LOCKED(bo); 2176 2177 do { 2178 anyfreed = false; 2179 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2180 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2181 continue; 2182 if (BUF_LOCK(bp, 2183 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2184 BO_LOCKPTR(bo)) == ENOLCK) { 2185 BO_LOCK(bo); 2186 return (EAGAIN); 2187 } 2188 2189 bremfree(bp); 2190 bp->b_flags |= B_INVAL | B_RELBUF; 2191 bp->b_flags &= ~B_ASYNC; 2192 brelse(bp); 2193 anyfreed = true; 2194 2195 BO_LOCK(bo); 2196 if (nbp != NULL && 2197 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2198 nbp->b_vp != vp || 2199 (nbp->b_flags & B_DELWRI) != 0)) 2200 return (EAGAIN); 2201 } 2202 2203 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2204 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2205 continue; 2206 if (BUF_LOCK(bp, 2207 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2208 BO_LOCKPTR(bo)) == ENOLCK) { 2209 BO_LOCK(bo); 2210 return (EAGAIN); 2211 } 2212 bremfree(bp); 2213 bp->b_flags |= B_INVAL | B_RELBUF; 2214 bp->b_flags &= ~B_ASYNC; 2215 brelse(bp); 2216 anyfreed = true; 2217 2218 BO_LOCK(bo); 2219 if (nbp != NULL && 2220 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2221 (nbp->b_vp != vp) || 2222 (nbp->b_flags & B_DELWRI) == 0)) 2223 return (EAGAIN); 2224 } 2225 } while (anyfreed); 2226 return (0); 2227 } 2228 2229 static void 2230 buf_vlist_remove(struct buf *bp) 2231 { 2232 struct bufv *bv; 2233 2234 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2235 ASSERT_BO_WLOCKED(bp->b_bufobj); 2236 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 2237 (BX_VNDIRTY|BX_VNCLEAN), 2238 ("buf_vlist_remove: Buf %p is on two lists", bp)); 2239 if (bp->b_xflags & BX_VNDIRTY) 2240 bv = &bp->b_bufobj->bo_dirty; 2241 else 2242 bv = &bp->b_bufobj->bo_clean; 2243 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2244 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2245 bv->bv_cnt--; 2246 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2247 } 2248 2249 /* 2250 * Add the buffer to the sorted clean or dirty block list. 2251 * 2252 * NOTE: xflags is passed as a constant, optimizing this inline function! 2253 */ 2254 static void 2255 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2256 { 2257 struct bufv *bv; 2258 struct buf *n; 2259 int error; 2260 2261 ASSERT_BO_WLOCKED(bo); 2262 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2263 ("dead bo %p", bo)); 2264 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2265 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2266 bp->b_xflags |= xflags; 2267 if (xflags & BX_VNDIRTY) 2268 bv = &bo->bo_dirty; 2269 else 2270 bv = &bo->bo_clean; 2271 2272 /* 2273 * Keep the list ordered. Optimize empty list insertion. Assume 2274 * we tend to grow at the tail so lookup_le should usually be cheaper 2275 * than _ge. 2276 */ 2277 if (bv->bv_cnt == 0 || 2278 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2279 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2280 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2281 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2282 else 2283 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2284 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2285 if (error) 2286 panic("buf_vlist_add: Preallocated nodes insufficient."); 2287 bv->bv_cnt++; 2288 } 2289 2290 /* 2291 * Look up a buffer using the buffer tries. 2292 */ 2293 struct buf * 2294 gbincore(struct bufobj *bo, daddr_t lblkno) 2295 { 2296 struct buf *bp; 2297 2298 ASSERT_BO_LOCKED(bo); 2299 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2300 if (bp != NULL) 2301 return (bp); 2302 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 2303 } 2304 2305 /* 2306 * Associate a buffer with a vnode. 2307 */ 2308 void 2309 bgetvp(struct vnode *vp, struct buf *bp) 2310 { 2311 struct bufobj *bo; 2312 2313 bo = &vp->v_bufobj; 2314 ASSERT_BO_WLOCKED(bo); 2315 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2316 2317 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2318 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2319 ("bgetvp: bp already attached! %p", bp)); 2320 2321 vhold(vp); 2322 bp->b_vp = vp; 2323 bp->b_bufobj = bo; 2324 /* 2325 * Insert onto list for new vnode. 2326 */ 2327 buf_vlist_add(bp, bo, BX_VNCLEAN); 2328 } 2329 2330 /* 2331 * Disassociate a buffer from a vnode. 2332 */ 2333 void 2334 brelvp(struct buf *bp) 2335 { 2336 struct bufobj *bo; 2337 struct vnode *vp; 2338 2339 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2340 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2341 2342 /* 2343 * Delete from old vnode list, if on one. 2344 */ 2345 vp = bp->b_vp; /* XXX */ 2346 bo = bp->b_bufobj; 2347 BO_LOCK(bo); 2348 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2349 buf_vlist_remove(bp); 2350 else 2351 panic("brelvp: Buffer %p not on queue.", bp); 2352 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2353 bo->bo_flag &= ~BO_ONWORKLST; 2354 mtx_lock(&sync_mtx); 2355 LIST_REMOVE(bo, bo_synclist); 2356 syncer_worklist_len--; 2357 mtx_unlock(&sync_mtx); 2358 } 2359 bp->b_vp = NULL; 2360 bp->b_bufobj = NULL; 2361 BO_UNLOCK(bo); 2362 vdrop(vp); 2363 } 2364 2365 /* 2366 * Add an item to the syncer work queue. 2367 */ 2368 static void 2369 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2370 { 2371 int slot; 2372 2373 ASSERT_BO_WLOCKED(bo); 2374 2375 mtx_lock(&sync_mtx); 2376 if (bo->bo_flag & BO_ONWORKLST) 2377 LIST_REMOVE(bo, bo_synclist); 2378 else { 2379 bo->bo_flag |= BO_ONWORKLST; 2380 syncer_worklist_len++; 2381 } 2382 2383 if (delay > syncer_maxdelay - 2) 2384 delay = syncer_maxdelay - 2; 2385 slot = (syncer_delayno + delay) & syncer_mask; 2386 2387 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2388 mtx_unlock(&sync_mtx); 2389 } 2390 2391 static int 2392 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2393 { 2394 int error, len; 2395 2396 mtx_lock(&sync_mtx); 2397 len = syncer_worklist_len - sync_vnode_count; 2398 mtx_unlock(&sync_mtx); 2399 error = SYSCTL_OUT(req, &len, sizeof(len)); 2400 return (error); 2401 } 2402 2403 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 2404 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2405 2406 static struct proc *updateproc; 2407 static void sched_sync(void); 2408 static struct kproc_desc up_kp = { 2409 "syncer", 2410 sched_sync, 2411 &updateproc 2412 }; 2413 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2414 2415 static int 2416 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2417 { 2418 struct vnode *vp; 2419 struct mount *mp; 2420 2421 *bo = LIST_FIRST(slp); 2422 if (*bo == NULL) 2423 return (0); 2424 vp = bo2vnode(*bo); 2425 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2426 return (1); 2427 /* 2428 * We use vhold in case the vnode does not 2429 * successfully sync. vhold prevents the vnode from 2430 * going away when we unlock the sync_mtx so that 2431 * we can acquire the vnode interlock. 2432 */ 2433 vholdl(vp); 2434 mtx_unlock(&sync_mtx); 2435 VI_UNLOCK(vp); 2436 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2437 vdrop(vp); 2438 mtx_lock(&sync_mtx); 2439 return (*bo == LIST_FIRST(slp)); 2440 } 2441 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2442 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2443 VOP_UNLOCK(vp); 2444 vn_finished_write(mp); 2445 BO_LOCK(*bo); 2446 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2447 /* 2448 * Put us back on the worklist. The worklist 2449 * routine will remove us from our current 2450 * position and then add us back in at a later 2451 * position. 2452 */ 2453 vn_syncer_add_to_worklist(*bo, syncdelay); 2454 } 2455 BO_UNLOCK(*bo); 2456 vdrop(vp); 2457 mtx_lock(&sync_mtx); 2458 return (0); 2459 } 2460 2461 static int first_printf = 1; 2462 2463 /* 2464 * System filesystem synchronizer daemon. 2465 */ 2466 static void 2467 sched_sync(void) 2468 { 2469 struct synclist *next, *slp; 2470 struct bufobj *bo; 2471 long starttime; 2472 struct thread *td = curthread; 2473 int last_work_seen; 2474 int net_worklist_len; 2475 int syncer_final_iter; 2476 int error; 2477 2478 last_work_seen = 0; 2479 syncer_final_iter = 0; 2480 syncer_state = SYNCER_RUNNING; 2481 starttime = time_uptime; 2482 td->td_pflags |= TDP_NORUNNINGBUF; 2483 2484 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2485 SHUTDOWN_PRI_LAST); 2486 2487 mtx_lock(&sync_mtx); 2488 for (;;) { 2489 if (syncer_state == SYNCER_FINAL_DELAY && 2490 syncer_final_iter == 0) { 2491 mtx_unlock(&sync_mtx); 2492 kproc_suspend_check(td->td_proc); 2493 mtx_lock(&sync_mtx); 2494 } 2495 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2496 if (syncer_state != SYNCER_RUNNING && 2497 starttime != time_uptime) { 2498 if (first_printf) { 2499 printf("\nSyncing disks, vnodes remaining... "); 2500 first_printf = 0; 2501 } 2502 printf("%d ", net_worklist_len); 2503 } 2504 starttime = time_uptime; 2505 2506 /* 2507 * Push files whose dirty time has expired. Be careful 2508 * of interrupt race on slp queue. 2509 * 2510 * Skip over empty worklist slots when shutting down. 2511 */ 2512 do { 2513 slp = &syncer_workitem_pending[syncer_delayno]; 2514 syncer_delayno += 1; 2515 if (syncer_delayno == syncer_maxdelay) 2516 syncer_delayno = 0; 2517 next = &syncer_workitem_pending[syncer_delayno]; 2518 /* 2519 * If the worklist has wrapped since the 2520 * it was emptied of all but syncer vnodes, 2521 * switch to the FINAL_DELAY state and run 2522 * for one more second. 2523 */ 2524 if (syncer_state == SYNCER_SHUTTING_DOWN && 2525 net_worklist_len == 0 && 2526 last_work_seen == syncer_delayno) { 2527 syncer_state = SYNCER_FINAL_DELAY; 2528 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2529 } 2530 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2531 syncer_worklist_len > 0); 2532 2533 /* 2534 * Keep track of the last time there was anything 2535 * on the worklist other than syncer vnodes. 2536 * Return to the SHUTTING_DOWN state if any 2537 * new work appears. 2538 */ 2539 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2540 last_work_seen = syncer_delayno; 2541 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2542 syncer_state = SYNCER_SHUTTING_DOWN; 2543 while (!LIST_EMPTY(slp)) { 2544 error = sync_vnode(slp, &bo, td); 2545 if (error == 1) { 2546 LIST_REMOVE(bo, bo_synclist); 2547 LIST_INSERT_HEAD(next, bo, bo_synclist); 2548 continue; 2549 } 2550 2551 if (first_printf == 0) { 2552 /* 2553 * Drop the sync mutex, because some watchdog 2554 * drivers need to sleep while patting 2555 */ 2556 mtx_unlock(&sync_mtx); 2557 wdog_kern_pat(WD_LASTVAL); 2558 mtx_lock(&sync_mtx); 2559 } 2560 2561 } 2562 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2563 syncer_final_iter--; 2564 /* 2565 * The variable rushjob allows the kernel to speed up the 2566 * processing of the filesystem syncer process. A rushjob 2567 * value of N tells the filesystem syncer to process the next 2568 * N seconds worth of work on its queue ASAP. Currently rushjob 2569 * is used by the soft update code to speed up the filesystem 2570 * syncer process when the incore state is getting so far 2571 * ahead of the disk that the kernel memory pool is being 2572 * threatened with exhaustion. 2573 */ 2574 if (rushjob > 0) { 2575 rushjob -= 1; 2576 continue; 2577 } 2578 /* 2579 * Just sleep for a short period of time between 2580 * iterations when shutting down to allow some I/O 2581 * to happen. 2582 * 2583 * If it has taken us less than a second to process the 2584 * current work, then wait. Otherwise start right over 2585 * again. We can still lose time if any single round 2586 * takes more than two seconds, but it does not really 2587 * matter as we are just trying to generally pace the 2588 * filesystem activity. 2589 */ 2590 if (syncer_state != SYNCER_RUNNING || 2591 time_uptime == starttime) { 2592 thread_lock(td); 2593 sched_prio(td, PPAUSE); 2594 thread_unlock(td); 2595 } 2596 if (syncer_state != SYNCER_RUNNING) 2597 cv_timedwait(&sync_wakeup, &sync_mtx, 2598 hz / SYNCER_SHUTDOWN_SPEEDUP); 2599 else if (time_uptime == starttime) 2600 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2601 } 2602 } 2603 2604 /* 2605 * Request the syncer daemon to speed up its work. 2606 * We never push it to speed up more than half of its 2607 * normal turn time, otherwise it could take over the cpu. 2608 */ 2609 int 2610 speedup_syncer(void) 2611 { 2612 int ret = 0; 2613 2614 mtx_lock(&sync_mtx); 2615 if (rushjob < syncdelay / 2) { 2616 rushjob += 1; 2617 stat_rush_requests += 1; 2618 ret = 1; 2619 } 2620 mtx_unlock(&sync_mtx); 2621 cv_broadcast(&sync_wakeup); 2622 return (ret); 2623 } 2624 2625 /* 2626 * Tell the syncer to speed up its work and run though its work 2627 * list several times, then tell it to shut down. 2628 */ 2629 static void 2630 syncer_shutdown(void *arg, int howto) 2631 { 2632 2633 if (howto & RB_NOSYNC) 2634 return; 2635 mtx_lock(&sync_mtx); 2636 syncer_state = SYNCER_SHUTTING_DOWN; 2637 rushjob = 0; 2638 mtx_unlock(&sync_mtx); 2639 cv_broadcast(&sync_wakeup); 2640 kproc_shutdown(arg, howto); 2641 } 2642 2643 void 2644 syncer_suspend(void) 2645 { 2646 2647 syncer_shutdown(updateproc, 0); 2648 } 2649 2650 void 2651 syncer_resume(void) 2652 { 2653 2654 mtx_lock(&sync_mtx); 2655 first_printf = 1; 2656 syncer_state = SYNCER_RUNNING; 2657 mtx_unlock(&sync_mtx); 2658 cv_broadcast(&sync_wakeup); 2659 kproc_resume(updateproc); 2660 } 2661 2662 /* 2663 * Reassign a buffer from one vnode to another. 2664 * Used to assign file specific control information 2665 * (indirect blocks) to the vnode to which they belong. 2666 */ 2667 void 2668 reassignbuf(struct buf *bp) 2669 { 2670 struct vnode *vp; 2671 struct bufobj *bo; 2672 int delay; 2673 #ifdef INVARIANTS 2674 struct bufv *bv; 2675 #endif 2676 2677 vp = bp->b_vp; 2678 bo = bp->b_bufobj; 2679 ++reassignbufcalls; 2680 2681 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2682 bp, bp->b_vp, bp->b_flags); 2683 /* 2684 * B_PAGING flagged buffers cannot be reassigned because their vp 2685 * is not fully linked in. 2686 */ 2687 if (bp->b_flags & B_PAGING) 2688 panic("cannot reassign paging buffer"); 2689 2690 /* 2691 * Delete from old vnode list, if on one. 2692 */ 2693 BO_LOCK(bo); 2694 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2695 buf_vlist_remove(bp); 2696 else 2697 panic("reassignbuf: Buffer %p not on queue.", bp); 2698 /* 2699 * If dirty, put on list of dirty buffers; otherwise insert onto list 2700 * of clean buffers. 2701 */ 2702 if (bp->b_flags & B_DELWRI) { 2703 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2704 switch (vp->v_type) { 2705 case VDIR: 2706 delay = dirdelay; 2707 break; 2708 case VCHR: 2709 delay = metadelay; 2710 break; 2711 default: 2712 delay = filedelay; 2713 } 2714 vn_syncer_add_to_worklist(bo, delay); 2715 } 2716 buf_vlist_add(bp, bo, BX_VNDIRTY); 2717 } else { 2718 buf_vlist_add(bp, bo, BX_VNCLEAN); 2719 2720 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2721 mtx_lock(&sync_mtx); 2722 LIST_REMOVE(bo, bo_synclist); 2723 syncer_worklist_len--; 2724 mtx_unlock(&sync_mtx); 2725 bo->bo_flag &= ~BO_ONWORKLST; 2726 } 2727 } 2728 #ifdef INVARIANTS 2729 bv = &bo->bo_clean; 2730 bp = TAILQ_FIRST(&bv->bv_hd); 2731 KASSERT(bp == NULL || bp->b_bufobj == bo, 2732 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2733 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2734 KASSERT(bp == NULL || bp->b_bufobj == bo, 2735 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2736 bv = &bo->bo_dirty; 2737 bp = TAILQ_FIRST(&bv->bv_hd); 2738 KASSERT(bp == NULL || bp->b_bufobj == bo, 2739 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2740 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2741 KASSERT(bp == NULL || bp->b_bufobj == bo, 2742 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2743 #endif 2744 BO_UNLOCK(bo); 2745 } 2746 2747 static void 2748 v_init_counters(struct vnode *vp) 2749 { 2750 2751 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2752 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2753 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2754 2755 refcount_init(&vp->v_holdcnt, 1); 2756 refcount_init(&vp->v_usecount, 1); 2757 } 2758 2759 /* 2760 * Increment si_usecount of the associated device, if any. 2761 */ 2762 static void 2763 v_incr_devcount(struct vnode *vp) 2764 { 2765 2766 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2767 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2768 dev_lock(); 2769 vp->v_rdev->si_usecount++; 2770 dev_unlock(); 2771 } 2772 } 2773 2774 /* 2775 * Decrement si_usecount of the associated device, if any. 2776 */ 2777 static void 2778 v_decr_devcount(struct vnode *vp) 2779 { 2780 2781 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2782 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2783 dev_lock(); 2784 vp->v_rdev->si_usecount--; 2785 dev_unlock(); 2786 } 2787 } 2788 2789 /* 2790 * Grab a particular vnode from the free list, increment its 2791 * reference count and lock it. VIRF_DOOMED is set if the vnode 2792 * is being destroyed. Only callers who specify LK_RETRY will 2793 * see doomed vnodes. If inactive processing was delayed in 2794 * vput try to do it here. 2795 * 2796 * Both holdcnt and usecount can be manipulated using atomics without holding 2797 * any locks except in these cases which require the vnode interlock: 2798 * holdcnt: 1->0 and 0->1 2799 * usecount: 0->1 2800 * 2801 * usecount is permitted to transition 1->0 without the interlock because 2802 * vnode is kept live by holdcnt. 2803 */ 2804 static enum vgetstate __always_inline 2805 _vget_prep(struct vnode *vp, bool interlock) 2806 { 2807 enum vgetstate vs; 2808 2809 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2810 vs = VGET_USECOUNT; 2811 } else { 2812 if (interlock) 2813 vholdl(vp); 2814 else 2815 vhold(vp); 2816 vs = VGET_HOLDCNT; 2817 } 2818 return (vs); 2819 } 2820 2821 enum vgetstate 2822 vget_prep(struct vnode *vp) 2823 { 2824 2825 return (_vget_prep(vp, false)); 2826 } 2827 2828 int 2829 vget(struct vnode *vp, int flags, struct thread *td) 2830 { 2831 enum vgetstate vs; 2832 2833 MPASS(td == curthread); 2834 2835 vs = _vget_prep(vp, (flags & LK_INTERLOCK) != 0); 2836 return (vget_finish(vp, flags, vs)); 2837 } 2838 2839 int 2840 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 2841 { 2842 int error, oweinact; 2843 2844 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2845 ("%s: invalid lock operation", __func__)); 2846 2847 if ((flags & LK_INTERLOCK) != 0) 2848 ASSERT_VI_LOCKED(vp, __func__); 2849 else 2850 ASSERT_VI_UNLOCKED(vp, __func__); 2851 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 2852 if (vs == VGET_USECOUNT) { 2853 VNASSERT(vp->v_usecount > 0, vp, 2854 ("%s: vnode without usecount when VGET_USECOUNT was passed", 2855 __func__)); 2856 } 2857 2858 if ((error = vn_lock(vp, flags)) != 0) { 2859 if (vs == VGET_USECOUNT) 2860 vrele(vp); 2861 else 2862 vdrop(vp); 2863 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2864 vp); 2865 return (error); 2866 } 2867 2868 if (vs == VGET_USECOUNT) { 2869 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2870 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2871 return (0); 2872 } 2873 2874 /* 2875 * We hold the vnode. If the usecount is 0 it will be utilized to keep 2876 * the vnode around. Otherwise someone else lended their hold count and 2877 * we have to drop ours. 2878 */ 2879 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2880 #ifdef INVARIANTS 2881 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2882 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2883 #else 2884 refcount_release(&vp->v_holdcnt); 2885 #endif 2886 VNODE_REFCOUNT_FENCE_ACQ(); 2887 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2888 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2889 return (0); 2890 } 2891 2892 /* 2893 * We don't guarantee that any particular close will 2894 * trigger inactive processing so just make a best effort 2895 * here at preventing a reference to a removed file. If 2896 * we don't succeed no harm is done. 2897 * 2898 * Upgrade our holdcnt to a usecount. 2899 */ 2900 VI_LOCK(vp); 2901 /* 2902 * See the previous section. By the time we get here we may find 2903 * ourselves in the same spot. 2904 */ 2905 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2906 #ifdef INVARIANTS 2907 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2908 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2909 #else 2910 refcount_release(&vp->v_holdcnt); 2911 #endif 2912 VNODE_REFCOUNT_FENCE_ACQ(); 2913 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2914 ("%s: vnode with usecount and VI_OWEINACT set", 2915 __func__)); 2916 VI_UNLOCK(vp); 2917 return (0); 2918 } 2919 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2920 oweinact = 0; 2921 } else { 2922 oweinact = 1; 2923 vp->v_iflag &= ~VI_OWEINACT; 2924 VNODE_REFCOUNT_FENCE_REL(); 2925 } 2926 v_incr_devcount(vp); 2927 refcount_acquire(&vp->v_usecount); 2928 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2929 (flags & LK_NOWAIT) == 0) 2930 vinactive(vp); 2931 VI_UNLOCK(vp); 2932 return (0); 2933 } 2934 2935 /* 2936 * Increase the reference (use) and hold count of a vnode. 2937 * This will also remove the vnode from the free list if it is presently free. 2938 */ 2939 void 2940 vref(struct vnode *vp) 2941 { 2942 2943 ASSERT_VI_UNLOCKED(vp, __func__); 2944 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2945 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2946 VNODE_REFCOUNT_FENCE_ACQ(); 2947 VNASSERT(vp->v_holdcnt > 0, vp, 2948 ("%s: active vnode not held", __func__)); 2949 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2950 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2951 return; 2952 } 2953 VI_LOCK(vp); 2954 vrefl(vp); 2955 VI_UNLOCK(vp); 2956 } 2957 2958 void 2959 vrefl(struct vnode *vp) 2960 { 2961 2962 ASSERT_VI_LOCKED(vp, __func__); 2963 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2964 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2965 VNODE_REFCOUNT_FENCE_ACQ(); 2966 VNASSERT(vp->v_holdcnt > 0, vp, 2967 ("%s: active vnode not held", __func__)); 2968 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2969 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2970 return; 2971 } 2972 vholdl(vp); 2973 if ((vp->v_iflag & VI_OWEINACT) != 0) { 2974 vp->v_iflag &= ~VI_OWEINACT; 2975 VNODE_REFCOUNT_FENCE_REL(); 2976 } 2977 v_incr_devcount(vp); 2978 refcount_acquire(&vp->v_usecount); 2979 } 2980 2981 void 2982 vrefact(struct vnode *vp) 2983 { 2984 2985 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2986 #ifdef INVARIANTS 2987 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 2988 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 2989 #else 2990 refcount_acquire(&vp->v_usecount); 2991 #endif 2992 } 2993 2994 /* 2995 * Return reference count of a vnode. 2996 * 2997 * The results of this call are only guaranteed when some mechanism is used to 2998 * stop other processes from gaining references to the vnode. This may be the 2999 * case if the caller holds the only reference. This is also useful when stale 3000 * data is acceptable as race conditions may be accounted for by some other 3001 * means. 3002 */ 3003 int 3004 vrefcnt(struct vnode *vp) 3005 { 3006 3007 return (vp->v_usecount); 3008 } 3009 3010 enum vputx_op { VPUTX_VRELE, VPUTX_VPUT, VPUTX_VUNREF }; 3011 3012 /* 3013 * Decrement the use and hold counts for a vnode. 3014 * 3015 * See an explanation near vget() as to why atomic operation is safe. 3016 */ 3017 static void 3018 vputx(struct vnode *vp, enum vputx_op func) 3019 { 3020 int error; 3021 3022 KASSERT(vp != NULL, ("vputx: null vp")); 3023 if (func == VPUTX_VUNREF) 3024 ASSERT_VOP_LOCKED(vp, "vunref"); 3025 ASSERT_VI_UNLOCKED(vp, __func__); 3026 VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp, 3027 ("%s: wrong ref counts", __func__)); 3028 3029 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3030 3031 /* 3032 * We want to hold the vnode until the inactive finishes to 3033 * prevent vgone() races. We drop the use count here and the 3034 * hold count below when we're done. 3035 * 3036 * If we release the last usecount we take ownership of the hold 3037 * count which provides liveness of the vnode, in which case we 3038 * have to vdrop. 3039 */ 3040 if (!refcount_release(&vp->v_usecount)) 3041 return; 3042 VI_LOCK(vp); 3043 v_decr_devcount(vp); 3044 /* 3045 * By the time we got here someone else might have transitioned 3046 * the count back to > 0. 3047 */ 3048 if (vp->v_usecount > 0) { 3049 vdropl(vp); 3050 return; 3051 } 3052 if (vp->v_iflag & VI_DOINGINACT) { 3053 vdropl(vp); 3054 return; 3055 } 3056 3057 /* 3058 * Check if the fs wants to perform inactive processing. Note we 3059 * may be only holding the interlock, in which case it is possible 3060 * someone else called vgone on the vnode and ->v_data is now NULL. 3061 * Since vgone performs inactive on its own there is nothing to do 3062 * here but to drop our hold count. 3063 */ 3064 if (__predict_false(VN_IS_DOOMED(vp)) || 3065 VOP_NEED_INACTIVE(vp) == 0) { 3066 vdropl(vp); 3067 return; 3068 } 3069 3070 /* 3071 * We must call VOP_INACTIVE with the node locked. Mark 3072 * as VI_DOINGINACT to avoid recursion. 3073 */ 3074 vp->v_iflag |= VI_OWEINACT; 3075 switch (func) { 3076 case VPUTX_VRELE: 3077 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3078 VI_LOCK(vp); 3079 break; 3080 case VPUTX_VPUT: 3081 error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT); 3082 VI_LOCK(vp); 3083 break; 3084 case VPUTX_VUNREF: 3085 error = 0; 3086 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3087 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3088 VI_LOCK(vp); 3089 } 3090 break; 3091 } 3092 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 3093 ("vnode with usecount and VI_OWEINACT set")); 3094 if (error == 0) { 3095 if (vp->v_iflag & VI_OWEINACT) 3096 vinactive(vp); 3097 if (func != VPUTX_VUNREF) 3098 VOP_UNLOCK(vp); 3099 } 3100 vdropl(vp); 3101 } 3102 3103 /* 3104 * Vnode put/release. 3105 * If count drops to zero, call inactive routine and return to freelist. 3106 */ 3107 void 3108 vrele(struct vnode *vp) 3109 { 3110 3111 vputx(vp, VPUTX_VRELE); 3112 } 3113 3114 /* 3115 * Release an already locked vnode. This give the same effects as 3116 * unlock+vrele(), but takes less time and avoids releasing and 3117 * re-aquiring the lock (as vrele() acquires the lock internally.) 3118 * 3119 * It is an invariant that all VOP_* calls operate on a held vnode. 3120 * We may be only having an implicit hold stemming from our usecount, 3121 * which we are about to release. If we unlock the vnode afterwards we 3122 * open a time window where someone else dropped the last usecount and 3123 * proceeded to free the vnode before our unlock finished. For this 3124 * reason we unlock the vnode early. This is a little bit wasteful as 3125 * it may be the vnode is exclusively locked and inactive processing is 3126 * needed, in which case we are adding work. 3127 */ 3128 void 3129 vput(struct vnode *vp) 3130 { 3131 3132 VOP_UNLOCK(vp); 3133 vputx(vp, VPUTX_VPUT); 3134 } 3135 3136 /* 3137 * Release an exclusively locked vnode. Do not unlock the vnode lock. 3138 */ 3139 void 3140 vunref(struct vnode *vp) 3141 { 3142 3143 vputx(vp, VPUTX_VUNREF); 3144 } 3145 3146 /* 3147 * Increase the hold count and activate if this is the first reference. 3148 */ 3149 static void 3150 vhold_activate(struct vnode *vp) 3151 { 3152 struct mount *mp; 3153 3154 ASSERT_VI_LOCKED(vp, __func__); 3155 VNASSERT(vp->v_holdcnt == 0, vp, 3156 ("%s: wrong hold count", __func__)); 3157 VNASSERT(vp->v_op != NULL, vp, 3158 ("%s: vnode already reclaimed.", __func__)); 3159 /* 3160 * Remove a vnode from the free list, mark it as in use, 3161 * and put it on the active list. 3162 */ 3163 VNASSERT(vp->v_mount != NULL, vp, 3164 ("_vhold: vnode not on per mount vnode list")); 3165 mp = vp->v_mount; 3166 mtx_lock(&mp->mnt_listmtx); 3167 if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { 3168 TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); 3169 mp->mnt_tmpfreevnodelistsize--; 3170 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 3171 } else { 3172 mtx_lock(&vnode_free_list_mtx); 3173 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 3174 freevnodes--; 3175 mtx_unlock(&vnode_free_list_mtx); 3176 } 3177 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 3178 ("Activating already active vnode")); 3179 vp->v_iflag &= ~VI_FREE; 3180 vp->v_iflag |= VI_ACTIVE; 3181 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 3182 mp->mnt_activevnodelistsize++; 3183 mtx_unlock(&mp->mnt_listmtx); 3184 refcount_acquire(&vp->v_holdcnt); 3185 } 3186 3187 void 3188 vhold(struct vnode *vp) 3189 { 3190 3191 ASSERT_VI_UNLOCKED(vp, __func__); 3192 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3193 if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 3194 VNODE_REFCOUNT_FENCE_ACQ(); 3195 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 3196 ("vhold: vnode with holdcnt is free")); 3197 return; 3198 } 3199 VI_LOCK(vp); 3200 vholdl(vp); 3201 VI_UNLOCK(vp); 3202 } 3203 3204 void 3205 vholdl(struct vnode *vp) 3206 { 3207 3208 ASSERT_VI_LOCKED(vp, __func__); 3209 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3210 if ((vp->v_iflag & VI_FREE) == 0) { 3211 refcount_acquire(&vp->v_holdcnt); 3212 return; 3213 } 3214 vhold_activate(vp); 3215 } 3216 3217 void 3218 vholdnz(struct vnode *vp) 3219 { 3220 3221 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3222 #ifdef INVARIANTS 3223 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3224 VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); 3225 #else 3226 atomic_add_int(&vp->v_holdcnt, 1); 3227 #endif 3228 } 3229 3230 /* 3231 * Drop the hold count of the vnode. If this is the last reference to 3232 * the vnode we place it on the free list unless it has been vgone'd 3233 * (marked VIRF_DOOMED) in which case we will free it. 3234 * 3235 * Because the vnode vm object keeps a hold reference on the vnode if 3236 * there is at least one resident non-cached page, the vnode cannot 3237 * leave the active list without the page cleanup done. 3238 */ 3239 static void 3240 vdrop_deactivate(struct vnode *vp) 3241 { 3242 struct mount *mp; 3243 3244 ASSERT_VI_LOCKED(vp, __func__); 3245 /* 3246 * Mark a vnode as free: remove it from its active list 3247 * and put it up for recycling on the freelist. 3248 */ 3249 VNASSERT(!VN_IS_DOOMED(vp), vp, 3250 ("vdrop: returning doomed vnode")); 3251 VNASSERT(vp->v_op != NULL, vp, 3252 ("vdrop: vnode already reclaimed.")); 3253 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 3254 ("vnode already free")); 3255 VNASSERT(vp->v_holdcnt == 0, vp, 3256 ("vdrop: freeing when we shouldn't")); 3257 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3258 mp = vp->v_mount; 3259 mtx_lock(&mp->mnt_listmtx); 3260 if (vp->v_iflag & VI_ACTIVE) { 3261 vp->v_iflag &= ~VI_ACTIVE; 3262 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 3263 mp->mnt_activevnodelistsize--; 3264 } 3265 TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); 3266 mp->mnt_tmpfreevnodelistsize++; 3267 vp->v_iflag |= VI_FREE; 3268 vp->v_mflag |= VMP_TMPMNTFREELIST; 3269 VI_UNLOCK(vp); 3270 if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) 3271 vnlru_return_batch_locked(mp); 3272 mtx_unlock(&mp->mnt_listmtx); 3273 } else { 3274 VI_UNLOCK(vp); 3275 counter_u64_add(free_owe_inact, 1); 3276 } 3277 } 3278 3279 void 3280 vdrop(struct vnode *vp) 3281 { 3282 3283 ASSERT_VI_UNLOCKED(vp, __func__); 3284 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3285 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3286 return; 3287 VI_LOCK(vp); 3288 vdropl(vp); 3289 } 3290 3291 void 3292 vdropl(struct vnode *vp) 3293 { 3294 3295 ASSERT_VI_LOCKED(vp, __func__); 3296 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3297 if (!refcount_release(&vp->v_holdcnt)) { 3298 VI_UNLOCK(vp); 3299 return; 3300 } 3301 if (VN_IS_DOOMED(vp)) { 3302 freevnode(vp); 3303 return; 3304 } 3305 vdrop_deactivate(vp); 3306 } 3307 3308 /* 3309 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3310 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3311 * OWEINACT tracks whether a vnode missed a call to inactive due to a 3312 * failed lock upgrade. 3313 */ 3314 void 3315 vinactive(struct vnode *vp) 3316 { 3317 struct vm_object *obj; 3318 3319 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3320 ASSERT_VI_LOCKED(vp, "vinactive"); 3321 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3322 ("vinactive: recursed on VI_DOINGINACT")); 3323 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3324 vp->v_iflag |= VI_DOINGINACT; 3325 vp->v_iflag &= ~VI_OWEINACT; 3326 VI_UNLOCK(vp); 3327 /* 3328 * Before moving off the active list, we must be sure that any 3329 * modified pages are converted into the vnode's dirty 3330 * buffers, since these will no longer be checked once the 3331 * vnode is on the inactive list. 3332 * 3333 * The write-out of the dirty pages is asynchronous. At the 3334 * point that VOP_INACTIVE() is called, there could still be 3335 * pending I/O and dirty pages in the object. 3336 */ 3337 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3338 vm_object_mightbedirty(obj)) { 3339 VM_OBJECT_WLOCK(obj); 3340 vm_object_page_clean(obj, 0, 0, 0); 3341 VM_OBJECT_WUNLOCK(obj); 3342 } 3343 VOP_INACTIVE(vp, curthread); 3344 VI_LOCK(vp); 3345 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3346 ("vinactive: lost VI_DOINGINACT")); 3347 vp->v_iflag &= ~VI_DOINGINACT; 3348 } 3349 3350 /* 3351 * Remove any vnodes in the vnode table belonging to mount point mp. 3352 * 3353 * If FORCECLOSE is not specified, there should not be any active ones, 3354 * return error if any are found (nb: this is a user error, not a 3355 * system error). If FORCECLOSE is specified, detach any active vnodes 3356 * that are found. 3357 * 3358 * If WRITECLOSE is set, only flush out regular file vnodes open for 3359 * writing. 3360 * 3361 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3362 * 3363 * `rootrefs' specifies the base reference count for the root vnode 3364 * of this filesystem. The root vnode is considered busy if its 3365 * v_usecount exceeds this value. On a successful return, vflush(, td) 3366 * will call vrele() on the root vnode exactly rootrefs times. 3367 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3368 * be zero. 3369 */ 3370 #ifdef DIAGNOSTIC 3371 static int busyprt = 0; /* print out busy vnodes */ 3372 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3373 #endif 3374 3375 int 3376 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3377 { 3378 struct vnode *vp, *mvp, *rootvp = NULL; 3379 struct vattr vattr; 3380 int busy = 0, error; 3381 3382 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3383 rootrefs, flags); 3384 if (rootrefs > 0) { 3385 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3386 ("vflush: bad args")); 3387 /* 3388 * Get the filesystem root vnode. We can vput() it 3389 * immediately, since with rootrefs > 0, it won't go away. 3390 */ 3391 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3392 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3393 __func__, error); 3394 return (error); 3395 } 3396 vput(rootvp); 3397 } 3398 loop: 3399 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3400 vholdl(vp); 3401 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3402 if (error) { 3403 vdrop(vp); 3404 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3405 goto loop; 3406 } 3407 /* 3408 * Skip over a vnodes marked VV_SYSTEM. 3409 */ 3410 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3411 VOP_UNLOCK(vp); 3412 vdrop(vp); 3413 continue; 3414 } 3415 /* 3416 * If WRITECLOSE is set, flush out unlinked but still open 3417 * files (even if open only for reading) and regular file 3418 * vnodes open for writing. 3419 */ 3420 if (flags & WRITECLOSE) { 3421 if (vp->v_object != NULL) { 3422 VM_OBJECT_WLOCK(vp->v_object); 3423 vm_object_page_clean(vp->v_object, 0, 0, 0); 3424 VM_OBJECT_WUNLOCK(vp->v_object); 3425 } 3426 error = VOP_FSYNC(vp, MNT_WAIT, td); 3427 if (error != 0) { 3428 VOP_UNLOCK(vp); 3429 vdrop(vp); 3430 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3431 return (error); 3432 } 3433 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3434 VI_LOCK(vp); 3435 3436 if ((vp->v_type == VNON || 3437 (error == 0 && vattr.va_nlink > 0)) && 3438 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3439 VOP_UNLOCK(vp); 3440 vdropl(vp); 3441 continue; 3442 } 3443 } else 3444 VI_LOCK(vp); 3445 /* 3446 * With v_usecount == 0, all we need to do is clear out the 3447 * vnode data structures and we are done. 3448 * 3449 * If FORCECLOSE is set, forcibly close the vnode. 3450 */ 3451 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3452 vgonel(vp); 3453 } else { 3454 busy++; 3455 #ifdef DIAGNOSTIC 3456 if (busyprt) 3457 vn_printf(vp, "vflush: busy vnode "); 3458 #endif 3459 } 3460 VOP_UNLOCK(vp); 3461 vdropl(vp); 3462 } 3463 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3464 /* 3465 * If just the root vnode is busy, and if its refcount 3466 * is equal to `rootrefs', then go ahead and kill it. 3467 */ 3468 VI_LOCK(rootvp); 3469 KASSERT(busy > 0, ("vflush: not busy")); 3470 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3471 ("vflush: usecount %d < rootrefs %d", 3472 rootvp->v_usecount, rootrefs)); 3473 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3474 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3475 vgone(rootvp); 3476 VOP_UNLOCK(rootvp); 3477 busy = 0; 3478 } else 3479 VI_UNLOCK(rootvp); 3480 } 3481 if (busy) { 3482 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3483 busy); 3484 return (EBUSY); 3485 } 3486 for (; rootrefs > 0; rootrefs--) 3487 vrele(rootvp); 3488 return (0); 3489 } 3490 3491 /* 3492 * Recycle an unused vnode to the front of the free list. 3493 */ 3494 int 3495 vrecycle(struct vnode *vp) 3496 { 3497 int recycled; 3498 3499 VI_LOCK(vp); 3500 recycled = vrecyclel(vp); 3501 VI_UNLOCK(vp); 3502 return (recycled); 3503 } 3504 3505 /* 3506 * vrecycle, with the vp interlock held. 3507 */ 3508 int 3509 vrecyclel(struct vnode *vp) 3510 { 3511 int recycled; 3512 3513 ASSERT_VOP_ELOCKED(vp, __func__); 3514 ASSERT_VI_LOCKED(vp, __func__); 3515 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3516 recycled = 0; 3517 if (vp->v_usecount == 0) { 3518 recycled = 1; 3519 vgonel(vp); 3520 } 3521 return (recycled); 3522 } 3523 3524 /* 3525 * Eliminate all activity associated with a vnode 3526 * in preparation for reuse. 3527 */ 3528 void 3529 vgone(struct vnode *vp) 3530 { 3531 VI_LOCK(vp); 3532 vgonel(vp); 3533 VI_UNLOCK(vp); 3534 } 3535 3536 static void 3537 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3538 struct vnode *lowervp __unused) 3539 { 3540 } 3541 3542 /* 3543 * Notify upper mounts about reclaimed or unlinked vnode. 3544 */ 3545 void 3546 vfs_notify_upper(struct vnode *vp, int event) 3547 { 3548 static struct vfsops vgonel_vfsops = { 3549 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3550 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3551 }; 3552 struct mount *mp, *ump, *mmp; 3553 3554 mp = vp->v_mount; 3555 if (mp == NULL) 3556 return; 3557 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3558 return; 3559 3560 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3561 mmp->mnt_op = &vgonel_vfsops; 3562 mmp->mnt_kern_flag |= MNTK_MARKER; 3563 MNT_ILOCK(mp); 3564 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3565 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3566 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3567 ump = TAILQ_NEXT(ump, mnt_upper_link); 3568 continue; 3569 } 3570 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3571 MNT_IUNLOCK(mp); 3572 switch (event) { 3573 case VFS_NOTIFY_UPPER_RECLAIM: 3574 VFS_RECLAIM_LOWERVP(ump, vp); 3575 break; 3576 case VFS_NOTIFY_UPPER_UNLINK: 3577 VFS_UNLINK_LOWERVP(ump, vp); 3578 break; 3579 default: 3580 KASSERT(0, ("invalid event %d", event)); 3581 break; 3582 } 3583 MNT_ILOCK(mp); 3584 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3585 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3586 } 3587 free(mmp, M_TEMP); 3588 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3589 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3590 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3591 wakeup(&mp->mnt_uppers); 3592 } 3593 MNT_IUNLOCK(mp); 3594 } 3595 3596 /* 3597 * vgone, with the vp interlock held. 3598 */ 3599 static void 3600 vgonel(struct vnode *vp) 3601 { 3602 struct thread *td; 3603 struct mount *mp; 3604 vm_object_t object; 3605 bool active, oweinact; 3606 3607 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3608 ASSERT_VI_LOCKED(vp, "vgonel"); 3609 VNASSERT(vp->v_holdcnt, vp, 3610 ("vgonel: vp %p has no reference.", vp)); 3611 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3612 td = curthread; 3613 3614 /* 3615 * Don't vgonel if we're already doomed. 3616 */ 3617 if (vp->v_irflag & VIRF_DOOMED) 3618 return; 3619 vp->v_irflag |= VIRF_DOOMED; 3620 3621 /* 3622 * Check to see if the vnode is in use. If so, we have to call 3623 * VOP_CLOSE() and VOP_INACTIVE(). 3624 */ 3625 active = vp->v_usecount > 0; 3626 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 3627 VI_UNLOCK(vp); 3628 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3629 3630 /* 3631 * If purging an active vnode, it must be closed and 3632 * deactivated before being reclaimed. 3633 */ 3634 if (active) 3635 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3636 if (oweinact || active) { 3637 VI_LOCK(vp); 3638 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3639 vinactive(vp); 3640 VI_UNLOCK(vp); 3641 } 3642 if (vp->v_type == VSOCK) 3643 vfs_unp_reclaim(vp); 3644 3645 /* 3646 * Clean out any buffers associated with the vnode. 3647 * If the flush fails, just toss the buffers. 3648 */ 3649 mp = NULL; 3650 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3651 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3652 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3653 while (vinvalbuf(vp, 0, 0, 0) != 0) 3654 ; 3655 } 3656 3657 BO_LOCK(&vp->v_bufobj); 3658 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3659 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3660 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3661 vp->v_bufobj.bo_clean.bv_cnt == 0, 3662 ("vp %p bufobj not invalidated", vp)); 3663 3664 /* 3665 * For VMIO bufobj, BO_DEAD is set later, or in 3666 * vm_object_terminate() after the object's page queue is 3667 * flushed. 3668 */ 3669 object = vp->v_bufobj.bo_object; 3670 if (object == NULL) 3671 vp->v_bufobj.bo_flag |= BO_DEAD; 3672 BO_UNLOCK(&vp->v_bufobj); 3673 3674 /* 3675 * Handle the VM part. Tmpfs handles v_object on its own (the 3676 * OBJT_VNODE check). Nullfs or other bypassing filesystems 3677 * should not touch the object borrowed from the lower vnode 3678 * (the handle check). 3679 */ 3680 if (object != NULL && object->type == OBJT_VNODE && 3681 object->handle == vp) 3682 vnode_destroy_vobject(vp); 3683 3684 /* 3685 * Reclaim the vnode. 3686 */ 3687 if (VOP_RECLAIM(vp, td)) 3688 panic("vgone: cannot reclaim"); 3689 if (mp != NULL) 3690 vn_finished_secondary_write(mp); 3691 VNASSERT(vp->v_object == NULL, vp, 3692 ("vop_reclaim left v_object vp=%p", vp)); 3693 /* 3694 * Clear the advisory locks and wake up waiting threads. 3695 */ 3696 (void)VOP_ADVLOCKPURGE(vp); 3697 vp->v_lockf = NULL; 3698 /* 3699 * Delete from old mount point vnode list. 3700 */ 3701 delmntque(vp); 3702 cache_purge(vp); 3703 /* 3704 * Done with purge, reset to the standard lock and invalidate 3705 * the vnode. 3706 */ 3707 VI_LOCK(vp); 3708 vp->v_vnlock = &vp->v_lock; 3709 vp->v_op = &dead_vnodeops; 3710 vp->v_type = VBAD; 3711 } 3712 3713 /* 3714 * Calculate the total number of references to a special device. 3715 */ 3716 int 3717 vcount(struct vnode *vp) 3718 { 3719 int count; 3720 3721 dev_lock(); 3722 count = vp->v_rdev->si_usecount; 3723 dev_unlock(); 3724 return (count); 3725 } 3726 3727 /* 3728 * Print out a description of a vnode. 3729 */ 3730 static char *typename[] = 3731 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3732 "VMARKER"}; 3733 3734 void 3735 vn_printf(struct vnode *vp, const char *fmt, ...) 3736 { 3737 va_list ap; 3738 char buf[256], buf2[16]; 3739 u_long flags; 3740 3741 va_start(ap, fmt); 3742 vprintf(fmt, ap); 3743 va_end(ap); 3744 printf("%p: ", (void *)vp); 3745 printf("type %s\n", typename[vp->v_type]); 3746 printf(" usecount %d, writecount %d, refcount %d", 3747 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 3748 switch (vp->v_type) { 3749 case VDIR: 3750 printf(" mountedhere %p\n", vp->v_mountedhere); 3751 break; 3752 case VCHR: 3753 printf(" rdev %p\n", vp->v_rdev); 3754 break; 3755 case VSOCK: 3756 printf(" socket %p\n", vp->v_unpcb); 3757 break; 3758 case VFIFO: 3759 printf(" fifoinfo %p\n", vp->v_fifoinfo); 3760 break; 3761 default: 3762 printf("\n"); 3763 break; 3764 } 3765 buf[0] = '\0'; 3766 buf[1] = '\0'; 3767 if (vp->v_irflag & VIRF_DOOMED) 3768 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 3769 flags = vp->v_irflag & ~(VIRF_DOOMED); 3770 if (flags != 0) { 3771 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 3772 strlcat(buf, buf2, sizeof(buf)); 3773 } 3774 if (vp->v_vflag & VV_ROOT) 3775 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3776 if (vp->v_vflag & VV_ISTTY) 3777 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3778 if (vp->v_vflag & VV_NOSYNC) 3779 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3780 if (vp->v_vflag & VV_ETERNALDEV) 3781 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3782 if (vp->v_vflag & VV_CACHEDLABEL) 3783 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3784 if (vp->v_vflag & VV_VMSIZEVNLOCK) 3785 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 3786 if (vp->v_vflag & VV_COPYONWRITE) 3787 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3788 if (vp->v_vflag & VV_SYSTEM) 3789 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3790 if (vp->v_vflag & VV_PROCDEP) 3791 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3792 if (vp->v_vflag & VV_NOKNOTE) 3793 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3794 if (vp->v_vflag & VV_DELETED) 3795 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3796 if (vp->v_vflag & VV_MD) 3797 strlcat(buf, "|VV_MD", sizeof(buf)); 3798 if (vp->v_vflag & VV_FORCEINSMQ) 3799 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3800 if (vp->v_vflag & VV_READLINK) 3801 strlcat(buf, "|VV_READLINK", sizeof(buf)); 3802 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3803 VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3804 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3805 if (flags != 0) { 3806 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3807 strlcat(buf, buf2, sizeof(buf)); 3808 } 3809 if (vp->v_iflag & VI_TEXT_REF) 3810 strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); 3811 if (vp->v_iflag & VI_MOUNT) 3812 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3813 if (vp->v_iflag & VI_FREE) 3814 strlcat(buf, "|VI_FREE", sizeof(buf)); 3815 if (vp->v_iflag & VI_ACTIVE) 3816 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3817 if (vp->v_iflag & VI_DOINGINACT) 3818 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3819 if (vp->v_iflag & VI_OWEINACT) 3820 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3821 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE | 3822 VI_DOINGINACT | VI_OWEINACT); 3823 if (flags != 0) { 3824 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3825 strlcat(buf, buf2, sizeof(buf)); 3826 } 3827 if (vp->v_mflag & VMP_TMPMNTFREELIST) 3828 strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); 3829 flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); 3830 if (flags != 0) { 3831 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 3832 strlcat(buf, buf2, sizeof(buf)); 3833 } 3834 printf(" flags (%s)\n", buf + 1); 3835 if (mtx_owned(VI_MTX(vp))) 3836 printf(" VI_LOCKed"); 3837 if (vp->v_object != NULL) 3838 printf(" v_object %p ref %d pages %d " 3839 "cleanbuf %d dirtybuf %d\n", 3840 vp->v_object, vp->v_object->ref_count, 3841 vp->v_object->resident_page_count, 3842 vp->v_bufobj.bo_clean.bv_cnt, 3843 vp->v_bufobj.bo_dirty.bv_cnt); 3844 printf(" "); 3845 lockmgr_printinfo(vp->v_vnlock); 3846 if (vp->v_data != NULL) 3847 VOP_PRINT(vp); 3848 } 3849 3850 #ifdef DDB 3851 /* 3852 * List all of the locked vnodes in the system. 3853 * Called when debugging the kernel. 3854 */ 3855 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3856 { 3857 struct mount *mp; 3858 struct vnode *vp; 3859 3860 /* 3861 * Note: because this is DDB, we can't obey the locking semantics 3862 * for these structures, which means we could catch an inconsistent 3863 * state and dereference a nasty pointer. Not much to be done 3864 * about that. 3865 */ 3866 db_printf("Locked vnodes\n"); 3867 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3868 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3869 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3870 vn_printf(vp, "vnode "); 3871 } 3872 } 3873 } 3874 3875 /* 3876 * Show details about the given vnode. 3877 */ 3878 DB_SHOW_COMMAND(vnode, db_show_vnode) 3879 { 3880 struct vnode *vp; 3881 3882 if (!have_addr) 3883 return; 3884 vp = (struct vnode *)addr; 3885 vn_printf(vp, "vnode "); 3886 } 3887 3888 /* 3889 * Show details about the given mount point. 3890 */ 3891 DB_SHOW_COMMAND(mount, db_show_mount) 3892 { 3893 struct mount *mp; 3894 struct vfsopt *opt; 3895 struct statfs *sp; 3896 struct vnode *vp; 3897 char buf[512]; 3898 uint64_t mflags; 3899 u_int flags; 3900 3901 if (!have_addr) { 3902 /* No address given, print short info about all mount points. */ 3903 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3904 db_printf("%p %s on %s (%s)\n", mp, 3905 mp->mnt_stat.f_mntfromname, 3906 mp->mnt_stat.f_mntonname, 3907 mp->mnt_stat.f_fstypename); 3908 if (db_pager_quit) 3909 break; 3910 } 3911 db_printf("\nMore info: show mount <addr>\n"); 3912 return; 3913 } 3914 3915 mp = (struct mount *)addr; 3916 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3917 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3918 3919 buf[0] = '\0'; 3920 mflags = mp->mnt_flag; 3921 #define MNT_FLAG(flag) do { \ 3922 if (mflags & (flag)) { \ 3923 if (buf[0] != '\0') \ 3924 strlcat(buf, ", ", sizeof(buf)); \ 3925 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3926 mflags &= ~(flag); \ 3927 } \ 3928 } while (0) 3929 MNT_FLAG(MNT_RDONLY); 3930 MNT_FLAG(MNT_SYNCHRONOUS); 3931 MNT_FLAG(MNT_NOEXEC); 3932 MNT_FLAG(MNT_NOSUID); 3933 MNT_FLAG(MNT_NFS4ACLS); 3934 MNT_FLAG(MNT_UNION); 3935 MNT_FLAG(MNT_ASYNC); 3936 MNT_FLAG(MNT_SUIDDIR); 3937 MNT_FLAG(MNT_SOFTDEP); 3938 MNT_FLAG(MNT_NOSYMFOLLOW); 3939 MNT_FLAG(MNT_GJOURNAL); 3940 MNT_FLAG(MNT_MULTILABEL); 3941 MNT_FLAG(MNT_ACLS); 3942 MNT_FLAG(MNT_NOATIME); 3943 MNT_FLAG(MNT_NOCLUSTERR); 3944 MNT_FLAG(MNT_NOCLUSTERW); 3945 MNT_FLAG(MNT_SUJ); 3946 MNT_FLAG(MNT_EXRDONLY); 3947 MNT_FLAG(MNT_EXPORTED); 3948 MNT_FLAG(MNT_DEFEXPORTED); 3949 MNT_FLAG(MNT_EXPORTANON); 3950 MNT_FLAG(MNT_EXKERB); 3951 MNT_FLAG(MNT_EXPUBLIC); 3952 MNT_FLAG(MNT_LOCAL); 3953 MNT_FLAG(MNT_QUOTA); 3954 MNT_FLAG(MNT_ROOTFS); 3955 MNT_FLAG(MNT_USER); 3956 MNT_FLAG(MNT_IGNORE); 3957 MNT_FLAG(MNT_UPDATE); 3958 MNT_FLAG(MNT_DELEXPORT); 3959 MNT_FLAG(MNT_RELOAD); 3960 MNT_FLAG(MNT_FORCE); 3961 MNT_FLAG(MNT_SNAPSHOT); 3962 MNT_FLAG(MNT_BYFSID); 3963 #undef MNT_FLAG 3964 if (mflags != 0) { 3965 if (buf[0] != '\0') 3966 strlcat(buf, ", ", sizeof(buf)); 3967 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3968 "0x%016jx", mflags); 3969 } 3970 db_printf(" mnt_flag = %s\n", buf); 3971 3972 buf[0] = '\0'; 3973 flags = mp->mnt_kern_flag; 3974 #define MNT_KERN_FLAG(flag) do { \ 3975 if (flags & (flag)) { \ 3976 if (buf[0] != '\0') \ 3977 strlcat(buf, ", ", sizeof(buf)); \ 3978 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3979 flags &= ~(flag); \ 3980 } \ 3981 } while (0) 3982 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3983 MNT_KERN_FLAG(MNTK_ASYNC); 3984 MNT_KERN_FLAG(MNTK_SOFTDEP); 3985 MNT_KERN_FLAG(MNTK_DRAINING); 3986 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3987 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3988 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3989 MNT_KERN_FLAG(MNTK_NO_IOPF); 3990 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3991 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3992 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3993 MNT_KERN_FLAG(MNTK_MARKER); 3994 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3995 MNT_KERN_FLAG(MNTK_NOASYNC); 3996 MNT_KERN_FLAG(MNTK_UNMOUNT); 3997 MNT_KERN_FLAG(MNTK_MWAIT); 3998 MNT_KERN_FLAG(MNTK_SUSPEND); 3999 MNT_KERN_FLAG(MNTK_SUSPEND2); 4000 MNT_KERN_FLAG(MNTK_SUSPENDED); 4001 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4002 MNT_KERN_FLAG(MNTK_NOKNOTE); 4003 #undef MNT_KERN_FLAG 4004 if (flags != 0) { 4005 if (buf[0] != '\0') 4006 strlcat(buf, ", ", sizeof(buf)); 4007 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4008 "0x%08x", flags); 4009 } 4010 db_printf(" mnt_kern_flag = %s\n", buf); 4011 4012 db_printf(" mnt_opt = "); 4013 opt = TAILQ_FIRST(mp->mnt_opt); 4014 if (opt != NULL) { 4015 db_printf("%s", opt->name); 4016 opt = TAILQ_NEXT(opt, link); 4017 while (opt != NULL) { 4018 db_printf(", %s", opt->name); 4019 opt = TAILQ_NEXT(opt, link); 4020 } 4021 } 4022 db_printf("\n"); 4023 4024 sp = &mp->mnt_stat; 4025 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4026 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4027 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4028 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4029 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4030 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4031 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4032 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4033 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4034 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4035 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4036 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4037 4038 db_printf(" mnt_cred = { uid=%u ruid=%u", 4039 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4040 if (jailed(mp->mnt_cred)) 4041 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4042 db_printf(" }\n"); 4043 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4044 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4045 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4046 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4047 db_printf(" mnt_activevnodelistsize = %d\n", 4048 mp->mnt_activevnodelistsize); 4049 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4050 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4051 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 4052 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4053 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4054 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4055 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4056 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4057 db_printf(" mnt_secondary_accwrites = %d\n", 4058 mp->mnt_secondary_accwrites); 4059 db_printf(" mnt_gjprovider = %s\n", 4060 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4061 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4062 4063 db_printf("\n\nList of active vnodes\n"); 4064 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 4065 if (vp->v_type != VMARKER) { 4066 vn_printf(vp, "vnode "); 4067 if (db_pager_quit) 4068 break; 4069 } 4070 } 4071 db_printf("\n\nList of inactive vnodes\n"); 4072 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4073 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 4074 vn_printf(vp, "vnode "); 4075 if (db_pager_quit) 4076 break; 4077 } 4078 } 4079 } 4080 #endif /* DDB */ 4081 4082 /* 4083 * Fill in a struct xvfsconf based on a struct vfsconf. 4084 */ 4085 static int 4086 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4087 { 4088 struct xvfsconf xvfsp; 4089 4090 bzero(&xvfsp, sizeof(xvfsp)); 4091 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4092 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4093 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4094 xvfsp.vfc_flags = vfsp->vfc_flags; 4095 /* 4096 * These are unused in userland, we keep them 4097 * to not break binary compatibility. 4098 */ 4099 xvfsp.vfc_vfsops = NULL; 4100 xvfsp.vfc_next = NULL; 4101 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4102 } 4103 4104 #ifdef COMPAT_FREEBSD32 4105 struct xvfsconf32 { 4106 uint32_t vfc_vfsops; 4107 char vfc_name[MFSNAMELEN]; 4108 int32_t vfc_typenum; 4109 int32_t vfc_refcount; 4110 int32_t vfc_flags; 4111 uint32_t vfc_next; 4112 }; 4113 4114 static int 4115 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4116 { 4117 struct xvfsconf32 xvfsp; 4118 4119 bzero(&xvfsp, sizeof(xvfsp)); 4120 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4121 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4122 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4123 xvfsp.vfc_flags = vfsp->vfc_flags; 4124 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4125 } 4126 #endif 4127 4128 /* 4129 * Top level filesystem related information gathering. 4130 */ 4131 static int 4132 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4133 { 4134 struct vfsconf *vfsp; 4135 int error; 4136 4137 error = 0; 4138 vfsconf_slock(); 4139 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4140 #ifdef COMPAT_FREEBSD32 4141 if (req->flags & SCTL_MASK32) 4142 error = vfsconf2x32(req, vfsp); 4143 else 4144 #endif 4145 error = vfsconf2x(req, vfsp); 4146 if (error) 4147 break; 4148 } 4149 vfsconf_sunlock(); 4150 return (error); 4151 } 4152 4153 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4154 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4155 "S,xvfsconf", "List of all configured filesystems"); 4156 4157 #ifndef BURN_BRIDGES 4158 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4159 4160 static int 4161 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4162 { 4163 int *name = (int *)arg1 - 1; /* XXX */ 4164 u_int namelen = arg2 + 1; /* XXX */ 4165 struct vfsconf *vfsp; 4166 4167 log(LOG_WARNING, "userland calling deprecated sysctl, " 4168 "please rebuild world\n"); 4169 4170 #if 1 || defined(COMPAT_PRELITE2) 4171 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4172 if (namelen == 1) 4173 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4174 #endif 4175 4176 switch (name[1]) { 4177 case VFS_MAXTYPENUM: 4178 if (namelen != 2) 4179 return (ENOTDIR); 4180 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4181 case VFS_CONF: 4182 if (namelen != 3) 4183 return (ENOTDIR); /* overloaded */ 4184 vfsconf_slock(); 4185 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4186 if (vfsp->vfc_typenum == name[2]) 4187 break; 4188 } 4189 vfsconf_sunlock(); 4190 if (vfsp == NULL) 4191 return (EOPNOTSUPP); 4192 #ifdef COMPAT_FREEBSD32 4193 if (req->flags & SCTL_MASK32) 4194 return (vfsconf2x32(req, vfsp)); 4195 else 4196 #endif 4197 return (vfsconf2x(req, vfsp)); 4198 } 4199 return (EOPNOTSUPP); 4200 } 4201 4202 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4203 CTLFLAG_MPSAFE, vfs_sysctl, 4204 "Generic filesystem"); 4205 4206 #if 1 || defined(COMPAT_PRELITE2) 4207 4208 static int 4209 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4210 { 4211 int error; 4212 struct vfsconf *vfsp; 4213 struct ovfsconf ovfs; 4214 4215 vfsconf_slock(); 4216 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4217 bzero(&ovfs, sizeof(ovfs)); 4218 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4219 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4220 ovfs.vfc_index = vfsp->vfc_typenum; 4221 ovfs.vfc_refcount = vfsp->vfc_refcount; 4222 ovfs.vfc_flags = vfsp->vfc_flags; 4223 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4224 if (error != 0) { 4225 vfsconf_sunlock(); 4226 return (error); 4227 } 4228 } 4229 vfsconf_sunlock(); 4230 return (0); 4231 } 4232 4233 #endif /* 1 || COMPAT_PRELITE2 */ 4234 #endif /* !BURN_BRIDGES */ 4235 4236 #define KINFO_VNODESLOP 10 4237 #ifdef notyet 4238 /* 4239 * Dump vnode list (via sysctl). 4240 */ 4241 /* ARGSUSED */ 4242 static int 4243 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4244 { 4245 struct xvnode *xvn; 4246 struct mount *mp; 4247 struct vnode *vp; 4248 int error, len, n; 4249 4250 /* 4251 * Stale numvnodes access is not fatal here. 4252 */ 4253 req->lock = 0; 4254 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4255 if (!req->oldptr) 4256 /* Make an estimate */ 4257 return (SYSCTL_OUT(req, 0, len)); 4258 4259 error = sysctl_wire_old_buffer(req, 0); 4260 if (error != 0) 4261 return (error); 4262 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4263 n = 0; 4264 mtx_lock(&mountlist_mtx); 4265 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4266 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4267 continue; 4268 MNT_ILOCK(mp); 4269 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4270 if (n == len) 4271 break; 4272 vref(vp); 4273 xvn[n].xv_size = sizeof *xvn; 4274 xvn[n].xv_vnode = vp; 4275 xvn[n].xv_id = 0; /* XXX compat */ 4276 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4277 XV_COPY(usecount); 4278 XV_COPY(writecount); 4279 XV_COPY(holdcnt); 4280 XV_COPY(mount); 4281 XV_COPY(numoutput); 4282 XV_COPY(type); 4283 #undef XV_COPY 4284 xvn[n].xv_flag = vp->v_vflag; 4285 4286 switch (vp->v_type) { 4287 case VREG: 4288 case VDIR: 4289 case VLNK: 4290 break; 4291 case VBLK: 4292 case VCHR: 4293 if (vp->v_rdev == NULL) { 4294 vrele(vp); 4295 continue; 4296 } 4297 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4298 break; 4299 case VSOCK: 4300 xvn[n].xv_socket = vp->v_socket; 4301 break; 4302 case VFIFO: 4303 xvn[n].xv_fifo = vp->v_fifoinfo; 4304 break; 4305 case VNON: 4306 case VBAD: 4307 default: 4308 /* shouldn't happen? */ 4309 vrele(vp); 4310 continue; 4311 } 4312 vrele(vp); 4313 ++n; 4314 } 4315 MNT_IUNLOCK(mp); 4316 mtx_lock(&mountlist_mtx); 4317 vfs_unbusy(mp); 4318 if (n == len) 4319 break; 4320 } 4321 mtx_unlock(&mountlist_mtx); 4322 4323 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4324 free(xvn, M_TEMP); 4325 return (error); 4326 } 4327 4328 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4329 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4330 ""); 4331 #endif 4332 4333 static void 4334 unmount_or_warn(struct mount *mp) 4335 { 4336 int error; 4337 4338 error = dounmount(mp, MNT_FORCE, curthread); 4339 if (error != 0) { 4340 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4341 if (error == EBUSY) 4342 printf("BUSY)\n"); 4343 else 4344 printf("%d)\n", error); 4345 } 4346 } 4347 4348 /* 4349 * Unmount all filesystems. The list is traversed in reverse order 4350 * of mounting to avoid dependencies. 4351 */ 4352 void 4353 vfs_unmountall(void) 4354 { 4355 struct mount *mp, *tmp; 4356 4357 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4358 4359 /* 4360 * Since this only runs when rebooting, it is not interlocked. 4361 */ 4362 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4363 vfs_ref(mp); 4364 4365 /* 4366 * Forcibly unmounting "/dev" before "/" would prevent clean 4367 * unmount of the latter. 4368 */ 4369 if (mp == rootdevmp) 4370 continue; 4371 4372 unmount_or_warn(mp); 4373 } 4374 4375 if (rootdevmp != NULL) 4376 unmount_or_warn(rootdevmp); 4377 } 4378 4379 /* 4380 * perform msync on all vnodes under a mount point 4381 * the mount point must be locked. 4382 */ 4383 void 4384 vfs_msync(struct mount *mp, int flags) 4385 { 4386 struct vnode *vp, *mvp; 4387 struct vm_object *obj; 4388 4389 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4390 4391 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4392 return; 4393 4394 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 4395 obj = vp->v_object; 4396 if (obj != NULL && vm_object_mightbedirty(obj) && 4397 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 4398 if (!vget(vp, 4399 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 4400 curthread)) { 4401 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 4402 vput(vp); 4403 continue; 4404 } 4405 4406 obj = vp->v_object; 4407 if (obj != NULL) { 4408 VM_OBJECT_WLOCK(obj); 4409 vm_object_page_clean(obj, 0, 0, 4410 flags == MNT_WAIT ? 4411 OBJPC_SYNC : OBJPC_NOSYNC); 4412 VM_OBJECT_WUNLOCK(obj); 4413 } 4414 vput(vp); 4415 } 4416 } else 4417 VI_UNLOCK(vp); 4418 } 4419 } 4420 4421 static void 4422 destroy_vpollinfo_free(struct vpollinfo *vi) 4423 { 4424 4425 knlist_destroy(&vi->vpi_selinfo.si_note); 4426 mtx_destroy(&vi->vpi_lock); 4427 uma_zfree(vnodepoll_zone, vi); 4428 } 4429 4430 static void 4431 destroy_vpollinfo(struct vpollinfo *vi) 4432 { 4433 4434 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4435 seldrain(&vi->vpi_selinfo); 4436 destroy_vpollinfo_free(vi); 4437 } 4438 4439 /* 4440 * Initialize per-vnode helper structure to hold poll-related state. 4441 */ 4442 void 4443 v_addpollinfo(struct vnode *vp) 4444 { 4445 struct vpollinfo *vi; 4446 4447 if (vp->v_pollinfo != NULL) 4448 return; 4449 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 4450 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4451 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4452 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 4453 VI_LOCK(vp); 4454 if (vp->v_pollinfo != NULL) { 4455 VI_UNLOCK(vp); 4456 destroy_vpollinfo_free(vi); 4457 return; 4458 } 4459 vp->v_pollinfo = vi; 4460 VI_UNLOCK(vp); 4461 } 4462 4463 /* 4464 * Record a process's interest in events which might happen to 4465 * a vnode. Because poll uses the historic select-style interface 4466 * internally, this routine serves as both the ``check for any 4467 * pending events'' and the ``record my interest in future events'' 4468 * functions. (These are done together, while the lock is held, 4469 * to avoid race conditions.) 4470 */ 4471 int 4472 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4473 { 4474 4475 v_addpollinfo(vp); 4476 mtx_lock(&vp->v_pollinfo->vpi_lock); 4477 if (vp->v_pollinfo->vpi_revents & events) { 4478 /* 4479 * This leaves events we are not interested 4480 * in available for the other process which 4481 * which presumably had requested them 4482 * (otherwise they would never have been 4483 * recorded). 4484 */ 4485 events &= vp->v_pollinfo->vpi_revents; 4486 vp->v_pollinfo->vpi_revents &= ~events; 4487 4488 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4489 return (events); 4490 } 4491 vp->v_pollinfo->vpi_events |= events; 4492 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4493 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4494 return (0); 4495 } 4496 4497 /* 4498 * Routine to create and manage a filesystem syncer vnode. 4499 */ 4500 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4501 static int sync_fsync(struct vop_fsync_args *); 4502 static int sync_inactive(struct vop_inactive_args *); 4503 static int sync_reclaim(struct vop_reclaim_args *); 4504 4505 static struct vop_vector sync_vnodeops = { 4506 .vop_bypass = VOP_EOPNOTSUPP, 4507 .vop_close = sync_close, /* close */ 4508 .vop_fsync = sync_fsync, /* fsync */ 4509 .vop_inactive = sync_inactive, /* inactive */ 4510 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 4511 .vop_reclaim = sync_reclaim, /* reclaim */ 4512 .vop_lock1 = vop_stdlock, /* lock */ 4513 .vop_unlock = vop_stdunlock, /* unlock */ 4514 .vop_islocked = vop_stdislocked, /* islocked */ 4515 }; 4516 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 4517 4518 /* 4519 * Create a new filesystem syncer vnode for the specified mount point. 4520 */ 4521 void 4522 vfs_allocate_syncvnode(struct mount *mp) 4523 { 4524 struct vnode *vp; 4525 struct bufobj *bo; 4526 static long start, incr, next; 4527 int error; 4528 4529 /* Allocate a new vnode */ 4530 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4531 if (error != 0) 4532 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4533 vp->v_type = VNON; 4534 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4535 vp->v_vflag |= VV_FORCEINSMQ; 4536 error = insmntque(vp, mp); 4537 if (error != 0) 4538 panic("vfs_allocate_syncvnode: insmntque() failed"); 4539 vp->v_vflag &= ~VV_FORCEINSMQ; 4540 VOP_UNLOCK(vp); 4541 /* 4542 * Place the vnode onto the syncer worklist. We attempt to 4543 * scatter them about on the list so that they will go off 4544 * at evenly distributed times even if all the filesystems 4545 * are mounted at once. 4546 */ 4547 next += incr; 4548 if (next == 0 || next > syncer_maxdelay) { 4549 start /= 2; 4550 incr /= 2; 4551 if (start == 0) { 4552 start = syncer_maxdelay / 2; 4553 incr = syncer_maxdelay; 4554 } 4555 next = start; 4556 } 4557 bo = &vp->v_bufobj; 4558 BO_LOCK(bo); 4559 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4560 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4561 mtx_lock(&sync_mtx); 4562 sync_vnode_count++; 4563 if (mp->mnt_syncer == NULL) { 4564 mp->mnt_syncer = vp; 4565 vp = NULL; 4566 } 4567 mtx_unlock(&sync_mtx); 4568 BO_UNLOCK(bo); 4569 if (vp != NULL) { 4570 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4571 vgone(vp); 4572 vput(vp); 4573 } 4574 } 4575 4576 void 4577 vfs_deallocate_syncvnode(struct mount *mp) 4578 { 4579 struct vnode *vp; 4580 4581 mtx_lock(&sync_mtx); 4582 vp = mp->mnt_syncer; 4583 if (vp != NULL) 4584 mp->mnt_syncer = NULL; 4585 mtx_unlock(&sync_mtx); 4586 if (vp != NULL) 4587 vrele(vp); 4588 } 4589 4590 /* 4591 * Do a lazy sync of the filesystem. 4592 */ 4593 static int 4594 sync_fsync(struct vop_fsync_args *ap) 4595 { 4596 struct vnode *syncvp = ap->a_vp; 4597 struct mount *mp = syncvp->v_mount; 4598 int error, save; 4599 struct bufobj *bo; 4600 4601 /* 4602 * We only need to do something if this is a lazy evaluation. 4603 */ 4604 if (ap->a_waitfor != MNT_LAZY) 4605 return (0); 4606 4607 /* 4608 * Move ourselves to the back of the sync list. 4609 */ 4610 bo = &syncvp->v_bufobj; 4611 BO_LOCK(bo); 4612 vn_syncer_add_to_worklist(bo, syncdelay); 4613 BO_UNLOCK(bo); 4614 4615 /* 4616 * Walk the list of vnodes pushing all that are dirty and 4617 * not already on the sync list. 4618 */ 4619 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4620 return (0); 4621 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4622 vfs_unbusy(mp); 4623 return (0); 4624 } 4625 save = curthread_pflags_set(TDP_SYNCIO); 4626 /* 4627 * The filesystem at hand may be idle with free vnodes stored in the 4628 * batch. Return them instead of letting them stay there indefinitely. 4629 */ 4630 vnlru_return_batch(mp); 4631 vfs_msync(mp, MNT_NOWAIT); 4632 error = VFS_SYNC(mp, MNT_LAZY); 4633 curthread_pflags_restore(save); 4634 vn_finished_write(mp); 4635 vfs_unbusy(mp); 4636 return (error); 4637 } 4638 4639 /* 4640 * The syncer vnode is no referenced. 4641 */ 4642 static int 4643 sync_inactive(struct vop_inactive_args *ap) 4644 { 4645 4646 vgone(ap->a_vp); 4647 return (0); 4648 } 4649 4650 /* 4651 * The syncer vnode is no longer needed and is being decommissioned. 4652 * 4653 * Modifications to the worklist must be protected by sync_mtx. 4654 */ 4655 static int 4656 sync_reclaim(struct vop_reclaim_args *ap) 4657 { 4658 struct vnode *vp = ap->a_vp; 4659 struct bufobj *bo; 4660 4661 bo = &vp->v_bufobj; 4662 BO_LOCK(bo); 4663 mtx_lock(&sync_mtx); 4664 if (vp->v_mount->mnt_syncer == vp) 4665 vp->v_mount->mnt_syncer = NULL; 4666 if (bo->bo_flag & BO_ONWORKLST) { 4667 LIST_REMOVE(bo, bo_synclist); 4668 syncer_worklist_len--; 4669 sync_vnode_count--; 4670 bo->bo_flag &= ~BO_ONWORKLST; 4671 } 4672 mtx_unlock(&sync_mtx); 4673 BO_UNLOCK(bo); 4674 4675 return (0); 4676 } 4677 4678 int 4679 vn_need_pageq_flush(struct vnode *vp) 4680 { 4681 struct vm_object *obj; 4682 int need; 4683 4684 MPASS(mtx_owned(VI_MTX(vp))); 4685 need = 0; 4686 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 4687 vm_object_mightbedirty(obj)) 4688 need = 1; 4689 return (need); 4690 } 4691 4692 /* 4693 * Check if vnode represents a disk device 4694 */ 4695 int 4696 vn_isdisk(struct vnode *vp, int *errp) 4697 { 4698 int error; 4699 4700 if (vp->v_type != VCHR) { 4701 error = ENOTBLK; 4702 goto out; 4703 } 4704 error = 0; 4705 dev_lock(); 4706 if (vp->v_rdev == NULL) 4707 error = ENXIO; 4708 else if (vp->v_rdev->si_devsw == NULL) 4709 error = ENXIO; 4710 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4711 error = ENOTBLK; 4712 dev_unlock(); 4713 out: 4714 if (errp != NULL) 4715 *errp = error; 4716 return (error == 0); 4717 } 4718 4719 /* 4720 * Common filesystem object access control check routine. Accepts a 4721 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4722 * and optional call-by-reference privused argument allowing vaccess() 4723 * to indicate to the caller whether privilege was used to satisfy the 4724 * request (obsoleted). Returns 0 on success, or an errno on failure. 4725 */ 4726 int 4727 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4728 accmode_t accmode, struct ucred *cred, int *privused) 4729 { 4730 accmode_t dac_granted; 4731 accmode_t priv_granted; 4732 4733 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4734 ("invalid bit in accmode")); 4735 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4736 ("VAPPEND without VWRITE")); 4737 4738 /* 4739 * Look for a normal, non-privileged way to access the file/directory 4740 * as requested. If it exists, go with that. 4741 */ 4742 4743 if (privused != NULL) 4744 *privused = 0; 4745 4746 dac_granted = 0; 4747 4748 /* Check the owner. */ 4749 if (cred->cr_uid == file_uid) { 4750 dac_granted |= VADMIN; 4751 if (file_mode & S_IXUSR) 4752 dac_granted |= VEXEC; 4753 if (file_mode & S_IRUSR) 4754 dac_granted |= VREAD; 4755 if (file_mode & S_IWUSR) 4756 dac_granted |= (VWRITE | VAPPEND); 4757 4758 if ((accmode & dac_granted) == accmode) 4759 return (0); 4760 4761 goto privcheck; 4762 } 4763 4764 /* Otherwise, check the groups (first match) */ 4765 if (groupmember(file_gid, cred)) { 4766 if (file_mode & S_IXGRP) 4767 dac_granted |= VEXEC; 4768 if (file_mode & S_IRGRP) 4769 dac_granted |= VREAD; 4770 if (file_mode & S_IWGRP) 4771 dac_granted |= (VWRITE | VAPPEND); 4772 4773 if ((accmode & dac_granted) == accmode) 4774 return (0); 4775 4776 goto privcheck; 4777 } 4778 4779 /* Otherwise, check everyone else. */ 4780 if (file_mode & S_IXOTH) 4781 dac_granted |= VEXEC; 4782 if (file_mode & S_IROTH) 4783 dac_granted |= VREAD; 4784 if (file_mode & S_IWOTH) 4785 dac_granted |= (VWRITE | VAPPEND); 4786 if ((accmode & dac_granted) == accmode) 4787 return (0); 4788 4789 privcheck: 4790 /* 4791 * Build a privilege mask to determine if the set of privileges 4792 * satisfies the requirements when combined with the granted mask 4793 * from above. For each privilege, if the privilege is required, 4794 * bitwise or the request type onto the priv_granted mask. 4795 */ 4796 priv_granted = 0; 4797 4798 if (type == VDIR) { 4799 /* 4800 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4801 * requests, instead of PRIV_VFS_EXEC. 4802 */ 4803 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4804 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 4805 priv_granted |= VEXEC; 4806 } else { 4807 /* 4808 * Ensure that at least one execute bit is on. Otherwise, 4809 * a privileged user will always succeed, and we don't want 4810 * this to happen unless the file really is executable. 4811 */ 4812 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4813 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4814 !priv_check_cred(cred, PRIV_VFS_EXEC)) 4815 priv_granted |= VEXEC; 4816 } 4817 4818 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4819 !priv_check_cred(cred, PRIV_VFS_READ)) 4820 priv_granted |= VREAD; 4821 4822 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4823 !priv_check_cred(cred, PRIV_VFS_WRITE)) 4824 priv_granted |= (VWRITE | VAPPEND); 4825 4826 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4827 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 4828 priv_granted |= VADMIN; 4829 4830 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4831 /* XXX audit: privilege used */ 4832 if (privused != NULL) 4833 *privused = 1; 4834 return (0); 4835 } 4836 4837 return ((accmode & VADMIN) ? EPERM : EACCES); 4838 } 4839 4840 /* 4841 * Credential check based on process requesting service, and per-attribute 4842 * permissions. 4843 */ 4844 int 4845 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4846 struct thread *td, accmode_t accmode) 4847 { 4848 4849 /* 4850 * Kernel-invoked always succeeds. 4851 */ 4852 if (cred == NOCRED) 4853 return (0); 4854 4855 /* 4856 * Do not allow privileged processes in jail to directly manipulate 4857 * system attributes. 4858 */ 4859 switch (attrnamespace) { 4860 case EXTATTR_NAMESPACE_SYSTEM: 4861 /* Potentially should be: return (EPERM); */ 4862 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 4863 case EXTATTR_NAMESPACE_USER: 4864 return (VOP_ACCESS(vp, accmode, cred, td)); 4865 default: 4866 return (EPERM); 4867 } 4868 } 4869 4870 #ifdef DEBUG_VFS_LOCKS 4871 /* 4872 * This only exists to suppress warnings from unlocked specfs accesses. It is 4873 * no longer ok to have an unlocked VFS. 4874 */ 4875 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4876 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4877 4878 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4879 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4880 "Drop into debugger on lock violation"); 4881 4882 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4883 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4884 0, "Check for interlock across VOPs"); 4885 4886 int vfs_badlock_print = 1; /* Print lock violations. */ 4887 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4888 0, "Print lock violations"); 4889 4890 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 4891 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 4892 0, "Print vnode details on lock violations"); 4893 4894 #ifdef KDB 4895 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4896 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4897 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4898 #endif 4899 4900 static void 4901 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4902 { 4903 4904 #ifdef KDB 4905 if (vfs_badlock_backtrace) 4906 kdb_backtrace(); 4907 #endif 4908 if (vfs_badlock_vnode) 4909 vn_printf(vp, "vnode "); 4910 if (vfs_badlock_print) 4911 printf("%s: %p %s\n", str, (void *)vp, msg); 4912 if (vfs_badlock_ddb) 4913 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4914 } 4915 4916 void 4917 assert_vi_locked(struct vnode *vp, const char *str) 4918 { 4919 4920 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4921 vfs_badlock("interlock is not locked but should be", str, vp); 4922 } 4923 4924 void 4925 assert_vi_unlocked(struct vnode *vp, const char *str) 4926 { 4927 4928 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4929 vfs_badlock("interlock is locked but should not be", str, vp); 4930 } 4931 4932 void 4933 assert_vop_locked(struct vnode *vp, const char *str) 4934 { 4935 int locked; 4936 4937 if (!IGNORE_LOCK(vp)) { 4938 locked = VOP_ISLOCKED(vp); 4939 if (locked == 0 || locked == LK_EXCLOTHER) 4940 vfs_badlock("is not locked but should be", str, vp); 4941 } 4942 } 4943 4944 void 4945 assert_vop_unlocked(struct vnode *vp, const char *str) 4946 { 4947 4948 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4949 vfs_badlock("is locked but should not be", str, vp); 4950 } 4951 4952 void 4953 assert_vop_elocked(struct vnode *vp, const char *str) 4954 { 4955 4956 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4957 vfs_badlock("is not exclusive locked but should be", str, vp); 4958 } 4959 #endif /* DEBUG_VFS_LOCKS */ 4960 4961 void 4962 vop_rename_fail(struct vop_rename_args *ap) 4963 { 4964 4965 if (ap->a_tvp != NULL) 4966 vput(ap->a_tvp); 4967 if (ap->a_tdvp == ap->a_tvp) 4968 vrele(ap->a_tdvp); 4969 else 4970 vput(ap->a_tdvp); 4971 vrele(ap->a_fdvp); 4972 vrele(ap->a_fvp); 4973 } 4974 4975 void 4976 vop_rename_pre(void *ap) 4977 { 4978 struct vop_rename_args *a = ap; 4979 4980 #ifdef DEBUG_VFS_LOCKS 4981 if (a->a_tvp) 4982 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4983 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4984 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4985 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4986 4987 /* Check the source (from). */ 4988 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4989 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4990 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4991 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4992 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4993 4994 /* Check the target. */ 4995 if (a->a_tvp) 4996 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4997 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4998 #endif 4999 if (a->a_tdvp != a->a_fdvp) 5000 vhold(a->a_fdvp); 5001 if (a->a_tvp != a->a_fvp) 5002 vhold(a->a_fvp); 5003 vhold(a->a_tdvp); 5004 if (a->a_tvp) 5005 vhold(a->a_tvp); 5006 } 5007 5008 #ifdef DEBUG_VFS_LOCKS 5009 void 5010 vop_strategy_pre(void *ap) 5011 { 5012 struct vop_strategy_args *a; 5013 struct buf *bp; 5014 5015 a = ap; 5016 bp = a->a_bp; 5017 5018 /* 5019 * Cluster ops lock their component buffers but not the IO container. 5020 */ 5021 if ((bp->b_flags & B_CLUSTER) != 0) 5022 return; 5023 5024 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 5025 if (vfs_badlock_print) 5026 printf( 5027 "VOP_STRATEGY: bp is not locked but should be\n"); 5028 if (vfs_badlock_ddb) 5029 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5030 } 5031 } 5032 5033 void 5034 vop_lock_pre(void *ap) 5035 { 5036 struct vop_lock1_args *a = ap; 5037 5038 if ((a->a_flags & LK_INTERLOCK) == 0) 5039 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5040 else 5041 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5042 } 5043 5044 void 5045 vop_lock_post(void *ap, int rc) 5046 { 5047 struct vop_lock1_args *a = ap; 5048 5049 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5050 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5051 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5052 } 5053 5054 void 5055 vop_unlock_pre(void *ap) 5056 { 5057 struct vop_unlock_args *a = ap; 5058 5059 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5060 } 5061 5062 void 5063 vop_unlock_post(void *ap, int rc) 5064 { 5065 return; 5066 } 5067 5068 void 5069 vop_need_inactive_pre(void *ap) 5070 { 5071 struct vop_need_inactive_args *a = ap; 5072 5073 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5074 } 5075 5076 void 5077 vop_need_inactive_post(void *ap, int rc) 5078 { 5079 struct vop_need_inactive_args *a = ap; 5080 5081 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5082 } 5083 #endif 5084 5085 void 5086 vop_create_post(void *ap, int rc) 5087 { 5088 struct vop_create_args *a = ap; 5089 5090 if (!rc) 5091 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5092 } 5093 5094 void 5095 vop_deleteextattr_post(void *ap, int rc) 5096 { 5097 struct vop_deleteextattr_args *a = ap; 5098 5099 if (!rc) 5100 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5101 } 5102 5103 void 5104 vop_link_post(void *ap, int rc) 5105 { 5106 struct vop_link_args *a = ap; 5107 5108 if (!rc) { 5109 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 5110 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 5111 } 5112 } 5113 5114 void 5115 vop_mkdir_post(void *ap, int rc) 5116 { 5117 struct vop_mkdir_args *a = ap; 5118 5119 if (!rc) 5120 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 5121 } 5122 5123 void 5124 vop_mknod_post(void *ap, int rc) 5125 { 5126 struct vop_mknod_args *a = ap; 5127 5128 if (!rc) 5129 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5130 } 5131 5132 void 5133 vop_reclaim_post(void *ap, int rc) 5134 { 5135 struct vop_reclaim_args *a = ap; 5136 5137 if (!rc) 5138 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 5139 } 5140 5141 void 5142 vop_remove_post(void *ap, int rc) 5143 { 5144 struct vop_remove_args *a = ap; 5145 5146 if (!rc) { 5147 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5148 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 5149 } 5150 } 5151 5152 void 5153 vop_rename_post(void *ap, int rc) 5154 { 5155 struct vop_rename_args *a = ap; 5156 long hint; 5157 5158 if (!rc) { 5159 hint = NOTE_WRITE; 5160 if (a->a_fdvp == a->a_tdvp) { 5161 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5162 hint |= NOTE_LINK; 5163 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5164 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5165 } else { 5166 hint |= NOTE_EXTEND; 5167 if (a->a_fvp->v_type == VDIR) 5168 hint |= NOTE_LINK; 5169 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5170 5171 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5172 a->a_tvp->v_type == VDIR) 5173 hint &= ~NOTE_LINK; 5174 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5175 } 5176 5177 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5178 if (a->a_tvp) 5179 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5180 } 5181 if (a->a_tdvp != a->a_fdvp) 5182 vdrop(a->a_fdvp); 5183 if (a->a_tvp != a->a_fvp) 5184 vdrop(a->a_fvp); 5185 vdrop(a->a_tdvp); 5186 if (a->a_tvp) 5187 vdrop(a->a_tvp); 5188 } 5189 5190 void 5191 vop_rmdir_post(void *ap, int rc) 5192 { 5193 struct vop_rmdir_args *a = ap; 5194 5195 if (!rc) { 5196 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 5197 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 5198 } 5199 } 5200 5201 void 5202 vop_setattr_post(void *ap, int rc) 5203 { 5204 struct vop_setattr_args *a = ap; 5205 5206 if (!rc) 5207 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5208 } 5209 5210 void 5211 vop_setextattr_post(void *ap, int rc) 5212 { 5213 struct vop_setextattr_args *a = ap; 5214 5215 if (!rc) 5216 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5217 } 5218 5219 void 5220 vop_symlink_post(void *ap, int rc) 5221 { 5222 struct vop_symlink_args *a = ap; 5223 5224 if (!rc) 5225 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5226 } 5227 5228 void 5229 vop_open_post(void *ap, int rc) 5230 { 5231 struct vop_open_args *a = ap; 5232 5233 if (!rc) 5234 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 5235 } 5236 5237 void 5238 vop_close_post(void *ap, int rc) 5239 { 5240 struct vop_close_args *a = ap; 5241 5242 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 5243 !VN_IS_DOOMED(a->a_vp))) { 5244 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 5245 NOTE_CLOSE_WRITE : NOTE_CLOSE); 5246 } 5247 } 5248 5249 void 5250 vop_read_post(void *ap, int rc) 5251 { 5252 struct vop_read_args *a = ap; 5253 5254 if (!rc) 5255 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5256 } 5257 5258 void 5259 vop_readdir_post(void *ap, int rc) 5260 { 5261 struct vop_readdir_args *a = ap; 5262 5263 if (!rc) 5264 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5265 } 5266 5267 static struct knlist fs_knlist; 5268 5269 static void 5270 vfs_event_init(void *arg) 5271 { 5272 knlist_init_mtx(&fs_knlist, NULL); 5273 } 5274 /* XXX - correct order? */ 5275 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 5276 5277 void 5278 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 5279 { 5280 5281 KNOTE_UNLOCKED(&fs_knlist, event); 5282 } 5283 5284 static int filt_fsattach(struct knote *kn); 5285 static void filt_fsdetach(struct knote *kn); 5286 static int filt_fsevent(struct knote *kn, long hint); 5287 5288 struct filterops fs_filtops = { 5289 .f_isfd = 0, 5290 .f_attach = filt_fsattach, 5291 .f_detach = filt_fsdetach, 5292 .f_event = filt_fsevent 5293 }; 5294 5295 static int 5296 filt_fsattach(struct knote *kn) 5297 { 5298 5299 kn->kn_flags |= EV_CLEAR; 5300 knlist_add(&fs_knlist, kn, 0); 5301 return (0); 5302 } 5303 5304 static void 5305 filt_fsdetach(struct knote *kn) 5306 { 5307 5308 knlist_remove(&fs_knlist, kn, 0); 5309 } 5310 5311 static int 5312 filt_fsevent(struct knote *kn, long hint) 5313 { 5314 5315 kn->kn_fflags |= hint; 5316 return (kn->kn_fflags != 0); 5317 } 5318 5319 static int 5320 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 5321 { 5322 struct vfsidctl vc; 5323 int error; 5324 struct mount *mp; 5325 5326 error = SYSCTL_IN(req, &vc, sizeof(vc)); 5327 if (error) 5328 return (error); 5329 if (vc.vc_vers != VFS_CTL_VERS1) 5330 return (EINVAL); 5331 mp = vfs_getvfs(&vc.vc_fsid); 5332 if (mp == NULL) 5333 return (ENOENT); 5334 /* ensure that a specific sysctl goes to the right filesystem. */ 5335 if (strcmp(vc.vc_fstypename, "*") != 0 && 5336 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 5337 vfs_rel(mp); 5338 return (EINVAL); 5339 } 5340 VCTLTOREQ(&vc, req); 5341 error = VFS_SYSCTL(mp, vc.vc_op, req); 5342 vfs_rel(mp); 5343 return (error); 5344 } 5345 5346 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 5347 NULL, 0, sysctl_vfs_ctl, "", 5348 "Sysctl by fsid"); 5349 5350 /* 5351 * Function to initialize a va_filerev field sensibly. 5352 * XXX: Wouldn't a random number make a lot more sense ?? 5353 */ 5354 u_quad_t 5355 init_va_filerev(void) 5356 { 5357 struct bintime bt; 5358 5359 getbinuptime(&bt); 5360 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 5361 } 5362 5363 static int filt_vfsread(struct knote *kn, long hint); 5364 static int filt_vfswrite(struct knote *kn, long hint); 5365 static int filt_vfsvnode(struct knote *kn, long hint); 5366 static void filt_vfsdetach(struct knote *kn); 5367 static struct filterops vfsread_filtops = { 5368 .f_isfd = 1, 5369 .f_detach = filt_vfsdetach, 5370 .f_event = filt_vfsread 5371 }; 5372 static struct filterops vfswrite_filtops = { 5373 .f_isfd = 1, 5374 .f_detach = filt_vfsdetach, 5375 .f_event = filt_vfswrite 5376 }; 5377 static struct filterops vfsvnode_filtops = { 5378 .f_isfd = 1, 5379 .f_detach = filt_vfsdetach, 5380 .f_event = filt_vfsvnode 5381 }; 5382 5383 static void 5384 vfs_knllock(void *arg) 5385 { 5386 struct vnode *vp = arg; 5387 5388 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5389 } 5390 5391 static void 5392 vfs_knlunlock(void *arg) 5393 { 5394 struct vnode *vp = arg; 5395 5396 VOP_UNLOCK(vp); 5397 } 5398 5399 static void 5400 vfs_knl_assert_locked(void *arg) 5401 { 5402 #ifdef DEBUG_VFS_LOCKS 5403 struct vnode *vp = arg; 5404 5405 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 5406 #endif 5407 } 5408 5409 static void 5410 vfs_knl_assert_unlocked(void *arg) 5411 { 5412 #ifdef DEBUG_VFS_LOCKS 5413 struct vnode *vp = arg; 5414 5415 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 5416 #endif 5417 } 5418 5419 int 5420 vfs_kqfilter(struct vop_kqfilter_args *ap) 5421 { 5422 struct vnode *vp = ap->a_vp; 5423 struct knote *kn = ap->a_kn; 5424 struct knlist *knl; 5425 5426 switch (kn->kn_filter) { 5427 case EVFILT_READ: 5428 kn->kn_fop = &vfsread_filtops; 5429 break; 5430 case EVFILT_WRITE: 5431 kn->kn_fop = &vfswrite_filtops; 5432 break; 5433 case EVFILT_VNODE: 5434 kn->kn_fop = &vfsvnode_filtops; 5435 break; 5436 default: 5437 return (EINVAL); 5438 } 5439 5440 kn->kn_hook = (caddr_t)vp; 5441 5442 v_addpollinfo(vp); 5443 if (vp->v_pollinfo == NULL) 5444 return (ENOMEM); 5445 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 5446 vhold(vp); 5447 knlist_add(knl, kn, 0); 5448 5449 return (0); 5450 } 5451 5452 /* 5453 * Detach knote from vnode 5454 */ 5455 static void 5456 filt_vfsdetach(struct knote *kn) 5457 { 5458 struct vnode *vp = (struct vnode *)kn->kn_hook; 5459 5460 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 5461 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 5462 vdrop(vp); 5463 } 5464 5465 /*ARGSUSED*/ 5466 static int 5467 filt_vfsread(struct knote *kn, long hint) 5468 { 5469 struct vnode *vp = (struct vnode *)kn->kn_hook; 5470 struct vattr va; 5471 int res; 5472 5473 /* 5474 * filesystem is gone, so set the EOF flag and schedule 5475 * the knote for deletion. 5476 */ 5477 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5478 VI_LOCK(vp); 5479 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5480 VI_UNLOCK(vp); 5481 return (1); 5482 } 5483 5484 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 5485 return (0); 5486 5487 VI_LOCK(vp); 5488 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 5489 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 5490 VI_UNLOCK(vp); 5491 return (res); 5492 } 5493 5494 /*ARGSUSED*/ 5495 static int 5496 filt_vfswrite(struct knote *kn, long hint) 5497 { 5498 struct vnode *vp = (struct vnode *)kn->kn_hook; 5499 5500 VI_LOCK(vp); 5501 5502 /* 5503 * filesystem is gone, so set the EOF flag and schedule 5504 * the knote for deletion. 5505 */ 5506 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 5507 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5508 5509 kn->kn_data = 0; 5510 VI_UNLOCK(vp); 5511 return (1); 5512 } 5513 5514 static int 5515 filt_vfsvnode(struct knote *kn, long hint) 5516 { 5517 struct vnode *vp = (struct vnode *)kn->kn_hook; 5518 int res; 5519 5520 VI_LOCK(vp); 5521 if (kn->kn_sfflags & hint) 5522 kn->kn_fflags |= hint; 5523 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5524 kn->kn_flags |= EV_EOF; 5525 VI_UNLOCK(vp); 5526 return (1); 5527 } 5528 res = (kn->kn_fflags != 0); 5529 VI_UNLOCK(vp); 5530 return (res); 5531 } 5532 5533 /* 5534 * Returns whether the directory is empty or not. 5535 * If it is empty, the return value is 0; otherwise 5536 * the return value is an error value (which may 5537 * be ENOTEMPTY). 5538 */ 5539 int 5540 vfs_emptydir(struct vnode *vp) 5541 { 5542 struct uio uio; 5543 struct iovec iov; 5544 struct dirent *dirent, *dp, *endp; 5545 int error, eof; 5546 5547 error = 0; 5548 eof = 0; 5549 5550 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 5551 5552 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 5553 iov.iov_base = dirent; 5554 iov.iov_len = sizeof(struct dirent); 5555 5556 uio.uio_iov = &iov; 5557 uio.uio_iovcnt = 1; 5558 uio.uio_offset = 0; 5559 uio.uio_resid = sizeof(struct dirent); 5560 uio.uio_segflg = UIO_SYSSPACE; 5561 uio.uio_rw = UIO_READ; 5562 uio.uio_td = curthread; 5563 5564 while (eof == 0 && error == 0) { 5565 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 5566 NULL, NULL); 5567 if (error != 0) 5568 break; 5569 endp = (void *)((uint8_t *)dirent + 5570 sizeof(struct dirent) - uio.uio_resid); 5571 for (dp = dirent; dp < endp; 5572 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 5573 if (dp->d_type == DT_WHT) 5574 continue; 5575 if (dp->d_namlen == 0) 5576 continue; 5577 if (dp->d_type != DT_DIR && 5578 dp->d_type != DT_UNKNOWN) { 5579 error = ENOTEMPTY; 5580 break; 5581 } 5582 if (dp->d_namlen > 2) { 5583 error = ENOTEMPTY; 5584 break; 5585 } 5586 if (dp->d_namlen == 1 && 5587 dp->d_name[0] != '.') { 5588 error = ENOTEMPTY; 5589 break; 5590 } 5591 if (dp->d_namlen == 2 && 5592 dp->d_name[1] != '.') { 5593 error = ENOTEMPTY; 5594 break; 5595 } 5596 uio.uio_resid = sizeof(struct dirent); 5597 } 5598 } 5599 free(dirent, M_TEMP); 5600 return (error); 5601 } 5602 5603 int 5604 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 5605 { 5606 int error; 5607 5608 if (dp->d_reclen > ap->a_uio->uio_resid) 5609 return (ENAMETOOLONG); 5610 error = uiomove(dp, dp->d_reclen, ap->a_uio); 5611 if (error) { 5612 if (ap->a_ncookies != NULL) { 5613 if (ap->a_cookies != NULL) 5614 free(ap->a_cookies, M_TEMP); 5615 ap->a_cookies = NULL; 5616 *ap->a_ncookies = 0; 5617 } 5618 return (error); 5619 } 5620 if (ap->a_ncookies == NULL) 5621 return (0); 5622 5623 KASSERT(ap->a_cookies, 5624 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 5625 5626 *ap->a_cookies = realloc(*ap->a_cookies, 5627 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 5628 (*ap->a_cookies)[*ap->a_ncookies] = off; 5629 *ap->a_ncookies += 1; 5630 return (0); 5631 } 5632 5633 /* 5634 * Mark for update the access time of the file if the filesystem 5635 * supports VOP_MARKATIME. This functionality is used by execve and 5636 * mmap, so we want to avoid the I/O implied by directly setting 5637 * va_atime for the sake of efficiency. 5638 */ 5639 void 5640 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 5641 { 5642 struct mount *mp; 5643 5644 mp = vp->v_mount; 5645 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 5646 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 5647 (void)VOP_MARKATIME(vp); 5648 } 5649 5650 /* 5651 * The purpose of this routine is to remove granularity from accmode_t, 5652 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 5653 * VADMIN and VAPPEND. 5654 * 5655 * If it returns 0, the caller is supposed to continue with the usual 5656 * access checks using 'accmode' as modified by this routine. If it 5657 * returns nonzero value, the caller is supposed to return that value 5658 * as errno. 5659 * 5660 * Note that after this routine runs, accmode may be zero. 5661 */ 5662 int 5663 vfs_unixify_accmode(accmode_t *accmode) 5664 { 5665 /* 5666 * There is no way to specify explicit "deny" rule using 5667 * file mode or POSIX.1e ACLs. 5668 */ 5669 if (*accmode & VEXPLICIT_DENY) { 5670 *accmode = 0; 5671 return (0); 5672 } 5673 5674 /* 5675 * None of these can be translated into usual access bits. 5676 * Also, the common case for NFSv4 ACLs is to not contain 5677 * either of these bits. Caller should check for VWRITE 5678 * on the containing directory instead. 5679 */ 5680 if (*accmode & (VDELETE_CHILD | VDELETE)) 5681 return (EPERM); 5682 5683 if (*accmode & VADMIN_PERMS) { 5684 *accmode &= ~VADMIN_PERMS; 5685 *accmode |= VADMIN; 5686 } 5687 5688 /* 5689 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5690 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5691 */ 5692 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5693 5694 return (0); 5695 } 5696 5697 /* 5698 * Clear out a doomed vnode (if any) and replace it with a new one as long 5699 * as the fs is not being unmounted. Return the root vnode to the caller. 5700 */ 5701 static int __noinline 5702 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 5703 { 5704 struct vnode *vp; 5705 int error; 5706 5707 restart: 5708 if (mp->mnt_rootvnode != NULL) { 5709 MNT_ILOCK(mp); 5710 vp = mp->mnt_rootvnode; 5711 if (vp != NULL) { 5712 if (!VN_IS_DOOMED(vp)) { 5713 vrefact(vp); 5714 MNT_IUNLOCK(mp); 5715 error = vn_lock(vp, flags); 5716 if (error == 0) { 5717 *vpp = vp; 5718 return (0); 5719 } 5720 vrele(vp); 5721 goto restart; 5722 } 5723 /* 5724 * Clear the old one. 5725 */ 5726 mp->mnt_rootvnode = NULL; 5727 } 5728 MNT_IUNLOCK(mp); 5729 if (vp != NULL) { 5730 /* 5731 * Paired with a fence in vfs_op_thread_exit(). 5732 */ 5733 atomic_thread_fence_acq(); 5734 vfs_op_barrier_wait(mp); 5735 vrele(vp); 5736 } 5737 } 5738 error = VFS_CACHEDROOT(mp, flags, vpp); 5739 if (error != 0) 5740 return (error); 5741 if (mp->mnt_vfs_ops == 0) { 5742 MNT_ILOCK(mp); 5743 if (mp->mnt_vfs_ops != 0) { 5744 MNT_IUNLOCK(mp); 5745 return (0); 5746 } 5747 if (mp->mnt_rootvnode == NULL) { 5748 vrefact(*vpp); 5749 mp->mnt_rootvnode = *vpp; 5750 } else { 5751 if (mp->mnt_rootvnode != *vpp) { 5752 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 5753 panic("%s: mismatch between vnode returned " 5754 " by VFS_CACHEDROOT and the one cached " 5755 " (%p != %p)", 5756 __func__, *vpp, mp->mnt_rootvnode); 5757 } 5758 } 5759 } 5760 MNT_IUNLOCK(mp); 5761 } 5762 return (0); 5763 } 5764 5765 int 5766 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 5767 { 5768 struct vnode *vp; 5769 int error; 5770 5771 if (!vfs_op_thread_enter(mp)) 5772 return (vfs_cache_root_fallback(mp, flags, vpp)); 5773 vp = (struct vnode *)atomic_load_ptr(&mp->mnt_rootvnode); 5774 if (vp == NULL || VN_IS_DOOMED(vp)) { 5775 vfs_op_thread_exit(mp); 5776 return (vfs_cache_root_fallback(mp, flags, vpp)); 5777 } 5778 vrefact(vp); 5779 vfs_op_thread_exit(mp); 5780 error = vn_lock(vp, flags); 5781 if (error != 0) { 5782 vrele(vp); 5783 return (vfs_cache_root_fallback(mp, flags, vpp)); 5784 } 5785 *vpp = vp; 5786 return (0); 5787 } 5788 5789 struct vnode * 5790 vfs_cache_root_clear(struct mount *mp) 5791 { 5792 struct vnode *vp; 5793 5794 /* 5795 * ops > 0 guarantees there is nobody who can see this vnode 5796 */ 5797 MPASS(mp->mnt_vfs_ops > 0); 5798 vp = mp->mnt_rootvnode; 5799 mp->mnt_rootvnode = NULL; 5800 return (vp); 5801 } 5802 5803 void 5804 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 5805 { 5806 5807 MPASS(mp->mnt_vfs_ops > 0); 5808 vrefact(vp); 5809 mp->mnt_rootvnode = vp; 5810 } 5811 5812 /* 5813 * These are helper functions for filesystems to traverse all 5814 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5815 * 5816 * This interface replaces MNT_VNODE_FOREACH. 5817 */ 5818 5819 5820 struct vnode * 5821 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5822 { 5823 struct vnode *vp; 5824 5825 if (should_yield()) 5826 kern_yield(PRI_USER); 5827 MNT_ILOCK(mp); 5828 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5829 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 5830 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 5831 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 5832 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 5833 continue; 5834 VI_LOCK(vp); 5835 if (VN_IS_DOOMED(vp)) { 5836 VI_UNLOCK(vp); 5837 continue; 5838 } 5839 break; 5840 } 5841 if (vp == NULL) { 5842 __mnt_vnode_markerfree_all(mvp, mp); 5843 /* MNT_IUNLOCK(mp); -- done in above function */ 5844 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5845 return (NULL); 5846 } 5847 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5848 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5849 MNT_IUNLOCK(mp); 5850 return (vp); 5851 } 5852 5853 struct vnode * 5854 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5855 { 5856 struct vnode *vp; 5857 5858 *mvp = vn_alloc_marker(mp); 5859 MNT_ILOCK(mp); 5860 MNT_REF(mp); 5861 5862 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 5863 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 5864 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 5865 continue; 5866 VI_LOCK(vp); 5867 if (VN_IS_DOOMED(vp)) { 5868 VI_UNLOCK(vp); 5869 continue; 5870 } 5871 break; 5872 } 5873 if (vp == NULL) { 5874 MNT_REL(mp); 5875 MNT_IUNLOCK(mp); 5876 vn_free_marker(*mvp); 5877 *mvp = NULL; 5878 return (NULL); 5879 } 5880 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5881 MNT_IUNLOCK(mp); 5882 return (vp); 5883 } 5884 5885 void 5886 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5887 { 5888 5889 if (*mvp == NULL) { 5890 MNT_IUNLOCK(mp); 5891 return; 5892 } 5893 5894 mtx_assert(MNT_MTX(mp), MA_OWNED); 5895 5896 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5897 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5898 MNT_REL(mp); 5899 MNT_IUNLOCK(mp); 5900 vn_free_marker(*mvp); 5901 *mvp = NULL; 5902 } 5903 5904 /* 5905 * These are helper functions for filesystems to traverse their 5906 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5907 */ 5908 static void 5909 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5910 { 5911 5912 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5913 5914 MNT_ILOCK(mp); 5915 MNT_REL(mp); 5916 MNT_IUNLOCK(mp); 5917 vn_free_marker(*mvp); 5918 *mvp = NULL; 5919 } 5920 5921 /* 5922 * Relock the mp mount vnode list lock with the vp vnode interlock in the 5923 * conventional lock order during mnt_vnode_next_active iteration. 5924 * 5925 * On entry, the mount vnode list lock is held and the vnode interlock is not. 5926 * The list lock is dropped and reacquired. On success, both locks are held. 5927 * On failure, the mount vnode list lock is held but the vnode interlock is 5928 * not, and the procedure may have yielded. 5929 */ 5930 static bool 5931 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, 5932 struct vnode *vp) 5933 { 5934 const struct vnode *tmp; 5935 bool held, ret; 5936 5937 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 5938 TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, 5939 ("%s: bad marker", __func__)); 5940 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 5941 ("%s: inappropriate vnode", __func__)); 5942 ASSERT_VI_UNLOCKED(vp, __func__); 5943 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5944 5945 ret = false; 5946 5947 TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); 5948 TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); 5949 5950 /* 5951 * Use a hold to prevent vp from disappearing while the mount vnode 5952 * list lock is dropped and reacquired. Normally a hold would be 5953 * acquired with vhold(), but that might try to acquire the vnode 5954 * interlock, which would be a LOR with the mount vnode list lock. 5955 */ 5956 held = refcount_acquire_if_not_zero(&vp->v_holdcnt); 5957 mtx_unlock(&mp->mnt_listmtx); 5958 if (!held) 5959 goto abort; 5960 VI_LOCK(vp); 5961 if (!refcount_release_if_not_last(&vp->v_holdcnt)) { 5962 vdropl(vp); 5963 goto abort; 5964 } 5965 mtx_lock(&mp->mnt_listmtx); 5966 5967 /* 5968 * Determine whether the vnode is still the next one after the marker, 5969 * excepting any other markers. If the vnode has not been doomed by 5970 * vgone() then the hold should have ensured that it remained on the 5971 * active list. If it has been doomed but is still on the active list, 5972 * don't abort, but rather skip over it (avoid spinning on doomed 5973 * vnodes). 5974 */ 5975 tmp = mvp; 5976 do { 5977 tmp = TAILQ_NEXT(tmp, v_actfreelist); 5978 } while (tmp != NULL && tmp->v_type == VMARKER); 5979 if (tmp != vp) { 5980 mtx_unlock(&mp->mnt_listmtx); 5981 VI_UNLOCK(vp); 5982 goto abort; 5983 } 5984 5985 ret = true; 5986 goto out; 5987 abort: 5988 maybe_yield(); 5989 mtx_lock(&mp->mnt_listmtx); 5990 out: 5991 if (ret) 5992 ASSERT_VI_LOCKED(vp, __func__); 5993 else 5994 ASSERT_VI_UNLOCKED(vp, __func__); 5995 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5996 return (ret); 5997 } 5998 5999 static struct vnode * 6000 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 6001 { 6002 struct vnode *vp, *nvp; 6003 6004 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6005 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6006 restart: 6007 vp = TAILQ_NEXT(*mvp, v_actfreelist); 6008 while (vp != NULL) { 6009 if (vp->v_type == VMARKER) { 6010 vp = TAILQ_NEXT(vp, v_actfreelist); 6011 continue; 6012 } 6013 /* 6014 * Try-lock because this is the wrong lock order. If that does 6015 * not succeed, drop the mount vnode list lock and try to 6016 * reacquire it and the vnode interlock in the right order. 6017 */ 6018 if (!VI_TRYLOCK(vp) && 6019 !mnt_vnode_next_active_relock(*mvp, mp, vp)) 6020 goto restart; 6021 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6022 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6023 ("alien vnode on the active list %p %p", vp, mp)); 6024 if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) 6025 break; 6026 nvp = TAILQ_NEXT(vp, v_actfreelist); 6027 VI_UNLOCK(vp); 6028 vp = nvp; 6029 } 6030 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 6031 6032 /* Check if we are done */ 6033 if (vp == NULL) { 6034 mtx_unlock(&mp->mnt_listmtx); 6035 mnt_vnode_markerfree_active(mvp, mp); 6036 return (NULL); 6037 } 6038 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 6039 mtx_unlock(&mp->mnt_listmtx); 6040 ASSERT_VI_LOCKED(vp, "active iter"); 6041 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 6042 return (vp); 6043 } 6044 6045 struct vnode * 6046 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 6047 { 6048 6049 if (should_yield()) 6050 kern_yield(PRI_USER); 6051 mtx_lock(&mp->mnt_listmtx); 6052 return (mnt_vnode_next_active(mvp, mp)); 6053 } 6054 6055 struct vnode * 6056 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 6057 { 6058 struct vnode *vp; 6059 6060 *mvp = vn_alloc_marker(mp); 6061 MNT_ILOCK(mp); 6062 MNT_REF(mp); 6063 MNT_IUNLOCK(mp); 6064 6065 mtx_lock(&mp->mnt_listmtx); 6066 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 6067 if (vp == NULL) { 6068 mtx_unlock(&mp->mnt_listmtx); 6069 mnt_vnode_markerfree_active(mvp, mp); 6070 return (NULL); 6071 } 6072 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 6073 return (mnt_vnode_next_active(mvp, mp)); 6074 } 6075 6076 void 6077 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 6078 { 6079 6080 if (*mvp == NULL) 6081 return; 6082 6083 mtx_lock(&mp->mnt_listmtx); 6084 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 6085 mtx_unlock(&mp->mnt_listmtx); 6086 mnt_vnode_markerfree_active(mvp, mp); 6087 } 6088