1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/capsicum.h> 54 #include <sys/condvar.h> 55 #include <sys/conf.h> 56 #include <sys/counter.h> 57 #include <sys/dirent.h> 58 #include <sys/event.h> 59 #include <sys/eventhandler.h> 60 #include <sys/extattr.h> 61 #include <sys/file.h> 62 #include <sys/fcntl.h> 63 #include <sys/jail.h> 64 #include <sys/kdb.h> 65 #include <sys/kernel.h> 66 #include <sys/kthread.h> 67 #include <sys/ktr.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smp.h> 80 #include <sys/stat.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/vmmeter.h> 84 #include <sys/vnode.h> 85 #include <sys/watchdog.h> 86 87 #include <machine/stdarg.h> 88 89 #include <security/mac/mac_framework.h> 90 91 #include <vm/vm.h> 92 #include <vm/vm_object.h> 93 #include <vm/vm_extern.h> 94 #include <vm/pmap.h> 95 #include <vm/vm_map.h> 96 #include <vm/vm_page.h> 97 #include <vm/vm_kern.h> 98 #include <vm/uma.h> 99 100 #ifdef DDB 101 #include <ddb/ddb.h> 102 #endif 103 104 static void delmntque(struct vnode *vp); 105 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 106 int slpflag, int slptimeo); 107 static void syncer_shutdown(void *arg, int howto); 108 static int vtryrecycle(struct vnode *vp); 109 static void v_init_counters(struct vnode *); 110 static void v_incr_devcount(struct vnode *); 111 static void v_decr_devcount(struct vnode *); 112 static void vgonel(struct vnode *); 113 static void vfs_knllock(void *arg); 114 static void vfs_knlunlock(void *arg); 115 static void vfs_knl_assert_locked(void *arg); 116 static void vfs_knl_assert_unlocked(void *arg); 117 static void vnlru_return_batches(struct vfsops *mnt_op); 118 static void destroy_vpollinfo(struct vpollinfo *vi); 119 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 120 daddr_t startlbn, daddr_t endlbn); 121 122 /* 123 * These fences are intended for cases where some synchronization is 124 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 125 * and v_usecount) updates. Access to v_iflags is generally synchronized 126 * by the interlock, but we have some internal assertions that check vnode 127 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 128 * for now. 129 */ 130 #ifdef INVARIANTS 131 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 132 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 133 #else 134 #define VNODE_REFCOUNT_FENCE_ACQ() 135 #define VNODE_REFCOUNT_FENCE_REL() 136 #endif 137 138 /* 139 * Number of vnodes in existence. Increased whenever getnewvnode() 140 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 141 */ 142 static unsigned long numvnodes; 143 144 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 145 "Number of vnodes in existence"); 146 147 static counter_u64_t vnodes_created; 148 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 149 "Number of vnodes created by getnewvnode"); 150 151 static u_long mnt_free_list_batch = 128; 152 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, 153 &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); 154 155 /* 156 * Conversion tables for conversion from vnode types to inode formats 157 * and back. 158 */ 159 enum vtype iftovt_tab[16] = { 160 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 161 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 162 }; 163 int vttoif_tab[10] = { 164 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 165 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 166 }; 167 168 /* 169 * List of vnodes that are ready for recycling. 170 */ 171 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 172 173 /* 174 * "Free" vnode target. Free vnodes are rarely completely free, but are 175 * just ones that are cheap to recycle. Usually they are for files which 176 * have been stat'd but not read; these usually have inode and namecache 177 * data attached to them. This target is the preferred minimum size of a 178 * sub-cache consisting mostly of such files. The system balances the size 179 * of this sub-cache with its complement to try to prevent either from 180 * thrashing while the other is relatively inactive. The targets express 181 * a preference for the best balance. 182 * 183 * "Above" this target there are 2 further targets (watermarks) related 184 * to recyling of free vnodes. In the best-operating case, the cache is 185 * exactly full, the free list has size between vlowat and vhiwat above the 186 * free target, and recycling from it and normal use maintains this state. 187 * Sometimes the free list is below vlowat or even empty, but this state 188 * is even better for immediate use provided the cache is not full. 189 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 190 * ones) to reach one of these states. The watermarks are currently hard- 191 * coded as 4% and 9% of the available space higher. These and the default 192 * of 25% for wantfreevnodes are too large if the memory size is large. 193 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 194 * whenever vnlru_proc() becomes active. 195 */ 196 static u_long wantfreevnodes; 197 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 198 &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); 199 static u_long freevnodes; 200 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 201 &freevnodes, 0, "Number of \"free\" vnodes"); 202 203 static counter_u64_t recycles_count; 204 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 205 "Number of vnodes recycled to meet vnode cache targets"); 206 207 /* 208 * Various variables used for debugging the new implementation of 209 * reassignbuf(). 210 * XXX these are probably of (very) limited utility now. 211 */ 212 static int reassignbufcalls; 213 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS, 214 &reassignbufcalls, 0, "Number of calls to reassignbuf"); 215 216 static counter_u64_t free_owe_inact; 217 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 218 "Number of times free vnodes kept on active list due to VFS " 219 "owing inactivation"); 220 221 /* To keep more than one thread at a time from running vfs_getnewfsid */ 222 static struct mtx mntid_mtx; 223 224 /* 225 * Lock for any access to the following: 226 * vnode_free_list 227 * numvnodes 228 * freevnodes 229 */ 230 static struct mtx vnode_free_list_mtx; 231 232 /* Publicly exported FS */ 233 struct nfs_public nfs_pub; 234 235 static uma_zone_t buf_trie_zone; 236 237 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 238 static uma_zone_t vnode_zone; 239 static uma_zone_t vnodepoll_zone; 240 241 /* 242 * The workitem queue. 243 * 244 * It is useful to delay writes of file data and filesystem metadata 245 * for tens of seconds so that quickly created and deleted files need 246 * not waste disk bandwidth being created and removed. To realize this, 247 * we append vnodes to a "workitem" queue. When running with a soft 248 * updates implementation, most pending metadata dependencies should 249 * not wait for more than a few seconds. Thus, mounted on block devices 250 * are delayed only about a half the time that file data is delayed. 251 * Similarly, directory updates are more critical, so are only delayed 252 * about a third the time that file data is delayed. Thus, there are 253 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 254 * one each second (driven off the filesystem syncer process). The 255 * syncer_delayno variable indicates the next queue that is to be processed. 256 * Items that need to be processed soon are placed in this queue: 257 * 258 * syncer_workitem_pending[syncer_delayno] 259 * 260 * A delay of fifteen seconds is done by placing the request fifteen 261 * entries later in the queue: 262 * 263 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 264 * 265 */ 266 static int syncer_delayno; 267 static long syncer_mask; 268 LIST_HEAD(synclist, bufobj); 269 static struct synclist *syncer_workitem_pending; 270 /* 271 * The sync_mtx protects: 272 * bo->bo_synclist 273 * sync_vnode_count 274 * syncer_delayno 275 * syncer_state 276 * syncer_workitem_pending 277 * syncer_worklist_len 278 * rushjob 279 */ 280 static struct mtx sync_mtx; 281 static struct cv sync_wakeup; 282 283 #define SYNCER_MAXDELAY 32 284 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 285 static int syncdelay = 30; /* max time to delay syncing data */ 286 static int filedelay = 30; /* time to delay syncing files */ 287 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 288 "Time to delay syncing files (in seconds)"); 289 static int dirdelay = 29; /* time to delay syncing directories */ 290 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 291 "Time to delay syncing directories (in seconds)"); 292 static int metadelay = 28; /* time to delay syncing metadata */ 293 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 294 "Time to delay syncing metadata (in seconds)"); 295 static int rushjob; /* number of slots to run ASAP */ 296 static int stat_rush_requests; /* number of times I/O speeded up */ 297 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 298 "Number of times I/O speeded up (rush requests)"); 299 300 /* 301 * When shutting down the syncer, run it at four times normal speed. 302 */ 303 #define SYNCER_SHUTDOWN_SPEEDUP 4 304 static int sync_vnode_count; 305 static int syncer_worklist_len; 306 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 307 syncer_state; 308 309 /* Target for maximum number of vnodes. */ 310 int desiredvnodes; 311 static int gapvnodes; /* gap between wanted and desired */ 312 static int vhiwat; /* enough extras after expansion */ 313 static int vlowat; /* minimal extras before expansion */ 314 static int vstir; /* nonzero to stir non-free vnodes */ 315 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 316 317 static int 318 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) 319 { 320 int error, old_desiredvnodes; 321 322 old_desiredvnodes = desiredvnodes; 323 if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) 324 return (error); 325 if (old_desiredvnodes != desiredvnodes) { 326 wantfreevnodes = desiredvnodes / 4; 327 /* XXX locking seems to be incomplete. */ 328 vfs_hash_changesize(desiredvnodes); 329 cache_changesize(desiredvnodes); 330 } 331 return (0); 332 } 333 334 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 335 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, 336 sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); 337 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 338 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 339 static int vnlru_nowhere; 340 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 341 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 342 343 static int 344 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 345 { 346 struct vnode *vp; 347 struct nameidata nd; 348 char *buf; 349 unsigned long ndflags; 350 int error; 351 352 if (req->newptr == NULL) 353 return (EINVAL); 354 if (req->newlen >= PATH_MAX) 355 return (E2BIG); 356 357 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 358 error = SYSCTL_IN(req, buf, req->newlen); 359 if (error != 0) 360 goto out; 361 362 buf[req->newlen] = '\0'; 363 364 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; 365 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); 366 if ((error = namei(&nd)) != 0) 367 goto out; 368 vp = nd.ni_vp; 369 370 if (VN_IS_DOOMED(vp)) { 371 /* 372 * This vnode is being recycled. Return != 0 to let the caller 373 * know that the sysctl had no effect. Return EAGAIN because a 374 * subsequent call will likely succeed (since namei will create 375 * a new vnode if necessary) 376 */ 377 error = EAGAIN; 378 goto putvnode; 379 } 380 381 counter_u64_add(recycles_count, 1); 382 vgone(vp); 383 putvnode: 384 NDFREE(&nd, 0); 385 out: 386 free(buf, M_TEMP); 387 return (error); 388 } 389 390 static int 391 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 392 { 393 struct thread *td = curthread; 394 struct vnode *vp; 395 struct file *fp; 396 int error; 397 int fd; 398 399 if (req->newptr == NULL) 400 return (EBADF); 401 402 error = sysctl_handle_int(oidp, &fd, 0, req); 403 if (error != 0) 404 return (error); 405 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 406 if (error != 0) 407 return (error); 408 vp = fp->f_vnode; 409 410 error = vn_lock(vp, LK_EXCLUSIVE); 411 if (error != 0) 412 goto drop; 413 414 counter_u64_add(recycles_count, 1); 415 vgone(vp); 416 VOP_UNLOCK(vp); 417 drop: 418 fdrop(fp, td); 419 return (error); 420 } 421 422 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 423 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 424 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 425 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 426 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 427 sysctl_ftry_reclaim_vnode, "I", 428 "Try to reclaim a vnode by its file descriptor"); 429 430 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 431 static int vnsz2log; 432 433 /* 434 * Support for the bufobj clean & dirty pctrie. 435 */ 436 static void * 437 buf_trie_alloc(struct pctrie *ptree) 438 { 439 440 return uma_zalloc(buf_trie_zone, M_NOWAIT); 441 } 442 443 static void 444 buf_trie_free(struct pctrie *ptree, void *node) 445 { 446 447 uma_zfree(buf_trie_zone, node); 448 } 449 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); 450 451 /* 452 * Initialize the vnode management data structures. 453 * 454 * Reevaluate the following cap on the number of vnodes after the physical 455 * memory size exceeds 512GB. In the limit, as the physical memory size 456 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 457 */ 458 #ifndef MAXVNODES_MAX 459 #define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ 460 #endif 461 462 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 463 464 static struct vnode * 465 vn_alloc_marker(struct mount *mp) 466 { 467 struct vnode *vp; 468 469 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 470 vp->v_type = VMARKER; 471 vp->v_mount = mp; 472 473 return (vp); 474 } 475 476 static void 477 vn_free_marker(struct vnode *vp) 478 { 479 480 MPASS(vp->v_type == VMARKER); 481 free(vp, M_VNODE_MARKER); 482 } 483 484 /* 485 * Initialize a vnode as it first enters the zone. 486 */ 487 static int 488 vnode_init(void *mem, int size, int flags) 489 { 490 struct vnode *vp; 491 492 vp = mem; 493 bzero(vp, size); 494 /* 495 * Setup locks. 496 */ 497 vp->v_vnlock = &vp->v_lock; 498 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 499 /* 500 * By default, don't allow shared locks unless filesystems opt-in. 501 */ 502 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 503 LK_NOSHARE | LK_IS_VNODE); 504 /* 505 * Initialize bufobj. 506 */ 507 bufobj_init(&vp->v_bufobj, vp); 508 /* 509 * Initialize namecache. 510 */ 511 LIST_INIT(&vp->v_cache_src); 512 TAILQ_INIT(&vp->v_cache_dst); 513 /* 514 * Initialize rangelocks. 515 */ 516 rangelock_init(&vp->v_rl); 517 return (0); 518 } 519 520 /* 521 * Free a vnode when it is cleared from the zone. 522 */ 523 static void 524 vnode_fini(void *mem, int size) 525 { 526 struct vnode *vp; 527 struct bufobj *bo; 528 529 vp = mem; 530 rangelock_destroy(&vp->v_rl); 531 lockdestroy(vp->v_vnlock); 532 mtx_destroy(&vp->v_interlock); 533 bo = &vp->v_bufobj; 534 rw_destroy(BO_LOCKPTR(bo)); 535 } 536 537 /* 538 * Provide the size of NFS nclnode and NFS fh for calculation of the 539 * vnode memory consumption. The size is specified directly to 540 * eliminate dependency on NFS-private header. 541 * 542 * Other filesystems may use bigger or smaller (like UFS and ZFS) 543 * private inode data, but the NFS-based estimation is ample enough. 544 * Still, we care about differences in the size between 64- and 32-bit 545 * platforms. 546 * 547 * Namecache structure size is heuristically 548 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 549 */ 550 #ifdef _LP64 551 #define NFS_NCLNODE_SZ (528 + 64) 552 #define NC_SZ 148 553 #else 554 #define NFS_NCLNODE_SZ (360 + 32) 555 #define NC_SZ 92 556 #endif 557 558 static void 559 vntblinit(void *dummy __unused) 560 { 561 u_int i; 562 int physvnodes, virtvnodes; 563 564 /* 565 * Desiredvnodes is a function of the physical memory size and the 566 * kernel's heap size. Generally speaking, it scales with the 567 * physical memory size. The ratio of desiredvnodes to the physical 568 * memory size is 1:16 until desiredvnodes exceeds 98,304. 569 * Thereafter, the 570 * marginal ratio of desiredvnodes to the physical memory size is 571 * 1:64. However, desiredvnodes is limited by the kernel's heap 572 * size. The memory required by desiredvnodes vnodes and vm objects 573 * must not exceed 1/10th of the kernel's heap size. 574 */ 575 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 576 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 577 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 578 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 579 desiredvnodes = min(physvnodes, virtvnodes); 580 if (desiredvnodes > MAXVNODES_MAX) { 581 if (bootverbose) 582 printf("Reducing kern.maxvnodes %d -> %d\n", 583 desiredvnodes, MAXVNODES_MAX); 584 desiredvnodes = MAXVNODES_MAX; 585 } 586 wantfreevnodes = desiredvnodes / 4; 587 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 588 TAILQ_INIT(&vnode_free_list); 589 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 590 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 591 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 592 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 593 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 594 /* 595 * Preallocate enough nodes to support one-per buf so that 596 * we can not fail an insert. reassignbuf() callers can not 597 * tolerate the insertion failure. 598 */ 599 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 600 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 601 UMA_ZONE_NOFREE | UMA_ZONE_VM); 602 uma_prealloc(buf_trie_zone, nbuf); 603 604 vnodes_created = counter_u64_alloc(M_WAITOK); 605 recycles_count = counter_u64_alloc(M_WAITOK); 606 free_owe_inact = counter_u64_alloc(M_WAITOK); 607 608 /* 609 * Initialize the filesystem syncer. 610 */ 611 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 612 &syncer_mask); 613 syncer_maxdelay = syncer_mask + 1; 614 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 615 cv_init(&sync_wakeup, "syncer"); 616 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 617 vnsz2log++; 618 vnsz2log--; 619 } 620 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 621 622 623 /* 624 * Mark a mount point as busy. Used to synchronize access and to delay 625 * unmounting. Eventually, mountlist_mtx is not released on failure. 626 * 627 * vfs_busy() is a custom lock, it can block the caller. 628 * vfs_busy() only sleeps if the unmount is active on the mount point. 629 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 630 * vnode belonging to mp. 631 * 632 * Lookup uses vfs_busy() to traverse mount points. 633 * root fs var fs 634 * / vnode lock A / vnode lock (/var) D 635 * /var vnode lock B /log vnode lock(/var/log) E 636 * vfs_busy lock C vfs_busy lock F 637 * 638 * Within each file system, the lock order is C->A->B and F->D->E. 639 * 640 * When traversing across mounts, the system follows that lock order: 641 * 642 * C->A->B 643 * | 644 * +->F->D->E 645 * 646 * The lookup() process for namei("/var") illustrates the process: 647 * VOP_LOOKUP() obtains B while A is held 648 * vfs_busy() obtains a shared lock on F while A and B are held 649 * vput() releases lock on B 650 * vput() releases lock on A 651 * VFS_ROOT() obtains lock on D while shared lock on F is held 652 * vfs_unbusy() releases shared lock on F 653 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 654 * Attempt to lock A (instead of vp_crossmp) while D is held would 655 * violate the global order, causing deadlocks. 656 * 657 * dounmount() locks B while F is drained. 658 */ 659 int 660 vfs_busy(struct mount *mp, int flags) 661 { 662 663 MPASS((flags & ~MBF_MASK) == 0); 664 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 665 666 if (vfs_op_thread_enter(mp)) { 667 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 668 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 669 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 670 vfs_mp_count_add_pcpu(mp, ref, 1); 671 vfs_mp_count_add_pcpu(mp, lockref, 1); 672 vfs_op_thread_exit(mp); 673 if (flags & MBF_MNTLSTLOCK) 674 mtx_unlock(&mountlist_mtx); 675 return (0); 676 } 677 678 MNT_ILOCK(mp); 679 vfs_assert_mount_counters(mp); 680 MNT_REF(mp); 681 /* 682 * If mount point is currently being unmounted, sleep until the 683 * mount point fate is decided. If thread doing the unmounting fails, 684 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 685 * that this mount point has survived the unmount attempt and vfs_busy 686 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 687 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 688 * about to be really destroyed. vfs_busy needs to release its 689 * reference on the mount point in this case and return with ENOENT, 690 * telling the caller that mount mount it tried to busy is no longer 691 * valid. 692 */ 693 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 694 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 695 MNT_REL(mp); 696 MNT_IUNLOCK(mp); 697 CTR1(KTR_VFS, "%s: failed busying before sleeping", 698 __func__); 699 return (ENOENT); 700 } 701 if (flags & MBF_MNTLSTLOCK) 702 mtx_unlock(&mountlist_mtx); 703 mp->mnt_kern_flag |= MNTK_MWAIT; 704 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 705 if (flags & MBF_MNTLSTLOCK) 706 mtx_lock(&mountlist_mtx); 707 MNT_ILOCK(mp); 708 } 709 if (flags & MBF_MNTLSTLOCK) 710 mtx_unlock(&mountlist_mtx); 711 mp->mnt_lockref++; 712 MNT_IUNLOCK(mp); 713 return (0); 714 } 715 716 /* 717 * Free a busy filesystem. 718 */ 719 void 720 vfs_unbusy(struct mount *mp) 721 { 722 int c; 723 724 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 725 726 if (vfs_op_thread_enter(mp)) { 727 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 728 vfs_mp_count_sub_pcpu(mp, lockref, 1); 729 vfs_mp_count_sub_pcpu(mp, ref, 1); 730 vfs_op_thread_exit(mp); 731 return; 732 } 733 734 MNT_ILOCK(mp); 735 vfs_assert_mount_counters(mp); 736 MNT_REL(mp); 737 c = --mp->mnt_lockref; 738 if (mp->mnt_vfs_ops == 0) { 739 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 740 MNT_IUNLOCK(mp); 741 return; 742 } 743 if (c < 0) 744 vfs_dump_mount_counters(mp); 745 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 746 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 747 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 748 mp->mnt_kern_flag &= ~MNTK_DRAINING; 749 wakeup(&mp->mnt_lockref); 750 } 751 MNT_IUNLOCK(mp); 752 } 753 754 /* 755 * Lookup a mount point by filesystem identifier. 756 */ 757 struct mount * 758 vfs_getvfs(fsid_t *fsid) 759 { 760 struct mount *mp; 761 762 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 763 mtx_lock(&mountlist_mtx); 764 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 765 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 766 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 767 vfs_ref(mp); 768 mtx_unlock(&mountlist_mtx); 769 return (mp); 770 } 771 } 772 mtx_unlock(&mountlist_mtx); 773 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 774 return ((struct mount *) 0); 775 } 776 777 /* 778 * Lookup a mount point by filesystem identifier, busying it before 779 * returning. 780 * 781 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 782 * cache for popular filesystem identifiers. The cache is lockess, using 783 * the fact that struct mount's are never freed. In worst case we may 784 * get pointer to unmounted or even different filesystem, so we have to 785 * check what we got, and go slow way if so. 786 */ 787 struct mount * 788 vfs_busyfs(fsid_t *fsid) 789 { 790 #define FSID_CACHE_SIZE 256 791 typedef struct mount * volatile vmp_t; 792 static vmp_t cache[FSID_CACHE_SIZE]; 793 struct mount *mp; 794 int error; 795 uint32_t hash; 796 797 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 798 hash = fsid->val[0] ^ fsid->val[1]; 799 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 800 mp = cache[hash]; 801 if (mp == NULL || 802 mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || 803 mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) 804 goto slow; 805 if (vfs_busy(mp, 0) != 0) { 806 cache[hash] = NULL; 807 goto slow; 808 } 809 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 810 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) 811 return (mp); 812 else 813 vfs_unbusy(mp); 814 815 slow: 816 mtx_lock(&mountlist_mtx); 817 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 818 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 819 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 820 error = vfs_busy(mp, MBF_MNTLSTLOCK); 821 if (error) { 822 cache[hash] = NULL; 823 mtx_unlock(&mountlist_mtx); 824 return (NULL); 825 } 826 cache[hash] = mp; 827 return (mp); 828 } 829 } 830 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 831 mtx_unlock(&mountlist_mtx); 832 return ((struct mount *) 0); 833 } 834 835 /* 836 * Check if a user can access privileged mount options. 837 */ 838 int 839 vfs_suser(struct mount *mp, struct thread *td) 840 { 841 int error; 842 843 if (jailed(td->td_ucred)) { 844 /* 845 * If the jail of the calling thread lacks permission for 846 * this type of file system, deny immediately. 847 */ 848 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 849 return (EPERM); 850 851 /* 852 * If the file system was mounted outside the jail of the 853 * calling thread, deny immediately. 854 */ 855 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 856 return (EPERM); 857 } 858 859 /* 860 * If file system supports delegated administration, we don't check 861 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 862 * by the file system itself. 863 * If this is not the user that did original mount, we check for 864 * the PRIV_VFS_MOUNT_OWNER privilege. 865 */ 866 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 867 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 868 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 869 return (error); 870 } 871 return (0); 872 } 873 874 /* 875 * Get a new unique fsid. Try to make its val[0] unique, since this value 876 * will be used to create fake device numbers for stat(). Also try (but 877 * not so hard) make its val[0] unique mod 2^16, since some emulators only 878 * support 16-bit device numbers. We end up with unique val[0]'s for the 879 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 880 * 881 * Keep in mind that several mounts may be running in parallel. Starting 882 * the search one past where the previous search terminated is both a 883 * micro-optimization and a defense against returning the same fsid to 884 * different mounts. 885 */ 886 void 887 vfs_getnewfsid(struct mount *mp) 888 { 889 static uint16_t mntid_base; 890 struct mount *nmp; 891 fsid_t tfsid; 892 int mtype; 893 894 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 895 mtx_lock(&mntid_mtx); 896 mtype = mp->mnt_vfc->vfc_typenum; 897 tfsid.val[1] = mtype; 898 mtype = (mtype & 0xFF) << 24; 899 for (;;) { 900 tfsid.val[0] = makedev(255, 901 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 902 mntid_base++; 903 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 904 break; 905 vfs_rel(nmp); 906 } 907 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 908 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 909 mtx_unlock(&mntid_mtx); 910 } 911 912 /* 913 * Knob to control the precision of file timestamps: 914 * 915 * 0 = seconds only; nanoseconds zeroed. 916 * 1 = seconds and nanoseconds, accurate within 1/HZ. 917 * 2 = seconds and nanoseconds, truncated to microseconds. 918 * >=3 = seconds and nanoseconds, maximum precision. 919 */ 920 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 921 922 static int timestamp_precision = TSP_USEC; 923 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 924 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 925 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 926 "3+: sec + ns (max. precision))"); 927 928 /* 929 * Get a current timestamp. 930 */ 931 void 932 vfs_timestamp(struct timespec *tsp) 933 { 934 struct timeval tv; 935 936 switch (timestamp_precision) { 937 case TSP_SEC: 938 tsp->tv_sec = time_second; 939 tsp->tv_nsec = 0; 940 break; 941 case TSP_HZ: 942 getnanotime(tsp); 943 break; 944 case TSP_USEC: 945 microtime(&tv); 946 TIMEVAL_TO_TIMESPEC(&tv, tsp); 947 break; 948 case TSP_NSEC: 949 default: 950 nanotime(tsp); 951 break; 952 } 953 } 954 955 /* 956 * Set vnode attributes to VNOVAL 957 */ 958 void 959 vattr_null(struct vattr *vap) 960 { 961 962 vap->va_type = VNON; 963 vap->va_size = VNOVAL; 964 vap->va_bytes = VNOVAL; 965 vap->va_mode = VNOVAL; 966 vap->va_nlink = VNOVAL; 967 vap->va_uid = VNOVAL; 968 vap->va_gid = VNOVAL; 969 vap->va_fsid = VNOVAL; 970 vap->va_fileid = VNOVAL; 971 vap->va_blocksize = VNOVAL; 972 vap->va_rdev = VNOVAL; 973 vap->va_atime.tv_sec = VNOVAL; 974 vap->va_atime.tv_nsec = VNOVAL; 975 vap->va_mtime.tv_sec = VNOVAL; 976 vap->va_mtime.tv_nsec = VNOVAL; 977 vap->va_ctime.tv_sec = VNOVAL; 978 vap->va_ctime.tv_nsec = VNOVAL; 979 vap->va_birthtime.tv_sec = VNOVAL; 980 vap->va_birthtime.tv_nsec = VNOVAL; 981 vap->va_flags = VNOVAL; 982 vap->va_gen = VNOVAL; 983 vap->va_vaflags = 0; 984 } 985 986 /* 987 * This routine is called when we have too many vnodes. It attempts 988 * to free <count> vnodes and will potentially free vnodes that still 989 * have VM backing store (VM backing store is typically the cause 990 * of a vnode blowout so we want to do this). Therefore, this operation 991 * is not considered cheap. 992 * 993 * A number of conditions may prevent a vnode from being reclaimed. 994 * the buffer cache may have references on the vnode, a directory 995 * vnode may still have references due to the namei cache representing 996 * underlying files, or the vnode may be in active use. It is not 997 * desirable to reuse such vnodes. These conditions may cause the 998 * number of vnodes to reach some minimum value regardless of what 999 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1000 * 1001 * @param mp Try to reclaim vnodes from this mountpoint 1002 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1003 * entries if this argument is strue 1004 * @param trigger Only reclaim vnodes with fewer than this many resident 1005 * pages. 1006 * @return The number of vnodes that were reclaimed. 1007 */ 1008 static int 1009 vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger) 1010 { 1011 struct vnode *vp; 1012 int count, done, target; 1013 1014 done = 0; 1015 vn_start_write(NULL, &mp, V_WAIT); 1016 MNT_ILOCK(mp); 1017 count = mp->mnt_nvnodelistsize; 1018 target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1019 target = target / 10 + 1; 1020 while (count != 0 && done < target) { 1021 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 1022 while (vp != NULL && vp->v_type == VMARKER) 1023 vp = TAILQ_NEXT(vp, v_nmntvnodes); 1024 if (vp == NULL) 1025 break; 1026 /* 1027 * XXX LRU is completely broken for non-free vnodes. First 1028 * by calling here in mountpoint order, then by moving 1029 * unselected vnodes to the end here, and most grossly by 1030 * removing the vlruvp() function that was supposed to 1031 * maintain the order. (This function was born broken 1032 * since syncer problems prevented it doing anything.) The 1033 * order is closer to LRC (C = Created). 1034 * 1035 * LRU reclaiming of vnodes seems to have last worked in 1036 * FreeBSD-3 where LRU wasn't mentioned under any spelling. 1037 * Then there was no hold count, and inactive vnodes were 1038 * simply put on the free list in LRU order. The separate 1039 * lists also break LRU. We prefer to reclaim from the 1040 * free list for technical reasons. This tends to thrash 1041 * the free list to keep very unrecently used held vnodes. 1042 * The problem is mitigated by keeping the free list large. 1043 */ 1044 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1045 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1046 --count; 1047 if (!VI_TRYLOCK(vp)) 1048 goto next_iter; 1049 /* 1050 * If it's been deconstructed already, it's still 1051 * referenced, or it exceeds the trigger, skip it. 1052 * Also skip free vnodes. We are trying to make space 1053 * to expand the free list, not reduce it. 1054 */ 1055 if (vp->v_usecount || 1056 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1057 ((vp->v_iflag & VI_FREE) != 0) || 1058 VN_IS_DOOMED(vp) || (vp->v_object != NULL && 1059 vp->v_object->resident_page_count > trigger)) { 1060 VI_UNLOCK(vp); 1061 goto next_iter; 1062 } 1063 MNT_IUNLOCK(mp); 1064 vholdl(vp); 1065 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 1066 vdrop(vp); 1067 goto next_iter_mntunlocked; 1068 } 1069 VI_LOCK(vp); 1070 /* 1071 * v_usecount may have been bumped after VOP_LOCK() dropped 1072 * the vnode interlock and before it was locked again. 1073 * 1074 * It is not necessary to recheck VIRF_DOOMED because it can 1075 * only be set by another thread that holds both the vnode 1076 * lock and vnode interlock. If another thread has the 1077 * vnode lock before we get to VOP_LOCK() and obtains the 1078 * vnode interlock after VOP_LOCK() drops the vnode 1079 * interlock, the other thread will be unable to drop the 1080 * vnode lock before our VOP_LOCK() call fails. 1081 */ 1082 if (vp->v_usecount || 1083 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1084 (vp->v_object != NULL && 1085 vp->v_object->resident_page_count > trigger)) { 1086 VOP_UNLOCK(vp); 1087 vdropl(vp); 1088 goto next_iter_mntunlocked; 1089 } 1090 KASSERT(!VN_IS_DOOMED(vp), 1091 ("VIRF_DOOMED unexpectedly detected in vlrureclaim()")); 1092 counter_u64_add(recycles_count, 1); 1093 vgonel(vp); 1094 VOP_UNLOCK(vp); 1095 vdropl(vp); 1096 done++; 1097 next_iter_mntunlocked: 1098 if (!should_yield()) 1099 goto relock_mnt; 1100 goto yield; 1101 next_iter: 1102 if (!should_yield()) 1103 continue; 1104 MNT_IUNLOCK(mp); 1105 yield: 1106 kern_yield(PRI_USER); 1107 relock_mnt: 1108 MNT_ILOCK(mp); 1109 } 1110 MNT_IUNLOCK(mp); 1111 vn_finished_write(mp); 1112 return done; 1113 } 1114 1115 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1116 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1117 0, 1118 "limit on vnode free requests per call to the vnlru_free routine"); 1119 1120 /* 1121 * Attempt to reduce the free list by the requested amount. 1122 */ 1123 static void 1124 vnlru_free_locked(int count, struct vfsops *mnt_op) 1125 { 1126 struct vnode *vp; 1127 struct mount *mp; 1128 bool tried_batches; 1129 1130 tried_batches = false; 1131 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1132 if (count > max_vnlru_free) 1133 count = max_vnlru_free; 1134 for (; count > 0; count--) { 1135 vp = TAILQ_FIRST(&vnode_free_list); 1136 /* 1137 * The list can be modified while the free_list_mtx 1138 * has been dropped and vp could be NULL here. 1139 */ 1140 if (vp == NULL) { 1141 if (tried_batches) 1142 break; 1143 mtx_unlock(&vnode_free_list_mtx); 1144 vnlru_return_batches(mnt_op); 1145 tried_batches = true; 1146 mtx_lock(&vnode_free_list_mtx); 1147 continue; 1148 } 1149 1150 VNASSERT(vp->v_op != NULL, vp, 1151 ("vnlru_free: vnode already reclaimed.")); 1152 KASSERT((vp->v_iflag & VI_FREE) != 0, 1153 ("Removing vnode not on freelist")); 1154 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1155 ("Mangling active vnode")); 1156 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 1157 1158 /* 1159 * Don't recycle if our vnode is from different type 1160 * of mount point. Note that mp is type-safe, the 1161 * check does not reach unmapped address even if 1162 * vnode is reclaimed. 1163 * Don't recycle if we can't get the interlock without 1164 * blocking. 1165 */ 1166 if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && 1167 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 1168 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 1169 continue; 1170 } 1171 VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, 1172 vp, ("vp inconsistent on freelist")); 1173 1174 /* 1175 * The clear of VI_FREE prevents activation of the 1176 * vnode. There is no sense in putting the vnode on 1177 * the mount point active list, only to remove it 1178 * later during recycling. Inline the relevant part 1179 * of vholdl(), to avoid triggering assertions or 1180 * activating. 1181 */ 1182 freevnodes--; 1183 vp->v_iflag &= ~VI_FREE; 1184 VNODE_REFCOUNT_FENCE_REL(); 1185 refcount_acquire(&vp->v_holdcnt); 1186 1187 mtx_unlock(&vnode_free_list_mtx); 1188 VI_UNLOCK(vp); 1189 vtryrecycle(vp); 1190 /* 1191 * If the recycled succeeded this vdrop will actually free 1192 * the vnode. If not it will simply place it back on 1193 * the free list. 1194 */ 1195 vdrop(vp); 1196 mtx_lock(&vnode_free_list_mtx); 1197 } 1198 } 1199 1200 void 1201 vnlru_free(int count, struct vfsops *mnt_op) 1202 { 1203 1204 mtx_lock(&vnode_free_list_mtx); 1205 vnlru_free_locked(count, mnt_op); 1206 mtx_unlock(&vnode_free_list_mtx); 1207 } 1208 1209 1210 /* XXX some names and initialization are bad for limits and watermarks. */ 1211 static int 1212 vspace(void) 1213 { 1214 int space; 1215 1216 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1217 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1218 vlowat = vhiwat / 2; 1219 if (numvnodes > desiredvnodes) 1220 return (0); 1221 space = desiredvnodes - numvnodes; 1222 if (freevnodes > wantfreevnodes) 1223 space += freevnodes - wantfreevnodes; 1224 return (space); 1225 } 1226 1227 static void 1228 vnlru_return_batch_locked(struct mount *mp) 1229 { 1230 struct vnode *vp; 1231 1232 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 1233 1234 if (mp->mnt_tmpfreevnodelistsize == 0) 1235 return; 1236 1237 TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { 1238 VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, 1239 ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); 1240 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 1241 } 1242 mtx_lock(&vnode_free_list_mtx); 1243 TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); 1244 freevnodes += mp->mnt_tmpfreevnodelistsize; 1245 mtx_unlock(&vnode_free_list_mtx); 1246 mp->mnt_tmpfreevnodelistsize = 0; 1247 } 1248 1249 static void 1250 vnlru_return_batch(struct mount *mp) 1251 { 1252 1253 mtx_lock(&mp->mnt_listmtx); 1254 vnlru_return_batch_locked(mp); 1255 mtx_unlock(&mp->mnt_listmtx); 1256 } 1257 1258 static void 1259 vnlru_return_batches(struct vfsops *mnt_op) 1260 { 1261 struct mount *mp, *nmp; 1262 bool need_unbusy; 1263 1264 mtx_lock(&mountlist_mtx); 1265 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1266 need_unbusy = false; 1267 if (mnt_op != NULL && mp->mnt_op != mnt_op) 1268 goto next; 1269 if (mp->mnt_tmpfreevnodelistsize == 0) 1270 goto next; 1271 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { 1272 vnlru_return_batch(mp); 1273 need_unbusy = true; 1274 mtx_lock(&mountlist_mtx); 1275 } 1276 next: 1277 nmp = TAILQ_NEXT(mp, mnt_list); 1278 if (need_unbusy) 1279 vfs_unbusy(mp); 1280 } 1281 mtx_unlock(&mountlist_mtx); 1282 } 1283 1284 /* 1285 * Attempt to recycle vnodes in a context that is always safe to block. 1286 * Calling vlrurecycle() from the bowels of filesystem code has some 1287 * interesting deadlock problems. 1288 */ 1289 static struct proc *vnlruproc; 1290 static int vnlruproc_sig; 1291 1292 static void 1293 vnlru_proc(void) 1294 { 1295 struct mount *mp, *nmp; 1296 unsigned long onumvnodes; 1297 int done, force, trigger, usevnodes, vsp; 1298 bool reclaim_nc_src; 1299 1300 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1301 SHUTDOWN_PRI_FIRST); 1302 1303 force = 0; 1304 for (;;) { 1305 kproc_suspend_check(vnlruproc); 1306 mtx_lock(&vnode_free_list_mtx); 1307 /* 1308 * If numvnodes is too large (due to desiredvnodes being 1309 * adjusted using its sysctl, or emergency growth), first 1310 * try to reduce it by discarding from the free list. 1311 */ 1312 if (numvnodes > desiredvnodes) 1313 vnlru_free_locked(numvnodes - desiredvnodes, NULL); 1314 /* 1315 * Sleep if the vnode cache is in a good state. This is 1316 * when it is not over-full and has space for about a 4% 1317 * or 9% expansion (by growing its size or inexcessively 1318 * reducing its free list). Otherwise, try to reclaim 1319 * space for a 10% expansion. 1320 */ 1321 if (vstir && force == 0) { 1322 force = 1; 1323 vstir = 0; 1324 } 1325 vsp = vspace(); 1326 if (vsp >= vlowat && force == 0) { 1327 vnlruproc_sig = 0; 1328 wakeup(&vnlruproc_sig); 1329 msleep(vnlruproc, &vnode_free_list_mtx, 1330 PVFS|PDROP, "vlruwt", hz); 1331 continue; 1332 } 1333 mtx_unlock(&vnode_free_list_mtx); 1334 done = 0; 1335 onumvnodes = numvnodes; 1336 /* 1337 * Calculate parameters for recycling. These are the same 1338 * throughout the loop to give some semblance of fairness. 1339 * The trigger point is to avoid recycling vnodes with lots 1340 * of resident pages. We aren't trying to free memory; we 1341 * are trying to recycle or at least free vnodes. 1342 */ 1343 if (numvnodes <= desiredvnodes) 1344 usevnodes = numvnodes - freevnodes; 1345 else 1346 usevnodes = numvnodes; 1347 if (usevnodes <= 0) 1348 usevnodes = 1; 1349 /* 1350 * The trigger value is is chosen to give a conservatively 1351 * large value to ensure that it alone doesn't prevent 1352 * making progress. The value can easily be so large that 1353 * it is effectively infinite in some congested and 1354 * misconfigured cases, and this is necessary. Normally 1355 * it is about 8 to 100 (pages), which is quite large. 1356 */ 1357 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1358 if (force < 2) 1359 trigger = vsmalltrigger; 1360 reclaim_nc_src = force >= 3; 1361 mtx_lock(&mountlist_mtx); 1362 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1363 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 1364 nmp = TAILQ_NEXT(mp, mnt_list); 1365 continue; 1366 } 1367 done += vlrureclaim(mp, reclaim_nc_src, trigger); 1368 mtx_lock(&mountlist_mtx); 1369 nmp = TAILQ_NEXT(mp, mnt_list); 1370 vfs_unbusy(mp); 1371 } 1372 mtx_unlock(&mountlist_mtx); 1373 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1374 uma_reclaim(UMA_RECLAIM_DRAIN); 1375 if (done == 0) { 1376 if (force == 0 || force == 1) { 1377 force = 2; 1378 continue; 1379 } 1380 if (force == 2) { 1381 force = 3; 1382 continue; 1383 } 1384 force = 0; 1385 vnlru_nowhere++; 1386 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1387 } else 1388 kern_yield(PRI_USER); 1389 /* 1390 * After becoming active to expand above low water, keep 1391 * active until above high water. 1392 */ 1393 vsp = vspace(); 1394 force = vsp < vhiwat; 1395 } 1396 } 1397 1398 static struct kproc_desc vnlru_kp = { 1399 "vnlru", 1400 vnlru_proc, 1401 &vnlruproc 1402 }; 1403 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1404 &vnlru_kp); 1405 1406 /* 1407 * Routines having to do with the management of the vnode table. 1408 */ 1409 1410 /* 1411 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1412 * before we actually vgone(). This function must be called with the vnode 1413 * held to prevent the vnode from being returned to the free list midway 1414 * through vgone(). 1415 */ 1416 static int 1417 vtryrecycle(struct vnode *vp) 1418 { 1419 struct mount *vnmp; 1420 1421 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1422 VNASSERT(vp->v_holdcnt, vp, 1423 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1424 /* 1425 * This vnode may found and locked via some other list, if so we 1426 * can't recycle it yet. 1427 */ 1428 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1429 CTR2(KTR_VFS, 1430 "%s: impossible to recycle, vp %p lock is already held", 1431 __func__, vp); 1432 return (EWOULDBLOCK); 1433 } 1434 /* 1435 * Don't recycle if its filesystem is being suspended. 1436 */ 1437 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1438 VOP_UNLOCK(vp); 1439 CTR2(KTR_VFS, 1440 "%s: impossible to recycle, cannot start the write for %p", 1441 __func__, vp); 1442 return (EBUSY); 1443 } 1444 /* 1445 * If we got this far, we need to acquire the interlock and see if 1446 * anyone picked up this vnode from another list. If not, we will 1447 * mark it with DOOMED via vgonel() so that anyone who does find it 1448 * will skip over it. 1449 */ 1450 VI_LOCK(vp); 1451 if (vp->v_usecount) { 1452 VOP_UNLOCK(vp); 1453 VI_UNLOCK(vp); 1454 vn_finished_write(vnmp); 1455 CTR2(KTR_VFS, 1456 "%s: impossible to recycle, %p is already referenced", 1457 __func__, vp); 1458 return (EBUSY); 1459 } 1460 if (!VN_IS_DOOMED(vp)) { 1461 counter_u64_add(recycles_count, 1); 1462 vgonel(vp); 1463 } 1464 VOP_UNLOCK(vp); 1465 VI_UNLOCK(vp); 1466 vn_finished_write(vnmp); 1467 return (0); 1468 } 1469 1470 static void 1471 vcheckspace(void) 1472 { 1473 int vsp; 1474 1475 vsp = vspace(); 1476 if (vsp < vlowat && vnlruproc_sig == 0) { 1477 vnlruproc_sig = 1; 1478 wakeup(vnlruproc); 1479 } 1480 } 1481 1482 /* 1483 * Wait if necessary for space for a new vnode. 1484 */ 1485 static int 1486 getnewvnode_wait(int suspended) 1487 { 1488 1489 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 1490 if (numvnodes >= desiredvnodes) { 1491 if (suspended) { 1492 /* 1493 * The file system is being suspended. We cannot 1494 * risk a deadlock here, so allow allocation of 1495 * another vnode even if this would give too many. 1496 */ 1497 return (0); 1498 } 1499 if (vnlruproc_sig == 0) { 1500 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 1501 wakeup(vnlruproc); 1502 } 1503 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 1504 "vlruwk", hz); 1505 } 1506 /* Post-adjust like the pre-adjust in getnewvnode(). */ 1507 if (numvnodes + 1 > desiredvnodes && freevnodes > 1) 1508 vnlru_free_locked(1, NULL); 1509 return (numvnodes >= desiredvnodes ? ENFILE : 0); 1510 } 1511 1512 /* 1513 * This hack is fragile, and probably not needed any more now that the 1514 * watermark handling works. 1515 */ 1516 void 1517 getnewvnode_reserve(u_int count) 1518 { 1519 struct thread *td; 1520 1521 /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ 1522 /* XXX no longer so quick, but this part is not racy. */ 1523 mtx_lock(&vnode_free_list_mtx); 1524 if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) 1525 vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, 1526 freevnodes - wantfreevnodes), NULL); 1527 mtx_unlock(&vnode_free_list_mtx); 1528 1529 td = curthread; 1530 /* First try to be quick and racy. */ 1531 if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { 1532 td->td_vp_reserv += count; 1533 vcheckspace(); /* XXX no longer so quick, but more racy */ 1534 return; 1535 } else 1536 atomic_subtract_long(&numvnodes, count); 1537 1538 mtx_lock(&vnode_free_list_mtx); 1539 while (count > 0) { 1540 if (getnewvnode_wait(0) == 0) { 1541 count--; 1542 td->td_vp_reserv++; 1543 atomic_add_long(&numvnodes, 1); 1544 } 1545 } 1546 vcheckspace(); 1547 mtx_unlock(&vnode_free_list_mtx); 1548 } 1549 1550 /* 1551 * This hack is fragile, especially if desiredvnodes or wantvnodes are 1552 * misconfgured or changed significantly. Reducing desiredvnodes below 1553 * the reserved amount should cause bizarre behaviour like reducing it 1554 * below the number of active vnodes -- the system will try to reduce 1555 * numvnodes to match, but should fail, so the subtraction below should 1556 * not overflow. 1557 */ 1558 void 1559 getnewvnode_drop_reserve(void) 1560 { 1561 struct thread *td; 1562 1563 td = curthread; 1564 atomic_subtract_long(&numvnodes, td->td_vp_reserv); 1565 td->td_vp_reserv = 0; 1566 } 1567 1568 /* 1569 * Return the next vnode from the free list. 1570 */ 1571 int 1572 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1573 struct vnode **vpp) 1574 { 1575 struct vnode *vp; 1576 struct thread *td; 1577 struct lock_object *lo; 1578 static int cyclecount; 1579 int error __unused; 1580 1581 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1582 1583 KASSERT(vops->registered, 1584 ("%s: not registered vector op %p\n", __func__, vops)); 1585 1586 vp = NULL; 1587 td = curthread; 1588 if (td->td_vp_reserv > 0) { 1589 td->td_vp_reserv -= 1; 1590 goto alloc; 1591 } 1592 mtx_lock(&vnode_free_list_mtx); 1593 if (numvnodes < desiredvnodes) 1594 cyclecount = 0; 1595 else if (cyclecount++ >= freevnodes) { 1596 cyclecount = 0; 1597 vstir = 1; 1598 } 1599 /* 1600 * Grow the vnode cache if it will not be above its target max 1601 * after growing. Otherwise, if the free list is nonempty, try 1602 * to reclaim 1 item from it before growing the cache (possibly 1603 * above its target max if the reclamation failed or is delayed). 1604 * Otherwise, wait for some space. In all cases, schedule 1605 * vnlru_proc() if we are getting short of space. The watermarks 1606 * should be chosen so that we never wait or even reclaim from 1607 * the free list to below its target minimum. 1608 */ 1609 if (numvnodes + 1 <= desiredvnodes) 1610 ; 1611 else if (freevnodes > 0) 1612 vnlru_free_locked(1, NULL); 1613 else { 1614 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1615 MNTK_SUSPEND)); 1616 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1617 if (error != 0) { 1618 mtx_unlock(&vnode_free_list_mtx); 1619 return (error); 1620 } 1621 #endif 1622 } 1623 vcheckspace(); 1624 atomic_add_long(&numvnodes, 1); 1625 mtx_unlock(&vnode_free_list_mtx); 1626 alloc: 1627 counter_u64_add(vnodes_created, 1); 1628 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 1629 /* 1630 * Locks are given the generic name "vnode" when created. 1631 * Follow the historic practice of using the filesystem 1632 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1633 * 1634 * Locks live in a witness group keyed on their name. Thus, 1635 * when a lock is renamed, it must also move from the witness 1636 * group of its old name to the witness group of its new name. 1637 * 1638 * The change only needs to be made when the vnode moves 1639 * from one filesystem type to another. We ensure that each 1640 * filesystem use a single static name pointer for its tag so 1641 * that we can compare pointers rather than doing a strcmp(). 1642 */ 1643 lo = &vp->v_vnlock->lock_object; 1644 if (lo->lo_name != tag) { 1645 lo->lo_name = tag; 1646 WITNESS_DESTROY(lo); 1647 WITNESS_INIT(lo, tag); 1648 } 1649 /* 1650 * By default, don't allow shared locks unless filesystems opt-in. 1651 */ 1652 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1653 /* 1654 * Finalize various vnode identity bits. 1655 */ 1656 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1657 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1658 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1659 vp->v_type = VNON; 1660 vp->v_tag = tag; 1661 vp->v_op = vops; 1662 v_init_counters(vp); 1663 vp->v_bufobj.bo_ops = &buf_ops_bio; 1664 #ifdef DIAGNOSTIC 1665 if (mp == NULL && vops != &dead_vnodeops) 1666 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1667 #endif 1668 #ifdef MAC 1669 mac_vnode_init(vp); 1670 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1671 mac_vnode_associate_singlelabel(mp, vp); 1672 #endif 1673 if (mp != NULL) { 1674 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1675 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1676 vp->v_vflag |= VV_NOKNOTE; 1677 } 1678 1679 /* 1680 * For the filesystems which do not use vfs_hash_insert(), 1681 * still initialize v_hash to have vfs_hash_index() useful. 1682 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1683 * its own hashing. 1684 */ 1685 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1686 1687 *vpp = vp; 1688 return (0); 1689 } 1690 1691 static void 1692 freevnode(struct vnode *vp) 1693 { 1694 struct bufobj *bo; 1695 1696 /* 1697 * The vnode has been marked for destruction, so free it. 1698 * 1699 * The vnode will be returned to the zone where it will 1700 * normally remain until it is needed for another vnode. We 1701 * need to cleanup (or verify that the cleanup has already 1702 * been done) any residual data left from its current use 1703 * so as not to contaminate the freshly allocated vnode. 1704 */ 1705 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1706 atomic_subtract_long(&numvnodes, 1); 1707 bo = &vp->v_bufobj; 1708 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 1709 ("cleaned vnode still on the free list.")); 1710 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1711 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 1712 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1713 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1714 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1715 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1716 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1717 ("clean blk trie not empty")); 1718 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1719 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1720 ("dirty blk trie not empty")); 1721 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 1722 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 1723 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 1724 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1725 ("Dangling rangelock waiters")); 1726 VI_UNLOCK(vp); 1727 #ifdef MAC 1728 mac_vnode_destroy(vp); 1729 #endif 1730 if (vp->v_pollinfo != NULL) { 1731 destroy_vpollinfo(vp->v_pollinfo); 1732 vp->v_pollinfo = NULL; 1733 } 1734 #ifdef INVARIANTS 1735 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 1736 vp->v_op = NULL; 1737 #endif 1738 vp->v_mountedhere = NULL; 1739 vp->v_unpcb = NULL; 1740 vp->v_rdev = NULL; 1741 vp->v_fifoinfo = NULL; 1742 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 1743 vp->v_irflag = 0; 1744 vp->v_iflag = 0; 1745 vp->v_vflag = 0; 1746 bo->bo_flag = 0; 1747 uma_zfree(vnode_zone, vp); 1748 } 1749 1750 /* 1751 * Delete from old mount point vnode list, if on one. 1752 */ 1753 static void 1754 delmntque(struct vnode *vp) 1755 { 1756 struct mount *mp; 1757 1758 mp = vp->v_mount; 1759 if (mp == NULL) 1760 return; 1761 MNT_ILOCK(mp); 1762 VI_LOCK(vp); 1763 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1764 ("Active vnode list size %d > Vnode list size %d", 1765 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1766 if (vp->v_iflag & VI_ACTIVE) { 1767 vp->v_iflag &= ~VI_ACTIVE; 1768 mtx_lock(&mp->mnt_listmtx); 1769 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1770 mp->mnt_activevnodelistsize--; 1771 mtx_unlock(&mp->mnt_listmtx); 1772 } 1773 vp->v_mount = NULL; 1774 VI_UNLOCK(vp); 1775 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1776 ("bad mount point vnode list size")); 1777 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1778 mp->mnt_nvnodelistsize--; 1779 MNT_REL(mp); 1780 MNT_IUNLOCK(mp); 1781 } 1782 1783 static void 1784 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1785 { 1786 1787 vp->v_data = NULL; 1788 vp->v_op = &dead_vnodeops; 1789 vgone(vp); 1790 vput(vp); 1791 } 1792 1793 /* 1794 * Insert into list of vnodes for the new mount point, if available. 1795 */ 1796 int 1797 insmntque1(struct vnode *vp, struct mount *mp, 1798 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1799 { 1800 1801 KASSERT(vp->v_mount == NULL, 1802 ("insmntque: vnode already on per mount vnode list")); 1803 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1804 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1805 1806 /* 1807 * We acquire the vnode interlock early to ensure that the 1808 * vnode cannot be recycled by another process releasing a 1809 * holdcnt on it before we get it on both the vnode list 1810 * and the active vnode list. The mount mutex protects only 1811 * manipulation of the vnode list and the vnode freelist 1812 * mutex protects only manipulation of the active vnode list. 1813 * Hence the need to hold the vnode interlock throughout. 1814 */ 1815 MNT_ILOCK(mp); 1816 VI_LOCK(vp); 1817 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1818 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1819 mp->mnt_nvnodelistsize == 0)) && 1820 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1821 VI_UNLOCK(vp); 1822 MNT_IUNLOCK(mp); 1823 if (dtr != NULL) 1824 dtr(vp, dtr_arg); 1825 return (EBUSY); 1826 } 1827 vp->v_mount = mp; 1828 MNT_REF(mp); 1829 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1830 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1831 ("neg mount point vnode list size")); 1832 mp->mnt_nvnodelistsize++; 1833 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1834 ("Activating already active vnode")); 1835 vp->v_iflag |= VI_ACTIVE; 1836 mtx_lock(&mp->mnt_listmtx); 1837 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1838 mp->mnt_activevnodelistsize++; 1839 mtx_unlock(&mp->mnt_listmtx); 1840 VI_UNLOCK(vp); 1841 MNT_IUNLOCK(mp); 1842 return (0); 1843 } 1844 1845 int 1846 insmntque(struct vnode *vp, struct mount *mp) 1847 { 1848 1849 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1850 } 1851 1852 /* 1853 * Flush out and invalidate all buffers associated with a bufobj 1854 * Called with the underlying object locked. 1855 */ 1856 int 1857 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1858 { 1859 int error; 1860 1861 BO_LOCK(bo); 1862 if (flags & V_SAVE) { 1863 error = bufobj_wwait(bo, slpflag, slptimeo); 1864 if (error) { 1865 BO_UNLOCK(bo); 1866 return (error); 1867 } 1868 if (bo->bo_dirty.bv_cnt > 0) { 1869 BO_UNLOCK(bo); 1870 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1871 return (error); 1872 /* 1873 * XXX We could save a lock/unlock if this was only 1874 * enabled under INVARIANTS 1875 */ 1876 BO_LOCK(bo); 1877 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1878 panic("vinvalbuf: dirty bufs"); 1879 } 1880 } 1881 /* 1882 * If you alter this loop please notice that interlock is dropped and 1883 * reacquired in flushbuflist. Special care is needed to ensure that 1884 * no race conditions occur from this. 1885 */ 1886 do { 1887 error = flushbuflist(&bo->bo_clean, 1888 flags, bo, slpflag, slptimeo); 1889 if (error == 0 && !(flags & V_CLEANONLY)) 1890 error = flushbuflist(&bo->bo_dirty, 1891 flags, bo, slpflag, slptimeo); 1892 if (error != 0 && error != EAGAIN) { 1893 BO_UNLOCK(bo); 1894 return (error); 1895 } 1896 } while (error != 0); 1897 1898 /* 1899 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1900 * have write I/O in-progress but if there is a VM object then the 1901 * VM object can also have read-I/O in-progress. 1902 */ 1903 do { 1904 bufobj_wwait(bo, 0, 0); 1905 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 1906 BO_UNLOCK(bo); 1907 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 1908 BO_LOCK(bo); 1909 } 1910 } while (bo->bo_numoutput > 0); 1911 BO_UNLOCK(bo); 1912 1913 /* 1914 * Destroy the copy in the VM cache, too. 1915 */ 1916 if (bo->bo_object != NULL && 1917 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1918 VM_OBJECT_WLOCK(bo->bo_object); 1919 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1920 OBJPR_CLEANONLY : 0); 1921 VM_OBJECT_WUNLOCK(bo->bo_object); 1922 } 1923 1924 #ifdef INVARIANTS 1925 BO_LOCK(bo); 1926 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1927 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1928 bo->bo_clean.bv_cnt > 0)) 1929 panic("vinvalbuf: flush failed"); 1930 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 1931 bo->bo_dirty.bv_cnt > 0) 1932 panic("vinvalbuf: flush dirty failed"); 1933 BO_UNLOCK(bo); 1934 #endif 1935 return (0); 1936 } 1937 1938 /* 1939 * Flush out and invalidate all buffers associated with a vnode. 1940 * Called with the underlying object locked. 1941 */ 1942 int 1943 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1944 { 1945 1946 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1947 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1948 if (vp->v_object != NULL && vp->v_object->handle != vp) 1949 return (0); 1950 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1951 } 1952 1953 /* 1954 * Flush out buffers on the specified list. 1955 * 1956 */ 1957 static int 1958 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1959 int slptimeo) 1960 { 1961 struct buf *bp, *nbp; 1962 int retval, error; 1963 daddr_t lblkno; 1964 b_xflags_t xflags; 1965 1966 ASSERT_BO_WLOCKED(bo); 1967 1968 retval = 0; 1969 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1970 /* 1971 * If we are flushing both V_NORMAL and V_ALT buffers then 1972 * do not skip any buffers. If we are flushing only V_NORMAL 1973 * buffers then skip buffers marked as BX_ALTDATA. If we are 1974 * flushing only V_ALT buffers then skip buffers not marked 1975 * as BX_ALTDATA. 1976 */ 1977 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 1978 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 1979 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 1980 continue; 1981 } 1982 if (nbp != NULL) { 1983 lblkno = nbp->b_lblkno; 1984 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 1985 } 1986 retval = EAGAIN; 1987 error = BUF_TIMELOCK(bp, 1988 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 1989 "flushbuf", slpflag, slptimeo); 1990 if (error) { 1991 BO_LOCK(bo); 1992 return (error != ENOLCK ? error : EAGAIN); 1993 } 1994 KASSERT(bp->b_bufobj == bo, 1995 ("bp %p wrong b_bufobj %p should be %p", 1996 bp, bp->b_bufobj, bo)); 1997 /* 1998 * XXX Since there are no node locks for NFS, I 1999 * believe there is a slight chance that a delayed 2000 * write will occur while sleeping just above, so 2001 * check for it. 2002 */ 2003 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2004 (flags & V_SAVE)) { 2005 bremfree(bp); 2006 bp->b_flags |= B_ASYNC; 2007 bwrite(bp); 2008 BO_LOCK(bo); 2009 return (EAGAIN); /* XXX: why not loop ? */ 2010 } 2011 bremfree(bp); 2012 bp->b_flags |= (B_INVAL | B_RELBUF); 2013 bp->b_flags &= ~B_ASYNC; 2014 brelse(bp); 2015 BO_LOCK(bo); 2016 if (nbp == NULL) 2017 break; 2018 nbp = gbincore(bo, lblkno); 2019 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2020 != xflags) 2021 break; /* nbp invalid */ 2022 } 2023 return (retval); 2024 } 2025 2026 int 2027 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2028 { 2029 struct buf *bp; 2030 int error; 2031 daddr_t lblkno; 2032 2033 ASSERT_BO_LOCKED(bo); 2034 2035 for (lblkno = startn;;) { 2036 again: 2037 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2038 if (bp == NULL || bp->b_lblkno >= endn || 2039 bp->b_lblkno < startn) 2040 break; 2041 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2042 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2043 if (error != 0) { 2044 BO_RLOCK(bo); 2045 if (error == ENOLCK) 2046 goto again; 2047 return (error); 2048 } 2049 KASSERT(bp->b_bufobj == bo, 2050 ("bp %p wrong b_bufobj %p should be %p", 2051 bp, bp->b_bufobj, bo)); 2052 lblkno = bp->b_lblkno + 1; 2053 if ((bp->b_flags & B_MANAGED) == 0) 2054 bremfree(bp); 2055 bp->b_flags |= B_RELBUF; 2056 /* 2057 * In the VMIO case, use the B_NOREUSE flag to hint that the 2058 * pages backing each buffer in the range are unlikely to be 2059 * reused. Dirty buffers will have the hint applied once 2060 * they've been written. 2061 */ 2062 if ((bp->b_flags & B_VMIO) != 0) 2063 bp->b_flags |= B_NOREUSE; 2064 brelse(bp); 2065 BO_RLOCK(bo); 2066 } 2067 return (0); 2068 } 2069 2070 /* 2071 * Truncate a file's buffer and pages to a specified length. This 2072 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2073 * sync activity. 2074 */ 2075 int 2076 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2077 { 2078 struct buf *bp, *nbp; 2079 struct bufobj *bo; 2080 daddr_t startlbn; 2081 2082 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2083 vp, blksize, (uintmax_t)length); 2084 2085 /* 2086 * Round up to the *next* lbn. 2087 */ 2088 startlbn = howmany(length, blksize); 2089 2090 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2091 2092 bo = &vp->v_bufobj; 2093 restart_unlocked: 2094 BO_LOCK(bo); 2095 2096 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2097 ; 2098 2099 if (length > 0) { 2100 restartsync: 2101 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2102 if (bp->b_lblkno > 0) 2103 continue; 2104 /* 2105 * Since we hold the vnode lock this should only 2106 * fail if we're racing with the buf daemon. 2107 */ 2108 if (BUF_LOCK(bp, 2109 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2110 BO_LOCKPTR(bo)) == ENOLCK) 2111 goto restart_unlocked; 2112 2113 VNASSERT((bp->b_flags & B_DELWRI), vp, 2114 ("buf(%p) on dirty queue without DELWRI", bp)); 2115 2116 bremfree(bp); 2117 bawrite(bp); 2118 BO_LOCK(bo); 2119 goto restartsync; 2120 } 2121 } 2122 2123 bufobj_wwait(bo, 0, 0); 2124 BO_UNLOCK(bo); 2125 vnode_pager_setsize(vp, length); 2126 2127 return (0); 2128 } 2129 2130 /* 2131 * Invalidate the cached pages of a file's buffer within the range of block 2132 * numbers [startlbn, endlbn). 2133 */ 2134 void 2135 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2136 int blksize) 2137 { 2138 struct bufobj *bo; 2139 off_t start, end; 2140 2141 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2142 2143 start = blksize * startlbn; 2144 end = blksize * endlbn; 2145 2146 bo = &vp->v_bufobj; 2147 BO_LOCK(bo); 2148 MPASS(blksize == bo->bo_bsize); 2149 2150 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2151 ; 2152 2153 BO_UNLOCK(bo); 2154 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2155 } 2156 2157 static int 2158 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2159 daddr_t startlbn, daddr_t endlbn) 2160 { 2161 struct buf *bp, *nbp; 2162 bool anyfreed; 2163 2164 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2165 ASSERT_BO_LOCKED(bo); 2166 2167 do { 2168 anyfreed = false; 2169 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2170 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2171 continue; 2172 if (BUF_LOCK(bp, 2173 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2174 BO_LOCKPTR(bo)) == ENOLCK) { 2175 BO_LOCK(bo); 2176 return (EAGAIN); 2177 } 2178 2179 bremfree(bp); 2180 bp->b_flags |= B_INVAL | B_RELBUF; 2181 bp->b_flags &= ~B_ASYNC; 2182 brelse(bp); 2183 anyfreed = true; 2184 2185 BO_LOCK(bo); 2186 if (nbp != NULL && 2187 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2188 nbp->b_vp != vp || 2189 (nbp->b_flags & B_DELWRI) != 0)) 2190 return (EAGAIN); 2191 } 2192 2193 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2194 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2195 continue; 2196 if (BUF_LOCK(bp, 2197 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2198 BO_LOCKPTR(bo)) == ENOLCK) { 2199 BO_LOCK(bo); 2200 return (EAGAIN); 2201 } 2202 bremfree(bp); 2203 bp->b_flags |= B_INVAL | B_RELBUF; 2204 bp->b_flags &= ~B_ASYNC; 2205 brelse(bp); 2206 anyfreed = true; 2207 2208 BO_LOCK(bo); 2209 if (nbp != NULL && 2210 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2211 (nbp->b_vp != vp) || 2212 (nbp->b_flags & B_DELWRI) == 0)) 2213 return (EAGAIN); 2214 } 2215 } while (anyfreed); 2216 return (0); 2217 } 2218 2219 static void 2220 buf_vlist_remove(struct buf *bp) 2221 { 2222 struct bufv *bv; 2223 2224 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2225 ASSERT_BO_WLOCKED(bp->b_bufobj); 2226 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 2227 (BX_VNDIRTY|BX_VNCLEAN), 2228 ("buf_vlist_remove: Buf %p is on two lists", bp)); 2229 if (bp->b_xflags & BX_VNDIRTY) 2230 bv = &bp->b_bufobj->bo_dirty; 2231 else 2232 bv = &bp->b_bufobj->bo_clean; 2233 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2234 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2235 bv->bv_cnt--; 2236 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2237 } 2238 2239 /* 2240 * Add the buffer to the sorted clean or dirty block list. 2241 * 2242 * NOTE: xflags is passed as a constant, optimizing this inline function! 2243 */ 2244 static void 2245 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2246 { 2247 struct bufv *bv; 2248 struct buf *n; 2249 int error; 2250 2251 ASSERT_BO_WLOCKED(bo); 2252 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2253 ("dead bo %p", bo)); 2254 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2255 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2256 bp->b_xflags |= xflags; 2257 if (xflags & BX_VNDIRTY) 2258 bv = &bo->bo_dirty; 2259 else 2260 bv = &bo->bo_clean; 2261 2262 /* 2263 * Keep the list ordered. Optimize empty list insertion. Assume 2264 * we tend to grow at the tail so lookup_le should usually be cheaper 2265 * than _ge. 2266 */ 2267 if (bv->bv_cnt == 0 || 2268 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2269 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2270 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2271 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2272 else 2273 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2274 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2275 if (error) 2276 panic("buf_vlist_add: Preallocated nodes insufficient."); 2277 bv->bv_cnt++; 2278 } 2279 2280 /* 2281 * Look up a buffer using the buffer tries. 2282 */ 2283 struct buf * 2284 gbincore(struct bufobj *bo, daddr_t lblkno) 2285 { 2286 struct buf *bp; 2287 2288 ASSERT_BO_LOCKED(bo); 2289 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2290 if (bp != NULL) 2291 return (bp); 2292 return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); 2293 } 2294 2295 /* 2296 * Associate a buffer with a vnode. 2297 */ 2298 void 2299 bgetvp(struct vnode *vp, struct buf *bp) 2300 { 2301 struct bufobj *bo; 2302 2303 bo = &vp->v_bufobj; 2304 ASSERT_BO_WLOCKED(bo); 2305 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2306 2307 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2308 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2309 ("bgetvp: bp already attached! %p", bp)); 2310 2311 vhold(vp); 2312 bp->b_vp = vp; 2313 bp->b_bufobj = bo; 2314 /* 2315 * Insert onto list for new vnode. 2316 */ 2317 buf_vlist_add(bp, bo, BX_VNCLEAN); 2318 } 2319 2320 /* 2321 * Disassociate a buffer from a vnode. 2322 */ 2323 void 2324 brelvp(struct buf *bp) 2325 { 2326 struct bufobj *bo; 2327 struct vnode *vp; 2328 2329 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2330 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2331 2332 /* 2333 * Delete from old vnode list, if on one. 2334 */ 2335 vp = bp->b_vp; /* XXX */ 2336 bo = bp->b_bufobj; 2337 BO_LOCK(bo); 2338 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2339 buf_vlist_remove(bp); 2340 else 2341 panic("brelvp: Buffer %p not on queue.", bp); 2342 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2343 bo->bo_flag &= ~BO_ONWORKLST; 2344 mtx_lock(&sync_mtx); 2345 LIST_REMOVE(bo, bo_synclist); 2346 syncer_worklist_len--; 2347 mtx_unlock(&sync_mtx); 2348 } 2349 bp->b_vp = NULL; 2350 bp->b_bufobj = NULL; 2351 BO_UNLOCK(bo); 2352 vdrop(vp); 2353 } 2354 2355 /* 2356 * Add an item to the syncer work queue. 2357 */ 2358 static void 2359 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2360 { 2361 int slot; 2362 2363 ASSERT_BO_WLOCKED(bo); 2364 2365 mtx_lock(&sync_mtx); 2366 if (bo->bo_flag & BO_ONWORKLST) 2367 LIST_REMOVE(bo, bo_synclist); 2368 else { 2369 bo->bo_flag |= BO_ONWORKLST; 2370 syncer_worklist_len++; 2371 } 2372 2373 if (delay > syncer_maxdelay - 2) 2374 delay = syncer_maxdelay - 2; 2375 slot = (syncer_delayno + delay) & syncer_mask; 2376 2377 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2378 mtx_unlock(&sync_mtx); 2379 } 2380 2381 static int 2382 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2383 { 2384 int error, len; 2385 2386 mtx_lock(&sync_mtx); 2387 len = syncer_worklist_len - sync_vnode_count; 2388 mtx_unlock(&sync_mtx); 2389 error = SYSCTL_OUT(req, &len, sizeof(len)); 2390 return (error); 2391 } 2392 2393 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 2394 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2395 2396 static struct proc *updateproc; 2397 static void sched_sync(void); 2398 static struct kproc_desc up_kp = { 2399 "syncer", 2400 sched_sync, 2401 &updateproc 2402 }; 2403 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2404 2405 static int 2406 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2407 { 2408 struct vnode *vp; 2409 struct mount *mp; 2410 2411 *bo = LIST_FIRST(slp); 2412 if (*bo == NULL) 2413 return (0); 2414 vp = bo2vnode(*bo); 2415 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2416 return (1); 2417 /* 2418 * We use vhold in case the vnode does not 2419 * successfully sync. vhold prevents the vnode from 2420 * going away when we unlock the sync_mtx so that 2421 * we can acquire the vnode interlock. 2422 */ 2423 vholdl(vp); 2424 mtx_unlock(&sync_mtx); 2425 VI_UNLOCK(vp); 2426 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2427 vdrop(vp); 2428 mtx_lock(&sync_mtx); 2429 return (*bo == LIST_FIRST(slp)); 2430 } 2431 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2432 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2433 VOP_UNLOCK(vp); 2434 vn_finished_write(mp); 2435 BO_LOCK(*bo); 2436 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2437 /* 2438 * Put us back on the worklist. The worklist 2439 * routine will remove us from our current 2440 * position and then add us back in at a later 2441 * position. 2442 */ 2443 vn_syncer_add_to_worklist(*bo, syncdelay); 2444 } 2445 BO_UNLOCK(*bo); 2446 vdrop(vp); 2447 mtx_lock(&sync_mtx); 2448 return (0); 2449 } 2450 2451 static int first_printf = 1; 2452 2453 /* 2454 * System filesystem synchronizer daemon. 2455 */ 2456 static void 2457 sched_sync(void) 2458 { 2459 struct synclist *next, *slp; 2460 struct bufobj *bo; 2461 long starttime; 2462 struct thread *td = curthread; 2463 int last_work_seen; 2464 int net_worklist_len; 2465 int syncer_final_iter; 2466 int error; 2467 2468 last_work_seen = 0; 2469 syncer_final_iter = 0; 2470 syncer_state = SYNCER_RUNNING; 2471 starttime = time_uptime; 2472 td->td_pflags |= TDP_NORUNNINGBUF; 2473 2474 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2475 SHUTDOWN_PRI_LAST); 2476 2477 mtx_lock(&sync_mtx); 2478 for (;;) { 2479 if (syncer_state == SYNCER_FINAL_DELAY && 2480 syncer_final_iter == 0) { 2481 mtx_unlock(&sync_mtx); 2482 kproc_suspend_check(td->td_proc); 2483 mtx_lock(&sync_mtx); 2484 } 2485 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2486 if (syncer_state != SYNCER_RUNNING && 2487 starttime != time_uptime) { 2488 if (first_printf) { 2489 printf("\nSyncing disks, vnodes remaining... "); 2490 first_printf = 0; 2491 } 2492 printf("%d ", net_worklist_len); 2493 } 2494 starttime = time_uptime; 2495 2496 /* 2497 * Push files whose dirty time has expired. Be careful 2498 * of interrupt race on slp queue. 2499 * 2500 * Skip over empty worklist slots when shutting down. 2501 */ 2502 do { 2503 slp = &syncer_workitem_pending[syncer_delayno]; 2504 syncer_delayno += 1; 2505 if (syncer_delayno == syncer_maxdelay) 2506 syncer_delayno = 0; 2507 next = &syncer_workitem_pending[syncer_delayno]; 2508 /* 2509 * If the worklist has wrapped since the 2510 * it was emptied of all but syncer vnodes, 2511 * switch to the FINAL_DELAY state and run 2512 * for one more second. 2513 */ 2514 if (syncer_state == SYNCER_SHUTTING_DOWN && 2515 net_worklist_len == 0 && 2516 last_work_seen == syncer_delayno) { 2517 syncer_state = SYNCER_FINAL_DELAY; 2518 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2519 } 2520 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2521 syncer_worklist_len > 0); 2522 2523 /* 2524 * Keep track of the last time there was anything 2525 * on the worklist other than syncer vnodes. 2526 * Return to the SHUTTING_DOWN state if any 2527 * new work appears. 2528 */ 2529 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2530 last_work_seen = syncer_delayno; 2531 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2532 syncer_state = SYNCER_SHUTTING_DOWN; 2533 while (!LIST_EMPTY(slp)) { 2534 error = sync_vnode(slp, &bo, td); 2535 if (error == 1) { 2536 LIST_REMOVE(bo, bo_synclist); 2537 LIST_INSERT_HEAD(next, bo, bo_synclist); 2538 continue; 2539 } 2540 2541 if (first_printf == 0) { 2542 /* 2543 * Drop the sync mutex, because some watchdog 2544 * drivers need to sleep while patting 2545 */ 2546 mtx_unlock(&sync_mtx); 2547 wdog_kern_pat(WD_LASTVAL); 2548 mtx_lock(&sync_mtx); 2549 } 2550 2551 } 2552 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2553 syncer_final_iter--; 2554 /* 2555 * The variable rushjob allows the kernel to speed up the 2556 * processing of the filesystem syncer process. A rushjob 2557 * value of N tells the filesystem syncer to process the next 2558 * N seconds worth of work on its queue ASAP. Currently rushjob 2559 * is used by the soft update code to speed up the filesystem 2560 * syncer process when the incore state is getting so far 2561 * ahead of the disk that the kernel memory pool is being 2562 * threatened with exhaustion. 2563 */ 2564 if (rushjob > 0) { 2565 rushjob -= 1; 2566 continue; 2567 } 2568 /* 2569 * Just sleep for a short period of time between 2570 * iterations when shutting down to allow some I/O 2571 * to happen. 2572 * 2573 * If it has taken us less than a second to process the 2574 * current work, then wait. Otherwise start right over 2575 * again. We can still lose time if any single round 2576 * takes more than two seconds, but it does not really 2577 * matter as we are just trying to generally pace the 2578 * filesystem activity. 2579 */ 2580 if (syncer_state != SYNCER_RUNNING || 2581 time_uptime == starttime) { 2582 thread_lock(td); 2583 sched_prio(td, PPAUSE); 2584 thread_unlock(td); 2585 } 2586 if (syncer_state != SYNCER_RUNNING) 2587 cv_timedwait(&sync_wakeup, &sync_mtx, 2588 hz / SYNCER_SHUTDOWN_SPEEDUP); 2589 else if (time_uptime == starttime) 2590 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2591 } 2592 } 2593 2594 /* 2595 * Request the syncer daemon to speed up its work. 2596 * We never push it to speed up more than half of its 2597 * normal turn time, otherwise it could take over the cpu. 2598 */ 2599 int 2600 speedup_syncer(void) 2601 { 2602 int ret = 0; 2603 2604 mtx_lock(&sync_mtx); 2605 if (rushjob < syncdelay / 2) { 2606 rushjob += 1; 2607 stat_rush_requests += 1; 2608 ret = 1; 2609 } 2610 mtx_unlock(&sync_mtx); 2611 cv_broadcast(&sync_wakeup); 2612 return (ret); 2613 } 2614 2615 /* 2616 * Tell the syncer to speed up its work and run though its work 2617 * list several times, then tell it to shut down. 2618 */ 2619 static void 2620 syncer_shutdown(void *arg, int howto) 2621 { 2622 2623 if (howto & RB_NOSYNC) 2624 return; 2625 mtx_lock(&sync_mtx); 2626 syncer_state = SYNCER_SHUTTING_DOWN; 2627 rushjob = 0; 2628 mtx_unlock(&sync_mtx); 2629 cv_broadcast(&sync_wakeup); 2630 kproc_shutdown(arg, howto); 2631 } 2632 2633 void 2634 syncer_suspend(void) 2635 { 2636 2637 syncer_shutdown(updateproc, 0); 2638 } 2639 2640 void 2641 syncer_resume(void) 2642 { 2643 2644 mtx_lock(&sync_mtx); 2645 first_printf = 1; 2646 syncer_state = SYNCER_RUNNING; 2647 mtx_unlock(&sync_mtx); 2648 cv_broadcast(&sync_wakeup); 2649 kproc_resume(updateproc); 2650 } 2651 2652 /* 2653 * Reassign a buffer from one vnode to another. 2654 * Used to assign file specific control information 2655 * (indirect blocks) to the vnode to which they belong. 2656 */ 2657 void 2658 reassignbuf(struct buf *bp) 2659 { 2660 struct vnode *vp; 2661 struct bufobj *bo; 2662 int delay; 2663 #ifdef INVARIANTS 2664 struct bufv *bv; 2665 #endif 2666 2667 vp = bp->b_vp; 2668 bo = bp->b_bufobj; 2669 ++reassignbufcalls; 2670 2671 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2672 bp, bp->b_vp, bp->b_flags); 2673 /* 2674 * B_PAGING flagged buffers cannot be reassigned because their vp 2675 * is not fully linked in. 2676 */ 2677 if (bp->b_flags & B_PAGING) 2678 panic("cannot reassign paging buffer"); 2679 2680 /* 2681 * Delete from old vnode list, if on one. 2682 */ 2683 BO_LOCK(bo); 2684 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2685 buf_vlist_remove(bp); 2686 else 2687 panic("reassignbuf: Buffer %p not on queue.", bp); 2688 /* 2689 * If dirty, put on list of dirty buffers; otherwise insert onto list 2690 * of clean buffers. 2691 */ 2692 if (bp->b_flags & B_DELWRI) { 2693 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2694 switch (vp->v_type) { 2695 case VDIR: 2696 delay = dirdelay; 2697 break; 2698 case VCHR: 2699 delay = metadelay; 2700 break; 2701 default: 2702 delay = filedelay; 2703 } 2704 vn_syncer_add_to_worklist(bo, delay); 2705 } 2706 buf_vlist_add(bp, bo, BX_VNDIRTY); 2707 } else { 2708 buf_vlist_add(bp, bo, BX_VNCLEAN); 2709 2710 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2711 mtx_lock(&sync_mtx); 2712 LIST_REMOVE(bo, bo_synclist); 2713 syncer_worklist_len--; 2714 mtx_unlock(&sync_mtx); 2715 bo->bo_flag &= ~BO_ONWORKLST; 2716 } 2717 } 2718 #ifdef INVARIANTS 2719 bv = &bo->bo_clean; 2720 bp = TAILQ_FIRST(&bv->bv_hd); 2721 KASSERT(bp == NULL || bp->b_bufobj == bo, 2722 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2723 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2724 KASSERT(bp == NULL || bp->b_bufobj == bo, 2725 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2726 bv = &bo->bo_dirty; 2727 bp = TAILQ_FIRST(&bv->bv_hd); 2728 KASSERT(bp == NULL || bp->b_bufobj == bo, 2729 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2730 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2731 KASSERT(bp == NULL || bp->b_bufobj == bo, 2732 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2733 #endif 2734 BO_UNLOCK(bo); 2735 } 2736 2737 static void 2738 v_init_counters(struct vnode *vp) 2739 { 2740 2741 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2742 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2743 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2744 2745 refcount_init(&vp->v_holdcnt, 1); 2746 refcount_init(&vp->v_usecount, 1); 2747 } 2748 2749 /* 2750 * Increment si_usecount of the associated device, if any. 2751 */ 2752 static void 2753 v_incr_devcount(struct vnode *vp) 2754 { 2755 2756 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2757 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2758 dev_lock(); 2759 vp->v_rdev->si_usecount++; 2760 dev_unlock(); 2761 } 2762 } 2763 2764 /* 2765 * Decrement si_usecount of the associated device, if any. 2766 */ 2767 static void 2768 v_decr_devcount(struct vnode *vp) 2769 { 2770 2771 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2772 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2773 dev_lock(); 2774 vp->v_rdev->si_usecount--; 2775 dev_unlock(); 2776 } 2777 } 2778 2779 /* 2780 * Grab a particular vnode from the free list, increment its 2781 * reference count and lock it. VIRF_DOOMED is set if the vnode 2782 * is being destroyed. Only callers who specify LK_RETRY will 2783 * see doomed vnodes. If inactive processing was delayed in 2784 * vput try to do it here. 2785 * 2786 * Both holdcnt and usecount can be manipulated using atomics without holding 2787 * any locks except in these cases which require the vnode interlock: 2788 * holdcnt: 1->0 and 0->1 2789 * usecount: 0->1 2790 * 2791 * usecount is permitted to transition 1->0 without the interlock because 2792 * vnode is kept live by holdcnt. 2793 */ 2794 static enum vgetstate __always_inline 2795 _vget_prep(struct vnode *vp, bool interlock) 2796 { 2797 enum vgetstate vs; 2798 2799 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2800 vs = VGET_USECOUNT; 2801 } else { 2802 if (interlock) 2803 vholdl(vp); 2804 else 2805 vhold(vp); 2806 vs = VGET_HOLDCNT; 2807 } 2808 return (vs); 2809 } 2810 2811 enum vgetstate 2812 vget_prep(struct vnode *vp) 2813 { 2814 2815 return (_vget_prep(vp, false)); 2816 } 2817 2818 int 2819 vget(struct vnode *vp, int flags, struct thread *td) 2820 { 2821 enum vgetstate vs; 2822 2823 MPASS(td == curthread); 2824 2825 vs = _vget_prep(vp, (flags & LK_INTERLOCK) != 0); 2826 return (vget_finish(vp, flags, vs)); 2827 } 2828 2829 int 2830 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 2831 { 2832 int error, oweinact; 2833 2834 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2835 ("%s: invalid lock operation", __func__)); 2836 2837 if ((flags & LK_INTERLOCK) != 0) 2838 ASSERT_VI_LOCKED(vp, __func__); 2839 else 2840 ASSERT_VI_UNLOCKED(vp, __func__); 2841 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 2842 if (vs == VGET_USECOUNT) { 2843 VNASSERT(vp->v_usecount > 0, vp, 2844 ("%s: vnode without usecount when VGET_USECOUNT was passed", 2845 __func__)); 2846 } 2847 2848 if ((error = vn_lock(vp, flags)) != 0) { 2849 if (vs == VGET_USECOUNT) 2850 vrele(vp); 2851 else 2852 vdrop(vp); 2853 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2854 vp); 2855 return (error); 2856 } 2857 2858 if (vs == VGET_USECOUNT) { 2859 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2860 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2861 return (0); 2862 } 2863 2864 /* 2865 * We hold the vnode. If the usecount is 0 it will be utilized to keep 2866 * the vnode around. Otherwise someone else lended their hold count and 2867 * we have to drop ours. 2868 */ 2869 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2870 #ifdef INVARIANTS 2871 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2872 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2873 #else 2874 refcount_release(&vp->v_holdcnt); 2875 #endif 2876 VNODE_REFCOUNT_FENCE_ACQ(); 2877 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2878 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2879 return (0); 2880 } 2881 2882 /* 2883 * We don't guarantee that any particular close will 2884 * trigger inactive processing so just make a best effort 2885 * here at preventing a reference to a removed file. If 2886 * we don't succeed no harm is done. 2887 * 2888 * Upgrade our holdcnt to a usecount. 2889 */ 2890 VI_LOCK(vp); 2891 /* 2892 * See the previous section. By the time we get here we may find 2893 * ourselves in the same spot. 2894 */ 2895 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2896 #ifdef INVARIANTS 2897 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2898 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2899 #else 2900 refcount_release(&vp->v_holdcnt); 2901 #endif 2902 VNODE_REFCOUNT_FENCE_ACQ(); 2903 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2904 ("%s: vnode with usecount and VI_OWEINACT set", 2905 __func__)); 2906 VI_UNLOCK(vp); 2907 return (0); 2908 } 2909 if ((vp->v_iflag & VI_OWEINACT) == 0) { 2910 oweinact = 0; 2911 } else { 2912 oweinact = 1; 2913 vp->v_iflag &= ~VI_OWEINACT; 2914 VNODE_REFCOUNT_FENCE_REL(); 2915 } 2916 v_incr_devcount(vp); 2917 refcount_acquire(&vp->v_usecount); 2918 if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2919 (flags & LK_NOWAIT) == 0) 2920 vinactive(vp); 2921 VI_UNLOCK(vp); 2922 return (0); 2923 } 2924 2925 /* 2926 * Increase the reference (use) and hold count of a vnode. 2927 * This will also remove the vnode from the free list if it is presently free. 2928 */ 2929 void 2930 vref(struct vnode *vp) 2931 { 2932 2933 ASSERT_VI_UNLOCKED(vp, __func__); 2934 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2935 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2936 VNODE_REFCOUNT_FENCE_ACQ(); 2937 VNASSERT(vp->v_holdcnt > 0, vp, 2938 ("%s: active vnode not held", __func__)); 2939 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2940 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2941 return; 2942 } 2943 VI_LOCK(vp); 2944 vrefl(vp); 2945 VI_UNLOCK(vp); 2946 } 2947 2948 void 2949 vrefl(struct vnode *vp) 2950 { 2951 2952 ASSERT_VI_LOCKED(vp, __func__); 2953 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2954 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2955 VNODE_REFCOUNT_FENCE_ACQ(); 2956 VNASSERT(vp->v_holdcnt > 0, vp, 2957 ("%s: active vnode not held", __func__)); 2958 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2959 ("%s: vnode with usecount and VI_OWEINACT set", __func__)); 2960 return; 2961 } 2962 vholdl(vp); 2963 if ((vp->v_iflag & VI_OWEINACT) != 0) { 2964 vp->v_iflag &= ~VI_OWEINACT; 2965 VNODE_REFCOUNT_FENCE_REL(); 2966 } 2967 v_incr_devcount(vp); 2968 refcount_acquire(&vp->v_usecount); 2969 } 2970 2971 void 2972 vrefact(struct vnode *vp) 2973 { 2974 2975 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2976 #ifdef INVARIANTS 2977 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 2978 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 2979 #else 2980 refcount_acquire(&vp->v_usecount); 2981 #endif 2982 } 2983 2984 /* 2985 * Return reference count of a vnode. 2986 * 2987 * The results of this call are only guaranteed when some mechanism is used to 2988 * stop other processes from gaining references to the vnode. This may be the 2989 * case if the caller holds the only reference. This is also useful when stale 2990 * data is acceptable as race conditions may be accounted for by some other 2991 * means. 2992 */ 2993 int 2994 vrefcnt(struct vnode *vp) 2995 { 2996 2997 return (vp->v_usecount); 2998 } 2999 3000 enum vputx_op { VPUTX_VRELE, VPUTX_VPUT, VPUTX_VUNREF }; 3001 3002 /* 3003 * Decrement the use and hold counts for a vnode. 3004 * 3005 * See an explanation near vget() as to why atomic operation is safe. 3006 */ 3007 static void 3008 vputx(struct vnode *vp, enum vputx_op func) 3009 { 3010 int error; 3011 3012 KASSERT(vp != NULL, ("vputx: null vp")); 3013 if (func == VPUTX_VUNREF) 3014 ASSERT_VOP_LOCKED(vp, "vunref"); 3015 ASSERT_VI_UNLOCKED(vp, __func__); 3016 VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp, 3017 ("%s: wrong ref counts", __func__)); 3018 3019 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3020 3021 /* 3022 * We want to hold the vnode until the inactive finishes to 3023 * prevent vgone() races. We drop the use count here and the 3024 * hold count below when we're done. 3025 * 3026 * If we release the last usecount we take ownership of the hold 3027 * count which provides liveness of the vnode, in which case we 3028 * have to vdrop. 3029 */ 3030 if (!refcount_release(&vp->v_usecount)) 3031 return; 3032 VI_LOCK(vp); 3033 v_decr_devcount(vp); 3034 /* 3035 * By the time we got here someone else might have transitioned 3036 * the count back to > 0. 3037 */ 3038 if (vp->v_usecount > 0) { 3039 vdropl(vp); 3040 return; 3041 } 3042 if (vp->v_iflag & VI_DOINGINACT) { 3043 vdropl(vp); 3044 return; 3045 } 3046 3047 /* 3048 * Check if the fs wants to perform inactive processing. Note we 3049 * may be only holding the interlock, in which case it is possible 3050 * someone else called vgone on the vnode and ->v_data is now NULL. 3051 * Since vgone performs inactive on its own there is nothing to do 3052 * here but to drop our hold count. 3053 */ 3054 if (__predict_false(VN_IS_DOOMED(vp)) || 3055 VOP_NEED_INACTIVE(vp) == 0) { 3056 vdropl(vp); 3057 return; 3058 } 3059 3060 /* 3061 * We must call VOP_INACTIVE with the node locked. Mark 3062 * as VI_DOINGINACT to avoid recursion. 3063 */ 3064 vp->v_iflag |= VI_OWEINACT; 3065 switch (func) { 3066 case VPUTX_VRELE: 3067 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3068 VI_LOCK(vp); 3069 break; 3070 case VPUTX_VPUT: 3071 error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT); 3072 VI_LOCK(vp); 3073 break; 3074 case VPUTX_VUNREF: 3075 error = 0; 3076 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3077 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3078 VI_LOCK(vp); 3079 } 3080 break; 3081 } 3082 VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, 3083 ("vnode with usecount and VI_OWEINACT set")); 3084 if (error == 0) { 3085 if (vp->v_iflag & VI_OWEINACT) 3086 vinactive(vp); 3087 if (func != VPUTX_VUNREF) 3088 VOP_UNLOCK(vp); 3089 } 3090 vdropl(vp); 3091 } 3092 3093 /* 3094 * Vnode put/release. 3095 * If count drops to zero, call inactive routine and return to freelist. 3096 */ 3097 void 3098 vrele(struct vnode *vp) 3099 { 3100 3101 vputx(vp, VPUTX_VRELE); 3102 } 3103 3104 /* 3105 * Release an already locked vnode. This give the same effects as 3106 * unlock+vrele(), but takes less time and avoids releasing and 3107 * re-aquiring the lock (as vrele() acquires the lock internally.) 3108 * 3109 * It is an invariant that all VOP_* calls operate on a held vnode. 3110 * We may be only having an implicit hold stemming from our usecount, 3111 * which we are about to release. If we unlock the vnode afterwards we 3112 * open a time window where someone else dropped the last usecount and 3113 * proceeded to free the vnode before our unlock finished. For this 3114 * reason we unlock the vnode early. This is a little bit wasteful as 3115 * it may be the vnode is exclusively locked and inactive processing is 3116 * needed, in which case we are adding work. 3117 */ 3118 void 3119 vput(struct vnode *vp) 3120 { 3121 3122 VOP_UNLOCK(vp); 3123 vputx(vp, VPUTX_VPUT); 3124 } 3125 3126 /* 3127 * Release an exclusively locked vnode. Do not unlock the vnode lock. 3128 */ 3129 void 3130 vunref(struct vnode *vp) 3131 { 3132 3133 vputx(vp, VPUTX_VUNREF); 3134 } 3135 3136 /* 3137 * Increase the hold count and activate if this is the first reference. 3138 */ 3139 static void 3140 vhold_activate(struct vnode *vp) 3141 { 3142 struct mount *mp; 3143 3144 ASSERT_VI_LOCKED(vp, __func__); 3145 VNASSERT(vp->v_holdcnt == 0, vp, 3146 ("%s: wrong hold count", __func__)); 3147 VNASSERT(vp->v_op != NULL, vp, 3148 ("%s: vnode already reclaimed.", __func__)); 3149 /* 3150 * Remove a vnode from the free list, mark it as in use, 3151 * and put it on the active list. 3152 */ 3153 VNASSERT(vp->v_mount != NULL, vp, 3154 ("_vhold: vnode not on per mount vnode list")); 3155 mp = vp->v_mount; 3156 mtx_lock(&mp->mnt_listmtx); 3157 if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { 3158 TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); 3159 mp->mnt_tmpfreevnodelistsize--; 3160 vp->v_mflag &= ~VMP_TMPMNTFREELIST; 3161 } else { 3162 mtx_lock(&vnode_free_list_mtx); 3163 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 3164 freevnodes--; 3165 mtx_unlock(&vnode_free_list_mtx); 3166 } 3167 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 3168 ("Activating already active vnode")); 3169 vp->v_iflag &= ~VI_FREE; 3170 vp->v_iflag |= VI_ACTIVE; 3171 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 3172 mp->mnt_activevnodelistsize++; 3173 mtx_unlock(&mp->mnt_listmtx); 3174 refcount_acquire(&vp->v_holdcnt); 3175 } 3176 3177 void 3178 vhold(struct vnode *vp) 3179 { 3180 3181 ASSERT_VI_UNLOCKED(vp, __func__); 3182 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3183 if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) { 3184 VNODE_REFCOUNT_FENCE_ACQ(); 3185 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 3186 ("vhold: vnode with holdcnt is free")); 3187 return; 3188 } 3189 VI_LOCK(vp); 3190 vholdl(vp); 3191 VI_UNLOCK(vp); 3192 } 3193 3194 void 3195 vholdl(struct vnode *vp) 3196 { 3197 3198 ASSERT_VI_LOCKED(vp, __func__); 3199 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3200 if ((vp->v_iflag & VI_FREE) == 0) { 3201 refcount_acquire(&vp->v_holdcnt); 3202 return; 3203 } 3204 vhold_activate(vp); 3205 } 3206 3207 void 3208 vholdnz(struct vnode *vp) 3209 { 3210 3211 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3212 #ifdef INVARIANTS 3213 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3214 VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); 3215 #else 3216 atomic_add_int(&vp->v_holdcnt, 1); 3217 #endif 3218 } 3219 3220 /* 3221 * Drop the hold count of the vnode. If this is the last reference to 3222 * the vnode we place it on the free list unless it has been vgone'd 3223 * (marked VIRF_DOOMED) in which case we will free it. 3224 * 3225 * Because the vnode vm object keeps a hold reference on the vnode if 3226 * there is at least one resident non-cached page, the vnode cannot 3227 * leave the active list without the page cleanup done. 3228 */ 3229 static void 3230 vdrop_deactivate(struct vnode *vp) 3231 { 3232 struct mount *mp; 3233 3234 ASSERT_VI_LOCKED(vp, __func__); 3235 /* 3236 * Mark a vnode as free: remove it from its active list 3237 * and put it up for recycling on the freelist. 3238 */ 3239 VNASSERT(!VN_IS_DOOMED(vp), vp, 3240 ("vdrop: returning doomed vnode")); 3241 VNASSERT(vp->v_op != NULL, vp, 3242 ("vdrop: vnode already reclaimed.")); 3243 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 3244 ("vnode already free")); 3245 VNASSERT(vp->v_holdcnt == 0, vp, 3246 ("vdrop: freeing when we shouldn't")); 3247 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3248 mp = vp->v_mount; 3249 mtx_lock(&mp->mnt_listmtx); 3250 if (vp->v_iflag & VI_ACTIVE) { 3251 vp->v_iflag &= ~VI_ACTIVE; 3252 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 3253 mp->mnt_activevnodelistsize--; 3254 } 3255 TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); 3256 mp->mnt_tmpfreevnodelistsize++; 3257 vp->v_iflag |= VI_FREE; 3258 vp->v_mflag |= VMP_TMPMNTFREELIST; 3259 VI_UNLOCK(vp); 3260 if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) 3261 vnlru_return_batch_locked(mp); 3262 mtx_unlock(&mp->mnt_listmtx); 3263 } else { 3264 VI_UNLOCK(vp); 3265 counter_u64_add(free_owe_inact, 1); 3266 } 3267 } 3268 3269 void 3270 vdrop(struct vnode *vp) 3271 { 3272 3273 ASSERT_VI_UNLOCKED(vp, __func__); 3274 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3275 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3276 return; 3277 VI_LOCK(vp); 3278 vdropl(vp); 3279 } 3280 3281 void 3282 vdropl(struct vnode *vp) 3283 { 3284 3285 ASSERT_VI_LOCKED(vp, __func__); 3286 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3287 if (!refcount_release(&vp->v_holdcnt)) { 3288 VI_UNLOCK(vp); 3289 return; 3290 } 3291 if (VN_IS_DOOMED(vp)) { 3292 freevnode(vp); 3293 return; 3294 } 3295 vdrop_deactivate(vp); 3296 } 3297 3298 /* 3299 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3300 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3301 * OWEINACT tracks whether a vnode missed a call to inactive due to a 3302 * failed lock upgrade. 3303 */ 3304 void 3305 vinactive(struct vnode *vp) 3306 { 3307 struct vm_object *obj; 3308 3309 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3310 ASSERT_VI_LOCKED(vp, "vinactive"); 3311 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3312 ("vinactive: recursed on VI_DOINGINACT")); 3313 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3314 vp->v_iflag |= VI_DOINGINACT; 3315 vp->v_iflag &= ~VI_OWEINACT; 3316 VI_UNLOCK(vp); 3317 /* 3318 * Before moving off the active list, we must be sure that any 3319 * modified pages are converted into the vnode's dirty 3320 * buffers, since these will no longer be checked once the 3321 * vnode is on the inactive list. 3322 * 3323 * The write-out of the dirty pages is asynchronous. At the 3324 * point that VOP_INACTIVE() is called, there could still be 3325 * pending I/O and dirty pages in the object. 3326 */ 3327 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3328 vm_object_mightbedirty(obj)) { 3329 VM_OBJECT_WLOCK(obj); 3330 vm_object_page_clean(obj, 0, 0, 0); 3331 VM_OBJECT_WUNLOCK(obj); 3332 } 3333 VOP_INACTIVE(vp, curthread); 3334 VI_LOCK(vp); 3335 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3336 ("vinactive: lost VI_DOINGINACT")); 3337 vp->v_iflag &= ~VI_DOINGINACT; 3338 } 3339 3340 /* 3341 * Remove any vnodes in the vnode table belonging to mount point mp. 3342 * 3343 * If FORCECLOSE is not specified, there should not be any active ones, 3344 * return error if any are found (nb: this is a user error, not a 3345 * system error). If FORCECLOSE is specified, detach any active vnodes 3346 * that are found. 3347 * 3348 * If WRITECLOSE is set, only flush out regular file vnodes open for 3349 * writing. 3350 * 3351 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3352 * 3353 * `rootrefs' specifies the base reference count for the root vnode 3354 * of this filesystem. The root vnode is considered busy if its 3355 * v_usecount exceeds this value. On a successful return, vflush(, td) 3356 * will call vrele() on the root vnode exactly rootrefs times. 3357 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3358 * be zero. 3359 */ 3360 #ifdef DIAGNOSTIC 3361 static int busyprt = 0; /* print out busy vnodes */ 3362 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3363 #endif 3364 3365 int 3366 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3367 { 3368 struct vnode *vp, *mvp, *rootvp = NULL; 3369 struct vattr vattr; 3370 int busy = 0, error; 3371 3372 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3373 rootrefs, flags); 3374 if (rootrefs > 0) { 3375 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3376 ("vflush: bad args")); 3377 /* 3378 * Get the filesystem root vnode. We can vput() it 3379 * immediately, since with rootrefs > 0, it won't go away. 3380 */ 3381 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3382 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3383 __func__, error); 3384 return (error); 3385 } 3386 vput(rootvp); 3387 } 3388 loop: 3389 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3390 vholdl(vp); 3391 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3392 if (error) { 3393 vdrop(vp); 3394 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3395 goto loop; 3396 } 3397 /* 3398 * Skip over a vnodes marked VV_SYSTEM. 3399 */ 3400 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3401 VOP_UNLOCK(vp); 3402 vdrop(vp); 3403 continue; 3404 } 3405 /* 3406 * If WRITECLOSE is set, flush out unlinked but still open 3407 * files (even if open only for reading) and regular file 3408 * vnodes open for writing. 3409 */ 3410 if (flags & WRITECLOSE) { 3411 if (vp->v_object != NULL) { 3412 VM_OBJECT_WLOCK(vp->v_object); 3413 vm_object_page_clean(vp->v_object, 0, 0, 0); 3414 VM_OBJECT_WUNLOCK(vp->v_object); 3415 } 3416 error = VOP_FSYNC(vp, MNT_WAIT, td); 3417 if (error != 0) { 3418 VOP_UNLOCK(vp); 3419 vdrop(vp); 3420 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3421 return (error); 3422 } 3423 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3424 VI_LOCK(vp); 3425 3426 if ((vp->v_type == VNON || 3427 (error == 0 && vattr.va_nlink > 0)) && 3428 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3429 VOP_UNLOCK(vp); 3430 vdropl(vp); 3431 continue; 3432 } 3433 } else 3434 VI_LOCK(vp); 3435 /* 3436 * With v_usecount == 0, all we need to do is clear out the 3437 * vnode data structures and we are done. 3438 * 3439 * If FORCECLOSE is set, forcibly close the vnode. 3440 */ 3441 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3442 vgonel(vp); 3443 } else { 3444 busy++; 3445 #ifdef DIAGNOSTIC 3446 if (busyprt) 3447 vn_printf(vp, "vflush: busy vnode "); 3448 #endif 3449 } 3450 VOP_UNLOCK(vp); 3451 vdropl(vp); 3452 } 3453 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3454 /* 3455 * If just the root vnode is busy, and if its refcount 3456 * is equal to `rootrefs', then go ahead and kill it. 3457 */ 3458 VI_LOCK(rootvp); 3459 KASSERT(busy > 0, ("vflush: not busy")); 3460 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3461 ("vflush: usecount %d < rootrefs %d", 3462 rootvp->v_usecount, rootrefs)); 3463 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3464 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3465 vgone(rootvp); 3466 VOP_UNLOCK(rootvp); 3467 busy = 0; 3468 } else 3469 VI_UNLOCK(rootvp); 3470 } 3471 if (busy) { 3472 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3473 busy); 3474 return (EBUSY); 3475 } 3476 for (; rootrefs > 0; rootrefs--) 3477 vrele(rootvp); 3478 return (0); 3479 } 3480 3481 /* 3482 * Recycle an unused vnode to the front of the free list. 3483 */ 3484 int 3485 vrecycle(struct vnode *vp) 3486 { 3487 int recycled; 3488 3489 VI_LOCK(vp); 3490 recycled = vrecyclel(vp); 3491 VI_UNLOCK(vp); 3492 return (recycled); 3493 } 3494 3495 /* 3496 * vrecycle, with the vp interlock held. 3497 */ 3498 int 3499 vrecyclel(struct vnode *vp) 3500 { 3501 int recycled; 3502 3503 ASSERT_VOP_ELOCKED(vp, __func__); 3504 ASSERT_VI_LOCKED(vp, __func__); 3505 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3506 recycled = 0; 3507 if (vp->v_usecount == 0) { 3508 recycled = 1; 3509 vgonel(vp); 3510 } 3511 return (recycled); 3512 } 3513 3514 /* 3515 * Eliminate all activity associated with a vnode 3516 * in preparation for reuse. 3517 */ 3518 void 3519 vgone(struct vnode *vp) 3520 { 3521 VI_LOCK(vp); 3522 vgonel(vp); 3523 VI_UNLOCK(vp); 3524 } 3525 3526 static void 3527 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3528 struct vnode *lowervp __unused) 3529 { 3530 } 3531 3532 /* 3533 * Notify upper mounts about reclaimed or unlinked vnode. 3534 */ 3535 void 3536 vfs_notify_upper(struct vnode *vp, int event) 3537 { 3538 static struct vfsops vgonel_vfsops = { 3539 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3540 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3541 }; 3542 struct mount *mp, *ump, *mmp; 3543 3544 mp = vp->v_mount; 3545 if (mp == NULL) 3546 return; 3547 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3548 return; 3549 3550 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3551 mmp->mnt_op = &vgonel_vfsops; 3552 mmp->mnt_kern_flag |= MNTK_MARKER; 3553 MNT_ILOCK(mp); 3554 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3555 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3556 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3557 ump = TAILQ_NEXT(ump, mnt_upper_link); 3558 continue; 3559 } 3560 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3561 MNT_IUNLOCK(mp); 3562 switch (event) { 3563 case VFS_NOTIFY_UPPER_RECLAIM: 3564 VFS_RECLAIM_LOWERVP(ump, vp); 3565 break; 3566 case VFS_NOTIFY_UPPER_UNLINK: 3567 VFS_UNLINK_LOWERVP(ump, vp); 3568 break; 3569 default: 3570 KASSERT(0, ("invalid event %d", event)); 3571 break; 3572 } 3573 MNT_ILOCK(mp); 3574 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3575 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 3576 } 3577 free(mmp, M_TEMP); 3578 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 3579 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 3580 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 3581 wakeup(&mp->mnt_uppers); 3582 } 3583 MNT_IUNLOCK(mp); 3584 } 3585 3586 /* 3587 * vgone, with the vp interlock held. 3588 */ 3589 static void 3590 vgonel(struct vnode *vp) 3591 { 3592 struct thread *td; 3593 struct mount *mp; 3594 vm_object_t object; 3595 bool active, oweinact; 3596 3597 ASSERT_VOP_ELOCKED(vp, "vgonel"); 3598 ASSERT_VI_LOCKED(vp, "vgonel"); 3599 VNASSERT(vp->v_holdcnt, vp, 3600 ("vgonel: vp %p has no reference.", vp)); 3601 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3602 td = curthread; 3603 3604 /* 3605 * Don't vgonel if we're already doomed. 3606 */ 3607 if (vp->v_irflag & VIRF_DOOMED) 3608 return; 3609 vp->v_irflag |= VIRF_DOOMED; 3610 3611 /* 3612 * Check to see if the vnode is in use. If so, we have to call 3613 * VOP_CLOSE() and VOP_INACTIVE(). 3614 */ 3615 active = vp->v_usecount > 0; 3616 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 3617 VI_UNLOCK(vp); 3618 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 3619 3620 /* 3621 * If purging an active vnode, it must be closed and 3622 * deactivated before being reclaimed. 3623 */ 3624 if (active) 3625 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 3626 if (oweinact || active) { 3627 VI_LOCK(vp); 3628 if ((vp->v_iflag & VI_DOINGINACT) == 0) 3629 vinactive(vp); 3630 VI_UNLOCK(vp); 3631 } 3632 if (vp->v_type == VSOCK) 3633 vfs_unp_reclaim(vp); 3634 3635 /* 3636 * Clean out any buffers associated with the vnode. 3637 * If the flush fails, just toss the buffers. 3638 */ 3639 mp = NULL; 3640 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 3641 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 3642 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 3643 while (vinvalbuf(vp, 0, 0, 0) != 0) 3644 ; 3645 } 3646 3647 BO_LOCK(&vp->v_bufobj); 3648 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 3649 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 3650 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 3651 vp->v_bufobj.bo_clean.bv_cnt == 0, 3652 ("vp %p bufobj not invalidated", vp)); 3653 3654 /* 3655 * For VMIO bufobj, BO_DEAD is set later, or in 3656 * vm_object_terminate() after the object's page queue is 3657 * flushed. 3658 */ 3659 object = vp->v_bufobj.bo_object; 3660 if (object == NULL) 3661 vp->v_bufobj.bo_flag |= BO_DEAD; 3662 BO_UNLOCK(&vp->v_bufobj); 3663 3664 /* 3665 * Handle the VM part. Tmpfs handles v_object on its own (the 3666 * OBJT_VNODE check). Nullfs or other bypassing filesystems 3667 * should not touch the object borrowed from the lower vnode 3668 * (the handle check). 3669 */ 3670 if (object != NULL && object->type == OBJT_VNODE && 3671 object->handle == vp) 3672 vnode_destroy_vobject(vp); 3673 3674 /* 3675 * Reclaim the vnode. 3676 */ 3677 if (VOP_RECLAIM(vp, td)) 3678 panic("vgone: cannot reclaim"); 3679 if (mp != NULL) 3680 vn_finished_secondary_write(mp); 3681 VNASSERT(vp->v_object == NULL, vp, 3682 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 3683 /* 3684 * Clear the advisory locks and wake up waiting threads. 3685 */ 3686 (void)VOP_ADVLOCKPURGE(vp); 3687 vp->v_lockf = NULL; 3688 /* 3689 * Delete from old mount point vnode list. 3690 */ 3691 delmntque(vp); 3692 cache_purge(vp); 3693 /* 3694 * Done with purge, reset to the standard lock and invalidate 3695 * the vnode. 3696 */ 3697 VI_LOCK(vp); 3698 vp->v_vnlock = &vp->v_lock; 3699 vp->v_op = &dead_vnodeops; 3700 vp->v_tag = "none"; 3701 vp->v_type = VBAD; 3702 } 3703 3704 /* 3705 * Calculate the total number of references to a special device. 3706 */ 3707 int 3708 vcount(struct vnode *vp) 3709 { 3710 int count; 3711 3712 dev_lock(); 3713 count = vp->v_rdev->si_usecount; 3714 dev_unlock(); 3715 return (count); 3716 } 3717 3718 /* 3719 * Print out a description of a vnode. 3720 */ 3721 static char *typename[] = 3722 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 3723 "VMARKER"}; 3724 3725 void 3726 vn_printf(struct vnode *vp, const char *fmt, ...) 3727 { 3728 va_list ap; 3729 char buf[256], buf2[16]; 3730 u_long flags; 3731 3732 va_start(ap, fmt); 3733 vprintf(fmt, ap); 3734 va_end(ap); 3735 printf("%p: ", (void *)vp); 3736 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 3737 printf(" usecount %d, writecount %d, refcount %d", 3738 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 3739 switch (vp->v_type) { 3740 case VDIR: 3741 printf(" mountedhere %p\n", vp->v_mountedhere); 3742 break; 3743 case VCHR: 3744 printf(" rdev %p\n", vp->v_rdev); 3745 break; 3746 case VSOCK: 3747 printf(" socket %p\n", vp->v_unpcb); 3748 break; 3749 case VFIFO: 3750 printf(" fifoinfo %p\n", vp->v_fifoinfo); 3751 break; 3752 default: 3753 printf("\n"); 3754 break; 3755 } 3756 buf[0] = '\0'; 3757 buf[1] = '\0'; 3758 if (vp->v_irflag & VIRF_DOOMED) 3759 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 3760 flags = vp->v_irflag & ~(VIRF_DOOMED); 3761 if (flags != 0) { 3762 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 3763 strlcat(buf, buf2, sizeof(buf)); 3764 } 3765 if (vp->v_vflag & VV_ROOT) 3766 strlcat(buf, "|VV_ROOT", sizeof(buf)); 3767 if (vp->v_vflag & VV_ISTTY) 3768 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 3769 if (vp->v_vflag & VV_NOSYNC) 3770 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 3771 if (vp->v_vflag & VV_ETERNALDEV) 3772 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 3773 if (vp->v_vflag & VV_CACHEDLABEL) 3774 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 3775 if (vp->v_vflag & VV_VMSIZEVNLOCK) 3776 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 3777 if (vp->v_vflag & VV_COPYONWRITE) 3778 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 3779 if (vp->v_vflag & VV_SYSTEM) 3780 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 3781 if (vp->v_vflag & VV_PROCDEP) 3782 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 3783 if (vp->v_vflag & VV_NOKNOTE) 3784 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 3785 if (vp->v_vflag & VV_DELETED) 3786 strlcat(buf, "|VV_DELETED", sizeof(buf)); 3787 if (vp->v_vflag & VV_MD) 3788 strlcat(buf, "|VV_MD", sizeof(buf)); 3789 if (vp->v_vflag & VV_FORCEINSMQ) 3790 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 3791 if (vp->v_vflag & VV_READLINK) 3792 strlcat(buf, "|VV_READLINK", sizeof(buf)); 3793 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 3794 VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 3795 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 3796 if (flags != 0) { 3797 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 3798 strlcat(buf, buf2, sizeof(buf)); 3799 } 3800 if (vp->v_iflag & VI_TEXT_REF) 3801 strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); 3802 if (vp->v_iflag & VI_MOUNT) 3803 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 3804 if (vp->v_iflag & VI_FREE) 3805 strlcat(buf, "|VI_FREE", sizeof(buf)); 3806 if (vp->v_iflag & VI_ACTIVE) 3807 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3808 if (vp->v_iflag & VI_DOINGINACT) 3809 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3810 if (vp->v_iflag & VI_OWEINACT) 3811 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3812 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE | 3813 VI_DOINGINACT | VI_OWEINACT); 3814 if (flags != 0) { 3815 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3816 strlcat(buf, buf2, sizeof(buf)); 3817 } 3818 if (vp->v_mflag & VMP_TMPMNTFREELIST) 3819 strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); 3820 flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); 3821 if (flags != 0) { 3822 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 3823 strlcat(buf, buf2, sizeof(buf)); 3824 } 3825 printf(" flags (%s)\n", buf + 1); 3826 if (mtx_owned(VI_MTX(vp))) 3827 printf(" VI_LOCKed"); 3828 if (vp->v_object != NULL) 3829 printf(" v_object %p ref %d pages %d " 3830 "cleanbuf %d dirtybuf %d\n", 3831 vp->v_object, vp->v_object->ref_count, 3832 vp->v_object->resident_page_count, 3833 vp->v_bufobj.bo_clean.bv_cnt, 3834 vp->v_bufobj.bo_dirty.bv_cnt); 3835 printf(" "); 3836 lockmgr_printinfo(vp->v_vnlock); 3837 if (vp->v_data != NULL) 3838 VOP_PRINT(vp); 3839 } 3840 3841 #ifdef DDB 3842 /* 3843 * List all of the locked vnodes in the system. 3844 * Called when debugging the kernel. 3845 */ 3846 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3847 { 3848 struct mount *mp; 3849 struct vnode *vp; 3850 3851 /* 3852 * Note: because this is DDB, we can't obey the locking semantics 3853 * for these structures, which means we could catch an inconsistent 3854 * state and dereference a nasty pointer. Not much to be done 3855 * about that. 3856 */ 3857 db_printf("Locked vnodes\n"); 3858 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3859 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3860 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 3861 vn_printf(vp, "vnode "); 3862 } 3863 } 3864 } 3865 3866 /* 3867 * Show details about the given vnode. 3868 */ 3869 DB_SHOW_COMMAND(vnode, db_show_vnode) 3870 { 3871 struct vnode *vp; 3872 3873 if (!have_addr) 3874 return; 3875 vp = (struct vnode *)addr; 3876 vn_printf(vp, "vnode "); 3877 } 3878 3879 /* 3880 * Show details about the given mount point. 3881 */ 3882 DB_SHOW_COMMAND(mount, db_show_mount) 3883 { 3884 struct mount *mp; 3885 struct vfsopt *opt; 3886 struct statfs *sp; 3887 struct vnode *vp; 3888 char buf[512]; 3889 uint64_t mflags; 3890 u_int flags; 3891 3892 if (!have_addr) { 3893 /* No address given, print short info about all mount points. */ 3894 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3895 db_printf("%p %s on %s (%s)\n", mp, 3896 mp->mnt_stat.f_mntfromname, 3897 mp->mnt_stat.f_mntonname, 3898 mp->mnt_stat.f_fstypename); 3899 if (db_pager_quit) 3900 break; 3901 } 3902 db_printf("\nMore info: show mount <addr>\n"); 3903 return; 3904 } 3905 3906 mp = (struct mount *)addr; 3907 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3908 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3909 3910 buf[0] = '\0'; 3911 mflags = mp->mnt_flag; 3912 #define MNT_FLAG(flag) do { \ 3913 if (mflags & (flag)) { \ 3914 if (buf[0] != '\0') \ 3915 strlcat(buf, ", ", sizeof(buf)); \ 3916 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3917 mflags &= ~(flag); \ 3918 } \ 3919 } while (0) 3920 MNT_FLAG(MNT_RDONLY); 3921 MNT_FLAG(MNT_SYNCHRONOUS); 3922 MNT_FLAG(MNT_NOEXEC); 3923 MNT_FLAG(MNT_NOSUID); 3924 MNT_FLAG(MNT_NFS4ACLS); 3925 MNT_FLAG(MNT_UNION); 3926 MNT_FLAG(MNT_ASYNC); 3927 MNT_FLAG(MNT_SUIDDIR); 3928 MNT_FLAG(MNT_SOFTDEP); 3929 MNT_FLAG(MNT_NOSYMFOLLOW); 3930 MNT_FLAG(MNT_GJOURNAL); 3931 MNT_FLAG(MNT_MULTILABEL); 3932 MNT_FLAG(MNT_ACLS); 3933 MNT_FLAG(MNT_NOATIME); 3934 MNT_FLAG(MNT_NOCLUSTERR); 3935 MNT_FLAG(MNT_NOCLUSTERW); 3936 MNT_FLAG(MNT_SUJ); 3937 MNT_FLAG(MNT_EXRDONLY); 3938 MNT_FLAG(MNT_EXPORTED); 3939 MNT_FLAG(MNT_DEFEXPORTED); 3940 MNT_FLAG(MNT_EXPORTANON); 3941 MNT_FLAG(MNT_EXKERB); 3942 MNT_FLAG(MNT_EXPUBLIC); 3943 MNT_FLAG(MNT_LOCAL); 3944 MNT_FLAG(MNT_QUOTA); 3945 MNT_FLAG(MNT_ROOTFS); 3946 MNT_FLAG(MNT_USER); 3947 MNT_FLAG(MNT_IGNORE); 3948 MNT_FLAG(MNT_UPDATE); 3949 MNT_FLAG(MNT_DELEXPORT); 3950 MNT_FLAG(MNT_RELOAD); 3951 MNT_FLAG(MNT_FORCE); 3952 MNT_FLAG(MNT_SNAPSHOT); 3953 MNT_FLAG(MNT_BYFSID); 3954 #undef MNT_FLAG 3955 if (mflags != 0) { 3956 if (buf[0] != '\0') 3957 strlcat(buf, ", ", sizeof(buf)); 3958 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3959 "0x%016jx", mflags); 3960 } 3961 db_printf(" mnt_flag = %s\n", buf); 3962 3963 buf[0] = '\0'; 3964 flags = mp->mnt_kern_flag; 3965 #define MNT_KERN_FLAG(flag) do { \ 3966 if (flags & (flag)) { \ 3967 if (buf[0] != '\0') \ 3968 strlcat(buf, ", ", sizeof(buf)); \ 3969 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3970 flags &= ~(flag); \ 3971 } \ 3972 } while (0) 3973 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3974 MNT_KERN_FLAG(MNTK_ASYNC); 3975 MNT_KERN_FLAG(MNTK_SOFTDEP); 3976 MNT_KERN_FLAG(MNTK_DRAINING); 3977 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3978 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3979 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3980 MNT_KERN_FLAG(MNTK_NO_IOPF); 3981 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3982 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3983 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3984 MNT_KERN_FLAG(MNTK_MARKER); 3985 MNT_KERN_FLAG(MNTK_USES_BCACHE); 3986 MNT_KERN_FLAG(MNTK_NOASYNC); 3987 MNT_KERN_FLAG(MNTK_UNMOUNT); 3988 MNT_KERN_FLAG(MNTK_MWAIT); 3989 MNT_KERN_FLAG(MNTK_SUSPEND); 3990 MNT_KERN_FLAG(MNTK_SUSPEND2); 3991 MNT_KERN_FLAG(MNTK_SUSPENDED); 3992 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3993 MNT_KERN_FLAG(MNTK_NOKNOTE); 3994 #undef MNT_KERN_FLAG 3995 if (flags != 0) { 3996 if (buf[0] != '\0') 3997 strlcat(buf, ", ", sizeof(buf)); 3998 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3999 "0x%08x", flags); 4000 } 4001 db_printf(" mnt_kern_flag = %s\n", buf); 4002 4003 db_printf(" mnt_opt = "); 4004 opt = TAILQ_FIRST(mp->mnt_opt); 4005 if (opt != NULL) { 4006 db_printf("%s", opt->name); 4007 opt = TAILQ_NEXT(opt, link); 4008 while (opt != NULL) { 4009 db_printf(", %s", opt->name); 4010 opt = TAILQ_NEXT(opt, link); 4011 } 4012 } 4013 db_printf("\n"); 4014 4015 sp = &mp->mnt_stat; 4016 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4017 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4018 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4019 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4020 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4021 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4022 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4023 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4024 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4025 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4026 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4027 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4028 4029 db_printf(" mnt_cred = { uid=%u ruid=%u", 4030 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4031 if (jailed(mp->mnt_cred)) 4032 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4033 db_printf(" }\n"); 4034 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4035 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4036 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4037 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4038 db_printf(" mnt_activevnodelistsize = %d\n", 4039 mp->mnt_activevnodelistsize); 4040 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4041 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4042 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 4043 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4044 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4045 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4046 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4047 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4048 db_printf(" mnt_secondary_accwrites = %d\n", 4049 mp->mnt_secondary_accwrites); 4050 db_printf(" mnt_gjprovider = %s\n", 4051 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4052 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4053 4054 db_printf("\n\nList of active vnodes\n"); 4055 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 4056 if (vp->v_type != VMARKER) { 4057 vn_printf(vp, "vnode "); 4058 if (db_pager_quit) 4059 break; 4060 } 4061 } 4062 db_printf("\n\nList of inactive vnodes\n"); 4063 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4064 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 4065 vn_printf(vp, "vnode "); 4066 if (db_pager_quit) 4067 break; 4068 } 4069 } 4070 } 4071 #endif /* DDB */ 4072 4073 /* 4074 * Fill in a struct xvfsconf based on a struct vfsconf. 4075 */ 4076 static int 4077 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4078 { 4079 struct xvfsconf xvfsp; 4080 4081 bzero(&xvfsp, sizeof(xvfsp)); 4082 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4083 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4084 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4085 xvfsp.vfc_flags = vfsp->vfc_flags; 4086 /* 4087 * These are unused in userland, we keep them 4088 * to not break binary compatibility. 4089 */ 4090 xvfsp.vfc_vfsops = NULL; 4091 xvfsp.vfc_next = NULL; 4092 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4093 } 4094 4095 #ifdef COMPAT_FREEBSD32 4096 struct xvfsconf32 { 4097 uint32_t vfc_vfsops; 4098 char vfc_name[MFSNAMELEN]; 4099 int32_t vfc_typenum; 4100 int32_t vfc_refcount; 4101 int32_t vfc_flags; 4102 uint32_t vfc_next; 4103 }; 4104 4105 static int 4106 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4107 { 4108 struct xvfsconf32 xvfsp; 4109 4110 bzero(&xvfsp, sizeof(xvfsp)); 4111 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4112 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4113 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4114 xvfsp.vfc_flags = vfsp->vfc_flags; 4115 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4116 } 4117 #endif 4118 4119 /* 4120 * Top level filesystem related information gathering. 4121 */ 4122 static int 4123 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4124 { 4125 struct vfsconf *vfsp; 4126 int error; 4127 4128 error = 0; 4129 vfsconf_slock(); 4130 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4131 #ifdef COMPAT_FREEBSD32 4132 if (req->flags & SCTL_MASK32) 4133 error = vfsconf2x32(req, vfsp); 4134 else 4135 #endif 4136 error = vfsconf2x(req, vfsp); 4137 if (error) 4138 break; 4139 } 4140 vfsconf_sunlock(); 4141 return (error); 4142 } 4143 4144 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4145 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4146 "S,xvfsconf", "List of all configured filesystems"); 4147 4148 #ifndef BURN_BRIDGES 4149 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4150 4151 static int 4152 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4153 { 4154 int *name = (int *)arg1 - 1; /* XXX */ 4155 u_int namelen = arg2 + 1; /* XXX */ 4156 struct vfsconf *vfsp; 4157 4158 log(LOG_WARNING, "userland calling deprecated sysctl, " 4159 "please rebuild world\n"); 4160 4161 #if 1 || defined(COMPAT_PRELITE2) 4162 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4163 if (namelen == 1) 4164 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4165 #endif 4166 4167 switch (name[1]) { 4168 case VFS_MAXTYPENUM: 4169 if (namelen != 2) 4170 return (ENOTDIR); 4171 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4172 case VFS_CONF: 4173 if (namelen != 3) 4174 return (ENOTDIR); /* overloaded */ 4175 vfsconf_slock(); 4176 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4177 if (vfsp->vfc_typenum == name[2]) 4178 break; 4179 } 4180 vfsconf_sunlock(); 4181 if (vfsp == NULL) 4182 return (EOPNOTSUPP); 4183 #ifdef COMPAT_FREEBSD32 4184 if (req->flags & SCTL_MASK32) 4185 return (vfsconf2x32(req, vfsp)); 4186 else 4187 #endif 4188 return (vfsconf2x(req, vfsp)); 4189 } 4190 return (EOPNOTSUPP); 4191 } 4192 4193 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4194 CTLFLAG_MPSAFE, vfs_sysctl, 4195 "Generic filesystem"); 4196 4197 #if 1 || defined(COMPAT_PRELITE2) 4198 4199 static int 4200 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4201 { 4202 int error; 4203 struct vfsconf *vfsp; 4204 struct ovfsconf ovfs; 4205 4206 vfsconf_slock(); 4207 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4208 bzero(&ovfs, sizeof(ovfs)); 4209 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4210 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4211 ovfs.vfc_index = vfsp->vfc_typenum; 4212 ovfs.vfc_refcount = vfsp->vfc_refcount; 4213 ovfs.vfc_flags = vfsp->vfc_flags; 4214 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4215 if (error != 0) { 4216 vfsconf_sunlock(); 4217 return (error); 4218 } 4219 } 4220 vfsconf_sunlock(); 4221 return (0); 4222 } 4223 4224 #endif /* 1 || COMPAT_PRELITE2 */ 4225 #endif /* !BURN_BRIDGES */ 4226 4227 #define KINFO_VNODESLOP 10 4228 #ifdef notyet 4229 /* 4230 * Dump vnode list (via sysctl). 4231 */ 4232 /* ARGSUSED */ 4233 static int 4234 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4235 { 4236 struct xvnode *xvn; 4237 struct mount *mp; 4238 struct vnode *vp; 4239 int error, len, n; 4240 4241 /* 4242 * Stale numvnodes access is not fatal here. 4243 */ 4244 req->lock = 0; 4245 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4246 if (!req->oldptr) 4247 /* Make an estimate */ 4248 return (SYSCTL_OUT(req, 0, len)); 4249 4250 error = sysctl_wire_old_buffer(req, 0); 4251 if (error != 0) 4252 return (error); 4253 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4254 n = 0; 4255 mtx_lock(&mountlist_mtx); 4256 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4257 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4258 continue; 4259 MNT_ILOCK(mp); 4260 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4261 if (n == len) 4262 break; 4263 vref(vp); 4264 xvn[n].xv_size = sizeof *xvn; 4265 xvn[n].xv_vnode = vp; 4266 xvn[n].xv_id = 0; /* XXX compat */ 4267 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4268 XV_COPY(usecount); 4269 XV_COPY(writecount); 4270 XV_COPY(holdcnt); 4271 XV_COPY(mount); 4272 XV_COPY(numoutput); 4273 XV_COPY(type); 4274 #undef XV_COPY 4275 xvn[n].xv_flag = vp->v_vflag; 4276 4277 switch (vp->v_type) { 4278 case VREG: 4279 case VDIR: 4280 case VLNK: 4281 break; 4282 case VBLK: 4283 case VCHR: 4284 if (vp->v_rdev == NULL) { 4285 vrele(vp); 4286 continue; 4287 } 4288 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4289 break; 4290 case VSOCK: 4291 xvn[n].xv_socket = vp->v_socket; 4292 break; 4293 case VFIFO: 4294 xvn[n].xv_fifo = vp->v_fifoinfo; 4295 break; 4296 case VNON: 4297 case VBAD: 4298 default: 4299 /* shouldn't happen? */ 4300 vrele(vp); 4301 continue; 4302 } 4303 vrele(vp); 4304 ++n; 4305 } 4306 MNT_IUNLOCK(mp); 4307 mtx_lock(&mountlist_mtx); 4308 vfs_unbusy(mp); 4309 if (n == len) 4310 break; 4311 } 4312 mtx_unlock(&mountlist_mtx); 4313 4314 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4315 free(xvn, M_TEMP); 4316 return (error); 4317 } 4318 4319 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4320 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4321 ""); 4322 #endif 4323 4324 static void 4325 unmount_or_warn(struct mount *mp) 4326 { 4327 int error; 4328 4329 error = dounmount(mp, MNT_FORCE, curthread); 4330 if (error != 0) { 4331 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4332 if (error == EBUSY) 4333 printf("BUSY)\n"); 4334 else 4335 printf("%d)\n", error); 4336 } 4337 } 4338 4339 /* 4340 * Unmount all filesystems. The list is traversed in reverse order 4341 * of mounting to avoid dependencies. 4342 */ 4343 void 4344 vfs_unmountall(void) 4345 { 4346 struct mount *mp, *tmp; 4347 4348 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4349 4350 /* 4351 * Since this only runs when rebooting, it is not interlocked. 4352 */ 4353 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4354 vfs_ref(mp); 4355 4356 /* 4357 * Forcibly unmounting "/dev" before "/" would prevent clean 4358 * unmount of the latter. 4359 */ 4360 if (mp == rootdevmp) 4361 continue; 4362 4363 unmount_or_warn(mp); 4364 } 4365 4366 if (rootdevmp != NULL) 4367 unmount_or_warn(rootdevmp); 4368 } 4369 4370 /* 4371 * perform msync on all vnodes under a mount point 4372 * the mount point must be locked. 4373 */ 4374 void 4375 vfs_msync(struct mount *mp, int flags) 4376 { 4377 struct vnode *vp, *mvp; 4378 struct vm_object *obj; 4379 4380 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4381 4382 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4383 return; 4384 4385 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 4386 obj = vp->v_object; 4387 if (obj != NULL && vm_object_mightbedirty(obj) && 4388 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 4389 if (!vget(vp, 4390 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 4391 curthread)) { 4392 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 4393 vput(vp); 4394 continue; 4395 } 4396 4397 obj = vp->v_object; 4398 if (obj != NULL) { 4399 VM_OBJECT_WLOCK(obj); 4400 vm_object_page_clean(obj, 0, 0, 4401 flags == MNT_WAIT ? 4402 OBJPC_SYNC : OBJPC_NOSYNC); 4403 VM_OBJECT_WUNLOCK(obj); 4404 } 4405 vput(vp); 4406 } 4407 } else 4408 VI_UNLOCK(vp); 4409 } 4410 } 4411 4412 static void 4413 destroy_vpollinfo_free(struct vpollinfo *vi) 4414 { 4415 4416 knlist_destroy(&vi->vpi_selinfo.si_note); 4417 mtx_destroy(&vi->vpi_lock); 4418 uma_zfree(vnodepoll_zone, vi); 4419 } 4420 4421 static void 4422 destroy_vpollinfo(struct vpollinfo *vi) 4423 { 4424 4425 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4426 seldrain(&vi->vpi_selinfo); 4427 destroy_vpollinfo_free(vi); 4428 } 4429 4430 /* 4431 * Initialize per-vnode helper structure to hold poll-related state. 4432 */ 4433 void 4434 v_addpollinfo(struct vnode *vp) 4435 { 4436 struct vpollinfo *vi; 4437 4438 if (vp->v_pollinfo != NULL) 4439 return; 4440 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 4441 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4442 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4443 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 4444 VI_LOCK(vp); 4445 if (vp->v_pollinfo != NULL) { 4446 VI_UNLOCK(vp); 4447 destroy_vpollinfo_free(vi); 4448 return; 4449 } 4450 vp->v_pollinfo = vi; 4451 VI_UNLOCK(vp); 4452 } 4453 4454 /* 4455 * Record a process's interest in events which might happen to 4456 * a vnode. Because poll uses the historic select-style interface 4457 * internally, this routine serves as both the ``check for any 4458 * pending events'' and the ``record my interest in future events'' 4459 * functions. (These are done together, while the lock is held, 4460 * to avoid race conditions.) 4461 */ 4462 int 4463 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4464 { 4465 4466 v_addpollinfo(vp); 4467 mtx_lock(&vp->v_pollinfo->vpi_lock); 4468 if (vp->v_pollinfo->vpi_revents & events) { 4469 /* 4470 * This leaves events we are not interested 4471 * in available for the other process which 4472 * which presumably had requested them 4473 * (otherwise they would never have been 4474 * recorded). 4475 */ 4476 events &= vp->v_pollinfo->vpi_revents; 4477 vp->v_pollinfo->vpi_revents &= ~events; 4478 4479 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4480 return (events); 4481 } 4482 vp->v_pollinfo->vpi_events |= events; 4483 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4484 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4485 return (0); 4486 } 4487 4488 /* 4489 * Routine to create and manage a filesystem syncer vnode. 4490 */ 4491 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4492 static int sync_fsync(struct vop_fsync_args *); 4493 static int sync_inactive(struct vop_inactive_args *); 4494 static int sync_reclaim(struct vop_reclaim_args *); 4495 4496 static struct vop_vector sync_vnodeops = { 4497 .vop_bypass = VOP_EOPNOTSUPP, 4498 .vop_close = sync_close, /* close */ 4499 .vop_fsync = sync_fsync, /* fsync */ 4500 .vop_inactive = sync_inactive, /* inactive */ 4501 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 4502 .vop_reclaim = sync_reclaim, /* reclaim */ 4503 .vop_lock1 = vop_stdlock, /* lock */ 4504 .vop_unlock = vop_stdunlock, /* unlock */ 4505 .vop_islocked = vop_stdislocked, /* islocked */ 4506 }; 4507 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 4508 4509 /* 4510 * Create a new filesystem syncer vnode for the specified mount point. 4511 */ 4512 void 4513 vfs_allocate_syncvnode(struct mount *mp) 4514 { 4515 struct vnode *vp; 4516 struct bufobj *bo; 4517 static long start, incr, next; 4518 int error; 4519 4520 /* Allocate a new vnode */ 4521 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4522 if (error != 0) 4523 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4524 vp->v_type = VNON; 4525 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4526 vp->v_vflag |= VV_FORCEINSMQ; 4527 error = insmntque(vp, mp); 4528 if (error != 0) 4529 panic("vfs_allocate_syncvnode: insmntque() failed"); 4530 vp->v_vflag &= ~VV_FORCEINSMQ; 4531 VOP_UNLOCK(vp); 4532 /* 4533 * Place the vnode onto the syncer worklist. We attempt to 4534 * scatter them about on the list so that they will go off 4535 * at evenly distributed times even if all the filesystems 4536 * are mounted at once. 4537 */ 4538 next += incr; 4539 if (next == 0 || next > syncer_maxdelay) { 4540 start /= 2; 4541 incr /= 2; 4542 if (start == 0) { 4543 start = syncer_maxdelay / 2; 4544 incr = syncer_maxdelay; 4545 } 4546 next = start; 4547 } 4548 bo = &vp->v_bufobj; 4549 BO_LOCK(bo); 4550 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 4551 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 4552 mtx_lock(&sync_mtx); 4553 sync_vnode_count++; 4554 if (mp->mnt_syncer == NULL) { 4555 mp->mnt_syncer = vp; 4556 vp = NULL; 4557 } 4558 mtx_unlock(&sync_mtx); 4559 BO_UNLOCK(bo); 4560 if (vp != NULL) { 4561 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4562 vgone(vp); 4563 vput(vp); 4564 } 4565 } 4566 4567 void 4568 vfs_deallocate_syncvnode(struct mount *mp) 4569 { 4570 struct vnode *vp; 4571 4572 mtx_lock(&sync_mtx); 4573 vp = mp->mnt_syncer; 4574 if (vp != NULL) 4575 mp->mnt_syncer = NULL; 4576 mtx_unlock(&sync_mtx); 4577 if (vp != NULL) 4578 vrele(vp); 4579 } 4580 4581 /* 4582 * Do a lazy sync of the filesystem. 4583 */ 4584 static int 4585 sync_fsync(struct vop_fsync_args *ap) 4586 { 4587 struct vnode *syncvp = ap->a_vp; 4588 struct mount *mp = syncvp->v_mount; 4589 int error, save; 4590 struct bufobj *bo; 4591 4592 /* 4593 * We only need to do something if this is a lazy evaluation. 4594 */ 4595 if (ap->a_waitfor != MNT_LAZY) 4596 return (0); 4597 4598 /* 4599 * Move ourselves to the back of the sync list. 4600 */ 4601 bo = &syncvp->v_bufobj; 4602 BO_LOCK(bo); 4603 vn_syncer_add_to_worklist(bo, syncdelay); 4604 BO_UNLOCK(bo); 4605 4606 /* 4607 * Walk the list of vnodes pushing all that are dirty and 4608 * not already on the sync list. 4609 */ 4610 if (vfs_busy(mp, MBF_NOWAIT) != 0) 4611 return (0); 4612 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 4613 vfs_unbusy(mp); 4614 return (0); 4615 } 4616 save = curthread_pflags_set(TDP_SYNCIO); 4617 /* 4618 * The filesystem at hand may be idle with free vnodes stored in the 4619 * batch. Return them instead of letting them stay there indefinitely. 4620 */ 4621 vnlru_return_batch(mp); 4622 vfs_msync(mp, MNT_NOWAIT); 4623 error = VFS_SYNC(mp, MNT_LAZY); 4624 curthread_pflags_restore(save); 4625 vn_finished_write(mp); 4626 vfs_unbusy(mp); 4627 return (error); 4628 } 4629 4630 /* 4631 * The syncer vnode is no referenced. 4632 */ 4633 static int 4634 sync_inactive(struct vop_inactive_args *ap) 4635 { 4636 4637 vgone(ap->a_vp); 4638 return (0); 4639 } 4640 4641 /* 4642 * The syncer vnode is no longer needed and is being decommissioned. 4643 * 4644 * Modifications to the worklist must be protected by sync_mtx. 4645 */ 4646 static int 4647 sync_reclaim(struct vop_reclaim_args *ap) 4648 { 4649 struct vnode *vp = ap->a_vp; 4650 struct bufobj *bo; 4651 4652 bo = &vp->v_bufobj; 4653 BO_LOCK(bo); 4654 mtx_lock(&sync_mtx); 4655 if (vp->v_mount->mnt_syncer == vp) 4656 vp->v_mount->mnt_syncer = NULL; 4657 if (bo->bo_flag & BO_ONWORKLST) { 4658 LIST_REMOVE(bo, bo_synclist); 4659 syncer_worklist_len--; 4660 sync_vnode_count--; 4661 bo->bo_flag &= ~BO_ONWORKLST; 4662 } 4663 mtx_unlock(&sync_mtx); 4664 BO_UNLOCK(bo); 4665 4666 return (0); 4667 } 4668 4669 int 4670 vn_need_pageq_flush(struct vnode *vp) 4671 { 4672 struct vm_object *obj; 4673 int need; 4674 4675 MPASS(mtx_owned(VI_MTX(vp))); 4676 need = 0; 4677 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 4678 vm_object_mightbedirty(obj)) 4679 need = 1; 4680 return (need); 4681 } 4682 4683 /* 4684 * Check if vnode represents a disk device 4685 */ 4686 int 4687 vn_isdisk(struct vnode *vp, int *errp) 4688 { 4689 int error; 4690 4691 if (vp->v_type != VCHR) { 4692 error = ENOTBLK; 4693 goto out; 4694 } 4695 error = 0; 4696 dev_lock(); 4697 if (vp->v_rdev == NULL) 4698 error = ENXIO; 4699 else if (vp->v_rdev->si_devsw == NULL) 4700 error = ENXIO; 4701 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 4702 error = ENOTBLK; 4703 dev_unlock(); 4704 out: 4705 if (errp != NULL) 4706 *errp = error; 4707 return (error == 0); 4708 } 4709 4710 /* 4711 * Common filesystem object access control check routine. Accepts a 4712 * vnode's type, "mode", uid and gid, requested access mode, credentials, 4713 * and optional call-by-reference privused argument allowing vaccess() 4714 * to indicate to the caller whether privilege was used to satisfy the 4715 * request (obsoleted). Returns 0 on success, or an errno on failure. 4716 */ 4717 int 4718 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 4719 accmode_t accmode, struct ucred *cred, int *privused) 4720 { 4721 accmode_t dac_granted; 4722 accmode_t priv_granted; 4723 4724 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 4725 ("invalid bit in accmode")); 4726 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 4727 ("VAPPEND without VWRITE")); 4728 4729 /* 4730 * Look for a normal, non-privileged way to access the file/directory 4731 * as requested. If it exists, go with that. 4732 */ 4733 4734 if (privused != NULL) 4735 *privused = 0; 4736 4737 dac_granted = 0; 4738 4739 /* Check the owner. */ 4740 if (cred->cr_uid == file_uid) { 4741 dac_granted |= VADMIN; 4742 if (file_mode & S_IXUSR) 4743 dac_granted |= VEXEC; 4744 if (file_mode & S_IRUSR) 4745 dac_granted |= VREAD; 4746 if (file_mode & S_IWUSR) 4747 dac_granted |= (VWRITE | VAPPEND); 4748 4749 if ((accmode & dac_granted) == accmode) 4750 return (0); 4751 4752 goto privcheck; 4753 } 4754 4755 /* Otherwise, check the groups (first match) */ 4756 if (groupmember(file_gid, cred)) { 4757 if (file_mode & S_IXGRP) 4758 dac_granted |= VEXEC; 4759 if (file_mode & S_IRGRP) 4760 dac_granted |= VREAD; 4761 if (file_mode & S_IWGRP) 4762 dac_granted |= (VWRITE | VAPPEND); 4763 4764 if ((accmode & dac_granted) == accmode) 4765 return (0); 4766 4767 goto privcheck; 4768 } 4769 4770 /* Otherwise, check everyone else. */ 4771 if (file_mode & S_IXOTH) 4772 dac_granted |= VEXEC; 4773 if (file_mode & S_IROTH) 4774 dac_granted |= VREAD; 4775 if (file_mode & S_IWOTH) 4776 dac_granted |= (VWRITE | VAPPEND); 4777 if ((accmode & dac_granted) == accmode) 4778 return (0); 4779 4780 privcheck: 4781 /* 4782 * Build a privilege mask to determine if the set of privileges 4783 * satisfies the requirements when combined with the granted mask 4784 * from above. For each privilege, if the privilege is required, 4785 * bitwise or the request type onto the priv_granted mask. 4786 */ 4787 priv_granted = 0; 4788 4789 if (type == VDIR) { 4790 /* 4791 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 4792 * requests, instead of PRIV_VFS_EXEC. 4793 */ 4794 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4795 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 4796 priv_granted |= VEXEC; 4797 } else { 4798 /* 4799 * Ensure that at least one execute bit is on. Otherwise, 4800 * a privileged user will always succeed, and we don't want 4801 * this to happen unless the file really is executable. 4802 */ 4803 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 4804 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 4805 !priv_check_cred(cred, PRIV_VFS_EXEC)) 4806 priv_granted |= VEXEC; 4807 } 4808 4809 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 4810 !priv_check_cred(cred, PRIV_VFS_READ)) 4811 priv_granted |= VREAD; 4812 4813 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 4814 !priv_check_cred(cred, PRIV_VFS_WRITE)) 4815 priv_granted |= (VWRITE | VAPPEND); 4816 4817 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 4818 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 4819 priv_granted |= VADMIN; 4820 4821 if ((accmode & (priv_granted | dac_granted)) == accmode) { 4822 /* XXX audit: privilege used */ 4823 if (privused != NULL) 4824 *privused = 1; 4825 return (0); 4826 } 4827 4828 return ((accmode & VADMIN) ? EPERM : EACCES); 4829 } 4830 4831 /* 4832 * Credential check based on process requesting service, and per-attribute 4833 * permissions. 4834 */ 4835 int 4836 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 4837 struct thread *td, accmode_t accmode) 4838 { 4839 4840 /* 4841 * Kernel-invoked always succeeds. 4842 */ 4843 if (cred == NOCRED) 4844 return (0); 4845 4846 /* 4847 * Do not allow privileged processes in jail to directly manipulate 4848 * system attributes. 4849 */ 4850 switch (attrnamespace) { 4851 case EXTATTR_NAMESPACE_SYSTEM: 4852 /* Potentially should be: return (EPERM); */ 4853 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 4854 case EXTATTR_NAMESPACE_USER: 4855 return (VOP_ACCESS(vp, accmode, cred, td)); 4856 default: 4857 return (EPERM); 4858 } 4859 } 4860 4861 #ifdef DEBUG_VFS_LOCKS 4862 /* 4863 * This only exists to suppress warnings from unlocked specfs accesses. It is 4864 * no longer ok to have an unlocked VFS. 4865 */ 4866 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4867 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4868 4869 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4870 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4871 "Drop into debugger on lock violation"); 4872 4873 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4874 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4875 0, "Check for interlock across VOPs"); 4876 4877 int vfs_badlock_print = 1; /* Print lock violations. */ 4878 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4879 0, "Print lock violations"); 4880 4881 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 4882 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 4883 0, "Print vnode details on lock violations"); 4884 4885 #ifdef KDB 4886 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4887 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4888 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4889 #endif 4890 4891 static void 4892 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4893 { 4894 4895 #ifdef KDB 4896 if (vfs_badlock_backtrace) 4897 kdb_backtrace(); 4898 #endif 4899 if (vfs_badlock_vnode) 4900 vn_printf(vp, "vnode "); 4901 if (vfs_badlock_print) 4902 printf("%s: %p %s\n", str, (void *)vp, msg); 4903 if (vfs_badlock_ddb) 4904 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4905 } 4906 4907 void 4908 assert_vi_locked(struct vnode *vp, const char *str) 4909 { 4910 4911 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4912 vfs_badlock("interlock is not locked but should be", str, vp); 4913 } 4914 4915 void 4916 assert_vi_unlocked(struct vnode *vp, const char *str) 4917 { 4918 4919 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4920 vfs_badlock("interlock is locked but should not be", str, vp); 4921 } 4922 4923 void 4924 assert_vop_locked(struct vnode *vp, const char *str) 4925 { 4926 int locked; 4927 4928 if (!IGNORE_LOCK(vp)) { 4929 locked = VOP_ISLOCKED(vp); 4930 if (locked == 0 || locked == LK_EXCLOTHER) 4931 vfs_badlock("is not locked but should be", str, vp); 4932 } 4933 } 4934 4935 void 4936 assert_vop_unlocked(struct vnode *vp, const char *str) 4937 { 4938 4939 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4940 vfs_badlock("is locked but should not be", str, vp); 4941 } 4942 4943 void 4944 assert_vop_elocked(struct vnode *vp, const char *str) 4945 { 4946 4947 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4948 vfs_badlock("is not exclusive locked but should be", str, vp); 4949 } 4950 #endif /* DEBUG_VFS_LOCKS */ 4951 4952 void 4953 vop_rename_fail(struct vop_rename_args *ap) 4954 { 4955 4956 if (ap->a_tvp != NULL) 4957 vput(ap->a_tvp); 4958 if (ap->a_tdvp == ap->a_tvp) 4959 vrele(ap->a_tdvp); 4960 else 4961 vput(ap->a_tdvp); 4962 vrele(ap->a_fdvp); 4963 vrele(ap->a_fvp); 4964 } 4965 4966 void 4967 vop_rename_pre(void *ap) 4968 { 4969 struct vop_rename_args *a = ap; 4970 4971 #ifdef DEBUG_VFS_LOCKS 4972 if (a->a_tvp) 4973 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4974 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4975 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4976 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4977 4978 /* Check the source (from). */ 4979 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4980 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4981 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4982 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4983 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4984 4985 /* Check the target. */ 4986 if (a->a_tvp) 4987 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4988 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4989 #endif 4990 if (a->a_tdvp != a->a_fdvp) 4991 vhold(a->a_fdvp); 4992 if (a->a_tvp != a->a_fvp) 4993 vhold(a->a_fvp); 4994 vhold(a->a_tdvp); 4995 if (a->a_tvp) 4996 vhold(a->a_tvp); 4997 } 4998 4999 #ifdef DEBUG_VFS_LOCKS 5000 void 5001 vop_strategy_pre(void *ap) 5002 { 5003 struct vop_strategy_args *a; 5004 struct buf *bp; 5005 5006 a = ap; 5007 bp = a->a_bp; 5008 5009 /* 5010 * Cluster ops lock their component buffers but not the IO container. 5011 */ 5012 if ((bp->b_flags & B_CLUSTER) != 0) 5013 return; 5014 5015 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 5016 if (vfs_badlock_print) 5017 printf( 5018 "VOP_STRATEGY: bp is not locked but should be\n"); 5019 if (vfs_badlock_ddb) 5020 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5021 } 5022 } 5023 5024 void 5025 vop_lock_pre(void *ap) 5026 { 5027 struct vop_lock1_args *a = ap; 5028 5029 if ((a->a_flags & LK_INTERLOCK) == 0) 5030 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5031 else 5032 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5033 } 5034 5035 void 5036 vop_lock_post(void *ap, int rc) 5037 { 5038 struct vop_lock1_args *a = ap; 5039 5040 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5041 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5042 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5043 } 5044 5045 void 5046 vop_unlock_pre(void *ap) 5047 { 5048 struct vop_unlock_args *a = ap; 5049 5050 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5051 } 5052 5053 void 5054 vop_unlock_post(void *ap, int rc) 5055 { 5056 return; 5057 } 5058 5059 void 5060 vop_need_inactive_pre(void *ap) 5061 { 5062 struct vop_need_inactive_args *a = ap; 5063 5064 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5065 } 5066 5067 void 5068 vop_need_inactive_post(void *ap, int rc) 5069 { 5070 struct vop_need_inactive_args *a = ap; 5071 5072 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5073 } 5074 #endif 5075 5076 void 5077 vop_create_post(void *ap, int rc) 5078 { 5079 struct vop_create_args *a = ap; 5080 5081 if (!rc) 5082 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5083 } 5084 5085 void 5086 vop_deleteextattr_post(void *ap, int rc) 5087 { 5088 struct vop_deleteextattr_args *a = ap; 5089 5090 if (!rc) 5091 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5092 } 5093 5094 void 5095 vop_link_post(void *ap, int rc) 5096 { 5097 struct vop_link_args *a = ap; 5098 5099 if (!rc) { 5100 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 5101 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 5102 } 5103 } 5104 5105 void 5106 vop_mkdir_post(void *ap, int rc) 5107 { 5108 struct vop_mkdir_args *a = ap; 5109 5110 if (!rc) 5111 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 5112 } 5113 5114 void 5115 vop_mknod_post(void *ap, int rc) 5116 { 5117 struct vop_mknod_args *a = ap; 5118 5119 if (!rc) 5120 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5121 } 5122 5123 void 5124 vop_reclaim_post(void *ap, int rc) 5125 { 5126 struct vop_reclaim_args *a = ap; 5127 5128 if (!rc) 5129 VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); 5130 } 5131 5132 void 5133 vop_remove_post(void *ap, int rc) 5134 { 5135 struct vop_remove_args *a = ap; 5136 5137 if (!rc) { 5138 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5139 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 5140 } 5141 } 5142 5143 void 5144 vop_rename_post(void *ap, int rc) 5145 { 5146 struct vop_rename_args *a = ap; 5147 long hint; 5148 5149 if (!rc) { 5150 hint = NOTE_WRITE; 5151 if (a->a_fdvp == a->a_tdvp) { 5152 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5153 hint |= NOTE_LINK; 5154 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5155 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5156 } else { 5157 hint |= NOTE_EXTEND; 5158 if (a->a_fvp->v_type == VDIR) 5159 hint |= NOTE_LINK; 5160 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5161 5162 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5163 a->a_tvp->v_type == VDIR) 5164 hint &= ~NOTE_LINK; 5165 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5166 } 5167 5168 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5169 if (a->a_tvp) 5170 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5171 } 5172 if (a->a_tdvp != a->a_fdvp) 5173 vdrop(a->a_fdvp); 5174 if (a->a_tvp != a->a_fvp) 5175 vdrop(a->a_fvp); 5176 vdrop(a->a_tdvp); 5177 if (a->a_tvp) 5178 vdrop(a->a_tvp); 5179 } 5180 5181 void 5182 vop_rmdir_post(void *ap, int rc) 5183 { 5184 struct vop_rmdir_args *a = ap; 5185 5186 if (!rc) { 5187 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 5188 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 5189 } 5190 } 5191 5192 void 5193 vop_setattr_post(void *ap, int rc) 5194 { 5195 struct vop_setattr_args *a = ap; 5196 5197 if (!rc) 5198 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5199 } 5200 5201 void 5202 vop_setextattr_post(void *ap, int rc) 5203 { 5204 struct vop_setextattr_args *a = ap; 5205 5206 if (!rc) 5207 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5208 } 5209 5210 void 5211 vop_symlink_post(void *ap, int rc) 5212 { 5213 struct vop_symlink_args *a = ap; 5214 5215 if (!rc) 5216 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 5217 } 5218 5219 void 5220 vop_open_post(void *ap, int rc) 5221 { 5222 struct vop_open_args *a = ap; 5223 5224 if (!rc) 5225 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 5226 } 5227 5228 void 5229 vop_close_post(void *ap, int rc) 5230 { 5231 struct vop_close_args *a = ap; 5232 5233 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 5234 !VN_IS_DOOMED(a->a_vp))) { 5235 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 5236 NOTE_CLOSE_WRITE : NOTE_CLOSE); 5237 } 5238 } 5239 5240 void 5241 vop_read_post(void *ap, int rc) 5242 { 5243 struct vop_read_args *a = ap; 5244 5245 if (!rc) 5246 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5247 } 5248 5249 void 5250 vop_readdir_post(void *ap, int rc) 5251 { 5252 struct vop_readdir_args *a = ap; 5253 5254 if (!rc) 5255 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 5256 } 5257 5258 static struct knlist fs_knlist; 5259 5260 static void 5261 vfs_event_init(void *arg) 5262 { 5263 knlist_init_mtx(&fs_knlist, NULL); 5264 } 5265 /* XXX - correct order? */ 5266 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 5267 5268 void 5269 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 5270 { 5271 5272 KNOTE_UNLOCKED(&fs_knlist, event); 5273 } 5274 5275 static int filt_fsattach(struct knote *kn); 5276 static void filt_fsdetach(struct knote *kn); 5277 static int filt_fsevent(struct knote *kn, long hint); 5278 5279 struct filterops fs_filtops = { 5280 .f_isfd = 0, 5281 .f_attach = filt_fsattach, 5282 .f_detach = filt_fsdetach, 5283 .f_event = filt_fsevent 5284 }; 5285 5286 static int 5287 filt_fsattach(struct knote *kn) 5288 { 5289 5290 kn->kn_flags |= EV_CLEAR; 5291 knlist_add(&fs_knlist, kn, 0); 5292 return (0); 5293 } 5294 5295 static void 5296 filt_fsdetach(struct knote *kn) 5297 { 5298 5299 knlist_remove(&fs_knlist, kn, 0); 5300 } 5301 5302 static int 5303 filt_fsevent(struct knote *kn, long hint) 5304 { 5305 5306 kn->kn_fflags |= hint; 5307 return (kn->kn_fflags != 0); 5308 } 5309 5310 static int 5311 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 5312 { 5313 struct vfsidctl vc; 5314 int error; 5315 struct mount *mp; 5316 5317 error = SYSCTL_IN(req, &vc, sizeof(vc)); 5318 if (error) 5319 return (error); 5320 if (vc.vc_vers != VFS_CTL_VERS1) 5321 return (EINVAL); 5322 mp = vfs_getvfs(&vc.vc_fsid); 5323 if (mp == NULL) 5324 return (ENOENT); 5325 /* ensure that a specific sysctl goes to the right filesystem. */ 5326 if (strcmp(vc.vc_fstypename, "*") != 0 && 5327 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 5328 vfs_rel(mp); 5329 return (EINVAL); 5330 } 5331 VCTLTOREQ(&vc, req); 5332 error = VFS_SYSCTL(mp, vc.vc_op, req); 5333 vfs_rel(mp); 5334 return (error); 5335 } 5336 5337 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 5338 NULL, 0, sysctl_vfs_ctl, "", 5339 "Sysctl by fsid"); 5340 5341 /* 5342 * Function to initialize a va_filerev field sensibly. 5343 * XXX: Wouldn't a random number make a lot more sense ?? 5344 */ 5345 u_quad_t 5346 init_va_filerev(void) 5347 { 5348 struct bintime bt; 5349 5350 getbinuptime(&bt); 5351 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 5352 } 5353 5354 static int filt_vfsread(struct knote *kn, long hint); 5355 static int filt_vfswrite(struct knote *kn, long hint); 5356 static int filt_vfsvnode(struct knote *kn, long hint); 5357 static void filt_vfsdetach(struct knote *kn); 5358 static struct filterops vfsread_filtops = { 5359 .f_isfd = 1, 5360 .f_detach = filt_vfsdetach, 5361 .f_event = filt_vfsread 5362 }; 5363 static struct filterops vfswrite_filtops = { 5364 .f_isfd = 1, 5365 .f_detach = filt_vfsdetach, 5366 .f_event = filt_vfswrite 5367 }; 5368 static struct filterops vfsvnode_filtops = { 5369 .f_isfd = 1, 5370 .f_detach = filt_vfsdetach, 5371 .f_event = filt_vfsvnode 5372 }; 5373 5374 static void 5375 vfs_knllock(void *arg) 5376 { 5377 struct vnode *vp = arg; 5378 5379 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5380 } 5381 5382 static void 5383 vfs_knlunlock(void *arg) 5384 { 5385 struct vnode *vp = arg; 5386 5387 VOP_UNLOCK(vp); 5388 } 5389 5390 static void 5391 vfs_knl_assert_locked(void *arg) 5392 { 5393 #ifdef DEBUG_VFS_LOCKS 5394 struct vnode *vp = arg; 5395 5396 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 5397 #endif 5398 } 5399 5400 static void 5401 vfs_knl_assert_unlocked(void *arg) 5402 { 5403 #ifdef DEBUG_VFS_LOCKS 5404 struct vnode *vp = arg; 5405 5406 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 5407 #endif 5408 } 5409 5410 int 5411 vfs_kqfilter(struct vop_kqfilter_args *ap) 5412 { 5413 struct vnode *vp = ap->a_vp; 5414 struct knote *kn = ap->a_kn; 5415 struct knlist *knl; 5416 5417 switch (kn->kn_filter) { 5418 case EVFILT_READ: 5419 kn->kn_fop = &vfsread_filtops; 5420 break; 5421 case EVFILT_WRITE: 5422 kn->kn_fop = &vfswrite_filtops; 5423 break; 5424 case EVFILT_VNODE: 5425 kn->kn_fop = &vfsvnode_filtops; 5426 break; 5427 default: 5428 return (EINVAL); 5429 } 5430 5431 kn->kn_hook = (caddr_t)vp; 5432 5433 v_addpollinfo(vp); 5434 if (vp->v_pollinfo == NULL) 5435 return (ENOMEM); 5436 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 5437 vhold(vp); 5438 knlist_add(knl, kn, 0); 5439 5440 return (0); 5441 } 5442 5443 /* 5444 * Detach knote from vnode 5445 */ 5446 static void 5447 filt_vfsdetach(struct knote *kn) 5448 { 5449 struct vnode *vp = (struct vnode *)kn->kn_hook; 5450 5451 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 5452 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 5453 vdrop(vp); 5454 } 5455 5456 /*ARGSUSED*/ 5457 static int 5458 filt_vfsread(struct knote *kn, long hint) 5459 { 5460 struct vnode *vp = (struct vnode *)kn->kn_hook; 5461 struct vattr va; 5462 int res; 5463 5464 /* 5465 * filesystem is gone, so set the EOF flag and schedule 5466 * the knote for deletion. 5467 */ 5468 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5469 VI_LOCK(vp); 5470 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5471 VI_UNLOCK(vp); 5472 return (1); 5473 } 5474 5475 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 5476 return (0); 5477 5478 VI_LOCK(vp); 5479 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 5480 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 5481 VI_UNLOCK(vp); 5482 return (res); 5483 } 5484 5485 /*ARGSUSED*/ 5486 static int 5487 filt_vfswrite(struct knote *kn, long hint) 5488 { 5489 struct vnode *vp = (struct vnode *)kn->kn_hook; 5490 5491 VI_LOCK(vp); 5492 5493 /* 5494 * filesystem is gone, so set the EOF flag and schedule 5495 * the knote for deletion. 5496 */ 5497 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 5498 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 5499 5500 kn->kn_data = 0; 5501 VI_UNLOCK(vp); 5502 return (1); 5503 } 5504 5505 static int 5506 filt_vfsvnode(struct knote *kn, long hint) 5507 { 5508 struct vnode *vp = (struct vnode *)kn->kn_hook; 5509 int res; 5510 5511 VI_LOCK(vp); 5512 if (kn->kn_sfflags & hint) 5513 kn->kn_fflags |= hint; 5514 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 5515 kn->kn_flags |= EV_EOF; 5516 VI_UNLOCK(vp); 5517 return (1); 5518 } 5519 res = (kn->kn_fflags != 0); 5520 VI_UNLOCK(vp); 5521 return (res); 5522 } 5523 5524 /* 5525 * Returns whether the directory is empty or not. 5526 * If it is empty, the return value is 0; otherwise 5527 * the return value is an error value (which may 5528 * be ENOTEMPTY). 5529 */ 5530 int 5531 vfs_emptydir(struct vnode *vp) 5532 { 5533 struct uio uio; 5534 struct iovec iov; 5535 struct dirent *dirent, *dp, *endp; 5536 int error, eof; 5537 5538 error = 0; 5539 eof = 0; 5540 5541 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 5542 5543 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 5544 iov.iov_base = dirent; 5545 iov.iov_len = sizeof(struct dirent); 5546 5547 uio.uio_iov = &iov; 5548 uio.uio_iovcnt = 1; 5549 uio.uio_offset = 0; 5550 uio.uio_resid = sizeof(struct dirent); 5551 uio.uio_segflg = UIO_SYSSPACE; 5552 uio.uio_rw = UIO_READ; 5553 uio.uio_td = curthread; 5554 5555 while (eof == 0 && error == 0) { 5556 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 5557 NULL, NULL); 5558 if (error != 0) 5559 break; 5560 endp = (void *)((uint8_t *)dirent + 5561 sizeof(struct dirent) - uio.uio_resid); 5562 for (dp = dirent; dp < endp; 5563 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 5564 if (dp->d_type == DT_WHT) 5565 continue; 5566 if (dp->d_namlen == 0) 5567 continue; 5568 if (dp->d_type != DT_DIR && 5569 dp->d_type != DT_UNKNOWN) { 5570 error = ENOTEMPTY; 5571 break; 5572 } 5573 if (dp->d_namlen > 2) { 5574 error = ENOTEMPTY; 5575 break; 5576 } 5577 if (dp->d_namlen == 1 && 5578 dp->d_name[0] != '.') { 5579 error = ENOTEMPTY; 5580 break; 5581 } 5582 if (dp->d_namlen == 2 && 5583 dp->d_name[1] != '.') { 5584 error = ENOTEMPTY; 5585 break; 5586 } 5587 uio.uio_resid = sizeof(struct dirent); 5588 } 5589 } 5590 free(dirent, M_TEMP); 5591 return (error); 5592 } 5593 5594 int 5595 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 5596 { 5597 int error; 5598 5599 if (dp->d_reclen > ap->a_uio->uio_resid) 5600 return (ENAMETOOLONG); 5601 error = uiomove(dp, dp->d_reclen, ap->a_uio); 5602 if (error) { 5603 if (ap->a_ncookies != NULL) { 5604 if (ap->a_cookies != NULL) 5605 free(ap->a_cookies, M_TEMP); 5606 ap->a_cookies = NULL; 5607 *ap->a_ncookies = 0; 5608 } 5609 return (error); 5610 } 5611 if (ap->a_ncookies == NULL) 5612 return (0); 5613 5614 KASSERT(ap->a_cookies, 5615 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 5616 5617 *ap->a_cookies = realloc(*ap->a_cookies, 5618 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 5619 (*ap->a_cookies)[*ap->a_ncookies] = off; 5620 *ap->a_ncookies += 1; 5621 return (0); 5622 } 5623 5624 /* 5625 * Mark for update the access time of the file if the filesystem 5626 * supports VOP_MARKATIME. This functionality is used by execve and 5627 * mmap, so we want to avoid the I/O implied by directly setting 5628 * va_atime for the sake of efficiency. 5629 */ 5630 void 5631 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 5632 { 5633 struct mount *mp; 5634 5635 mp = vp->v_mount; 5636 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 5637 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 5638 (void)VOP_MARKATIME(vp); 5639 } 5640 5641 /* 5642 * The purpose of this routine is to remove granularity from accmode_t, 5643 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 5644 * VADMIN and VAPPEND. 5645 * 5646 * If it returns 0, the caller is supposed to continue with the usual 5647 * access checks using 'accmode' as modified by this routine. If it 5648 * returns nonzero value, the caller is supposed to return that value 5649 * as errno. 5650 * 5651 * Note that after this routine runs, accmode may be zero. 5652 */ 5653 int 5654 vfs_unixify_accmode(accmode_t *accmode) 5655 { 5656 /* 5657 * There is no way to specify explicit "deny" rule using 5658 * file mode or POSIX.1e ACLs. 5659 */ 5660 if (*accmode & VEXPLICIT_DENY) { 5661 *accmode = 0; 5662 return (0); 5663 } 5664 5665 /* 5666 * None of these can be translated into usual access bits. 5667 * Also, the common case for NFSv4 ACLs is to not contain 5668 * either of these bits. Caller should check for VWRITE 5669 * on the containing directory instead. 5670 */ 5671 if (*accmode & (VDELETE_CHILD | VDELETE)) 5672 return (EPERM); 5673 5674 if (*accmode & VADMIN_PERMS) { 5675 *accmode &= ~VADMIN_PERMS; 5676 *accmode |= VADMIN; 5677 } 5678 5679 /* 5680 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 5681 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 5682 */ 5683 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 5684 5685 return (0); 5686 } 5687 5688 /* 5689 * Clear out a doomed vnode (if any) and replace it with a new one as long 5690 * as the fs is not being unmounted. Return the root vnode to the caller. 5691 */ 5692 static int __noinline 5693 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 5694 { 5695 struct vnode *vp; 5696 int error; 5697 5698 restart: 5699 if (mp->mnt_rootvnode != NULL) { 5700 MNT_ILOCK(mp); 5701 vp = mp->mnt_rootvnode; 5702 if (vp != NULL) { 5703 if (!VN_IS_DOOMED(vp)) { 5704 vrefact(vp); 5705 MNT_IUNLOCK(mp); 5706 error = vn_lock(vp, flags); 5707 if (error == 0) { 5708 *vpp = vp; 5709 return (0); 5710 } 5711 vrele(vp); 5712 goto restart; 5713 } 5714 /* 5715 * Clear the old one. 5716 */ 5717 mp->mnt_rootvnode = NULL; 5718 } 5719 MNT_IUNLOCK(mp); 5720 if (vp != NULL) { 5721 /* 5722 * Paired with a fence in vfs_op_thread_exit(). 5723 */ 5724 atomic_thread_fence_acq(); 5725 vfs_op_barrier_wait(mp); 5726 vrele(vp); 5727 } 5728 } 5729 error = VFS_CACHEDROOT(mp, flags, vpp); 5730 if (error != 0) 5731 return (error); 5732 if (mp->mnt_vfs_ops == 0) { 5733 MNT_ILOCK(mp); 5734 if (mp->mnt_vfs_ops != 0) { 5735 MNT_IUNLOCK(mp); 5736 return (0); 5737 } 5738 if (mp->mnt_rootvnode == NULL) { 5739 vrefact(*vpp); 5740 mp->mnt_rootvnode = *vpp; 5741 } else { 5742 if (mp->mnt_rootvnode != *vpp) { 5743 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 5744 panic("%s: mismatch between vnode returned " 5745 " by VFS_CACHEDROOT and the one cached " 5746 " (%p != %p)", 5747 __func__, *vpp, mp->mnt_rootvnode); 5748 } 5749 } 5750 } 5751 MNT_IUNLOCK(mp); 5752 } 5753 return (0); 5754 } 5755 5756 int 5757 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 5758 { 5759 struct vnode *vp; 5760 int error; 5761 5762 if (!vfs_op_thread_enter(mp)) 5763 return (vfs_cache_root_fallback(mp, flags, vpp)); 5764 vp = (struct vnode *)atomic_load_ptr(&mp->mnt_rootvnode); 5765 if (vp == NULL || VN_IS_DOOMED(vp)) { 5766 vfs_op_thread_exit(mp); 5767 return (vfs_cache_root_fallback(mp, flags, vpp)); 5768 } 5769 vrefact(vp); 5770 vfs_op_thread_exit(mp); 5771 error = vn_lock(vp, flags); 5772 if (error != 0) { 5773 vrele(vp); 5774 return (vfs_cache_root_fallback(mp, flags, vpp)); 5775 } 5776 *vpp = vp; 5777 return (0); 5778 } 5779 5780 struct vnode * 5781 vfs_cache_root_clear(struct mount *mp) 5782 { 5783 struct vnode *vp; 5784 5785 /* 5786 * ops > 0 guarantees there is nobody who can see this vnode 5787 */ 5788 MPASS(mp->mnt_vfs_ops > 0); 5789 vp = mp->mnt_rootvnode; 5790 mp->mnt_rootvnode = NULL; 5791 return (vp); 5792 } 5793 5794 void 5795 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 5796 { 5797 5798 MPASS(mp->mnt_vfs_ops > 0); 5799 vrefact(vp); 5800 mp->mnt_rootvnode = vp; 5801 } 5802 5803 /* 5804 * These are helper functions for filesystems to traverse all 5805 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 5806 * 5807 * This interface replaces MNT_VNODE_FOREACH. 5808 */ 5809 5810 5811 struct vnode * 5812 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 5813 { 5814 struct vnode *vp; 5815 5816 if (should_yield()) 5817 kern_yield(PRI_USER); 5818 MNT_ILOCK(mp); 5819 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5820 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 5821 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 5822 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 5823 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 5824 continue; 5825 VI_LOCK(vp); 5826 if (VN_IS_DOOMED(vp)) { 5827 VI_UNLOCK(vp); 5828 continue; 5829 } 5830 break; 5831 } 5832 if (vp == NULL) { 5833 __mnt_vnode_markerfree_all(mvp, mp); 5834 /* MNT_IUNLOCK(mp); -- done in above function */ 5835 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 5836 return (NULL); 5837 } 5838 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5839 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5840 MNT_IUNLOCK(mp); 5841 return (vp); 5842 } 5843 5844 struct vnode * 5845 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 5846 { 5847 struct vnode *vp; 5848 5849 *mvp = vn_alloc_marker(mp); 5850 MNT_ILOCK(mp); 5851 MNT_REF(mp); 5852 5853 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 5854 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 5855 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 5856 continue; 5857 VI_LOCK(vp); 5858 if (VN_IS_DOOMED(vp)) { 5859 VI_UNLOCK(vp); 5860 continue; 5861 } 5862 break; 5863 } 5864 if (vp == NULL) { 5865 MNT_REL(mp); 5866 MNT_IUNLOCK(mp); 5867 vn_free_marker(*mvp); 5868 *mvp = NULL; 5869 return (NULL); 5870 } 5871 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 5872 MNT_IUNLOCK(mp); 5873 return (vp); 5874 } 5875 5876 void 5877 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 5878 { 5879 5880 if (*mvp == NULL) { 5881 MNT_IUNLOCK(mp); 5882 return; 5883 } 5884 5885 mtx_assert(MNT_MTX(mp), MA_OWNED); 5886 5887 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5888 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 5889 MNT_REL(mp); 5890 MNT_IUNLOCK(mp); 5891 vn_free_marker(*mvp); 5892 *mvp = NULL; 5893 } 5894 5895 /* 5896 * These are helper functions for filesystems to traverse their 5897 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 5898 */ 5899 static void 5900 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 5901 { 5902 5903 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5904 5905 MNT_ILOCK(mp); 5906 MNT_REL(mp); 5907 MNT_IUNLOCK(mp); 5908 vn_free_marker(*mvp); 5909 *mvp = NULL; 5910 } 5911 5912 /* 5913 * Relock the mp mount vnode list lock with the vp vnode interlock in the 5914 * conventional lock order during mnt_vnode_next_active iteration. 5915 * 5916 * On entry, the mount vnode list lock is held and the vnode interlock is not. 5917 * The list lock is dropped and reacquired. On success, both locks are held. 5918 * On failure, the mount vnode list lock is held but the vnode interlock is 5919 * not, and the procedure may have yielded. 5920 */ 5921 static bool 5922 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, 5923 struct vnode *vp) 5924 { 5925 const struct vnode *tmp; 5926 bool held, ret; 5927 5928 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 5929 TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, 5930 ("%s: bad marker", __func__)); 5931 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 5932 ("%s: inappropriate vnode", __func__)); 5933 ASSERT_VI_UNLOCKED(vp, __func__); 5934 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5935 5936 ret = false; 5937 5938 TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); 5939 TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); 5940 5941 /* 5942 * Use a hold to prevent vp from disappearing while the mount vnode 5943 * list lock is dropped and reacquired. Normally a hold would be 5944 * acquired with vhold(), but that might try to acquire the vnode 5945 * interlock, which would be a LOR with the mount vnode list lock. 5946 */ 5947 held = refcount_acquire_if_not_zero(&vp->v_holdcnt); 5948 mtx_unlock(&mp->mnt_listmtx); 5949 if (!held) 5950 goto abort; 5951 VI_LOCK(vp); 5952 if (!refcount_release_if_not_last(&vp->v_holdcnt)) { 5953 vdropl(vp); 5954 goto abort; 5955 } 5956 mtx_lock(&mp->mnt_listmtx); 5957 5958 /* 5959 * Determine whether the vnode is still the next one after the marker, 5960 * excepting any other markers. If the vnode has not been doomed by 5961 * vgone() then the hold should have ensured that it remained on the 5962 * active list. If it has been doomed but is still on the active list, 5963 * don't abort, but rather skip over it (avoid spinning on doomed 5964 * vnodes). 5965 */ 5966 tmp = mvp; 5967 do { 5968 tmp = TAILQ_NEXT(tmp, v_actfreelist); 5969 } while (tmp != NULL && tmp->v_type == VMARKER); 5970 if (tmp != vp) { 5971 mtx_unlock(&mp->mnt_listmtx); 5972 VI_UNLOCK(vp); 5973 goto abort; 5974 } 5975 5976 ret = true; 5977 goto out; 5978 abort: 5979 maybe_yield(); 5980 mtx_lock(&mp->mnt_listmtx); 5981 out: 5982 if (ret) 5983 ASSERT_VI_LOCKED(vp, __func__); 5984 else 5985 ASSERT_VI_UNLOCKED(vp, __func__); 5986 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5987 return (ret); 5988 } 5989 5990 static struct vnode * 5991 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 5992 { 5993 struct vnode *vp, *nvp; 5994 5995 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 5996 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 5997 restart: 5998 vp = TAILQ_NEXT(*mvp, v_actfreelist); 5999 while (vp != NULL) { 6000 if (vp->v_type == VMARKER) { 6001 vp = TAILQ_NEXT(vp, v_actfreelist); 6002 continue; 6003 } 6004 /* 6005 * Try-lock because this is the wrong lock order. If that does 6006 * not succeed, drop the mount vnode list lock and try to 6007 * reacquire it and the vnode interlock in the right order. 6008 */ 6009 if (!VI_TRYLOCK(vp) && 6010 !mnt_vnode_next_active_relock(*mvp, mp, vp)) 6011 goto restart; 6012 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6013 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6014 ("alien vnode on the active list %p %p", vp, mp)); 6015 if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) 6016 break; 6017 nvp = TAILQ_NEXT(vp, v_actfreelist); 6018 VI_UNLOCK(vp); 6019 vp = nvp; 6020 } 6021 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 6022 6023 /* Check if we are done */ 6024 if (vp == NULL) { 6025 mtx_unlock(&mp->mnt_listmtx); 6026 mnt_vnode_markerfree_active(mvp, mp); 6027 return (NULL); 6028 } 6029 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 6030 mtx_unlock(&mp->mnt_listmtx); 6031 ASSERT_VI_LOCKED(vp, "active iter"); 6032 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 6033 return (vp); 6034 } 6035 6036 struct vnode * 6037 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 6038 { 6039 6040 if (should_yield()) 6041 kern_yield(PRI_USER); 6042 mtx_lock(&mp->mnt_listmtx); 6043 return (mnt_vnode_next_active(mvp, mp)); 6044 } 6045 6046 struct vnode * 6047 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 6048 { 6049 struct vnode *vp; 6050 6051 *mvp = vn_alloc_marker(mp); 6052 MNT_ILOCK(mp); 6053 MNT_REF(mp); 6054 MNT_IUNLOCK(mp); 6055 6056 mtx_lock(&mp->mnt_listmtx); 6057 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 6058 if (vp == NULL) { 6059 mtx_unlock(&mp->mnt_listmtx); 6060 mnt_vnode_markerfree_active(mvp, mp); 6061 return (NULL); 6062 } 6063 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 6064 return (mnt_vnode_next_active(mvp, mp)); 6065 } 6066 6067 void 6068 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 6069 { 6070 6071 if (*mvp == NULL) 6072 return; 6073 6074 mtx_lock(&mp->mnt_listmtx); 6075 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 6076 mtx_unlock(&mp->mnt_listmtx); 6077 mnt_vnode_markerfree_active(mvp, mp); 6078 } 6079