1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/asan.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/capsicum.h> 55 #include <sys/condvar.h> 56 #include <sys/conf.h> 57 #include <sys/counter.h> 58 #include <sys/dirent.h> 59 #include <sys/event.h> 60 #include <sys/eventhandler.h> 61 #include <sys/extattr.h> 62 #include <sys/file.h> 63 #include <sys/fcntl.h> 64 #include <sys/jail.h> 65 #include <sys/kdb.h> 66 #include <sys/kernel.h> 67 #include <sys/kthread.h> 68 #include <sys/ktr.h> 69 #include <sys/limits.h> 70 #include <sys/lockf.h> 71 #include <sys/malloc.h> 72 #include <sys/mount.h> 73 #include <sys/namei.h> 74 #include <sys/pctrie.h> 75 #include <sys/priv.h> 76 #include <sys/reboot.h> 77 #include <sys/refcount.h> 78 #include <sys/rwlock.h> 79 #include <sys/sched.h> 80 #include <sys/sleepqueue.h> 81 #include <sys/smr.h> 82 #include <sys/smp.h> 83 #include <sys/stat.h> 84 #include <sys/sysctl.h> 85 #include <sys/syslog.h> 86 #include <sys/vmmeter.h> 87 #include <sys/vnode.h> 88 #include <sys/watchdog.h> 89 90 #include <machine/stdarg.h> 91 92 #include <security/mac/mac_framework.h> 93 94 #include <vm/vm.h> 95 #include <vm/vm_object.h> 96 #include <vm/vm_extern.h> 97 #include <vm/pmap.h> 98 #include <vm/vm_map.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_kern.h> 101 #include <vm/uma.h> 102 103 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 104 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 105 #endif 106 107 #ifdef DDB 108 #include <ddb/ddb.h> 109 #endif 110 111 static void delmntque(struct vnode *vp); 112 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 113 int slpflag, int slptimeo); 114 static void syncer_shutdown(void *arg, int howto); 115 static int vtryrecycle(struct vnode *vp); 116 static void v_init_counters(struct vnode *); 117 static void vn_seqc_init(struct vnode *); 118 static void vn_seqc_write_end_free(struct vnode *vp); 119 static void vgonel(struct vnode *); 120 static bool vhold_recycle_free(struct vnode *); 121 static void vdropl_recycle(struct vnode *vp); 122 static void vdrop_recycle(struct vnode *vp); 123 static void vfs_knllock(void *arg); 124 static void vfs_knlunlock(void *arg); 125 static void vfs_knl_assert_lock(void *arg, int what); 126 static void destroy_vpollinfo(struct vpollinfo *vi); 127 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 128 daddr_t startlbn, daddr_t endlbn); 129 static void vnlru_recalc(void); 130 131 /* 132 * Number of vnodes in existence. Increased whenever getnewvnode() 133 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 134 */ 135 static u_long __exclusive_cache_line numvnodes; 136 137 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 138 "Number of vnodes in existence"); 139 140 static counter_u64_t vnodes_created; 141 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 142 "Number of vnodes created by getnewvnode"); 143 144 /* 145 * Conversion tables for conversion from vnode types to inode formats 146 * and back. 147 */ 148 enum vtype iftovt_tab[16] = { 149 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 150 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 151 }; 152 int vttoif_tab[10] = { 153 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 154 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 155 }; 156 157 /* 158 * List of allocates vnodes in the system. 159 */ 160 static TAILQ_HEAD(freelst, vnode) vnode_list; 161 static struct vnode *vnode_list_free_marker; 162 static struct vnode *vnode_list_reclaim_marker; 163 164 /* 165 * "Free" vnode target. Free vnodes are rarely completely free, but are 166 * just ones that are cheap to recycle. Usually they are for files which 167 * have been stat'd but not read; these usually have inode and namecache 168 * data attached to them. This target is the preferred minimum size of a 169 * sub-cache consisting mostly of such files. The system balances the size 170 * of this sub-cache with its complement to try to prevent either from 171 * thrashing while the other is relatively inactive. The targets express 172 * a preference for the best balance. 173 * 174 * "Above" this target there are 2 further targets (watermarks) related 175 * to recyling of free vnodes. In the best-operating case, the cache is 176 * exactly full, the free list has size between vlowat and vhiwat above the 177 * free target, and recycling from it and normal use maintains this state. 178 * Sometimes the free list is below vlowat or even empty, but this state 179 * is even better for immediate use provided the cache is not full. 180 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 181 * ones) to reach one of these states. The watermarks are currently hard- 182 * coded as 4% and 9% of the available space higher. These and the default 183 * of 25% for wantfreevnodes are too large if the memory size is large. 184 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 185 * whenever vnlru_proc() becomes active. 186 */ 187 static long wantfreevnodes; 188 static long __exclusive_cache_line freevnodes; 189 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 190 &freevnodes, 0, "Number of \"free\" vnodes"); 191 static long freevnodes_old; 192 193 static counter_u64_t recycles_count; 194 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 195 "Number of vnodes recycled to meet vnode cache targets"); 196 197 static counter_u64_t recycles_free_count; 198 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 199 "Number of free vnodes recycled to meet vnode cache targets"); 200 201 static counter_u64_t deferred_inact; 202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 203 "Number of times inactive processing was deferred"); 204 205 /* To keep more than one thread at a time from running vfs_getnewfsid */ 206 static struct mtx mntid_mtx; 207 208 /* 209 * Lock for any access to the following: 210 * vnode_list 211 * numvnodes 212 * freevnodes 213 */ 214 static struct mtx __exclusive_cache_line vnode_list_mtx; 215 216 /* Publicly exported FS */ 217 struct nfs_public nfs_pub; 218 219 static uma_zone_t buf_trie_zone; 220 static smr_t buf_trie_smr; 221 222 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 223 static uma_zone_t vnode_zone; 224 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 225 226 __read_frequently smr_t vfs_smr; 227 228 /* 229 * The workitem queue. 230 * 231 * It is useful to delay writes of file data and filesystem metadata 232 * for tens of seconds so that quickly created and deleted files need 233 * not waste disk bandwidth being created and removed. To realize this, 234 * we append vnodes to a "workitem" queue. When running with a soft 235 * updates implementation, most pending metadata dependencies should 236 * not wait for more than a few seconds. Thus, mounted on block devices 237 * are delayed only about a half the time that file data is delayed. 238 * Similarly, directory updates are more critical, so are only delayed 239 * about a third the time that file data is delayed. Thus, there are 240 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 241 * one each second (driven off the filesystem syncer process). The 242 * syncer_delayno variable indicates the next queue that is to be processed. 243 * Items that need to be processed soon are placed in this queue: 244 * 245 * syncer_workitem_pending[syncer_delayno] 246 * 247 * A delay of fifteen seconds is done by placing the request fifteen 248 * entries later in the queue: 249 * 250 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 251 * 252 */ 253 static int syncer_delayno; 254 static long syncer_mask; 255 LIST_HEAD(synclist, bufobj); 256 static struct synclist *syncer_workitem_pending; 257 /* 258 * The sync_mtx protects: 259 * bo->bo_synclist 260 * sync_vnode_count 261 * syncer_delayno 262 * syncer_state 263 * syncer_workitem_pending 264 * syncer_worklist_len 265 * rushjob 266 */ 267 static struct mtx sync_mtx; 268 static struct cv sync_wakeup; 269 270 #define SYNCER_MAXDELAY 32 271 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 272 static int syncdelay = 30; /* max time to delay syncing data */ 273 static int filedelay = 30; /* time to delay syncing files */ 274 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 275 "Time to delay syncing files (in seconds)"); 276 static int dirdelay = 29; /* time to delay syncing directories */ 277 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 278 "Time to delay syncing directories (in seconds)"); 279 static int metadelay = 28; /* time to delay syncing metadata */ 280 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 281 "Time to delay syncing metadata (in seconds)"); 282 static int rushjob; /* number of slots to run ASAP */ 283 static int stat_rush_requests; /* number of times I/O speeded up */ 284 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 285 "Number of times I/O speeded up (rush requests)"); 286 287 #define VDBATCH_SIZE 8 288 struct vdbatch { 289 u_int index; 290 long freevnodes; 291 struct mtx lock; 292 struct vnode *tab[VDBATCH_SIZE]; 293 }; 294 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 295 296 static void vdbatch_dequeue(struct vnode *vp); 297 298 /* 299 * When shutting down the syncer, run it at four times normal speed. 300 */ 301 #define SYNCER_SHUTDOWN_SPEEDUP 4 302 static int sync_vnode_count; 303 static int syncer_worklist_len; 304 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 305 syncer_state; 306 307 /* Target for maximum number of vnodes. */ 308 u_long desiredvnodes; 309 static u_long gapvnodes; /* gap between wanted and desired */ 310 static u_long vhiwat; /* enough extras after expansion */ 311 static u_long vlowat; /* minimal extras before expansion */ 312 static u_long vstir; /* nonzero to stir non-free vnodes */ 313 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 314 315 static u_long vnlru_read_freevnodes(void); 316 317 /* 318 * Note that no attempt is made to sanitize these parameters. 319 */ 320 static int 321 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 322 { 323 u_long val; 324 int error; 325 326 val = desiredvnodes; 327 error = sysctl_handle_long(oidp, &val, 0, req); 328 if (error != 0 || req->newptr == NULL) 329 return (error); 330 331 if (val == desiredvnodes) 332 return (0); 333 mtx_lock(&vnode_list_mtx); 334 desiredvnodes = val; 335 wantfreevnodes = desiredvnodes / 4; 336 vnlru_recalc(); 337 mtx_unlock(&vnode_list_mtx); 338 /* 339 * XXX There is no protection against multiple threads changing 340 * desiredvnodes at the same time. Locking above only helps vnlru and 341 * getnewvnode. 342 */ 343 vfs_hash_changesize(desiredvnodes); 344 cache_changesize(desiredvnodes); 345 return (0); 346 } 347 348 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 349 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 350 "LU", "Target for maximum number of vnodes"); 351 352 static int 353 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 354 { 355 u_long val; 356 int error; 357 358 val = wantfreevnodes; 359 error = sysctl_handle_long(oidp, &val, 0, req); 360 if (error != 0 || req->newptr == NULL) 361 return (error); 362 363 if (val == wantfreevnodes) 364 return (0); 365 mtx_lock(&vnode_list_mtx); 366 wantfreevnodes = val; 367 vnlru_recalc(); 368 mtx_unlock(&vnode_list_mtx); 369 return (0); 370 } 371 372 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 373 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 374 "LU", "Target for minimum number of \"free\" vnodes"); 375 376 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 377 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 378 static int vnlru_nowhere; 379 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, 380 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 381 382 static int 383 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 384 { 385 struct vnode *vp; 386 struct nameidata nd; 387 char *buf; 388 unsigned long ndflags; 389 int error; 390 391 if (req->newptr == NULL) 392 return (EINVAL); 393 if (req->newlen >= PATH_MAX) 394 return (E2BIG); 395 396 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 397 error = SYSCTL_IN(req, buf, req->newlen); 398 if (error != 0) 399 goto out; 400 401 buf[req->newlen] = '\0'; 402 403 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 404 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 405 if ((error = namei(&nd)) != 0) 406 goto out; 407 vp = nd.ni_vp; 408 409 if (VN_IS_DOOMED(vp)) { 410 /* 411 * This vnode is being recycled. Return != 0 to let the caller 412 * know that the sysctl had no effect. Return EAGAIN because a 413 * subsequent call will likely succeed (since namei will create 414 * a new vnode if necessary) 415 */ 416 error = EAGAIN; 417 goto putvnode; 418 } 419 420 counter_u64_add(recycles_count, 1); 421 vgone(vp); 422 putvnode: 423 vput(vp); 424 NDFREE_PNBUF(&nd); 425 out: 426 free(buf, M_TEMP); 427 return (error); 428 } 429 430 static int 431 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 432 { 433 struct thread *td = curthread; 434 struct vnode *vp; 435 struct file *fp; 436 int error; 437 int fd; 438 439 if (req->newptr == NULL) 440 return (EBADF); 441 442 error = sysctl_handle_int(oidp, &fd, 0, req); 443 if (error != 0) 444 return (error); 445 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 446 if (error != 0) 447 return (error); 448 vp = fp->f_vnode; 449 450 error = vn_lock(vp, LK_EXCLUSIVE); 451 if (error != 0) 452 goto drop; 453 454 counter_u64_add(recycles_count, 1); 455 vgone(vp); 456 VOP_UNLOCK(vp); 457 drop: 458 fdrop(fp, td); 459 return (error); 460 } 461 462 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 463 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 464 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 465 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 466 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 467 sysctl_ftry_reclaim_vnode, "I", 468 "Try to reclaim a vnode by its file descriptor"); 469 470 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 471 #define vnsz2log 8 472 #ifndef DEBUG_LOCKS 473 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 474 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 475 "vnsz2log needs to be updated"); 476 #endif 477 478 /* 479 * Support for the bufobj clean & dirty pctrie. 480 */ 481 static void * 482 buf_trie_alloc(struct pctrie *ptree) 483 { 484 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 485 } 486 487 static void 488 buf_trie_free(struct pctrie *ptree, void *node) 489 { 490 uma_zfree_smr(buf_trie_zone, node); 491 } 492 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 493 buf_trie_smr); 494 495 /* 496 * Initialize the vnode management data structures. 497 * 498 * Reevaluate the following cap on the number of vnodes after the physical 499 * memory size exceeds 512GB. In the limit, as the physical memory size 500 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 501 */ 502 #ifndef MAXVNODES_MAX 503 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 504 #endif 505 506 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 507 508 static struct vnode * 509 vn_alloc_marker(struct mount *mp) 510 { 511 struct vnode *vp; 512 513 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 514 vp->v_type = VMARKER; 515 vp->v_mount = mp; 516 517 return (vp); 518 } 519 520 static void 521 vn_free_marker(struct vnode *vp) 522 { 523 524 MPASS(vp->v_type == VMARKER); 525 free(vp, M_VNODE_MARKER); 526 } 527 528 #ifdef KASAN 529 static int 530 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 531 { 532 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 533 return (0); 534 } 535 536 static void 537 vnode_dtor(void *mem, int size, void *arg __unused) 538 { 539 size_t end1, end2, off1, off2; 540 541 _Static_assert(offsetof(struct vnode, v_vnodelist) < 542 offsetof(struct vnode, v_dbatchcpu), 543 "KASAN marks require updating"); 544 545 off1 = offsetof(struct vnode, v_vnodelist); 546 off2 = offsetof(struct vnode, v_dbatchcpu); 547 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 548 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 549 550 /* 551 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 552 * after the vnode has been freed. Try to get some KASAN coverage by 553 * marking everything except those two fields as invalid. Because 554 * KASAN's tracking is not byte-granular, any preceding fields sharing 555 * the same 8-byte aligned word must also be marked valid. 556 */ 557 558 /* Handle the area from the start until v_vnodelist... */ 559 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 560 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 561 562 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 563 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 564 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 565 if (off2 > off1) 566 kasan_mark((void *)((char *)mem + off1), off2 - off1, 567 off2 - off1, KASAN_UMA_FREED); 568 569 /* ... and finally the area from v_dbatchcpu to the end. */ 570 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 571 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 572 KASAN_UMA_FREED); 573 } 574 #endif /* KASAN */ 575 576 /* 577 * Initialize a vnode as it first enters the zone. 578 */ 579 static int 580 vnode_init(void *mem, int size, int flags) 581 { 582 struct vnode *vp; 583 584 vp = mem; 585 bzero(vp, size); 586 /* 587 * Setup locks. 588 */ 589 vp->v_vnlock = &vp->v_lock; 590 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 591 /* 592 * By default, don't allow shared locks unless filesystems opt-in. 593 */ 594 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 595 LK_NOSHARE | LK_IS_VNODE); 596 /* 597 * Initialize bufobj. 598 */ 599 bufobj_init(&vp->v_bufobj, vp); 600 /* 601 * Initialize namecache. 602 */ 603 cache_vnode_init(vp); 604 /* 605 * Initialize rangelocks. 606 */ 607 rangelock_init(&vp->v_rl); 608 609 vp->v_dbatchcpu = NOCPU; 610 611 vp->v_state = VSTATE_DEAD; 612 613 /* 614 * Check vhold_recycle_free for an explanation. 615 */ 616 vp->v_holdcnt = VHOLD_NO_SMR; 617 vp->v_type = VNON; 618 mtx_lock(&vnode_list_mtx); 619 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 620 mtx_unlock(&vnode_list_mtx); 621 return (0); 622 } 623 624 /* 625 * Free a vnode when it is cleared from the zone. 626 */ 627 static void 628 vnode_fini(void *mem, int size) 629 { 630 struct vnode *vp; 631 struct bufobj *bo; 632 633 vp = mem; 634 vdbatch_dequeue(vp); 635 mtx_lock(&vnode_list_mtx); 636 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 637 mtx_unlock(&vnode_list_mtx); 638 rangelock_destroy(&vp->v_rl); 639 lockdestroy(vp->v_vnlock); 640 mtx_destroy(&vp->v_interlock); 641 bo = &vp->v_bufobj; 642 rw_destroy(BO_LOCKPTR(bo)); 643 644 kasan_mark(mem, size, size, 0); 645 } 646 647 /* 648 * Provide the size of NFS nclnode and NFS fh for calculation of the 649 * vnode memory consumption. The size is specified directly to 650 * eliminate dependency on NFS-private header. 651 * 652 * Other filesystems may use bigger or smaller (like UFS and ZFS) 653 * private inode data, but the NFS-based estimation is ample enough. 654 * Still, we care about differences in the size between 64- and 32-bit 655 * platforms. 656 * 657 * Namecache structure size is heuristically 658 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 659 */ 660 #ifdef _LP64 661 #define NFS_NCLNODE_SZ (528 + 64) 662 #define NC_SZ 148 663 #else 664 #define NFS_NCLNODE_SZ (360 + 32) 665 #define NC_SZ 92 666 #endif 667 668 static void 669 vntblinit(void *dummy __unused) 670 { 671 struct vdbatch *vd; 672 uma_ctor ctor; 673 uma_dtor dtor; 674 int cpu, physvnodes, virtvnodes; 675 676 /* 677 * Desiredvnodes is a function of the physical memory size and the 678 * kernel's heap size. Generally speaking, it scales with the 679 * physical memory size. The ratio of desiredvnodes to the physical 680 * memory size is 1:16 until desiredvnodes exceeds 98,304. 681 * Thereafter, the 682 * marginal ratio of desiredvnodes to the physical memory size is 683 * 1:64. However, desiredvnodes is limited by the kernel's heap 684 * size. The memory required by desiredvnodes vnodes and vm objects 685 * must not exceed 1/10th of the kernel's heap size. 686 */ 687 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 688 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 689 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 690 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 691 desiredvnodes = min(physvnodes, virtvnodes); 692 if (desiredvnodes > MAXVNODES_MAX) { 693 if (bootverbose) 694 printf("Reducing kern.maxvnodes %lu -> %lu\n", 695 desiredvnodes, MAXVNODES_MAX); 696 desiredvnodes = MAXVNODES_MAX; 697 } 698 wantfreevnodes = desiredvnodes / 4; 699 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 700 TAILQ_INIT(&vnode_list); 701 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 702 /* 703 * The lock is taken to appease WITNESS. 704 */ 705 mtx_lock(&vnode_list_mtx); 706 vnlru_recalc(); 707 mtx_unlock(&vnode_list_mtx); 708 vnode_list_free_marker = vn_alloc_marker(NULL); 709 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 710 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 711 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 712 713 #ifdef KASAN 714 ctor = vnode_ctor; 715 dtor = vnode_dtor; 716 #else 717 ctor = NULL; 718 dtor = NULL; 719 #endif 720 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 721 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 722 uma_zone_set_smr(vnode_zone, vfs_smr); 723 724 /* 725 * Preallocate enough nodes to support one-per buf so that 726 * we can not fail an insert. reassignbuf() callers can not 727 * tolerate the insertion failure. 728 */ 729 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 730 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 731 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 732 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 733 uma_prealloc(buf_trie_zone, nbuf); 734 735 vnodes_created = counter_u64_alloc(M_WAITOK); 736 recycles_count = counter_u64_alloc(M_WAITOK); 737 recycles_free_count = counter_u64_alloc(M_WAITOK); 738 deferred_inact = counter_u64_alloc(M_WAITOK); 739 740 /* 741 * Initialize the filesystem syncer. 742 */ 743 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 744 &syncer_mask); 745 syncer_maxdelay = syncer_mask + 1; 746 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 747 cv_init(&sync_wakeup, "syncer"); 748 749 CPU_FOREACH(cpu) { 750 vd = DPCPU_ID_PTR((cpu), vd); 751 bzero(vd, sizeof(*vd)); 752 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 753 } 754 } 755 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 756 757 /* 758 * Mark a mount point as busy. Used to synchronize access and to delay 759 * unmounting. Eventually, mountlist_mtx is not released on failure. 760 * 761 * vfs_busy() is a custom lock, it can block the caller. 762 * vfs_busy() only sleeps if the unmount is active on the mount point. 763 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 764 * vnode belonging to mp. 765 * 766 * Lookup uses vfs_busy() to traverse mount points. 767 * root fs var fs 768 * / vnode lock A / vnode lock (/var) D 769 * /var vnode lock B /log vnode lock(/var/log) E 770 * vfs_busy lock C vfs_busy lock F 771 * 772 * Within each file system, the lock order is C->A->B and F->D->E. 773 * 774 * When traversing across mounts, the system follows that lock order: 775 * 776 * C->A->B 777 * | 778 * +->F->D->E 779 * 780 * The lookup() process for namei("/var") illustrates the process: 781 * 1. VOP_LOOKUP() obtains B while A is held 782 * 2. vfs_busy() obtains a shared lock on F while A and B are held 783 * 3. vput() releases lock on B 784 * 4. vput() releases lock on A 785 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 786 * 6. vfs_unbusy() releases shared lock on F 787 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 788 * Attempt to lock A (instead of vp_crossmp) while D is held would 789 * violate the global order, causing deadlocks. 790 * 791 * dounmount() locks B while F is drained. Note that for stacked 792 * filesystems, D and B in the example above may be the same lock, 793 * which introdues potential lock order reversal deadlock between 794 * dounmount() and step 5 above. These filesystems may avoid the LOR 795 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 796 * remain held until after step 5. 797 */ 798 int 799 vfs_busy(struct mount *mp, int flags) 800 { 801 struct mount_pcpu *mpcpu; 802 803 MPASS((flags & ~MBF_MASK) == 0); 804 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 805 806 if (vfs_op_thread_enter(mp, mpcpu)) { 807 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 808 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 809 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 810 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 811 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 812 vfs_op_thread_exit(mp, mpcpu); 813 if (flags & MBF_MNTLSTLOCK) 814 mtx_unlock(&mountlist_mtx); 815 return (0); 816 } 817 818 MNT_ILOCK(mp); 819 vfs_assert_mount_counters(mp); 820 MNT_REF(mp); 821 /* 822 * If mount point is currently being unmounted, sleep until the 823 * mount point fate is decided. If thread doing the unmounting fails, 824 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 825 * that this mount point has survived the unmount attempt and vfs_busy 826 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 827 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 828 * about to be really destroyed. vfs_busy needs to release its 829 * reference on the mount point in this case and return with ENOENT, 830 * telling the caller the mount it tried to busy is no longer valid. 831 */ 832 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 833 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 834 ("%s: non-empty upper mount list with pending unmount", 835 __func__)); 836 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 837 MNT_REL(mp); 838 MNT_IUNLOCK(mp); 839 CTR1(KTR_VFS, "%s: failed busying before sleeping", 840 __func__); 841 return (ENOENT); 842 } 843 if (flags & MBF_MNTLSTLOCK) 844 mtx_unlock(&mountlist_mtx); 845 mp->mnt_kern_flag |= MNTK_MWAIT; 846 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 847 if (flags & MBF_MNTLSTLOCK) 848 mtx_lock(&mountlist_mtx); 849 MNT_ILOCK(mp); 850 } 851 if (flags & MBF_MNTLSTLOCK) 852 mtx_unlock(&mountlist_mtx); 853 mp->mnt_lockref++; 854 MNT_IUNLOCK(mp); 855 return (0); 856 } 857 858 /* 859 * Free a busy filesystem. 860 */ 861 void 862 vfs_unbusy(struct mount *mp) 863 { 864 struct mount_pcpu *mpcpu; 865 int c; 866 867 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 868 869 if (vfs_op_thread_enter(mp, mpcpu)) { 870 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 871 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 872 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 873 vfs_op_thread_exit(mp, mpcpu); 874 return; 875 } 876 877 MNT_ILOCK(mp); 878 vfs_assert_mount_counters(mp); 879 MNT_REL(mp); 880 c = --mp->mnt_lockref; 881 if (mp->mnt_vfs_ops == 0) { 882 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 883 MNT_IUNLOCK(mp); 884 return; 885 } 886 if (c < 0) 887 vfs_dump_mount_counters(mp); 888 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 889 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 890 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 891 mp->mnt_kern_flag &= ~MNTK_DRAINING; 892 wakeup(&mp->mnt_lockref); 893 } 894 MNT_IUNLOCK(mp); 895 } 896 897 /* 898 * Lookup a mount point by filesystem identifier. 899 */ 900 struct mount * 901 vfs_getvfs(fsid_t *fsid) 902 { 903 struct mount *mp; 904 905 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 906 mtx_lock(&mountlist_mtx); 907 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 908 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 909 vfs_ref(mp); 910 mtx_unlock(&mountlist_mtx); 911 return (mp); 912 } 913 } 914 mtx_unlock(&mountlist_mtx); 915 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 916 return ((struct mount *) 0); 917 } 918 919 /* 920 * Lookup a mount point by filesystem identifier, busying it before 921 * returning. 922 * 923 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 924 * cache for popular filesystem identifiers. The cache is lockess, using 925 * the fact that struct mount's are never freed. In worst case we may 926 * get pointer to unmounted or even different filesystem, so we have to 927 * check what we got, and go slow way if so. 928 */ 929 struct mount * 930 vfs_busyfs(fsid_t *fsid) 931 { 932 #define FSID_CACHE_SIZE 256 933 typedef struct mount * volatile vmp_t; 934 static vmp_t cache[FSID_CACHE_SIZE]; 935 struct mount *mp; 936 int error; 937 uint32_t hash; 938 939 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 940 hash = fsid->val[0] ^ fsid->val[1]; 941 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 942 mp = cache[hash]; 943 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 944 goto slow; 945 if (vfs_busy(mp, 0) != 0) { 946 cache[hash] = NULL; 947 goto slow; 948 } 949 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 950 return (mp); 951 else 952 vfs_unbusy(mp); 953 954 slow: 955 mtx_lock(&mountlist_mtx); 956 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 957 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 958 error = vfs_busy(mp, MBF_MNTLSTLOCK); 959 if (error) { 960 cache[hash] = NULL; 961 mtx_unlock(&mountlist_mtx); 962 return (NULL); 963 } 964 cache[hash] = mp; 965 return (mp); 966 } 967 } 968 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 969 mtx_unlock(&mountlist_mtx); 970 return ((struct mount *) 0); 971 } 972 973 /* 974 * Check if a user can access privileged mount options. 975 */ 976 int 977 vfs_suser(struct mount *mp, struct thread *td) 978 { 979 int error; 980 981 if (jailed(td->td_ucred)) { 982 /* 983 * If the jail of the calling thread lacks permission for 984 * this type of file system, deny immediately. 985 */ 986 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 987 return (EPERM); 988 989 /* 990 * If the file system was mounted outside the jail of the 991 * calling thread, deny immediately. 992 */ 993 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 994 return (EPERM); 995 } 996 997 /* 998 * If file system supports delegated administration, we don't check 999 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1000 * by the file system itself. 1001 * If this is not the user that did original mount, we check for 1002 * the PRIV_VFS_MOUNT_OWNER privilege. 1003 */ 1004 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1005 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1006 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1007 return (error); 1008 } 1009 return (0); 1010 } 1011 1012 /* 1013 * Get a new unique fsid. Try to make its val[0] unique, since this value 1014 * will be used to create fake device numbers for stat(). Also try (but 1015 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1016 * support 16-bit device numbers. We end up with unique val[0]'s for the 1017 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1018 * 1019 * Keep in mind that several mounts may be running in parallel. Starting 1020 * the search one past where the previous search terminated is both a 1021 * micro-optimization and a defense against returning the same fsid to 1022 * different mounts. 1023 */ 1024 void 1025 vfs_getnewfsid(struct mount *mp) 1026 { 1027 static uint16_t mntid_base; 1028 struct mount *nmp; 1029 fsid_t tfsid; 1030 int mtype; 1031 1032 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1033 mtx_lock(&mntid_mtx); 1034 mtype = mp->mnt_vfc->vfc_typenum; 1035 tfsid.val[1] = mtype; 1036 mtype = (mtype & 0xFF) << 24; 1037 for (;;) { 1038 tfsid.val[0] = makedev(255, 1039 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1040 mntid_base++; 1041 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1042 break; 1043 vfs_rel(nmp); 1044 } 1045 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1046 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1047 mtx_unlock(&mntid_mtx); 1048 } 1049 1050 /* 1051 * Knob to control the precision of file timestamps: 1052 * 1053 * 0 = seconds only; nanoseconds zeroed. 1054 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1055 * 2 = seconds and nanoseconds, truncated to microseconds. 1056 * >=3 = seconds and nanoseconds, maximum precision. 1057 */ 1058 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1059 1060 static int timestamp_precision = TSP_USEC; 1061 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1062 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1063 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1064 "3+: sec + ns (max. precision))"); 1065 1066 /* 1067 * Get a current timestamp. 1068 */ 1069 void 1070 vfs_timestamp(struct timespec *tsp) 1071 { 1072 struct timeval tv; 1073 1074 switch (timestamp_precision) { 1075 case TSP_SEC: 1076 tsp->tv_sec = time_second; 1077 tsp->tv_nsec = 0; 1078 break; 1079 case TSP_HZ: 1080 getnanotime(tsp); 1081 break; 1082 case TSP_USEC: 1083 microtime(&tv); 1084 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1085 break; 1086 case TSP_NSEC: 1087 default: 1088 nanotime(tsp); 1089 break; 1090 } 1091 } 1092 1093 /* 1094 * Set vnode attributes to VNOVAL 1095 */ 1096 void 1097 vattr_null(struct vattr *vap) 1098 { 1099 1100 vap->va_type = VNON; 1101 vap->va_size = VNOVAL; 1102 vap->va_bytes = VNOVAL; 1103 vap->va_mode = VNOVAL; 1104 vap->va_nlink = VNOVAL; 1105 vap->va_uid = VNOVAL; 1106 vap->va_gid = VNOVAL; 1107 vap->va_fsid = VNOVAL; 1108 vap->va_fileid = VNOVAL; 1109 vap->va_blocksize = VNOVAL; 1110 vap->va_rdev = VNOVAL; 1111 vap->va_atime.tv_sec = VNOVAL; 1112 vap->va_atime.tv_nsec = VNOVAL; 1113 vap->va_mtime.tv_sec = VNOVAL; 1114 vap->va_mtime.tv_nsec = VNOVAL; 1115 vap->va_ctime.tv_sec = VNOVAL; 1116 vap->va_ctime.tv_nsec = VNOVAL; 1117 vap->va_birthtime.tv_sec = VNOVAL; 1118 vap->va_birthtime.tv_nsec = VNOVAL; 1119 vap->va_flags = VNOVAL; 1120 vap->va_gen = VNOVAL; 1121 vap->va_vaflags = 0; 1122 } 1123 1124 /* 1125 * Try to reduce the total number of vnodes. 1126 * 1127 * This routine (and its user) are buggy in at least the following ways: 1128 * - all parameters were picked years ago when RAM sizes were significantly 1129 * smaller 1130 * - it can pick vnodes based on pages used by the vm object, but filesystems 1131 * like ZFS don't use it making the pick broken 1132 * - since ZFS has its own aging policy it gets partially combated by this one 1133 * - a dedicated method should be provided for filesystems to let them decide 1134 * whether the vnode should be recycled 1135 * 1136 * This routine is called when we have too many vnodes. It attempts 1137 * to free <count> vnodes and will potentially free vnodes that still 1138 * have VM backing store (VM backing store is typically the cause 1139 * of a vnode blowout so we want to do this). Therefore, this operation 1140 * is not considered cheap. 1141 * 1142 * A number of conditions may prevent a vnode from being reclaimed. 1143 * the buffer cache may have references on the vnode, a directory 1144 * vnode may still have references due to the namei cache representing 1145 * underlying files, or the vnode may be in active use. It is not 1146 * desirable to reuse such vnodes. These conditions may cause the 1147 * number of vnodes to reach some minimum value regardless of what 1148 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1149 * 1150 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1151 * entries if this argument is strue 1152 * @param trigger Only reclaim vnodes with fewer than this many resident 1153 * pages. 1154 * @param target How many vnodes to reclaim. 1155 * @return The number of vnodes that were reclaimed. 1156 */ 1157 static int 1158 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1159 { 1160 struct vnode *vp, *mvp; 1161 struct mount *mp; 1162 struct vm_object *object; 1163 u_long done; 1164 bool retried; 1165 1166 mtx_assert(&vnode_list_mtx, MA_OWNED); 1167 1168 retried = false; 1169 done = 0; 1170 1171 mvp = vnode_list_reclaim_marker; 1172 restart: 1173 vp = mvp; 1174 while (done < target) { 1175 vp = TAILQ_NEXT(vp, v_vnodelist); 1176 if (__predict_false(vp == NULL)) 1177 break; 1178 1179 if (__predict_false(vp->v_type == VMARKER)) 1180 continue; 1181 1182 /* 1183 * If it's been deconstructed already, it's still 1184 * referenced, or it exceeds the trigger, skip it. 1185 * Also skip free vnodes. We are trying to make space 1186 * to expand the free list, not reduce it. 1187 */ 1188 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1189 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1190 goto next_iter; 1191 1192 if (vp->v_type == VBAD || vp->v_type == VNON) 1193 goto next_iter; 1194 1195 object = atomic_load_ptr(&vp->v_object); 1196 if (object == NULL || object->resident_page_count > trigger) { 1197 goto next_iter; 1198 } 1199 1200 /* 1201 * Handle races against vnode allocation. Filesystems lock the 1202 * vnode some time after it gets returned from getnewvnode, 1203 * despite type and hold count being manipulated earlier. 1204 * Resorting to checking v_mount restores guarantees present 1205 * before the global list was reworked to contain all vnodes. 1206 */ 1207 if (!VI_TRYLOCK(vp)) 1208 goto next_iter; 1209 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1210 VI_UNLOCK(vp); 1211 goto next_iter; 1212 } 1213 if (vp->v_mount == NULL) { 1214 VI_UNLOCK(vp); 1215 goto next_iter; 1216 } 1217 vholdl(vp); 1218 VI_UNLOCK(vp); 1219 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1220 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1221 mtx_unlock(&vnode_list_mtx); 1222 1223 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1224 vdrop_recycle(vp); 1225 goto next_iter_unlocked; 1226 } 1227 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1228 vdrop_recycle(vp); 1229 vn_finished_write(mp); 1230 goto next_iter_unlocked; 1231 } 1232 1233 VI_LOCK(vp); 1234 if (vp->v_usecount > 0 || 1235 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1236 (vp->v_object != NULL && vp->v_object->handle == vp && 1237 vp->v_object->resident_page_count > trigger)) { 1238 VOP_UNLOCK(vp); 1239 vdropl_recycle(vp); 1240 vn_finished_write(mp); 1241 goto next_iter_unlocked; 1242 } 1243 counter_u64_add(recycles_count, 1); 1244 vgonel(vp); 1245 VOP_UNLOCK(vp); 1246 vdropl_recycle(vp); 1247 vn_finished_write(mp); 1248 done++; 1249 next_iter_unlocked: 1250 maybe_yield(); 1251 mtx_lock(&vnode_list_mtx); 1252 goto restart; 1253 next_iter: 1254 MPASS(vp->v_type != VMARKER); 1255 if (!should_yield()) 1256 continue; 1257 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1258 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1259 mtx_unlock(&vnode_list_mtx); 1260 kern_yield(PRI_USER); 1261 mtx_lock(&vnode_list_mtx); 1262 goto restart; 1263 } 1264 if (done == 0 && !retried) { 1265 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1266 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1267 retried = true; 1268 goto restart; 1269 } 1270 return (done); 1271 } 1272 1273 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1274 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1275 0, 1276 "limit on vnode free requests per call to the vnlru_free routine"); 1277 1278 /* 1279 * Attempt to reduce the free list by the requested amount. 1280 */ 1281 static int 1282 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) 1283 { 1284 struct vnode *vp; 1285 struct mount *mp; 1286 int ocount; 1287 1288 mtx_assert(&vnode_list_mtx, MA_OWNED); 1289 if (count > max_vnlru_free) 1290 count = max_vnlru_free; 1291 ocount = count; 1292 vp = mvp; 1293 for (;;) { 1294 if (count == 0) { 1295 break; 1296 } 1297 vp = TAILQ_NEXT(vp, v_vnodelist); 1298 if (__predict_false(vp == NULL)) { 1299 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1300 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1301 break; 1302 } 1303 if (__predict_false(vp->v_type == VMARKER)) 1304 continue; 1305 if (vp->v_holdcnt > 0) 1306 continue; 1307 /* 1308 * Don't recycle if our vnode is from different type 1309 * of mount point. Note that mp is type-safe, the 1310 * check does not reach unmapped address even if 1311 * vnode is reclaimed. 1312 */ 1313 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1314 mp->mnt_op != mnt_op) { 1315 continue; 1316 } 1317 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1318 continue; 1319 } 1320 if (!vhold_recycle_free(vp)) 1321 continue; 1322 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1323 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1324 mtx_unlock(&vnode_list_mtx); 1325 /* 1326 * FIXME: ignores the return value, meaning it may be nothing 1327 * got recycled but it claims otherwise to the caller. 1328 * 1329 * Originally the value started being ignored in 2005 with 1330 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1331 * 1332 * Respecting the value can run into significant stalls if most 1333 * vnodes belong to one file system and it has writes 1334 * suspended. In presence of many threads and millions of 1335 * vnodes they keep contending on the vnode_list_mtx lock only 1336 * to find vnodes they can't recycle. 1337 * 1338 * The solution would be to pre-check if the vnode is likely to 1339 * be recycle-able, but it needs to happen with the 1340 * vnode_list_mtx lock held. This runs into a problem where 1341 * VOP_GETWRITEMOUNT (currently needed to find out about if 1342 * writes are frozen) can take locks which LOR against it. 1343 * 1344 * Check nullfs for one example (null_getwritemount). 1345 */ 1346 vtryrecycle(vp); 1347 count--; 1348 mtx_lock(&vnode_list_mtx); 1349 vp = mvp; 1350 } 1351 return (ocount - count); 1352 } 1353 1354 static int 1355 vnlru_free_locked(int count) 1356 { 1357 1358 mtx_assert(&vnode_list_mtx, MA_OWNED); 1359 return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); 1360 } 1361 1362 void 1363 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1364 { 1365 1366 MPASS(mnt_op != NULL); 1367 MPASS(mvp != NULL); 1368 VNPASS(mvp->v_type == VMARKER, mvp); 1369 mtx_lock(&vnode_list_mtx); 1370 vnlru_free_impl(count, mnt_op, mvp); 1371 mtx_unlock(&vnode_list_mtx); 1372 } 1373 1374 struct vnode * 1375 vnlru_alloc_marker(void) 1376 { 1377 struct vnode *mvp; 1378 1379 mvp = vn_alloc_marker(NULL); 1380 mtx_lock(&vnode_list_mtx); 1381 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1382 mtx_unlock(&vnode_list_mtx); 1383 return (mvp); 1384 } 1385 1386 void 1387 vnlru_free_marker(struct vnode *mvp) 1388 { 1389 mtx_lock(&vnode_list_mtx); 1390 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1391 mtx_unlock(&vnode_list_mtx); 1392 vn_free_marker(mvp); 1393 } 1394 1395 static void 1396 vnlru_recalc(void) 1397 { 1398 1399 mtx_assert(&vnode_list_mtx, MA_OWNED); 1400 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1401 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1402 vlowat = vhiwat / 2; 1403 } 1404 1405 /* 1406 * Attempt to recycle vnodes in a context that is always safe to block. 1407 * Calling vlrurecycle() from the bowels of filesystem code has some 1408 * interesting deadlock problems. 1409 */ 1410 static struct proc *vnlruproc; 1411 static int vnlruproc_sig; 1412 1413 /* 1414 * The main freevnodes counter is only updated when threads requeue their vnode 1415 * batches. CPUs are conditionally walked to compute a more accurate total. 1416 * 1417 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1418 * at any given moment can still exceed slop, but it should not be by significant 1419 * margin in practice. 1420 */ 1421 #define VNLRU_FREEVNODES_SLOP 128 1422 1423 static __inline void 1424 vfs_freevnodes_inc(void) 1425 { 1426 struct vdbatch *vd; 1427 1428 critical_enter(); 1429 vd = DPCPU_PTR(vd); 1430 vd->freevnodes++; 1431 critical_exit(); 1432 } 1433 1434 static __inline void 1435 vfs_freevnodes_dec(void) 1436 { 1437 struct vdbatch *vd; 1438 1439 critical_enter(); 1440 vd = DPCPU_PTR(vd); 1441 vd->freevnodes--; 1442 critical_exit(); 1443 } 1444 1445 static u_long 1446 vnlru_read_freevnodes(void) 1447 { 1448 struct vdbatch *vd; 1449 long slop; 1450 int cpu; 1451 1452 mtx_assert(&vnode_list_mtx, MA_OWNED); 1453 if (freevnodes > freevnodes_old) 1454 slop = freevnodes - freevnodes_old; 1455 else 1456 slop = freevnodes_old - freevnodes; 1457 if (slop < VNLRU_FREEVNODES_SLOP) 1458 return (freevnodes >= 0 ? freevnodes : 0); 1459 freevnodes_old = freevnodes; 1460 CPU_FOREACH(cpu) { 1461 vd = DPCPU_ID_PTR((cpu), vd); 1462 freevnodes_old += vd->freevnodes; 1463 } 1464 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1465 } 1466 1467 static bool 1468 vnlru_under(u_long rnumvnodes, u_long limit) 1469 { 1470 u_long rfreevnodes, space; 1471 1472 if (__predict_false(rnumvnodes > desiredvnodes)) 1473 return (true); 1474 1475 space = desiredvnodes - rnumvnodes; 1476 if (space < limit) { 1477 rfreevnodes = vnlru_read_freevnodes(); 1478 if (rfreevnodes > wantfreevnodes) 1479 space += rfreevnodes - wantfreevnodes; 1480 } 1481 return (space < limit); 1482 } 1483 1484 static bool 1485 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1486 { 1487 long rfreevnodes, space; 1488 1489 if (__predict_false(rnumvnodes > desiredvnodes)) 1490 return (true); 1491 1492 space = desiredvnodes - rnumvnodes; 1493 if (space < limit) { 1494 rfreevnodes = atomic_load_long(&freevnodes); 1495 if (rfreevnodes > wantfreevnodes) 1496 space += rfreevnodes - wantfreevnodes; 1497 } 1498 return (space < limit); 1499 } 1500 1501 static void 1502 vnlru_kick(void) 1503 { 1504 1505 mtx_assert(&vnode_list_mtx, MA_OWNED); 1506 if (vnlruproc_sig == 0) { 1507 vnlruproc_sig = 1; 1508 wakeup(vnlruproc); 1509 } 1510 } 1511 1512 static void 1513 vnlru_proc(void) 1514 { 1515 u_long rnumvnodes, rfreevnodes, target; 1516 unsigned long onumvnodes; 1517 int done, force, trigger, usevnodes; 1518 bool reclaim_nc_src, want_reread; 1519 1520 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1521 SHUTDOWN_PRI_FIRST); 1522 1523 force = 0; 1524 want_reread = false; 1525 for (;;) { 1526 kproc_suspend_check(vnlruproc); 1527 mtx_lock(&vnode_list_mtx); 1528 rnumvnodes = atomic_load_long(&numvnodes); 1529 1530 if (want_reread) { 1531 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1532 want_reread = false; 1533 } 1534 1535 /* 1536 * If numvnodes is too large (due to desiredvnodes being 1537 * adjusted using its sysctl, or emergency growth), first 1538 * try to reduce it by discarding from the free list. 1539 */ 1540 if (rnumvnodes > desiredvnodes) { 1541 vnlru_free_locked(rnumvnodes - desiredvnodes); 1542 rnumvnodes = atomic_load_long(&numvnodes); 1543 } 1544 /* 1545 * Sleep if the vnode cache is in a good state. This is 1546 * when it is not over-full and has space for about a 4% 1547 * or 9% expansion (by growing its size or inexcessively 1548 * reducing its free list). Otherwise, try to reclaim 1549 * space for a 10% expansion. 1550 */ 1551 if (vstir && force == 0) { 1552 force = 1; 1553 vstir = 0; 1554 } 1555 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1556 vnlruproc_sig = 0; 1557 wakeup(&vnlruproc_sig); 1558 msleep(vnlruproc, &vnode_list_mtx, 1559 PVFS|PDROP, "vlruwt", hz); 1560 continue; 1561 } 1562 rfreevnodes = vnlru_read_freevnodes(); 1563 1564 onumvnodes = rnumvnodes; 1565 /* 1566 * Calculate parameters for recycling. These are the same 1567 * throughout the loop to give some semblance of fairness. 1568 * The trigger point is to avoid recycling vnodes with lots 1569 * of resident pages. We aren't trying to free memory; we 1570 * are trying to recycle or at least free vnodes. 1571 */ 1572 if (rnumvnodes <= desiredvnodes) 1573 usevnodes = rnumvnodes - rfreevnodes; 1574 else 1575 usevnodes = rnumvnodes; 1576 if (usevnodes <= 0) 1577 usevnodes = 1; 1578 /* 1579 * The trigger value is chosen to give a conservatively 1580 * large value to ensure that it alone doesn't prevent 1581 * making progress. The value can easily be so large that 1582 * it is effectively infinite in some congested and 1583 * misconfigured cases, and this is necessary. Normally 1584 * it is about 8 to 100 (pages), which is quite large. 1585 */ 1586 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1587 if (force < 2) 1588 trigger = vsmalltrigger; 1589 reclaim_nc_src = force >= 3; 1590 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1591 target = target / 10 + 1; 1592 done = vlrureclaim(reclaim_nc_src, trigger, target); 1593 mtx_unlock(&vnode_list_mtx); 1594 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1595 uma_reclaim(UMA_RECLAIM_DRAIN); 1596 if (done == 0) { 1597 if (force == 0 || force == 1) { 1598 force = 2; 1599 continue; 1600 } 1601 if (force == 2) { 1602 force = 3; 1603 continue; 1604 } 1605 want_reread = true; 1606 force = 0; 1607 vnlru_nowhere++; 1608 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1609 } else { 1610 want_reread = true; 1611 kern_yield(PRI_USER); 1612 } 1613 } 1614 } 1615 1616 static struct kproc_desc vnlru_kp = { 1617 "vnlru", 1618 vnlru_proc, 1619 &vnlruproc 1620 }; 1621 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1622 &vnlru_kp); 1623 1624 /* 1625 * Routines having to do with the management of the vnode table. 1626 */ 1627 1628 /* 1629 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1630 * before we actually vgone(). This function must be called with the vnode 1631 * held to prevent the vnode from being returned to the free list midway 1632 * through vgone(). 1633 */ 1634 static int 1635 vtryrecycle(struct vnode *vp) 1636 { 1637 struct mount *vnmp; 1638 1639 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1640 VNASSERT(vp->v_holdcnt, vp, 1641 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1642 /* 1643 * This vnode may found and locked via some other list, if so we 1644 * can't recycle it yet. 1645 */ 1646 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1647 CTR2(KTR_VFS, 1648 "%s: impossible to recycle, vp %p lock is already held", 1649 __func__, vp); 1650 vdrop_recycle(vp); 1651 return (EWOULDBLOCK); 1652 } 1653 /* 1654 * Don't recycle if its filesystem is being suspended. 1655 */ 1656 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1657 VOP_UNLOCK(vp); 1658 CTR2(KTR_VFS, 1659 "%s: impossible to recycle, cannot start the write for %p", 1660 __func__, vp); 1661 vdrop_recycle(vp); 1662 return (EBUSY); 1663 } 1664 /* 1665 * If we got this far, we need to acquire the interlock and see if 1666 * anyone picked up this vnode from another list. If not, we will 1667 * mark it with DOOMED via vgonel() so that anyone who does find it 1668 * will skip over it. 1669 */ 1670 VI_LOCK(vp); 1671 if (vp->v_usecount) { 1672 VOP_UNLOCK(vp); 1673 vdropl_recycle(vp); 1674 vn_finished_write(vnmp); 1675 CTR2(KTR_VFS, 1676 "%s: impossible to recycle, %p is already referenced", 1677 __func__, vp); 1678 return (EBUSY); 1679 } 1680 if (!VN_IS_DOOMED(vp)) { 1681 counter_u64_add(recycles_free_count, 1); 1682 vgonel(vp); 1683 } 1684 VOP_UNLOCK(vp); 1685 vdropl_recycle(vp); 1686 vn_finished_write(vnmp); 1687 return (0); 1688 } 1689 1690 /* 1691 * Allocate a new vnode. 1692 * 1693 * The operation never returns an error. Returning an error was disabled 1694 * in r145385 (dated 2005) with the following comment: 1695 * 1696 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1697 * 1698 * Given the age of this commit (almost 15 years at the time of writing this 1699 * comment) restoring the ability to fail requires a significant audit of 1700 * all codepaths. 1701 * 1702 * The routine can try to free a vnode or stall for up to 1 second waiting for 1703 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1704 */ 1705 static u_long vn_alloc_cyclecount; 1706 1707 static struct vnode * __noinline 1708 vn_alloc_hard(struct mount *mp) 1709 { 1710 u_long rnumvnodes, rfreevnodes; 1711 1712 mtx_lock(&vnode_list_mtx); 1713 rnumvnodes = atomic_load_long(&numvnodes); 1714 if (rnumvnodes + 1 < desiredvnodes) { 1715 vn_alloc_cyclecount = 0; 1716 goto alloc; 1717 } 1718 rfreevnodes = vnlru_read_freevnodes(); 1719 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1720 vn_alloc_cyclecount = 0; 1721 vstir = 1; 1722 } 1723 /* 1724 * Grow the vnode cache if it will not be above its target max 1725 * after growing. Otherwise, if the free list is nonempty, try 1726 * to reclaim 1 item from it before growing the cache (possibly 1727 * above its target max if the reclamation failed or is delayed). 1728 * Otherwise, wait for some space. In all cases, schedule 1729 * vnlru_proc() if we are getting short of space. The watermarks 1730 * should be chosen so that we never wait or even reclaim from 1731 * the free list to below its target minimum. 1732 */ 1733 if (vnlru_free_locked(1) > 0) 1734 goto alloc; 1735 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1736 /* 1737 * Wait for space for a new vnode. 1738 */ 1739 vnlru_kick(); 1740 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1741 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1742 vnlru_read_freevnodes() > 1) 1743 vnlru_free_locked(1); 1744 } 1745 alloc: 1746 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1747 if (vnlru_under(rnumvnodes, vlowat)) 1748 vnlru_kick(); 1749 mtx_unlock(&vnode_list_mtx); 1750 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1751 } 1752 1753 static struct vnode * 1754 vn_alloc(struct mount *mp) 1755 { 1756 u_long rnumvnodes; 1757 1758 if (__predict_false(vn_alloc_cyclecount != 0)) 1759 return (vn_alloc_hard(mp)); 1760 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1761 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1762 atomic_subtract_long(&numvnodes, 1); 1763 return (vn_alloc_hard(mp)); 1764 } 1765 1766 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1767 } 1768 1769 static void 1770 vn_free(struct vnode *vp) 1771 { 1772 1773 atomic_subtract_long(&numvnodes, 1); 1774 uma_zfree_smr(vnode_zone, vp); 1775 } 1776 1777 /* 1778 * Return the next vnode from the free list. 1779 */ 1780 int 1781 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1782 struct vnode **vpp) 1783 { 1784 struct vnode *vp; 1785 struct thread *td; 1786 struct lock_object *lo; 1787 1788 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1789 1790 KASSERT(vops->registered, 1791 ("%s: not registered vector op %p\n", __func__, vops)); 1792 1793 td = curthread; 1794 if (td->td_vp_reserved != NULL) { 1795 vp = td->td_vp_reserved; 1796 td->td_vp_reserved = NULL; 1797 } else { 1798 vp = vn_alloc(mp); 1799 } 1800 counter_u64_add(vnodes_created, 1); 1801 1802 vn_set_state(vp, VSTATE_UNINITIALIZED); 1803 1804 /* 1805 * Locks are given the generic name "vnode" when created. 1806 * Follow the historic practice of using the filesystem 1807 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1808 * 1809 * Locks live in a witness group keyed on their name. Thus, 1810 * when a lock is renamed, it must also move from the witness 1811 * group of its old name to the witness group of its new name. 1812 * 1813 * The change only needs to be made when the vnode moves 1814 * from one filesystem type to another. We ensure that each 1815 * filesystem use a single static name pointer for its tag so 1816 * that we can compare pointers rather than doing a strcmp(). 1817 */ 1818 lo = &vp->v_vnlock->lock_object; 1819 #ifdef WITNESS 1820 if (lo->lo_name != tag) { 1821 #endif 1822 lo->lo_name = tag; 1823 #ifdef WITNESS 1824 WITNESS_DESTROY(lo); 1825 WITNESS_INIT(lo, tag); 1826 } 1827 #endif 1828 /* 1829 * By default, don't allow shared locks unless filesystems opt-in. 1830 */ 1831 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1832 /* 1833 * Finalize various vnode identity bits. 1834 */ 1835 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1836 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1837 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1838 vp->v_type = VNON; 1839 vp->v_op = vops; 1840 vp->v_irflag = 0; 1841 v_init_counters(vp); 1842 vn_seqc_init(vp); 1843 vp->v_bufobj.bo_ops = &buf_ops_bio; 1844 #ifdef DIAGNOSTIC 1845 if (mp == NULL && vops != &dead_vnodeops) 1846 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1847 #endif 1848 #ifdef MAC 1849 mac_vnode_init(vp); 1850 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1851 mac_vnode_associate_singlelabel(mp, vp); 1852 #endif 1853 if (mp != NULL) { 1854 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1855 } 1856 1857 /* 1858 * For the filesystems which do not use vfs_hash_insert(), 1859 * still initialize v_hash to have vfs_hash_index() useful. 1860 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1861 * its own hashing. 1862 */ 1863 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1864 1865 *vpp = vp; 1866 return (0); 1867 } 1868 1869 void 1870 getnewvnode_reserve(void) 1871 { 1872 struct thread *td; 1873 1874 td = curthread; 1875 MPASS(td->td_vp_reserved == NULL); 1876 td->td_vp_reserved = vn_alloc(NULL); 1877 } 1878 1879 void 1880 getnewvnode_drop_reserve(void) 1881 { 1882 struct thread *td; 1883 1884 td = curthread; 1885 if (td->td_vp_reserved != NULL) { 1886 vn_free(td->td_vp_reserved); 1887 td->td_vp_reserved = NULL; 1888 } 1889 } 1890 1891 static void __noinline 1892 freevnode(struct vnode *vp) 1893 { 1894 struct bufobj *bo; 1895 1896 /* 1897 * The vnode has been marked for destruction, so free it. 1898 * 1899 * The vnode will be returned to the zone where it will 1900 * normally remain until it is needed for another vnode. We 1901 * need to cleanup (or verify that the cleanup has already 1902 * been done) any residual data left from its current use 1903 * so as not to contaminate the freshly allocated vnode. 1904 */ 1905 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1906 /* 1907 * Paired with vgone. 1908 */ 1909 vn_seqc_write_end_free(vp); 1910 1911 bo = &vp->v_bufobj; 1912 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1913 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1914 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1915 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1916 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1917 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1918 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1919 ("clean blk trie not empty")); 1920 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1921 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1922 ("dirty blk trie not empty")); 1923 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1924 ("Dangling rangelock waiters")); 1925 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1926 ("Leaked inactivation")); 1927 VI_UNLOCK(vp); 1928 cache_assert_no_entries(vp); 1929 1930 #ifdef MAC 1931 mac_vnode_destroy(vp); 1932 #endif 1933 if (vp->v_pollinfo != NULL) { 1934 /* 1935 * Use LK_NOWAIT to shut up witness about the lock. We may get 1936 * here while having another vnode locked when trying to 1937 * satisfy a lookup and needing to recycle. 1938 */ 1939 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 1940 destroy_vpollinfo(vp->v_pollinfo); 1941 VOP_UNLOCK(vp); 1942 vp->v_pollinfo = NULL; 1943 } 1944 vp->v_mountedhere = NULL; 1945 vp->v_unpcb = NULL; 1946 vp->v_rdev = NULL; 1947 vp->v_fifoinfo = NULL; 1948 vp->v_iflag = 0; 1949 vp->v_vflag = 0; 1950 bo->bo_flag = 0; 1951 vn_free(vp); 1952 } 1953 1954 /* 1955 * Delete from old mount point vnode list, if on one. 1956 */ 1957 static void 1958 delmntque(struct vnode *vp) 1959 { 1960 struct mount *mp; 1961 1962 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1963 1964 mp = vp->v_mount; 1965 MNT_ILOCK(mp); 1966 VI_LOCK(vp); 1967 vp->v_mount = NULL; 1968 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1969 ("bad mount point vnode list size")); 1970 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1971 mp->mnt_nvnodelistsize--; 1972 MNT_REL(mp); 1973 MNT_IUNLOCK(mp); 1974 /* 1975 * The caller expects the interlock to be still held. 1976 */ 1977 ASSERT_VI_LOCKED(vp, __func__); 1978 } 1979 1980 static int 1981 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 1982 { 1983 1984 KASSERT(vp->v_mount == NULL, 1985 ("insmntque: vnode already on per mount vnode list")); 1986 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1987 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 1988 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1989 } else { 1990 KASSERT(!dtr, 1991 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 1992 __func__)); 1993 } 1994 1995 /* 1996 * We acquire the vnode interlock early to ensure that the 1997 * vnode cannot be recycled by another process releasing a 1998 * holdcnt on it before we get it on both the vnode list 1999 * and the active vnode list. The mount mutex protects only 2000 * manipulation of the vnode list and the vnode freelist 2001 * mutex protects only manipulation of the active vnode list. 2002 * Hence the need to hold the vnode interlock throughout. 2003 */ 2004 MNT_ILOCK(mp); 2005 VI_LOCK(vp); 2006 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2007 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2008 mp->mnt_nvnodelistsize == 0)) && 2009 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2010 VI_UNLOCK(vp); 2011 MNT_IUNLOCK(mp); 2012 if (dtr) { 2013 vp->v_data = NULL; 2014 vp->v_op = &dead_vnodeops; 2015 vgone(vp); 2016 vput(vp); 2017 } 2018 return (EBUSY); 2019 } 2020 vp->v_mount = mp; 2021 MNT_REF(mp); 2022 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2023 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2024 ("neg mount point vnode list size")); 2025 mp->mnt_nvnodelistsize++; 2026 VI_UNLOCK(vp); 2027 MNT_IUNLOCK(mp); 2028 return (0); 2029 } 2030 2031 /* 2032 * Insert into list of vnodes for the new mount point, if available. 2033 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2034 * leaves handling of the vnode to the caller. 2035 */ 2036 int 2037 insmntque(struct vnode *vp, struct mount *mp) 2038 { 2039 return (insmntque1_int(vp, mp, true)); 2040 } 2041 2042 int 2043 insmntque1(struct vnode *vp, struct mount *mp) 2044 { 2045 return (insmntque1_int(vp, mp, false)); 2046 } 2047 2048 /* 2049 * Flush out and invalidate all buffers associated with a bufobj 2050 * Called with the underlying object locked. 2051 */ 2052 int 2053 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2054 { 2055 int error; 2056 2057 BO_LOCK(bo); 2058 if (flags & V_SAVE) { 2059 error = bufobj_wwait(bo, slpflag, slptimeo); 2060 if (error) { 2061 BO_UNLOCK(bo); 2062 return (error); 2063 } 2064 if (bo->bo_dirty.bv_cnt > 0) { 2065 BO_UNLOCK(bo); 2066 do { 2067 error = BO_SYNC(bo, MNT_WAIT); 2068 } while (error == ERELOOKUP); 2069 if (error != 0) 2070 return (error); 2071 BO_LOCK(bo); 2072 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2073 BO_UNLOCK(bo); 2074 return (EBUSY); 2075 } 2076 } 2077 } 2078 /* 2079 * If you alter this loop please notice that interlock is dropped and 2080 * reacquired in flushbuflist. Special care is needed to ensure that 2081 * no race conditions occur from this. 2082 */ 2083 do { 2084 error = flushbuflist(&bo->bo_clean, 2085 flags, bo, slpflag, slptimeo); 2086 if (error == 0 && !(flags & V_CLEANONLY)) 2087 error = flushbuflist(&bo->bo_dirty, 2088 flags, bo, slpflag, slptimeo); 2089 if (error != 0 && error != EAGAIN) { 2090 BO_UNLOCK(bo); 2091 return (error); 2092 } 2093 } while (error != 0); 2094 2095 /* 2096 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2097 * have write I/O in-progress but if there is a VM object then the 2098 * VM object can also have read-I/O in-progress. 2099 */ 2100 do { 2101 bufobj_wwait(bo, 0, 0); 2102 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2103 BO_UNLOCK(bo); 2104 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2105 BO_LOCK(bo); 2106 } 2107 } while (bo->bo_numoutput > 0); 2108 BO_UNLOCK(bo); 2109 2110 /* 2111 * Destroy the copy in the VM cache, too. 2112 */ 2113 if (bo->bo_object != NULL && 2114 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2115 VM_OBJECT_WLOCK(bo->bo_object); 2116 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2117 OBJPR_CLEANONLY : 0); 2118 VM_OBJECT_WUNLOCK(bo->bo_object); 2119 } 2120 2121 #ifdef INVARIANTS 2122 BO_LOCK(bo); 2123 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2124 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2125 bo->bo_clean.bv_cnt > 0)) 2126 panic("vinvalbuf: flush failed"); 2127 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2128 bo->bo_dirty.bv_cnt > 0) 2129 panic("vinvalbuf: flush dirty failed"); 2130 BO_UNLOCK(bo); 2131 #endif 2132 return (0); 2133 } 2134 2135 /* 2136 * Flush out and invalidate all buffers associated with a vnode. 2137 * Called with the underlying object locked. 2138 */ 2139 int 2140 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2141 { 2142 2143 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2144 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2145 if (vp->v_object != NULL && vp->v_object->handle != vp) 2146 return (0); 2147 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2148 } 2149 2150 /* 2151 * Flush out buffers on the specified list. 2152 * 2153 */ 2154 static int 2155 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2156 int slptimeo) 2157 { 2158 struct buf *bp, *nbp; 2159 int retval, error; 2160 daddr_t lblkno; 2161 b_xflags_t xflags; 2162 2163 ASSERT_BO_WLOCKED(bo); 2164 2165 retval = 0; 2166 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2167 /* 2168 * If we are flushing both V_NORMAL and V_ALT buffers then 2169 * do not skip any buffers. If we are flushing only V_NORMAL 2170 * buffers then skip buffers marked as BX_ALTDATA. If we are 2171 * flushing only V_ALT buffers then skip buffers not marked 2172 * as BX_ALTDATA. 2173 */ 2174 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2175 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2176 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2177 continue; 2178 } 2179 if (nbp != NULL) { 2180 lblkno = nbp->b_lblkno; 2181 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2182 } 2183 retval = EAGAIN; 2184 error = BUF_TIMELOCK(bp, 2185 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2186 "flushbuf", slpflag, slptimeo); 2187 if (error) { 2188 BO_LOCK(bo); 2189 return (error != ENOLCK ? error : EAGAIN); 2190 } 2191 KASSERT(bp->b_bufobj == bo, 2192 ("bp %p wrong b_bufobj %p should be %p", 2193 bp, bp->b_bufobj, bo)); 2194 /* 2195 * XXX Since there are no node locks for NFS, I 2196 * believe there is a slight chance that a delayed 2197 * write will occur while sleeping just above, so 2198 * check for it. 2199 */ 2200 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2201 (flags & V_SAVE)) { 2202 bremfree(bp); 2203 bp->b_flags |= B_ASYNC; 2204 bwrite(bp); 2205 BO_LOCK(bo); 2206 return (EAGAIN); /* XXX: why not loop ? */ 2207 } 2208 bremfree(bp); 2209 bp->b_flags |= (B_INVAL | B_RELBUF); 2210 bp->b_flags &= ~B_ASYNC; 2211 brelse(bp); 2212 BO_LOCK(bo); 2213 if (nbp == NULL) 2214 break; 2215 nbp = gbincore(bo, lblkno); 2216 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2217 != xflags) 2218 break; /* nbp invalid */ 2219 } 2220 return (retval); 2221 } 2222 2223 int 2224 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2225 { 2226 struct buf *bp; 2227 int error; 2228 daddr_t lblkno; 2229 2230 ASSERT_BO_LOCKED(bo); 2231 2232 for (lblkno = startn;;) { 2233 again: 2234 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2235 if (bp == NULL || bp->b_lblkno >= endn || 2236 bp->b_lblkno < startn) 2237 break; 2238 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2239 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2240 if (error != 0) { 2241 BO_RLOCK(bo); 2242 if (error == ENOLCK) 2243 goto again; 2244 return (error); 2245 } 2246 KASSERT(bp->b_bufobj == bo, 2247 ("bp %p wrong b_bufobj %p should be %p", 2248 bp, bp->b_bufobj, bo)); 2249 lblkno = bp->b_lblkno + 1; 2250 if ((bp->b_flags & B_MANAGED) == 0) 2251 bremfree(bp); 2252 bp->b_flags |= B_RELBUF; 2253 /* 2254 * In the VMIO case, use the B_NOREUSE flag to hint that the 2255 * pages backing each buffer in the range are unlikely to be 2256 * reused. Dirty buffers will have the hint applied once 2257 * they've been written. 2258 */ 2259 if ((bp->b_flags & B_VMIO) != 0) 2260 bp->b_flags |= B_NOREUSE; 2261 brelse(bp); 2262 BO_RLOCK(bo); 2263 } 2264 return (0); 2265 } 2266 2267 /* 2268 * Truncate a file's buffer and pages to a specified length. This 2269 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2270 * sync activity. 2271 */ 2272 int 2273 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2274 { 2275 struct buf *bp, *nbp; 2276 struct bufobj *bo; 2277 daddr_t startlbn; 2278 2279 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2280 vp, blksize, (uintmax_t)length); 2281 2282 /* 2283 * Round up to the *next* lbn. 2284 */ 2285 startlbn = howmany(length, blksize); 2286 2287 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2288 2289 bo = &vp->v_bufobj; 2290 restart_unlocked: 2291 BO_LOCK(bo); 2292 2293 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2294 ; 2295 2296 if (length > 0) { 2297 restartsync: 2298 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2299 if (bp->b_lblkno > 0) 2300 continue; 2301 /* 2302 * Since we hold the vnode lock this should only 2303 * fail if we're racing with the buf daemon. 2304 */ 2305 if (BUF_LOCK(bp, 2306 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2307 BO_LOCKPTR(bo)) == ENOLCK) 2308 goto restart_unlocked; 2309 2310 VNASSERT((bp->b_flags & B_DELWRI), vp, 2311 ("buf(%p) on dirty queue without DELWRI", bp)); 2312 2313 bremfree(bp); 2314 bawrite(bp); 2315 BO_LOCK(bo); 2316 goto restartsync; 2317 } 2318 } 2319 2320 bufobj_wwait(bo, 0, 0); 2321 BO_UNLOCK(bo); 2322 vnode_pager_setsize(vp, length); 2323 2324 return (0); 2325 } 2326 2327 /* 2328 * Invalidate the cached pages of a file's buffer within the range of block 2329 * numbers [startlbn, endlbn). 2330 */ 2331 void 2332 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2333 int blksize) 2334 { 2335 struct bufobj *bo; 2336 off_t start, end; 2337 2338 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2339 2340 start = blksize * startlbn; 2341 end = blksize * endlbn; 2342 2343 bo = &vp->v_bufobj; 2344 BO_LOCK(bo); 2345 MPASS(blksize == bo->bo_bsize); 2346 2347 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2348 ; 2349 2350 BO_UNLOCK(bo); 2351 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2352 } 2353 2354 static int 2355 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2356 daddr_t startlbn, daddr_t endlbn) 2357 { 2358 struct buf *bp, *nbp; 2359 bool anyfreed; 2360 2361 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2362 ASSERT_BO_LOCKED(bo); 2363 2364 do { 2365 anyfreed = false; 2366 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2367 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2368 continue; 2369 if (BUF_LOCK(bp, 2370 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2371 BO_LOCKPTR(bo)) == ENOLCK) { 2372 BO_LOCK(bo); 2373 return (EAGAIN); 2374 } 2375 2376 bremfree(bp); 2377 bp->b_flags |= B_INVAL | B_RELBUF; 2378 bp->b_flags &= ~B_ASYNC; 2379 brelse(bp); 2380 anyfreed = true; 2381 2382 BO_LOCK(bo); 2383 if (nbp != NULL && 2384 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2385 nbp->b_vp != vp || 2386 (nbp->b_flags & B_DELWRI) != 0)) 2387 return (EAGAIN); 2388 } 2389 2390 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2391 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2392 continue; 2393 if (BUF_LOCK(bp, 2394 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2395 BO_LOCKPTR(bo)) == ENOLCK) { 2396 BO_LOCK(bo); 2397 return (EAGAIN); 2398 } 2399 bremfree(bp); 2400 bp->b_flags |= B_INVAL | B_RELBUF; 2401 bp->b_flags &= ~B_ASYNC; 2402 brelse(bp); 2403 anyfreed = true; 2404 2405 BO_LOCK(bo); 2406 if (nbp != NULL && 2407 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2408 (nbp->b_vp != vp) || 2409 (nbp->b_flags & B_DELWRI) == 0)) 2410 return (EAGAIN); 2411 } 2412 } while (anyfreed); 2413 return (0); 2414 } 2415 2416 static void 2417 buf_vlist_remove(struct buf *bp) 2418 { 2419 struct bufv *bv; 2420 b_xflags_t flags; 2421 2422 flags = bp->b_xflags; 2423 2424 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2425 ASSERT_BO_WLOCKED(bp->b_bufobj); 2426 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2427 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2428 ("%s: buffer %p has invalid queue state", __func__, bp)); 2429 2430 if ((flags & BX_VNDIRTY) != 0) 2431 bv = &bp->b_bufobj->bo_dirty; 2432 else 2433 bv = &bp->b_bufobj->bo_clean; 2434 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2435 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2436 bv->bv_cnt--; 2437 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2438 } 2439 2440 /* 2441 * Add the buffer to the sorted clean or dirty block list. 2442 * 2443 * NOTE: xflags is passed as a constant, optimizing this inline function! 2444 */ 2445 static void 2446 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2447 { 2448 struct bufv *bv; 2449 struct buf *n; 2450 int error; 2451 2452 ASSERT_BO_WLOCKED(bo); 2453 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2454 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2455 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2456 ("dead bo %p", bo)); 2457 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2458 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2459 bp->b_xflags |= xflags; 2460 if (xflags & BX_VNDIRTY) 2461 bv = &bo->bo_dirty; 2462 else 2463 bv = &bo->bo_clean; 2464 2465 /* 2466 * Keep the list ordered. Optimize empty list insertion. Assume 2467 * we tend to grow at the tail so lookup_le should usually be cheaper 2468 * than _ge. 2469 */ 2470 if (bv->bv_cnt == 0 || 2471 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2472 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2473 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2474 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2475 else 2476 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2477 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2478 if (error) 2479 panic("buf_vlist_add: Preallocated nodes insufficient."); 2480 bv->bv_cnt++; 2481 } 2482 2483 /* 2484 * Look up a buffer using the buffer tries. 2485 */ 2486 struct buf * 2487 gbincore(struct bufobj *bo, daddr_t lblkno) 2488 { 2489 struct buf *bp; 2490 2491 ASSERT_BO_LOCKED(bo); 2492 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2493 if (bp != NULL) 2494 return (bp); 2495 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2496 } 2497 2498 /* 2499 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2500 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2501 * stability of the result. Like other lockless lookups, the found buf may 2502 * already be invalid by the time this function returns. 2503 */ 2504 struct buf * 2505 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2506 { 2507 struct buf *bp; 2508 2509 ASSERT_BO_UNLOCKED(bo); 2510 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2511 if (bp != NULL) 2512 return (bp); 2513 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2514 } 2515 2516 /* 2517 * Associate a buffer with a vnode. 2518 */ 2519 void 2520 bgetvp(struct vnode *vp, struct buf *bp) 2521 { 2522 struct bufobj *bo; 2523 2524 bo = &vp->v_bufobj; 2525 ASSERT_BO_WLOCKED(bo); 2526 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2527 2528 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2529 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2530 ("bgetvp: bp already attached! %p", bp)); 2531 2532 vhold(vp); 2533 bp->b_vp = vp; 2534 bp->b_bufobj = bo; 2535 /* 2536 * Insert onto list for new vnode. 2537 */ 2538 buf_vlist_add(bp, bo, BX_VNCLEAN); 2539 } 2540 2541 /* 2542 * Disassociate a buffer from a vnode. 2543 */ 2544 void 2545 brelvp(struct buf *bp) 2546 { 2547 struct bufobj *bo; 2548 struct vnode *vp; 2549 2550 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2551 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2552 2553 /* 2554 * Delete from old vnode list, if on one. 2555 */ 2556 vp = bp->b_vp; /* XXX */ 2557 bo = bp->b_bufobj; 2558 BO_LOCK(bo); 2559 buf_vlist_remove(bp); 2560 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2561 bo->bo_flag &= ~BO_ONWORKLST; 2562 mtx_lock(&sync_mtx); 2563 LIST_REMOVE(bo, bo_synclist); 2564 syncer_worklist_len--; 2565 mtx_unlock(&sync_mtx); 2566 } 2567 bp->b_vp = NULL; 2568 bp->b_bufobj = NULL; 2569 BO_UNLOCK(bo); 2570 vdrop(vp); 2571 } 2572 2573 /* 2574 * Add an item to the syncer work queue. 2575 */ 2576 static void 2577 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2578 { 2579 int slot; 2580 2581 ASSERT_BO_WLOCKED(bo); 2582 2583 mtx_lock(&sync_mtx); 2584 if (bo->bo_flag & BO_ONWORKLST) 2585 LIST_REMOVE(bo, bo_synclist); 2586 else { 2587 bo->bo_flag |= BO_ONWORKLST; 2588 syncer_worklist_len++; 2589 } 2590 2591 if (delay > syncer_maxdelay - 2) 2592 delay = syncer_maxdelay - 2; 2593 slot = (syncer_delayno + delay) & syncer_mask; 2594 2595 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2596 mtx_unlock(&sync_mtx); 2597 } 2598 2599 static int 2600 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2601 { 2602 int error, len; 2603 2604 mtx_lock(&sync_mtx); 2605 len = syncer_worklist_len - sync_vnode_count; 2606 mtx_unlock(&sync_mtx); 2607 error = SYSCTL_OUT(req, &len, sizeof(len)); 2608 return (error); 2609 } 2610 2611 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2612 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2613 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2614 2615 static struct proc *updateproc; 2616 static void sched_sync(void); 2617 static struct kproc_desc up_kp = { 2618 "syncer", 2619 sched_sync, 2620 &updateproc 2621 }; 2622 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2623 2624 static int 2625 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2626 { 2627 struct vnode *vp; 2628 struct mount *mp; 2629 2630 *bo = LIST_FIRST(slp); 2631 if (*bo == NULL) 2632 return (0); 2633 vp = bo2vnode(*bo); 2634 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2635 return (1); 2636 /* 2637 * We use vhold in case the vnode does not 2638 * successfully sync. vhold prevents the vnode from 2639 * going away when we unlock the sync_mtx so that 2640 * we can acquire the vnode interlock. 2641 */ 2642 vholdl(vp); 2643 mtx_unlock(&sync_mtx); 2644 VI_UNLOCK(vp); 2645 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2646 vdrop(vp); 2647 mtx_lock(&sync_mtx); 2648 return (*bo == LIST_FIRST(slp)); 2649 } 2650 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2651 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2652 VOP_UNLOCK(vp); 2653 vn_finished_write(mp); 2654 BO_LOCK(*bo); 2655 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2656 /* 2657 * Put us back on the worklist. The worklist 2658 * routine will remove us from our current 2659 * position and then add us back in at a later 2660 * position. 2661 */ 2662 vn_syncer_add_to_worklist(*bo, syncdelay); 2663 } 2664 BO_UNLOCK(*bo); 2665 vdrop(vp); 2666 mtx_lock(&sync_mtx); 2667 return (0); 2668 } 2669 2670 static int first_printf = 1; 2671 2672 /* 2673 * System filesystem synchronizer daemon. 2674 */ 2675 static void 2676 sched_sync(void) 2677 { 2678 struct synclist *next, *slp; 2679 struct bufobj *bo; 2680 long starttime; 2681 struct thread *td = curthread; 2682 int last_work_seen; 2683 int net_worklist_len; 2684 int syncer_final_iter; 2685 int error; 2686 2687 last_work_seen = 0; 2688 syncer_final_iter = 0; 2689 syncer_state = SYNCER_RUNNING; 2690 starttime = time_uptime; 2691 td->td_pflags |= TDP_NORUNNINGBUF; 2692 2693 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2694 SHUTDOWN_PRI_LAST); 2695 2696 mtx_lock(&sync_mtx); 2697 for (;;) { 2698 if (syncer_state == SYNCER_FINAL_DELAY && 2699 syncer_final_iter == 0) { 2700 mtx_unlock(&sync_mtx); 2701 kproc_suspend_check(td->td_proc); 2702 mtx_lock(&sync_mtx); 2703 } 2704 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2705 if (syncer_state != SYNCER_RUNNING && 2706 starttime != time_uptime) { 2707 if (first_printf) { 2708 printf("\nSyncing disks, vnodes remaining... "); 2709 first_printf = 0; 2710 } 2711 printf("%d ", net_worklist_len); 2712 } 2713 starttime = time_uptime; 2714 2715 /* 2716 * Push files whose dirty time has expired. Be careful 2717 * of interrupt race on slp queue. 2718 * 2719 * Skip over empty worklist slots when shutting down. 2720 */ 2721 do { 2722 slp = &syncer_workitem_pending[syncer_delayno]; 2723 syncer_delayno += 1; 2724 if (syncer_delayno == syncer_maxdelay) 2725 syncer_delayno = 0; 2726 next = &syncer_workitem_pending[syncer_delayno]; 2727 /* 2728 * If the worklist has wrapped since the 2729 * it was emptied of all but syncer vnodes, 2730 * switch to the FINAL_DELAY state and run 2731 * for one more second. 2732 */ 2733 if (syncer_state == SYNCER_SHUTTING_DOWN && 2734 net_worklist_len == 0 && 2735 last_work_seen == syncer_delayno) { 2736 syncer_state = SYNCER_FINAL_DELAY; 2737 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2738 } 2739 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2740 syncer_worklist_len > 0); 2741 2742 /* 2743 * Keep track of the last time there was anything 2744 * on the worklist other than syncer vnodes. 2745 * Return to the SHUTTING_DOWN state if any 2746 * new work appears. 2747 */ 2748 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2749 last_work_seen = syncer_delayno; 2750 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2751 syncer_state = SYNCER_SHUTTING_DOWN; 2752 while (!LIST_EMPTY(slp)) { 2753 error = sync_vnode(slp, &bo, td); 2754 if (error == 1) { 2755 LIST_REMOVE(bo, bo_synclist); 2756 LIST_INSERT_HEAD(next, bo, bo_synclist); 2757 continue; 2758 } 2759 2760 if (first_printf == 0) { 2761 /* 2762 * Drop the sync mutex, because some watchdog 2763 * drivers need to sleep while patting 2764 */ 2765 mtx_unlock(&sync_mtx); 2766 wdog_kern_pat(WD_LASTVAL); 2767 mtx_lock(&sync_mtx); 2768 } 2769 } 2770 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2771 syncer_final_iter--; 2772 /* 2773 * The variable rushjob allows the kernel to speed up the 2774 * processing of the filesystem syncer process. A rushjob 2775 * value of N tells the filesystem syncer to process the next 2776 * N seconds worth of work on its queue ASAP. Currently rushjob 2777 * is used by the soft update code to speed up the filesystem 2778 * syncer process when the incore state is getting so far 2779 * ahead of the disk that the kernel memory pool is being 2780 * threatened with exhaustion. 2781 */ 2782 if (rushjob > 0) { 2783 rushjob -= 1; 2784 continue; 2785 } 2786 /* 2787 * Just sleep for a short period of time between 2788 * iterations when shutting down to allow some I/O 2789 * to happen. 2790 * 2791 * If it has taken us less than a second to process the 2792 * current work, then wait. Otherwise start right over 2793 * again. We can still lose time if any single round 2794 * takes more than two seconds, but it does not really 2795 * matter as we are just trying to generally pace the 2796 * filesystem activity. 2797 */ 2798 if (syncer_state != SYNCER_RUNNING || 2799 time_uptime == starttime) { 2800 thread_lock(td); 2801 sched_prio(td, PPAUSE); 2802 thread_unlock(td); 2803 } 2804 if (syncer_state != SYNCER_RUNNING) 2805 cv_timedwait(&sync_wakeup, &sync_mtx, 2806 hz / SYNCER_SHUTDOWN_SPEEDUP); 2807 else if (time_uptime == starttime) 2808 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2809 } 2810 } 2811 2812 /* 2813 * Request the syncer daemon to speed up its work. 2814 * We never push it to speed up more than half of its 2815 * normal turn time, otherwise it could take over the cpu. 2816 */ 2817 int 2818 speedup_syncer(void) 2819 { 2820 int ret = 0; 2821 2822 mtx_lock(&sync_mtx); 2823 if (rushjob < syncdelay / 2) { 2824 rushjob += 1; 2825 stat_rush_requests += 1; 2826 ret = 1; 2827 } 2828 mtx_unlock(&sync_mtx); 2829 cv_broadcast(&sync_wakeup); 2830 return (ret); 2831 } 2832 2833 /* 2834 * Tell the syncer to speed up its work and run though its work 2835 * list several times, then tell it to shut down. 2836 */ 2837 static void 2838 syncer_shutdown(void *arg, int howto) 2839 { 2840 2841 if (howto & RB_NOSYNC) 2842 return; 2843 mtx_lock(&sync_mtx); 2844 syncer_state = SYNCER_SHUTTING_DOWN; 2845 rushjob = 0; 2846 mtx_unlock(&sync_mtx); 2847 cv_broadcast(&sync_wakeup); 2848 kproc_shutdown(arg, howto); 2849 } 2850 2851 void 2852 syncer_suspend(void) 2853 { 2854 2855 syncer_shutdown(updateproc, 0); 2856 } 2857 2858 void 2859 syncer_resume(void) 2860 { 2861 2862 mtx_lock(&sync_mtx); 2863 first_printf = 1; 2864 syncer_state = SYNCER_RUNNING; 2865 mtx_unlock(&sync_mtx); 2866 cv_broadcast(&sync_wakeup); 2867 kproc_resume(updateproc); 2868 } 2869 2870 /* 2871 * Move the buffer between the clean and dirty lists of its vnode. 2872 */ 2873 void 2874 reassignbuf(struct buf *bp) 2875 { 2876 struct vnode *vp; 2877 struct bufobj *bo; 2878 int delay; 2879 #ifdef INVARIANTS 2880 struct bufv *bv; 2881 #endif 2882 2883 vp = bp->b_vp; 2884 bo = bp->b_bufobj; 2885 2886 KASSERT((bp->b_flags & B_PAGING) == 0, 2887 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2888 2889 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2890 bp, bp->b_vp, bp->b_flags); 2891 2892 BO_LOCK(bo); 2893 buf_vlist_remove(bp); 2894 2895 /* 2896 * If dirty, put on list of dirty buffers; otherwise insert onto list 2897 * of clean buffers. 2898 */ 2899 if (bp->b_flags & B_DELWRI) { 2900 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2901 switch (vp->v_type) { 2902 case VDIR: 2903 delay = dirdelay; 2904 break; 2905 case VCHR: 2906 delay = metadelay; 2907 break; 2908 default: 2909 delay = filedelay; 2910 } 2911 vn_syncer_add_to_worklist(bo, delay); 2912 } 2913 buf_vlist_add(bp, bo, BX_VNDIRTY); 2914 } else { 2915 buf_vlist_add(bp, bo, BX_VNCLEAN); 2916 2917 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2918 mtx_lock(&sync_mtx); 2919 LIST_REMOVE(bo, bo_synclist); 2920 syncer_worklist_len--; 2921 mtx_unlock(&sync_mtx); 2922 bo->bo_flag &= ~BO_ONWORKLST; 2923 } 2924 } 2925 #ifdef INVARIANTS 2926 bv = &bo->bo_clean; 2927 bp = TAILQ_FIRST(&bv->bv_hd); 2928 KASSERT(bp == NULL || bp->b_bufobj == bo, 2929 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2930 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2931 KASSERT(bp == NULL || bp->b_bufobj == bo, 2932 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2933 bv = &bo->bo_dirty; 2934 bp = TAILQ_FIRST(&bv->bv_hd); 2935 KASSERT(bp == NULL || bp->b_bufobj == bo, 2936 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2937 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2938 KASSERT(bp == NULL || bp->b_bufobj == bo, 2939 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2940 #endif 2941 BO_UNLOCK(bo); 2942 } 2943 2944 static void 2945 v_init_counters(struct vnode *vp) 2946 { 2947 2948 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2949 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2950 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2951 2952 refcount_init(&vp->v_holdcnt, 1); 2953 refcount_init(&vp->v_usecount, 1); 2954 } 2955 2956 /* 2957 * Grab a particular vnode from the free list, increment its 2958 * reference count and lock it. VIRF_DOOMED is set if the vnode 2959 * is being destroyed. Only callers who specify LK_RETRY will 2960 * see doomed vnodes. If inactive processing was delayed in 2961 * vput try to do it here. 2962 * 2963 * usecount is manipulated using atomics without holding any locks. 2964 * 2965 * holdcnt can be manipulated using atomics without holding any locks, 2966 * except when transitioning 1<->0, in which case the interlock is held. 2967 * 2968 * Consumers which don't guarantee liveness of the vnode can use SMR to 2969 * try to get a reference. Note this operation can fail since the vnode 2970 * may be awaiting getting freed by the time they get to it. 2971 */ 2972 enum vgetstate 2973 vget_prep_smr(struct vnode *vp) 2974 { 2975 enum vgetstate vs; 2976 2977 VFS_SMR_ASSERT_ENTERED(); 2978 2979 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2980 vs = VGET_USECOUNT; 2981 } else { 2982 if (vhold_smr(vp)) 2983 vs = VGET_HOLDCNT; 2984 else 2985 vs = VGET_NONE; 2986 } 2987 return (vs); 2988 } 2989 2990 enum vgetstate 2991 vget_prep(struct vnode *vp) 2992 { 2993 enum vgetstate vs; 2994 2995 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2996 vs = VGET_USECOUNT; 2997 } else { 2998 vhold(vp); 2999 vs = VGET_HOLDCNT; 3000 } 3001 return (vs); 3002 } 3003 3004 void 3005 vget_abort(struct vnode *vp, enum vgetstate vs) 3006 { 3007 3008 switch (vs) { 3009 case VGET_USECOUNT: 3010 vrele(vp); 3011 break; 3012 case VGET_HOLDCNT: 3013 vdrop(vp); 3014 break; 3015 default: 3016 __assert_unreachable(); 3017 } 3018 } 3019 3020 int 3021 vget(struct vnode *vp, int flags) 3022 { 3023 enum vgetstate vs; 3024 3025 vs = vget_prep(vp); 3026 return (vget_finish(vp, flags, vs)); 3027 } 3028 3029 int 3030 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3031 { 3032 int error; 3033 3034 if ((flags & LK_INTERLOCK) != 0) 3035 ASSERT_VI_LOCKED(vp, __func__); 3036 else 3037 ASSERT_VI_UNLOCKED(vp, __func__); 3038 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3039 VNPASS(vp->v_holdcnt > 0, vp); 3040 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3041 3042 error = vn_lock(vp, flags); 3043 if (__predict_false(error != 0)) { 3044 vget_abort(vp, vs); 3045 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3046 vp); 3047 return (error); 3048 } 3049 3050 vget_finish_ref(vp, vs); 3051 return (0); 3052 } 3053 3054 void 3055 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3056 { 3057 int old; 3058 3059 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3060 VNPASS(vp->v_holdcnt > 0, vp); 3061 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3062 3063 if (vs == VGET_USECOUNT) 3064 return; 3065 3066 /* 3067 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3068 * the vnode around. Otherwise someone else lended their hold count and 3069 * we have to drop ours. 3070 */ 3071 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3072 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3073 if (old != 0) { 3074 #ifdef INVARIANTS 3075 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3076 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3077 #else 3078 refcount_release(&vp->v_holdcnt); 3079 #endif 3080 } 3081 } 3082 3083 void 3084 vref(struct vnode *vp) 3085 { 3086 enum vgetstate vs; 3087 3088 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3089 vs = vget_prep(vp); 3090 vget_finish_ref(vp, vs); 3091 } 3092 3093 void 3094 vrefact(struct vnode *vp) 3095 { 3096 3097 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3098 #ifdef INVARIANTS 3099 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3100 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3101 #else 3102 refcount_acquire(&vp->v_usecount); 3103 #endif 3104 } 3105 3106 void 3107 vlazy(struct vnode *vp) 3108 { 3109 struct mount *mp; 3110 3111 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3112 3113 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3114 return; 3115 /* 3116 * We may get here for inactive routines after the vnode got doomed. 3117 */ 3118 if (VN_IS_DOOMED(vp)) 3119 return; 3120 mp = vp->v_mount; 3121 mtx_lock(&mp->mnt_listmtx); 3122 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3123 vp->v_mflag |= VMP_LAZYLIST; 3124 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3125 mp->mnt_lazyvnodelistsize++; 3126 } 3127 mtx_unlock(&mp->mnt_listmtx); 3128 } 3129 3130 static void 3131 vunlazy(struct vnode *vp) 3132 { 3133 struct mount *mp; 3134 3135 ASSERT_VI_LOCKED(vp, __func__); 3136 VNPASS(!VN_IS_DOOMED(vp), vp); 3137 3138 mp = vp->v_mount; 3139 mtx_lock(&mp->mnt_listmtx); 3140 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3141 /* 3142 * Don't remove the vnode from the lazy list if another thread 3143 * has increased the hold count. It may have re-enqueued the 3144 * vnode to the lazy list and is now responsible for its 3145 * removal. 3146 */ 3147 if (vp->v_holdcnt == 0) { 3148 vp->v_mflag &= ~VMP_LAZYLIST; 3149 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3150 mp->mnt_lazyvnodelistsize--; 3151 } 3152 mtx_unlock(&mp->mnt_listmtx); 3153 } 3154 3155 /* 3156 * This routine is only meant to be called from vgonel prior to dooming 3157 * the vnode. 3158 */ 3159 static void 3160 vunlazy_gone(struct vnode *vp) 3161 { 3162 struct mount *mp; 3163 3164 ASSERT_VOP_ELOCKED(vp, __func__); 3165 ASSERT_VI_LOCKED(vp, __func__); 3166 VNPASS(!VN_IS_DOOMED(vp), vp); 3167 3168 if (vp->v_mflag & VMP_LAZYLIST) { 3169 mp = vp->v_mount; 3170 mtx_lock(&mp->mnt_listmtx); 3171 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3172 vp->v_mflag &= ~VMP_LAZYLIST; 3173 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3174 mp->mnt_lazyvnodelistsize--; 3175 mtx_unlock(&mp->mnt_listmtx); 3176 } 3177 } 3178 3179 static void 3180 vdefer_inactive(struct vnode *vp) 3181 { 3182 3183 ASSERT_VI_LOCKED(vp, __func__); 3184 VNASSERT(vp->v_holdcnt > 0, vp, 3185 ("%s: vnode without hold count", __func__)); 3186 if (VN_IS_DOOMED(vp)) { 3187 vdropl(vp); 3188 return; 3189 } 3190 if (vp->v_iflag & VI_DEFINACT) { 3191 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3192 vdropl(vp); 3193 return; 3194 } 3195 if (vp->v_usecount > 0) { 3196 vp->v_iflag &= ~VI_OWEINACT; 3197 vdropl(vp); 3198 return; 3199 } 3200 vlazy(vp); 3201 vp->v_iflag |= VI_DEFINACT; 3202 VI_UNLOCK(vp); 3203 counter_u64_add(deferred_inact, 1); 3204 } 3205 3206 static void 3207 vdefer_inactive_unlocked(struct vnode *vp) 3208 { 3209 3210 VI_LOCK(vp); 3211 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3212 vdropl(vp); 3213 return; 3214 } 3215 vdefer_inactive(vp); 3216 } 3217 3218 enum vput_op { VRELE, VPUT, VUNREF }; 3219 3220 /* 3221 * Handle ->v_usecount transitioning to 0. 3222 * 3223 * By releasing the last usecount we take ownership of the hold count which 3224 * provides liveness of the vnode, meaning we have to vdrop. 3225 * 3226 * For all vnodes we may need to perform inactive processing. It requires an 3227 * exclusive lock on the vnode, while it is legal to call here with only a 3228 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3229 * inactive processing gets deferred to the syncer. 3230 * 3231 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3232 * on the lock being held all the way until VOP_INACTIVE. This in particular 3233 * happens with UFS which adds half-constructed vnodes to the hash, where they 3234 * can be found by other code. 3235 */ 3236 static void 3237 vput_final(struct vnode *vp, enum vput_op func) 3238 { 3239 int error; 3240 bool want_unlock; 3241 3242 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3243 VNPASS(vp->v_holdcnt > 0, vp); 3244 3245 VI_LOCK(vp); 3246 3247 /* 3248 * By the time we got here someone else might have transitioned 3249 * the count back to > 0. 3250 */ 3251 if (vp->v_usecount > 0) 3252 goto out; 3253 3254 /* 3255 * If the vnode is doomed vgone already performed inactive processing 3256 * (if needed). 3257 */ 3258 if (VN_IS_DOOMED(vp)) 3259 goto out; 3260 3261 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3262 goto out; 3263 3264 if (vp->v_iflag & VI_DOINGINACT) 3265 goto out; 3266 3267 /* 3268 * Locking operations here will drop the interlock and possibly the 3269 * vnode lock, opening a window where the vnode can get doomed all the 3270 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3271 * perform inactive. 3272 */ 3273 vp->v_iflag |= VI_OWEINACT; 3274 want_unlock = false; 3275 error = 0; 3276 switch (func) { 3277 case VRELE: 3278 switch (VOP_ISLOCKED(vp)) { 3279 case LK_EXCLUSIVE: 3280 break; 3281 case LK_EXCLOTHER: 3282 case 0: 3283 want_unlock = true; 3284 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3285 VI_LOCK(vp); 3286 break; 3287 default: 3288 /* 3289 * The lock has at least one sharer, but we have no way 3290 * to conclude whether this is us. Play it safe and 3291 * defer processing. 3292 */ 3293 error = EAGAIN; 3294 break; 3295 } 3296 break; 3297 case VPUT: 3298 want_unlock = true; 3299 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3300 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3301 LK_NOWAIT); 3302 VI_LOCK(vp); 3303 } 3304 break; 3305 case VUNREF: 3306 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3307 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3308 VI_LOCK(vp); 3309 } 3310 break; 3311 } 3312 if (error == 0) { 3313 if (func == VUNREF) { 3314 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3315 ("recursive vunref")); 3316 vp->v_vflag |= VV_UNREF; 3317 } 3318 for (;;) { 3319 error = vinactive(vp); 3320 if (want_unlock) 3321 VOP_UNLOCK(vp); 3322 if (error != ERELOOKUP || !want_unlock) 3323 break; 3324 VOP_LOCK(vp, LK_EXCLUSIVE); 3325 } 3326 if (func == VUNREF) 3327 vp->v_vflag &= ~VV_UNREF; 3328 vdropl(vp); 3329 } else { 3330 vdefer_inactive(vp); 3331 } 3332 return; 3333 out: 3334 if (func == VPUT) 3335 VOP_UNLOCK(vp); 3336 vdropl(vp); 3337 } 3338 3339 /* 3340 * Decrement ->v_usecount for a vnode. 3341 * 3342 * Releasing the last use count requires additional processing, see vput_final 3343 * above for details. 3344 * 3345 * Comment above each variant denotes lock state on entry and exit. 3346 */ 3347 3348 /* 3349 * in: any 3350 * out: same as passed in 3351 */ 3352 void 3353 vrele(struct vnode *vp) 3354 { 3355 3356 ASSERT_VI_UNLOCKED(vp, __func__); 3357 if (!refcount_release(&vp->v_usecount)) 3358 return; 3359 vput_final(vp, VRELE); 3360 } 3361 3362 /* 3363 * in: locked 3364 * out: unlocked 3365 */ 3366 void 3367 vput(struct vnode *vp) 3368 { 3369 3370 ASSERT_VOP_LOCKED(vp, __func__); 3371 ASSERT_VI_UNLOCKED(vp, __func__); 3372 if (!refcount_release(&vp->v_usecount)) { 3373 VOP_UNLOCK(vp); 3374 return; 3375 } 3376 vput_final(vp, VPUT); 3377 } 3378 3379 /* 3380 * in: locked 3381 * out: locked 3382 */ 3383 void 3384 vunref(struct vnode *vp) 3385 { 3386 3387 ASSERT_VOP_LOCKED(vp, __func__); 3388 ASSERT_VI_UNLOCKED(vp, __func__); 3389 if (!refcount_release(&vp->v_usecount)) 3390 return; 3391 vput_final(vp, VUNREF); 3392 } 3393 3394 void 3395 vhold(struct vnode *vp) 3396 { 3397 int old; 3398 3399 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3400 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3401 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3402 ("%s: wrong hold count %d", __func__, old)); 3403 if (old == 0) 3404 vfs_freevnodes_dec(); 3405 } 3406 3407 void 3408 vholdnz(struct vnode *vp) 3409 { 3410 3411 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3412 #ifdef INVARIANTS 3413 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3414 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3415 ("%s: wrong hold count %d", __func__, old)); 3416 #else 3417 atomic_add_int(&vp->v_holdcnt, 1); 3418 #endif 3419 } 3420 3421 /* 3422 * Grab a hold count unless the vnode is freed. 3423 * 3424 * Only use this routine if vfs smr is the only protection you have against 3425 * freeing the vnode. 3426 * 3427 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3428 * is not set. After the flag is set the vnode becomes immutable to anyone but 3429 * the thread which managed to set the flag. 3430 * 3431 * It may be tempting to replace the loop with: 3432 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3433 * if (count & VHOLD_NO_SMR) { 3434 * backpedal and error out; 3435 * } 3436 * 3437 * However, while this is more performant, it hinders debugging by eliminating 3438 * the previously mentioned invariant. 3439 */ 3440 bool 3441 vhold_smr(struct vnode *vp) 3442 { 3443 int count; 3444 3445 VFS_SMR_ASSERT_ENTERED(); 3446 3447 count = atomic_load_int(&vp->v_holdcnt); 3448 for (;;) { 3449 if (count & VHOLD_NO_SMR) { 3450 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3451 ("non-zero hold count with flags %d\n", count)); 3452 return (false); 3453 } 3454 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3455 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3456 if (count == 0) 3457 vfs_freevnodes_dec(); 3458 return (true); 3459 } 3460 } 3461 } 3462 3463 /* 3464 * Hold a free vnode for recycling. 3465 * 3466 * Note: vnode_init references this comment. 3467 * 3468 * Attempts to recycle only need the global vnode list lock and have no use for 3469 * SMR. 3470 * 3471 * However, vnodes get inserted into the global list before they get fully 3472 * initialized and stay there until UMA decides to free the memory. This in 3473 * particular means the target can be found before it becomes usable and after 3474 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3475 * VHOLD_NO_SMR. 3476 * 3477 * Note: the vnode may gain more references after we transition the count 0->1. 3478 */ 3479 static bool 3480 vhold_recycle_free(struct vnode *vp) 3481 { 3482 int count; 3483 3484 mtx_assert(&vnode_list_mtx, MA_OWNED); 3485 3486 count = atomic_load_int(&vp->v_holdcnt); 3487 for (;;) { 3488 if (count & VHOLD_NO_SMR) { 3489 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3490 ("non-zero hold count with flags %d\n", count)); 3491 return (false); 3492 } 3493 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3494 if (count > 0) { 3495 return (false); 3496 } 3497 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3498 vfs_freevnodes_dec(); 3499 return (true); 3500 } 3501 } 3502 } 3503 3504 static void __noinline 3505 vdbatch_process(struct vdbatch *vd) 3506 { 3507 struct vnode *vp; 3508 int i; 3509 3510 mtx_assert(&vd->lock, MA_OWNED); 3511 MPASS(curthread->td_pinned > 0); 3512 MPASS(vd->index == VDBATCH_SIZE); 3513 3514 mtx_lock(&vnode_list_mtx); 3515 critical_enter(); 3516 freevnodes += vd->freevnodes; 3517 for (i = 0; i < VDBATCH_SIZE; i++) { 3518 vp = vd->tab[i]; 3519 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3520 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3521 MPASS(vp->v_dbatchcpu != NOCPU); 3522 vp->v_dbatchcpu = NOCPU; 3523 } 3524 mtx_unlock(&vnode_list_mtx); 3525 vd->freevnodes = 0; 3526 bzero(vd->tab, sizeof(vd->tab)); 3527 vd->index = 0; 3528 critical_exit(); 3529 } 3530 3531 static void 3532 vdbatch_enqueue(struct vnode *vp) 3533 { 3534 struct vdbatch *vd; 3535 3536 ASSERT_VI_LOCKED(vp, __func__); 3537 VNASSERT(!VN_IS_DOOMED(vp), vp, 3538 ("%s: deferring requeue of a doomed vnode", __func__)); 3539 3540 if (vp->v_dbatchcpu != NOCPU) { 3541 VI_UNLOCK(vp); 3542 return; 3543 } 3544 3545 sched_pin(); 3546 vd = DPCPU_PTR(vd); 3547 mtx_lock(&vd->lock); 3548 MPASS(vd->index < VDBATCH_SIZE); 3549 MPASS(vd->tab[vd->index] == NULL); 3550 /* 3551 * A hack: we depend on being pinned so that we know what to put in 3552 * ->v_dbatchcpu. 3553 */ 3554 vp->v_dbatchcpu = curcpu; 3555 vd->tab[vd->index] = vp; 3556 vd->index++; 3557 VI_UNLOCK(vp); 3558 if (vd->index == VDBATCH_SIZE) 3559 vdbatch_process(vd); 3560 mtx_unlock(&vd->lock); 3561 sched_unpin(); 3562 } 3563 3564 /* 3565 * This routine must only be called for vnodes which are about to be 3566 * deallocated. Supporting dequeue for arbitrary vndoes would require 3567 * validating that the locked batch matches. 3568 */ 3569 static void 3570 vdbatch_dequeue(struct vnode *vp) 3571 { 3572 struct vdbatch *vd; 3573 int i; 3574 short cpu; 3575 3576 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3577 ("%s: called for a used vnode\n", __func__)); 3578 3579 cpu = vp->v_dbatchcpu; 3580 if (cpu == NOCPU) 3581 return; 3582 3583 vd = DPCPU_ID_PTR(cpu, vd); 3584 mtx_lock(&vd->lock); 3585 for (i = 0; i < vd->index; i++) { 3586 if (vd->tab[i] != vp) 3587 continue; 3588 vp->v_dbatchcpu = NOCPU; 3589 vd->index--; 3590 vd->tab[i] = vd->tab[vd->index]; 3591 vd->tab[vd->index] = NULL; 3592 break; 3593 } 3594 mtx_unlock(&vd->lock); 3595 /* 3596 * Either we dequeued the vnode above or the target CPU beat us to it. 3597 */ 3598 MPASS(vp->v_dbatchcpu == NOCPU); 3599 } 3600 3601 /* 3602 * Drop the hold count of the vnode. If this is the last reference to 3603 * the vnode we place it on the free list unless it has been vgone'd 3604 * (marked VIRF_DOOMED) in which case we will free it. 3605 * 3606 * Because the vnode vm object keeps a hold reference on the vnode if 3607 * there is at least one resident non-cached page, the vnode cannot 3608 * leave the active list without the page cleanup done. 3609 */ 3610 static void __noinline 3611 vdropl_final(struct vnode *vp) 3612 { 3613 3614 ASSERT_VI_LOCKED(vp, __func__); 3615 VNPASS(VN_IS_DOOMED(vp), vp); 3616 /* 3617 * Set the VHOLD_NO_SMR flag. 3618 * 3619 * We may be racing against vhold_smr. If they win we can just pretend 3620 * we never got this far, they will vdrop later. 3621 */ 3622 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3623 vfs_freevnodes_inc(); 3624 VI_UNLOCK(vp); 3625 /* 3626 * We lost the aforementioned race. Any subsequent access is 3627 * invalid as they might have managed to vdropl on their own. 3628 */ 3629 return; 3630 } 3631 /* 3632 * Don't bump freevnodes as this one is going away. 3633 */ 3634 freevnode(vp); 3635 } 3636 3637 void 3638 vdrop(struct vnode *vp) 3639 { 3640 3641 ASSERT_VI_UNLOCKED(vp, __func__); 3642 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3643 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3644 return; 3645 VI_LOCK(vp); 3646 vdropl(vp); 3647 } 3648 3649 static void __always_inline 3650 vdropl_impl(struct vnode *vp, bool enqueue) 3651 { 3652 3653 ASSERT_VI_LOCKED(vp, __func__); 3654 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3655 if (!refcount_release(&vp->v_holdcnt)) { 3656 VI_UNLOCK(vp); 3657 return; 3658 } 3659 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3660 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3661 if (VN_IS_DOOMED(vp)) { 3662 vdropl_final(vp); 3663 return; 3664 } 3665 3666 vfs_freevnodes_inc(); 3667 if (vp->v_mflag & VMP_LAZYLIST) { 3668 vunlazy(vp); 3669 } 3670 3671 if (!enqueue) { 3672 VI_UNLOCK(vp); 3673 return; 3674 } 3675 3676 /* 3677 * Also unlocks the interlock. We can't assert on it as we 3678 * released our hold and by now the vnode might have been 3679 * freed. 3680 */ 3681 vdbatch_enqueue(vp); 3682 } 3683 3684 void 3685 vdropl(struct vnode *vp) 3686 { 3687 3688 vdropl_impl(vp, true); 3689 } 3690 3691 /* 3692 * vdrop a vnode when recycling 3693 * 3694 * This is a special case routine only to be used when recycling, differs from 3695 * regular vdrop by not requeieing the vnode on LRU. 3696 * 3697 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3698 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3699 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3700 * loop which can last for as long as writes are frozen. 3701 */ 3702 static void 3703 vdropl_recycle(struct vnode *vp) 3704 { 3705 3706 vdropl_impl(vp, false); 3707 } 3708 3709 static void 3710 vdrop_recycle(struct vnode *vp) 3711 { 3712 3713 VI_LOCK(vp); 3714 vdropl_recycle(vp); 3715 } 3716 3717 /* 3718 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3719 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3720 */ 3721 static int 3722 vinactivef(struct vnode *vp) 3723 { 3724 struct vm_object *obj; 3725 int error; 3726 3727 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3728 ASSERT_VI_LOCKED(vp, "vinactive"); 3729 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3730 ("vinactive: recursed on VI_DOINGINACT")); 3731 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3732 vp->v_iflag |= VI_DOINGINACT; 3733 vp->v_iflag &= ~VI_OWEINACT; 3734 VI_UNLOCK(vp); 3735 /* 3736 * Before moving off the active list, we must be sure that any 3737 * modified pages are converted into the vnode's dirty 3738 * buffers, since these will no longer be checked once the 3739 * vnode is on the inactive list. 3740 * 3741 * The write-out of the dirty pages is asynchronous. At the 3742 * point that VOP_INACTIVE() is called, there could still be 3743 * pending I/O and dirty pages in the object. 3744 */ 3745 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3746 vm_object_mightbedirty(obj)) { 3747 VM_OBJECT_WLOCK(obj); 3748 vm_object_page_clean(obj, 0, 0, 0); 3749 VM_OBJECT_WUNLOCK(obj); 3750 } 3751 error = VOP_INACTIVE(vp); 3752 VI_LOCK(vp); 3753 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3754 ("vinactive: lost VI_DOINGINACT")); 3755 vp->v_iflag &= ~VI_DOINGINACT; 3756 return (error); 3757 } 3758 3759 int 3760 vinactive(struct vnode *vp) 3761 { 3762 3763 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3764 ASSERT_VI_LOCKED(vp, "vinactive"); 3765 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3766 3767 if ((vp->v_iflag & VI_OWEINACT) == 0) 3768 return (0); 3769 if (vp->v_iflag & VI_DOINGINACT) 3770 return (0); 3771 if (vp->v_usecount > 0) { 3772 vp->v_iflag &= ~VI_OWEINACT; 3773 return (0); 3774 } 3775 return (vinactivef(vp)); 3776 } 3777 3778 /* 3779 * Remove any vnodes in the vnode table belonging to mount point mp. 3780 * 3781 * If FORCECLOSE is not specified, there should not be any active ones, 3782 * return error if any are found (nb: this is a user error, not a 3783 * system error). If FORCECLOSE is specified, detach any active vnodes 3784 * that are found. 3785 * 3786 * If WRITECLOSE is set, only flush out regular file vnodes open for 3787 * writing. 3788 * 3789 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3790 * 3791 * `rootrefs' specifies the base reference count for the root vnode 3792 * of this filesystem. The root vnode is considered busy if its 3793 * v_usecount exceeds this value. On a successful return, vflush(, td) 3794 * will call vrele() on the root vnode exactly rootrefs times. 3795 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3796 * be zero. 3797 */ 3798 #ifdef DIAGNOSTIC 3799 static int busyprt = 0; /* print out busy vnodes */ 3800 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3801 #endif 3802 3803 int 3804 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3805 { 3806 struct vnode *vp, *mvp, *rootvp = NULL; 3807 struct vattr vattr; 3808 int busy = 0, error; 3809 3810 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3811 rootrefs, flags); 3812 if (rootrefs > 0) { 3813 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3814 ("vflush: bad args")); 3815 /* 3816 * Get the filesystem root vnode. We can vput() it 3817 * immediately, since with rootrefs > 0, it won't go away. 3818 */ 3819 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3820 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3821 __func__, error); 3822 return (error); 3823 } 3824 vput(rootvp); 3825 } 3826 loop: 3827 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3828 vholdl(vp); 3829 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3830 if (error) { 3831 vdrop(vp); 3832 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3833 goto loop; 3834 } 3835 /* 3836 * Skip over a vnodes marked VV_SYSTEM. 3837 */ 3838 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3839 VOP_UNLOCK(vp); 3840 vdrop(vp); 3841 continue; 3842 } 3843 /* 3844 * If WRITECLOSE is set, flush out unlinked but still open 3845 * files (even if open only for reading) and regular file 3846 * vnodes open for writing. 3847 */ 3848 if (flags & WRITECLOSE) { 3849 if (vp->v_object != NULL) { 3850 VM_OBJECT_WLOCK(vp->v_object); 3851 vm_object_page_clean(vp->v_object, 0, 0, 0); 3852 VM_OBJECT_WUNLOCK(vp->v_object); 3853 } 3854 do { 3855 error = VOP_FSYNC(vp, MNT_WAIT, td); 3856 } while (error == ERELOOKUP); 3857 if (error != 0) { 3858 VOP_UNLOCK(vp); 3859 vdrop(vp); 3860 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3861 return (error); 3862 } 3863 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3864 VI_LOCK(vp); 3865 3866 if ((vp->v_type == VNON || 3867 (error == 0 && vattr.va_nlink > 0)) && 3868 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3869 VOP_UNLOCK(vp); 3870 vdropl(vp); 3871 continue; 3872 } 3873 } else 3874 VI_LOCK(vp); 3875 /* 3876 * With v_usecount == 0, all we need to do is clear out the 3877 * vnode data structures and we are done. 3878 * 3879 * If FORCECLOSE is set, forcibly close the vnode. 3880 */ 3881 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3882 vgonel(vp); 3883 } else { 3884 busy++; 3885 #ifdef DIAGNOSTIC 3886 if (busyprt) 3887 vn_printf(vp, "vflush: busy vnode "); 3888 #endif 3889 } 3890 VOP_UNLOCK(vp); 3891 vdropl(vp); 3892 } 3893 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3894 /* 3895 * If just the root vnode is busy, and if its refcount 3896 * is equal to `rootrefs', then go ahead and kill it. 3897 */ 3898 VI_LOCK(rootvp); 3899 KASSERT(busy > 0, ("vflush: not busy")); 3900 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3901 ("vflush: usecount %d < rootrefs %d", 3902 rootvp->v_usecount, rootrefs)); 3903 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3904 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3905 vgone(rootvp); 3906 VOP_UNLOCK(rootvp); 3907 busy = 0; 3908 } else 3909 VI_UNLOCK(rootvp); 3910 } 3911 if (busy) { 3912 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3913 busy); 3914 return (EBUSY); 3915 } 3916 for (; rootrefs > 0; rootrefs--) 3917 vrele(rootvp); 3918 return (0); 3919 } 3920 3921 /* 3922 * Recycle an unused vnode to the front of the free list. 3923 */ 3924 int 3925 vrecycle(struct vnode *vp) 3926 { 3927 int recycled; 3928 3929 VI_LOCK(vp); 3930 recycled = vrecyclel(vp); 3931 VI_UNLOCK(vp); 3932 return (recycled); 3933 } 3934 3935 /* 3936 * vrecycle, with the vp interlock held. 3937 */ 3938 int 3939 vrecyclel(struct vnode *vp) 3940 { 3941 int recycled; 3942 3943 ASSERT_VOP_ELOCKED(vp, __func__); 3944 ASSERT_VI_LOCKED(vp, __func__); 3945 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3946 recycled = 0; 3947 if (vp->v_usecount == 0) { 3948 recycled = 1; 3949 vgonel(vp); 3950 } 3951 return (recycled); 3952 } 3953 3954 /* 3955 * Eliminate all activity associated with a vnode 3956 * in preparation for reuse. 3957 */ 3958 void 3959 vgone(struct vnode *vp) 3960 { 3961 VI_LOCK(vp); 3962 vgonel(vp); 3963 VI_UNLOCK(vp); 3964 } 3965 3966 /* 3967 * Notify upper mounts about reclaimed or unlinked vnode. 3968 */ 3969 void 3970 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 3971 { 3972 struct mount *mp; 3973 struct mount_upper_node *ump; 3974 3975 mp = atomic_load_ptr(&vp->v_mount); 3976 if (mp == NULL) 3977 return; 3978 if (TAILQ_EMPTY(&mp->mnt_notify)) 3979 return; 3980 3981 MNT_ILOCK(mp); 3982 mp->mnt_upper_pending++; 3983 KASSERT(mp->mnt_upper_pending > 0, 3984 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 3985 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 3986 MNT_IUNLOCK(mp); 3987 switch (event) { 3988 case VFS_NOTIFY_UPPER_RECLAIM: 3989 VFS_RECLAIM_LOWERVP(ump->mp, vp); 3990 break; 3991 case VFS_NOTIFY_UPPER_UNLINK: 3992 VFS_UNLINK_LOWERVP(ump->mp, vp); 3993 break; 3994 } 3995 MNT_ILOCK(mp); 3996 } 3997 mp->mnt_upper_pending--; 3998 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 3999 mp->mnt_upper_pending == 0) { 4000 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4001 wakeup(&mp->mnt_uppers); 4002 } 4003 MNT_IUNLOCK(mp); 4004 } 4005 4006 /* 4007 * vgone, with the vp interlock held. 4008 */ 4009 static void 4010 vgonel(struct vnode *vp) 4011 { 4012 struct thread *td; 4013 struct mount *mp; 4014 vm_object_t object; 4015 bool active, doinginact, oweinact; 4016 4017 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4018 ASSERT_VI_LOCKED(vp, "vgonel"); 4019 VNASSERT(vp->v_holdcnt, vp, 4020 ("vgonel: vp %p has no reference.", vp)); 4021 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4022 td = curthread; 4023 4024 /* 4025 * Don't vgonel if we're already doomed. 4026 */ 4027 if (VN_IS_DOOMED(vp)) { 4028 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4029 vn_get_state(vp) == VSTATE_DEAD, vp); 4030 return; 4031 } 4032 /* 4033 * Paired with freevnode. 4034 */ 4035 vn_seqc_write_begin_locked(vp); 4036 vunlazy_gone(vp); 4037 vn_irflag_set_locked(vp, VIRF_DOOMED); 4038 vn_set_state(vp, VSTATE_DESTROYING); 4039 4040 /* 4041 * Check to see if the vnode is in use. If so, we have to 4042 * call VOP_CLOSE() and VOP_INACTIVE(). 4043 * 4044 * It could be that VOP_INACTIVE() requested reclamation, in 4045 * which case we should avoid recursion, so check 4046 * VI_DOINGINACT. This is not precise but good enough. 4047 */ 4048 active = vp->v_usecount > 0; 4049 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4050 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4051 4052 /* 4053 * If we need to do inactive VI_OWEINACT will be set. 4054 */ 4055 if (vp->v_iflag & VI_DEFINACT) { 4056 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4057 vp->v_iflag &= ~VI_DEFINACT; 4058 vdropl(vp); 4059 } else { 4060 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4061 VI_UNLOCK(vp); 4062 } 4063 cache_purge_vgone(vp); 4064 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4065 4066 /* 4067 * If purging an active vnode, it must be closed and 4068 * deactivated before being reclaimed. 4069 */ 4070 if (active) 4071 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4072 if (!doinginact) { 4073 do { 4074 if (oweinact || active) { 4075 VI_LOCK(vp); 4076 vinactivef(vp); 4077 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4078 VI_UNLOCK(vp); 4079 } 4080 } while (oweinact); 4081 } 4082 if (vp->v_type == VSOCK) 4083 vfs_unp_reclaim(vp); 4084 4085 /* 4086 * Clean out any buffers associated with the vnode. 4087 * If the flush fails, just toss the buffers. 4088 */ 4089 mp = NULL; 4090 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4091 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4092 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4093 while (vinvalbuf(vp, 0, 0, 0) != 0) 4094 ; 4095 } 4096 4097 BO_LOCK(&vp->v_bufobj); 4098 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4099 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4100 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4101 vp->v_bufobj.bo_clean.bv_cnt == 0, 4102 ("vp %p bufobj not invalidated", vp)); 4103 4104 /* 4105 * For VMIO bufobj, BO_DEAD is set later, or in 4106 * vm_object_terminate() after the object's page queue is 4107 * flushed. 4108 */ 4109 object = vp->v_bufobj.bo_object; 4110 if (object == NULL) 4111 vp->v_bufobj.bo_flag |= BO_DEAD; 4112 BO_UNLOCK(&vp->v_bufobj); 4113 4114 /* 4115 * Handle the VM part. Tmpfs handles v_object on its own (the 4116 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4117 * should not touch the object borrowed from the lower vnode 4118 * (the handle check). 4119 */ 4120 if (object != NULL && object->type == OBJT_VNODE && 4121 object->handle == vp) 4122 vnode_destroy_vobject(vp); 4123 4124 /* 4125 * Reclaim the vnode. 4126 */ 4127 if (VOP_RECLAIM(vp)) 4128 panic("vgone: cannot reclaim"); 4129 if (mp != NULL) 4130 vn_finished_secondary_write(mp); 4131 VNASSERT(vp->v_object == NULL, vp, 4132 ("vop_reclaim left v_object vp=%p", vp)); 4133 /* 4134 * Clear the advisory locks and wake up waiting threads. 4135 */ 4136 if (vp->v_lockf != NULL) { 4137 (void)VOP_ADVLOCKPURGE(vp); 4138 vp->v_lockf = NULL; 4139 } 4140 /* 4141 * Delete from old mount point vnode list. 4142 */ 4143 if (vp->v_mount == NULL) { 4144 VI_LOCK(vp); 4145 } else { 4146 delmntque(vp); 4147 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4148 } 4149 /* 4150 * Done with purge, reset to the standard lock and invalidate 4151 * the vnode. 4152 */ 4153 vp->v_vnlock = &vp->v_lock; 4154 vp->v_op = &dead_vnodeops; 4155 vp->v_type = VBAD; 4156 vn_set_state(vp, VSTATE_DEAD); 4157 } 4158 4159 /* 4160 * Print out a description of a vnode. 4161 */ 4162 static const char *const vtypename[] = { 4163 [VNON] = "VNON", 4164 [VREG] = "VREG", 4165 [VDIR] = "VDIR", 4166 [VBLK] = "VBLK", 4167 [VCHR] = "VCHR", 4168 [VLNK] = "VLNK", 4169 [VSOCK] = "VSOCK", 4170 [VFIFO] = "VFIFO", 4171 [VBAD] = "VBAD", 4172 [VMARKER] = "VMARKER", 4173 }; 4174 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4175 "vnode type name not added to vtypename"); 4176 4177 static const char *const vstatename[] = { 4178 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4179 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4180 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4181 [VSTATE_DEAD] = "VSTATE_DEAD", 4182 }; 4183 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4184 "vnode state name not added to vstatename"); 4185 4186 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4187 "new hold count flag not added to vn_printf"); 4188 4189 void 4190 vn_printf(struct vnode *vp, const char *fmt, ...) 4191 { 4192 va_list ap; 4193 char buf[256], buf2[16]; 4194 u_long flags; 4195 u_int holdcnt; 4196 short irflag; 4197 4198 va_start(ap, fmt); 4199 vprintf(fmt, ap); 4200 va_end(ap); 4201 printf("%p: ", (void *)vp); 4202 printf("type %s state %s\n", vtypename[vp->v_type], vstatename[vp->v_state]); 4203 holdcnt = atomic_load_int(&vp->v_holdcnt); 4204 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4205 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4206 vp->v_seqc_users); 4207 switch (vp->v_type) { 4208 case VDIR: 4209 printf(" mountedhere %p\n", vp->v_mountedhere); 4210 break; 4211 case VCHR: 4212 printf(" rdev %p\n", vp->v_rdev); 4213 break; 4214 case VSOCK: 4215 printf(" socket %p\n", vp->v_unpcb); 4216 break; 4217 case VFIFO: 4218 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4219 break; 4220 default: 4221 printf("\n"); 4222 break; 4223 } 4224 buf[0] = '\0'; 4225 buf[1] = '\0'; 4226 if (holdcnt & VHOLD_NO_SMR) 4227 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4228 printf(" hold count flags (%s)\n", buf + 1); 4229 4230 buf[0] = '\0'; 4231 buf[1] = '\0'; 4232 irflag = vn_irflag_read(vp); 4233 if (irflag & VIRF_DOOMED) 4234 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4235 if (irflag & VIRF_PGREAD) 4236 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4237 if (irflag & VIRF_MOUNTPOINT) 4238 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4239 if (irflag & VIRF_TEXT_REF) 4240 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4241 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4242 if (flags != 0) { 4243 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4244 strlcat(buf, buf2, sizeof(buf)); 4245 } 4246 if (vp->v_vflag & VV_ROOT) 4247 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4248 if (vp->v_vflag & VV_ISTTY) 4249 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4250 if (vp->v_vflag & VV_NOSYNC) 4251 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4252 if (vp->v_vflag & VV_ETERNALDEV) 4253 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4254 if (vp->v_vflag & VV_CACHEDLABEL) 4255 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4256 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4257 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4258 if (vp->v_vflag & VV_COPYONWRITE) 4259 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4260 if (vp->v_vflag & VV_SYSTEM) 4261 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4262 if (vp->v_vflag & VV_PROCDEP) 4263 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4264 if (vp->v_vflag & VV_DELETED) 4265 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4266 if (vp->v_vflag & VV_MD) 4267 strlcat(buf, "|VV_MD", sizeof(buf)); 4268 if (vp->v_vflag & VV_FORCEINSMQ) 4269 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4270 if (vp->v_vflag & VV_READLINK) 4271 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4272 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4273 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4274 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4275 if (flags != 0) { 4276 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4277 strlcat(buf, buf2, sizeof(buf)); 4278 } 4279 if (vp->v_iflag & VI_MOUNT) 4280 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4281 if (vp->v_iflag & VI_DOINGINACT) 4282 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4283 if (vp->v_iflag & VI_OWEINACT) 4284 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4285 if (vp->v_iflag & VI_DEFINACT) 4286 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4287 if (vp->v_iflag & VI_FOPENING) 4288 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4289 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4290 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4291 if (flags != 0) { 4292 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4293 strlcat(buf, buf2, sizeof(buf)); 4294 } 4295 if (vp->v_mflag & VMP_LAZYLIST) 4296 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4297 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4298 if (flags != 0) { 4299 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4300 strlcat(buf, buf2, sizeof(buf)); 4301 } 4302 printf(" flags (%s)", buf + 1); 4303 if (mtx_owned(VI_MTX(vp))) 4304 printf(" VI_LOCKed"); 4305 printf("\n"); 4306 if (vp->v_object != NULL) 4307 printf(" v_object %p ref %d pages %d " 4308 "cleanbuf %d dirtybuf %d\n", 4309 vp->v_object, vp->v_object->ref_count, 4310 vp->v_object->resident_page_count, 4311 vp->v_bufobj.bo_clean.bv_cnt, 4312 vp->v_bufobj.bo_dirty.bv_cnt); 4313 printf(" "); 4314 lockmgr_printinfo(vp->v_vnlock); 4315 if (vp->v_data != NULL) 4316 VOP_PRINT(vp); 4317 } 4318 4319 #ifdef DDB 4320 /* 4321 * List all of the locked vnodes in the system. 4322 * Called when debugging the kernel. 4323 */ 4324 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4325 { 4326 struct mount *mp; 4327 struct vnode *vp; 4328 4329 /* 4330 * Note: because this is DDB, we can't obey the locking semantics 4331 * for these structures, which means we could catch an inconsistent 4332 * state and dereference a nasty pointer. Not much to be done 4333 * about that. 4334 */ 4335 db_printf("Locked vnodes\n"); 4336 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4337 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4338 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4339 vn_printf(vp, "vnode "); 4340 } 4341 } 4342 } 4343 4344 /* 4345 * Show details about the given vnode. 4346 */ 4347 DB_SHOW_COMMAND(vnode, db_show_vnode) 4348 { 4349 struct vnode *vp; 4350 4351 if (!have_addr) 4352 return; 4353 vp = (struct vnode *)addr; 4354 vn_printf(vp, "vnode "); 4355 } 4356 4357 /* 4358 * Show details about the given mount point. 4359 */ 4360 DB_SHOW_COMMAND(mount, db_show_mount) 4361 { 4362 struct mount *mp; 4363 struct vfsopt *opt; 4364 struct statfs *sp; 4365 struct vnode *vp; 4366 char buf[512]; 4367 uint64_t mflags; 4368 u_int flags; 4369 4370 if (!have_addr) { 4371 /* No address given, print short info about all mount points. */ 4372 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4373 db_printf("%p %s on %s (%s)\n", mp, 4374 mp->mnt_stat.f_mntfromname, 4375 mp->mnt_stat.f_mntonname, 4376 mp->mnt_stat.f_fstypename); 4377 if (db_pager_quit) 4378 break; 4379 } 4380 db_printf("\nMore info: show mount <addr>\n"); 4381 return; 4382 } 4383 4384 mp = (struct mount *)addr; 4385 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4386 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4387 4388 buf[0] = '\0'; 4389 mflags = mp->mnt_flag; 4390 #define MNT_FLAG(flag) do { \ 4391 if (mflags & (flag)) { \ 4392 if (buf[0] != '\0') \ 4393 strlcat(buf, ", ", sizeof(buf)); \ 4394 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4395 mflags &= ~(flag); \ 4396 } \ 4397 } while (0) 4398 MNT_FLAG(MNT_RDONLY); 4399 MNT_FLAG(MNT_SYNCHRONOUS); 4400 MNT_FLAG(MNT_NOEXEC); 4401 MNT_FLAG(MNT_NOSUID); 4402 MNT_FLAG(MNT_NFS4ACLS); 4403 MNT_FLAG(MNT_UNION); 4404 MNT_FLAG(MNT_ASYNC); 4405 MNT_FLAG(MNT_SUIDDIR); 4406 MNT_FLAG(MNT_SOFTDEP); 4407 MNT_FLAG(MNT_NOSYMFOLLOW); 4408 MNT_FLAG(MNT_GJOURNAL); 4409 MNT_FLAG(MNT_MULTILABEL); 4410 MNT_FLAG(MNT_ACLS); 4411 MNT_FLAG(MNT_NOATIME); 4412 MNT_FLAG(MNT_NOCLUSTERR); 4413 MNT_FLAG(MNT_NOCLUSTERW); 4414 MNT_FLAG(MNT_SUJ); 4415 MNT_FLAG(MNT_EXRDONLY); 4416 MNT_FLAG(MNT_EXPORTED); 4417 MNT_FLAG(MNT_DEFEXPORTED); 4418 MNT_FLAG(MNT_EXPORTANON); 4419 MNT_FLAG(MNT_EXKERB); 4420 MNT_FLAG(MNT_EXPUBLIC); 4421 MNT_FLAG(MNT_LOCAL); 4422 MNT_FLAG(MNT_QUOTA); 4423 MNT_FLAG(MNT_ROOTFS); 4424 MNT_FLAG(MNT_USER); 4425 MNT_FLAG(MNT_IGNORE); 4426 MNT_FLAG(MNT_UPDATE); 4427 MNT_FLAG(MNT_DELEXPORT); 4428 MNT_FLAG(MNT_RELOAD); 4429 MNT_FLAG(MNT_FORCE); 4430 MNT_FLAG(MNT_SNAPSHOT); 4431 MNT_FLAG(MNT_BYFSID); 4432 #undef MNT_FLAG 4433 if (mflags != 0) { 4434 if (buf[0] != '\0') 4435 strlcat(buf, ", ", sizeof(buf)); 4436 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4437 "0x%016jx", mflags); 4438 } 4439 db_printf(" mnt_flag = %s\n", buf); 4440 4441 buf[0] = '\0'; 4442 flags = mp->mnt_kern_flag; 4443 #define MNT_KERN_FLAG(flag) do { \ 4444 if (flags & (flag)) { \ 4445 if (buf[0] != '\0') \ 4446 strlcat(buf, ", ", sizeof(buf)); \ 4447 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4448 flags &= ~(flag); \ 4449 } \ 4450 } while (0) 4451 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4452 MNT_KERN_FLAG(MNTK_ASYNC); 4453 MNT_KERN_FLAG(MNTK_SOFTDEP); 4454 MNT_KERN_FLAG(MNTK_NOMSYNC); 4455 MNT_KERN_FLAG(MNTK_DRAINING); 4456 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4457 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4458 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4459 MNT_KERN_FLAG(MNTK_NO_IOPF); 4460 MNT_KERN_FLAG(MNTK_RECURSE); 4461 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4462 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4463 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4464 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4465 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4466 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4467 MNT_KERN_FLAG(MNTK_NOASYNC); 4468 MNT_KERN_FLAG(MNTK_UNMOUNT); 4469 MNT_KERN_FLAG(MNTK_MWAIT); 4470 MNT_KERN_FLAG(MNTK_SUSPEND); 4471 MNT_KERN_FLAG(MNTK_SUSPEND2); 4472 MNT_KERN_FLAG(MNTK_SUSPENDED); 4473 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4474 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4475 #undef MNT_KERN_FLAG 4476 if (flags != 0) { 4477 if (buf[0] != '\0') 4478 strlcat(buf, ", ", sizeof(buf)); 4479 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4480 "0x%08x", flags); 4481 } 4482 db_printf(" mnt_kern_flag = %s\n", buf); 4483 4484 db_printf(" mnt_opt = "); 4485 opt = TAILQ_FIRST(mp->mnt_opt); 4486 if (opt != NULL) { 4487 db_printf("%s", opt->name); 4488 opt = TAILQ_NEXT(opt, link); 4489 while (opt != NULL) { 4490 db_printf(", %s", opt->name); 4491 opt = TAILQ_NEXT(opt, link); 4492 } 4493 } 4494 db_printf("\n"); 4495 4496 sp = &mp->mnt_stat; 4497 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4498 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4499 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4500 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4501 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4502 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4503 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4504 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4505 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4506 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4507 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4508 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4509 4510 db_printf(" mnt_cred = { uid=%u ruid=%u", 4511 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4512 if (jailed(mp->mnt_cred)) 4513 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4514 db_printf(" }\n"); 4515 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4516 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4517 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4518 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4519 db_printf(" mnt_lazyvnodelistsize = %d\n", 4520 mp->mnt_lazyvnodelistsize); 4521 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4522 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4523 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4524 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4525 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4526 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4527 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4528 db_printf(" mnt_secondary_accwrites = %d\n", 4529 mp->mnt_secondary_accwrites); 4530 db_printf(" mnt_gjprovider = %s\n", 4531 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4532 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4533 4534 db_printf("\n\nList of active vnodes\n"); 4535 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4536 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4537 vn_printf(vp, "vnode "); 4538 if (db_pager_quit) 4539 break; 4540 } 4541 } 4542 db_printf("\n\nList of inactive vnodes\n"); 4543 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4544 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4545 vn_printf(vp, "vnode "); 4546 if (db_pager_quit) 4547 break; 4548 } 4549 } 4550 } 4551 #endif /* DDB */ 4552 4553 /* 4554 * Fill in a struct xvfsconf based on a struct vfsconf. 4555 */ 4556 static int 4557 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4558 { 4559 struct xvfsconf xvfsp; 4560 4561 bzero(&xvfsp, sizeof(xvfsp)); 4562 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4563 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4564 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4565 xvfsp.vfc_flags = vfsp->vfc_flags; 4566 /* 4567 * These are unused in userland, we keep them 4568 * to not break binary compatibility. 4569 */ 4570 xvfsp.vfc_vfsops = NULL; 4571 xvfsp.vfc_next = NULL; 4572 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4573 } 4574 4575 #ifdef COMPAT_FREEBSD32 4576 struct xvfsconf32 { 4577 uint32_t vfc_vfsops; 4578 char vfc_name[MFSNAMELEN]; 4579 int32_t vfc_typenum; 4580 int32_t vfc_refcount; 4581 int32_t vfc_flags; 4582 uint32_t vfc_next; 4583 }; 4584 4585 static int 4586 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4587 { 4588 struct xvfsconf32 xvfsp; 4589 4590 bzero(&xvfsp, sizeof(xvfsp)); 4591 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4592 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4593 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4594 xvfsp.vfc_flags = vfsp->vfc_flags; 4595 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4596 } 4597 #endif 4598 4599 /* 4600 * Top level filesystem related information gathering. 4601 */ 4602 static int 4603 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4604 { 4605 struct vfsconf *vfsp; 4606 int error; 4607 4608 error = 0; 4609 vfsconf_slock(); 4610 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4611 #ifdef COMPAT_FREEBSD32 4612 if (req->flags & SCTL_MASK32) 4613 error = vfsconf2x32(req, vfsp); 4614 else 4615 #endif 4616 error = vfsconf2x(req, vfsp); 4617 if (error) 4618 break; 4619 } 4620 vfsconf_sunlock(); 4621 return (error); 4622 } 4623 4624 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4625 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4626 "S,xvfsconf", "List of all configured filesystems"); 4627 4628 #ifndef BURN_BRIDGES 4629 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4630 4631 static int 4632 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4633 { 4634 int *name = (int *)arg1 - 1; /* XXX */ 4635 u_int namelen = arg2 + 1; /* XXX */ 4636 struct vfsconf *vfsp; 4637 4638 log(LOG_WARNING, "userland calling deprecated sysctl, " 4639 "please rebuild world\n"); 4640 4641 #if 1 || defined(COMPAT_PRELITE2) 4642 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4643 if (namelen == 1) 4644 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4645 #endif 4646 4647 switch (name[1]) { 4648 case VFS_MAXTYPENUM: 4649 if (namelen != 2) 4650 return (ENOTDIR); 4651 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4652 case VFS_CONF: 4653 if (namelen != 3) 4654 return (ENOTDIR); /* overloaded */ 4655 vfsconf_slock(); 4656 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4657 if (vfsp->vfc_typenum == name[2]) 4658 break; 4659 } 4660 vfsconf_sunlock(); 4661 if (vfsp == NULL) 4662 return (EOPNOTSUPP); 4663 #ifdef COMPAT_FREEBSD32 4664 if (req->flags & SCTL_MASK32) 4665 return (vfsconf2x32(req, vfsp)); 4666 else 4667 #endif 4668 return (vfsconf2x(req, vfsp)); 4669 } 4670 return (EOPNOTSUPP); 4671 } 4672 4673 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4674 CTLFLAG_MPSAFE, vfs_sysctl, 4675 "Generic filesystem"); 4676 4677 #if 1 || defined(COMPAT_PRELITE2) 4678 4679 static int 4680 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4681 { 4682 int error; 4683 struct vfsconf *vfsp; 4684 struct ovfsconf ovfs; 4685 4686 vfsconf_slock(); 4687 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4688 bzero(&ovfs, sizeof(ovfs)); 4689 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4690 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4691 ovfs.vfc_index = vfsp->vfc_typenum; 4692 ovfs.vfc_refcount = vfsp->vfc_refcount; 4693 ovfs.vfc_flags = vfsp->vfc_flags; 4694 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4695 if (error != 0) { 4696 vfsconf_sunlock(); 4697 return (error); 4698 } 4699 } 4700 vfsconf_sunlock(); 4701 return (0); 4702 } 4703 4704 #endif /* 1 || COMPAT_PRELITE2 */ 4705 #endif /* !BURN_BRIDGES */ 4706 4707 static void 4708 unmount_or_warn(struct mount *mp) 4709 { 4710 int error; 4711 4712 error = dounmount(mp, MNT_FORCE, curthread); 4713 if (error != 0) { 4714 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4715 if (error == EBUSY) 4716 printf("BUSY)\n"); 4717 else 4718 printf("%d)\n", error); 4719 } 4720 } 4721 4722 /* 4723 * Unmount all filesystems. The list is traversed in reverse order 4724 * of mounting to avoid dependencies. 4725 */ 4726 void 4727 vfs_unmountall(void) 4728 { 4729 struct mount *mp, *tmp; 4730 4731 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4732 4733 /* 4734 * Since this only runs when rebooting, it is not interlocked. 4735 */ 4736 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4737 vfs_ref(mp); 4738 4739 /* 4740 * Forcibly unmounting "/dev" before "/" would prevent clean 4741 * unmount of the latter. 4742 */ 4743 if (mp == rootdevmp) 4744 continue; 4745 4746 unmount_or_warn(mp); 4747 } 4748 4749 if (rootdevmp != NULL) 4750 unmount_or_warn(rootdevmp); 4751 } 4752 4753 static void 4754 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4755 { 4756 4757 ASSERT_VI_LOCKED(vp, __func__); 4758 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4759 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4760 vdropl(vp); 4761 return; 4762 } 4763 if (vn_lock(vp, lkflags) == 0) { 4764 VI_LOCK(vp); 4765 vinactive(vp); 4766 VOP_UNLOCK(vp); 4767 vdropl(vp); 4768 return; 4769 } 4770 vdefer_inactive_unlocked(vp); 4771 } 4772 4773 static int 4774 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4775 { 4776 4777 return (vp->v_iflag & VI_DEFINACT); 4778 } 4779 4780 static void __noinline 4781 vfs_periodic_inactive(struct mount *mp, int flags) 4782 { 4783 struct vnode *vp, *mvp; 4784 int lkflags; 4785 4786 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4787 if (flags != MNT_WAIT) 4788 lkflags |= LK_NOWAIT; 4789 4790 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4791 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4792 VI_UNLOCK(vp); 4793 continue; 4794 } 4795 vp->v_iflag &= ~VI_DEFINACT; 4796 vfs_deferred_inactive(vp, lkflags); 4797 } 4798 } 4799 4800 static inline bool 4801 vfs_want_msync(struct vnode *vp) 4802 { 4803 struct vm_object *obj; 4804 4805 /* 4806 * This test may be performed without any locks held. 4807 * We rely on vm_object's type stability. 4808 */ 4809 if (vp->v_vflag & VV_NOSYNC) 4810 return (false); 4811 obj = vp->v_object; 4812 return (obj != NULL && vm_object_mightbedirty(obj)); 4813 } 4814 4815 static int 4816 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4817 { 4818 4819 if (vp->v_vflag & VV_NOSYNC) 4820 return (false); 4821 if (vp->v_iflag & VI_DEFINACT) 4822 return (true); 4823 return (vfs_want_msync(vp)); 4824 } 4825 4826 static void __noinline 4827 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4828 { 4829 struct vnode *vp, *mvp; 4830 struct vm_object *obj; 4831 int lkflags, objflags; 4832 bool seen_defer; 4833 4834 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4835 if (flags != MNT_WAIT) { 4836 lkflags |= LK_NOWAIT; 4837 objflags = OBJPC_NOSYNC; 4838 } else { 4839 objflags = OBJPC_SYNC; 4840 } 4841 4842 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4843 seen_defer = false; 4844 if (vp->v_iflag & VI_DEFINACT) { 4845 vp->v_iflag &= ~VI_DEFINACT; 4846 seen_defer = true; 4847 } 4848 if (!vfs_want_msync(vp)) { 4849 if (seen_defer) 4850 vfs_deferred_inactive(vp, lkflags); 4851 else 4852 VI_UNLOCK(vp); 4853 continue; 4854 } 4855 if (vget(vp, lkflags) == 0) { 4856 obj = vp->v_object; 4857 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4858 VM_OBJECT_WLOCK(obj); 4859 vm_object_page_clean(obj, 0, 0, objflags); 4860 VM_OBJECT_WUNLOCK(obj); 4861 } 4862 vput(vp); 4863 if (seen_defer) 4864 vdrop(vp); 4865 } else { 4866 if (seen_defer) 4867 vdefer_inactive_unlocked(vp); 4868 } 4869 } 4870 } 4871 4872 void 4873 vfs_periodic(struct mount *mp, int flags) 4874 { 4875 4876 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4877 4878 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4879 vfs_periodic_inactive(mp, flags); 4880 else 4881 vfs_periodic_msync_inactive(mp, flags); 4882 } 4883 4884 static void 4885 destroy_vpollinfo_free(struct vpollinfo *vi) 4886 { 4887 4888 knlist_destroy(&vi->vpi_selinfo.si_note); 4889 mtx_destroy(&vi->vpi_lock); 4890 free(vi, M_VNODEPOLL); 4891 } 4892 4893 static void 4894 destroy_vpollinfo(struct vpollinfo *vi) 4895 { 4896 4897 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4898 seldrain(&vi->vpi_selinfo); 4899 destroy_vpollinfo_free(vi); 4900 } 4901 4902 /* 4903 * Initialize per-vnode helper structure to hold poll-related state. 4904 */ 4905 void 4906 v_addpollinfo(struct vnode *vp) 4907 { 4908 struct vpollinfo *vi; 4909 4910 if (vp->v_pollinfo != NULL) 4911 return; 4912 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 4913 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4914 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4915 vfs_knlunlock, vfs_knl_assert_lock); 4916 VI_LOCK(vp); 4917 if (vp->v_pollinfo != NULL) { 4918 VI_UNLOCK(vp); 4919 destroy_vpollinfo_free(vi); 4920 return; 4921 } 4922 vp->v_pollinfo = vi; 4923 VI_UNLOCK(vp); 4924 } 4925 4926 /* 4927 * Record a process's interest in events which might happen to 4928 * a vnode. Because poll uses the historic select-style interface 4929 * internally, this routine serves as both the ``check for any 4930 * pending events'' and the ``record my interest in future events'' 4931 * functions. (These are done together, while the lock is held, 4932 * to avoid race conditions.) 4933 */ 4934 int 4935 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4936 { 4937 4938 v_addpollinfo(vp); 4939 mtx_lock(&vp->v_pollinfo->vpi_lock); 4940 if (vp->v_pollinfo->vpi_revents & events) { 4941 /* 4942 * This leaves events we are not interested 4943 * in available for the other process which 4944 * which presumably had requested them 4945 * (otherwise they would never have been 4946 * recorded). 4947 */ 4948 events &= vp->v_pollinfo->vpi_revents; 4949 vp->v_pollinfo->vpi_revents &= ~events; 4950 4951 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4952 return (events); 4953 } 4954 vp->v_pollinfo->vpi_events |= events; 4955 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4956 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4957 return (0); 4958 } 4959 4960 /* 4961 * Routine to create and manage a filesystem syncer vnode. 4962 */ 4963 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4964 static int sync_fsync(struct vop_fsync_args *); 4965 static int sync_inactive(struct vop_inactive_args *); 4966 static int sync_reclaim(struct vop_reclaim_args *); 4967 4968 static struct vop_vector sync_vnodeops = { 4969 .vop_bypass = VOP_EOPNOTSUPP, 4970 .vop_close = sync_close, /* close */ 4971 .vop_fsync = sync_fsync, /* fsync */ 4972 .vop_inactive = sync_inactive, /* inactive */ 4973 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 4974 .vop_reclaim = sync_reclaim, /* reclaim */ 4975 .vop_lock1 = vop_stdlock, /* lock */ 4976 .vop_unlock = vop_stdunlock, /* unlock */ 4977 .vop_islocked = vop_stdislocked, /* islocked */ 4978 }; 4979 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 4980 4981 /* 4982 * Create a new filesystem syncer vnode for the specified mount point. 4983 */ 4984 void 4985 vfs_allocate_syncvnode(struct mount *mp) 4986 { 4987 struct vnode *vp; 4988 struct bufobj *bo; 4989 static long start, incr, next; 4990 int error; 4991 4992 /* Allocate a new vnode */ 4993 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 4994 if (error != 0) 4995 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 4996 vp->v_type = VNON; 4997 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4998 vp->v_vflag |= VV_FORCEINSMQ; 4999 error = insmntque1(vp, mp); 5000 if (error != 0) 5001 panic("vfs_allocate_syncvnode: insmntque() failed"); 5002 vp->v_vflag &= ~VV_FORCEINSMQ; 5003 vn_set_state(vp, VSTATE_CONSTRUCTED); 5004 VOP_UNLOCK(vp); 5005 /* 5006 * Place the vnode onto the syncer worklist. We attempt to 5007 * scatter them about on the list so that they will go off 5008 * at evenly distributed times even if all the filesystems 5009 * are mounted at once. 5010 */ 5011 next += incr; 5012 if (next == 0 || next > syncer_maxdelay) { 5013 start /= 2; 5014 incr /= 2; 5015 if (start == 0) { 5016 start = syncer_maxdelay / 2; 5017 incr = syncer_maxdelay; 5018 } 5019 next = start; 5020 } 5021 bo = &vp->v_bufobj; 5022 BO_LOCK(bo); 5023 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5024 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5025 mtx_lock(&sync_mtx); 5026 sync_vnode_count++; 5027 if (mp->mnt_syncer == NULL) { 5028 mp->mnt_syncer = vp; 5029 vp = NULL; 5030 } 5031 mtx_unlock(&sync_mtx); 5032 BO_UNLOCK(bo); 5033 if (vp != NULL) { 5034 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5035 vgone(vp); 5036 vput(vp); 5037 } 5038 } 5039 5040 void 5041 vfs_deallocate_syncvnode(struct mount *mp) 5042 { 5043 struct vnode *vp; 5044 5045 mtx_lock(&sync_mtx); 5046 vp = mp->mnt_syncer; 5047 if (vp != NULL) 5048 mp->mnt_syncer = NULL; 5049 mtx_unlock(&sync_mtx); 5050 if (vp != NULL) 5051 vrele(vp); 5052 } 5053 5054 /* 5055 * Do a lazy sync of the filesystem. 5056 */ 5057 static int 5058 sync_fsync(struct vop_fsync_args *ap) 5059 { 5060 struct vnode *syncvp = ap->a_vp; 5061 struct mount *mp = syncvp->v_mount; 5062 int error, save; 5063 struct bufobj *bo; 5064 5065 /* 5066 * We only need to do something if this is a lazy evaluation. 5067 */ 5068 if (ap->a_waitfor != MNT_LAZY) 5069 return (0); 5070 5071 /* 5072 * Move ourselves to the back of the sync list. 5073 */ 5074 bo = &syncvp->v_bufobj; 5075 BO_LOCK(bo); 5076 vn_syncer_add_to_worklist(bo, syncdelay); 5077 BO_UNLOCK(bo); 5078 5079 /* 5080 * Walk the list of vnodes pushing all that are dirty and 5081 * not already on the sync list. 5082 */ 5083 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5084 return (0); 5085 VOP_UNLOCK(syncvp); 5086 save = curthread_pflags_set(TDP_SYNCIO); 5087 /* 5088 * The filesystem at hand may be idle with free vnodes stored in the 5089 * batch. Return them instead of letting them stay there indefinitely. 5090 */ 5091 vfs_periodic(mp, MNT_NOWAIT); 5092 error = VFS_SYNC(mp, MNT_LAZY); 5093 curthread_pflags_restore(save); 5094 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5095 vfs_unbusy(mp); 5096 return (error); 5097 } 5098 5099 /* 5100 * The syncer vnode is no referenced. 5101 */ 5102 static int 5103 sync_inactive(struct vop_inactive_args *ap) 5104 { 5105 5106 vgone(ap->a_vp); 5107 return (0); 5108 } 5109 5110 /* 5111 * The syncer vnode is no longer needed and is being decommissioned. 5112 * 5113 * Modifications to the worklist must be protected by sync_mtx. 5114 */ 5115 static int 5116 sync_reclaim(struct vop_reclaim_args *ap) 5117 { 5118 struct vnode *vp = ap->a_vp; 5119 struct bufobj *bo; 5120 5121 bo = &vp->v_bufobj; 5122 BO_LOCK(bo); 5123 mtx_lock(&sync_mtx); 5124 if (vp->v_mount->mnt_syncer == vp) 5125 vp->v_mount->mnt_syncer = NULL; 5126 if (bo->bo_flag & BO_ONWORKLST) { 5127 LIST_REMOVE(bo, bo_synclist); 5128 syncer_worklist_len--; 5129 sync_vnode_count--; 5130 bo->bo_flag &= ~BO_ONWORKLST; 5131 } 5132 mtx_unlock(&sync_mtx); 5133 BO_UNLOCK(bo); 5134 5135 return (0); 5136 } 5137 5138 int 5139 vn_need_pageq_flush(struct vnode *vp) 5140 { 5141 struct vm_object *obj; 5142 5143 obj = vp->v_object; 5144 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5145 vm_object_mightbedirty(obj)); 5146 } 5147 5148 /* 5149 * Check if vnode represents a disk device 5150 */ 5151 bool 5152 vn_isdisk_error(struct vnode *vp, int *errp) 5153 { 5154 int error; 5155 5156 if (vp->v_type != VCHR) { 5157 error = ENOTBLK; 5158 goto out; 5159 } 5160 error = 0; 5161 dev_lock(); 5162 if (vp->v_rdev == NULL) 5163 error = ENXIO; 5164 else if (vp->v_rdev->si_devsw == NULL) 5165 error = ENXIO; 5166 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5167 error = ENOTBLK; 5168 dev_unlock(); 5169 out: 5170 *errp = error; 5171 return (error == 0); 5172 } 5173 5174 bool 5175 vn_isdisk(struct vnode *vp) 5176 { 5177 int error; 5178 5179 return (vn_isdisk_error(vp, &error)); 5180 } 5181 5182 /* 5183 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5184 * the comment above cache_fplookup for details. 5185 */ 5186 int 5187 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5188 { 5189 int error; 5190 5191 VFS_SMR_ASSERT_ENTERED(); 5192 5193 /* Check the owner. */ 5194 if (cred->cr_uid == file_uid) { 5195 if (file_mode & S_IXUSR) 5196 return (0); 5197 goto out_error; 5198 } 5199 5200 /* Otherwise, check the groups (first match) */ 5201 if (groupmember(file_gid, cred)) { 5202 if (file_mode & S_IXGRP) 5203 return (0); 5204 goto out_error; 5205 } 5206 5207 /* Otherwise, check everyone else. */ 5208 if (file_mode & S_IXOTH) 5209 return (0); 5210 out_error: 5211 /* 5212 * Permission check failed, but it is possible denial will get overwritten 5213 * (e.g., when root is traversing through a 700 directory owned by someone 5214 * else). 5215 * 5216 * vaccess() calls priv_check_cred which in turn can descent into MAC 5217 * modules overriding this result. It's quite unclear what semantics 5218 * are allowed for them to operate, thus for safety we don't call them 5219 * from within the SMR section. This also means if any such modules 5220 * are present, we have to let the regular lookup decide. 5221 */ 5222 error = priv_check_cred_vfs_lookup_nomac(cred); 5223 switch (error) { 5224 case 0: 5225 return (0); 5226 case EAGAIN: 5227 /* 5228 * MAC modules present. 5229 */ 5230 return (EAGAIN); 5231 case EPERM: 5232 return (EACCES); 5233 default: 5234 return (error); 5235 } 5236 } 5237 5238 /* 5239 * Common filesystem object access control check routine. Accepts a 5240 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5241 * Returns 0 on success, or an errno on failure. 5242 */ 5243 int 5244 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5245 accmode_t accmode, struct ucred *cred) 5246 { 5247 accmode_t dac_granted; 5248 accmode_t priv_granted; 5249 5250 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5251 ("invalid bit in accmode")); 5252 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5253 ("VAPPEND without VWRITE")); 5254 5255 /* 5256 * Look for a normal, non-privileged way to access the file/directory 5257 * as requested. If it exists, go with that. 5258 */ 5259 5260 dac_granted = 0; 5261 5262 /* Check the owner. */ 5263 if (cred->cr_uid == file_uid) { 5264 dac_granted |= VADMIN; 5265 if (file_mode & S_IXUSR) 5266 dac_granted |= VEXEC; 5267 if (file_mode & S_IRUSR) 5268 dac_granted |= VREAD; 5269 if (file_mode & S_IWUSR) 5270 dac_granted |= (VWRITE | VAPPEND); 5271 5272 if ((accmode & dac_granted) == accmode) 5273 return (0); 5274 5275 goto privcheck; 5276 } 5277 5278 /* Otherwise, check the groups (first match) */ 5279 if (groupmember(file_gid, cred)) { 5280 if (file_mode & S_IXGRP) 5281 dac_granted |= VEXEC; 5282 if (file_mode & S_IRGRP) 5283 dac_granted |= VREAD; 5284 if (file_mode & S_IWGRP) 5285 dac_granted |= (VWRITE | VAPPEND); 5286 5287 if ((accmode & dac_granted) == accmode) 5288 return (0); 5289 5290 goto privcheck; 5291 } 5292 5293 /* Otherwise, check everyone else. */ 5294 if (file_mode & S_IXOTH) 5295 dac_granted |= VEXEC; 5296 if (file_mode & S_IROTH) 5297 dac_granted |= VREAD; 5298 if (file_mode & S_IWOTH) 5299 dac_granted |= (VWRITE | VAPPEND); 5300 if ((accmode & dac_granted) == accmode) 5301 return (0); 5302 5303 privcheck: 5304 /* 5305 * Build a privilege mask to determine if the set of privileges 5306 * satisfies the requirements when combined with the granted mask 5307 * from above. For each privilege, if the privilege is required, 5308 * bitwise or the request type onto the priv_granted mask. 5309 */ 5310 priv_granted = 0; 5311 5312 if (type == VDIR) { 5313 /* 5314 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5315 * requests, instead of PRIV_VFS_EXEC. 5316 */ 5317 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5318 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5319 priv_granted |= VEXEC; 5320 } else { 5321 /* 5322 * Ensure that at least one execute bit is on. Otherwise, 5323 * a privileged user will always succeed, and we don't want 5324 * this to happen unless the file really is executable. 5325 */ 5326 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5327 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5328 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5329 priv_granted |= VEXEC; 5330 } 5331 5332 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5333 !priv_check_cred(cred, PRIV_VFS_READ)) 5334 priv_granted |= VREAD; 5335 5336 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5337 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5338 priv_granted |= (VWRITE | VAPPEND); 5339 5340 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5341 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5342 priv_granted |= VADMIN; 5343 5344 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5345 return (0); 5346 } 5347 5348 return ((accmode & VADMIN) ? EPERM : EACCES); 5349 } 5350 5351 /* 5352 * Credential check based on process requesting service, and per-attribute 5353 * permissions. 5354 */ 5355 int 5356 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5357 struct thread *td, accmode_t accmode) 5358 { 5359 5360 /* 5361 * Kernel-invoked always succeeds. 5362 */ 5363 if (cred == NOCRED) 5364 return (0); 5365 5366 /* 5367 * Do not allow privileged processes in jail to directly manipulate 5368 * system attributes. 5369 */ 5370 switch (attrnamespace) { 5371 case EXTATTR_NAMESPACE_SYSTEM: 5372 /* Potentially should be: return (EPERM); */ 5373 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5374 case EXTATTR_NAMESPACE_USER: 5375 return (VOP_ACCESS(vp, accmode, cred, td)); 5376 default: 5377 return (EPERM); 5378 } 5379 } 5380 5381 #ifdef DEBUG_VFS_LOCKS 5382 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5383 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5384 "Drop into debugger on lock violation"); 5385 5386 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5387 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5388 0, "Check for interlock across VOPs"); 5389 5390 int vfs_badlock_print = 1; /* Print lock violations. */ 5391 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5392 0, "Print lock violations"); 5393 5394 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5395 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5396 0, "Print vnode details on lock violations"); 5397 5398 #ifdef KDB 5399 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5400 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5401 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5402 #endif 5403 5404 static void 5405 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5406 { 5407 5408 #ifdef KDB 5409 if (vfs_badlock_backtrace) 5410 kdb_backtrace(); 5411 #endif 5412 if (vfs_badlock_vnode) 5413 vn_printf(vp, "vnode "); 5414 if (vfs_badlock_print) 5415 printf("%s: %p %s\n", str, (void *)vp, msg); 5416 if (vfs_badlock_ddb) 5417 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5418 } 5419 5420 void 5421 assert_vi_locked(struct vnode *vp, const char *str) 5422 { 5423 5424 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5425 vfs_badlock("interlock is not locked but should be", str, vp); 5426 } 5427 5428 void 5429 assert_vi_unlocked(struct vnode *vp, const char *str) 5430 { 5431 5432 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5433 vfs_badlock("interlock is locked but should not be", str, vp); 5434 } 5435 5436 void 5437 assert_vop_locked(struct vnode *vp, const char *str) 5438 { 5439 int locked; 5440 5441 if (KERNEL_PANICKED() || vp == NULL) 5442 return; 5443 5444 locked = VOP_ISLOCKED(vp); 5445 if (locked == 0 || locked == LK_EXCLOTHER) 5446 vfs_badlock("is not locked but should be", str, vp); 5447 } 5448 5449 void 5450 assert_vop_unlocked(struct vnode *vp, const char *str) 5451 { 5452 if (KERNEL_PANICKED() || vp == NULL) 5453 return; 5454 5455 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5456 vfs_badlock("is locked but should not be", str, vp); 5457 } 5458 5459 void 5460 assert_vop_elocked(struct vnode *vp, const char *str) 5461 { 5462 if (KERNEL_PANICKED() || vp == NULL) 5463 return; 5464 5465 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5466 vfs_badlock("is not exclusive locked but should be", str, vp); 5467 } 5468 #endif /* DEBUG_VFS_LOCKS */ 5469 5470 void 5471 vop_rename_fail(struct vop_rename_args *ap) 5472 { 5473 5474 if (ap->a_tvp != NULL) 5475 vput(ap->a_tvp); 5476 if (ap->a_tdvp == ap->a_tvp) 5477 vrele(ap->a_tdvp); 5478 else 5479 vput(ap->a_tdvp); 5480 vrele(ap->a_fdvp); 5481 vrele(ap->a_fvp); 5482 } 5483 5484 void 5485 vop_rename_pre(void *ap) 5486 { 5487 struct vop_rename_args *a = ap; 5488 5489 #ifdef DEBUG_VFS_LOCKS 5490 if (a->a_tvp) 5491 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5492 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5493 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5494 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5495 5496 /* Check the source (from). */ 5497 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5498 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5499 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5500 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5501 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5502 5503 /* Check the target. */ 5504 if (a->a_tvp) 5505 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5506 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5507 #endif 5508 /* 5509 * It may be tempting to add vn_seqc_write_begin/end calls here and 5510 * in vop_rename_post but that's not going to work out since some 5511 * filesystems relookup vnodes mid-rename. This is probably a bug. 5512 * 5513 * For now filesystems are expected to do the relevant calls after they 5514 * decide what vnodes to operate on. 5515 */ 5516 if (a->a_tdvp != a->a_fdvp) 5517 vhold(a->a_fdvp); 5518 if (a->a_tvp != a->a_fvp) 5519 vhold(a->a_fvp); 5520 vhold(a->a_tdvp); 5521 if (a->a_tvp) 5522 vhold(a->a_tvp); 5523 } 5524 5525 #ifdef DEBUG_VFS_LOCKS 5526 void 5527 vop_fplookup_vexec_debugpre(void *ap __unused) 5528 { 5529 5530 VFS_SMR_ASSERT_ENTERED(); 5531 } 5532 5533 void 5534 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5535 { 5536 5537 VFS_SMR_ASSERT_ENTERED(); 5538 } 5539 5540 void 5541 vop_fplookup_symlink_debugpre(void *ap __unused) 5542 { 5543 5544 VFS_SMR_ASSERT_ENTERED(); 5545 } 5546 5547 void 5548 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5549 { 5550 5551 VFS_SMR_ASSERT_ENTERED(); 5552 } 5553 5554 static void 5555 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5556 { 5557 if (vp->v_type == VCHR) 5558 ; 5559 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5560 ASSERT_VOP_LOCKED(vp, name); 5561 else 5562 ASSERT_VOP_ELOCKED(vp, name); 5563 } 5564 5565 void 5566 vop_fsync_debugpre(void *a) 5567 { 5568 struct vop_fsync_args *ap; 5569 5570 ap = a; 5571 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5572 } 5573 5574 void 5575 vop_fsync_debugpost(void *a, int rc __unused) 5576 { 5577 struct vop_fsync_args *ap; 5578 5579 ap = a; 5580 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5581 } 5582 5583 void 5584 vop_fdatasync_debugpre(void *a) 5585 { 5586 struct vop_fdatasync_args *ap; 5587 5588 ap = a; 5589 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5590 } 5591 5592 void 5593 vop_fdatasync_debugpost(void *a, int rc __unused) 5594 { 5595 struct vop_fdatasync_args *ap; 5596 5597 ap = a; 5598 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5599 } 5600 5601 void 5602 vop_strategy_debugpre(void *ap) 5603 { 5604 struct vop_strategy_args *a; 5605 struct buf *bp; 5606 5607 a = ap; 5608 bp = a->a_bp; 5609 5610 /* 5611 * Cluster ops lock their component buffers but not the IO container. 5612 */ 5613 if ((bp->b_flags & B_CLUSTER) != 0) 5614 return; 5615 5616 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5617 if (vfs_badlock_print) 5618 printf( 5619 "VOP_STRATEGY: bp is not locked but should be\n"); 5620 if (vfs_badlock_ddb) 5621 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5622 } 5623 } 5624 5625 void 5626 vop_lock_debugpre(void *ap) 5627 { 5628 struct vop_lock1_args *a = ap; 5629 5630 if ((a->a_flags & LK_INTERLOCK) == 0) 5631 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5632 else 5633 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5634 } 5635 5636 void 5637 vop_lock_debugpost(void *ap, int rc) 5638 { 5639 struct vop_lock1_args *a = ap; 5640 5641 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5642 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5643 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5644 } 5645 5646 void 5647 vop_unlock_debugpre(void *ap) 5648 { 5649 struct vop_unlock_args *a = ap; 5650 struct vnode *vp = a->a_vp; 5651 5652 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 5653 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 5654 } 5655 5656 void 5657 vop_need_inactive_debugpre(void *ap) 5658 { 5659 struct vop_need_inactive_args *a = ap; 5660 5661 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5662 } 5663 5664 void 5665 vop_need_inactive_debugpost(void *ap, int rc) 5666 { 5667 struct vop_need_inactive_args *a = ap; 5668 5669 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5670 } 5671 #endif 5672 5673 void 5674 vop_create_pre(void *ap) 5675 { 5676 struct vop_create_args *a; 5677 struct vnode *dvp; 5678 5679 a = ap; 5680 dvp = a->a_dvp; 5681 vn_seqc_write_begin(dvp); 5682 } 5683 5684 void 5685 vop_create_post(void *ap, int rc) 5686 { 5687 struct vop_create_args *a; 5688 struct vnode *dvp; 5689 5690 a = ap; 5691 dvp = a->a_dvp; 5692 vn_seqc_write_end(dvp); 5693 if (!rc) 5694 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5695 } 5696 5697 void 5698 vop_whiteout_pre(void *ap) 5699 { 5700 struct vop_whiteout_args *a; 5701 struct vnode *dvp; 5702 5703 a = ap; 5704 dvp = a->a_dvp; 5705 vn_seqc_write_begin(dvp); 5706 } 5707 5708 void 5709 vop_whiteout_post(void *ap, int rc) 5710 { 5711 struct vop_whiteout_args *a; 5712 struct vnode *dvp; 5713 5714 a = ap; 5715 dvp = a->a_dvp; 5716 vn_seqc_write_end(dvp); 5717 } 5718 5719 void 5720 vop_deleteextattr_pre(void *ap) 5721 { 5722 struct vop_deleteextattr_args *a; 5723 struct vnode *vp; 5724 5725 a = ap; 5726 vp = a->a_vp; 5727 vn_seqc_write_begin(vp); 5728 } 5729 5730 void 5731 vop_deleteextattr_post(void *ap, int rc) 5732 { 5733 struct vop_deleteextattr_args *a; 5734 struct vnode *vp; 5735 5736 a = ap; 5737 vp = a->a_vp; 5738 vn_seqc_write_end(vp); 5739 if (!rc) 5740 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5741 } 5742 5743 void 5744 vop_link_pre(void *ap) 5745 { 5746 struct vop_link_args *a; 5747 struct vnode *vp, *tdvp; 5748 5749 a = ap; 5750 vp = a->a_vp; 5751 tdvp = a->a_tdvp; 5752 vn_seqc_write_begin(vp); 5753 vn_seqc_write_begin(tdvp); 5754 } 5755 5756 void 5757 vop_link_post(void *ap, int rc) 5758 { 5759 struct vop_link_args *a; 5760 struct vnode *vp, *tdvp; 5761 5762 a = ap; 5763 vp = a->a_vp; 5764 tdvp = a->a_tdvp; 5765 vn_seqc_write_end(vp); 5766 vn_seqc_write_end(tdvp); 5767 if (!rc) { 5768 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5769 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5770 } 5771 } 5772 5773 void 5774 vop_mkdir_pre(void *ap) 5775 { 5776 struct vop_mkdir_args *a; 5777 struct vnode *dvp; 5778 5779 a = ap; 5780 dvp = a->a_dvp; 5781 vn_seqc_write_begin(dvp); 5782 } 5783 5784 void 5785 vop_mkdir_post(void *ap, int rc) 5786 { 5787 struct vop_mkdir_args *a; 5788 struct vnode *dvp; 5789 5790 a = ap; 5791 dvp = a->a_dvp; 5792 vn_seqc_write_end(dvp); 5793 if (!rc) 5794 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5795 } 5796 5797 #ifdef DEBUG_VFS_LOCKS 5798 void 5799 vop_mkdir_debugpost(void *ap, int rc) 5800 { 5801 struct vop_mkdir_args *a; 5802 5803 a = ap; 5804 if (!rc) 5805 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5806 } 5807 #endif 5808 5809 void 5810 vop_mknod_pre(void *ap) 5811 { 5812 struct vop_mknod_args *a; 5813 struct vnode *dvp; 5814 5815 a = ap; 5816 dvp = a->a_dvp; 5817 vn_seqc_write_begin(dvp); 5818 } 5819 5820 void 5821 vop_mknod_post(void *ap, int rc) 5822 { 5823 struct vop_mknod_args *a; 5824 struct vnode *dvp; 5825 5826 a = ap; 5827 dvp = a->a_dvp; 5828 vn_seqc_write_end(dvp); 5829 if (!rc) 5830 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5831 } 5832 5833 void 5834 vop_reclaim_post(void *ap, int rc) 5835 { 5836 struct vop_reclaim_args *a; 5837 struct vnode *vp; 5838 5839 a = ap; 5840 vp = a->a_vp; 5841 ASSERT_VOP_IN_SEQC(vp); 5842 if (!rc) 5843 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5844 } 5845 5846 void 5847 vop_remove_pre(void *ap) 5848 { 5849 struct vop_remove_args *a; 5850 struct vnode *dvp, *vp; 5851 5852 a = ap; 5853 dvp = a->a_dvp; 5854 vp = a->a_vp; 5855 vn_seqc_write_begin(dvp); 5856 vn_seqc_write_begin(vp); 5857 } 5858 5859 void 5860 vop_remove_post(void *ap, int rc) 5861 { 5862 struct vop_remove_args *a; 5863 struct vnode *dvp, *vp; 5864 5865 a = ap; 5866 dvp = a->a_dvp; 5867 vp = a->a_vp; 5868 vn_seqc_write_end(dvp); 5869 vn_seqc_write_end(vp); 5870 if (!rc) { 5871 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5872 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5873 } 5874 } 5875 5876 void 5877 vop_rename_post(void *ap, int rc) 5878 { 5879 struct vop_rename_args *a = ap; 5880 long hint; 5881 5882 if (!rc) { 5883 hint = NOTE_WRITE; 5884 if (a->a_fdvp == a->a_tdvp) { 5885 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5886 hint |= NOTE_LINK; 5887 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5888 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5889 } else { 5890 hint |= NOTE_EXTEND; 5891 if (a->a_fvp->v_type == VDIR) 5892 hint |= NOTE_LINK; 5893 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5894 5895 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5896 a->a_tvp->v_type == VDIR) 5897 hint &= ~NOTE_LINK; 5898 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5899 } 5900 5901 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5902 if (a->a_tvp) 5903 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5904 } 5905 if (a->a_tdvp != a->a_fdvp) 5906 vdrop(a->a_fdvp); 5907 if (a->a_tvp != a->a_fvp) 5908 vdrop(a->a_fvp); 5909 vdrop(a->a_tdvp); 5910 if (a->a_tvp) 5911 vdrop(a->a_tvp); 5912 } 5913 5914 void 5915 vop_rmdir_pre(void *ap) 5916 { 5917 struct vop_rmdir_args *a; 5918 struct vnode *dvp, *vp; 5919 5920 a = ap; 5921 dvp = a->a_dvp; 5922 vp = a->a_vp; 5923 vn_seqc_write_begin(dvp); 5924 vn_seqc_write_begin(vp); 5925 } 5926 5927 void 5928 vop_rmdir_post(void *ap, int rc) 5929 { 5930 struct vop_rmdir_args *a; 5931 struct vnode *dvp, *vp; 5932 5933 a = ap; 5934 dvp = a->a_dvp; 5935 vp = a->a_vp; 5936 vn_seqc_write_end(dvp); 5937 vn_seqc_write_end(vp); 5938 if (!rc) { 5939 vp->v_vflag |= VV_UNLINKED; 5940 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5941 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5942 } 5943 } 5944 5945 void 5946 vop_setattr_pre(void *ap) 5947 { 5948 struct vop_setattr_args *a; 5949 struct vnode *vp; 5950 5951 a = ap; 5952 vp = a->a_vp; 5953 vn_seqc_write_begin(vp); 5954 } 5955 5956 void 5957 vop_setattr_post(void *ap, int rc) 5958 { 5959 struct vop_setattr_args *a; 5960 struct vnode *vp; 5961 5962 a = ap; 5963 vp = a->a_vp; 5964 vn_seqc_write_end(vp); 5965 if (!rc) 5966 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5967 } 5968 5969 void 5970 vop_setacl_pre(void *ap) 5971 { 5972 struct vop_setacl_args *a; 5973 struct vnode *vp; 5974 5975 a = ap; 5976 vp = a->a_vp; 5977 vn_seqc_write_begin(vp); 5978 } 5979 5980 void 5981 vop_setacl_post(void *ap, int rc __unused) 5982 { 5983 struct vop_setacl_args *a; 5984 struct vnode *vp; 5985 5986 a = ap; 5987 vp = a->a_vp; 5988 vn_seqc_write_end(vp); 5989 } 5990 5991 void 5992 vop_setextattr_pre(void *ap) 5993 { 5994 struct vop_setextattr_args *a; 5995 struct vnode *vp; 5996 5997 a = ap; 5998 vp = a->a_vp; 5999 vn_seqc_write_begin(vp); 6000 } 6001 6002 void 6003 vop_setextattr_post(void *ap, int rc) 6004 { 6005 struct vop_setextattr_args *a; 6006 struct vnode *vp; 6007 6008 a = ap; 6009 vp = a->a_vp; 6010 vn_seqc_write_end(vp); 6011 if (!rc) 6012 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6013 } 6014 6015 void 6016 vop_symlink_pre(void *ap) 6017 { 6018 struct vop_symlink_args *a; 6019 struct vnode *dvp; 6020 6021 a = ap; 6022 dvp = a->a_dvp; 6023 vn_seqc_write_begin(dvp); 6024 } 6025 6026 void 6027 vop_symlink_post(void *ap, int rc) 6028 { 6029 struct vop_symlink_args *a; 6030 struct vnode *dvp; 6031 6032 a = ap; 6033 dvp = a->a_dvp; 6034 vn_seqc_write_end(dvp); 6035 if (!rc) 6036 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6037 } 6038 6039 void 6040 vop_open_post(void *ap, int rc) 6041 { 6042 struct vop_open_args *a = ap; 6043 6044 if (!rc) 6045 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6046 } 6047 6048 void 6049 vop_close_post(void *ap, int rc) 6050 { 6051 struct vop_close_args *a = ap; 6052 6053 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6054 !VN_IS_DOOMED(a->a_vp))) { 6055 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6056 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6057 } 6058 } 6059 6060 void 6061 vop_read_post(void *ap, int rc) 6062 { 6063 struct vop_read_args *a = ap; 6064 6065 if (!rc) 6066 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6067 } 6068 6069 void 6070 vop_read_pgcache_post(void *ap, int rc) 6071 { 6072 struct vop_read_pgcache_args *a = ap; 6073 6074 if (!rc) 6075 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6076 } 6077 6078 void 6079 vop_readdir_post(void *ap, int rc) 6080 { 6081 struct vop_readdir_args *a = ap; 6082 6083 if (!rc) 6084 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6085 } 6086 6087 static struct knlist fs_knlist; 6088 6089 static void 6090 vfs_event_init(void *arg) 6091 { 6092 knlist_init_mtx(&fs_knlist, NULL); 6093 } 6094 /* XXX - correct order? */ 6095 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6096 6097 void 6098 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6099 { 6100 6101 KNOTE_UNLOCKED(&fs_knlist, event); 6102 } 6103 6104 static int filt_fsattach(struct knote *kn); 6105 static void filt_fsdetach(struct knote *kn); 6106 static int filt_fsevent(struct knote *kn, long hint); 6107 6108 struct filterops fs_filtops = { 6109 .f_isfd = 0, 6110 .f_attach = filt_fsattach, 6111 .f_detach = filt_fsdetach, 6112 .f_event = filt_fsevent 6113 }; 6114 6115 static int 6116 filt_fsattach(struct knote *kn) 6117 { 6118 6119 kn->kn_flags |= EV_CLEAR; 6120 knlist_add(&fs_knlist, kn, 0); 6121 return (0); 6122 } 6123 6124 static void 6125 filt_fsdetach(struct knote *kn) 6126 { 6127 6128 knlist_remove(&fs_knlist, kn, 0); 6129 } 6130 6131 static int 6132 filt_fsevent(struct knote *kn, long hint) 6133 { 6134 6135 kn->kn_fflags |= kn->kn_sfflags & hint; 6136 6137 return (kn->kn_fflags != 0); 6138 } 6139 6140 static int 6141 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6142 { 6143 struct vfsidctl vc; 6144 int error; 6145 struct mount *mp; 6146 6147 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6148 if (error) 6149 return (error); 6150 if (vc.vc_vers != VFS_CTL_VERS1) 6151 return (EINVAL); 6152 mp = vfs_getvfs(&vc.vc_fsid); 6153 if (mp == NULL) 6154 return (ENOENT); 6155 /* ensure that a specific sysctl goes to the right filesystem. */ 6156 if (strcmp(vc.vc_fstypename, "*") != 0 && 6157 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6158 vfs_rel(mp); 6159 return (EINVAL); 6160 } 6161 VCTLTOREQ(&vc, req); 6162 error = VFS_SYSCTL(mp, vc.vc_op, req); 6163 vfs_rel(mp); 6164 return (error); 6165 } 6166 6167 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6168 NULL, 0, sysctl_vfs_ctl, "", 6169 "Sysctl by fsid"); 6170 6171 /* 6172 * Function to initialize a va_filerev field sensibly. 6173 * XXX: Wouldn't a random number make a lot more sense ?? 6174 */ 6175 u_quad_t 6176 init_va_filerev(void) 6177 { 6178 struct bintime bt; 6179 6180 getbinuptime(&bt); 6181 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6182 } 6183 6184 static int filt_vfsread(struct knote *kn, long hint); 6185 static int filt_vfswrite(struct knote *kn, long hint); 6186 static int filt_vfsvnode(struct knote *kn, long hint); 6187 static void filt_vfsdetach(struct knote *kn); 6188 static struct filterops vfsread_filtops = { 6189 .f_isfd = 1, 6190 .f_detach = filt_vfsdetach, 6191 .f_event = filt_vfsread 6192 }; 6193 static struct filterops vfswrite_filtops = { 6194 .f_isfd = 1, 6195 .f_detach = filt_vfsdetach, 6196 .f_event = filt_vfswrite 6197 }; 6198 static struct filterops vfsvnode_filtops = { 6199 .f_isfd = 1, 6200 .f_detach = filt_vfsdetach, 6201 .f_event = filt_vfsvnode 6202 }; 6203 6204 static void 6205 vfs_knllock(void *arg) 6206 { 6207 struct vnode *vp = arg; 6208 6209 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6210 } 6211 6212 static void 6213 vfs_knlunlock(void *arg) 6214 { 6215 struct vnode *vp = arg; 6216 6217 VOP_UNLOCK(vp); 6218 } 6219 6220 static void 6221 vfs_knl_assert_lock(void *arg, int what) 6222 { 6223 #ifdef DEBUG_VFS_LOCKS 6224 struct vnode *vp = arg; 6225 6226 if (what == LA_LOCKED) 6227 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6228 else 6229 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6230 #endif 6231 } 6232 6233 int 6234 vfs_kqfilter(struct vop_kqfilter_args *ap) 6235 { 6236 struct vnode *vp = ap->a_vp; 6237 struct knote *kn = ap->a_kn; 6238 struct knlist *knl; 6239 6240 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6241 kn->kn_filter != EVFILT_WRITE), 6242 ("READ/WRITE filter on a FIFO leaked through")); 6243 switch (kn->kn_filter) { 6244 case EVFILT_READ: 6245 kn->kn_fop = &vfsread_filtops; 6246 break; 6247 case EVFILT_WRITE: 6248 kn->kn_fop = &vfswrite_filtops; 6249 break; 6250 case EVFILT_VNODE: 6251 kn->kn_fop = &vfsvnode_filtops; 6252 break; 6253 default: 6254 return (EINVAL); 6255 } 6256 6257 kn->kn_hook = (caddr_t)vp; 6258 6259 v_addpollinfo(vp); 6260 if (vp->v_pollinfo == NULL) 6261 return (ENOMEM); 6262 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6263 vhold(vp); 6264 knlist_add(knl, kn, 0); 6265 6266 return (0); 6267 } 6268 6269 /* 6270 * Detach knote from vnode 6271 */ 6272 static void 6273 filt_vfsdetach(struct knote *kn) 6274 { 6275 struct vnode *vp = (struct vnode *)kn->kn_hook; 6276 6277 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6278 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6279 vdrop(vp); 6280 } 6281 6282 /*ARGSUSED*/ 6283 static int 6284 filt_vfsread(struct knote *kn, long hint) 6285 { 6286 struct vnode *vp = (struct vnode *)kn->kn_hook; 6287 off_t size; 6288 int res; 6289 6290 /* 6291 * filesystem is gone, so set the EOF flag and schedule 6292 * the knote for deletion. 6293 */ 6294 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6295 VI_LOCK(vp); 6296 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6297 VI_UNLOCK(vp); 6298 return (1); 6299 } 6300 6301 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6302 return (0); 6303 6304 VI_LOCK(vp); 6305 kn->kn_data = size - kn->kn_fp->f_offset; 6306 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6307 VI_UNLOCK(vp); 6308 return (res); 6309 } 6310 6311 /*ARGSUSED*/ 6312 static int 6313 filt_vfswrite(struct knote *kn, long hint) 6314 { 6315 struct vnode *vp = (struct vnode *)kn->kn_hook; 6316 6317 VI_LOCK(vp); 6318 6319 /* 6320 * filesystem is gone, so set the EOF flag and schedule 6321 * the knote for deletion. 6322 */ 6323 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6324 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6325 6326 kn->kn_data = 0; 6327 VI_UNLOCK(vp); 6328 return (1); 6329 } 6330 6331 static int 6332 filt_vfsvnode(struct knote *kn, long hint) 6333 { 6334 struct vnode *vp = (struct vnode *)kn->kn_hook; 6335 int res; 6336 6337 VI_LOCK(vp); 6338 if (kn->kn_sfflags & hint) 6339 kn->kn_fflags |= hint; 6340 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6341 kn->kn_flags |= EV_EOF; 6342 VI_UNLOCK(vp); 6343 return (1); 6344 } 6345 res = (kn->kn_fflags != 0); 6346 VI_UNLOCK(vp); 6347 return (res); 6348 } 6349 6350 /* 6351 * Returns whether the directory is empty or not. 6352 * If it is empty, the return value is 0; otherwise 6353 * the return value is an error value (which may 6354 * be ENOTEMPTY). 6355 */ 6356 int 6357 vfs_emptydir(struct vnode *vp) 6358 { 6359 struct uio uio; 6360 struct iovec iov; 6361 struct dirent *dirent, *dp, *endp; 6362 int error, eof; 6363 6364 error = 0; 6365 eof = 0; 6366 6367 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6368 VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); 6369 6370 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6371 iov.iov_base = dirent; 6372 iov.iov_len = sizeof(struct dirent); 6373 6374 uio.uio_iov = &iov; 6375 uio.uio_iovcnt = 1; 6376 uio.uio_offset = 0; 6377 uio.uio_resid = sizeof(struct dirent); 6378 uio.uio_segflg = UIO_SYSSPACE; 6379 uio.uio_rw = UIO_READ; 6380 uio.uio_td = curthread; 6381 6382 while (eof == 0 && error == 0) { 6383 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6384 NULL, NULL); 6385 if (error != 0) 6386 break; 6387 endp = (void *)((uint8_t *)dirent + 6388 sizeof(struct dirent) - uio.uio_resid); 6389 for (dp = dirent; dp < endp; 6390 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6391 if (dp->d_type == DT_WHT) 6392 continue; 6393 if (dp->d_namlen == 0) 6394 continue; 6395 if (dp->d_type != DT_DIR && 6396 dp->d_type != DT_UNKNOWN) { 6397 error = ENOTEMPTY; 6398 break; 6399 } 6400 if (dp->d_namlen > 2) { 6401 error = ENOTEMPTY; 6402 break; 6403 } 6404 if (dp->d_namlen == 1 && 6405 dp->d_name[0] != '.') { 6406 error = ENOTEMPTY; 6407 break; 6408 } 6409 if (dp->d_namlen == 2 && 6410 dp->d_name[1] != '.') { 6411 error = ENOTEMPTY; 6412 break; 6413 } 6414 uio.uio_resid = sizeof(struct dirent); 6415 } 6416 } 6417 free(dirent, M_TEMP); 6418 return (error); 6419 } 6420 6421 int 6422 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6423 { 6424 int error; 6425 6426 if (dp->d_reclen > ap->a_uio->uio_resid) 6427 return (ENAMETOOLONG); 6428 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6429 if (error) { 6430 if (ap->a_ncookies != NULL) { 6431 if (ap->a_cookies != NULL) 6432 free(ap->a_cookies, M_TEMP); 6433 ap->a_cookies = NULL; 6434 *ap->a_ncookies = 0; 6435 } 6436 return (error); 6437 } 6438 if (ap->a_ncookies == NULL) 6439 return (0); 6440 6441 KASSERT(ap->a_cookies, 6442 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6443 6444 *ap->a_cookies = realloc(*ap->a_cookies, 6445 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6446 (*ap->a_cookies)[*ap->a_ncookies] = off; 6447 *ap->a_ncookies += 1; 6448 return (0); 6449 } 6450 6451 /* 6452 * The purpose of this routine is to remove granularity from accmode_t, 6453 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6454 * VADMIN and VAPPEND. 6455 * 6456 * If it returns 0, the caller is supposed to continue with the usual 6457 * access checks using 'accmode' as modified by this routine. If it 6458 * returns nonzero value, the caller is supposed to return that value 6459 * as errno. 6460 * 6461 * Note that after this routine runs, accmode may be zero. 6462 */ 6463 int 6464 vfs_unixify_accmode(accmode_t *accmode) 6465 { 6466 /* 6467 * There is no way to specify explicit "deny" rule using 6468 * file mode or POSIX.1e ACLs. 6469 */ 6470 if (*accmode & VEXPLICIT_DENY) { 6471 *accmode = 0; 6472 return (0); 6473 } 6474 6475 /* 6476 * None of these can be translated into usual access bits. 6477 * Also, the common case for NFSv4 ACLs is to not contain 6478 * either of these bits. Caller should check for VWRITE 6479 * on the containing directory instead. 6480 */ 6481 if (*accmode & (VDELETE_CHILD | VDELETE)) 6482 return (EPERM); 6483 6484 if (*accmode & VADMIN_PERMS) { 6485 *accmode &= ~VADMIN_PERMS; 6486 *accmode |= VADMIN; 6487 } 6488 6489 /* 6490 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6491 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6492 */ 6493 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6494 6495 return (0); 6496 } 6497 6498 /* 6499 * Clear out a doomed vnode (if any) and replace it with a new one as long 6500 * as the fs is not being unmounted. Return the root vnode to the caller. 6501 */ 6502 static int __noinline 6503 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6504 { 6505 struct vnode *vp; 6506 int error; 6507 6508 restart: 6509 if (mp->mnt_rootvnode != NULL) { 6510 MNT_ILOCK(mp); 6511 vp = mp->mnt_rootvnode; 6512 if (vp != NULL) { 6513 if (!VN_IS_DOOMED(vp)) { 6514 vrefact(vp); 6515 MNT_IUNLOCK(mp); 6516 error = vn_lock(vp, flags); 6517 if (error == 0) { 6518 *vpp = vp; 6519 return (0); 6520 } 6521 vrele(vp); 6522 goto restart; 6523 } 6524 /* 6525 * Clear the old one. 6526 */ 6527 mp->mnt_rootvnode = NULL; 6528 } 6529 MNT_IUNLOCK(mp); 6530 if (vp != NULL) { 6531 vfs_op_barrier_wait(mp); 6532 vrele(vp); 6533 } 6534 } 6535 error = VFS_CACHEDROOT(mp, flags, vpp); 6536 if (error != 0) 6537 return (error); 6538 if (mp->mnt_vfs_ops == 0) { 6539 MNT_ILOCK(mp); 6540 if (mp->mnt_vfs_ops != 0) { 6541 MNT_IUNLOCK(mp); 6542 return (0); 6543 } 6544 if (mp->mnt_rootvnode == NULL) { 6545 vrefact(*vpp); 6546 mp->mnt_rootvnode = *vpp; 6547 } else { 6548 if (mp->mnt_rootvnode != *vpp) { 6549 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6550 panic("%s: mismatch between vnode returned " 6551 " by VFS_CACHEDROOT and the one cached " 6552 " (%p != %p)", 6553 __func__, *vpp, mp->mnt_rootvnode); 6554 } 6555 } 6556 } 6557 MNT_IUNLOCK(mp); 6558 } 6559 return (0); 6560 } 6561 6562 int 6563 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6564 { 6565 struct mount_pcpu *mpcpu; 6566 struct vnode *vp; 6567 int error; 6568 6569 if (!vfs_op_thread_enter(mp, mpcpu)) 6570 return (vfs_cache_root_fallback(mp, flags, vpp)); 6571 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6572 if (vp == NULL || VN_IS_DOOMED(vp)) { 6573 vfs_op_thread_exit(mp, mpcpu); 6574 return (vfs_cache_root_fallback(mp, flags, vpp)); 6575 } 6576 vrefact(vp); 6577 vfs_op_thread_exit(mp, mpcpu); 6578 error = vn_lock(vp, flags); 6579 if (error != 0) { 6580 vrele(vp); 6581 return (vfs_cache_root_fallback(mp, flags, vpp)); 6582 } 6583 *vpp = vp; 6584 return (0); 6585 } 6586 6587 struct vnode * 6588 vfs_cache_root_clear(struct mount *mp) 6589 { 6590 struct vnode *vp; 6591 6592 /* 6593 * ops > 0 guarantees there is nobody who can see this vnode 6594 */ 6595 MPASS(mp->mnt_vfs_ops > 0); 6596 vp = mp->mnt_rootvnode; 6597 if (vp != NULL) 6598 vn_seqc_write_begin(vp); 6599 mp->mnt_rootvnode = NULL; 6600 return (vp); 6601 } 6602 6603 void 6604 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6605 { 6606 6607 MPASS(mp->mnt_vfs_ops > 0); 6608 vrefact(vp); 6609 mp->mnt_rootvnode = vp; 6610 } 6611 6612 /* 6613 * These are helper functions for filesystems to traverse all 6614 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6615 * 6616 * This interface replaces MNT_VNODE_FOREACH. 6617 */ 6618 6619 struct vnode * 6620 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6621 { 6622 struct vnode *vp; 6623 6624 maybe_yield(); 6625 MNT_ILOCK(mp); 6626 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6627 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6628 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6629 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6630 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6631 continue; 6632 VI_LOCK(vp); 6633 if (VN_IS_DOOMED(vp)) { 6634 VI_UNLOCK(vp); 6635 continue; 6636 } 6637 break; 6638 } 6639 if (vp == NULL) { 6640 __mnt_vnode_markerfree_all(mvp, mp); 6641 /* MNT_IUNLOCK(mp); -- done in above function */ 6642 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6643 return (NULL); 6644 } 6645 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6646 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6647 MNT_IUNLOCK(mp); 6648 return (vp); 6649 } 6650 6651 struct vnode * 6652 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6653 { 6654 struct vnode *vp; 6655 6656 *mvp = vn_alloc_marker(mp); 6657 MNT_ILOCK(mp); 6658 MNT_REF(mp); 6659 6660 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6661 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6662 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6663 continue; 6664 VI_LOCK(vp); 6665 if (VN_IS_DOOMED(vp)) { 6666 VI_UNLOCK(vp); 6667 continue; 6668 } 6669 break; 6670 } 6671 if (vp == NULL) { 6672 MNT_REL(mp); 6673 MNT_IUNLOCK(mp); 6674 vn_free_marker(*mvp); 6675 *mvp = NULL; 6676 return (NULL); 6677 } 6678 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6679 MNT_IUNLOCK(mp); 6680 return (vp); 6681 } 6682 6683 void 6684 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6685 { 6686 6687 if (*mvp == NULL) { 6688 MNT_IUNLOCK(mp); 6689 return; 6690 } 6691 6692 mtx_assert(MNT_MTX(mp), MA_OWNED); 6693 6694 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6695 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6696 MNT_REL(mp); 6697 MNT_IUNLOCK(mp); 6698 vn_free_marker(*mvp); 6699 *mvp = NULL; 6700 } 6701 6702 /* 6703 * These are helper functions for filesystems to traverse their 6704 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6705 */ 6706 static void 6707 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6708 { 6709 6710 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6711 6712 MNT_ILOCK(mp); 6713 MNT_REL(mp); 6714 MNT_IUNLOCK(mp); 6715 vn_free_marker(*mvp); 6716 *mvp = NULL; 6717 } 6718 6719 /* 6720 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6721 * conventional lock order during mnt_vnode_next_lazy iteration. 6722 * 6723 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6724 * The list lock is dropped and reacquired. On success, both locks are held. 6725 * On failure, the mount vnode list lock is held but the vnode interlock is 6726 * not, and the procedure may have yielded. 6727 */ 6728 static bool 6729 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6730 struct vnode *vp) 6731 { 6732 6733 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6734 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6735 ("%s: bad marker", __func__)); 6736 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6737 ("%s: inappropriate vnode", __func__)); 6738 ASSERT_VI_UNLOCKED(vp, __func__); 6739 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6740 6741 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6742 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6743 6744 /* 6745 * Note we may be racing against vdrop which transitioned the hold 6746 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6747 * if we are the only user after we get the interlock we will just 6748 * vdrop. 6749 */ 6750 vhold(vp); 6751 mtx_unlock(&mp->mnt_listmtx); 6752 VI_LOCK(vp); 6753 if (VN_IS_DOOMED(vp)) { 6754 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6755 goto out_lost; 6756 } 6757 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6758 /* 6759 * There is nothing to do if we are the last user. 6760 */ 6761 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6762 goto out_lost; 6763 mtx_lock(&mp->mnt_listmtx); 6764 return (true); 6765 out_lost: 6766 vdropl(vp); 6767 maybe_yield(); 6768 mtx_lock(&mp->mnt_listmtx); 6769 return (false); 6770 } 6771 6772 static struct vnode * 6773 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6774 void *cbarg) 6775 { 6776 struct vnode *vp; 6777 6778 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6779 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6780 restart: 6781 vp = TAILQ_NEXT(*mvp, v_lazylist); 6782 while (vp != NULL) { 6783 if (vp->v_type == VMARKER) { 6784 vp = TAILQ_NEXT(vp, v_lazylist); 6785 continue; 6786 } 6787 /* 6788 * See if we want to process the vnode. Note we may encounter a 6789 * long string of vnodes we don't care about and hog the list 6790 * as a result. Check for it and requeue the marker. 6791 */ 6792 VNPASS(!VN_IS_DOOMED(vp), vp); 6793 if (!cb(vp, cbarg)) { 6794 if (!should_yield()) { 6795 vp = TAILQ_NEXT(vp, v_lazylist); 6796 continue; 6797 } 6798 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6799 v_lazylist); 6800 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6801 v_lazylist); 6802 mtx_unlock(&mp->mnt_listmtx); 6803 kern_yield(PRI_USER); 6804 mtx_lock(&mp->mnt_listmtx); 6805 goto restart; 6806 } 6807 /* 6808 * Try-lock because this is the wrong lock order. 6809 */ 6810 if (!VI_TRYLOCK(vp) && 6811 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6812 goto restart; 6813 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6814 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6815 ("alien vnode on the lazy list %p %p", vp, mp)); 6816 VNPASS(vp->v_mount == mp, vp); 6817 VNPASS(!VN_IS_DOOMED(vp), vp); 6818 break; 6819 } 6820 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6821 6822 /* Check if we are done */ 6823 if (vp == NULL) { 6824 mtx_unlock(&mp->mnt_listmtx); 6825 mnt_vnode_markerfree_lazy(mvp, mp); 6826 return (NULL); 6827 } 6828 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6829 mtx_unlock(&mp->mnt_listmtx); 6830 ASSERT_VI_LOCKED(vp, "lazy iter"); 6831 return (vp); 6832 } 6833 6834 struct vnode * 6835 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6836 void *cbarg) 6837 { 6838 6839 maybe_yield(); 6840 mtx_lock(&mp->mnt_listmtx); 6841 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6842 } 6843 6844 struct vnode * 6845 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6846 void *cbarg) 6847 { 6848 struct vnode *vp; 6849 6850 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6851 return (NULL); 6852 6853 *mvp = vn_alloc_marker(mp); 6854 MNT_ILOCK(mp); 6855 MNT_REF(mp); 6856 MNT_IUNLOCK(mp); 6857 6858 mtx_lock(&mp->mnt_listmtx); 6859 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6860 if (vp == NULL) { 6861 mtx_unlock(&mp->mnt_listmtx); 6862 mnt_vnode_markerfree_lazy(mvp, mp); 6863 return (NULL); 6864 } 6865 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6866 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6867 } 6868 6869 void 6870 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6871 { 6872 6873 if (*mvp == NULL) 6874 return; 6875 6876 mtx_lock(&mp->mnt_listmtx); 6877 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6878 mtx_unlock(&mp->mnt_listmtx); 6879 mnt_vnode_markerfree_lazy(mvp, mp); 6880 } 6881 6882 int 6883 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6884 { 6885 6886 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6887 cnp->cn_flags &= ~NOEXECCHECK; 6888 return (0); 6889 } 6890 6891 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 6892 } 6893 6894 /* 6895 * Do not use this variant unless you have means other than the hold count 6896 * to prevent the vnode from getting freed. 6897 */ 6898 void 6899 vn_seqc_write_begin_locked(struct vnode *vp) 6900 { 6901 6902 ASSERT_VI_LOCKED(vp, __func__); 6903 VNPASS(vp->v_holdcnt > 0, vp); 6904 VNPASS(vp->v_seqc_users >= 0, vp); 6905 vp->v_seqc_users++; 6906 if (vp->v_seqc_users == 1) 6907 seqc_sleepable_write_begin(&vp->v_seqc); 6908 } 6909 6910 void 6911 vn_seqc_write_begin(struct vnode *vp) 6912 { 6913 6914 VI_LOCK(vp); 6915 vn_seqc_write_begin_locked(vp); 6916 VI_UNLOCK(vp); 6917 } 6918 6919 void 6920 vn_seqc_write_end_locked(struct vnode *vp) 6921 { 6922 6923 ASSERT_VI_LOCKED(vp, __func__); 6924 VNPASS(vp->v_seqc_users > 0, vp); 6925 vp->v_seqc_users--; 6926 if (vp->v_seqc_users == 0) 6927 seqc_sleepable_write_end(&vp->v_seqc); 6928 } 6929 6930 void 6931 vn_seqc_write_end(struct vnode *vp) 6932 { 6933 6934 VI_LOCK(vp); 6935 vn_seqc_write_end_locked(vp); 6936 VI_UNLOCK(vp); 6937 } 6938 6939 /* 6940 * Special case handling for allocating and freeing vnodes. 6941 * 6942 * The counter remains unchanged on free so that a doomed vnode will 6943 * keep testing as in modify as long as it is accessible with SMR. 6944 */ 6945 static void 6946 vn_seqc_init(struct vnode *vp) 6947 { 6948 6949 vp->v_seqc = 0; 6950 vp->v_seqc_users = 0; 6951 } 6952 6953 static void 6954 vn_seqc_write_end_free(struct vnode *vp) 6955 { 6956 6957 VNPASS(seqc_in_modify(vp->v_seqc), vp); 6958 VNPASS(vp->v_seqc_users == 1, vp); 6959 } 6960 6961 void 6962 vn_irflag_set_locked(struct vnode *vp, short toset) 6963 { 6964 short flags; 6965 6966 ASSERT_VI_LOCKED(vp, __func__); 6967 flags = vn_irflag_read(vp); 6968 VNASSERT((flags & toset) == 0, vp, 6969 ("%s: some of the passed flags already set (have %d, passed %d)\n", 6970 __func__, flags, toset)); 6971 atomic_store_short(&vp->v_irflag, flags | toset); 6972 } 6973 6974 void 6975 vn_irflag_set(struct vnode *vp, short toset) 6976 { 6977 6978 VI_LOCK(vp); 6979 vn_irflag_set_locked(vp, toset); 6980 VI_UNLOCK(vp); 6981 } 6982 6983 void 6984 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 6985 { 6986 short flags; 6987 6988 ASSERT_VI_LOCKED(vp, __func__); 6989 flags = vn_irflag_read(vp); 6990 atomic_store_short(&vp->v_irflag, flags | toset); 6991 } 6992 6993 void 6994 vn_irflag_set_cond(struct vnode *vp, short toset) 6995 { 6996 6997 VI_LOCK(vp); 6998 vn_irflag_set_cond_locked(vp, toset); 6999 VI_UNLOCK(vp); 7000 } 7001 7002 void 7003 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7004 { 7005 short flags; 7006 7007 ASSERT_VI_LOCKED(vp, __func__); 7008 flags = vn_irflag_read(vp); 7009 VNASSERT((flags & tounset) == tounset, vp, 7010 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7011 __func__, flags, tounset)); 7012 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7013 } 7014 7015 void 7016 vn_irflag_unset(struct vnode *vp, short tounset) 7017 { 7018 7019 VI_LOCK(vp); 7020 vn_irflag_unset_locked(vp, tounset); 7021 VI_UNLOCK(vp); 7022 } 7023 7024 int 7025 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7026 { 7027 struct vattr vattr; 7028 int error; 7029 7030 ASSERT_VOP_LOCKED(vp, __func__); 7031 error = VOP_GETATTR(vp, &vattr, cred); 7032 if (__predict_true(error == 0)) { 7033 if (vattr.va_size <= OFF_MAX) 7034 *size = vattr.va_size; 7035 else 7036 error = EFBIG; 7037 } 7038 return (error); 7039 } 7040 7041 int 7042 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7043 { 7044 int error; 7045 7046 VOP_LOCK(vp, LK_SHARED); 7047 error = vn_getsize_locked(vp, size, cred); 7048 VOP_UNLOCK(vp); 7049 return (error); 7050 } 7051 7052 #ifdef INVARIANTS 7053 void 7054 vn_set_state_validate(struct vnode *vp, enum vstate state) 7055 { 7056 7057 switch (vp->v_state) { 7058 case VSTATE_UNINITIALIZED: 7059 switch (state) { 7060 case VSTATE_CONSTRUCTED: 7061 case VSTATE_DESTROYING: 7062 return; 7063 default: 7064 break; 7065 } 7066 break; 7067 case VSTATE_CONSTRUCTED: 7068 ASSERT_VOP_ELOCKED(vp, __func__); 7069 switch (state) { 7070 case VSTATE_DESTROYING: 7071 return; 7072 default: 7073 break; 7074 } 7075 break; 7076 case VSTATE_DESTROYING: 7077 ASSERT_VOP_ELOCKED(vp, __func__); 7078 switch (state) { 7079 case VSTATE_DEAD: 7080 return; 7081 default: 7082 break; 7083 } 7084 break; 7085 case VSTATE_DEAD: 7086 switch (state) { 7087 case VSTATE_UNINITIALIZED: 7088 return; 7089 default: 7090 break; 7091 } 7092 break; 7093 } 7094 7095 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7096 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7097 } 7098 #endif 7099