1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 #include "opt_ddb.h" 43 #include "opt_watchdog.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/asan.h> 48 #include <sys/bio.h> 49 #include <sys/buf.h> 50 #include <sys/capsicum.h> 51 #include <sys/condvar.h> 52 #include <sys/conf.h> 53 #include <sys/counter.h> 54 #include <sys/dirent.h> 55 #include <sys/event.h> 56 #include <sys/eventhandler.h> 57 #include <sys/extattr.h> 58 #include <sys/file.h> 59 #include <sys/fcntl.h> 60 #include <sys/jail.h> 61 #include <sys/kdb.h> 62 #include <sys/kernel.h> 63 #include <sys/kthread.h> 64 #include <sys/ktr.h> 65 #include <sys/limits.h> 66 #include <sys/lockf.h> 67 #include <sys/malloc.h> 68 #include <sys/mount.h> 69 #include <sys/namei.h> 70 #include <sys/pctrie.h> 71 #include <sys/priv.h> 72 #include <sys/reboot.h> 73 #include <sys/refcount.h> 74 #include <sys/rwlock.h> 75 #include <sys/sched.h> 76 #include <sys/sleepqueue.h> 77 #include <sys/smr.h> 78 #include <sys/smp.h> 79 #include <sys/stat.h> 80 #include <sys/sysctl.h> 81 #include <sys/syslog.h> 82 #include <sys/vmmeter.h> 83 #include <sys/vnode.h> 84 #include <sys/watchdog.h> 85 86 #include <machine/stdarg.h> 87 88 #include <security/mac/mac_framework.h> 89 90 #include <vm/vm.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_extern.h> 93 #include <vm/pmap.h> 94 #include <vm/vm_map.h> 95 #include <vm/vm_page.h> 96 #include <vm/vm_kern.h> 97 #include <vm/uma.h> 98 99 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 100 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 101 #endif 102 103 #ifdef DDB 104 #include <ddb/ddb.h> 105 #endif 106 107 static void delmntque(struct vnode *vp); 108 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 109 int slpflag, int slptimeo); 110 static void syncer_shutdown(void *arg, int howto); 111 static int vtryrecycle(struct vnode *vp, bool isvnlru); 112 static void v_init_counters(struct vnode *); 113 static void vn_seqc_init(struct vnode *); 114 static void vn_seqc_write_end_free(struct vnode *vp); 115 static void vgonel(struct vnode *); 116 static bool vhold_recycle_free(struct vnode *); 117 static void vdropl_recycle(struct vnode *vp); 118 static void vdrop_recycle(struct vnode *vp); 119 static void vfs_knllock(void *arg); 120 static void vfs_knlunlock(void *arg); 121 static void vfs_knl_assert_lock(void *arg, int what); 122 static void destroy_vpollinfo(struct vpollinfo *vi); 123 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 124 daddr_t startlbn, daddr_t endlbn); 125 static void vnlru_recalc(void); 126 127 static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 128 "vnode configuration and statistics"); 129 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 130 "vnode configuration"); 131 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 132 "vnode statistics"); 133 static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 134 "vnode recycling"); 135 136 /* 137 * Number of vnodes in existence. Increased whenever getnewvnode() 138 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 139 */ 140 static u_long __exclusive_cache_line numvnodes; 141 142 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 143 "Number of vnodes in existence (legacy)"); 144 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, 145 "Number of vnodes in existence"); 146 147 static counter_u64_t vnodes_created; 148 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 149 "Number of vnodes created by getnewvnode (legacy)"); 150 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, 151 "Number of vnodes created by getnewvnode"); 152 153 /* 154 * Conversion tables for conversion from vnode types to inode formats 155 * and back. 156 */ 157 __enum_uint8(vtype) iftovt_tab[16] = { 158 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 159 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 160 }; 161 int vttoif_tab[10] = { 162 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 163 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 164 }; 165 166 /* 167 * List of allocates vnodes in the system. 168 */ 169 static TAILQ_HEAD(freelst, vnode) vnode_list; 170 static struct vnode *vnode_list_free_marker; 171 static struct vnode *vnode_list_reclaim_marker; 172 173 /* 174 * "Free" vnode target. Free vnodes are rarely completely free, but are 175 * just ones that are cheap to recycle. Usually they are for files which 176 * have been stat'd but not read; these usually have inode and namecache 177 * data attached to them. This target is the preferred minimum size of a 178 * sub-cache consisting mostly of such files. The system balances the size 179 * of this sub-cache with its complement to try to prevent either from 180 * thrashing while the other is relatively inactive. The targets express 181 * a preference for the best balance. 182 * 183 * "Above" this target there are 2 further targets (watermarks) related 184 * to recyling of free vnodes. In the best-operating case, the cache is 185 * exactly full, the free list has size between vlowat and vhiwat above the 186 * free target, and recycling from it and normal use maintains this state. 187 * Sometimes the free list is below vlowat or even empty, but this state 188 * is even better for immediate use provided the cache is not full. 189 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 190 * ones) to reach one of these states. The watermarks are currently hard- 191 * coded as 4% and 9% of the available space higher. These and the default 192 * of 25% for wantfreevnodes are too large if the memory size is large. 193 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 194 * whenever vnlru_proc() becomes active. 195 */ 196 static long wantfreevnodes; 197 static long __exclusive_cache_line freevnodes; 198 static long freevnodes_old; 199 200 static u_long recycles_count; 201 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, 202 "Number of vnodes recycled to meet vnode cache targets (legacy)"); 203 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, 204 &recycles_count, 0, 205 "Number of vnodes recycled to meet vnode cache targets"); 206 207 static u_long recycles_free_count; 208 SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 209 &recycles_free_count, 0, 210 "Number of free vnodes recycled to meet vnode cache targets (legacy)"); 211 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, 212 &recycles_free_count, 0, 213 "Number of free vnodes recycled to meet vnode cache targets"); 214 215 static counter_u64_t direct_recycles_free_count; 216 SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, 217 &direct_recycles_free_count, 218 "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); 219 220 static counter_u64_t vnode_skipped_requeues; 221 SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, 222 "Number of times LRU requeue was skipped due to lock contention"); 223 224 static u_long deferred_inact; 225 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 226 &deferred_inact, 0, "Number of times inactive processing was deferred"); 227 228 /* To keep more than one thread at a time from running vfs_getnewfsid */ 229 static struct mtx mntid_mtx; 230 231 /* 232 * Lock for any access to the following: 233 * vnode_list 234 * numvnodes 235 * freevnodes 236 */ 237 static struct mtx __exclusive_cache_line vnode_list_mtx; 238 239 /* Publicly exported FS */ 240 struct nfs_public nfs_pub; 241 242 static uma_zone_t buf_trie_zone; 243 static smr_t buf_trie_smr; 244 245 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 246 static uma_zone_t vnode_zone; 247 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 248 249 __read_frequently smr_t vfs_smr; 250 251 /* 252 * The workitem queue. 253 * 254 * It is useful to delay writes of file data and filesystem metadata 255 * for tens of seconds so that quickly created and deleted files need 256 * not waste disk bandwidth being created and removed. To realize this, 257 * we append vnodes to a "workitem" queue. When running with a soft 258 * updates implementation, most pending metadata dependencies should 259 * not wait for more than a few seconds. Thus, mounted on block devices 260 * are delayed only about a half the time that file data is delayed. 261 * Similarly, directory updates are more critical, so are only delayed 262 * about a third the time that file data is delayed. Thus, there are 263 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 264 * one each second (driven off the filesystem syncer process). The 265 * syncer_delayno variable indicates the next queue that is to be processed. 266 * Items that need to be processed soon are placed in this queue: 267 * 268 * syncer_workitem_pending[syncer_delayno] 269 * 270 * A delay of fifteen seconds is done by placing the request fifteen 271 * entries later in the queue: 272 * 273 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 274 * 275 */ 276 static int syncer_delayno; 277 static long syncer_mask; 278 LIST_HEAD(synclist, bufobj); 279 static struct synclist *syncer_workitem_pending; 280 /* 281 * The sync_mtx protects: 282 * bo->bo_synclist 283 * sync_vnode_count 284 * syncer_delayno 285 * syncer_state 286 * syncer_workitem_pending 287 * syncer_worklist_len 288 * rushjob 289 */ 290 static struct mtx sync_mtx; 291 static struct cv sync_wakeup; 292 293 #define SYNCER_MAXDELAY 32 294 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 295 static int syncdelay = 30; /* max time to delay syncing data */ 296 static int filedelay = 30; /* time to delay syncing files */ 297 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 298 "Time to delay syncing files (in seconds)"); 299 static int dirdelay = 29; /* time to delay syncing directories */ 300 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 301 "Time to delay syncing directories (in seconds)"); 302 static int metadelay = 28; /* time to delay syncing metadata */ 303 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 304 "Time to delay syncing metadata (in seconds)"); 305 static int rushjob; /* number of slots to run ASAP */ 306 static int stat_rush_requests; /* number of times I/O speeded up */ 307 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 308 "Number of times I/O speeded up (rush requests)"); 309 310 #define VDBATCH_SIZE 8 311 struct vdbatch { 312 u_int index; 313 struct mtx lock; 314 struct vnode *tab[VDBATCH_SIZE]; 315 }; 316 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 317 318 static void vdbatch_dequeue(struct vnode *vp); 319 320 /* 321 * When shutting down the syncer, run it at four times normal speed. 322 */ 323 #define SYNCER_SHUTDOWN_SPEEDUP 4 324 static int sync_vnode_count; 325 static int syncer_worklist_len; 326 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 327 syncer_state; 328 329 /* Target for maximum number of vnodes. */ 330 u_long desiredvnodes; 331 static u_long gapvnodes; /* gap between wanted and desired */ 332 static u_long vhiwat; /* enough extras after expansion */ 333 static u_long vlowat; /* minimal extras before expansion */ 334 static bool vstir; /* nonzero to stir non-free vnodes */ 335 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 336 337 static u_long vnlru_read_freevnodes(void); 338 339 /* 340 * Note that no attempt is made to sanitize these parameters. 341 */ 342 static int 343 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 344 { 345 u_long val; 346 int error; 347 348 val = desiredvnodes; 349 error = sysctl_handle_long(oidp, &val, 0, req); 350 if (error != 0 || req->newptr == NULL) 351 return (error); 352 353 if (val == desiredvnodes) 354 return (0); 355 mtx_lock(&vnode_list_mtx); 356 desiredvnodes = val; 357 wantfreevnodes = desiredvnodes / 4; 358 vnlru_recalc(); 359 mtx_unlock(&vnode_list_mtx); 360 /* 361 * XXX There is no protection against multiple threads changing 362 * desiredvnodes at the same time. Locking above only helps vnlru and 363 * getnewvnode. 364 */ 365 vfs_hash_changesize(desiredvnodes); 366 cache_changesize(desiredvnodes); 367 return (0); 368 } 369 370 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 371 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 372 "LU", "Target for maximum number of vnodes (legacy)"); 373 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, 374 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 375 "LU", "Target for maximum number of vnodes"); 376 377 static int 378 sysctl_freevnodes(SYSCTL_HANDLER_ARGS) 379 { 380 u_long rfreevnodes; 381 382 rfreevnodes = vnlru_read_freevnodes(); 383 return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); 384 } 385 386 SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, 387 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 388 "LU", "Number of \"free\" vnodes (legacy)"); 389 SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, 390 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, 391 "LU", "Number of \"free\" vnodes"); 392 393 static int 394 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 395 { 396 u_long val; 397 int error; 398 399 val = wantfreevnodes; 400 error = sysctl_handle_long(oidp, &val, 0, req); 401 if (error != 0 || req->newptr == NULL) 402 return (error); 403 404 if (val == wantfreevnodes) 405 return (0); 406 mtx_lock(&vnode_list_mtx); 407 wantfreevnodes = val; 408 vnlru_recalc(); 409 mtx_unlock(&vnode_list_mtx); 410 return (0); 411 } 412 413 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 414 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 415 "LU", "Target for minimum number of \"free\" vnodes (legacy)"); 416 SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, 417 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 418 "LU", "Target for minimum number of \"free\" vnodes"); 419 420 static int vnlru_nowhere; 421 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, 422 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 423 424 static int 425 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 426 { 427 struct vnode *vp; 428 struct nameidata nd; 429 char *buf; 430 unsigned long ndflags; 431 int error; 432 433 if (req->newptr == NULL) 434 return (EINVAL); 435 if (req->newlen >= PATH_MAX) 436 return (E2BIG); 437 438 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 439 error = SYSCTL_IN(req, buf, req->newlen); 440 if (error != 0) 441 goto out; 442 443 buf[req->newlen] = '\0'; 444 445 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 446 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 447 if ((error = namei(&nd)) != 0) 448 goto out; 449 vp = nd.ni_vp; 450 451 if (VN_IS_DOOMED(vp)) { 452 /* 453 * This vnode is being recycled. Return != 0 to let the caller 454 * know that the sysctl had no effect. Return EAGAIN because a 455 * subsequent call will likely succeed (since namei will create 456 * a new vnode if necessary) 457 */ 458 error = EAGAIN; 459 goto putvnode; 460 } 461 462 vgone(vp); 463 putvnode: 464 vput(vp); 465 NDFREE_PNBUF(&nd); 466 out: 467 free(buf, M_TEMP); 468 return (error); 469 } 470 471 static int 472 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 473 { 474 struct thread *td = curthread; 475 struct vnode *vp; 476 struct file *fp; 477 int error; 478 int fd; 479 480 if (req->newptr == NULL) 481 return (EBADF); 482 483 error = sysctl_handle_int(oidp, &fd, 0, req); 484 if (error != 0) 485 return (error); 486 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 487 if (error != 0) 488 return (error); 489 vp = fp->f_vnode; 490 491 error = vn_lock(vp, LK_EXCLUSIVE); 492 if (error != 0) 493 goto drop; 494 495 vgone(vp); 496 VOP_UNLOCK(vp); 497 drop: 498 fdrop(fp, td); 499 return (error); 500 } 501 502 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 503 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 504 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 505 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 506 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 507 sysctl_ftry_reclaim_vnode, "I", 508 "Try to reclaim a vnode by its file descriptor"); 509 510 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 511 #define vnsz2log 8 512 #ifndef DEBUG_LOCKS 513 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 514 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 515 "vnsz2log needs to be updated"); 516 #endif 517 518 /* 519 * Support for the bufobj clean & dirty pctrie. 520 */ 521 static void * 522 buf_trie_alloc(struct pctrie *ptree) 523 { 524 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 525 } 526 527 static void 528 buf_trie_free(struct pctrie *ptree, void *node) 529 { 530 uma_zfree_smr(buf_trie_zone, node); 531 } 532 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 533 buf_trie_smr); 534 535 /* 536 * Initialize the vnode management data structures. 537 * 538 * Reevaluate the following cap on the number of vnodes after the physical 539 * memory size exceeds 512GB. In the limit, as the physical memory size 540 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 541 */ 542 #ifndef MAXVNODES_MAX 543 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 544 #endif 545 546 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 547 548 static struct vnode * 549 vn_alloc_marker(struct mount *mp) 550 { 551 struct vnode *vp; 552 553 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 554 vp->v_type = VMARKER; 555 vp->v_mount = mp; 556 557 return (vp); 558 } 559 560 static void 561 vn_free_marker(struct vnode *vp) 562 { 563 564 MPASS(vp->v_type == VMARKER); 565 free(vp, M_VNODE_MARKER); 566 } 567 568 #ifdef KASAN 569 static int 570 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 571 { 572 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 573 return (0); 574 } 575 576 static void 577 vnode_dtor(void *mem, int size, void *arg __unused) 578 { 579 size_t end1, end2, off1, off2; 580 581 _Static_assert(offsetof(struct vnode, v_vnodelist) < 582 offsetof(struct vnode, v_dbatchcpu), 583 "KASAN marks require updating"); 584 585 off1 = offsetof(struct vnode, v_vnodelist); 586 off2 = offsetof(struct vnode, v_dbatchcpu); 587 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 588 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 589 590 /* 591 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 592 * after the vnode has been freed. Try to get some KASAN coverage by 593 * marking everything except those two fields as invalid. Because 594 * KASAN's tracking is not byte-granular, any preceding fields sharing 595 * the same 8-byte aligned word must also be marked valid. 596 */ 597 598 /* Handle the area from the start until v_vnodelist... */ 599 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 600 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 601 602 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 603 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 604 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 605 if (off2 > off1) 606 kasan_mark((void *)((char *)mem + off1), off2 - off1, 607 off2 - off1, KASAN_UMA_FREED); 608 609 /* ... and finally the area from v_dbatchcpu to the end. */ 610 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 611 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 612 KASAN_UMA_FREED); 613 } 614 #endif /* KASAN */ 615 616 /* 617 * Initialize a vnode as it first enters the zone. 618 */ 619 static int 620 vnode_init(void *mem, int size, int flags) 621 { 622 struct vnode *vp; 623 624 vp = mem; 625 bzero(vp, size); 626 /* 627 * Setup locks. 628 */ 629 vp->v_vnlock = &vp->v_lock; 630 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 631 /* 632 * By default, don't allow shared locks unless filesystems opt-in. 633 */ 634 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 635 LK_NOSHARE | LK_IS_VNODE); 636 /* 637 * Initialize bufobj. 638 */ 639 bufobj_init(&vp->v_bufobj, vp); 640 /* 641 * Initialize namecache. 642 */ 643 cache_vnode_init(vp); 644 /* 645 * Initialize rangelocks. 646 */ 647 rangelock_init(&vp->v_rl); 648 649 vp->v_dbatchcpu = NOCPU; 650 651 vp->v_state = VSTATE_DEAD; 652 653 /* 654 * Check vhold_recycle_free for an explanation. 655 */ 656 vp->v_holdcnt = VHOLD_NO_SMR; 657 vp->v_type = VNON; 658 mtx_lock(&vnode_list_mtx); 659 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 660 mtx_unlock(&vnode_list_mtx); 661 return (0); 662 } 663 664 /* 665 * Free a vnode when it is cleared from the zone. 666 */ 667 static void 668 vnode_fini(void *mem, int size) 669 { 670 struct vnode *vp; 671 struct bufobj *bo; 672 673 vp = mem; 674 vdbatch_dequeue(vp); 675 mtx_lock(&vnode_list_mtx); 676 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 677 mtx_unlock(&vnode_list_mtx); 678 rangelock_destroy(&vp->v_rl); 679 lockdestroy(vp->v_vnlock); 680 mtx_destroy(&vp->v_interlock); 681 bo = &vp->v_bufobj; 682 rw_destroy(BO_LOCKPTR(bo)); 683 684 kasan_mark(mem, size, size, 0); 685 } 686 687 /* 688 * Provide the size of NFS nclnode and NFS fh for calculation of the 689 * vnode memory consumption. The size is specified directly to 690 * eliminate dependency on NFS-private header. 691 * 692 * Other filesystems may use bigger or smaller (like UFS and ZFS) 693 * private inode data, but the NFS-based estimation is ample enough. 694 * Still, we care about differences in the size between 64- and 32-bit 695 * platforms. 696 * 697 * Namecache structure size is heuristically 698 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 699 */ 700 #ifdef _LP64 701 #define NFS_NCLNODE_SZ (528 + 64) 702 #define NC_SZ 148 703 #else 704 #define NFS_NCLNODE_SZ (360 + 32) 705 #define NC_SZ 92 706 #endif 707 708 static void 709 vntblinit(void *dummy __unused) 710 { 711 struct vdbatch *vd; 712 uma_ctor ctor; 713 uma_dtor dtor; 714 int cpu, physvnodes, virtvnodes; 715 716 /* 717 * Desiredvnodes is a function of the physical memory size and the 718 * kernel's heap size. Generally speaking, it scales with the 719 * physical memory size. The ratio of desiredvnodes to the physical 720 * memory size is 1:16 until desiredvnodes exceeds 98,304. 721 * Thereafter, the 722 * marginal ratio of desiredvnodes to the physical memory size is 723 * 1:64. However, desiredvnodes is limited by the kernel's heap 724 * size. The memory required by desiredvnodes vnodes and vm objects 725 * must not exceed 1/10th of the kernel's heap size. 726 */ 727 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 728 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 729 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 730 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 731 desiredvnodes = min(physvnodes, virtvnodes); 732 if (desiredvnodes > MAXVNODES_MAX) { 733 if (bootverbose) 734 printf("Reducing kern.maxvnodes %lu -> %lu\n", 735 desiredvnodes, MAXVNODES_MAX); 736 desiredvnodes = MAXVNODES_MAX; 737 } 738 wantfreevnodes = desiredvnodes / 4; 739 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 740 TAILQ_INIT(&vnode_list); 741 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 742 /* 743 * The lock is taken to appease WITNESS. 744 */ 745 mtx_lock(&vnode_list_mtx); 746 vnlru_recalc(); 747 mtx_unlock(&vnode_list_mtx); 748 vnode_list_free_marker = vn_alloc_marker(NULL); 749 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 750 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 751 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 752 753 #ifdef KASAN 754 ctor = vnode_ctor; 755 dtor = vnode_dtor; 756 #else 757 ctor = NULL; 758 dtor = NULL; 759 #endif 760 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 761 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 762 uma_zone_set_smr(vnode_zone, vfs_smr); 763 764 /* 765 * Preallocate enough nodes to support one-per buf so that 766 * we can not fail an insert. reassignbuf() callers can not 767 * tolerate the insertion failure. 768 */ 769 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 770 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 771 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 772 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 773 uma_prealloc(buf_trie_zone, nbuf); 774 775 vnodes_created = counter_u64_alloc(M_WAITOK); 776 direct_recycles_free_count = counter_u64_alloc(M_WAITOK); 777 vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); 778 779 /* 780 * Initialize the filesystem syncer. 781 */ 782 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 783 &syncer_mask); 784 syncer_maxdelay = syncer_mask + 1; 785 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 786 cv_init(&sync_wakeup, "syncer"); 787 788 CPU_FOREACH(cpu) { 789 vd = DPCPU_ID_PTR((cpu), vd); 790 bzero(vd, sizeof(*vd)); 791 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 792 } 793 } 794 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 795 796 /* 797 * Mark a mount point as busy. Used to synchronize access and to delay 798 * unmounting. Eventually, mountlist_mtx is not released on failure. 799 * 800 * vfs_busy() is a custom lock, it can block the caller. 801 * vfs_busy() only sleeps if the unmount is active on the mount point. 802 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 803 * vnode belonging to mp. 804 * 805 * Lookup uses vfs_busy() to traverse mount points. 806 * root fs var fs 807 * / vnode lock A / vnode lock (/var) D 808 * /var vnode lock B /log vnode lock(/var/log) E 809 * vfs_busy lock C vfs_busy lock F 810 * 811 * Within each file system, the lock order is C->A->B and F->D->E. 812 * 813 * When traversing across mounts, the system follows that lock order: 814 * 815 * C->A->B 816 * | 817 * +->F->D->E 818 * 819 * The lookup() process for namei("/var") illustrates the process: 820 * 1. VOP_LOOKUP() obtains B while A is held 821 * 2. vfs_busy() obtains a shared lock on F while A and B are held 822 * 3. vput() releases lock on B 823 * 4. vput() releases lock on A 824 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 825 * 6. vfs_unbusy() releases shared lock on F 826 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 827 * Attempt to lock A (instead of vp_crossmp) while D is held would 828 * violate the global order, causing deadlocks. 829 * 830 * dounmount() locks B while F is drained. Note that for stacked 831 * filesystems, D and B in the example above may be the same lock, 832 * which introdues potential lock order reversal deadlock between 833 * dounmount() and step 5 above. These filesystems may avoid the LOR 834 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 835 * remain held until after step 5. 836 */ 837 int 838 vfs_busy(struct mount *mp, int flags) 839 { 840 struct mount_pcpu *mpcpu; 841 842 MPASS((flags & ~MBF_MASK) == 0); 843 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 844 845 if (vfs_op_thread_enter(mp, mpcpu)) { 846 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 847 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 848 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 849 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 850 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 851 vfs_op_thread_exit(mp, mpcpu); 852 if (flags & MBF_MNTLSTLOCK) 853 mtx_unlock(&mountlist_mtx); 854 return (0); 855 } 856 857 MNT_ILOCK(mp); 858 vfs_assert_mount_counters(mp); 859 MNT_REF(mp); 860 /* 861 * If mount point is currently being unmounted, sleep until the 862 * mount point fate is decided. If thread doing the unmounting fails, 863 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 864 * that this mount point has survived the unmount attempt and vfs_busy 865 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 866 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 867 * about to be really destroyed. vfs_busy needs to release its 868 * reference on the mount point in this case and return with ENOENT, 869 * telling the caller the mount it tried to busy is no longer valid. 870 */ 871 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 872 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 873 ("%s: non-empty upper mount list with pending unmount", 874 __func__)); 875 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 876 MNT_REL(mp); 877 MNT_IUNLOCK(mp); 878 CTR1(KTR_VFS, "%s: failed busying before sleeping", 879 __func__); 880 return (ENOENT); 881 } 882 if (flags & MBF_MNTLSTLOCK) 883 mtx_unlock(&mountlist_mtx); 884 mp->mnt_kern_flag |= MNTK_MWAIT; 885 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 886 if (flags & MBF_MNTLSTLOCK) 887 mtx_lock(&mountlist_mtx); 888 MNT_ILOCK(mp); 889 } 890 if (flags & MBF_MNTLSTLOCK) 891 mtx_unlock(&mountlist_mtx); 892 mp->mnt_lockref++; 893 MNT_IUNLOCK(mp); 894 return (0); 895 } 896 897 /* 898 * Free a busy filesystem. 899 */ 900 void 901 vfs_unbusy(struct mount *mp) 902 { 903 struct mount_pcpu *mpcpu; 904 int c; 905 906 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 907 908 if (vfs_op_thread_enter(mp, mpcpu)) { 909 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 910 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 911 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 912 vfs_op_thread_exit(mp, mpcpu); 913 return; 914 } 915 916 MNT_ILOCK(mp); 917 vfs_assert_mount_counters(mp); 918 MNT_REL(mp); 919 c = --mp->mnt_lockref; 920 if (mp->mnt_vfs_ops == 0) { 921 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 922 MNT_IUNLOCK(mp); 923 return; 924 } 925 if (c < 0) 926 vfs_dump_mount_counters(mp); 927 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 928 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 929 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 930 mp->mnt_kern_flag &= ~MNTK_DRAINING; 931 wakeup(&mp->mnt_lockref); 932 } 933 MNT_IUNLOCK(mp); 934 } 935 936 /* 937 * Lookup a mount point by filesystem identifier. 938 */ 939 struct mount * 940 vfs_getvfs(fsid_t *fsid) 941 { 942 struct mount *mp; 943 944 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 945 mtx_lock(&mountlist_mtx); 946 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 947 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 948 vfs_ref(mp); 949 mtx_unlock(&mountlist_mtx); 950 return (mp); 951 } 952 } 953 mtx_unlock(&mountlist_mtx); 954 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 955 return ((struct mount *) 0); 956 } 957 958 /* 959 * Lookup a mount point by filesystem identifier, busying it before 960 * returning. 961 * 962 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 963 * cache for popular filesystem identifiers. The cache is lockess, using 964 * the fact that struct mount's are never freed. In worst case we may 965 * get pointer to unmounted or even different filesystem, so we have to 966 * check what we got, and go slow way if so. 967 */ 968 struct mount * 969 vfs_busyfs(fsid_t *fsid) 970 { 971 #define FSID_CACHE_SIZE 256 972 typedef struct mount * volatile vmp_t; 973 static vmp_t cache[FSID_CACHE_SIZE]; 974 struct mount *mp; 975 int error; 976 uint32_t hash; 977 978 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 979 hash = fsid->val[0] ^ fsid->val[1]; 980 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 981 mp = cache[hash]; 982 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 983 goto slow; 984 if (vfs_busy(mp, 0) != 0) { 985 cache[hash] = NULL; 986 goto slow; 987 } 988 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 989 return (mp); 990 else 991 vfs_unbusy(mp); 992 993 slow: 994 mtx_lock(&mountlist_mtx); 995 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 996 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 997 error = vfs_busy(mp, MBF_MNTLSTLOCK); 998 if (error) { 999 cache[hash] = NULL; 1000 mtx_unlock(&mountlist_mtx); 1001 return (NULL); 1002 } 1003 cache[hash] = mp; 1004 return (mp); 1005 } 1006 } 1007 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 1008 mtx_unlock(&mountlist_mtx); 1009 return ((struct mount *) 0); 1010 } 1011 1012 /* 1013 * Check if a user can access privileged mount options. 1014 */ 1015 int 1016 vfs_suser(struct mount *mp, struct thread *td) 1017 { 1018 int error; 1019 1020 if (jailed(td->td_ucred)) { 1021 /* 1022 * If the jail of the calling thread lacks permission for 1023 * this type of file system, deny immediately. 1024 */ 1025 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 1026 return (EPERM); 1027 1028 /* 1029 * If the file system was mounted outside the jail of the 1030 * calling thread, deny immediately. 1031 */ 1032 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 1033 return (EPERM); 1034 } 1035 1036 /* 1037 * If file system supports delegated administration, we don't check 1038 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1039 * by the file system itself. 1040 * If this is not the user that did original mount, we check for 1041 * the PRIV_VFS_MOUNT_OWNER privilege. 1042 */ 1043 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1044 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1045 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1046 return (error); 1047 } 1048 return (0); 1049 } 1050 1051 /* 1052 * Get a new unique fsid. Try to make its val[0] unique, since this value 1053 * will be used to create fake device numbers for stat(). Also try (but 1054 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1055 * support 16-bit device numbers. We end up with unique val[0]'s for the 1056 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1057 * 1058 * Keep in mind that several mounts may be running in parallel. Starting 1059 * the search one past where the previous search terminated is both a 1060 * micro-optimization and a defense against returning the same fsid to 1061 * different mounts. 1062 */ 1063 void 1064 vfs_getnewfsid(struct mount *mp) 1065 { 1066 static uint16_t mntid_base; 1067 struct mount *nmp; 1068 fsid_t tfsid; 1069 int mtype; 1070 1071 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1072 mtx_lock(&mntid_mtx); 1073 mtype = mp->mnt_vfc->vfc_typenum; 1074 tfsid.val[1] = mtype; 1075 mtype = (mtype & 0xFF) << 24; 1076 for (;;) { 1077 tfsid.val[0] = makedev(255, 1078 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1079 mntid_base++; 1080 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1081 break; 1082 vfs_rel(nmp); 1083 } 1084 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1085 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1086 mtx_unlock(&mntid_mtx); 1087 } 1088 1089 /* 1090 * Knob to control the precision of file timestamps: 1091 * 1092 * 0 = seconds only; nanoseconds zeroed. 1093 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1094 * 2 = seconds and nanoseconds, truncated to microseconds. 1095 * >=3 = seconds and nanoseconds, maximum precision. 1096 */ 1097 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1098 1099 static int timestamp_precision = TSP_USEC; 1100 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1101 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1102 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1103 "3+: sec + ns (max. precision))"); 1104 1105 /* 1106 * Get a current timestamp. 1107 */ 1108 void 1109 vfs_timestamp(struct timespec *tsp) 1110 { 1111 struct timeval tv; 1112 1113 switch (timestamp_precision) { 1114 case TSP_SEC: 1115 tsp->tv_sec = time_second; 1116 tsp->tv_nsec = 0; 1117 break; 1118 case TSP_HZ: 1119 getnanotime(tsp); 1120 break; 1121 case TSP_USEC: 1122 microtime(&tv); 1123 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1124 break; 1125 case TSP_NSEC: 1126 default: 1127 nanotime(tsp); 1128 break; 1129 } 1130 } 1131 1132 /* 1133 * Set vnode attributes to VNOVAL 1134 */ 1135 void 1136 vattr_null(struct vattr *vap) 1137 { 1138 1139 vap->va_type = VNON; 1140 vap->va_size = VNOVAL; 1141 vap->va_bytes = VNOVAL; 1142 vap->va_mode = VNOVAL; 1143 vap->va_nlink = VNOVAL; 1144 vap->va_uid = VNOVAL; 1145 vap->va_gid = VNOVAL; 1146 vap->va_fsid = VNOVAL; 1147 vap->va_fileid = VNOVAL; 1148 vap->va_blocksize = VNOVAL; 1149 vap->va_rdev = VNOVAL; 1150 vap->va_atime.tv_sec = VNOVAL; 1151 vap->va_atime.tv_nsec = VNOVAL; 1152 vap->va_mtime.tv_sec = VNOVAL; 1153 vap->va_mtime.tv_nsec = VNOVAL; 1154 vap->va_ctime.tv_sec = VNOVAL; 1155 vap->va_ctime.tv_nsec = VNOVAL; 1156 vap->va_birthtime.tv_sec = VNOVAL; 1157 vap->va_birthtime.tv_nsec = VNOVAL; 1158 vap->va_flags = VNOVAL; 1159 vap->va_gen = VNOVAL; 1160 vap->va_vaflags = 0; 1161 } 1162 1163 /* 1164 * Try to reduce the total number of vnodes. 1165 * 1166 * This routine (and its user) are buggy in at least the following ways: 1167 * - all parameters were picked years ago when RAM sizes were significantly 1168 * smaller 1169 * - it can pick vnodes based on pages used by the vm object, but filesystems 1170 * like ZFS don't use it making the pick broken 1171 * - since ZFS has its own aging policy it gets partially combated by this one 1172 * - a dedicated method should be provided for filesystems to let them decide 1173 * whether the vnode should be recycled 1174 * 1175 * This routine is called when we have too many vnodes. It attempts 1176 * to free <count> vnodes and will potentially free vnodes that still 1177 * have VM backing store (VM backing store is typically the cause 1178 * of a vnode blowout so we want to do this). Therefore, this operation 1179 * is not considered cheap. 1180 * 1181 * A number of conditions may prevent a vnode from being reclaimed. 1182 * the buffer cache may have references on the vnode, a directory 1183 * vnode may still have references due to the namei cache representing 1184 * underlying files, or the vnode may be in active use. It is not 1185 * desirable to reuse such vnodes. These conditions may cause the 1186 * number of vnodes to reach some minimum value regardless of what 1187 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1188 * 1189 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1190 * entries if this argument is strue 1191 * @param trigger Only reclaim vnodes with fewer than this many resident 1192 * pages. 1193 * @param target How many vnodes to reclaim. 1194 * @return The number of vnodes that were reclaimed. 1195 */ 1196 static int 1197 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1198 { 1199 struct vnode *vp, *mvp; 1200 struct mount *mp; 1201 struct vm_object *object; 1202 u_long done; 1203 bool retried; 1204 1205 mtx_assert(&vnode_list_mtx, MA_OWNED); 1206 1207 retried = false; 1208 done = 0; 1209 1210 mvp = vnode_list_reclaim_marker; 1211 restart: 1212 vp = mvp; 1213 while (done < target) { 1214 vp = TAILQ_NEXT(vp, v_vnodelist); 1215 if (__predict_false(vp == NULL)) 1216 break; 1217 1218 if (__predict_false(vp->v_type == VMARKER)) 1219 continue; 1220 1221 /* 1222 * If it's been deconstructed already, it's still 1223 * referenced, or it exceeds the trigger, skip it. 1224 * Also skip free vnodes. We are trying to make space 1225 * for more free vnodes, not reduce their count. 1226 */ 1227 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1228 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1229 goto next_iter; 1230 1231 if (vp->v_type == VBAD || vp->v_type == VNON) 1232 goto next_iter; 1233 1234 object = atomic_load_ptr(&vp->v_object); 1235 if (object == NULL || object->resident_page_count > trigger) { 1236 goto next_iter; 1237 } 1238 1239 /* 1240 * Handle races against vnode allocation. Filesystems lock the 1241 * vnode some time after it gets returned from getnewvnode, 1242 * despite type and hold count being manipulated earlier. 1243 * Resorting to checking v_mount restores guarantees present 1244 * before the global list was reworked to contain all vnodes. 1245 */ 1246 if (!VI_TRYLOCK(vp)) 1247 goto next_iter; 1248 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1249 VI_UNLOCK(vp); 1250 goto next_iter; 1251 } 1252 if (vp->v_mount == NULL) { 1253 VI_UNLOCK(vp); 1254 goto next_iter; 1255 } 1256 vholdl(vp); 1257 VI_UNLOCK(vp); 1258 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1259 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1260 mtx_unlock(&vnode_list_mtx); 1261 1262 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1263 vdrop_recycle(vp); 1264 goto next_iter_unlocked; 1265 } 1266 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1267 vdrop_recycle(vp); 1268 vn_finished_write(mp); 1269 goto next_iter_unlocked; 1270 } 1271 1272 VI_LOCK(vp); 1273 if (vp->v_usecount > 0 || 1274 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1275 (vp->v_object != NULL && vp->v_object->handle == vp && 1276 vp->v_object->resident_page_count > trigger)) { 1277 VOP_UNLOCK(vp); 1278 vdropl_recycle(vp); 1279 vn_finished_write(mp); 1280 goto next_iter_unlocked; 1281 } 1282 recycles_count++; 1283 vgonel(vp); 1284 VOP_UNLOCK(vp); 1285 vdropl_recycle(vp); 1286 vn_finished_write(mp); 1287 done++; 1288 next_iter_unlocked: 1289 maybe_yield(); 1290 mtx_lock(&vnode_list_mtx); 1291 goto restart; 1292 next_iter: 1293 MPASS(vp->v_type != VMARKER); 1294 if (!should_yield()) 1295 continue; 1296 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1297 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1298 mtx_unlock(&vnode_list_mtx); 1299 kern_yield(PRI_USER); 1300 mtx_lock(&vnode_list_mtx); 1301 goto restart; 1302 } 1303 if (done == 0 && !retried) { 1304 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1305 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1306 retried = true; 1307 goto restart; 1308 } 1309 return (done); 1310 } 1311 1312 static int max_free_per_call = 10000; 1313 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, 1314 "limit on vnode free requests per call to the vnlru_free routine (legacy)"); 1315 SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, 1316 &max_free_per_call, 0, 1317 "limit on vnode free requests per call to the vnlru_free routine"); 1318 1319 /* 1320 * Attempt to recycle requested amount of free vnodes. 1321 */ 1322 static int 1323 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) 1324 { 1325 struct vnode *vp; 1326 struct mount *mp; 1327 int ocount; 1328 bool retried; 1329 1330 mtx_assert(&vnode_list_mtx, MA_OWNED); 1331 if (count > max_free_per_call) 1332 count = max_free_per_call; 1333 if (count == 0) { 1334 mtx_unlock(&vnode_list_mtx); 1335 return (0); 1336 } 1337 ocount = count; 1338 retried = false; 1339 vp = mvp; 1340 for (;;) { 1341 vp = TAILQ_NEXT(vp, v_vnodelist); 1342 if (__predict_false(vp == NULL)) { 1343 /* 1344 * The free vnode marker can be past eligible vnodes: 1345 * 1. if vdbatch_process trylock failed 1346 * 2. if vtryrecycle failed 1347 * 1348 * If so, start the scan from scratch. 1349 */ 1350 if (!retried && vnlru_read_freevnodes() > 0) { 1351 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1352 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1353 vp = mvp; 1354 retried = true; 1355 continue; 1356 } 1357 1358 /* 1359 * Give up 1360 */ 1361 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1362 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1363 mtx_unlock(&vnode_list_mtx); 1364 break; 1365 } 1366 if (__predict_false(vp->v_type == VMARKER)) 1367 continue; 1368 if (vp->v_holdcnt > 0) 1369 continue; 1370 /* 1371 * Don't recycle if our vnode is from different type 1372 * of mount point. Note that mp is type-safe, the 1373 * check does not reach unmapped address even if 1374 * vnode is reclaimed. 1375 */ 1376 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1377 mp->mnt_op != mnt_op) { 1378 continue; 1379 } 1380 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1381 continue; 1382 } 1383 if (!vhold_recycle_free(vp)) 1384 continue; 1385 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1386 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1387 mtx_unlock(&vnode_list_mtx); 1388 /* 1389 * FIXME: ignores the return value, meaning it may be nothing 1390 * got recycled but it claims otherwise to the caller. 1391 * 1392 * Originally the value started being ignored in 2005 with 1393 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1394 * 1395 * Respecting the value can run into significant stalls if most 1396 * vnodes belong to one file system and it has writes 1397 * suspended. In presence of many threads and millions of 1398 * vnodes they keep contending on the vnode_list_mtx lock only 1399 * to find vnodes they can't recycle. 1400 * 1401 * The solution would be to pre-check if the vnode is likely to 1402 * be recycle-able, but it needs to happen with the 1403 * vnode_list_mtx lock held. This runs into a problem where 1404 * VOP_GETWRITEMOUNT (currently needed to find out about if 1405 * writes are frozen) can take locks which LOR against it. 1406 * 1407 * Check nullfs for one example (null_getwritemount). 1408 */ 1409 vtryrecycle(vp, isvnlru); 1410 count--; 1411 if (count == 0) { 1412 break; 1413 } 1414 mtx_lock(&vnode_list_mtx); 1415 vp = mvp; 1416 } 1417 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1418 return (ocount - count); 1419 } 1420 1421 /* 1422 * XXX: returns without vnode_list_mtx locked! 1423 */ 1424 static int 1425 vnlru_free_locked_direct(int count) 1426 { 1427 int ret; 1428 1429 mtx_assert(&vnode_list_mtx, MA_OWNED); 1430 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); 1431 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1432 return (ret); 1433 } 1434 1435 static int 1436 vnlru_free_locked_vnlru(int count) 1437 { 1438 int ret; 1439 1440 mtx_assert(&vnode_list_mtx, MA_OWNED); 1441 ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); 1442 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1443 return (ret); 1444 } 1445 1446 static int 1447 vnlru_free_vnlru(int count) 1448 { 1449 1450 mtx_lock(&vnode_list_mtx); 1451 return (vnlru_free_locked_vnlru(count)); 1452 } 1453 1454 void 1455 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1456 { 1457 1458 MPASS(mnt_op != NULL); 1459 MPASS(mvp != NULL); 1460 VNPASS(mvp->v_type == VMARKER, mvp); 1461 mtx_lock(&vnode_list_mtx); 1462 vnlru_free_impl(count, mnt_op, mvp, true); 1463 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1464 } 1465 1466 struct vnode * 1467 vnlru_alloc_marker(void) 1468 { 1469 struct vnode *mvp; 1470 1471 mvp = vn_alloc_marker(NULL); 1472 mtx_lock(&vnode_list_mtx); 1473 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1474 mtx_unlock(&vnode_list_mtx); 1475 return (mvp); 1476 } 1477 1478 void 1479 vnlru_free_marker(struct vnode *mvp) 1480 { 1481 mtx_lock(&vnode_list_mtx); 1482 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1483 mtx_unlock(&vnode_list_mtx); 1484 vn_free_marker(mvp); 1485 } 1486 1487 static void 1488 vnlru_recalc(void) 1489 { 1490 1491 mtx_assert(&vnode_list_mtx, MA_OWNED); 1492 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1493 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1494 vlowat = vhiwat / 2; 1495 } 1496 1497 /* 1498 * Attempt to recycle vnodes in a context that is always safe to block. 1499 * Calling vlrurecycle() from the bowels of filesystem code has some 1500 * interesting deadlock problems. 1501 */ 1502 static struct proc *vnlruproc; 1503 static int vnlruproc_sig; 1504 static u_long vnlruproc_kicks; 1505 1506 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, 1507 "Number of times vnlru awakened due to vnode shortage"); 1508 1509 #define VNLRU_COUNT_SLOP 100 1510 1511 /* 1512 * The main freevnodes counter is only updated when a counter local to CPU 1513 * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally 1514 * walked to compute a more accurate total. 1515 * 1516 * Note: the actual value at any given moment can still exceed slop, but it 1517 * should not be by significant margin in practice. 1518 */ 1519 #define VNLRU_FREEVNODES_SLOP 126 1520 1521 static void __noinline 1522 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1523 { 1524 1525 atomic_add_long(&freevnodes, *lfreevnodes); 1526 *lfreevnodes = 0; 1527 critical_exit(); 1528 } 1529 1530 static __inline void 1531 vfs_freevnodes_inc(void) 1532 { 1533 int8_t *lfreevnodes; 1534 1535 critical_enter(); 1536 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1537 (*lfreevnodes)++; 1538 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1539 vfs_freevnodes_rollup(lfreevnodes); 1540 else 1541 critical_exit(); 1542 } 1543 1544 static __inline void 1545 vfs_freevnodes_dec(void) 1546 { 1547 int8_t *lfreevnodes; 1548 1549 critical_enter(); 1550 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1551 (*lfreevnodes)--; 1552 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1553 vfs_freevnodes_rollup(lfreevnodes); 1554 else 1555 critical_exit(); 1556 } 1557 1558 static u_long 1559 vnlru_read_freevnodes(void) 1560 { 1561 long slop, rfreevnodes, rfreevnodes_old; 1562 int cpu; 1563 1564 rfreevnodes = atomic_load_long(&freevnodes); 1565 rfreevnodes_old = atomic_load_long(&freevnodes_old); 1566 1567 if (rfreevnodes > rfreevnodes_old) 1568 slop = rfreevnodes - rfreevnodes_old; 1569 else 1570 slop = rfreevnodes_old - rfreevnodes; 1571 if (slop < VNLRU_FREEVNODES_SLOP) 1572 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1573 CPU_FOREACH(cpu) { 1574 rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1575 } 1576 atomic_store_long(&freevnodes_old, rfreevnodes); 1577 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1578 } 1579 1580 static bool 1581 vnlru_under(u_long rnumvnodes, u_long limit) 1582 { 1583 u_long rfreevnodes, space; 1584 1585 if (__predict_false(rnumvnodes > desiredvnodes)) 1586 return (true); 1587 1588 space = desiredvnodes - rnumvnodes; 1589 if (space < limit) { 1590 rfreevnodes = vnlru_read_freevnodes(); 1591 if (rfreevnodes > wantfreevnodes) 1592 space += rfreevnodes - wantfreevnodes; 1593 } 1594 return (space < limit); 1595 } 1596 1597 static void 1598 vnlru_kick_locked(void) 1599 { 1600 1601 mtx_assert(&vnode_list_mtx, MA_OWNED); 1602 if (vnlruproc_sig == 0) { 1603 vnlruproc_sig = 1; 1604 vnlruproc_kicks++; 1605 wakeup(vnlruproc); 1606 } 1607 } 1608 1609 static void 1610 vnlru_kick_cond(void) 1611 { 1612 1613 if (vnlru_read_freevnodes() > wantfreevnodes) 1614 return; 1615 1616 if (vnlruproc_sig) 1617 return; 1618 mtx_lock(&vnode_list_mtx); 1619 vnlru_kick_locked(); 1620 mtx_unlock(&vnode_list_mtx); 1621 } 1622 1623 static void 1624 vnlru_proc_sleep(void) 1625 { 1626 1627 if (vnlruproc_sig) { 1628 vnlruproc_sig = 0; 1629 wakeup(&vnlruproc_sig); 1630 } 1631 msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); 1632 } 1633 1634 /* 1635 * A lighter version of the machinery below. 1636 * 1637 * Tries to reach goals only by recycling free vnodes and does not invoke 1638 * uma_reclaim(UMA_RECLAIM_DRAIN). 1639 * 1640 * This works around pathological behavior in vnlru in presence of tons of free 1641 * vnodes, but without having to rewrite the machinery at this time. Said 1642 * behavior boils down to continuously trying to reclaim all kinds of vnodes 1643 * (cycling through all levels of "force") when the count is transiently above 1644 * limit. This happens a lot when all vnodes are used up and vn_alloc 1645 * speculatively increments the counter. 1646 * 1647 * Sample testcase: vnode limit 8388608, 20 separate directory trees each with 1648 * 1 million files in total and 20 find(1) processes stating them in parallel 1649 * (one per each tree). 1650 * 1651 * On a kernel with only stock machinery this needs anywhere between 60 and 120 1652 * seconds to execute (time varies *wildly* between runs). With the workaround 1653 * it consistently stays around 20 seconds [it got further down with later 1654 * changes]. 1655 * 1656 * That is to say the entire thing needs a fundamental redesign (most notably 1657 * to accommodate faster recycling), the above only tries to get it ouf the way. 1658 * 1659 * Return values are: 1660 * -1 -- fallback to regular vnlru loop 1661 * 0 -- do nothing, go to sleep 1662 * >0 -- recycle this many vnodes 1663 */ 1664 static long 1665 vnlru_proc_light_pick(void) 1666 { 1667 u_long rnumvnodes, rfreevnodes; 1668 1669 if (vstir || vnlruproc_sig == 1) 1670 return (-1); 1671 1672 rnumvnodes = atomic_load_long(&numvnodes); 1673 rfreevnodes = vnlru_read_freevnodes(); 1674 1675 /* 1676 * vnode limit might have changed and now we may be at a significant 1677 * excess. Bail if we can't sort it out with free vnodes. 1678 * 1679 * Due to atomic updates the count can legitimately go above 1680 * the limit for a short period, don't bother doing anything in 1681 * that case. 1682 */ 1683 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { 1684 if (rnumvnodes - rfreevnodes >= desiredvnodes || 1685 rfreevnodes <= wantfreevnodes) { 1686 return (-1); 1687 } 1688 1689 return (rnumvnodes - desiredvnodes); 1690 } 1691 1692 /* 1693 * Don't try to reach wantfreevnodes target if there are too few vnodes 1694 * to begin with. 1695 */ 1696 if (rnumvnodes < wantfreevnodes) { 1697 return (0); 1698 } 1699 1700 if (rfreevnodes < wantfreevnodes) { 1701 return (-1); 1702 } 1703 1704 return (0); 1705 } 1706 1707 static bool 1708 vnlru_proc_light(void) 1709 { 1710 long freecount; 1711 1712 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1713 1714 freecount = vnlru_proc_light_pick(); 1715 if (freecount == -1) 1716 return (false); 1717 1718 if (freecount != 0) { 1719 vnlru_free_vnlru(freecount); 1720 } 1721 1722 mtx_lock(&vnode_list_mtx); 1723 vnlru_proc_sleep(); 1724 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1725 return (true); 1726 } 1727 1728 static u_long uma_reclaim_calls; 1729 SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, 1730 &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); 1731 1732 static void 1733 vnlru_proc(void) 1734 { 1735 u_long rnumvnodes, rfreevnodes, target; 1736 unsigned long onumvnodes; 1737 int done, force, trigger, usevnodes; 1738 bool reclaim_nc_src, want_reread; 1739 1740 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1741 SHUTDOWN_PRI_FIRST); 1742 1743 force = 0; 1744 want_reread = false; 1745 for (;;) { 1746 kproc_suspend_check(vnlruproc); 1747 1748 if (force == 0 && vnlru_proc_light()) 1749 continue; 1750 1751 mtx_lock(&vnode_list_mtx); 1752 rnumvnodes = atomic_load_long(&numvnodes); 1753 1754 if (want_reread) { 1755 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1756 want_reread = false; 1757 } 1758 1759 /* 1760 * If numvnodes is too large (due to desiredvnodes being 1761 * adjusted using its sysctl, or emergency growth), first 1762 * try to reduce it by discarding free vnodes. 1763 */ 1764 if (rnumvnodes > desiredvnodes + 10) { 1765 vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); 1766 mtx_lock(&vnode_list_mtx); 1767 rnumvnodes = atomic_load_long(&numvnodes); 1768 } 1769 /* 1770 * Sleep if the vnode cache is in a good state. This is 1771 * when it is not over-full and has space for about a 4% 1772 * or 9% expansion (by growing its size or inexcessively 1773 * reducing free vnode count). Otherwise, try to reclaim 1774 * space for a 10% expansion. 1775 */ 1776 if (vstir && force == 0) { 1777 force = 1; 1778 vstir = false; 1779 } 1780 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1781 vnlru_proc_sleep(); 1782 continue; 1783 } 1784 rfreevnodes = vnlru_read_freevnodes(); 1785 1786 onumvnodes = rnumvnodes; 1787 /* 1788 * Calculate parameters for recycling. These are the same 1789 * throughout the loop to give some semblance of fairness. 1790 * The trigger point is to avoid recycling vnodes with lots 1791 * of resident pages. We aren't trying to free memory; we 1792 * are trying to recycle or at least free vnodes. 1793 */ 1794 if (rnumvnodes <= desiredvnodes) 1795 usevnodes = rnumvnodes - rfreevnodes; 1796 else 1797 usevnodes = rnumvnodes; 1798 if (usevnodes <= 0) 1799 usevnodes = 1; 1800 /* 1801 * The trigger value is chosen to give a conservatively 1802 * large value to ensure that it alone doesn't prevent 1803 * making progress. The value can easily be so large that 1804 * it is effectively infinite in some congested and 1805 * misconfigured cases, and this is necessary. Normally 1806 * it is about 8 to 100 (pages), which is quite large. 1807 */ 1808 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1809 if (force < 2) 1810 trigger = vsmalltrigger; 1811 reclaim_nc_src = force >= 3; 1812 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1813 target = target / 10 + 1; 1814 done = vlrureclaim(reclaim_nc_src, trigger, target); 1815 mtx_unlock(&vnode_list_mtx); 1816 /* 1817 * Total number of vnodes can transiently go slightly above the 1818 * limit (see vn_alloc_hard), no need to call uma_reclaim if 1819 * this happens. 1820 */ 1821 if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && 1822 numvnodes <= desiredvnodes) { 1823 uma_reclaim_calls++; 1824 uma_reclaim(UMA_RECLAIM_DRAIN); 1825 } 1826 if (done == 0) { 1827 if (force == 0 || force == 1) { 1828 force = 2; 1829 continue; 1830 } 1831 if (force == 2) { 1832 force = 3; 1833 continue; 1834 } 1835 want_reread = true; 1836 force = 0; 1837 vnlru_nowhere++; 1838 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1839 } else { 1840 want_reread = true; 1841 kern_yield(PRI_USER); 1842 } 1843 } 1844 } 1845 1846 static struct kproc_desc vnlru_kp = { 1847 "vnlru", 1848 vnlru_proc, 1849 &vnlruproc 1850 }; 1851 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1852 &vnlru_kp); 1853 1854 /* 1855 * Routines having to do with the management of the vnode table. 1856 */ 1857 1858 /* 1859 * Try to recycle a freed vnode. 1860 */ 1861 static int 1862 vtryrecycle(struct vnode *vp, bool isvnlru) 1863 { 1864 struct mount *vnmp; 1865 1866 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1867 VNPASS(vp->v_holdcnt > 0, vp); 1868 /* 1869 * This vnode may found and locked via some other list, if so we 1870 * can't recycle it yet. 1871 */ 1872 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1873 CTR2(KTR_VFS, 1874 "%s: impossible to recycle, vp %p lock is already held", 1875 __func__, vp); 1876 vdrop_recycle(vp); 1877 return (EWOULDBLOCK); 1878 } 1879 /* 1880 * Don't recycle if its filesystem is being suspended. 1881 */ 1882 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1883 VOP_UNLOCK(vp); 1884 CTR2(KTR_VFS, 1885 "%s: impossible to recycle, cannot start the write for %p", 1886 __func__, vp); 1887 vdrop_recycle(vp); 1888 return (EBUSY); 1889 } 1890 /* 1891 * If we got this far, we need to acquire the interlock and see if 1892 * anyone picked up this vnode from another list. If not, we will 1893 * mark it with DOOMED via vgonel() so that anyone who does find it 1894 * will skip over it. 1895 */ 1896 VI_LOCK(vp); 1897 if (vp->v_usecount) { 1898 VOP_UNLOCK(vp); 1899 vdropl_recycle(vp); 1900 vn_finished_write(vnmp); 1901 CTR2(KTR_VFS, 1902 "%s: impossible to recycle, %p is already referenced", 1903 __func__, vp); 1904 return (EBUSY); 1905 } 1906 if (!VN_IS_DOOMED(vp)) { 1907 if (isvnlru) 1908 recycles_free_count++; 1909 else 1910 counter_u64_add(direct_recycles_free_count, 1); 1911 vgonel(vp); 1912 } 1913 VOP_UNLOCK(vp); 1914 vdropl_recycle(vp); 1915 vn_finished_write(vnmp); 1916 return (0); 1917 } 1918 1919 /* 1920 * Allocate a new vnode. 1921 * 1922 * The operation never returns an error. Returning an error was disabled 1923 * in r145385 (dated 2005) with the following comment: 1924 * 1925 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1926 * 1927 * Given the age of this commit (almost 15 years at the time of writing this 1928 * comment) restoring the ability to fail requires a significant audit of 1929 * all codepaths. 1930 * 1931 * The routine can try to free a vnode or stall for up to 1 second waiting for 1932 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1933 */ 1934 static u_long vn_alloc_cyclecount; 1935 static u_long vn_alloc_sleeps; 1936 1937 SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1938 "Number of times vnode allocation blocked waiting on vnlru"); 1939 1940 static struct vnode * __noinline 1941 vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) 1942 { 1943 u_long rfreevnodes; 1944 1945 if (bumped) { 1946 if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { 1947 atomic_subtract_long(&numvnodes, 1); 1948 bumped = false; 1949 } 1950 } 1951 1952 mtx_lock(&vnode_list_mtx); 1953 1954 if (vn_alloc_cyclecount != 0) { 1955 rnumvnodes = atomic_load_long(&numvnodes); 1956 if (rnumvnodes + 1 < desiredvnodes) { 1957 vn_alloc_cyclecount = 0; 1958 mtx_unlock(&vnode_list_mtx); 1959 goto alloc; 1960 } 1961 1962 rfreevnodes = vnlru_read_freevnodes(); 1963 if (rfreevnodes < wantfreevnodes) { 1964 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1965 vn_alloc_cyclecount = 0; 1966 vstir = true; 1967 } 1968 } else { 1969 vn_alloc_cyclecount = 0; 1970 } 1971 } 1972 1973 /* 1974 * Grow the vnode cache if it will not be above its target max after 1975 * growing. Otherwise, if there is at least one free vnode, try to 1976 * reclaim 1 item from it before growing the cache (possibly above its 1977 * target max if the reclamation failed or is delayed). 1978 */ 1979 if (vnlru_free_locked_direct(1) > 0) 1980 goto alloc; 1981 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 1982 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1983 /* 1984 * Wait for space for a new vnode. 1985 */ 1986 if (bumped) { 1987 atomic_subtract_long(&numvnodes, 1); 1988 bumped = false; 1989 } 1990 mtx_lock(&vnode_list_mtx); 1991 vnlru_kick_locked(); 1992 vn_alloc_sleeps++; 1993 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1994 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1995 vnlru_read_freevnodes() > 1) 1996 vnlru_free_locked_direct(1); 1997 else 1998 mtx_unlock(&vnode_list_mtx); 1999 } 2000 alloc: 2001 mtx_assert(&vnode_list_mtx, MA_NOTOWNED); 2002 if (!bumped) 2003 atomic_add_long(&numvnodes, 1); 2004 vnlru_kick_cond(); 2005 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2006 } 2007 2008 static struct vnode * 2009 vn_alloc(struct mount *mp) 2010 { 2011 u_long rnumvnodes; 2012 2013 if (__predict_false(vn_alloc_cyclecount != 0)) 2014 return (vn_alloc_hard(mp, 0, false)); 2015 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 2016 if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { 2017 return (vn_alloc_hard(mp, rnumvnodes, true)); 2018 } 2019 2020 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 2021 } 2022 2023 static void 2024 vn_free(struct vnode *vp) 2025 { 2026 2027 atomic_subtract_long(&numvnodes, 1); 2028 uma_zfree_smr(vnode_zone, vp); 2029 } 2030 2031 /* 2032 * Allocate a new vnode. 2033 */ 2034 int 2035 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 2036 struct vnode **vpp) 2037 { 2038 struct vnode *vp; 2039 struct thread *td; 2040 struct lock_object *lo; 2041 2042 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 2043 2044 KASSERT(vops->registered, 2045 ("%s: not registered vector op %p\n", __func__, vops)); 2046 cache_validate_vop_vector(mp, vops); 2047 2048 td = curthread; 2049 if (td->td_vp_reserved != NULL) { 2050 vp = td->td_vp_reserved; 2051 td->td_vp_reserved = NULL; 2052 } else { 2053 vp = vn_alloc(mp); 2054 } 2055 counter_u64_add(vnodes_created, 1); 2056 2057 vn_set_state(vp, VSTATE_UNINITIALIZED); 2058 2059 /* 2060 * Locks are given the generic name "vnode" when created. 2061 * Follow the historic practice of using the filesystem 2062 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 2063 * 2064 * Locks live in a witness group keyed on their name. Thus, 2065 * when a lock is renamed, it must also move from the witness 2066 * group of its old name to the witness group of its new name. 2067 * 2068 * The change only needs to be made when the vnode moves 2069 * from one filesystem type to another. We ensure that each 2070 * filesystem use a single static name pointer for its tag so 2071 * that we can compare pointers rather than doing a strcmp(). 2072 */ 2073 lo = &vp->v_vnlock->lock_object; 2074 #ifdef WITNESS 2075 if (lo->lo_name != tag) { 2076 #endif 2077 lo->lo_name = tag; 2078 #ifdef WITNESS 2079 WITNESS_DESTROY(lo); 2080 WITNESS_INIT(lo, tag); 2081 } 2082 #endif 2083 /* 2084 * By default, don't allow shared locks unless filesystems opt-in. 2085 */ 2086 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 2087 /* 2088 * Finalize various vnode identity bits. 2089 */ 2090 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 2091 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 2092 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 2093 vp->v_type = VNON; 2094 vp->v_op = vops; 2095 vp->v_irflag = 0; 2096 v_init_counters(vp); 2097 vn_seqc_init(vp); 2098 vp->v_bufobj.bo_ops = &buf_ops_bio; 2099 #ifdef DIAGNOSTIC 2100 if (mp == NULL && vops != &dead_vnodeops) 2101 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 2102 #endif 2103 #ifdef MAC 2104 mac_vnode_init(vp); 2105 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 2106 mac_vnode_associate_singlelabel(mp, vp); 2107 #endif 2108 if (mp != NULL) { 2109 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 2110 } 2111 2112 /* 2113 * For the filesystems which do not use vfs_hash_insert(), 2114 * still initialize v_hash to have vfs_hash_index() useful. 2115 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 2116 * its own hashing. 2117 */ 2118 vp->v_hash = (uintptr_t)vp >> vnsz2log; 2119 2120 *vpp = vp; 2121 return (0); 2122 } 2123 2124 void 2125 getnewvnode_reserve(void) 2126 { 2127 struct thread *td; 2128 2129 td = curthread; 2130 MPASS(td->td_vp_reserved == NULL); 2131 td->td_vp_reserved = vn_alloc(NULL); 2132 } 2133 2134 void 2135 getnewvnode_drop_reserve(void) 2136 { 2137 struct thread *td; 2138 2139 td = curthread; 2140 if (td->td_vp_reserved != NULL) { 2141 vn_free(td->td_vp_reserved); 2142 td->td_vp_reserved = NULL; 2143 } 2144 } 2145 2146 static void __noinline 2147 freevnode(struct vnode *vp) 2148 { 2149 struct bufobj *bo; 2150 2151 /* 2152 * The vnode has been marked for destruction, so free it. 2153 * 2154 * The vnode will be returned to the zone where it will 2155 * normally remain until it is needed for another vnode. We 2156 * need to cleanup (or verify that the cleanup has already 2157 * been done) any residual data left from its current use 2158 * so as not to contaminate the freshly allocated vnode. 2159 */ 2160 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2161 /* 2162 * Paired with vgone. 2163 */ 2164 vn_seqc_write_end_free(vp); 2165 2166 bo = &vp->v_bufobj; 2167 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2168 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 2169 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2170 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2171 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2172 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2173 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 2174 ("clean blk trie not empty")); 2175 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2176 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 2177 ("dirty blk trie not empty")); 2178 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 2179 ("Dangling rangelock waiters")); 2180 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 2181 ("Leaked inactivation")); 2182 VI_UNLOCK(vp); 2183 cache_assert_no_entries(vp); 2184 2185 #ifdef MAC 2186 mac_vnode_destroy(vp); 2187 #endif 2188 if (vp->v_pollinfo != NULL) { 2189 /* 2190 * Use LK_NOWAIT to shut up witness about the lock. We may get 2191 * here while having another vnode locked when trying to 2192 * satisfy a lookup and needing to recycle. 2193 */ 2194 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 2195 destroy_vpollinfo(vp->v_pollinfo); 2196 VOP_UNLOCK(vp); 2197 vp->v_pollinfo = NULL; 2198 } 2199 vp->v_mountedhere = NULL; 2200 vp->v_unpcb = NULL; 2201 vp->v_rdev = NULL; 2202 vp->v_fifoinfo = NULL; 2203 vp->v_iflag = 0; 2204 vp->v_vflag = 0; 2205 bo->bo_flag = 0; 2206 vn_free(vp); 2207 } 2208 2209 /* 2210 * Delete from old mount point vnode list, if on one. 2211 */ 2212 static void 2213 delmntque(struct vnode *vp) 2214 { 2215 struct mount *mp; 2216 2217 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 2218 2219 mp = vp->v_mount; 2220 MNT_ILOCK(mp); 2221 VI_LOCK(vp); 2222 vp->v_mount = NULL; 2223 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 2224 ("bad mount point vnode list size")); 2225 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2226 mp->mnt_nvnodelistsize--; 2227 MNT_REL(mp); 2228 MNT_IUNLOCK(mp); 2229 /* 2230 * The caller expects the interlock to be still held. 2231 */ 2232 ASSERT_VI_LOCKED(vp, __func__); 2233 } 2234 2235 static int 2236 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 2237 { 2238 2239 KASSERT(vp->v_mount == NULL, 2240 ("insmntque: vnode already on per mount vnode list")); 2241 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2242 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2243 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2244 } else { 2245 KASSERT(!dtr, 2246 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2247 __func__)); 2248 } 2249 2250 /* 2251 * We acquire the vnode interlock early to ensure that the 2252 * vnode cannot be recycled by another process releasing a 2253 * holdcnt on it before we get it on both the vnode list 2254 * and the active vnode list. The mount mutex protects only 2255 * manipulation of the vnode list and the vnode freelist 2256 * mutex protects only manipulation of the active vnode list. 2257 * Hence the need to hold the vnode interlock throughout. 2258 */ 2259 MNT_ILOCK(mp); 2260 VI_LOCK(vp); 2261 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2262 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2263 mp->mnt_nvnodelistsize == 0)) && 2264 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2265 VI_UNLOCK(vp); 2266 MNT_IUNLOCK(mp); 2267 if (dtr) { 2268 vp->v_data = NULL; 2269 vp->v_op = &dead_vnodeops; 2270 vgone(vp); 2271 vput(vp); 2272 } 2273 return (EBUSY); 2274 } 2275 vp->v_mount = mp; 2276 MNT_REF(mp); 2277 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2278 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2279 ("neg mount point vnode list size")); 2280 mp->mnt_nvnodelistsize++; 2281 VI_UNLOCK(vp); 2282 MNT_IUNLOCK(mp); 2283 return (0); 2284 } 2285 2286 /* 2287 * Insert into list of vnodes for the new mount point, if available. 2288 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2289 * leaves handling of the vnode to the caller. 2290 */ 2291 int 2292 insmntque(struct vnode *vp, struct mount *mp) 2293 { 2294 return (insmntque1_int(vp, mp, true)); 2295 } 2296 2297 int 2298 insmntque1(struct vnode *vp, struct mount *mp) 2299 { 2300 return (insmntque1_int(vp, mp, false)); 2301 } 2302 2303 /* 2304 * Flush out and invalidate all buffers associated with a bufobj 2305 * Called with the underlying object locked. 2306 */ 2307 int 2308 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2309 { 2310 int error; 2311 2312 BO_LOCK(bo); 2313 if (flags & V_SAVE) { 2314 error = bufobj_wwait(bo, slpflag, slptimeo); 2315 if (error) { 2316 BO_UNLOCK(bo); 2317 return (error); 2318 } 2319 if (bo->bo_dirty.bv_cnt > 0) { 2320 BO_UNLOCK(bo); 2321 do { 2322 error = BO_SYNC(bo, MNT_WAIT); 2323 } while (error == ERELOOKUP); 2324 if (error != 0) 2325 return (error); 2326 BO_LOCK(bo); 2327 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2328 BO_UNLOCK(bo); 2329 return (EBUSY); 2330 } 2331 } 2332 } 2333 /* 2334 * If you alter this loop please notice that interlock is dropped and 2335 * reacquired in flushbuflist. Special care is needed to ensure that 2336 * no race conditions occur from this. 2337 */ 2338 do { 2339 error = flushbuflist(&bo->bo_clean, 2340 flags, bo, slpflag, slptimeo); 2341 if (error == 0 && !(flags & V_CLEANONLY)) 2342 error = flushbuflist(&bo->bo_dirty, 2343 flags, bo, slpflag, slptimeo); 2344 if (error != 0 && error != EAGAIN) { 2345 BO_UNLOCK(bo); 2346 return (error); 2347 } 2348 } while (error != 0); 2349 2350 /* 2351 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2352 * have write I/O in-progress but if there is a VM object then the 2353 * VM object can also have read-I/O in-progress. 2354 */ 2355 do { 2356 bufobj_wwait(bo, 0, 0); 2357 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2358 BO_UNLOCK(bo); 2359 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2360 BO_LOCK(bo); 2361 } 2362 } while (bo->bo_numoutput > 0); 2363 BO_UNLOCK(bo); 2364 2365 /* 2366 * Destroy the copy in the VM cache, too. 2367 */ 2368 if (bo->bo_object != NULL && 2369 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2370 VM_OBJECT_WLOCK(bo->bo_object); 2371 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2372 OBJPR_CLEANONLY : 0); 2373 VM_OBJECT_WUNLOCK(bo->bo_object); 2374 } 2375 2376 #ifdef INVARIANTS 2377 BO_LOCK(bo); 2378 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2379 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2380 bo->bo_clean.bv_cnt > 0)) 2381 panic("vinvalbuf: flush failed"); 2382 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2383 bo->bo_dirty.bv_cnt > 0) 2384 panic("vinvalbuf: flush dirty failed"); 2385 BO_UNLOCK(bo); 2386 #endif 2387 return (0); 2388 } 2389 2390 /* 2391 * Flush out and invalidate all buffers associated with a vnode. 2392 * Called with the underlying object locked. 2393 */ 2394 int 2395 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2396 { 2397 2398 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2399 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2400 if (vp->v_object != NULL && vp->v_object->handle != vp) 2401 return (0); 2402 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2403 } 2404 2405 /* 2406 * Flush out buffers on the specified list. 2407 * 2408 */ 2409 static int 2410 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2411 int slptimeo) 2412 { 2413 struct buf *bp, *nbp; 2414 int retval, error; 2415 daddr_t lblkno; 2416 b_xflags_t xflags; 2417 2418 ASSERT_BO_WLOCKED(bo); 2419 2420 retval = 0; 2421 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2422 /* 2423 * If we are flushing both V_NORMAL and V_ALT buffers then 2424 * do not skip any buffers. If we are flushing only V_NORMAL 2425 * buffers then skip buffers marked as BX_ALTDATA. If we are 2426 * flushing only V_ALT buffers then skip buffers not marked 2427 * as BX_ALTDATA. 2428 */ 2429 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2430 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2431 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2432 continue; 2433 } 2434 if (nbp != NULL) { 2435 lblkno = nbp->b_lblkno; 2436 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2437 } 2438 retval = EAGAIN; 2439 error = BUF_TIMELOCK(bp, 2440 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2441 "flushbuf", slpflag, slptimeo); 2442 if (error) { 2443 BO_LOCK(bo); 2444 return (error != ENOLCK ? error : EAGAIN); 2445 } 2446 KASSERT(bp->b_bufobj == bo, 2447 ("bp %p wrong b_bufobj %p should be %p", 2448 bp, bp->b_bufobj, bo)); 2449 /* 2450 * XXX Since there are no node locks for NFS, I 2451 * believe there is a slight chance that a delayed 2452 * write will occur while sleeping just above, so 2453 * check for it. 2454 */ 2455 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2456 (flags & V_SAVE)) { 2457 bremfree(bp); 2458 bp->b_flags |= B_ASYNC; 2459 bwrite(bp); 2460 BO_LOCK(bo); 2461 return (EAGAIN); /* XXX: why not loop ? */ 2462 } 2463 bremfree(bp); 2464 bp->b_flags |= (B_INVAL | B_RELBUF); 2465 bp->b_flags &= ~B_ASYNC; 2466 brelse(bp); 2467 BO_LOCK(bo); 2468 if (nbp == NULL) 2469 break; 2470 nbp = gbincore(bo, lblkno); 2471 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2472 != xflags) 2473 break; /* nbp invalid */ 2474 } 2475 return (retval); 2476 } 2477 2478 int 2479 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2480 { 2481 struct buf *bp; 2482 int error; 2483 daddr_t lblkno; 2484 2485 ASSERT_BO_LOCKED(bo); 2486 2487 for (lblkno = startn;;) { 2488 again: 2489 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2490 if (bp == NULL || bp->b_lblkno >= endn || 2491 bp->b_lblkno < startn) 2492 break; 2493 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2494 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2495 if (error != 0) { 2496 BO_RLOCK(bo); 2497 if (error == ENOLCK) 2498 goto again; 2499 return (error); 2500 } 2501 KASSERT(bp->b_bufobj == bo, 2502 ("bp %p wrong b_bufobj %p should be %p", 2503 bp, bp->b_bufobj, bo)); 2504 lblkno = bp->b_lblkno + 1; 2505 if ((bp->b_flags & B_MANAGED) == 0) 2506 bremfree(bp); 2507 bp->b_flags |= B_RELBUF; 2508 /* 2509 * In the VMIO case, use the B_NOREUSE flag to hint that the 2510 * pages backing each buffer in the range are unlikely to be 2511 * reused. Dirty buffers will have the hint applied once 2512 * they've been written. 2513 */ 2514 if ((bp->b_flags & B_VMIO) != 0) 2515 bp->b_flags |= B_NOREUSE; 2516 brelse(bp); 2517 BO_RLOCK(bo); 2518 } 2519 return (0); 2520 } 2521 2522 /* 2523 * Truncate a file's buffer and pages to a specified length. This 2524 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2525 * sync activity. 2526 */ 2527 int 2528 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2529 { 2530 struct buf *bp, *nbp; 2531 struct bufobj *bo; 2532 daddr_t startlbn; 2533 2534 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2535 vp, blksize, (uintmax_t)length); 2536 2537 /* 2538 * Round up to the *next* lbn. 2539 */ 2540 startlbn = howmany(length, blksize); 2541 2542 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2543 2544 bo = &vp->v_bufobj; 2545 restart_unlocked: 2546 BO_LOCK(bo); 2547 2548 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2549 ; 2550 2551 if (length > 0) { 2552 restartsync: 2553 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2554 if (bp->b_lblkno > 0) 2555 continue; 2556 /* 2557 * Since we hold the vnode lock this should only 2558 * fail if we're racing with the buf daemon. 2559 */ 2560 if (BUF_LOCK(bp, 2561 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2562 BO_LOCKPTR(bo)) == ENOLCK) 2563 goto restart_unlocked; 2564 2565 VNASSERT((bp->b_flags & B_DELWRI), vp, 2566 ("buf(%p) on dirty queue without DELWRI", bp)); 2567 2568 bremfree(bp); 2569 bawrite(bp); 2570 BO_LOCK(bo); 2571 goto restartsync; 2572 } 2573 } 2574 2575 bufobj_wwait(bo, 0, 0); 2576 BO_UNLOCK(bo); 2577 vnode_pager_setsize(vp, length); 2578 2579 return (0); 2580 } 2581 2582 /* 2583 * Invalidate the cached pages of a file's buffer within the range of block 2584 * numbers [startlbn, endlbn). 2585 */ 2586 void 2587 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2588 int blksize) 2589 { 2590 struct bufobj *bo; 2591 off_t start, end; 2592 2593 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2594 2595 start = blksize * startlbn; 2596 end = blksize * endlbn; 2597 2598 bo = &vp->v_bufobj; 2599 BO_LOCK(bo); 2600 MPASS(blksize == bo->bo_bsize); 2601 2602 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2603 ; 2604 2605 BO_UNLOCK(bo); 2606 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2607 } 2608 2609 static int 2610 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2611 daddr_t startlbn, daddr_t endlbn) 2612 { 2613 struct buf *bp, *nbp; 2614 bool anyfreed; 2615 2616 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2617 ASSERT_BO_LOCKED(bo); 2618 2619 do { 2620 anyfreed = false; 2621 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2622 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2623 continue; 2624 if (BUF_LOCK(bp, 2625 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2626 BO_LOCKPTR(bo)) == ENOLCK) { 2627 BO_LOCK(bo); 2628 return (EAGAIN); 2629 } 2630 2631 bremfree(bp); 2632 bp->b_flags |= B_INVAL | B_RELBUF; 2633 bp->b_flags &= ~B_ASYNC; 2634 brelse(bp); 2635 anyfreed = true; 2636 2637 BO_LOCK(bo); 2638 if (nbp != NULL && 2639 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2640 nbp->b_vp != vp || 2641 (nbp->b_flags & B_DELWRI) != 0)) 2642 return (EAGAIN); 2643 } 2644 2645 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2646 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2647 continue; 2648 if (BUF_LOCK(bp, 2649 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2650 BO_LOCKPTR(bo)) == ENOLCK) { 2651 BO_LOCK(bo); 2652 return (EAGAIN); 2653 } 2654 bremfree(bp); 2655 bp->b_flags |= B_INVAL | B_RELBUF; 2656 bp->b_flags &= ~B_ASYNC; 2657 brelse(bp); 2658 anyfreed = true; 2659 2660 BO_LOCK(bo); 2661 if (nbp != NULL && 2662 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2663 (nbp->b_vp != vp) || 2664 (nbp->b_flags & B_DELWRI) == 0)) 2665 return (EAGAIN); 2666 } 2667 } while (anyfreed); 2668 return (0); 2669 } 2670 2671 static void 2672 buf_vlist_remove(struct buf *bp) 2673 { 2674 struct bufv *bv; 2675 b_xflags_t flags; 2676 2677 flags = bp->b_xflags; 2678 2679 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2680 ASSERT_BO_WLOCKED(bp->b_bufobj); 2681 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2682 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2683 ("%s: buffer %p has invalid queue state", __func__, bp)); 2684 2685 if ((flags & BX_VNDIRTY) != 0) 2686 bv = &bp->b_bufobj->bo_dirty; 2687 else 2688 bv = &bp->b_bufobj->bo_clean; 2689 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2690 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2691 bv->bv_cnt--; 2692 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2693 } 2694 2695 /* 2696 * Add the buffer to the sorted clean or dirty block list. 2697 * 2698 * NOTE: xflags is passed as a constant, optimizing this inline function! 2699 */ 2700 static void 2701 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2702 { 2703 struct bufv *bv; 2704 struct buf *n; 2705 int error; 2706 2707 ASSERT_BO_WLOCKED(bo); 2708 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2709 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2710 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2711 ("dead bo %p", bo)); 2712 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2713 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2714 bp->b_xflags |= xflags; 2715 if (xflags & BX_VNDIRTY) 2716 bv = &bo->bo_dirty; 2717 else 2718 bv = &bo->bo_clean; 2719 2720 /* 2721 * Keep the list ordered. Optimize empty list insertion. Assume 2722 * we tend to grow at the tail so lookup_le should usually be cheaper 2723 * than _ge. 2724 */ 2725 if (bv->bv_cnt == 0 || 2726 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2727 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2728 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2729 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2730 else 2731 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2732 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2733 if (error) 2734 panic("buf_vlist_add: Preallocated nodes insufficient."); 2735 bv->bv_cnt++; 2736 } 2737 2738 /* 2739 * Look up a buffer using the buffer tries. 2740 */ 2741 struct buf * 2742 gbincore(struct bufobj *bo, daddr_t lblkno) 2743 { 2744 struct buf *bp; 2745 2746 ASSERT_BO_LOCKED(bo); 2747 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2748 if (bp != NULL) 2749 return (bp); 2750 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2751 } 2752 2753 /* 2754 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2755 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2756 * stability of the result. Like other lockless lookups, the found buf may 2757 * already be invalid by the time this function returns. 2758 */ 2759 struct buf * 2760 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2761 { 2762 struct buf *bp; 2763 2764 ASSERT_BO_UNLOCKED(bo); 2765 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2766 if (bp != NULL) 2767 return (bp); 2768 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2769 } 2770 2771 /* 2772 * Associate a buffer with a vnode. 2773 */ 2774 void 2775 bgetvp(struct vnode *vp, struct buf *bp) 2776 { 2777 struct bufobj *bo; 2778 2779 bo = &vp->v_bufobj; 2780 ASSERT_BO_WLOCKED(bo); 2781 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2782 2783 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2784 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2785 ("bgetvp: bp already attached! %p", bp)); 2786 2787 vhold(vp); 2788 bp->b_vp = vp; 2789 bp->b_bufobj = bo; 2790 /* 2791 * Insert onto list for new vnode. 2792 */ 2793 buf_vlist_add(bp, bo, BX_VNCLEAN); 2794 } 2795 2796 /* 2797 * Disassociate a buffer from a vnode. 2798 */ 2799 void 2800 brelvp(struct buf *bp) 2801 { 2802 struct bufobj *bo; 2803 struct vnode *vp; 2804 2805 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2806 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2807 2808 /* 2809 * Delete from old vnode list, if on one. 2810 */ 2811 vp = bp->b_vp; /* XXX */ 2812 bo = bp->b_bufobj; 2813 BO_LOCK(bo); 2814 buf_vlist_remove(bp); 2815 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2816 bo->bo_flag &= ~BO_ONWORKLST; 2817 mtx_lock(&sync_mtx); 2818 LIST_REMOVE(bo, bo_synclist); 2819 syncer_worklist_len--; 2820 mtx_unlock(&sync_mtx); 2821 } 2822 bp->b_vp = NULL; 2823 bp->b_bufobj = NULL; 2824 BO_UNLOCK(bo); 2825 vdrop(vp); 2826 } 2827 2828 /* 2829 * Add an item to the syncer work queue. 2830 */ 2831 static void 2832 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2833 { 2834 int slot; 2835 2836 ASSERT_BO_WLOCKED(bo); 2837 2838 mtx_lock(&sync_mtx); 2839 if (bo->bo_flag & BO_ONWORKLST) 2840 LIST_REMOVE(bo, bo_synclist); 2841 else { 2842 bo->bo_flag |= BO_ONWORKLST; 2843 syncer_worklist_len++; 2844 } 2845 2846 if (delay > syncer_maxdelay - 2) 2847 delay = syncer_maxdelay - 2; 2848 slot = (syncer_delayno + delay) & syncer_mask; 2849 2850 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2851 mtx_unlock(&sync_mtx); 2852 } 2853 2854 static int 2855 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2856 { 2857 int error, len; 2858 2859 mtx_lock(&sync_mtx); 2860 len = syncer_worklist_len - sync_vnode_count; 2861 mtx_unlock(&sync_mtx); 2862 error = SYSCTL_OUT(req, &len, sizeof(len)); 2863 return (error); 2864 } 2865 2866 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2867 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2868 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2869 2870 static struct proc *updateproc; 2871 static void sched_sync(void); 2872 static struct kproc_desc up_kp = { 2873 "syncer", 2874 sched_sync, 2875 &updateproc 2876 }; 2877 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2878 2879 static int 2880 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2881 { 2882 struct vnode *vp; 2883 struct mount *mp; 2884 2885 *bo = LIST_FIRST(slp); 2886 if (*bo == NULL) 2887 return (0); 2888 vp = bo2vnode(*bo); 2889 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2890 return (1); 2891 /* 2892 * We use vhold in case the vnode does not 2893 * successfully sync. vhold prevents the vnode from 2894 * going away when we unlock the sync_mtx so that 2895 * we can acquire the vnode interlock. 2896 */ 2897 vholdl(vp); 2898 mtx_unlock(&sync_mtx); 2899 VI_UNLOCK(vp); 2900 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2901 vdrop(vp); 2902 mtx_lock(&sync_mtx); 2903 return (*bo == LIST_FIRST(slp)); 2904 } 2905 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2906 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2907 ("suspended mp syncing vp %p", vp)); 2908 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2909 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2910 VOP_UNLOCK(vp); 2911 vn_finished_write(mp); 2912 BO_LOCK(*bo); 2913 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2914 /* 2915 * Put us back on the worklist. The worklist 2916 * routine will remove us from our current 2917 * position and then add us back in at a later 2918 * position. 2919 */ 2920 vn_syncer_add_to_worklist(*bo, syncdelay); 2921 } 2922 BO_UNLOCK(*bo); 2923 vdrop(vp); 2924 mtx_lock(&sync_mtx); 2925 return (0); 2926 } 2927 2928 static int first_printf = 1; 2929 2930 /* 2931 * System filesystem synchronizer daemon. 2932 */ 2933 static void 2934 sched_sync(void) 2935 { 2936 struct synclist *next, *slp; 2937 struct bufobj *bo; 2938 long starttime; 2939 struct thread *td = curthread; 2940 int last_work_seen; 2941 int net_worklist_len; 2942 int syncer_final_iter; 2943 int error; 2944 2945 last_work_seen = 0; 2946 syncer_final_iter = 0; 2947 syncer_state = SYNCER_RUNNING; 2948 starttime = time_uptime; 2949 td->td_pflags |= TDP_NORUNNINGBUF; 2950 2951 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2952 SHUTDOWN_PRI_LAST); 2953 2954 mtx_lock(&sync_mtx); 2955 for (;;) { 2956 if (syncer_state == SYNCER_FINAL_DELAY && 2957 syncer_final_iter == 0) { 2958 mtx_unlock(&sync_mtx); 2959 kproc_suspend_check(td->td_proc); 2960 mtx_lock(&sync_mtx); 2961 } 2962 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2963 if (syncer_state != SYNCER_RUNNING && 2964 starttime != time_uptime) { 2965 if (first_printf) { 2966 printf("\nSyncing disks, vnodes remaining... "); 2967 first_printf = 0; 2968 } 2969 printf("%d ", net_worklist_len); 2970 } 2971 starttime = time_uptime; 2972 2973 /* 2974 * Push files whose dirty time has expired. Be careful 2975 * of interrupt race on slp queue. 2976 * 2977 * Skip over empty worklist slots when shutting down. 2978 */ 2979 do { 2980 slp = &syncer_workitem_pending[syncer_delayno]; 2981 syncer_delayno += 1; 2982 if (syncer_delayno == syncer_maxdelay) 2983 syncer_delayno = 0; 2984 next = &syncer_workitem_pending[syncer_delayno]; 2985 /* 2986 * If the worklist has wrapped since the 2987 * it was emptied of all but syncer vnodes, 2988 * switch to the FINAL_DELAY state and run 2989 * for one more second. 2990 */ 2991 if (syncer_state == SYNCER_SHUTTING_DOWN && 2992 net_worklist_len == 0 && 2993 last_work_seen == syncer_delayno) { 2994 syncer_state = SYNCER_FINAL_DELAY; 2995 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2996 } 2997 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2998 syncer_worklist_len > 0); 2999 3000 /* 3001 * Keep track of the last time there was anything 3002 * on the worklist other than syncer vnodes. 3003 * Return to the SHUTTING_DOWN state if any 3004 * new work appears. 3005 */ 3006 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 3007 last_work_seen = syncer_delayno; 3008 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 3009 syncer_state = SYNCER_SHUTTING_DOWN; 3010 while (!LIST_EMPTY(slp)) { 3011 error = sync_vnode(slp, &bo, td); 3012 if (error == 1) { 3013 LIST_REMOVE(bo, bo_synclist); 3014 LIST_INSERT_HEAD(next, bo, bo_synclist); 3015 continue; 3016 } 3017 3018 if (first_printf == 0) { 3019 /* 3020 * Drop the sync mutex, because some watchdog 3021 * drivers need to sleep while patting 3022 */ 3023 mtx_unlock(&sync_mtx); 3024 wdog_kern_pat(WD_LASTVAL); 3025 mtx_lock(&sync_mtx); 3026 } 3027 } 3028 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 3029 syncer_final_iter--; 3030 /* 3031 * The variable rushjob allows the kernel to speed up the 3032 * processing of the filesystem syncer process. A rushjob 3033 * value of N tells the filesystem syncer to process the next 3034 * N seconds worth of work on its queue ASAP. Currently rushjob 3035 * is used by the soft update code to speed up the filesystem 3036 * syncer process when the incore state is getting so far 3037 * ahead of the disk that the kernel memory pool is being 3038 * threatened with exhaustion. 3039 */ 3040 if (rushjob > 0) { 3041 rushjob -= 1; 3042 continue; 3043 } 3044 /* 3045 * Just sleep for a short period of time between 3046 * iterations when shutting down to allow some I/O 3047 * to happen. 3048 * 3049 * If it has taken us less than a second to process the 3050 * current work, then wait. Otherwise start right over 3051 * again. We can still lose time if any single round 3052 * takes more than two seconds, but it does not really 3053 * matter as we are just trying to generally pace the 3054 * filesystem activity. 3055 */ 3056 if (syncer_state != SYNCER_RUNNING || 3057 time_uptime == starttime) { 3058 thread_lock(td); 3059 sched_prio(td, PPAUSE); 3060 thread_unlock(td); 3061 } 3062 if (syncer_state != SYNCER_RUNNING) 3063 cv_timedwait(&sync_wakeup, &sync_mtx, 3064 hz / SYNCER_SHUTDOWN_SPEEDUP); 3065 else if (time_uptime == starttime) 3066 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 3067 } 3068 } 3069 3070 /* 3071 * Request the syncer daemon to speed up its work. 3072 * We never push it to speed up more than half of its 3073 * normal turn time, otherwise it could take over the cpu. 3074 */ 3075 int 3076 speedup_syncer(void) 3077 { 3078 int ret = 0; 3079 3080 mtx_lock(&sync_mtx); 3081 if (rushjob < syncdelay / 2) { 3082 rushjob += 1; 3083 stat_rush_requests += 1; 3084 ret = 1; 3085 } 3086 mtx_unlock(&sync_mtx); 3087 cv_broadcast(&sync_wakeup); 3088 return (ret); 3089 } 3090 3091 /* 3092 * Tell the syncer to speed up its work and run though its work 3093 * list several times, then tell it to shut down. 3094 */ 3095 static void 3096 syncer_shutdown(void *arg, int howto) 3097 { 3098 3099 if (howto & RB_NOSYNC) 3100 return; 3101 mtx_lock(&sync_mtx); 3102 syncer_state = SYNCER_SHUTTING_DOWN; 3103 rushjob = 0; 3104 mtx_unlock(&sync_mtx); 3105 cv_broadcast(&sync_wakeup); 3106 kproc_shutdown(arg, howto); 3107 } 3108 3109 void 3110 syncer_suspend(void) 3111 { 3112 3113 syncer_shutdown(updateproc, 0); 3114 } 3115 3116 void 3117 syncer_resume(void) 3118 { 3119 3120 mtx_lock(&sync_mtx); 3121 first_printf = 1; 3122 syncer_state = SYNCER_RUNNING; 3123 mtx_unlock(&sync_mtx); 3124 cv_broadcast(&sync_wakeup); 3125 kproc_resume(updateproc); 3126 } 3127 3128 /* 3129 * Move the buffer between the clean and dirty lists of its vnode. 3130 */ 3131 void 3132 reassignbuf(struct buf *bp) 3133 { 3134 struct vnode *vp; 3135 struct bufobj *bo; 3136 int delay; 3137 #ifdef INVARIANTS 3138 struct bufv *bv; 3139 #endif 3140 3141 vp = bp->b_vp; 3142 bo = bp->b_bufobj; 3143 3144 KASSERT((bp->b_flags & B_PAGING) == 0, 3145 ("%s: cannot reassign paging buffer %p", __func__, bp)); 3146 3147 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 3148 bp, bp->b_vp, bp->b_flags); 3149 3150 BO_LOCK(bo); 3151 buf_vlist_remove(bp); 3152 3153 /* 3154 * If dirty, put on list of dirty buffers; otherwise insert onto list 3155 * of clean buffers. 3156 */ 3157 if (bp->b_flags & B_DELWRI) { 3158 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 3159 switch (vp->v_type) { 3160 case VDIR: 3161 delay = dirdelay; 3162 break; 3163 case VCHR: 3164 delay = metadelay; 3165 break; 3166 default: 3167 delay = filedelay; 3168 } 3169 vn_syncer_add_to_worklist(bo, delay); 3170 } 3171 buf_vlist_add(bp, bo, BX_VNDIRTY); 3172 } else { 3173 buf_vlist_add(bp, bo, BX_VNCLEAN); 3174 3175 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 3176 mtx_lock(&sync_mtx); 3177 LIST_REMOVE(bo, bo_synclist); 3178 syncer_worklist_len--; 3179 mtx_unlock(&sync_mtx); 3180 bo->bo_flag &= ~BO_ONWORKLST; 3181 } 3182 } 3183 #ifdef INVARIANTS 3184 bv = &bo->bo_clean; 3185 bp = TAILQ_FIRST(&bv->bv_hd); 3186 KASSERT(bp == NULL || bp->b_bufobj == bo, 3187 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3188 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3189 KASSERT(bp == NULL || bp->b_bufobj == bo, 3190 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3191 bv = &bo->bo_dirty; 3192 bp = TAILQ_FIRST(&bv->bv_hd); 3193 KASSERT(bp == NULL || bp->b_bufobj == bo, 3194 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3195 bp = TAILQ_LAST(&bv->bv_hd, buflists); 3196 KASSERT(bp == NULL || bp->b_bufobj == bo, 3197 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 3198 #endif 3199 BO_UNLOCK(bo); 3200 } 3201 3202 static void 3203 v_init_counters(struct vnode *vp) 3204 { 3205 3206 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 3207 vp, ("%s called for an initialized vnode", __FUNCTION__)); 3208 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 3209 3210 refcount_init(&vp->v_holdcnt, 1); 3211 refcount_init(&vp->v_usecount, 1); 3212 } 3213 3214 /* 3215 * Get a usecount on a vnode. 3216 * 3217 * vget and vget_finish may fail to lock the vnode if they lose a race against 3218 * it being doomed. LK_RETRY can be passed in flags to lock it anyway. 3219 * 3220 * Consumers which don't guarantee liveness of the vnode can use SMR to 3221 * try to get a reference. Note this operation can fail since the vnode 3222 * may be awaiting getting freed by the time they get to it. 3223 */ 3224 enum vgetstate 3225 vget_prep_smr(struct vnode *vp) 3226 { 3227 enum vgetstate vs; 3228 3229 VFS_SMR_ASSERT_ENTERED(); 3230 3231 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3232 vs = VGET_USECOUNT; 3233 } else { 3234 if (vhold_smr(vp)) 3235 vs = VGET_HOLDCNT; 3236 else 3237 vs = VGET_NONE; 3238 } 3239 return (vs); 3240 } 3241 3242 enum vgetstate 3243 vget_prep(struct vnode *vp) 3244 { 3245 enum vgetstate vs; 3246 3247 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3248 vs = VGET_USECOUNT; 3249 } else { 3250 vhold(vp); 3251 vs = VGET_HOLDCNT; 3252 } 3253 return (vs); 3254 } 3255 3256 void 3257 vget_abort(struct vnode *vp, enum vgetstate vs) 3258 { 3259 3260 switch (vs) { 3261 case VGET_USECOUNT: 3262 vrele(vp); 3263 break; 3264 case VGET_HOLDCNT: 3265 vdrop(vp); 3266 break; 3267 default: 3268 __assert_unreachable(); 3269 } 3270 } 3271 3272 int 3273 vget(struct vnode *vp, int flags) 3274 { 3275 enum vgetstate vs; 3276 3277 vs = vget_prep(vp); 3278 return (vget_finish(vp, flags, vs)); 3279 } 3280 3281 int 3282 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3283 { 3284 int error; 3285 3286 if ((flags & LK_INTERLOCK) != 0) 3287 ASSERT_VI_LOCKED(vp, __func__); 3288 else 3289 ASSERT_VI_UNLOCKED(vp, __func__); 3290 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3291 VNPASS(vp->v_holdcnt > 0, vp); 3292 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3293 3294 error = vn_lock(vp, flags); 3295 if (__predict_false(error != 0)) { 3296 vget_abort(vp, vs); 3297 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3298 vp); 3299 return (error); 3300 } 3301 3302 vget_finish_ref(vp, vs); 3303 return (0); 3304 } 3305 3306 void 3307 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3308 { 3309 int old; 3310 3311 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3312 VNPASS(vp->v_holdcnt > 0, vp); 3313 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3314 3315 if (vs == VGET_USECOUNT) 3316 return; 3317 3318 /* 3319 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3320 * the vnode around. Otherwise someone else lended their hold count and 3321 * we have to drop ours. 3322 */ 3323 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3324 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3325 if (old != 0) { 3326 #ifdef INVARIANTS 3327 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3328 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3329 #else 3330 refcount_release(&vp->v_holdcnt); 3331 #endif 3332 } 3333 } 3334 3335 void 3336 vref(struct vnode *vp) 3337 { 3338 enum vgetstate vs; 3339 3340 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3341 vs = vget_prep(vp); 3342 vget_finish_ref(vp, vs); 3343 } 3344 3345 void 3346 vrefact(struct vnode *vp) 3347 { 3348 int old __diagused; 3349 3350 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3351 old = refcount_acquire(&vp->v_usecount); 3352 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3353 } 3354 3355 void 3356 vlazy(struct vnode *vp) 3357 { 3358 struct mount *mp; 3359 3360 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3361 3362 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3363 return; 3364 /* 3365 * We may get here for inactive routines after the vnode got doomed. 3366 */ 3367 if (VN_IS_DOOMED(vp)) 3368 return; 3369 mp = vp->v_mount; 3370 mtx_lock(&mp->mnt_listmtx); 3371 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3372 vp->v_mflag |= VMP_LAZYLIST; 3373 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3374 mp->mnt_lazyvnodelistsize++; 3375 } 3376 mtx_unlock(&mp->mnt_listmtx); 3377 } 3378 3379 static void 3380 vunlazy(struct vnode *vp) 3381 { 3382 struct mount *mp; 3383 3384 ASSERT_VI_LOCKED(vp, __func__); 3385 VNPASS(!VN_IS_DOOMED(vp), vp); 3386 3387 mp = vp->v_mount; 3388 mtx_lock(&mp->mnt_listmtx); 3389 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3390 /* 3391 * Don't remove the vnode from the lazy list if another thread 3392 * has increased the hold count. It may have re-enqueued the 3393 * vnode to the lazy list and is now responsible for its 3394 * removal. 3395 */ 3396 if (vp->v_holdcnt == 0) { 3397 vp->v_mflag &= ~VMP_LAZYLIST; 3398 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3399 mp->mnt_lazyvnodelistsize--; 3400 } 3401 mtx_unlock(&mp->mnt_listmtx); 3402 } 3403 3404 /* 3405 * This routine is only meant to be called from vgonel prior to dooming 3406 * the vnode. 3407 */ 3408 static void 3409 vunlazy_gone(struct vnode *vp) 3410 { 3411 struct mount *mp; 3412 3413 ASSERT_VOP_ELOCKED(vp, __func__); 3414 ASSERT_VI_LOCKED(vp, __func__); 3415 VNPASS(!VN_IS_DOOMED(vp), vp); 3416 3417 if (vp->v_mflag & VMP_LAZYLIST) { 3418 mp = vp->v_mount; 3419 mtx_lock(&mp->mnt_listmtx); 3420 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3421 vp->v_mflag &= ~VMP_LAZYLIST; 3422 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3423 mp->mnt_lazyvnodelistsize--; 3424 mtx_unlock(&mp->mnt_listmtx); 3425 } 3426 } 3427 3428 static void 3429 vdefer_inactive(struct vnode *vp) 3430 { 3431 3432 ASSERT_VI_LOCKED(vp, __func__); 3433 VNPASS(vp->v_holdcnt > 0, vp); 3434 if (VN_IS_DOOMED(vp)) { 3435 vdropl(vp); 3436 return; 3437 } 3438 if (vp->v_iflag & VI_DEFINACT) { 3439 VNPASS(vp->v_holdcnt > 1, vp); 3440 vdropl(vp); 3441 return; 3442 } 3443 if (vp->v_usecount > 0) { 3444 vp->v_iflag &= ~VI_OWEINACT; 3445 vdropl(vp); 3446 return; 3447 } 3448 vlazy(vp); 3449 vp->v_iflag |= VI_DEFINACT; 3450 VI_UNLOCK(vp); 3451 atomic_add_long(&deferred_inact, 1); 3452 } 3453 3454 static void 3455 vdefer_inactive_unlocked(struct vnode *vp) 3456 { 3457 3458 VI_LOCK(vp); 3459 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3460 vdropl(vp); 3461 return; 3462 } 3463 vdefer_inactive(vp); 3464 } 3465 3466 enum vput_op { VRELE, VPUT, VUNREF }; 3467 3468 /* 3469 * Handle ->v_usecount transitioning to 0. 3470 * 3471 * By releasing the last usecount we take ownership of the hold count which 3472 * provides liveness of the vnode, meaning we have to vdrop. 3473 * 3474 * For all vnodes we may need to perform inactive processing. It requires an 3475 * exclusive lock on the vnode, while it is legal to call here with only a 3476 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3477 * inactive processing gets deferred to the syncer. 3478 * 3479 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3480 * on the lock being held all the way until VOP_INACTIVE. This in particular 3481 * happens with UFS which adds half-constructed vnodes to the hash, where they 3482 * can be found by other code. 3483 */ 3484 static void 3485 vput_final(struct vnode *vp, enum vput_op func) 3486 { 3487 int error; 3488 bool want_unlock; 3489 3490 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3491 VNPASS(vp->v_holdcnt > 0, vp); 3492 3493 VI_LOCK(vp); 3494 3495 /* 3496 * By the time we got here someone else might have transitioned 3497 * the count back to > 0. 3498 */ 3499 if (vp->v_usecount > 0) 3500 goto out; 3501 3502 /* 3503 * If the vnode is doomed vgone already performed inactive processing 3504 * (if needed). 3505 */ 3506 if (VN_IS_DOOMED(vp)) 3507 goto out; 3508 3509 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3510 goto out; 3511 3512 if (vp->v_iflag & VI_DOINGINACT) 3513 goto out; 3514 3515 /* 3516 * Locking operations here will drop the interlock and possibly the 3517 * vnode lock, opening a window where the vnode can get doomed all the 3518 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3519 * perform inactive. 3520 */ 3521 vp->v_iflag |= VI_OWEINACT; 3522 want_unlock = false; 3523 error = 0; 3524 switch (func) { 3525 case VRELE: 3526 switch (VOP_ISLOCKED(vp)) { 3527 case LK_EXCLUSIVE: 3528 break; 3529 case LK_EXCLOTHER: 3530 case 0: 3531 want_unlock = true; 3532 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3533 VI_LOCK(vp); 3534 break; 3535 default: 3536 /* 3537 * The lock has at least one sharer, but we have no way 3538 * to conclude whether this is us. Play it safe and 3539 * defer processing. 3540 */ 3541 error = EAGAIN; 3542 break; 3543 } 3544 break; 3545 case VPUT: 3546 want_unlock = true; 3547 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3548 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3549 LK_NOWAIT); 3550 VI_LOCK(vp); 3551 } 3552 break; 3553 case VUNREF: 3554 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3555 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3556 VI_LOCK(vp); 3557 } 3558 break; 3559 } 3560 if (error == 0) { 3561 if (func == VUNREF) { 3562 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3563 ("recursive vunref")); 3564 vp->v_vflag |= VV_UNREF; 3565 } 3566 for (;;) { 3567 error = vinactive(vp); 3568 if (want_unlock) 3569 VOP_UNLOCK(vp); 3570 if (error != ERELOOKUP || !want_unlock) 3571 break; 3572 VOP_LOCK(vp, LK_EXCLUSIVE); 3573 } 3574 if (func == VUNREF) 3575 vp->v_vflag &= ~VV_UNREF; 3576 vdropl(vp); 3577 } else { 3578 vdefer_inactive(vp); 3579 } 3580 return; 3581 out: 3582 if (func == VPUT) 3583 VOP_UNLOCK(vp); 3584 vdropl(vp); 3585 } 3586 3587 /* 3588 * Decrement ->v_usecount for a vnode. 3589 * 3590 * Releasing the last use count requires additional processing, see vput_final 3591 * above for details. 3592 * 3593 * Comment above each variant denotes lock state on entry and exit. 3594 */ 3595 3596 /* 3597 * in: any 3598 * out: same as passed in 3599 */ 3600 void 3601 vrele(struct vnode *vp) 3602 { 3603 3604 ASSERT_VI_UNLOCKED(vp, __func__); 3605 if (!refcount_release(&vp->v_usecount)) 3606 return; 3607 vput_final(vp, VRELE); 3608 } 3609 3610 /* 3611 * in: locked 3612 * out: unlocked 3613 */ 3614 void 3615 vput(struct vnode *vp) 3616 { 3617 3618 ASSERT_VOP_LOCKED(vp, __func__); 3619 ASSERT_VI_UNLOCKED(vp, __func__); 3620 if (!refcount_release(&vp->v_usecount)) { 3621 VOP_UNLOCK(vp); 3622 return; 3623 } 3624 vput_final(vp, VPUT); 3625 } 3626 3627 /* 3628 * in: locked 3629 * out: locked 3630 */ 3631 void 3632 vunref(struct vnode *vp) 3633 { 3634 3635 ASSERT_VOP_LOCKED(vp, __func__); 3636 ASSERT_VI_UNLOCKED(vp, __func__); 3637 if (!refcount_release(&vp->v_usecount)) 3638 return; 3639 vput_final(vp, VUNREF); 3640 } 3641 3642 void 3643 vhold(struct vnode *vp) 3644 { 3645 int old; 3646 3647 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3648 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3649 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3650 ("%s: wrong hold count %d", __func__, old)); 3651 if (old == 0) 3652 vfs_freevnodes_dec(); 3653 } 3654 3655 void 3656 vholdnz(struct vnode *vp) 3657 { 3658 3659 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3660 #ifdef INVARIANTS 3661 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3662 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3663 ("%s: wrong hold count %d", __func__, old)); 3664 #else 3665 atomic_add_int(&vp->v_holdcnt, 1); 3666 #endif 3667 } 3668 3669 /* 3670 * Grab a hold count unless the vnode is freed. 3671 * 3672 * Only use this routine if vfs smr is the only protection you have against 3673 * freeing the vnode. 3674 * 3675 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3676 * is not set. After the flag is set the vnode becomes immutable to anyone but 3677 * the thread which managed to set the flag. 3678 * 3679 * It may be tempting to replace the loop with: 3680 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3681 * if (count & VHOLD_NO_SMR) { 3682 * backpedal and error out; 3683 * } 3684 * 3685 * However, while this is more performant, it hinders debugging by eliminating 3686 * the previously mentioned invariant. 3687 */ 3688 bool 3689 vhold_smr(struct vnode *vp) 3690 { 3691 int count; 3692 3693 VFS_SMR_ASSERT_ENTERED(); 3694 3695 count = atomic_load_int(&vp->v_holdcnt); 3696 for (;;) { 3697 if (count & VHOLD_NO_SMR) { 3698 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3699 ("non-zero hold count with flags %d\n", count)); 3700 return (false); 3701 } 3702 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3703 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3704 if (count == 0) 3705 vfs_freevnodes_dec(); 3706 return (true); 3707 } 3708 } 3709 } 3710 3711 /* 3712 * Hold a free vnode for recycling. 3713 * 3714 * Note: vnode_init references this comment. 3715 * 3716 * Attempts to recycle only need the global vnode list lock and have no use for 3717 * SMR. 3718 * 3719 * However, vnodes get inserted into the global list before they get fully 3720 * initialized and stay there until UMA decides to free the memory. This in 3721 * particular means the target can be found before it becomes usable and after 3722 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3723 * VHOLD_NO_SMR. 3724 * 3725 * Note: the vnode may gain more references after we transition the count 0->1. 3726 */ 3727 static bool 3728 vhold_recycle_free(struct vnode *vp) 3729 { 3730 int count; 3731 3732 mtx_assert(&vnode_list_mtx, MA_OWNED); 3733 3734 count = atomic_load_int(&vp->v_holdcnt); 3735 for (;;) { 3736 if (count & VHOLD_NO_SMR) { 3737 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3738 ("non-zero hold count with flags %d\n", count)); 3739 return (false); 3740 } 3741 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3742 if (count > 0) { 3743 return (false); 3744 } 3745 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3746 vfs_freevnodes_dec(); 3747 return (true); 3748 } 3749 } 3750 } 3751 3752 static void __noinline 3753 vdbatch_process(struct vdbatch *vd) 3754 { 3755 struct vnode *vp; 3756 int i; 3757 3758 mtx_assert(&vd->lock, MA_OWNED); 3759 MPASS(curthread->td_pinned > 0); 3760 MPASS(vd->index == VDBATCH_SIZE); 3761 3762 /* 3763 * Attempt to requeue the passed batch, but give up easily. 3764 * 3765 * Despite batching the mechanism is prone to transient *significant* 3766 * lock contention, where vnode_list_mtx becomes the primary bottleneck 3767 * if multiple CPUs get here (one real-world example is highly parallel 3768 * do-nothing make , which will stat *tons* of vnodes). Since it is 3769 * quasi-LRU (read: not that great even if fully honoured) just dodge 3770 * the problem. Parties which don't like it are welcome to implement 3771 * something better. 3772 */ 3773 critical_enter(); 3774 if (mtx_trylock(&vnode_list_mtx)) { 3775 for (i = 0; i < VDBATCH_SIZE; i++) { 3776 vp = vd->tab[i]; 3777 vd->tab[i] = NULL; 3778 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3779 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3780 MPASS(vp->v_dbatchcpu != NOCPU); 3781 vp->v_dbatchcpu = NOCPU; 3782 } 3783 mtx_unlock(&vnode_list_mtx); 3784 } else { 3785 counter_u64_add(vnode_skipped_requeues, 1); 3786 3787 for (i = 0; i < VDBATCH_SIZE; i++) { 3788 vp = vd->tab[i]; 3789 vd->tab[i] = NULL; 3790 MPASS(vp->v_dbatchcpu != NOCPU); 3791 vp->v_dbatchcpu = NOCPU; 3792 } 3793 } 3794 vd->index = 0; 3795 critical_exit(); 3796 } 3797 3798 static void 3799 vdbatch_enqueue(struct vnode *vp) 3800 { 3801 struct vdbatch *vd; 3802 3803 ASSERT_VI_LOCKED(vp, __func__); 3804 VNPASS(!VN_IS_DOOMED(vp), vp); 3805 3806 if (vp->v_dbatchcpu != NOCPU) { 3807 VI_UNLOCK(vp); 3808 return; 3809 } 3810 3811 sched_pin(); 3812 vd = DPCPU_PTR(vd); 3813 mtx_lock(&vd->lock); 3814 MPASS(vd->index < VDBATCH_SIZE); 3815 MPASS(vd->tab[vd->index] == NULL); 3816 /* 3817 * A hack: we depend on being pinned so that we know what to put in 3818 * ->v_dbatchcpu. 3819 */ 3820 vp->v_dbatchcpu = curcpu; 3821 vd->tab[vd->index] = vp; 3822 vd->index++; 3823 VI_UNLOCK(vp); 3824 if (vd->index == VDBATCH_SIZE) 3825 vdbatch_process(vd); 3826 mtx_unlock(&vd->lock); 3827 sched_unpin(); 3828 } 3829 3830 /* 3831 * This routine must only be called for vnodes which are about to be 3832 * deallocated. Supporting dequeue for arbitrary vndoes would require 3833 * validating that the locked batch matches. 3834 */ 3835 static void 3836 vdbatch_dequeue(struct vnode *vp) 3837 { 3838 struct vdbatch *vd; 3839 int i; 3840 short cpu; 3841 3842 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3843 3844 cpu = vp->v_dbatchcpu; 3845 if (cpu == NOCPU) 3846 return; 3847 3848 vd = DPCPU_ID_PTR(cpu, vd); 3849 mtx_lock(&vd->lock); 3850 for (i = 0; i < vd->index; i++) { 3851 if (vd->tab[i] != vp) 3852 continue; 3853 vp->v_dbatchcpu = NOCPU; 3854 vd->index--; 3855 vd->tab[i] = vd->tab[vd->index]; 3856 vd->tab[vd->index] = NULL; 3857 break; 3858 } 3859 mtx_unlock(&vd->lock); 3860 /* 3861 * Either we dequeued the vnode above or the target CPU beat us to it. 3862 */ 3863 MPASS(vp->v_dbatchcpu == NOCPU); 3864 } 3865 3866 /* 3867 * Drop the hold count of the vnode. 3868 * 3869 * It will only get freed if this is the last hold *and* it has been vgone'd. 3870 * 3871 * Because the vnode vm object keeps a hold reference on the vnode if 3872 * there is at least one resident non-cached page, the vnode cannot 3873 * leave the active list without the page cleanup done. 3874 */ 3875 static void __noinline 3876 vdropl_final(struct vnode *vp) 3877 { 3878 3879 ASSERT_VI_LOCKED(vp, __func__); 3880 VNPASS(VN_IS_DOOMED(vp), vp); 3881 /* 3882 * Set the VHOLD_NO_SMR flag. 3883 * 3884 * We may be racing against vhold_smr. If they win we can just pretend 3885 * we never got this far, they will vdrop later. 3886 */ 3887 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3888 vfs_freevnodes_inc(); 3889 VI_UNLOCK(vp); 3890 /* 3891 * We lost the aforementioned race. Any subsequent access is 3892 * invalid as they might have managed to vdropl on their own. 3893 */ 3894 return; 3895 } 3896 /* 3897 * Don't bump freevnodes as this one is going away. 3898 */ 3899 freevnode(vp); 3900 } 3901 3902 void 3903 vdrop(struct vnode *vp) 3904 { 3905 3906 ASSERT_VI_UNLOCKED(vp, __func__); 3907 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3908 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3909 return; 3910 VI_LOCK(vp); 3911 vdropl(vp); 3912 } 3913 3914 static void __always_inline 3915 vdropl_impl(struct vnode *vp, bool enqueue) 3916 { 3917 3918 ASSERT_VI_LOCKED(vp, __func__); 3919 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3920 if (!refcount_release(&vp->v_holdcnt)) { 3921 VI_UNLOCK(vp); 3922 return; 3923 } 3924 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3925 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3926 if (VN_IS_DOOMED(vp)) { 3927 vdropl_final(vp); 3928 return; 3929 } 3930 3931 vfs_freevnodes_inc(); 3932 if (vp->v_mflag & VMP_LAZYLIST) { 3933 vunlazy(vp); 3934 } 3935 3936 if (!enqueue) { 3937 VI_UNLOCK(vp); 3938 return; 3939 } 3940 3941 /* 3942 * Also unlocks the interlock. We can't assert on it as we 3943 * released our hold and by now the vnode might have been 3944 * freed. 3945 */ 3946 vdbatch_enqueue(vp); 3947 } 3948 3949 void 3950 vdropl(struct vnode *vp) 3951 { 3952 3953 vdropl_impl(vp, true); 3954 } 3955 3956 /* 3957 * vdrop a vnode when recycling 3958 * 3959 * This is a special case routine only to be used when recycling, differs from 3960 * regular vdrop by not requeieing the vnode on LRU. 3961 * 3962 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3963 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3964 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3965 * loop which can last for as long as writes are frozen. 3966 */ 3967 static void 3968 vdropl_recycle(struct vnode *vp) 3969 { 3970 3971 vdropl_impl(vp, false); 3972 } 3973 3974 static void 3975 vdrop_recycle(struct vnode *vp) 3976 { 3977 3978 VI_LOCK(vp); 3979 vdropl_recycle(vp); 3980 } 3981 3982 /* 3983 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3984 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3985 */ 3986 static int 3987 vinactivef(struct vnode *vp) 3988 { 3989 struct vm_object *obj; 3990 int error; 3991 3992 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3993 ASSERT_VI_LOCKED(vp, "vinactive"); 3994 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 3995 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3996 vp->v_iflag |= VI_DOINGINACT; 3997 vp->v_iflag &= ~VI_OWEINACT; 3998 VI_UNLOCK(vp); 3999 /* 4000 * Before moving off the active list, we must be sure that any 4001 * modified pages are converted into the vnode's dirty 4002 * buffers, since these will no longer be checked once the 4003 * vnode is on the inactive list. 4004 * 4005 * The write-out of the dirty pages is asynchronous. At the 4006 * point that VOP_INACTIVE() is called, there could still be 4007 * pending I/O and dirty pages in the object. 4008 */ 4009 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 4010 vm_object_mightbedirty(obj)) { 4011 VM_OBJECT_WLOCK(obj); 4012 vm_object_page_clean(obj, 0, 0, 0); 4013 VM_OBJECT_WUNLOCK(obj); 4014 } 4015 error = VOP_INACTIVE(vp); 4016 VI_LOCK(vp); 4017 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 4018 vp->v_iflag &= ~VI_DOINGINACT; 4019 return (error); 4020 } 4021 4022 int 4023 vinactive(struct vnode *vp) 4024 { 4025 4026 ASSERT_VOP_ELOCKED(vp, "vinactive"); 4027 ASSERT_VI_LOCKED(vp, "vinactive"); 4028 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4029 4030 if ((vp->v_iflag & VI_OWEINACT) == 0) 4031 return (0); 4032 if (vp->v_iflag & VI_DOINGINACT) 4033 return (0); 4034 if (vp->v_usecount > 0) { 4035 vp->v_iflag &= ~VI_OWEINACT; 4036 return (0); 4037 } 4038 return (vinactivef(vp)); 4039 } 4040 4041 /* 4042 * Remove any vnodes in the vnode table belonging to mount point mp. 4043 * 4044 * If FORCECLOSE is not specified, there should not be any active ones, 4045 * return error if any are found (nb: this is a user error, not a 4046 * system error). If FORCECLOSE is specified, detach any active vnodes 4047 * that are found. 4048 * 4049 * If WRITECLOSE is set, only flush out regular file vnodes open for 4050 * writing. 4051 * 4052 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 4053 * 4054 * `rootrefs' specifies the base reference count for the root vnode 4055 * of this filesystem. The root vnode is considered busy if its 4056 * v_usecount exceeds this value. On a successful return, vflush(, td) 4057 * will call vrele() on the root vnode exactly rootrefs times. 4058 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 4059 * be zero. 4060 */ 4061 #ifdef DIAGNOSTIC 4062 static int busyprt = 0; /* print out busy vnodes */ 4063 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 4064 #endif 4065 4066 int 4067 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 4068 { 4069 struct vnode *vp, *mvp, *rootvp = NULL; 4070 struct vattr vattr; 4071 int busy = 0, error; 4072 4073 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 4074 rootrefs, flags); 4075 if (rootrefs > 0) { 4076 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 4077 ("vflush: bad args")); 4078 /* 4079 * Get the filesystem root vnode. We can vput() it 4080 * immediately, since with rootrefs > 0, it won't go away. 4081 */ 4082 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 4083 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 4084 __func__, error); 4085 return (error); 4086 } 4087 vput(rootvp); 4088 } 4089 loop: 4090 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 4091 vholdl(vp); 4092 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 4093 if (error) { 4094 vdrop(vp); 4095 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4096 goto loop; 4097 } 4098 /* 4099 * Skip over a vnodes marked VV_SYSTEM. 4100 */ 4101 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 4102 VOP_UNLOCK(vp); 4103 vdrop(vp); 4104 continue; 4105 } 4106 /* 4107 * If WRITECLOSE is set, flush out unlinked but still open 4108 * files (even if open only for reading) and regular file 4109 * vnodes open for writing. 4110 */ 4111 if (flags & WRITECLOSE) { 4112 if (vp->v_object != NULL) { 4113 VM_OBJECT_WLOCK(vp->v_object); 4114 vm_object_page_clean(vp->v_object, 0, 0, 0); 4115 VM_OBJECT_WUNLOCK(vp->v_object); 4116 } 4117 do { 4118 error = VOP_FSYNC(vp, MNT_WAIT, td); 4119 } while (error == ERELOOKUP); 4120 if (error != 0) { 4121 VOP_UNLOCK(vp); 4122 vdrop(vp); 4123 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 4124 return (error); 4125 } 4126 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 4127 VI_LOCK(vp); 4128 4129 if ((vp->v_type == VNON || 4130 (error == 0 && vattr.va_nlink > 0)) && 4131 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 4132 VOP_UNLOCK(vp); 4133 vdropl(vp); 4134 continue; 4135 } 4136 } else 4137 VI_LOCK(vp); 4138 /* 4139 * With v_usecount == 0, all we need to do is clear out the 4140 * vnode data structures and we are done. 4141 * 4142 * If FORCECLOSE is set, forcibly close the vnode. 4143 */ 4144 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 4145 vgonel(vp); 4146 } else { 4147 busy++; 4148 #ifdef DIAGNOSTIC 4149 if (busyprt) 4150 vn_printf(vp, "vflush: busy vnode "); 4151 #endif 4152 } 4153 VOP_UNLOCK(vp); 4154 vdropl(vp); 4155 } 4156 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 4157 /* 4158 * If just the root vnode is busy, and if its refcount 4159 * is equal to `rootrefs', then go ahead and kill it. 4160 */ 4161 VI_LOCK(rootvp); 4162 KASSERT(busy > 0, ("vflush: not busy")); 4163 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 4164 ("vflush: usecount %d < rootrefs %d", 4165 rootvp->v_usecount, rootrefs)); 4166 if (busy == 1 && rootvp->v_usecount == rootrefs) { 4167 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 4168 vgone(rootvp); 4169 VOP_UNLOCK(rootvp); 4170 busy = 0; 4171 } else 4172 VI_UNLOCK(rootvp); 4173 } 4174 if (busy) { 4175 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 4176 busy); 4177 return (EBUSY); 4178 } 4179 for (; rootrefs > 0; rootrefs--) 4180 vrele(rootvp); 4181 return (0); 4182 } 4183 4184 /* 4185 * Recycle an unused vnode. 4186 */ 4187 int 4188 vrecycle(struct vnode *vp) 4189 { 4190 int recycled; 4191 4192 VI_LOCK(vp); 4193 recycled = vrecyclel(vp); 4194 VI_UNLOCK(vp); 4195 return (recycled); 4196 } 4197 4198 /* 4199 * vrecycle, with the vp interlock held. 4200 */ 4201 int 4202 vrecyclel(struct vnode *vp) 4203 { 4204 int recycled; 4205 4206 ASSERT_VOP_ELOCKED(vp, __func__); 4207 ASSERT_VI_LOCKED(vp, __func__); 4208 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4209 recycled = 0; 4210 if (vp->v_usecount == 0) { 4211 recycled = 1; 4212 vgonel(vp); 4213 } 4214 return (recycled); 4215 } 4216 4217 /* 4218 * Eliminate all activity associated with a vnode 4219 * in preparation for reuse. 4220 */ 4221 void 4222 vgone(struct vnode *vp) 4223 { 4224 VI_LOCK(vp); 4225 vgonel(vp); 4226 VI_UNLOCK(vp); 4227 } 4228 4229 /* 4230 * Notify upper mounts about reclaimed or unlinked vnode. 4231 */ 4232 void 4233 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 4234 { 4235 struct mount *mp; 4236 struct mount_upper_node *ump; 4237 4238 mp = atomic_load_ptr(&vp->v_mount); 4239 if (mp == NULL) 4240 return; 4241 if (TAILQ_EMPTY(&mp->mnt_notify)) 4242 return; 4243 4244 MNT_ILOCK(mp); 4245 mp->mnt_upper_pending++; 4246 KASSERT(mp->mnt_upper_pending > 0, 4247 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4248 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4249 MNT_IUNLOCK(mp); 4250 switch (event) { 4251 case VFS_NOTIFY_UPPER_RECLAIM: 4252 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4253 break; 4254 case VFS_NOTIFY_UPPER_UNLINK: 4255 VFS_UNLINK_LOWERVP(ump->mp, vp); 4256 break; 4257 } 4258 MNT_ILOCK(mp); 4259 } 4260 mp->mnt_upper_pending--; 4261 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4262 mp->mnt_upper_pending == 0) { 4263 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4264 wakeup(&mp->mnt_uppers); 4265 } 4266 MNT_IUNLOCK(mp); 4267 } 4268 4269 /* 4270 * vgone, with the vp interlock held. 4271 */ 4272 static void 4273 vgonel(struct vnode *vp) 4274 { 4275 struct thread *td; 4276 struct mount *mp; 4277 vm_object_t object; 4278 bool active, doinginact, oweinact; 4279 4280 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4281 ASSERT_VI_LOCKED(vp, "vgonel"); 4282 VNASSERT(vp->v_holdcnt, vp, 4283 ("vgonel: vp %p has no reference.", vp)); 4284 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4285 td = curthread; 4286 4287 /* 4288 * Don't vgonel if we're already doomed. 4289 */ 4290 if (VN_IS_DOOMED(vp)) { 4291 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4292 vn_get_state(vp) == VSTATE_DEAD, vp); 4293 return; 4294 } 4295 /* 4296 * Paired with freevnode. 4297 */ 4298 vn_seqc_write_begin_locked(vp); 4299 vunlazy_gone(vp); 4300 vn_irflag_set_locked(vp, VIRF_DOOMED); 4301 vn_set_state(vp, VSTATE_DESTROYING); 4302 4303 /* 4304 * Check to see if the vnode is in use. If so, we have to 4305 * call VOP_CLOSE() and VOP_INACTIVE(). 4306 * 4307 * It could be that VOP_INACTIVE() requested reclamation, in 4308 * which case we should avoid recursion, so check 4309 * VI_DOINGINACT. This is not precise but good enough. 4310 */ 4311 active = vp->v_usecount > 0; 4312 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4313 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4314 4315 /* 4316 * If we need to do inactive VI_OWEINACT will be set. 4317 */ 4318 if (vp->v_iflag & VI_DEFINACT) { 4319 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4320 vp->v_iflag &= ~VI_DEFINACT; 4321 vdropl(vp); 4322 } else { 4323 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4324 VI_UNLOCK(vp); 4325 } 4326 cache_purge_vgone(vp); 4327 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4328 4329 /* 4330 * If purging an active vnode, it must be closed and 4331 * deactivated before being reclaimed. 4332 */ 4333 if (active) 4334 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4335 if (!doinginact) { 4336 do { 4337 if (oweinact || active) { 4338 VI_LOCK(vp); 4339 vinactivef(vp); 4340 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4341 VI_UNLOCK(vp); 4342 } 4343 } while (oweinact); 4344 } 4345 if (vp->v_type == VSOCK) 4346 vfs_unp_reclaim(vp); 4347 4348 /* 4349 * Clean out any buffers associated with the vnode. 4350 * If the flush fails, just toss the buffers. 4351 */ 4352 mp = NULL; 4353 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4354 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4355 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4356 while (vinvalbuf(vp, 0, 0, 0) != 0) 4357 ; 4358 } 4359 4360 BO_LOCK(&vp->v_bufobj); 4361 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4362 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4363 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4364 vp->v_bufobj.bo_clean.bv_cnt == 0, 4365 ("vp %p bufobj not invalidated", vp)); 4366 4367 /* 4368 * For VMIO bufobj, BO_DEAD is set later, or in 4369 * vm_object_terminate() after the object's page queue is 4370 * flushed. 4371 */ 4372 object = vp->v_bufobj.bo_object; 4373 if (object == NULL) 4374 vp->v_bufobj.bo_flag |= BO_DEAD; 4375 BO_UNLOCK(&vp->v_bufobj); 4376 4377 /* 4378 * Handle the VM part. Tmpfs handles v_object on its own (the 4379 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4380 * should not touch the object borrowed from the lower vnode 4381 * (the handle check). 4382 */ 4383 if (object != NULL && object->type == OBJT_VNODE && 4384 object->handle == vp) 4385 vnode_destroy_vobject(vp); 4386 4387 /* 4388 * Reclaim the vnode. 4389 */ 4390 if (VOP_RECLAIM(vp)) 4391 panic("vgone: cannot reclaim"); 4392 if (mp != NULL) 4393 vn_finished_secondary_write(mp); 4394 VNASSERT(vp->v_object == NULL, vp, 4395 ("vop_reclaim left v_object vp=%p", vp)); 4396 /* 4397 * Clear the advisory locks and wake up waiting threads. 4398 */ 4399 if (vp->v_lockf != NULL) { 4400 (void)VOP_ADVLOCKPURGE(vp); 4401 vp->v_lockf = NULL; 4402 } 4403 /* 4404 * Delete from old mount point vnode list. 4405 */ 4406 if (vp->v_mount == NULL) { 4407 VI_LOCK(vp); 4408 } else { 4409 delmntque(vp); 4410 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4411 } 4412 /* 4413 * Done with purge, reset to the standard lock and invalidate 4414 * the vnode. 4415 */ 4416 vp->v_vnlock = &vp->v_lock; 4417 vp->v_op = &dead_vnodeops; 4418 vp->v_type = VBAD; 4419 vn_set_state(vp, VSTATE_DEAD); 4420 } 4421 4422 /* 4423 * Print out a description of a vnode. 4424 */ 4425 static const char *const vtypename[] = { 4426 [VNON] = "VNON", 4427 [VREG] = "VREG", 4428 [VDIR] = "VDIR", 4429 [VBLK] = "VBLK", 4430 [VCHR] = "VCHR", 4431 [VLNK] = "VLNK", 4432 [VSOCK] = "VSOCK", 4433 [VFIFO] = "VFIFO", 4434 [VBAD] = "VBAD", 4435 [VMARKER] = "VMARKER", 4436 }; 4437 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4438 "vnode type name not added to vtypename"); 4439 4440 static const char *const vstatename[] = { 4441 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4442 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4443 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4444 [VSTATE_DEAD] = "VSTATE_DEAD", 4445 }; 4446 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4447 "vnode state name not added to vstatename"); 4448 4449 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4450 "new hold count flag not added to vn_printf"); 4451 4452 void 4453 vn_printf(struct vnode *vp, const char *fmt, ...) 4454 { 4455 va_list ap; 4456 char buf[256], buf2[16]; 4457 u_long flags; 4458 u_int holdcnt; 4459 short irflag; 4460 4461 va_start(ap, fmt); 4462 vprintf(fmt, ap); 4463 va_end(ap); 4464 printf("%p: ", (void *)vp); 4465 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4466 vstatename[vp->v_state], vp->v_op); 4467 holdcnt = atomic_load_int(&vp->v_holdcnt); 4468 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4469 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4470 vp->v_seqc_users); 4471 switch (vp->v_type) { 4472 case VDIR: 4473 printf(" mountedhere %p\n", vp->v_mountedhere); 4474 break; 4475 case VCHR: 4476 printf(" rdev %p\n", vp->v_rdev); 4477 break; 4478 case VSOCK: 4479 printf(" socket %p\n", vp->v_unpcb); 4480 break; 4481 case VFIFO: 4482 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4483 break; 4484 default: 4485 printf("\n"); 4486 break; 4487 } 4488 buf[0] = '\0'; 4489 buf[1] = '\0'; 4490 if (holdcnt & VHOLD_NO_SMR) 4491 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4492 printf(" hold count flags (%s)\n", buf + 1); 4493 4494 buf[0] = '\0'; 4495 buf[1] = '\0'; 4496 irflag = vn_irflag_read(vp); 4497 if (irflag & VIRF_DOOMED) 4498 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4499 if (irflag & VIRF_PGREAD) 4500 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4501 if (irflag & VIRF_MOUNTPOINT) 4502 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4503 if (irflag & VIRF_TEXT_REF) 4504 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4505 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4506 if (flags != 0) { 4507 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4508 strlcat(buf, buf2, sizeof(buf)); 4509 } 4510 if (vp->v_vflag & VV_ROOT) 4511 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4512 if (vp->v_vflag & VV_ISTTY) 4513 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4514 if (vp->v_vflag & VV_NOSYNC) 4515 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4516 if (vp->v_vflag & VV_ETERNALDEV) 4517 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4518 if (vp->v_vflag & VV_CACHEDLABEL) 4519 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4520 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4521 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4522 if (vp->v_vflag & VV_COPYONWRITE) 4523 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4524 if (vp->v_vflag & VV_SYSTEM) 4525 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4526 if (vp->v_vflag & VV_PROCDEP) 4527 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4528 if (vp->v_vflag & VV_DELETED) 4529 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4530 if (vp->v_vflag & VV_MD) 4531 strlcat(buf, "|VV_MD", sizeof(buf)); 4532 if (vp->v_vflag & VV_FORCEINSMQ) 4533 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4534 if (vp->v_vflag & VV_READLINK) 4535 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4536 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4537 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4538 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4539 if (flags != 0) { 4540 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4541 strlcat(buf, buf2, sizeof(buf)); 4542 } 4543 if (vp->v_iflag & VI_MOUNT) 4544 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4545 if (vp->v_iflag & VI_DOINGINACT) 4546 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4547 if (vp->v_iflag & VI_OWEINACT) 4548 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4549 if (vp->v_iflag & VI_DEFINACT) 4550 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4551 if (vp->v_iflag & VI_FOPENING) 4552 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4553 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4554 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4555 if (flags != 0) { 4556 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4557 strlcat(buf, buf2, sizeof(buf)); 4558 } 4559 if (vp->v_mflag & VMP_LAZYLIST) 4560 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4561 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4562 if (flags != 0) { 4563 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4564 strlcat(buf, buf2, sizeof(buf)); 4565 } 4566 printf(" flags (%s)", buf + 1); 4567 if (mtx_owned(VI_MTX(vp))) 4568 printf(" VI_LOCKed"); 4569 printf("\n"); 4570 if (vp->v_object != NULL) 4571 printf(" v_object %p ref %d pages %d " 4572 "cleanbuf %d dirtybuf %d\n", 4573 vp->v_object, vp->v_object->ref_count, 4574 vp->v_object->resident_page_count, 4575 vp->v_bufobj.bo_clean.bv_cnt, 4576 vp->v_bufobj.bo_dirty.bv_cnt); 4577 printf(" "); 4578 lockmgr_printinfo(vp->v_vnlock); 4579 if (vp->v_data != NULL) 4580 VOP_PRINT(vp); 4581 } 4582 4583 #ifdef DDB 4584 /* 4585 * List all of the locked vnodes in the system. 4586 * Called when debugging the kernel. 4587 */ 4588 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4589 { 4590 struct mount *mp; 4591 struct vnode *vp; 4592 4593 /* 4594 * Note: because this is DDB, we can't obey the locking semantics 4595 * for these structures, which means we could catch an inconsistent 4596 * state and dereference a nasty pointer. Not much to be done 4597 * about that. 4598 */ 4599 db_printf("Locked vnodes\n"); 4600 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4601 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4602 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4603 vn_printf(vp, "vnode "); 4604 } 4605 } 4606 } 4607 4608 /* 4609 * Show details about the given vnode. 4610 */ 4611 DB_SHOW_COMMAND(vnode, db_show_vnode) 4612 { 4613 struct vnode *vp; 4614 4615 if (!have_addr) 4616 return; 4617 vp = (struct vnode *)addr; 4618 vn_printf(vp, "vnode "); 4619 } 4620 4621 /* 4622 * Show details about the given mount point. 4623 */ 4624 DB_SHOW_COMMAND(mount, db_show_mount) 4625 { 4626 struct mount *mp; 4627 struct vfsopt *opt; 4628 struct statfs *sp; 4629 struct vnode *vp; 4630 char buf[512]; 4631 uint64_t mflags; 4632 u_int flags; 4633 4634 if (!have_addr) { 4635 /* No address given, print short info about all mount points. */ 4636 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4637 db_printf("%p %s on %s (%s)\n", mp, 4638 mp->mnt_stat.f_mntfromname, 4639 mp->mnt_stat.f_mntonname, 4640 mp->mnt_stat.f_fstypename); 4641 if (db_pager_quit) 4642 break; 4643 } 4644 db_printf("\nMore info: show mount <addr>\n"); 4645 return; 4646 } 4647 4648 mp = (struct mount *)addr; 4649 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4650 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4651 4652 buf[0] = '\0'; 4653 mflags = mp->mnt_flag; 4654 #define MNT_FLAG(flag) do { \ 4655 if (mflags & (flag)) { \ 4656 if (buf[0] != '\0') \ 4657 strlcat(buf, ", ", sizeof(buf)); \ 4658 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4659 mflags &= ~(flag); \ 4660 } \ 4661 } while (0) 4662 MNT_FLAG(MNT_RDONLY); 4663 MNT_FLAG(MNT_SYNCHRONOUS); 4664 MNT_FLAG(MNT_NOEXEC); 4665 MNT_FLAG(MNT_NOSUID); 4666 MNT_FLAG(MNT_NFS4ACLS); 4667 MNT_FLAG(MNT_UNION); 4668 MNT_FLAG(MNT_ASYNC); 4669 MNT_FLAG(MNT_SUIDDIR); 4670 MNT_FLAG(MNT_SOFTDEP); 4671 MNT_FLAG(MNT_NOSYMFOLLOW); 4672 MNT_FLAG(MNT_GJOURNAL); 4673 MNT_FLAG(MNT_MULTILABEL); 4674 MNT_FLAG(MNT_ACLS); 4675 MNT_FLAG(MNT_NOATIME); 4676 MNT_FLAG(MNT_NOCLUSTERR); 4677 MNT_FLAG(MNT_NOCLUSTERW); 4678 MNT_FLAG(MNT_SUJ); 4679 MNT_FLAG(MNT_EXRDONLY); 4680 MNT_FLAG(MNT_EXPORTED); 4681 MNT_FLAG(MNT_DEFEXPORTED); 4682 MNT_FLAG(MNT_EXPORTANON); 4683 MNT_FLAG(MNT_EXKERB); 4684 MNT_FLAG(MNT_EXPUBLIC); 4685 MNT_FLAG(MNT_LOCAL); 4686 MNT_FLAG(MNT_QUOTA); 4687 MNT_FLAG(MNT_ROOTFS); 4688 MNT_FLAG(MNT_USER); 4689 MNT_FLAG(MNT_IGNORE); 4690 MNT_FLAG(MNT_UPDATE); 4691 MNT_FLAG(MNT_DELEXPORT); 4692 MNT_FLAG(MNT_RELOAD); 4693 MNT_FLAG(MNT_FORCE); 4694 MNT_FLAG(MNT_SNAPSHOT); 4695 MNT_FLAG(MNT_BYFSID); 4696 #undef MNT_FLAG 4697 if (mflags != 0) { 4698 if (buf[0] != '\0') 4699 strlcat(buf, ", ", sizeof(buf)); 4700 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4701 "0x%016jx", mflags); 4702 } 4703 db_printf(" mnt_flag = %s\n", buf); 4704 4705 buf[0] = '\0'; 4706 flags = mp->mnt_kern_flag; 4707 #define MNT_KERN_FLAG(flag) do { \ 4708 if (flags & (flag)) { \ 4709 if (buf[0] != '\0') \ 4710 strlcat(buf, ", ", sizeof(buf)); \ 4711 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4712 flags &= ~(flag); \ 4713 } \ 4714 } while (0) 4715 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4716 MNT_KERN_FLAG(MNTK_ASYNC); 4717 MNT_KERN_FLAG(MNTK_SOFTDEP); 4718 MNT_KERN_FLAG(MNTK_NOMSYNC); 4719 MNT_KERN_FLAG(MNTK_DRAINING); 4720 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4721 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4722 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4723 MNT_KERN_FLAG(MNTK_NO_IOPF); 4724 MNT_KERN_FLAG(MNTK_RECURSE); 4725 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4726 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4727 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4728 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4729 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4730 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4731 MNT_KERN_FLAG(MNTK_NOASYNC); 4732 MNT_KERN_FLAG(MNTK_UNMOUNT); 4733 MNT_KERN_FLAG(MNTK_MWAIT); 4734 MNT_KERN_FLAG(MNTK_SUSPEND); 4735 MNT_KERN_FLAG(MNTK_SUSPEND2); 4736 MNT_KERN_FLAG(MNTK_SUSPENDED); 4737 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4738 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4739 #undef MNT_KERN_FLAG 4740 if (flags != 0) { 4741 if (buf[0] != '\0') 4742 strlcat(buf, ", ", sizeof(buf)); 4743 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4744 "0x%08x", flags); 4745 } 4746 db_printf(" mnt_kern_flag = %s\n", buf); 4747 4748 db_printf(" mnt_opt = "); 4749 opt = TAILQ_FIRST(mp->mnt_opt); 4750 if (opt != NULL) { 4751 db_printf("%s", opt->name); 4752 opt = TAILQ_NEXT(opt, link); 4753 while (opt != NULL) { 4754 db_printf(", %s", opt->name); 4755 opt = TAILQ_NEXT(opt, link); 4756 } 4757 } 4758 db_printf("\n"); 4759 4760 sp = &mp->mnt_stat; 4761 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4762 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4763 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4764 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4765 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4766 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4767 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4768 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4769 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4770 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4771 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4772 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4773 4774 db_printf(" mnt_cred = { uid=%u ruid=%u", 4775 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4776 if (jailed(mp->mnt_cred)) 4777 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4778 db_printf(" }\n"); 4779 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4780 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4781 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4782 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4783 db_printf(" mnt_lazyvnodelistsize = %d\n", 4784 mp->mnt_lazyvnodelistsize); 4785 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4786 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4787 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4788 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4789 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4790 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4791 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4792 db_printf(" mnt_secondary_accwrites = %d\n", 4793 mp->mnt_secondary_accwrites); 4794 db_printf(" mnt_gjprovider = %s\n", 4795 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4796 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4797 4798 db_printf("\n\nList of active vnodes\n"); 4799 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4800 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4801 vn_printf(vp, "vnode "); 4802 if (db_pager_quit) 4803 break; 4804 } 4805 } 4806 db_printf("\n\nList of inactive vnodes\n"); 4807 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4808 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4809 vn_printf(vp, "vnode "); 4810 if (db_pager_quit) 4811 break; 4812 } 4813 } 4814 } 4815 #endif /* DDB */ 4816 4817 /* 4818 * Fill in a struct xvfsconf based on a struct vfsconf. 4819 */ 4820 static int 4821 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4822 { 4823 struct xvfsconf xvfsp; 4824 4825 bzero(&xvfsp, sizeof(xvfsp)); 4826 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4827 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4828 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4829 xvfsp.vfc_flags = vfsp->vfc_flags; 4830 /* 4831 * These are unused in userland, we keep them 4832 * to not break binary compatibility. 4833 */ 4834 xvfsp.vfc_vfsops = NULL; 4835 xvfsp.vfc_next = NULL; 4836 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4837 } 4838 4839 #ifdef COMPAT_FREEBSD32 4840 struct xvfsconf32 { 4841 uint32_t vfc_vfsops; 4842 char vfc_name[MFSNAMELEN]; 4843 int32_t vfc_typenum; 4844 int32_t vfc_refcount; 4845 int32_t vfc_flags; 4846 uint32_t vfc_next; 4847 }; 4848 4849 static int 4850 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4851 { 4852 struct xvfsconf32 xvfsp; 4853 4854 bzero(&xvfsp, sizeof(xvfsp)); 4855 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4856 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4857 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4858 xvfsp.vfc_flags = vfsp->vfc_flags; 4859 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4860 } 4861 #endif 4862 4863 /* 4864 * Top level filesystem related information gathering. 4865 */ 4866 static int 4867 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4868 { 4869 struct vfsconf *vfsp; 4870 int error; 4871 4872 error = 0; 4873 vfsconf_slock(); 4874 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4875 #ifdef COMPAT_FREEBSD32 4876 if (req->flags & SCTL_MASK32) 4877 error = vfsconf2x32(req, vfsp); 4878 else 4879 #endif 4880 error = vfsconf2x(req, vfsp); 4881 if (error) 4882 break; 4883 } 4884 vfsconf_sunlock(); 4885 return (error); 4886 } 4887 4888 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4889 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4890 "S,xvfsconf", "List of all configured filesystems"); 4891 4892 #ifndef BURN_BRIDGES 4893 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4894 4895 static int 4896 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4897 { 4898 int *name = (int *)arg1 - 1; /* XXX */ 4899 u_int namelen = arg2 + 1; /* XXX */ 4900 struct vfsconf *vfsp; 4901 4902 log(LOG_WARNING, "userland calling deprecated sysctl, " 4903 "please rebuild world\n"); 4904 4905 #if 1 || defined(COMPAT_PRELITE2) 4906 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4907 if (namelen == 1) 4908 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4909 #endif 4910 4911 switch (name[1]) { 4912 case VFS_MAXTYPENUM: 4913 if (namelen != 2) 4914 return (ENOTDIR); 4915 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4916 case VFS_CONF: 4917 if (namelen != 3) 4918 return (ENOTDIR); /* overloaded */ 4919 vfsconf_slock(); 4920 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4921 if (vfsp->vfc_typenum == name[2]) 4922 break; 4923 } 4924 vfsconf_sunlock(); 4925 if (vfsp == NULL) 4926 return (EOPNOTSUPP); 4927 #ifdef COMPAT_FREEBSD32 4928 if (req->flags & SCTL_MASK32) 4929 return (vfsconf2x32(req, vfsp)); 4930 else 4931 #endif 4932 return (vfsconf2x(req, vfsp)); 4933 } 4934 return (EOPNOTSUPP); 4935 } 4936 4937 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4938 CTLFLAG_MPSAFE, vfs_sysctl, 4939 "Generic filesystem"); 4940 4941 #if 1 || defined(COMPAT_PRELITE2) 4942 4943 static int 4944 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4945 { 4946 int error; 4947 struct vfsconf *vfsp; 4948 struct ovfsconf ovfs; 4949 4950 vfsconf_slock(); 4951 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4952 bzero(&ovfs, sizeof(ovfs)); 4953 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4954 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4955 ovfs.vfc_index = vfsp->vfc_typenum; 4956 ovfs.vfc_refcount = vfsp->vfc_refcount; 4957 ovfs.vfc_flags = vfsp->vfc_flags; 4958 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4959 if (error != 0) { 4960 vfsconf_sunlock(); 4961 return (error); 4962 } 4963 } 4964 vfsconf_sunlock(); 4965 return (0); 4966 } 4967 4968 #endif /* 1 || COMPAT_PRELITE2 */ 4969 #endif /* !BURN_BRIDGES */ 4970 4971 static void 4972 unmount_or_warn(struct mount *mp) 4973 { 4974 int error; 4975 4976 error = dounmount(mp, MNT_FORCE, curthread); 4977 if (error != 0) { 4978 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4979 if (error == EBUSY) 4980 printf("BUSY)\n"); 4981 else 4982 printf("%d)\n", error); 4983 } 4984 } 4985 4986 /* 4987 * Unmount all filesystems. The list is traversed in reverse order 4988 * of mounting to avoid dependencies. 4989 */ 4990 void 4991 vfs_unmountall(void) 4992 { 4993 struct mount *mp, *tmp; 4994 4995 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4996 4997 /* 4998 * Since this only runs when rebooting, it is not interlocked. 4999 */ 5000 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 5001 vfs_ref(mp); 5002 5003 /* 5004 * Forcibly unmounting "/dev" before "/" would prevent clean 5005 * unmount of the latter. 5006 */ 5007 if (mp == rootdevmp) 5008 continue; 5009 5010 unmount_or_warn(mp); 5011 } 5012 5013 if (rootdevmp != NULL) 5014 unmount_or_warn(rootdevmp); 5015 } 5016 5017 static void 5018 vfs_deferred_inactive(struct vnode *vp, int lkflags) 5019 { 5020 5021 ASSERT_VI_LOCKED(vp, __func__); 5022 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 5023 if ((vp->v_iflag & VI_OWEINACT) == 0) { 5024 vdropl(vp); 5025 return; 5026 } 5027 if (vn_lock(vp, lkflags) == 0) { 5028 VI_LOCK(vp); 5029 vinactive(vp); 5030 VOP_UNLOCK(vp); 5031 vdropl(vp); 5032 return; 5033 } 5034 vdefer_inactive_unlocked(vp); 5035 } 5036 5037 static int 5038 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 5039 { 5040 5041 return (vp->v_iflag & VI_DEFINACT); 5042 } 5043 5044 static void __noinline 5045 vfs_periodic_inactive(struct mount *mp, int flags) 5046 { 5047 struct vnode *vp, *mvp; 5048 int lkflags; 5049 5050 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5051 if (flags != MNT_WAIT) 5052 lkflags |= LK_NOWAIT; 5053 5054 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 5055 if ((vp->v_iflag & VI_DEFINACT) == 0) { 5056 VI_UNLOCK(vp); 5057 continue; 5058 } 5059 vp->v_iflag &= ~VI_DEFINACT; 5060 vfs_deferred_inactive(vp, lkflags); 5061 } 5062 } 5063 5064 static inline bool 5065 vfs_want_msync(struct vnode *vp) 5066 { 5067 struct vm_object *obj; 5068 5069 /* 5070 * This test may be performed without any locks held. 5071 * We rely on vm_object's type stability. 5072 */ 5073 if (vp->v_vflag & VV_NOSYNC) 5074 return (false); 5075 obj = vp->v_object; 5076 return (obj != NULL && vm_object_mightbedirty(obj)); 5077 } 5078 5079 static int 5080 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 5081 { 5082 5083 if (vp->v_vflag & VV_NOSYNC) 5084 return (false); 5085 if (vp->v_iflag & VI_DEFINACT) 5086 return (true); 5087 return (vfs_want_msync(vp)); 5088 } 5089 5090 static void __noinline 5091 vfs_periodic_msync_inactive(struct mount *mp, int flags) 5092 { 5093 struct vnode *vp, *mvp; 5094 struct vm_object *obj; 5095 int lkflags, objflags; 5096 bool seen_defer; 5097 5098 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 5099 if (flags != MNT_WAIT) { 5100 lkflags |= LK_NOWAIT; 5101 objflags = OBJPC_NOSYNC; 5102 } else { 5103 objflags = OBJPC_SYNC; 5104 } 5105 5106 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 5107 seen_defer = false; 5108 if (vp->v_iflag & VI_DEFINACT) { 5109 vp->v_iflag &= ~VI_DEFINACT; 5110 seen_defer = true; 5111 } 5112 if (!vfs_want_msync(vp)) { 5113 if (seen_defer) 5114 vfs_deferred_inactive(vp, lkflags); 5115 else 5116 VI_UNLOCK(vp); 5117 continue; 5118 } 5119 if (vget(vp, lkflags) == 0) { 5120 obj = vp->v_object; 5121 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 5122 VM_OBJECT_WLOCK(obj); 5123 vm_object_page_clean(obj, 0, 0, objflags); 5124 VM_OBJECT_WUNLOCK(obj); 5125 } 5126 vput(vp); 5127 if (seen_defer) 5128 vdrop(vp); 5129 } else { 5130 if (seen_defer) 5131 vdefer_inactive_unlocked(vp); 5132 } 5133 } 5134 } 5135 5136 void 5137 vfs_periodic(struct mount *mp, int flags) 5138 { 5139 5140 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 5141 5142 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 5143 vfs_periodic_inactive(mp, flags); 5144 else 5145 vfs_periodic_msync_inactive(mp, flags); 5146 } 5147 5148 static void 5149 destroy_vpollinfo_free(struct vpollinfo *vi) 5150 { 5151 5152 knlist_destroy(&vi->vpi_selinfo.si_note); 5153 mtx_destroy(&vi->vpi_lock); 5154 free(vi, M_VNODEPOLL); 5155 } 5156 5157 static void 5158 destroy_vpollinfo(struct vpollinfo *vi) 5159 { 5160 5161 knlist_clear(&vi->vpi_selinfo.si_note, 1); 5162 seldrain(&vi->vpi_selinfo); 5163 destroy_vpollinfo_free(vi); 5164 } 5165 5166 /* 5167 * Initialize per-vnode helper structure to hold poll-related state. 5168 */ 5169 void 5170 v_addpollinfo(struct vnode *vp) 5171 { 5172 struct vpollinfo *vi; 5173 5174 if (vp->v_pollinfo != NULL) 5175 return; 5176 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5177 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5178 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5179 vfs_knlunlock, vfs_knl_assert_lock); 5180 VI_LOCK(vp); 5181 if (vp->v_pollinfo != NULL) { 5182 VI_UNLOCK(vp); 5183 destroy_vpollinfo_free(vi); 5184 return; 5185 } 5186 vp->v_pollinfo = vi; 5187 VI_UNLOCK(vp); 5188 } 5189 5190 /* 5191 * Record a process's interest in events which might happen to 5192 * a vnode. Because poll uses the historic select-style interface 5193 * internally, this routine serves as both the ``check for any 5194 * pending events'' and the ``record my interest in future events'' 5195 * functions. (These are done together, while the lock is held, 5196 * to avoid race conditions.) 5197 */ 5198 int 5199 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5200 { 5201 5202 v_addpollinfo(vp); 5203 mtx_lock(&vp->v_pollinfo->vpi_lock); 5204 if (vp->v_pollinfo->vpi_revents & events) { 5205 /* 5206 * This leaves events we are not interested 5207 * in available for the other process which 5208 * which presumably had requested them 5209 * (otherwise they would never have been 5210 * recorded). 5211 */ 5212 events &= vp->v_pollinfo->vpi_revents; 5213 vp->v_pollinfo->vpi_revents &= ~events; 5214 5215 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5216 return (events); 5217 } 5218 vp->v_pollinfo->vpi_events |= events; 5219 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5220 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5221 return (0); 5222 } 5223 5224 /* 5225 * Routine to create and manage a filesystem syncer vnode. 5226 */ 5227 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5228 static int sync_fsync(struct vop_fsync_args *); 5229 static int sync_inactive(struct vop_inactive_args *); 5230 static int sync_reclaim(struct vop_reclaim_args *); 5231 5232 static struct vop_vector sync_vnodeops = { 5233 .vop_bypass = VOP_EOPNOTSUPP, 5234 .vop_close = sync_close, 5235 .vop_fsync = sync_fsync, 5236 .vop_getwritemount = vop_stdgetwritemount, 5237 .vop_inactive = sync_inactive, 5238 .vop_need_inactive = vop_stdneed_inactive, 5239 .vop_reclaim = sync_reclaim, 5240 .vop_lock1 = vop_stdlock, 5241 .vop_unlock = vop_stdunlock, 5242 .vop_islocked = vop_stdislocked, 5243 .vop_fplookup_vexec = VOP_EAGAIN, 5244 .vop_fplookup_symlink = VOP_EAGAIN, 5245 }; 5246 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5247 5248 /* 5249 * Create a new filesystem syncer vnode for the specified mount point. 5250 */ 5251 void 5252 vfs_allocate_syncvnode(struct mount *mp) 5253 { 5254 struct vnode *vp; 5255 struct bufobj *bo; 5256 static long start, incr, next; 5257 int error; 5258 5259 /* Allocate a new vnode */ 5260 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5261 if (error != 0) 5262 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5263 vp->v_type = VNON; 5264 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5265 vp->v_vflag |= VV_FORCEINSMQ; 5266 error = insmntque1(vp, mp); 5267 if (error != 0) 5268 panic("vfs_allocate_syncvnode: insmntque() failed"); 5269 vp->v_vflag &= ~VV_FORCEINSMQ; 5270 vn_set_state(vp, VSTATE_CONSTRUCTED); 5271 VOP_UNLOCK(vp); 5272 /* 5273 * Place the vnode onto the syncer worklist. We attempt to 5274 * scatter them about on the list so that they will go off 5275 * at evenly distributed times even if all the filesystems 5276 * are mounted at once. 5277 */ 5278 next += incr; 5279 if (next == 0 || next > syncer_maxdelay) { 5280 start /= 2; 5281 incr /= 2; 5282 if (start == 0) { 5283 start = syncer_maxdelay / 2; 5284 incr = syncer_maxdelay; 5285 } 5286 next = start; 5287 } 5288 bo = &vp->v_bufobj; 5289 BO_LOCK(bo); 5290 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5291 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5292 mtx_lock(&sync_mtx); 5293 sync_vnode_count++; 5294 if (mp->mnt_syncer == NULL) { 5295 mp->mnt_syncer = vp; 5296 vp = NULL; 5297 } 5298 mtx_unlock(&sync_mtx); 5299 BO_UNLOCK(bo); 5300 if (vp != NULL) { 5301 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5302 vgone(vp); 5303 vput(vp); 5304 } 5305 } 5306 5307 void 5308 vfs_deallocate_syncvnode(struct mount *mp) 5309 { 5310 struct vnode *vp; 5311 5312 mtx_lock(&sync_mtx); 5313 vp = mp->mnt_syncer; 5314 if (vp != NULL) 5315 mp->mnt_syncer = NULL; 5316 mtx_unlock(&sync_mtx); 5317 if (vp != NULL) 5318 vrele(vp); 5319 } 5320 5321 /* 5322 * Do a lazy sync of the filesystem. 5323 */ 5324 static int 5325 sync_fsync(struct vop_fsync_args *ap) 5326 { 5327 struct vnode *syncvp = ap->a_vp; 5328 struct mount *mp = syncvp->v_mount; 5329 int error, save; 5330 struct bufobj *bo; 5331 5332 /* 5333 * We only need to do something if this is a lazy evaluation. 5334 */ 5335 if (ap->a_waitfor != MNT_LAZY) 5336 return (0); 5337 5338 /* 5339 * Move ourselves to the back of the sync list. 5340 */ 5341 bo = &syncvp->v_bufobj; 5342 BO_LOCK(bo); 5343 vn_syncer_add_to_worklist(bo, syncdelay); 5344 BO_UNLOCK(bo); 5345 5346 /* 5347 * Walk the list of vnodes pushing all that are dirty and 5348 * not already on the sync list. 5349 */ 5350 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5351 return (0); 5352 VOP_UNLOCK(syncvp); 5353 save = curthread_pflags_set(TDP_SYNCIO); 5354 /* 5355 * The filesystem at hand may be idle with free vnodes stored in the 5356 * batch. Return them instead of letting them stay there indefinitely. 5357 */ 5358 vfs_periodic(mp, MNT_NOWAIT); 5359 error = VFS_SYNC(mp, MNT_LAZY); 5360 curthread_pflags_restore(save); 5361 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5362 vfs_unbusy(mp); 5363 return (error); 5364 } 5365 5366 /* 5367 * The syncer vnode is no referenced. 5368 */ 5369 static int 5370 sync_inactive(struct vop_inactive_args *ap) 5371 { 5372 5373 vgone(ap->a_vp); 5374 return (0); 5375 } 5376 5377 /* 5378 * The syncer vnode is no longer needed and is being decommissioned. 5379 * 5380 * Modifications to the worklist must be protected by sync_mtx. 5381 */ 5382 static int 5383 sync_reclaim(struct vop_reclaim_args *ap) 5384 { 5385 struct vnode *vp = ap->a_vp; 5386 struct bufobj *bo; 5387 5388 bo = &vp->v_bufobj; 5389 BO_LOCK(bo); 5390 mtx_lock(&sync_mtx); 5391 if (vp->v_mount->mnt_syncer == vp) 5392 vp->v_mount->mnt_syncer = NULL; 5393 if (bo->bo_flag & BO_ONWORKLST) { 5394 LIST_REMOVE(bo, bo_synclist); 5395 syncer_worklist_len--; 5396 sync_vnode_count--; 5397 bo->bo_flag &= ~BO_ONWORKLST; 5398 } 5399 mtx_unlock(&sync_mtx); 5400 BO_UNLOCK(bo); 5401 5402 return (0); 5403 } 5404 5405 int 5406 vn_need_pageq_flush(struct vnode *vp) 5407 { 5408 struct vm_object *obj; 5409 5410 obj = vp->v_object; 5411 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5412 vm_object_mightbedirty(obj)); 5413 } 5414 5415 /* 5416 * Check if vnode represents a disk device 5417 */ 5418 bool 5419 vn_isdisk_error(struct vnode *vp, int *errp) 5420 { 5421 int error; 5422 5423 if (vp->v_type != VCHR) { 5424 error = ENOTBLK; 5425 goto out; 5426 } 5427 error = 0; 5428 dev_lock(); 5429 if (vp->v_rdev == NULL) 5430 error = ENXIO; 5431 else if (vp->v_rdev->si_devsw == NULL) 5432 error = ENXIO; 5433 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5434 error = ENOTBLK; 5435 dev_unlock(); 5436 out: 5437 *errp = error; 5438 return (error == 0); 5439 } 5440 5441 bool 5442 vn_isdisk(struct vnode *vp) 5443 { 5444 int error; 5445 5446 return (vn_isdisk_error(vp, &error)); 5447 } 5448 5449 /* 5450 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5451 * the comment above cache_fplookup for details. 5452 */ 5453 int 5454 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5455 { 5456 int error; 5457 5458 VFS_SMR_ASSERT_ENTERED(); 5459 5460 /* Check the owner. */ 5461 if (cred->cr_uid == file_uid) { 5462 if (file_mode & S_IXUSR) 5463 return (0); 5464 goto out_error; 5465 } 5466 5467 /* Otherwise, check the groups (first match) */ 5468 if (groupmember(file_gid, cred)) { 5469 if (file_mode & S_IXGRP) 5470 return (0); 5471 goto out_error; 5472 } 5473 5474 /* Otherwise, check everyone else. */ 5475 if (file_mode & S_IXOTH) 5476 return (0); 5477 out_error: 5478 /* 5479 * Permission check failed, but it is possible denial will get overwritten 5480 * (e.g., when root is traversing through a 700 directory owned by someone 5481 * else). 5482 * 5483 * vaccess() calls priv_check_cred which in turn can descent into MAC 5484 * modules overriding this result. It's quite unclear what semantics 5485 * are allowed for them to operate, thus for safety we don't call them 5486 * from within the SMR section. This also means if any such modules 5487 * are present, we have to let the regular lookup decide. 5488 */ 5489 error = priv_check_cred_vfs_lookup_nomac(cred); 5490 switch (error) { 5491 case 0: 5492 return (0); 5493 case EAGAIN: 5494 /* 5495 * MAC modules present. 5496 */ 5497 return (EAGAIN); 5498 case EPERM: 5499 return (EACCES); 5500 default: 5501 return (error); 5502 } 5503 } 5504 5505 /* 5506 * Common filesystem object access control check routine. Accepts a 5507 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5508 * Returns 0 on success, or an errno on failure. 5509 */ 5510 int 5511 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5512 accmode_t accmode, struct ucred *cred) 5513 { 5514 accmode_t dac_granted; 5515 accmode_t priv_granted; 5516 5517 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5518 ("invalid bit in accmode")); 5519 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5520 ("VAPPEND without VWRITE")); 5521 5522 /* 5523 * Look for a normal, non-privileged way to access the file/directory 5524 * as requested. If it exists, go with that. 5525 */ 5526 5527 dac_granted = 0; 5528 5529 /* Check the owner. */ 5530 if (cred->cr_uid == file_uid) { 5531 dac_granted |= VADMIN; 5532 if (file_mode & S_IXUSR) 5533 dac_granted |= VEXEC; 5534 if (file_mode & S_IRUSR) 5535 dac_granted |= VREAD; 5536 if (file_mode & S_IWUSR) 5537 dac_granted |= (VWRITE | VAPPEND); 5538 5539 if ((accmode & dac_granted) == accmode) 5540 return (0); 5541 5542 goto privcheck; 5543 } 5544 5545 /* Otherwise, check the groups (first match) */ 5546 if (groupmember(file_gid, cred)) { 5547 if (file_mode & S_IXGRP) 5548 dac_granted |= VEXEC; 5549 if (file_mode & S_IRGRP) 5550 dac_granted |= VREAD; 5551 if (file_mode & S_IWGRP) 5552 dac_granted |= (VWRITE | VAPPEND); 5553 5554 if ((accmode & dac_granted) == accmode) 5555 return (0); 5556 5557 goto privcheck; 5558 } 5559 5560 /* Otherwise, check everyone else. */ 5561 if (file_mode & S_IXOTH) 5562 dac_granted |= VEXEC; 5563 if (file_mode & S_IROTH) 5564 dac_granted |= VREAD; 5565 if (file_mode & S_IWOTH) 5566 dac_granted |= (VWRITE | VAPPEND); 5567 if ((accmode & dac_granted) == accmode) 5568 return (0); 5569 5570 privcheck: 5571 /* 5572 * Build a privilege mask to determine if the set of privileges 5573 * satisfies the requirements when combined with the granted mask 5574 * from above. For each privilege, if the privilege is required, 5575 * bitwise or the request type onto the priv_granted mask. 5576 */ 5577 priv_granted = 0; 5578 5579 if (type == VDIR) { 5580 /* 5581 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5582 * requests, instead of PRIV_VFS_EXEC. 5583 */ 5584 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5585 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5586 priv_granted |= VEXEC; 5587 } else { 5588 /* 5589 * Ensure that at least one execute bit is on. Otherwise, 5590 * a privileged user will always succeed, and we don't want 5591 * this to happen unless the file really is executable. 5592 */ 5593 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5594 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5595 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5596 priv_granted |= VEXEC; 5597 } 5598 5599 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5600 !priv_check_cred(cred, PRIV_VFS_READ)) 5601 priv_granted |= VREAD; 5602 5603 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5604 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5605 priv_granted |= (VWRITE | VAPPEND); 5606 5607 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5608 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5609 priv_granted |= VADMIN; 5610 5611 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5612 return (0); 5613 } 5614 5615 return ((accmode & VADMIN) ? EPERM : EACCES); 5616 } 5617 5618 /* 5619 * Credential check based on process requesting service, and per-attribute 5620 * permissions. 5621 */ 5622 int 5623 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5624 struct thread *td, accmode_t accmode) 5625 { 5626 5627 /* 5628 * Kernel-invoked always succeeds. 5629 */ 5630 if (cred == NOCRED) 5631 return (0); 5632 5633 /* 5634 * Do not allow privileged processes in jail to directly manipulate 5635 * system attributes. 5636 */ 5637 switch (attrnamespace) { 5638 case EXTATTR_NAMESPACE_SYSTEM: 5639 /* Potentially should be: return (EPERM); */ 5640 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5641 case EXTATTR_NAMESPACE_USER: 5642 return (VOP_ACCESS(vp, accmode, cred, td)); 5643 default: 5644 return (EPERM); 5645 } 5646 } 5647 5648 #ifdef DEBUG_VFS_LOCKS 5649 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5650 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5651 "Drop into debugger on lock violation"); 5652 5653 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5654 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5655 0, "Check for interlock across VOPs"); 5656 5657 int vfs_badlock_print = 1; /* Print lock violations. */ 5658 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5659 0, "Print lock violations"); 5660 5661 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5662 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5663 0, "Print vnode details on lock violations"); 5664 5665 #ifdef KDB 5666 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5667 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5668 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5669 #endif 5670 5671 static void 5672 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5673 { 5674 5675 #ifdef KDB 5676 if (vfs_badlock_backtrace) 5677 kdb_backtrace(); 5678 #endif 5679 if (vfs_badlock_vnode) 5680 vn_printf(vp, "vnode "); 5681 if (vfs_badlock_print) 5682 printf("%s: %p %s\n", str, (void *)vp, msg); 5683 if (vfs_badlock_ddb) 5684 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5685 } 5686 5687 void 5688 assert_vi_locked(struct vnode *vp, const char *str) 5689 { 5690 5691 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5692 vfs_badlock("interlock is not locked but should be", str, vp); 5693 } 5694 5695 void 5696 assert_vi_unlocked(struct vnode *vp, const char *str) 5697 { 5698 5699 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5700 vfs_badlock("interlock is locked but should not be", str, vp); 5701 } 5702 5703 void 5704 assert_vop_locked(struct vnode *vp, const char *str) 5705 { 5706 if (KERNEL_PANICKED() || vp == NULL) 5707 return; 5708 5709 #ifdef WITNESS 5710 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5711 witness_is_owned(&vp->v_vnlock->lock_object) == -1) 5712 #else 5713 int locked = VOP_ISLOCKED(vp); 5714 if (locked == 0 || locked == LK_EXCLOTHER) 5715 #endif 5716 vfs_badlock("is not locked but should be", str, vp); 5717 } 5718 5719 void 5720 assert_vop_unlocked(struct vnode *vp, const char *str) 5721 { 5722 if (KERNEL_PANICKED() || vp == NULL) 5723 return; 5724 5725 #ifdef WITNESS 5726 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5727 witness_is_owned(&vp->v_vnlock->lock_object) == 1) 5728 #else 5729 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5730 #endif 5731 vfs_badlock("is locked but should not be", str, vp); 5732 } 5733 5734 void 5735 assert_vop_elocked(struct vnode *vp, const char *str) 5736 { 5737 if (KERNEL_PANICKED() || vp == NULL) 5738 return; 5739 5740 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5741 vfs_badlock("is not exclusive locked but should be", str, vp); 5742 } 5743 #endif /* DEBUG_VFS_LOCKS */ 5744 5745 void 5746 vop_rename_fail(struct vop_rename_args *ap) 5747 { 5748 5749 if (ap->a_tvp != NULL) 5750 vput(ap->a_tvp); 5751 if (ap->a_tdvp == ap->a_tvp) 5752 vrele(ap->a_tdvp); 5753 else 5754 vput(ap->a_tdvp); 5755 vrele(ap->a_fdvp); 5756 vrele(ap->a_fvp); 5757 } 5758 5759 void 5760 vop_rename_pre(void *ap) 5761 { 5762 struct vop_rename_args *a = ap; 5763 5764 #ifdef DEBUG_VFS_LOCKS 5765 if (a->a_tvp) 5766 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5767 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5768 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5769 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5770 5771 /* Check the source (from). */ 5772 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5773 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5774 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5775 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5776 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5777 5778 /* Check the target. */ 5779 if (a->a_tvp) 5780 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5781 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5782 #endif 5783 /* 5784 * It may be tempting to add vn_seqc_write_begin/end calls here and 5785 * in vop_rename_post but that's not going to work out since some 5786 * filesystems relookup vnodes mid-rename. This is probably a bug. 5787 * 5788 * For now filesystems are expected to do the relevant calls after they 5789 * decide what vnodes to operate on. 5790 */ 5791 if (a->a_tdvp != a->a_fdvp) 5792 vhold(a->a_fdvp); 5793 if (a->a_tvp != a->a_fvp) 5794 vhold(a->a_fvp); 5795 vhold(a->a_tdvp); 5796 if (a->a_tvp) 5797 vhold(a->a_tvp); 5798 } 5799 5800 #ifdef DEBUG_VFS_LOCKS 5801 void 5802 vop_fplookup_vexec_debugpre(void *ap __unused) 5803 { 5804 5805 VFS_SMR_ASSERT_ENTERED(); 5806 } 5807 5808 void 5809 vop_fplookup_vexec_debugpost(void *ap, int rc) 5810 { 5811 struct vop_fplookup_vexec_args *a; 5812 struct vnode *vp; 5813 5814 a = ap; 5815 vp = a->a_vp; 5816 5817 VFS_SMR_ASSERT_ENTERED(); 5818 if (rc == EOPNOTSUPP) 5819 VNPASS(VN_IS_DOOMED(vp), vp); 5820 } 5821 5822 void 5823 vop_fplookup_symlink_debugpre(void *ap __unused) 5824 { 5825 5826 VFS_SMR_ASSERT_ENTERED(); 5827 } 5828 5829 void 5830 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5831 { 5832 5833 VFS_SMR_ASSERT_ENTERED(); 5834 } 5835 5836 static void 5837 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5838 { 5839 if (vp->v_type == VCHR) 5840 ; 5841 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5842 ASSERT_VOP_LOCKED(vp, name); 5843 else 5844 ASSERT_VOP_ELOCKED(vp, name); 5845 } 5846 5847 void 5848 vop_fsync_debugpre(void *a) 5849 { 5850 struct vop_fsync_args *ap; 5851 5852 ap = a; 5853 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5854 } 5855 5856 void 5857 vop_fsync_debugpost(void *a, int rc __unused) 5858 { 5859 struct vop_fsync_args *ap; 5860 5861 ap = a; 5862 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5863 } 5864 5865 void 5866 vop_fdatasync_debugpre(void *a) 5867 { 5868 struct vop_fdatasync_args *ap; 5869 5870 ap = a; 5871 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5872 } 5873 5874 void 5875 vop_fdatasync_debugpost(void *a, int rc __unused) 5876 { 5877 struct vop_fdatasync_args *ap; 5878 5879 ap = a; 5880 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5881 } 5882 5883 void 5884 vop_strategy_debugpre(void *ap) 5885 { 5886 struct vop_strategy_args *a; 5887 struct buf *bp; 5888 5889 a = ap; 5890 bp = a->a_bp; 5891 5892 /* 5893 * Cluster ops lock their component buffers but not the IO container. 5894 */ 5895 if ((bp->b_flags & B_CLUSTER) != 0) 5896 return; 5897 5898 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5899 if (vfs_badlock_print) 5900 printf( 5901 "VOP_STRATEGY: bp is not locked but should be\n"); 5902 if (vfs_badlock_ddb) 5903 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5904 } 5905 } 5906 5907 void 5908 vop_lock_debugpre(void *ap) 5909 { 5910 struct vop_lock1_args *a = ap; 5911 5912 if ((a->a_flags & LK_INTERLOCK) == 0) 5913 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5914 else 5915 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5916 } 5917 5918 void 5919 vop_lock_debugpost(void *ap, int rc) 5920 { 5921 struct vop_lock1_args *a = ap; 5922 5923 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5924 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5925 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5926 } 5927 5928 void 5929 vop_unlock_debugpre(void *ap) 5930 { 5931 struct vop_unlock_args *a = ap; 5932 struct vnode *vp = a->a_vp; 5933 5934 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 5935 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 5936 } 5937 5938 void 5939 vop_need_inactive_debugpre(void *ap) 5940 { 5941 struct vop_need_inactive_args *a = ap; 5942 5943 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5944 } 5945 5946 void 5947 vop_need_inactive_debugpost(void *ap, int rc) 5948 { 5949 struct vop_need_inactive_args *a = ap; 5950 5951 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5952 } 5953 #endif 5954 5955 void 5956 vop_create_pre(void *ap) 5957 { 5958 struct vop_create_args *a; 5959 struct vnode *dvp; 5960 5961 a = ap; 5962 dvp = a->a_dvp; 5963 vn_seqc_write_begin(dvp); 5964 } 5965 5966 void 5967 vop_create_post(void *ap, int rc) 5968 { 5969 struct vop_create_args *a; 5970 struct vnode *dvp; 5971 5972 a = ap; 5973 dvp = a->a_dvp; 5974 vn_seqc_write_end(dvp); 5975 if (!rc) 5976 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5977 } 5978 5979 void 5980 vop_whiteout_pre(void *ap) 5981 { 5982 struct vop_whiteout_args *a; 5983 struct vnode *dvp; 5984 5985 a = ap; 5986 dvp = a->a_dvp; 5987 vn_seqc_write_begin(dvp); 5988 } 5989 5990 void 5991 vop_whiteout_post(void *ap, int rc) 5992 { 5993 struct vop_whiteout_args *a; 5994 struct vnode *dvp; 5995 5996 a = ap; 5997 dvp = a->a_dvp; 5998 vn_seqc_write_end(dvp); 5999 } 6000 6001 void 6002 vop_deleteextattr_pre(void *ap) 6003 { 6004 struct vop_deleteextattr_args *a; 6005 struct vnode *vp; 6006 6007 a = ap; 6008 vp = a->a_vp; 6009 vn_seqc_write_begin(vp); 6010 } 6011 6012 void 6013 vop_deleteextattr_post(void *ap, int rc) 6014 { 6015 struct vop_deleteextattr_args *a; 6016 struct vnode *vp; 6017 6018 a = ap; 6019 vp = a->a_vp; 6020 vn_seqc_write_end(vp); 6021 if (!rc) 6022 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 6023 } 6024 6025 void 6026 vop_link_pre(void *ap) 6027 { 6028 struct vop_link_args *a; 6029 struct vnode *vp, *tdvp; 6030 6031 a = ap; 6032 vp = a->a_vp; 6033 tdvp = a->a_tdvp; 6034 vn_seqc_write_begin(vp); 6035 vn_seqc_write_begin(tdvp); 6036 } 6037 6038 void 6039 vop_link_post(void *ap, int rc) 6040 { 6041 struct vop_link_args *a; 6042 struct vnode *vp, *tdvp; 6043 6044 a = ap; 6045 vp = a->a_vp; 6046 tdvp = a->a_tdvp; 6047 vn_seqc_write_end(vp); 6048 vn_seqc_write_end(tdvp); 6049 if (!rc) { 6050 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 6051 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 6052 } 6053 } 6054 6055 void 6056 vop_mkdir_pre(void *ap) 6057 { 6058 struct vop_mkdir_args *a; 6059 struct vnode *dvp; 6060 6061 a = ap; 6062 dvp = a->a_dvp; 6063 vn_seqc_write_begin(dvp); 6064 } 6065 6066 void 6067 vop_mkdir_post(void *ap, int rc) 6068 { 6069 struct vop_mkdir_args *a; 6070 struct vnode *dvp; 6071 6072 a = ap; 6073 dvp = a->a_dvp; 6074 vn_seqc_write_end(dvp); 6075 if (!rc) 6076 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6077 } 6078 6079 #ifdef DEBUG_VFS_LOCKS 6080 void 6081 vop_mkdir_debugpost(void *ap, int rc) 6082 { 6083 struct vop_mkdir_args *a; 6084 6085 a = ap; 6086 if (!rc) 6087 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 6088 } 6089 #endif 6090 6091 void 6092 vop_mknod_pre(void *ap) 6093 { 6094 struct vop_mknod_args *a; 6095 struct vnode *dvp; 6096 6097 a = ap; 6098 dvp = a->a_dvp; 6099 vn_seqc_write_begin(dvp); 6100 } 6101 6102 void 6103 vop_mknod_post(void *ap, int rc) 6104 { 6105 struct vop_mknod_args *a; 6106 struct vnode *dvp; 6107 6108 a = ap; 6109 dvp = a->a_dvp; 6110 vn_seqc_write_end(dvp); 6111 if (!rc) 6112 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6113 } 6114 6115 void 6116 vop_reclaim_post(void *ap, int rc) 6117 { 6118 struct vop_reclaim_args *a; 6119 struct vnode *vp; 6120 6121 a = ap; 6122 vp = a->a_vp; 6123 ASSERT_VOP_IN_SEQC(vp); 6124 if (!rc) 6125 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 6126 } 6127 6128 void 6129 vop_remove_pre(void *ap) 6130 { 6131 struct vop_remove_args *a; 6132 struct vnode *dvp, *vp; 6133 6134 a = ap; 6135 dvp = a->a_dvp; 6136 vp = a->a_vp; 6137 vn_seqc_write_begin(dvp); 6138 vn_seqc_write_begin(vp); 6139 } 6140 6141 void 6142 vop_remove_post(void *ap, int rc) 6143 { 6144 struct vop_remove_args *a; 6145 struct vnode *dvp, *vp; 6146 6147 a = ap; 6148 dvp = a->a_dvp; 6149 vp = a->a_vp; 6150 vn_seqc_write_end(dvp); 6151 vn_seqc_write_end(vp); 6152 if (!rc) { 6153 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6154 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6155 } 6156 } 6157 6158 void 6159 vop_rename_post(void *ap, int rc) 6160 { 6161 struct vop_rename_args *a = ap; 6162 long hint; 6163 6164 if (!rc) { 6165 hint = NOTE_WRITE; 6166 if (a->a_fdvp == a->a_tdvp) { 6167 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 6168 hint |= NOTE_LINK; 6169 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6170 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6171 } else { 6172 hint |= NOTE_EXTEND; 6173 if (a->a_fvp->v_type == VDIR) 6174 hint |= NOTE_LINK; 6175 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 6176 6177 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 6178 a->a_tvp->v_type == VDIR) 6179 hint &= ~NOTE_LINK; 6180 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 6181 } 6182 6183 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 6184 if (a->a_tvp) 6185 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6186 } 6187 if (a->a_tdvp != a->a_fdvp) 6188 vdrop(a->a_fdvp); 6189 if (a->a_tvp != a->a_fvp) 6190 vdrop(a->a_fvp); 6191 vdrop(a->a_tdvp); 6192 if (a->a_tvp) 6193 vdrop(a->a_tvp); 6194 } 6195 6196 void 6197 vop_rmdir_pre(void *ap) 6198 { 6199 struct vop_rmdir_args *a; 6200 struct vnode *dvp, *vp; 6201 6202 a = ap; 6203 dvp = a->a_dvp; 6204 vp = a->a_vp; 6205 vn_seqc_write_begin(dvp); 6206 vn_seqc_write_begin(vp); 6207 } 6208 6209 void 6210 vop_rmdir_post(void *ap, int rc) 6211 { 6212 struct vop_rmdir_args *a; 6213 struct vnode *dvp, *vp; 6214 6215 a = ap; 6216 dvp = a->a_dvp; 6217 vp = a->a_vp; 6218 vn_seqc_write_end(dvp); 6219 vn_seqc_write_end(vp); 6220 if (!rc) { 6221 vp->v_vflag |= VV_UNLINKED; 6222 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6223 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6224 } 6225 } 6226 6227 void 6228 vop_setattr_pre(void *ap) 6229 { 6230 struct vop_setattr_args *a; 6231 struct vnode *vp; 6232 6233 a = ap; 6234 vp = a->a_vp; 6235 vn_seqc_write_begin(vp); 6236 } 6237 6238 void 6239 vop_setattr_post(void *ap, int rc) 6240 { 6241 struct vop_setattr_args *a; 6242 struct vnode *vp; 6243 6244 a = ap; 6245 vp = a->a_vp; 6246 vn_seqc_write_end(vp); 6247 if (!rc) 6248 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6249 } 6250 6251 void 6252 vop_setacl_pre(void *ap) 6253 { 6254 struct vop_setacl_args *a; 6255 struct vnode *vp; 6256 6257 a = ap; 6258 vp = a->a_vp; 6259 vn_seqc_write_begin(vp); 6260 } 6261 6262 void 6263 vop_setacl_post(void *ap, int rc __unused) 6264 { 6265 struct vop_setacl_args *a; 6266 struct vnode *vp; 6267 6268 a = ap; 6269 vp = a->a_vp; 6270 vn_seqc_write_end(vp); 6271 } 6272 6273 void 6274 vop_setextattr_pre(void *ap) 6275 { 6276 struct vop_setextattr_args *a; 6277 struct vnode *vp; 6278 6279 a = ap; 6280 vp = a->a_vp; 6281 vn_seqc_write_begin(vp); 6282 } 6283 6284 void 6285 vop_setextattr_post(void *ap, int rc) 6286 { 6287 struct vop_setextattr_args *a; 6288 struct vnode *vp; 6289 6290 a = ap; 6291 vp = a->a_vp; 6292 vn_seqc_write_end(vp); 6293 if (!rc) 6294 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6295 } 6296 6297 void 6298 vop_symlink_pre(void *ap) 6299 { 6300 struct vop_symlink_args *a; 6301 struct vnode *dvp; 6302 6303 a = ap; 6304 dvp = a->a_dvp; 6305 vn_seqc_write_begin(dvp); 6306 } 6307 6308 void 6309 vop_symlink_post(void *ap, int rc) 6310 { 6311 struct vop_symlink_args *a; 6312 struct vnode *dvp; 6313 6314 a = ap; 6315 dvp = a->a_dvp; 6316 vn_seqc_write_end(dvp); 6317 if (!rc) 6318 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6319 } 6320 6321 void 6322 vop_open_post(void *ap, int rc) 6323 { 6324 struct vop_open_args *a = ap; 6325 6326 if (!rc) 6327 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6328 } 6329 6330 void 6331 vop_close_post(void *ap, int rc) 6332 { 6333 struct vop_close_args *a = ap; 6334 6335 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6336 !VN_IS_DOOMED(a->a_vp))) { 6337 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6338 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6339 } 6340 } 6341 6342 void 6343 vop_read_post(void *ap, int rc) 6344 { 6345 struct vop_read_args *a = ap; 6346 6347 if (!rc) 6348 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6349 } 6350 6351 void 6352 vop_read_pgcache_post(void *ap, int rc) 6353 { 6354 struct vop_read_pgcache_args *a = ap; 6355 6356 if (!rc) 6357 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6358 } 6359 6360 void 6361 vop_readdir_post(void *ap, int rc) 6362 { 6363 struct vop_readdir_args *a = ap; 6364 6365 if (!rc) 6366 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6367 } 6368 6369 static struct knlist fs_knlist; 6370 6371 static void 6372 vfs_event_init(void *arg) 6373 { 6374 knlist_init_mtx(&fs_knlist, NULL); 6375 } 6376 /* XXX - correct order? */ 6377 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6378 6379 void 6380 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6381 { 6382 6383 KNOTE_UNLOCKED(&fs_knlist, event); 6384 } 6385 6386 static int filt_fsattach(struct knote *kn); 6387 static void filt_fsdetach(struct knote *kn); 6388 static int filt_fsevent(struct knote *kn, long hint); 6389 6390 struct filterops fs_filtops = { 6391 .f_isfd = 0, 6392 .f_attach = filt_fsattach, 6393 .f_detach = filt_fsdetach, 6394 .f_event = filt_fsevent 6395 }; 6396 6397 static int 6398 filt_fsattach(struct knote *kn) 6399 { 6400 6401 kn->kn_flags |= EV_CLEAR; 6402 knlist_add(&fs_knlist, kn, 0); 6403 return (0); 6404 } 6405 6406 static void 6407 filt_fsdetach(struct knote *kn) 6408 { 6409 6410 knlist_remove(&fs_knlist, kn, 0); 6411 } 6412 6413 static int 6414 filt_fsevent(struct knote *kn, long hint) 6415 { 6416 6417 kn->kn_fflags |= kn->kn_sfflags & hint; 6418 6419 return (kn->kn_fflags != 0); 6420 } 6421 6422 static int 6423 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6424 { 6425 struct vfsidctl vc; 6426 int error; 6427 struct mount *mp; 6428 6429 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6430 if (error) 6431 return (error); 6432 if (vc.vc_vers != VFS_CTL_VERS1) 6433 return (EINVAL); 6434 mp = vfs_getvfs(&vc.vc_fsid); 6435 if (mp == NULL) 6436 return (ENOENT); 6437 /* ensure that a specific sysctl goes to the right filesystem. */ 6438 if (strcmp(vc.vc_fstypename, "*") != 0 && 6439 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6440 vfs_rel(mp); 6441 return (EINVAL); 6442 } 6443 VCTLTOREQ(&vc, req); 6444 error = VFS_SYSCTL(mp, vc.vc_op, req); 6445 vfs_rel(mp); 6446 return (error); 6447 } 6448 6449 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6450 NULL, 0, sysctl_vfs_ctl, "", 6451 "Sysctl by fsid"); 6452 6453 /* 6454 * Function to initialize a va_filerev field sensibly. 6455 * XXX: Wouldn't a random number make a lot more sense ?? 6456 */ 6457 u_quad_t 6458 init_va_filerev(void) 6459 { 6460 struct bintime bt; 6461 6462 getbinuptime(&bt); 6463 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6464 } 6465 6466 static int filt_vfsread(struct knote *kn, long hint); 6467 static int filt_vfswrite(struct knote *kn, long hint); 6468 static int filt_vfsvnode(struct knote *kn, long hint); 6469 static void filt_vfsdetach(struct knote *kn); 6470 static struct filterops vfsread_filtops = { 6471 .f_isfd = 1, 6472 .f_detach = filt_vfsdetach, 6473 .f_event = filt_vfsread 6474 }; 6475 static struct filterops vfswrite_filtops = { 6476 .f_isfd = 1, 6477 .f_detach = filt_vfsdetach, 6478 .f_event = filt_vfswrite 6479 }; 6480 static struct filterops vfsvnode_filtops = { 6481 .f_isfd = 1, 6482 .f_detach = filt_vfsdetach, 6483 .f_event = filt_vfsvnode 6484 }; 6485 6486 static void 6487 vfs_knllock(void *arg) 6488 { 6489 struct vnode *vp = arg; 6490 6491 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6492 } 6493 6494 static void 6495 vfs_knlunlock(void *arg) 6496 { 6497 struct vnode *vp = arg; 6498 6499 VOP_UNLOCK(vp); 6500 } 6501 6502 static void 6503 vfs_knl_assert_lock(void *arg, int what) 6504 { 6505 #ifdef DEBUG_VFS_LOCKS 6506 struct vnode *vp = arg; 6507 6508 if (what == LA_LOCKED) 6509 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6510 else 6511 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6512 #endif 6513 } 6514 6515 int 6516 vfs_kqfilter(struct vop_kqfilter_args *ap) 6517 { 6518 struct vnode *vp = ap->a_vp; 6519 struct knote *kn = ap->a_kn; 6520 struct knlist *knl; 6521 6522 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6523 kn->kn_filter != EVFILT_WRITE), 6524 ("READ/WRITE filter on a FIFO leaked through")); 6525 switch (kn->kn_filter) { 6526 case EVFILT_READ: 6527 kn->kn_fop = &vfsread_filtops; 6528 break; 6529 case EVFILT_WRITE: 6530 kn->kn_fop = &vfswrite_filtops; 6531 break; 6532 case EVFILT_VNODE: 6533 kn->kn_fop = &vfsvnode_filtops; 6534 break; 6535 default: 6536 return (EINVAL); 6537 } 6538 6539 kn->kn_hook = (caddr_t)vp; 6540 6541 v_addpollinfo(vp); 6542 if (vp->v_pollinfo == NULL) 6543 return (ENOMEM); 6544 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6545 vhold(vp); 6546 knlist_add(knl, kn, 0); 6547 6548 return (0); 6549 } 6550 6551 /* 6552 * Detach knote from vnode 6553 */ 6554 static void 6555 filt_vfsdetach(struct knote *kn) 6556 { 6557 struct vnode *vp = (struct vnode *)kn->kn_hook; 6558 6559 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6560 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6561 vdrop(vp); 6562 } 6563 6564 /*ARGSUSED*/ 6565 static int 6566 filt_vfsread(struct knote *kn, long hint) 6567 { 6568 struct vnode *vp = (struct vnode *)kn->kn_hook; 6569 off_t size; 6570 int res; 6571 6572 /* 6573 * filesystem is gone, so set the EOF flag and schedule 6574 * the knote for deletion. 6575 */ 6576 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6577 VI_LOCK(vp); 6578 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6579 VI_UNLOCK(vp); 6580 return (1); 6581 } 6582 6583 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6584 return (0); 6585 6586 VI_LOCK(vp); 6587 kn->kn_data = size - kn->kn_fp->f_offset; 6588 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6589 VI_UNLOCK(vp); 6590 return (res); 6591 } 6592 6593 /*ARGSUSED*/ 6594 static int 6595 filt_vfswrite(struct knote *kn, long hint) 6596 { 6597 struct vnode *vp = (struct vnode *)kn->kn_hook; 6598 6599 VI_LOCK(vp); 6600 6601 /* 6602 * filesystem is gone, so set the EOF flag and schedule 6603 * the knote for deletion. 6604 */ 6605 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6606 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6607 6608 kn->kn_data = 0; 6609 VI_UNLOCK(vp); 6610 return (1); 6611 } 6612 6613 static int 6614 filt_vfsvnode(struct knote *kn, long hint) 6615 { 6616 struct vnode *vp = (struct vnode *)kn->kn_hook; 6617 int res; 6618 6619 VI_LOCK(vp); 6620 if (kn->kn_sfflags & hint) 6621 kn->kn_fflags |= hint; 6622 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6623 kn->kn_flags |= EV_EOF; 6624 VI_UNLOCK(vp); 6625 return (1); 6626 } 6627 res = (kn->kn_fflags != 0); 6628 VI_UNLOCK(vp); 6629 return (res); 6630 } 6631 6632 int 6633 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6634 { 6635 int error; 6636 6637 if (dp->d_reclen > ap->a_uio->uio_resid) 6638 return (ENAMETOOLONG); 6639 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6640 if (error) { 6641 if (ap->a_ncookies != NULL) { 6642 if (ap->a_cookies != NULL) 6643 free(ap->a_cookies, M_TEMP); 6644 ap->a_cookies = NULL; 6645 *ap->a_ncookies = 0; 6646 } 6647 return (error); 6648 } 6649 if (ap->a_ncookies == NULL) 6650 return (0); 6651 6652 KASSERT(ap->a_cookies, 6653 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6654 6655 *ap->a_cookies = realloc(*ap->a_cookies, 6656 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6657 (*ap->a_cookies)[*ap->a_ncookies] = off; 6658 *ap->a_ncookies += 1; 6659 return (0); 6660 } 6661 6662 /* 6663 * The purpose of this routine is to remove granularity from accmode_t, 6664 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6665 * VADMIN and VAPPEND. 6666 * 6667 * If it returns 0, the caller is supposed to continue with the usual 6668 * access checks using 'accmode' as modified by this routine. If it 6669 * returns nonzero value, the caller is supposed to return that value 6670 * as errno. 6671 * 6672 * Note that after this routine runs, accmode may be zero. 6673 */ 6674 int 6675 vfs_unixify_accmode(accmode_t *accmode) 6676 { 6677 /* 6678 * There is no way to specify explicit "deny" rule using 6679 * file mode or POSIX.1e ACLs. 6680 */ 6681 if (*accmode & VEXPLICIT_DENY) { 6682 *accmode = 0; 6683 return (0); 6684 } 6685 6686 /* 6687 * None of these can be translated into usual access bits. 6688 * Also, the common case for NFSv4 ACLs is to not contain 6689 * either of these bits. Caller should check for VWRITE 6690 * on the containing directory instead. 6691 */ 6692 if (*accmode & (VDELETE_CHILD | VDELETE)) 6693 return (EPERM); 6694 6695 if (*accmode & VADMIN_PERMS) { 6696 *accmode &= ~VADMIN_PERMS; 6697 *accmode |= VADMIN; 6698 } 6699 6700 /* 6701 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6702 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6703 */ 6704 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6705 6706 return (0); 6707 } 6708 6709 /* 6710 * Clear out a doomed vnode (if any) and replace it with a new one as long 6711 * as the fs is not being unmounted. Return the root vnode to the caller. 6712 */ 6713 static int __noinline 6714 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6715 { 6716 struct vnode *vp; 6717 int error; 6718 6719 restart: 6720 if (mp->mnt_rootvnode != NULL) { 6721 MNT_ILOCK(mp); 6722 vp = mp->mnt_rootvnode; 6723 if (vp != NULL) { 6724 if (!VN_IS_DOOMED(vp)) { 6725 vrefact(vp); 6726 MNT_IUNLOCK(mp); 6727 error = vn_lock(vp, flags); 6728 if (error == 0) { 6729 *vpp = vp; 6730 return (0); 6731 } 6732 vrele(vp); 6733 goto restart; 6734 } 6735 /* 6736 * Clear the old one. 6737 */ 6738 mp->mnt_rootvnode = NULL; 6739 } 6740 MNT_IUNLOCK(mp); 6741 if (vp != NULL) { 6742 vfs_op_barrier_wait(mp); 6743 vrele(vp); 6744 } 6745 } 6746 error = VFS_CACHEDROOT(mp, flags, vpp); 6747 if (error != 0) 6748 return (error); 6749 if (mp->mnt_vfs_ops == 0) { 6750 MNT_ILOCK(mp); 6751 if (mp->mnt_vfs_ops != 0) { 6752 MNT_IUNLOCK(mp); 6753 return (0); 6754 } 6755 if (mp->mnt_rootvnode == NULL) { 6756 vrefact(*vpp); 6757 mp->mnt_rootvnode = *vpp; 6758 } else { 6759 if (mp->mnt_rootvnode != *vpp) { 6760 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6761 panic("%s: mismatch between vnode returned " 6762 " by VFS_CACHEDROOT and the one cached " 6763 " (%p != %p)", 6764 __func__, *vpp, mp->mnt_rootvnode); 6765 } 6766 } 6767 } 6768 MNT_IUNLOCK(mp); 6769 } 6770 return (0); 6771 } 6772 6773 int 6774 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6775 { 6776 struct mount_pcpu *mpcpu; 6777 struct vnode *vp; 6778 int error; 6779 6780 if (!vfs_op_thread_enter(mp, mpcpu)) 6781 return (vfs_cache_root_fallback(mp, flags, vpp)); 6782 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6783 if (vp == NULL || VN_IS_DOOMED(vp)) { 6784 vfs_op_thread_exit(mp, mpcpu); 6785 return (vfs_cache_root_fallback(mp, flags, vpp)); 6786 } 6787 vrefact(vp); 6788 vfs_op_thread_exit(mp, mpcpu); 6789 error = vn_lock(vp, flags); 6790 if (error != 0) { 6791 vrele(vp); 6792 return (vfs_cache_root_fallback(mp, flags, vpp)); 6793 } 6794 *vpp = vp; 6795 return (0); 6796 } 6797 6798 struct vnode * 6799 vfs_cache_root_clear(struct mount *mp) 6800 { 6801 struct vnode *vp; 6802 6803 /* 6804 * ops > 0 guarantees there is nobody who can see this vnode 6805 */ 6806 MPASS(mp->mnt_vfs_ops > 0); 6807 vp = mp->mnt_rootvnode; 6808 if (vp != NULL) 6809 vn_seqc_write_begin(vp); 6810 mp->mnt_rootvnode = NULL; 6811 return (vp); 6812 } 6813 6814 void 6815 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6816 { 6817 6818 MPASS(mp->mnt_vfs_ops > 0); 6819 vrefact(vp); 6820 mp->mnt_rootvnode = vp; 6821 } 6822 6823 /* 6824 * These are helper functions for filesystems to traverse all 6825 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6826 * 6827 * This interface replaces MNT_VNODE_FOREACH. 6828 */ 6829 6830 struct vnode * 6831 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6832 { 6833 struct vnode *vp; 6834 6835 maybe_yield(); 6836 MNT_ILOCK(mp); 6837 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6838 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6839 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6840 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6841 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6842 continue; 6843 VI_LOCK(vp); 6844 if (VN_IS_DOOMED(vp)) { 6845 VI_UNLOCK(vp); 6846 continue; 6847 } 6848 break; 6849 } 6850 if (vp == NULL) { 6851 __mnt_vnode_markerfree_all(mvp, mp); 6852 /* MNT_IUNLOCK(mp); -- done in above function */ 6853 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6854 return (NULL); 6855 } 6856 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6857 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6858 MNT_IUNLOCK(mp); 6859 return (vp); 6860 } 6861 6862 struct vnode * 6863 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6864 { 6865 struct vnode *vp; 6866 6867 *mvp = vn_alloc_marker(mp); 6868 MNT_ILOCK(mp); 6869 MNT_REF(mp); 6870 6871 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6872 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6873 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6874 continue; 6875 VI_LOCK(vp); 6876 if (VN_IS_DOOMED(vp)) { 6877 VI_UNLOCK(vp); 6878 continue; 6879 } 6880 break; 6881 } 6882 if (vp == NULL) { 6883 MNT_REL(mp); 6884 MNT_IUNLOCK(mp); 6885 vn_free_marker(*mvp); 6886 *mvp = NULL; 6887 return (NULL); 6888 } 6889 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6890 MNT_IUNLOCK(mp); 6891 return (vp); 6892 } 6893 6894 void 6895 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6896 { 6897 6898 if (*mvp == NULL) { 6899 MNT_IUNLOCK(mp); 6900 return; 6901 } 6902 6903 mtx_assert(MNT_MTX(mp), MA_OWNED); 6904 6905 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6906 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6907 MNT_REL(mp); 6908 MNT_IUNLOCK(mp); 6909 vn_free_marker(*mvp); 6910 *mvp = NULL; 6911 } 6912 6913 /* 6914 * These are helper functions for filesystems to traverse their 6915 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6916 */ 6917 static void 6918 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6919 { 6920 6921 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6922 6923 MNT_ILOCK(mp); 6924 MNT_REL(mp); 6925 MNT_IUNLOCK(mp); 6926 vn_free_marker(*mvp); 6927 *mvp = NULL; 6928 } 6929 6930 /* 6931 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6932 * conventional lock order during mnt_vnode_next_lazy iteration. 6933 * 6934 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6935 * The list lock is dropped and reacquired. On success, both locks are held. 6936 * On failure, the mount vnode list lock is held but the vnode interlock is 6937 * not, and the procedure may have yielded. 6938 */ 6939 static bool 6940 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6941 struct vnode *vp) 6942 { 6943 6944 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6945 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6946 ("%s: bad marker", __func__)); 6947 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6948 ("%s: inappropriate vnode", __func__)); 6949 ASSERT_VI_UNLOCKED(vp, __func__); 6950 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6951 6952 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6953 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6954 6955 /* 6956 * Note we may be racing against vdrop which transitioned the hold 6957 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6958 * if we are the only user after we get the interlock we will just 6959 * vdrop. 6960 */ 6961 vhold(vp); 6962 mtx_unlock(&mp->mnt_listmtx); 6963 VI_LOCK(vp); 6964 if (VN_IS_DOOMED(vp)) { 6965 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6966 goto out_lost; 6967 } 6968 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6969 /* 6970 * There is nothing to do if we are the last user. 6971 */ 6972 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6973 goto out_lost; 6974 mtx_lock(&mp->mnt_listmtx); 6975 return (true); 6976 out_lost: 6977 vdropl(vp); 6978 maybe_yield(); 6979 mtx_lock(&mp->mnt_listmtx); 6980 return (false); 6981 } 6982 6983 static struct vnode * 6984 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6985 void *cbarg) 6986 { 6987 struct vnode *vp; 6988 6989 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6990 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6991 restart: 6992 vp = TAILQ_NEXT(*mvp, v_lazylist); 6993 while (vp != NULL) { 6994 if (vp->v_type == VMARKER) { 6995 vp = TAILQ_NEXT(vp, v_lazylist); 6996 continue; 6997 } 6998 /* 6999 * See if we want to process the vnode. Note we may encounter a 7000 * long string of vnodes we don't care about and hog the list 7001 * as a result. Check for it and requeue the marker. 7002 */ 7003 VNPASS(!VN_IS_DOOMED(vp), vp); 7004 if (!cb(vp, cbarg)) { 7005 if (!should_yield()) { 7006 vp = TAILQ_NEXT(vp, v_lazylist); 7007 continue; 7008 } 7009 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 7010 v_lazylist); 7011 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 7012 v_lazylist); 7013 mtx_unlock(&mp->mnt_listmtx); 7014 kern_yield(PRI_USER); 7015 mtx_lock(&mp->mnt_listmtx); 7016 goto restart; 7017 } 7018 /* 7019 * Try-lock because this is the wrong lock order. 7020 */ 7021 if (!VI_TRYLOCK(vp) && 7022 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 7023 goto restart; 7024 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 7025 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 7026 ("alien vnode on the lazy list %p %p", vp, mp)); 7027 VNPASS(vp->v_mount == mp, vp); 7028 VNPASS(!VN_IS_DOOMED(vp), vp); 7029 break; 7030 } 7031 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7032 7033 /* Check if we are done */ 7034 if (vp == NULL) { 7035 mtx_unlock(&mp->mnt_listmtx); 7036 mnt_vnode_markerfree_lazy(mvp, mp); 7037 return (NULL); 7038 } 7039 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 7040 mtx_unlock(&mp->mnt_listmtx); 7041 ASSERT_VI_LOCKED(vp, "lazy iter"); 7042 return (vp); 7043 } 7044 7045 struct vnode * 7046 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7047 void *cbarg) 7048 { 7049 7050 maybe_yield(); 7051 mtx_lock(&mp->mnt_listmtx); 7052 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7053 } 7054 7055 struct vnode * 7056 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 7057 void *cbarg) 7058 { 7059 struct vnode *vp; 7060 7061 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 7062 return (NULL); 7063 7064 *mvp = vn_alloc_marker(mp); 7065 MNT_ILOCK(mp); 7066 MNT_REF(mp); 7067 MNT_IUNLOCK(mp); 7068 7069 mtx_lock(&mp->mnt_listmtx); 7070 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 7071 if (vp == NULL) { 7072 mtx_unlock(&mp->mnt_listmtx); 7073 mnt_vnode_markerfree_lazy(mvp, mp); 7074 return (NULL); 7075 } 7076 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 7077 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 7078 } 7079 7080 void 7081 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 7082 { 7083 7084 if (*mvp == NULL) 7085 return; 7086 7087 mtx_lock(&mp->mnt_listmtx); 7088 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 7089 mtx_unlock(&mp->mnt_listmtx); 7090 mnt_vnode_markerfree_lazy(mvp, mp); 7091 } 7092 7093 int 7094 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 7095 { 7096 7097 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 7098 cnp->cn_flags &= ~NOEXECCHECK; 7099 return (0); 7100 } 7101 7102 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 7103 } 7104 7105 /* 7106 * Do not use this variant unless you have means other than the hold count 7107 * to prevent the vnode from getting freed. 7108 */ 7109 void 7110 vn_seqc_write_begin_locked(struct vnode *vp) 7111 { 7112 7113 ASSERT_VI_LOCKED(vp, __func__); 7114 VNPASS(vp->v_holdcnt > 0, vp); 7115 VNPASS(vp->v_seqc_users >= 0, vp); 7116 vp->v_seqc_users++; 7117 if (vp->v_seqc_users == 1) 7118 seqc_sleepable_write_begin(&vp->v_seqc); 7119 } 7120 7121 void 7122 vn_seqc_write_begin(struct vnode *vp) 7123 { 7124 7125 VI_LOCK(vp); 7126 vn_seqc_write_begin_locked(vp); 7127 VI_UNLOCK(vp); 7128 } 7129 7130 void 7131 vn_seqc_write_end_locked(struct vnode *vp) 7132 { 7133 7134 ASSERT_VI_LOCKED(vp, __func__); 7135 VNPASS(vp->v_seqc_users > 0, vp); 7136 vp->v_seqc_users--; 7137 if (vp->v_seqc_users == 0) 7138 seqc_sleepable_write_end(&vp->v_seqc); 7139 } 7140 7141 void 7142 vn_seqc_write_end(struct vnode *vp) 7143 { 7144 7145 VI_LOCK(vp); 7146 vn_seqc_write_end_locked(vp); 7147 VI_UNLOCK(vp); 7148 } 7149 7150 /* 7151 * Special case handling for allocating and freeing vnodes. 7152 * 7153 * The counter remains unchanged on free so that a doomed vnode will 7154 * keep testing as in modify as long as it is accessible with SMR. 7155 */ 7156 static void 7157 vn_seqc_init(struct vnode *vp) 7158 { 7159 7160 vp->v_seqc = 0; 7161 vp->v_seqc_users = 0; 7162 } 7163 7164 static void 7165 vn_seqc_write_end_free(struct vnode *vp) 7166 { 7167 7168 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7169 VNPASS(vp->v_seqc_users == 1, vp); 7170 } 7171 7172 void 7173 vn_irflag_set_locked(struct vnode *vp, short toset) 7174 { 7175 short flags; 7176 7177 ASSERT_VI_LOCKED(vp, __func__); 7178 flags = vn_irflag_read(vp); 7179 VNASSERT((flags & toset) == 0, vp, 7180 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7181 __func__, flags, toset)); 7182 atomic_store_short(&vp->v_irflag, flags | toset); 7183 } 7184 7185 void 7186 vn_irflag_set(struct vnode *vp, short toset) 7187 { 7188 7189 VI_LOCK(vp); 7190 vn_irflag_set_locked(vp, toset); 7191 VI_UNLOCK(vp); 7192 } 7193 7194 void 7195 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7196 { 7197 short flags; 7198 7199 ASSERT_VI_LOCKED(vp, __func__); 7200 flags = vn_irflag_read(vp); 7201 atomic_store_short(&vp->v_irflag, flags | toset); 7202 } 7203 7204 void 7205 vn_irflag_set_cond(struct vnode *vp, short toset) 7206 { 7207 7208 VI_LOCK(vp); 7209 vn_irflag_set_cond_locked(vp, toset); 7210 VI_UNLOCK(vp); 7211 } 7212 7213 void 7214 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7215 { 7216 short flags; 7217 7218 ASSERT_VI_LOCKED(vp, __func__); 7219 flags = vn_irflag_read(vp); 7220 VNASSERT((flags & tounset) == tounset, vp, 7221 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7222 __func__, flags, tounset)); 7223 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7224 } 7225 7226 void 7227 vn_irflag_unset(struct vnode *vp, short tounset) 7228 { 7229 7230 VI_LOCK(vp); 7231 vn_irflag_unset_locked(vp, tounset); 7232 VI_UNLOCK(vp); 7233 } 7234 7235 int 7236 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7237 { 7238 struct vattr vattr; 7239 int error; 7240 7241 ASSERT_VOP_LOCKED(vp, __func__); 7242 error = VOP_GETATTR(vp, &vattr, cred); 7243 if (__predict_true(error == 0)) { 7244 if (vattr.va_size <= OFF_MAX) 7245 *size = vattr.va_size; 7246 else 7247 error = EFBIG; 7248 } 7249 return (error); 7250 } 7251 7252 int 7253 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7254 { 7255 int error; 7256 7257 VOP_LOCK(vp, LK_SHARED); 7258 error = vn_getsize_locked(vp, size, cred); 7259 VOP_UNLOCK(vp); 7260 return (error); 7261 } 7262 7263 #ifdef INVARIANTS 7264 void 7265 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7266 { 7267 7268 switch (vp->v_state) { 7269 case VSTATE_UNINITIALIZED: 7270 switch (state) { 7271 case VSTATE_CONSTRUCTED: 7272 case VSTATE_DESTROYING: 7273 return; 7274 default: 7275 break; 7276 } 7277 break; 7278 case VSTATE_CONSTRUCTED: 7279 ASSERT_VOP_ELOCKED(vp, __func__); 7280 switch (state) { 7281 case VSTATE_DESTROYING: 7282 return; 7283 default: 7284 break; 7285 } 7286 break; 7287 case VSTATE_DESTROYING: 7288 ASSERT_VOP_ELOCKED(vp, __func__); 7289 switch (state) { 7290 case VSTATE_DEAD: 7291 return; 7292 default: 7293 break; 7294 } 7295 break; 7296 case VSTATE_DEAD: 7297 switch (state) { 7298 case VSTATE_UNINITIALIZED: 7299 return; 7300 default: 7301 break; 7302 } 7303 break; 7304 } 7305 7306 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7307 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7308 } 7309 #endif 7310