1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Poul-Henning Kamp of the FreeBSD Project. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include "opt_ddb.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/capsicum.h> 46 #include <sys/counter.h> 47 #include <sys/filedesc.h> 48 #include <sys/fnv_hash.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/fcntl.h> 54 #include <sys/mount.h> 55 #include <sys/namei.h> 56 #include <sys/proc.h> 57 #include <sys/rwlock.h> 58 #include <sys/sdt.h> 59 #include <sys/smr.h> 60 #include <sys/smp.h> 61 #include <sys/syscallsubr.h> 62 #include <sys/sysctl.h> 63 #include <sys/sysproto.h> 64 #include <sys/vnode.h> 65 #include <ck_queue.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 70 #ifdef DDB 71 #include <ddb/ddb.h> 72 #endif 73 74 #include <vm/uma.h> 75 76 SDT_PROVIDER_DECLARE(vfs); 77 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", 78 "struct vnode *"); 79 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", 80 "char *"); 81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); 82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", 83 "char *", "struct vnode *"); 84 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); 85 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", 86 "struct vnode *", "char *"); 87 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", 88 "struct vnode *"); 89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, 90 "struct vnode *", "char *"); 91 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", 92 "char *"); 93 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); 94 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); 95 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); 96 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", 97 "struct vnode *"); 98 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", 99 "char *"); 100 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", 101 "char *"); 102 103 /* 104 * This structure describes the elements in the cache of recent 105 * names looked up by namei. 106 */ 107 struct negstate { 108 u_char neg_flag; 109 }; 110 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), 111 "the state must fit in a union with a pointer without growing it"); 112 113 struct namecache { 114 CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ 115 LIST_ENTRY(namecache) nc_src; /* source vnode list */ 116 TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ 117 struct vnode *nc_dvp; /* vnode of parent of name */ 118 union { 119 struct vnode *nu_vp; /* vnode the name refers to */ 120 struct negstate nu_neg;/* negative entry state */ 121 } n_un; 122 u_char nc_flag; /* flag bits */ 123 u_char nc_nlen; /* length of name */ 124 char nc_name[0]; /* segment name + nul */ 125 }; 126 127 /* 128 * struct namecache_ts repeats struct namecache layout up to the 129 * nc_nlen member. 130 * struct namecache_ts is used in place of struct namecache when time(s) need 131 * to be stored. The nc_dotdottime field is used when a cache entry is mapping 132 * both a non-dotdot directory name plus dotdot for the directory's 133 * parent. 134 */ 135 struct namecache_ts { 136 struct timespec nc_time; /* timespec provided by fs */ 137 struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ 138 int nc_ticks; /* ticks value when entry was added */ 139 struct namecache nc_nc; 140 }; 141 142 #define nc_vp n_un.nu_vp 143 #define nc_neg n_un.nu_neg 144 145 /* 146 * Flags in namecache.nc_flag 147 */ 148 #define NCF_WHITE 0x01 149 #define NCF_ISDOTDOT 0x02 150 #define NCF_TS 0x04 151 #define NCF_DTS 0x08 152 #define NCF_DVDROP 0x10 153 #define NCF_NEGATIVE 0x20 154 #define NCF_INVALID 0x40 155 156 /* 157 * Flags in negstate.neg_flag 158 */ 159 #define NEG_HOT 0x01 160 161 /* 162 * Mark an entry as invalid. 163 * 164 * This is called before it starts getting deconstructed. 165 */ 166 static void 167 cache_ncp_invalidate(struct namecache *ncp) 168 { 169 170 KASSERT((ncp->nc_flag & NCF_INVALID) == 0, 171 ("%s: entry %p already invalid", __func__, ncp)); 172 ncp->nc_flag |= NCF_INVALID; 173 atomic_thread_fence_rel(); 174 } 175 176 /* 177 * Verify validity of an entry. 178 * 179 * All places which elide locks are supposed to call this after they are 180 * done with reading from an entry. 181 */ 182 static bool 183 cache_ncp_invalid(struct namecache *ncp) 184 { 185 186 atomic_thread_fence_acq(); 187 return ((ncp->nc_flag & NCF_INVALID) != 0); 188 } 189 190 /* 191 * Name caching works as follows: 192 * 193 * Names found by directory scans are retained in a cache 194 * for future reference. It is managed LRU, so frequently 195 * used names will hang around. Cache is indexed by hash value 196 * obtained from (dvp, name) where dvp refers to the directory 197 * containing name. 198 * 199 * If it is a "negative" entry, (i.e. for a name that is known NOT to 200 * exist) the vnode pointer will be NULL. 201 * 202 * Upon reaching the last segment of a path, if the reference 203 * is for DELETE, or NOCACHE is set (rewrite), and the 204 * name is located in the cache, it will be dropped. 205 * 206 * These locks are used (in the order in which they can be taken): 207 * NAME TYPE ROLE 208 * vnodelock mtx vnode lists and v_cache_dd field protection 209 * bucketlock rwlock for access to given set of hash buckets 210 * neglist mtx negative entry LRU management 211 * 212 * Additionally, ncneg_shrink_lock mtx is used to have at most one thread 213 * shrinking the LRU list. 214 * 215 * It is legal to take multiple vnodelock and bucketlock locks. The locking 216 * order is lower address first. Both are recursive. 217 * 218 * "." lookups are lockless. 219 * 220 * ".." and vnode -> name lookups require vnodelock. 221 * 222 * name -> vnode lookup requires the relevant bucketlock to be held for reading. 223 * 224 * Insertions and removals of entries require involved vnodes and bucketlocks 225 * to be write-locked to prevent other threads from seeing the entry. 226 * 227 * Some lookups result in removal of the found entry (e.g. getting rid of a 228 * negative entry with the intent to create a positive one), which poses a 229 * problem when multiple threads reach the state. Similarly, two different 230 * threads can purge two different vnodes and try to remove the same name. 231 * 232 * If the already held vnode lock is lower than the second required lock, we 233 * can just take the other lock. However, in the opposite case, this could 234 * deadlock. As such, this is resolved by trylocking and if that fails unlocking 235 * the first node, locking everything in order and revalidating the state. 236 */ 237 238 VFS_SMR_DECLARE; 239 240 /* 241 * Structures associated with name caching. 242 */ 243 #define NCHHASH(hash) \ 244 (&nchashtbl[(hash) & nchash]) 245 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ 246 static u_long __read_mostly nchash; /* size of hash table */ 247 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 248 "Size of namecache hash table"); 249 static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ 250 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 251 "Ratio of negative namecache entries"); 252 static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ 253 static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ 254 u_int ncsizefactor = 2; 255 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, 256 "Size factor for namecache"); 257 static u_int __read_mostly ncpurgeminvnodes; 258 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, 259 "Number of vnodes below which purgevfs ignores the request"); 260 static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ 261 262 struct nchstats nchstats; /* cache effectiveness statistics */ 263 264 static struct mtx __exclusive_cache_line ncneg_shrink_lock; 265 266 struct neglist { 267 struct mtx nl_lock; 268 TAILQ_HEAD(, namecache) nl_list; 269 } __aligned(CACHE_LINE_SIZE); 270 271 static struct neglist __read_mostly *neglists; 272 static struct neglist ncneg_hot; 273 static u_long numhotneg; 274 275 #define numneglists (ncneghash + 1) 276 static u_int __read_mostly ncneghash; 277 static inline struct neglist * 278 NCP2NEGLIST(struct namecache *ncp) 279 { 280 281 return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); 282 } 283 284 static inline struct negstate * 285 NCP2NEGSTATE(struct namecache *ncp) 286 { 287 288 MPASS(ncp->nc_flag & NCF_NEGATIVE); 289 return (&ncp->nc_neg); 290 } 291 292 #define numbucketlocks (ncbuckethash + 1) 293 static u_int __read_mostly ncbuckethash; 294 static struct rwlock_padalign __read_mostly *bucketlocks; 295 #define HASH2BUCKETLOCK(hash) \ 296 ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) 297 298 #define numvnodelocks (ncvnodehash + 1) 299 static u_int __read_mostly ncvnodehash; 300 static struct mtx __read_mostly *vnodelocks; 301 static inline struct mtx * 302 VP2VNODELOCK(struct vnode *vp) 303 { 304 305 return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); 306 } 307 308 /* 309 * UMA zones for the VFS cache. 310 * 311 * The small cache is used for entries with short names, which are the 312 * most common. The large cache is used for entries which are too big to 313 * fit in the small cache. 314 */ 315 static uma_zone_t __read_mostly cache_zone_small; 316 static uma_zone_t __read_mostly cache_zone_small_ts; 317 static uma_zone_t __read_mostly cache_zone_large; 318 static uma_zone_t __read_mostly cache_zone_large_ts; 319 320 #define CACHE_PATH_CUTOFF 35 321 322 static struct namecache * 323 cache_alloc(int len, int ts) 324 { 325 struct namecache_ts *ncp_ts; 326 struct namecache *ncp; 327 328 if (__predict_false(ts)) { 329 if (len <= CACHE_PATH_CUTOFF) 330 ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); 331 else 332 ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); 333 ncp = &ncp_ts->nc_nc; 334 } else { 335 if (len <= CACHE_PATH_CUTOFF) 336 ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); 337 else 338 ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); 339 } 340 return (ncp); 341 } 342 343 static void 344 cache_free(struct namecache *ncp) 345 { 346 struct namecache_ts *ncp_ts; 347 348 if (ncp == NULL) 349 return; 350 if ((ncp->nc_flag & NCF_DVDROP) != 0) 351 vdrop(ncp->nc_dvp); 352 if (__predict_false(ncp->nc_flag & NCF_TS)) { 353 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 354 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 355 uma_zfree_smr(cache_zone_small_ts, ncp_ts); 356 else 357 uma_zfree_smr(cache_zone_large_ts, ncp_ts); 358 } else { 359 if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) 360 uma_zfree_smr(cache_zone_small, ncp); 361 else 362 uma_zfree_smr(cache_zone_large, ncp); 363 } 364 } 365 366 static void 367 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) 368 { 369 struct namecache_ts *ncp_ts; 370 371 KASSERT((ncp->nc_flag & NCF_TS) != 0 || 372 (tsp == NULL && ticksp == NULL), 373 ("No NCF_TS")); 374 375 if (tsp == NULL && ticksp == NULL) 376 return; 377 378 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 379 if (tsp != NULL) 380 *tsp = ncp_ts->nc_time; 381 if (ticksp != NULL) 382 *ticksp = ncp_ts->nc_ticks; 383 } 384 385 #ifdef DEBUG_CACHE 386 static int __read_mostly doingcache = 1; /* 1 => enable the cache */ 387 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, 388 "VFS namecache enabled"); 389 #endif 390 391 /* Export size information to userland */ 392 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 393 sizeof(struct namecache), "sizeof(struct namecache)"); 394 395 /* 396 * The new name cache statistics 397 */ 398 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 399 "Name cache statistics"); 400 #define STATNODE_ULONG(name, descr) \ 401 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); 402 #define STATNODE_COUNTER(name, descr) \ 403 static COUNTER_U64_DEFINE_EARLY(name); \ 404 SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ 405 descr); 406 STATNODE_ULONG(numneg, "Number of negative cache entries"); 407 STATNODE_ULONG(numcache, "Number of cache entries"); 408 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); 409 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); 410 STATNODE_COUNTER(dothits, "Number of '.' hits"); 411 STATNODE_COUNTER(dotdothits, "Number of '..' hits"); 412 STATNODE_COUNTER(numchecks, "Number of checks in lookup"); 413 STATNODE_COUNTER(nummiss, "Number of cache misses"); 414 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); 415 STATNODE_COUNTER(numposzaps, 416 "Number of cache hits (positive) we do not want to cache"); 417 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); 418 STATNODE_COUNTER(numnegzaps, 419 "Number of cache hits (negative) we do not want to cache"); 420 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); 421 /* These count for vn_getcwd(), too. */ 422 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); 423 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); 424 STATNODE_COUNTER(numfullpathfail2, 425 "Number of fullpath search errors (VOP_VPTOCNP failures)"); 426 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); 427 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); 428 STATNODE_COUNTER(zap_and_exit_bucket_relock_success, 429 "Number of successful removals after relocking"); 430 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, 431 "Number of times zap_and_exit failed to lock"); 432 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, 433 "Number of times zap_and_exit failed to lock"); 434 static long cache_lock_vnodes_cel_3_failures; 435 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, 436 "Number of times 3-way vnode locking failed"); 437 STATNODE_ULONG(numhotneg, "Number of hot negative entries"); 438 STATNODE_COUNTER(numneg_evicted, 439 "Number of negative entries evicted when adding a new entry"); 440 STATNODE_COUNTER(shrinking_skipped, 441 "Number of times shrinking was already in progress"); 442 443 static void cache_zap_locked(struct namecache *ncp); 444 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 445 char **freebuf, size_t *buflen); 446 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 447 char *buf, char **retbuf, size_t *buflen); 448 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 449 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); 450 451 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 452 453 static int cache_yield; 454 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, 455 "Number of times cache called yield"); 456 457 static void __noinline 458 cache_maybe_yield(void) 459 { 460 461 if (should_yield()) { 462 cache_yield++; 463 kern_yield(PRI_USER); 464 } 465 } 466 467 static inline void 468 cache_assert_vlp_locked(struct mtx *vlp) 469 { 470 471 if (vlp != NULL) 472 mtx_assert(vlp, MA_OWNED); 473 } 474 475 static inline void 476 cache_assert_vnode_locked(struct vnode *vp) 477 { 478 struct mtx *vlp; 479 480 vlp = VP2VNODELOCK(vp); 481 cache_assert_vlp_locked(vlp); 482 } 483 484 static uint32_t 485 cache_get_hash(char *name, u_char len, struct vnode *dvp) 486 { 487 uint32_t hash; 488 489 hash = fnv_32_buf(name, len, FNV1_32_INIT); 490 hash = fnv_32_buf(&dvp, sizeof(dvp), hash); 491 return (hash); 492 } 493 494 static inline struct rwlock * 495 NCP2BUCKETLOCK(struct namecache *ncp) 496 { 497 uint32_t hash; 498 499 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); 500 return (HASH2BUCKETLOCK(hash)); 501 } 502 503 #ifdef INVARIANTS 504 static void 505 cache_assert_bucket_locked(struct namecache *ncp, int mode) 506 { 507 struct rwlock *blp; 508 509 blp = NCP2BUCKETLOCK(ncp); 510 rw_assert(blp, mode); 511 } 512 #else 513 #define cache_assert_bucket_locked(x, y) do { } while (0) 514 #endif 515 516 #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) 517 static void 518 _cache_sort_vnodes(void **p1, void **p2) 519 { 520 void *tmp; 521 522 MPASS(*p1 != NULL || *p2 != NULL); 523 524 if (*p1 > *p2) { 525 tmp = *p2; 526 *p2 = *p1; 527 *p1 = tmp; 528 } 529 } 530 531 static void 532 cache_lock_all_buckets(void) 533 { 534 u_int i; 535 536 for (i = 0; i < numbucketlocks; i++) 537 rw_wlock(&bucketlocks[i]); 538 } 539 540 static void 541 cache_unlock_all_buckets(void) 542 { 543 u_int i; 544 545 for (i = 0; i < numbucketlocks; i++) 546 rw_wunlock(&bucketlocks[i]); 547 } 548 549 static void 550 cache_lock_all_vnodes(void) 551 { 552 u_int i; 553 554 for (i = 0; i < numvnodelocks; i++) 555 mtx_lock(&vnodelocks[i]); 556 } 557 558 static void 559 cache_unlock_all_vnodes(void) 560 { 561 u_int i; 562 563 for (i = 0; i < numvnodelocks; i++) 564 mtx_unlock(&vnodelocks[i]); 565 } 566 567 static int 568 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 569 { 570 571 cache_sort_vnodes(&vlp1, &vlp2); 572 573 if (vlp1 != NULL) { 574 if (!mtx_trylock(vlp1)) 575 return (EAGAIN); 576 } 577 if (!mtx_trylock(vlp2)) { 578 if (vlp1 != NULL) 579 mtx_unlock(vlp1); 580 return (EAGAIN); 581 } 582 583 return (0); 584 } 585 586 static void 587 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 588 { 589 590 MPASS(vlp1 != NULL || vlp2 != NULL); 591 MPASS(vlp1 <= vlp2); 592 593 if (vlp1 != NULL) 594 mtx_lock(vlp1); 595 if (vlp2 != NULL) 596 mtx_lock(vlp2); 597 } 598 599 static void 600 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) 601 { 602 603 MPASS(vlp1 != NULL || vlp2 != NULL); 604 605 if (vlp1 != NULL) 606 mtx_unlock(vlp1); 607 if (vlp2 != NULL) 608 mtx_unlock(vlp2); 609 } 610 611 static int 612 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 613 { 614 struct nchstats snap; 615 616 if (req->oldptr == NULL) 617 return (SYSCTL_OUT(req, 0, sizeof(snap))); 618 619 snap = nchstats; 620 snap.ncs_goodhits = counter_u64_fetch(numposhits); 621 snap.ncs_neghits = counter_u64_fetch(numneghits); 622 snap.ncs_badhits = counter_u64_fetch(numposzaps) + 623 counter_u64_fetch(numnegzaps); 624 snap.ncs_miss = counter_u64_fetch(nummisszap) + 625 counter_u64_fetch(nummiss); 626 627 return (SYSCTL_OUT(req, &snap, sizeof(snap))); 628 } 629 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | 630 CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", 631 "VFS cache effectiveness statistics"); 632 633 #ifdef DIAGNOSTIC 634 /* 635 * Grab an atomic snapshot of the name cache hash chain lengths 636 */ 637 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, 638 CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 639 "hash table stats"); 640 641 static int 642 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) 643 { 644 struct nchashhead *ncpp; 645 struct namecache *ncp; 646 int i, error, n_nchash, *cntbuf; 647 648 retry: 649 n_nchash = nchash + 1; /* nchash is max index, not count */ 650 if (req->oldptr == NULL) 651 return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); 652 cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); 653 cache_lock_all_buckets(); 654 if (n_nchash != nchash + 1) { 655 cache_unlock_all_buckets(); 656 free(cntbuf, M_TEMP); 657 goto retry; 658 } 659 /* Scan hash tables counting entries */ 660 for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) 661 CK_LIST_FOREACH(ncp, ncpp, nc_hash) 662 cntbuf[i]++; 663 cache_unlock_all_buckets(); 664 for (error = 0, i = 0; i < n_nchash; i++) 665 if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) 666 break; 667 free(cntbuf, M_TEMP); 668 return (error); 669 } 670 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| 671 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", 672 "nchash chain lengths"); 673 674 static int 675 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) 676 { 677 int error; 678 struct nchashhead *ncpp; 679 struct namecache *ncp; 680 int n_nchash; 681 int count, maxlength, used, pct; 682 683 if (!req->oldptr) 684 return SYSCTL_OUT(req, 0, 4 * sizeof(int)); 685 686 cache_lock_all_buckets(); 687 n_nchash = nchash + 1; /* nchash is max index, not count */ 688 used = 0; 689 maxlength = 0; 690 691 /* Scan hash tables for applicable entries */ 692 for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { 693 count = 0; 694 CK_LIST_FOREACH(ncp, ncpp, nc_hash) { 695 count++; 696 } 697 if (count) 698 used++; 699 if (maxlength < count) 700 maxlength = count; 701 } 702 n_nchash = nchash + 1; 703 cache_unlock_all_buckets(); 704 pct = (used * 100) / (n_nchash / 100); 705 error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); 706 if (error) 707 return (error); 708 error = SYSCTL_OUT(req, &used, sizeof(used)); 709 if (error) 710 return (error); 711 error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); 712 if (error) 713 return (error); 714 error = SYSCTL_OUT(req, &pct, sizeof(pct)); 715 if (error) 716 return (error); 717 return (0); 718 } 719 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| 720 CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", 721 "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); 722 #endif 723 724 /* 725 * Negative entries management 726 * 727 * A variation of LRU scheme is used. New entries are hashed into one of 728 * numneglists cold lists. Entries get promoted to the hot list on first hit. 729 * 730 * The shrinker will demote hot list head and evict from the cold list in a 731 * round-robin manner. 732 */ 733 static void 734 cache_negative_init(struct namecache *ncp) 735 { 736 struct negstate *negstate; 737 738 ncp->nc_flag |= NCF_NEGATIVE; 739 negstate = NCP2NEGSTATE(ncp); 740 negstate->neg_flag = 0; 741 } 742 743 static void 744 cache_negative_hit(struct namecache *ncp) 745 { 746 struct neglist *neglist; 747 struct negstate *negstate; 748 749 negstate = NCP2NEGSTATE(ncp); 750 if ((negstate->neg_flag & NEG_HOT) != 0) 751 return; 752 neglist = NCP2NEGLIST(ncp); 753 mtx_lock(&ncneg_hot.nl_lock); 754 mtx_lock(&neglist->nl_lock); 755 if ((negstate->neg_flag & NEG_HOT) == 0) { 756 numhotneg++; 757 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 758 TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); 759 negstate->neg_flag |= NEG_HOT; 760 } 761 mtx_unlock(&neglist->nl_lock); 762 mtx_unlock(&ncneg_hot.nl_lock); 763 } 764 765 static void 766 cache_negative_insert(struct namecache *ncp) 767 { 768 struct neglist *neglist; 769 770 MPASS(ncp->nc_flag & NCF_NEGATIVE); 771 cache_assert_bucket_locked(ncp, RA_WLOCKED); 772 neglist = NCP2NEGLIST(ncp); 773 mtx_lock(&neglist->nl_lock); 774 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 775 mtx_unlock(&neglist->nl_lock); 776 atomic_add_rel_long(&numneg, 1); 777 } 778 779 static void 780 cache_negative_remove(struct namecache *ncp) 781 { 782 struct neglist *neglist; 783 struct negstate *negstate; 784 bool hot_locked = false; 785 bool list_locked = false; 786 787 cache_assert_bucket_locked(ncp, RA_WLOCKED); 788 neglist = NCP2NEGLIST(ncp); 789 negstate = NCP2NEGSTATE(ncp); 790 if ((negstate->neg_flag & NEG_HOT) != 0) { 791 hot_locked = true; 792 mtx_lock(&ncneg_hot.nl_lock); 793 if ((negstate->neg_flag & NEG_HOT) == 0) { 794 list_locked = true; 795 mtx_lock(&neglist->nl_lock); 796 } 797 } else { 798 list_locked = true; 799 mtx_lock(&neglist->nl_lock); 800 } 801 if ((negstate->neg_flag & NEG_HOT) != 0) { 802 mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); 803 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 804 numhotneg--; 805 } else { 806 mtx_assert(&neglist->nl_lock, MA_OWNED); 807 TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); 808 } 809 if (list_locked) 810 mtx_unlock(&neglist->nl_lock); 811 if (hot_locked) 812 mtx_unlock(&ncneg_hot.nl_lock); 813 atomic_subtract_rel_long(&numneg, 1); 814 } 815 816 static void 817 cache_negative_shrink_select(struct namecache **ncpp, 818 struct neglist **neglistpp) 819 { 820 struct neglist *neglist; 821 struct namecache *ncp; 822 static u_int cycle; 823 u_int i; 824 825 *ncpp = ncp = NULL; 826 827 for (i = 0; i < numneglists; i++) { 828 neglist = &neglists[(cycle + i) % numneglists]; 829 if (TAILQ_FIRST(&neglist->nl_list) == NULL) 830 continue; 831 mtx_lock(&neglist->nl_lock); 832 ncp = TAILQ_FIRST(&neglist->nl_list); 833 if (ncp != NULL) 834 break; 835 mtx_unlock(&neglist->nl_lock); 836 } 837 838 *neglistpp = neglist; 839 *ncpp = ncp; 840 cycle++; 841 } 842 843 static void 844 cache_negative_zap_one(void) 845 { 846 struct namecache *ncp, *ncp2; 847 struct neglist *neglist; 848 struct negstate *negstate; 849 struct mtx *dvlp; 850 struct rwlock *blp; 851 852 if (mtx_owner(&ncneg_shrink_lock) != NULL || 853 !mtx_trylock(&ncneg_shrink_lock)) { 854 counter_u64_add(shrinking_skipped, 1); 855 return; 856 } 857 858 mtx_lock(&ncneg_hot.nl_lock); 859 ncp = TAILQ_FIRST(&ncneg_hot.nl_list); 860 if (ncp != NULL) { 861 neglist = NCP2NEGLIST(ncp); 862 negstate = NCP2NEGSTATE(ncp); 863 mtx_lock(&neglist->nl_lock); 864 MPASS((negstate->neg_flag & NEG_HOT) != 0); 865 TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); 866 TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); 867 negstate->neg_flag &= ~NEG_HOT; 868 numhotneg--; 869 mtx_unlock(&neglist->nl_lock); 870 } 871 mtx_unlock(&ncneg_hot.nl_lock); 872 873 cache_negative_shrink_select(&ncp, &neglist); 874 875 mtx_unlock(&ncneg_shrink_lock); 876 if (ncp == NULL) 877 return; 878 879 MPASS(ncp->nc_flag & NCF_NEGATIVE); 880 dvlp = VP2VNODELOCK(ncp->nc_dvp); 881 blp = NCP2BUCKETLOCK(ncp); 882 mtx_unlock(&neglist->nl_lock); 883 mtx_lock(dvlp); 884 rw_wlock(blp); 885 /* 886 * Enter SMR to safely check the negative list. 887 * Even if the found pointer matches, the entry may now be reallocated 888 * and used by a different vnode. 889 */ 890 vfs_smr_enter(); 891 ncp2 = TAILQ_FIRST(&neglist->nl_list); 892 if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || 893 blp != NCP2BUCKETLOCK(ncp2)) { 894 vfs_smr_exit(); 895 ncp = NULL; 896 } else { 897 vfs_smr_exit(); 898 SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, 899 ncp->nc_name); 900 cache_zap_locked(ncp); 901 counter_u64_add(numneg_evicted, 1); 902 } 903 rw_wunlock(blp); 904 mtx_unlock(dvlp); 905 cache_free(ncp); 906 } 907 908 /* 909 * cache_zap_locked(): 910 * 911 * Removes a namecache entry from cache, whether it contains an actual 912 * pointer to a vnode or if it is just a negative cache entry. 913 */ 914 static void 915 cache_zap_locked(struct namecache *ncp) 916 { 917 918 if (!(ncp->nc_flag & NCF_NEGATIVE)) 919 cache_assert_vnode_locked(ncp->nc_vp); 920 cache_assert_vnode_locked(ncp->nc_dvp); 921 cache_assert_bucket_locked(ncp, RA_WLOCKED); 922 923 CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, 924 (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); 925 926 cache_ncp_invalidate(ncp); 927 928 CK_LIST_REMOVE(ncp, nc_hash); 929 if (!(ncp->nc_flag & NCF_NEGATIVE)) { 930 SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, 931 ncp->nc_name, ncp->nc_vp); 932 TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); 933 if (ncp == ncp->nc_vp->v_cache_dd) 934 ncp->nc_vp->v_cache_dd = NULL; 935 } else { 936 SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, 937 ncp->nc_name); 938 cache_negative_remove(ncp); 939 } 940 if (ncp->nc_flag & NCF_ISDOTDOT) { 941 if (ncp == ncp->nc_dvp->v_cache_dd) 942 ncp->nc_dvp->v_cache_dd = NULL; 943 } else { 944 LIST_REMOVE(ncp, nc_src); 945 if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { 946 ncp->nc_flag |= NCF_DVDROP; 947 counter_u64_add(numcachehv, -1); 948 } 949 } 950 atomic_subtract_rel_long(&numcache, 1); 951 } 952 953 static void 954 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) 955 { 956 struct rwlock *blp; 957 958 MPASS(ncp->nc_dvp == vp); 959 MPASS(ncp->nc_flag & NCF_NEGATIVE); 960 cache_assert_vnode_locked(vp); 961 962 blp = NCP2BUCKETLOCK(ncp); 963 rw_wlock(blp); 964 cache_zap_locked(ncp); 965 rw_wunlock(blp); 966 } 967 968 static bool 969 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, 970 struct mtx **vlpp) 971 { 972 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 973 struct rwlock *blp; 974 975 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 976 cache_assert_vnode_locked(vp); 977 978 if (ncp->nc_flag & NCF_NEGATIVE) { 979 if (*vlpp != NULL) { 980 mtx_unlock(*vlpp); 981 *vlpp = NULL; 982 } 983 cache_zap_negative_locked_vnode_kl(ncp, vp); 984 return (true); 985 } 986 987 pvlp = VP2VNODELOCK(vp); 988 blp = NCP2BUCKETLOCK(ncp); 989 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 990 vlp2 = VP2VNODELOCK(ncp->nc_vp); 991 992 if (*vlpp == vlp1 || *vlpp == vlp2) { 993 to_unlock = *vlpp; 994 *vlpp = NULL; 995 } else { 996 if (*vlpp != NULL) { 997 mtx_unlock(*vlpp); 998 *vlpp = NULL; 999 } 1000 cache_sort_vnodes(&vlp1, &vlp2); 1001 if (vlp1 == pvlp) { 1002 mtx_lock(vlp2); 1003 to_unlock = vlp2; 1004 } else { 1005 if (!mtx_trylock(vlp1)) 1006 goto out_relock; 1007 to_unlock = vlp1; 1008 } 1009 } 1010 rw_wlock(blp); 1011 cache_zap_locked(ncp); 1012 rw_wunlock(blp); 1013 if (to_unlock != NULL) 1014 mtx_unlock(to_unlock); 1015 return (true); 1016 1017 out_relock: 1018 mtx_unlock(vlp2); 1019 mtx_lock(vlp1); 1020 mtx_lock(vlp2); 1021 MPASS(*vlpp == NULL); 1022 *vlpp = vlp1; 1023 return (false); 1024 } 1025 1026 static int __noinline 1027 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) 1028 { 1029 struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; 1030 struct rwlock *blp; 1031 int error = 0; 1032 1033 MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); 1034 cache_assert_vnode_locked(vp); 1035 1036 pvlp = VP2VNODELOCK(vp); 1037 if (ncp->nc_flag & NCF_NEGATIVE) { 1038 cache_zap_negative_locked_vnode_kl(ncp, vp); 1039 goto out; 1040 } 1041 1042 blp = NCP2BUCKETLOCK(ncp); 1043 vlp1 = VP2VNODELOCK(ncp->nc_dvp); 1044 vlp2 = VP2VNODELOCK(ncp->nc_vp); 1045 cache_sort_vnodes(&vlp1, &vlp2); 1046 if (vlp1 == pvlp) { 1047 mtx_lock(vlp2); 1048 to_unlock = vlp2; 1049 } else { 1050 if (!mtx_trylock(vlp1)) { 1051 error = EAGAIN; 1052 goto out; 1053 } 1054 to_unlock = vlp1; 1055 } 1056 rw_wlock(blp); 1057 cache_zap_locked(ncp); 1058 rw_wunlock(blp); 1059 mtx_unlock(to_unlock); 1060 out: 1061 mtx_unlock(pvlp); 1062 return (error); 1063 } 1064 1065 /* 1066 * If trylocking failed we can get here. We know enough to take all needed locks 1067 * in the right order and re-lookup the entry. 1068 */ 1069 static int 1070 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1071 struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, 1072 struct rwlock *blp) 1073 { 1074 struct namecache *rncp; 1075 1076 cache_assert_bucket_locked(ncp, RA_UNLOCKED); 1077 1078 cache_sort_vnodes(&dvlp, &vlp); 1079 cache_lock_vnodes(dvlp, vlp); 1080 rw_wlock(blp); 1081 CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { 1082 if (rncp == ncp && rncp->nc_dvp == dvp && 1083 rncp->nc_nlen == cnp->cn_namelen && 1084 !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) 1085 break; 1086 } 1087 if (rncp != NULL) { 1088 cache_zap_locked(rncp); 1089 rw_wunlock(blp); 1090 cache_unlock_vnodes(dvlp, vlp); 1091 counter_u64_add(zap_and_exit_bucket_relock_success, 1); 1092 return (0); 1093 } 1094 1095 rw_wunlock(blp); 1096 cache_unlock_vnodes(dvlp, vlp); 1097 return (EAGAIN); 1098 } 1099 1100 static int __noinline 1101 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1102 uint32_t hash, struct rwlock *blp) 1103 { 1104 struct mtx *dvlp, *vlp; 1105 struct vnode *dvp; 1106 1107 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1108 1109 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1110 vlp = NULL; 1111 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1112 vlp = VP2VNODELOCK(ncp->nc_vp); 1113 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1114 cache_zap_locked(ncp); 1115 rw_wunlock(blp); 1116 cache_unlock_vnodes(dvlp, vlp); 1117 return (0); 1118 } 1119 1120 dvp = ncp->nc_dvp; 1121 rw_wunlock(blp); 1122 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1123 } 1124 1125 static int __noinline 1126 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, 1127 uint32_t hash, struct rwlock *blp) 1128 { 1129 struct mtx *dvlp, *vlp; 1130 struct vnode *dvp; 1131 1132 cache_assert_bucket_locked(ncp, RA_RLOCKED); 1133 1134 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1135 vlp = NULL; 1136 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1137 vlp = VP2VNODELOCK(ncp->nc_vp); 1138 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1139 rw_runlock(blp); 1140 rw_wlock(blp); 1141 cache_zap_locked(ncp); 1142 rw_wunlock(blp); 1143 cache_unlock_vnodes(dvlp, vlp); 1144 return (0); 1145 } 1146 1147 dvp = ncp->nc_dvp; 1148 rw_runlock(blp); 1149 return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); 1150 } 1151 1152 static int 1153 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, 1154 struct mtx **vlpp1, struct mtx **vlpp2) 1155 { 1156 struct mtx *dvlp, *vlp; 1157 1158 cache_assert_bucket_locked(ncp, RA_WLOCKED); 1159 1160 dvlp = VP2VNODELOCK(ncp->nc_dvp); 1161 vlp = NULL; 1162 if (!(ncp->nc_flag & NCF_NEGATIVE)) 1163 vlp = VP2VNODELOCK(ncp->nc_vp); 1164 cache_sort_vnodes(&dvlp, &vlp); 1165 1166 if (*vlpp1 == dvlp && *vlpp2 == vlp) { 1167 cache_zap_locked(ncp); 1168 cache_unlock_vnodes(dvlp, vlp); 1169 *vlpp1 = NULL; 1170 *vlpp2 = NULL; 1171 return (0); 1172 } 1173 1174 if (*vlpp1 != NULL) 1175 mtx_unlock(*vlpp1); 1176 if (*vlpp2 != NULL) 1177 mtx_unlock(*vlpp2); 1178 *vlpp1 = NULL; 1179 *vlpp2 = NULL; 1180 1181 if (cache_trylock_vnodes(dvlp, vlp) == 0) { 1182 cache_zap_locked(ncp); 1183 cache_unlock_vnodes(dvlp, vlp); 1184 return (0); 1185 } 1186 1187 rw_wunlock(blp); 1188 *vlpp1 = dvlp; 1189 *vlpp2 = vlp; 1190 if (*vlpp1 != NULL) 1191 mtx_lock(*vlpp1); 1192 mtx_lock(*vlpp2); 1193 rw_wlock(blp); 1194 return (EAGAIN); 1195 } 1196 1197 static void 1198 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) 1199 { 1200 1201 if (blp != NULL) { 1202 rw_runlock(blp); 1203 } else { 1204 mtx_unlock(vlp); 1205 } 1206 } 1207 1208 static int __noinline 1209 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1210 struct timespec *tsp, int *ticksp) 1211 { 1212 int ltype; 1213 1214 *vpp = dvp; 1215 CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", 1216 dvp, cnp->cn_nameptr); 1217 counter_u64_add(dothits, 1); 1218 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); 1219 if (tsp != NULL) 1220 timespecclear(tsp); 1221 if (ticksp != NULL) 1222 *ticksp = ticks; 1223 vrefact(*vpp); 1224 /* 1225 * When we lookup "." we still can be asked to lock it 1226 * differently... 1227 */ 1228 ltype = cnp->cn_lkflags & LK_TYPE_MASK; 1229 if (ltype != VOP_ISLOCKED(*vpp)) { 1230 if (ltype == LK_EXCLUSIVE) { 1231 vn_lock(*vpp, LK_UPGRADE | LK_RETRY); 1232 if (VN_IS_DOOMED((*vpp))) { 1233 /* forced unmount */ 1234 vrele(*vpp); 1235 *vpp = NULL; 1236 return (ENOENT); 1237 } 1238 } else 1239 vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); 1240 } 1241 return (-1); 1242 } 1243 1244 static __noinline int 1245 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, 1246 struct componentname *cnp, struct timespec *tsp, int *ticksp) 1247 { 1248 struct namecache *ncp; 1249 struct rwlock *blp; 1250 struct mtx *dvlp, *dvlp2; 1251 uint32_t hash; 1252 int error; 1253 1254 if (cnp->cn_namelen == 2 && 1255 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1256 counter_u64_add(dotdothits, 1); 1257 dvlp = VP2VNODELOCK(dvp); 1258 dvlp2 = NULL; 1259 mtx_lock(dvlp); 1260 retry_dotdot: 1261 ncp = dvp->v_cache_dd; 1262 if (ncp == NULL) { 1263 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1264 "..", NULL); 1265 mtx_unlock(dvlp); 1266 if (dvlp2 != NULL) 1267 mtx_unlock(dvlp2); 1268 return (0); 1269 } 1270 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1271 if (ncp->nc_dvp != dvp) 1272 panic("dvp %p v_cache_dd %p\n", dvp, ncp); 1273 if (!cache_zap_locked_vnode_kl2(ncp, 1274 dvp, &dvlp2)) 1275 goto retry_dotdot; 1276 MPASS(dvp->v_cache_dd == NULL); 1277 mtx_unlock(dvlp); 1278 if (dvlp2 != NULL) 1279 mtx_unlock(dvlp2); 1280 cache_free(ncp); 1281 } else { 1282 dvp->v_cache_dd = NULL; 1283 mtx_unlock(dvlp); 1284 if (dvlp2 != NULL) 1285 mtx_unlock(dvlp2); 1286 } 1287 return (0); 1288 } 1289 1290 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1291 blp = HASH2BUCKETLOCK(hash); 1292 retry: 1293 if (CK_LIST_EMPTY(NCHHASH(hash))) 1294 goto out_no_entry; 1295 1296 rw_wlock(blp); 1297 1298 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1299 counter_u64_add(numchecks, 1); 1300 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1301 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1302 break; 1303 } 1304 1305 /* We failed to find an entry */ 1306 if (ncp == NULL) { 1307 rw_wunlock(blp); 1308 goto out_no_entry; 1309 } 1310 1311 error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); 1312 if (__predict_false(error != 0)) { 1313 zap_and_exit_bucket_fail++; 1314 cache_maybe_yield(); 1315 goto retry; 1316 } 1317 counter_u64_add(numposzaps, 1); 1318 cache_free(ncp); 1319 return (0); 1320 out_no_entry: 1321 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); 1322 counter_u64_add(nummisszap, 1); 1323 return (0); 1324 } 1325 1326 /** 1327 * Lookup a name in the name cache 1328 * 1329 * # Arguments 1330 * 1331 * - dvp: Parent directory in which to search. 1332 * - vpp: Return argument. Will contain desired vnode on cache hit. 1333 * - cnp: Parameters of the name search. The most interesting bits of 1334 * the cn_flags field have the following meanings: 1335 * - MAKEENTRY: If clear, free an entry from the cache rather than look 1336 * it up. 1337 * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." 1338 * - tsp: Return storage for cache timestamp. On a successful (positive 1339 * or negative) lookup, tsp will be filled with any timespec that 1340 * was stored when this cache entry was created. However, it will 1341 * be clear for "." entries. 1342 * - ticks: Return storage for alternate cache timestamp. On a successful 1343 * (positive or negative) lookup, it will contain the ticks value 1344 * that was current when the cache entry was created, unless cnp 1345 * was ".". 1346 * 1347 * # Returns 1348 * 1349 * - -1: A positive cache hit. vpp will contain the desired vnode. 1350 * - ENOENT: A negative cache hit, or dvp was recycled out from under us due 1351 * to a forced unmount. vpp will not be modified. If the entry 1352 * is a whiteout, then the ISWHITEOUT flag will be set in 1353 * cnp->cn_flags. 1354 * - 0: A cache miss. vpp will not be modified. 1355 * 1356 * # Locking 1357 * 1358 * On a cache hit, vpp will be returned locked and ref'd. If we're looking up 1359 * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the 1360 * lock is not recursively acquired. 1361 */ 1362 int 1363 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, 1364 struct timespec *tsp, int *ticksp) 1365 { 1366 struct namecache_ts *ncp_ts; 1367 struct namecache *ncp; 1368 struct negstate *negstate; 1369 struct rwlock *blp; 1370 struct mtx *dvlp; 1371 uint32_t hash; 1372 enum vgetstate vs; 1373 int error, ltype; 1374 bool try_smr, doing_smr, whiteout; 1375 1376 #ifdef DEBUG_CACHE 1377 if (__predict_false(!doingcache)) { 1378 cnp->cn_flags &= ~MAKEENTRY; 1379 return (0); 1380 } 1381 #endif 1382 1383 if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) 1384 return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); 1385 1386 if ((cnp->cn_flags & MAKEENTRY) == 0) 1387 return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); 1388 1389 try_smr = true; 1390 if (cnp->cn_nameiop == CREATE) 1391 try_smr = false; 1392 retry: 1393 doing_smr = false; 1394 blp = NULL; 1395 dvlp = NULL; 1396 error = 0; 1397 if (cnp->cn_namelen == 2 && 1398 cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { 1399 counter_u64_add(dotdothits, 1); 1400 dvlp = VP2VNODELOCK(dvp); 1401 mtx_lock(dvlp); 1402 ncp = dvp->v_cache_dd; 1403 if (ncp == NULL) { 1404 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, 1405 "..", NULL); 1406 mtx_unlock(dvlp); 1407 return (0); 1408 } 1409 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { 1410 if (ncp->nc_flag & NCF_NEGATIVE) 1411 *vpp = NULL; 1412 else 1413 *vpp = ncp->nc_vp; 1414 } else 1415 *vpp = ncp->nc_dvp; 1416 /* Return failure if negative entry was found. */ 1417 if (*vpp == NULL) 1418 goto negative_success; 1419 CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", 1420 dvp, cnp->cn_nameptr, *vpp); 1421 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", 1422 *vpp); 1423 cache_out_ts(ncp, tsp, ticksp); 1424 if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == 1425 NCF_DTS && tsp != NULL) { 1426 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1427 *tsp = ncp_ts->nc_dotdottime; 1428 } 1429 goto success; 1430 } 1431 1432 hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); 1433 retry_hashed: 1434 if (try_smr) { 1435 vfs_smr_enter(); 1436 doing_smr = true; 1437 try_smr = false; 1438 } else { 1439 blp = HASH2BUCKETLOCK(hash); 1440 rw_rlock(blp); 1441 } 1442 1443 CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { 1444 counter_u64_add(numchecks, 1); 1445 if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && 1446 !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) 1447 break; 1448 } 1449 1450 /* We failed to find an entry */ 1451 if (__predict_false(ncp == NULL)) { 1452 if (doing_smr) 1453 vfs_smr_exit(); 1454 else 1455 rw_runlock(blp); 1456 SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, 1457 NULL); 1458 counter_u64_add(nummiss, 1); 1459 return (0); 1460 } 1461 1462 if (ncp->nc_flag & NCF_NEGATIVE) 1463 goto negative_success; 1464 1465 /* We found a "positive" match, return the vnode */ 1466 counter_u64_add(numposhits, 1); 1467 *vpp = ncp->nc_vp; 1468 CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", 1469 dvp, cnp->cn_nameptr, *vpp, ncp); 1470 SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, 1471 *vpp); 1472 cache_out_ts(ncp, tsp, ticksp); 1473 success: 1474 /* 1475 * On success we return a locked and ref'd vnode as per the lookup 1476 * protocol. 1477 */ 1478 MPASS(dvp != *vpp); 1479 ltype = 0; /* silence gcc warning */ 1480 if (cnp->cn_flags & ISDOTDOT) { 1481 ltype = VOP_ISLOCKED(dvp); 1482 VOP_UNLOCK(dvp); 1483 } 1484 if (doing_smr) { 1485 if (cache_ncp_invalid(ncp)) { 1486 vfs_smr_exit(); 1487 *vpp = NULL; 1488 goto retry; 1489 } 1490 vs = vget_prep_smr(*vpp); 1491 vfs_smr_exit(); 1492 if (vs == VGET_NONE) { 1493 *vpp = NULL; 1494 goto retry; 1495 } 1496 } else { 1497 vs = vget_prep(*vpp); 1498 cache_lookup_unlock(blp, dvlp); 1499 } 1500 error = vget_finish(*vpp, cnp->cn_lkflags, vs); 1501 if (cnp->cn_flags & ISDOTDOT) { 1502 vn_lock(dvp, ltype | LK_RETRY); 1503 if (VN_IS_DOOMED(dvp)) { 1504 if (error == 0) 1505 vput(*vpp); 1506 *vpp = NULL; 1507 return (ENOENT); 1508 } 1509 } 1510 if (error) { 1511 *vpp = NULL; 1512 goto retry; 1513 } 1514 if ((cnp->cn_flags & ISLASTCN) && 1515 (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { 1516 ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); 1517 } 1518 return (-1); 1519 1520 negative_success: 1521 /* We found a negative match, and want to create it, so purge */ 1522 if (cnp->cn_nameiop == CREATE) { 1523 MPASS(!doing_smr); 1524 counter_u64_add(numnegzaps, 1); 1525 goto zap_and_exit; 1526 } 1527 1528 SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); 1529 cache_out_ts(ncp, tsp, ticksp); 1530 counter_u64_add(numneghits, 1); 1531 whiteout = (ncp->nc_flag & NCF_WHITE); 1532 1533 if (doing_smr) { 1534 /* 1535 * We need to take locks to promote an entry. 1536 */ 1537 negstate = NCP2NEGSTATE(ncp); 1538 if ((negstate->neg_flag & NEG_HOT) == 0 || 1539 cache_ncp_invalid(ncp)) { 1540 vfs_smr_exit(); 1541 doing_smr = false; 1542 goto retry_hashed; 1543 } 1544 vfs_smr_exit(); 1545 } else { 1546 cache_negative_hit(ncp); 1547 cache_lookup_unlock(blp, dvlp); 1548 } 1549 if (whiteout) 1550 cnp->cn_flags |= ISWHITEOUT; 1551 return (ENOENT); 1552 1553 zap_and_exit: 1554 MPASS(!doing_smr); 1555 if (blp != NULL) 1556 error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); 1557 else 1558 error = cache_zap_locked_vnode(ncp, dvp); 1559 if (__predict_false(error != 0)) { 1560 zap_and_exit_bucket_fail2++; 1561 cache_maybe_yield(); 1562 goto retry; 1563 } 1564 cache_free(ncp); 1565 return (0); 1566 } 1567 1568 struct celockstate { 1569 struct mtx *vlp[3]; 1570 struct rwlock *blp[2]; 1571 }; 1572 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); 1573 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); 1574 1575 static inline void 1576 cache_celockstate_init(struct celockstate *cel) 1577 { 1578 1579 bzero(cel, sizeof(*cel)); 1580 } 1581 1582 static void 1583 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, 1584 struct vnode *dvp) 1585 { 1586 struct mtx *vlp1, *vlp2; 1587 1588 MPASS(cel->vlp[0] == NULL); 1589 MPASS(cel->vlp[1] == NULL); 1590 MPASS(cel->vlp[2] == NULL); 1591 1592 MPASS(vp != NULL || dvp != NULL); 1593 1594 vlp1 = VP2VNODELOCK(vp); 1595 vlp2 = VP2VNODELOCK(dvp); 1596 cache_sort_vnodes(&vlp1, &vlp2); 1597 1598 if (vlp1 != NULL) { 1599 mtx_lock(vlp1); 1600 cel->vlp[0] = vlp1; 1601 } 1602 mtx_lock(vlp2); 1603 cel->vlp[1] = vlp2; 1604 } 1605 1606 static void 1607 cache_unlock_vnodes_cel(struct celockstate *cel) 1608 { 1609 1610 MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); 1611 1612 if (cel->vlp[0] != NULL) 1613 mtx_unlock(cel->vlp[0]); 1614 if (cel->vlp[1] != NULL) 1615 mtx_unlock(cel->vlp[1]); 1616 if (cel->vlp[2] != NULL) 1617 mtx_unlock(cel->vlp[2]); 1618 } 1619 1620 static bool 1621 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) 1622 { 1623 struct mtx *vlp; 1624 bool ret; 1625 1626 cache_assert_vlp_locked(cel->vlp[0]); 1627 cache_assert_vlp_locked(cel->vlp[1]); 1628 MPASS(cel->vlp[2] == NULL); 1629 1630 MPASS(vp != NULL); 1631 vlp = VP2VNODELOCK(vp); 1632 1633 ret = true; 1634 if (vlp >= cel->vlp[1]) { 1635 mtx_lock(vlp); 1636 } else { 1637 if (mtx_trylock(vlp)) 1638 goto out; 1639 cache_lock_vnodes_cel_3_failures++; 1640 cache_unlock_vnodes_cel(cel); 1641 if (vlp < cel->vlp[0]) { 1642 mtx_lock(vlp); 1643 mtx_lock(cel->vlp[0]); 1644 mtx_lock(cel->vlp[1]); 1645 } else { 1646 if (cel->vlp[0] != NULL) 1647 mtx_lock(cel->vlp[0]); 1648 mtx_lock(vlp); 1649 mtx_lock(cel->vlp[1]); 1650 } 1651 ret = false; 1652 } 1653 out: 1654 cel->vlp[2] = vlp; 1655 return (ret); 1656 } 1657 1658 static void 1659 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, 1660 struct rwlock *blp2) 1661 { 1662 1663 MPASS(cel->blp[0] == NULL); 1664 MPASS(cel->blp[1] == NULL); 1665 1666 cache_sort_vnodes(&blp1, &blp2); 1667 1668 if (blp1 != NULL) { 1669 rw_wlock(blp1); 1670 cel->blp[0] = blp1; 1671 } 1672 rw_wlock(blp2); 1673 cel->blp[1] = blp2; 1674 } 1675 1676 static void 1677 cache_unlock_buckets_cel(struct celockstate *cel) 1678 { 1679 1680 if (cel->blp[0] != NULL) 1681 rw_wunlock(cel->blp[0]); 1682 rw_wunlock(cel->blp[1]); 1683 } 1684 1685 /* 1686 * Lock part of the cache affected by the insertion. 1687 * 1688 * This means vnodelocks for dvp, vp and the relevant bucketlock. 1689 * However, insertion can result in removal of an old entry. In this 1690 * case we have an additional vnode and bucketlock pair to lock. If the 1691 * entry is negative, ncelock is locked instead of the vnode. 1692 * 1693 * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while 1694 * preserving the locking order (smaller address first). 1695 */ 1696 static void 1697 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1698 uint32_t hash) 1699 { 1700 struct namecache *ncp; 1701 struct rwlock *blps[2]; 1702 1703 blps[0] = HASH2BUCKETLOCK(hash); 1704 for (;;) { 1705 blps[1] = NULL; 1706 cache_lock_vnodes_cel(cel, dvp, vp); 1707 if (vp == NULL || vp->v_type != VDIR) 1708 break; 1709 ncp = vp->v_cache_dd; 1710 if (ncp == NULL) 1711 break; 1712 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1713 break; 1714 MPASS(ncp->nc_dvp == vp); 1715 blps[1] = NCP2BUCKETLOCK(ncp); 1716 if (ncp->nc_flag & NCF_NEGATIVE) 1717 break; 1718 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1719 break; 1720 /* 1721 * All vnodes got re-locked. Re-validate the state and if 1722 * nothing changed we are done. Otherwise restart. 1723 */ 1724 if (ncp == vp->v_cache_dd && 1725 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1726 blps[1] == NCP2BUCKETLOCK(ncp) && 1727 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1728 break; 1729 cache_unlock_vnodes_cel(cel); 1730 cel->vlp[0] = NULL; 1731 cel->vlp[1] = NULL; 1732 cel->vlp[2] = NULL; 1733 } 1734 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1735 } 1736 1737 static void 1738 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, 1739 uint32_t hash) 1740 { 1741 struct namecache *ncp; 1742 struct rwlock *blps[2]; 1743 1744 blps[0] = HASH2BUCKETLOCK(hash); 1745 for (;;) { 1746 blps[1] = NULL; 1747 cache_lock_vnodes_cel(cel, dvp, vp); 1748 ncp = dvp->v_cache_dd; 1749 if (ncp == NULL) 1750 break; 1751 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 1752 break; 1753 MPASS(ncp->nc_dvp == dvp); 1754 blps[1] = NCP2BUCKETLOCK(ncp); 1755 if (ncp->nc_flag & NCF_NEGATIVE) 1756 break; 1757 if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) 1758 break; 1759 if (ncp == dvp->v_cache_dd && 1760 (ncp->nc_flag & NCF_ISDOTDOT) != 0 && 1761 blps[1] == NCP2BUCKETLOCK(ncp) && 1762 VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) 1763 break; 1764 cache_unlock_vnodes_cel(cel); 1765 cel->vlp[0] = NULL; 1766 cel->vlp[1] = NULL; 1767 cel->vlp[2] = NULL; 1768 } 1769 cache_lock_buckets_cel(cel, blps[0], blps[1]); 1770 } 1771 1772 static void 1773 cache_enter_unlock(struct celockstate *cel) 1774 { 1775 1776 cache_unlock_buckets_cel(cel); 1777 cache_unlock_vnodes_cel(cel); 1778 } 1779 1780 static void __noinline 1781 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, 1782 struct componentname *cnp) 1783 { 1784 struct celockstate cel; 1785 struct namecache *ncp; 1786 uint32_t hash; 1787 int len; 1788 1789 if (dvp->v_cache_dd == NULL) 1790 return; 1791 len = cnp->cn_namelen; 1792 cache_celockstate_init(&cel); 1793 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1794 cache_enter_lock_dd(&cel, dvp, vp, hash); 1795 ncp = dvp->v_cache_dd; 1796 if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { 1797 KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); 1798 cache_zap_locked(ncp); 1799 } else { 1800 ncp = NULL; 1801 } 1802 dvp->v_cache_dd = NULL; 1803 cache_enter_unlock(&cel); 1804 cache_free(ncp); 1805 } 1806 1807 /* 1808 * Add an entry to the cache. 1809 */ 1810 void 1811 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, 1812 struct timespec *tsp, struct timespec *dtsp) 1813 { 1814 struct celockstate cel; 1815 struct namecache *ncp, *n2, *ndd; 1816 struct namecache_ts *ncp_ts, *n2_ts; 1817 struct nchashhead *ncpp; 1818 uint32_t hash; 1819 int flag; 1820 int len; 1821 u_long lnumcache; 1822 1823 CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); 1824 VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, 1825 ("cache_enter: Adding a doomed vnode")); 1826 VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, 1827 ("cache_enter: Doomed vnode used as src")); 1828 1829 #ifdef DEBUG_CACHE 1830 if (__predict_false(!doingcache)) 1831 return; 1832 #endif 1833 1834 flag = 0; 1835 if (__predict_false(cnp->cn_nameptr[0] == '.')) { 1836 if (cnp->cn_namelen == 1) 1837 return; 1838 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { 1839 cache_enter_dotdot_prep(dvp, vp, cnp); 1840 flag = NCF_ISDOTDOT; 1841 } 1842 } 1843 1844 /* 1845 * Avoid blowout in namecache entries. 1846 */ 1847 lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; 1848 if (__predict_false(lnumcache >= ncsize)) { 1849 atomic_add_long(&numcache, -1); 1850 counter_u64_add(numdrops, 1); 1851 return; 1852 } 1853 1854 cache_celockstate_init(&cel); 1855 ndd = NULL; 1856 ncp_ts = NULL; 1857 1858 /* 1859 * Calculate the hash key and setup as much of the new 1860 * namecache entry as possible before acquiring the lock. 1861 */ 1862 ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); 1863 ncp->nc_flag = flag; 1864 ncp->nc_vp = vp; 1865 if (vp == NULL) 1866 cache_negative_init(ncp); 1867 ncp->nc_dvp = dvp; 1868 if (tsp != NULL) { 1869 ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); 1870 ncp_ts->nc_time = *tsp; 1871 ncp_ts->nc_ticks = ticks; 1872 ncp_ts->nc_nc.nc_flag |= NCF_TS; 1873 if (dtsp != NULL) { 1874 ncp_ts->nc_dotdottime = *dtsp; 1875 ncp_ts->nc_nc.nc_flag |= NCF_DTS; 1876 } 1877 } 1878 len = ncp->nc_nlen = cnp->cn_namelen; 1879 hash = cache_get_hash(cnp->cn_nameptr, len, dvp); 1880 strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); 1881 cache_enter_lock(&cel, dvp, vp, hash); 1882 1883 /* 1884 * See if this vnode or negative entry is already in the cache 1885 * with this name. This can happen with concurrent lookups of 1886 * the same path name. 1887 */ 1888 ncpp = NCHHASH(hash); 1889 CK_LIST_FOREACH(n2, ncpp, nc_hash) { 1890 if (n2->nc_dvp == dvp && 1891 n2->nc_nlen == cnp->cn_namelen && 1892 !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { 1893 if (tsp != NULL) { 1894 KASSERT((n2->nc_flag & NCF_TS) != 0, 1895 ("no NCF_TS")); 1896 n2_ts = __containerof(n2, struct namecache_ts, nc_nc); 1897 n2_ts->nc_time = ncp_ts->nc_time; 1898 n2_ts->nc_ticks = ncp_ts->nc_ticks; 1899 if (dtsp != NULL) { 1900 n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; 1901 n2_ts->nc_nc.nc_flag |= NCF_DTS; 1902 } 1903 } 1904 goto out_unlock_free; 1905 } 1906 } 1907 1908 if (flag == NCF_ISDOTDOT) { 1909 /* 1910 * See if we are trying to add .. entry, but some other lookup 1911 * has populated v_cache_dd pointer already. 1912 */ 1913 if (dvp->v_cache_dd != NULL) 1914 goto out_unlock_free; 1915 KASSERT(vp == NULL || vp->v_type == VDIR, 1916 ("wrong vnode type %p", vp)); 1917 dvp->v_cache_dd = ncp; 1918 } 1919 1920 if (vp != NULL) { 1921 if (vp->v_type == VDIR) { 1922 if (flag != NCF_ISDOTDOT) { 1923 /* 1924 * For this case, the cache entry maps both the 1925 * directory name in it and the name ".." for the 1926 * directory's parent. 1927 */ 1928 if ((ndd = vp->v_cache_dd) != NULL) { 1929 if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) 1930 cache_zap_locked(ndd); 1931 else 1932 ndd = NULL; 1933 } 1934 vp->v_cache_dd = ncp; 1935 } 1936 } else { 1937 vp->v_cache_dd = NULL; 1938 } 1939 } 1940 1941 if (flag != NCF_ISDOTDOT) { 1942 if (LIST_EMPTY(&dvp->v_cache_src)) { 1943 vhold(dvp); 1944 counter_u64_add(numcachehv, 1); 1945 } 1946 LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); 1947 } 1948 1949 /* 1950 * If the entry is "negative", we place it into the 1951 * "negative" cache queue, otherwise, we place it into the 1952 * destination vnode's cache entries queue. 1953 */ 1954 if (vp != NULL) { 1955 TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); 1956 SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, 1957 vp); 1958 } else { 1959 if (cnp->cn_flags & ISWHITEOUT) 1960 ncp->nc_flag |= NCF_WHITE; 1961 cache_negative_insert(ncp); 1962 SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, 1963 ncp->nc_name); 1964 } 1965 1966 atomic_thread_fence_rel(); 1967 /* 1968 * Insert the new namecache entry into the appropriate chain 1969 * within the cache entries table. 1970 */ 1971 CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); 1972 1973 cache_enter_unlock(&cel); 1974 if (numneg * ncnegfactor > lnumcache) 1975 cache_negative_zap_one(); 1976 cache_free(ndd); 1977 return; 1978 out_unlock_free: 1979 cache_enter_unlock(&cel); 1980 cache_free(ncp); 1981 return; 1982 } 1983 1984 static u_int 1985 cache_roundup_2(u_int val) 1986 { 1987 u_int res; 1988 1989 for (res = 1; res <= val; res <<= 1) 1990 continue; 1991 1992 return (res); 1993 } 1994 1995 /* 1996 * Name cache initialization, from vfs_init() when we are booting 1997 */ 1998 static void 1999 nchinit(void *dummy __unused) 2000 { 2001 u_int i; 2002 2003 cache_zone_small = uma_zcreate("S VFS Cache", 2004 sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, 2005 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2006 UMA_ZONE_ZINIT); 2007 cache_zone_small_ts = uma_zcreate("STS VFS Cache", 2008 sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, 2009 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2010 UMA_ZONE_ZINIT); 2011 cache_zone_large = uma_zcreate("L VFS Cache", 2012 sizeof(struct namecache) + NAME_MAX + 1, 2013 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), 2014 UMA_ZONE_ZINIT); 2015 cache_zone_large_ts = uma_zcreate("LTS VFS Cache", 2016 sizeof(struct namecache_ts) + NAME_MAX + 1, 2017 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), 2018 UMA_ZONE_ZINIT); 2019 2020 VFS_SMR_ZONE_SET(cache_zone_small); 2021 VFS_SMR_ZONE_SET(cache_zone_small_ts); 2022 VFS_SMR_ZONE_SET(cache_zone_large); 2023 VFS_SMR_ZONE_SET(cache_zone_large_ts); 2024 2025 ncsize = desiredvnodes * ncsizefactor; 2026 nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); 2027 ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; 2028 if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ 2029 ncbuckethash = 7; 2030 if (ncbuckethash > nchash) 2031 ncbuckethash = nchash; 2032 bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, 2033 M_WAITOK | M_ZERO); 2034 for (i = 0; i < numbucketlocks; i++) 2035 rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); 2036 ncvnodehash = ncbuckethash; 2037 vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, 2038 M_WAITOK | M_ZERO); 2039 for (i = 0; i < numvnodelocks; i++) 2040 mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); 2041 ncpurgeminvnodes = numbucketlocks * 2; 2042 2043 ncneghash = 3; 2044 neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, 2045 M_WAITOK | M_ZERO); 2046 for (i = 0; i < numneglists; i++) { 2047 mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); 2048 TAILQ_INIT(&neglists[i].nl_list); 2049 } 2050 mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); 2051 TAILQ_INIT(&ncneg_hot.nl_list); 2052 2053 mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); 2054 } 2055 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); 2056 2057 void 2058 cache_changesize(u_long newmaxvnodes) 2059 { 2060 struct nchashhead *new_nchashtbl, *old_nchashtbl; 2061 u_long new_nchash, old_nchash; 2062 struct namecache *ncp; 2063 uint32_t hash; 2064 u_long newncsize; 2065 int i; 2066 2067 newncsize = newmaxvnodes * ncsizefactor; 2068 newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); 2069 if (newmaxvnodes < numbucketlocks) 2070 newmaxvnodes = numbucketlocks; 2071 2072 new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); 2073 /* If same hash table size, nothing to do */ 2074 if (nchash == new_nchash) { 2075 free(new_nchashtbl, M_VFSCACHE); 2076 return; 2077 } 2078 /* 2079 * Move everything from the old hash table to the new table. 2080 * None of the namecache entries in the table can be removed 2081 * because to do so, they have to be removed from the hash table. 2082 */ 2083 cache_lock_all_vnodes(); 2084 cache_lock_all_buckets(); 2085 old_nchashtbl = nchashtbl; 2086 old_nchash = nchash; 2087 nchashtbl = new_nchashtbl; 2088 nchash = new_nchash; 2089 for (i = 0; i <= old_nchash; i++) { 2090 while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { 2091 hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, 2092 ncp->nc_dvp); 2093 CK_LIST_REMOVE(ncp, nc_hash); 2094 CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); 2095 } 2096 } 2097 ncsize = newncsize; 2098 cache_unlock_all_buckets(); 2099 cache_unlock_all_vnodes(); 2100 free(old_nchashtbl, M_VFSCACHE); 2101 } 2102 2103 /* 2104 * Invalidate all entries from and to a particular vnode. 2105 */ 2106 void 2107 cache_purge(struct vnode *vp) 2108 { 2109 TAILQ_HEAD(, namecache) ncps; 2110 struct namecache *ncp, *nnp; 2111 struct mtx *vlp, *vlp2; 2112 2113 CTR1(KTR_VFS, "cache_purge(%p)", vp); 2114 SDT_PROBE1(vfs, namecache, purge, done, vp); 2115 if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && 2116 vp->v_cache_dd == NULL) 2117 return; 2118 TAILQ_INIT(&ncps); 2119 vlp = VP2VNODELOCK(vp); 2120 vlp2 = NULL; 2121 mtx_lock(vlp); 2122 retry: 2123 while (!LIST_EMPTY(&vp->v_cache_src)) { 2124 ncp = LIST_FIRST(&vp->v_cache_src); 2125 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2126 goto retry; 2127 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2128 } 2129 while (!TAILQ_EMPTY(&vp->v_cache_dst)) { 2130 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2131 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2132 goto retry; 2133 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2134 } 2135 ncp = vp->v_cache_dd; 2136 if (ncp != NULL) { 2137 KASSERT(ncp->nc_flag & NCF_ISDOTDOT, 2138 ("lost dotdot link")); 2139 if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) 2140 goto retry; 2141 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2142 } 2143 KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); 2144 mtx_unlock(vlp); 2145 if (vlp2 != NULL) 2146 mtx_unlock(vlp2); 2147 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2148 cache_free(ncp); 2149 } 2150 } 2151 2152 /* 2153 * Invalidate all negative entries for a particular directory vnode. 2154 */ 2155 void 2156 cache_purge_negative(struct vnode *vp) 2157 { 2158 TAILQ_HEAD(, namecache) ncps; 2159 struct namecache *ncp, *nnp; 2160 struct mtx *vlp; 2161 2162 CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); 2163 SDT_PROBE1(vfs, namecache, purge_negative, done, vp); 2164 if (LIST_EMPTY(&vp->v_cache_src)) 2165 return; 2166 TAILQ_INIT(&ncps); 2167 vlp = VP2VNODELOCK(vp); 2168 mtx_lock(vlp); 2169 LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { 2170 if (!(ncp->nc_flag & NCF_NEGATIVE)) 2171 continue; 2172 cache_zap_negative_locked_vnode_kl(ncp, vp); 2173 TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); 2174 } 2175 mtx_unlock(vlp); 2176 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2177 cache_free(ncp); 2178 } 2179 } 2180 2181 /* 2182 * Flush all entries referencing a particular filesystem. 2183 */ 2184 void 2185 cache_purgevfs(struct mount *mp, bool force) 2186 { 2187 TAILQ_HEAD(, namecache) ncps; 2188 struct mtx *vlp1, *vlp2; 2189 struct rwlock *blp; 2190 struct nchashhead *bucket; 2191 struct namecache *ncp, *nnp; 2192 u_long i, j, n_nchash; 2193 int error; 2194 2195 /* Scan hash tables for applicable entries */ 2196 SDT_PROBE1(vfs, namecache, purgevfs, done, mp); 2197 if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) 2198 return; 2199 TAILQ_INIT(&ncps); 2200 n_nchash = nchash + 1; 2201 vlp1 = vlp2 = NULL; 2202 for (i = 0; i < numbucketlocks; i++) { 2203 blp = (struct rwlock *)&bucketlocks[i]; 2204 rw_wlock(blp); 2205 for (j = i; j < n_nchash; j += numbucketlocks) { 2206 retry: 2207 bucket = &nchashtbl[j]; 2208 CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { 2209 cache_assert_bucket_locked(ncp, RA_WLOCKED); 2210 if (ncp->nc_dvp->v_mount != mp) 2211 continue; 2212 error = cache_zap_wlocked_bucket_kl(ncp, blp, 2213 &vlp1, &vlp2); 2214 if (error != 0) 2215 goto retry; 2216 TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); 2217 } 2218 } 2219 rw_wunlock(blp); 2220 if (vlp1 == NULL && vlp2 == NULL) 2221 cache_maybe_yield(); 2222 } 2223 if (vlp1 != NULL) 2224 mtx_unlock(vlp1); 2225 if (vlp2 != NULL) 2226 mtx_unlock(vlp2); 2227 2228 TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { 2229 cache_free(ncp); 2230 } 2231 } 2232 2233 /* 2234 * Perform canonical checks and cache lookup and pass on to filesystem 2235 * through the vop_cachedlookup only if needed. 2236 */ 2237 2238 int 2239 vfs_cache_lookup(struct vop_lookup_args *ap) 2240 { 2241 struct vnode *dvp; 2242 int error; 2243 struct vnode **vpp = ap->a_vpp; 2244 struct componentname *cnp = ap->a_cnp; 2245 int flags = cnp->cn_flags; 2246 2247 *vpp = NULL; 2248 dvp = ap->a_dvp; 2249 2250 if (dvp->v_type != VDIR) 2251 return (ENOTDIR); 2252 2253 if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && 2254 (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) 2255 return (EROFS); 2256 2257 error = vn_dir_check_exec(dvp, cnp); 2258 if (error != 0) 2259 return (error); 2260 2261 error = cache_lookup(dvp, vpp, cnp, NULL, NULL); 2262 if (error == 0) 2263 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); 2264 if (error == -1) 2265 return (0); 2266 return (error); 2267 } 2268 2269 /* Implementation of the getcwd syscall. */ 2270 int 2271 sys___getcwd(struct thread *td, struct __getcwd_args *uap) 2272 { 2273 char *buf, *retbuf; 2274 size_t buflen; 2275 int error; 2276 2277 buflen = uap->buflen; 2278 if (__predict_false(buflen < 2)) 2279 return (EINVAL); 2280 if (buflen > MAXPATHLEN) 2281 buflen = MAXPATHLEN; 2282 2283 buf = malloc(buflen, M_TEMP, M_WAITOK); 2284 error = vn_getcwd(td, buf, &retbuf, &buflen); 2285 if (error == 0) 2286 error = copyout(retbuf, uap->buf, buflen); 2287 free(buf, M_TEMP); 2288 return (error); 2289 } 2290 2291 int 2292 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) 2293 { 2294 struct pwd *pwd; 2295 int error; 2296 2297 pwd = pwd_hold(td); 2298 error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); 2299 pwd_drop(pwd); 2300 2301 #ifdef KTRACE 2302 if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) 2303 ktrnamei(*retbuf); 2304 #endif 2305 return (error); 2306 } 2307 2308 static int 2309 kern___realpathat(struct thread *td, int fd, const char *path, char *buf, 2310 size_t size, int flags, enum uio_seg pathseg) 2311 { 2312 struct nameidata nd; 2313 char *retbuf, *freebuf; 2314 int error; 2315 2316 if (flags != 0) 2317 return (EINVAL); 2318 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, 2319 pathseg, path, fd, &cap_fstat_rights, td); 2320 if ((error = namei(&nd)) != 0) 2321 return (error); 2322 error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); 2323 if (error == 0) { 2324 error = copyout(retbuf, buf, size); 2325 free(freebuf, M_TEMP); 2326 } 2327 NDFREE(&nd, 0); 2328 return (error); 2329 } 2330 2331 int 2332 sys___realpathat(struct thread *td, struct __realpathat_args *uap) 2333 { 2334 2335 return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, 2336 uap->flags, UIO_USERSPACE)); 2337 } 2338 2339 /* 2340 * Retrieve the full filesystem path that correspond to a vnode from the name 2341 * cache (if available) 2342 */ 2343 int 2344 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) 2345 { 2346 struct pwd *pwd; 2347 char *buf; 2348 size_t buflen; 2349 int error; 2350 2351 if (__predict_false(vn == NULL)) 2352 return (EINVAL); 2353 2354 buflen = MAXPATHLEN; 2355 buf = malloc(buflen, M_TEMP, M_WAITOK); 2356 pwd = pwd_hold(td); 2357 error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); 2358 pwd_drop(pwd); 2359 2360 if (!error) 2361 *freebuf = buf; 2362 else 2363 free(buf, M_TEMP); 2364 return (error); 2365 } 2366 2367 /* 2368 * This function is similar to vn_fullpath, but it attempts to lookup the 2369 * pathname relative to the global root mount point. This is required for the 2370 * auditing sub-system, as audited pathnames must be absolute, relative to the 2371 * global root mount point. 2372 */ 2373 int 2374 vn_fullpath_global(struct thread *td, struct vnode *vn, 2375 char **retbuf, char **freebuf) 2376 { 2377 char *buf; 2378 size_t buflen; 2379 int error; 2380 2381 if (__predict_false(vn == NULL)) 2382 return (EINVAL); 2383 buflen = MAXPATHLEN; 2384 buf = malloc(buflen, M_TEMP, M_WAITOK); 2385 error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); 2386 if (!error) 2387 *freebuf = buf; 2388 else 2389 free(buf, M_TEMP); 2390 return (error); 2391 } 2392 2393 int 2394 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) 2395 { 2396 struct vnode *dvp; 2397 struct namecache *ncp; 2398 struct mtx *vlp; 2399 int error; 2400 2401 vlp = VP2VNODELOCK(*vp); 2402 mtx_lock(vlp); 2403 TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { 2404 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2405 break; 2406 } 2407 if (ncp != NULL) { 2408 if (*buflen < ncp->nc_nlen) { 2409 mtx_unlock(vlp); 2410 vrele(*vp); 2411 counter_u64_add(numfullpathfail4, 1); 2412 error = ENOMEM; 2413 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2414 vp, NULL); 2415 return (error); 2416 } 2417 *buflen -= ncp->nc_nlen; 2418 memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); 2419 SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, 2420 ncp->nc_name, vp); 2421 dvp = *vp; 2422 *vp = ncp->nc_dvp; 2423 vref(*vp); 2424 mtx_unlock(vlp); 2425 vrele(dvp); 2426 return (0); 2427 } 2428 SDT_PROBE1(vfs, namecache, fullpath, miss, vp); 2429 2430 mtx_unlock(vlp); 2431 vn_lock(*vp, LK_SHARED | LK_RETRY); 2432 error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); 2433 vput(*vp); 2434 if (error) { 2435 counter_u64_add(numfullpathfail2, 1); 2436 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2437 return (error); 2438 } 2439 2440 *vp = dvp; 2441 if (VN_IS_DOOMED(dvp)) { 2442 /* forced unmount */ 2443 vrele(dvp); 2444 error = ENOENT; 2445 SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); 2446 return (error); 2447 } 2448 /* 2449 * *vp has its use count incremented still. 2450 */ 2451 2452 return (0); 2453 } 2454 2455 /* 2456 * Resolve a directory to a pathname. 2457 * 2458 * The name of the directory can always be found in the namecache or fetched 2459 * from the filesystem. There is also guaranteed to be only one parent, meaning 2460 * we can just follow vnodes up until we find the root. 2461 * 2462 * The vnode must be referenced. 2463 */ 2464 static int 2465 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, 2466 char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) 2467 { 2468 #ifdef KDTRACE_HOOKS 2469 struct vnode *startvp = vp; 2470 #endif 2471 struct vnode *vp1; 2472 size_t buflen; 2473 int error; 2474 2475 VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); 2476 VNPASS(vp->v_usecount > 0, vp); 2477 2478 buflen = *len; 2479 2480 if (!slash_prefixed) { 2481 MPASS(*len >= 2); 2482 buflen--; 2483 buf[buflen] = '\0'; 2484 } 2485 2486 error = 0; 2487 2488 SDT_PROBE1(vfs, namecache, fullpath, entry, vp); 2489 counter_u64_add(numfullpathcalls, 1); 2490 while (vp != rdir && vp != rootvnode) { 2491 /* 2492 * The vp vnode must be already fully constructed, 2493 * since it is either found in namecache or obtained 2494 * from VOP_VPTOCNP(). We may test for VV_ROOT safely 2495 * without obtaining the vnode lock. 2496 */ 2497 if ((vp->v_vflag & VV_ROOT) != 0) { 2498 vn_lock(vp, LK_RETRY | LK_SHARED); 2499 2500 /* 2501 * With the vnode locked, check for races with 2502 * unmount, forced or not. Note that we 2503 * already verified that vp is not equal to 2504 * the root vnode, which means that 2505 * mnt_vnodecovered can be NULL only for the 2506 * case of unmount. 2507 */ 2508 if (VN_IS_DOOMED(vp) || 2509 (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || 2510 vp1->v_mountedhere != vp->v_mount) { 2511 vput(vp); 2512 error = ENOENT; 2513 SDT_PROBE3(vfs, namecache, fullpath, return, 2514 error, vp, NULL); 2515 break; 2516 } 2517 2518 vref(vp1); 2519 vput(vp); 2520 vp = vp1; 2521 continue; 2522 } 2523 if (vp->v_type != VDIR) { 2524 vrele(vp); 2525 counter_u64_add(numfullpathfail1, 1); 2526 error = ENOTDIR; 2527 SDT_PROBE3(vfs, namecache, fullpath, return, 2528 error, vp, NULL); 2529 break; 2530 } 2531 error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); 2532 if (error) 2533 break; 2534 if (buflen == 0) { 2535 vrele(vp); 2536 error = ENOMEM; 2537 SDT_PROBE3(vfs, namecache, fullpath, return, error, 2538 startvp, NULL); 2539 break; 2540 } 2541 buf[--buflen] = '/'; 2542 slash_prefixed = true; 2543 } 2544 if (error) 2545 return (error); 2546 if (!slash_prefixed) { 2547 if (buflen == 0) { 2548 vrele(vp); 2549 counter_u64_add(numfullpathfail4, 1); 2550 SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, 2551 startvp, NULL); 2552 return (ENOMEM); 2553 } 2554 buf[--buflen] = '/'; 2555 } 2556 counter_u64_add(numfullpathfound, 1); 2557 vrele(vp); 2558 2559 *retbuf = buf + buflen; 2560 SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); 2561 *len -= buflen; 2562 *len += addend; 2563 return (0); 2564 } 2565 2566 /* 2567 * Resolve an arbitrary vnode to a pathname. 2568 * 2569 * Note 2 caveats: 2570 * - hardlinks are not tracked, thus if the vnode is not a directory this can 2571 * resolve to a different path than the one used to find it 2572 * - namecache is not mandatory, meaning names are not guaranteed to be added 2573 * (in which case resolving fails) 2574 */ 2575 static int 2576 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, 2577 char *buf, char **retbuf, size_t *buflen) 2578 { 2579 size_t orig_buflen; 2580 bool slash_prefixed; 2581 int error; 2582 2583 if (*buflen < 2) 2584 return (EINVAL); 2585 2586 orig_buflen = *buflen; 2587 2588 vref(vp); 2589 slash_prefixed = false; 2590 if (vp->v_type != VDIR) { 2591 *buflen -= 1; 2592 buf[*buflen] = '\0'; 2593 error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); 2594 if (error) 2595 return (error); 2596 if (*buflen == 0) { 2597 vrele(vp); 2598 return (ENOMEM); 2599 } 2600 *buflen -= 1; 2601 buf[*buflen] = '/'; 2602 slash_prefixed = true; 2603 } 2604 2605 return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, 2606 orig_buflen - *buflen)); 2607 } 2608 2609 /* 2610 * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). 2611 * 2612 * Since the namecache does not track handlings, the caller is expected to first 2613 * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. 2614 * 2615 * Then we have 2 cases: 2616 * - if the found vnode is a directory, the path can be constructed just by 2617 * fullowing names up the chain 2618 * - otherwise we populate the buffer with the saved name and start resolving 2619 * from the parent 2620 */ 2621 static int 2622 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, 2623 char **freebuf, size_t *buflen) 2624 { 2625 char *buf, *tmpbuf; 2626 struct pwd *pwd; 2627 struct componentname *cnp; 2628 struct vnode *vp; 2629 size_t addend; 2630 int error; 2631 bool slash_prefixed; 2632 2633 if (*buflen < 2) 2634 return (EINVAL); 2635 if (*buflen > MAXPATHLEN) 2636 *buflen = MAXPATHLEN; 2637 2638 slash_prefixed = false; 2639 2640 buf = malloc(*buflen, M_TEMP, M_WAITOK); 2641 pwd = pwd_hold(td); 2642 2643 addend = 0; 2644 vp = ndp->ni_vp; 2645 if (vp->v_type != VDIR) { 2646 cnp = &ndp->ni_cnd; 2647 addend = cnp->cn_namelen + 2; 2648 if (*buflen < addend) { 2649 error = ENOMEM; 2650 goto out_bad; 2651 } 2652 *buflen -= addend; 2653 tmpbuf = buf + *buflen; 2654 tmpbuf[0] = '/'; 2655 memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); 2656 tmpbuf[addend - 1] = '\0'; 2657 slash_prefixed = true; 2658 vp = ndp->ni_dvp; 2659 } 2660 2661 vref(vp); 2662 error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, 2663 slash_prefixed, addend); 2664 if (error != 0) 2665 goto out_bad; 2666 2667 pwd_drop(pwd); 2668 *freebuf = buf; 2669 2670 return (0); 2671 out_bad: 2672 pwd_drop(pwd); 2673 free(buf, M_TEMP); 2674 return (error); 2675 } 2676 2677 struct vnode * 2678 vn_dir_dd_ino(struct vnode *vp) 2679 { 2680 struct namecache *ncp; 2681 struct vnode *ddvp; 2682 struct mtx *vlp; 2683 enum vgetstate vs; 2684 2685 ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); 2686 vlp = VP2VNODELOCK(vp); 2687 mtx_lock(vlp); 2688 TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { 2689 if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) 2690 continue; 2691 ddvp = ncp->nc_dvp; 2692 vs = vget_prep(ddvp); 2693 mtx_unlock(vlp); 2694 if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) 2695 return (NULL); 2696 return (ddvp); 2697 } 2698 mtx_unlock(vlp); 2699 return (NULL); 2700 } 2701 2702 int 2703 vn_commname(struct vnode *vp, char *buf, u_int buflen) 2704 { 2705 struct namecache *ncp; 2706 struct mtx *vlp; 2707 int l; 2708 2709 vlp = VP2VNODELOCK(vp); 2710 mtx_lock(vlp); 2711 TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) 2712 if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) 2713 break; 2714 if (ncp == NULL) { 2715 mtx_unlock(vlp); 2716 return (ENOENT); 2717 } 2718 l = min(ncp->nc_nlen, buflen - 1); 2719 memcpy(buf, ncp->nc_name, l); 2720 mtx_unlock(vlp); 2721 buf[l] = '\0'; 2722 return (0); 2723 } 2724 2725 /* 2726 * This function updates path string to vnode's full global path 2727 * and checks the size of the new path string against the pathlen argument. 2728 * 2729 * Requires a locked, referenced vnode. 2730 * Vnode is re-locked on success or ENODEV, otherwise unlocked. 2731 * 2732 * If vp is a directory, the call to vn_fullpath_global() always succeeds 2733 * because it falls back to the ".." lookup if the namecache lookup fails. 2734 */ 2735 int 2736 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, 2737 u_int pathlen) 2738 { 2739 struct nameidata nd; 2740 struct vnode *vp1; 2741 char *rpath, *fbuf; 2742 int error; 2743 2744 ASSERT_VOP_ELOCKED(vp, __func__); 2745 2746 /* Construct global filesystem path from vp. */ 2747 VOP_UNLOCK(vp); 2748 error = vn_fullpath_global(td, vp, &rpath, &fbuf); 2749 2750 if (error != 0) { 2751 vrele(vp); 2752 return (error); 2753 } 2754 2755 if (strlen(rpath) >= pathlen) { 2756 vrele(vp); 2757 error = ENAMETOOLONG; 2758 goto out; 2759 } 2760 2761 /* 2762 * Re-lookup the vnode by path to detect a possible rename. 2763 * As a side effect, the vnode is relocked. 2764 * If vnode was renamed, return ENOENT. 2765 */ 2766 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 2767 UIO_SYSSPACE, path, td); 2768 error = namei(&nd); 2769 if (error != 0) { 2770 vrele(vp); 2771 goto out; 2772 } 2773 NDFREE(&nd, NDF_ONLY_PNBUF); 2774 vp1 = nd.ni_vp; 2775 vrele(vp); 2776 if (vp1 == vp) 2777 strcpy(path, rpath); 2778 else { 2779 vput(vp1); 2780 error = ENOENT; 2781 } 2782 2783 out: 2784 free(fbuf, M_TEMP); 2785 return (error); 2786 } 2787 2788 #ifdef DDB 2789 static void 2790 db_print_vpath(struct vnode *vp) 2791 { 2792 2793 while (vp != NULL) { 2794 db_printf("%p: ", vp); 2795 if (vp == rootvnode) { 2796 db_printf("/"); 2797 vp = NULL; 2798 } else { 2799 if (vp->v_vflag & VV_ROOT) { 2800 db_printf("<mount point>"); 2801 vp = vp->v_mount->mnt_vnodecovered; 2802 } else { 2803 struct namecache *ncp; 2804 char *ncn; 2805 int i; 2806 2807 ncp = TAILQ_FIRST(&vp->v_cache_dst); 2808 if (ncp != NULL) { 2809 ncn = ncp->nc_name; 2810 for (i = 0; i < ncp->nc_nlen; i++) 2811 db_printf("%c", *ncn++); 2812 vp = ncp->nc_dvp; 2813 } else { 2814 vp = NULL; 2815 } 2816 } 2817 } 2818 db_printf("\n"); 2819 } 2820 2821 return; 2822 } 2823 2824 DB_SHOW_COMMAND(vpath, db_show_vpath) 2825 { 2826 struct vnode *vp; 2827 2828 if (!have_addr) { 2829 db_printf("usage: show vpath <struct vnode *>\n"); 2830 return; 2831 } 2832 2833 vp = (struct vnode *)addr; 2834 db_print_vpath(vp); 2835 } 2836 2837 #endif 2838