1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long vfscache_unres; 179 long numdefered; 180 long inv_kid_quick_count; 181 long inv_ncp_quick_count; 182 long clean_pos_count; 183 long clean_neg_count; 184 } __cachealign; 185 186 __read_mostly static struct nchash_head *nchashtbl; 187 __read_mostly static struct pcpu_ncache *pcpu_ncache; 188 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 189 190 /* 191 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 192 * to create the namecache infrastructure leading to a dangling vnode. 193 * 194 * 0 Only errors are reported 195 * 1 Successes are reported 196 * 2 Successes + the whole directory scan is reported 197 * 3 Force the directory scan code run as if the parent vnode did not 198 * have a namecache record, even if it does have one. 199 */ 200 __read_mostly int ncvp_debug; 201 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 202 "Namecache debug level (0-3)"); 203 204 __read_mostly static u_long nchash; /* size of hash table */ 205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 206 "Size of namecache hash table"); 207 208 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 209 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 210 "Batch flush negative entries"); 211 212 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 213 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 214 "Batch flush positive entries"); 215 216 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 217 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 218 "Ratio of negative namecache entries"); 219 220 __read_mostly static int ncposfactor = 16; /* ratio of unres+leaf entries */ 221 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0, 222 "Ratio of unresolved leaf namecache entries"); 223 224 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 225 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 226 "Warn on locked namecache entries in ticks"); 227 228 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 229 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 230 "Number of cache entries allocated"); 231 232 __read_mostly static int ncp_shared_lock_disable = 0; 233 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 234 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 235 236 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 237 "sizeof(struct vnode)"); 238 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 239 "sizeof(struct namecache)"); 240 241 __read_mostly static int ncmount_cache_enable = 1; 242 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 243 &ncmount_cache_enable, 0, "mount point cache"); 244 245 static __inline void _cache_drop(struct namecache *ncp); 246 static int cache_resolve_mp(struct mount *mp, int adjgen); 247 static int cache_findmount_callback(struct mount *mp, void *data); 248 static void _cache_setunresolved(struct namecache *ncp, int adjgen); 249 static void _cache_cleanneg(long count); 250 static void _cache_cleanpos(long ucount, long xcount); 251 static void _cache_cleandefered(void); 252 static void _cache_unlink(struct namecache *ncp); 253 254 /* 255 * The new name cache statistics (these are rolled up globals and not 256 * modified in the critical path, see struct pcpu_ncache). 257 */ 258 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 259 static long vfscache_negs; 260 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 261 "Number of negative namecache entries"); 262 static long vfscache_count; 263 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 264 "Number of namecaches entries"); 265 static long vfscache_leafs; 266 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 267 "Number of leaf namecaches entries"); 268 static long vfscache_unres; 269 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0, 270 "Number of unresolved leaf namecaches entries"); 271 272 static long inv_kid_quick_count; 273 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_kid_quick_count, CTLFLAG_RD, 274 &inv_kid_quick_count, 0, 275 "quick kid invalidations"); 276 static long inv_ncp_quick_count; 277 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_ncp_quick_count, CTLFLAG_RD, 278 &inv_ncp_quick_count, 0, 279 "quick ncp invalidations"); 280 static long clean_pos_count; 281 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_pos_count, CTLFLAG_RD, 282 &clean_pos_count, 0, 283 "positive ncp cleanings"); 284 static long clean_neg_count; 285 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_neg_count, CTLFLAG_RD, 286 &clean_neg_count, 0, 287 "negative ncp cleanings"); 288 289 static long numdefered; 290 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 291 "Number of cache entries allocated"); 292 293 /* 294 * Returns the number of basic references expected on the ncp, not 295 * including any children. 1 for the natural ref, and an addition ref 296 * if the ncp is resolved (representing a positive or negative hit). 297 */ 298 static __inline int 299 ncpbaserefs(struct namecache *ncp) 300 { 301 return (1 + ((ncp->nc_flag & NCF_UNRESOLVED) == 0)); 302 } 303 304 struct nchstats nchstats[SMP_MAXCPU]; 305 /* 306 * Export VFS cache effectiveness statistics to user-land. 307 * 308 * The statistics are left for aggregation to user-land so 309 * neat things can be achieved, like observing per-CPU cache 310 * distribution. 311 */ 312 static int 313 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 314 { 315 struct globaldata *gd; 316 int i, error; 317 318 error = 0; 319 for (i = 0; i < ncpus; ++i) { 320 gd = globaldata_find(i); 321 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 322 sizeof(struct nchstats)))) 323 break; 324 } 325 326 return (error); 327 } 328 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 329 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 330 331 static int cache_zap(struct namecache *ncp); 332 333 /* 334 * Cache mount points and namecache records in order to avoid unnecessary 335 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 336 * performance and is particularly important on multi-socket systems to 337 * reduce cache-line ping-ponging. 338 * 339 * Try to keep the pcpu structure within one cache line (~64 bytes). 340 */ 341 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 342 #define MNTCACHE_SET 8 /* set associativity */ 343 344 struct mntcache_elm { 345 struct namecache *ncp; 346 struct mount *mp; 347 int ticks; 348 int unused01; 349 }; 350 351 struct mntcache { 352 struct mntcache_elm array[MNTCACHE_COUNT]; 353 } __cachealign; 354 355 static struct mntcache pcpu_mntcache[MAXCPU]; 356 357 static __inline 358 void 359 _cache_ncp_gen_enter(struct namecache *ncp) 360 { 361 ncp->nc_generation += 2; 362 cpu_sfence(); 363 } 364 365 static __inline 366 void 367 _cache_ncp_gen_exit(struct namecache *ncp) 368 { 369 cpu_sfence(); 370 ncp->nc_generation += 2; 371 cpu_sfence(); 372 } 373 374 static __inline 375 struct mntcache_elm * 376 _cache_mntcache_hash(void *ptr) 377 { 378 struct mntcache_elm *elm; 379 int hv; 380 381 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 382 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 383 384 return elm; 385 } 386 387 static 388 void 389 _cache_mntref(struct mount *mp) 390 { 391 struct mntcache_elm *elm; 392 struct mount *mpr; 393 int i; 394 395 elm = _cache_mntcache_hash(mp); 396 for (i = 0; i < MNTCACHE_SET; ++i) { 397 if (elm->mp == mp) { 398 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 399 if (__predict_true(mpr == mp)) 400 return; 401 if (mpr) 402 atomic_add_int(&mpr->mnt_refs, -1); 403 } 404 ++elm; 405 } 406 atomic_add_int(&mp->mnt_refs, 1); 407 } 408 409 static 410 void 411 _cache_mntrel(struct mount *mp) 412 { 413 struct mntcache_elm *elm; 414 struct mntcache_elm *best; 415 struct mount *mpr; 416 int delta1; 417 int delta2; 418 int i; 419 420 elm = _cache_mntcache_hash(mp); 421 best = elm; 422 for (i = 0; i < MNTCACHE_SET; ++i) { 423 if (elm->mp == NULL) { 424 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 425 if (__predict_false(mpr != NULL)) { 426 atomic_add_int(&mpr->mnt_refs, -1); 427 } 428 elm->ticks = ticks; 429 return; 430 } 431 delta1 = ticks - best->ticks; 432 delta2 = ticks - elm->ticks; 433 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 434 best = elm; 435 ++elm; 436 } 437 mpr = atomic_swap_ptr((void *)&best->mp, mp); 438 best->ticks = ticks; 439 if (mpr) 440 atomic_add_int(&mpr->mnt_refs, -1); 441 } 442 443 /* 444 * Clears all cached mount points on all cpus. This routine should only 445 * be called when we are waiting for a mount to clear, e.g. so we can 446 * unmount. 447 */ 448 void 449 cache_clearmntcache(struct mount *target __unused) 450 { 451 int n; 452 453 for (n = 0; n < ncpus; ++n) { 454 struct mntcache *cache = &pcpu_mntcache[n]; 455 struct mntcache_elm *elm; 456 struct namecache *ncp; 457 struct mount *mp; 458 int i; 459 460 for (i = 0; i < MNTCACHE_COUNT; ++i) { 461 elm = &cache->array[i]; 462 if (elm->mp) { 463 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 464 if (mp) 465 atomic_add_int(&mp->mnt_refs, -1); 466 } 467 if (elm->ncp) { 468 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 469 if (ncp) 470 _cache_drop(ncp); 471 } 472 } 473 } 474 } 475 476 /* 477 * Namespace locking. The caller must already hold a reference to the 478 * namecache structure in order to lock/unlock it. The controlling entity 479 * in a 1->0 transition does not need to lock the ncp to dispose of it, 480 * as nobody else will have visibility to it at that point. 481 * 482 * Note that holding a locked namecache structure prevents other threads 483 * from making namespace changes (e.g. deleting or creating), prevents 484 * vnode association state changes by other threads, and prevents the 485 * namecache entry from being resolved or unresolved by other threads. 486 * 487 * An exclusive lock owner has full authority to associate/disassociate 488 * vnodes and resolve/unresolve the locked ncp. 489 * 490 * A shared lock owner only has authority to acquire the underlying vnode, 491 * if any. 492 * 493 * The primary lock field is nc_lockstatus. nc_locktd is set after the 494 * fact (when locking) or cleared prior to unlocking. 495 * 496 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 497 * or recycled, but it does NOT help you if the vnode had already 498 * initiated a recyclement. If this is important, use cache_get() 499 * rather then cache_lock() (and deal with the differences in the 500 * way the refs counter is handled). Or, alternatively, make an 501 * unconditional call to cache_validate() or cache_resolve() 502 * after cache_lock() returns. 503 */ 504 static __inline 505 void 506 _cache_lock(struct namecache *ncp) 507 { 508 int didwarn = 0; 509 int error; 510 511 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 512 while (__predict_false(error == EWOULDBLOCK)) { 513 if (didwarn == 0) { 514 didwarn = ticks - nclockwarn; 515 kprintf("[diagnostic] cache_lock: " 516 "%s blocked on %p " 517 "\"%*.*s\"\n", 518 curthread->td_comm, ncp, 519 ncp->nc_nlen, ncp->nc_nlen, 520 ncp->nc_name); 521 } 522 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 523 } 524 if (__predict_false(didwarn)) { 525 kprintf("[diagnostic] cache_lock: " 526 "%s unblocked %*.*s after %d secs\n", 527 curthread->td_comm, 528 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 529 (int)(ticks - didwarn) / hz); 530 } 531 } 532 533 /* 534 * Release a previously acquired lock. 535 * 536 * A concurrent shared-lock acquisition or acquisition/release can 537 * race bit 31 so only drop the ncp if bit 31 was set. 538 */ 539 static __inline 540 void 541 _cache_unlock(struct namecache *ncp) 542 { 543 lockmgr(&ncp->nc_lock, LK_RELEASE); 544 } 545 546 /* 547 * Lock ncp exclusively, non-blocking. Return 0 on success. 548 */ 549 static __inline 550 int 551 _cache_lock_nonblock(struct namecache *ncp) 552 { 553 int error; 554 555 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 556 if (__predict_false(error != 0)) { 557 return(EWOULDBLOCK); 558 } 559 return 0; 560 } 561 562 /* 563 * This is a special form of _cache_lock() which only succeeds if 564 * it can get a pristine, non-recursive lock. The caller must have 565 * already ref'd the ncp. 566 * 567 * On success the ncp will be locked, on failure it will not. The 568 * ref count does not change either way. 569 * 570 * We want _cache_lock_special() (on success) to return a definitively 571 * usable vnode or a definitively unresolved ncp. 572 */ 573 static __inline 574 int 575 _cache_lock_special(struct namecache *ncp) 576 { 577 if (_cache_lock_nonblock(ncp) == 0) { 578 if (lockmgr_oneexcl(&ncp->nc_lock)) { 579 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 580 _cache_setunresolved(ncp, 1); 581 return 0; 582 } 583 _cache_unlock(ncp); 584 } 585 return EWOULDBLOCK; 586 } 587 588 /* 589 * Shared lock, guarantees vp held 590 * 591 * The shared lock holds vp on the 0->1 transition. It is possible to race 592 * another shared lock release, preventing the other release from dropping 593 * the vnode and clearing bit 31. 594 * 595 * If it is not set then we are responsible for setting it, and this 596 * responsibility does not race with anyone else. 597 */ 598 static __inline 599 void 600 _cache_lock_shared(struct namecache *ncp) 601 { 602 int didwarn = 0; 603 int error; 604 605 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 606 while (__predict_false(error == EWOULDBLOCK)) { 607 if (didwarn == 0) { 608 didwarn = ticks - nclockwarn; 609 kprintf("[diagnostic] cache_lock_shared: " 610 "%s blocked on %p " 611 "\"%*.*s\"\n", 612 curthread->td_comm, ncp, 613 ncp->nc_nlen, ncp->nc_nlen, 614 ncp->nc_name); 615 } 616 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 617 } 618 if (__predict_false(didwarn)) { 619 kprintf("[diagnostic] cache_lock_shared: " 620 "%s unblocked %*.*s after %d secs\n", 621 curthread->td_comm, 622 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 623 (int)(ticks - didwarn) / hz); 624 } 625 } 626 627 /* 628 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 629 */ 630 static __inline 631 int 632 _cache_lock_shared_nonblock(struct namecache *ncp) 633 { 634 int error; 635 636 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 637 if (__predict_false(error != 0)) { 638 return(EWOULDBLOCK); 639 } 640 return 0; 641 } 642 643 /* 644 * This function tries to get a shared lock but will back-off to an 645 * exclusive lock if: 646 * 647 * (1) Some other thread is trying to obtain an exclusive lock 648 * (to prevent the exclusive requester from getting livelocked out 649 * by many shared locks). 650 * 651 * (2) The current thread already owns an exclusive lock (to avoid 652 * deadlocking). 653 * 654 * WARNING! On machines with lots of cores we really want to try hard to 655 * get a shared lock or concurrent path lookups can chain-react 656 * into a very high-latency exclusive lock. 657 * 658 * This is very evident in dsynth's initial scans. 659 */ 660 static __inline 661 int 662 _cache_lock_shared_special(struct namecache *ncp) 663 { 664 /* 665 * Only honor a successful shared lock (returning 0) if there is 666 * no exclusive request pending and the vnode, if present, is not 667 * in a reclaimed state. 668 */ 669 if (_cache_lock_shared_nonblock(ncp) == 0) { 670 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 671 if (ncp->nc_vp == NULL || 672 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 673 return(0); 674 } 675 } 676 _cache_unlock(ncp); 677 return(EWOULDBLOCK); 678 } 679 680 /* 681 * Non-blocking shared lock failed. If we already own the exclusive 682 * lock just acquire another exclusive lock (instead of deadlocking). 683 * Otherwise acquire a shared lock. 684 */ 685 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 686 _cache_lock(ncp); 687 return(0); 688 } 689 _cache_lock_shared(ncp); 690 return(0); 691 } 692 693 /* 694 * Returns: 695 * -1 Locked by other 696 * 0 Not locked 697 * (v) LK_SHARED or LK_EXCLUSIVE 698 */ 699 static __inline 700 int 701 _cache_lockstatus(struct namecache *ncp) 702 { 703 int status; 704 705 status = lockstatus(&ncp->nc_lock, curthread); 706 if (status == LK_EXCLOTHER) 707 status = -1; 708 return status; 709 } 710 711 /* 712 * cache_hold() and cache_drop() prevent the premature deletion of a 713 * namecache entry but do not prevent operations (such as zapping) on 714 * that namecache entry. 715 * 716 * This routine may only be called from outside this source module if 717 * nc_refs is already deterministically at least 1, such as being 718 * associated with e.g. a process, file descriptor, or some other entity. 719 * 720 * Only the above situations, similar situations within this module where 721 * the ref count is deterministically at least 1, or when the ncp is found 722 * via the nchpp (hash table) lookup, can bump nc_refs. 723 * 724 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 725 * can still be removed from the nc_list, however, as long as the caller 726 * can acquire its lock (in the wrong order). 727 * 728 * This is a rare case where callers are allowed to hold a spinlock, 729 * so we can't ourselves. 730 */ 731 static __inline 732 struct namecache * 733 _cache_hold(struct namecache *ncp) 734 { 735 KKASSERT(ncp->nc_refs > 0); 736 atomic_add_int(&ncp->nc_refs, 1); 737 738 return(ncp); 739 } 740 741 /* 742 * Drop a cache entry. 743 * 744 * The 1->0 transition can only occur after or because the natural ref 745 * is being dropped. If another thread had a temporary ref during the 746 * ncp's destruction, then that other thread might wind up being the 747 * one to drop the last ref. 748 */ 749 static __inline 750 void 751 _cache_drop(struct namecache *ncp) 752 { 753 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 754 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 755 756 /* 757 * Scrap it. 758 */ 759 ncp->nc_refs = -1; /* safety */ 760 if (ncp->nc_name) 761 kfree(ncp->nc_name, M_VFSCACHEAUX); 762 kfree_obj(ncp, M_VFSCACHE); 763 } 764 } 765 766 /* 767 * Link a new namecache entry to its parent and to the hash table. Be 768 * careful to avoid races if vhold() blocks in the future. 769 * 770 * Both ncp and par must be referenced and locked. The reference is 771 * transfered to the nchpp (and, most notably, NOT to the parent list). 772 * 773 * NOTE: The hash table spinlock is held across this call, we can't do 774 * anything fancy. 775 */ 776 static void 777 _cache_link_parent(struct namecache *ncp, struct namecache *par, 778 struct nchash_head *nchpp) 779 { 780 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 781 782 KKASSERT(ncp->nc_parent == NULL); 783 _cache_ncp_gen_enter(ncp); 784 ncp->nc_parent = par; 785 ncp->nc_head = nchpp; 786 787 /* 788 * Set inheritance flags. Note that the parent flags may be 789 * stale due to getattr potentially not having been run yet 790 * (it gets run during nlookup()'s). 791 */ 792 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 793 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 794 ncp->nc_flag |= NCF_SF_PNOCACHE; 795 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 796 ncp->nc_flag |= NCF_UF_PCACHE; 797 798 /* 799 * Add to hash table and parent, adjust accounting 800 */ 801 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 802 atomic_add_long(&pn->vfscache_count, 1); 803 804 /* 805 * ncp is a new leaf being added to the tree 806 */ 807 if (TAILQ_EMPTY(&ncp->nc_list)) { 808 atomic_add_long(&pn->vfscache_leafs, 1); 809 if (ncp->nc_flag & NCF_UNRESOLVED) 810 atomic_add_long(&pn->vfscache_unres, 1); 811 } 812 813 if (TAILQ_EMPTY(&par->nc_list)) { 814 /* 815 * Parent was, but now is no longer a leaf 816 */ 817 /* 818 * XXX for now don't mess with par's gen, it causes 819 * unnecessary nlookup retries (though not many) 820 */ 821 /*_cache_ncp_gen_enter(par);*/ 822 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 823 if (par->nc_parent) { 824 if (par->nc_flag & NCF_UNRESOLVED) 825 atomic_add_long(&pn->vfscache_unres, -1); 826 atomic_add_long(&pn->vfscache_leafs, -1); 827 } 828 829 /* 830 * Any vp associated with an ncp which has children must 831 * be held to prevent it from being recycled. 832 */ 833 if (par->nc_vp) 834 vhold(par->nc_vp); 835 /*_cache_ncp_gen_exit(par);*/ 836 } else { 837 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 838 } 839 _cache_hold(par); /* add nc_parent ref */ 840 _cache_ncp_gen_exit(ncp); 841 } 842 843 /* 844 * Remove the parent and hash associations from a namecache structure. 845 * Drop the ref-count on the parent. The caller receives the ref 846 * from the ncp's nchpp linkage that was removed and may forward that 847 * ref to a new linkage. 848 849 * The caller usually holds an additional ref * on the ncp so the unlink 850 * cannot be the final drop. XXX should not be necessary now since the 851 * caller receives the ref from the nchpp linkage, assuming the ncp 852 * was linked in the first place. 853 * 854 * ncp must be locked, which means that there won't be any nc_parent 855 * removal races. This routine will acquire a temporary lock on 856 * the parent as well as the appropriate hash chain. 857 * 858 * par must be locked and will remain locked on return. 859 * 860 * nhcpp must be spin-locked. This routine eats the spin-lock. 861 */ 862 static __inline void 863 _cache_unlink_parent(struct namecache *par, struct namecache *ncp, 864 struct nchash_head *nchpp) 865 { 866 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 867 struct vnode *dropvp; 868 869 KKASSERT(ncp->nc_parent == par); 870 cpu_ccfence(); 871 _cache_ncp_gen_enter(ncp); 872 873 /* don't add a ref, we drop the nchpp ref later */ 874 875 /* 876 * Remove from hash table and parent, adjust accounting 877 */ 878 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 879 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 880 atomic_add_long(&pn->vfscache_count, -1); 881 882 /* 883 * Removing leaf from tree 884 */ 885 if (TAILQ_EMPTY(&ncp->nc_list)) { 886 if (ncp->nc_flag & NCF_UNRESOLVED) 887 atomic_add_long(&pn->vfscache_unres, -1); 888 atomic_add_long(&pn->vfscache_leafs, -1); 889 } 890 891 /* 892 * Parent is now a leaf? 893 */ 894 dropvp = NULL; 895 if (TAILQ_EMPTY(&par->nc_list)) { 896 /* 897 * XXX for now don't mess with par's gen, it causes 898 * unnecessary nlookup retries (though not many) 899 */ 900 /*_cache_ncp_gen_enter(par);*/ 901 if (par->nc_parent) { 902 if (par->nc_flag & NCF_UNRESOLVED) 903 atomic_add_long(&pn->vfscache_unres, 1); 904 atomic_add_long(&pn->vfscache_leafs, 1); 905 } 906 if (par->nc_vp) 907 dropvp = par->nc_vp; 908 /*_cache_ncp_gen_exit(par);*/ 909 } 910 ncp->nc_parent = NULL; 911 ncp->nc_head = NULL; 912 spin_unlock(&nchpp->spin); 913 _cache_drop(par); /* drop ncp's nc_parent ref from (par) */ 914 915 /* 916 * We can only safely vdrop with no spinlocks held. 917 */ 918 if (dropvp) 919 vdrop(dropvp); 920 _cache_ncp_gen_exit(ncp); 921 } 922 923 /* 924 * Allocate a new namecache structure. Most of the code does not require 925 * zero-termination of the string but it makes vop_compat_ncreate() easier. 926 * 927 * The returned ncp will be locked and referenced. The ref is generally meant 928 * to be transfered to the nchpp linkage. 929 */ 930 static struct namecache * 931 cache_alloc(int nlen) 932 { 933 struct namecache *ncp; 934 935 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 936 if (nlen) 937 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 938 ncp->nc_nlen = nlen; 939 ncp->nc_flag = NCF_UNRESOLVED; 940 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 941 ncp->nc_refs = 1; /* natural ref */ 942 ncp->nc_generation = 0; /* link/unlink/res/unres op */ 943 TAILQ_INIT(&ncp->nc_list); 944 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 945 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 946 947 return(ncp); 948 } 949 950 /* 951 * Can only be called for the case where the ncp has never been 952 * associated with anything (so no spinlocks are needed). 953 */ 954 static void 955 _cache_free(struct namecache *ncp) 956 { 957 KKASSERT(ncp->nc_refs == 1); 958 if (ncp->nc_name) 959 kfree(ncp->nc_name, M_VFSCACHEAUX); 960 kfree_obj(ncp, M_VFSCACHE); 961 } 962 963 /* 964 * [re]initialize a nchandle. 965 */ 966 void 967 cache_zero(struct nchandle *nch) 968 { 969 nch->ncp = NULL; 970 nch->mount = NULL; 971 } 972 973 /* 974 * Ref and deref a nchandle structure (ncp + mp) 975 * 976 * The caller must specify a stable ncp pointer, typically meaning the 977 * ncp is already referenced but this can also occur indirectly through 978 * e.g. holding a lock on a direct child. 979 * 980 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 981 * use read spinlocks here. 982 */ 983 struct nchandle * 984 cache_hold(struct nchandle *nch) 985 { 986 _cache_hold(nch->ncp); 987 _cache_mntref(nch->mount); 988 return(nch); 989 } 990 991 /* 992 * Create a copy of a namecache handle for an already-referenced 993 * entry. 994 */ 995 void 996 cache_copy(struct nchandle *nch, struct nchandle *target) 997 { 998 struct namecache *ncp; 999 struct mount *mp; 1000 struct mntcache_elm *elm; 1001 struct namecache *ncpr; 1002 int i; 1003 1004 ncp = nch->ncp; 1005 mp = nch->mount; 1006 target->ncp = ncp; 1007 target->mount = mp; 1008 1009 elm = _cache_mntcache_hash(ncp); 1010 for (i = 0; i < MNTCACHE_SET; ++i) { 1011 if (elm->ncp == ncp) { 1012 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 1013 if (ncpr == ncp) { 1014 _cache_mntref(mp); 1015 return; 1016 } 1017 if (ncpr) 1018 _cache_drop(ncpr); 1019 } 1020 ++elm; 1021 } 1022 if (ncp) 1023 _cache_hold(ncp); 1024 _cache_mntref(mp); 1025 } 1026 1027 /* 1028 * Drop the nchandle, but try to cache the ref to avoid global atomic 1029 * ops. This is typically done on the system root and jail root nchandles. 1030 */ 1031 void 1032 cache_drop_and_cache(struct nchandle *nch, int elmno) 1033 { 1034 struct mntcache_elm *elm; 1035 struct mntcache_elm *best; 1036 struct namecache *ncpr; 1037 int delta1; 1038 int delta2; 1039 int i; 1040 1041 if (elmno > 4) { 1042 if (nch->ncp) { 1043 _cache_drop(nch->ncp); 1044 nch->ncp = NULL; 1045 } 1046 if (nch->mount) { 1047 _cache_mntrel(nch->mount); 1048 nch->mount = NULL; 1049 } 1050 return; 1051 } 1052 1053 elm = _cache_mntcache_hash(nch->ncp); 1054 best = elm; 1055 for (i = 0; i < MNTCACHE_SET; ++i) { 1056 if (elm->ncp == NULL) { 1057 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 1058 _cache_mntrel(nch->mount); 1059 elm->ticks = ticks; 1060 nch->mount = NULL; 1061 nch->ncp = NULL; 1062 if (ncpr) 1063 _cache_drop(ncpr); 1064 return; 1065 } 1066 delta1 = ticks - best->ticks; 1067 delta2 = ticks - elm->ticks; 1068 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 1069 best = elm; 1070 ++elm; 1071 } 1072 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 1073 _cache_mntrel(nch->mount); 1074 best->ticks = ticks; 1075 nch->mount = NULL; 1076 nch->ncp = NULL; 1077 if (ncpr) 1078 _cache_drop(ncpr); 1079 } 1080 1081 void 1082 cache_changemount(struct nchandle *nch, struct mount *mp) 1083 { 1084 _cache_mntref(mp); 1085 _cache_mntrel(nch->mount); 1086 nch->mount = mp; 1087 } 1088 1089 void 1090 cache_drop(struct nchandle *nch) 1091 { 1092 _cache_mntrel(nch->mount); 1093 _cache_drop(nch->ncp); 1094 nch->ncp = NULL; 1095 nch->mount = NULL; 1096 } 1097 1098 /* 1099 * Returns: 1100 * -1 Locked by other 1101 * 0 Not locked 1102 * (v) LK_SHARED or LK_EXCLUSIVE 1103 */ 1104 int 1105 cache_lockstatus(struct nchandle *nch) 1106 { 1107 return(_cache_lockstatus(nch->ncp)); 1108 } 1109 1110 void 1111 cache_lock(struct nchandle *nch) 1112 { 1113 _cache_lock(nch->ncp); 1114 } 1115 1116 /* 1117 * Returns a shared or exclusive-locked ncp. The ncp will only be 1118 * shared-locked if it is already resolved. 1119 */ 1120 void 1121 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1122 { 1123 struct namecache *ncp = nch->ncp; 1124 1125 if (ncp_shared_lock_disable || excl || 1126 (ncp->nc_flag & NCF_UNRESOLVED)) { 1127 _cache_lock(ncp); 1128 } else { 1129 _cache_lock_shared(ncp); 1130 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1131 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1132 _cache_unlock(ncp); 1133 _cache_lock(ncp); 1134 } 1135 } else { 1136 _cache_unlock(ncp); 1137 _cache_lock(ncp); 1138 } 1139 } 1140 } 1141 1142 /* 1143 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1144 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1145 * 1146 * The caller is responsible for checking the validity upon return as 1147 * the records may have been flagged DESTROYED in the interim. 1148 * 1149 * Namecache lock ordering is leaf first, then parent. However, complex 1150 * interactions may occur between the source and target because there is 1151 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1152 */ 1153 void 1154 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1155 struct nchandle *tncpd, struct nchandle *tncp, 1156 struct ucred *fcred, struct ucred *tcred) 1157 { 1158 int tlocked = 1; 1159 u_int dummy_gen = 0; 1160 1161 /* 1162 * Lock tncp and tncpd 1163 * 1164 * NOTE: Because these ncps are not locked to begin with, it is 1165 * possible for other rename races to cause the normal lock 1166 * order assumptions to fail. 1167 * 1168 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1169 * matches after the leaf has been locked. However, ordering 1170 * between the 'from' and the 'to' is not and an overlapping 1171 * lock order reversal is still possible. 1172 */ 1173 again: 1174 if (__predict_false(tlocked == 0)) { 1175 cache_lock(tncp); 1176 } 1177 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1178 cache_unlock(tncp); 1179 cache_lock(tncpd); /* cycle tncpd lock */ 1180 cache_unlock(tncpd); 1181 tlocked = 0; 1182 goto again; 1183 } 1184 1185 /* 1186 * Lock fncp and fncpd 1187 * 1188 * NOTE: Because these ncps are not locked to begin with, it is 1189 * possible for other rename races to cause the normal lock 1190 * order assumptions to fail. 1191 * 1192 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1193 * matches after the leaf has been locked. However, ordering 1194 * between the 'from' and the 'to' is not and an overlapping 1195 * lock order reversal is still possible. 1196 */ 1197 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1198 cache_unlock(tncpd); 1199 cache_unlock(tncp); 1200 cache_lock(fncp); /* cycle fncp lock */ 1201 cache_unlock(fncp); 1202 tlocked = 0; 1203 goto again; 1204 } 1205 1206 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1207 cache_unlock(fncp); 1208 cache_unlock(tncpd); 1209 cache_unlock(tncp); 1210 cache_lock(fncpd); 1211 cache_unlock(fncpd); /* cycle fncpd lock */ 1212 tlocked = 0; 1213 goto again; 1214 } 1215 1216 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1217 cache_resolve(fncpd, &dummy_gen, fcred); 1218 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1219 cache_resolve(tncpd, &dummy_gen, tcred); 1220 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1221 cache_resolve(fncp, &dummy_gen, fcred); 1222 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1223 cache_resolve(tncp, &dummy_gen, tcred); 1224 } 1225 1226 int 1227 cache_lock_nonblock(struct nchandle *nch) 1228 { 1229 return(_cache_lock_nonblock(nch->ncp)); 1230 } 1231 1232 void 1233 cache_unlock(struct nchandle *nch) 1234 { 1235 _cache_unlock(nch->ncp); 1236 } 1237 1238 /* 1239 * ref-and-lock, unlock-and-deref functions. 1240 * 1241 * This function is primarily used by nlookup. Even though cache_lock 1242 * holds the vnode, it is possible that the vnode may have already 1243 * initiated a recyclement. 1244 * 1245 * We want cache_get() to return a definitively usable vnode or a 1246 * definitively unresolved ncp. 1247 */ 1248 static 1249 struct namecache * 1250 _cache_get(struct namecache *ncp) 1251 { 1252 _cache_hold(ncp); 1253 _cache_lock(ncp); 1254 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1255 _cache_setunresolved(ncp, 1); 1256 return(ncp); 1257 } 1258 1259 /* 1260 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1261 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1262 * valid. Otherwise an exclusive lock will be acquired instead. 1263 */ 1264 static 1265 struct namecache * 1266 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1267 { 1268 if (ncp_shared_lock_disable || excl || 1269 (ncp->nc_flag & NCF_UNRESOLVED)) 1270 { 1271 return(_cache_get(ncp)); 1272 } 1273 _cache_hold(ncp); 1274 _cache_lock_shared(ncp); 1275 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1276 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1277 _cache_unlock(ncp); 1278 ncp = _cache_get(ncp); 1279 _cache_drop(ncp); 1280 } 1281 } else { 1282 _cache_unlock(ncp); 1283 ncp = _cache_get(ncp); 1284 _cache_drop(ncp); 1285 } 1286 return(ncp); 1287 } 1288 1289 /* 1290 * NOTE: The same nchandle can be passed for both arguments. 1291 */ 1292 void 1293 cache_get(struct nchandle *nch, struct nchandle *target) 1294 { 1295 KKASSERT(nch->ncp->nc_refs > 0); 1296 target->mount = nch->mount; 1297 target->ncp = _cache_get(nch->ncp); 1298 _cache_mntref(target->mount); 1299 } 1300 1301 void 1302 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1303 { 1304 KKASSERT(nch->ncp->nc_refs > 0); 1305 target->mount = nch->mount; 1306 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1307 _cache_mntref(target->mount); 1308 } 1309 1310 /* 1311 * Release a held and locked ncp 1312 */ 1313 static __inline 1314 void 1315 _cache_put(struct namecache *ncp) 1316 { 1317 _cache_unlock(ncp); 1318 _cache_drop(ncp); 1319 } 1320 1321 void 1322 cache_put(struct nchandle *nch) 1323 { 1324 _cache_mntrel(nch->mount); 1325 _cache_put(nch->ncp); 1326 nch->ncp = NULL; 1327 nch->mount = NULL; 1328 } 1329 1330 /* 1331 * Resolve an unresolved ncp by associating a vnode with it. If the 1332 * vnode is NULL, a negative cache entry is created. 1333 * 1334 * The ncp should be locked on entry and will remain locked on return. 1335 */ 1336 static 1337 void 1338 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp, 1339 int adjgen) 1340 { 1341 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1342 1343 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1344 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1345 ncp->nc_vp == NULL); 1346 1347 if (adjgen) 1348 _cache_ncp_gen_enter(ncp); 1349 1350 if (vp) { 1351 /* 1352 * Any vp associated with an ncp which has children must 1353 * be held. Any vp associated with a locked ncp must be held. 1354 */ 1355 if (!TAILQ_EMPTY(&ncp->nc_list)) 1356 vhold(vp); 1357 spin_lock(&vp->v_spin); 1358 ncp->nc_vp = vp; 1359 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1360 ++vp->v_namecache_count; 1361 _cache_hold(ncp); /* v_namecache assoc */ 1362 spin_unlock(&vp->v_spin); 1363 vhold(vp); /* nc_vp */ 1364 1365 /* 1366 * Set auxiliary flags 1367 */ 1368 switch(vp->v_type) { 1369 case VDIR: 1370 ncp->nc_flag |= NCF_ISDIR; 1371 break; 1372 case VLNK: 1373 ncp->nc_flag |= NCF_ISSYMLINK; 1374 /* XXX cache the contents of the symlink */ 1375 break; 1376 default: 1377 break; 1378 } 1379 1380 ncp->nc_error = 0; 1381 1382 /* 1383 * XXX: this is a hack to work-around the lack of a real pfs vfs 1384 * implementation 1385 */ 1386 if (mp) { 1387 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1388 vp->v_pfsmp = mp; 1389 } 1390 } else { 1391 /* 1392 * When creating a negative cache hit we set the 1393 * namecache_gen. A later resolve will clean out the 1394 * negative cache hit if the mount point's namecache_gen 1395 * has changed. Used by devfs, could also be used by 1396 * other remote FSs. 1397 */ 1398 ncp->nc_vp = NULL; 1399 ncp->nc_negcpu = mycpu->gd_cpuid; 1400 spin_lock(&pn->neg_spin); 1401 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1402 _cache_hold(ncp); /* neg_list assoc */ 1403 ++pn->neg_count; 1404 spin_unlock(&pn->neg_spin); 1405 atomic_add_long(&pn->vfscache_negs, 1); 1406 1407 ncp->nc_error = ENOENT; 1408 if (mp) 1409 VFS_NCPGEN_SET(mp, ncp); 1410 } 1411 1412 /* 1413 * Previously unresolved leaf is now resolved. 1414 * 1415 * Clear the NCF_UNRESOLVED flag last (see cache_nlookup_nonlocked()). 1416 * We only adjust vfscache_unres for ncp's that are in the tree. 1417 */ 1418 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) 1419 atomic_add_long(&pn->vfscache_unres, -1); 1420 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1421 if (adjgen) 1422 _cache_ncp_gen_exit(ncp); 1423 } 1424 1425 void 1426 cache_setvp(struct nchandle *nch, struct vnode *vp) 1427 { 1428 _cache_setvp(nch->mount, nch->ncp, vp, 1); 1429 } 1430 1431 /* 1432 * Used for NFS 1433 */ 1434 void 1435 cache_settimeout(struct nchandle *nch, int nticks) 1436 { 1437 struct namecache *ncp = nch->ncp; 1438 1439 if ((ncp->nc_timeout = ticks + nticks) == 0) 1440 ncp->nc_timeout = 1; 1441 } 1442 1443 /* 1444 * Disassociate the vnode or negative-cache association and mark a 1445 * namecache entry as unresolved again. Note that the ncp is still 1446 * left in the hash table and still linked to its parent. 1447 * 1448 * The ncp should be locked and refd on entry and will remain locked and refd 1449 * on return. 1450 * 1451 * This routine is normally never called on a directory containing children. 1452 * However, NFS often does just that in its rename() code as a cop-out to 1453 * avoid complex namespace operations. This disconnects a directory vnode 1454 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1455 * sync. 1456 * 1457 */ 1458 static 1459 void 1460 _cache_setunresolved(struct namecache *ncp, int adjgen) 1461 { 1462 struct vnode *vp; 1463 1464 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1465 struct pcpu_ncache *pn; 1466 1467 if (adjgen) 1468 _cache_ncp_gen_enter(ncp); 1469 1470 /* 1471 * Is a resolved or destroyed leaf now becoming unresolved? 1472 * Only adjust vfscache_unres for linked ncp's. 1473 */ 1474 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) { 1475 pn = &pcpu_ncache[mycpu->gd_cpuid]; 1476 atomic_add_long(&pn->vfscache_unres, 1); 1477 } 1478 1479 ncp->nc_flag |= NCF_UNRESOLVED; 1480 ncp->nc_timeout = 0; 1481 ncp->nc_error = ENOTCONN; 1482 if ((vp = ncp->nc_vp) != NULL) { 1483 spin_lock(&vp->v_spin); 1484 ncp->nc_vp = NULL; 1485 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1486 --vp->v_namecache_count; 1487 spin_unlock(&vp->v_spin); 1488 1489 /* 1490 * Any vp associated with an ncp with children is 1491 * held by that ncp. Any vp associated with ncp 1492 * is held by that ncp. These conditions must be 1493 * undone when the vp is cleared out from the ncp. 1494 */ 1495 if (!TAILQ_EMPTY(&ncp->nc_list)) 1496 vdrop(vp); 1497 vdrop(vp); 1498 } else { 1499 pn = &pcpu_ncache[ncp->nc_negcpu]; 1500 1501 atomic_add_long(&pn->vfscache_negs, -1); 1502 spin_lock(&pn->neg_spin); 1503 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1504 --pn->neg_count; 1505 spin_unlock(&pn->neg_spin); 1506 } 1507 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1508 1509 if (adjgen) 1510 _cache_ncp_gen_exit(ncp); 1511 _cache_drop(ncp); /* from v_namecache or neg_list */ 1512 } 1513 } 1514 1515 /* 1516 * The cache_nresolve() code calls this function to automatically 1517 * set a resolved cache element to unresolved if it has timed out 1518 * or if it is a negative cache hit and the mount point namecache_gen 1519 * has changed. 1520 */ 1521 static __inline int 1522 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1523 { 1524 /* 1525 * Try to zap entries that have timed out. We have 1526 * to be careful here because locked leafs may depend 1527 * on the vnode remaining intact in a parent, so only 1528 * do this under very specific conditions. 1529 */ 1530 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1531 TAILQ_EMPTY(&ncp->nc_list)) { 1532 return 1; 1533 } 1534 1535 /* 1536 * If a resolved negative cache hit is invalid due to 1537 * the mount's namecache generation being bumped, zap it. 1538 */ 1539 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1540 return 1; 1541 } 1542 1543 /* 1544 * Otherwise we are good 1545 */ 1546 return 0; 1547 } 1548 1549 static __inline void 1550 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1551 { 1552 /* 1553 * Already in an unresolved state, nothing to do. 1554 */ 1555 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1556 if (_cache_auto_unresolve_test(mp, ncp)) 1557 _cache_setunresolved(ncp, 1); 1558 } 1559 } 1560 1561 void 1562 cache_setunresolved(struct nchandle *nch) 1563 { 1564 _cache_setunresolved(nch->ncp, 1); 1565 } 1566 1567 /* 1568 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1569 * looking for matches. This flag tells the lookup code when it must 1570 * check for a mount linkage and also prevents the directories in question 1571 * from being deleted or renamed. 1572 */ 1573 static 1574 int 1575 cache_clrmountpt_callback(struct mount *mp, void *data) 1576 { 1577 struct nchandle *nch = data; 1578 1579 if (mp->mnt_ncmounton.ncp == nch->ncp) 1580 return(1); 1581 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1582 return(1); 1583 return(0); 1584 } 1585 1586 /* 1587 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1588 * with a mount point. 1589 */ 1590 void 1591 cache_clrmountpt(struct nchandle *nch) 1592 { 1593 int count; 1594 1595 count = mountlist_scan(cache_clrmountpt_callback, nch, 1596 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1597 MNTSCAN_NOUNLOCK); 1598 if (count == 0) 1599 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1600 } 1601 1602 /* 1603 * Invalidate portions of the namecache topology given a starting entry. 1604 * The passed ncp is set to an unresolved state and: 1605 * 1606 * The passed ncp must be referenced and locked. The routine may unlock 1607 * and relock ncp several times, and will recheck the children and loop 1608 * to catch races. When done the passed ncp will be returned with the 1609 * reference and lock intact. 1610 * 1611 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1612 * that the physical underlying nodes have been 1613 * destroyed... as in deleted. For example, when 1614 * a directory is removed. This will cause record 1615 * lookups on the name to no longer be able to find 1616 * the record and tells the resolver to return failure 1617 * rather then trying to resolve through the parent. 1618 * 1619 * The topology itself, including ncp->nc_name, 1620 * remains intact. 1621 * 1622 * This only applies to the passed ncp, if CINV_CHILDREN 1623 * is specified the children are not flagged. 1624 * 1625 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1626 * state as well. 1627 * 1628 * Note that this will also have the side effect of 1629 * cleaning out any unreferenced nodes in the topology 1630 * from the leaves up as the recursion backs out. 1631 * 1632 * Note that the topology for any referenced nodes remains intact, but 1633 * the nodes will be marked as having been destroyed and will be set 1634 * to an unresolved state. 1635 * 1636 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1637 * the namecache entry may not actually be invalidated on return if it was 1638 * revalidated while recursing down into its children. This code guarentees 1639 * that the node(s) will go through an invalidation cycle, but does not 1640 * guarentee that they will remain in an invalidated state. 1641 * 1642 * Returns non-zero if a revalidation was detected during the invalidation 1643 * recursion, zero otherwise. Note that since only the original ncp is 1644 * locked the revalidation ultimately can only indicate that the original ncp 1645 * *MIGHT* no have been reresolved. 1646 * 1647 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1648 * have to avoid blowing out the kernel stack. We do this by saving the 1649 * deep namecache node and aborting the recursion, then re-recursing at that 1650 * node using a depth-first algorithm in order to allow multiple deep 1651 * recursions to chain through each other, then we restart the invalidation 1652 * from scratch. 1653 */ 1654 1655 struct cinvtrack { 1656 struct namecache *resume_ncp; 1657 int depth; 1658 }; 1659 1660 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1661 1662 static 1663 int 1664 _cache_inval(struct namecache *ncp, int flags) 1665 { 1666 struct cinvtrack track; 1667 struct namecache *ncp2; 1668 int r; 1669 1670 track.depth = 0; 1671 track.resume_ncp = NULL; 1672 1673 for (;;) { 1674 r = _cache_inval_internal(ncp, flags, &track); 1675 if (track.resume_ncp == NULL) 1676 break; 1677 _cache_unlock(ncp); 1678 while ((ncp2 = track.resume_ncp) != NULL) { 1679 track.resume_ncp = NULL; 1680 _cache_lock(ncp2); 1681 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1682 &track); 1683 /*_cache_put(ncp2);*/ 1684 cache_zap(ncp2); 1685 } 1686 _cache_lock(ncp); 1687 } 1688 return(r); 1689 } 1690 1691 int 1692 cache_inval(struct nchandle *nch, int flags) 1693 { 1694 return(_cache_inval(nch->ncp, flags)); 1695 } 1696 1697 /* 1698 * Helper for _cache_inval(). The passed ncp is refd and locked and 1699 * remains that way on return, but may be unlocked/relocked multiple 1700 * times by the routine. 1701 */ 1702 static int 1703 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1704 { 1705 struct namecache *nextkid; 1706 int rcnt = 0; 1707 1708 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1709 1710 _cache_ncp_gen_enter(ncp); 1711 _cache_setunresolved(ncp, 0); 1712 if (flags & CINV_DESTROY) { 1713 ncp->nc_flag |= NCF_DESTROYED; 1714 cpu_sfence(); 1715 } 1716 1717 while ((flags & CINV_CHILDREN) && 1718 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1719 ) { 1720 struct namecache *kid; 1721 int restart; 1722 1723 restart = 0; 1724 _cache_hold(nextkid); 1725 if (++track->depth > MAX_RECURSION_DEPTH) { 1726 track->resume_ncp = ncp; 1727 _cache_hold(ncp); 1728 ++rcnt; 1729 } 1730 while ((kid = nextkid) != NULL) { 1731 /* 1732 * Parent (ncp) must be locked for the iteration. 1733 */ 1734 nextkid = NULL; 1735 if (kid->nc_parent != ncp) { 1736 _cache_drop(kid); 1737 kprintf("cache_inval_internal restartA %s\n", 1738 ncp->nc_name); 1739 restart = 1; 1740 break; 1741 } 1742 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1743 _cache_hold(nextkid); 1744 1745 /* 1746 * Parent unlocked for this section to avoid 1747 * deadlocks. Then lock the kid and check for 1748 * races. 1749 */ 1750 _cache_unlock(ncp); 1751 if (track->resume_ncp) { 1752 _cache_drop(kid); 1753 _cache_lock(ncp); 1754 break; 1755 } 1756 _cache_lock(kid); 1757 if (kid->nc_parent != ncp) { 1758 kprintf("cache_inval_internal " 1759 "restartB %s\n", 1760 ncp->nc_name); 1761 restart = 1; 1762 _cache_unlock(kid); 1763 _cache_drop(kid); 1764 _cache_lock(ncp); 1765 break; 1766 } 1767 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1768 TAILQ_FIRST(&kid->nc_list) 1769 ) { 1770 1771 rcnt += _cache_inval_internal(kid, 1772 flags & ~CINV_DESTROY, track); 1773 /*_cache_unlock(kid);*/ 1774 /*_cache_drop(kid);*/ 1775 cache_zap(kid); 1776 } else { 1777 cache_zap(kid); 1778 } 1779 1780 /* 1781 * Relock parent to continue scan 1782 */ 1783 _cache_lock(ncp); 1784 } 1785 if (nextkid) 1786 _cache_drop(nextkid); 1787 --track->depth; 1788 if (restart == 0) 1789 break; 1790 } 1791 1792 /* 1793 * Someone could have gotten in there while ncp was unlocked, 1794 * retry if so. 1795 */ 1796 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1797 ++rcnt; 1798 _cache_ncp_gen_exit(ncp); 1799 1800 return (rcnt); 1801 } 1802 1803 /* 1804 * Invalidate a vnode's namecache associations. To avoid races against 1805 * the resolver we do not invalidate a node which we previously invalidated 1806 * but which was then re-resolved while we were in the invalidation loop. 1807 * 1808 * Returns non-zero if any namecache entries remain after the invalidation 1809 * loop completed. 1810 * 1811 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1812 * be ripped out of the topology while held, the vnode's v_namecache 1813 * list has no such restriction. NCP's can be ripped out of the list 1814 * at virtually any time if not locked, even if held. 1815 * 1816 * In addition, the v_namecache list itself must be locked via 1817 * the vnode's spinlock. 1818 */ 1819 int 1820 cache_inval_vp(struct vnode *vp, int flags) 1821 { 1822 struct namecache *ncp; 1823 struct namecache *next; 1824 1825 restart: 1826 spin_lock(&vp->v_spin); 1827 ncp = TAILQ_FIRST(&vp->v_namecache); 1828 if (ncp) 1829 _cache_hold(ncp); 1830 while (ncp) { 1831 /* loop entered with ncp held and vp spin-locked */ 1832 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1833 _cache_hold(next); 1834 spin_unlock(&vp->v_spin); 1835 _cache_lock(ncp); 1836 if (ncp->nc_vp != vp) { 1837 kprintf("Warning: cache_inval_vp: race-A detected on " 1838 "%s\n", ncp->nc_name); 1839 _cache_put(ncp); 1840 if (next) 1841 _cache_drop(next); 1842 goto restart; 1843 } 1844 _cache_inval(ncp, flags); 1845 _cache_put(ncp); /* also releases reference */ 1846 ncp = next; 1847 spin_lock(&vp->v_spin); 1848 if (ncp && ncp->nc_vp != vp) { 1849 spin_unlock(&vp->v_spin); 1850 kprintf("Warning: cache_inval_vp: race-B detected on " 1851 "%s\n", ncp->nc_name); 1852 _cache_drop(ncp); 1853 goto restart; 1854 } 1855 } 1856 spin_unlock(&vp->v_spin); 1857 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1858 } 1859 1860 /* 1861 * This routine is used instead of the normal cache_inval_vp() when we 1862 * are trying to recycle otherwise good vnodes. 1863 * 1864 * Return 0 on success, non-zero if not all namecache records could be 1865 * disassociated from the vnode (for various reasons). 1866 */ 1867 int 1868 cache_inval_vp_nonblock(struct vnode *vp) 1869 { 1870 struct namecache *ncp; 1871 struct namecache *next; 1872 1873 spin_lock(&vp->v_spin); 1874 1875 ncp = TAILQ_FIRST(&vp->v_namecache); 1876 if (ncp) 1877 _cache_hold(ncp); 1878 1879 while (ncp) { 1880 /* loop entered with ncp held */ 1881 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1882 _cache_hold(next); 1883 spin_unlock(&vp->v_spin); 1884 if (_cache_lock_nonblock(ncp)) { 1885 _cache_drop(ncp); 1886 if (next) 1887 _cache_drop(next); 1888 goto done; 1889 } 1890 if (ncp->nc_vp != vp) { 1891 kprintf("Warning: cache_inval_vp: race-A detected on " 1892 "%s\n", ncp->nc_name); 1893 _cache_put(ncp); 1894 if (next) 1895 _cache_drop(next); 1896 goto done; 1897 } 1898 _cache_inval(ncp, 0); 1899 _cache_put(ncp); /* also releases reference */ 1900 ncp = next; 1901 spin_lock(&vp->v_spin); 1902 if (ncp && ncp->nc_vp != vp) { 1903 spin_unlock(&vp->v_spin); 1904 kprintf("Warning: cache_inval_vp: race-B detected on " 1905 "%s\n", ncp->nc_name); 1906 _cache_drop(ncp); 1907 goto done; 1908 } 1909 } 1910 spin_unlock(&vp->v_spin); 1911 done: 1912 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1913 } 1914 1915 /* 1916 * Attempt to quickly invalidate the vnode's namecache entry. This function 1917 * will also dive the ncp and free its children but only if they are trivial. 1918 * All locks are non-blocking and the function will fail if required locks 1919 * cannot be obtained. 1920 * 1921 * We want this sort of function to be able to guarantee progress when vnlru 1922 * wants to recycle a vnode. Directories could otherwise get stuck and not 1923 * be able to recycle due to destroyed or unresolved children in the 1924 * namecache. 1925 */ 1926 void 1927 cache_inval_vp_quick(struct vnode *vp) 1928 { 1929 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1930 struct namecache *ncp; 1931 struct namecache *kid; 1932 1933 spin_lock(&vp->v_spin); 1934 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) { 1935 _cache_hold(ncp); 1936 spin_unlock(&vp->v_spin); 1937 if (_cache_lock_nonblock(ncp)) { 1938 _cache_drop(ncp); 1939 return; 1940 } 1941 1942 /* 1943 * Try to trivially destroy any children. 1944 */ 1945 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) { 1946 struct nchash_head *nchpp; 1947 1948 /* 1949 * Early test without the lock. Give-up if the 1950 * child has children of its own, the child is 1951 * positively-resolved, or the ref-count is 1952 * unexpected. 1953 */ 1954 if (TAILQ_FIRST(&kid->nc_list) || 1955 kid->nc_vp || 1956 kid->nc_refs != ncpbaserefs(kid)) 1957 { 1958 _cache_put(ncp); 1959 return; 1960 } 1961 1962 _cache_hold(kid); 1963 if (_cache_lock_nonblock(kid)) { 1964 _cache_drop(kid); 1965 _cache_put(ncp); 1966 return; 1967 } 1968 1969 /* 1970 * A destruction/free test requires the parent, 1971 * the kid, and the hash table to be locked. Note 1972 * that the kid may still be on the negative cache 1973 * list. 1974 */ 1975 nchpp = kid->nc_head; 1976 spin_lock(&nchpp->spin); 1977 1978 /* 1979 * Give up if the child isn't trivial. It can be 1980 * resolved or unresolved but must not have a vp. 1981 */ 1982 if (kid->nc_parent != ncp || 1983 kid->nc_vp || 1984 TAILQ_FIRST(&kid->nc_list) || 1985 kid->nc_refs != 1 + ncpbaserefs(kid)) 1986 { 1987 spin_unlock(&nchpp->spin); 1988 _cache_put(kid); 1989 _cache_put(ncp); 1990 return; 1991 } 1992 1993 ++pn->inv_kid_quick_count; 1994 1995 /* 1996 * We can safely destroy the kid. It may still 1997 * have extra refs due to ncneglist races, but since 1998 * we checked above with the lock held those races 1999 * will self-resolve. 2000 * 2001 * With these actions the kid should nominally 2002 * have just its natural ref plus our ref. 2003 * 2004 * This is only safe because we hold locks on 2005 * the parent, the kid, and the nchpp. The only 2006 * lock we don't have is on the ncneglist and that 2007 * can race a ref, but as long as we unresolve the 2008 * kid before executing our final drop the ncneglist 2009 * code path(s) will just drop their own ref so all 2010 * is good. 2011 */ 2012 _cache_unlink_parent(ncp, kid, nchpp); 2013 _cache_setunresolved(kid, 1); 2014 if (kid->nc_refs != 2) { 2015 kprintf("Warning: kid %p unexpected refs=%d " 2016 "%08x %s\n", 2017 kid, kid->nc_refs, 2018 kid->nc_flag, kid->nc_name); 2019 } 2020 _cache_put(kid); /* drop our ref and lock */ 2021 _cache_drop(kid); /* drop natural ref to destroy */ 2022 } 2023 2024 /* 2025 * Now check ncp itself against our expectations. With 2026 * no children left we have our ref plus whether it is 2027 * resolved or not (which it has to be, actually, since it 2028 * is hanging off the vp->v_namecache). 2029 */ 2030 if (ncp->nc_refs != 1 + ncpbaserefs(ncp)) { 2031 _cache_put(ncp); 2032 spin_lock(&vp->v_spin); 2033 break; 2034 } 2035 2036 ++pn->inv_ncp_quick_count; 2037 2038 /* 2039 * Success, disassociate and release the ncp. Do not 2040 * try to zap it here. 2041 * 2042 * NOTE: Releasing the ncp here leaves it in the tree, 2043 * but since we have disassociated the vnode this 2044 * ncp entry becomes 'trivial' and successive calls 2045 * to cache_inval_vp_quick() will be able to continue 2046 * to make progress. 2047 */ 2048 _cache_setunresolved(ncp, 1); 2049 _cache_put(ncp); 2050 spin_lock(&vp->v_spin); 2051 } 2052 spin_unlock(&vp->v_spin); 2053 } 2054 2055 /* 2056 * Clears the universal directory search 'ok' flag. This flag allows 2057 * nlookup() to bypass normal vnode checks. This flag is a cached flag 2058 * so clearing it simply forces revalidation. 2059 */ 2060 void 2061 cache_inval_wxok(struct vnode *vp) 2062 { 2063 struct namecache *ncp; 2064 2065 spin_lock(&vp->v_spin); 2066 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 2067 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 2068 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 2069 } 2070 spin_unlock(&vp->v_spin); 2071 } 2072 2073 /* 2074 * The source ncp has been renamed to the target ncp. All elements have been 2075 * locked, including the parent ncp's. 2076 * 2077 * The target ncp is destroyed (as a normal rename-over would destroy the 2078 * target file or directory). 2079 * 2080 * Because there may be references to the source ncp we cannot copy its 2081 * contents to the target. Instead the source ncp is relinked as the target 2082 * and the target ncp is removed from the namecache topology. 2083 */ 2084 void 2085 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 2086 { 2087 struct namecache *fncp = fnch->ncp; 2088 struct namecache *tncp = tnch->ncp; 2089 struct namecache *par; 2090 struct nchash_head *nchpp; 2091 u_int32_t hash; 2092 char *oname; 2093 char *nname; 2094 2095 if (tncp->nc_nlen) { 2096 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 2097 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 2098 nname[tncp->nc_nlen] = 0; 2099 } else { 2100 nname = NULL; 2101 } 2102 2103 /* 2104 * Rename fncp (unlink) 2105 */ 2106 if (fncp->nc_parent) { 2107 par = fncp->nc_parent; 2108 _cache_hold(par); 2109 _cache_lock(par); 2110 nchpp = fncp->nc_head; 2111 spin_lock(&nchpp->spin); 2112 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */ 2113 _cache_put(par); 2114 } else { 2115 par = NULL; 2116 nchpp = NULL; 2117 } 2118 oname = fncp->nc_name; 2119 fncp->nc_name = nname; 2120 fncp->nc_nlen = tncp->nc_nlen; 2121 if (oname) 2122 kfree(oname, M_VFSCACHEAUX); 2123 2124 par = tncp->nc_parent; 2125 KKASSERT(par->nc_lock.lk_lockholder == curthread); 2126 2127 /* 2128 * Rename fncp (relink) 2129 */ 2130 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 2131 hash = fnv_32_buf(&par, sizeof(par), hash); 2132 nchpp = NCHHASH(hash); 2133 2134 spin_lock(&nchpp->spin); 2135 _cache_link_parent(fncp, par, nchpp); 2136 spin_unlock(&nchpp->spin); 2137 2138 /* 2139 * Get rid of the overwritten tncp (unlink) 2140 */ 2141 _cache_unlink(tncp); 2142 } 2143 2144 /* 2145 * Perform actions consistent with unlinking a file. The passed-in ncp 2146 * must be locked. 2147 * 2148 * The ncp is marked DESTROYED so it no longer shows up in searches, 2149 * and will be physically deleted when the vnode goes away. 2150 * 2151 * If the related vnode has no refs then we cycle it through vget()/vput() 2152 * to (possibly if we don't have a ref race) trigger a deactivation, 2153 * allowing the VFS to trivially detect and recycle the deleted vnode 2154 * via VOP_INACTIVE(). 2155 * 2156 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2157 * target ncp. 2158 */ 2159 void 2160 cache_unlink(struct nchandle *nch) 2161 { 2162 _cache_unlink(nch->ncp); 2163 } 2164 2165 static void 2166 _cache_unlink(struct namecache *ncp) 2167 { 2168 struct vnode *vp; 2169 2170 /* 2171 * Causes lookups to fail and allows another ncp with the same 2172 * name to be created under ncp->nc_parent. 2173 */ 2174 _cache_ncp_gen_enter(ncp); 2175 ncp->nc_flag |= NCF_DESTROYED; 2176 2177 /* 2178 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2179 * force action on the 1->0 transition. Do not destroy the 2180 * vp association if a vp is present (leave the destroyed ncp 2181 * resolved through the vp finalization). 2182 * 2183 * Cleanup the refs in the resolved-not-found case by setting 2184 * the ncp to an unresolved state. This improves our ability 2185 * to get rid of dead ncp elements in other cache_*() routines. 2186 */ 2187 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 2188 vp = ncp->nc_vp; 2189 if (vp) { 2190 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2191 if (VREFCNT(vp) <= 0) { 2192 if (vget(vp, LK_SHARED) == 0) 2193 vput(vp); 2194 } 2195 } else { 2196 _cache_setunresolved(ncp, 0); 2197 } 2198 } 2199 _cache_ncp_gen_exit(ncp); 2200 } 2201 2202 /* 2203 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2204 * file. The easy solution is to just return non-zero if the vnode has refs. 2205 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2206 * force the reclaim). 2207 */ 2208 int 2209 cache_isopen(struct nchandle *nch) 2210 { 2211 struct vnode *vp; 2212 struct namecache *ncp = nch->ncp; 2213 2214 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2215 (vp = ncp->nc_vp) != NULL && 2216 VREFCNT(vp)) { 2217 return 1; 2218 } 2219 return 0; 2220 } 2221 2222 2223 /* 2224 * vget the vnode associated with the namecache entry. Resolve the namecache 2225 * entry if necessary. The passed ncp must be referenced and locked. If 2226 * the ncp is resolved it might be locked shared. 2227 * 2228 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2229 * (depending on the passed lk_type) will be returned in *vpp with an error 2230 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2231 * most typical error is ENOENT, meaning that the ncp represents a negative 2232 * cache hit and there is no vnode to retrieve, but other errors can occur 2233 * too. 2234 * 2235 * The vget() can race a reclaim. If this occurs we re-resolve the 2236 * namecache entry. 2237 * 2238 * There are numerous places in the kernel where vget() is called on a 2239 * vnode while one or more of its namecache entries is locked. Releasing 2240 * a vnode never deadlocks against locked namecache entries (the vnode 2241 * will not get recycled while referenced ncp's exist). This means we 2242 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2243 * lock when acquiring the vp lock or we might cause a deadlock. 2244 * 2245 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2246 * unresolved. If a reclaim race occurs the passed-in ncp will be 2247 * relocked exclusively before being re-resolved. 2248 */ 2249 int 2250 cache_vget(struct nchandle *nch, struct ucred *cred, 2251 int lk_type, struct vnode **vpp) 2252 { 2253 struct namecache *ncp; 2254 struct vnode *vp; 2255 int error; 2256 u_int dummy_gen = 0; 2257 2258 ncp = nch->ncp; 2259 again: 2260 vp = NULL; 2261 if (ncp->nc_flag & NCF_UNRESOLVED) 2262 error = cache_resolve(nch, &dummy_gen, cred); 2263 else 2264 error = 0; 2265 2266 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2267 error = vget(vp, lk_type); 2268 if (error) { 2269 /* 2270 * VRECLAIM race 2271 * 2272 * The ncp may have been locked shared, we must relock 2273 * it exclusively before we can set it to unresolved. 2274 */ 2275 if (error == ENOENT) { 2276 kprintf("Warning: vnode reclaim race detected " 2277 "in cache_vget on %p (%s)\n", 2278 vp, ncp->nc_name); 2279 _cache_unlock(ncp); 2280 _cache_lock(ncp); 2281 _cache_setunresolved(ncp, 1); 2282 goto again; 2283 } 2284 2285 /* 2286 * Not a reclaim race, some other error. 2287 */ 2288 KKASSERT(ncp->nc_vp == vp); 2289 vp = NULL; 2290 } else { 2291 KKASSERT(ncp->nc_vp == vp); 2292 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2293 } 2294 } 2295 if (error == 0 && vp == NULL) 2296 error = ENOENT; 2297 *vpp = vp; 2298 return(error); 2299 } 2300 2301 /* 2302 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 2303 * is already held by virtuue of the ncp being locked, but it might not be 2304 * referenced and while it is not referenced it can transition into the 2305 * VRECLAIMED state. 2306 * 2307 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2308 * unresolved. If a reclaim race occurs the passed-in ncp will be 2309 * relocked exclusively before being re-resolved. 2310 * 2311 * NOTE: At the moment we have to issue a vget() on the vnode, even though 2312 * we are going to immediately release the lock, in order to resolve 2313 * potential reclamation races. Once we have a solid vnode ref that 2314 * was (at some point) interlocked via a vget(), the vnode will not 2315 * be reclaimed. 2316 * 2317 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2318 */ 2319 int 2320 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2321 { 2322 struct namecache *ncp; 2323 struct vnode *vp; 2324 int error; 2325 int v; 2326 u_int dummy_gen = 0; 2327 2328 ncp = nch->ncp; 2329 again: 2330 vp = NULL; 2331 if (ncp->nc_flag & NCF_UNRESOLVED) 2332 error = cache_resolve(nch, &dummy_gen, cred); 2333 else 2334 error = 0; 2335 2336 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2337 /* 2338 * Try a lockless ref of the vnode. VRECLAIMED transitions 2339 * use the vx_lock state and update-counter mechanism so we 2340 * can detect if one is in-progress or occurred. 2341 * 2342 * If we can successfully ref the vnode and interlock against 2343 * the update-counter mechanism, and VRECLAIMED is found to 2344 * not be set after that, we should be good. 2345 */ 2346 v = spin_access_start_only(&vp->v_spin); 2347 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2348 vref_special(vp); 2349 if (__predict_false( 2350 spin_access_end_only(&vp->v_spin, v))) { 2351 vrele(vp); 2352 continue; 2353 } 2354 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2355 break; 2356 } 2357 vrele(vp); 2358 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2359 } 2360 2361 /* 2362 * Do it the slow way 2363 */ 2364 error = vget(vp, LK_SHARED); 2365 if (error) { 2366 /* 2367 * VRECLAIM race 2368 */ 2369 if (error == ENOENT) { 2370 kprintf("Warning: vnode reclaim race detected " 2371 "in cache_vget on %p (%s)\n", 2372 vp, ncp->nc_name); 2373 _cache_unlock(ncp); 2374 _cache_lock(ncp); 2375 _cache_setunresolved(ncp, 1); 2376 goto again; 2377 } 2378 2379 /* 2380 * Not a reclaim race, some other error. 2381 */ 2382 KKASSERT(ncp->nc_vp == vp); 2383 vp = NULL; 2384 } else { 2385 KKASSERT(ncp->nc_vp == vp); 2386 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2387 /* caller does not want a lock */ 2388 vn_unlock(vp); 2389 } 2390 break; 2391 } 2392 if (error == 0 && vp == NULL) 2393 error = ENOENT; 2394 *vpp = vp; 2395 2396 return(error); 2397 } 2398 2399 /* 2400 * Return a referenced vnode representing the parent directory of 2401 * ncp. 2402 * 2403 * Because the caller has locked the ncp it should not be possible for 2404 * the parent ncp to go away. However, the parent can unresolve its 2405 * dvp at any time so we must be able to acquire a lock on the parent 2406 * to safely access nc_vp. 2407 * 2408 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2409 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2410 * getting destroyed. 2411 * 2412 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2413 * lock on the ncp in question.. 2414 */ 2415 struct vnode * 2416 cache_dvpref(struct namecache *ncp) 2417 { 2418 struct namecache *par; 2419 struct vnode *dvp; 2420 2421 dvp = NULL; 2422 if ((par = ncp->nc_parent) != NULL) { 2423 _cache_hold(par); 2424 _cache_lock(par); 2425 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2426 if ((dvp = par->nc_vp) != NULL) 2427 vhold(dvp); 2428 } 2429 _cache_unlock(par); 2430 if (dvp) { 2431 if (vget(dvp, LK_SHARED) == 0) { 2432 vn_unlock(dvp); 2433 vdrop(dvp); 2434 /* return refd, unlocked dvp */ 2435 } else { 2436 vdrop(dvp); 2437 dvp = NULL; 2438 } 2439 } 2440 _cache_drop(par); 2441 } 2442 return(dvp); 2443 } 2444 2445 /* 2446 * Convert a directory vnode to a namecache record without any other 2447 * knowledge of the topology. This ONLY works with directory vnodes and 2448 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2449 * returned ncp (if not NULL) will be held and unlocked. 2450 * 2451 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2452 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2453 * for dvp. This will fail only if the directory has been deleted out from 2454 * under the caller. 2455 * 2456 * Callers must always check for a NULL return no matter the value of 'makeit'. 2457 * 2458 * To avoid underflowing the kernel stack each recursive call increments 2459 * the makeit variable. 2460 */ 2461 2462 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2463 struct vnode *dvp, char *fakename); 2464 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2465 struct vnode **saved_dvp); 2466 2467 int 2468 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2469 struct nchandle *nch) 2470 { 2471 struct vnode *saved_dvp; 2472 struct vnode *pvp; 2473 char *fakename; 2474 int error; 2475 2476 nch->ncp = NULL; 2477 nch->mount = dvp->v_mount; 2478 saved_dvp = NULL; 2479 fakename = NULL; 2480 2481 /* 2482 * Handle the makeit == 0 degenerate case 2483 */ 2484 if (makeit == 0) { 2485 spin_lock_shared(&dvp->v_spin); 2486 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2487 if (nch->ncp) 2488 cache_hold(nch); 2489 spin_unlock_shared(&dvp->v_spin); 2490 } 2491 2492 /* 2493 * Loop until resolution, inside code will break out on error. 2494 */ 2495 while (makeit) { 2496 /* 2497 * Break out if we successfully acquire a working ncp. 2498 */ 2499 spin_lock_shared(&dvp->v_spin); 2500 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2501 if (nch->ncp) { 2502 cache_hold(nch); 2503 spin_unlock_shared(&dvp->v_spin); 2504 break; 2505 } 2506 spin_unlock_shared(&dvp->v_spin); 2507 2508 /* 2509 * If dvp is the root of its filesystem it should already 2510 * have a namecache pointer associated with it as a side 2511 * effect of the mount, but it may have been disassociated. 2512 */ 2513 if (dvp->v_flag & VROOT) { 2514 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2515 error = cache_resolve_mp(nch->mount, 1); 2516 _cache_put(nch->ncp); 2517 if (ncvp_debug & 1) { 2518 kprintf("cache_fromdvp: resolve root of " 2519 "mount %p error %d", 2520 dvp->v_mount, error); 2521 } 2522 if (error) { 2523 if (ncvp_debug & 1) 2524 kprintf(" failed\n"); 2525 nch->ncp = NULL; 2526 break; 2527 } 2528 if (ncvp_debug & 1) 2529 kprintf(" succeeded\n"); 2530 continue; 2531 } 2532 2533 /* 2534 * If we are recursed too deeply resort to an O(n^2) 2535 * algorithm to resolve the namecache topology. The 2536 * resolved pvp is left referenced in saved_dvp to 2537 * prevent the tree from being destroyed while we loop. 2538 */ 2539 if (makeit > 20) { 2540 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2541 if (error) { 2542 kprintf("lookupdotdot(longpath) failed %d " 2543 "dvp %p\n", error, dvp); 2544 nch->ncp = NULL; 2545 break; 2546 } 2547 continue; 2548 } 2549 2550 /* 2551 * Get the parent directory and resolve its ncp. 2552 */ 2553 if (fakename) { 2554 kfree(fakename, M_TEMP); 2555 fakename = NULL; 2556 } 2557 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2558 &fakename); 2559 if (error) { 2560 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2561 break; 2562 } 2563 vn_unlock(pvp); 2564 2565 /* 2566 * Reuse makeit as a recursion depth counter. On success 2567 * nch will be fully referenced. 2568 */ 2569 cache_fromdvp(pvp, cred, makeit + 1, nch); 2570 vrele(pvp); 2571 if (nch->ncp == NULL) 2572 break; 2573 2574 /* 2575 * Do an inefficient scan of pvp (embodied by ncp) to look 2576 * for dvp. This will create a namecache record for dvp on 2577 * success. We loop up to recheck on success. 2578 * 2579 * ncp and dvp are both held but not locked. 2580 */ 2581 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2582 if (error) { 2583 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2584 pvp, nch->ncp->nc_name, dvp); 2585 cache_drop(nch); 2586 /* nch was NULLed out, reload mount */ 2587 nch->mount = dvp->v_mount; 2588 break; 2589 } 2590 if (ncvp_debug & 1) { 2591 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2592 pvp, nch->ncp->nc_name); 2593 } 2594 cache_drop(nch); 2595 /* nch was NULLed out, reload mount */ 2596 nch->mount = dvp->v_mount; 2597 } 2598 2599 /* 2600 * If nch->ncp is non-NULL it will have been held already. 2601 */ 2602 if (fakename) 2603 kfree(fakename, M_TEMP); 2604 if (saved_dvp) 2605 vrele(saved_dvp); 2606 if (nch->ncp) 2607 return (0); 2608 return (EINVAL); 2609 } 2610 2611 /* 2612 * Go up the chain of parent directories until we find something 2613 * we can resolve into the namecache. This is very inefficient. 2614 */ 2615 static 2616 int 2617 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2618 struct vnode **saved_dvp) 2619 { 2620 struct nchandle nch; 2621 struct vnode *pvp; 2622 int error; 2623 static time_t last_fromdvp_report; 2624 char *fakename; 2625 2626 /* 2627 * Loop getting the parent directory vnode until we get something we 2628 * can resolve in the namecache. 2629 */ 2630 vref(dvp); 2631 nch.mount = dvp->v_mount; 2632 nch.ncp = NULL; 2633 fakename = NULL; 2634 2635 for (;;) { 2636 if (fakename) { 2637 kfree(fakename, M_TEMP); 2638 fakename = NULL; 2639 } 2640 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2641 &fakename); 2642 if (error) { 2643 vrele(dvp); 2644 break; 2645 } 2646 vn_unlock(pvp); 2647 spin_lock_shared(&pvp->v_spin); 2648 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2649 _cache_hold(nch.ncp); 2650 spin_unlock_shared(&pvp->v_spin); 2651 vrele(pvp); 2652 break; 2653 } 2654 spin_unlock_shared(&pvp->v_spin); 2655 if (pvp->v_flag & VROOT) { 2656 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2657 error = cache_resolve_mp(nch.mount, 1); 2658 _cache_unlock(nch.ncp); 2659 vrele(pvp); 2660 if (error) { 2661 _cache_drop(nch.ncp); 2662 nch.ncp = NULL; 2663 vrele(dvp); 2664 } 2665 break; 2666 } 2667 vrele(dvp); 2668 dvp = pvp; 2669 } 2670 if (error == 0) { 2671 if (last_fromdvp_report != time_uptime) { 2672 last_fromdvp_report = time_uptime; 2673 kprintf("Warning: extremely inefficient path " 2674 "resolution on %s\n", 2675 nch.ncp->nc_name); 2676 } 2677 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2678 2679 /* 2680 * Hopefully dvp now has a namecache record associated with 2681 * it. Leave it referenced to prevent the kernel from 2682 * recycling the vnode. Otherwise extremely long directory 2683 * paths could result in endless recycling. 2684 */ 2685 if (*saved_dvp) 2686 vrele(*saved_dvp); 2687 *saved_dvp = dvp; 2688 _cache_drop(nch.ncp); 2689 } 2690 if (fakename) 2691 kfree(fakename, M_TEMP); 2692 return (error); 2693 } 2694 2695 /* 2696 * Do an inefficient scan of the directory represented by ncp looking for 2697 * the directory vnode dvp. ncp must be held but not locked on entry and 2698 * will be held on return. dvp must be refd but not locked on entry and 2699 * will remain refd on return. 2700 * 2701 * Why do this at all? Well, due to its stateless nature the NFS server 2702 * converts file handles directly to vnodes without necessarily going through 2703 * the namecache ops that would otherwise create the namecache topology 2704 * leading to the vnode. We could either (1) Change the namecache algorithms 2705 * to allow disconnect namecache records that are re-merged opportunistically, 2706 * or (2) Make the NFS server backtrack and scan to recover a connected 2707 * namecache topology in order to then be able to issue new API lookups. 2708 * 2709 * It turns out that (1) is a huge mess. It takes a nice clean set of 2710 * namecache algorithms and introduces a lot of complication in every subsystem 2711 * that calls into the namecache to deal with the re-merge case, especially 2712 * since we are using the namecache to placehold negative lookups and the 2713 * vnode might not be immediately assigned. (2) is certainly far less 2714 * efficient then (1), but since we are only talking about directories here 2715 * (which are likely to remain cached), the case does not actually run all 2716 * that often and has the supreme advantage of not polluting the namecache 2717 * algorithms. 2718 * 2719 * If a fakename is supplied just construct a namecache entry using the 2720 * fake name. 2721 */ 2722 static int 2723 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2724 struct vnode *dvp, char *fakename) 2725 { 2726 struct nlcomponent nlc; 2727 struct nchandle rncp; 2728 struct dirent *den; 2729 struct vnode *pvp; 2730 struct vattr vat; 2731 struct iovec iov; 2732 struct uio uio; 2733 int blksize; 2734 int eofflag; 2735 int bytes; 2736 char *rbuf; 2737 int error; 2738 2739 vat.va_blocksize = 0; 2740 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2741 return (error); 2742 cache_lock(nch); 2743 error = cache_vref(nch, cred, &pvp); 2744 cache_unlock(nch); 2745 if (error) 2746 return (error); 2747 if (ncvp_debug & 1) { 2748 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2749 "vattr fileid = %lld\n", 2750 nch->ncp, nch->ncp->nc_name, 2751 vat.va_blocksize, 2752 (long long)vat.va_fileid); 2753 } 2754 2755 /* 2756 * Use the supplied fakename if not NULL. Fake names are typically 2757 * not in the actual filesystem hierarchy. This is used by HAMMER 2758 * to glue @@timestamp recursions together. 2759 */ 2760 if (fakename) { 2761 nlc.nlc_nameptr = fakename; 2762 nlc.nlc_namelen = strlen(fakename); 2763 rncp = cache_nlookup(nch, &nlc); 2764 goto done; 2765 } 2766 2767 if ((blksize = vat.va_blocksize) == 0) 2768 blksize = DEV_BSIZE; 2769 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2770 rncp.ncp = NULL; 2771 2772 eofflag = 0; 2773 uio.uio_offset = 0; 2774 again: 2775 iov.iov_base = rbuf; 2776 iov.iov_len = blksize; 2777 uio.uio_iov = &iov; 2778 uio.uio_iovcnt = 1; 2779 uio.uio_resid = blksize; 2780 uio.uio_segflg = UIO_SYSSPACE; 2781 uio.uio_rw = UIO_READ; 2782 uio.uio_td = curthread; 2783 2784 if (ncvp_debug & 2) 2785 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2786 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2787 if (error == 0) { 2788 den = (struct dirent *)rbuf; 2789 bytes = blksize - uio.uio_resid; 2790 2791 while (bytes > 0) { 2792 if (ncvp_debug & 2) { 2793 kprintf("cache_inefficient_scan: %*.*s\n", 2794 den->d_namlen, den->d_namlen, 2795 den->d_name); 2796 } 2797 if (den->d_type != DT_WHT && 2798 den->d_ino == vat.va_fileid) { 2799 if (ncvp_debug & 1) { 2800 kprintf("cache_inefficient_scan: " 2801 "MATCHED inode %lld path %s/%*.*s\n", 2802 (long long)vat.va_fileid, 2803 nch->ncp->nc_name, 2804 den->d_namlen, den->d_namlen, 2805 den->d_name); 2806 } 2807 nlc.nlc_nameptr = den->d_name; 2808 nlc.nlc_namelen = den->d_namlen; 2809 rncp = cache_nlookup(nch, &nlc); 2810 KKASSERT(rncp.ncp != NULL); 2811 break; 2812 } 2813 bytes -= _DIRENT_DIRSIZ(den); 2814 den = _DIRENT_NEXT(den); 2815 } 2816 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2817 goto again; 2818 } 2819 kfree(rbuf, M_TEMP); 2820 done: 2821 vrele(pvp); 2822 if (rncp.ncp) { 2823 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2824 _cache_setvp(rncp.mount, rncp.ncp, dvp, 1); 2825 if (ncvp_debug & 2) { 2826 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2827 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2828 } 2829 } else { 2830 if (ncvp_debug & 2) { 2831 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2832 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2833 rncp.ncp->nc_vp); 2834 } 2835 } 2836 if (rncp.ncp->nc_vp == NULL) 2837 error = rncp.ncp->nc_error; 2838 /* 2839 * Release rncp after a successful nlookup. rncp was fully 2840 * referenced. 2841 */ 2842 cache_put(&rncp); 2843 } else { 2844 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2845 dvp, nch->ncp->nc_name); 2846 error = ENOENT; 2847 } 2848 return (error); 2849 } 2850 2851 /* 2852 * This function must be called with the ncp held and locked and will unlock 2853 * and drop it during zapping. 2854 * 2855 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2856 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2857 * and removes the related reference. If the ncp can be removed, and the 2858 * parent can be zapped non-blocking, this function loops up. 2859 * 2860 * There will be one ref from the caller (which we now own). The only 2861 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2862 * so possibly 2 refs left. Taking this into account, if there are no 2863 * additional refs and no children, the ncp will be removed from the topology 2864 * and destroyed. 2865 * 2866 * References and/or children may exist if the ncp is in the middle of the 2867 * topology, preventing the ncp from being destroyed. 2868 * 2869 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2870 * 2871 * This function may return a held (but NOT locked) parent node which the 2872 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2873 * due to deep namecache trees. 2874 * 2875 * WARNING! For MPSAFE operation this routine must acquire up to three 2876 * spin locks to be able to safely test nc_refs. Lock order is 2877 * very important. 2878 * 2879 * hash spinlock if on hash list 2880 * parent spinlock if child of parent 2881 * (the ncp is unresolved so there is no vnode association) 2882 */ 2883 static int 2884 cache_zap(struct namecache *ncp) 2885 { 2886 struct namecache *par; 2887 struct nchash_head *nchpp; 2888 int refcmp; 2889 int nonblock = 1; /* XXX cleanup */ 2890 int res = 0; 2891 2892 again: 2893 /* 2894 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2895 * This gets rid of any vp->v_namecache list or negative list and 2896 * the related ref. 2897 */ 2898 _cache_setunresolved(ncp, 1); 2899 2900 /* 2901 * Try to scrap the entry and possibly tail-recurse on its parent. 2902 * We only scrap unref'd (other then our ref) unresolved entries, 2903 * we do not scrap 'live' entries. 2904 * 2905 * If nc_parent is non NULL we expect 2 references, else just 1. 2906 * If there are more, someone else also holds the ncp and we cannot 2907 * destroy it. 2908 */ 2909 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2910 KKASSERT(ncp->nc_refs > 0); 2911 2912 /* 2913 * If the ncp is linked to its parent it will also be in the hash 2914 * table. We have to be able to lock the parent and the hash table. 2915 * 2916 * Acquire locks. Note that the parent can't go away while we hold 2917 * a child locked. If nc_parent is present, expect 2 refs instead 2918 * of 1. 2919 */ 2920 nchpp = NULL; 2921 if ((par = ncp->nc_parent) != NULL) { 2922 if (nonblock) { 2923 if (_cache_lock_nonblock(par)) { 2924 /* lock failed */ 2925 ncp->nc_flag |= NCF_DEFEREDZAP; 2926 atomic_add_long( 2927 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2928 1); 2929 _cache_unlock(ncp); 2930 _cache_drop(ncp); /* caller's ref */ 2931 return res; 2932 } 2933 _cache_hold(par); 2934 } else { 2935 _cache_hold(par); 2936 _cache_lock(par); 2937 } 2938 nchpp = ncp->nc_head; 2939 spin_lock(&nchpp->spin); 2940 } 2941 2942 /* 2943 * With the parent and nchpp locked, and the vnode removed 2944 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2945 * more someone else has a ref and we cannot zap the entry. 2946 * 2947 * one for our hold 2948 * one for our parent link (parent also has one from the linkage) 2949 */ 2950 if (par) 2951 refcmp = 2; 2952 else 2953 refcmp = 1; 2954 2955 /* 2956 * On failure undo the work we've done so far and drop the 2957 * caller's ref and ncp. 2958 */ 2959 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2960 if (par) { 2961 spin_unlock(&nchpp->spin); 2962 _cache_put(par); 2963 } 2964 _cache_unlock(ncp); 2965 _cache_drop(ncp); 2966 return res; 2967 } 2968 2969 /* 2970 * We own all the refs and with the spinlocks held no further 2971 * refs can be acquired by others. 2972 * 2973 * Remove us from the hash list and parent list. We have to 2974 * drop a ref on the parent's vp if the parent's list becomes 2975 * empty. 2976 */ 2977 if (par) { 2978 KKASSERT(nchpp == ncp->nc_head); 2979 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */ 2980 /*_cache_unlock(par);*/ 2981 /* &nchpp->spin is unlocked by call */ 2982 } else { 2983 KKASSERT(ncp->nc_head == NULL); 2984 } 2985 2986 /* 2987 * ncp should not have picked up any refs. Physically 2988 * destroy the ncp. 2989 */ 2990 if (ncp->nc_refs != refcmp) { 2991 panic("cache_zap: %p bad refs %d (expected %d)\n", 2992 ncp, ncp->nc_refs, refcmp); 2993 } 2994 /* _cache_unlock(ncp) not required */ 2995 ncp->nc_refs = -1; /* safety */ 2996 if (ncp->nc_name) 2997 kfree(ncp->nc_name, M_VFSCACHEAUX); 2998 kfree_obj(ncp, M_VFSCACHE); 2999 res = 1; 3000 3001 /* 3002 * Loop up if we can recursively clean out the parent. 3003 */ 3004 if (par) { 3005 refcmp = 1; /* ref on parent */ 3006 if (par->nc_parent) /* par->par */ 3007 ++refcmp; 3008 par->nc_flag &= ~NCF_DEFEREDZAP; 3009 if ((par->nc_flag & NCF_UNRESOLVED) && 3010 par->nc_refs == refcmp && 3011 TAILQ_EMPTY(&par->nc_list)) 3012 { 3013 ncp = par; 3014 goto again; 3015 } 3016 _cache_unlock(par); 3017 _cache_drop(par); 3018 } 3019 return 1; 3020 } 3021 3022 /* 3023 * Clean up dangling negative cache and defered-drop entries in the 3024 * namecache. 3025 * 3026 * This routine is called in the critical path and also called from 3027 * vnlru(). When called from vnlru we use a lower limit to try to 3028 * deal with the negative cache before the critical path has to start 3029 * dealing with it. 3030 */ 3031 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 3032 3033 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3034 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3035 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3036 3037 static int cache_hyst_run[2]; 3038 3039 void 3040 cache_hysteresis(int critpath) 3041 { 3042 long poslimit; 3043 long exclimit; 3044 long neglimit; 3045 long xnumunres; 3046 long xnumleafs; 3047 long clean_neg; 3048 long clean_unres; 3049 long clean_excess; 3050 3051 /* 3052 * Lets not compete for running a general garbage collection 3053 */ 3054 if (atomic_swap_int(&cache_hyst_run[critpath], 1) != 0) 3055 return; 3056 3057 /* 3058 * Calculate negative ncp limit 3059 */ 3060 neglimit = maxvnodes / ncnegfactor; 3061 if (critpath == 0) 3062 neglimit = neglimit * 8 / 10; 3063 3064 /* 3065 * Don't cache too many negative hits. We use hysteresis to reduce 3066 * the impact on the critical path. 3067 */ 3068 clean_neg = 0; 3069 3070 switch(neg_cache_hysteresis_state[critpath]) { 3071 case CHI_LOW: 3072 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 3073 if (critpath) 3074 clean_neg = ncnegflush; 3075 else 3076 clean_neg = ncnegflush + 3077 vfscache_negs - neglimit; 3078 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 3079 } 3080 break; 3081 case CHI_HIGH: 3082 if (vfscache_negs > MINNEG * 9 / 10 && 3083 vfscache_negs * 9 / 10 > neglimit 3084 ) { 3085 if (critpath) 3086 clean_neg = ncnegflush; 3087 else 3088 clean_neg = ncnegflush + 3089 vfscache_negs * 9 / 10 - 3090 neglimit; 3091 } else { 3092 neg_cache_hysteresis_state[critpath] = CHI_LOW; 3093 } 3094 break; 3095 } 3096 if (clean_neg) 3097 _cache_cleanneg(clean_neg); 3098 3099 /* 3100 * Don't cache too many unresolved elements. We use hysteresis to 3101 * reduce the impact on the critical path. 3102 */ 3103 if ((poslimit = ncposlimit) == 0) 3104 poslimit = maxvnodes / ncposfactor; 3105 if (critpath == 0) 3106 poslimit = poslimit * 8 / 10; 3107 3108 /* 3109 * Number of unresolved leaf elements in the namecache. These 3110 * can build-up for various reasons and may have to be disposed 3111 * of to allow the inactive list to be cleaned out by vnlru_proc() 3112 * 3113 * Collect count 3114 */ 3115 xnumunres = vfscache_unres; 3116 clean_unres = 0; 3117 3118 switch(pos_cache_hysteresis_state[critpath]) { 3119 case CHI_LOW: 3120 if (xnumunres > poslimit && xnumunres > MINPOS) { 3121 if (critpath) 3122 clean_unres = ncposflush; 3123 else 3124 clean_unres = ncposflush + xnumunres - 3125 poslimit; 3126 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 3127 } 3128 break; 3129 case CHI_HIGH: 3130 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) { 3131 if (critpath) 3132 clean_unres = ncposflush; 3133 else 3134 clean_unres = ncposflush + xnumunres - 3135 poslimit * 5 / 6; 3136 } else { 3137 pos_cache_hysteresis_state[critpath] = CHI_LOW; 3138 } 3139 break; 3140 } 3141 3142 /* 3143 * Excessive positive hits can accumulate due to large numbers of 3144 * hardlinks (the vnode cache will not prevent ncps representing 3145 * hardlinks from growing into infinity). 3146 */ 3147 exclimit = maxvnodes * 2; 3148 if (critpath == 0) 3149 exclimit = exclimit * 8 / 10; 3150 xnumleafs = vfscache_leafs; 3151 clean_excess = 0; 3152 3153 switch(exc_cache_hysteresis_state[critpath]) { 3154 case CHI_LOW: 3155 if (xnumleafs > exclimit && xnumleafs > MINPOS) { 3156 if (critpath) 3157 clean_excess = ncposflush; 3158 else 3159 clean_excess = ncposflush + xnumleafs - 3160 exclimit; 3161 exc_cache_hysteresis_state[critpath] = CHI_HIGH; 3162 } 3163 break; 3164 case CHI_HIGH: 3165 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) { 3166 if (critpath) 3167 clean_excess = ncposflush; 3168 else 3169 clean_excess = ncposflush + xnumleafs - 3170 exclimit * 5 / 6; 3171 } else { 3172 exc_cache_hysteresis_state[critpath] = CHI_LOW; 3173 } 3174 break; 3175 } 3176 3177 if (clean_unres || clean_excess) 3178 _cache_cleanpos(clean_unres, clean_excess); 3179 3180 /* 3181 * Clean out dangling defered-zap ncps which could not be cleanly 3182 * dropped if too many build up. Note that numdefered is 3183 * heuristical. Make sure we are real-time for the current cpu, 3184 * plus the global rollup. 3185 */ 3186 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 3187 _cache_cleandefered(); 3188 } 3189 3190 atomic_swap_int(&cache_hyst_run[critpath], 0); 3191 } 3192 3193 /* 3194 * NEW NAMECACHE LOOKUP API 3195 * 3196 * Lookup an entry in the namecache. The passed par_nch must be referenced 3197 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 3198 * is ALWAYS returned, eve if the supplied component is illegal. 3199 * 3200 * The resulting namecache entry should be returned to the system with 3201 * cache_put() or cache_unlock() + cache_drop(). 3202 * 3203 * namecache locks are recursive but care must be taken to avoid lock order 3204 * reversals (hence why the passed par_nch must be unlocked). Locking 3205 * rules are to order for parent traversals, not for child traversals. 3206 * 3207 * Nobody else will be able to manipulate the associated namespace (e.g. 3208 * create, delete, rename, rename-target) until the caller unlocks the 3209 * entry. 3210 * 3211 * The returned entry will be in one of three states: positive hit (non-null 3212 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 3213 * Unresolved entries must be resolved through the filesystem to associate the 3214 * vnode and/or determine whether a positive or negative hit has occured. 3215 * 3216 * It is not necessary to lock a directory in order to lock namespace under 3217 * that directory. In fact, it is explicitly not allowed to do that. A 3218 * directory is typically only locked when being created, renamed, or 3219 * destroyed. 3220 * 3221 * The directory (par) may be unresolved, in which case any returned child 3222 * will likely also be marked unresolved. Likely but not guarenteed. Since 3223 * the filesystem lookup requires a resolved directory vnode the caller is 3224 * responsible for resolving the namecache chain top-down. This API 3225 * specifically allows whole chains to be created in an unresolved state. 3226 */ 3227 struct nchandle 3228 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3229 { 3230 struct nchandle nch; 3231 struct namecache *ncp; 3232 struct namecache *new_ncp; 3233 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 3234 struct nchash_head *nchpp; 3235 struct mount *mp; 3236 u_int32_t hash; 3237 globaldata_t gd; 3238 int par_locked; 3239 int use_excl; 3240 3241 gd = mycpu; 3242 mp = par_nch->mount; 3243 par_locked = 0; 3244 3245 /* 3246 * This is a good time to call it, no ncp's are locked by 3247 * the caller or us. 3248 */ 3249 cache_hysteresis(1); 3250 3251 /* 3252 * Try to locate an existing entry 3253 */ 3254 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3255 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3256 new_ncp = NULL; 3257 use_excl = 0; 3258 nchpp = NCHHASH(hash); 3259 restart: 3260 rep_ncp = NULL; 3261 if (use_excl) 3262 spin_lock(&nchpp->spin); 3263 else 3264 spin_lock_shared(&nchpp->spin); 3265 3266 /* 3267 * Do a reverse scan to collect any DESTROYED ncps prior to matching 3268 * an existing entry. 3269 */ 3270 TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) { 3271 /* 3272 * Break out if we find a matching entry. Note that 3273 * UNRESOLVED entries may match, but DESTROYED entries 3274 * do not. 3275 * 3276 * We may be able to reuse DESTROYED entries that we come 3277 * across, even if the name does not match, as long as 3278 * nc_nlen is correct and the only hold ref is from the nchpp 3279 * list itself. 3280 */ 3281 if (ncp->nc_parent == par_nch->ncp && 3282 ncp->nc_nlen == nlc->nlc_namelen) { 3283 if (ncp->nc_flag & NCF_DESTROYED) { 3284 if (ncp->nc_refs == 1 && rep_ncp == NULL) 3285 rep_ncp = ncp; 3286 continue; 3287 } 3288 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 3289 continue; 3290 3291 /* 3292 * Matched ncp 3293 */ 3294 _cache_hold(ncp); 3295 if (rep_ncp) 3296 _cache_hold(rep_ncp); 3297 3298 if (use_excl) 3299 spin_unlock(&nchpp->spin); 3300 else 3301 spin_unlock_shared(&nchpp->spin); 3302 3303 if (par_locked) { 3304 _cache_unlock(par_nch->ncp); 3305 par_locked = 0; 3306 } 3307 3308 /* 3309 * Really try to destroy rep_ncp if encountered. 3310 * Various edge cases can build up more than one, 3311 * so loop if we succeed. This isn't perfect, but 3312 * we can't afford to have tons of entries build 3313 * up on a single nhcpp list due to rename-over 3314 * operations. If that were to happen, the system 3315 * would bog down quickly. 3316 */ 3317 if (rep_ncp) { 3318 if (_cache_lock_nonblock(rep_ncp) == 0) { 3319 if (rep_ncp->nc_flag & NCF_DESTROYED) { 3320 if (cache_zap(rep_ncp)) { 3321 _cache_drop(ncp); 3322 goto restart; 3323 } 3324 } else { 3325 _cache_unlock(rep_ncp); 3326 _cache_drop(rep_ncp); 3327 } 3328 } else { 3329 _cache_drop(rep_ncp); 3330 } 3331 } 3332 3333 /* 3334 * Continue processing the matched entry 3335 */ 3336 if (_cache_lock_special(ncp) == 0) { 3337 /* 3338 * Successfully locked but we must re-test 3339 * conditions that might have changed since 3340 * we did not have the lock before. 3341 */ 3342 if (ncp->nc_parent != par_nch->ncp || 3343 ncp->nc_nlen != nlc->nlc_namelen || 3344 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3345 ncp->nc_nlen) || 3346 (ncp->nc_flag & NCF_DESTROYED)) { 3347 _cache_put(ncp); 3348 goto restart; 3349 } 3350 _cache_auto_unresolve(mp, ncp); 3351 if (new_ncp) { 3352 _cache_free(new_ncp); 3353 new_ncp = NULL; /* safety */ 3354 } 3355 goto found; 3356 } 3357 _cache_get(ncp); /* cycle the lock to block */ 3358 _cache_put(ncp); 3359 _cache_drop(ncp); 3360 goto restart; 3361 } 3362 } 3363 3364 /* 3365 * We failed to locate the entry, try to resurrect a destroyed 3366 * entry that we did find that is already correctly linked into 3367 * nchpp and the parent. We must re-test conditions after 3368 * successfully locking rep_ncp. 3369 * 3370 * This case can occur under heavy loads due to not being able 3371 * to safely lock the parent in cache_zap(). Nominally a repeated 3372 * create/unlink load, but only the namelen needs to match. 3373 * 3374 * An exclusive lock on the nchpp is required to process this case, 3375 * otherwise a race can cause duplicate entries to be created with 3376 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 3377 */ 3378 if (rep_ncp && use_excl) { 3379 if (_cache_lock_nonblock(rep_ncp) == 0) { 3380 _cache_hold(rep_ncp); 3381 if (rep_ncp->nc_parent == par_nch->ncp && 3382 rep_ncp->nc_nlen == nlc->nlc_namelen && 3383 (rep_ncp->nc_flag & NCF_DESTROYED) && 3384 rep_ncp->nc_refs == 2) 3385 { 3386 /* 3387 * Update nc_name. 3388 */ 3389 ncp = rep_ncp; 3390 3391 _cache_ncp_gen_enter(ncp); 3392 3393 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3394 nlc->nlc_namelen); 3395 3396 /* 3397 * This takes some care. We must clear the 3398 * NCF_DESTROYED flag before unlocking the 3399 * hash chain so other concurrent searches 3400 * do not skip this element. 3401 * 3402 * We must also unlock the hash chain before 3403 * unresolving the ncp to avoid deadlocks. 3404 * We hold the lock on the ncp so we can safely 3405 * reinitialize nc_flag after that. 3406 */ 3407 ncp->nc_flag &= ~NCF_DESTROYED; 3408 spin_unlock(&nchpp->spin); /* use_excl */ 3409 3410 _cache_setunresolved(ncp, 0); 3411 ncp->nc_flag = NCF_UNRESOLVED; 3412 ncp->nc_error = ENOTCONN; 3413 3414 _cache_ncp_gen_exit(ncp); 3415 3416 if (par_locked) { 3417 _cache_unlock(par_nch->ncp); 3418 par_locked = 0; 3419 } 3420 if (new_ncp) { 3421 _cache_free(new_ncp); 3422 new_ncp = NULL; /* safety */ 3423 } 3424 goto found; 3425 } 3426 _cache_put(rep_ncp); 3427 } 3428 } 3429 3430 /* 3431 * Otherwise create a new entry and add it to the cache. The parent 3432 * ncp must also be locked so we can link into it. 3433 * 3434 * We have to relookup after possibly blocking in kmalloc or 3435 * when locking par_nch. 3436 * 3437 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3438 * mount case, in which case nc_name will be NULL. 3439 * 3440 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3441 * a DESTROYED entry, but didn't have an exclusive lock. 3442 * In this situation we do not create a new_ncp. 3443 */ 3444 if (new_ncp == NULL) { 3445 if (use_excl) 3446 spin_unlock(&nchpp->spin); 3447 else 3448 spin_unlock_shared(&nchpp->spin); 3449 if (rep_ncp == NULL) { 3450 new_ncp = cache_alloc(nlc->nlc_namelen); 3451 if (nlc->nlc_namelen) { 3452 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3453 nlc->nlc_namelen); 3454 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3455 } 3456 } 3457 use_excl = 1; 3458 goto restart; 3459 } 3460 3461 /* 3462 * NOTE! The spinlock is held exclusively here because new_ncp 3463 * is non-NULL. 3464 */ 3465 if (par_locked == 0) { 3466 spin_unlock(&nchpp->spin); 3467 _cache_lock(par_nch->ncp); 3468 par_locked = 1; 3469 goto restart; 3470 } 3471 3472 /* 3473 * Link to parent (requires another ref, the one already in new_ncp 3474 * is what we wil lreturn). 3475 * 3476 * WARNING! We still hold the spinlock. We have to set the hash 3477 * table entry atomically. 3478 */ 3479 ncp = new_ncp; 3480 ++ncp->nc_refs; 3481 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3482 spin_unlock(&nchpp->spin); 3483 _cache_unlock(par_nch->ncp); 3484 /* par_locked = 0 - not used */ 3485 found: 3486 /* 3487 * stats and namecache size management 3488 */ 3489 if (ncp->nc_flag & NCF_UNRESOLVED) 3490 ++gd->gd_nchstats->ncs_miss; 3491 else if (ncp->nc_vp) 3492 ++gd->gd_nchstats->ncs_goodhits; 3493 else 3494 ++gd->gd_nchstats->ncs_neghits; 3495 nch.mount = mp; 3496 nch.ncp = ncp; 3497 _cache_mntref(nch.mount); 3498 3499 return(nch); 3500 } 3501 3502 /* 3503 * Attempt to lookup a namecache entry and return with a shared namecache 3504 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3505 * set or we are unable to lock. 3506 */ 3507 int 3508 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3509 struct nlcomponent *nlc, 3510 int excl, struct nchandle *res_nch) 3511 { 3512 struct namecache *ncp; 3513 struct nchash_head *nchpp; 3514 struct mount *mp; 3515 u_int32_t hash; 3516 globaldata_t gd; 3517 3518 /* 3519 * If exclusive requested or shared namecache locks are disabled, 3520 * return failure. 3521 */ 3522 if (ncp_shared_lock_disable || excl) 3523 return(EWOULDBLOCK); 3524 3525 gd = mycpu; 3526 mp = par_nch->mount; 3527 3528 /* 3529 * This is a good time to call it, no ncp's are locked by 3530 * the caller or us. 3531 */ 3532 cache_hysteresis(1); 3533 3534 /* 3535 * Try to locate an existing entry 3536 */ 3537 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3538 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3539 nchpp = NCHHASH(hash); 3540 3541 spin_lock_shared(&nchpp->spin); 3542 3543 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3544 /* 3545 * Break out if we find a matching entry. Note that 3546 * UNRESOLVED entries may match, but DESTROYED entries 3547 * do not. 3548 */ 3549 if (ncp->nc_parent == par_nch->ncp && 3550 ncp->nc_nlen == nlc->nlc_namelen && 3551 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3552 (ncp->nc_flag & NCF_DESTROYED) == 0 3553 ) { 3554 _cache_hold(ncp); 3555 spin_unlock_shared(&nchpp->spin); 3556 3557 if (_cache_lock_shared_special(ncp) == 0) { 3558 if (ncp->nc_parent == par_nch->ncp && 3559 ncp->nc_nlen == nlc->nlc_namelen && 3560 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3561 ncp->nc_nlen) == 0 && 3562 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3563 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3564 _cache_auto_unresolve_test(mp, ncp) == 0) 3565 { 3566 goto found; 3567 } 3568 _cache_unlock(ncp); 3569 } 3570 _cache_drop(ncp); 3571 return(EWOULDBLOCK); 3572 } 3573 } 3574 3575 /* 3576 * Failure 3577 */ 3578 spin_unlock_shared(&nchpp->spin); 3579 return(EWOULDBLOCK); 3580 3581 /* 3582 * Success 3583 * 3584 * Note that nc_error might be non-zero (e.g ENOENT). 3585 */ 3586 found: 3587 res_nch->mount = mp; 3588 res_nch->ncp = ncp; 3589 ++gd->gd_nchstats->ncs_goodhits; 3590 _cache_mntref(res_nch->mount); 3591 3592 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3593 return(ncp->nc_error); 3594 } 3595 3596 /* 3597 * This is a non-blocking verison of cache_nlookup() used by 3598 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3599 * will return nch.ncp == NULL in that case. 3600 */ 3601 struct nchandle 3602 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3603 { 3604 struct nchandle nch; 3605 struct namecache *ncp; 3606 struct namecache *new_ncp; 3607 struct nchash_head *nchpp; 3608 struct mount *mp; 3609 u_int32_t hash; 3610 globaldata_t gd; 3611 int par_locked; 3612 3613 gd = mycpu; 3614 mp = par_nch->mount; 3615 par_locked = 0; 3616 3617 /* 3618 * Try to locate an existing entry 3619 */ 3620 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3621 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3622 new_ncp = NULL; 3623 nchpp = NCHHASH(hash); 3624 restart: 3625 spin_lock(&nchpp->spin); 3626 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3627 /* 3628 * Break out if we find a matching entry. Note that 3629 * UNRESOLVED entries may match, but DESTROYED entries 3630 * do not. 3631 */ 3632 if (ncp->nc_parent == par_nch->ncp && 3633 ncp->nc_nlen == nlc->nlc_namelen && 3634 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3635 (ncp->nc_flag & NCF_DESTROYED) == 0 3636 ) { 3637 _cache_hold(ncp); 3638 spin_unlock(&nchpp->spin); 3639 if (par_locked) { 3640 _cache_unlock(par_nch->ncp); 3641 par_locked = 0; 3642 } 3643 if (_cache_lock_special(ncp) == 0) { 3644 if (ncp->nc_parent != par_nch->ncp || 3645 ncp->nc_nlen != nlc->nlc_namelen || 3646 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3647 (ncp->nc_flag & NCF_DESTROYED)) { 3648 kprintf("cache_lookup_nonblock: " 3649 "ncp-race %p %*.*s\n", 3650 ncp, 3651 nlc->nlc_namelen, 3652 nlc->nlc_namelen, 3653 nlc->nlc_nameptr); 3654 _cache_unlock(ncp); 3655 _cache_drop(ncp); 3656 goto failed; 3657 } 3658 _cache_auto_unresolve(mp, ncp); 3659 if (new_ncp) { 3660 _cache_free(new_ncp); 3661 new_ncp = NULL; 3662 } 3663 goto found; 3664 } 3665 _cache_drop(ncp); 3666 goto failed; 3667 } 3668 } 3669 3670 /* 3671 * We failed to locate an entry, create a new entry and add it to 3672 * the cache. The parent ncp must also be locked so we 3673 * can link into it. 3674 * 3675 * We have to relookup after possibly blocking in kmalloc or 3676 * when locking par_nch. 3677 * 3678 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3679 * mount case, in which case nc_name will be NULL. 3680 */ 3681 if (new_ncp == NULL) { 3682 spin_unlock(&nchpp->spin); 3683 new_ncp = cache_alloc(nlc->nlc_namelen); 3684 if (nlc->nlc_namelen) { 3685 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3686 nlc->nlc_namelen); 3687 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3688 } 3689 goto restart; 3690 } 3691 if (par_locked == 0) { 3692 spin_unlock(&nchpp->spin); 3693 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3694 par_locked = 1; 3695 goto restart; 3696 } 3697 goto failed; 3698 } 3699 3700 /* 3701 * Link to parent (requires another ref, the one already in new_ncp 3702 * is what we wil lreturn). 3703 * 3704 * WARNING! We still hold the spinlock. We have to set the hash 3705 * table entry atomically. 3706 */ 3707 ncp = new_ncp; 3708 ++ncp->nc_refs; 3709 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3710 spin_unlock(&nchpp->spin); 3711 _cache_unlock(par_nch->ncp); 3712 /* par_locked = 0 - not used */ 3713 found: 3714 /* 3715 * stats and namecache size management 3716 */ 3717 if (ncp->nc_flag & NCF_UNRESOLVED) 3718 ++gd->gd_nchstats->ncs_miss; 3719 else if (ncp->nc_vp) 3720 ++gd->gd_nchstats->ncs_goodhits; 3721 else 3722 ++gd->gd_nchstats->ncs_neghits; 3723 nch.mount = mp; 3724 nch.ncp = ncp; 3725 _cache_mntref(nch.mount); 3726 3727 return(nch); 3728 failed: 3729 if (new_ncp) { 3730 _cache_free(new_ncp); 3731 new_ncp = NULL; 3732 } 3733 nch.mount = NULL; 3734 nch.ncp = NULL; 3735 return(nch); 3736 } 3737 3738 /* 3739 * This is a non-locking optimized lookup that depends on adding a ref 3740 * to prevent normal eviction. nch.ncp can be returned as NULL for any 3741 * reason and the caller will retry with normal locking in that case. 3742 * 3743 * This function only returns resolved entries so callers do not accidentally 3744 * race doing out of order / unfenced field checks. 3745 * 3746 * The caller must validate the result for parent-to-child continuity. 3747 */ 3748 struct nchandle 3749 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3750 { 3751 struct nchandle nch; 3752 struct namecache *ncp; 3753 struct nchash_head *nchpp; 3754 struct mount *mp; 3755 u_int32_t hash; 3756 globaldata_t gd; 3757 3758 gd = mycpu; 3759 mp = par_nch->mount; 3760 3761 /* 3762 * Try to locate an existing entry 3763 */ 3764 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3765 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3766 nchpp = NCHHASH(hash); 3767 3768 spin_lock_shared(&nchpp->spin); 3769 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3770 /* 3771 * Break out if we find a matching entry. Note that 3772 * UNRESOLVED entries may match, but DESTROYED entries 3773 * do not. However, UNRESOLVED entries still return failure. 3774 */ 3775 if (ncp->nc_parent == par_nch->ncp && 3776 ncp->nc_nlen == nlc->nlc_namelen && 3777 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3778 (ncp->nc_flag & NCF_DESTROYED) == 0 3779 ) { 3780 /* 3781 * Test NFS timeout for auto-unresolve. Give up if 3782 * the entry is not resolved. 3783 * 3784 * Getting the ref with the nchpp locked prevents 3785 * any transition to NCF_DESTROYED. 3786 */ 3787 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3788 break; 3789 if (ncp->nc_flag & NCF_UNRESOLVED) 3790 break; 3791 _cache_hold(ncp); 3792 spin_unlock_shared(&nchpp->spin); 3793 3794 /* 3795 * We need an additional test to ensure that the ref 3796 * we got above prevents transitions to NCF_UNRESOLVED. 3797 * This can occur if another thread is currently 3798 * holding the ncp exclusively locked or (if we raced 3799 * that and it unlocked before our test) the flag 3800 * has been set. 3801 * 3802 * XXX check if superceeded by nc_generation XXX 3803 */ 3804 if (_cache_lockstatus(ncp) < 0 || 3805 (ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED))) 3806 { 3807 if ((ncvp_debug & 4) && 3808 (ncp->nc_flag & 3809 (NCF_DESTROYED | NCF_UNRESOLVED))) 3810 { 3811 kprintf("ncp state change: %p %08x %d %s\n", 3812 ncp, ncp->nc_flag, ncp->nc_error, 3813 ncp->nc_name); 3814 } 3815 _cache_drop(ncp); 3816 spin_lock_shared(&nchpp->spin); 3817 break; 3818 } 3819 3820 /* 3821 * Return the ncp bundled into a nch on success. 3822 * The ref should passively prevent the ncp from 3823 * becoming unresolved without having to hold a lock. 3824 * (XXX this may not be entirely true) 3825 */ 3826 goto found; 3827 } 3828 } 3829 spin_unlock_shared(&nchpp->spin); 3830 nch.mount = NULL; 3831 nch.ncp = NULL; 3832 3833 return nch; 3834 found: 3835 /* 3836 * stats and namecache size management 3837 */ 3838 if (ncp->nc_flag & NCF_UNRESOLVED) 3839 ++gd->gd_nchstats->ncs_miss; 3840 else if (ncp->nc_vp) 3841 ++gd->gd_nchstats->ncs_goodhits; 3842 else 3843 ++gd->gd_nchstats->ncs_neghits; 3844 nch.mount = mp; 3845 nch.ncp = ncp; 3846 _cache_mntref(nch.mount); 3847 3848 return(nch); 3849 } 3850 3851 /* 3852 * The namecache entry is marked as being used as a mount point. 3853 * Locate the mount if it is visible to the caller. The DragonFly 3854 * mount system allows arbitrary loops in the topology and disentangles 3855 * those loops by matching against (mp, ncp) rather than just (ncp). 3856 * This means any given ncp can dive any number of mounts, depending 3857 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3858 * 3859 * We use a very simple frontend cache to reduce SMP conflicts, 3860 * which we have to do because the mountlist scan needs an exclusive 3861 * lock around its ripout info list. Not to mention that there might 3862 * be a lot of mounts. 3863 * 3864 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3865 * down a bit to allow some contention rather than making the cache 3866 * excessively huge. 3867 * 3868 * The hash table is split into per-cpu areas, is 4-way set-associative. 3869 */ 3870 struct findmount_info { 3871 struct mount *result; 3872 struct mount *nch_mount; 3873 struct namecache *nch_ncp; 3874 }; 3875 3876 static __inline 3877 struct ncmount_cache * 3878 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3879 { 3880 uint32_t hash; 3881 3882 hash = iscsi_crc32(&mp, sizeof(mp)); 3883 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3884 hash ^= hash >> 16; 3885 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3886 3887 return (&ncmount_cache[hash]); 3888 } 3889 3890 static 3891 struct ncmount_cache * 3892 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3893 { 3894 struct ncmount_cache *ncc; 3895 struct ncmount_cache *best; 3896 int delta; 3897 int best_delta; 3898 int i; 3899 3900 ncc = ncmount_cache_lookup4(mp, ncp); 3901 3902 /* 3903 * NOTE: When checking for a ticks overflow implement a slop of 3904 * 2 ticks just to be safe, because ticks is accessed 3905 * non-atomically one CPU can increment it while another 3906 * is still using the old value. 3907 */ 3908 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3909 return ncc; 3910 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3911 if (delta < -2) /* overflow reset */ 3912 ncc->ticks = ticks; 3913 best = ncc; 3914 best_delta = delta; 3915 3916 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3917 ++ncc; 3918 if (ncc->ncp == ncp && ncc->mp == mp) 3919 return ncc; 3920 delta = (int)(ticks - ncc->ticks); 3921 if (delta < -2) 3922 ncc->ticks = ticks; 3923 if (delta > best_delta) { 3924 best_delta = delta; 3925 best = ncc; 3926 } 3927 } 3928 return best; 3929 } 3930 3931 /* 3932 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3933 * doing an expensive mountlist_scan*() if possible. 3934 * 3935 * (mp, ncp) -> mountonpt.k 3936 * 3937 * Returns a referenced mount pointer or NULL 3938 * 3939 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3940 * operations (that is, where the mp_target can be freed out from under us). 3941 * 3942 * Lookups use the ncc->updating counter to validate the contents in order 3943 * to avoid having to obtain the per cache-element spin-lock. In addition, 3944 * the ticks field is only updated when it changes. However, if our per-cpu 3945 * lock fails due to an unmount-in-progress, we fall-back to the 3946 * cache-element's spin-lock. 3947 */ 3948 struct mount * 3949 cache_findmount(struct nchandle *nch) 3950 { 3951 struct findmount_info info; 3952 struct ncmount_cache *ncc; 3953 struct ncmount_cache ncc_copy; 3954 struct mount *target; 3955 struct pcpu_ncache *pcpu; 3956 struct spinlock *spinlk; 3957 int update; 3958 3959 pcpu = pcpu_ncache; 3960 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3961 ncc = NULL; 3962 goto skip; 3963 } 3964 pcpu += mycpu->gd_cpuid; 3965 3966 again: 3967 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3968 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3969 found: 3970 /* 3971 * This is a bit messy for now because we do not yet have 3972 * safe disposal of mount structures. We have to ref 3973 * ncc->mp_target but the 'update' counter only tell us 3974 * whether the cache has changed after the fact. 3975 * 3976 * For now get a per-cpu spinlock that will only contend 3977 * against umount's. This is the best path. If it fails, 3978 * instead of waiting on the umount we fall-back to a 3979 * shared ncc->spin lock, which will generally only cost a 3980 * cache ping-pong. 3981 */ 3982 update = ncc->updating; 3983 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3984 spinlk = &pcpu->umount_spin; 3985 } else { 3986 spinlk = &ncc->spin; 3987 spin_lock_shared(spinlk); 3988 } 3989 if (update & 1) { /* update in progress */ 3990 spin_unlock_any(spinlk); 3991 goto skip; 3992 } 3993 ncc_copy = *ncc; 3994 cpu_lfence(); 3995 if (ncc->updating != update) { /* content changed */ 3996 spin_unlock_any(spinlk); 3997 goto again; 3998 } 3999 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 4000 spin_unlock_any(spinlk); 4001 goto again; 4002 } 4003 if (ncc_copy.isneg == 0) { 4004 target = ncc_copy.mp_target; 4005 if (target->mnt_ncmounton.mount == nch->mount && 4006 target->mnt_ncmounton.ncp == nch->ncp) { 4007 /* 4008 * Cache hit (positive) (avoid dirtying 4009 * the cache line if possible) 4010 */ 4011 if (ncc->ticks != (int)ticks) 4012 ncc->ticks = (int)ticks; 4013 _cache_mntref(target); 4014 } 4015 } else { 4016 /* 4017 * Cache hit (negative) (avoid dirtying 4018 * the cache line if possible) 4019 */ 4020 if (ncc->ticks != (int)ticks) 4021 ncc->ticks = (int)ticks; 4022 target = NULL; 4023 } 4024 spin_unlock_any(spinlk); 4025 4026 return target; 4027 } 4028 skip: 4029 4030 /* 4031 * Slow 4032 */ 4033 info.result = NULL; 4034 info.nch_mount = nch->mount; 4035 info.nch_ncp = nch->ncp; 4036 mountlist_scan(cache_findmount_callback, &info, 4037 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 4038 4039 /* 4040 * To reduce multi-re-entry on the cache, relookup in the cache. 4041 * This can still race, obviously, but that's ok. 4042 */ 4043 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 4044 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 4045 if (info.result) 4046 atomic_add_int(&info.result->mnt_refs, -1); 4047 goto found; 4048 } 4049 4050 /* 4051 * Cache the result. 4052 */ 4053 if ((info.result == NULL || 4054 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 4055 spin_lock(&ncc->spin); 4056 atomic_add_int_nonlocked(&ncc->updating, 1); 4057 cpu_sfence(); 4058 KKASSERT(ncc->updating & 1); 4059 if (ncc->mp != nch->mount) { 4060 if (ncc->mp) 4061 atomic_add_int(&ncc->mp->mnt_refs, -1); 4062 atomic_add_int(&nch->mount->mnt_refs, 1); 4063 ncc->mp = nch->mount; 4064 } 4065 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 4066 ncc->ticks = (int)ticks; 4067 4068 if (info.result) { 4069 ncc->isneg = 0; 4070 if (ncc->mp_target != info.result) { 4071 if (ncc->mp_target) 4072 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4073 ncc->mp_target = info.result; 4074 atomic_add_int(&info.result->mnt_refs, 1); 4075 } 4076 } else { 4077 ncc->isneg = 1; 4078 if (ncc->mp_target) { 4079 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4080 ncc->mp_target = NULL; 4081 } 4082 } 4083 cpu_sfence(); 4084 atomic_add_int_nonlocked(&ncc->updating, 1); 4085 spin_unlock(&ncc->spin); 4086 } 4087 return(info.result); 4088 } 4089 4090 static 4091 int 4092 cache_findmount_callback(struct mount *mp, void *data) 4093 { 4094 struct findmount_info *info = data; 4095 4096 /* 4097 * Check the mount's mounted-on point against the passed nch. 4098 */ 4099 if (mp->mnt_ncmounton.mount == info->nch_mount && 4100 mp->mnt_ncmounton.ncp == info->nch_ncp 4101 ) { 4102 info->result = mp; 4103 _cache_mntref(mp); 4104 return(-1); 4105 } 4106 return(0); 4107 } 4108 4109 void 4110 cache_dropmount(struct mount *mp) 4111 { 4112 _cache_mntrel(mp); 4113 } 4114 4115 /* 4116 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 4117 * or negative). 4118 * 4119 * A full scan is not required, but for now just do it anyway. 4120 */ 4121 void 4122 cache_ismounting(struct mount *mp) 4123 { 4124 struct ncmount_cache *ncc; 4125 struct mount *ncc_mp; 4126 int i; 4127 4128 if (pcpu_ncache == NULL) 4129 return; 4130 4131 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4132 ncc = &ncmount_cache[i]; 4133 if (ncc->mp != mp->mnt_ncmounton.mount || 4134 ncc->ncp != mp->mnt_ncmounton.ncp) { 4135 continue; 4136 } 4137 spin_lock(&ncc->spin); 4138 atomic_add_int_nonlocked(&ncc->updating, 1); 4139 cpu_sfence(); 4140 KKASSERT(ncc->updating & 1); 4141 if (ncc->mp != mp->mnt_ncmounton.mount || 4142 ncc->ncp != mp->mnt_ncmounton.ncp) { 4143 cpu_sfence(); 4144 ++ncc->updating; 4145 spin_unlock(&ncc->spin); 4146 continue; 4147 } 4148 ncc_mp = ncc->mp; 4149 ncc->ncp = NULL; 4150 ncc->mp = NULL; 4151 if (ncc_mp) 4152 atomic_add_int(&ncc_mp->mnt_refs, -1); 4153 ncc_mp = ncc->mp_target; 4154 ncc->mp_target = NULL; 4155 if (ncc_mp) 4156 atomic_add_int(&ncc_mp->mnt_refs, -1); 4157 ncc->ticks = (int)ticks - hz * 120; 4158 4159 cpu_sfence(); 4160 atomic_add_int_nonlocked(&ncc->updating, 1); 4161 spin_unlock(&ncc->spin); 4162 } 4163 4164 /* 4165 * Pre-cache the mount point 4166 */ 4167 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 4168 mp->mnt_ncmounton.ncp); 4169 4170 spin_lock(&ncc->spin); 4171 atomic_add_int_nonlocked(&ncc->updating, 1); 4172 cpu_sfence(); 4173 KKASSERT(ncc->updating & 1); 4174 4175 if (ncc->mp) 4176 atomic_add_int(&ncc->mp->mnt_refs, -1); 4177 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 4178 ncc->mp = mp->mnt_ncmounton.mount; 4179 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 4180 ncc->ticks = (int)ticks; 4181 4182 ncc->isneg = 0; 4183 if (ncc->mp_target != mp) { 4184 if (ncc->mp_target) 4185 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4186 ncc->mp_target = mp; 4187 atomic_add_int(&mp->mnt_refs, 1); 4188 } 4189 cpu_sfence(); 4190 atomic_add_int_nonlocked(&ncc->updating, 1); 4191 spin_unlock(&ncc->spin); 4192 } 4193 4194 /* 4195 * Scrap any ncmount_cache entries related to mp. Not only do we need to 4196 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 4197 * negative hits involving (mp, <any>). 4198 * 4199 * A full scan is required. 4200 */ 4201 void 4202 cache_unmounting(struct mount *mp) 4203 { 4204 struct ncmount_cache *ncc; 4205 struct pcpu_ncache *pcpu; 4206 struct mount *ncc_mp; 4207 int i; 4208 4209 pcpu = pcpu_ncache; 4210 if (pcpu == NULL) 4211 return; 4212 4213 for (i = 0; i < ncpus; ++i) 4214 spin_lock(&pcpu[i].umount_spin); 4215 4216 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4217 ncc = &ncmount_cache[i]; 4218 if (ncc->mp != mp && ncc->mp_target != mp) 4219 continue; 4220 spin_lock(&ncc->spin); 4221 atomic_add_int_nonlocked(&ncc->updating, 1); 4222 cpu_sfence(); 4223 4224 if (ncc->mp != mp && ncc->mp_target != mp) { 4225 atomic_add_int_nonlocked(&ncc->updating, 1); 4226 cpu_sfence(); 4227 spin_unlock(&ncc->spin); 4228 continue; 4229 } 4230 ncc_mp = ncc->mp; 4231 ncc->ncp = NULL; 4232 ncc->mp = NULL; 4233 if (ncc_mp) 4234 atomic_add_int(&ncc_mp->mnt_refs, -1); 4235 ncc_mp = ncc->mp_target; 4236 ncc->mp_target = NULL; 4237 if (ncc_mp) 4238 atomic_add_int(&ncc_mp->mnt_refs, -1); 4239 ncc->ticks = (int)ticks - hz * 120; 4240 4241 cpu_sfence(); 4242 atomic_add_int_nonlocked(&ncc->updating, 1); 4243 spin_unlock(&ncc->spin); 4244 } 4245 4246 for (i = 0; i < ncpus; ++i) 4247 spin_unlock(&pcpu[i].umount_spin); 4248 } 4249 4250 /* 4251 * Resolve an unresolved namecache entry, generally by looking it up. 4252 * The passed ncp must be locked and refd. 4253 * 4254 * Theoretically since a vnode cannot be recycled while held, and since 4255 * the nc_parent chain holds its vnode as long as children exist, the 4256 * direct parent of the cache entry we are trying to resolve should 4257 * have a valid vnode. If not then generate an error that we can 4258 * determine is related to a resolver bug. 4259 * 4260 * However, if a vnode was in the middle of a recyclement when the NCP 4261 * got locked, ncp->nc_vp might point to a vnode that is about to become 4262 * invalid. cache_resolve() handles this case by unresolving the entry 4263 * and then re-resolving it. 4264 * 4265 * Note that successful resolution does not necessarily return an error 4266 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 4267 * will be returned. 4268 * 4269 * (*genp) is adjusted based on our resolution operation. If it is already 4270 * wrong, that's ok... it will still be wrong on return. 4271 */ 4272 int 4273 cache_resolve(struct nchandle *nch, u_int *genp, struct ucred *cred) 4274 { 4275 struct namecache *par_tmp; 4276 struct namecache *par; 4277 struct namecache *ncp; 4278 struct nchandle nctmp; 4279 struct mount *mp; 4280 struct vnode *dvp; 4281 int error; 4282 4283 ncp = nch->ncp; 4284 mp = nch->mount; 4285 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4286 4287 restart: 4288 /* 4289 * If the ncp is already resolved we have nothing to do. However, 4290 * we do want to guarentee that a usable vnode is returned when 4291 * a vnode is present, so make sure it hasn't been reclaimed. 4292 */ 4293 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4294 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 4295 _cache_ncp_gen_enter(ncp); 4296 _cache_setunresolved(ncp, 0); 4297 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4298 _cache_ncp_gen_exit(ncp); 4299 *genp += 4; 4300 return (ncp->nc_error); 4301 } 4302 } else if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4303 return (ncp->nc_error); 4304 } else { 4305 _cache_ncp_gen_enter(ncp); 4306 } 4307 } else { 4308 _cache_ncp_gen_enter(ncp); 4309 } 4310 /* in gen_enter state */ 4311 *genp += 4; 4312 4313 /* 4314 * If the ncp was destroyed it will never resolve again. This 4315 * can basically only happen when someone is chdir'd into an 4316 * empty directory which is then rmdir'd. We want to catch this 4317 * here and not dive the VFS because the VFS might actually 4318 * have a way to re-resolve the disconnected ncp, which will 4319 * result in inconsistencies in the cdir/nch for proc->p_fd. 4320 */ 4321 if (ncp->nc_flag & NCF_DESTROYED) { 4322 _cache_ncp_gen_exit(ncp); 4323 return(EINVAL); 4324 } 4325 4326 /* 4327 * Mount points need special handling because the parent does not 4328 * belong to the same filesystem as the ncp. 4329 */ 4330 if (ncp == mp->mnt_ncmountpt.ncp) { 4331 error = cache_resolve_mp(mp, 0); 4332 _cache_ncp_gen_exit(ncp); 4333 return error; 4334 } 4335 4336 /* 4337 * We expect an unbroken chain of ncps to at least the mount point, 4338 * and even all the way to root (but this code doesn't have to go 4339 * past the mount point). 4340 */ 4341 if (ncp->nc_parent == NULL) { 4342 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 4343 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4344 ncp->nc_error = EXDEV; 4345 _cache_ncp_gen_exit(ncp); 4346 return(ncp->nc_error); 4347 } 4348 4349 /* 4350 * The vp's of the parent directories in the chain are held via vhold() 4351 * due to the existance of the child, and should not disappear. 4352 * However, there are cases where they can disappear: 4353 * 4354 * - due to filesystem I/O errors. 4355 * - due to NFS being stupid about tracking the namespace and 4356 * destroys the namespace for entire directories quite often. 4357 * - due to forced unmounts. 4358 * - due to an rmdir (parent will be marked DESTROYED) 4359 * 4360 * When this occurs we have to track the chain backwards and resolve 4361 * it, looping until the resolver catches up to the current node. We 4362 * could recurse here but we might run ourselves out of kernel stack 4363 * so we do it in a more painful manner. This situation really should 4364 * not occur all that often, or if it does not have to go back too 4365 * many nodes to resolve the ncp. 4366 */ 4367 while ((dvp = cache_dvpref(ncp)) == NULL) { 4368 /* 4369 * This case can occur if a process is CD'd into a 4370 * directory which is then rmdir'd. If the parent is marked 4371 * destroyed there is no point trying to resolve it. 4372 */ 4373 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) { 4374 if (ncvp_debug & 8) { 4375 kprintf("nc_parent destroyed: %s/%s\n", 4376 ncp->nc_parent->nc_name, ncp->nc_name); 4377 } 4378 _cache_ncp_gen_exit(ncp); 4379 return(ENOENT); 4380 } 4381 par = ncp->nc_parent; 4382 _cache_hold(par); 4383 _cache_lock(par); 4384 while ((par_tmp = par->nc_parent) != NULL && 4385 par_tmp->nc_vp == NULL) { 4386 _cache_hold(par_tmp); 4387 _cache_lock(par_tmp); 4388 _cache_put(par); 4389 par = par_tmp; 4390 } 4391 if (par->nc_parent == NULL) { 4392 kprintf("EXDEV case 2 %*.*s\n", 4393 par->nc_nlen, par->nc_nlen, par->nc_name); 4394 _cache_put(par); 4395 _cache_ncp_gen_exit(ncp); 4396 return (EXDEV); 4397 } 4398 /* 4399 * The parent is not set in stone, ref and lock it to prevent 4400 * it from disappearing. Also note that due to renames it 4401 * is possible for our ncp to move and for par to no longer 4402 * be one of its parents. We resolve it anyway, the loop 4403 * will handle any moves. 4404 */ 4405 _cache_get(par); /* additional hold/lock */ 4406 _cache_put(par); /* from earlier hold/lock */ 4407 if (par == nch->mount->mnt_ncmountpt.ncp) { 4408 cache_resolve_mp(nch->mount, 0); 4409 } else if ((dvp = cache_dvpref(par)) == NULL) { 4410 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4411 par->nc_nlen, par->nc_nlen, par->nc_name); 4412 _cache_put(par); 4413 continue; 4414 } else { 4415 if (par->nc_flag & NCF_UNRESOLVED) { 4416 nctmp.mount = mp; 4417 nctmp.ncp = par; 4418 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4419 } 4420 vrele(dvp); 4421 } 4422 if ((error = par->nc_error) != 0) { 4423 if (par->nc_error != EAGAIN) { 4424 kprintf("EXDEV case 3 %*.*s error %d\n", 4425 par->nc_nlen, par->nc_nlen, par->nc_name, 4426 par->nc_error); 4427 _cache_put(par); 4428 _cache_ncp_gen_exit(ncp); 4429 return(error); 4430 } 4431 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4432 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4433 } 4434 _cache_put(par); 4435 /* loop */ 4436 } 4437 4438 /* 4439 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 4440 * ncp's and reattach them. If this occurs the original ncp is marked 4441 * EAGAIN to force a relookup. 4442 * 4443 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 4444 * ncp must already be resolved. 4445 */ 4446 if (dvp) { 4447 nctmp.mount = mp; 4448 nctmp.ncp = ncp; 4449 *genp += 4; /* setvp bumps the generation */ 4450 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4451 vrele(dvp); 4452 } else { 4453 ncp->nc_error = EPERM; 4454 } 4455 4456 if (ncp->nc_error == EAGAIN) { 4457 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 4458 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4459 goto restart; 4460 } 4461 _cache_ncp_gen_exit(ncp); 4462 4463 return(ncp->nc_error); 4464 } 4465 4466 /* 4467 * Resolve the ncp associated with a mount point. Such ncp's almost always 4468 * remain resolved and this routine is rarely called. NFS MPs tends to force 4469 * re-resolution more often due to its mac-truck-smash-the-namecache 4470 * method of tracking namespace changes. 4471 * 4472 * The semantics for this call is that the passed ncp must be locked on 4473 * entry and will be locked on return. However, if we actually have to 4474 * resolve the mount point we temporarily unlock the entry in order to 4475 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 4476 * the unlock we have to recheck the flags after we relock. 4477 */ 4478 static int 4479 cache_resolve_mp(struct mount *mp, int adjgen) 4480 { 4481 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 4482 struct vnode *vp; 4483 int error; 4484 4485 KKASSERT(mp != NULL); 4486 4487 /* 4488 * If the ncp is already resolved we have nothing to do. However, 4489 * we do want to guarentee that a usable vnode is returned when 4490 * a vnode is present, so make sure it hasn't been reclaimed. 4491 */ 4492 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4493 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4494 _cache_setunresolved(ncp, adjgen); 4495 } 4496 4497 if (ncp->nc_flag & NCF_UNRESOLVED) { 4498 /* 4499 * ncp must be unlocked across the vfs_busy(), but 4500 * once busied lock ordering is ncp(s), then vnodes, 4501 * so we must relock the ncp before issuing the VFS_ROOT(). 4502 */ 4503 _cache_unlock(ncp); 4504 while (vfs_busy(mp, 0)) 4505 ; 4506 _cache_lock(ncp); 4507 error = VFS_ROOT(mp, &vp); 4508 4509 /* 4510 * recheck the ncp state after relocking. 4511 */ 4512 if (ncp->nc_flag & NCF_UNRESOLVED) { 4513 ncp->nc_error = error; 4514 if (error == 0) { 4515 _cache_setvp(mp, ncp, vp, adjgen); 4516 vput(vp); 4517 } else { 4518 kprintf("[diagnostic] cache_resolve_mp: failed" 4519 " to resolve mount %p err=%d ncp=%p\n", 4520 mp, error, ncp); 4521 _cache_setvp(mp, ncp, NULL, adjgen); 4522 } 4523 } else if (error == 0) { 4524 vput(vp); 4525 } 4526 vfs_unbusy(mp); 4527 } 4528 return(ncp->nc_error); 4529 } 4530 4531 /* 4532 * Resolve the parent vnode 4533 */ 4534 int 4535 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4536 { 4537 struct namecache *par_tmp; 4538 struct namecache *par; 4539 struct namecache *ncp; 4540 struct nchandle nctmp; 4541 struct mount *mp; 4542 struct vnode *dvp; 4543 int error; 4544 4545 *dvpp = NULL; 4546 ncp = nch->ncp; 4547 mp = nch->mount; 4548 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4549 4550 /* 4551 * Treat this as a mount point even if it has a parent (e.g. 4552 * null-mount). Return a NULL dvp and no error. 4553 */ 4554 if (ncp == mp->mnt_ncmountpt.ncp) 4555 return 0; 4556 4557 /* 4558 * If the ncp was destroyed there is no parent directory, return 4559 * EINVAL. 4560 */ 4561 if (ncp->nc_flag & NCF_DESTROYED) 4562 return(EINVAL); 4563 4564 /* 4565 * No parent if at the root of a filesystem, no error. Typically 4566 * not applicable to null-mounts. This case should have been caught 4567 * in the above ncmountpt check. 4568 */ 4569 if (ncp->nc_parent == NULL) 4570 return 0; 4571 4572 /* 4573 * Resolve the parent dvp. 4574 * 4575 * The vp's of the parent directories in the chain are held via vhold() 4576 * due to the existance of the child, and should not disappear. 4577 * However, there are cases where they can disappear: 4578 * 4579 * - due to filesystem I/O errors. 4580 * - due to NFS being stupid about tracking the namespace and 4581 * destroys the namespace for entire directories quite often. 4582 * - due to forced unmounts. 4583 * - due to an rmdir (parent will be marked DESTROYED) 4584 * 4585 * When this occurs we have to track the chain backwards and resolve 4586 * it, looping until the resolver catches up to the current node. We 4587 * could recurse here but we might run ourselves out of kernel stack 4588 * so we do it in a more painful manner. This situation really should 4589 * not occur all that often, or if it does not have to go back too 4590 * many nodes to resolve the ncp. 4591 */ 4592 while ((dvp = cache_dvpref(ncp)) == NULL) { 4593 /* 4594 * This case can occur if a process is CD'd into a 4595 * directory which is then rmdir'd. If the parent is marked 4596 * destroyed there is no point trying to resolve it. 4597 */ 4598 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4599 return(ENOENT); 4600 par = ncp->nc_parent; 4601 _cache_hold(par); 4602 _cache_lock(par); 4603 while ((par_tmp = par->nc_parent) != NULL && 4604 par_tmp->nc_vp == NULL) { 4605 _cache_hold(par_tmp); 4606 _cache_lock(par_tmp); 4607 _cache_put(par); 4608 par = par_tmp; 4609 } 4610 if (par->nc_parent == NULL) { 4611 kprintf("EXDEV case 2 %*.*s\n", 4612 par->nc_nlen, par->nc_nlen, par->nc_name); 4613 _cache_put(par); 4614 return (EXDEV); 4615 } 4616 4617 /* 4618 * The parent is not set in stone, ref and lock it to prevent 4619 * it from disappearing. Also note that due to renames it 4620 * is possible for our ncp to move and for par to no longer 4621 * be one of its parents. We resolve it anyway, the loop 4622 * will handle any moves. 4623 */ 4624 _cache_get(par); /* additional hold/lock */ 4625 _cache_put(par); /* from earlier hold/lock */ 4626 if (par == nch->mount->mnt_ncmountpt.ncp) { 4627 cache_resolve_mp(nch->mount, 1); 4628 } else if ((dvp = cache_dvpref(par)) == NULL) { 4629 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4630 par->nc_nlen, par->nc_nlen, par->nc_name); 4631 _cache_put(par); 4632 continue; 4633 } else { 4634 if (par->nc_flag & NCF_UNRESOLVED) { 4635 nctmp.mount = mp; 4636 nctmp.ncp = par; 4637 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4638 } 4639 vrele(dvp); 4640 } 4641 if ((error = par->nc_error) != 0) { 4642 if (par->nc_error != EAGAIN) { 4643 kprintf("EXDEV case 3 %*.*s error %d\n", 4644 par->nc_nlen, par->nc_nlen, par->nc_name, 4645 par->nc_error); 4646 _cache_put(par); 4647 return(error); 4648 } 4649 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4650 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4651 } 4652 _cache_put(par); 4653 /* loop */ 4654 } 4655 4656 /* 4657 * We have a referenced dvp 4658 */ 4659 *dvpp = dvp; 4660 return 0; 4661 } 4662 4663 /* 4664 * Clean out negative cache entries when too many have accumulated. 4665 */ 4666 static void 4667 _cache_cleanneg(long count) 4668 { 4669 struct pcpu_ncache *pn; 4670 struct namecache *ncp; 4671 static uint32_t neg_rover; 4672 uint32_t n; 4673 long vnegs; 4674 4675 n = neg_rover++; /* SMP heuristical, race ok */ 4676 cpu_ccfence(); 4677 n = n % (uint32_t)ncpus; 4678 4679 /* 4680 * Normalize vfscache_negs and count. count is sometimes based 4681 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4682 * have crazy values. 4683 */ 4684 vnegs = vfscache_negs; 4685 cpu_ccfence(); 4686 if (vnegs <= MINNEG) 4687 vnegs = MINNEG; 4688 if (count < 1) 4689 count = 1; 4690 4691 pn = &pcpu_ncache[n]; 4692 spin_lock(&pn->neg_spin); 4693 count = pn->neg_count * count / vnegs + 1; 4694 spin_unlock(&pn->neg_spin); 4695 4696 /* 4697 * Attempt to clean out the specified number of negative cache 4698 * entries. 4699 */ 4700 while (count > 0) { 4701 spin_lock(&pn->neg_spin); 4702 ncp = TAILQ_FIRST(&pn->neg_list); 4703 if (ncp == NULL) { 4704 spin_unlock(&pn->neg_spin); 4705 break; 4706 } 4707 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4708 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4709 _cache_hold(ncp); 4710 spin_unlock(&pn->neg_spin); 4711 4712 /* 4713 * This can race, so we must re-check that the ncp 4714 * is on the ncneg.list after successfully locking it. 4715 * 4716 * Don't scrap actively referenced ncps. There should be 4717 * 3 refs. The natural ref, one from being on the neg list, 4718 * and one from us. 4719 * 4720 * Recheck fields after successfully locking to ensure 4721 * that it is in-fact still on the negative list with no 4722 * extra refs. 4723 * 4724 * WARNING! On the ncneglist scan any race against other 4725 * destructors (zaps or cache_inval_vp_quick() calls) 4726 * will have already unresolved the ncp and cause 4727 * us to drop instead of zap. This fine, if 4728 * our drop winds up being the last one it will 4729 * kfree() the ncp. 4730 */ 4731 if (_cache_lock_special(ncp) == 0) { 4732 if (ncp->nc_vp == NULL && 4733 ncp->nc_refs == 3 && 4734 (ncp->nc_flag & NCF_UNRESOLVED) == 0) 4735 { 4736 ++pcpu_ncache[mycpu->gd_cpuid].clean_neg_count; 4737 cache_zap(ncp); 4738 } else { 4739 _cache_unlock(ncp); 4740 _cache_drop(ncp); 4741 } 4742 } else { 4743 _cache_drop(ncp); 4744 } 4745 --count; 4746 } 4747 } 4748 4749 /* 4750 * Clean out unresolved cache entries when too many have accumulated. 4751 * Resolved cache entries are cleaned out via the vnode reclamation 4752 * mechanism and by _cache_cleanneg(). 4753 */ 4754 static void 4755 _cache_cleanpos(long ucount, long xcount) 4756 { 4757 static volatile int rover; 4758 struct nchash_head *nchpp; 4759 struct namecache *ncp; 4760 long count; 4761 int rover_copy; 4762 4763 /* 4764 * Don't burn too much cpu looking for stuff 4765 */ 4766 count = (ucount > xcount) ? ucount : xcount; 4767 count = count * 4; 4768 4769 /* 4770 * Attempt to clean out the specified number of cache entries. 4771 */ 4772 while (count > 0 && (ucount > 0 || xcount > 0)) { 4773 rover_copy = atomic_fetchadd_int(&rover, 1); 4774 cpu_ccfence(); 4775 nchpp = NCHHASH(rover_copy); 4776 4777 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4778 --count; 4779 continue; 4780 } 4781 4782 /* 4783 * Get the next ncp 4784 */ 4785 spin_lock(&nchpp->spin); 4786 ncp = TAILQ_FIRST(&nchpp->list); 4787 4788 /* 4789 * Skip placeholder ncp's. Do not shift their 4790 * position in the list. 4791 */ 4792 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4793 ncp = TAILQ_NEXT(ncp, nc_hash); 4794 4795 if (ncp) { 4796 /* 4797 * Move to end of list 4798 */ 4799 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4800 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4801 4802 if (ncp->nc_refs != ncpbaserefs(ncp)) { 4803 /* 4804 * Do not destroy internal nodes that have 4805 * children or nodes which have thread 4806 * references. 4807 */ 4808 ncp = NULL; 4809 } else if (ucount > 0 && 4810 (ncp->nc_flag & NCF_UNRESOLVED)) 4811 { 4812 /* 4813 * Destroy unresolved nodes if asked. 4814 */ 4815 --ucount; 4816 --xcount; 4817 _cache_hold(ncp); 4818 } else if (xcount > 0) { 4819 /* 4820 * Destroy any other node if asked. 4821 */ 4822 --xcount; 4823 _cache_hold(ncp); 4824 } else { 4825 /* 4826 * Otherwise don't 4827 */ 4828 ncp = NULL; 4829 } 4830 } 4831 spin_unlock(&nchpp->spin); 4832 4833 /* 4834 * Try to scap the ncp if we can do so non-blocking. 4835 * We must re-check nc_refs after locking, and it will 4836 * have one additional ref from above. 4837 */ 4838 if (ncp) { 4839 if (_cache_lock_special(ncp) == 0) { 4840 if (ncp->nc_refs == 1 + ncpbaserefs(ncp)) { 4841 ++pcpu_ncache[mycpu->gd_cpuid]. 4842 clean_pos_count; 4843 cache_zap(ncp); 4844 } else { 4845 _cache_unlock(ncp); 4846 _cache_drop(ncp); 4847 } 4848 } else { 4849 _cache_drop(ncp); 4850 } 4851 } 4852 --count; 4853 } 4854 } 4855 4856 /* 4857 * This is a kitchen sink function to clean out ncps which we 4858 * tried to zap from cache_drop() but failed because we were 4859 * unable to acquire the parent lock. 4860 * 4861 * Such entries can also be removed via cache_inval_vp(), such 4862 * as when unmounting. 4863 */ 4864 static void 4865 _cache_cleandefered(void) 4866 { 4867 struct nchash_head *nchpp; 4868 struct namecache *ncp; 4869 struct namecache dummy; 4870 int i; 4871 4872 /* 4873 * Create a list iterator. DUMMY indicates that this is a list 4874 * iterator, DESTROYED prevents matches by lookup functions. 4875 */ 4876 numdefered = 0; 4877 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4878 bzero(&dummy, sizeof(dummy)); 4879 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4880 dummy.nc_refs = 1; 4881 4882 for (i = 0; i <= nchash; ++i) { 4883 nchpp = &nchashtbl[i]; 4884 4885 spin_lock(&nchpp->spin); 4886 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4887 ncp = &dummy; 4888 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4889 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4890 continue; 4891 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4892 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4893 _cache_hold(ncp); 4894 spin_unlock(&nchpp->spin); 4895 if (_cache_lock_nonblock(ncp) == 0) { 4896 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4897 _cache_unlock(ncp); 4898 } 4899 _cache_drop(ncp); 4900 spin_lock(&nchpp->spin); 4901 ncp = &dummy; 4902 } 4903 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4904 spin_unlock(&nchpp->spin); 4905 } 4906 } 4907 4908 /* 4909 * Name cache initialization, from vfsinit() when we are booting 4910 */ 4911 void 4912 nchinit(void) 4913 { 4914 struct pcpu_ncache *pn; 4915 globaldata_t gd; 4916 int i; 4917 4918 /* 4919 * Per-cpu accounting and negative hit list 4920 */ 4921 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4922 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4923 for (i = 0; i < ncpus; ++i) { 4924 pn = &pcpu_ncache[i]; 4925 TAILQ_INIT(&pn->neg_list); 4926 spin_init(&pn->neg_spin, "ncneg"); 4927 spin_init(&pn->umount_spin, "ncumm"); 4928 } 4929 4930 /* 4931 * Initialise per-cpu namecache effectiveness statistics. 4932 */ 4933 for (i = 0; i < ncpus; ++i) { 4934 gd = globaldata_find(i); 4935 gd->gd_nchstats = &nchstats[i]; 4936 } 4937 4938 /* 4939 * Create a generous namecache hash table 4940 */ 4941 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4942 sizeof(struct nchash_head), 4943 M_VFSCACHEAUX, &nchash); 4944 for (i = 0; i <= (int)nchash; ++i) { 4945 TAILQ_INIT(&nchashtbl[i].list); 4946 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4947 } 4948 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4949 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4950 nclockwarn = 5 * hz; 4951 } 4952 4953 /* 4954 * Called from start_init() to bootstrap the root filesystem. Returns 4955 * a referenced, unlocked namecache record to serve as a root or the 4956 * root of the system. 4957 * 4958 * Adjust our namecache counts 4959 */ 4960 void 4961 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4962 { 4963 /*struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];*/ 4964 4965 /* nc_parent is NULL, doesn't count as a leaf or unresolved */ 4966 /*atomic_add_long(&pn->vfscache_leafs, 1);*/ 4967 /*atomic_add_long(&pn->vfscache_unres, 1);*/ 4968 4969 nch->ncp = cache_alloc(0); 4970 nch->mount = mp; 4971 _cache_mntref(mp); 4972 if (vp) 4973 _cache_setvp(nch->mount, nch->ncp, vp, 1); 4974 } 4975 4976 /* 4977 * vfs_cache_setroot() 4978 * 4979 * Create an association between the root of our namecache and 4980 * the root vnode. This routine may be called several times during 4981 * booting. 4982 * 4983 * If the caller intends to save the returned namecache pointer somewhere 4984 * it must cache_hold() it. 4985 */ 4986 void 4987 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4988 { 4989 struct vnode *ovp; 4990 struct nchandle onch; 4991 4992 ovp = rootvnode; 4993 onch = rootnch; 4994 rootvnode = nvp; 4995 if (nch) 4996 rootnch = *nch; 4997 else 4998 cache_zero(&rootnch); 4999 if (ovp) 5000 vrele(ovp); 5001 if (onch.ncp) 5002 cache_drop(&onch); 5003 } 5004 5005 /* 5006 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 5007 * topology and is being removed as quickly as possible. The new VOP_N*() 5008 * API calls are required to make specific adjustments using the supplied 5009 * ncp pointers rather then just bogusly purging random vnodes. 5010 * 5011 * Invalidate all namecache entries to a particular vnode as well as 5012 * any direct children of that vnode in the namecache. This is a 5013 * 'catch all' purge used by filesystems that do not know any better. 5014 * 5015 * Note that the linkage between the vnode and its namecache entries will 5016 * be removed, but the namecache entries themselves might stay put due to 5017 * active references from elsewhere in the system or due to the existance of 5018 * the children. The namecache topology is left intact even if we do not 5019 * know what the vnode association is. Such entries will be marked 5020 * NCF_UNRESOLVED. 5021 */ 5022 void 5023 cache_purge(struct vnode *vp) 5024 { 5025 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 5026 } 5027 5028 __read_mostly static int disablecwd; 5029 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 5030 "Disable getcwd"); 5031 5032 /* 5033 * MPALMOSTSAFE 5034 */ 5035 int 5036 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 5037 { 5038 u_int buflen; 5039 int error; 5040 char *buf; 5041 char *bp; 5042 5043 if (disablecwd) 5044 return (ENODEV); 5045 5046 buflen = uap->buflen; 5047 if (buflen == 0) 5048 return (EINVAL); 5049 if (buflen > MAXPATHLEN) 5050 buflen = MAXPATHLEN; 5051 5052 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 5053 bp = kern_getcwd(buf, buflen, &error); 5054 if (error == 0) 5055 error = copyout(bp, uap->buf, strlen(bp) + 1); 5056 kfree(buf, M_TEMP); 5057 return (error); 5058 } 5059 5060 char * 5061 kern_getcwd(char *buf, size_t buflen, int *error) 5062 { 5063 struct proc *p = curproc; 5064 char *bp; 5065 int i, slash_prefixed; 5066 struct filedesc *fdp; 5067 struct nchandle nch; 5068 struct namecache *ncp; 5069 5070 bp = buf; 5071 bp += buflen - 1; 5072 *bp = '\0'; 5073 fdp = p->p_fd; 5074 slash_prefixed = 0; 5075 5076 nch = fdp->fd_ncdir; 5077 ncp = nch.ncp; 5078 if (ncp) 5079 _cache_hold(ncp); 5080 5081 while (ncp && (ncp != fdp->fd_nrdir.ncp || 5082 nch.mount != fdp->fd_nrdir.mount) 5083 ) { 5084 if (ncp->nc_flag & NCF_DESTROYED) { 5085 _cache_drop(ncp); 5086 ncp = NULL; 5087 break; 5088 } 5089 /* 5090 * While traversing upwards if we encounter the root 5091 * of the current mount we have to skip to the mount point 5092 * in the underlying filesystem. 5093 */ 5094 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 5095 nch = nch.mount->mnt_ncmounton; 5096 _cache_drop(ncp); 5097 ncp = nch.ncp; 5098 if (ncp) 5099 _cache_hold(ncp); 5100 continue; 5101 } 5102 5103 /* 5104 * Prepend the path segment 5105 */ 5106 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 5107 if (bp == buf) { 5108 *error = ERANGE; 5109 bp = NULL; 5110 goto done; 5111 } 5112 *--bp = ncp->nc_name[i]; 5113 } 5114 if (bp == buf) { 5115 *error = ERANGE; 5116 bp = NULL; 5117 goto done; 5118 } 5119 *--bp = '/'; 5120 slash_prefixed = 1; 5121 5122 /* 5123 * Go up a directory. This isn't a mount point so we don't 5124 * have to check again. 5125 */ 5126 while ((nch.ncp = ncp->nc_parent) != NULL) { 5127 if (ncp_shared_lock_disable) 5128 _cache_lock(ncp); 5129 else 5130 _cache_lock_shared(ncp); 5131 if (nch.ncp != ncp->nc_parent) { 5132 _cache_unlock(ncp); 5133 continue; 5134 } 5135 _cache_hold(nch.ncp); 5136 _cache_unlock(ncp); 5137 break; 5138 } 5139 _cache_drop(ncp); 5140 ncp = nch.ncp; 5141 } 5142 if (ncp == NULL) { 5143 *error = ENOENT; 5144 bp = NULL; 5145 goto done; 5146 } 5147 if (!slash_prefixed) { 5148 if (bp == buf) { 5149 *error = ERANGE; 5150 bp = NULL; 5151 goto done; 5152 } 5153 *--bp = '/'; 5154 } 5155 *error = 0; 5156 done: 5157 if (ncp) 5158 _cache_drop(ncp); 5159 return (bp); 5160 } 5161 5162 /* 5163 * Thus begins the fullpath magic. 5164 * 5165 * The passed nchp is referenced but not locked. 5166 */ 5167 __read_mostly static int disablefullpath; 5168 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 5169 &disablefullpath, 0, 5170 "Disable fullpath lookups"); 5171 5172 int 5173 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 5174 char **retbuf, char **freebuf, int guess) 5175 { 5176 struct nchandle fd_nrdir; 5177 struct nchandle nch; 5178 struct namecache *ncp; 5179 struct mount *mp, *new_mp; 5180 char *bp, *buf; 5181 int slash_prefixed; 5182 int error = 0; 5183 int i; 5184 5185 *retbuf = NULL; 5186 *freebuf = NULL; 5187 5188 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 5189 bp = buf + MAXPATHLEN - 1; 5190 *bp = '\0'; 5191 if (nchbase) 5192 fd_nrdir = *nchbase; 5193 else if (p != NULL) 5194 fd_nrdir = p->p_fd->fd_nrdir; 5195 else 5196 fd_nrdir = rootnch; 5197 slash_prefixed = 0; 5198 nch = *nchp; 5199 ncp = nch.ncp; 5200 if (ncp) 5201 _cache_hold(ncp); 5202 mp = nch.mount; 5203 5204 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 5205 new_mp = NULL; 5206 5207 /* 5208 * If we are asked to guess the upwards path, we do so whenever 5209 * we encounter an ncp marked as a mountpoint. We try to find 5210 * the actual mountpoint by finding the mountpoint with this 5211 * ncp. 5212 */ 5213 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 5214 new_mp = mount_get_by_nc(ncp); 5215 } 5216 /* 5217 * While traversing upwards if we encounter the root 5218 * of the current mount we have to skip to the mount point. 5219 */ 5220 if (ncp == mp->mnt_ncmountpt.ncp) { 5221 new_mp = mp; 5222 } 5223 if (new_mp) { 5224 nch = new_mp->mnt_ncmounton; 5225 _cache_drop(ncp); 5226 ncp = nch.ncp; 5227 if (ncp) 5228 _cache_hold(ncp); 5229 mp = nch.mount; 5230 continue; 5231 } 5232 5233 /* 5234 * Prepend the path segment 5235 */ 5236 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 5237 if (bp == buf) { 5238 kfree(buf, M_TEMP); 5239 error = ENOMEM; 5240 goto done; 5241 } 5242 *--bp = ncp->nc_name[i]; 5243 } 5244 if (bp == buf) { 5245 kfree(buf, M_TEMP); 5246 error = ENOMEM; 5247 goto done; 5248 } 5249 *--bp = '/'; 5250 slash_prefixed = 1; 5251 5252 /* 5253 * Go up a directory. This isn't a mount point so we don't 5254 * have to check again. 5255 * 5256 * We can only safely access nc_parent with ncp held locked. 5257 */ 5258 while ((nch.ncp = ncp->nc_parent) != NULL) { 5259 _cache_lock_shared(ncp); 5260 if (nch.ncp != ncp->nc_parent) { 5261 _cache_unlock(ncp); 5262 continue; 5263 } 5264 _cache_hold(nch.ncp); 5265 _cache_unlock(ncp); 5266 break; 5267 } 5268 _cache_drop(ncp); 5269 ncp = nch.ncp; 5270 } 5271 if (ncp == NULL) { 5272 kfree(buf, M_TEMP); 5273 error = ENOENT; 5274 goto done; 5275 } 5276 5277 if (!slash_prefixed) { 5278 if (bp == buf) { 5279 kfree(buf, M_TEMP); 5280 error = ENOMEM; 5281 goto done; 5282 } 5283 *--bp = '/'; 5284 } 5285 *retbuf = bp; 5286 *freebuf = buf; 5287 error = 0; 5288 done: 5289 if (ncp) 5290 _cache_drop(ncp); 5291 return(error); 5292 } 5293 5294 int 5295 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 5296 char **freebuf, int guess) 5297 { 5298 struct namecache *ncp; 5299 struct nchandle nch; 5300 int error; 5301 5302 *freebuf = NULL; 5303 if (disablefullpath) 5304 return (ENODEV); 5305 5306 if (p == NULL) 5307 return (EINVAL); 5308 5309 /* vn is NULL, client wants us to use p->p_textvp */ 5310 if (vn == NULL) { 5311 if ((vn = p->p_textvp) == NULL) 5312 return (EINVAL); 5313 } 5314 spin_lock_shared(&vn->v_spin); 5315 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 5316 if (ncp->nc_nlen) 5317 break; 5318 } 5319 if (ncp == NULL) { 5320 spin_unlock_shared(&vn->v_spin); 5321 return (EINVAL); 5322 } 5323 _cache_hold(ncp); 5324 spin_unlock_shared(&vn->v_spin); 5325 5326 nch.ncp = ncp; 5327 nch.mount = vn->v_mount; 5328 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 5329 _cache_drop(ncp); 5330 return (error); 5331 } 5332 5333 void 5334 vfscache_rollup_cpu(struct globaldata *gd) 5335 { 5336 struct pcpu_ncache *pn; 5337 long count; 5338 5339 if (pcpu_ncache == NULL) 5340 return; 5341 pn = &pcpu_ncache[gd->gd_cpuid]; 5342 5343 /* 5344 * namecache statistics 5345 */ 5346 if (pn->vfscache_count) { 5347 count = atomic_swap_long(&pn->vfscache_count, 0); 5348 atomic_add_long(&vfscache_count, count); 5349 } 5350 if (pn->vfscache_leafs) { 5351 count = atomic_swap_long(&pn->vfscache_leafs, 0); 5352 atomic_add_long(&vfscache_leafs, count); 5353 } 5354 if (pn->vfscache_unres) { 5355 count = atomic_swap_long(&pn->vfscache_unres, 0); 5356 atomic_add_long(&vfscache_unres, count); 5357 } 5358 if (pn->vfscache_negs) { 5359 count = atomic_swap_long(&pn->vfscache_negs, 0); 5360 atomic_add_long(&vfscache_negs, count); 5361 } 5362 5363 /* 5364 * hysteresis based cleanings 5365 */ 5366 if (pn->inv_kid_quick_count) { 5367 count = atomic_swap_long(&pn->inv_kid_quick_count, 0); 5368 atomic_add_long(&inv_kid_quick_count, count); 5369 } 5370 if (pn->inv_ncp_quick_count) { 5371 count = atomic_swap_long(&pn->inv_ncp_quick_count, 0); 5372 atomic_add_long(&inv_ncp_quick_count, count); 5373 } 5374 if (pn->clean_pos_count) { 5375 count = atomic_swap_long(&pn->clean_pos_count, 0); 5376 atomic_add_long(&clean_pos_count, count); 5377 } 5378 if (pn->clean_neg_count) { 5379 count = atomic_swap_long(&pn->clean_neg_count, 0); 5380 atomic_add_long(&clean_neg_count, count); 5381 } 5382 5383 if (pn->numdefered) { 5384 count = atomic_swap_long(&pn->numdefered, 0); 5385 atomic_add_long(&numdefered, count); 5386 } 5387 } 5388