1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 144 145 TAILQ_HEAD(nchash_list, namecache); 146 147 /* 148 * Don't cachealign, but at least pad to 32 bytes so entries 149 * don't cross a cache line. 150 */ 151 struct nchash_head { 152 struct nchash_list list; /* 16 bytes */ 153 struct spinlock spin; /* 8 bytes */ 154 long pad01; /* 8 bytes */ 155 }; 156 157 struct ncmount_cache { 158 struct spinlock spin; 159 struct namecache *ncp; 160 struct mount *mp; 161 struct mount *mp_target; 162 int isneg; 163 int ticks; 164 int updating; 165 int unused01; 166 }; 167 168 struct pcpu_ncache { 169 struct spinlock umount_spin; /* cache_findmount/interlock */ 170 struct spinlock neg_spin; /* for neg_list and neg_count */ 171 struct namecache_list neg_list; 172 long neg_count; 173 long vfscache_negs; 174 long vfscache_count; 175 long vfscache_leafs; 176 long numdefered; 177 } __cachealign; 178 179 __read_mostly static struct nchash_head *nchashtbl; 180 __read_mostly static struct pcpu_ncache *pcpu_ncache; 181 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 182 183 /* 184 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 185 * to create the namecache infrastructure leading to a dangling vnode. 186 * 187 * 0 Only errors are reported 188 * 1 Successes are reported 189 * 2 Successes + the whole directory scan is reported 190 * 3 Force the directory scan code run as if the parent vnode did not 191 * have a namecache record, even if it does have one. 192 */ 193 __read_mostly static int ncvp_debug; 194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 195 "Namecache debug level (0-3)"); 196 197 __read_mostly static u_long nchash; /* size of hash table */ 198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 199 "Size of namecache hash table"); 200 201 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 203 "Batch flush negative entries"); 204 205 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 207 "Batch flush positive entries"); 208 209 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 211 "Ratio of namecache negative entries"); 212 213 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 215 "Warn on locked namecache entries in ticks"); 216 217 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 219 "Number of cache entries allocated"); 220 221 __read_mostly static int ncp_shared_lock_disable = 0; 222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 223 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 224 225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 226 "sizeof(struct vnode)"); 227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 228 "sizeof(struct namecache)"); 229 230 __read_mostly static int ncmount_cache_enable = 1; 231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 232 &ncmount_cache_enable, 0, "mount point cache"); 233 234 static __inline void _cache_drop(struct namecache *ncp); 235 static int cache_resolve_mp(struct mount *mp); 236 static int cache_findmount_callback(struct mount *mp, void *data); 237 static void _cache_setunresolved(struct namecache *ncp); 238 static void _cache_cleanneg(long count); 239 static void _cache_cleanpos(long count); 240 static void _cache_cleandefered(void); 241 static void _cache_unlink(struct namecache *ncp); 242 243 /* 244 * The new name cache statistics (these are rolled up globals and not 245 * modified in the critical path, see struct pcpu_ncache). 246 */ 247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 248 static long vfscache_negs; 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 250 "Number of negative namecache entries"); 251 static long vfscache_count; 252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 253 "Number of namecaches entries"); 254 static long vfscache_leafs; 255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 256 "Number of namecaches entries"); 257 static long numdefered; 258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 259 "Number of cache entries allocated"); 260 261 262 struct nchstats nchstats[SMP_MAXCPU]; 263 /* 264 * Export VFS cache effectiveness statistics to user-land. 265 * 266 * The statistics are left for aggregation to user-land so 267 * neat things can be achieved, like observing per-CPU cache 268 * distribution. 269 */ 270 static int 271 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 272 { 273 struct globaldata *gd; 274 int i, error; 275 276 error = 0; 277 for (i = 0; i < ncpus; ++i) { 278 gd = globaldata_find(i); 279 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 280 sizeof(struct nchstats)))) 281 break; 282 } 283 284 return (error); 285 } 286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 287 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 288 289 static void cache_zap(struct namecache *ncp); 290 291 /* 292 * Cache mount points and namecache records in order to avoid unnecessary 293 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 294 * performance and is particularly important on multi-socket systems to 295 * reduce cache-line ping-ponging. 296 * 297 * Try to keep the pcpu structure within one cache line (~64 bytes). 298 */ 299 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 300 #define MNTCACHE_SET 8 /* set associativity */ 301 302 struct mntcache_elm { 303 struct namecache *ncp; 304 struct mount *mp; 305 int ticks; 306 int unused01; 307 }; 308 309 struct mntcache { 310 struct mntcache_elm array[MNTCACHE_COUNT]; 311 } __cachealign; 312 313 static struct mntcache pcpu_mntcache[MAXCPU]; 314 315 static __inline 316 struct mntcache_elm * 317 _cache_mntcache_hash(void *ptr) 318 { 319 struct mntcache_elm *elm; 320 int hv; 321 322 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 323 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 324 325 return elm; 326 } 327 328 static 329 void 330 _cache_mntref(struct mount *mp) 331 { 332 struct mntcache_elm *elm; 333 struct mount *mpr; 334 int i; 335 336 elm = _cache_mntcache_hash(mp); 337 for (i = 0; i < MNTCACHE_SET; ++i) { 338 if (elm->mp == mp) { 339 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 340 if (__predict_true(mpr == mp)) 341 return; 342 if (mpr) 343 atomic_add_int(&mpr->mnt_refs, -1); 344 } 345 ++elm; 346 } 347 atomic_add_int(&mp->mnt_refs, 1); 348 } 349 350 static 351 void 352 _cache_mntrel(struct mount *mp) 353 { 354 struct mntcache_elm *elm; 355 struct mntcache_elm *best; 356 struct mount *mpr; 357 int delta1; 358 int delta2; 359 int i; 360 361 elm = _cache_mntcache_hash(mp); 362 best = elm; 363 for (i = 0; i < MNTCACHE_SET; ++i) { 364 if (elm->mp == NULL) { 365 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 366 if (__predict_false(mpr != NULL)) { 367 atomic_add_int(&mpr->mnt_refs, -1); 368 } 369 elm->ticks = ticks; 370 return; 371 } 372 delta1 = ticks - best->ticks; 373 delta2 = ticks - elm->ticks; 374 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 375 best = elm; 376 ++elm; 377 } 378 mpr = atomic_swap_ptr((void *)&best->mp, mp); 379 best->ticks = ticks; 380 if (mpr) 381 atomic_add_int(&mpr->mnt_refs, -1); 382 } 383 384 /* 385 * Clears all cached mount points on all cpus. This routine should only 386 * be called when we are waiting for a mount to clear, e.g. so we can 387 * unmount. 388 */ 389 void 390 cache_clearmntcache(struct mount *target __unused) 391 { 392 int n; 393 394 for (n = 0; n < ncpus; ++n) { 395 struct mntcache *cache = &pcpu_mntcache[n]; 396 struct mntcache_elm *elm; 397 struct namecache *ncp; 398 struct mount *mp; 399 int i; 400 401 for (i = 0; i < MNTCACHE_COUNT; ++i) { 402 elm = &cache->array[i]; 403 if (elm->mp) { 404 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 405 if (mp) 406 atomic_add_int(&mp->mnt_refs, -1); 407 } 408 if (elm->ncp) { 409 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 410 if (ncp) 411 _cache_drop(ncp); 412 } 413 } 414 } 415 } 416 417 /* 418 * Namespace locking. The caller must already hold a reference to the 419 * namecache structure in order to lock/unlock it. The controlling entity 420 * in a 1->0 transition does not need to lock the ncp to dispose of it, 421 * as nobody else will have visiblity to it at that point. 422 * 423 * Note that holding a locked namecache structure prevents other threads 424 * from making namespace changes (e.g. deleting or creating), prevents 425 * vnode association state changes by other threads, and prevents the 426 * namecache entry from being resolved or unresolved by other threads. 427 * 428 * An exclusive lock owner has full authority to associate/disassociate 429 * vnodes and resolve/unresolve the locked ncp. 430 * 431 * A shared lock owner only has authority to acquire the underlying vnode, 432 * if any. 433 * 434 * The primary lock field is nc_lockstatus. nc_locktd is set after the 435 * fact (when locking) or cleared prior to unlocking. 436 * 437 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 438 * or recycled, but it does NOT help you if the vnode had already 439 * initiated a recyclement. If this is important, use cache_get() 440 * rather then cache_lock() (and deal with the differences in the 441 * way the refs counter is handled). Or, alternatively, make an 442 * unconditional call to cache_validate() or cache_resolve() 443 * after cache_lock() returns. 444 */ 445 static __inline 446 void 447 _cache_lock(struct namecache *ncp) 448 { 449 int didwarn = 0; 450 int error; 451 452 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 453 while (__predict_false(error == EWOULDBLOCK)) { 454 if (didwarn == 0) { 455 didwarn = ticks - nclockwarn; 456 kprintf("[diagnostic] cache_lock: " 457 "%s blocked on %p " 458 "\"%*.*s\"\n", 459 curthread->td_comm, ncp, 460 ncp->nc_nlen, ncp->nc_nlen, 461 ncp->nc_name); 462 } 463 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 464 } 465 if (__predict_false(didwarn)) { 466 kprintf("[diagnostic] cache_lock: " 467 "%s unblocked %*.*s after %d secs\n", 468 curthread->td_comm, 469 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 470 (int)(ticks - didwarn) / hz); 471 } 472 } 473 474 /* 475 * Release a previously acquired lock. 476 * 477 * A concurrent shared-lock acquisition or acquisition/release can 478 * race bit 31 so only drop the ncp if bit 31 was set. 479 */ 480 static __inline 481 void 482 _cache_unlock(struct namecache *ncp) 483 { 484 lockmgr(&ncp->nc_lock, LK_RELEASE); 485 } 486 487 /* 488 * Lock ncp exclusively, non-blocking. Return 0 on success. 489 */ 490 static __inline 491 int 492 _cache_lock_nonblock(struct namecache *ncp) 493 { 494 int error; 495 496 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 497 if (__predict_false(error != 0)) { 498 return(EWOULDBLOCK); 499 } 500 return 0; 501 } 502 503 /* 504 * This is a special form of _cache_lock() which only succeeds if 505 * it can get a pristine, non-recursive lock. The caller must have 506 * already ref'd the ncp. 507 * 508 * On success the ncp will be locked, on failure it will not. The 509 * ref count does not change either way. 510 * 511 * We want _cache_lock_special() (on success) to return a definitively 512 * usable vnode or a definitively unresolved ncp. 513 */ 514 static __inline 515 int 516 _cache_lock_special(struct namecache *ncp) 517 { 518 if (_cache_lock_nonblock(ncp) == 0) { 519 if (lockmgr_oneexcl(&ncp->nc_lock)) { 520 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 521 _cache_setunresolved(ncp); 522 return 0; 523 } 524 _cache_unlock(ncp); 525 } 526 return EWOULDBLOCK; 527 } 528 529 /* 530 * Shared lock, guarantees vp held 531 * 532 * The shared lock holds vp on the 0->1 transition. It is possible to race 533 * another shared lock release, preventing the other release from dropping 534 * the vnode and clearing bit 31. 535 * 536 * If it is not set then we are responsible for setting it, and this 537 * responsibility does not race with anyone else. 538 */ 539 static __inline 540 void 541 _cache_lock_shared(struct namecache *ncp) 542 { 543 int didwarn = 0; 544 int error; 545 546 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 547 while (__predict_false(error == EWOULDBLOCK)) { 548 if (didwarn == 0) { 549 didwarn = ticks - nclockwarn; 550 kprintf("[diagnostic] cache_lock_shared: " 551 "%s blocked on %p " 552 "\"%*.*s\"\n", 553 curthread->td_comm, ncp, 554 ncp->nc_nlen, ncp->nc_nlen, 555 ncp->nc_name); 556 } 557 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 558 } 559 if (__predict_false(didwarn)) { 560 kprintf("[diagnostic] cache_lock_shared: " 561 "%s unblocked %*.*s after %d secs\n", 562 curthread->td_comm, 563 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 564 (int)(ticks - didwarn) / hz); 565 } 566 } 567 568 /* 569 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 570 */ 571 static __inline 572 int 573 _cache_lock_shared_nonblock(struct namecache *ncp) 574 { 575 int error; 576 577 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 578 if (__predict_false(error != 0)) { 579 return(EWOULDBLOCK); 580 } 581 return 0; 582 } 583 584 /* 585 * This function tries to get a shared lock but will back-off to an 586 * exclusive lock if: 587 * 588 * (1) Some other thread is trying to obtain an exclusive lock 589 * (to prevent the exclusive requester from getting livelocked out 590 * by many shared locks). 591 * 592 * (2) The current thread already owns an exclusive lock (to avoid 593 * deadlocking). 594 * 595 * WARNING! On machines with lots of cores we really want to try hard to 596 * get a shared lock or concurrent path lookups can chain-react 597 * into a very high-latency exclusive lock. 598 * 599 * This is very evident in dsynth's initial scans. 600 */ 601 static __inline 602 int 603 _cache_lock_shared_special(struct namecache *ncp) 604 { 605 /* 606 * Only honor a successful shared lock (returning 0) if there is 607 * no exclusive request pending and the vnode, if present, is not 608 * in a reclaimed state. 609 */ 610 if (_cache_lock_shared_nonblock(ncp) == 0) { 611 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 612 if (ncp->nc_vp == NULL || 613 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 614 return(0); 615 } 616 } 617 _cache_unlock(ncp); 618 return(EWOULDBLOCK); 619 } 620 621 /* 622 * Non-blocking shared lock failed. If we already own the exclusive 623 * lock just acquire another exclusive lock (instead of deadlocking). 624 * Otherwise acquire a shared lock. 625 */ 626 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 627 _cache_lock(ncp); 628 return(0); 629 } 630 _cache_lock_shared(ncp); 631 return(0); 632 } 633 634 static __inline 635 int 636 _cache_lockstatus(struct namecache *ncp) 637 { 638 int status; 639 640 status = lockstatus(&ncp->nc_lock, curthread); 641 if (status == 0 || status == LK_EXCLOTHER) 642 status = -1; 643 return status; 644 } 645 646 /* 647 * cache_hold() and cache_drop() prevent the premature deletion of a 648 * namecache entry but do not prevent operations (such as zapping) on 649 * that namecache entry. 650 * 651 * This routine may only be called from outside this source module if 652 * nc_refs is already deterministically at least 1, such as being 653 * associated with e.g. a process, file descriptor, or some other entity. 654 * 655 * Only the above situations, similar situations within this module where 656 * the ref count is deterministically at least 1, or when the ncp is found 657 * via the nchpp (hash table) lookup, can bump nc_refs. 658 * 659 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 660 * can still be removed from the nc_list, however, as long as the caller 661 * can acquire its lock (in the wrong order). 662 * 663 * This is a rare case where callers are allowed to hold a spinlock, 664 * so we can't ourselves. 665 */ 666 static __inline 667 struct namecache * 668 _cache_hold(struct namecache *ncp) 669 { 670 KKASSERT(ncp->nc_refs > 0); 671 atomic_add_int(&ncp->nc_refs, 1); 672 673 return(ncp); 674 } 675 676 /* 677 * Drop a cache entry. 678 * 679 * The 1->0 transition is special and requires the caller to destroy the 680 * entry. It means that the ncp is no longer on a nchpp list (since that 681 * would mean there was stilla ref). The ncp could still be on a nc_list 682 * but will not have any child of its own, again because nc_refs is now 0 683 * and children would have a ref to their parent. 684 * 685 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 686 */ 687 static __inline 688 void 689 _cache_drop(struct namecache *ncp) 690 { 691 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 692 /* 693 * Executed unlocked (no need to lock on last drop) 694 */ 695 _cache_setunresolved(ncp); 696 697 /* 698 * Scrap it. 699 */ 700 ncp->nc_refs = -1; /* safety */ 701 if (ncp->nc_name) 702 kfree(ncp->nc_name, M_VFSCACHE); 703 kfree(ncp, M_VFSCACHE); 704 } 705 } 706 707 /* 708 * Link a new namecache entry to its parent and to the hash table. Be 709 * careful to avoid races if vhold() blocks in the future. 710 * 711 * Both ncp and par must be referenced and locked. The reference is 712 * transfered to the nchpp (and, most notably, NOT to the parent list). 713 * 714 * NOTE: The hash table spinlock is held across this call, we can't do 715 * anything fancy. 716 */ 717 static void 718 _cache_link_parent(struct namecache *ncp, struct namecache *par, 719 struct nchash_head *nchpp) 720 { 721 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 722 723 KKASSERT(ncp->nc_parent == NULL); 724 ncp->nc_parent = par; 725 ncp->nc_head = nchpp; 726 727 /* 728 * Set inheritance flags. Note that the parent flags may be 729 * stale due to getattr potentially not having been run yet 730 * (it gets run during nlookup()'s). 731 */ 732 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 733 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 734 ncp->nc_flag |= NCF_SF_PNOCACHE; 735 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 736 ncp->nc_flag |= NCF_UF_PCACHE; 737 738 /* 739 * Add to hash table and parent, adjust accounting 740 */ 741 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 742 atomic_add_long(&pn->vfscache_count, 1); 743 if (TAILQ_EMPTY(&ncp->nc_list)) 744 atomic_add_long(&pn->vfscache_leafs, 1); 745 746 if (TAILQ_EMPTY(&par->nc_list)) { 747 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 748 atomic_add_long(&pn->vfscache_leafs, -1); 749 /* 750 * Any vp associated with an ncp which has children must 751 * be held to prevent it from being recycled. 752 */ 753 if (par->nc_vp) 754 vhold(par->nc_vp); 755 } else { 756 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 757 } 758 _cache_hold(par); /* add nc_parent ref */ 759 } 760 761 /* 762 * Remove the parent and hash associations from a namecache structure. 763 * Drop the ref-count on the parent. The caller receives the ref 764 * from the ncp's nchpp linkage that was removed and may forward that 765 * ref to a new linkage. 766 767 * The caller usually holds an additional ref * on the ncp so the unlink 768 * cannot be the final drop. XXX should not be necessary now since the 769 * caller receives the ref from the nchpp linkage, assuming the ncp 770 * was linked in the first place. 771 * 772 * ncp must be locked, which means that there won't be any nc_parent 773 * removal races. This routine will acquire a temporary lock on 774 * the parent as well as the appropriate hash chain. 775 */ 776 static void 777 _cache_unlink_parent(struct namecache *ncp) 778 { 779 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 780 struct namecache *par; 781 struct vnode *dropvp; 782 struct nchash_head *nchpp; 783 784 if ((par = ncp->nc_parent) != NULL) { 785 cpu_ccfence(); 786 KKASSERT(ncp->nc_parent == par); 787 788 /* don't add a ref, we drop the nchpp ref later */ 789 _cache_lock(par); 790 nchpp = ncp->nc_head; 791 spin_lock(&nchpp->spin); 792 793 /* 794 * Remove from hash table and parent, adjust accounting 795 */ 796 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 797 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 798 atomic_add_long(&pn->vfscache_count, -1); 799 if (TAILQ_EMPTY(&ncp->nc_list)) 800 atomic_add_long(&pn->vfscache_leafs, -1); 801 802 dropvp = NULL; 803 if (TAILQ_EMPTY(&par->nc_list)) { 804 atomic_add_long(&pn->vfscache_leafs, 1); 805 if (par->nc_vp) 806 dropvp = par->nc_vp; 807 } 808 ncp->nc_parent = NULL; 809 ncp->nc_head = NULL; 810 spin_unlock(&nchpp->spin); 811 _cache_unlock(par); 812 _cache_drop(par); /* drop nc_parent ref */ 813 814 /* 815 * We can only safely vdrop with no spinlocks held. 816 */ 817 if (dropvp) 818 vdrop(dropvp); 819 } 820 } 821 822 /* 823 * Allocate a new namecache structure. Most of the code does not require 824 * zero-termination of the string but it makes vop_compat_ncreate() easier. 825 * 826 * The returned ncp will be locked and referenced. The ref is generally meant 827 * to be transfered to the nchpp linkage. 828 */ 829 static struct namecache * 830 cache_alloc(int nlen) 831 { 832 struct namecache *ncp; 833 834 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 835 if (nlen) 836 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 837 ncp->nc_nlen = nlen; 838 ncp->nc_flag = NCF_UNRESOLVED; 839 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 840 ncp->nc_refs = 1; 841 TAILQ_INIT(&ncp->nc_list); 842 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 843 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 844 845 return(ncp); 846 } 847 848 /* 849 * Can only be called for the case where the ncp has never been 850 * associated with anything (so no spinlocks are needed). 851 */ 852 static void 853 _cache_free(struct namecache *ncp) 854 { 855 KKASSERT(ncp->nc_refs == 1); 856 if (ncp->nc_name) 857 kfree(ncp->nc_name, M_VFSCACHE); 858 kfree(ncp, M_VFSCACHE); 859 } 860 861 /* 862 * [re]initialize a nchandle. 863 */ 864 void 865 cache_zero(struct nchandle *nch) 866 { 867 nch->ncp = NULL; 868 nch->mount = NULL; 869 } 870 871 /* 872 * Ref and deref a nchandle structure (ncp + mp) 873 * 874 * The caller must specify a stable ncp pointer, typically meaning the 875 * ncp is already referenced but this can also occur indirectly through 876 * e.g. holding a lock on a direct child. 877 * 878 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 879 * use read spinlocks here. 880 */ 881 struct nchandle * 882 cache_hold(struct nchandle *nch) 883 { 884 _cache_hold(nch->ncp); 885 _cache_mntref(nch->mount); 886 return(nch); 887 } 888 889 /* 890 * Create a copy of a namecache handle for an already-referenced 891 * entry. 892 */ 893 void 894 cache_copy(struct nchandle *nch, struct nchandle *target) 895 { 896 struct namecache *ncp; 897 struct mount *mp; 898 struct mntcache_elm *elm; 899 struct namecache *ncpr; 900 int i; 901 902 ncp = nch->ncp; 903 mp = nch->mount; 904 target->ncp = ncp; 905 target->mount = mp; 906 907 elm = _cache_mntcache_hash(ncp); 908 for (i = 0; i < MNTCACHE_SET; ++i) { 909 if (elm->ncp == ncp) { 910 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 911 if (ncpr == ncp) { 912 _cache_mntref(mp); 913 return; 914 } 915 if (ncpr) 916 _cache_drop(ncpr); 917 } 918 ++elm; 919 } 920 if (ncp) 921 _cache_hold(ncp); 922 _cache_mntref(mp); 923 } 924 925 /* 926 * Drop the nchandle, but try to cache the ref to avoid global atomic 927 * ops. This is typically done on the system root and jail root nchandles. 928 */ 929 void 930 cache_drop_and_cache(struct nchandle *nch, int elmno) 931 { 932 struct mntcache_elm *elm; 933 struct mntcache_elm *best; 934 struct namecache *ncpr; 935 int delta1; 936 int delta2; 937 int i; 938 939 if (elmno > 4) { 940 if (nch->ncp) { 941 _cache_drop(nch->ncp); 942 nch->ncp = NULL; 943 } 944 if (nch->mount) { 945 _cache_mntrel(nch->mount); 946 nch->mount = NULL; 947 } 948 return; 949 } 950 951 elm = _cache_mntcache_hash(nch->ncp); 952 best = elm; 953 for (i = 0; i < MNTCACHE_SET; ++i) { 954 if (elm->ncp == NULL) { 955 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 956 _cache_mntrel(nch->mount); 957 elm->ticks = ticks; 958 nch->mount = NULL; 959 nch->ncp = NULL; 960 if (ncpr) 961 _cache_drop(ncpr); 962 return; 963 } 964 delta1 = ticks - best->ticks; 965 delta2 = ticks - elm->ticks; 966 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 967 best = elm; 968 ++elm; 969 } 970 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 971 _cache_mntrel(nch->mount); 972 best->ticks = ticks; 973 nch->mount = NULL; 974 nch->ncp = NULL; 975 if (ncpr) 976 _cache_drop(ncpr); 977 } 978 979 void 980 cache_changemount(struct nchandle *nch, struct mount *mp) 981 { 982 _cache_mntref(mp); 983 _cache_mntrel(nch->mount); 984 nch->mount = mp; 985 } 986 987 void 988 cache_drop(struct nchandle *nch) 989 { 990 _cache_mntrel(nch->mount); 991 _cache_drop(nch->ncp); 992 nch->ncp = NULL; 993 nch->mount = NULL; 994 } 995 996 int 997 cache_lockstatus(struct nchandle *nch) 998 { 999 return(_cache_lockstatus(nch->ncp)); 1000 } 1001 1002 void 1003 cache_lock(struct nchandle *nch) 1004 { 1005 _cache_lock(nch->ncp); 1006 } 1007 1008 void 1009 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1010 { 1011 struct namecache *ncp = nch->ncp; 1012 1013 if (ncp_shared_lock_disable || excl || 1014 (ncp->nc_flag & NCF_UNRESOLVED)) { 1015 _cache_lock(ncp); 1016 } else { 1017 _cache_lock_shared(ncp); 1018 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1019 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1020 _cache_unlock(ncp); 1021 _cache_lock(ncp); 1022 } 1023 } else { 1024 _cache_unlock(ncp); 1025 _cache_lock(ncp); 1026 } 1027 } 1028 } 1029 1030 /* 1031 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1032 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1033 * 1034 * The caller is responsible for checking the validity upon return as 1035 * the records may have been flagged DESTROYED in the interim. 1036 * 1037 * Namecache lock ordering is leaf first, then parent. However, complex 1038 * interactions may occur between the source and target because there is 1039 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1040 */ 1041 void 1042 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1043 struct nchandle *tncpd, struct nchandle *tncp, 1044 struct ucred *fcred, struct ucred *tcred) 1045 { 1046 int tlocked = 1; 1047 1048 /* 1049 * Lock tncp and tncpd 1050 * 1051 * NOTE: Because these ncps are not locked to begin with, it is 1052 * possible for other rename races to cause the normal lock 1053 * order assumptions to fail. 1054 * 1055 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1056 * matches after the leaf has been locked. However, ordering 1057 * between the 'from' and the 'to' is not and an overlapping 1058 * lock order reversal is still possible. 1059 */ 1060 again: 1061 if (__predict_false(tlocked == 0)) { 1062 cache_lock(tncp); 1063 } 1064 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1065 cache_unlock(tncp); 1066 cache_lock(tncpd); cache_unlock(tncpd); /* cycle */ 1067 tlocked = 0; 1068 goto again; 1069 } 1070 1071 /* 1072 * Lock fncp and fncpd 1073 * 1074 * NOTE: Because these ncps are not locked to begin with, it is 1075 * possible for other rename races to cause the normal lock 1076 * order assumptions to fail. 1077 * 1078 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1079 * matches after the leaf has been locked. However, ordering 1080 * between the 'from' and the 'to' is not and an overlapping 1081 * lock order reversal is still possible. 1082 */ 1083 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1084 cache_unlock(tncpd); 1085 cache_unlock(tncp); 1086 cache_lock(fncp); cache_unlock(fncp); /* cycle */ 1087 tlocked = 0; 1088 goto again; 1089 } 1090 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1091 cache_unlock(fncp); 1092 cache_unlock(tncpd); 1093 cache_unlock(tncp); 1094 cache_lock(fncpd); cache_unlock(fncpd); /* cycle */ 1095 tlocked = 0; 1096 goto again; 1097 } 1098 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1099 cache_resolve(fncpd, fcred); 1100 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1101 cache_resolve(tncpd, tcred); 1102 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1103 cache_resolve(fncp, fcred); 1104 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1105 cache_resolve(tncp, tcred); 1106 } 1107 1108 int 1109 cache_lock_nonblock(struct nchandle *nch) 1110 { 1111 return(_cache_lock_nonblock(nch->ncp)); 1112 } 1113 1114 void 1115 cache_unlock(struct nchandle *nch) 1116 { 1117 _cache_unlock(nch->ncp); 1118 } 1119 1120 /* 1121 * ref-and-lock, unlock-and-deref functions. 1122 * 1123 * This function is primarily used by nlookup. Even though cache_lock 1124 * holds the vnode, it is possible that the vnode may have already 1125 * initiated a recyclement. 1126 * 1127 * We want cache_get() to return a definitively usable vnode or a 1128 * definitively unresolved ncp. 1129 */ 1130 static 1131 struct namecache * 1132 _cache_get(struct namecache *ncp) 1133 { 1134 _cache_hold(ncp); 1135 _cache_lock(ncp); 1136 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1137 _cache_setunresolved(ncp); 1138 return(ncp); 1139 } 1140 1141 /* 1142 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1143 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1144 * valid. Otherwise an exclusive lock will be acquired instead. 1145 */ 1146 static 1147 struct namecache * 1148 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1149 { 1150 if (ncp_shared_lock_disable || excl || 1151 (ncp->nc_flag & NCF_UNRESOLVED)) { 1152 return(_cache_get(ncp)); 1153 } 1154 _cache_hold(ncp); 1155 _cache_lock_shared(ncp); 1156 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1157 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1158 _cache_unlock(ncp); 1159 ncp = _cache_get(ncp); 1160 _cache_drop(ncp); 1161 } 1162 } else { 1163 _cache_unlock(ncp); 1164 ncp = _cache_get(ncp); 1165 _cache_drop(ncp); 1166 } 1167 return(ncp); 1168 } 1169 1170 /* 1171 * NOTE: The same nchandle can be passed for both arguments. 1172 */ 1173 void 1174 cache_get(struct nchandle *nch, struct nchandle *target) 1175 { 1176 KKASSERT(nch->ncp->nc_refs > 0); 1177 target->mount = nch->mount; 1178 target->ncp = _cache_get(nch->ncp); 1179 _cache_mntref(target->mount); 1180 } 1181 1182 void 1183 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1184 { 1185 KKASSERT(nch->ncp->nc_refs > 0); 1186 target->mount = nch->mount; 1187 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1188 _cache_mntref(target->mount); 1189 } 1190 1191 /* 1192 * Release a held and locked ncp 1193 */ 1194 static __inline 1195 void 1196 _cache_put(struct namecache *ncp) 1197 { 1198 _cache_unlock(ncp); 1199 _cache_drop(ncp); 1200 } 1201 1202 void 1203 cache_put(struct nchandle *nch) 1204 { 1205 _cache_mntrel(nch->mount); 1206 _cache_put(nch->ncp); 1207 nch->ncp = NULL; 1208 nch->mount = NULL; 1209 } 1210 1211 /* 1212 * Resolve an unresolved ncp by associating a vnode with it. If the 1213 * vnode is NULL, a negative cache entry is created. 1214 * 1215 * The ncp should be locked on entry and will remain locked on return. 1216 */ 1217 static 1218 void 1219 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1220 { 1221 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1222 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1223 ncp->nc_vp == NULL); 1224 1225 if (vp) { 1226 /* 1227 * Any vp associated with an ncp which has children must 1228 * be held. Any vp associated with a locked ncp must be held. 1229 */ 1230 if (!TAILQ_EMPTY(&ncp->nc_list)) 1231 vhold(vp); 1232 spin_lock(&vp->v_spin); 1233 ncp->nc_vp = vp; 1234 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1235 ++vp->v_namecache_count; 1236 _cache_hold(ncp); /* v_namecache assoc */ 1237 spin_unlock(&vp->v_spin); 1238 vhold(vp); /* nc_vp */ 1239 1240 /* 1241 * Set auxiliary flags 1242 */ 1243 switch(vp->v_type) { 1244 case VDIR: 1245 ncp->nc_flag |= NCF_ISDIR; 1246 break; 1247 case VLNK: 1248 ncp->nc_flag |= NCF_ISSYMLINK; 1249 /* XXX cache the contents of the symlink */ 1250 break; 1251 default: 1252 break; 1253 } 1254 1255 ncp->nc_error = 0; 1256 1257 /* 1258 * XXX: this is a hack to work-around the lack of a real pfs vfs 1259 * implementation 1260 */ 1261 if (mp) { 1262 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1263 vp->v_pfsmp = mp; 1264 } 1265 } else { 1266 /* 1267 * When creating a negative cache hit we set the 1268 * namecache_gen. A later resolve will clean out the 1269 * negative cache hit if the mount point's namecache_gen 1270 * has changed. Used by devfs, could also be used by 1271 * other remote FSs. 1272 */ 1273 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1274 1275 ncp->nc_vp = NULL; 1276 ncp->nc_negcpu = mycpu->gd_cpuid; 1277 spin_lock(&pn->neg_spin); 1278 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1279 _cache_hold(ncp); /* neg_list assoc */ 1280 ++pn->neg_count; 1281 spin_unlock(&pn->neg_spin); 1282 atomic_add_long(&pn->vfscache_negs, 1); 1283 1284 ncp->nc_error = ENOENT; 1285 if (mp) 1286 VFS_NCPGEN_SET(mp, ncp); 1287 } 1288 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1289 } 1290 1291 void 1292 cache_setvp(struct nchandle *nch, struct vnode *vp) 1293 { 1294 _cache_setvp(nch->mount, nch->ncp, vp); 1295 } 1296 1297 /* 1298 * Used for NFS 1299 */ 1300 void 1301 cache_settimeout(struct nchandle *nch, int nticks) 1302 { 1303 struct namecache *ncp = nch->ncp; 1304 1305 if ((ncp->nc_timeout = ticks + nticks) == 0) 1306 ncp->nc_timeout = 1; 1307 } 1308 1309 /* 1310 * Disassociate the vnode or negative-cache association and mark a 1311 * namecache entry as unresolved again. Note that the ncp is still 1312 * left in the hash table and still linked to its parent. 1313 * 1314 * The ncp should be locked and refd on entry and will remain locked and refd 1315 * on return. 1316 * 1317 * This routine is normally never called on a directory containing children. 1318 * However, NFS often does just that in its rename() code as a cop-out to 1319 * avoid complex namespace operations. This disconnects a directory vnode 1320 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1321 * sync. 1322 * 1323 */ 1324 static 1325 void 1326 _cache_setunresolved(struct namecache *ncp) 1327 { 1328 struct vnode *vp; 1329 1330 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1331 ncp->nc_flag |= NCF_UNRESOLVED; 1332 ncp->nc_timeout = 0; 1333 ncp->nc_error = ENOTCONN; 1334 if ((vp = ncp->nc_vp) != NULL) { 1335 spin_lock(&vp->v_spin); 1336 ncp->nc_vp = NULL; 1337 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1338 --vp->v_namecache_count; 1339 spin_unlock(&vp->v_spin); 1340 1341 /* 1342 * Any vp associated with an ncp with children is 1343 * held by that ncp. Any vp associated with ncp 1344 * is held by that ncp. These conditions must be 1345 * undone when the vp is cleared out from the ncp. 1346 */ 1347 if (!TAILQ_EMPTY(&ncp->nc_list)) 1348 vdrop(vp); 1349 vdrop(vp); 1350 } else { 1351 struct pcpu_ncache *pn; 1352 1353 pn = &pcpu_ncache[ncp->nc_negcpu]; 1354 1355 atomic_add_long(&pn->vfscache_negs, -1); 1356 spin_lock(&pn->neg_spin); 1357 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1358 --pn->neg_count; 1359 spin_unlock(&pn->neg_spin); 1360 } 1361 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1362 _cache_drop(ncp); /* from v_namecache or neg_list */ 1363 } 1364 } 1365 1366 /* 1367 * The cache_nresolve() code calls this function to automatically 1368 * set a resolved cache element to unresolved if it has timed out 1369 * or if it is a negative cache hit and the mount point namecache_gen 1370 * has changed. 1371 */ 1372 static __inline int 1373 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1374 { 1375 /* 1376 * Try to zap entries that have timed out. We have 1377 * to be careful here because locked leafs may depend 1378 * on the vnode remaining intact in a parent, so only 1379 * do this under very specific conditions. 1380 */ 1381 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1382 TAILQ_EMPTY(&ncp->nc_list)) { 1383 return 1; 1384 } 1385 1386 /* 1387 * If a resolved negative cache hit is invalid due to 1388 * the mount's namecache generation being bumped, zap it. 1389 */ 1390 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1391 return 1; 1392 } 1393 1394 /* 1395 * Otherwise we are good 1396 */ 1397 return 0; 1398 } 1399 1400 static __inline void 1401 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1402 { 1403 /* 1404 * Already in an unresolved state, nothing to do. 1405 */ 1406 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1407 if (_cache_auto_unresolve_test(mp, ncp)) 1408 _cache_setunresolved(ncp); 1409 } 1410 } 1411 1412 void 1413 cache_setunresolved(struct nchandle *nch) 1414 { 1415 _cache_setunresolved(nch->ncp); 1416 } 1417 1418 /* 1419 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1420 * looking for matches. This flag tells the lookup code when it must 1421 * check for a mount linkage and also prevents the directories in question 1422 * from being deleted or renamed. 1423 */ 1424 static 1425 int 1426 cache_clrmountpt_callback(struct mount *mp, void *data) 1427 { 1428 struct nchandle *nch = data; 1429 1430 if (mp->mnt_ncmounton.ncp == nch->ncp) 1431 return(1); 1432 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1433 return(1); 1434 return(0); 1435 } 1436 1437 /* 1438 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1439 * with a mount point. 1440 */ 1441 void 1442 cache_clrmountpt(struct nchandle *nch) 1443 { 1444 int count; 1445 1446 count = mountlist_scan(cache_clrmountpt_callback, nch, 1447 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1448 MNTSCAN_NOUNLOCK); 1449 if (count == 0) 1450 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1451 } 1452 1453 /* 1454 * Invalidate portions of the namecache topology given a starting entry. 1455 * The passed ncp is set to an unresolved state and: 1456 * 1457 * The passed ncp must be referenced and locked. The routine may unlock 1458 * and relock ncp several times, and will recheck the children and loop 1459 * to catch races. When done the passed ncp will be returned with the 1460 * reference and lock intact. 1461 * 1462 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1463 * that the physical underlying nodes have been 1464 * destroyed... as in deleted. For example, when 1465 * a directory is removed. This will cause record 1466 * lookups on the name to no longer be able to find 1467 * the record and tells the resolver to return failure 1468 * rather then trying to resolve through the parent. 1469 * 1470 * The topology itself, including ncp->nc_name, 1471 * remains intact. 1472 * 1473 * This only applies to the passed ncp, if CINV_CHILDREN 1474 * is specified the children are not flagged. 1475 * 1476 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1477 * state as well. 1478 * 1479 * Note that this will also have the side effect of 1480 * cleaning out any unreferenced nodes in the topology 1481 * from the leaves up as the recursion backs out. 1482 * 1483 * Note that the topology for any referenced nodes remains intact, but 1484 * the nodes will be marked as having been destroyed and will be set 1485 * to an unresolved state. 1486 * 1487 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1488 * the namecache entry may not actually be invalidated on return if it was 1489 * revalidated while recursing down into its children. This code guarentees 1490 * that the node(s) will go through an invalidation cycle, but does not 1491 * guarentee that they will remain in an invalidated state. 1492 * 1493 * Returns non-zero if a revalidation was detected during the invalidation 1494 * recursion, zero otherwise. Note that since only the original ncp is 1495 * locked the revalidation ultimately can only indicate that the original ncp 1496 * *MIGHT* no have been reresolved. 1497 * 1498 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1499 * have to avoid blowing out the kernel stack. We do this by saving the 1500 * deep namecache node and aborting the recursion, then re-recursing at that 1501 * node using a depth-first algorithm in order to allow multiple deep 1502 * recursions to chain through each other, then we restart the invalidation 1503 * from scratch. 1504 */ 1505 1506 struct cinvtrack { 1507 struct namecache *resume_ncp; 1508 int depth; 1509 }; 1510 1511 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1512 1513 static 1514 int 1515 _cache_inval(struct namecache *ncp, int flags) 1516 { 1517 struct cinvtrack track; 1518 struct namecache *ncp2; 1519 int r; 1520 1521 track.depth = 0; 1522 track.resume_ncp = NULL; 1523 1524 for (;;) { 1525 r = _cache_inval_internal(ncp, flags, &track); 1526 if (track.resume_ncp == NULL) 1527 break; 1528 _cache_unlock(ncp); 1529 while ((ncp2 = track.resume_ncp) != NULL) { 1530 track.resume_ncp = NULL; 1531 _cache_lock(ncp2); 1532 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1533 &track); 1534 /*_cache_put(ncp2);*/ 1535 cache_zap(ncp2); 1536 } 1537 _cache_lock(ncp); 1538 } 1539 return(r); 1540 } 1541 1542 int 1543 cache_inval(struct nchandle *nch, int flags) 1544 { 1545 return(_cache_inval(nch->ncp, flags)); 1546 } 1547 1548 /* 1549 * Helper for _cache_inval(). The passed ncp is refd and locked and 1550 * remains that way on return, but may be unlocked/relocked multiple 1551 * times by the routine. 1552 */ 1553 static int 1554 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1555 { 1556 struct namecache *nextkid; 1557 int rcnt = 0; 1558 1559 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1560 1561 _cache_setunresolved(ncp); 1562 if (flags & CINV_DESTROY) { 1563 ncp->nc_flag |= NCF_DESTROYED; 1564 ++ncp->nc_generation; 1565 } 1566 1567 while ((flags & CINV_CHILDREN) && 1568 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1569 ) { 1570 struct namecache *kid; 1571 int restart; 1572 1573 restart = 0; 1574 _cache_hold(nextkid); 1575 if (++track->depth > MAX_RECURSION_DEPTH) { 1576 track->resume_ncp = ncp; 1577 _cache_hold(ncp); 1578 ++rcnt; 1579 } 1580 while ((kid = nextkid) != NULL) { 1581 /* 1582 * Parent (ncp) must be locked for the iteration. 1583 */ 1584 nextkid = NULL; 1585 if (kid->nc_parent != ncp) { 1586 _cache_drop(kid); 1587 kprintf("cache_inval_internal restartA %s\n", 1588 ncp->nc_name); 1589 restart = 1; 1590 break; 1591 } 1592 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1593 _cache_hold(nextkid); 1594 1595 /* 1596 * Parent unlocked for this section to avoid 1597 * deadlocks. Then lock the kid and check for 1598 * races. 1599 */ 1600 _cache_unlock(ncp); 1601 if (track->resume_ncp) { 1602 _cache_drop(kid); 1603 _cache_lock(ncp); 1604 break; 1605 } 1606 _cache_lock(kid); 1607 if (kid->nc_parent != ncp) { 1608 kprintf("cache_inval_internal " 1609 "restartB %s\n", 1610 ncp->nc_name); 1611 restart = 1; 1612 _cache_unlock(kid); 1613 _cache_drop(kid); 1614 _cache_lock(ncp); 1615 break; 1616 } 1617 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1618 TAILQ_FIRST(&kid->nc_list) 1619 ) { 1620 1621 rcnt += _cache_inval_internal(kid, 1622 flags & ~CINV_DESTROY, track); 1623 /*_cache_unlock(kid);*/ 1624 /*_cache_drop(kid);*/ 1625 cache_zap(kid); 1626 } else { 1627 cache_zap(kid); 1628 } 1629 1630 /* 1631 * Relock parent to continue scan 1632 */ 1633 _cache_lock(ncp); 1634 } 1635 if (nextkid) 1636 _cache_drop(nextkid); 1637 --track->depth; 1638 if (restart == 0) 1639 break; 1640 } 1641 1642 /* 1643 * Someone could have gotten in there while ncp was unlocked, 1644 * retry if so. 1645 */ 1646 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1647 ++rcnt; 1648 return (rcnt); 1649 } 1650 1651 /* 1652 * Invalidate a vnode's namecache associations. To avoid races against 1653 * the resolver we do not invalidate a node which we previously invalidated 1654 * but which was then re-resolved while we were in the invalidation loop. 1655 * 1656 * Returns non-zero if any namecache entries remain after the invalidation 1657 * loop completed. 1658 * 1659 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1660 * be ripped out of the topology while held, the vnode's v_namecache 1661 * list has no such restriction. NCP's can be ripped out of the list 1662 * at virtually any time if not locked, even if held. 1663 * 1664 * In addition, the v_namecache list itself must be locked via 1665 * the vnode's spinlock. 1666 */ 1667 int 1668 cache_inval_vp(struct vnode *vp, int flags) 1669 { 1670 struct namecache *ncp; 1671 struct namecache *next; 1672 1673 restart: 1674 spin_lock(&vp->v_spin); 1675 ncp = TAILQ_FIRST(&vp->v_namecache); 1676 if (ncp) 1677 _cache_hold(ncp); 1678 while (ncp) { 1679 /* loop entered with ncp held and vp spin-locked */ 1680 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1681 _cache_hold(next); 1682 spin_unlock(&vp->v_spin); 1683 _cache_lock(ncp); 1684 if (ncp->nc_vp != vp) { 1685 kprintf("Warning: cache_inval_vp: race-A detected on " 1686 "%s\n", ncp->nc_name); 1687 _cache_put(ncp); 1688 if (next) 1689 _cache_drop(next); 1690 goto restart; 1691 } 1692 _cache_inval(ncp, flags); 1693 _cache_put(ncp); /* also releases reference */ 1694 ncp = next; 1695 spin_lock(&vp->v_spin); 1696 if (ncp && ncp->nc_vp != vp) { 1697 spin_unlock(&vp->v_spin); 1698 kprintf("Warning: cache_inval_vp: race-B detected on " 1699 "%s\n", ncp->nc_name); 1700 _cache_drop(ncp); 1701 goto restart; 1702 } 1703 } 1704 spin_unlock(&vp->v_spin); 1705 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1706 } 1707 1708 /* 1709 * This routine is used instead of the normal cache_inval_vp() when we 1710 * are trying to recycle otherwise good vnodes. 1711 * 1712 * Return 0 on success, non-zero if not all namecache records could be 1713 * disassociated from the vnode (for various reasons). 1714 */ 1715 int 1716 cache_inval_vp_nonblock(struct vnode *vp) 1717 { 1718 struct namecache *ncp; 1719 struct namecache *next; 1720 1721 spin_lock(&vp->v_spin); 1722 ncp = TAILQ_FIRST(&vp->v_namecache); 1723 if (ncp) 1724 _cache_hold(ncp); 1725 while (ncp) { 1726 /* loop entered with ncp held */ 1727 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1728 _cache_hold(next); 1729 spin_unlock(&vp->v_spin); 1730 if (_cache_lock_nonblock(ncp)) { 1731 _cache_drop(ncp); 1732 if (next) 1733 _cache_drop(next); 1734 goto done; 1735 } 1736 if (ncp->nc_vp != vp) { 1737 kprintf("Warning: cache_inval_vp: race-A detected on " 1738 "%s\n", ncp->nc_name); 1739 _cache_put(ncp); 1740 if (next) 1741 _cache_drop(next); 1742 goto done; 1743 } 1744 _cache_inval(ncp, 0); 1745 _cache_put(ncp); /* also releases reference */ 1746 ncp = next; 1747 spin_lock(&vp->v_spin); 1748 if (ncp && ncp->nc_vp != vp) { 1749 spin_unlock(&vp->v_spin); 1750 kprintf("Warning: cache_inval_vp: race-B detected on " 1751 "%s\n", ncp->nc_name); 1752 _cache_drop(ncp); 1753 goto done; 1754 } 1755 } 1756 spin_unlock(&vp->v_spin); 1757 done: 1758 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1759 } 1760 1761 /* 1762 * Clears the universal directory search 'ok' flag. This flag allows 1763 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1764 * so clearing it simply forces revalidation. 1765 */ 1766 void 1767 cache_inval_wxok(struct vnode *vp) 1768 { 1769 struct namecache *ncp; 1770 1771 spin_lock(&vp->v_spin); 1772 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1773 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1774 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1775 } 1776 spin_unlock(&vp->v_spin); 1777 } 1778 1779 /* 1780 * The source ncp has been renamed to the target ncp. All elements have been 1781 * locked, including the parent ncp's. 1782 * 1783 * The target ncp is destroyed (as a normal rename-over would destroy the 1784 * target file or directory). 1785 * 1786 * Because there may be references to the source ncp we cannot copy its 1787 * contents to the target. Instead the source ncp is relinked as the target 1788 * and the target ncp is removed from the namecache topology. 1789 */ 1790 void 1791 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1792 { 1793 struct namecache *fncp = fnch->ncp; 1794 struct namecache *tncp = tnch->ncp; 1795 struct namecache *tncp_par; 1796 struct nchash_head *nchpp; 1797 u_int32_t hash; 1798 char *oname; 1799 char *nname; 1800 1801 ++fncp->nc_generation; 1802 ++tncp->nc_generation; 1803 if (tncp->nc_nlen) { 1804 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1805 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1806 nname[tncp->nc_nlen] = 0; 1807 } else { 1808 nname = NULL; 1809 } 1810 1811 /* 1812 * Rename fncp (unlink) 1813 */ 1814 _cache_unlink_parent(fncp); 1815 oname = fncp->nc_name; 1816 fncp->nc_name = nname; 1817 fncp->nc_nlen = tncp->nc_nlen; 1818 if (oname) 1819 kfree(oname, M_VFSCACHE); 1820 1821 tncp_par = tncp->nc_parent; 1822 KKASSERT(tncp_par->nc_lock.lk_lockholder == curthread); 1823 1824 /* 1825 * Rename fncp (relink) 1826 */ 1827 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1828 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1829 nchpp = NCHHASH(hash); 1830 1831 spin_lock(&nchpp->spin); 1832 _cache_link_parent(fncp, tncp_par, nchpp); 1833 spin_unlock(&nchpp->spin); 1834 1835 /* 1836 * Get rid of the overwritten tncp (unlink) 1837 */ 1838 _cache_unlink(tncp); 1839 } 1840 1841 /* 1842 * Perform actions consistent with unlinking a file. The passed-in ncp 1843 * must be locked. 1844 * 1845 * The ncp is marked DESTROYED so it no longer shows up in searches, 1846 * and will be physically deleted when the vnode goes away. 1847 * 1848 * If the related vnode has no refs then we cycle it through vget()/vput() 1849 * to (possibly if we don't have a ref race) trigger a deactivation, 1850 * allowing the VFS to trivially detect and recycle the deleted vnode 1851 * via VOP_INACTIVE(). 1852 * 1853 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1854 * target ncp. 1855 */ 1856 void 1857 cache_unlink(struct nchandle *nch) 1858 { 1859 _cache_unlink(nch->ncp); 1860 } 1861 1862 static void 1863 _cache_unlink(struct namecache *ncp) 1864 { 1865 struct vnode *vp; 1866 1867 /* 1868 * Causes lookups to fail and allows another ncp with the same 1869 * name to be created under ncp->nc_parent. 1870 */ 1871 ncp->nc_flag |= NCF_DESTROYED; 1872 ++ncp->nc_generation; 1873 1874 /* 1875 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1876 * force action on the 1->0 transition. 1877 */ 1878 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1879 (vp = ncp->nc_vp) != NULL) { 1880 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1881 if (VREFCNT(vp) <= 0) { 1882 if (vget(vp, LK_SHARED) == 0) 1883 vput(vp); 1884 } 1885 } 1886 } 1887 1888 /* 1889 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1890 * file. The easy solution is to just return non-zero if the vnode has refs. 1891 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1892 * force the reclaim). 1893 */ 1894 int 1895 cache_isopen(struct nchandle *nch) 1896 { 1897 struct vnode *vp; 1898 struct namecache *ncp = nch->ncp; 1899 1900 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1901 (vp = ncp->nc_vp) != NULL && 1902 VREFCNT(vp)) { 1903 return 1; 1904 } 1905 return 0; 1906 } 1907 1908 1909 /* 1910 * vget the vnode associated with the namecache entry. Resolve the namecache 1911 * entry if necessary. The passed ncp must be referenced and locked. If 1912 * the ncp is resolved it might be locked shared. 1913 * 1914 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1915 * (depending on the passed lk_type) will be returned in *vpp with an error 1916 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1917 * most typical error is ENOENT, meaning that the ncp represents a negative 1918 * cache hit and there is no vnode to retrieve, but other errors can occur 1919 * too. 1920 * 1921 * The vget() can race a reclaim. If this occurs we re-resolve the 1922 * namecache entry. 1923 * 1924 * There are numerous places in the kernel where vget() is called on a 1925 * vnode while one or more of its namecache entries is locked. Releasing 1926 * a vnode never deadlocks against locked namecache entries (the vnode 1927 * will not get recycled while referenced ncp's exist). This means we 1928 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1929 * lock when acquiring the vp lock or we might cause a deadlock. 1930 * 1931 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1932 * unresolved. If a reclaim race occurs the passed-in ncp will be 1933 * relocked exclusively before being re-resolved. 1934 */ 1935 int 1936 cache_vget(struct nchandle *nch, struct ucred *cred, 1937 int lk_type, struct vnode **vpp) 1938 { 1939 struct namecache *ncp; 1940 struct vnode *vp; 1941 int error; 1942 1943 ncp = nch->ncp; 1944 again: 1945 vp = NULL; 1946 if (ncp->nc_flag & NCF_UNRESOLVED) 1947 error = cache_resolve(nch, cred); 1948 else 1949 error = 0; 1950 1951 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1952 error = vget(vp, lk_type); 1953 if (error) { 1954 /* 1955 * VRECLAIM race 1956 * 1957 * The ncp may have been locked shared, we must relock 1958 * it exclusively before we can set it to unresolved. 1959 */ 1960 if (error == ENOENT) { 1961 kprintf("Warning: vnode reclaim race detected " 1962 "in cache_vget on %p (%s)\n", 1963 vp, ncp->nc_name); 1964 _cache_unlock(ncp); 1965 _cache_lock(ncp); 1966 _cache_setunresolved(ncp); 1967 goto again; 1968 } 1969 1970 /* 1971 * Not a reclaim race, some other error. 1972 */ 1973 KKASSERT(ncp->nc_vp == vp); 1974 vp = NULL; 1975 } else { 1976 KKASSERT(ncp->nc_vp == vp); 1977 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1978 } 1979 } 1980 if (error == 0 && vp == NULL) 1981 error = ENOENT; 1982 *vpp = vp; 1983 return(error); 1984 } 1985 1986 /* 1987 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 1988 * is already held by virtuue of the ncp being locked, but it might not be 1989 * referenced and while it is not referenced it can transition into the 1990 * VRECLAIMED state. 1991 * 1992 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1993 * unresolved. If a reclaim race occurs the passed-in ncp will be 1994 * relocked exclusively before being re-resolved. 1995 * 1996 * NOTE: At the moment we have to issue a vget() on the vnode, even though 1997 * we are going to immediately release the lock, in order to resolve 1998 * potential reclamation races. Once we have a solid vnode ref that 1999 * was (at some point) interlocked via a vget(), the vnode will not 2000 * be reclaimed. 2001 * 2002 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2003 */ 2004 int 2005 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2006 { 2007 struct namecache *ncp; 2008 struct vnode *vp; 2009 int error; 2010 int v; 2011 2012 ncp = nch->ncp; 2013 again: 2014 vp = NULL; 2015 if (ncp->nc_flag & NCF_UNRESOLVED) 2016 error = cache_resolve(nch, cred); 2017 else 2018 error = 0; 2019 2020 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2021 /* 2022 * Try a lockless ref of the vnode. VRECLAIMED transitions 2023 * use the vx_lock state and update-counter mechanism so we 2024 * can detect if one is in-progress or occurred. 2025 * 2026 * If we can successfully ref the vnode and interlock against 2027 * the update-counter mechanism, and VRECLAIMED is found to 2028 * not be set after that, we should be good. 2029 */ 2030 v = spin_access_start_only(&vp->v_spin); 2031 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2032 vref_special(vp); 2033 if (__predict_false( 2034 spin_access_end_only(&vp->v_spin, v))) { 2035 vrele(vp); 2036 continue; 2037 } 2038 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2039 break; 2040 } 2041 vrele(vp); 2042 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2043 } 2044 2045 /* 2046 * Do it the slow way 2047 */ 2048 error = vget(vp, LK_SHARED); 2049 if (error) { 2050 /* 2051 * VRECLAIM race 2052 */ 2053 if (error == ENOENT) { 2054 kprintf("Warning: vnode reclaim race detected " 2055 "in cache_vget on %p (%s)\n", 2056 vp, ncp->nc_name); 2057 _cache_unlock(ncp); 2058 _cache_lock(ncp); 2059 _cache_setunresolved(ncp); 2060 goto again; 2061 } 2062 2063 /* 2064 * Not a reclaim race, some other error. 2065 */ 2066 KKASSERT(ncp->nc_vp == vp); 2067 vp = NULL; 2068 } else { 2069 KKASSERT(ncp->nc_vp == vp); 2070 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2071 /* caller does not want a lock */ 2072 vn_unlock(vp); 2073 } 2074 break; 2075 } 2076 if (error == 0 && vp == NULL) 2077 error = ENOENT; 2078 *vpp = vp; 2079 2080 return(error); 2081 } 2082 2083 /* 2084 * Return a referenced vnode representing the parent directory of 2085 * ncp. 2086 * 2087 * Because the caller has locked the ncp it should not be possible for 2088 * the parent ncp to go away. However, the parent can unresolve its 2089 * dvp at any time so we must be able to acquire a lock on the parent 2090 * to safely access nc_vp. 2091 * 2092 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2093 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2094 * getting destroyed. 2095 * 2096 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2097 * lock on the ncp in question.. 2098 */ 2099 struct vnode * 2100 cache_dvpref(struct namecache *ncp) 2101 { 2102 struct namecache *par; 2103 struct vnode *dvp; 2104 2105 dvp = NULL; 2106 if ((par = ncp->nc_parent) != NULL) { 2107 _cache_hold(par); 2108 _cache_lock(par); 2109 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2110 if ((dvp = par->nc_vp) != NULL) 2111 vhold(dvp); 2112 } 2113 _cache_unlock(par); 2114 if (dvp) { 2115 if (vget(dvp, LK_SHARED) == 0) { 2116 vn_unlock(dvp); 2117 vdrop(dvp); 2118 /* return refd, unlocked dvp */ 2119 } else { 2120 vdrop(dvp); 2121 dvp = NULL; 2122 } 2123 } 2124 _cache_drop(par); 2125 } 2126 return(dvp); 2127 } 2128 2129 /* 2130 * Convert a directory vnode to a namecache record without any other 2131 * knowledge of the topology. This ONLY works with directory vnodes and 2132 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2133 * returned ncp (if not NULL) will be held and unlocked. 2134 * 2135 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2136 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2137 * for dvp. This will fail only if the directory has been deleted out from 2138 * under the caller. 2139 * 2140 * Callers must always check for a NULL return no matter the value of 'makeit'. 2141 * 2142 * To avoid underflowing the kernel stack each recursive call increments 2143 * the makeit variable. 2144 */ 2145 2146 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2147 struct vnode *dvp, char *fakename); 2148 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2149 struct vnode **saved_dvp); 2150 2151 int 2152 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2153 struct nchandle *nch) 2154 { 2155 struct vnode *saved_dvp; 2156 struct vnode *pvp; 2157 char *fakename; 2158 int error; 2159 2160 nch->ncp = NULL; 2161 nch->mount = dvp->v_mount; 2162 saved_dvp = NULL; 2163 fakename = NULL; 2164 2165 /* 2166 * Handle the makeit == 0 degenerate case 2167 */ 2168 if (makeit == 0) { 2169 spin_lock_shared(&dvp->v_spin); 2170 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2171 if (nch->ncp) 2172 cache_hold(nch); 2173 spin_unlock_shared(&dvp->v_spin); 2174 } 2175 2176 /* 2177 * Loop until resolution, inside code will break out on error. 2178 */ 2179 while (makeit) { 2180 /* 2181 * Break out if we successfully acquire a working ncp. 2182 */ 2183 spin_lock_shared(&dvp->v_spin); 2184 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2185 if (nch->ncp) { 2186 cache_hold(nch); 2187 spin_unlock_shared(&dvp->v_spin); 2188 break; 2189 } 2190 spin_unlock_shared(&dvp->v_spin); 2191 2192 /* 2193 * If dvp is the root of its filesystem it should already 2194 * have a namecache pointer associated with it as a side 2195 * effect of the mount, but it may have been disassociated. 2196 */ 2197 if (dvp->v_flag & VROOT) { 2198 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2199 error = cache_resolve_mp(nch->mount); 2200 _cache_put(nch->ncp); 2201 if (ncvp_debug) { 2202 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2203 dvp->v_mount, error); 2204 } 2205 if (error) { 2206 if (ncvp_debug) 2207 kprintf(" failed\n"); 2208 nch->ncp = NULL; 2209 break; 2210 } 2211 if (ncvp_debug) 2212 kprintf(" succeeded\n"); 2213 continue; 2214 } 2215 2216 /* 2217 * If we are recursed too deeply resort to an O(n^2) 2218 * algorithm to resolve the namecache topology. The 2219 * resolved pvp is left referenced in saved_dvp to 2220 * prevent the tree from being destroyed while we loop. 2221 */ 2222 if (makeit > 20) { 2223 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2224 if (error) { 2225 kprintf("lookupdotdot(longpath) failed %d " 2226 "dvp %p\n", error, dvp); 2227 nch->ncp = NULL; 2228 break; 2229 } 2230 continue; 2231 } 2232 2233 /* 2234 * Get the parent directory and resolve its ncp. 2235 */ 2236 if (fakename) { 2237 kfree(fakename, M_TEMP); 2238 fakename = NULL; 2239 } 2240 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2241 &fakename); 2242 if (error) { 2243 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2244 break; 2245 } 2246 vn_unlock(pvp); 2247 2248 /* 2249 * Reuse makeit as a recursion depth counter. On success 2250 * nch will be fully referenced. 2251 */ 2252 cache_fromdvp(pvp, cred, makeit + 1, nch); 2253 vrele(pvp); 2254 if (nch->ncp == NULL) 2255 break; 2256 2257 /* 2258 * Do an inefficient scan of pvp (embodied by ncp) to look 2259 * for dvp. This will create a namecache record for dvp on 2260 * success. We loop up to recheck on success. 2261 * 2262 * ncp and dvp are both held but not locked. 2263 */ 2264 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2265 if (error) { 2266 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2267 pvp, nch->ncp->nc_name, dvp); 2268 cache_drop(nch); 2269 /* nch was NULLed out, reload mount */ 2270 nch->mount = dvp->v_mount; 2271 break; 2272 } 2273 if (ncvp_debug) { 2274 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2275 pvp, nch->ncp->nc_name); 2276 } 2277 cache_drop(nch); 2278 /* nch was NULLed out, reload mount */ 2279 nch->mount = dvp->v_mount; 2280 } 2281 2282 /* 2283 * If nch->ncp is non-NULL it will have been held already. 2284 */ 2285 if (fakename) 2286 kfree(fakename, M_TEMP); 2287 if (saved_dvp) 2288 vrele(saved_dvp); 2289 if (nch->ncp) 2290 return (0); 2291 return (EINVAL); 2292 } 2293 2294 /* 2295 * Go up the chain of parent directories until we find something 2296 * we can resolve into the namecache. This is very inefficient. 2297 */ 2298 static 2299 int 2300 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2301 struct vnode **saved_dvp) 2302 { 2303 struct nchandle nch; 2304 struct vnode *pvp; 2305 int error; 2306 static time_t last_fromdvp_report; 2307 char *fakename; 2308 2309 /* 2310 * Loop getting the parent directory vnode until we get something we 2311 * can resolve in the namecache. 2312 */ 2313 vref(dvp); 2314 nch.mount = dvp->v_mount; 2315 nch.ncp = NULL; 2316 fakename = NULL; 2317 2318 for (;;) { 2319 if (fakename) { 2320 kfree(fakename, M_TEMP); 2321 fakename = NULL; 2322 } 2323 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2324 &fakename); 2325 if (error) { 2326 vrele(dvp); 2327 break; 2328 } 2329 vn_unlock(pvp); 2330 spin_lock_shared(&pvp->v_spin); 2331 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2332 _cache_hold(nch.ncp); 2333 spin_unlock_shared(&pvp->v_spin); 2334 vrele(pvp); 2335 break; 2336 } 2337 spin_unlock_shared(&pvp->v_spin); 2338 if (pvp->v_flag & VROOT) { 2339 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2340 error = cache_resolve_mp(nch.mount); 2341 _cache_unlock(nch.ncp); 2342 vrele(pvp); 2343 if (error) { 2344 _cache_drop(nch.ncp); 2345 nch.ncp = NULL; 2346 vrele(dvp); 2347 } 2348 break; 2349 } 2350 vrele(dvp); 2351 dvp = pvp; 2352 } 2353 if (error == 0) { 2354 if (last_fromdvp_report != time_uptime) { 2355 last_fromdvp_report = time_uptime; 2356 kprintf("Warning: extremely inefficient path " 2357 "resolution on %s\n", 2358 nch.ncp->nc_name); 2359 } 2360 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2361 2362 /* 2363 * Hopefully dvp now has a namecache record associated with 2364 * it. Leave it referenced to prevent the kernel from 2365 * recycling the vnode. Otherwise extremely long directory 2366 * paths could result in endless recycling. 2367 */ 2368 if (*saved_dvp) 2369 vrele(*saved_dvp); 2370 *saved_dvp = dvp; 2371 _cache_drop(nch.ncp); 2372 } 2373 if (fakename) 2374 kfree(fakename, M_TEMP); 2375 return (error); 2376 } 2377 2378 /* 2379 * Do an inefficient scan of the directory represented by ncp looking for 2380 * the directory vnode dvp. ncp must be held but not locked on entry and 2381 * will be held on return. dvp must be refd but not locked on entry and 2382 * will remain refd on return. 2383 * 2384 * Why do this at all? Well, due to its stateless nature the NFS server 2385 * converts file handles directly to vnodes without necessarily going through 2386 * the namecache ops that would otherwise create the namecache topology 2387 * leading to the vnode. We could either (1) Change the namecache algorithms 2388 * to allow disconnect namecache records that are re-merged opportunistically, 2389 * or (2) Make the NFS server backtrack and scan to recover a connected 2390 * namecache topology in order to then be able to issue new API lookups. 2391 * 2392 * It turns out that (1) is a huge mess. It takes a nice clean set of 2393 * namecache algorithms and introduces a lot of complication in every subsystem 2394 * that calls into the namecache to deal with the re-merge case, especially 2395 * since we are using the namecache to placehold negative lookups and the 2396 * vnode might not be immediately assigned. (2) is certainly far less 2397 * efficient then (1), but since we are only talking about directories here 2398 * (which are likely to remain cached), the case does not actually run all 2399 * that often and has the supreme advantage of not polluting the namecache 2400 * algorithms. 2401 * 2402 * If a fakename is supplied just construct a namecache entry using the 2403 * fake name. 2404 */ 2405 static int 2406 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2407 struct vnode *dvp, char *fakename) 2408 { 2409 struct nlcomponent nlc; 2410 struct nchandle rncp; 2411 struct dirent *den; 2412 struct vnode *pvp; 2413 struct vattr vat; 2414 struct iovec iov; 2415 struct uio uio; 2416 int blksize; 2417 int eofflag; 2418 int bytes; 2419 char *rbuf; 2420 int error; 2421 2422 vat.va_blocksize = 0; 2423 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2424 return (error); 2425 cache_lock(nch); 2426 error = cache_vref(nch, cred, &pvp); 2427 cache_unlock(nch); 2428 if (error) 2429 return (error); 2430 if (ncvp_debug) { 2431 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2432 "vattr fileid = %lld\n", 2433 nch->ncp, nch->ncp->nc_name, 2434 vat.va_blocksize, 2435 (long long)vat.va_fileid); 2436 } 2437 2438 /* 2439 * Use the supplied fakename if not NULL. Fake names are typically 2440 * not in the actual filesystem hierarchy. This is used by HAMMER 2441 * to glue @@timestamp recursions together. 2442 */ 2443 if (fakename) { 2444 nlc.nlc_nameptr = fakename; 2445 nlc.nlc_namelen = strlen(fakename); 2446 rncp = cache_nlookup(nch, &nlc); 2447 goto done; 2448 } 2449 2450 if ((blksize = vat.va_blocksize) == 0) 2451 blksize = DEV_BSIZE; 2452 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2453 rncp.ncp = NULL; 2454 2455 eofflag = 0; 2456 uio.uio_offset = 0; 2457 again: 2458 iov.iov_base = rbuf; 2459 iov.iov_len = blksize; 2460 uio.uio_iov = &iov; 2461 uio.uio_iovcnt = 1; 2462 uio.uio_resid = blksize; 2463 uio.uio_segflg = UIO_SYSSPACE; 2464 uio.uio_rw = UIO_READ; 2465 uio.uio_td = curthread; 2466 2467 if (ncvp_debug >= 2) 2468 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2469 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2470 if (error == 0) { 2471 den = (struct dirent *)rbuf; 2472 bytes = blksize - uio.uio_resid; 2473 2474 while (bytes > 0) { 2475 if (ncvp_debug >= 2) { 2476 kprintf("cache_inefficient_scan: %*.*s\n", 2477 den->d_namlen, den->d_namlen, 2478 den->d_name); 2479 } 2480 if (den->d_type != DT_WHT && 2481 den->d_ino == vat.va_fileid) { 2482 if (ncvp_debug) { 2483 kprintf("cache_inefficient_scan: " 2484 "MATCHED inode %lld path %s/%*.*s\n", 2485 (long long)vat.va_fileid, 2486 nch->ncp->nc_name, 2487 den->d_namlen, den->d_namlen, 2488 den->d_name); 2489 } 2490 nlc.nlc_nameptr = den->d_name; 2491 nlc.nlc_namelen = den->d_namlen; 2492 rncp = cache_nlookup(nch, &nlc); 2493 KKASSERT(rncp.ncp != NULL); 2494 break; 2495 } 2496 bytes -= _DIRENT_DIRSIZ(den); 2497 den = _DIRENT_NEXT(den); 2498 } 2499 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2500 goto again; 2501 } 2502 kfree(rbuf, M_TEMP); 2503 done: 2504 vrele(pvp); 2505 if (rncp.ncp) { 2506 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2507 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2508 if (ncvp_debug >= 2) { 2509 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2510 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2511 } 2512 } else { 2513 if (ncvp_debug >= 2) { 2514 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2515 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2516 rncp.ncp->nc_vp); 2517 } 2518 } 2519 if (rncp.ncp->nc_vp == NULL) 2520 error = rncp.ncp->nc_error; 2521 /* 2522 * Release rncp after a successful nlookup. rncp was fully 2523 * referenced. 2524 */ 2525 cache_put(&rncp); 2526 } else { 2527 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2528 dvp, nch->ncp->nc_name); 2529 error = ENOENT; 2530 } 2531 return (error); 2532 } 2533 2534 /* 2535 * This function must be called with the ncp held and locked and will unlock 2536 * and drop it during zapping. 2537 * 2538 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2539 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2540 * and removes the related reference. If the ncp can be removed, and the 2541 * parent can be zapped non-blocking, this function loops up. 2542 * 2543 * There will be one ref from the caller (which we now own). The only 2544 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2545 * so possibly 2 refs left. Taking this into account, if there are no 2546 * additional refs and no children, the ncp will be removed from the topology 2547 * and destroyed. 2548 * 2549 * References and/or children may exist if the ncp is in the middle of the 2550 * topology, preventing the ncp from being destroyed. 2551 * 2552 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2553 * 2554 * This function may return a held (but NOT locked) parent node which the 2555 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2556 * due to deep namecache trees. 2557 * 2558 * WARNING! For MPSAFE operation this routine must acquire up to three 2559 * spin locks to be able to safely test nc_refs. Lock order is 2560 * very important. 2561 * 2562 * hash spinlock if on hash list 2563 * parent spinlock if child of parent 2564 * (the ncp is unresolved so there is no vnode association) 2565 */ 2566 static void 2567 cache_zap(struct namecache *ncp) 2568 { 2569 struct namecache *par; 2570 struct vnode *dropvp; 2571 struct nchash_head *nchpp; 2572 int refcmp; 2573 int nonblock = 1; /* XXX cleanup */ 2574 2575 again: 2576 /* 2577 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2578 * This gets rid of any vp->v_namecache list or negative list and 2579 * the related ref. 2580 */ 2581 _cache_setunresolved(ncp); 2582 2583 /* 2584 * Try to scrap the entry and possibly tail-recurse on its parent. 2585 * We only scrap unref'd (other then our ref) unresolved entries, 2586 * we do not scrap 'live' entries. 2587 * 2588 * If nc_parent is non NULL we expect 2 references, else just 1. 2589 * If there are more, someone else also holds the ncp and we cannot 2590 * destroy it. 2591 */ 2592 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2593 KKASSERT(ncp->nc_refs > 0); 2594 2595 /* 2596 * If the ncp is linked to its parent it will also be in the hash 2597 * table. We have to be able to lock the parent and the hash table. 2598 * 2599 * Acquire locks. Note that the parent can't go away while we hold 2600 * a child locked. If nc_parent is present, expect 2 refs instead 2601 * of 1. 2602 */ 2603 nchpp = NULL; 2604 if ((par = ncp->nc_parent) != NULL) { 2605 if (nonblock) { 2606 if (_cache_lock_nonblock(par)) { 2607 /* lock failed */ 2608 ncp->nc_flag |= NCF_DEFEREDZAP; 2609 atomic_add_long( 2610 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2611 1); 2612 _cache_unlock(ncp); 2613 _cache_drop(ncp); /* caller's ref */ 2614 return; 2615 } 2616 _cache_hold(par); 2617 } else { 2618 _cache_hold(par); 2619 _cache_lock(par); 2620 } 2621 nchpp = ncp->nc_head; 2622 spin_lock(&nchpp->spin); 2623 } 2624 2625 /* 2626 * With the parent and nchpp locked, and the vnode removed 2627 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2628 * more someone else has a ref and we cannot zap the entry. 2629 * 2630 * one for our hold 2631 * one for our parent link (parent also has one from the linkage) 2632 */ 2633 if (par) 2634 refcmp = 2; 2635 else 2636 refcmp = 1; 2637 2638 /* 2639 * On failure undo the work we've done so far and drop the 2640 * caller's ref and ncp. 2641 */ 2642 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2643 if (par) { 2644 spin_unlock(&nchpp->spin); 2645 _cache_put(par); 2646 } 2647 _cache_unlock(ncp); 2648 _cache_drop(ncp); 2649 return; 2650 } 2651 2652 /* 2653 * We own all the refs and with the spinlocks held no further 2654 * refs can be acquired by others. 2655 * 2656 * Remove us from the hash list and parent list. We have to 2657 * drop a ref on the parent's vp if the parent's list becomes 2658 * empty. 2659 */ 2660 dropvp = NULL; 2661 if (par) { 2662 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2663 2664 KKASSERT(nchpp == ncp->nc_head); 2665 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2666 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2667 atomic_add_long(&pn->vfscache_count, -1); 2668 if (TAILQ_EMPTY(&ncp->nc_list)) 2669 atomic_add_long(&pn->vfscache_leafs, -1); 2670 2671 if (TAILQ_EMPTY(&par->nc_list)) { 2672 atomic_add_long(&pn->vfscache_leafs, 1); 2673 if (par->nc_vp) 2674 dropvp = par->nc_vp; 2675 } 2676 ncp->nc_parent = NULL; 2677 ncp->nc_head = NULL; 2678 spin_unlock(&nchpp->spin); 2679 _cache_drop(par); /* removal of ncp from par->nc_list */ 2680 /*_cache_unlock(par);*/ 2681 } else { 2682 KKASSERT(ncp->nc_head == NULL); 2683 } 2684 2685 /* 2686 * ncp should not have picked up any refs. Physically 2687 * destroy the ncp. 2688 */ 2689 if (ncp->nc_refs != refcmp) { 2690 panic("cache_zap: %p bad refs %d (expected %d)\n", 2691 ncp, ncp->nc_refs, refcmp); 2692 } 2693 /* _cache_unlock(ncp) not required */ 2694 ncp->nc_refs = -1; /* safety */ 2695 if (ncp->nc_name) 2696 kfree(ncp->nc_name, M_VFSCACHE); 2697 kfree(ncp, M_VFSCACHE); 2698 2699 /* 2700 * Delayed drop (we had to release our spinlocks) 2701 */ 2702 if (dropvp) 2703 vdrop(dropvp); 2704 2705 /* 2706 * Loop up if we can recursively clean out the parent. 2707 */ 2708 if (par) { 2709 refcmp = 1; /* ref on parent */ 2710 if (par->nc_parent) /* par->par */ 2711 ++refcmp; 2712 par->nc_flag &= ~NCF_DEFEREDZAP; 2713 if ((par->nc_flag & NCF_UNRESOLVED) && 2714 par->nc_refs == refcmp && 2715 TAILQ_EMPTY(&par->nc_list)) { 2716 ncp = par; 2717 goto again; 2718 } 2719 _cache_unlock(par); 2720 _cache_drop(par); 2721 } 2722 } 2723 2724 /* 2725 * Clean up dangling negative cache and defered-drop entries in the 2726 * namecache. 2727 * 2728 * This routine is called in the critical path and also called from 2729 * vnlru(). When called from vnlru we use a lower limit to try to 2730 * deal with the negative cache before the critical path has to start 2731 * dealing with it. 2732 */ 2733 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2734 2735 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2736 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2737 2738 void 2739 cache_hysteresis(int critpath) 2740 { 2741 long poslimit; 2742 long neglimit = maxvnodes / ncnegfactor; 2743 long xnumcache = vfscache_leafs; 2744 2745 if (critpath == 0) 2746 neglimit = neglimit * 8 / 10; 2747 2748 /* 2749 * Don't cache too many negative hits. We use hysteresis to reduce 2750 * the impact on the critical path. 2751 */ 2752 switch(neg_cache_hysteresis_state[critpath]) { 2753 case CHI_LOW: 2754 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2755 if (critpath) 2756 _cache_cleanneg(ncnegflush); 2757 else 2758 _cache_cleanneg(ncnegflush + 2759 vfscache_negs - neglimit); 2760 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2761 } 2762 break; 2763 case CHI_HIGH: 2764 if (vfscache_negs > MINNEG * 9 / 10 && 2765 vfscache_negs * 9 / 10 > neglimit 2766 ) { 2767 if (critpath) 2768 _cache_cleanneg(ncnegflush); 2769 else 2770 _cache_cleanneg(ncnegflush + 2771 vfscache_negs * 9 / 10 - 2772 neglimit); 2773 } else { 2774 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2775 } 2776 break; 2777 } 2778 2779 /* 2780 * Don't cache too many positive hits. We use hysteresis to reduce 2781 * the impact on the critical path. 2782 * 2783 * Excessive positive hits can accumulate due to large numbers of 2784 * hardlinks (the vnode cache will not prevent hl ncps from growing 2785 * into infinity). 2786 */ 2787 if ((poslimit = ncposlimit) == 0) 2788 poslimit = maxvnodes * 2; 2789 if (critpath == 0) 2790 poslimit = poslimit * 8 / 10; 2791 2792 switch(pos_cache_hysteresis_state[critpath]) { 2793 case CHI_LOW: 2794 if (xnumcache > poslimit && xnumcache > MINPOS) { 2795 if (critpath) 2796 _cache_cleanpos(ncposflush); 2797 else 2798 _cache_cleanpos(ncposflush + 2799 xnumcache - poslimit); 2800 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2801 } 2802 break; 2803 case CHI_HIGH: 2804 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2805 if (critpath) 2806 _cache_cleanpos(ncposflush); 2807 else 2808 _cache_cleanpos(ncposflush + 2809 xnumcache - poslimit * 5 / 6); 2810 } else { 2811 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2812 } 2813 break; 2814 } 2815 2816 /* 2817 * Clean out dangling defered-zap ncps which could not be cleanly 2818 * dropped if too many build up. Note that numdefered is 2819 * heuristical. Make sure we are real-time for the current cpu, 2820 * plus the global rollup. 2821 */ 2822 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2823 _cache_cleandefered(); 2824 } 2825 } 2826 2827 /* 2828 * NEW NAMECACHE LOOKUP API 2829 * 2830 * Lookup an entry in the namecache. The passed par_nch must be referenced 2831 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2832 * is ALWAYS returned, eve if the supplied component is illegal. 2833 * 2834 * The resulting namecache entry should be returned to the system with 2835 * cache_put() or cache_unlock() + cache_drop(). 2836 * 2837 * namecache locks are recursive but care must be taken to avoid lock order 2838 * reversals (hence why the passed par_nch must be unlocked). Locking 2839 * rules are to order for parent traversals, not for child traversals. 2840 * 2841 * Nobody else will be able to manipulate the associated namespace (e.g. 2842 * create, delete, rename, rename-target) until the caller unlocks the 2843 * entry. 2844 * 2845 * The returned entry will be in one of three states: positive hit (non-null 2846 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2847 * Unresolved entries must be resolved through the filesystem to associate the 2848 * vnode and/or determine whether a positive or negative hit has occured. 2849 * 2850 * It is not necessary to lock a directory in order to lock namespace under 2851 * that directory. In fact, it is explicitly not allowed to do that. A 2852 * directory is typically only locked when being created, renamed, or 2853 * destroyed. 2854 * 2855 * The directory (par) may be unresolved, in which case any returned child 2856 * will likely also be marked unresolved. Likely but not guarenteed. Since 2857 * the filesystem lookup requires a resolved directory vnode the caller is 2858 * responsible for resolving the namecache chain top-down. This API 2859 * specifically allows whole chains to be created in an unresolved state. 2860 */ 2861 struct nchandle 2862 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2863 { 2864 struct nchandle nch; 2865 struct namecache *ncp; 2866 struct namecache *new_ncp; 2867 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2868 struct nchash_head *nchpp; 2869 struct mount *mp; 2870 u_int32_t hash; 2871 globaldata_t gd; 2872 int par_locked; 2873 int use_excl; 2874 2875 gd = mycpu; 2876 mp = par_nch->mount; 2877 par_locked = 0; 2878 2879 /* 2880 * This is a good time to call it, no ncp's are locked by 2881 * the caller or us. 2882 */ 2883 cache_hysteresis(1); 2884 2885 /* 2886 * Try to locate an existing entry 2887 */ 2888 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2889 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2890 new_ncp = NULL; 2891 use_excl = 0; 2892 nchpp = NCHHASH(hash); 2893 restart: 2894 rep_ncp = NULL; 2895 if (use_excl) 2896 spin_lock(&nchpp->spin); 2897 else 2898 spin_lock_shared(&nchpp->spin); 2899 2900 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2901 /* 2902 * Break out if we find a matching entry. Note that 2903 * UNRESOLVED entries may match, but DESTROYED entries 2904 * do not. 2905 * 2906 * We may be able to reuse DESTROYED entries that we come 2907 * across, even if the name does not match, as long as 2908 * nc_nlen is correct and the only hold ref is from the nchpp 2909 * list itself. 2910 */ 2911 if (ncp->nc_parent == par_nch->ncp && 2912 ncp->nc_nlen == nlc->nlc_namelen) { 2913 if (ncp->nc_flag & NCF_DESTROYED) { 2914 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2915 rep_ncp = ncp; 2916 continue; 2917 } 2918 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2919 continue; 2920 _cache_hold(ncp); 2921 if (use_excl) 2922 spin_unlock(&nchpp->spin); 2923 else 2924 spin_unlock_shared(&nchpp->spin); 2925 if (par_locked) { 2926 _cache_unlock(par_nch->ncp); 2927 par_locked = 0; 2928 } 2929 if (_cache_lock_special(ncp) == 0) { 2930 /* 2931 * Successfully locked but we must re-test 2932 * conditions that might have changed since 2933 * we did not have the lock before. 2934 */ 2935 if (ncp->nc_parent != par_nch->ncp || 2936 ncp->nc_nlen != nlc->nlc_namelen || 2937 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2938 ncp->nc_nlen) || 2939 (ncp->nc_flag & NCF_DESTROYED)) { 2940 _cache_put(ncp); 2941 goto restart; 2942 } 2943 _cache_auto_unresolve(mp, ncp); 2944 if (new_ncp) { 2945 _cache_free(new_ncp); 2946 new_ncp = NULL; /* safety */ 2947 } 2948 goto found; 2949 } 2950 _cache_get(ncp); /* cycle the lock to block */ 2951 _cache_put(ncp); 2952 _cache_drop(ncp); 2953 goto restart; 2954 } 2955 } 2956 2957 /* 2958 * We failed to locate the entry, try to resurrect a destroyed 2959 * entry that we did find that is already correctly linked into 2960 * nchpp and the parent. We must re-test conditions after 2961 * successfully locking rep_ncp. 2962 * 2963 * This case can occur under heavy loads due to not being able 2964 * to safely lock the parent in cache_zap(). Nominally a repeated 2965 * create/unlink load, but only the namelen needs to match. 2966 * 2967 * An exclusive lock on the nchpp is required to process this case, 2968 * otherwise a race can cause duplicate entries to be created with 2969 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 2970 */ 2971 if (rep_ncp && use_excl) { 2972 if (_cache_lock_nonblock(rep_ncp) == 0) { 2973 _cache_hold(rep_ncp); 2974 if (rep_ncp->nc_parent == par_nch->ncp && 2975 rep_ncp->nc_nlen == nlc->nlc_namelen && 2976 (rep_ncp->nc_flag & NCF_DESTROYED) && 2977 rep_ncp->nc_refs == 2) { 2978 /* 2979 * Update nc_name. 2980 */ 2981 ncp = rep_ncp; 2982 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2983 nlc->nlc_namelen); 2984 2985 /* 2986 * This takes some care. We must clear the 2987 * NCF_DESTROYED flag before unlocking the 2988 * hash chain so other concurrent searches 2989 * do not skip this element. 2990 * 2991 * We must also unlock the hash chain before 2992 * unresolving the ncp to avoid deadlocks. 2993 * We hold the lock on the ncp so we can safely 2994 * reinitialize nc_flag after that. 2995 */ 2996 ncp->nc_flag &= ~NCF_DESTROYED; 2997 spin_unlock(&nchpp->spin); /* use_excl */ 2998 2999 _cache_setunresolved(ncp); 3000 ncp->nc_flag = NCF_UNRESOLVED; 3001 ncp->nc_error = ENOTCONN; 3002 if (par_locked) { 3003 _cache_unlock(par_nch->ncp); 3004 par_locked = 0; 3005 } 3006 if (new_ncp) { 3007 _cache_free(new_ncp); 3008 new_ncp = NULL; /* safety */ 3009 } 3010 goto found; 3011 } 3012 _cache_put(rep_ncp); 3013 } 3014 } 3015 3016 /* 3017 * Otherwise create a new entry and add it to the cache. The parent 3018 * ncp must also be locked so we can link into it. 3019 * 3020 * We have to relookup after possibly blocking in kmalloc or 3021 * when locking par_nch. 3022 * 3023 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3024 * mount case, in which case nc_name will be NULL. 3025 * 3026 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3027 * a DESTROYED entry, but didn't have an exclusive lock. 3028 * In this situation we do not create a new_ncp. 3029 */ 3030 if (new_ncp == NULL) { 3031 if (use_excl) 3032 spin_unlock(&nchpp->spin); 3033 else 3034 spin_unlock_shared(&nchpp->spin); 3035 if (rep_ncp == NULL) { 3036 new_ncp = cache_alloc(nlc->nlc_namelen); 3037 if (nlc->nlc_namelen) { 3038 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3039 nlc->nlc_namelen); 3040 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3041 } 3042 } 3043 use_excl = 1; 3044 goto restart; 3045 } 3046 3047 /* 3048 * NOTE! The spinlock is held exclusively here because new_ncp 3049 * is non-NULL. 3050 */ 3051 if (par_locked == 0) { 3052 spin_unlock(&nchpp->spin); 3053 _cache_lock(par_nch->ncp); 3054 par_locked = 1; 3055 goto restart; 3056 } 3057 3058 /* 3059 * Link to parent (requires another ref, the one already in new_ncp 3060 * is what we wil lreturn). 3061 * 3062 * WARNING! We still hold the spinlock. We have to set the hash 3063 * table entry atomically. 3064 */ 3065 ncp = new_ncp; 3066 ++ncp->nc_refs; 3067 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3068 spin_unlock(&nchpp->spin); 3069 _cache_unlock(par_nch->ncp); 3070 /* par_locked = 0 - not used */ 3071 found: 3072 /* 3073 * stats and namecache size management 3074 */ 3075 if (ncp->nc_flag & NCF_UNRESOLVED) 3076 ++gd->gd_nchstats->ncs_miss; 3077 else if (ncp->nc_vp) 3078 ++gd->gd_nchstats->ncs_goodhits; 3079 else 3080 ++gd->gd_nchstats->ncs_neghits; 3081 nch.mount = mp; 3082 nch.ncp = ncp; 3083 _cache_mntref(nch.mount); 3084 3085 return(nch); 3086 } 3087 3088 /* 3089 * Attempt to lookup a namecache entry and return with a shared namecache 3090 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3091 * set or we are unable to lock. 3092 */ 3093 int 3094 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3095 struct nlcomponent *nlc, 3096 int excl, struct nchandle *res_nch) 3097 { 3098 struct namecache *ncp; 3099 struct nchash_head *nchpp; 3100 struct mount *mp; 3101 u_int32_t hash; 3102 globaldata_t gd; 3103 3104 /* 3105 * If exclusive requested or shared namecache locks are disabled, 3106 * return failure. 3107 */ 3108 if (ncp_shared_lock_disable || excl) 3109 return(EWOULDBLOCK); 3110 3111 gd = mycpu; 3112 mp = par_nch->mount; 3113 3114 /* 3115 * This is a good time to call it, no ncp's are locked by 3116 * the caller or us. 3117 */ 3118 cache_hysteresis(1); 3119 3120 /* 3121 * Try to locate an existing entry 3122 */ 3123 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3124 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3125 nchpp = NCHHASH(hash); 3126 3127 spin_lock_shared(&nchpp->spin); 3128 3129 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3130 /* 3131 * Break out if we find a matching entry. Note that 3132 * UNRESOLVED entries may match, but DESTROYED entries 3133 * do not. 3134 */ 3135 if (ncp->nc_parent == par_nch->ncp && 3136 ncp->nc_nlen == nlc->nlc_namelen && 3137 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3138 (ncp->nc_flag & NCF_DESTROYED) == 0 3139 ) { 3140 _cache_hold(ncp); 3141 spin_unlock_shared(&nchpp->spin); 3142 3143 if (_cache_lock_shared_special(ncp) == 0) { 3144 if (ncp->nc_parent == par_nch->ncp && 3145 ncp->nc_nlen == nlc->nlc_namelen && 3146 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3147 ncp->nc_nlen) == 0 && 3148 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3149 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3150 _cache_auto_unresolve_test(mp, ncp) == 0) { 3151 goto found; 3152 } 3153 _cache_unlock(ncp); 3154 } 3155 _cache_drop(ncp); 3156 return(EWOULDBLOCK); 3157 } 3158 } 3159 3160 /* 3161 * Failure 3162 */ 3163 spin_unlock_shared(&nchpp->spin); 3164 return(EWOULDBLOCK); 3165 3166 /* 3167 * Success 3168 * 3169 * Note that nc_error might be non-zero (e.g ENOENT). 3170 */ 3171 found: 3172 res_nch->mount = mp; 3173 res_nch->ncp = ncp; 3174 ++gd->gd_nchstats->ncs_goodhits; 3175 _cache_mntref(res_nch->mount); 3176 3177 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3178 return(ncp->nc_error); 3179 } 3180 3181 /* 3182 * This is a non-blocking verison of cache_nlookup() used by 3183 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3184 * will return nch.ncp == NULL in that case. 3185 */ 3186 struct nchandle 3187 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3188 { 3189 struct nchandle nch; 3190 struct namecache *ncp; 3191 struct namecache *new_ncp; 3192 struct nchash_head *nchpp; 3193 struct mount *mp; 3194 u_int32_t hash; 3195 globaldata_t gd; 3196 int par_locked; 3197 3198 gd = mycpu; 3199 mp = par_nch->mount; 3200 par_locked = 0; 3201 3202 /* 3203 * Try to locate an existing entry 3204 */ 3205 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3206 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3207 new_ncp = NULL; 3208 nchpp = NCHHASH(hash); 3209 restart: 3210 spin_lock(&nchpp->spin); 3211 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3212 /* 3213 * Break out if we find a matching entry. Note that 3214 * UNRESOLVED entries may match, but DESTROYED entries 3215 * do not. 3216 */ 3217 if (ncp->nc_parent == par_nch->ncp && 3218 ncp->nc_nlen == nlc->nlc_namelen && 3219 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3220 (ncp->nc_flag & NCF_DESTROYED) == 0 3221 ) { 3222 _cache_hold(ncp); 3223 spin_unlock(&nchpp->spin); 3224 if (par_locked) { 3225 _cache_unlock(par_nch->ncp); 3226 par_locked = 0; 3227 } 3228 if (_cache_lock_special(ncp) == 0) { 3229 if (ncp->nc_parent != par_nch->ncp || 3230 ncp->nc_nlen != nlc->nlc_namelen || 3231 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3232 (ncp->nc_flag & NCF_DESTROYED)) { 3233 kprintf("cache_lookup_nonblock: " 3234 "ncp-race %p %*.*s\n", 3235 ncp, 3236 nlc->nlc_namelen, 3237 nlc->nlc_namelen, 3238 nlc->nlc_nameptr); 3239 _cache_unlock(ncp); 3240 _cache_drop(ncp); 3241 goto failed; 3242 } 3243 _cache_auto_unresolve(mp, ncp); 3244 if (new_ncp) { 3245 _cache_free(new_ncp); 3246 new_ncp = NULL; 3247 } 3248 goto found; 3249 } 3250 _cache_drop(ncp); 3251 goto failed; 3252 } 3253 } 3254 3255 /* 3256 * We failed to locate an entry, create a new entry and add it to 3257 * the cache. The parent ncp must also be locked so we 3258 * can link into it. 3259 * 3260 * We have to relookup after possibly blocking in kmalloc or 3261 * when locking par_nch. 3262 * 3263 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3264 * mount case, in which case nc_name will be NULL. 3265 */ 3266 if (new_ncp == NULL) { 3267 spin_unlock(&nchpp->spin); 3268 new_ncp = cache_alloc(nlc->nlc_namelen); 3269 if (nlc->nlc_namelen) { 3270 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3271 nlc->nlc_namelen); 3272 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3273 } 3274 goto restart; 3275 } 3276 if (par_locked == 0) { 3277 spin_unlock(&nchpp->spin); 3278 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3279 par_locked = 1; 3280 goto restart; 3281 } 3282 goto failed; 3283 } 3284 3285 /* 3286 * Link to parent (requires another ref, the one already in new_ncp 3287 * is what we wil lreturn). 3288 * 3289 * WARNING! We still hold the spinlock. We have to set the hash 3290 * table entry atomically. 3291 */ 3292 ncp = new_ncp; 3293 ++ncp->nc_refs; 3294 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3295 spin_unlock(&nchpp->spin); 3296 _cache_unlock(par_nch->ncp); 3297 /* par_locked = 0 - not used */ 3298 found: 3299 /* 3300 * stats and namecache size management 3301 */ 3302 if (ncp->nc_flag & NCF_UNRESOLVED) 3303 ++gd->gd_nchstats->ncs_miss; 3304 else if (ncp->nc_vp) 3305 ++gd->gd_nchstats->ncs_goodhits; 3306 else 3307 ++gd->gd_nchstats->ncs_neghits; 3308 nch.mount = mp; 3309 nch.ncp = ncp; 3310 _cache_mntref(nch.mount); 3311 3312 return(nch); 3313 failed: 3314 if (new_ncp) { 3315 _cache_free(new_ncp); 3316 new_ncp = NULL; 3317 } 3318 nch.mount = NULL; 3319 nch.ncp = NULL; 3320 return(nch); 3321 } 3322 3323 /* 3324 * This version is non-locking. The caller must validate the result 3325 * for parent-to-child continuity. 3326 * 3327 * It can fail for any reason and will return nch.ncp == NULL in that case. 3328 */ 3329 struct nchandle 3330 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3331 { 3332 struct nchandle nch; 3333 struct namecache *ncp; 3334 struct nchash_head *nchpp; 3335 struct mount *mp; 3336 u_int32_t hash; 3337 globaldata_t gd; 3338 3339 gd = mycpu; 3340 mp = par_nch->mount; 3341 3342 /* 3343 * Try to locate an existing entry 3344 */ 3345 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3346 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3347 nchpp = NCHHASH(hash); 3348 3349 spin_lock_shared(&nchpp->spin); 3350 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3351 /* 3352 * Break out if we find a matching entry. Note that 3353 * UNRESOLVED entries may match, but DESTROYED entries 3354 * do not. 3355 * 3356 * Resolved NFS entries which have timed out fail so the 3357 * caller can rerun with normal locking. 3358 */ 3359 if (ncp->nc_parent == par_nch->ncp && 3360 ncp->nc_nlen == nlc->nlc_namelen && 3361 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3362 (ncp->nc_flag & NCF_DESTROYED) == 0 3363 ) { 3364 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3365 break; 3366 _cache_hold(ncp); 3367 spin_unlock_shared(&nchpp->spin); 3368 goto found; 3369 } 3370 } 3371 spin_unlock_shared(&nchpp->spin); 3372 nch.mount = NULL; 3373 nch.ncp = NULL; 3374 return nch; 3375 found: 3376 /* 3377 * stats and namecache size management 3378 */ 3379 if (ncp->nc_flag & NCF_UNRESOLVED) 3380 ++gd->gd_nchstats->ncs_miss; 3381 else if (ncp->nc_vp) 3382 ++gd->gd_nchstats->ncs_goodhits; 3383 else 3384 ++gd->gd_nchstats->ncs_neghits; 3385 nch.mount = mp; 3386 nch.ncp = ncp; 3387 _cache_mntref(nch.mount); 3388 3389 return(nch); 3390 } 3391 3392 /* 3393 * The namecache entry is marked as being used as a mount point. 3394 * Locate the mount if it is visible to the caller. The DragonFly 3395 * mount system allows arbitrary loops in the topology and disentangles 3396 * those loops by matching against (mp, ncp) rather than just (ncp). 3397 * This means any given ncp can dive any number of mounts, depending 3398 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3399 * 3400 * We use a very simple frontend cache to reduce SMP conflicts, 3401 * which we have to do because the mountlist scan needs an exclusive 3402 * lock around its ripout info list. Not to mention that there might 3403 * be a lot of mounts. 3404 * 3405 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3406 * down a bit to allow some contention rather than making the cache 3407 * excessively huge. 3408 * 3409 * The hash table is split into per-cpu areas, is 4-way set-associative. 3410 */ 3411 struct findmount_info { 3412 struct mount *result; 3413 struct mount *nch_mount; 3414 struct namecache *nch_ncp; 3415 }; 3416 3417 static __inline 3418 struct ncmount_cache * 3419 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3420 { 3421 uint32_t hash; 3422 3423 hash = iscsi_crc32(&mp, sizeof(mp)); 3424 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3425 hash ^= hash >> 16; 3426 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3427 3428 return (&ncmount_cache[hash]); 3429 } 3430 3431 static 3432 struct ncmount_cache * 3433 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3434 { 3435 struct ncmount_cache *ncc; 3436 struct ncmount_cache *best; 3437 int delta; 3438 int best_delta; 3439 int i; 3440 3441 ncc = ncmount_cache_lookup4(mp, ncp); 3442 3443 /* 3444 * NOTE: When checking for a ticks overflow implement a slop of 3445 * 2 ticks just to be safe, because ticks is accessed 3446 * non-atomically one CPU can increment it while another 3447 * is still using the old value. 3448 */ 3449 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3450 return ncc; 3451 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3452 if (delta < -2) /* overflow reset */ 3453 ncc->ticks = ticks; 3454 best = ncc; 3455 best_delta = delta; 3456 3457 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3458 ++ncc; 3459 if (ncc->ncp == ncp && ncc->mp == mp) 3460 return ncc; 3461 delta = (int)(ticks - ncc->ticks); 3462 if (delta < -2) 3463 ncc->ticks = ticks; 3464 if (delta > best_delta) { 3465 best_delta = delta; 3466 best = ncc; 3467 } 3468 } 3469 return best; 3470 } 3471 3472 /* 3473 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3474 * doing an expensive mountlist_scan*() if possible. 3475 * 3476 * (mp, ncp) -> mountonpt.k 3477 * 3478 * Returns a referenced mount pointer or NULL 3479 * 3480 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3481 * operations (that is, where the mp_target can be freed out from under us). 3482 * 3483 * Lookups use the ncc->updating counter to validate the contents in order 3484 * to avoid having to obtain the per cache-element spin-lock. In addition, 3485 * the ticks field is only updated when it changes. However, if our per-cpu 3486 * lock fails due to an unmount-in-progress, we fall-back to the 3487 * cache-element's spin-lock. 3488 */ 3489 struct mount * 3490 cache_findmount(struct nchandle *nch) 3491 { 3492 struct findmount_info info; 3493 struct ncmount_cache *ncc; 3494 struct ncmount_cache ncc_copy; 3495 struct mount *target; 3496 struct pcpu_ncache *pcpu; 3497 struct spinlock *spinlk; 3498 int update; 3499 3500 pcpu = pcpu_ncache; 3501 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3502 ncc = NULL; 3503 goto skip; 3504 } 3505 pcpu += mycpu->gd_cpuid; 3506 3507 again: 3508 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3509 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3510 found: 3511 /* 3512 * This is a bit messy for now because we do not yet have 3513 * safe disposal of mount structures. We have to ref 3514 * ncc->mp_target but the 'update' counter only tell us 3515 * whether the cache has changed after the fact. 3516 * 3517 * For now get a per-cpu spinlock that will only contend 3518 * against umount's. This is the best path. If it fails, 3519 * instead of waiting on the umount we fall-back to a 3520 * shared ncc->spin lock, which will generally only cost a 3521 * cache ping-pong. 3522 */ 3523 update = ncc->updating; 3524 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3525 spinlk = &pcpu->umount_spin; 3526 } else { 3527 spinlk = &ncc->spin; 3528 spin_lock_shared(spinlk); 3529 } 3530 if (update & 1) { /* update in progress */ 3531 spin_unlock_any(spinlk); 3532 goto skip; 3533 } 3534 ncc_copy = *ncc; 3535 cpu_lfence(); 3536 if (ncc->updating != update) { /* content changed */ 3537 spin_unlock_any(spinlk); 3538 goto again; 3539 } 3540 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3541 spin_unlock_any(spinlk); 3542 goto again; 3543 } 3544 if (ncc_copy.isneg == 0) { 3545 target = ncc_copy.mp_target; 3546 if (target->mnt_ncmounton.mount == nch->mount && 3547 target->mnt_ncmounton.ncp == nch->ncp) { 3548 /* 3549 * Cache hit (positive) (avoid dirtying 3550 * the cache line if possible) 3551 */ 3552 if (ncc->ticks != (int)ticks) 3553 ncc->ticks = (int)ticks; 3554 _cache_mntref(target); 3555 } 3556 } else { 3557 /* 3558 * Cache hit (negative) (avoid dirtying 3559 * the cache line if possible) 3560 */ 3561 if (ncc->ticks != (int)ticks) 3562 ncc->ticks = (int)ticks; 3563 target = NULL; 3564 } 3565 spin_unlock_any(spinlk); 3566 3567 return target; 3568 } 3569 skip: 3570 3571 /* 3572 * Slow 3573 */ 3574 info.result = NULL; 3575 info.nch_mount = nch->mount; 3576 info.nch_ncp = nch->ncp; 3577 mountlist_scan(cache_findmount_callback, &info, 3578 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3579 3580 /* 3581 * To reduce multi-re-entry on the cache, relookup in the cache. 3582 * This can still race, obviously, but that's ok. 3583 */ 3584 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3585 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3586 if (info.result) 3587 atomic_add_int(&info.result->mnt_refs, -1); 3588 goto found; 3589 } 3590 3591 /* 3592 * Cache the result. 3593 */ 3594 if ((info.result == NULL || 3595 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3596 spin_lock(&ncc->spin); 3597 atomic_add_int_nonlocked(&ncc->updating, 1); 3598 cpu_sfence(); 3599 KKASSERT(ncc->updating & 1); 3600 if (ncc->mp != nch->mount) { 3601 if (ncc->mp) 3602 atomic_add_int(&ncc->mp->mnt_refs, -1); 3603 atomic_add_int(&nch->mount->mnt_refs, 1); 3604 ncc->mp = nch->mount; 3605 } 3606 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3607 ncc->ticks = (int)ticks; 3608 3609 if (info.result) { 3610 ncc->isneg = 0; 3611 if (ncc->mp_target != info.result) { 3612 if (ncc->mp_target) 3613 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3614 ncc->mp_target = info.result; 3615 atomic_add_int(&info.result->mnt_refs, 1); 3616 } 3617 } else { 3618 ncc->isneg = 1; 3619 if (ncc->mp_target) { 3620 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3621 ncc->mp_target = NULL; 3622 } 3623 } 3624 cpu_sfence(); 3625 atomic_add_int_nonlocked(&ncc->updating, 1); 3626 spin_unlock(&ncc->spin); 3627 } 3628 return(info.result); 3629 } 3630 3631 static 3632 int 3633 cache_findmount_callback(struct mount *mp, void *data) 3634 { 3635 struct findmount_info *info = data; 3636 3637 /* 3638 * Check the mount's mounted-on point against the passed nch. 3639 */ 3640 if (mp->mnt_ncmounton.mount == info->nch_mount && 3641 mp->mnt_ncmounton.ncp == info->nch_ncp 3642 ) { 3643 info->result = mp; 3644 _cache_mntref(mp); 3645 return(-1); 3646 } 3647 return(0); 3648 } 3649 3650 void 3651 cache_dropmount(struct mount *mp) 3652 { 3653 _cache_mntrel(mp); 3654 } 3655 3656 /* 3657 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3658 * or negative). 3659 * 3660 * A full scan is not required, but for now just do it anyway. 3661 */ 3662 void 3663 cache_ismounting(struct mount *mp) 3664 { 3665 struct ncmount_cache *ncc; 3666 struct mount *ncc_mp; 3667 int i; 3668 3669 if (pcpu_ncache == NULL) 3670 return; 3671 3672 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3673 ncc = &ncmount_cache[i]; 3674 if (ncc->mp != mp->mnt_ncmounton.mount || 3675 ncc->ncp != mp->mnt_ncmounton.ncp) { 3676 continue; 3677 } 3678 spin_lock(&ncc->spin); 3679 atomic_add_int_nonlocked(&ncc->updating, 1); 3680 cpu_sfence(); 3681 KKASSERT(ncc->updating & 1); 3682 if (ncc->mp != mp->mnt_ncmounton.mount || 3683 ncc->ncp != mp->mnt_ncmounton.ncp) { 3684 cpu_sfence(); 3685 ++ncc->updating; 3686 spin_unlock(&ncc->spin); 3687 continue; 3688 } 3689 ncc_mp = ncc->mp; 3690 ncc->ncp = NULL; 3691 ncc->mp = NULL; 3692 if (ncc_mp) 3693 atomic_add_int(&ncc_mp->mnt_refs, -1); 3694 ncc_mp = ncc->mp_target; 3695 ncc->mp_target = NULL; 3696 if (ncc_mp) 3697 atomic_add_int(&ncc_mp->mnt_refs, -1); 3698 ncc->ticks = (int)ticks - hz * 120; 3699 3700 cpu_sfence(); 3701 atomic_add_int_nonlocked(&ncc->updating, 1); 3702 spin_unlock(&ncc->spin); 3703 } 3704 3705 /* 3706 * Pre-cache the mount point 3707 */ 3708 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3709 mp->mnt_ncmounton.ncp); 3710 3711 spin_lock(&ncc->spin); 3712 atomic_add_int_nonlocked(&ncc->updating, 1); 3713 cpu_sfence(); 3714 KKASSERT(ncc->updating & 1); 3715 3716 if (ncc->mp) 3717 atomic_add_int(&ncc->mp->mnt_refs, -1); 3718 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3719 ncc->mp = mp->mnt_ncmounton.mount; 3720 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3721 ncc->ticks = (int)ticks; 3722 3723 ncc->isneg = 0; 3724 if (ncc->mp_target != mp) { 3725 if (ncc->mp_target) 3726 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3727 ncc->mp_target = mp; 3728 atomic_add_int(&mp->mnt_refs, 1); 3729 } 3730 cpu_sfence(); 3731 atomic_add_int_nonlocked(&ncc->updating, 1); 3732 spin_unlock(&ncc->spin); 3733 } 3734 3735 /* 3736 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3737 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3738 * negative hits involving (mp, <any>). 3739 * 3740 * A full scan is required. 3741 */ 3742 void 3743 cache_unmounting(struct mount *mp) 3744 { 3745 struct ncmount_cache *ncc; 3746 struct pcpu_ncache *pcpu; 3747 struct mount *ncc_mp; 3748 int i; 3749 3750 pcpu = pcpu_ncache; 3751 if (pcpu == NULL) 3752 return; 3753 3754 for (i = 0; i < ncpus; ++i) 3755 spin_lock(&pcpu[i].umount_spin); 3756 3757 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3758 ncc = &ncmount_cache[i]; 3759 if (ncc->mp != mp && ncc->mp_target != mp) 3760 continue; 3761 spin_lock(&ncc->spin); 3762 atomic_add_int_nonlocked(&ncc->updating, 1); 3763 cpu_sfence(); 3764 3765 if (ncc->mp != mp && ncc->mp_target != mp) { 3766 atomic_add_int_nonlocked(&ncc->updating, 1); 3767 cpu_sfence(); 3768 spin_unlock(&ncc->spin); 3769 continue; 3770 } 3771 ncc_mp = ncc->mp; 3772 ncc->ncp = NULL; 3773 ncc->mp = NULL; 3774 if (ncc_mp) 3775 atomic_add_int(&ncc_mp->mnt_refs, -1); 3776 ncc_mp = ncc->mp_target; 3777 ncc->mp_target = NULL; 3778 if (ncc_mp) 3779 atomic_add_int(&ncc_mp->mnt_refs, -1); 3780 ncc->ticks = (int)ticks - hz * 120; 3781 3782 cpu_sfence(); 3783 atomic_add_int_nonlocked(&ncc->updating, 1); 3784 spin_unlock(&ncc->spin); 3785 } 3786 3787 for (i = 0; i < ncpus; ++i) 3788 spin_unlock(&pcpu[i].umount_spin); 3789 } 3790 3791 /* 3792 * Resolve an unresolved namecache entry, generally by looking it up. 3793 * The passed ncp must be locked and refd. 3794 * 3795 * Theoretically since a vnode cannot be recycled while held, and since 3796 * the nc_parent chain holds its vnode as long as children exist, the 3797 * direct parent of the cache entry we are trying to resolve should 3798 * have a valid vnode. If not then generate an error that we can 3799 * determine is related to a resolver bug. 3800 * 3801 * However, if a vnode was in the middle of a recyclement when the NCP 3802 * got locked, ncp->nc_vp might point to a vnode that is about to become 3803 * invalid. cache_resolve() handles this case by unresolving the entry 3804 * and then re-resolving it. 3805 * 3806 * Note that successful resolution does not necessarily return an error 3807 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3808 * will be returned. 3809 */ 3810 int 3811 cache_resolve(struct nchandle *nch, struct ucred *cred) 3812 { 3813 struct namecache *par_tmp; 3814 struct namecache *par; 3815 struct namecache *ncp; 3816 struct nchandle nctmp; 3817 struct mount *mp; 3818 struct vnode *dvp; 3819 int error; 3820 3821 ncp = nch->ncp; 3822 mp = nch->mount; 3823 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3824 restart: 3825 /* 3826 * If the ncp is already resolved we have nothing to do. However, 3827 * we do want to guarentee that a usable vnode is returned when 3828 * a vnode is present, so make sure it hasn't been reclaimed. 3829 */ 3830 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3831 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3832 _cache_setunresolved(ncp); 3833 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3834 return (ncp->nc_error); 3835 } 3836 3837 /* 3838 * If the ncp was destroyed it will never resolve again. This 3839 * can basically only happen when someone is chdir'd into an 3840 * empty directory which is then rmdir'd. We want to catch this 3841 * here and not dive the VFS because the VFS might actually 3842 * have a way to re-resolve the disconnected ncp, which will 3843 * result in inconsistencies in the cdir/nch for proc->p_fd. 3844 */ 3845 if (ncp->nc_flag & NCF_DESTROYED) 3846 return(EINVAL); 3847 3848 /* 3849 * Mount points need special handling because the parent does not 3850 * belong to the same filesystem as the ncp. 3851 */ 3852 if (ncp == mp->mnt_ncmountpt.ncp) 3853 return (cache_resolve_mp(mp)); 3854 3855 /* 3856 * We expect an unbroken chain of ncps to at least the mount point, 3857 * and even all the way to root (but this code doesn't have to go 3858 * past the mount point). 3859 */ 3860 if (ncp->nc_parent == NULL) { 3861 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3862 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3863 ncp->nc_error = EXDEV; 3864 return(ncp->nc_error); 3865 } 3866 3867 /* 3868 * The vp's of the parent directories in the chain are held via vhold() 3869 * due to the existance of the child, and should not disappear. 3870 * However, there are cases where they can disappear: 3871 * 3872 * - due to filesystem I/O errors. 3873 * - due to NFS being stupid about tracking the namespace and 3874 * destroys the namespace for entire directories quite often. 3875 * - due to forced unmounts. 3876 * - due to an rmdir (parent will be marked DESTROYED) 3877 * 3878 * When this occurs we have to track the chain backwards and resolve 3879 * it, looping until the resolver catches up to the current node. We 3880 * could recurse here but we might run ourselves out of kernel stack 3881 * so we do it in a more painful manner. This situation really should 3882 * not occur all that often, or if it does not have to go back too 3883 * many nodes to resolve the ncp. 3884 */ 3885 while ((dvp = cache_dvpref(ncp)) == NULL) { 3886 /* 3887 * This case can occur if a process is CD'd into a 3888 * directory which is then rmdir'd. If the parent is marked 3889 * destroyed there is no point trying to resolve it. 3890 */ 3891 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3892 return(ENOENT); 3893 par = ncp->nc_parent; 3894 _cache_hold(par); 3895 _cache_lock(par); 3896 while ((par_tmp = par->nc_parent) != NULL && 3897 par_tmp->nc_vp == NULL) { 3898 _cache_hold(par_tmp); 3899 _cache_lock(par_tmp); 3900 _cache_put(par); 3901 par = par_tmp; 3902 } 3903 if (par->nc_parent == NULL) { 3904 kprintf("EXDEV case 2 %*.*s\n", 3905 par->nc_nlen, par->nc_nlen, par->nc_name); 3906 _cache_put(par); 3907 return (EXDEV); 3908 } 3909 /* 3910 * The parent is not set in stone, ref and lock it to prevent 3911 * it from disappearing. Also note that due to renames it 3912 * is possible for our ncp to move and for par to no longer 3913 * be one of its parents. We resolve it anyway, the loop 3914 * will handle any moves. 3915 */ 3916 _cache_get(par); /* additional hold/lock */ 3917 _cache_put(par); /* from earlier hold/lock */ 3918 if (par == nch->mount->mnt_ncmountpt.ncp) { 3919 cache_resolve_mp(nch->mount); 3920 } else if ((dvp = cache_dvpref(par)) == NULL) { 3921 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3922 par->nc_nlen, par->nc_nlen, par->nc_name); 3923 _cache_put(par); 3924 continue; 3925 } else { 3926 if (par->nc_flag & NCF_UNRESOLVED) { 3927 nctmp.mount = mp; 3928 nctmp.ncp = par; 3929 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3930 } 3931 vrele(dvp); 3932 } 3933 if ((error = par->nc_error) != 0) { 3934 if (par->nc_error != EAGAIN) { 3935 kprintf("EXDEV case 3 %*.*s error %d\n", 3936 par->nc_nlen, par->nc_nlen, par->nc_name, 3937 par->nc_error); 3938 _cache_put(par); 3939 return(error); 3940 } 3941 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3942 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3943 } 3944 _cache_put(par); 3945 /* loop */ 3946 } 3947 3948 /* 3949 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3950 * ncp's and reattach them. If this occurs the original ncp is marked 3951 * EAGAIN to force a relookup. 3952 * 3953 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3954 * ncp must already be resolved. 3955 */ 3956 if (dvp) { 3957 nctmp.mount = mp; 3958 nctmp.ncp = ncp; 3959 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3960 vrele(dvp); 3961 } else { 3962 ncp->nc_error = EPERM; 3963 } 3964 if (ncp->nc_error == EAGAIN) { 3965 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3966 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3967 goto restart; 3968 } 3969 return(ncp->nc_error); 3970 } 3971 3972 /* 3973 * Resolve the ncp associated with a mount point. Such ncp's almost always 3974 * remain resolved and this routine is rarely called. NFS MPs tends to force 3975 * re-resolution more often due to its mac-truck-smash-the-namecache 3976 * method of tracking namespace changes. 3977 * 3978 * The semantics for this call is that the passed ncp must be locked on 3979 * entry and will be locked on return. However, if we actually have to 3980 * resolve the mount point we temporarily unlock the entry in order to 3981 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3982 * the unlock we have to recheck the flags after we relock. 3983 */ 3984 static int 3985 cache_resolve_mp(struct mount *mp) 3986 { 3987 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3988 struct vnode *vp; 3989 int error; 3990 3991 KKASSERT(mp != NULL); 3992 3993 /* 3994 * If the ncp is already resolved we have nothing to do. However, 3995 * we do want to guarentee that a usable vnode is returned when 3996 * a vnode is present, so make sure it hasn't been reclaimed. 3997 */ 3998 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3999 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4000 _cache_setunresolved(ncp); 4001 } 4002 4003 if (ncp->nc_flag & NCF_UNRESOLVED) { 4004 _cache_unlock(ncp); 4005 while (vfs_busy(mp, 0)) 4006 ; 4007 error = VFS_ROOT(mp, &vp); 4008 _cache_lock(ncp); 4009 4010 /* 4011 * recheck the ncp state after relocking. 4012 */ 4013 if (ncp->nc_flag & NCF_UNRESOLVED) { 4014 ncp->nc_error = error; 4015 if (error == 0) { 4016 _cache_setvp(mp, ncp, vp); 4017 vput(vp); 4018 } else { 4019 kprintf("[diagnostic] cache_resolve_mp: failed" 4020 " to resolve mount %p err=%d ncp=%p\n", 4021 mp, error, ncp); 4022 _cache_setvp(mp, ncp, NULL); 4023 } 4024 } else if (error == 0) { 4025 vput(vp); 4026 } 4027 vfs_unbusy(mp); 4028 } 4029 return(ncp->nc_error); 4030 } 4031 4032 /* 4033 * Clean out negative cache entries when too many have accumulated. 4034 */ 4035 static void 4036 _cache_cleanneg(long count) 4037 { 4038 struct pcpu_ncache *pn; 4039 struct namecache *ncp; 4040 static uint32_t neg_rover; 4041 uint32_t n; 4042 long vnegs; 4043 4044 n = neg_rover++; /* SMP heuristical, race ok */ 4045 cpu_ccfence(); 4046 n = n % (uint32_t)ncpus; 4047 4048 /* 4049 * Normalize vfscache_negs and count. count is sometimes based 4050 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4051 * have crazy values. 4052 */ 4053 vnegs = vfscache_negs; 4054 cpu_ccfence(); 4055 if (vnegs <= MINNEG) 4056 vnegs = MINNEG; 4057 if (count < 1) 4058 count = 1; 4059 4060 pn = &pcpu_ncache[n]; 4061 spin_lock(&pn->neg_spin); 4062 count = pn->neg_count * count / vnegs + 1; 4063 spin_unlock(&pn->neg_spin); 4064 4065 /* 4066 * Attempt to clean out the specified number of negative cache 4067 * entries. 4068 */ 4069 while (count > 0) { 4070 spin_lock(&pn->neg_spin); 4071 ncp = TAILQ_FIRST(&pn->neg_list); 4072 if (ncp == NULL) { 4073 spin_unlock(&pn->neg_spin); 4074 break; 4075 } 4076 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4077 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4078 _cache_hold(ncp); 4079 spin_unlock(&pn->neg_spin); 4080 4081 /* 4082 * This can race, so we must re-check that the ncp 4083 * is on the ncneg.list after successfully locking it. 4084 */ 4085 if (_cache_lock_special(ncp) == 0) { 4086 if (ncp->nc_vp == NULL && 4087 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4088 cache_zap(ncp); 4089 } else { 4090 _cache_unlock(ncp); 4091 _cache_drop(ncp); 4092 } 4093 } else { 4094 _cache_drop(ncp); 4095 } 4096 --count; 4097 } 4098 } 4099 4100 /* 4101 * Clean out positive cache entries when too many have accumulated. 4102 */ 4103 static void 4104 _cache_cleanpos(long count) 4105 { 4106 static volatile int rover; 4107 struct nchash_head *nchpp; 4108 struct namecache *ncp; 4109 int rover_copy; 4110 4111 /* 4112 * Attempt to clean out the specified number of negative cache 4113 * entries. 4114 */ 4115 while (count > 0) { 4116 rover_copy = ++rover; /* MPSAFEENOUGH */ 4117 cpu_ccfence(); 4118 nchpp = NCHHASH(rover_copy); 4119 4120 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4121 --count; 4122 continue; 4123 } 4124 4125 /* 4126 * Cycle ncp on list, ignore and do not move DUMMY 4127 * ncps. These are temporary list iterators. 4128 * 4129 * We must cycle the ncp to the end of the list to 4130 * ensure that all ncp's have an equal chance of 4131 * being removed. 4132 */ 4133 spin_lock(&nchpp->spin); 4134 ncp = TAILQ_FIRST(&nchpp->list); 4135 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4136 ncp = TAILQ_NEXT(ncp, nc_hash); 4137 if (ncp) { 4138 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4139 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4140 _cache_hold(ncp); 4141 } 4142 spin_unlock(&nchpp->spin); 4143 4144 if (ncp) { 4145 if (_cache_lock_special(ncp) == 0) { 4146 cache_zap(ncp); 4147 } else { 4148 _cache_drop(ncp); 4149 } 4150 } 4151 --count; 4152 } 4153 } 4154 4155 /* 4156 * This is a kitchen sink function to clean out ncps which we 4157 * tried to zap from cache_drop() but failed because we were 4158 * unable to acquire the parent lock. 4159 * 4160 * Such entries can also be removed via cache_inval_vp(), such 4161 * as when unmounting. 4162 */ 4163 static void 4164 _cache_cleandefered(void) 4165 { 4166 struct nchash_head *nchpp; 4167 struct namecache *ncp; 4168 struct namecache dummy; 4169 int i; 4170 4171 /* 4172 * Create a list iterator. DUMMY indicates that this is a list 4173 * iterator, DESTROYED prevents matches by lookup functions. 4174 */ 4175 numdefered = 0; 4176 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4177 bzero(&dummy, sizeof(dummy)); 4178 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4179 dummy.nc_refs = 1; 4180 4181 for (i = 0; i <= nchash; ++i) { 4182 nchpp = &nchashtbl[i]; 4183 4184 spin_lock(&nchpp->spin); 4185 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4186 ncp = &dummy; 4187 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4188 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4189 continue; 4190 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4191 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4192 _cache_hold(ncp); 4193 spin_unlock(&nchpp->spin); 4194 if (_cache_lock_nonblock(ncp) == 0) { 4195 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4196 _cache_unlock(ncp); 4197 } 4198 _cache_drop(ncp); 4199 spin_lock(&nchpp->spin); 4200 ncp = &dummy; 4201 } 4202 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4203 spin_unlock(&nchpp->spin); 4204 } 4205 } 4206 4207 /* 4208 * Name cache initialization, from vfsinit() when we are booting 4209 */ 4210 void 4211 nchinit(void) 4212 { 4213 struct pcpu_ncache *pn; 4214 globaldata_t gd; 4215 int i; 4216 4217 /* 4218 * Per-cpu accounting and negative hit list 4219 */ 4220 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4221 M_VFSCACHE, M_WAITOK|M_ZERO); 4222 for (i = 0; i < ncpus; ++i) { 4223 pn = &pcpu_ncache[i]; 4224 TAILQ_INIT(&pn->neg_list); 4225 spin_init(&pn->neg_spin, "ncneg"); 4226 spin_init(&pn->umount_spin, "ncumm"); 4227 } 4228 4229 /* 4230 * Initialise per-cpu namecache effectiveness statistics. 4231 */ 4232 for (i = 0; i < ncpus; ++i) { 4233 gd = globaldata_find(i); 4234 gd->gd_nchstats = &nchstats[i]; 4235 } 4236 4237 /* 4238 * Create a generous namecache hash table 4239 */ 4240 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4241 sizeof(struct nchash_head), 4242 M_VFSCACHE, &nchash); 4243 for (i = 0; i <= (int)nchash; ++i) { 4244 TAILQ_INIT(&nchashtbl[i].list); 4245 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4246 } 4247 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4248 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4249 nclockwarn = 5 * hz; 4250 } 4251 4252 /* 4253 * Called from start_init() to bootstrap the root filesystem. Returns 4254 * a referenced, unlocked namecache record. 4255 */ 4256 void 4257 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4258 { 4259 nch->ncp = cache_alloc(0); 4260 nch->mount = mp; 4261 _cache_mntref(mp); 4262 if (vp) 4263 _cache_setvp(nch->mount, nch->ncp, vp); 4264 } 4265 4266 /* 4267 * vfs_cache_setroot() 4268 * 4269 * Create an association between the root of our namecache and 4270 * the root vnode. This routine may be called several times during 4271 * booting. 4272 * 4273 * If the caller intends to save the returned namecache pointer somewhere 4274 * it must cache_hold() it. 4275 */ 4276 void 4277 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4278 { 4279 struct vnode *ovp; 4280 struct nchandle onch; 4281 4282 ovp = rootvnode; 4283 onch = rootnch; 4284 rootvnode = nvp; 4285 if (nch) 4286 rootnch = *nch; 4287 else 4288 cache_zero(&rootnch); 4289 if (ovp) 4290 vrele(ovp); 4291 if (onch.ncp) 4292 cache_drop(&onch); 4293 } 4294 4295 /* 4296 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4297 * topology and is being removed as quickly as possible. The new VOP_N*() 4298 * API calls are required to make specific adjustments using the supplied 4299 * ncp pointers rather then just bogusly purging random vnodes. 4300 * 4301 * Invalidate all namecache entries to a particular vnode as well as 4302 * any direct children of that vnode in the namecache. This is a 4303 * 'catch all' purge used by filesystems that do not know any better. 4304 * 4305 * Note that the linkage between the vnode and its namecache entries will 4306 * be removed, but the namecache entries themselves might stay put due to 4307 * active references from elsewhere in the system or due to the existance of 4308 * the children. The namecache topology is left intact even if we do not 4309 * know what the vnode association is. Such entries will be marked 4310 * NCF_UNRESOLVED. 4311 */ 4312 void 4313 cache_purge(struct vnode *vp) 4314 { 4315 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4316 } 4317 4318 __read_mostly static int disablecwd; 4319 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4320 "Disable getcwd"); 4321 4322 /* 4323 * MPALMOSTSAFE 4324 */ 4325 int 4326 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 4327 { 4328 u_int buflen; 4329 int error; 4330 char *buf; 4331 char *bp; 4332 4333 if (disablecwd) 4334 return (ENODEV); 4335 4336 buflen = uap->buflen; 4337 if (buflen == 0) 4338 return (EINVAL); 4339 if (buflen > MAXPATHLEN) 4340 buflen = MAXPATHLEN; 4341 4342 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4343 bp = kern_getcwd(buf, buflen, &error); 4344 if (error == 0) 4345 error = copyout(bp, uap->buf, strlen(bp) + 1); 4346 kfree(buf, M_TEMP); 4347 return (error); 4348 } 4349 4350 char * 4351 kern_getcwd(char *buf, size_t buflen, int *error) 4352 { 4353 struct proc *p = curproc; 4354 char *bp; 4355 int i, slash_prefixed; 4356 struct filedesc *fdp; 4357 struct nchandle nch; 4358 struct namecache *ncp; 4359 4360 bp = buf; 4361 bp += buflen - 1; 4362 *bp = '\0'; 4363 fdp = p->p_fd; 4364 slash_prefixed = 0; 4365 4366 nch = fdp->fd_ncdir; 4367 ncp = nch.ncp; 4368 if (ncp) 4369 _cache_hold(ncp); 4370 4371 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4372 nch.mount != fdp->fd_nrdir.mount) 4373 ) { 4374 if (ncp->nc_flag & NCF_DESTROYED) { 4375 _cache_drop(ncp); 4376 ncp = NULL; 4377 break; 4378 } 4379 /* 4380 * While traversing upwards if we encounter the root 4381 * of the current mount we have to skip to the mount point 4382 * in the underlying filesystem. 4383 */ 4384 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4385 nch = nch.mount->mnt_ncmounton; 4386 _cache_drop(ncp); 4387 ncp = nch.ncp; 4388 if (ncp) 4389 _cache_hold(ncp); 4390 continue; 4391 } 4392 4393 /* 4394 * Prepend the path segment 4395 */ 4396 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4397 if (bp == buf) { 4398 *error = ERANGE; 4399 bp = NULL; 4400 goto done; 4401 } 4402 *--bp = ncp->nc_name[i]; 4403 } 4404 if (bp == buf) { 4405 *error = ERANGE; 4406 bp = NULL; 4407 goto done; 4408 } 4409 *--bp = '/'; 4410 slash_prefixed = 1; 4411 4412 /* 4413 * Go up a directory. This isn't a mount point so we don't 4414 * have to check again. 4415 */ 4416 while ((nch.ncp = ncp->nc_parent) != NULL) { 4417 if (ncp_shared_lock_disable) 4418 _cache_lock(ncp); 4419 else 4420 _cache_lock_shared(ncp); 4421 if (nch.ncp != ncp->nc_parent) { 4422 _cache_unlock(ncp); 4423 continue; 4424 } 4425 _cache_hold(nch.ncp); 4426 _cache_unlock(ncp); 4427 break; 4428 } 4429 _cache_drop(ncp); 4430 ncp = nch.ncp; 4431 } 4432 if (ncp == NULL) { 4433 *error = ENOENT; 4434 bp = NULL; 4435 goto done; 4436 } 4437 if (!slash_prefixed) { 4438 if (bp == buf) { 4439 *error = ERANGE; 4440 bp = NULL; 4441 goto done; 4442 } 4443 *--bp = '/'; 4444 } 4445 *error = 0; 4446 done: 4447 if (ncp) 4448 _cache_drop(ncp); 4449 return (bp); 4450 } 4451 4452 /* 4453 * Thus begins the fullpath magic. 4454 * 4455 * The passed nchp is referenced but not locked. 4456 */ 4457 __read_mostly static int disablefullpath; 4458 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4459 &disablefullpath, 0, 4460 "Disable fullpath lookups"); 4461 4462 int 4463 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4464 char **retbuf, char **freebuf, int guess) 4465 { 4466 struct nchandle fd_nrdir; 4467 struct nchandle nch; 4468 struct namecache *ncp; 4469 struct mount *mp, *new_mp; 4470 char *bp, *buf; 4471 int slash_prefixed; 4472 int error = 0; 4473 int i; 4474 4475 *retbuf = NULL; 4476 *freebuf = NULL; 4477 4478 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4479 bp = buf + MAXPATHLEN - 1; 4480 *bp = '\0'; 4481 if (nchbase) 4482 fd_nrdir = *nchbase; 4483 else if (p != NULL) 4484 fd_nrdir = p->p_fd->fd_nrdir; 4485 else 4486 fd_nrdir = rootnch; 4487 slash_prefixed = 0; 4488 nch = *nchp; 4489 ncp = nch.ncp; 4490 if (ncp) 4491 _cache_hold(ncp); 4492 mp = nch.mount; 4493 4494 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4495 new_mp = NULL; 4496 4497 /* 4498 * If we are asked to guess the upwards path, we do so whenever 4499 * we encounter an ncp marked as a mountpoint. We try to find 4500 * the actual mountpoint by finding the mountpoint with this 4501 * ncp. 4502 */ 4503 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4504 new_mp = mount_get_by_nc(ncp); 4505 } 4506 /* 4507 * While traversing upwards if we encounter the root 4508 * of the current mount we have to skip to the mount point. 4509 */ 4510 if (ncp == mp->mnt_ncmountpt.ncp) { 4511 new_mp = mp; 4512 } 4513 if (new_mp) { 4514 nch = new_mp->mnt_ncmounton; 4515 _cache_drop(ncp); 4516 ncp = nch.ncp; 4517 if (ncp) 4518 _cache_hold(ncp); 4519 mp = nch.mount; 4520 continue; 4521 } 4522 4523 /* 4524 * Prepend the path segment 4525 */ 4526 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4527 if (bp == buf) { 4528 kfree(buf, M_TEMP); 4529 error = ENOMEM; 4530 goto done; 4531 } 4532 *--bp = ncp->nc_name[i]; 4533 } 4534 if (bp == buf) { 4535 kfree(buf, M_TEMP); 4536 error = ENOMEM; 4537 goto done; 4538 } 4539 *--bp = '/'; 4540 slash_prefixed = 1; 4541 4542 /* 4543 * Go up a directory. This isn't a mount point so we don't 4544 * have to check again. 4545 * 4546 * We can only safely access nc_parent with ncp held locked. 4547 */ 4548 while ((nch.ncp = ncp->nc_parent) != NULL) { 4549 _cache_lock_shared(ncp); 4550 if (nch.ncp != ncp->nc_parent) { 4551 _cache_unlock(ncp); 4552 continue; 4553 } 4554 _cache_hold(nch.ncp); 4555 _cache_unlock(ncp); 4556 break; 4557 } 4558 _cache_drop(ncp); 4559 ncp = nch.ncp; 4560 } 4561 if (ncp == NULL) { 4562 kfree(buf, M_TEMP); 4563 error = ENOENT; 4564 goto done; 4565 } 4566 4567 if (!slash_prefixed) { 4568 if (bp == buf) { 4569 kfree(buf, M_TEMP); 4570 error = ENOMEM; 4571 goto done; 4572 } 4573 *--bp = '/'; 4574 } 4575 *retbuf = bp; 4576 *freebuf = buf; 4577 error = 0; 4578 done: 4579 if (ncp) 4580 _cache_drop(ncp); 4581 return(error); 4582 } 4583 4584 int 4585 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4586 char **freebuf, int guess) 4587 { 4588 struct namecache *ncp; 4589 struct nchandle nch; 4590 int error; 4591 4592 *freebuf = NULL; 4593 if (disablefullpath) 4594 return (ENODEV); 4595 4596 if (p == NULL) 4597 return (EINVAL); 4598 4599 /* vn is NULL, client wants us to use p->p_textvp */ 4600 if (vn == NULL) { 4601 if ((vn = p->p_textvp) == NULL) 4602 return (EINVAL); 4603 } 4604 spin_lock_shared(&vn->v_spin); 4605 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4606 if (ncp->nc_nlen) 4607 break; 4608 } 4609 if (ncp == NULL) { 4610 spin_unlock_shared(&vn->v_spin); 4611 return (EINVAL); 4612 } 4613 _cache_hold(ncp); 4614 spin_unlock_shared(&vn->v_spin); 4615 4616 nch.ncp = ncp; 4617 nch.mount = vn->v_mount; 4618 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4619 _cache_drop(ncp); 4620 return (error); 4621 } 4622 4623 void 4624 vfscache_rollup_cpu(struct globaldata *gd) 4625 { 4626 struct pcpu_ncache *pn; 4627 long count; 4628 4629 if (pcpu_ncache == NULL) 4630 return; 4631 pn = &pcpu_ncache[gd->gd_cpuid]; 4632 4633 if (pn->vfscache_count) { 4634 count = atomic_swap_long(&pn->vfscache_count, 0); 4635 atomic_add_long(&vfscache_count, count); 4636 } 4637 if (pn->vfscache_leafs) { 4638 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4639 atomic_add_long(&vfscache_leafs, count); 4640 } 4641 if (pn->vfscache_negs) { 4642 count = atomic_swap_long(&pn->vfscache_negs, 0); 4643 atomic_add_long(&vfscache_negs, count); 4644 } 4645 if (pn->numdefered) { 4646 count = atomic_swap_long(&pn->numdefered, 0); 4647 atomic_add_long(&numdefered, count); 4648 } 4649 } 4650