1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long vfscache_unres; 179 long numdefered; 180 long inv_kid_quick_count; 181 long inv_ncp_quick_count; 182 long clean_pos_count; 183 long clean_neg_count; 184 } __cachealign; 185 186 __read_mostly static struct nchash_head *nchashtbl; 187 __read_mostly static struct pcpu_ncache *pcpu_ncache; 188 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 189 190 /* 191 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 192 * to create the namecache infrastructure leading to a dangling vnode. 193 * 194 * 0 Only errors are reported 195 * 1 Successes are reported 196 * 2 Successes + the whole directory scan is reported 197 * 3 Force the directory scan code run as if the parent vnode did not 198 * have a namecache record, even if it does have one. 199 */ 200 __read_mostly int ncvp_debug; 201 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 202 "Namecache debug level (0-3)"); 203 204 __read_mostly static u_long nchash; /* size of hash table */ 205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 206 "Size of namecache hash table"); 207 208 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 209 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 210 "Batch flush negative entries"); 211 212 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 213 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 214 "Batch flush positive entries"); 215 216 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 217 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 218 "Ratio of negative namecache entries"); 219 220 __read_mostly static int ncposfactor = 16; /* ratio of unres+leaf entries */ 221 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0, 222 "Ratio of unresolved leaf namecache entries"); 223 224 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 225 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 226 "Warn on locked namecache entries in ticks"); 227 228 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 229 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 230 "Number of cache entries allocated"); 231 232 __read_mostly static int ncp_shared_lock_disable = 0; 233 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 234 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 235 236 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 237 "sizeof(struct vnode)"); 238 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 239 "sizeof(struct namecache)"); 240 241 __read_mostly static int ncmount_cache_enable = 1; 242 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 243 &ncmount_cache_enable, 0, "mount point cache"); 244 245 static __inline void _cache_drop(struct namecache *ncp); 246 static int cache_resolve_mp(struct mount *mp, int adjgen); 247 static int cache_findmount_callback(struct mount *mp, void *data); 248 static void _cache_setunresolved(struct namecache *ncp, int adjgen); 249 static void _cache_cleanneg(long count); 250 static void _cache_cleanpos(long ucount, long xcount); 251 static void _cache_cleandefered(void); 252 static void _cache_unlink(struct namecache *ncp); 253 254 /* 255 * The new name cache statistics (these are rolled up globals and not 256 * modified in the critical path, see struct pcpu_ncache). 257 */ 258 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 259 static long vfscache_negs; 260 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 261 "Number of negative namecache entries"); 262 static long vfscache_count; 263 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 264 "Number of namecaches entries"); 265 static long vfscache_leafs; 266 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 267 "Number of leaf namecaches entries"); 268 static long vfscache_unres; 269 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0, 270 "Number of unresolved leaf namecaches entries"); 271 272 static long inv_kid_quick_count; 273 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_kid_quick_count, CTLFLAG_RD, 274 &inv_kid_quick_count, 0, 275 "quick kid invalidations"); 276 static long inv_ncp_quick_count; 277 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_ncp_quick_count, CTLFLAG_RD, 278 &inv_ncp_quick_count, 0, 279 "quick ncp invalidations"); 280 static long clean_pos_count; 281 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_pos_count, CTLFLAG_RD, 282 &clean_pos_count, 0, 283 "positive ncp cleanings"); 284 static long clean_neg_count; 285 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_neg_count, CTLFLAG_RD, 286 &clean_neg_count, 0, 287 "negative ncp cleanings"); 288 289 static long numdefered; 290 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 291 "Number of cache entries allocated"); 292 293 /* 294 * Returns the number of basic references expected on the ncp, not 295 * including any children. 1 for the natural ref, and an addition ref 296 * if the ncp is resolved (representing a positive or negative hit). 297 */ 298 static __inline int 299 ncpbaserefs(struct namecache *ncp) 300 { 301 return (1 + ((ncp->nc_flag & NCF_UNRESOLVED) == 0)); 302 } 303 304 struct nchstats nchstats[SMP_MAXCPU]; 305 /* 306 * Export VFS cache effectiveness statistics to user-land. 307 * 308 * The statistics are left for aggregation to user-land so 309 * neat things can be achieved, like observing per-CPU cache 310 * distribution. 311 */ 312 static int 313 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 314 { 315 struct globaldata *gd; 316 int i, error; 317 318 error = 0; 319 for (i = 0; i < ncpus; ++i) { 320 gd = globaldata_find(i); 321 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 322 sizeof(struct nchstats)))) 323 break; 324 } 325 326 return (error); 327 } 328 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 329 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 330 331 static int cache_zap(struct namecache *ncp); 332 333 /* 334 * Cache mount points and namecache records in order to avoid unnecessary 335 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 336 * performance and is particularly important on multi-socket systems to 337 * reduce cache-line ping-ponging. 338 * 339 * Try to keep the pcpu structure within one cache line (~64 bytes). 340 */ 341 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 342 #define MNTCACHE_SET 8 /* set associativity */ 343 344 struct mntcache_elm { 345 struct namecache *ncp; 346 struct mount *mp; 347 int ticks; 348 int unused01; 349 }; 350 351 struct mntcache { 352 struct mntcache_elm array[MNTCACHE_COUNT]; 353 } __cachealign; 354 355 static struct mntcache pcpu_mntcache[MAXCPU]; 356 357 static __inline 358 void 359 _cache_ncp_gen_enter(struct namecache *ncp) 360 { 361 ncp->nc_generation += 2; 362 cpu_sfence(); 363 } 364 365 static __inline 366 void 367 _cache_ncp_gen_exit(struct namecache *ncp) 368 { 369 cpu_sfence(); 370 ncp->nc_generation += 2; 371 cpu_sfence(); 372 } 373 374 static __inline 375 struct mntcache_elm * 376 _cache_mntcache_hash(void *ptr) 377 { 378 struct mntcache_elm *elm; 379 int hv; 380 381 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 382 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 383 384 return elm; 385 } 386 387 static 388 void 389 _cache_mntref(struct mount *mp) 390 { 391 struct mntcache_elm *elm; 392 struct mount *mpr; 393 int i; 394 395 elm = _cache_mntcache_hash(mp); 396 for (i = 0; i < MNTCACHE_SET; ++i) { 397 if (elm->mp == mp) { 398 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 399 if (__predict_true(mpr == mp)) 400 return; 401 if (mpr) 402 atomic_add_int(&mpr->mnt_refs, -1); 403 } 404 ++elm; 405 } 406 atomic_add_int(&mp->mnt_refs, 1); 407 } 408 409 static 410 void 411 _cache_mntrel(struct mount *mp) 412 { 413 struct mntcache_elm *elm; 414 struct mntcache_elm *best; 415 struct mount *mpr; 416 int delta1; 417 int delta2; 418 int i; 419 420 elm = _cache_mntcache_hash(mp); 421 best = elm; 422 for (i = 0; i < MNTCACHE_SET; ++i) { 423 if (elm->mp == NULL) { 424 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 425 if (__predict_false(mpr != NULL)) { 426 atomic_add_int(&mpr->mnt_refs, -1); 427 } 428 elm->ticks = ticks; 429 return; 430 } 431 delta1 = ticks - best->ticks; 432 delta2 = ticks - elm->ticks; 433 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 434 best = elm; 435 ++elm; 436 } 437 mpr = atomic_swap_ptr((void *)&best->mp, mp); 438 best->ticks = ticks; 439 if (mpr) 440 atomic_add_int(&mpr->mnt_refs, -1); 441 } 442 443 /* 444 * Clears all cached mount points on all cpus. This routine should only 445 * be called when we are waiting for a mount to clear, e.g. so we can 446 * unmount. 447 */ 448 void 449 cache_clearmntcache(struct mount *target __unused) 450 { 451 int n; 452 453 for (n = 0; n < ncpus; ++n) { 454 struct mntcache *cache = &pcpu_mntcache[n]; 455 struct mntcache_elm *elm; 456 struct namecache *ncp; 457 struct mount *mp; 458 int i; 459 460 for (i = 0; i < MNTCACHE_COUNT; ++i) { 461 elm = &cache->array[i]; 462 if (elm->mp) { 463 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 464 if (mp) 465 atomic_add_int(&mp->mnt_refs, -1); 466 } 467 if (elm->ncp) { 468 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 469 if (ncp) 470 _cache_drop(ncp); 471 } 472 } 473 } 474 } 475 476 /* 477 * Namespace locking. The caller must already hold a reference to the 478 * namecache structure in order to lock/unlock it. The controlling entity 479 * in a 1->0 transition does not need to lock the ncp to dispose of it, 480 * as nobody else will have visibility to it at that point. 481 * 482 * Note that holding a locked namecache structure prevents other threads 483 * from making namespace changes (e.g. deleting or creating), prevents 484 * vnode association state changes by other threads, and prevents the 485 * namecache entry from being resolved or unresolved by other threads. 486 * 487 * An exclusive lock owner has full authority to associate/disassociate 488 * vnodes and resolve/unresolve the locked ncp. 489 * 490 * A shared lock owner only has authority to acquire the underlying vnode, 491 * if any. 492 * 493 * The primary lock field is nc_lockstatus. nc_locktd is set after the 494 * fact (when locking) or cleared prior to unlocking. 495 * 496 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 497 * or recycled, but it does NOT help you if the vnode had already 498 * initiated a recyclement. If this is important, use cache_get() 499 * rather then cache_lock() (and deal with the differences in the 500 * way the refs counter is handled). Or, alternatively, make an 501 * unconditional call to cache_validate() or cache_resolve() 502 * after cache_lock() returns. 503 */ 504 static __inline 505 void 506 _cache_lock(struct namecache *ncp) 507 { 508 int didwarn = 0; 509 int error; 510 511 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 512 while (__predict_false(error == EWOULDBLOCK)) { 513 if (didwarn == 0) { 514 didwarn = ticks - nclockwarn; 515 kprintf("[diagnostic] cache_lock: " 516 "%s blocked on %p " 517 "\"%*.*s\"\n", 518 curthread->td_comm, ncp, 519 ncp->nc_nlen, ncp->nc_nlen, 520 ncp->nc_name); 521 } 522 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 523 } 524 if (__predict_false(didwarn)) { 525 kprintf("[diagnostic] cache_lock: " 526 "%s unblocked %*.*s after %d secs\n", 527 curthread->td_comm, 528 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 529 (int)(ticks - didwarn) / hz); 530 } 531 } 532 533 /* 534 * Release a previously acquired lock. 535 * 536 * A concurrent shared-lock acquisition or acquisition/release can 537 * race bit 31 so only drop the ncp if bit 31 was set. 538 */ 539 static __inline 540 void 541 _cache_unlock(struct namecache *ncp) 542 { 543 lockmgr(&ncp->nc_lock, LK_RELEASE); 544 } 545 546 /* 547 * Lock ncp exclusively, non-blocking. Return 0 on success. 548 */ 549 static __inline 550 int 551 _cache_lock_nonblock(struct namecache *ncp) 552 { 553 int error; 554 555 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 556 if (__predict_false(error != 0)) { 557 return(EWOULDBLOCK); 558 } 559 return 0; 560 } 561 562 /* 563 * This is a special form of _cache_lock() which only succeeds if 564 * it can get a pristine, non-recursive lock. The caller must have 565 * already ref'd the ncp. 566 * 567 * On success the ncp will be locked, on failure it will not. The 568 * ref count does not change either way. 569 * 570 * We want _cache_lock_special() (on success) to return a definitively 571 * usable vnode or a definitively unresolved ncp. 572 */ 573 static __inline 574 int 575 _cache_lock_special(struct namecache *ncp) 576 { 577 if (_cache_lock_nonblock(ncp) == 0) { 578 if (lockmgr_oneexcl(&ncp->nc_lock)) { 579 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 580 _cache_setunresolved(ncp, 1); 581 return 0; 582 } 583 _cache_unlock(ncp); 584 } 585 return EWOULDBLOCK; 586 } 587 588 /* 589 * Shared lock, guarantees vp held 590 * 591 * The shared lock holds vp on the 0->1 transition. It is possible to race 592 * another shared lock release, preventing the other release from dropping 593 * the vnode and clearing bit 31. 594 * 595 * If it is not set then we are responsible for setting it, and this 596 * responsibility does not race with anyone else. 597 */ 598 static __inline 599 void 600 _cache_lock_shared(struct namecache *ncp) 601 { 602 int didwarn = 0; 603 int error; 604 605 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 606 while (__predict_false(error == EWOULDBLOCK)) { 607 if (didwarn == 0) { 608 didwarn = ticks - nclockwarn; 609 kprintf("[diagnostic] cache_lock_shared: " 610 "%s blocked on %p " 611 "\"%*.*s\"\n", 612 curthread->td_comm, ncp, 613 ncp->nc_nlen, ncp->nc_nlen, 614 ncp->nc_name); 615 } 616 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 617 } 618 if (__predict_false(didwarn)) { 619 kprintf("[diagnostic] cache_lock_shared: " 620 "%s unblocked %*.*s after %d secs\n", 621 curthread->td_comm, 622 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 623 (int)(ticks - didwarn) / hz); 624 } 625 } 626 627 /* 628 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 629 */ 630 static __inline 631 int 632 _cache_lock_shared_nonblock(struct namecache *ncp) 633 { 634 int error; 635 636 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 637 if (__predict_false(error != 0)) { 638 return(EWOULDBLOCK); 639 } 640 return 0; 641 } 642 643 /* 644 * This function tries to get a shared lock but will back-off to an 645 * exclusive lock if: 646 * 647 * (1) Some other thread is trying to obtain an exclusive lock 648 * (to prevent the exclusive requester from getting livelocked out 649 * by many shared locks). 650 * 651 * (2) The current thread already owns an exclusive lock (to avoid 652 * deadlocking). 653 * 654 * WARNING! On machines with lots of cores we really want to try hard to 655 * get a shared lock or concurrent path lookups can chain-react 656 * into a very high-latency exclusive lock. 657 * 658 * This is very evident in dsynth's initial scans. 659 */ 660 static __inline 661 int 662 _cache_lock_shared_special(struct namecache *ncp) 663 { 664 /* 665 * Only honor a successful shared lock (returning 0) if there is 666 * no exclusive request pending and the vnode, if present, is not 667 * in a reclaimed state. 668 */ 669 if (_cache_lock_shared_nonblock(ncp) == 0) { 670 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 671 if (ncp->nc_vp == NULL || 672 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 673 return(0); 674 } 675 } 676 _cache_unlock(ncp); 677 return(EWOULDBLOCK); 678 } 679 680 /* 681 * Non-blocking shared lock failed. If we already own the exclusive 682 * lock just acquire another exclusive lock (instead of deadlocking). 683 * Otherwise acquire a shared lock. 684 */ 685 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 686 _cache_lock(ncp); 687 return(0); 688 } 689 _cache_lock_shared(ncp); 690 return(0); 691 } 692 693 /* 694 * Returns: 695 * -1 Locked by other 696 * 0 Not locked 697 * (v) LK_SHARED or LK_EXCLUSIVE 698 */ 699 static __inline 700 int 701 _cache_lockstatus(struct namecache *ncp) 702 { 703 int status; 704 705 status = lockstatus(&ncp->nc_lock, curthread); 706 if (status == LK_EXCLOTHER) 707 status = -1; 708 return status; 709 } 710 711 /* 712 * cache_hold() and cache_drop() prevent the premature deletion of a 713 * namecache entry but do not prevent operations (such as zapping) on 714 * that namecache entry. 715 * 716 * This routine may only be called from outside this source module if 717 * nc_refs is already deterministically at least 1, such as being 718 * associated with e.g. a process, file descriptor, or some other entity. 719 * 720 * Only the above situations, similar situations within this module where 721 * the ref count is deterministically at least 1, or when the ncp is found 722 * via the nchpp (hash table) lookup, can bump nc_refs. 723 * 724 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 725 * can still be removed from the nc_list, however, as long as the caller 726 * can acquire its lock (in the wrong order). 727 * 728 * This is a rare case where callers are allowed to hold a spinlock, 729 * so we can't ourselves. 730 */ 731 static __inline 732 struct namecache * 733 _cache_hold(struct namecache *ncp) 734 { 735 KKASSERT(ncp->nc_refs > 0); 736 atomic_add_int(&ncp->nc_refs, 1); 737 738 return(ncp); 739 } 740 741 /* 742 * Drop a cache entry. 743 * 744 * The 1->0 transition can only occur after or because the natural ref 745 * is being dropped. If another thread had a temporary ref during the 746 * ncp's destruction, then that other thread might wind up being the 747 * one to drop the last ref. 748 */ 749 static __inline 750 void 751 _cache_drop(struct namecache *ncp) 752 { 753 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 754 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 755 756 /* 757 * Scrap it. 758 */ 759 ncp->nc_refs = -1; /* safety */ 760 if (ncp->nc_name) 761 kfree(ncp->nc_name, M_VFSCACHEAUX); 762 kfree_obj(ncp, M_VFSCACHE); 763 } 764 } 765 766 /* 767 * Link a new namecache entry to its parent and to the hash table. Be 768 * careful to avoid races if vhold() blocks in the future. 769 * 770 * Both ncp and par must be referenced and locked. The reference is 771 * transfered to the nchpp (and, most notably, NOT to the parent list). 772 * 773 * NOTE: The hash table spinlock is held across this call, we can't do 774 * anything fancy. 775 */ 776 static void 777 _cache_link_parent(struct namecache *ncp, struct namecache *par, 778 struct nchash_head *nchpp) 779 { 780 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 781 782 KKASSERT(ncp->nc_parent == NULL); 783 _cache_ncp_gen_enter(ncp); 784 ncp->nc_parent = par; 785 ncp->nc_head = nchpp; 786 787 /* 788 * Set inheritance flags. Note that the parent flags may be 789 * stale due to getattr potentially not having been run yet 790 * (it gets run during nlookup()'s). 791 */ 792 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 793 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 794 ncp->nc_flag |= NCF_SF_PNOCACHE; 795 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 796 ncp->nc_flag |= NCF_UF_PCACHE; 797 798 /* 799 * Add to hash table and parent, adjust accounting 800 */ 801 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 802 atomic_add_long(&pn->vfscache_count, 1); 803 804 /* 805 * ncp is a new leaf being added to the tree 806 */ 807 if (TAILQ_EMPTY(&ncp->nc_list)) { 808 atomic_add_long(&pn->vfscache_leafs, 1); 809 if (ncp->nc_flag & NCF_UNRESOLVED) 810 atomic_add_long(&pn->vfscache_unres, 1); 811 } 812 813 if (TAILQ_EMPTY(&par->nc_list)) { 814 /* 815 * Parent was, but now is no longer a leaf 816 */ 817 /* 818 * XXX for now don't mess with par's gen, it causes 819 * unnecessary nlookup retries (though not many) 820 */ 821 /*_cache_ncp_gen_enter(par);*/ 822 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 823 if (par->nc_flag & NCF_UNRESOLVED) 824 atomic_add_long(&pn->vfscache_unres, -1); 825 atomic_add_long(&pn->vfscache_leafs, -1); 826 827 /* 828 * Any vp associated with an ncp which has children must 829 * be held to prevent it from being recycled. 830 */ 831 if (par->nc_vp) 832 vhold(par->nc_vp); 833 /*_cache_ncp_gen_exit(par);*/ 834 } else { 835 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 836 } 837 _cache_hold(par); /* add nc_parent ref */ 838 _cache_ncp_gen_exit(ncp); 839 } 840 841 /* 842 * Remove the parent and hash associations from a namecache structure. 843 * Drop the ref-count on the parent. The caller receives the ref 844 * from the ncp's nchpp linkage that was removed and may forward that 845 * ref to a new linkage. 846 847 * The caller usually holds an additional ref * on the ncp so the unlink 848 * cannot be the final drop. XXX should not be necessary now since the 849 * caller receives the ref from the nchpp linkage, assuming the ncp 850 * was linked in the first place. 851 * 852 * ncp must be locked, which means that there won't be any nc_parent 853 * removal races. This routine will acquire a temporary lock on 854 * the parent as well as the appropriate hash chain. 855 * 856 * par must be locked and will remain locked on return. 857 * 858 * nhcpp must be spin-locked. This routine eats the spin-lock. 859 */ 860 static __inline void 861 _cache_unlink_parent(struct namecache *par, struct namecache *ncp, 862 struct nchash_head *nchpp) 863 { 864 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 865 struct vnode *dropvp; 866 867 KKASSERT(ncp->nc_parent == par); 868 cpu_ccfence(); 869 _cache_ncp_gen_enter(ncp); 870 871 /* don't add a ref, we drop the nchpp ref later */ 872 873 /* 874 * Remove from hash table and parent, adjust accounting 875 */ 876 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 877 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 878 atomic_add_long(&pn->vfscache_count, -1); 879 880 /* 881 * Removing leaf from tree 882 */ 883 if (TAILQ_EMPTY(&ncp->nc_list)) { 884 if (ncp->nc_flag & NCF_UNRESOLVED) 885 atomic_add_long(&pn->vfscache_unres, -1); 886 atomic_add_long(&pn->vfscache_leafs, -1); 887 } 888 889 /* 890 * Parent is now a leaf? 891 */ 892 dropvp = NULL; 893 if (TAILQ_EMPTY(&par->nc_list)) { 894 /* 895 * XXX for now don't mess with par's gen, it causes 896 * unnecessary nlookup retries (though not many) 897 */ 898 /*_cache_ncp_gen_enter(par);*/ 899 if (par->nc_flag & NCF_UNRESOLVED) 900 atomic_add_long(&pn->vfscache_unres, 1); 901 atomic_add_long(&pn->vfscache_leafs, 1); 902 if (par->nc_vp) 903 dropvp = par->nc_vp; 904 /*_cache_ncp_gen_exit(par);*/ 905 } 906 ncp->nc_parent = NULL; 907 ncp->nc_head = NULL; 908 spin_unlock(&nchpp->spin); 909 _cache_drop(par); /* drop ncp's nc_parent ref from (par) */ 910 911 /* 912 * We can only safely vdrop with no spinlocks held. 913 */ 914 if (dropvp) 915 vdrop(dropvp); 916 _cache_ncp_gen_exit(ncp); 917 } 918 919 /* 920 * Allocate a new namecache structure. Most of the code does not require 921 * zero-termination of the string but it makes vop_compat_ncreate() easier. 922 * 923 * The returned ncp will be locked and referenced. The ref is generally meant 924 * to be transfered to the nchpp linkage. 925 */ 926 static struct namecache * 927 cache_alloc(int nlen) 928 { 929 struct namecache *ncp; 930 931 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 932 if (nlen) 933 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 934 ncp->nc_nlen = nlen; 935 ncp->nc_flag = NCF_UNRESOLVED; 936 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 937 ncp->nc_refs = 1; /* natural ref */ 938 ncp->nc_generation = 0; /* link/unlink/res/unres op */ 939 TAILQ_INIT(&ncp->nc_list); 940 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 941 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 942 943 return(ncp); 944 } 945 946 /* 947 * Can only be called for the case where the ncp has never been 948 * associated with anything (so no spinlocks are needed). 949 */ 950 static void 951 _cache_free(struct namecache *ncp) 952 { 953 KKASSERT(ncp->nc_refs == 1); 954 if (ncp->nc_name) 955 kfree(ncp->nc_name, M_VFSCACHEAUX); 956 kfree_obj(ncp, M_VFSCACHE); 957 } 958 959 /* 960 * [re]initialize a nchandle. 961 */ 962 void 963 cache_zero(struct nchandle *nch) 964 { 965 nch->ncp = NULL; 966 nch->mount = NULL; 967 } 968 969 /* 970 * Ref and deref a nchandle structure (ncp + mp) 971 * 972 * The caller must specify a stable ncp pointer, typically meaning the 973 * ncp is already referenced but this can also occur indirectly through 974 * e.g. holding a lock on a direct child. 975 * 976 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 977 * use read spinlocks here. 978 */ 979 struct nchandle * 980 cache_hold(struct nchandle *nch) 981 { 982 _cache_hold(nch->ncp); 983 _cache_mntref(nch->mount); 984 return(nch); 985 } 986 987 /* 988 * Create a copy of a namecache handle for an already-referenced 989 * entry. 990 */ 991 void 992 cache_copy(struct nchandle *nch, struct nchandle *target) 993 { 994 struct namecache *ncp; 995 struct mount *mp; 996 struct mntcache_elm *elm; 997 struct namecache *ncpr; 998 int i; 999 1000 ncp = nch->ncp; 1001 mp = nch->mount; 1002 target->ncp = ncp; 1003 target->mount = mp; 1004 1005 elm = _cache_mntcache_hash(ncp); 1006 for (i = 0; i < MNTCACHE_SET; ++i) { 1007 if (elm->ncp == ncp) { 1008 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 1009 if (ncpr == ncp) { 1010 _cache_mntref(mp); 1011 return; 1012 } 1013 if (ncpr) 1014 _cache_drop(ncpr); 1015 } 1016 ++elm; 1017 } 1018 if (ncp) 1019 _cache_hold(ncp); 1020 _cache_mntref(mp); 1021 } 1022 1023 /* 1024 * Drop the nchandle, but try to cache the ref to avoid global atomic 1025 * ops. This is typically done on the system root and jail root nchandles. 1026 */ 1027 void 1028 cache_drop_and_cache(struct nchandle *nch, int elmno) 1029 { 1030 struct mntcache_elm *elm; 1031 struct mntcache_elm *best; 1032 struct namecache *ncpr; 1033 int delta1; 1034 int delta2; 1035 int i; 1036 1037 if (elmno > 4) { 1038 if (nch->ncp) { 1039 _cache_drop(nch->ncp); 1040 nch->ncp = NULL; 1041 } 1042 if (nch->mount) { 1043 _cache_mntrel(nch->mount); 1044 nch->mount = NULL; 1045 } 1046 return; 1047 } 1048 1049 elm = _cache_mntcache_hash(nch->ncp); 1050 best = elm; 1051 for (i = 0; i < MNTCACHE_SET; ++i) { 1052 if (elm->ncp == NULL) { 1053 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 1054 _cache_mntrel(nch->mount); 1055 elm->ticks = ticks; 1056 nch->mount = NULL; 1057 nch->ncp = NULL; 1058 if (ncpr) 1059 _cache_drop(ncpr); 1060 return; 1061 } 1062 delta1 = ticks - best->ticks; 1063 delta2 = ticks - elm->ticks; 1064 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 1065 best = elm; 1066 ++elm; 1067 } 1068 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 1069 _cache_mntrel(nch->mount); 1070 best->ticks = ticks; 1071 nch->mount = NULL; 1072 nch->ncp = NULL; 1073 if (ncpr) 1074 _cache_drop(ncpr); 1075 } 1076 1077 void 1078 cache_changemount(struct nchandle *nch, struct mount *mp) 1079 { 1080 _cache_mntref(mp); 1081 _cache_mntrel(nch->mount); 1082 nch->mount = mp; 1083 } 1084 1085 void 1086 cache_drop(struct nchandle *nch) 1087 { 1088 _cache_mntrel(nch->mount); 1089 _cache_drop(nch->ncp); 1090 nch->ncp = NULL; 1091 nch->mount = NULL; 1092 } 1093 1094 /* 1095 * Returns: 1096 * -1 Locked by other 1097 * 0 Not locked 1098 * (v) LK_SHARED or LK_EXCLUSIVE 1099 */ 1100 int 1101 cache_lockstatus(struct nchandle *nch) 1102 { 1103 return(_cache_lockstatus(nch->ncp)); 1104 } 1105 1106 void 1107 cache_lock(struct nchandle *nch) 1108 { 1109 _cache_lock(nch->ncp); 1110 } 1111 1112 /* 1113 * Returns a shared or exclusive-locked ncp. The ncp will only be 1114 * shared-locked if it is already resolved. 1115 */ 1116 void 1117 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1118 { 1119 struct namecache *ncp = nch->ncp; 1120 1121 if (ncp_shared_lock_disable || excl || 1122 (ncp->nc_flag & NCF_UNRESOLVED)) { 1123 _cache_lock(ncp); 1124 } else { 1125 _cache_lock_shared(ncp); 1126 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1127 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1128 _cache_unlock(ncp); 1129 _cache_lock(ncp); 1130 } 1131 } else { 1132 _cache_unlock(ncp); 1133 _cache_lock(ncp); 1134 } 1135 } 1136 } 1137 1138 /* 1139 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1140 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1141 * 1142 * The caller is responsible for checking the validity upon return as 1143 * the records may have been flagged DESTROYED in the interim. 1144 * 1145 * Namecache lock ordering is leaf first, then parent. However, complex 1146 * interactions may occur between the source and target because there is 1147 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1148 */ 1149 void 1150 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1151 struct nchandle *tncpd, struct nchandle *tncp, 1152 struct ucred *fcred, struct ucred *tcred) 1153 { 1154 int tlocked = 1; 1155 u_int dummy_gen = 0; 1156 1157 /* 1158 * Lock tncp and tncpd 1159 * 1160 * NOTE: Because these ncps are not locked to begin with, it is 1161 * possible for other rename races to cause the normal lock 1162 * order assumptions to fail. 1163 * 1164 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1165 * matches after the leaf has been locked. However, ordering 1166 * between the 'from' and the 'to' is not and an overlapping 1167 * lock order reversal is still possible. 1168 */ 1169 again: 1170 if (__predict_false(tlocked == 0)) { 1171 cache_lock(tncp); 1172 } 1173 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1174 cache_unlock(tncp); 1175 cache_lock(tncpd); /* cycle tncpd lock */ 1176 cache_unlock(tncpd); 1177 tlocked = 0; 1178 goto again; 1179 } 1180 1181 /* 1182 * Lock fncp and fncpd 1183 * 1184 * NOTE: Because these ncps are not locked to begin with, it is 1185 * possible for other rename races to cause the normal lock 1186 * order assumptions to fail. 1187 * 1188 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1189 * matches after the leaf has been locked. However, ordering 1190 * between the 'from' and the 'to' is not and an overlapping 1191 * lock order reversal is still possible. 1192 */ 1193 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1194 cache_unlock(tncpd); 1195 cache_unlock(tncp); 1196 cache_lock(fncp); /* cycle fncp lock */ 1197 cache_unlock(fncp); 1198 tlocked = 0; 1199 goto again; 1200 } 1201 1202 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1203 cache_unlock(fncp); 1204 cache_unlock(tncpd); 1205 cache_unlock(tncp); 1206 cache_lock(fncpd); 1207 cache_unlock(fncpd); /* cycle fncpd lock */ 1208 tlocked = 0; 1209 goto again; 1210 } 1211 1212 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1213 cache_resolve(fncpd, &dummy_gen, fcred); 1214 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1215 cache_resolve(tncpd, &dummy_gen, tcred); 1216 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1217 cache_resolve(fncp, &dummy_gen, fcred); 1218 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1219 cache_resolve(tncp, &dummy_gen, tcred); 1220 } 1221 1222 int 1223 cache_lock_nonblock(struct nchandle *nch) 1224 { 1225 return(_cache_lock_nonblock(nch->ncp)); 1226 } 1227 1228 void 1229 cache_unlock(struct nchandle *nch) 1230 { 1231 _cache_unlock(nch->ncp); 1232 } 1233 1234 /* 1235 * ref-and-lock, unlock-and-deref functions. 1236 * 1237 * This function is primarily used by nlookup. Even though cache_lock 1238 * holds the vnode, it is possible that the vnode may have already 1239 * initiated a recyclement. 1240 * 1241 * We want cache_get() to return a definitively usable vnode or a 1242 * definitively unresolved ncp. 1243 */ 1244 static 1245 struct namecache * 1246 _cache_get(struct namecache *ncp) 1247 { 1248 _cache_hold(ncp); 1249 _cache_lock(ncp); 1250 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1251 _cache_setunresolved(ncp, 1); 1252 return(ncp); 1253 } 1254 1255 /* 1256 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1257 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1258 * valid. Otherwise an exclusive lock will be acquired instead. 1259 */ 1260 static 1261 struct namecache * 1262 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1263 { 1264 if (ncp_shared_lock_disable || excl || 1265 (ncp->nc_flag & NCF_UNRESOLVED)) 1266 { 1267 return(_cache_get(ncp)); 1268 } 1269 _cache_hold(ncp); 1270 _cache_lock_shared(ncp); 1271 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1272 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1273 _cache_unlock(ncp); 1274 ncp = _cache_get(ncp); 1275 _cache_drop(ncp); 1276 } 1277 } else { 1278 _cache_unlock(ncp); 1279 ncp = _cache_get(ncp); 1280 _cache_drop(ncp); 1281 } 1282 return(ncp); 1283 } 1284 1285 /* 1286 * NOTE: The same nchandle can be passed for both arguments. 1287 */ 1288 void 1289 cache_get(struct nchandle *nch, struct nchandle *target) 1290 { 1291 KKASSERT(nch->ncp->nc_refs > 0); 1292 target->mount = nch->mount; 1293 target->ncp = _cache_get(nch->ncp); 1294 _cache_mntref(target->mount); 1295 } 1296 1297 void 1298 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1299 { 1300 KKASSERT(nch->ncp->nc_refs > 0); 1301 target->mount = nch->mount; 1302 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1303 _cache_mntref(target->mount); 1304 } 1305 1306 /* 1307 * Release a held and locked ncp 1308 */ 1309 static __inline 1310 void 1311 _cache_put(struct namecache *ncp) 1312 { 1313 _cache_unlock(ncp); 1314 _cache_drop(ncp); 1315 } 1316 1317 void 1318 cache_put(struct nchandle *nch) 1319 { 1320 _cache_mntrel(nch->mount); 1321 _cache_put(nch->ncp); 1322 nch->ncp = NULL; 1323 nch->mount = NULL; 1324 } 1325 1326 /* 1327 * Resolve an unresolved ncp by associating a vnode with it. If the 1328 * vnode is NULL, a negative cache entry is created. 1329 * 1330 * The ncp should be locked on entry and will remain locked on return. 1331 */ 1332 static 1333 void 1334 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp, 1335 int adjgen) 1336 { 1337 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1338 1339 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1340 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1341 ncp->nc_vp == NULL); 1342 1343 if (adjgen) 1344 _cache_ncp_gen_enter(ncp); 1345 1346 if (vp) { 1347 /* 1348 * Any vp associated with an ncp which has children must 1349 * be held. Any vp associated with a locked ncp must be held. 1350 */ 1351 if (!TAILQ_EMPTY(&ncp->nc_list)) 1352 vhold(vp); 1353 spin_lock(&vp->v_spin); 1354 ncp->nc_vp = vp; 1355 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1356 ++vp->v_namecache_count; 1357 _cache_hold(ncp); /* v_namecache assoc */ 1358 spin_unlock(&vp->v_spin); 1359 vhold(vp); /* nc_vp */ 1360 1361 /* 1362 * Set auxiliary flags 1363 */ 1364 switch(vp->v_type) { 1365 case VDIR: 1366 ncp->nc_flag |= NCF_ISDIR; 1367 break; 1368 case VLNK: 1369 ncp->nc_flag |= NCF_ISSYMLINK; 1370 /* XXX cache the contents of the symlink */ 1371 break; 1372 default: 1373 break; 1374 } 1375 1376 ncp->nc_error = 0; 1377 1378 /* 1379 * XXX: this is a hack to work-around the lack of a real pfs vfs 1380 * implementation 1381 */ 1382 if (mp) { 1383 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1384 vp->v_pfsmp = mp; 1385 } 1386 } else { 1387 /* 1388 * When creating a negative cache hit we set the 1389 * namecache_gen. A later resolve will clean out the 1390 * negative cache hit if the mount point's namecache_gen 1391 * has changed. Used by devfs, could also be used by 1392 * other remote FSs. 1393 */ 1394 ncp->nc_vp = NULL; 1395 ncp->nc_negcpu = mycpu->gd_cpuid; 1396 spin_lock(&pn->neg_spin); 1397 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1398 _cache_hold(ncp); /* neg_list assoc */ 1399 ++pn->neg_count; 1400 spin_unlock(&pn->neg_spin); 1401 atomic_add_long(&pn->vfscache_negs, 1); 1402 1403 ncp->nc_error = ENOENT; 1404 if (mp) 1405 VFS_NCPGEN_SET(mp, ncp); 1406 } 1407 1408 /* 1409 * Previously unresolved leaf is now resolved. 1410 * 1411 * Clear the NCF_UNRESOLVED flag last (see cache_nlookup_nonlocked()). 1412 * We only adjust vfscache_unres for ncp's that are in the tree. 1413 */ 1414 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) 1415 atomic_add_long(&pn->vfscache_unres, -1); 1416 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1417 if (adjgen) 1418 _cache_ncp_gen_exit(ncp); 1419 } 1420 1421 void 1422 cache_setvp(struct nchandle *nch, struct vnode *vp) 1423 { 1424 _cache_setvp(nch->mount, nch->ncp, vp, 1); 1425 } 1426 1427 /* 1428 * Used for NFS 1429 */ 1430 void 1431 cache_settimeout(struct nchandle *nch, int nticks) 1432 { 1433 struct namecache *ncp = nch->ncp; 1434 1435 if ((ncp->nc_timeout = ticks + nticks) == 0) 1436 ncp->nc_timeout = 1; 1437 } 1438 1439 /* 1440 * Disassociate the vnode or negative-cache association and mark a 1441 * namecache entry as unresolved again. Note that the ncp is still 1442 * left in the hash table and still linked to its parent. 1443 * 1444 * The ncp should be locked and refd on entry and will remain locked and refd 1445 * on return. 1446 * 1447 * This routine is normally never called on a directory containing children. 1448 * However, NFS often does just that in its rename() code as a cop-out to 1449 * avoid complex namespace operations. This disconnects a directory vnode 1450 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1451 * sync. 1452 * 1453 */ 1454 static 1455 void 1456 _cache_setunresolved(struct namecache *ncp, int adjgen) 1457 { 1458 struct vnode *vp; 1459 1460 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1461 struct pcpu_ncache *pn; 1462 1463 if (adjgen) 1464 _cache_ncp_gen_enter(ncp); 1465 1466 /* 1467 * Is a resolved or destroyed leaf now becoming unresolved? 1468 * Only adjust vfscache_unres for linked ncp's. 1469 */ 1470 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) { 1471 pn = &pcpu_ncache[mycpu->gd_cpuid]; 1472 atomic_add_long(&pn->vfscache_unres, 1); 1473 } 1474 1475 ncp->nc_flag |= NCF_UNRESOLVED; 1476 ncp->nc_timeout = 0; 1477 ncp->nc_error = ENOTCONN; 1478 if ((vp = ncp->nc_vp) != NULL) { 1479 spin_lock(&vp->v_spin); 1480 ncp->nc_vp = NULL; 1481 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1482 --vp->v_namecache_count; 1483 spin_unlock(&vp->v_spin); 1484 1485 /* 1486 * Any vp associated with an ncp with children is 1487 * held by that ncp. Any vp associated with ncp 1488 * is held by that ncp. These conditions must be 1489 * undone when the vp is cleared out from the ncp. 1490 */ 1491 if (!TAILQ_EMPTY(&ncp->nc_list)) 1492 vdrop(vp); 1493 vdrop(vp); 1494 } else { 1495 pn = &pcpu_ncache[ncp->nc_negcpu]; 1496 1497 atomic_add_long(&pn->vfscache_negs, -1); 1498 spin_lock(&pn->neg_spin); 1499 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1500 --pn->neg_count; 1501 spin_unlock(&pn->neg_spin); 1502 } 1503 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1504 1505 if (adjgen) 1506 _cache_ncp_gen_exit(ncp); 1507 _cache_drop(ncp); /* from v_namecache or neg_list */ 1508 } 1509 } 1510 1511 /* 1512 * The cache_nresolve() code calls this function to automatically 1513 * set a resolved cache element to unresolved if it has timed out 1514 * or if it is a negative cache hit and the mount point namecache_gen 1515 * has changed. 1516 */ 1517 static __inline int 1518 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1519 { 1520 /* 1521 * Try to zap entries that have timed out. We have 1522 * to be careful here because locked leafs may depend 1523 * on the vnode remaining intact in a parent, so only 1524 * do this under very specific conditions. 1525 */ 1526 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1527 TAILQ_EMPTY(&ncp->nc_list)) { 1528 return 1; 1529 } 1530 1531 /* 1532 * If a resolved negative cache hit is invalid due to 1533 * the mount's namecache generation being bumped, zap it. 1534 */ 1535 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1536 return 1; 1537 } 1538 1539 /* 1540 * Otherwise we are good 1541 */ 1542 return 0; 1543 } 1544 1545 static __inline void 1546 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1547 { 1548 /* 1549 * Already in an unresolved state, nothing to do. 1550 */ 1551 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1552 if (_cache_auto_unresolve_test(mp, ncp)) 1553 _cache_setunresolved(ncp, 1); 1554 } 1555 } 1556 1557 void 1558 cache_setunresolved(struct nchandle *nch) 1559 { 1560 _cache_setunresolved(nch->ncp, 1); 1561 } 1562 1563 /* 1564 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1565 * looking for matches. This flag tells the lookup code when it must 1566 * check for a mount linkage and also prevents the directories in question 1567 * from being deleted or renamed. 1568 */ 1569 static 1570 int 1571 cache_clrmountpt_callback(struct mount *mp, void *data) 1572 { 1573 struct nchandle *nch = data; 1574 1575 if (mp->mnt_ncmounton.ncp == nch->ncp) 1576 return(1); 1577 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1578 return(1); 1579 return(0); 1580 } 1581 1582 /* 1583 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1584 * with a mount point. 1585 */ 1586 void 1587 cache_clrmountpt(struct nchandle *nch) 1588 { 1589 int count; 1590 1591 count = mountlist_scan(cache_clrmountpt_callback, nch, 1592 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1593 MNTSCAN_NOUNLOCK); 1594 if (count == 0) 1595 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1596 } 1597 1598 /* 1599 * Invalidate portions of the namecache topology given a starting entry. 1600 * The passed ncp is set to an unresolved state and: 1601 * 1602 * The passed ncp must be referenced and locked. The routine may unlock 1603 * and relock ncp several times, and will recheck the children and loop 1604 * to catch races. When done the passed ncp will be returned with the 1605 * reference and lock intact. 1606 * 1607 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1608 * that the physical underlying nodes have been 1609 * destroyed... as in deleted. For example, when 1610 * a directory is removed. This will cause record 1611 * lookups on the name to no longer be able to find 1612 * the record and tells the resolver to return failure 1613 * rather then trying to resolve through the parent. 1614 * 1615 * The topology itself, including ncp->nc_name, 1616 * remains intact. 1617 * 1618 * This only applies to the passed ncp, if CINV_CHILDREN 1619 * is specified the children are not flagged. 1620 * 1621 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1622 * state as well. 1623 * 1624 * Note that this will also have the side effect of 1625 * cleaning out any unreferenced nodes in the topology 1626 * from the leaves up as the recursion backs out. 1627 * 1628 * Note that the topology for any referenced nodes remains intact, but 1629 * the nodes will be marked as having been destroyed and will be set 1630 * to an unresolved state. 1631 * 1632 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1633 * the namecache entry may not actually be invalidated on return if it was 1634 * revalidated while recursing down into its children. This code guarentees 1635 * that the node(s) will go through an invalidation cycle, but does not 1636 * guarentee that they will remain in an invalidated state. 1637 * 1638 * Returns non-zero if a revalidation was detected during the invalidation 1639 * recursion, zero otherwise. Note that since only the original ncp is 1640 * locked the revalidation ultimately can only indicate that the original ncp 1641 * *MIGHT* no have been reresolved. 1642 * 1643 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1644 * have to avoid blowing out the kernel stack. We do this by saving the 1645 * deep namecache node and aborting the recursion, then re-recursing at that 1646 * node using a depth-first algorithm in order to allow multiple deep 1647 * recursions to chain through each other, then we restart the invalidation 1648 * from scratch. 1649 */ 1650 1651 struct cinvtrack { 1652 struct namecache *resume_ncp; 1653 int depth; 1654 }; 1655 1656 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1657 1658 static 1659 int 1660 _cache_inval(struct namecache *ncp, int flags) 1661 { 1662 struct cinvtrack track; 1663 struct namecache *ncp2; 1664 int r; 1665 1666 track.depth = 0; 1667 track.resume_ncp = NULL; 1668 1669 for (;;) { 1670 r = _cache_inval_internal(ncp, flags, &track); 1671 if (track.resume_ncp == NULL) 1672 break; 1673 _cache_unlock(ncp); 1674 while ((ncp2 = track.resume_ncp) != NULL) { 1675 track.resume_ncp = NULL; 1676 _cache_lock(ncp2); 1677 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1678 &track); 1679 /*_cache_put(ncp2);*/ 1680 cache_zap(ncp2); 1681 } 1682 _cache_lock(ncp); 1683 } 1684 return(r); 1685 } 1686 1687 int 1688 cache_inval(struct nchandle *nch, int flags) 1689 { 1690 return(_cache_inval(nch->ncp, flags)); 1691 } 1692 1693 /* 1694 * Helper for _cache_inval(). The passed ncp is refd and locked and 1695 * remains that way on return, but may be unlocked/relocked multiple 1696 * times by the routine. 1697 */ 1698 static int 1699 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1700 { 1701 struct namecache *nextkid; 1702 int rcnt = 0; 1703 1704 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1705 1706 _cache_ncp_gen_enter(ncp); 1707 _cache_setunresolved(ncp, 0); 1708 if (flags & CINV_DESTROY) { 1709 ncp->nc_flag |= NCF_DESTROYED; 1710 cpu_sfence(); 1711 } 1712 1713 while ((flags & CINV_CHILDREN) && 1714 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1715 ) { 1716 struct namecache *kid; 1717 int restart; 1718 1719 restart = 0; 1720 _cache_hold(nextkid); 1721 if (++track->depth > MAX_RECURSION_DEPTH) { 1722 track->resume_ncp = ncp; 1723 _cache_hold(ncp); 1724 ++rcnt; 1725 } 1726 while ((kid = nextkid) != NULL) { 1727 /* 1728 * Parent (ncp) must be locked for the iteration. 1729 */ 1730 nextkid = NULL; 1731 if (kid->nc_parent != ncp) { 1732 _cache_drop(kid); 1733 kprintf("cache_inval_internal restartA %s\n", 1734 ncp->nc_name); 1735 restart = 1; 1736 break; 1737 } 1738 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1739 _cache_hold(nextkid); 1740 1741 /* 1742 * Parent unlocked for this section to avoid 1743 * deadlocks. Then lock the kid and check for 1744 * races. 1745 */ 1746 _cache_unlock(ncp); 1747 if (track->resume_ncp) { 1748 _cache_drop(kid); 1749 _cache_lock(ncp); 1750 break; 1751 } 1752 _cache_lock(kid); 1753 if (kid->nc_parent != ncp) { 1754 kprintf("cache_inval_internal " 1755 "restartB %s\n", 1756 ncp->nc_name); 1757 restart = 1; 1758 _cache_unlock(kid); 1759 _cache_drop(kid); 1760 _cache_lock(ncp); 1761 break; 1762 } 1763 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1764 TAILQ_FIRST(&kid->nc_list) 1765 ) { 1766 1767 rcnt += _cache_inval_internal(kid, 1768 flags & ~CINV_DESTROY, track); 1769 /*_cache_unlock(kid);*/ 1770 /*_cache_drop(kid);*/ 1771 cache_zap(kid); 1772 } else { 1773 cache_zap(kid); 1774 } 1775 1776 /* 1777 * Relock parent to continue scan 1778 */ 1779 _cache_lock(ncp); 1780 } 1781 if (nextkid) 1782 _cache_drop(nextkid); 1783 --track->depth; 1784 if (restart == 0) 1785 break; 1786 } 1787 1788 /* 1789 * Someone could have gotten in there while ncp was unlocked, 1790 * retry if so. 1791 */ 1792 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1793 ++rcnt; 1794 _cache_ncp_gen_exit(ncp); 1795 1796 return (rcnt); 1797 } 1798 1799 /* 1800 * Invalidate a vnode's namecache associations. To avoid races against 1801 * the resolver we do not invalidate a node which we previously invalidated 1802 * but which was then re-resolved while we were in the invalidation loop. 1803 * 1804 * Returns non-zero if any namecache entries remain after the invalidation 1805 * loop completed. 1806 * 1807 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1808 * be ripped out of the topology while held, the vnode's v_namecache 1809 * list has no such restriction. NCP's can be ripped out of the list 1810 * at virtually any time if not locked, even if held. 1811 * 1812 * In addition, the v_namecache list itself must be locked via 1813 * the vnode's spinlock. 1814 */ 1815 int 1816 cache_inval_vp(struct vnode *vp, int flags) 1817 { 1818 struct namecache *ncp; 1819 struct namecache *next; 1820 1821 restart: 1822 spin_lock(&vp->v_spin); 1823 ncp = TAILQ_FIRST(&vp->v_namecache); 1824 if (ncp) 1825 _cache_hold(ncp); 1826 while (ncp) { 1827 /* loop entered with ncp held and vp spin-locked */ 1828 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1829 _cache_hold(next); 1830 spin_unlock(&vp->v_spin); 1831 _cache_lock(ncp); 1832 if (ncp->nc_vp != vp) { 1833 kprintf("Warning: cache_inval_vp: race-A detected on " 1834 "%s\n", ncp->nc_name); 1835 _cache_put(ncp); 1836 if (next) 1837 _cache_drop(next); 1838 goto restart; 1839 } 1840 _cache_inval(ncp, flags); 1841 _cache_put(ncp); /* also releases reference */ 1842 ncp = next; 1843 spin_lock(&vp->v_spin); 1844 if (ncp && ncp->nc_vp != vp) { 1845 spin_unlock(&vp->v_spin); 1846 kprintf("Warning: cache_inval_vp: race-B detected on " 1847 "%s\n", ncp->nc_name); 1848 _cache_drop(ncp); 1849 goto restart; 1850 } 1851 } 1852 spin_unlock(&vp->v_spin); 1853 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1854 } 1855 1856 /* 1857 * This routine is used instead of the normal cache_inval_vp() when we 1858 * are trying to recycle otherwise good vnodes. 1859 * 1860 * Return 0 on success, non-zero if not all namecache records could be 1861 * disassociated from the vnode (for various reasons). 1862 */ 1863 int 1864 cache_inval_vp_nonblock(struct vnode *vp) 1865 { 1866 struct namecache *ncp; 1867 struct namecache *next; 1868 1869 spin_lock(&vp->v_spin); 1870 1871 ncp = TAILQ_FIRST(&vp->v_namecache); 1872 if (ncp) 1873 _cache_hold(ncp); 1874 1875 while (ncp) { 1876 /* loop entered with ncp held */ 1877 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1878 _cache_hold(next); 1879 spin_unlock(&vp->v_spin); 1880 if (_cache_lock_nonblock(ncp)) { 1881 _cache_drop(ncp); 1882 if (next) 1883 _cache_drop(next); 1884 goto done; 1885 } 1886 if (ncp->nc_vp != vp) { 1887 kprintf("Warning: cache_inval_vp: race-A detected on " 1888 "%s\n", ncp->nc_name); 1889 _cache_put(ncp); 1890 if (next) 1891 _cache_drop(next); 1892 goto done; 1893 } 1894 _cache_inval(ncp, 0); 1895 _cache_put(ncp); /* also releases reference */ 1896 ncp = next; 1897 spin_lock(&vp->v_spin); 1898 if (ncp && ncp->nc_vp != vp) { 1899 spin_unlock(&vp->v_spin); 1900 kprintf("Warning: cache_inval_vp: race-B detected on " 1901 "%s\n", ncp->nc_name); 1902 _cache_drop(ncp); 1903 goto done; 1904 } 1905 } 1906 spin_unlock(&vp->v_spin); 1907 done: 1908 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1909 } 1910 1911 /* 1912 * Attempt to quickly invalidate the vnode's namecache entry. This function 1913 * will also dive the ncp and free its children but only if they are trivial. 1914 * All locks are non-blocking and the function will fail if required locks 1915 * cannot be obtained. 1916 * 1917 * We want this sort of function to be able to guarantee progress when vnlru 1918 * wants to recycle a vnode. Directories could otherwise get stuck and not 1919 * be able to recycle due to destroyed or unresolved children in the 1920 * namecache. 1921 */ 1922 void 1923 cache_inval_vp_quick(struct vnode *vp) 1924 { 1925 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1926 struct namecache *ncp; 1927 struct namecache *kid; 1928 1929 spin_lock(&vp->v_spin); 1930 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) { 1931 _cache_hold(ncp); 1932 spin_unlock(&vp->v_spin); 1933 if (_cache_lock_nonblock(ncp)) { 1934 _cache_drop(ncp); 1935 return; 1936 } 1937 1938 /* 1939 * Try to trivially destroy any children. 1940 */ 1941 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) { 1942 struct nchash_head *nchpp; 1943 1944 /* 1945 * Early test without the lock. Give-up if the 1946 * child has children of its own, the child is 1947 * positively-resolved, or the ref-count is 1948 * unexpected. 1949 */ 1950 if (TAILQ_FIRST(&kid->nc_list) || 1951 kid->nc_vp || 1952 kid->nc_refs != ncpbaserefs(kid)) 1953 { 1954 _cache_put(ncp); 1955 return; 1956 } 1957 1958 _cache_hold(kid); 1959 if (_cache_lock_nonblock(kid)) { 1960 _cache_drop(kid); 1961 _cache_put(ncp); 1962 return; 1963 } 1964 1965 /* 1966 * A destruction/free test requires the parent, 1967 * the kid, and the hash table to be locked. Note 1968 * that the kid may still be on the negative cache 1969 * list. 1970 */ 1971 nchpp = kid->nc_head; 1972 spin_lock(&nchpp->spin); 1973 1974 /* 1975 * Give up if the child isn't trivial. It can be 1976 * resolved or unresolved but must not have a vp. 1977 */ 1978 if (kid->nc_parent != ncp || 1979 kid->nc_vp || 1980 TAILQ_FIRST(&kid->nc_list) || 1981 kid->nc_refs != 1 + ncpbaserefs(kid)) 1982 { 1983 spin_unlock(&nchpp->spin); 1984 _cache_put(kid); 1985 _cache_put(ncp); 1986 return; 1987 } 1988 1989 ++pn->inv_kid_quick_count; 1990 1991 /* 1992 * We can safely destroy the kid. It may still 1993 * have extra refs due to ncneglist races, but since 1994 * we checked above with the lock held those races 1995 * will self-resolve. 1996 * 1997 * With these actions the kid should nominally 1998 * have just its natural ref plus our ref. 1999 * 2000 * This is only safe because we hold locks on 2001 * the parent, the kid, and the nchpp. The only 2002 * lock we don't have is on the ncneglist and that 2003 * can race a ref, but as long as we unresolve the 2004 * kid before executing our final drop the ncneglist 2005 * code path(s) will just drop their own ref so all 2006 * is good. 2007 */ 2008 _cache_unlink_parent(ncp, kid, nchpp); 2009 _cache_setunresolved(kid, 1); 2010 if (kid->nc_refs != 2) { 2011 kprintf("Warning: kid %p unexpected refs=%d " 2012 "%08x %s\n", 2013 kid, kid->nc_refs, 2014 kid->nc_flag, kid->nc_name); 2015 } 2016 _cache_put(kid); /* drop our ref and lock */ 2017 _cache_drop(kid); /* drop natural ref to destroy */ 2018 } 2019 2020 /* 2021 * Now check ncp itself against our expectations. With 2022 * no children left we have our ref plus whether it is 2023 * resolved or not (which it has to be, actually, since it 2024 * is hanging off the vp->v_namecache). 2025 */ 2026 if (ncp->nc_refs != 1 + ncpbaserefs(ncp)) { 2027 _cache_put(ncp); 2028 spin_lock(&vp->v_spin); 2029 break; 2030 } 2031 2032 ++pn->inv_ncp_quick_count; 2033 2034 /* 2035 * Success, disassociate and release the ncp. Do not 2036 * try to zap it here. 2037 * 2038 * NOTE: Releasing the ncp here leaves it in the tree, 2039 * but since we have disassociated the vnode this 2040 * ncp entry becomes 'trivial' and successive calls 2041 * to cache_inval_vp_quick() will be able to continue 2042 * to make progress. 2043 */ 2044 _cache_setunresolved(ncp, 1); 2045 _cache_put(ncp); 2046 spin_lock(&vp->v_spin); 2047 } 2048 spin_unlock(&vp->v_spin); 2049 } 2050 2051 /* 2052 * Clears the universal directory search 'ok' flag. This flag allows 2053 * nlookup() to bypass normal vnode checks. This flag is a cached flag 2054 * so clearing it simply forces revalidation. 2055 */ 2056 void 2057 cache_inval_wxok(struct vnode *vp) 2058 { 2059 struct namecache *ncp; 2060 2061 spin_lock(&vp->v_spin); 2062 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 2063 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 2064 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 2065 } 2066 spin_unlock(&vp->v_spin); 2067 } 2068 2069 /* 2070 * The source ncp has been renamed to the target ncp. All elements have been 2071 * locked, including the parent ncp's. 2072 * 2073 * The target ncp is destroyed (as a normal rename-over would destroy the 2074 * target file or directory). 2075 * 2076 * Because there may be references to the source ncp we cannot copy its 2077 * contents to the target. Instead the source ncp is relinked as the target 2078 * and the target ncp is removed from the namecache topology. 2079 */ 2080 void 2081 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 2082 { 2083 struct namecache *fncp = fnch->ncp; 2084 struct namecache *tncp = tnch->ncp; 2085 struct namecache *par; 2086 struct nchash_head *nchpp; 2087 u_int32_t hash; 2088 char *oname; 2089 char *nname; 2090 2091 if (tncp->nc_nlen) { 2092 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 2093 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 2094 nname[tncp->nc_nlen] = 0; 2095 } else { 2096 nname = NULL; 2097 } 2098 2099 /* 2100 * Rename fncp (unlink) 2101 */ 2102 if (fncp->nc_parent) { 2103 par = fncp->nc_parent; 2104 _cache_hold(par); 2105 _cache_lock(par); 2106 nchpp = fncp->nc_head; 2107 spin_lock(&nchpp->spin); 2108 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */ 2109 _cache_put(par); 2110 } else { 2111 par = NULL; 2112 nchpp = NULL; 2113 } 2114 oname = fncp->nc_name; 2115 fncp->nc_name = nname; 2116 fncp->nc_nlen = tncp->nc_nlen; 2117 if (oname) 2118 kfree(oname, M_VFSCACHEAUX); 2119 2120 par = tncp->nc_parent; 2121 KKASSERT(par->nc_lock.lk_lockholder == curthread); 2122 2123 /* 2124 * Rename fncp (relink) 2125 */ 2126 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 2127 hash = fnv_32_buf(&par, sizeof(par), hash); 2128 nchpp = NCHHASH(hash); 2129 2130 spin_lock(&nchpp->spin); 2131 _cache_link_parent(fncp, par, nchpp); 2132 spin_unlock(&nchpp->spin); 2133 2134 /* 2135 * Get rid of the overwritten tncp (unlink) 2136 */ 2137 _cache_unlink(tncp); 2138 } 2139 2140 /* 2141 * Perform actions consistent with unlinking a file. The passed-in ncp 2142 * must be locked. 2143 * 2144 * The ncp is marked DESTROYED so it no longer shows up in searches, 2145 * and will be physically deleted when the vnode goes away. 2146 * 2147 * If the related vnode has no refs then we cycle it through vget()/vput() 2148 * to (possibly if we don't have a ref race) trigger a deactivation, 2149 * allowing the VFS to trivially detect and recycle the deleted vnode 2150 * via VOP_INACTIVE(). 2151 * 2152 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2153 * target ncp. 2154 */ 2155 void 2156 cache_unlink(struct nchandle *nch) 2157 { 2158 _cache_unlink(nch->ncp); 2159 } 2160 2161 static void 2162 _cache_unlink(struct namecache *ncp) 2163 { 2164 struct vnode *vp; 2165 2166 /* 2167 * Causes lookups to fail and allows another ncp with the same 2168 * name to be created under ncp->nc_parent. 2169 */ 2170 _cache_ncp_gen_enter(ncp); 2171 ncp->nc_flag |= NCF_DESTROYED; 2172 2173 /* 2174 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2175 * force action on the 1->0 transition. Do not destroy the 2176 * vp association if a vp is present (leave the destroyed ncp 2177 * resolved through the vp finalization). 2178 * 2179 * Cleanup the refs in the resolved-not-found case by setting 2180 * the ncp to an unresolved state. This improves our ability 2181 * to get rid of dead ncp elements in other cache_*() routines. 2182 */ 2183 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 2184 vp = ncp->nc_vp; 2185 if (vp) { 2186 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2187 if (VREFCNT(vp) <= 0) { 2188 if (vget(vp, LK_SHARED) == 0) 2189 vput(vp); 2190 } 2191 } else { 2192 _cache_setunresolved(ncp, 0); 2193 } 2194 } 2195 _cache_ncp_gen_exit(ncp); 2196 } 2197 2198 /* 2199 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2200 * file. The easy solution is to just return non-zero if the vnode has refs. 2201 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2202 * force the reclaim). 2203 */ 2204 int 2205 cache_isopen(struct nchandle *nch) 2206 { 2207 struct vnode *vp; 2208 struct namecache *ncp = nch->ncp; 2209 2210 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2211 (vp = ncp->nc_vp) != NULL && 2212 VREFCNT(vp)) { 2213 return 1; 2214 } 2215 return 0; 2216 } 2217 2218 2219 /* 2220 * vget the vnode associated with the namecache entry. Resolve the namecache 2221 * entry if necessary. The passed ncp must be referenced and locked. If 2222 * the ncp is resolved it might be locked shared. 2223 * 2224 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2225 * (depending on the passed lk_type) will be returned in *vpp with an error 2226 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2227 * most typical error is ENOENT, meaning that the ncp represents a negative 2228 * cache hit and there is no vnode to retrieve, but other errors can occur 2229 * too. 2230 * 2231 * The vget() can race a reclaim. If this occurs we re-resolve the 2232 * namecache entry. 2233 * 2234 * There are numerous places in the kernel where vget() is called on a 2235 * vnode while one or more of its namecache entries is locked. Releasing 2236 * a vnode never deadlocks against locked namecache entries (the vnode 2237 * will not get recycled while referenced ncp's exist). This means we 2238 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2239 * lock when acquiring the vp lock or we might cause a deadlock. 2240 * 2241 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2242 * unresolved. If a reclaim race occurs the passed-in ncp will be 2243 * relocked exclusively before being re-resolved. 2244 */ 2245 int 2246 cache_vget(struct nchandle *nch, struct ucred *cred, 2247 int lk_type, struct vnode **vpp) 2248 { 2249 struct namecache *ncp; 2250 struct vnode *vp; 2251 int error; 2252 u_int dummy_gen = 0; 2253 2254 ncp = nch->ncp; 2255 again: 2256 vp = NULL; 2257 if (ncp->nc_flag & NCF_UNRESOLVED) 2258 error = cache_resolve(nch, &dummy_gen, cred); 2259 else 2260 error = 0; 2261 2262 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2263 error = vget(vp, lk_type); 2264 if (error) { 2265 /* 2266 * VRECLAIM race 2267 * 2268 * The ncp may have been locked shared, we must relock 2269 * it exclusively before we can set it to unresolved. 2270 */ 2271 if (error == ENOENT) { 2272 kprintf("Warning: vnode reclaim race detected " 2273 "in cache_vget on %p (%s)\n", 2274 vp, ncp->nc_name); 2275 _cache_unlock(ncp); 2276 _cache_lock(ncp); 2277 _cache_setunresolved(ncp, 1); 2278 goto again; 2279 } 2280 2281 /* 2282 * Not a reclaim race, some other error. 2283 */ 2284 KKASSERT(ncp->nc_vp == vp); 2285 vp = NULL; 2286 } else { 2287 KKASSERT(ncp->nc_vp == vp); 2288 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2289 } 2290 } 2291 if (error == 0 && vp == NULL) 2292 error = ENOENT; 2293 *vpp = vp; 2294 return(error); 2295 } 2296 2297 /* 2298 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 2299 * is already held by virtuue of the ncp being locked, but it might not be 2300 * referenced and while it is not referenced it can transition into the 2301 * VRECLAIMED state. 2302 * 2303 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2304 * unresolved. If a reclaim race occurs the passed-in ncp will be 2305 * relocked exclusively before being re-resolved. 2306 * 2307 * NOTE: At the moment we have to issue a vget() on the vnode, even though 2308 * we are going to immediately release the lock, in order to resolve 2309 * potential reclamation races. Once we have a solid vnode ref that 2310 * was (at some point) interlocked via a vget(), the vnode will not 2311 * be reclaimed. 2312 * 2313 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2314 */ 2315 int 2316 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2317 { 2318 struct namecache *ncp; 2319 struct vnode *vp; 2320 int error; 2321 int v; 2322 u_int dummy_gen = 0; 2323 2324 ncp = nch->ncp; 2325 again: 2326 vp = NULL; 2327 if (ncp->nc_flag & NCF_UNRESOLVED) 2328 error = cache_resolve(nch, &dummy_gen, cred); 2329 else 2330 error = 0; 2331 2332 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2333 /* 2334 * Try a lockless ref of the vnode. VRECLAIMED transitions 2335 * use the vx_lock state and update-counter mechanism so we 2336 * can detect if one is in-progress or occurred. 2337 * 2338 * If we can successfully ref the vnode and interlock against 2339 * the update-counter mechanism, and VRECLAIMED is found to 2340 * not be set after that, we should be good. 2341 */ 2342 v = spin_access_start_only(&vp->v_spin); 2343 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2344 vref_special(vp); 2345 if (__predict_false( 2346 spin_access_end_only(&vp->v_spin, v))) { 2347 vrele(vp); 2348 continue; 2349 } 2350 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2351 break; 2352 } 2353 vrele(vp); 2354 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2355 } 2356 2357 /* 2358 * Do it the slow way 2359 */ 2360 error = vget(vp, LK_SHARED); 2361 if (error) { 2362 /* 2363 * VRECLAIM race 2364 */ 2365 if (error == ENOENT) { 2366 kprintf("Warning: vnode reclaim race detected " 2367 "in cache_vget on %p (%s)\n", 2368 vp, ncp->nc_name); 2369 _cache_unlock(ncp); 2370 _cache_lock(ncp); 2371 _cache_setunresolved(ncp, 1); 2372 goto again; 2373 } 2374 2375 /* 2376 * Not a reclaim race, some other error. 2377 */ 2378 KKASSERT(ncp->nc_vp == vp); 2379 vp = NULL; 2380 } else { 2381 KKASSERT(ncp->nc_vp == vp); 2382 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2383 /* caller does not want a lock */ 2384 vn_unlock(vp); 2385 } 2386 break; 2387 } 2388 if (error == 0 && vp == NULL) 2389 error = ENOENT; 2390 *vpp = vp; 2391 2392 return(error); 2393 } 2394 2395 /* 2396 * Return a referenced vnode representing the parent directory of 2397 * ncp. 2398 * 2399 * Because the caller has locked the ncp it should not be possible for 2400 * the parent ncp to go away. However, the parent can unresolve its 2401 * dvp at any time so we must be able to acquire a lock on the parent 2402 * to safely access nc_vp. 2403 * 2404 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2405 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2406 * getting destroyed. 2407 * 2408 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2409 * lock on the ncp in question.. 2410 */ 2411 struct vnode * 2412 cache_dvpref(struct namecache *ncp) 2413 { 2414 struct namecache *par; 2415 struct vnode *dvp; 2416 2417 dvp = NULL; 2418 if ((par = ncp->nc_parent) != NULL) { 2419 _cache_hold(par); 2420 _cache_lock(par); 2421 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2422 if ((dvp = par->nc_vp) != NULL) 2423 vhold(dvp); 2424 } 2425 _cache_unlock(par); 2426 if (dvp) { 2427 if (vget(dvp, LK_SHARED) == 0) { 2428 vn_unlock(dvp); 2429 vdrop(dvp); 2430 /* return refd, unlocked dvp */ 2431 } else { 2432 vdrop(dvp); 2433 dvp = NULL; 2434 } 2435 } 2436 _cache_drop(par); 2437 } 2438 return(dvp); 2439 } 2440 2441 /* 2442 * Convert a directory vnode to a namecache record without any other 2443 * knowledge of the topology. This ONLY works with directory vnodes and 2444 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2445 * returned ncp (if not NULL) will be held and unlocked. 2446 * 2447 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2448 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2449 * for dvp. This will fail only if the directory has been deleted out from 2450 * under the caller. 2451 * 2452 * Callers must always check for a NULL return no matter the value of 'makeit'. 2453 * 2454 * To avoid underflowing the kernel stack each recursive call increments 2455 * the makeit variable. 2456 */ 2457 2458 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2459 struct vnode *dvp, char *fakename); 2460 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2461 struct vnode **saved_dvp); 2462 2463 int 2464 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2465 struct nchandle *nch) 2466 { 2467 struct vnode *saved_dvp; 2468 struct vnode *pvp; 2469 char *fakename; 2470 int error; 2471 2472 nch->ncp = NULL; 2473 nch->mount = dvp->v_mount; 2474 saved_dvp = NULL; 2475 fakename = NULL; 2476 2477 /* 2478 * Handle the makeit == 0 degenerate case 2479 */ 2480 if (makeit == 0) { 2481 spin_lock_shared(&dvp->v_spin); 2482 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2483 if (nch->ncp) 2484 cache_hold(nch); 2485 spin_unlock_shared(&dvp->v_spin); 2486 } 2487 2488 /* 2489 * Loop until resolution, inside code will break out on error. 2490 */ 2491 while (makeit) { 2492 /* 2493 * Break out if we successfully acquire a working ncp. 2494 */ 2495 spin_lock_shared(&dvp->v_spin); 2496 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2497 if (nch->ncp) { 2498 cache_hold(nch); 2499 spin_unlock_shared(&dvp->v_spin); 2500 break; 2501 } 2502 spin_unlock_shared(&dvp->v_spin); 2503 2504 /* 2505 * If dvp is the root of its filesystem it should already 2506 * have a namecache pointer associated with it as a side 2507 * effect of the mount, but it may have been disassociated. 2508 */ 2509 if (dvp->v_flag & VROOT) { 2510 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2511 error = cache_resolve_mp(nch->mount, 1); 2512 _cache_put(nch->ncp); 2513 if (ncvp_debug & 1) { 2514 kprintf("cache_fromdvp: resolve root of " 2515 "mount %p error %d", 2516 dvp->v_mount, error); 2517 } 2518 if (error) { 2519 if (ncvp_debug & 1) 2520 kprintf(" failed\n"); 2521 nch->ncp = NULL; 2522 break; 2523 } 2524 if (ncvp_debug & 1) 2525 kprintf(" succeeded\n"); 2526 continue; 2527 } 2528 2529 /* 2530 * If we are recursed too deeply resort to an O(n^2) 2531 * algorithm to resolve the namecache topology. The 2532 * resolved pvp is left referenced in saved_dvp to 2533 * prevent the tree from being destroyed while we loop. 2534 */ 2535 if (makeit > 20) { 2536 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2537 if (error) { 2538 kprintf("lookupdotdot(longpath) failed %d " 2539 "dvp %p\n", error, dvp); 2540 nch->ncp = NULL; 2541 break; 2542 } 2543 continue; 2544 } 2545 2546 /* 2547 * Get the parent directory and resolve its ncp. 2548 */ 2549 if (fakename) { 2550 kfree(fakename, M_TEMP); 2551 fakename = NULL; 2552 } 2553 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2554 &fakename); 2555 if (error) { 2556 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2557 break; 2558 } 2559 vn_unlock(pvp); 2560 2561 /* 2562 * Reuse makeit as a recursion depth counter. On success 2563 * nch will be fully referenced. 2564 */ 2565 cache_fromdvp(pvp, cred, makeit + 1, nch); 2566 vrele(pvp); 2567 if (nch->ncp == NULL) 2568 break; 2569 2570 /* 2571 * Do an inefficient scan of pvp (embodied by ncp) to look 2572 * for dvp. This will create a namecache record for dvp on 2573 * success. We loop up to recheck on success. 2574 * 2575 * ncp and dvp are both held but not locked. 2576 */ 2577 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2578 if (error) { 2579 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2580 pvp, nch->ncp->nc_name, dvp); 2581 cache_drop(nch); 2582 /* nch was NULLed out, reload mount */ 2583 nch->mount = dvp->v_mount; 2584 break; 2585 } 2586 if (ncvp_debug & 1) { 2587 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2588 pvp, nch->ncp->nc_name); 2589 } 2590 cache_drop(nch); 2591 /* nch was NULLed out, reload mount */ 2592 nch->mount = dvp->v_mount; 2593 } 2594 2595 /* 2596 * If nch->ncp is non-NULL it will have been held already. 2597 */ 2598 if (fakename) 2599 kfree(fakename, M_TEMP); 2600 if (saved_dvp) 2601 vrele(saved_dvp); 2602 if (nch->ncp) 2603 return (0); 2604 return (EINVAL); 2605 } 2606 2607 /* 2608 * Go up the chain of parent directories until we find something 2609 * we can resolve into the namecache. This is very inefficient. 2610 */ 2611 static 2612 int 2613 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2614 struct vnode **saved_dvp) 2615 { 2616 struct nchandle nch; 2617 struct vnode *pvp; 2618 int error; 2619 static time_t last_fromdvp_report; 2620 char *fakename; 2621 2622 /* 2623 * Loop getting the parent directory vnode until we get something we 2624 * can resolve in the namecache. 2625 */ 2626 vref(dvp); 2627 nch.mount = dvp->v_mount; 2628 nch.ncp = NULL; 2629 fakename = NULL; 2630 2631 for (;;) { 2632 if (fakename) { 2633 kfree(fakename, M_TEMP); 2634 fakename = NULL; 2635 } 2636 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2637 &fakename); 2638 if (error) { 2639 vrele(dvp); 2640 break; 2641 } 2642 vn_unlock(pvp); 2643 spin_lock_shared(&pvp->v_spin); 2644 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2645 _cache_hold(nch.ncp); 2646 spin_unlock_shared(&pvp->v_spin); 2647 vrele(pvp); 2648 break; 2649 } 2650 spin_unlock_shared(&pvp->v_spin); 2651 if (pvp->v_flag & VROOT) { 2652 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2653 error = cache_resolve_mp(nch.mount, 1); 2654 _cache_unlock(nch.ncp); 2655 vrele(pvp); 2656 if (error) { 2657 _cache_drop(nch.ncp); 2658 nch.ncp = NULL; 2659 vrele(dvp); 2660 } 2661 break; 2662 } 2663 vrele(dvp); 2664 dvp = pvp; 2665 } 2666 if (error == 0) { 2667 if (last_fromdvp_report != time_uptime) { 2668 last_fromdvp_report = time_uptime; 2669 kprintf("Warning: extremely inefficient path " 2670 "resolution on %s\n", 2671 nch.ncp->nc_name); 2672 } 2673 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2674 2675 /* 2676 * Hopefully dvp now has a namecache record associated with 2677 * it. Leave it referenced to prevent the kernel from 2678 * recycling the vnode. Otherwise extremely long directory 2679 * paths could result in endless recycling. 2680 */ 2681 if (*saved_dvp) 2682 vrele(*saved_dvp); 2683 *saved_dvp = dvp; 2684 _cache_drop(nch.ncp); 2685 } 2686 if (fakename) 2687 kfree(fakename, M_TEMP); 2688 return (error); 2689 } 2690 2691 /* 2692 * Do an inefficient scan of the directory represented by ncp looking for 2693 * the directory vnode dvp. ncp must be held but not locked on entry and 2694 * will be held on return. dvp must be refd but not locked on entry and 2695 * will remain refd on return. 2696 * 2697 * Why do this at all? Well, due to its stateless nature the NFS server 2698 * converts file handles directly to vnodes without necessarily going through 2699 * the namecache ops that would otherwise create the namecache topology 2700 * leading to the vnode. We could either (1) Change the namecache algorithms 2701 * to allow disconnect namecache records that are re-merged opportunistically, 2702 * or (2) Make the NFS server backtrack and scan to recover a connected 2703 * namecache topology in order to then be able to issue new API lookups. 2704 * 2705 * It turns out that (1) is a huge mess. It takes a nice clean set of 2706 * namecache algorithms and introduces a lot of complication in every subsystem 2707 * that calls into the namecache to deal with the re-merge case, especially 2708 * since we are using the namecache to placehold negative lookups and the 2709 * vnode might not be immediately assigned. (2) is certainly far less 2710 * efficient then (1), but since we are only talking about directories here 2711 * (which are likely to remain cached), the case does not actually run all 2712 * that often and has the supreme advantage of not polluting the namecache 2713 * algorithms. 2714 * 2715 * If a fakename is supplied just construct a namecache entry using the 2716 * fake name. 2717 */ 2718 static int 2719 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2720 struct vnode *dvp, char *fakename) 2721 { 2722 struct nlcomponent nlc; 2723 struct nchandle rncp; 2724 struct dirent *den; 2725 struct vnode *pvp; 2726 struct vattr vat; 2727 struct iovec iov; 2728 struct uio uio; 2729 int blksize; 2730 int eofflag; 2731 int bytes; 2732 char *rbuf; 2733 int error; 2734 2735 vat.va_blocksize = 0; 2736 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2737 return (error); 2738 cache_lock(nch); 2739 error = cache_vref(nch, cred, &pvp); 2740 cache_unlock(nch); 2741 if (error) 2742 return (error); 2743 if (ncvp_debug & 1) { 2744 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2745 "vattr fileid = %lld\n", 2746 nch->ncp, nch->ncp->nc_name, 2747 vat.va_blocksize, 2748 (long long)vat.va_fileid); 2749 } 2750 2751 /* 2752 * Use the supplied fakename if not NULL. Fake names are typically 2753 * not in the actual filesystem hierarchy. This is used by HAMMER 2754 * to glue @@timestamp recursions together. 2755 */ 2756 if (fakename) { 2757 nlc.nlc_nameptr = fakename; 2758 nlc.nlc_namelen = strlen(fakename); 2759 rncp = cache_nlookup(nch, &nlc); 2760 goto done; 2761 } 2762 2763 if ((blksize = vat.va_blocksize) == 0) 2764 blksize = DEV_BSIZE; 2765 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2766 rncp.ncp = NULL; 2767 2768 eofflag = 0; 2769 uio.uio_offset = 0; 2770 again: 2771 iov.iov_base = rbuf; 2772 iov.iov_len = blksize; 2773 uio.uio_iov = &iov; 2774 uio.uio_iovcnt = 1; 2775 uio.uio_resid = blksize; 2776 uio.uio_segflg = UIO_SYSSPACE; 2777 uio.uio_rw = UIO_READ; 2778 uio.uio_td = curthread; 2779 2780 if (ncvp_debug & 2) 2781 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2782 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2783 if (error == 0) { 2784 den = (struct dirent *)rbuf; 2785 bytes = blksize - uio.uio_resid; 2786 2787 while (bytes > 0) { 2788 if (ncvp_debug & 2) { 2789 kprintf("cache_inefficient_scan: %*.*s\n", 2790 den->d_namlen, den->d_namlen, 2791 den->d_name); 2792 } 2793 if (den->d_type != DT_WHT && 2794 den->d_ino == vat.va_fileid) { 2795 if (ncvp_debug & 1) { 2796 kprintf("cache_inefficient_scan: " 2797 "MATCHED inode %lld path %s/%*.*s\n", 2798 (long long)vat.va_fileid, 2799 nch->ncp->nc_name, 2800 den->d_namlen, den->d_namlen, 2801 den->d_name); 2802 } 2803 nlc.nlc_nameptr = den->d_name; 2804 nlc.nlc_namelen = den->d_namlen; 2805 rncp = cache_nlookup(nch, &nlc); 2806 KKASSERT(rncp.ncp != NULL); 2807 break; 2808 } 2809 bytes -= _DIRENT_DIRSIZ(den); 2810 den = _DIRENT_NEXT(den); 2811 } 2812 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2813 goto again; 2814 } 2815 kfree(rbuf, M_TEMP); 2816 done: 2817 vrele(pvp); 2818 if (rncp.ncp) { 2819 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2820 _cache_setvp(rncp.mount, rncp.ncp, dvp, 1); 2821 if (ncvp_debug & 2) { 2822 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2823 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2824 } 2825 } else { 2826 if (ncvp_debug & 2) { 2827 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2828 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2829 rncp.ncp->nc_vp); 2830 } 2831 } 2832 if (rncp.ncp->nc_vp == NULL) 2833 error = rncp.ncp->nc_error; 2834 /* 2835 * Release rncp after a successful nlookup. rncp was fully 2836 * referenced. 2837 */ 2838 cache_put(&rncp); 2839 } else { 2840 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2841 dvp, nch->ncp->nc_name); 2842 error = ENOENT; 2843 } 2844 return (error); 2845 } 2846 2847 /* 2848 * This function must be called with the ncp held and locked and will unlock 2849 * and drop it during zapping. 2850 * 2851 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2852 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2853 * and removes the related reference. If the ncp can be removed, and the 2854 * parent can be zapped non-blocking, this function loops up. 2855 * 2856 * There will be one ref from the caller (which we now own). The only 2857 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2858 * so possibly 2 refs left. Taking this into account, if there are no 2859 * additional refs and no children, the ncp will be removed from the topology 2860 * and destroyed. 2861 * 2862 * References and/or children may exist if the ncp is in the middle of the 2863 * topology, preventing the ncp from being destroyed. 2864 * 2865 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2866 * 2867 * This function may return a held (but NOT locked) parent node which the 2868 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2869 * due to deep namecache trees. 2870 * 2871 * WARNING! For MPSAFE operation this routine must acquire up to three 2872 * spin locks to be able to safely test nc_refs. Lock order is 2873 * very important. 2874 * 2875 * hash spinlock if on hash list 2876 * parent spinlock if child of parent 2877 * (the ncp is unresolved so there is no vnode association) 2878 */ 2879 static int 2880 cache_zap(struct namecache *ncp) 2881 { 2882 struct namecache *par; 2883 struct nchash_head *nchpp; 2884 int refcmp; 2885 int nonblock = 1; /* XXX cleanup */ 2886 int res = 0; 2887 2888 again: 2889 /* 2890 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2891 * This gets rid of any vp->v_namecache list or negative list and 2892 * the related ref. 2893 */ 2894 _cache_setunresolved(ncp, 1); 2895 2896 /* 2897 * Try to scrap the entry and possibly tail-recurse on its parent. 2898 * We only scrap unref'd (other then our ref) unresolved entries, 2899 * we do not scrap 'live' entries. 2900 * 2901 * If nc_parent is non NULL we expect 2 references, else just 1. 2902 * If there are more, someone else also holds the ncp and we cannot 2903 * destroy it. 2904 */ 2905 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2906 KKASSERT(ncp->nc_refs > 0); 2907 2908 /* 2909 * If the ncp is linked to its parent it will also be in the hash 2910 * table. We have to be able to lock the parent and the hash table. 2911 * 2912 * Acquire locks. Note that the parent can't go away while we hold 2913 * a child locked. If nc_parent is present, expect 2 refs instead 2914 * of 1. 2915 */ 2916 nchpp = NULL; 2917 if ((par = ncp->nc_parent) != NULL) { 2918 if (nonblock) { 2919 if (_cache_lock_nonblock(par)) { 2920 /* lock failed */ 2921 ncp->nc_flag |= NCF_DEFEREDZAP; 2922 atomic_add_long( 2923 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2924 1); 2925 _cache_unlock(ncp); 2926 _cache_drop(ncp); /* caller's ref */ 2927 return res; 2928 } 2929 _cache_hold(par); 2930 } else { 2931 _cache_hold(par); 2932 _cache_lock(par); 2933 } 2934 nchpp = ncp->nc_head; 2935 spin_lock(&nchpp->spin); 2936 } 2937 2938 /* 2939 * With the parent and nchpp locked, and the vnode removed 2940 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2941 * more someone else has a ref and we cannot zap the entry. 2942 * 2943 * one for our hold 2944 * one for our parent link (parent also has one from the linkage) 2945 */ 2946 if (par) 2947 refcmp = 2; 2948 else 2949 refcmp = 1; 2950 2951 /* 2952 * On failure undo the work we've done so far and drop the 2953 * caller's ref and ncp. 2954 */ 2955 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2956 if (par) { 2957 spin_unlock(&nchpp->spin); 2958 _cache_put(par); 2959 } 2960 _cache_unlock(ncp); 2961 _cache_drop(ncp); 2962 return res; 2963 } 2964 2965 /* 2966 * We own all the refs and with the spinlocks held no further 2967 * refs can be acquired by others. 2968 * 2969 * Remove us from the hash list and parent list. We have to 2970 * drop a ref on the parent's vp if the parent's list becomes 2971 * empty. 2972 */ 2973 if (par) { 2974 KKASSERT(nchpp == ncp->nc_head); 2975 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */ 2976 /*_cache_unlock(par);*/ 2977 /* &nchpp->spin is unlocked by call */ 2978 } else { 2979 KKASSERT(ncp->nc_head == NULL); 2980 } 2981 2982 /* 2983 * ncp should not have picked up any refs. Physically 2984 * destroy the ncp. 2985 */ 2986 if (ncp->nc_refs != refcmp) { 2987 panic("cache_zap: %p bad refs %d (expected %d)\n", 2988 ncp, ncp->nc_refs, refcmp); 2989 } 2990 /* _cache_unlock(ncp) not required */ 2991 ncp->nc_refs = -1; /* safety */ 2992 if (ncp->nc_name) 2993 kfree(ncp->nc_name, M_VFSCACHEAUX); 2994 kfree_obj(ncp, M_VFSCACHE); 2995 res = 1; 2996 2997 /* 2998 * Loop up if we can recursively clean out the parent. 2999 */ 3000 if (par) { 3001 refcmp = 1; /* ref on parent */ 3002 if (par->nc_parent) /* par->par */ 3003 ++refcmp; 3004 par->nc_flag &= ~NCF_DEFEREDZAP; 3005 if ((par->nc_flag & NCF_UNRESOLVED) && 3006 par->nc_refs == refcmp && 3007 TAILQ_EMPTY(&par->nc_list)) 3008 { 3009 ncp = par; 3010 goto again; 3011 } 3012 _cache_unlock(par); 3013 _cache_drop(par); 3014 } 3015 return 1; 3016 } 3017 3018 /* 3019 * Clean up dangling negative cache and defered-drop entries in the 3020 * namecache. 3021 * 3022 * This routine is called in the critical path and also called from 3023 * vnlru(). When called from vnlru we use a lower limit to try to 3024 * deal with the negative cache before the critical path has to start 3025 * dealing with it. 3026 */ 3027 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 3028 3029 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3030 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3031 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 3032 3033 void 3034 cache_hysteresis(int critpath) 3035 { 3036 long poslimit; 3037 long exclimit; 3038 long neglimit; 3039 long xnumunres; 3040 long xnumleafs; 3041 long clean_neg; 3042 long clean_unres; 3043 long clean_excess; 3044 3045 /* 3046 * Calculate negative ncp limit 3047 */ 3048 neglimit = maxvnodes / ncnegfactor; 3049 if (critpath == 0) 3050 neglimit = neglimit * 8 / 10; 3051 3052 /* 3053 * Don't cache too many negative hits. We use hysteresis to reduce 3054 * the impact on the critical path. 3055 */ 3056 clean_neg = 0; 3057 3058 switch(neg_cache_hysteresis_state[critpath]) { 3059 case CHI_LOW: 3060 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 3061 if (critpath) 3062 clean_neg = ncnegflush; 3063 else 3064 clean_neg = ncnegflush + 3065 vfscache_negs - neglimit; 3066 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 3067 } 3068 break; 3069 case CHI_HIGH: 3070 if (vfscache_negs > MINNEG * 9 / 10 && 3071 vfscache_negs * 9 / 10 > neglimit 3072 ) { 3073 if (critpath) 3074 clean_neg = ncnegflush; 3075 else 3076 clean_neg = ncnegflush + 3077 vfscache_negs * 9 / 10 - 3078 neglimit; 3079 } else { 3080 neg_cache_hysteresis_state[critpath] = CHI_LOW; 3081 } 3082 break; 3083 } 3084 if (clean_neg) 3085 _cache_cleanneg(clean_neg); 3086 3087 /* 3088 * Don't cache too many unresolved elements. We use hysteresis to 3089 * reduce the impact on the critical path. 3090 */ 3091 if ((poslimit = ncposlimit) == 0) 3092 poslimit = maxvnodes / ncposfactor; 3093 if (critpath == 0) 3094 poslimit = poslimit * 8 / 10; 3095 3096 /* 3097 * Number of unresolved leaf elements in the namecache. These 3098 * can build-up for various reasons and may have to be disposed 3099 * of to allow the inactive list to be cleaned out by vnlru_proc() 3100 * 3101 * Collect count 3102 */ 3103 xnumunres = vfscache_unres; 3104 clean_unres = 0; 3105 3106 switch(pos_cache_hysteresis_state[critpath]) { 3107 case CHI_LOW: 3108 if (xnumunres > poslimit && xnumunres > MINPOS) { 3109 if (critpath) 3110 clean_unres = ncposflush; 3111 else 3112 clean_unres = ncposflush + xnumunres - 3113 poslimit; 3114 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 3115 } 3116 break; 3117 case CHI_HIGH: 3118 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) { 3119 if (critpath) 3120 clean_unres = ncposflush; 3121 else 3122 clean_unres = ncposflush + xnumunres - 3123 poslimit * 5 / 6; 3124 } else { 3125 pos_cache_hysteresis_state[critpath] = CHI_LOW; 3126 } 3127 break; 3128 } 3129 3130 /* 3131 * Excessive positive hits can accumulate due to large numbers of 3132 * hardlinks (the vnode cache will not prevent ncps representing 3133 * hardlinks from growing into infinity). 3134 */ 3135 exclimit = maxvnodes * 2; 3136 if (critpath == 0) 3137 exclimit = exclimit * 8 / 10; 3138 xnumleafs = vfscache_leafs; 3139 clean_excess = 0; 3140 3141 switch(exc_cache_hysteresis_state[critpath]) { 3142 case CHI_LOW: 3143 if (xnumleafs > exclimit && xnumleafs > MINPOS) { 3144 if (critpath) 3145 clean_excess = ncposflush; 3146 else 3147 clean_excess = ncposflush + xnumleafs - 3148 exclimit; 3149 exc_cache_hysteresis_state[critpath] = CHI_HIGH; 3150 } 3151 break; 3152 case CHI_HIGH: 3153 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) { 3154 if (critpath) 3155 clean_excess = ncposflush; 3156 else 3157 clean_excess = ncposflush + xnumleafs - 3158 exclimit * 5 / 6; 3159 } else { 3160 exc_cache_hysteresis_state[critpath] = CHI_LOW; 3161 } 3162 break; 3163 } 3164 3165 if (clean_unres || clean_excess) 3166 _cache_cleanpos(clean_unres, clean_excess); 3167 3168 /* 3169 * Clean out dangling defered-zap ncps which could not be cleanly 3170 * dropped if too many build up. Note that numdefered is 3171 * heuristical. Make sure we are real-time for the current cpu, 3172 * plus the global rollup. 3173 */ 3174 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 3175 _cache_cleandefered(); 3176 } 3177 } 3178 3179 /* 3180 * NEW NAMECACHE LOOKUP API 3181 * 3182 * Lookup an entry in the namecache. The passed par_nch must be referenced 3183 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 3184 * is ALWAYS returned, eve if the supplied component is illegal. 3185 * 3186 * The resulting namecache entry should be returned to the system with 3187 * cache_put() or cache_unlock() + cache_drop(). 3188 * 3189 * namecache locks are recursive but care must be taken to avoid lock order 3190 * reversals (hence why the passed par_nch must be unlocked). Locking 3191 * rules are to order for parent traversals, not for child traversals. 3192 * 3193 * Nobody else will be able to manipulate the associated namespace (e.g. 3194 * create, delete, rename, rename-target) until the caller unlocks the 3195 * entry. 3196 * 3197 * The returned entry will be in one of three states: positive hit (non-null 3198 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 3199 * Unresolved entries must be resolved through the filesystem to associate the 3200 * vnode and/or determine whether a positive or negative hit has occured. 3201 * 3202 * It is not necessary to lock a directory in order to lock namespace under 3203 * that directory. In fact, it is explicitly not allowed to do that. A 3204 * directory is typically only locked when being created, renamed, or 3205 * destroyed. 3206 * 3207 * The directory (par) may be unresolved, in which case any returned child 3208 * will likely also be marked unresolved. Likely but not guarenteed. Since 3209 * the filesystem lookup requires a resolved directory vnode the caller is 3210 * responsible for resolving the namecache chain top-down. This API 3211 * specifically allows whole chains to be created in an unresolved state. 3212 */ 3213 struct nchandle 3214 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3215 { 3216 struct nchandle nch; 3217 struct namecache *ncp; 3218 struct namecache *new_ncp; 3219 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 3220 struct nchash_head *nchpp; 3221 struct mount *mp; 3222 u_int32_t hash; 3223 globaldata_t gd; 3224 int par_locked; 3225 int use_excl; 3226 3227 gd = mycpu; 3228 mp = par_nch->mount; 3229 par_locked = 0; 3230 3231 /* 3232 * This is a good time to call it, no ncp's are locked by 3233 * the caller or us. 3234 */ 3235 cache_hysteresis(1); 3236 3237 /* 3238 * Try to locate an existing entry 3239 */ 3240 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3241 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3242 new_ncp = NULL; 3243 use_excl = 0; 3244 nchpp = NCHHASH(hash); 3245 restart: 3246 rep_ncp = NULL; 3247 if (use_excl) 3248 spin_lock(&nchpp->spin); 3249 else 3250 spin_lock_shared(&nchpp->spin); 3251 3252 /* 3253 * Do a reverse scan to collect any DESTROYED ncps prior to matching 3254 * an existing entry. 3255 */ 3256 TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) { 3257 /* 3258 * Break out if we find a matching entry. Note that 3259 * UNRESOLVED entries may match, but DESTROYED entries 3260 * do not. 3261 * 3262 * We may be able to reuse DESTROYED entries that we come 3263 * across, even if the name does not match, as long as 3264 * nc_nlen is correct and the only hold ref is from the nchpp 3265 * list itself. 3266 */ 3267 if (ncp->nc_parent == par_nch->ncp && 3268 ncp->nc_nlen == nlc->nlc_namelen) { 3269 if (ncp->nc_flag & NCF_DESTROYED) { 3270 if (ncp->nc_refs == 1 && rep_ncp == NULL) 3271 rep_ncp = ncp; 3272 continue; 3273 } 3274 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 3275 continue; 3276 3277 /* 3278 * Matched ncp 3279 */ 3280 _cache_hold(ncp); 3281 if (rep_ncp) 3282 _cache_hold(rep_ncp); 3283 3284 if (use_excl) 3285 spin_unlock(&nchpp->spin); 3286 else 3287 spin_unlock_shared(&nchpp->spin); 3288 3289 if (par_locked) { 3290 _cache_unlock(par_nch->ncp); 3291 par_locked = 0; 3292 } 3293 3294 /* 3295 * Really try to destroy rep_ncp if encountered. 3296 * Various edge cases can build up more than one, 3297 * so loop if we succeed. This isn't perfect, but 3298 * we can't afford to have tons of entries build 3299 * up on a single nhcpp list due to rename-over 3300 * operations. If that were to happen, the system 3301 * would bog down quickly. 3302 */ 3303 if (rep_ncp) { 3304 if (_cache_lock_nonblock(rep_ncp) == 0) { 3305 if (rep_ncp->nc_flag & NCF_DESTROYED) { 3306 if (cache_zap(rep_ncp)) { 3307 _cache_drop(ncp); 3308 goto restart; 3309 } 3310 } else { 3311 _cache_unlock(rep_ncp); 3312 _cache_drop(rep_ncp); 3313 } 3314 } else { 3315 _cache_drop(rep_ncp); 3316 } 3317 } 3318 3319 /* 3320 * Continue processing the matched entry 3321 */ 3322 if (_cache_lock_special(ncp) == 0) { 3323 /* 3324 * Successfully locked but we must re-test 3325 * conditions that might have changed since 3326 * we did not have the lock before. 3327 */ 3328 if (ncp->nc_parent != par_nch->ncp || 3329 ncp->nc_nlen != nlc->nlc_namelen || 3330 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3331 ncp->nc_nlen) || 3332 (ncp->nc_flag & NCF_DESTROYED)) { 3333 _cache_put(ncp); 3334 goto restart; 3335 } 3336 _cache_auto_unresolve(mp, ncp); 3337 if (new_ncp) { 3338 _cache_free(new_ncp); 3339 new_ncp = NULL; /* safety */ 3340 } 3341 goto found; 3342 } 3343 _cache_get(ncp); /* cycle the lock to block */ 3344 _cache_put(ncp); 3345 _cache_drop(ncp); 3346 goto restart; 3347 } 3348 } 3349 3350 /* 3351 * We failed to locate the entry, try to resurrect a destroyed 3352 * entry that we did find that is already correctly linked into 3353 * nchpp and the parent. We must re-test conditions after 3354 * successfully locking rep_ncp. 3355 * 3356 * This case can occur under heavy loads due to not being able 3357 * to safely lock the parent in cache_zap(). Nominally a repeated 3358 * create/unlink load, but only the namelen needs to match. 3359 * 3360 * An exclusive lock on the nchpp is required to process this case, 3361 * otherwise a race can cause duplicate entries to be created with 3362 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 3363 */ 3364 if (rep_ncp && use_excl) { 3365 if (_cache_lock_nonblock(rep_ncp) == 0) { 3366 _cache_hold(rep_ncp); 3367 if (rep_ncp->nc_parent == par_nch->ncp && 3368 rep_ncp->nc_nlen == nlc->nlc_namelen && 3369 (rep_ncp->nc_flag & NCF_DESTROYED) && 3370 rep_ncp->nc_refs == 2) 3371 { 3372 /* 3373 * Update nc_name. 3374 */ 3375 ncp = rep_ncp; 3376 3377 _cache_ncp_gen_enter(ncp); 3378 3379 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3380 nlc->nlc_namelen); 3381 3382 /* 3383 * This takes some care. We must clear the 3384 * NCF_DESTROYED flag before unlocking the 3385 * hash chain so other concurrent searches 3386 * do not skip this element. 3387 * 3388 * We must also unlock the hash chain before 3389 * unresolving the ncp to avoid deadlocks. 3390 * We hold the lock on the ncp so we can safely 3391 * reinitialize nc_flag after that. 3392 */ 3393 ncp->nc_flag &= ~NCF_DESTROYED; 3394 spin_unlock(&nchpp->spin); /* use_excl */ 3395 3396 _cache_setunresolved(ncp, 0); 3397 ncp->nc_flag = NCF_UNRESOLVED; 3398 ncp->nc_error = ENOTCONN; 3399 3400 _cache_ncp_gen_exit(ncp); 3401 3402 if (par_locked) { 3403 _cache_unlock(par_nch->ncp); 3404 par_locked = 0; 3405 } 3406 if (new_ncp) { 3407 _cache_free(new_ncp); 3408 new_ncp = NULL; /* safety */ 3409 } 3410 goto found; 3411 } 3412 _cache_put(rep_ncp); 3413 } 3414 } 3415 3416 /* 3417 * Otherwise create a new entry and add it to the cache. The parent 3418 * ncp must also be locked so we can link into it. 3419 * 3420 * We have to relookup after possibly blocking in kmalloc or 3421 * when locking par_nch. 3422 * 3423 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3424 * mount case, in which case nc_name will be NULL. 3425 * 3426 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3427 * a DESTROYED entry, but didn't have an exclusive lock. 3428 * In this situation we do not create a new_ncp. 3429 */ 3430 if (new_ncp == NULL) { 3431 if (use_excl) 3432 spin_unlock(&nchpp->spin); 3433 else 3434 spin_unlock_shared(&nchpp->spin); 3435 if (rep_ncp == NULL) { 3436 new_ncp = cache_alloc(nlc->nlc_namelen); 3437 if (nlc->nlc_namelen) { 3438 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3439 nlc->nlc_namelen); 3440 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3441 } 3442 } 3443 use_excl = 1; 3444 goto restart; 3445 } 3446 3447 /* 3448 * NOTE! The spinlock is held exclusively here because new_ncp 3449 * is non-NULL. 3450 */ 3451 if (par_locked == 0) { 3452 spin_unlock(&nchpp->spin); 3453 _cache_lock(par_nch->ncp); 3454 par_locked = 1; 3455 goto restart; 3456 } 3457 3458 /* 3459 * Link to parent (requires another ref, the one already in new_ncp 3460 * is what we wil lreturn). 3461 * 3462 * WARNING! We still hold the spinlock. We have to set the hash 3463 * table entry atomically. 3464 */ 3465 ncp = new_ncp; 3466 ++ncp->nc_refs; 3467 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3468 spin_unlock(&nchpp->spin); 3469 _cache_unlock(par_nch->ncp); 3470 /* par_locked = 0 - not used */ 3471 found: 3472 /* 3473 * stats and namecache size management 3474 */ 3475 if (ncp->nc_flag & NCF_UNRESOLVED) 3476 ++gd->gd_nchstats->ncs_miss; 3477 else if (ncp->nc_vp) 3478 ++gd->gd_nchstats->ncs_goodhits; 3479 else 3480 ++gd->gd_nchstats->ncs_neghits; 3481 nch.mount = mp; 3482 nch.ncp = ncp; 3483 _cache_mntref(nch.mount); 3484 3485 return(nch); 3486 } 3487 3488 /* 3489 * Attempt to lookup a namecache entry and return with a shared namecache 3490 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3491 * set or we are unable to lock. 3492 */ 3493 int 3494 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3495 struct nlcomponent *nlc, 3496 int excl, struct nchandle *res_nch) 3497 { 3498 struct namecache *ncp; 3499 struct nchash_head *nchpp; 3500 struct mount *mp; 3501 u_int32_t hash; 3502 globaldata_t gd; 3503 3504 /* 3505 * If exclusive requested or shared namecache locks are disabled, 3506 * return failure. 3507 */ 3508 if (ncp_shared_lock_disable || excl) 3509 return(EWOULDBLOCK); 3510 3511 gd = mycpu; 3512 mp = par_nch->mount; 3513 3514 /* 3515 * This is a good time to call it, no ncp's are locked by 3516 * the caller or us. 3517 */ 3518 cache_hysteresis(1); 3519 3520 /* 3521 * Try to locate an existing entry 3522 */ 3523 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3524 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3525 nchpp = NCHHASH(hash); 3526 3527 spin_lock_shared(&nchpp->spin); 3528 3529 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3530 /* 3531 * Break out if we find a matching entry. Note that 3532 * UNRESOLVED entries may match, but DESTROYED entries 3533 * do not. 3534 */ 3535 if (ncp->nc_parent == par_nch->ncp && 3536 ncp->nc_nlen == nlc->nlc_namelen && 3537 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3538 (ncp->nc_flag & NCF_DESTROYED) == 0 3539 ) { 3540 _cache_hold(ncp); 3541 spin_unlock_shared(&nchpp->spin); 3542 3543 if (_cache_lock_shared_special(ncp) == 0) { 3544 if (ncp->nc_parent == par_nch->ncp && 3545 ncp->nc_nlen == nlc->nlc_namelen && 3546 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3547 ncp->nc_nlen) == 0 && 3548 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3549 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3550 _cache_auto_unresolve_test(mp, ncp) == 0) 3551 { 3552 goto found; 3553 } 3554 _cache_unlock(ncp); 3555 } 3556 _cache_drop(ncp); 3557 return(EWOULDBLOCK); 3558 } 3559 } 3560 3561 /* 3562 * Failure 3563 */ 3564 spin_unlock_shared(&nchpp->spin); 3565 return(EWOULDBLOCK); 3566 3567 /* 3568 * Success 3569 * 3570 * Note that nc_error might be non-zero (e.g ENOENT). 3571 */ 3572 found: 3573 res_nch->mount = mp; 3574 res_nch->ncp = ncp; 3575 ++gd->gd_nchstats->ncs_goodhits; 3576 _cache_mntref(res_nch->mount); 3577 3578 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3579 return(ncp->nc_error); 3580 } 3581 3582 /* 3583 * This is a non-blocking verison of cache_nlookup() used by 3584 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3585 * will return nch.ncp == NULL in that case. 3586 */ 3587 struct nchandle 3588 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3589 { 3590 struct nchandle nch; 3591 struct namecache *ncp; 3592 struct namecache *new_ncp; 3593 struct nchash_head *nchpp; 3594 struct mount *mp; 3595 u_int32_t hash; 3596 globaldata_t gd; 3597 int par_locked; 3598 3599 gd = mycpu; 3600 mp = par_nch->mount; 3601 par_locked = 0; 3602 3603 /* 3604 * Try to locate an existing entry 3605 */ 3606 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3607 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3608 new_ncp = NULL; 3609 nchpp = NCHHASH(hash); 3610 restart: 3611 spin_lock(&nchpp->spin); 3612 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3613 /* 3614 * Break out if we find a matching entry. Note that 3615 * UNRESOLVED entries may match, but DESTROYED entries 3616 * do not. 3617 */ 3618 if (ncp->nc_parent == par_nch->ncp && 3619 ncp->nc_nlen == nlc->nlc_namelen && 3620 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3621 (ncp->nc_flag & NCF_DESTROYED) == 0 3622 ) { 3623 _cache_hold(ncp); 3624 spin_unlock(&nchpp->spin); 3625 if (par_locked) { 3626 _cache_unlock(par_nch->ncp); 3627 par_locked = 0; 3628 } 3629 if (_cache_lock_special(ncp) == 0) { 3630 if (ncp->nc_parent != par_nch->ncp || 3631 ncp->nc_nlen != nlc->nlc_namelen || 3632 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3633 (ncp->nc_flag & NCF_DESTROYED)) { 3634 kprintf("cache_lookup_nonblock: " 3635 "ncp-race %p %*.*s\n", 3636 ncp, 3637 nlc->nlc_namelen, 3638 nlc->nlc_namelen, 3639 nlc->nlc_nameptr); 3640 _cache_unlock(ncp); 3641 _cache_drop(ncp); 3642 goto failed; 3643 } 3644 _cache_auto_unresolve(mp, ncp); 3645 if (new_ncp) { 3646 _cache_free(new_ncp); 3647 new_ncp = NULL; 3648 } 3649 goto found; 3650 } 3651 _cache_drop(ncp); 3652 goto failed; 3653 } 3654 } 3655 3656 /* 3657 * We failed to locate an entry, create a new entry and add it to 3658 * the cache. The parent ncp must also be locked so we 3659 * can link into it. 3660 * 3661 * We have to relookup after possibly blocking in kmalloc or 3662 * when locking par_nch. 3663 * 3664 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3665 * mount case, in which case nc_name will be NULL. 3666 */ 3667 if (new_ncp == NULL) { 3668 spin_unlock(&nchpp->spin); 3669 new_ncp = cache_alloc(nlc->nlc_namelen); 3670 if (nlc->nlc_namelen) { 3671 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3672 nlc->nlc_namelen); 3673 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3674 } 3675 goto restart; 3676 } 3677 if (par_locked == 0) { 3678 spin_unlock(&nchpp->spin); 3679 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3680 par_locked = 1; 3681 goto restart; 3682 } 3683 goto failed; 3684 } 3685 3686 /* 3687 * Link to parent (requires another ref, the one already in new_ncp 3688 * is what we wil lreturn). 3689 * 3690 * WARNING! We still hold the spinlock. We have to set the hash 3691 * table entry atomically. 3692 */ 3693 ncp = new_ncp; 3694 ++ncp->nc_refs; 3695 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3696 spin_unlock(&nchpp->spin); 3697 _cache_unlock(par_nch->ncp); 3698 /* par_locked = 0 - not used */ 3699 found: 3700 /* 3701 * stats and namecache size management 3702 */ 3703 if (ncp->nc_flag & NCF_UNRESOLVED) 3704 ++gd->gd_nchstats->ncs_miss; 3705 else if (ncp->nc_vp) 3706 ++gd->gd_nchstats->ncs_goodhits; 3707 else 3708 ++gd->gd_nchstats->ncs_neghits; 3709 nch.mount = mp; 3710 nch.ncp = ncp; 3711 _cache_mntref(nch.mount); 3712 3713 return(nch); 3714 failed: 3715 if (new_ncp) { 3716 _cache_free(new_ncp); 3717 new_ncp = NULL; 3718 } 3719 nch.mount = NULL; 3720 nch.ncp = NULL; 3721 return(nch); 3722 } 3723 3724 /* 3725 * This is a non-locking optimized lookup that depends on adding a ref 3726 * to prevent normal eviction. nch.ncp can be returned as NULL for any 3727 * reason and the caller will retry with normal locking in that case. 3728 * 3729 * This function only returns resolved entries so callers do not accidentally 3730 * race doing out of order / unfenced field checks. 3731 * 3732 * The caller must validate the result for parent-to-child continuity. 3733 */ 3734 struct nchandle 3735 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3736 { 3737 struct nchandle nch; 3738 struct namecache *ncp; 3739 struct nchash_head *nchpp; 3740 struct mount *mp; 3741 u_int32_t hash; 3742 globaldata_t gd; 3743 3744 gd = mycpu; 3745 mp = par_nch->mount; 3746 3747 /* 3748 * Try to locate an existing entry 3749 */ 3750 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3751 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3752 nchpp = NCHHASH(hash); 3753 3754 spin_lock_shared(&nchpp->spin); 3755 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3756 /* 3757 * Break out if we find a matching entry. Note that 3758 * UNRESOLVED entries may match, but DESTROYED entries 3759 * do not. However, UNRESOLVED entries still return failure. 3760 */ 3761 if (ncp->nc_parent == par_nch->ncp && 3762 ncp->nc_nlen == nlc->nlc_namelen && 3763 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3764 (ncp->nc_flag & NCF_DESTROYED) == 0 3765 ) { 3766 /* 3767 * Test NFS timeout for auto-unresolve. Give up if 3768 * the entry is not resolved. 3769 * 3770 * Getting the ref with the nchpp locked prevents 3771 * any transition to NCF_DESTROYED. 3772 */ 3773 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3774 break; 3775 if (ncp->nc_flag & NCF_UNRESOLVED) 3776 break; 3777 _cache_hold(ncp); 3778 spin_unlock_shared(&nchpp->spin); 3779 3780 /* 3781 * We need an additional test to ensure that the ref 3782 * we got above prevents transitions to NCF_UNRESOLVED. 3783 * This can occur if another thread is currently 3784 * holding the ncp exclusively locked or (if we raced 3785 * that and it unlocked before our test) the flag 3786 * has been set. 3787 * 3788 * XXX check if superceeded by nc_generation XXX 3789 */ 3790 if (_cache_lockstatus(ncp) < 0 || 3791 (ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED))) 3792 { 3793 if ((ncvp_debug & 4) && 3794 (ncp->nc_flag & 3795 (NCF_DESTROYED | NCF_UNRESOLVED))) 3796 { 3797 kprintf("ncp state change: %p %08x %d %s\n", 3798 ncp, ncp->nc_flag, ncp->nc_error, 3799 ncp->nc_name); 3800 } 3801 _cache_drop(ncp); 3802 spin_lock_shared(&nchpp->spin); 3803 break; 3804 } 3805 3806 /* 3807 * Return the ncp bundled into a nch on success. 3808 * The ref should passively prevent the ncp from 3809 * becoming unresolved without having to hold a lock. 3810 * (XXX this may not be entirely true) 3811 */ 3812 goto found; 3813 } 3814 } 3815 spin_unlock_shared(&nchpp->spin); 3816 nch.mount = NULL; 3817 nch.ncp = NULL; 3818 3819 return nch; 3820 found: 3821 /* 3822 * stats and namecache size management 3823 */ 3824 if (ncp->nc_flag & NCF_UNRESOLVED) 3825 ++gd->gd_nchstats->ncs_miss; 3826 else if (ncp->nc_vp) 3827 ++gd->gd_nchstats->ncs_goodhits; 3828 else 3829 ++gd->gd_nchstats->ncs_neghits; 3830 nch.mount = mp; 3831 nch.ncp = ncp; 3832 _cache_mntref(nch.mount); 3833 3834 return(nch); 3835 } 3836 3837 /* 3838 * The namecache entry is marked as being used as a mount point. 3839 * Locate the mount if it is visible to the caller. The DragonFly 3840 * mount system allows arbitrary loops in the topology and disentangles 3841 * those loops by matching against (mp, ncp) rather than just (ncp). 3842 * This means any given ncp can dive any number of mounts, depending 3843 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3844 * 3845 * We use a very simple frontend cache to reduce SMP conflicts, 3846 * which we have to do because the mountlist scan needs an exclusive 3847 * lock around its ripout info list. Not to mention that there might 3848 * be a lot of mounts. 3849 * 3850 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3851 * down a bit to allow some contention rather than making the cache 3852 * excessively huge. 3853 * 3854 * The hash table is split into per-cpu areas, is 4-way set-associative. 3855 */ 3856 struct findmount_info { 3857 struct mount *result; 3858 struct mount *nch_mount; 3859 struct namecache *nch_ncp; 3860 }; 3861 3862 static __inline 3863 struct ncmount_cache * 3864 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3865 { 3866 uint32_t hash; 3867 3868 hash = iscsi_crc32(&mp, sizeof(mp)); 3869 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3870 hash ^= hash >> 16; 3871 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3872 3873 return (&ncmount_cache[hash]); 3874 } 3875 3876 static 3877 struct ncmount_cache * 3878 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3879 { 3880 struct ncmount_cache *ncc; 3881 struct ncmount_cache *best; 3882 int delta; 3883 int best_delta; 3884 int i; 3885 3886 ncc = ncmount_cache_lookup4(mp, ncp); 3887 3888 /* 3889 * NOTE: When checking for a ticks overflow implement a slop of 3890 * 2 ticks just to be safe, because ticks is accessed 3891 * non-atomically one CPU can increment it while another 3892 * is still using the old value. 3893 */ 3894 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3895 return ncc; 3896 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3897 if (delta < -2) /* overflow reset */ 3898 ncc->ticks = ticks; 3899 best = ncc; 3900 best_delta = delta; 3901 3902 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3903 ++ncc; 3904 if (ncc->ncp == ncp && ncc->mp == mp) 3905 return ncc; 3906 delta = (int)(ticks - ncc->ticks); 3907 if (delta < -2) 3908 ncc->ticks = ticks; 3909 if (delta > best_delta) { 3910 best_delta = delta; 3911 best = ncc; 3912 } 3913 } 3914 return best; 3915 } 3916 3917 /* 3918 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3919 * doing an expensive mountlist_scan*() if possible. 3920 * 3921 * (mp, ncp) -> mountonpt.k 3922 * 3923 * Returns a referenced mount pointer or NULL 3924 * 3925 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3926 * operations (that is, where the mp_target can be freed out from under us). 3927 * 3928 * Lookups use the ncc->updating counter to validate the contents in order 3929 * to avoid having to obtain the per cache-element spin-lock. In addition, 3930 * the ticks field is only updated when it changes. However, if our per-cpu 3931 * lock fails due to an unmount-in-progress, we fall-back to the 3932 * cache-element's spin-lock. 3933 */ 3934 struct mount * 3935 cache_findmount(struct nchandle *nch) 3936 { 3937 struct findmount_info info; 3938 struct ncmount_cache *ncc; 3939 struct ncmount_cache ncc_copy; 3940 struct mount *target; 3941 struct pcpu_ncache *pcpu; 3942 struct spinlock *spinlk; 3943 int update; 3944 3945 pcpu = pcpu_ncache; 3946 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3947 ncc = NULL; 3948 goto skip; 3949 } 3950 pcpu += mycpu->gd_cpuid; 3951 3952 again: 3953 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3954 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3955 found: 3956 /* 3957 * This is a bit messy for now because we do not yet have 3958 * safe disposal of mount structures. We have to ref 3959 * ncc->mp_target but the 'update' counter only tell us 3960 * whether the cache has changed after the fact. 3961 * 3962 * For now get a per-cpu spinlock that will only contend 3963 * against umount's. This is the best path. If it fails, 3964 * instead of waiting on the umount we fall-back to a 3965 * shared ncc->spin lock, which will generally only cost a 3966 * cache ping-pong. 3967 */ 3968 update = ncc->updating; 3969 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3970 spinlk = &pcpu->umount_spin; 3971 } else { 3972 spinlk = &ncc->spin; 3973 spin_lock_shared(spinlk); 3974 } 3975 if (update & 1) { /* update in progress */ 3976 spin_unlock_any(spinlk); 3977 goto skip; 3978 } 3979 ncc_copy = *ncc; 3980 cpu_lfence(); 3981 if (ncc->updating != update) { /* content changed */ 3982 spin_unlock_any(spinlk); 3983 goto again; 3984 } 3985 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3986 spin_unlock_any(spinlk); 3987 goto again; 3988 } 3989 if (ncc_copy.isneg == 0) { 3990 target = ncc_copy.mp_target; 3991 if (target->mnt_ncmounton.mount == nch->mount && 3992 target->mnt_ncmounton.ncp == nch->ncp) { 3993 /* 3994 * Cache hit (positive) (avoid dirtying 3995 * the cache line if possible) 3996 */ 3997 if (ncc->ticks != (int)ticks) 3998 ncc->ticks = (int)ticks; 3999 _cache_mntref(target); 4000 } 4001 } else { 4002 /* 4003 * Cache hit (negative) (avoid dirtying 4004 * the cache line if possible) 4005 */ 4006 if (ncc->ticks != (int)ticks) 4007 ncc->ticks = (int)ticks; 4008 target = NULL; 4009 } 4010 spin_unlock_any(spinlk); 4011 4012 return target; 4013 } 4014 skip: 4015 4016 /* 4017 * Slow 4018 */ 4019 info.result = NULL; 4020 info.nch_mount = nch->mount; 4021 info.nch_ncp = nch->ncp; 4022 mountlist_scan(cache_findmount_callback, &info, 4023 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 4024 4025 /* 4026 * To reduce multi-re-entry on the cache, relookup in the cache. 4027 * This can still race, obviously, but that's ok. 4028 */ 4029 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 4030 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 4031 if (info.result) 4032 atomic_add_int(&info.result->mnt_refs, -1); 4033 goto found; 4034 } 4035 4036 /* 4037 * Cache the result. 4038 */ 4039 if ((info.result == NULL || 4040 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 4041 spin_lock(&ncc->spin); 4042 atomic_add_int_nonlocked(&ncc->updating, 1); 4043 cpu_sfence(); 4044 KKASSERT(ncc->updating & 1); 4045 if (ncc->mp != nch->mount) { 4046 if (ncc->mp) 4047 atomic_add_int(&ncc->mp->mnt_refs, -1); 4048 atomic_add_int(&nch->mount->mnt_refs, 1); 4049 ncc->mp = nch->mount; 4050 } 4051 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 4052 ncc->ticks = (int)ticks; 4053 4054 if (info.result) { 4055 ncc->isneg = 0; 4056 if (ncc->mp_target != info.result) { 4057 if (ncc->mp_target) 4058 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4059 ncc->mp_target = info.result; 4060 atomic_add_int(&info.result->mnt_refs, 1); 4061 } 4062 } else { 4063 ncc->isneg = 1; 4064 if (ncc->mp_target) { 4065 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4066 ncc->mp_target = NULL; 4067 } 4068 } 4069 cpu_sfence(); 4070 atomic_add_int_nonlocked(&ncc->updating, 1); 4071 spin_unlock(&ncc->spin); 4072 } 4073 return(info.result); 4074 } 4075 4076 static 4077 int 4078 cache_findmount_callback(struct mount *mp, void *data) 4079 { 4080 struct findmount_info *info = data; 4081 4082 /* 4083 * Check the mount's mounted-on point against the passed nch. 4084 */ 4085 if (mp->mnt_ncmounton.mount == info->nch_mount && 4086 mp->mnt_ncmounton.ncp == info->nch_ncp 4087 ) { 4088 info->result = mp; 4089 _cache_mntref(mp); 4090 return(-1); 4091 } 4092 return(0); 4093 } 4094 4095 void 4096 cache_dropmount(struct mount *mp) 4097 { 4098 _cache_mntrel(mp); 4099 } 4100 4101 /* 4102 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 4103 * or negative). 4104 * 4105 * A full scan is not required, but for now just do it anyway. 4106 */ 4107 void 4108 cache_ismounting(struct mount *mp) 4109 { 4110 struct ncmount_cache *ncc; 4111 struct mount *ncc_mp; 4112 int i; 4113 4114 if (pcpu_ncache == NULL) 4115 return; 4116 4117 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4118 ncc = &ncmount_cache[i]; 4119 if (ncc->mp != mp->mnt_ncmounton.mount || 4120 ncc->ncp != mp->mnt_ncmounton.ncp) { 4121 continue; 4122 } 4123 spin_lock(&ncc->spin); 4124 atomic_add_int_nonlocked(&ncc->updating, 1); 4125 cpu_sfence(); 4126 KKASSERT(ncc->updating & 1); 4127 if (ncc->mp != mp->mnt_ncmounton.mount || 4128 ncc->ncp != mp->mnt_ncmounton.ncp) { 4129 cpu_sfence(); 4130 ++ncc->updating; 4131 spin_unlock(&ncc->spin); 4132 continue; 4133 } 4134 ncc_mp = ncc->mp; 4135 ncc->ncp = NULL; 4136 ncc->mp = NULL; 4137 if (ncc_mp) 4138 atomic_add_int(&ncc_mp->mnt_refs, -1); 4139 ncc_mp = ncc->mp_target; 4140 ncc->mp_target = NULL; 4141 if (ncc_mp) 4142 atomic_add_int(&ncc_mp->mnt_refs, -1); 4143 ncc->ticks = (int)ticks - hz * 120; 4144 4145 cpu_sfence(); 4146 atomic_add_int_nonlocked(&ncc->updating, 1); 4147 spin_unlock(&ncc->spin); 4148 } 4149 4150 /* 4151 * Pre-cache the mount point 4152 */ 4153 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 4154 mp->mnt_ncmounton.ncp); 4155 4156 spin_lock(&ncc->spin); 4157 atomic_add_int_nonlocked(&ncc->updating, 1); 4158 cpu_sfence(); 4159 KKASSERT(ncc->updating & 1); 4160 4161 if (ncc->mp) 4162 atomic_add_int(&ncc->mp->mnt_refs, -1); 4163 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 4164 ncc->mp = mp->mnt_ncmounton.mount; 4165 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 4166 ncc->ticks = (int)ticks; 4167 4168 ncc->isneg = 0; 4169 if (ncc->mp_target != mp) { 4170 if (ncc->mp_target) 4171 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4172 ncc->mp_target = mp; 4173 atomic_add_int(&mp->mnt_refs, 1); 4174 } 4175 cpu_sfence(); 4176 atomic_add_int_nonlocked(&ncc->updating, 1); 4177 spin_unlock(&ncc->spin); 4178 } 4179 4180 /* 4181 * Scrap any ncmount_cache entries related to mp. Not only do we need to 4182 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 4183 * negative hits involving (mp, <any>). 4184 * 4185 * A full scan is required. 4186 */ 4187 void 4188 cache_unmounting(struct mount *mp) 4189 { 4190 struct ncmount_cache *ncc; 4191 struct pcpu_ncache *pcpu; 4192 struct mount *ncc_mp; 4193 int i; 4194 4195 pcpu = pcpu_ncache; 4196 if (pcpu == NULL) 4197 return; 4198 4199 for (i = 0; i < ncpus; ++i) 4200 spin_lock(&pcpu[i].umount_spin); 4201 4202 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4203 ncc = &ncmount_cache[i]; 4204 if (ncc->mp != mp && ncc->mp_target != mp) 4205 continue; 4206 spin_lock(&ncc->spin); 4207 atomic_add_int_nonlocked(&ncc->updating, 1); 4208 cpu_sfence(); 4209 4210 if (ncc->mp != mp && ncc->mp_target != mp) { 4211 atomic_add_int_nonlocked(&ncc->updating, 1); 4212 cpu_sfence(); 4213 spin_unlock(&ncc->spin); 4214 continue; 4215 } 4216 ncc_mp = ncc->mp; 4217 ncc->ncp = NULL; 4218 ncc->mp = NULL; 4219 if (ncc_mp) 4220 atomic_add_int(&ncc_mp->mnt_refs, -1); 4221 ncc_mp = ncc->mp_target; 4222 ncc->mp_target = NULL; 4223 if (ncc_mp) 4224 atomic_add_int(&ncc_mp->mnt_refs, -1); 4225 ncc->ticks = (int)ticks - hz * 120; 4226 4227 cpu_sfence(); 4228 atomic_add_int_nonlocked(&ncc->updating, 1); 4229 spin_unlock(&ncc->spin); 4230 } 4231 4232 for (i = 0; i < ncpus; ++i) 4233 spin_unlock(&pcpu[i].umount_spin); 4234 } 4235 4236 /* 4237 * Resolve an unresolved namecache entry, generally by looking it up. 4238 * The passed ncp must be locked and refd. 4239 * 4240 * Theoretically since a vnode cannot be recycled while held, and since 4241 * the nc_parent chain holds its vnode as long as children exist, the 4242 * direct parent of the cache entry we are trying to resolve should 4243 * have a valid vnode. If not then generate an error that we can 4244 * determine is related to a resolver bug. 4245 * 4246 * However, if a vnode was in the middle of a recyclement when the NCP 4247 * got locked, ncp->nc_vp might point to a vnode that is about to become 4248 * invalid. cache_resolve() handles this case by unresolving the entry 4249 * and then re-resolving it. 4250 * 4251 * Note that successful resolution does not necessarily return an error 4252 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 4253 * will be returned. 4254 * 4255 * (*genp) is adjusted based on our resolution operation. If it is already 4256 * wrong, that's ok... it will still be wrong on return. 4257 */ 4258 int 4259 cache_resolve(struct nchandle *nch, u_int *genp, struct ucred *cred) 4260 { 4261 struct namecache *par_tmp; 4262 struct namecache *par; 4263 struct namecache *ncp; 4264 struct nchandle nctmp; 4265 struct mount *mp; 4266 struct vnode *dvp; 4267 int error; 4268 4269 ncp = nch->ncp; 4270 mp = nch->mount; 4271 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4272 4273 restart: 4274 /* 4275 * If the ncp is already resolved we have nothing to do. However, 4276 * we do want to guarentee that a usable vnode is returned when 4277 * a vnode is present, so make sure it hasn't been reclaimed. 4278 */ 4279 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4280 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 4281 _cache_ncp_gen_enter(ncp); 4282 _cache_setunresolved(ncp, 0); 4283 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4284 _cache_ncp_gen_exit(ncp); 4285 *genp += 4; 4286 return (ncp->nc_error); 4287 } 4288 } else if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4289 return (ncp->nc_error); 4290 } else { 4291 _cache_ncp_gen_enter(ncp); 4292 } 4293 } else { 4294 _cache_ncp_gen_enter(ncp); 4295 } 4296 /* in gen_enter state */ 4297 *genp += 4; 4298 4299 /* 4300 * If the ncp was destroyed it will never resolve again. This 4301 * can basically only happen when someone is chdir'd into an 4302 * empty directory which is then rmdir'd. We want to catch this 4303 * here and not dive the VFS because the VFS might actually 4304 * have a way to re-resolve the disconnected ncp, which will 4305 * result in inconsistencies in the cdir/nch for proc->p_fd. 4306 */ 4307 if (ncp->nc_flag & NCF_DESTROYED) { 4308 _cache_ncp_gen_exit(ncp); 4309 return(EINVAL); 4310 } 4311 4312 /* 4313 * Mount points need special handling because the parent does not 4314 * belong to the same filesystem as the ncp. 4315 */ 4316 if (ncp == mp->mnt_ncmountpt.ncp) { 4317 error = cache_resolve_mp(mp, 0); 4318 _cache_ncp_gen_exit(ncp); 4319 return error; 4320 } 4321 4322 /* 4323 * We expect an unbroken chain of ncps to at least the mount point, 4324 * and even all the way to root (but this code doesn't have to go 4325 * past the mount point). 4326 */ 4327 if (ncp->nc_parent == NULL) { 4328 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 4329 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4330 ncp->nc_error = EXDEV; 4331 _cache_ncp_gen_exit(ncp); 4332 return(ncp->nc_error); 4333 } 4334 4335 /* 4336 * The vp's of the parent directories in the chain are held via vhold() 4337 * due to the existance of the child, and should not disappear. 4338 * However, there are cases where they can disappear: 4339 * 4340 * - due to filesystem I/O errors. 4341 * - due to NFS being stupid about tracking the namespace and 4342 * destroys the namespace for entire directories quite often. 4343 * - due to forced unmounts. 4344 * - due to an rmdir (parent will be marked DESTROYED) 4345 * 4346 * When this occurs we have to track the chain backwards and resolve 4347 * it, looping until the resolver catches up to the current node. We 4348 * could recurse here but we might run ourselves out of kernel stack 4349 * so we do it in a more painful manner. This situation really should 4350 * not occur all that often, or if it does not have to go back too 4351 * many nodes to resolve the ncp. 4352 */ 4353 while ((dvp = cache_dvpref(ncp)) == NULL) { 4354 /* 4355 * This case can occur if a process is CD'd into a 4356 * directory which is then rmdir'd. If the parent is marked 4357 * destroyed there is no point trying to resolve it. 4358 */ 4359 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) { 4360 if (ncvp_debug & 8) { 4361 kprintf("nc_parent destroyed: %s/%s\n", 4362 ncp->nc_parent->nc_name, ncp->nc_name); 4363 } 4364 _cache_ncp_gen_exit(ncp); 4365 return(ENOENT); 4366 } 4367 par = ncp->nc_parent; 4368 _cache_hold(par); 4369 _cache_lock(par); 4370 while ((par_tmp = par->nc_parent) != NULL && 4371 par_tmp->nc_vp == NULL) { 4372 _cache_hold(par_tmp); 4373 _cache_lock(par_tmp); 4374 _cache_put(par); 4375 par = par_tmp; 4376 } 4377 if (par->nc_parent == NULL) { 4378 kprintf("EXDEV case 2 %*.*s\n", 4379 par->nc_nlen, par->nc_nlen, par->nc_name); 4380 _cache_put(par); 4381 _cache_ncp_gen_exit(ncp); 4382 return (EXDEV); 4383 } 4384 /* 4385 * The parent is not set in stone, ref and lock it to prevent 4386 * it from disappearing. Also note that due to renames it 4387 * is possible for our ncp to move and for par to no longer 4388 * be one of its parents. We resolve it anyway, the loop 4389 * will handle any moves. 4390 */ 4391 _cache_get(par); /* additional hold/lock */ 4392 _cache_put(par); /* from earlier hold/lock */ 4393 if (par == nch->mount->mnt_ncmountpt.ncp) { 4394 cache_resolve_mp(nch->mount, 0); 4395 } else if ((dvp = cache_dvpref(par)) == NULL) { 4396 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4397 par->nc_nlen, par->nc_nlen, par->nc_name); 4398 _cache_put(par); 4399 continue; 4400 } else { 4401 if (par->nc_flag & NCF_UNRESOLVED) { 4402 nctmp.mount = mp; 4403 nctmp.ncp = par; 4404 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4405 } 4406 vrele(dvp); 4407 } 4408 if ((error = par->nc_error) != 0) { 4409 if (par->nc_error != EAGAIN) { 4410 kprintf("EXDEV case 3 %*.*s error %d\n", 4411 par->nc_nlen, par->nc_nlen, par->nc_name, 4412 par->nc_error); 4413 _cache_put(par); 4414 _cache_ncp_gen_exit(ncp); 4415 return(error); 4416 } 4417 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4418 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4419 } 4420 _cache_put(par); 4421 /* loop */ 4422 } 4423 4424 /* 4425 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 4426 * ncp's and reattach them. If this occurs the original ncp is marked 4427 * EAGAIN to force a relookup. 4428 * 4429 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 4430 * ncp must already be resolved. 4431 */ 4432 if (dvp) { 4433 nctmp.mount = mp; 4434 nctmp.ncp = ncp; 4435 *genp += 4; /* setvp bumps the generation */ 4436 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4437 vrele(dvp); 4438 } else { 4439 ncp->nc_error = EPERM; 4440 } 4441 4442 if (ncp->nc_error == EAGAIN) { 4443 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 4444 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4445 goto restart; 4446 } 4447 _cache_ncp_gen_exit(ncp); 4448 4449 return(ncp->nc_error); 4450 } 4451 4452 /* 4453 * Resolve the ncp associated with a mount point. Such ncp's almost always 4454 * remain resolved and this routine is rarely called. NFS MPs tends to force 4455 * re-resolution more often due to its mac-truck-smash-the-namecache 4456 * method of tracking namespace changes. 4457 * 4458 * The semantics for this call is that the passed ncp must be locked on 4459 * entry and will be locked on return. However, if we actually have to 4460 * resolve the mount point we temporarily unlock the entry in order to 4461 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 4462 * the unlock we have to recheck the flags after we relock. 4463 */ 4464 static int 4465 cache_resolve_mp(struct mount *mp, int adjgen) 4466 { 4467 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 4468 struct vnode *vp; 4469 int error; 4470 4471 KKASSERT(mp != NULL); 4472 4473 /* 4474 * If the ncp is already resolved we have nothing to do. However, 4475 * we do want to guarentee that a usable vnode is returned when 4476 * a vnode is present, so make sure it hasn't been reclaimed. 4477 */ 4478 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4479 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4480 _cache_setunresolved(ncp, adjgen); 4481 } 4482 4483 if (ncp->nc_flag & NCF_UNRESOLVED) { 4484 /* 4485 * ncp must be unlocked across the vfs_busy(), but 4486 * once busied lock ordering is ncp(s), then vnodes, 4487 * so we must relock the ncp before issuing the VFS_ROOT(). 4488 */ 4489 _cache_unlock(ncp); 4490 while (vfs_busy(mp, 0)) 4491 ; 4492 _cache_lock(ncp); 4493 error = VFS_ROOT(mp, &vp); 4494 4495 /* 4496 * recheck the ncp state after relocking. 4497 */ 4498 if (ncp->nc_flag & NCF_UNRESOLVED) { 4499 ncp->nc_error = error; 4500 if (error == 0) { 4501 _cache_setvp(mp, ncp, vp, adjgen); 4502 vput(vp); 4503 } else { 4504 kprintf("[diagnostic] cache_resolve_mp: failed" 4505 " to resolve mount %p err=%d ncp=%p\n", 4506 mp, error, ncp); 4507 _cache_setvp(mp, ncp, NULL, adjgen); 4508 } 4509 } else if (error == 0) { 4510 vput(vp); 4511 } 4512 vfs_unbusy(mp); 4513 } 4514 return(ncp->nc_error); 4515 } 4516 4517 /* 4518 * Resolve the parent vnode 4519 */ 4520 int 4521 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4522 { 4523 struct namecache *par_tmp; 4524 struct namecache *par; 4525 struct namecache *ncp; 4526 struct nchandle nctmp; 4527 struct mount *mp; 4528 struct vnode *dvp; 4529 int error; 4530 4531 *dvpp = NULL; 4532 ncp = nch->ncp; 4533 mp = nch->mount; 4534 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4535 4536 /* 4537 * Treat this as a mount point even if it has a parent (e.g. 4538 * null-mount). Return a NULL dvp and no error. 4539 */ 4540 if (ncp == mp->mnt_ncmountpt.ncp) 4541 return 0; 4542 4543 /* 4544 * If the ncp was destroyed there is no parent directory, return 4545 * EINVAL. 4546 */ 4547 if (ncp->nc_flag & NCF_DESTROYED) 4548 return(EINVAL); 4549 4550 /* 4551 * No parent if at the root of a filesystem, no error. Typically 4552 * not applicable to null-mounts. This case should have been caught 4553 * in the above ncmountpt check. 4554 */ 4555 if (ncp->nc_parent == NULL) 4556 return 0; 4557 4558 /* 4559 * Resolve the parent dvp. 4560 * 4561 * The vp's of the parent directories in the chain are held via vhold() 4562 * due to the existance of the child, and should not disappear. 4563 * However, there are cases where they can disappear: 4564 * 4565 * - due to filesystem I/O errors. 4566 * - due to NFS being stupid about tracking the namespace and 4567 * destroys the namespace for entire directories quite often. 4568 * - due to forced unmounts. 4569 * - due to an rmdir (parent will be marked DESTROYED) 4570 * 4571 * When this occurs we have to track the chain backwards and resolve 4572 * it, looping until the resolver catches up to the current node. We 4573 * could recurse here but we might run ourselves out of kernel stack 4574 * so we do it in a more painful manner. This situation really should 4575 * not occur all that often, or if it does not have to go back too 4576 * many nodes to resolve the ncp. 4577 */ 4578 while ((dvp = cache_dvpref(ncp)) == NULL) { 4579 /* 4580 * This case can occur if a process is CD'd into a 4581 * directory which is then rmdir'd. If the parent is marked 4582 * destroyed there is no point trying to resolve it. 4583 */ 4584 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4585 return(ENOENT); 4586 par = ncp->nc_parent; 4587 _cache_hold(par); 4588 _cache_lock(par); 4589 while ((par_tmp = par->nc_parent) != NULL && 4590 par_tmp->nc_vp == NULL) { 4591 _cache_hold(par_tmp); 4592 _cache_lock(par_tmp); 4593 _cache_put(par); 4594 par = par_tmp; 4595 } 4596 if (par->nc_parent == NULL) { 4597 kprintf("EXDEV case 2 %*.*s\n", 4598 par->nc_nlen, par->nc_nlen, par->nc_name); 4599 _cache_put(par); 4600 return (EXDEV); 4601 } 4602 4603 /* 4604 * The parent is not set in stone, ref and lock it to prevent 4605 * it from disappearing. Also note that due to renames it 4606 * is possible for our ncp to move and for par to no longer 4607 * be one of its parents. We resolve it anyway, the loop 4608 * will handle any moves. 4609 */ 4610 _cache_get(par); /* additional hold/lock */ 4611 _cache_put(par); /* from earlier hold/lock */ 4612 if (par == nch->mount->mnt_ncmountpt.ncp) { 4613 cache_resolve_mp(nch->mount, 1); 4614 } else if ((dvp = cache_dvpref(par)) == NULL) { 4615 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4616 par->nc_nlen, par->nc_nlen, par->nc_name); 4617 _cache_put(par); 4618 continue; 4619 } else { 4620 if (par->nc_flag & NCF_UNRESOLVED) { 4621 nctmp.mount = mp; 4622 nctmp.ncp = par; 4623 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4624 } 4625 vrele(dvp); 4626 } 4627 if ((error = par->nc_error) != 0) { 4628 if (par->nc_error != EAGAIN) { 4629 kprintf("EXDEV case 3 %*.*s error %d\n", 4630 par->nc_nlen, par->nc_nlen, par->nc_name, 4631 par->nc_error); 4632 _cache_put(par); 4633 return(error); 4634 } 4635 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4636 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4637 } 4638 _cache_put(par); 4639 /* loop */ 4640 } 4641 4642 /* 4643 * We have a referenced dvp 4644 */ 4645 *dvpp = dvp; 4646 return 0; 4647 } 4648 4649 /* 4650 * Clean out negative cache entries when too many have accumulated. 4651 */ 4652 static void 4653 _cache_cleanneg(long count) 4654 { 4655 struct pcpu_ncache *pn; 4656 struct namecache *ncp; 4657 static uint32_t neg_rover; 4658 uint32_t n; 4659 long vnegs; 4660 4661 n = neg_rover++; /* SMP heuristical, race ok */ 4662 cpu_ccfence(); 4663 n = n % (uint32_t)ncpus; 4664 4665 /* 4666 * Normalize vfscache_negs and count. count is sometimes based 4667 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4668 * have crazy values. 4669 */ 4670 vnegs = vfscache_negs; 4671 cpu_ccfence(); 4672 if (vnegs <= MINNEG) 4673 vnegs = MINNEG; 4674 if (count < 1) 4675 count = 1; 4676 4677 pn = &pcpu_ncache[n]; 4678 spin_lock(&pn->neg_spin); 4679 count = pn->neg_count * count / vnegs + 1; 4680 spin_unlock(&pn->neg_spin); 4681 4682 /* 4683 * Attempt to clean out the specified number of negative cache 4684 * entries. 4685 */ 4686 while (count > 0) { 4687 spin_lock(&pn->neg_spin); 4688 ncp = TAILQ_FIRST(&pn->neg_list); 4689 if (ncp == NULL) { 4690 spin_unlock(&pn->neg_spin); 4691 break; 4692 } 4693 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4694 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4695 _cache_hold(ncp); 4696 spin_unlock(&pn->neg_spin); 4697 4698 /* 4699 * This can race, so we must re-check that the ncp 4700 * is on the ncneg.list after successfully locking it. 4701 * 4702 * Don't scrap actively referenced ncps. There should be 4703 * 3 refs. The natural ref, one from being on the neg list, 4704 * and one from us. 4705 * 4706 * Recheck fields after successfully locking to ensure 4707 * that it is in-fact still on the negative list with no 4708 * extra refs. 4709 * 4710 * WARNING! On the ncneglist scan any race against other 4711 * destructors (zaps or cache_inval_vp_quick() calls) 4712 * will have already unresolved the ncp and cause 4713 * us to drop instead of zap. This fine, if 4714 * our drop winds up being the last one it will 4715 * kfree() the ncp. 4716 */ 4717 if (_cache_lock_special(ncp) == 0) { 4718 if (ncp->nc_vp == NULL && 4719 ncp->nc_refs == 3 && 4720 (ncp->nc_flag & NCF_UNRESOLVED) == 0) 4721 { 4722 ++pcpu_ncache[mycpu->gd_cpuid].clean_neg_count; 4723 cache_zap(ncp); 4724 } else { 4725 _cache_unlock(ncp); 4726 _cache_drop(ncp); 4727 } 4728 } else { 4729 _cache_drop(ncp); 4730 } 4731 --count; 4732 } 4733 } 4734 4735 /* 4736 * Clean out unresolved cache entries when too many have accumulated. 4737 * Resolved cache entries are cleaned out via the vnode reclamation 4738 * mechanism and by _cache_cleanneg(). 4739 */ 4740 static void 4741 _cache_cleanpos(long ucount, long xcount) 4742 { 4743 static volatile int rover; 4744 struct nchash_head *nchpp; 4745 struct namecache *ncp; 4746 long count; 4747 int rover_copy; 4748 4749 /* 4750 * Don't burn too much cpu looking for stuff 4751 */ 4752 count = (ucount > xcount) ? ucount : xcount; 4753 count = count * 4; 4754 4755 /* 4756 * Attempt to clean out the specified number of cache entries. 4757 */ 4758 while (count > 0 && (ucount > 0 || xcount > 0)) { 4759 rover_copy = ++rover; /* MPSAFEENOUGH */ 4760 cpu_ccfence(); 4761 nchpp = NCHHASH(rover_copy); 4762 4763 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4764 --count; 4765 continue; 4766 } 4767 4768 /* 4769 * Get the next ncp 4770 */ 4771 spin_lock(&nchpp->spin); 4772 ncp = TAILQ_FIRST(&nchpp->list); 4773 4774 /* 4775 * Skip placeholder ncp's. Do not shift their 4776 * position in the list. 4777 */ 4778 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4779 ncp = TAILQ_NEXT(ncp, nc_hash); 4780 4781 if (ncp) { 4782 /* 4783 * Move to end of list 4784 */ 4785 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4786 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4787 4788 if (ncp->nc_refs != ncpbaserefs(ncp)) { 4789 /* 4790 * Do not destroy internal nodes that have 4791 * children or nodes which have thread 4792 * references. 4793 */ 4794 ncp = NULL; 4795 } else if (ucount > 0 && 4796 (ncp->nc_flag & NCF_UNRESOLVED)) 4797 { 4798 /* 4799 * Destroy unresolved nodes if asked. 4800 */ 4801 --ucount; 4802 --xcount; 4803 _cache_hold(ncp); 4804 } else if (xcount > 0) { 4805 /* 4806 * Destroy any other node if asked. 4807 */ 4808 --xcount; 4809 _cache_hold(ncp); 4810 } else { 4811 /* 4812 * Otherwise don't 4813 */ 4814 ncp = NULL; 4815 } 4816 } 4817 spin_unlock(&nchpp->spin); 4818 4819 /* 4820 * Try to scap the ncp if we can do so non-blocking. 4821 * We must re-check nc_refs after locking, and it will 4822 * have one additional ref from above. 4823 */ 4824 if (ncp) { 4825 if (_cache_lock_special(ncp) == 0) { 4826 if (ncp->nc_refs == 1 + ncpbaserefs(ncp)) { 4827 ++pcpu_ncache[mycpu->gd_cpuid]. 4828 clean_pos_count; 4829 cache_zap(ncp); 4830 } else { 4831 _cache_unlock(ncp); 4832 _cache_drop(ncp); 4833 } 4834 } else { 4835 _cache_drop(ncp); 4836 } 4837 } 4838 --count; 4839 } 4840 } 4841 4842 /* 4843 * This is a kitchen sink function to clean out ncps which we 4844 * tried to zap from cache_drop() but failed because we were 4845 * unable to acquire the parent lock. 4846 * 4847 * Such entries can also be removed via cache_inval_vp(), such 4848 * as when unmounting. 4849 */ 4850 static void 4851 _cache_cleandefered(void) 4852 { 4853 struct nchash_head *nchpp; 4854 struct namecache *ncp; 4855 struct namecache dummy; 4856 int i; 4857 4858 /* 4859 * Create a list iterator. DUMMY indicates that this is a list 4860 * iterator, DESTROYED prevents matches by lookup functions. 4861 */ 4862 numdefered = 0; 4863 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4864 bzero(&dummy, sizeof(dummy)); 4865 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4866 dummy.nc_refs = 1; 4867 4868 for (i = 0; i <= nchash; ++i) { 4869 nchpp = &nchashtbl[i]; 4870 4871 spin_lock(&nchpp->spin); 4872 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4873 ncp = &dummy; 4874 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4875 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4876 continue; 4877 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4878 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4879 _cache_hold(ncp); 4880 spin_unlock(&nchpp->spin); 4881 if (_cache_lock_nonblock(ncp) == 0) { 4882 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4883 _cache_unlock(ncp); 4884 } 4885 _cache_drop(ncp); 4886 spin_lock(&nchpp->spin); 4887 ncp = &dummy; 4888 } 4889 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4890 spin_unlock(&nchpp->spin); 4891 } 4892 } 4893 4894 /* 4895 * Name cache initialization, from vfsinit() when we are booting 4896 */ 4897 void 4898 nchinit(void) 4899 { 4900 struct pcpu_ncache *pn; 4901 globaldata_t gd; 4902 int i; 4903 4904 /* 4905 * Per-cpu accounting and negative hit list 4906 */ 4907 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4908 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4909 for (i = 0; i < ncpus; ++i) { 4910 pn = &pcpu_ncache[i]; 4911 TAILQ_INIT(&pn->neg_list); 4912 spin_init(&pn->neg_spin, "ncneg"); 4913 spin_init(&pn->umount_spin, "ncumm"); 4914 } 4915 4916 /* 4917 * Initialise per-cpu namecache effectiveness statistics. 4918 */ 4919 for (i = 0; i < ncpus; ++i) { 4920 gd = globaldata_find(i); 4921 gd->gd_nchstats = &nchstats[i]; 4922 } 4923 4924 /* 4925 * Create a generous namecache hash table 4926 */ 4927 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4928 sizeof(struct nchash_head), 4929 M_VFSCACHEAUX, &nchash); 4930 for (i = 0; i <= (int)nchash; ++i) { 4931 TAILQ_INIT(&nchashtbl[i].list); 4932 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4933 } 4934 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4935 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4936 nclockwarn = 5 * hz; 4937 } 4938 4939 /* 4940 * Called from start_init() to bootstrap the root filesystem. Returns 4941 * a referenced, unlocked namecache record to serve as a root or the 4942 * root of the system. 4943 * 4944 * Adjust our namecache counts 4945 */ 4946 void 4947 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4948 { 4949 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 4950 4951 atomic_add_long(&pn->vfscache_leafs, 1); 4952 atomic_add_long(&pn->vfscache_unres, 1); 4953 4954 nch->ncp = cache_alloc(0); 4955 nch->mount = mp; 4956 _cache_mntref(mp); 4957 if (vp) 4958 _cache_setvp(nch->mount, nch->ncp, vp, 1); 4959 } 4960 4961 /* 4962 * vfs_cache_setroot() 4963 * 4964 * Create an association between the root of our namecache and 4965 * the root vnode. This routine may be called several times during 4966 * booting. 4967 * 4968 * If the caller intends to save the returned namecache pointer somewhere 4969 * it must cache_hold() it. 4970 */ 4971 void 4972 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4973 { 4974 struct vnode *ovp; 4975 struct nchandle onch; 4976 4977 ovp = rootvnode; 4978 onch = rootnch; 4979 rootvnode = nvp; 4980 if (nch) 4981 rootnch = *nch; 4982 else 4983 cache_zero(&rootnch); 4984 if (ovp) 4985 vrele(ovp); 4986 if (onch.ncp) 4987 cache_drop(&onch); 4988 } 4989 4990 /* 4991 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4992 * topology and is being removed as quickly as possible. The new VOP_N*() 4993 * API calls are required to make specific adjustments using the supplied 4994 * ncp pointers rather then just bogusly purging random vnodes. 4995 * 4996 * Invalidate all namecache entries to a particular vnode as well as 4997 * any direct children of that vnode in the namecache. This is a 4998 * 'catch all' purge used by filesystems that do not know any better. 4999 * 5000 * Note that the linkage between the vnode and its namecache entries will 5001 * be removed, but the namecache entries themselves might stay put due to 5002 * active references from elsewhere in the system or due to the existance of 5003 * the children. The namecache topology is left intact even if we do not 5004 * know what the vnode association is. Such entries will be marked 5005 * NCF_UNRESOLVED. 5006 */ 5007 void 5008 cache_purge(struct vnode *vp) 5009 { 5010 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 5011 } 5012 5013 __read_mostly static int disablecwd; 5014 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 5015 "Disable getcwd"); 5016 5017 /* 5018 * MPALMOSTSAFE 5019 */ 5020 int 5021 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 5022 { 5023 u_int buflen; 5024 int error; 5025 char *buf; 5026 char *bp; 5027 5028 if (disablecwd) 5029 return (ENODEV); 5030 5031 buflen = uap->buflen; 5032 if (buflen == 0) 5033 return (EINVAL); 5034 if (buflen > MAXPATHLEN) 5035 buflen = MAXPATHLEN; 5036 5037 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 5038 bp = kern_getcwd(buf, buflen, &error); 5039 if (error == 0) 5040 error = copyout(bp, uap->buf, strlen(bp) + 1); 5041 kfree(buf, M_TEMP); 5042 return (error); 5043 } 5044 5045 char * 5046 kern_getcwd(char *buf, size_t buflen, int *error) 5047 { 5048 struct proc *p = curproc; 5049 char *bp; 5050 int i, slash_prefixed; 5051 struct filedesc *fdp; 5052 struct nchandle nch; 5053 struct namecache *ncp; 5054 5055 bp = buf; 5056 bp += buflen - 1; 5057 *bp = '\0'; 5058 fdp = p->p_fd; 5059 slash_prefixed = 0; 5060 5061 nch = fdp->fd_ncdir; 5062 ncp = nch.ncp; 5063 if (ncp) 5064 _cache_hold(ncp); 5065 5066 while (ncp && (ncp != fdp->fd_nrdir.ncp || 5067 nch.mount != fdp->fd_nrdir.mount) 5068 ) { 5069 if (ncp->nc_flag & NCF_DESTROYED) { 5070 _cache_drop(ncp); 5071 ncp = NULL; 5072 break; 5073 } 5074 /* 5075 * While traversing upwards if we encounter the root 5076 * of the current mount we have to skip to the mount point 5077 * in the underlying filesystem. 5078 */ 5079 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 5080 nch = nch.mount->mnt_ncmounton; 5081 _cache_drop(ncp); 5082 ncp = nch.ncp; 5083 if (ncp) 5084 _cache_hold(ncp); 5085 continue; 5086 } 5087 5088 /* 5089 * Prepend the path segment 5090 */ 5091 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 5092 if (bp == buf) { 5093 *error = ERANGE; 5094 bp = NULL; 5095 goto done; 5096 } 5097 *--bp = ncp->nc_name[i]; 5098 } 5099 if (bp == buf) { 5100 *error = ERANGE; 5101 bp = NULL; 5102 goto done; 5103 } 5104 *--bp = '/'; 5105 slash_prefixed = 1; 5106 5107 /* 5108 * Go up a directory. This isn't a mount point so we don't 5109 * have to check again. 5110 */ 5111 while ((nch.ncp = ncp->nc_parent) != NULL) { 5112 if (ncp_shared_lock_disable) 5113 _cache_lock(ncp); 5114 else 5115 _cache_lock_shared(ncp); 5116 if (nch.ncp != ncp->nc_parent) { 5117 _cache_unlock(ncp); 5118 continue; 5119 } 5120 _cache_hold(nch.ncp); 5121 _cache_unlock(ncp); 5122 break; 5123 } 5124 _cache_drop(ncp); 5125 ncp = nch.ncp; 5126 } 5127 if (ncp == NULL) { 5128 *error = ENOENT; 5129 bp = NULL; 5130 goto done; 5131 } 5132 if (!slash_prefixed) { 5133 if (bp == buf) { 5134 *error = ERANGE; 5135 bp = NULL; 5136 goto done; 5137 } 5138 *--bp = '/'; 5139 } 5140 *error = 0; 5141 done: 5142 if (ncp) 5143 _cache_drop(ncp); 5144 return (bp); 5145 } 5146 5147 /* 5148 * Thus begins the fullpath magic. 5149 * 5150 * The passed nchp is referenced but not locked. 5151 */ 5152 __read_mostly static int disablefullpath; 5153 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 5154 &disablefullpath, 0, 5155 "Disable fullpath lookups"); 5156 5157 int 5158 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 5159 char **retbuf, char **freebuf, int guess) 5160 { 5161 struct nchandle fd_nrdir; 5162 struct nchandle nch; 5163 struct namecache *ncp; 5164 struct mount *mp, *new_mp; 5165 char *bp, *buf; 5166 int slash_prefixed; 5167 int error = 0; 5168 int i; 5169 5170 *retbuf = NULL; 5171 *freebuf = NULL; 5172 5173 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 5174 bp = buf + MAXPATHLEN - 1; 5175 *bp = '\0'; 5176 if (nchbase) 5177 fd_nrdir = *nchbase; 5178 else if (p != NULL) 5179 fd_nrdir = p->p_fd->fd_nrdir; 5180 else 5181 fd_nrdir = rootnch; 5182 slash_prefixed = 0; 5183 nch = *nchp; 5184 ncp = nch.ncp; 5185 if (ncp) 5186 _cache_hold(ncp); 5187 mp = nch.mount; 5188 5189 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 5190 new_mp = NULL; 5191 5192 /* 5193 * If we are asked to guess the upwards path, we do so whenever 5194 * we encounter an ncp marked as a mountpoint. We try to find 5195 * the actual mountpoint by finding the mountpoint with this 5196 * ncp. 5197 */ 5198 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 5199 new_mp = mount_get_by_nc(ncp); 5200 } 5201 /* 5202 * While traversing upwards if we encounter the root 5203 * of the current mount we have to skip to the mount point. 5204 */ 5205 if (ncp == mp->mnt_ncmountpt.ncp) { 5206 new_mp = mp; 5207 } 5208 if (new_mp) { 5209 nch = new_mp->mnt_ncmounton; 5210 _cache_drop(ncp); 5211 ncp = nch.ncp; 5212 if (ncp) 5213 _cache_hold(ncp); 5214 mp = nch.mount; 5215 continue; 5216 } 5217 5218 /* 5219 * Prepend the path segment 5220 */ 5221 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 5222 if (bp == buf) { 5223 kfree(buf, M_TEMP); 5224 error = ENOMEM; 5225 goto done; 5226 } 5227 *--bp = ncp->nc_name[i]; 5228 } 5229 if (bp == buf) { 5230 kfree(buf, M_TEMP); 5231 error = ENOMEM; 5232 goto done; 5233 } 5234 *--bp = '/'; 5235 slash_prefixed = 1; 5236 5237 /* 5238 * Go up a directory. This isn't a mount point so we don't 5239 * have to check again. 5240 * 5241 * We can only safely access nc_parent with ncp held locked. 5242 */ 5243 while ((nch.ncp = ncp->nc_parent) != NULL) { 5244 _cache_lock_shared(ncp); 5245 if (nch.ncp != ncp->nc_parent) { 5246 _cache_unlock(ncp); 5247 continue; 5248 } 5249 _cache_hold(nch.ncp); 5250 _cache_unlock(ncp); 5251 break; 5252 } 5253 _cache_drop(ncp); 5254 ncp = nch.ncp; 5255 } 5256 if (ncp == NULL) { 5257 kfree(buf, M_TEMP); 5258 error = ENOENT; 5259 goto done; 5260 } 5261 5262 if (!slash_prefixed) { 5263 if (bp == buf) { 5264 kfree(buf, M_TEMP); 5265 error = ENOMEM; 5266 goto done; 5267 } 5268 *--bp = '/'; 5269 } 5270 *retbuf = bp; 5271 *freebuf = buf; 5272 error = 0; 5273 done: 5274 if (ncp) 5275 _cache_drop(ncp); 5276 return(error); 5277 } 5278 5279 int 5280 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 5281 char **freebuf, int guess) 5282 { 5283 struct namecache *ncp; 5284 struct nchandle nch; 5285 int error; 5286 5287 *freebuf = NULL; 5288 if (disablefullpath) 5289 return (ENODEV); 5290 5291 if (p == NULL) 5292 return (EINVAL); 5293 5294 /* vn is NULL, client wants us to use p->p_textvp */ 5295 if (vn == NULL) { 5296 if ((vn = p->p_textvp) == NULL) 5297 return (EINVAL); 5298 } 5299 spin_lock_shared(&vn->v_spin); 5300 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 5301 if (ncp->nc_nlen) 5302 break; 5303 } 5304 if (ncp == NULL) { 5305 spin_unlock_shared(&vn->v_spin); 5306 return (EINVAL); 5307 } 5308 _cache_hold(ncp); 5309 spin_unlock_shared(&vn->v_spin); 5310 5311 nch.ncp = ncp; 5312 nch.mount = vn->v_mount; 5313 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 5314 _cache_drop(ncp); 5315 return (error); 5316 } 5317 5318 void 5319 vfscache_rollup_cpu(struct globaldata *gd) 5320 { 5321 struct pcpu_ncache *pn; 5322 long count; 5323 5324 if (pcpu_ncache == NULL) 5325 return; 5326 pn = &pcpu_ncache[gd->gd_cpuid]; 5327 5328 /* 5329 * namecache statistics 5330 */ 5331 if (pn->vfscache_count) { 5332 count = atomic_swap_long(&pn->vfscache_count, 0); 5333 atomic_add_long(&vfscache_count, count); 5334 } 5335 if (pn->vfscache_leafs) { 5336 count = atomic_swap_long(&pn->vfscache_leafs, 0); 5337 atomic_add_long(&vfscache_leafs, count); 5338 } 5339 if (pn->vfscache_unres) { 5340 count = atomic_swap_long(&pn->vfscache_unres, 0); 5341 atomic_add_long(&vfscache_unres, count); 5342 } 5343 if (pn->vfscache_negs) { 5344 count = atomic_swap_long(&pn->vfscache_negs, 0); 5345 atomic_add_long(&vfscache_negs, count); 5346 } 5347 5348 /* 5349 * hysteresis based cleanings 5350 */ 5351 if (pn->inv_kid_quick_count) { 5352 count = atomic_swap_long(&pn->inv_kid_quick_count, 0); 5353 atomic_add_long(&inv_kid_quick_count, count); 5354 } 5355 if (pn->inv_ncp_quick_count) { 5356 count = atomic_swap_long(&pn->inv_ncp_quick_count, 0); 5357 atomic_add_long(&inv_ncp_quick_count, count); 5358 } 5359 if (pn->clean_pos_count) { 5360 count = atomic_swap_long(&pn->clean_pos_count, 0); 5361 atomic_add_long(&clean_pos_count, count); 5362 } 5363 if (pn->clean_neg_count) { 5364 count = atomic_swap_long(&pn->clean_neg_count, 0); 5365 atomic_add_long(&clean_neg_count, count); 5366 } 5367 5368 if (pn->numdefered) { 5369 count = atomic_swap_long(&pn->numdefered, 0); 5370 atomic_add_long(&numdefered, count); 5371 } 5372 } 5373