1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long vfscache_unres; 179 long numdefered; 180 long inv_kid_quick_count; 181 long inv_ncp_quick_count; 182 long clean_pos_count; 183 long clean_neg_count; 184 } __cachealign; 185 186 __read_mostly static struct nchash_head *nchashtbl; 187 __read_mostly static struct pcpu_ncache *pcpu_ncache; 188 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 189 190 /* 191 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 192 * to create the namecache infrastructure leading to a dangling vnode. 193 * 194 * 0 Only errors are reported 195 * 1 Successes are reported 196 * 2 Successes + the whole directory scan is reported 197 * 3 Force the directory scan code run as if the parent vnode did not 198 * have a namecache record, even if it does have one. 199 */ 200 __read_mostly static int ncvp_debug; 201 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 202 "Namecache debug level (0-3)"); 203 204 __read_mostly static u_long nchash; /* size of hash table */ 205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 206 "Size of namecache hash table"); 207 208 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 209 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 210 "Batch flush negative entries"); 211 212 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 213 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 214 "Batch flush positive entries"); 215 216 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 217 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 218 "Ratio of negative namecache entries"); 219 220 __read_mostly static int ncposfactor = 16; /* ratio of unres+leaf entries */ 221 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0, 222 "Ratio of unresolved leaf namecache entries"); 223 224 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 225 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 226 "Warn on locked namecache entries in ticks"); 227 228 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 229 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 230 "Number of cache entries allocated"); 231 232 __read_mostly static int ncp_shared_lock_disable = 0; 233 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 234 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 235 236 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 237 "sizeof(struct vnode)"); 238 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 239 "sizeof(struct namecache)"); 240 241 __read_mostly static int ncmount_cache_enable = 1; 242 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 243 &ncmount_cache_enable, 0, "mount point cache"); 244 245 static __inline void _cache_drop(struct namecache *ncp); 246 static int cache_resolve_mp(struct mount *mp); 247 static int cache_findmount_callback(struct mount *mp, void *data); 248 static void _cache_setunresolved(struct namecache *ncp); 249 static void _cache_cleanneg(long count); 250 static void _cache_cleanpos(long ucount, long xcount); 251 static void _cache_cleandefered(void); 252 static void _cache_unlink(struct namecache *ncp); 253 254 /* 255 * The new name cache statistics (these are rolled up globals and not 256 * modified in the critical path, see struct pcpu_ncache). 257 */ 258 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 259 static long vfscache_negs; 260 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 261 "Number of negative namecache entries"); 262 static long vfscache_count; 263 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 264 "Number of namecaches entries"); 265 static long vfscache_leafs; 266 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 267 "Number of leaf namecaches entries"); 268 static long vfscache_unres; 269 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0, 270 "Number of unresolved leaf namecaches entries"); 271 272 static long inv_kid_quick_count; 273 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_kid_quick_count, CTLFLAG_RD, 274 &inv_kid_quick_count, 0, 275 "quick kid invalidations"); 276 static long inv_ncp_quick_count; 277 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_ncp_quick_count, CTLFLAG_RD, 278 &inv_ncp_quick_count, 0, 279 "quick ncp invalidations"); 280 static long clean_pos_count; 281 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_pos_count, CTLFLAG_RD, 282 &clean_pos_count, 0, 283 "positive ncp cleanings"); 284 static long clean_neg_count; 285 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_neg_count, CTLFLAG_RD, 286 &clean_neg_count, 0, 287 "negative ncp cleanings"); 288 289 static long numdefered; 290 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 291 "Number of cache entries allocated"); 292 293 /* 294 * Returns the number of basic references expected on the ncp, not 295 * including any children. 1 for the natural ref, and an addition ref 296 * if the ncp is resolved (representing a positive or negative hit). 297 */ 298 static __inline int 299 ncpbaserefs(struct namecache *ncp) 300 { 301 return (1 + ((ncp->nc_flag & NCF_UNRESOLVED) == 0)); 302 } 303 304 struct nchstats nchstats[SMP_MAXCPU]; 305 /* 306 * Export VFS cache effectiveness statistics to user-land. 307 * 308 * The statistics are left for aggregation to user-land so 309 * neat things can be achieved, like observing per-CPU cache 310 * distribution. 311 */ 312 static int 313 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 314 { 315 struct globaldata *gd; 316 int i, error; 317 318 error = 0; 319 for (i = 0; i < ncpus; ++i) { 320 gd = globaldata_find(i); 321 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 322 sizeof(struct nchstats)))) 323 break; 324 } 325 326 return (error); 327 } 328 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 329 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 330 331 static int cache_zap(struct namecache *ncp); 332 333 /* 334 * Cache mount points and namecache records in order to avoid unnecessary 335 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 336 * performance and is particularly important on multi-socket systems to 337 * reduce cache-line ping-ponging. 338 * 339 * Try to keep the pcpu structure within one cache line (~64 bytes). 340 */ 341 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 342 #define MNTCACHE_SET 8 /* set associativity */ 343 344 struct mntcache_elm { 345 struct namecache *ncp; 346 struct mount *mp; 347 int ticks; 348 int unused01; 349 }; 350 351 struct mntcache { 352 struct mntcache_elm array[MNTCACHE_COUNT]; 353 } __cachealign; 354 355 static struct mntcache pcpu_mntcache[MAXCPU]; 356 357 static __inline 358 struct mntcache_elm * 359 _cache_mntcache_hash(void *ptr) 360 { 361 struct mntcache_elm *elm; 362 int hv; 363 364 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 365 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 366 367 return elm; 368 } 369 370 static 371 void 372 _cache_mntref(struct mount *mp) 373 { 374 struct mntcache_elm *elm; 375 struct mount *mpr; 376 int i; 377 378 elm = _cache_mntcache_hash(mp); 379 for (i = 0; i < MNTCACHE_SET; ++i) { 380 if (elm->mp == mp) { 381 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 382 if (__predict_true(mpr == mp)) 383 return; 384 if (mpr) 385 atomic_add_int(&mpr->mnt_refs, -1); 386 } 387 ++elm; 388 } 389 atomic_add_int(&mp->mnt_refs, 1); 390 } 391 392 static 393 void 394 _cache_mntrel(struct mount *mp) 395 { 396 struct mntcache_elm *elm; 397 struct mntcache_elm *best; 398 struct mount *mpr; 399 int delta1; 400 int delta2; 401 int i; 402 403 elm = _cache_mntcache_hash(mp); 404 best = elm; 405 for (i = 0; i < MNTCACHE_SET; ++i) { 406 if (elm->mp == NULL) { 407 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 408 if (__predict_false(mpr != NULL)) { 409 atomic_add_int(&mpr->mnt_refs, -1); 410 } 411 elm->ticks = ticks; 412 return; 413 } 414 delta1 = ticks - best->ticks; 415 delta2 = ticks - elm->ticks; 416 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 417 best = elm; 418 ++elm; 419 } 420 mpr = atomic_swap_ptr((void *)&best->mp, mp); 421 best->ticks = ticks; 422 if (mpr) 423 atomic_add_int(&mpr->mnt_refs, -1); 424 } 425 426 /* 427 * Clears all cached mount points on all cpus. This routine should only 428 * be called when we are waiting for a mount to clear, e.g. so we can 429 * unmount. 430 */ 431 void 432 cache_clearmntcache(struct mount *target __unused) 433 { 434 int n; 435 436 for (n = 0; n < ncpus; ++n) { 437 struct mntcache *cache = &pcpu_mntcache[n]; 438 struct mntcache_elm *elm; 439 struct namecache *ncp; 440 struct mount *mp; 441 int i; 442 443 for (i = 0; i < MNTCACHE_COUNT; ++i) { 444 elm = &cache->array[i]; 445 if (elm->mp) { 446 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 447 if (mp) 448 atomic_add_int(&mp->mnt_refs, -1); 449 } 450 if (elm->ncp) { 451 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 452 if (ncp) 453 _cache_drop(ncp); 454 } 455 } 456 } 457 } 458 459 /* 460 * Namespace locking. The caller must already hold a reference to the 461 * namecache structure in order to lock/unlock it. The controlling entity 462 * in a 1->0 transition does not need to lock the ncp to dispose of it, 463 * as nobody else will have visibility to it at that point. 464 * 465 * Note that holding a locked namecache structure prevents other threads 466 * from making namespace changes (e.g. deleting or creating), prevents 467 * vnode association state changes by other threads, and prevents the 468 * namecache entry from being resolved or unresolved by other threads. 469 * 470 * An exclusive lock owner has full authority to associate/disassociate 471 * vnodes and resolve/unresolve the locked ncp. 472 * 473 * A shared lock owner only has authority to acquire the underlying vnode, 474 * if any. 475 * 476 * The primary lock field is nc_lockstatus. nc_locktd is set after the 477 * fact (when locking) or cleared prior to unlocking. 478 * 479 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 480 * or recycled, but it does NOT help you if the vnode had already 481 * initiated a recyclement. If this is important, use cache_get() 482 * rather then cache_lock() (and deal with the differences in the 483 * way the refs counter is handled). Or, alternatively, make an 484 * unconditional call to cache_validate() or cache_resolve() 485 * after cache_lock() returns. 486 */ 487 static __inline 488 void 489 _cache_lock(struct namecache *ncp) 490 { 491 int didwarn = 0; 492 int error; 493 494 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 495 while (__predict_false(error == EWOULDBLOCK)) { 496 if (didwarn == 0) { 497 didwarn = ticks - nclockwarn; 498 kprintf("[diagnostic] cache_lock: " 499 "%s blocked on %p " 500 "\"%*.*s\"\n", 501 curthread->td_comm, ncp, 502 ncp->nc_nlen, ncp->nc_nlen, 503 ncp->nc_name); 504 } 505 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 506 } 507 if (__predict_false(didwarn)) { 508 kprintf("[diagnostic] cache_lock: " 509 "%s unblocked %*.*s after %d secs\n", 510 curthread->td_comm, 511 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 512 (int)(ticks - didwarn) / hz); 513 } 514 } 515 516 /* 517 * Release a previously acquired lock. 518 * 519 * A concurrent shared-lock acquisition or acquisition/release can 520 * race bit 31 so only drop the ncp if bit 31 was set. 521 */ 522 static __inline 523 void 524 _cache_unlock(struct namecache *ncp) 525 { 526 lockmgr(&ncp->nc_lock, LK_RELEASE); 527 } 528 529 /* 530 * Lock ncp exclusively, non-blocking. Return 0 on success. 531 */ 532 static __inline 533 int 534 _cache_lock_nonblock(struct namecache *ncp) 535 { 536 int error; 537 538 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 539 if (__predict_false(error != 0)) { 540 return(EWOULDBLOCK); 541 } 542 return 0; 543 } 544 545 /* 546 * This is a special form of _cache_lock() which only succeeds if 547 * it can get a pristine, non-recursive lock. The caller must have 548 * already ref'd the ncp. 549 * 550 * On success the ncp will be locked, on failure it will not. The 551 * ref count does not change either way. 552 * 553 * We want _cache_lock_special() (on success) to return a definitively 554 * usable vnode or a definitively unresolved ncp. 555 */ 556 static __inline 557 int 558 _cache_lock_special(struct namecache *ncp) 559 { 560 if (_cache_lock_nonblock(ncp) == 0) { 561 if (lockmgr_oneexcl(&ncp->nc_lock)) { 562 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 563 _cache_setunresolved(ncp); 564 return 0; 565 } 566 _cache_unlock(ncp); 567 } 568 return EWOULDBLOCK; 569 } 570 571 /* 572 * Shared lock, guarantees vp held 573 * 574 * The shared lock holds vp on the 0->1 transition. It is possible to race 575 * another shared lock release, preventing the other release from dropping 576 * the vnode and clearing bit 31. 577 * 578 * If it is not set then we are responsible for setting it, and this 579 * responsibility does not race with anyone else. 580 */ 581 static __inline 582 void 583 _cache_lock_shared(struct namecache *ncp) 584 { 585 int didwarn = 0; 586 int error; 587 588 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 589 while (__predict_false(error == EWOULDBLOCK)) { 590 if (didwarn == 0) { 591 didwarn = ticks - nclockwarn; 592 kprintf("[diagnostic] cache_lock_shared: " 593 "%s blocked on %p " 594 "\"%*.*s\"\n", 595 curthread->td_comm, ncp, 596 ncp->nc_nlen, ncp->nc_nlen, 597 ncp->nc_name); 598 } 599 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 600 } 601 if (__predict_false(didwarn)) { 602 kprintf("[diagnostic] cache_lock_shared: " 603 "%s unblocked %*.*s after %d secs\n", 604 curthread->td_comm, 605 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 606 (int)(ticks - didwarn) / hz); 607 } 608 } 609 610 /* 611 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 612 */ 613 static __inline 614 int 615 _cache_lock_shared_nonblock(struct namecache *ncp) 616 { 617 int error; 618 619 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 620 if (__predict_false(error != 0)) { 621 return(EWOULDBLOCK); 622 } 623 return 0; 624 } 625 626 /* 627 * This function tries to get a shared lock but will back-off to an 628 * exclusive lock if: 629 * 630 * (1) Some other thread is trying to obtain an exclusive lock 631 * (to prevent the exclusive requester from getting livelocked out 632 * by many shared locks). 633 * 634 * (2) The current thread already owns an exclusive lock (to avoid 635 * deadlocking). 636 * 637 * WARNING! On machines with lots of cores we really want to try hard to 638 * get a shared lock or concurrent path lookups can chain-react 639 * into a very high-latency exclusive lock. 640 * 641 * This is very evident in dsynth's initial scans. 642 */ 643 static __inline 644 int 645 _cache_lock_shared_special(struct namecache *ncp) 646 { 647 /* 648 * Only honor a successful shared lock (returning 0) if there is 649 * no exclusive request pending and the vnode, if present, is not 650 * in a reclaimed state. 651 */ 652 if (_cache_lock_shared_nonblock(ncp) == 0) { 653 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 654 if (ncp->nc_vp == NULL || 655 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 656 return(0); 657 } 658 } 659 _cache_unlock(ncp); 660 return(EWOULDBLOCK); 661 } 662 663 /* 664 * Non-blocking shared lock failed. If we already own the exclusive 665 * lock just acquire another exclusive lock (instead of deadlocking). 666 * Otherwise acquire a shared lock. 667 */ 668 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 669 _cache_lock(ncp); 670 return(0); 671 } 672 _cache_lock_shared(ncp); 673 return(0); 674 } 675 676 /* 677 * Returns: 678 * -1 Locked by other 679 * 0 Not locked 680 * (v) LK_SHARED or LK_EXCLUSIVE 681 */ 682 static __inline 683 int 684 _cache_lockstatus(struct namecache *ncp) 685 { 686 int status; 687 688 status = lockstatus(&ncp->nc_lock, curthread); 689 if (status == LK_EXCLOTHER) 690 status = -1; 691 return status; 692 } 693 694 /* 695 * cache_hold() and cache_drop() prevent the premature deletion of a 696 * namecache entry but do not prevent operations (such as zapping) on 697 * that namecache entry. 698 * 699 * This routine may only be called from outside this source module if 700 * nc_refs is already deterministically at least 1, such as being 701 * associated with e.g. a process, file descriptor, or some other entity. 702 * 703 * Only the above situations, similar situations within this module where 704 * the ref count is deterministically at least 1, or when the ncp is found 705 * via the nchpp (hash table) lookup, can bump nc_refs. 706 * 707 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 708 * can still be removed from the nc_list, however, as long as the caller 709 * can acquire its lock (in the wrong order). 710 * 711 * This is a rare case where callers are allowed to hold a spinlock, 712 * so we can't ourselves. 713 */ 714 static __inline 715 struct namecache * 716 _cache_hold(struct namecache *ncp) 717 { 718 KKASSERT(ncp->nc_refs > 0); 719 atomic_add_int(&ncp->nc_refs, 1); 720 721 return(ncp); 722 } 723 724 /* 725 * Drop a cache entry. 726 * 727 * The 1->0 transition can only occur after or because the natural ref 728 * is being dropped. If another thread had a temporary ref during the 729 * ncp's destruction, then that other thread might wind up being the 730 * one to drop the last ref. 731 */ 732 static __inline 733 void 734 _cache_drop(struct namecache *ncp) 735 { 736 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 737 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 738 739 /* 740 * Scrap it. 741 */ 742 ncp->nc_refs = -1; /* safety */ 743 if (ncp->nc_name) 744 kfree(ncp->nc_name, M_VFSCACHEAUX); 745 kfree_obj(ncp, M_VFSCACHE); 746 } 747 } 748 749 /* 750 * Link a new namecache entry to its parent and to the hash table. Be 751 * careful to avoid races if vhold() blocks in the future. 752 * 753 * Both ncp and par must be referenced and locked. The reference is 754 * transfered to the nchpp (and, most notably, NOT to the parent list). 755 * 756 * NOTE: The hash table spinlock is held across this call, we can't do 757 * anything fancy. 758 */ 759 static void 760 _cache_link_parent(struct namecache *ncp, struct namecache *par, 761 struct nchash_head *nchpp) 762 { 763 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 764 765 KKASSERT(ncp->nc_parent == NULL); 766 ncp->nc_parent = par; 767 ncp->nc_head = nchpp; 768 769 /* 770 * Set inheritance flags. Note that the parent flags may be 771 * stale due to getattr potentially not having been run yet 772 * (it gets run during nlookup()'s). 773 */ 774 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 775 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 776 ncp->nc_flag |= NCF_SF_PNOCACHE; 777 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 778 ncp->nc_flag |= NCF_UF_PCACHE; 779 780 /* 781 * Add to hash table and parent, adjust accounting 782 */ 783 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 784 atomic_add_long(&pn->vfscache_count, 1); 785 786 /* 787 * ncp is a new leaf being added to the tree 788 */ 789 if (TAILQ_EMPTY(&ncp->nc_list)) { 790 atomic_add_long(&pn->vfscache_leafs, 1); 791 if (ncp->nc_flag & NCF_UNRESOLVED) 792 atomic_add_long(&pn->vfscache_unres, 1); 793 } 794 795 if (TAILQ_EMPTY(&par->nc_list)) { 796 /* 797 * Parent was, but now is no longer a leaf 798 */ 799 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 800 if (par->nc_flag & NCF_UNRESOLVED) 801 atomic_add_long(&pn->vfscache_unres, -1); 802 atomic_add_long(&pn->vfscache_leafs, -1); 803 804 /* 805 * Any vp associated with an ncp which has children must 806 * be held to prevent it from being recycled. 807 */ 808 if (par->nc_vp) 809 vhold(par->nc_vp); 810 } else { 811 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 812 } 813 _cache_hold(par); /* add nc_parent ref */ 814 } 815 816 /* 817 * Remove the parent and hash associations from a namecache structure. 818 * Drop the ref-count on the parent. The caller receives the ref 819 * from the ncp's nchpp linkage that was removed and may forward that 820 * ref to a new linkage. 821 822 * The caller usually holds an additional ref * on the ncp so the unlink 823 * cannot be the final drop. XXX should not be necessary now since the 824 * caller receives the ref from the nchpp linkage, assuming the ncp 825 * was linked in the first place. 826 * 827 * ncp must be locked, which means that there won't be any nc_parent 828 * removal races. This routine will acquire a temporary lock on 829 * the parent as well as the appropriate hash chain. 830 * 831 * par must be locked and will remain locked on return. 832 * 833 * nhcpp must be spin-locked. This routine eats the spin-lock. 834 */ 835 static __inline void 836 _cache_unlink_parent(struct namecache *par, struct namecache *ncp, 837 struct nchash_head *nchpp) 838 { 839 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 840 struct vnode *dropvp; 841 842 KKASSERT(ncp->nc_parent == par); 843 cpu_ccfence(); 844 845 /* don't add a ref, we drop the nchpp ref later */ 846 847 /* 848 * Remove from hash table and parent, adjust accounting 849 */ 850 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 851 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 852 atomic_add_long(&pn->vfscache_count, -1); 853 854 /* 855 * Removing leaf from tree 856 */ 857 if (TAILQ_EMPTY(&ncp->nc_list)) { 858 if (ncp->nc_flag & NCF_UNRESOLVED) 859 atomic_add_long(&pn->vfscache_unres, -1); 860 atomic_add_long(&pn->vfscache_leafs, -1); 861 } 862 863 /* 864 * Parent is now a leaf? 865 */ 866 dropvp = NULL; 867 if (TAILQ_EMPTY(&par->nc_list)) { 868 if (par->nc_flag & NCF_UNRESOLVED) 869 atomic_add_long(&pn->vfscache_unres, 1); 870 atomic_add_long(&pn->vfscache_leafs, 1); 871 if (par->nc_vp) 872 dropvp = par->nc_vp; 873 } 874 ncp->nc_parent = NULL; 875 ncp->nc_head = NULL; 876 spin_unlock(&nchpp->spin); 877 _cache_drop(par); /* drop ncp's nc_parent ref from (par) */ 878 879 /* 880 * We can only safely vdrop with no spinlocks held. 881 */ 882 if (dropvp) 883 vdrop(dropvp); 884 } 885 886 /* 887 * Allocate a new namecache structure. Most of the code does not require 888 * zero-termination of the string but it makes vop_compat_ncreate() easier. 889 * 890 * The returned ncp will be locked and referenced. The ref is generally meant 891 * to be transfered to the nchpp linkage. 892 */ 893 static struct namecache * 894 cache_alloc(int nlen) 895 { 896 struct namecache *ncp; 897 898 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 899 if (nlen) 900 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 901 ncp->nc_nlen = nlen; 902 ncp->nc_flag = NCF_UNRESOLVED; 903 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 904 ncp->nc_refs = 1; /* natural ref */ 905 TAILQ_INIT(&ncp->nc_list); 906 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 907 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 908 909 return(ncp); 910 } 911 912 /* 913 * Can only be called for the case where the ncp has never been 914 * associated with anything (so no spinlocks are needed). 915 */ 916 static void 917 _cache_free(struct namecache *ncp) 918 { 919 KKASSERT(ncp->nc_refs == 1); 920 if (ncp->nc_name) 921 kfree(ncp->nc_name, M_VFSCACHEAUX); 922 kfree_obj(ncp, M_VFSCACHE); 923 } 924 925 /* 926 * [re]initialize a nchandle. 927 */ 928 void 929 cache_zero(struct nchandle *nch) 930 { 931 nch->ncp = NULL; 932 nch->mount = NULL; 933 } 934 935 /* 936 * Ref and deref a nchandle structure (ncp + mp) 937 * 938 * The caller must specify a stable ncp pointer, typically meaning the 939 * ncp is already referenced but this can also occur indirectly through 940 * e.g. holding a lock on a direct child. 941 * 942 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 943 * use read spinlocks here. 944 */ 945 struct nchandle * 946 cache_hold(struct nchandle *nch) 947 { 948 _cache_hold(nch->ncp); 949 _cache_mntref(nch->mount); 950 return(nch); 951 } 952 953 /* 954 * Create a copy of a namecache handle for an already-referenced 955 * entry. 956 */ 957 void 958 cache_copy(struct nchandle *nch, struct nchandle *target) 959 { 960 struct namecache *ncp; 961 struct mount *mp; 962 struct mntcache_elm *elm; 963 struct namecache *ncpr; 964 int i; 965 966 ncp = nch->ncp; 967 mp = nch->mount; 968 target->ncp = ncp; 969 target->mount = mp; 970 971 elm = _cache_mntcache_hash(ncp); 972 for (i = 0; i < MNTCACHE_SET; ++i) { 973 if (elm->ncp == ncp) { 974 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 975 if (ncpr == ncp) { 976 _cache_mntref(mp); 977 return; 978 } 979 if (ncpr) 980 _cache_drop(ncpr); 981 } 982 ++elm; 983 } 984 if (ncp) 985 _cache_hold(ncp); 986 _cache_mntref(mp); 987 } 988 989 /* 990 * Drop the nchandle, but try to cache the ref to avoid global atomic 991 * ops. This is typically done on the system root and jail root nchandles. 992 */ 993 void 994 cache_drop_and_cache(struct nchandle *nch, int elmno) 995 { 996 struct mntcache_elm *elm; 997 struct mntcache_elm *best; 998 struct namecache *ncpr; 999 int delta1; 1000 int delta2; 1001 int i; 1002 1003 if (elmno > 4) { 1004 if (nch->ncp) { 1005 _cache_drop(nch->ncp); 1006 nch->ncp = NULL; 1007 } 1008 if (nch->mount) { 1009 _cache_mntrel(nch->mount); 1010 nch->mount = NULL; 1011 } 1012 return; 1013 } 1014 1015 elm = _cache_mntcache_hash(nch->ncp); 1016 best = elm; 1017 for (i = 0; i < MNTCACHE_SET; ++i) { 1018 if (elm->ncp == NULL) { 1019 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 1020 _cache_mntrel(nch->mount); 1021 elm->ticks = ticks; 1022 nch->mount = NULL; 1023 nch->ncp = NULL; 1024 if (ncpr) 1025 _cache_drop(ncpr); 1026 return; 1027 } 1028 delta1 = ticks - best->ticks; 1029 delta2 = ticks - elm->ticks; 1030 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 1031 best = elm; 1032 ++elm; 1033 } 1034 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 1035 _cache_mntrel(nch->mount); 1036 best->ticks = ticks; 1037 nch->mount = NULL; 1038 nch->ncp = NULL; 1039 if (ncpr) 1040 _cache_drop(ncpr); 1041 } 1042 1043 void 1044 cache_changemount(struct nchandle *nch, struct mount *mp) 1045 { 1046 _cache_mntref(mp); 1047 _cache_mntrel(nch->mount); 1048 nch->mount = mp; 1049 } 1050 1051 void 1052 cache_drop(struct nchandle *nch) 1053 { 1054 _cache_mntrel(nch->mount); 1055 _cache_drop(nch->ncp); 1056 nch->ncp = NULL; 1057 nch->mount = NULL; 1058 } 1059 1060 /* 1061 * Returns: 1062 * -1 Locked by other 1063 * 0 Not locked 1064 * (v) LK_SHARED or LK_EXCLUSIVE 1065 */ 1066 int 1067 cache_lockstatus(struct nchandle *nch) 1068 { 1069 return(_cache_lockstatus(nch->ncp)); 1070 } 1071 1072 void 1073 cache_lock(struct nchandle *nch) 1074 { 1075 _cache_lock(nch->ncp); 1076 } 1077 1078 /* 1079 * Returns a shared or exclusive-locked ncp. The ncp will only be 1080 * shared-locked if it is already resolved. 1081 */ 1082 void 1083 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1084 { 1085 struct namecache *ncp = nch->ncp; 1086 1087 if (ncp_shared_lock_disable || excl || 1088 (ncp->nc_flag & NCF_UNRESOLVED)) { 1089 _cache_lock(ncp); 1090 } else { 1091 _cache_lock_shared(ncp); 1092 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1093 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1094 _cache_unlock(ncp); 1095 _cache_lock(ncp); 1096 } 1097 } else { 1098 _cache_unlock(ncp); 1099 _cache_lock(ncp); 1100 } 1101 } 1102 } 1103 1104 /* 1105 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1106 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1107 * 1108 * The caller is responsible for checking the validity upon return as 1109 * the records may have been flagged DESTROYED in the interim. 1110 * 1111 * Namecache lock ordering is leaf first, then parent. However, complex 1112 * interactions may occur between the source and target because there is 1113 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1114 */ 1115 void 1116 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1117 struct nchandle *tncpd, struct nchandle *tncp, 1118 struct ucred *fcred, struct ucred *tcred) 1119 { 1120 int tlocked = 1; 1121 1122 /* 1123 * Lock tncp and tncpd 1124 * 1125 * NOTE: Because these ncps are not locked to begin with, it is 1126 * possible for other rename races to cause the normal lock 1127 * order assumptions to fail. 1128 * 1129 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1130 * matches after the leaf has been locked. However, ordering 1131 * between the 'from' and the 'to' is not and an overlapping 1132 * lock order reversal is still possible. 1133 */ 1134 again: 1135 if (__predict_false(tlocked == 0)) { 1136 cache_lock(tncp); 1137 } 1138 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1139 cache_unlock(tncp); 1140 cache_lock(tncpd); cache_unlock(tncpd); /* cycle */ 1141 tlocked = 0; 1142 goto again; 1143 } 1144 1145 /* 1146 * Lock fncp and fncpd 1147 * 1148 * NOTE: Because these ncps are not locked to begin with, it is 1149 * possible for other rename races to cause the normal lock 1150 * order assumptions to fail. 1151 * 1152 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1153 * matches after the leaf has been locked. However, ordering 1154 * between the 'from' and the 'to' is not and an overlapping 1155 * lock order reversal is still possible. 1156 */ 1157 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1158 cache_unlock(tncpd); 1159 cache_unlock(tncp); 1160 cache_lock(fncp); cache_unlock(fncp); /* cycle */ 1161 tlocked = 0; 1162 goto again; 1163 } 1164 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1165 cache_unlock(fncp); 1166 cache_unlock(tncpd); 1167 cache_unlock(tncp); 1168 cache_lock(fncpd); cache_unlock(fncpd); /* cycle */ 1169 tlocked = 0; 1170 goto again; 1171 } 1172 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1173 cache_resolve(fncpd, fcred); 1174 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1175 cache_resolve(tncpd, tcred); 1176 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1177 cache_resolve(fncp, fcred); 1178 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1179 cache_resolve(tncp, tcred); 1180 } 1181 1182 int 1183 cache_lock_nonblock(struct nchandle *nch) 1184 { 1185 return(_cache_lock_nonblock(nch->ncp)); 1186 } 1187 1188 void 1189 cache_unlock(struct nchandle *nch) 1190 { 1191 _cache_unlock(nch->ncp); 1192 } 1193 1194 /* 1195 * ref-and-lock, unlock-and-deref functions. 1196 * 1197 * This function is primarily used by nlookup. Even though cache_lock 1198 * holds the vnode, it is possible that the vnode may have already 1199 * initiated a recyclement. 1200 * 1201 * We want cache_get() to return a definitively usable vnode or a 1202 * definitively unresolved ncp. 1203 */ 1204 static 1205 struct namecache * 1206 _cache_get(struct namecache *ncp) 1207 { 1208 _cache_hold(ncp); 1209 _cache_lock(ncp); 1210 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1211 _cache_setunresolved(ncp); 1212 return(ncp); 1213 } 1214 1215 /* 1216 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1217 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1218 * valid. Otherwise an exclusive lock will be acquired instead. 1219 */ 1220 static 1221 struct namecache * 1222 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1223 { 1224 if (ncp_shared_lock_disable || excl || 1225 (ncp->nc_flag & NCF_UNRESOLVED)) 1226 { 1227 return(_cache_get(ncp)); 1228 } 1229 _cache_hold(ncp); 1230 _cache_lock_shared(ncp); 1231 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1232 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1233 _cache_unlock(ncp); 1234 ncp = _cache_get(ncp); 1235 _cache_drop(ncp); 1236 } 1237 } else { 1238 _cache_unlock(ncp); 1239 ncp = _cache_get(ncp); 1240 _cache_drop(ncp); 1241 } 1242 return(ncp); 1243 } 1244 1245 /* 1246 * NOTE: The same nchandle can be passed for both arguments. 1247 */ 1248 void 1249 cache_get(struct nchandle *nch, struct nchandle *target) 1250 { 1251 KKASSERT(nch->ncp->nc_refs > 0); 1252 target->mount = nch->mount; 1253 target->ncp = _cache_get(nch->ncp); 1254 _cache_mntref(target->mount); 1255 } 1256 1257 void 1258 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1259 { 1260 KKASSERT(nch->ncp->nc_refs > 0); 1261 target->mount = nch->mount; 1262 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1263 _cache_mntref(target->mount); 1264 } 1265 1266 /* 1267 * Release a held and locked ncp 1268 */ 1269 static __inline 1270 void 1271 _cache_put(struct namecache *ncp) 1272 { 1273 _cache_unlock(ncp); 1274 _cache_drop(ncp); 1275 } 1276 1277 void 1278 cache_put(struct nchandle *nch) 1279 { 1280 _cache_mntrel(nch->mount); 1281 _cache_put(nch->ncp); 1282 nch->ncp = NULL; 1283 nch->mount = NULL; 1284 } 1285 1286 /* 1287 * Resolve an unresolved ncp by associating a vnode with it. If the 1288 * vnode is NULL, a negative cache entry is created. 1289 * 1290 * The ncp should be locked on entry and will remain locked on return. 1291 */ 1292 static 1293 void 1294 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1295 { 1296 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1297 1298 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1299 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1300 ncp->nc_vp == NULL); 1301 1302 if (vp) { 1303 /* 1304 * Any vp associated with an ncp which has children must 1305 * be held. Any vp associated with a locked ncp must be held. 1306 */ 1307 if (!TAILQ_EMPTY(&ncp->nc_list)) 1308 vhold(vp); 1309 spin_lock(&vp->v_spin); 1310 ncp->nc_vp = vp; 1311 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1312 ++vp->v_namecache_count; 1313 _cache_hold(ncp); /* v_namecache assoc */ 1314 spin_unlock(&vp->v_spin); 1315 vhold(vp); /* nc_vp */ 1316 1317 /* 1318 * Set auxiliary flags 1319 */ 1320 switch(vp->v_type) { 1321 case VDIR: 1322 ncp->nc_flag |= NCF_ISDIR; 1323 break; 1324 case VLNK: 1325 ncp->nc_flag |= NCF_ISSYMLINK; 1326 /* XXX cache the contents of the symlink */ 1327 break; 1328 default: 1329 break; 1330 } 1331 1332 ncp->nc_error = 0; 1333 1334 /* 1335 * XXX: this is a hack to work-around the lack of a real pfs vfs 1336 * implementation 1337 */ 1338 if (mp) { 1339 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1340 vp->v_pfsmp = mp; 1341 } 1342 } else { 1343 /* 1344 * When creating a negative cache hit we set the 1345 * namecache_gen. A later resolve will clean out the 1346 * negative cache hit if the mount point's namecache_gen 1347 * has changed. Used by devfs, could also be used by 1348 * other remote FSs. 1349 */ 1350 ncp->nc_vp = NULL; 1351 ncp->nc_negcpu = mycpu->gd_cpuid; 1352 spin_lock(&pn->neg_spin); 1353 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1354 _cache_hold(ncp); /* neg_list assoc */ 1355 ++pn->neg_count; 1356 spin_unlock(&pn->neg_spin); 1357 atomic_add_long(&pn->vfscache_negs, 1); 1358 1359 ncp->nc_error = ENOENT; 1360 if (mp) 1361 VFS_NCPGEN_SET(mp, ncp); 1362 } 1363 1364 /* 1365 * Previously unresolved leaf is now resolved. 1366 * 1367 * Clear the NCF_UNRESOLVED flag last (see cache_nlookup_nonlocked()). 1368 */ 1369 if (TAILQ_EMPTY(&ncp->nc_list)) 1370 atomic_add_long(&pn->vfscache_unres, -1); 1371 cpu_sfence(); 1372 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1373 } 1374 1375 void 1376 cache_setvp(struct nchandle *nch, struct vnode *vp) 1377 { 1378 _cache_setvp(nch->mount, nch->ncp, vp); 1379 } 1380 1381 /* 1382 * Used for NFS 1383 */ 1384 void 1385 cache_settimeout(struct nchandle *nch, int nticks) 1386 { 1387 struct namecache *ncp = nch->ncp; 1388 1389 if ((ncp->nc_timeout = ticks + nticks) == 0) 1390 ncp->nc_timeout = 1; 1391 } 1392 1393 /* 1394 * Disassociate the vnode or negative-cache association and mark a 1395 * namecache entry as unresolved again. Note that the ncp is still 1396 * left in the hash table and still linked to its parent. 1397 * 1398 * The ncp should be locked and refd on entry and will remain locked and refd 1399 * on return. 1400 * 1401 * This routine is normally never called on a directory containing children. 1402 * However, NFS often does just that in its rename() code as a cop-out to 1403 * avoid complex namespace operations. This disconnects a directory vnode 1404 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1405 * sync. 1406 * 1407 */ 1408 static 1409 void 1410 _cache_setunresolved(struct namecache *ncp) 1411 { 1412 struct vnode *vp; 1413 1414 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1415 struct pcpu_ncache *pn; 1416 1417 /* 1418 * Is a resolved leaf now becoming unresolved? 1419 */ 1420 if (TAILQ_EMPTY(&ncp->nc_list)) { 1421 pn = &pcpu_ncache[mycpu->gd_cpuid]; 1422 atomic_add_long(&pn->vfscache_unres, 1); 1423 } 1424 1425 ncp->nc_flag |= NCF_UNRESOLVED; 1426 ncp->nc_timeout = 0; 1427 ncp->nc_error = ENOTCONN; 1428 if ((vp = ncp->nc_vp) != NULL) { 1429 spin_lock(&vp->v_spin); 1430 ncp->nc_vp = NULL; 1431 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1432 --vp->v_namecache_count; 1433 spin_unlock(&vp->v_spin); 1434 1435 /* 1436 * Any vp associated with an ncp with children is 1437 * held by that ncp. Any vp associated with ncp 1438 * is held by that ncp. These conditions must be 1439 * undone when the vp is cleared out from the ncp. 1440 */ 1441 if (!TAILQ_EMPTY(&ncp->nc_list)) 1442 vdrop(vp); 1443 vdrop(vp); 1444 } else { 1445 pn = &pcpu_ncache[ncp->nc_negcpu]; 1446 1447 atomic_add_long(&pn->vfscache_negs, -1); 1448 spin_lock(&pn->neg_spin); 1449 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1450 --pn->neg_count; 1451 spin_unlock(&pn->neg_spin); 1452 } 1453 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1454 _cache_drop(ncp); /* from v_namecache or neg_list */ 1455 } 1456 } 1457 1458 /* 1459 * The cache_nresolve() code calls this function to automatically 1460 * set a resolved cache element to unresolved if it has timed out 1461 * or if it is a negative cache hit and the mount point namecache_gen 1462 * has changed. 1463 */ 1464 static __inline int 1465 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1466 { 1467 /* 1468 * Try to zap entries that have timed out. We have 1469 * to be careful here because locked leafs may depend 1470 * on the vnode remaining intact in a parent, so only 1471 * do this under very specific conditions. 1472 */ 1473 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1474 TAILQ_EMPTY(&ncp->nc_list)) { 1475 return 1; 1476 } 1477 1478 /* 1479 * If a resolved negative cache hit is invalid due to 1480 * the mount's namecache generation being bumped, zap it. 1481 */ 1482 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1483 return 1; 1484 } 1485 1486 /* 1487 * Otherwise we are good 1488 */ 1489 return 0; 1490 } 1491 1492 static __inline void 1493 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1494 { 1495 /* 1496 * Already in an unresolved state, nothing to do. 1497 */ 1498 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1499 if (_cache_auto_unresolve_test(mp, ncp)) 1500 _cache_setunresolved(ncp); 1501 } 1502 } 1503 1504 void 1505 cache_setunresolved(struct nchandle *nch) 1506 { 1507 _cache_setunresolved(nch->ncp); 1508 } 1509 1510 /* 1511 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1512 * looking for matches. This flag tells the lookup code when it must 1513 * check for a mount linkage and also prevents the directories in question 1514 * from being deleted or renamed. 1515 */ 1516 static 1517 int 1518 cache_clrmountpt_callback(struct mount *mp, void *data) 1519 { 1520 struct nchandle *nch = data; 1521 1522 if (mp->mnt_ncmounton.ncp == nch->ncp) 1523 return(1); 1524 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1525 return(1); 1526 return(0); 1527 } 1528 1529 /* 1530 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1531 * with a mount point. 1532 */ 1533 void 1534 cache_clrmountpt(struct nchandle *nch) 1535 { 1536 int count; 1537 1538 count = mountlist_scan(cache_clrmountpt_callback, nch, 1539 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1540 MNTSCAN_NOUNLOCK); 1541 if (count == 0) 1542 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1543 } 1544 1545 /* 1546 * Invalidate portions of the namecache topology given a starting entry. 1547 * The passed ncp is set to an unresolved state and: 1548 * 1549 * The passed ncp must be referenced and locked. The routine may unlock 1550 * and relock ncp several times, and will recheck the children and loop 1551 * to catch races. When done the passed ncp will be returned with the 1552 * reference and lock intact. 1553 * 1554 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1555 * that the physical underlying nodes have been 1556 * destroyed... as in deleted. For example, when 1557 * a directory is removed. This will cause record 1558 * lookups on the name to no longer be able to find 1559 * the record and tells the resolver to return failure 1560 * rather then trying to resolve through the parent. 1561 * 1562 * The topology itself, including ncp->nc_name, 1563 * remains intact. 1564 * 1565 * This only applies to the passed ncp, if CINV_CHILDREN 1566 * is specified the children are not flagged. 1567 * 1568 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1569 * state as well. 1570 * 1571 * Note that this will also have the side effect of 1572 * cleaning out any unreferenced nodes in the topology 1573 * from the leaves up as the recursion backs out. 1574 * 1575 * Note that the topology for any referenced nodes remains intact, but 1576 * the nodes will be marked as having been destroyed and will be set 1577 * to an unresolved state. 1578 * 1579 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1580 * the namecache entry may not actually be invalidated on return if it was 1581 * revalidated while recursing down into its children. This code guarentees 1582 * that the node(s) will go through an invalidation cycle, but does not 1583 * guarentee that they will remain in an invalidated state. 1584 * 1585 * Returns non-zero if a revalidation was detected during the invalidation 1586 * recursion, zero otherwise. Note that since only the original ncp is 1587 * locked the revalidation ultimately can only indicate that the original ncp 1588 * *MIGHT* no have been reresolved. 1589 * 1590 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1591 * have to avoid blowing out the kernel stack. We do this by saving the 1592 * deep namecache node and aborting the recursion, then re-recursing at that 1593 * node using a depth-first algorithm in order to allow multiple deep 1594 * recursions to chain through each other, then we restart the invalidation 1595 * from scratch. 1596 */ 1597 1598 struct cinvtrack { 1599 struct namecache *resume_ncp; 1600 int depth; 1601 }; 1602 1603 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1604 1605 static 1606 int 1607 _cache_inval(struct namecache *ncp, int flags) 1608 { 1609 struct cinvtrack track; 1610 struct namecache *ncp2; 1611 int r; 1612 1613 track.depth = 0; 1614 track.resume_ncp = NULL; 1615 1616 for (;;) { 1617 r = _cache_inval_internal(ncp, flags, &track); 1618 if (track.resume_ncp == NULL) 1619 break; 1620 _cache_unlock(ncp); 1621 while ((ncp2 = track.resume_ncp) != NULL) { 1622 track.resume_ncp = NULL; 1623 _cache_lock(ncp2); 1624 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1625 &track); 1626 /*_cache_put(ncp2);*/ 1627 cache_zap(ncp2); 1628 } 1629 _cache_lock(ncp); 1630 } 1631 return(r); 1632 } 1633 1634 int 1635 cache_inval(struct nchandle *nch, int flags) 1636 { 1637 return(_cache_inval(nch->ncp, flags)); 1638 } 1639 1640 /* 1641 * Helper for _cache_inval(). The passed ncp is refd and locked and 1642 * remains that way on return, but may be unlocked/relocked multiple 1643 * times by the routine. 1644 */ 1645 static int 1646 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1647 { 1648 struct namecache *nextkid; 1649 int rcnt = 0; 1650 1651 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1652 1653 _cache_setunresolved(ncp); 1654 if (flags & CINV_DESTROY) { 1655 ncp->nc_flag |= NCF_DESTROYED; 1656 ++ncp->nc_generation; 1657 } 1658 1659 while ((flags & CINV_CHILDREN) && 1660 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1661 ) { 1662 struct namecache *kid; 1663 int restart; 1664 1665 restart = 0; 1666 _cache_hold(nextkid); 1667 if (++track->depth > MAX_RECURSION_DEPTH) { 1668 track->resume_ncp = ncp; 1669 _cache_hold(ncp); 1670 ++rcnt; 1671 } 1672 while ((kid = nextkid) != NULL) { 1673 /* 1674 * Parent (ncp) must be locked for the iteration. 1675 */ 1676 nextkid = NULL; 1677 if (kid->nc_parent != ncp) { 1678 _cache_drop(kid); 1679 kprintf("cache_inval_internal restartA %s\n", 1680 ncp->nc_name); 1681 restart = 1; 1682 break; 1683 } 1684 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1685 _cache_hold(nextkid); 1686 1687 /* 1688 * Parent unlocked for this section to avoid 1689 * deadlocks. Then lock the kid and check for 1690 * races. 1691 */ 1692 _cache_unlock(ncp); 1693 if (track->resume_ncp) { 1694 _cache_drop(kid); 1695 _cache_lock(ncp); 1696 break; 1697 } 1698 _cache_lock(kid); 1699 if (kid->nc_parent != ncp) { 1700 kprintf("cache_inval_internal " 1701 "restartB %s\n", 1702 ncp->nc_name); 1703 restart = 1; 1704 _cache_unlock(kid); 1705 _cache_drop(kid); 1706 _cache_lock(ncp); 1707 break; 1708 } 1709 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1710 TAILQ_FIRST(&kid->nc_list) 1711 ) { 1712 1713 rcnt += _cache_inval_internal(kid, 1714 flags & ~CINV_DESTROY, track); 1715 /*_cache_unlock(kid);*/ 1716 /*_cache_drop(kid);*/ 1717 cache_zap(kid); 1718 } else { 1719 cache_zap(kid); 1720 } 1721 1722 /* 1723 * Relock parent to continue scan 1724 */ 1725 _cache_lock(ncp); 1726 } 1727 if (nextkid) 1728 _cache_drop(nextkid); 1729 --track->depth; 1730 if (restart == 0) 1731 break; 1732 } 1733 1734 /* 1735 * Someone could have gotten in there while ncp was unlocked, 1736 * retry if so. 1737 */ 1738 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1739 ++rcnt; 1740 return (rcnt); 1741 } 1742 1743 /* 1744 * Invalidate a vnode's namecache associations. To avoid races against 1745 * the resolver we do not invalidate a node which we previously invalidated 1746 * but which was then re-resolved while we were in the invalidation loop. 1747 * 1748 * Returns non-zero if any namecache entries remain after the invalidation 1749 * loop completed. 1750 * 1751 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1752 * be ripped out of the topology while held, the vnode's v_namecache 1753 * list has no such restriction. NCP's can be ripped out of the list 1754 * at virtually any time if not locked, even if held. 1755 * 1756 * In addition, the v_namecache list itself must be locked via 1757 * the vnode's spinlock. 1758 */ 1759 int 1760 cache_inval_vp(struct vnode *vp, int flags) 1761 { 1762 struct namecache *ncp; 1763 struct namecache *next; 1764 1765 restart: 1766 spin_lock(&vp->v_spin); 1767 ncp = TAILQ_FIRST(&vp->v_namecache); 1768 if (ncp) 1769 _cache_hold(ncp); 1770 while (ncp) { 1771 /* loop entered with ncp held and vp spin-locked */ 1772 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1773 _cache_hold(next); 1774 spin_unlock(&vp->v_spin); 1775 _cache_lock(ncp); 1776 if (ncp->nc_vp != vp) { 1777 kprintf("Warning: cache_inval_vp: race-A detected on " 1778 "%s\n", ncp->nc_name); 1779 _cache_put(ncp); 1780 if (next) 1781 _cache_drop(next); 1782 goto restart; 1783 } 1784 _cache_inval(ncp, flags); 1785 _cache_put(ncp); /* also releases reference */ 1786 ncp = next; 1787 spin_lock(&vp->v_spin); 1788 if (ncp && ncp->nc_vp != vp) { 1789 spin_unlock(&vp->v_spin); 1790 kprintf("Warning: cache_inval_vp: race-B detected on " 1791 "%s\n", ncp->nc_name); 1792 _cache_drop(ncp); 1793 goto restart; 1794 } 1795 } 1796 spin_unlock(&vp->v_spin); 1797 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1798 } 1799 1800 /* 1801 * This routine is used instead of the normal cache_inval_vp() when we 1802 * are trying to recycle otherwise good vnodes. 1803 * 1804 * Return 0 on success, non-zero if not all namecache records could be 1805 * disassociated from the vnode (for various reasons). 1806 */ 1807 int 1808 cache_inval_vp_nonblock(struct vnode *vp) 1809 { 1810 struct namecache *ncp; 1811 struct namecache *next; 1812 1813 spin_lock(&vp->v_spin); 1814 1815 ncp = TAILQ_FIRST(&vp->v_namecache); 1816 if (ncp) 1817 _cache_hold(ncp); 1818 1819 while (ncp) { 1820 /* loop entered with ncp held */ 1821 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1822 _cache_hold(next); 1823 spin_unlock(&vp->v_spin); 1824 if (_cache_lock_nonblock(ncp)) { 1825 _cache_drop(ncp); 1826 if (next) 1827 _cache_drop(next); 1828 goto done; 1829 } 1830 if (ncp->nc_vp != vp) { 1831 kprintf("Warning: cache_inval_vp: race-A detected on " 1832 "%s\n", ncp->nc_name); 1833 _cache_put(ncp); 1834 if (next) 1835 _cache_drop(next); 1836 goto done; 1837 } 1838 _cache_inval(ncp, 0); 1839 _cache_put(ncp); /* also releases reference */ 1840 ncp = next; 1841 spin_lock(&vp->v_spin); 1842 if (ncp && ncp->nc_vp != vp) { 1843 spin_unlock(&vp->v_spin); 1844 kprintf("Warning: cache_inval_vp: race-B detected on " 1845 "%s\n", ncp->nc_name); 1846 _cache_drop(ncp); 1847 goto done; 1848 } 1849 } 1850 spin_unlock(&vp->v_spin); 1851 done: 1852 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1853 } 1854 1855 /* 1856 * Attempt to quickly invalidate the vnode's namecache entry. This function 1857 * will also dive the ncp and free its children but only if they are trivial. 1858 * All locks are non-blocking and the function will fail if required locks 1859 * cannot be obtained. 1860 * 1861 * We want this sort of function to be able to guarantee progress when vnlru 1862 * wants to recycle a vnode. Directories could otherwise get stuck and not 1863 * be able to recycle due to destroyed or unresolved children in the 1864 * namecache. 1865 */ 1866 void 1867 cache_inval_vp_quick(struct vnode *vp) 1868 { 1869 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1870 struct namecache *ncp; 1871 struct namecache *kid; 1872 1873 spin_lock(&vp->v_spin); 1874 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) { 1875 _cache_hold(ncp); 1876 spin_unlock(&vp->v_spin); 1877 if (_cache_lock_nonblock(ncp)) { 1878 _cache_drop(ncp); 1879 return; 1880 } 1881 1882 /* 1883 * Try to trivially destroy any children. 1884 */ 1885 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) { 1886 struct nchash_head *nchpp; 1887 1888 /* 1889 * Early test without the lock. Give-up if the 1890 * child has children of its own, the child is 1891 * positively-resolved, or the ref-count is 1892 * unexpected. 1893 */ 1894 if (TAILQ_FIRST(&kid->nc_list) || 1895 kid->nc_vp || 1896 kid->nc_refs != ncpbaserefs(kid)) 1897 { 1898 _cache_put(ncp); 1899 return; 1900 } 1901 1902 _cache_hold(kid); 1903 if (_cache_lock_nonblock(kid)) { 1904 _cache_drop(kid); 1905 _cache_put(ncp); 1906 return; 1907 } 1908 1909 /* 1910 * A destruction/free test requires the parent, 1911 * the kid, and the hash table to be locked. Note 1912 * that the kid may still be on the negative cache 1913 * list. 1914 */ 1915 nchpp = kid->nc_head; 1916 spin_lock(&nchpp->spin); 1917 1918 /* 1919 * Give up if the child isn't trivial. It can be 1920 * resolved or unresolved but must not have a vp. 1921 */ 1922 if (kid->nc_parent != ncp || 1923 kid->nc_vp || 1924 TAILQ_FIRST(&kid->nc_list) || 1925 kid->nc_refs != 1 + ncpbaserefs(kid)) 1926 { 1927 spin_unlock(&nchpp->spin); 1928 _cache_put(kid); 1929 _cache_put(ncp); 1930 return; 1931 } 1932 1933 ++pn->inv_kid_quick_count; 1934 1935 /* 1936 * We can safely destroy the kid. It may still 1937 * have extra refs due to ncneglist races, but since 1938 * we checked above with the lock held those races 1939 * will self-resolve. 1940 * 1941 * With these actions the kid should nominally 1942 * have just its natural ref plus our ref. 1943 * 1944 * This is only safe because we hold locks on 1945 * the parent, the kid, and the nchpp. The only 1946 * lock we don't have is on the ncneglist and that 1947 * can race a ref, but as long as we unresolve the 1948 * kid before executing our final drop the ncneglist 1949 * code path(s) will just drop their own ref so all 1950 * is good. 1951 */ 1952 _cache_unlink_parent(ncp, kid, nchpp); 1953 _cache_setunresolved(kid); 1954 if (kid->nc_refs != 2) { 1955 kprintf("Warning: kid %p unexpected refs=%d " 1956 "%08x %s\n", 1957 kid, kid->nc_refs, 1958 kid->nc_flag, kid->nc_name); 1959 } 1960 _cache_put(kid); /* drop our ref and lock */ 1961 _cache_drop(kid); /* drop natural ref to destroy */ 1962 } 1963 1964 /* 1965 * Now check ncp itself against our expectations. With 1966 * no children left we have our ref plus whether it is 1967 * resolved or not (which it has to be, actually, since it 1968 * is hanging off the vp->v_namecache). 1969 */ 1970 if (ncp->nc_refs != 1 + ncpbaserefs(ncp)) { 1971 _cache_put(ncp); 1972 spin_lock(&vp->v_spin); 1973 break; 1974 } 1975 1976 ++pn->inv_ncp_quick_count; 1977 1978 /* 1979 * Success, disassociate and release the ncp. Do not 1980 * try to zap it here. 1981 * 1982 * NOTE: Releasing the ncp here leaves it in the tree, 1983 * but since we have disassociated the vnode this 1984 * ncp entry becomes 'trivial' and successive calls 1985 * to cache_inval_vp_quick() will be able to continue 1986 * to make progress. 1987 */ 1988 _cache_setunresolved(ncp); 1989 _cache_put(ncp); 1990 spin_lock(&vp->v_spin); 1991 } 1992 spin_unlock(&vp->v_spin); 1993 } 1994 1995 /* 1996 * Clears the universal directory search 'ok' flag. This flag allows 1997 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1998 * so clearing it simply forces revalidation. 1999 */ 2000 void 2001 cache_inval_wxok(struct vnode *vp) 2002 { 2003 struct namecache *ncp; 2004 2005 spin_lock(&vp->v_spin); 2006 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 2007 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 2008 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 2009 } 2010 spin_unlock(&vp->v_spin); 2011 } 2012 2013 /* 2014 * The source ncp has been renamed to the target ncp. All elements have been 2015 * locked, including the parent ncp's. 2016 * 2017 * The target ncp is destroyed (as a normal rename-over would destroy the 2018 * target file or directory). 2019 * 2020 * Because there may be references to the source ncp we cannot copy its 2021 * contents to the target. Instead the source ncp is relinked as the target 2022 * and the target ncp is removed from the namecache topology. 2023 */ 2024 void 2025 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 2026 { 2027 struct namecache *fncp = fnch->ncp; 2028 struct namecache *tncp = tnch->ncp; 2029 struct namecache *par; 2030 struct nchash_head *nchpp; 2031 u_int32_t hash; 2032 char *oname; 2033 char *nname; 2034 2035 ++fncp->nc_generation; 2036 ++tncp->nc_generation; 2037 if (tncp->nc_nlen) { 2038 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 2039 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 2040 nname[tncp->nc_nlen] = 0; 2041 } else { 2042 nname = NULL; 2043 } 2044 2045 /* 2046 * Rename fncp (unlink) 2047 */ 2048 if (fncp->nc_parent) { 2049 par = fncp->nc_parent; 2050 _cache_hold(par); 2051 _cache_lock(par); 2052 nchpp = fncp->nc_head; 2053 spin_lock(&nchpp->spin); 2054 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */ 2055 _cache_put(par); 2056 } else { 2057 par = NULL; 2058 nchpp = NULL; 2059 } 2060 oname = fncp->nc_name; 2061 fncp->nc_name = nname; 2062 fncp->nc_nlen = tncp->nc_nlen; 2063 if (oname) 2064 kfree(oname, M_VFSCACHEAUX); 2065 2066 par = tncp->nc_parent; 2067 KKASSERT(par->nc_lock.lk_lockholder == curthread); 2068 2069 /* 2070 * Rename fncp (relink) 2071 */ 2072 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 2073 hash = fnv_32_buf(&par, sizeof(par), hash); 2074 nchpp = NCHHASH(hash); 2075 2076 spin_lock(&nchpp->spin); 2077 _cache_link_parent(fncp, par, nchpp); 2078 spin_unlock(&nchpp->spin); 2079 2080 /* 2081 * Get rid of the overwritten tncp (unlink) 2082 */ 2083 _cache_unlink(tncp); 2084 } 2085 2086 /* 2087 * Perform actions consistent with unlinking a file. The passed-in ncp 2088 * must be locked. 2089 * 2090 * The ncp is marked DESTROYED so it no longer shows up in searches, 2091 * and will be physically deleted when the vnode goes away. 2092 * 2093 * If the related vnode has no refs then we cycle it through vget()/vput() 2094 * to (possibly if we don't have a ref race) trigger a deactivation, 2095 * allowing the VFS to trivially detect and recycle the deleted vnode 2096 * via VOP_INACTIVE(). 2097 * 2098 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2099 * target ncp. 2100 */ 2101 void 2102 cache_unlink(struct nchandle *nch) 2103 { 2104 _cache_unlink(nch->ncp); 2105 } 2106 2107 static void 2108 _cache_unlink(struct namecache *ncp) 2109 { 2110 struct vnode *vp; 2111 2112 /* 2113 * Causes lookups to fail and allows another ncp with the same 2114 * name to be created under ncp->nc_parent. 2115 */ 2116 ncp->nc_flag |= NCF_DESTROYED; 2117 ++ncp->nc_generation; 2118 2119 /* 2120 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2121 * force action on the 1->0 transition. Do not destroy the 2122 * vp association if a vp is present (leave the destroyed ncp 2123 * resolved through the vp finalization). 2124 * 2125 * Cleanup the refs in the resolved-not-found case by setting 2126 * the ncp to an unresolved state. This improves our ability 2127 * to get rid of dead ncp elements in other cache_*() routines. 2128 */ 2129 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 2130 vp = ncp->nc_vp; 2131 if (vp) { 2132 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2133 if (VREFCNT(vp) <= 0) { 2134 if (vget(vp, LK_SHARED) == 0) 2135 vput(vp); 2136 } 2137 } else { 2138 _cache_setunresolved(ncp); 2139 } 2140 } 2141 } 2142 2143 /* 2144 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2145 * file. The easy solution is to just return non-zero if the vnode has refs. 2146 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2147 * force the reclaim). 2148 */ 2149 int 2150 cache_isopen(struct nchandle *nch) 2151 { 2152 struct vnode *vp; 2153 struct namecache *ncp = nch->ncp; 2154 2155 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2156 (vp = ncp->nc_vp) != NULL && 2157 VREFCNT(vp)) { 2158 return 1; 2159 } 2160 return 0; 2161 } 2162 2163 2164 /* 2165 * vget the vnode associated with the namecache entry. Resolve the namecache 2166 * entry if necessary. The passed ncp must be referenced and locked. If 2167 * the ncp is resolved it might be locked shared. 2168 * 2169 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2170 * (depending on the passed lk_type) will be returned in *vpp with an error 2171 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2172 * most typical error is ENOENT, meaning that the ncp represents a negative 2173 * cache hit and there is no vnode to retrieve, but other errors can occur 2174 * too. 2175 * 2176 * The vget() can race a reclaim. If this occurs we re-resolve the 2177 * namecache entry. 2178 * 2179 * There are numerous places in the kernel where vget() is called on a 2180 * vnode while one or more of its namecache entries is locked. Releasing 2181 * a vnode never deadlocks against locked namecache entries (the vnode 2182 * will not get recycled while referenced ncp's exist). This means we 2183 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2184 * lock when acquiring the vp lock or we might cause a deadlock. 2185 * 2186 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2187 * unresolved. If a reclaim race occurs the passed-in ncp will be 2188 * relocked exclusively before being re-resolved. 2189 */ 2190 int 2191 cache_vget(struct nchandle *nch, struct ucred *cred, 2192 int lk_type, struct vnode **vpp) 2193 { 2194 struct namecache *ncp; 2195 struct vnode *vp; 2196 int error; 2197 2198 ncp = nch->ncp; 2199 again: 2200 vp = NULL; 2201 if (ncp->nc_flag & NCF_UNRESOLVED) 2202 error = cache_resolve(nch, cred); 2203 else 2204 error = 0; 2205 2206 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2207 error = vget(vp, lk_type); 2208 if (error) { 2209 /* 2210 * VRECLAIM race 2211 * 2212 * The ncp may have been locked shared, we must relock 2213 * it exclusively before we can set it to unresolved. 2214 */ 2215 if (error == ENOENT) { 2216 kprintf("Warning: vnode reclaim race detected " 2217 "in cache_vget on %p (%s)\n", 2218 vp, ncp->nc_name); 2219 _cache_unlock(ncp); 2220 _cache_lock(ncp); 2221 _cache_setunresolved(ncp); 2222 goto again; 2223 } 2224 2225 /* 2226 * Not a reclaim race, some other error. 2227 */ 2228 KKASSERT(ncp->nc_vp == vp); 2229 vp = NULL; 2230 } else { 2231 KKASSERT(ncp->nc_vp == vp); 2232 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2233 } 2234 } 2235 if (error == 0 && vp == NULL) 2236 error = ENOENT; 2237 *vpp = vp; 2238 return(error); 2239 } 2240 2241 /* 2242 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 2243 * is already held by virtuue of the ncp being locked, but it might not be 2244 * referenced and while it is not referenced it can transition into the 2245 * VRECLAIMED state. 2246 * 2247 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2248 * unresolved. If a reclaim race occurs the passed-in ncp will be 2249 * relocked exclusively before being re-resolved. 2250 * 2251 * NOTE: At the moment we have to issue a vget() on the vnode, even though 2252 * we are going to immediately release the lock, in order to resolve 2253 * potential reclamation races. Once we have a solid vnode ref that 2254 * was (at some point) interlocked via a vget(), the vnode will not 2255 * be reclaimed. 2256 * 2257 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2258 */ 2259 int 2260 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2261 { 2262 struct namecache *ncp; 2263 struct vnode *vp; 2264 int error; 2265 int v; 2266 2267 ncp = nch->ncp; 2268 again: 2269 vp = NULL; 2270 if (ncp->nc_flag & NCF_UNRESOLVED) 2271 error = cache_resolve(nch, cred); 2272 else 2273 error = 0; 2274 2275 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2276 /* 2277 * Try a lockless ref of the vnode. VRECLAIMED transitions 2278 * use the vx_lock state and update-counter mechanism so we 2279 * can detect if one is in-progress or occurred. 2280 * 2281 * If we can successfully ref the vnode and interlock against 2282 * the update-counter mechanism, and VRECLAIMED is found to 2283 * not be set after that, we should be good. 2284 */ 2285 v = spin_access_start_only(&vp->v_spin); 2286 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2287 vref_special(vp); 2288 if (__predict_false( 2289 spin_access_end_only(&vp->v_spin, v))) { 2290 vrele(vp); 2291 continue; 2292 } 2293 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2294 break; 2295 } 2296 vrele(vp); 2297 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2298 } 2299 2300 /* 2301 * Do it the slow way 2302 */ 2303 error = vget(vp, LK_SHARED); 2304 if (error) { 2305 /* 2306 * VRECLAIM race 2307 */ 2308 if (error == ENOENT) { 2309 kprintf("Warning: vnode reclaim race detected " 2310 "in cache_vget on %p (%s)\n", 2311 vp, ncp->nc_name); 2312 _cache_unlock(ncp); 2313 _cache_lock(ncp); 2314 _cache_setunresolved(ncp); 2315 goto again; 2316 } 2317 2318 /* 2319 * Not a reclaim race, some other error. 2320 */ 2321 KKASSERT(ncp->nc_vp == vp); 2322 vp = NULL; 2323 } else { 2324 KKASSERT(ncp->nc_vp == vp); 2325 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2326 /* caller does not want a lock */ 2327 vn_unlock(vp); 2328 } 2329 break; 2330 } 2331 if (error == 0 && vp == NULL) 2332 error = ENOENT; 2333 *vpp = vp; 2334 2335 return(error); 2336 } 2337 2338 /* 2339 * Return a referenced vnode representing the parent directory of 2340 * ncp. 2341 * 2342 * Because the caller has locked the ncp it should not be possible for 2343 * the parent ncp to go away. However, the parent can unresolve its 2344 * dvp at any time so we must be able to acquire a lock on the parent 2345 * to safely access nc_vp. 2346 * 2347 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2348 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2349 * getting destroyed. 2350 * 2351 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2352 * lock on the ncp in question.. 2353 */ 2354 struct vnode * 2355 cache_dvpref(struct namecache *ncp) 2356 { 2357 struct namecache *par; 2358 struct vnode *dvp; 2359 2360 dvp = NULL; 2361 if ((par = ncp->nc_parent) != NULL) { 2362 _cache_hold(par); 2363 _cache_lock(par); 2364 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2365 if ((dvp = par->nc_vp) != NULL) 2366 vhold(dvp); 2367 } 2368 _cache_unlock(par); 2369 if (dvp) { 2370 if (vget(dvp, LK_SHARED) == 0) { 2371 vn_unlock(dvp); 2372 vdrop(dvp); 2373 /* return refd, unlocked dvp */ 2374 } else { 2375 vdrop(dvp); 2376 dvp = NULL; 2377 } 2378 } 2379 _cache_drop(par); 2380 } 2381 return(dvp); 2382 } 2383 2384 /* 2385 * Convert a directory vnode to a namecache record without any other 2386 * knowledge of the topology. This ONLY works with directory vnodes and 2387 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2388 * returned ncp (if not NULL) will be held and unlocked. 2389 * 2390 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2391 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2392 * for dvp. This will fail only if the directory has been deleted out from 2393 * under the caller. 2394 * 2395 * Callers must always check for a NULL return no matter the value of 'makeit'. 2396 * 2397 * To avoid underflowing the kernel stack each recursive call increments 2398 * the makeit variable. 2399 */ 2400 2401 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2402 struct vnode *dvp, char *fakename); 2403 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2404 struct vnode **saved_dvp); 2405 2406 int 2407 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2408 struct nchandle *nch) 2409 { 2410 struct vnode *saved_dvp; 2411 struct vnode *pvp; 2412 char *fakename; 2413 int error; 2414 2415 nch->ncp = NULL; 2416 nch->mount = dvp->v_mount; 2417 saved_dvp = NULL; 2418 fakename = NULL; 2419 2420 /* 2421 * Handle the makeit == 0 degenerate case 2422 */ 2423 if (makeit == 0) { 2424 spin_lock_shared(&dvp->v_spin); 2425 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2426 if (nch->ncp) 2427 cache_hold(nch); 2428 spin_unlock_shared(&dvp->v_spin); 2429 } 2430 2431 /* 2432 * Loop until resolution, inside code will break out on error. 2433 */ 2434 while (makeit) { 2435 /* 2436 * Break out if we successfully acquire a working ncp. 2437 */ 2438 spin_lock_shared(&dvp->v_spin); 2439 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2440 if (nch->ncp) { 2441 cache_hold(nch); 2442 spin_unlock_shared(&dvp->v_spin); 2443 break; 2444 } 2445 spin_unlock_shared(&dvp->v_spin); 2446 2447 /* 2448 * If dvp is the root of its filesystem it should already 2449 * have a namecache pointer associated with it as a side 2450 * effect of the mount, but it may have been disassociated. 2451 */ 2452 if (dvp->v_flag & VROOT) { 2453 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2454 error = cache_resolve_mp(nch->mount); 2455 _cache_put(nch->ncp); 2456 if (ncvp_debug & 1) { 2457 kprintf("cache_fromdvp: resolve root of " 2458 "mount %p error %d", 2459 dvp->v_mount, error); 2460 } 2461 if (error) { 2462 if (ncvp_debug & 1) 2463 kprintf(" failed\n"); 2464 nch->ncp = NULL; 2465 break; 2466 } 2467 if (ncvp_debug & 1) 2468 kprintf(" succeeded\n"); 2469 continue; 2470 } 2471 2472 /* 2473 * If we are recursed too deeply resort to an O(n^2) 2474 * algorithm to resolve the namecache topology. The 2475 * resolved pvp is left referenced in saved_dvp to 2476 * prevent the tree from being destroyed while we loop. 2477 */ 2478 if (makeit > 20) { 2479 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2480 if (error) { 2481 kprintf("lookupdotdot(longpath) failed %d " 2482 "dvp %p\n", error, dvp); 2483 nch->ncp = NULL; 2484 break; 2485 } 2486 continue; 2487 } 2488 2489 /* 2490 * Get the parent directory and resolve its ncp. 2491 */ 2492 if (fakename) { 2493 kfree(fakename, M_TEMP); 2494 fakename = NULL; 2495 } 2496 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2497 &fakename); 2498 if (error) { 2499 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2500 break; 2501 } 2502 vn_unlock(pvp); 2503 2504 /* 2505 * Reuse makeit as a recursion depth counter. On success 2506 * nch will be fully referenced. 2507 */ 2508 cache_fromdvp(pvp, cred, makeit + 1, nch); 2509 vrele(pvp); 2510 if (nch->ncp == NULL) 2511 break; 2512 2513 /* 2514 * Do an inefficient scan of pvp (embodied by ncp) to look 2515 * for dvp. This will create a namecache record for dvp on 2516 * success. We loop up to recheck on success. 2517 * 2518 * ncp and dvp are both held but not locked. 2519 */ 2520 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2521 if (error) { 2522 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2523 pvp, nch->ncp->nc_name, dvp); 2524 cache_drop(nch); 2525 /* nch was NULLed out, reload mount */ 2526 nch->mount = dvp->v_mount; 2527 break; 2528 } 2529 if (ncvp_debug & 1) { 2530 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2531 pvp, nch->ncp->nc_name); 2532 } 2533 cache_drop(nch); 2534 /* nch was NULLed out, reload mount */ 2535 nch->mount = dvp->v_mount; 2536 } 2537 2538 /* 2539 * If nch->ncp is non-NULL it will have been held already. 2540 */ 2541 if (fakename) 2542 kfree(fakename, M_TEMP); 2543 if (saved_dvp) 2544 vrele(saved_dvp); 2545 if (nch->ncp) 2546 return (0); 2547 return (EINVAL); 2548 } 2549 2550 /* 2551 * Go up the chain of parent directories until we find something 2552 * we can resolve into the namecache. This is very inefficient. 2553 */ 2554 static 2555 int 2556 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2557 struct vnode **saved_dvp) 2558 { 2559 struct nchandle nch; 2560 struct vnode *pvp; 2561 int error; 2562 static time_t last_fromdvp_report; 2563 char *fakename; 2564 2565 /* 2566 * Loop getting the parent directory vnode until we get something we 2567 * can resolve in the namecache. 2568 */ 2569 vref(dvp); 2570 nch.mount = dvp->v_mount; 2571 nch.ncp = NULL; 2572 fakename = NULL; 2573 2574 for (;;) { 2575 if (fakename) { 2576 kfree(fakename, M_TEMP); 2577 fakename = NULL; 2578 } 2579 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2580 &fakename); 2581 if (error) { 2582 vrele(dvp); 2583 break; 2584 } 2585 vn_unlock(pvp); 2586 spin_lock_shared(&pvp->v_spin); 2587 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2588 _cache_hold(nch.ncp); 2589 spin_unlock_shared(&pvp->v_spin); 2590 vrele(pvp); 2591 break; 2592 } 2593 spin_unlock_shared(&pvp->v_spin); 2594 if (pvp->v_flag & VROOT) { 2595 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2596 error = cache_resolve_mp(nch.mount); 2597 _cache_unlock(nch.ncp); 2598 vrele(pvp); 2599 if (error) { 2600 _cache_drop(nch.ncp); 2601 nch.ncp = NULL; 2602 vrele(dvp); 2603 } 2604 break; 2605 } 2606 vrele(dvp); 2607 dvp = pvp; 2608 } 2609 if (error == 0) { 2610 if (last_fromdvp_report != time_uptime) { 2611 last_fromdvp_report = time_uptime; 2612 kprintf("Warning: extremely inefficient path " 2613 "resolution on %s\n", 2614 nch.ncp->nc_name); 2615 } 2616 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2617 2618 /* 2619 * Hopefully dvp now has a namecache record associated with 2620 * it. Leave it referenced to prevent the kernel from 2621 * recycling the vnode. Otherwise extremely long directory 2622 * paths could result in endless recycling. 2623 */ 2624 if (*saved_dvp) 2625 vrele(*saved_dvp); 2626 *saved_dvp = dvp; 2627 _cache_drop(nch.ncp); 2628 } 2629 if (fakename) 2630 kfree(fakename, M_TEMP); 2631 return (error); 2632 } 2633 2634 /* 2635 * Do an inefficient scan of the directory represented by ncp looking for 2636 * the directory vnode dvp. ncp must be held but not locked on entry and 2637 * will be held on return. dvp must be refd but not locked on entry and 2638 * will remain refd on return. 2639 * 2640 * Why do this at all? Well, due to its stateless nature the NFS server 2641 * converts file handles directly to vnodes without necessarily going through 2642 * the namecache ops that would otherwise create the namecache topology 2643 * leading to the vnode. We could either (1) Change the namecache algorithms 2644 * to allow disconnect namecache records that are re-merged opportunistically, 2645 * or (2) Make the NFS server backtrack and scan to recover a connected 2646 * namecache topology in order to then be able to issue new API lookups. 2647 * 2648 * It turns out that (1) is a huge mess. It takes a nice clean set of 2649 * namecache algorithms and introduces a lot of complication in every subsystem 2650 * that calls into the namecache to deal with the re-merge case, especially 2651 * since we are using the namecache to placehold negative lookups and the 2652 * vnode might not be immediately assigned. (2) is certainly far less 2653 * efficient then (1), but since we are only talking about directories here 2654 * (which are likely to remain cached), the case does not actually run all 2655 * that often and has the supreme advantage of not polluting the namecache 2656 * algorithms. 2657 * 2658 * If a fakename is supplied just construct a namecache entry using the 2659 * fake name. 2660 */ 2661 static int 2662 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2663 struct vnode *dvp, char *fakename) 2664 { 2665 struct nlcomponent nlc; 2666 struct nchandle rncp; 2667 struct dirent *den; 2668 struct vnode *pvp; 2669 struct vattr vat; 2670 struct iovec iov; 2671 struct uio uio; 2672 int blksize; 2673 int eofflag; 2674 int bytes; 2675 char *rbuf; 2676 int error; 2677 2678 vat.va_blocksize = 0; 2679 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2680 return (error); 2681 cache_lock(nch); 2682 error = cache_vref(nch, cred, &pvp); 2683 cache_unlock(nch); 2684 if (error) 2685 return (error); 2686 if (ncvp_debug & 1) { 2687 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2688 "vattr fileid = %lld\n", 2689 nch->ncp, nch->ncp->nc_name, 2690 vat.va_blocksize, 2691 (long long)vat.va_fileid); 2692 } 2693 2694 /* 2695 * Use the supplied fakename if not NULL. Fake names are typically 2696 * not in the actual filesystem hierarchy. This is used by HAMMER 2697 * to glue @@timestamp recursions together. 2698 */ 2699 if (fakename) { 2700 nlc.nlc_nameptr = fakename; 2701 nlc.nlc_namelen = strlen(fakename); 2702 rncp = cache_nlookup(nch, &nlc); 2703 goto done; 2704 } 2705 2706 if ((blksize = vat.va_blocksize) == 0) 2707 blksize = DEV_BSIZE; 2708 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2709 rncp.ncp = NULL; 2710 2711 eofflag = 0; 2712 uio.uio_offset = 0; 2713 again: 2714 iov.iov_base = rbuf; 2715 iov.iov_len = blksize; 2716 uio.uio_iov = &iov; 2717 uio.uio_iovcnt = 1; 2718 uio.uio_resid = blksize; 2719 uio.uio_segflg = UIO_SYSSPACE; 2720 uio.uio_rw = UIO_READ; 2721 uio.uio_td = curthread; 2722 2723 if (ncvp_debug & 2) 2724 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2725 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2726 if (error == 0) { 2727 den = (struct dirent *)rbuf; 2728 bytes = blksize - uio.uio_resid; 2729 2730 while (bytes > 0) { 2731 if (ncvp_debug & 2) { 2732 kprintf("cache_inefficient_scan: %*.*s\n", 2733 den->d_namlen, den->d_namlen, 2734 den->d_name); 2735 } 2736 if (den->d_type != DT_WHT && 2737 den->d_ino == vat.va_fileid) { 2738 if (ncvp_debug & 1) { 2739 kprintf("cache_inefficient_scan: " 2740 "MATCHED inode %lld path %s/%*.*s\n", 2741 (long long)vat.va_fileid, 2742 nch->ncp->nc_name, 2743 den->d_namlen, den->d_namlen, 2744 den->d_name); 2745 } 2746 nlc.nlc_nameptr = den->d_name; 2747 nlc.nlc_namelen = den->d_namlen; 2748 rncp = cache_nlookup(nch, &nlc); 2749 KKASSERT(rncp.ncp != NULL); 2750 break; 2751 } 2752 bytes -= _DIRENT_DIRSIZ(den); 2753 den = _DIRENT_NEXT(den); 2754 } 2755 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2756 goto again; 2757 } 2758 kfree(rbuf, M_TEMP); 2759 done: 2760 vrele(pvp); 2761 if (rncp.ncp) { 2762 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2763 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2764 if (ncvp_debug & 2) { 2765 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2766 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2767 } 2768 } else { 2769 if (ncvp_debug & 2) { 2770 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2771 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2772 rncp.ncp->nc_vp); 2773 } 2774 } 2775 if (rncp.ncp->nc_vp == NULL) 2776 error = rncp.ncp->nc_error; 2777 /* 2778 * Release rncp after a successful nlookup. rncp was fully 2779 * referenced. 2780 */ 2781 cache_put(&rncp); 2782 } else { 2783 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2784 dvp, nch->ncp->nc_name); 2785 error = ENOENT; 2786 } 2787 return (error); 2788 } 2789 2790 /* 2791 * This function must be called with the ncp held and locked and will unlock 2792 * and drop it during zapping. 2793 * 2794 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2795 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2796 * and removes the related reference. If the ncp can be removed, and the 2797 * parent can be zapped non-blocking, this function loops up. 2798 * 2799 * There will be one ref from the caller (which we now own). The only 2800 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2801 * so possibly 2 refs left. Taking this into account, if there are no 2802 * additional refs and no children, the ncp will be removed from the topology 2803 * and destroyed. 2804 * 2805 * References and/or children may exist if the ncp is in the middle of the 2806 * topology, preventing the ncp from being destroyed. 2807 * 2808 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2809 * 2810 * This function may return a held (but NOT locked) parent node which the 2811 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2812 * due to deep namecache trees. 2813 * 2814 * WARNING! For MPSAFE operation this routine must acquire up to three 2815 * spin locks to be able to safely test nc_refs. Lock order is 2816 * very important. 2817 * 2818 * hash spinlock if on hash list 2819 * parent spinlock if child of parent 2820 * (the ncp is unresolved so there is no vnode association) 2821 */ 2822 static int 2823 cache_zap(struct namecache *ncp) 2824 { 2825 struct namecache *par; 2826 struct nchash_head *nchpp; 2827 int refcmp; 2828 int nonblock = 1; /* XXX cleanup */ 2829 int res = 0; 2830 2831 again: 2832 /* 2833 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2834 * This gets rid of any vp->v_namecache list or negative list and 2835 * the related ref. 2836 */ 2837 _cache_setunresolved(ncp); 2838 2839 /* 2840 * Try to scrap the entry and possibly tail-recurse on its parent. 2841 * We only scrap unref'd (other then our ref) unresolved entries, 2842 * we do not scrap 'live' entries. 2843 * 2844 * If nc_parent is non NULL we expect 2 references, else just 1. 2845 * If there are more, someone else also holds the ncp and we cannot 2846 * destroy it. 2847 */ 2848 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2849 KKASSERT(ncp->nc_refs > 0); 2850 2851 /* 2852 * If the ncp is linked to its parent it will also be in the hash 2853 * table. We have to be able to lock the parent and the hash table. 2854 * 2855 * Acquire locks. Note that the parent can't go away while we hold 2856 * a child locked. If nc_parent is present, expect 2 refs instead 2857 * of 1. 2858 */ 2859 nchpp = NULL; 2860 if ((par = ncp->nc_parent) != NULL) { 2861 if (nonblock) { 2862 if (_cache_lock_nonblock(par)) { 2863 /* lock failed */ 2864 ncp->nc_flag |= NCF_DEFEREDZAP; 2865 atomic_add_long( 2866 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2867 1); 2868 _cache_unlock(ncp); 2869 _cache_drop(ncp); /* caller's ref */ 2870 return res; 2871 } 2872 _cache_hold(par); 2873 } else { 2874 _cache_hold(par); 2875 _cache_lock(par); 2876 } 2877 nchpp = ncp->nc_head; 2878 spin_lock(&nchpp->spin); 2879 } 2880 2881 /* 2882 * With the parent and nchpp locked, and the vnode removed 2883 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2884 * more someone else has a ref and we cannot zap the entry. 2885 * 2886 * one for our hold 2887 * one for our parent link (parent also has one from the linkage) 2888 */ 2889 if (par) 2890 refcmp = 2; 2891 else 2892 refcmp = 1; 2893 2894 /* 2895 * On failure undo the work we've done so far and drop the 2896 * caller's ref and ncp. 2897 */ 2898 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2899 if (par) { 2900 spin_unlock(&nchpp->spin); 2901 _cache_put(par); 2902 } 2903 _cache_unlock(ncp); 2904 _cache_drop(ncp); 2905 return res; 2906 } 2907 2908 /* 2909 * We own all the refs and with the spinlocks held no further 2910 * refs can be acquired by others. 2911 * 2912 * Remove us from the hash list and parent list. We have to 2913 * drop a ref on the parent's vp if the parent's list becomes 2914 * empty. 2915 */ 2916 if (par) { 2917 KKASSERT(nchpp == ncp->nc_head); 2918 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */ 2919 /*_cache_unlock(par);*/ 2920 /* &nchpp->spin is unlocked by call */ 2921 } else { 2922 KKASSERT(ncp->nc_head == NULL); 2923 } 2924 2925 /* 2926 * ncp should not have picked up any refs. Physically 2927 * destroy the ncp. 2928 */ 2929 if (ncp->nc_refs != refcmp) { 2930 panic("cache_zap: %p bad refs %d (expected %d)\n", 2931 ncp, ncp->nc_refs, refcmp); 2932 } 2933 /* _cache_unlock(ncp) not required */ 2934 ncp->nc_refs = -1; /* safety */ 2935 if (ncp->nc_name) 2936 kfree(ncp->nc_name, M_VFSCACHEAUX); 2937 kfree_obj(ncp, M_VFSCACHE); 2938 res = 1; 2939 2940 /* 2941 * Loop up if we can recursively clean out the parent. 2942 */ 2943 if (par) { 2944 refcmp = 1; /* ref on parent */ 2945 if (par->nc_parent) /* par->par */ 2946 ++refcmp; 2947 par->nc_flag &= ~NCF_DEFEREDZAP; 2948 if ((par->nc_flag & NCF_UNRESOLVED) && 2949 par->nc_refs == refcmp && 2950 TAILQ_EMPTY(&par->nc_list)) 2951 { 2952 ncp = par; 2953 goto again; 2954 } 2955 _cache_unlock(par); 2956 _cache_drop(par); 2957 } 2958 return 1; 2959 } 2960 2961 /* 2962 * Clean up dangling negative cache and defered-drop entries in the 2963 * namecache. 2964 * 2965 * This routine is called in the critical path and also called from 2966 * vnlru(). When called from vnlru we use a lower limit to try to 2967 * deal with the negative cache before the critical path has to start 2968 * dealing with it. 2969 */ 2970 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2971 2972 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2973 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2974 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2975 2976 void 2977 cache_hysteresis(int critpath) 2978 { 2979 long poslimit; 2980 long exclimit; 2981 long neglimit; 2982 long xnumunres; 2983 long xnumleafs; 2984 long clean_neg; 2985 long clean_unres; 2986 long clean_excess; 2987 2988 /* 2989 * Calculate negative ncp limit 2990 */ 2991 neglimit = maxvnodes / ncnegfactor; 2992 if (critpath == 0) 2993 neglimit = neglimit * 8 / 10; 2994 2995 /* 2996 * Don't cache too many negative hits. We use hysteresis to reduce 2997 * the impact on the critical path. 2998 */ 2999 clean_neg = 0; 3000 3001 switch(neg_cache_hysteresis_state[critpath]) { 3002 case CHI_LOW: 3003 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 3004 if (critpath) 3005 clean_neg = ncnegflush; 3006 else 3007 clean_neg = ncnegflush + 3008 vfscache_negs - neglimit; 3009 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 3010 } 3011 break; 3012 case CHI_HIGH: 3013 if (vfscache_negs > MINNEG * 9 / 10 && 3014 vfscache_negs * 9 / 10 > neglimit 3015 ) { 3016 if (critpath) 3017 clean_neg = ncnegflush; 3018 else 3019 clean_neg = ncnegflush + 3020 vfscache_negs * 9 / 10 - 3021 neglimit; 3022 } else { 3023 neg_cache_hysteresis_state[critpath] = CHI_LOW; 3024 } 3025 break; 3026 } 3027 if (clean_neg) 3028 _cache_cleanneg(clean_neg); 3029 3030 /* 3031 * Don't cache too many unresolved elements. We use hysteresis to 3032 * reduce the impact on the critical path. 3033 */ 3034 if ((poslimit = ncposlimit) == 0) 3035 poslimit = maxvnodes / ncposfactor; 3036 if (critpath == 0) 3037 poslimit = poslimit * 8 / 10; 3038 3039 /* 3040 * Number of unresolved leaf elements in the namecache. These 3041 * can build-up for various reasons and may have to be disposed 3042 * of to allow the inactive list to be cleaned out by vnlru_proc() 3043 * 3044 * Collect count 3045 */ 3046 xnumunres = vfscache_unres; 3047 clean_unres = 0; 3048 3049 switch(pos_cache_hysteresis_state[critpath]) { 3050 case CHI_LOW: 3051 if (xnumunres > poslimit && xnumunres > MINPOS) { 3052 if (critpath) 3053 clean_unres = ncposflush; 3054 else 3055 clean_unres = ncposflush + xnumunres - 3056 poslimit; 3057 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 3058 } 3059 break; 3060 case CHI_HIGH: 3061 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) { 3062 if (critpath) 3063 clean_unres = ncposflush; 3064 else 3065 clean_unres = ncposflush + xnumunres - 3066 poslimit * 5 / 6; 3067 } else { 3068 pos_cache_hysteresis_state[critpath] = CHI_LOW; 3069 } 3070 break; 3071 } 3072 3073 /* 3074 * Excessive positive hits can accumulate due to large numbers of 3075 * hardlinks (the vnode cache will not prevent ncps representing 3076 * hardlinks from growing into infinity). 3077 */ 3078 exclimit = maxvnodes * 2; 3079 if (critpath == 0) 3080 exclimit = exclimit * 8 / 10; 3081 xnumleafs = vfscache_leafs; 3082 clean_excess = 0; 3083 3084 switch(exc_cache_hysteresis_state[critpath]) { 3085 case CHI_LOW: 3086 if (xnumleafs > exclimit && xnumleafs > MINPOS) { 3087 if (critpath) 3088 clean_excess = ncposflush; 3089 else 3090 clean_excess = ncposflush + xnumleafs - 3091 exclimit; 3092 exc_cache_hysteresis_state[critpath] = CHI_HIGH; 3093 } 3094 break; 3095 case CHI_HIGH: 3096 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) { 3097 if (critpath) 3098 clean_excess = ncposflush; 3099 else 3100 clean_excess = ncposflush + xnumleafs - 3101 exclimit * 5 / 6; 3102 } else { 3103 exc_cache_hysteresis_state[critpath] = CHI_LOW; 3104 } 3105 break; 3106 } 3107 3108 if (clean_unres || clean_excess) 3109 _cache_cleanpos(clean_unres, clean_excess); 3110 3111 /* 3112 * Clean out dangling defered-zap ncps which could not be cleanly 3113 * dropped if too many build up. Note that numdefered is 3114 * heuristical. Make sure we are real-time for the current cpu, 3115 * plus the global rollup. 3116 */ 3117 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 3118 _cache_cleandefered(); 3119 } 3120 } 3121 3122 /* 3123 * NEW NAMECACHE LOOKUP API 3124 * 3125 * Lookup an entry in the namecache. The passed par_nch must be referenced 3126 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 3127 * is ALWAYS returned, eve if the supplied component is illegal. 3128 * 3129 * The resulting namecache entry should be returned to the system with 3130 * cache_put() or cache_unlock() + cache_drop(). 3131 * 3132 * namecache locks are recursive but care must be taken to avoid lock order 3133 * reversals (hence why the passed par_nch must be unlocked). Locking 3134 * rules are to order for parent traversals, not for child traversals. 3135 * 3136 * Nobody else will be able to manipulate the associated namespace (e.g. 3137 * create, delete, rename, rename-target) until the caller unlocks the 3138 * entry. 3139 * 3140 * The returned entry will be in one of three states: positive hit (non-null 3141 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 3142 * Unresolved entries must be resolved through the filesystem to associate the 3143 * vnode and/or determine whether a positive or negative hit has occured. 3144 * 3145 * It is not necessary to lock a directory in order to lock namespace under 3146 * that directory. In fact, it is explicitly not allowed to do that. A 3147 * directory is typically only locked when being created, renamed, or 3148 * destroyed. 3149 * 3150 * The directory (par) may be unresolved, in which case any returned child 3151 * will likely also be marked unresolved. Likely but not guarenteed. Since 3152 * the filesystem lookup requires a resolved directory vnode the caller is 3153 * responsible for resolving the namecache chain top-down. This API 3154 * specifically allows whole chains to be created in an unresolved state. 3155 */ 3156 struct nchandle 3157 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3158 { 3159 struct nchandle nch; 3160 struct namecache *ncp; 3161 struct namecache *new_ncp; 3162 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 3163 struct nchash_head *nchpp; 3164 struct mount *mp; 3165 u_int32_t hash; 3166 globaldata_t gd; 3167 int par_locked; 3168 int use_excl; 3169 3170 gd = mycpu; 3171 mp = par_nch->mount; 3172 par_locked = 0; 3173 3174 /* 3175 * This is a good time to call it, no ncp's are locked by 3176 * the caller or us. 3177 */ 3178 cache_hysteresis(1); 3179 3180 /* 3181 * Try to locate an existing entry 3182 */ 3183 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3184 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3185 new_ncp = NULL; 3186 use_excl = 0; 3187 nchpp = NCHHASH(hash); 3188 restart: 3189 rep_ncp = NULL; 3190 if (use_excl) 3191 spin_lock(&nchpp->spin); 3192 else 3193 spin_lock_shared(&nchpp->spin); 3194 3195 /* 3196 * Do a reverse scan to collect any DESTROYED ncps prior to matching 3197 * an existing entry. 3198 */ 3199 TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) { 3200 /* 3201 * Break out if we find a matching entry. Note that 3202 * UNRESOLVED entries may match, but DESTROYED entries 3203 * do not. 3204 * 3205 * We may be able to reuse DESTROYED entries that we come 3206 * across, even if the name does not match, as long as 3207 * nc_nlen is correct and the only hold ref is from the nchpp 3208 * list itself. 3209 */ 3210 if (ncp->nc_parent == par_nch->ncp && 3211 ncp->nc_nlen == nlc->nlc_namelen) { 3212 if (ncp->nc_flag & NCF_DESTROYED) { 3213 if (ncp->nc_refs == 1 && rep_ncp == NULL) 3214 rep_ncp = ncp; 3215 continue; 3216 } 3217 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 3218 continue; 3219 3220 /* 3221 * Matched ncp 3222 */ 3223 _cache_hold(ncp); 3224 if (rep_ncp) 3225 _cache_hold(rep_ncp); 3226 3227 if (use_excl) 3228 spin_unlock(&nchpp->spin); 3229 else 3230 spin_unlock_shared(&nchpp->spin); 3231 3232 if (par_locked) { 3233 _cache_unlock(par_nch->ncp); 3234 par_locked = 0; 3235 } 3236 3237 /* 3238 * Really try to destroy rep_ncp if encountered. 3239 * Various edge cases can build up more than one, 3240 * so loop if we succeed. This isn't perfect, but 3241 * we can't afford to have tons of entries build 3242 * up on a single nhcpp list due to rename-over 3243 * operations. If that were to happen, the system 3244 * would bog down quickly. 3245 */ 3246 if (rep_ncp) { 3247 if (_cache_lock_nonblock(rep_ncp) == 0) { 3248 if (rep_ncp->nc_flag & NCF_DESTROYED) { 3249 if (cache_zap(rep_ncp)) { 3250 _cache_drop(ncp); 3251 goto restart; 3252 } 3253 } else { 3254 _cache_unlock(rep_ncp); 3255 _cache_drop(rep_ncp); 3256 } 3257 } else { 3258 _cache_drop(rep_ncp); 3259 } 3260 } 3261 3262 /* 3263 * Continue processing the matched entry 3264 */ 3265 if (_cache_lock_special(ncp) == 0) { 3266 /* 3267 * Successfully locked but we must re-test 3268 * conditions that might have changed since 3269 * we did not have the lock before. 3270 */ 3271 if (ncp->nc_parent != par_nch->ncp || 3272 ncp->nc_nlen != nlc->nlc_namelen || 3273 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3274 ncp->nc_nlen) || 3275 (ncp->nc_flag & NCF_DESTROYED)) { 3276 _cache_put(ncp); 3277 goto restart; 3278 } 3279 _cache_auto_unresolve(mp, ncp); 3280 if (new_ncp) { 3281 _cache_free(new_ncp); 3282 new_ncp = NULL; /* safety */ 3283 } 3284 goto found; 3285 } 3286 _cache_get(ncp); /* cycle the lock to block */ 3287 _cache_put(ncp); 3288 _cache_drop(ncp); 3289 goto restart; 3290 } 3291 } 3292 3293 /* 3294 * We failed to locate the entry, try to resurrect a destroyed 3295 * entry that we did find that is already correctly linked into 3296 * nchpp and the parent. We must re-test conditions after 3297 * successfully locking rep_ncp. 3298 * 3299 * This case can occur under heavy loads due to not being able 3300 * to safely lock the parent in cache_zap(). Nominally a repeated 3301 * create/unlink load, but only the namelen needs to match. 3302 * 3303 * An exclusive lock on the nchpp is required to process this case, 3304 * otherwise a race can cause duplicate entries to be created with 3305 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 3306 */ 3307 if (rep_ncp && use_excl) { 3308 if (_cache_lock_nonblock(rep_ncp) == 0) { 3309 _cache_hold(rep_ncp); 3310 if (rep_ncp->nc_parent == par_nch->ncp && 3311 rep_ncp->nc_nlen == nlc->nlc_namelen && 3312 (rep_ncp->nc_flag & NCF_DESTROYED) && 3313 rep_ncp->nc_refs == 2) 3314 { 3315 /* 3316 * Update nc_name. 3317 */ 3318 ncp = rep_ncp; 3319 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3320 nlc->nlc_namelen); 3321 3322 /* 3323 * This takes some care. We must clear the 3324 * NCF_DESTROYED flag before unlocking the 3325 * hash chain so other concurrent searches 3326 * do not skip this element. 3327 * 3328 * We must also unlock the hash chain before 3329 * unresolving the ncp to avoid deadlocks. 3330 * We hold the lock on the ncp so we can safely 3331 * reinitialize nc_flag after that. 3332 */ 3333 ncp->nc_flag &= ~NCF_DESTROYED; 3334 spin_unlock(&nchpp->spin); /* use_excl */ 3335 3336 _cache_setunresolved(ncp); 3337 ncp->nc_flag = NCF_UNRESOLVED; 3338 ncp->nc_error = ENOTCONN; 3339 if (par_locked) { 3340 _cache_unlock(par_nch->ncp); 3341 par_locked = 0; 3342 } 3343 if (new_ncp) { 3344 _cache_free(new_ncp); 3345 new_ncp = NULL; /* safety */ 3346 } 3347 goto found; 3348 } 3349 _cache_put(rep_ncp); 3350 } 3351 } 3352 3353 /* 3354 * Otherwise create a new entry and add it to the cache. The parent 3355 * ncp must also be locked so we can link into it. 3356 * 3357 * We have to relookup after possibly blocking in kmalloc or 3358 * when locking par_nch. 3359 * 3360 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3361 * mount case, in which case nc_name will be NULL. 3362 * 3363 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3364 * a DESTROYED entry, but didn't have an exclusive lock. 3365 * In this situation we do not create a new_ncp. 3366 */ 3367 if (new_ncp == NULL) { 3368 if (use_excl) 3369 spin_unlock(&nchpp->spin); 3370 else 3371 spin_unlock_shared(&nchpp->spin); 3372 if (rep_ncp == NULL) { 3373 new_ncp = cache_alloc(nlc->nlc_namelen); 3374 if (nlc->nlc_namelen) { 3375 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3376 nlc->nlc_namelen); 3377 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3378 } 3379 } 3380 use_excl = 1; 3381 goto restart; 3382 } 3383 3384 /* 3385 * NOTE! The spinlock is held exclusively here because new_ncp 3386 * is non-NULL. 3387 */ 3388 if (par_locked == 0) { 3389 spin_unlock(&nchpp->spin); 3390 _cache_lock(par_nch->ncp); 3391 par_locked = 1; 3392 goto restart; 3393 } 3394 3395 /* 3396 * Link to parent (requires another ref, the one already in new_ncp 3397 * is what we wil lreturn). 3398 * 3399 * WARNING! We still hold the spinlock. We have to set the hash 3400 * table entry atomically. 3401 */ 3402 ncp = new_ncp; 3403 ++ncp->nc_refs; 3404 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3405 spin_unlock(&nchpp->spin); 3406 _cache_unlock(par_nch->ncp); 3407 /* par_locked = 0 - not used */ 3408 found: 3409 /* 3410 * stats and namecache size management 3411 */ 3412 if (ncp->nc_flag & NCF_UNRESOLVED) 3413 ++gd->gd_nchstats->ncs_miss; 3414 else if (ncp->nc_vp) 3415 ++gd->gd_nchstats->ncs_goodhits; 3416 else 3417 ++gd->gd_nchstats->ncs_neghits; 3418 nch.mount = mp; 3419 nch.ncp = ncp; 3420 _cache_mntref(nch.mount); 3421 3422 return(nch); 3423 } 3424 3425 /* 3426 * Attempt to lookup a namecache entry and return with a shared namecache 3427 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3428 * set or we are unable to lock. 3429 */ 3430 int 3431 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3432 struct nlcomponent *nlc, 3433 int excl, struct nchandle *res_nch) 3434 { 3435 struct namecache *ncp; 3436 struct nchash_head *nchpp; 3437 struct mount *mp; 3438 u_int32_t hash; 3439 globaldata_t gd; 3440 3441 /* 3442 * If exclusive requested or shared namecache locks are disabled, 3443 * return failure. 3444 */ 3445 if (ncp_shared_lock_disable || excl) 3446 return(EWOULDBLOCK); 3447 3448 gd = mycpu; 3449 mp = par_nch->mount; 3450 3451 /* 3452 * This is a good time to call it, no ncp's are locked by 3453 * the caller or us. 3454 */ 3455 cache_hysteresis(1); 3456 3457 /* 3458 * Try to locate an existing entry 3459 */ 3460 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3461 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3462 nchpp = NCHHASH(hash); 3463 3464 spin_lock_shared(&nchpp->spin); 3465 3466 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3467 /* 3468 * Break out if we find a matching entry. Note that 3469 * UNRESOLVED entries may match, but DESTROYED entries 3470 * do not. 3471 */ 3472 if (ncp->nc_parent == par_nch->ncp && 3473 ncp->nc_nlen == nlc->nlc_namelen && 3474 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3475 (ncp->nc_flag & NCF_DESTROYED) == 0 3476 ) { 3477 _cache_hold(ncp); 3478 spin_unlock_shared(&nchpp->spin); 3479 3480 if (_cache_lock_shared_special(ncp) == 0) { 3481 if (ncp->nc_parent == par_nch->ncp && 3482 ncp->nc_nlen == nlc->nlc_namelen && 3483 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3484 ncp->nc_nlen) == 0 && 3485 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3486 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3487 _cache_auto_unresolve_test(mp, ncp) == 0) 3488 { 3489 goto found; 3490 } 3491 _cache_unlock(ncp); 3492 } 3493 _cache_drop(ncp); 3494 return(EWOULDBLOCK); 3495 } 3496 } 3497 3498 /* 3499 * Failure 3500 */ 3501 spin_unlock_shared(&nchpp->spin); 3502 return(EWOULDBLOCK); 3503 3504 /* 3505 * Success 3506 * 3507 * Note that nc_error might be non-zero (e.g ENOENT). 3508 */ 3509 found: 3510 res_nch->mount = mp; 3511 res_nch->ncp = ncp; 3512 ++gd->gd_nchstats->ncs_goodhits; 3513 _cache_mntref(res_nch->mount); 3514 3515 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3516 return(ncp->nc_error); 3517 } 3518 3519 /* 3520 * This is a non-blocking verison of cache_nlookup() used by 3521 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3522 * will return nch.ncp == NULL in that case. 3523 */ 3524 struct nchandle 3525 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3526 { 3527 struct nchandle nch; 3528 struct namecache *ncp; 3529 struct namecache *new_ncp; 3530 struct nchash_head *nchpp; 3531 struct mount *mp; 3532 u_int32_t hash; 3533 globaldata_t gd; 3534 int par_locked; 3535 3536 gd = mycpu; 3537 mp = par_nch->mount; 3538 par_locked = 0; 3539 3540 /* 3541 * Try to locate an existing entry 3542 */ 3543 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3544 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3545 new_ncp = NULL; 3546 nchpp = NCHHASH(hash); 3547 restart: 3548 spin_lock(&nchpp->spin); 3549 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3550 /* 3551 * Break out if we find a matching entry. Note that 3552 * UNRESOLVED entries may match, but DESTROYED entries 3553 * do not. 3554 */ 3555 if (ncp->nc_parent == par_nch->ncp && 3556 ncp->nc_nlen == nlc->nlc_namelen && 3557 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3558 (ncp->nc_flag & NCF_DESTROYED) == 0 3559 ) { 3560 _cache_hold(ncp); 3561 spin_unlock(&nchpp->spin); 3562 if (par_locked) { 3563 _cache_unlock(par_nch->ncp); 3564 par_locked = 0; 3565 } 3566 if (_cache_lock_special(ncp) == 0) { 3567 if (ncp->nc_parent != par_nch->ncp || 3568 ncp->nc_nlen != nlc->nlc_namelen || 3569 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3570 (ncp->nc_flag & NCF_DESTROYED)) { 3571 kprintf("cache_lookup_nonblock: " 3572 "ncp-race %p %*.*s\n", 3573 ncp, 3574 nlc->nlc_namelen, 3575 nlc->nlc_namelen, 3576 nlc->nlc_nameptr); 3577 _cache_unlock(ncp); 3578 _cache_drop(ncp); 3579 goto failed; 3580 } 3581 _cache_auto_unresolve(mp, ncp); 3582 if (new_ncp) { 3583 _cache_free(new_ncp); 3584 new_ncp = NULL; 3585 } 3586 goto found; 3587 } 3588 _cache_drop(ncp); 3589 goto failed; 3590 } 3591 } 3592 3593 /* 3594 * We failed to locate an entry, create a new entry and add it to 3595 * the cache. The parent ncp must also be locked so we 3596 * can link into it. 3597 * 3598 * We have to relookup after possibly blocking in kmalloc or 3599 * when locking par_nch. 3600 * 3601 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3602 * mount case, in which case nc_name will be NULL. 3603 */ 3604 if (new_ncp == NULL) { 3605 spin_unlock(&nchpp->spin); 3606 new_ncp = cache_alloc(nlc->nlc_namelen); 3607 if (nlc->nlc_namelen) { 3608 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3609 nlc->nlc_namelen); 3610 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3611 } 3612 goto restart; 3613 } 3614 if (par_locked == 0) { 3615 spin_unlock(&nchpp->spin); 3616 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3617 par_locked = 1; 3618 goto restart; 3619 } 3620 goto failed; 3621 } 3622 3623 /* 3624 * Link to parent (requires another ref, the one already in new_ncp 3625 * is what we wil lreturn). 3626 * 3627 * WARNING! We still hold the spinlock. We have to set the hash 3628 * table entry atomically. 3629 */ 3630 ncp = new_ncp; 3631 ++ncp->nc_refs; 3632 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3633 spin_unlock(&nchpp->spin); 3634 _cache_unlock(par_nch->ncp); 3635 /* par_locked = 0 - not used */ 3636 found: 3637 /* 3638 * stats and namecache size management 3639 */ 3640 if (ncp->nc_flag & NCF_UNRESOLVED) 3641 ++gd->gd_nchstats->ncs_miss; 3642 else if (ncp->nc_vp) 3643 ++gd->gd_nchstats->ncs_goodhits; 3644 else 3645 ++gd->gd_nchstats->ncs_neghits; 3646 nch.mount = mp; 3647 nch.ncp = ncp; 3648 _cache_mntref(nch.mount); 3649 3650 return(nch); 3651 failed: 3652 if (new_ncp) { 3653 _cache_free(new_ncp); 3654 new_ncp = NULL; 3655 } 3656 nch.mount = NULL; 3657 nch.ncp = NULL; 3658 return(nch); 3659 } 3660 3661 /* 3662 * This is a non-locking optimized lookup that depends on adding a ref 3663 * to prevent normal eviction. nch.ncp can be returned as NULL for any 3664 * reason and the caller will retry with normal locking in that case. 3665 * 3666 * This function only returns resolved entries so callers do not accidentally 3667 * race doing out of order / unfenced field checks. 3668 * 3669 * The caller must validate the result for parent-to-child continuity. 3670 */ 3671 struct nchandle 3672 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3673 { 3674 struct nchandle nch; 3675 struct namecache *ncp; 3676 struct nchash_head *nchpp; 3677 struct mount *mp; 3678 u_int32_t hash; 3679 globaldata_t gd; 3680 3681 gd = mycpu; 3682 mp = par_nch->mount; 3683 3684 /* 3685 * Try to locate an existing entry 3686 */ 3687 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3688 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3689 nchpp = NCHHASH(hash); 3690 3691 spin_lock_shared(&nchpp->spin); 3692 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3693 /* 3694 * Break out if we find a matching entry. Note that 3695 * UNRESOLVED entries may match, but DESTROYED entries 3696 * do not. However, UNRESOLVED entries still return failure. 3697 */ 3698 if (ncp->nc_parent == par_nch->ncp && 3699 ncp->nc_nlen == nlc->nlc_namelen && 3700 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3701 (ncp->nc_flag & NCF_DESTROYED) == 0 3702 ) { 3703 /* 3704 * Test NFS timeout for auto-unresolve. Give up if 3705 * the entry is not resolved. 3706 * 3707 * Getting the ref with the nchpp locked prevents 3708 * any transition to NCF_DESTROYED. 3709 */ 3710 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3711 break; 3712 if (ncp->nc_flag & NCF_UNRESOLVED) 3713 break; 3714 _cache_hold(ncp); 3715 spin_unlock_shared(&nchpp->spin); 3716 3717 /* 3718 * We need an additional test to ensure that the ref 3719 * we got above prevents transitions to NCF_UNRESOLVED. 3720 * This can occur if another thread is currently 3721 * holding the ncp exclusively locked or (if we raced 3722 * that and it unlocked before our test) the flag 3723 * has been set. 3724 */ 3725 if (_cache_lockstatus(ncp) < 0 || 3726 (ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED))) 3727 { 3728 if ((ncvp_debug & 4) && 3729 (ncp->nc_flag & 3730 (NCF_DESTROYED | NCF_UNRESOLVED))) 3731 { 3732 kprintf("ncp state change: %p %08x %d %s\n", 3733 ncp, ncp->nc_flag, ncp->nc_error, 3734 ncp->nc_name); 3735 } 3736 _cache_drop(ncp); 3737 spin_lock_shared(&nchpp->spin); 3738 break; 3739 } 3740 3741 /* 3742 * Return the ncp bundled into a nch on success. 3743 * The ref should passively prevent the ncp from 3744 * becoming unresolved without having to hold a lock. 3745 * (XXX this may not be entirely true) 3746 */ 3747 goto found; 3748 } 3749 } 3750 spin_unlock_shared(&nchpp->spin); 3751 nch.mount = NULL; 3752 nch.ncp = NULL; 3753 3754 return nch; 3755 found: 3756 /* 3757 * stats and namecache size management 3758 */ 3759 if (ncp->nc_flag & NCF_UNRESOLVED) 3760 ++gd->gd_nchstats->ncs_miss; 3761 else if (ncp->nc_vp) 3762 ++gd->gd_nchstats->ncs_goodhits; 3763 else 3764 ++gd->gd_nchstats->ncs_neghits; 3765 nch.mount = mp; 3766 nch.ncp = ncp; 3767 _cache_mntref(nch.mount); 3768 3769 return(nch); 3770 } 3771 3772 /* 3773 * The namecache entry is marked as being used as a mount point. 3774 * Locate the mount if it is visible to the caller. The DragonFly 3775 * mount system allows arbitrary loops in the topology and disentangles 3776 * those loops by matching against (mp, ncp) rather than just (ncp). 3777 * This means any given ncp can dive any number of mounts, depending 3778 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3779 * 3780 * We use a very simple frontend cache to reduce SMP conflicts, 3781 * which we have to do because the mountlist scan needs an exclusive 3782 * lock around its ripout info list. Not to mention that there might 3783 * be a lot of mounts. 3784 * 3785 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3786 * down a bit to allow some contention rather than making the cache 3787 * excessively huge. 3788 * 3789 * The hash table is split into per-cpu areas, is 4-way set-associative. 3790 */ 3791 struct findmount_info { 3792 struct mount *result; 3793 struct mount *nch_mount; 3794 struct namecache *nch_ncp; 3795 }; 3796 3797 static __inline 3798 struct ncmount_cache * 3799 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3800 { 3801 uint32_t hash; 3802 3803 hash = iscsi_crc32(&mp, sizeof(mp)); 3804 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3805 hash ^= hash >> 16; 3806 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3807 3808 return (&ncmount_cache[hash]); 3809 } 3810 3811 static 3812 struct ncmount_cache * 3813 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3814 { 3815 struct ncmount_cache *ncc; 3816 struct ncmount_cache *best; 3817 int delta; 3818 int best_delta; 3819 int i; 3820 3821 ncc = ncmount_cache_lookup4(mp, ncp); 3822 3823 /* 3824 * NOTE: When checking for a ticks overflow implement a slop of 3825 * 2 ticks just to be safe, because ticks is accessed 3826 * non-atomically one CPU can increment it while another 3827 * is still using the old value. 3828 */ 3829 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3830 return ncc; 3831 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3832 if (delta < -2) /* overflow reset */ 3833 ncc->ticks = ticks; 3834 best = ncc; 3835 best_delta = delta; 3836 3837 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3838 ++ncc; 3839 if (ncc->ncp == ncp && ncc->mp == mp) 3840 return ncc; 3841 delta = (int)(ticks - ncc->ticks); 3842 if (delta < -2) 3843 ncc->ticks = ticks; 3844 if (delta > best_delta) { 3845 best_delta = delta; 3846 best = ncc; 3847 } 3848 } 3849 return best; 3850 } 3851 3852 /* 3853 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3854 * doing an expensive mountlist_scan*() if possible. 3855 * 3856 * (mp, ncp) -> mountonpt.k 3857 * 3858 * Returns a referenced mount pointer or NULL 3859 * 3860 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3861 * operations (that is, where the mp_target can be freed out from under us). 3862 * 3863 * Lookups use the ncc->updating counter to validate the contents in order 3864 * to avoid having to obtain the per cache-element spin-lock. In addition, 3865 * the ticks field is only updated when it changes. However, if our per-cpu 3866 * lock fails due to an unmount-in-progress, we fall-back to the 3867 * cache-element's spin-lock. 3868 */ 3869 struct mount * 3870 cache_findmount(struct nchandle *nch) 3871 { 3872 struct findmount_info info; 3873 struct ncmount_cache *ncc; 3874 struct ncmount_cache ncc_copy; 3875 struct mount *target; 3876 struct pcpu_ncache *pcpu; 3877 struct spinlock *spinlk; 3878 int update; 3879 3880 pcpu = pcpu_ncache; 3881 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3882 ncc = NULL; 3883 goto skip; 3884 } 3885 pcpu += mycpu->gd_cpuid; 3886 3887 again: 3888 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3889 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3890 found: 3891 /* 3892 * This is a bit messy for now because we do not yet have 3893 * safe disposal of mount structures. We have to ref 3894 * ncc->mp_target but the 'update' counter only tell us 3895 * whether the cache has changed after the fact. 3896 * 3897 * For now get a per-cpu spinlock that will only contend 3898 * against umount's. This is the best path. If it fails, 3899 * instead of waiting on the umount we fall-back to a 3900 * shared ncc->spin lock, which will generally only cost a 3901 * cache ping-pong. 3902 */ 3903 update = ncc->updating; 3904 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3905 spinlk = &pcpu->umount_spin; 3906 } else { 3907 spinlk = &ncc->spin; 3908 spin_lock_shared(spinlk); 3909 } 3910 if (update & 1) { /* update in progress */ 3911 spin_unlock_any(spinlk); 3912 goto skip; 3913 } 3914 ncc_copy = *ncc; 3915 cpu_lfence(); 3916 if (ncc->updating != update) { /* content changed */ 3917 spin_unlock_any(spinlk); 3918 goto again; 3919 } 3920 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3921 spin_unlock_any(spinlk); 3922 goto again; 3923 } 3924 if (ncc_copy.isneg == 0) { 3925 target = ncc_copy.mp_target; 3926 if (target->mnt_ncmounton.mount == nch->mount && 3927 target->mnt_ncmounton.ncp == nch->ncp) { 3928 /* 3929 * Cache hit (positive) (avoid dirtying 3930 * the cache line if possible) 3931 */ 3932 if (ncc->ticks != (int)ticks) 3933 ncc->ticks = (int)ticks; 3934 _cache_mntref(target); 3935 } 3936 } else { 3937 /* 3938 * Cache hit (negative) (avoid dirtying 3939 * the cache line if possible) 3940 */ 3941 if (ncc->ticks != (int)ticks) 3942 ncc->ticks = (int)ticks; 3943 target = NULL; 3944 } 3945 spin_unlock_any(spinlk); 3946 3947 return target; 3948 } 3949 skip: 3950 3951 /* 3952 * Slow 3953 */ 3954 info.result = NULL; 3955 info.nch_mount = nch->mount; 3956 info.nch_ncp = nch->ncp; 3957 mountlist_scan(cache_findmount_callback, &info, 3958 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3959 3960 /* 3961 * To reduce multi-re-entry on the cache, relookup in the cache. 3962 * This can still race, obviously, but that's ok. 3963 */ 3964 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3965 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3966 if (info.result) 3967 atomic_add_int(&info.result->mnt_refs, -1); 3968 goto found; 3969 } 3970 3971 /* 3972 * Cache the result. 3973 */ 3974 if ((info.result == NULL || 3975 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3976 spin_lock(&ncc->spin); 3977 atomic_add_int_nonlocked(&ncc->updating, 1); 3978 cpu_sfence(); 3979 KKASSERT(ncc->updating & 1); 3980 if (ncc->mp != nch->mount) { 3981 if (ncc->mp) 3982 atomic_add_int(&ncc->mp->mnt_refs, -1); 3983 atomic_add_int(&nch->mount->mnt_refs, 1); 3984 ncc->mp = nch->mount; 3985 } 3986 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3987 ncc->ticks = (int)ticks; 3988 3989 if (info.result) { 3990 ncc->isneg = 0; 3991 if (ncc->mp_target != info.result) { 3992 if (ncc->mp_target) 3993 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3994 ncc->mp_target = info.result; 3995 atomic_add_int(&info.result->mnt_refs, 1); 3996 } 3997 } else { 3998 ncc->isneg = 1; 3999 if (ncc->mp_target) { 4000 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4001 ncc->mp_target = NULL; 4002 } 4003 } 4004 cpu_sfence(); 4005 atomic_add_int_nonlocked(&ncc->updating, 1); 4006 spin_unlock(&ncc->spin); 4007 } 4008 return(info.result); 4009 } 4010 4011 static 4012 int 4013 cache_findmount_callback(struct mount *mp, void *data) 4014 { 4015 struct findmount_info *info = data; 4016 4017 /* 4018 * Check the mount's mounted-on point against the passed nch. 4019 */ 4020 if (mp->mnt_ncmounton.mount == info->nch_mount && 4021 mp->mnt_ncmounton.ncp == info->nch_ncp 4022 ) { 4023 info->result = mp; 4024 _cache_mntref(mp); 4025 return(-1); 4026 } 4027 return(0); 4028 } 4029 4030 void 4031 cache_dropmount(struct mount *mp) 4032 { 4033 _cache_mntrel(mp); 4034 } 4035 4036 /* 4037 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 4038 * or negative). 4039 * 4040 * A full scan is not required, but for now just do it anyway. 4041 */ 4042 void 4043 cache_ismounting(struct mount *mp) 4044 { 4045 struct ncmount_cache *ncc; 4046 struct mount *ncc_mp; 4047 int i; 4048 4049 if (pcpu_ncache == NULL) 4050 return; 4051 4052 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4053 ncc = &ncmount_cache[i]; 4054 if (ncc->mp != mp->mnt_ncmounton.mount || 4055 ncc->ncp != mp->mnt_ncmounton.ncp) { 4056 continue; 4057 } 4058 spin_lock(&ncc->spin); 4059 atomic_add_int_nonlocked(&ncc->updating, 1); 4060 cpu_sfence(); 4061 KKASSERT(ncc->updating & 1); 4062 if (ncc->mp != mp->mnt_ncmounton.mount || 4063 ncc->ncp != mp->mnt_ncmounton.ncp) { 4064 cpu_sfence(); 4065 ++ncc->updating; 4066 spin_unlock(&ncc->spin); 4067 continue; 4068 } 4069 ncc_mp = ncc->mp; 4070 ncc->ncp = NULL; 4071 ncc->mp = NULL; 4072 if (ncc_mp) 4073 atomic_add_int(&ncc_mp->mnt_refs, -1); 4074 ncc_mp = ncc->mp_target; 4075 ncc->mp_target = NULL; 4076 if (ncc_mp) 4077 atomic_add_int(&ncc_mp->mnt_refs, -1); 4078 ncc->ticks = (int)ticks - hz * 120; 4079 4080 cpu_sfence(); 4081 atomic_add_int_nonlocked(&ncc->updating, 1); 4082 spin_unlock(&ncc->spin); 4083 } 4084 4085 /* 4086 * Pre-cache the mount point 4087 */ 4088 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 4089 mp->mnt_ncmounton.ncp); 4090 4091 spin_lock(&ncc->spin); 4092 atomic_add_int_nonlocked(&ncc->updating, 1); 4093 cpu_sfence(); 4094 KKASSERT(ncc->updating & 1); 4095 4096 if (ncc->mp) 4097 atomic_add_int(&ncc->mp->mnt_refs, -1); 4098 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 4099 ncc->mp = mp->mnt_ncmounton.mount; 4100 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 4101 ncc->ticks = (int)ticks; 4102 4103 ncc->isneg = 0; 4104 if (ncc->mp_target != mp) { 4105 if (ncc->mp_target) 4106 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 4107 ncc->mp_target = mp; 4108 atomic_add_int(&mp->mnt_refs, 1); 4109 } 4110 cpu_sfence(); 4111 atomic_add_int_nonlocked(&ncc->updating, 1); 4112 spin_unlock(&ncc->spin); 4113 } 4114 4115 /* 4116 * Scrap any ncmount_cache entries related to mp. Not only do we need to 4117 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 4118 * negative hits involving (mp, <any>). 4119 * 4120 * A full scan is required. 4121 */ 4122 void 4123 cache_unmounting(struct mount *mp) 4124 { 4125 struct ncmount_cache *ncc; 4126 struct pcpu_ncache *pcpu; 4127 struct mount *ncc_mp; 4128 int i; 4129 4130 pcpu = pcpu_ncache; 4131 if (pcpu == NULL) 4132 return; 4133 4134 for (i = 0; i < ncpus; ++i) 4135 spin_lock(&pcpu[i].umount_spin); 4136 4137 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 4138 ncc = &ncmount_cache[i]; 4139 if (ncc->mp != mp && ncc->mp_target != mp) 4140 continue; 4141 spin_lock(&ncc->spin); 4142 atomic_add_int_nonlocked(&ncc->updating, 1); 4143 cpu_sfence(); 4144 4145 if (ncc->mp != mp && ncc->mp_target != mp) { 4146 atomic_add_int_nonlocked(&ncc->updating, 1); 4147 cpu_sfence(); 4148 spin_unlock(&ncc->spin); 4149 continue; 4150 } 4151 ncc_mp = ncc->mp; 4152 ncc->ncp = NULL; 4153 ncc->mp = NULL; 4154 if (ncc_mp) 4155 atomic_add_int(&ncc_mp->mnt_refs, -1); 4156 ncc_mp = ncc->mp_target; 4157 ncc->mp_target = NULL; 4158 if (ncc_mp) 4159 atomic_add_int(&ncc_mp->mnt_refs, -1); 4160 ncc->ticks = (int)ticks - hz * 120; 4161 4162 cpu_sfence(); 4163 atomic_add_int_nonlocked(&ncc->updating, 1); 4164 spin_unlock(&ncc->spin); 4165 } 4166 4167 for (i = 0; i < ncpus; ++i) 4168 spin_unlock(&pcpu[i].umount_spin); 4169 } 4170 4171 /* 4172 * Resolve an unresolved namecache entry, generally by looking it up. 4173 * The passed ncp must be locked and refd. 4174 * 4175 * Theoretically since a vnode cannot be recycled while held, and since 4176 * the nc_parent chain holds its vnode as long as children exist, the 4177 * direct parent of the cache entry we are trying to resolve should 4178 * have a valid vnode. If not then generate an error that we can 4179 * determine is related to a resolver bug. 4180 * 4181 * However, if a vnode was in the middle of a recyclement when the NCP 4182 * got locked, ncp->nc_vp might point to a vnode that is about to become 4183 * invalid. cache_resolve() handles this case by unresolving the entry 4184 * and then re-resolving it. 4185 * 4186 * Note that successful resolution does not necessarily return an error 4187 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 4188 * will be returned. 4189 */ 4190 int 4191 cache_resolve(struct nchandle *nch, struct ucred *cred) 4192 { 4193 struct namecache *par_tmp; 4194 struct namecache *par; 4195 struct namecache *ncp; 4196 struct nchandle nctmp; 4197 struct mount *mp; 4198 struct vnode *dvp; 4199 int error; 4200 4201 ncp = nch->ncp; 4202 mp = nch->mount; 4203 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4204 restart: 4205 /* 4206 * If the ncp is already resolved we have nothing to do. However, 4207 * we do want to guarentee that a usable vnode is returned when 4208 * a vnode is present, so make sure it hasn't been reclaimed. 4209 */ 4210 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4211 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4212 _cache_setunresolved(ncp); 4213 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 4214 return (ncp->nc_error); 4215 } 4216 4217 /* 4218 * If the ncp was destroyed it will never resolve again. This 4219 * can basically only happen when someone is chdir'd into an 4220 * empty directory which is then rmdir'd. We want to catch this 4221 * here and not dive the VFS because the VFS might actually 4222 * have a way to re-resolve the disconnected ncp, which will 4223 * result in inconsistencies in the cdir/nch for proc->p_fd. 4224 */ 4225 if (ncp->nc_flag & NCF_DESTROYED) 4226 return(EINVAL); 4227 4228 /* 4229 * Mount points need special handling because the parent does not 4230 * belong to the same filesystem as the ncp. 4231 */ 4232 if (ncp == mp->mnt_ncmountpt.ncp) 4233 return (cache_resolve_mp(mp)); 4234 4235 /* 4236 * We expect an unbroken chain of ncps to at least the mount point, 4237 * and even all the way to root (but this code doesn't have to go 4238 * past the mount point). 4239 */ 4240 if (ncp->nc_parent == NULL) { 4241 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 4242 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4243 ncp->nc_error = EXDEV; 4244 return(ncp->nc_error); 4245 } 4246 4247 /* 4248 * The vp's of the parent directories in the chain are held via vhold() 4249 * due to the existance of the child, and should not disappear. 4250 * However, there are cases where they can disappear: 4251 * 4252 * - due to filesystem I/O errors. 4253 * - due to NFS being stupid about tracking the namespace and 4254 * destroys the namespace for entire directories quite often. 4255 * - due to forced unmounts. 4256 * - due to an rmdir (parent will be marked DESTROYED) 4257 * 4258 * When this occurs we have to track the chain backwards and resolve 4259 * it, looping until the resolver catches up to the current node. We 4260 * could recurse here but we might run ourselves out of kernel stack 4261 * so we do it in a more painful manner. This situation really should 4262 * not occur all that often, or if it does not have to go back too 4263 * many nodes to resolve the ncp. 4264 */ 4265 while ((dvp = cache_dvpref(ncp)) == NULL) { 4266 /* 4267 * This case can occur if a process is CD'd into a 4268 * directory which is then rmdir'd. If the parent is marked 4269 * destroyed there is no point trying to resolve it. 4270 */ 4271 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) { 4272 kprintf("nc_parent destroyed: %s/%s\n", 4273 ncp->nc_parent->nc_name, ncp->nc_name); 4274 return(ENOENT); 4275 } 4276 par = ncp->nc_parent; 4277 _cache_hold(par); 4278 _cache_lock(par); 4279 while ((par_tmp = par->nc_parent) != NULL && 4280 par_tmp->nc_vp == NULL) { 4281 _cache_hold(par_tmp); 4282 _cache_lock(par_tmp); 4283 _cache_put(par); 4284 par = par_tmp; 4285 } 4286 if (par->nc_parent == NULL) { 4287 kprintf("EXDEV case 2 %*.*s\n", 4288 par->nc_nlen, par->nc_nlen, par->nc_name); 4289 _cache_put(par); 4290 return (EXDEV); 4291 } 4292 /* 4293 * The parent is not set in stone, ref and lock it to prevent 4294 * it from disappearing. Also note that due to renames it 4295 * is possible for our ncp to move and for par to no longer 4296 * be one of its parents. We resolve it anyway, the loop 4297 * will handle any moves. 4298 */ 4299 _cache_get(par); /* additional hold/lock */ 4300 _cache_put(par); /* from earlier hold/lock */ 4301 if (par == nch->mount->mnt_ncmountpt.ncp) { 4302 cache_resolve_mp(nch->mount); 4303 } else if ((dvp = cache_dvpref(par)) == NULL) { 4304 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4305 par->nc_nlen, par->nc_nlen, par->nc_name); 4306 _cache_put(par); 4307 continue; 4308 } else { 4309 if (par->nc_flag & NCF_UNRESOLVED) { 4310 nctmp.mount = mp; 4311 nctmp.ncp = par; 4312 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4313 } 4314 vrele(dvp); 4315 } 4316 if ((error = par->nc_error) != 0) { 4317 if (par->nc_error != EAGAIN) { 4318 kprintf("EXDEV case 3 %*.*s error %d\n", 4319 par->nc_nlen, par->nc_nlen, par->nc_name, 4320 par->nc_error); 4321 _cache_put(par); 4322 return(error); 4323 } 4324 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4325 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4326 } 4327 _cache_put(par); 4328 /* loop */ 4329 } 4330 4331 /* 4332 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 4333 * ncp's and reattach them. If this occurs the original ncp is marked 4334 * EAGAIN to force a relookup. 4335 * 4336 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 4337 * ncp must already be resolved. 4338 */ 4339 if (dvp) { 4340 nctmp.mount = mp; 4341 nctmp.ncp = ncp; 4342 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4343 vrele(dvp); 4344 } else { 4345 ncp->nc_error = EPERM; 4346 } 4347 4348 if (ncp->nc_error == EAGAIN) { 4349 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 4350 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 4351 goto restart; 4352 } 4353 return(ncp->nc_error); 4354 } 4355 4356 /* 4357 * Resolve the ncp associated with a mount point. Such ncp's almost always 4358 * remain resolved and this routine is rarely called. NFS MPs tends to force 4359 * re-resolution more often due to its mac-truck-smash-the-namecache 4360 * method of tracking namespace changes. 4361 * 4362 * The semantics for this call is that the passed ncp must be locked on 4363 * entry and will be locked on return. However, if we actually have to 4364 * resolve the mount point we temporarily unlock the entry in order to 4365 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 4366 * the unlock we have to recheck the flags after we relock. 4367 */ 4368 static int 4369 cache_resolve_mp(struct mount *mp) 4370 { 4371 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 4372 struct vnode *vp; 4373 int error; 4374 4375 KKASSERT(mp != NULL); 4376 4377 /* 4378 * If the ncp is already resolved we have nothing to do. However, 4379 * we do want to guarentee that a usable vnode is returned when 4380 * a vnode is present, so make sure it hasn't been reclaimed. 4381 */ 4382 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4383 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4384 _cache_setunresolved(ncp); 4385 } 4386 4387 if (ncp->nc_flag & NCF_UNRESOLVED) { 4388 /* 4389 * ncp must be unlocked across the vfs_busy(), but 4390 * once busied lock ordering is ncp(s), then vnodes, 4391 * so we must relock the ncp before issuing the VFS_ROOT(). 4392 */ 4393 _cache_unlock(ncp); 4394 while (vfs_busy(mp, 0)) 4395 ; 4396 _cache_lock(ncp); 4397 error = VFS_ROOT(mp, &vp); 4398 4399 /* 4400 * recheck the ncp state after relocking. 4401 */ 4402 if (ncp->nc_flag & NCF_UNRESOLVED) { 4403 ncp->nc_error = error; 4404 if (error == 0) { 4405 _cache_setvp(mp, ncp, vp); 4406 vput(vp); 4407 } else { 4408 kprintf("[diagnostic] cache_resolve_mp: failed" 4409 " to resolve mount %p err=%d ncp=%p\n", 4410 mp, error, ncp); 4411 _cache_setvp(mp, ncp, NULL); 4412 } 4413 } else if (error == 0) { 4414 vput(vp); 4415 } 4416 vfs_unbusy(mp); 4417 } 4418 return(ncp->nc_error); 4419 } 4420 4421 /* 4422 * Resolve the parent vnode 4423 */ 4424 int 4425 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4426 { 4427 struct namecache *par_tmp; 4428 struct namecache *par; 4429 struct namecache *ncp; 4430 struct nchandle nctmp; 4431 struct mount *mp; 4432 struct vnode *dvp; 4433 int error; 4434 4435 *dvpp = NULL; 4436 ncp = nch->ncp; 4437 mp = nch->mount; 4438 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4439 4440 /* 4441 * Treat this as a mount point even if it has a parent (e.g. 4442 * null-mount). Return a NULL dvp and no error. 4443 */ 4444 if (ncp == mp->mnt_ncmountpt.ncp) 4445 return 0; 4446 4447 /* 4448 * If the ncp was destroyed there is no parent directory, return 4449 * EINVAL. 4450 */ 4451 if (ncp->nc_flag & NCF_DESTROYED) 4452 return(EINVAL); 4453 4454 /* 4455 * No parent if at the root of a filesystem, no error. Typically 4456 * not applicable to null-mounts. This case should have been caught 4457 * in the above ncmountpt check. 4458 */ 4459 if (ncp->nc_parent == NULL) 4460 return 0; 4461 4462 /* 4463 * Resolve the parent dvp. 4464 * 4465 * The vp's of the parent directories in the chain are held via vhold() 4466 * due to the existance of the child, and should not disappear. 4467 * However, there are cases where they can disappear: 4468 * 4469 * - due to filesystem I/O errors. 4470 * - due to NFS being stupid about tracking the namespace and 4471 * destroys the namespace for entire directories quite often. 4472 * - due to forced unmounts. 4473 * - due to an rmdir (parent will be marked DESTROYED) 4474 * 4475 * When this occurs we have to track the chain backwards and resolve 4476 * it, looping until the resolver catches up to the current node. We 4477 * could recurse here but we might run ourselves out of kernel stack 4478 * so we do it in a more painful manner. This situation really should 4479 * not occur all that often, or if it does not have to go back too 4480 * many nodes to resolve the ncp. 4481 */ 4482 while ((dvp = cache_dvpref(ncp)) == NULL) { 4483 /* 4484 * This case can occur if a process is CD'd into a 4485 * directory which is then rmdir'd. If the parent is marked 4486 * destroyed there is no point trying to resolve it. 4487 */ 4488 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4489 return(ENOENT); 4490 par = ncp->nc_parent; 4491 _cache_hold(par); 4492 _cache_lock(par); 4493 while ((par_tmp = par->nc_parent) != NULL && 4494 par_tmp->nc_vp == NULL) { 4495 _cache_hold(par_tmp); 4496 _cache_lock(par_tmp); 4497 _cache_put(par); 4498 par = par_tmp; 4499 } 4500 if (par->nc_parent == NULL) { 4501 kprintf("EXDEV case 2 %*.*s\n", 4502 par->nc_nlen, par->nc_nlen, par->nc_name); 4503 _cache_put(par); 4504 return (EXDEV); 4505 } 4506 4507 /* 4508 * The parent is not set in stone, ref and lock it to prevent 4509 * it from disappearing. Also note that due to renames it 4510 * is possible for our ncp to move and for par to no longer 4511 * be one of its parents. We resolve it anyway, the loop 4512 * will handle any moves. 4513 */ 4514 _cache_get(par); /* additional hold/lock */ 4515 _cache_put(par); /* from earlier hold/lock */ 4516 if (par == nch->mount->mnt_ncmountpt.ncp) { 4517 cache_resolve_mp(nch->mount); 4518 } else if ((dvp = cache_dvpref(par)) == NULL) { 4519 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4520 par->nc_nlen, par->nc_nlen, par->nc_name); 4521 _cache_put(par); 4522 continue; 4523 } else { 4524 if (par->nc_flag & NCF_UNRESOLVED) { 4525 nctmp.mount = mp; 4526 nctmp.ncp = par; 4527 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4528 } 4529 vrele(dvp); 4530 } 4531 if ((error = par->nc_error) != 0) { 4532 if (par->nc_error != EAGAIN) { 4533 kprintf("EXDEV case 3 %*.*s error %d\n", 4534 par->nc_nlen, par->nc_nlen, par->nc_name, 4535 par->nc_error); 4536 _cache_put(par); 4537 return(error); 4538 } 4539 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4540 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4541 } 4542 _cache_put(par); 4543 /* loop */ 4544 } 4545 4546 /* 4547 * We have a referenced dvp 4548 */ 4549 *dvpp = dvp; 4550 return 0; 4551 } 4552 4553 /* 4554 * Clean out negative cache entries when too many have accumulated. 4555 */ 4556 static void 4557 _cache_cleanneg(long count) 4558 { 4559 struct pcpu_ncache *pn; 4560 struct namecache *ncp; 4561 static uint32_t neg_rover; 4562 uint32_t n; 4563 long vnegs; 4564 4565 n = neg_rover++; /* SMP heuristical, race ok */ 4566 cpu_ccfence(); 4567 n = n % (uint32_t)ncpus; 4568 4569 /* 4570 * Normalize vfscache_negs and count. count is sometimes based 4571 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4572 * have crazy values. 4573 */ 4574 vnegs = vfscache_negs; 4575 cpu_ccfence(); 4576 if (vnegs <= MINNEG) 4577 vnegs = MINNEG; 4578 if (count < 1) 4579 count = 1; 4580 4581 pn = &pcpu_ncache[n]; 4582 spin_lock(&pn->neg_spin); 4583 count = pn->neg_count * count / vnegs + 1; 4584 spin_unlock(&pn->neg_spin); 4585 4586 /* 4587 * Attempt to clean out the specified number of negative cache 4588 * entries. 4589 */ 4590 while (count > 0) { 4591 spin_lock(&pn->neg_spin); 4592 ncp = TAILQ_FIRST(&pn->neg_list); 4593 if (ncp == NULL) { 4594 spin_unlock(&pn->neg_spin); 4595 break; 4596 } 4597 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4598 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4599 _cache_hold(ncp); 4600 spin_unlock(&pn->neg_spin); 4601 4602 /* 4603 * This can race, so we must re-check that the ncp 4604 * is on the ncneg.list after successfully locking it. 4605 * 4606 * Don't scrap actively referenced ncps. There should be 4607 * 3 refs. The natural ref, one from being on the neg list, 4608 * and one from us. 4609 * 4610 * Recheck fields after successfully locking to ensure 4611 * that it is in-fact still on the negative list with no 4612 * extra refs. 4613 * 4614 * WARNING! On the ncneglist scan any race against other 4615 * destructors (zaps or cache_inval_vp_quick() calls) 4616 * will have already unresolved the ncp and cause 4617 * us to drop instead of zap. This fine, if 4618 * our drop winds up being the last one it will 4619 * kfree() the ncp. 4620 */ 4621 if (_cache_lock_special(ncp) == 0) { 4622 if (ncp->nc_vp == NULL && 4623 ncp->nc_refs == 3 && 4624 (ncp->nc_flag & NCF_UNRESOLVED) == 0) 4625 { 4626 ++pcpu_ncache[mycpu->gd_cpuid].clean_neg_count; 4627 cache_zap(ncp); 4628 } else { 4629 _cache_unlock(ncp); 4630 _cache_drop(ncp); 4631 } 4632 } else { 4633 _cache_drop(ncp); 4634 } 4635 --count; 4636 } 4637 } 4638 4639 /* 4640 * Clean out unresolved cache entries when too many have accumulated. 4641 * Resolved cache entries are cleaned out via the vnode reclamation 4642 * mechanism and by _cache_cleanneg(). 4643 */ 4644 static void 4645 _cache_cleanpos(long ucount, long xcount) 4646 { 4647 static volatile int rover; 4648 struct nchash_head *nchpp; 4649 struct namecache *ncp; 4650 long count; 4651 int rover_copy; 4652 4653 /* 4654 * Don't burn too much cpu looking for stuff 4655 */ 4656 count = (ucount > xcount) ? ucount : xcount; 4657 count = count * 4; 4658 4659 /* 4660 * Attempt to clean out the specified number of cache entries. 4661 */ 4662 while (count > 0 && (ucount > 0 || xcount > 0)) { 4663 rover_copy = ++rover; /* MPSAFEENOUGH */ 4664 cpu_ccfence(); 4665 nchpp = NCHHASH(rover_copy); 4666 4667 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4668 --count; 4669 continue; 4670 } 4671 4672 /* 4673 * Get the next ncp 4674 */ 4675 spin_lock(&nchpp->spin); 4676 ncp = TAILQ_FIRST(&nchpp->list); 4677 4678 /* 4679 * Skip placeholder ncp's. Do not shift their 4680 * position in the list. 4681 */ 4682 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4683 ncp = TAILQ_NEXT(ncp, nc_hash); 4684 4685 if (ncp) { 4686 /* 4687 * Move to end of list 4688 */ 4689 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4690 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4691 4692 if (ncp->nc_refs != ncpbaserefs(ncp)) { 4693 /* 4694 * Do not destroy internal nodes that have 4695 * children or nodes which have thread 4696 * references. 4697 */ 4698 ncp = NULL; 4699 } else if (ucount > 0 && 4700 (ncp->nc_flag & NCF_UNRESOLVED)) 4701 { 4702 /* 4703 * Destroy unresolved nodes if asked. 4704 */ 4705 --ucount; 4706 --xcount; 4707 _cache_hold(ncp); 4708 } else if (xcount > 0) { 4709 /* 4710 * Destroy any other node if asked. 4711 */ 4712 --xcount; 4713 _cache_hold(ncp); 4714 } else { 4715 /* 4716 * Otherwise don't 4717 */ 4718 ncp = NULL; 4719 } 4720 } 4721 spin_unlock(&nchpp->spin); 4722 4723 /* 4724 * Try to scap the ncp if we can do so non-blocking. 4725 * We must re-check nc_refs after locking, and it will 4726 * have one additional ref from above. 4727 */ 4728 if (ncp) { 4729 if (_cache_lock_special(ncp) == 0) { 4730 if (ncp->nc_refs == 1 + ncpbaserefs(ncp)) { 4731 ++pcpu_ncache[mycpu->gd_cpuid]. 4732 clean_pos_count; 4733 cache_zap(ncp); 4734 } else { 4735 _cache_unlock(ncp); 4736 _cache_drop(ncp); 4737 } 4738 } else { 4739 _cache_drop(ncp); 4740 } 4741 } 4742 --count; 4743 } 4744 } 4745 4746 /* 4747 * This is a kitchen sink function to clean out ncps which we 4748 * tried to zap from cache_drop() but failed because we were 4749 * unable to acquire the parent lock. 4750 * 4751 * Such entries can also be removed via cache_inval_vp(), such 4752 * as when unmounting. 4753 */ 4754 static void 4755 _cache_cleandefered(void) 4756 { 4757 struct nchash_head *nchpp; 4758 struct namecache *ncp; 4759 struct namecache dummy; 4760 int i; 4761 4762 /* 4763 * Create a list iterator. DUMMY indicates that this is a list 4764 * iterator, DESTROYED prevents matches by lookup functions. 4765 */ 4766 numdefered = 0; 4767 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4768 bzero(&dummy, sizeof(dummy)); 4769 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4770 dummy.nc_refs = 1; 4771 4772 for (i = 0; i <= nchash; ++i) { 4773 nchpp = &nchashtbl[i]; 4774 4775 spin_lock(&nchpp->spin); 4776 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4777 ncp = &dummy; 4778 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4779 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4780 continue; 4781 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4782 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4783 _cache_hold(ncp); 4784 spin_unlock(&nchpp->spin); 4785 if (_cache_lock_nonblock(ncp) == 0) { 4786 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4787 _cache_unlock(ncp); 4788 } 4789 _cache_drop(ncp); 4790 spin_lock(&nchpp->spin); 4791 ncp = &dummy; 4792 } 4793 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4794 spin_unlock(&nchpp->spin); 4795 } 4796 } 4797 4798 /* 4799 * Name cache initialization, from vfsinit() when we are booting 4800 */ 4801 void 4802 nchinit(void) 4803 { 4804 struct pcpu_ncache *pn; 4805 globaldata_t gd; 4806 int i; 4807 4808 /* 4809 * Per-cpu accounting and negative hit list 4810 */ 4811 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4812 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4813 for (i = 0; i < ncpus; ++i) { 4814 pn = &pcpu_ncache[i]; 4815 TAILQ_INIT(&pn->neg_list); 4816 spin_init(&pn->neg_spin, "ncneg"); 4817 spin_init(&pn->umount_spin, "ncumm"); 4818 } 4819 4820 /* 4821 * Initialise per-cpu namecache effectiveness statistics. 4822 */ 4823 for (i = 0; i < ncpus; ++i) { 4824 gd = globaldata_find(i); 4825 gd->gd_nchstats = &nchstats[i]; 4826 } 4827 4828 /* 4829 * Create a generous namecache hash table 4830 */ 4831 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4832 sizeof(struct nchash_head), 4833 M_VFSCACHEAUX, &nchash); 4834 for (i = 0; i <= (int)nchash; ++i) { 4835 TAILQ_INIT(&nchashtbl[i].list); 4836 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4837 } 4838 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4839 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4840 nclockwarn = 5 * hz; 4841 } 4842 4843 /* 4844 * Called from start_init() to bootstrap the root filesystem. Returns 4845 * a referenced, unlocked namecache record to serve as a root or the 4846 * root of the system. 4847 * 4848 * Adjust our namecache counts 4849 */ 4850 void 4851 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4852 { 4853 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 4854 4855 atomic_add_long(&pn->vfscache_leafs, 1); 4856 atomic_add_long(&pn->vfscache_unres, 1); 4857 4858 nch->ncp = cache_alloc(0); 4859 nch->mount = mp; 4860 _cache_mntref(mp); 4861 if (vp) 4862 _cache_setvp(nch->mount, nch->ncp, vp); 4863 } 4864 4865 /* 4866 * vfs_cache_setroot() 4867 * 4868 * Create an association between the root of our namecache and 4869 * the root vnode. This routine may be called several times during 4870 * booting. 4871 * 4872 * If the caller intends to save the returned namecache pointer somewhere 4873 * it must cache_hold() it. 4874 */ 4875 void 4876 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4877 { 4878 struct vnode *ovp; 4879 struct nchandle onch; 4880 4881 ovp = rootvnode; 4882 onch = rootnch; 4883 rootvnode = nvp; 4884 if (nch) 4885 rootnch = *nch; 4886 else 4887 cache_zero(&rootnch); 4888 if (ovp) 4889 vrele(ovp); 4890 if (onch.ncp) 4891 cache_drop(&onch); 4892 } 4893 4894 /* 4895 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4896 * topology and is being removed as quickly as possible. The new VOP_N*() 4897 * API calls are required to make specific adjustments using the supplied 4898 * ncp pointers rather then just bogusly purging random vnodes. 4899 * 4900 * Invalidate all namecache entries to a particular vnode as well as 4901 * any direct children of that vnode in the namecache. This is a 4902 * 'catch all' purge used by filesystems that do not know any better. 4903 * 4904 * Note that the linkage between the vnode and its namecache entries will 4905 * be removed, but the namecache entries themselves might stay put due to 4906 * active references from elsewhere in the system or due to the existance of 4907 * the children. The namecache topology is left intact even if we do not 4908 * know what the vnode association is. Such entries will be marked 4909 * NCF_UNRESOLVED. 4910 */ 4911 void 4912 cache_purge(struct vnode *vp) 4913 { 4914 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4915 } 4916 4917 __read_mostly static int disablecwd; 4918 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4919 "Disable getcwd"); 4920 4921 /* 4922 * MPALMOSTSAFE 4923 */ 4924 int 4925 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 4926 { 4927 u_int buflen; 4928 int error; 4929 char *buf; 4930 char *bp; 4931 4932 if (disablecwd) 4933 return (ENODEV); 4934 4935 buflen = uap->buflen; 4936 if (buflen == 0) 4937 return (EINVAL); 4938 if (buflen > MAXPATHLEN) 4939 buflen = MAXPATHLEN; 4940 4941 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4942 bp = kern_getcwd(buf, buflen, &error); 4943 if (error == 0) 4944 error = copyout(bp, uap->buf, strlen(bp) + 1); 4945 kfree(buf, M_TEMP); 4946 return (error); 4947 } 4948 4949 char * 4950 kern_getcwd(char *buf, size_t buflen, int *error) 4951 { 4952 struct proc *p = curproc; 4953 char *bp; 4954 int i, slash_prefixed; 4955 struct filedesc *fdp; 4956 struct nchandle nch; 4957 struct namecache *ncp; 4958 4959 bp = buf; 4960 bp += buflen - 1; 4961 *bp = '\0'; 4962 fdp = p->p_fd; 4963 slash_prefixed = 0; 4964 4965 nch = fdp->fd_ncdir; 4966 ncp = nch.ncp; 4967 if (ncp) 4968 _cache_hold(ncp); 4969 4970 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4971 nch.mount != fdp->fd_nrdir.mount) 4972 ) { 4973 if (ncp->nc_flag & NCF_DESTROYED) { 4974 _cache_drop(ncp); 4975 ncp = NULL; 4976 break; 4977 } 4978 /* 4979 * While traversing upwards if we encounter the root 4980 * of the current mount we have to skip to the mount point 4981 * in the underlying filesystem. 4982 */ 4983 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4984 nch = nch.mount->mnt_ncmounton; 4985 _cache_drop(ncp); 4986 ncp = nch.ncp; 4987 if (ncp) 4988 _cache_hold(ncp); 4989 continue; 4990 } 4991 4992 /* 4993 * Prepend the path segment 4994 */ 4995 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4996 if (bp == buf) { 4997 *error = ERANGE; 4998 bp = NULL; 4999 goto done; 5000 } 5001 *--bp = ncp->nc_name[i]; 5002 } 5003 if (bp == buf) { 5004 *error = ERANGE; 5005 bp = NULL; 5006 goto done; 5007 } 5008 *--bp = '/'; 5009 slash_prefixed = 1; 5010 5011 /* 5012 * Go up a directory. This isn't a mount point so we don't 5013 * have to check again. 5014 */ 5015 while ((nch.ncp = ncp->nc_parent) != NULL) { 5016 if (ncp_shared_lock_disable) 5017 _cache_lock(ncp); 5018 else 5019 _cache_lock_shared(ncp); 5020 if (nch.ncp != ncp->nc_parent) { 5021 _cache_unlock(ncp); 5022 continue; 5023 } 5024 _cache_hold(nch.ncp); 5025 _cache_unlock(ncp); 5026 break; 5027 } 5028 _cache_drop(ncp); 5029 ncp = nch.ncp; 5030 } 5031 if (ncp == NULL) { 5032 *error = ENOENT; 5033 bp = NULL; 5034 goto done; 5035 } 5036 if (!slash_prefixed) { 5037 if (bp == buf) { 5038 *error = ERANGE; 5039 bp = NULL; 5040 goto done; 5041 } 5042 *--bp = '/'; 5043 } 5044 *error = 0; 5045 done: 5046 if (ncp) 5047 _cache_drop(ncp); 5048 return (bp); 5049 } 5050 5051 /* 5052 * Thus begins the fullpath magic. 5053 * 5054 * The passed nchp is referenced but not locked. 5055 */ 5056 __read_mostly static int disablefullpath; 5057 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 5058 &disablefullpath, 0, 5059 "Disable fullpath lookups"); 5060 5061 int 5062 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 5063 char **retbuf, char **freebuf, int guess) 5064 { 5065 struct nchandle fd_nrdir; 5066 struct nchandle nch; 5067 struct namecache *ncp; 5068 struct mount *mp, *new_mp; 5069 char *bp, *buf; 5070 int slash_prefixed; 5071 int error = 0; 5072 int i; 5073 5074 *retbuf = NULL; 5075 *freebuf = NULL; 5076 5077 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 5078 bp = buf + MAXPATHLEN - 1; 5079 *bp = '\0'; 5080 if (nchbase) 5081 fd_nrdir = *nchbase; 5082 else if (p != NULL) 5083 fd_nrdir = p->p_fd->fd_nrdir; 5084 else 5085 fd_nrdir = rootnch; 5086 slash_prefixed = 0; 5087 nch = *nchp; 5088 ncp = nch.ncp; 5089 if (ncp) 5090 _cache_hold(ncp); 5091 mp = nch.mount; 5092 5093 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 5094 new_mp = NULL; 5095 5096 /* 5097 * If we are asked to guess the upwards path, we do so whenever 5098 * we encounter an ncp marked as a mountpoint. We try to find 5099 * the actual mountpoint by finding the mountpoint with this 5100 * ncp. 5101 */ 5102 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 5103 new_mp = mount_get_by_nc(ncp); 5104 } 5105 /* 5106 * While traversing upwards if we encounter the root 5107 * of the current mount we have to skip to the mount point. 5108 */ 5109 if (ncp == mp->mnt_ncmountpt.ncp) { 5110 new_mp = mp; 5111 } 5112 if (new_mp) { 5113 nch = new_mp->mnt_ncmounton; 5114 _cache_drop(ncp); 5115 ncp = nch.ncp; 5116 if (ncp) 5117 _cache_hold(ncp); 5118 mp = nch.mount; 5119 continue; 5120 } 5121 5122 /* 5123 * Prepend the path segment 5124 */ 5125 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 5126 if (bp == buf) { 5127 kfree(buf, M_TEMP); 5128 error = ENOMEM; 5129 goto done; 5130 } 5131 *--bp = ncp->nc_name[i]; 5132 } 5133 if (bp == buf) { 5134 kfree(buf, M_TEMP); 5135 error = ENOMEM; 5136 goto done; 5137 } 5138 *--bp = '/'; 5139 slash_prefixed = 1; 5140 5141 /* 5142 * Go up a directory. This isn't a mount point so we don't 5143 * have to check again. 5144 * 5145 * We can only safely access nc_parent with ncp held locked. 5146 */ 5147 while ((nch.ncp = ncp->nc_parent) != NULL) { 5148 _cache_lock_shared(ncp); 5149 if (nch.ncp != ncp->nc_parent) { 5150 _cache_unlock(ncp); 5151 continue; 5152 } 5153 _cache_hold(nch.ncp); 5154 _cache_unlock(ncp); 5155 break; 5156 } 5157 _cache_drop(ncp); 5158 ncp = nch.ncp; 5159 } 5160 if (ncp == NULL) { 5161 kfree(buf, M_TEMP); 5162 error = ENOENT; 5163 goto done; 5164 } 5165 5166 if (!slash_prefixed) { 5167 if (bp == buf) { 5168 kfree(buf, M_TEMP); 5169 error = ENOMEM; 5170 goto done; 5171 } 5172 *--bp = '/'; 5173 } 5174 *retbuf = bp; 5175 *freebuf = buf; 5176 error = 0; 5177 done: 5178 if (ncp) 5179 _cache_drop(ncp); 5180 return(error); 5181 } 5182 5183 int 5184 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 5185 char **freebuf, int guess) 5186 { 5187 struct namecache *ncp; 5188 struct nchandle nch; 5189 int error; 5190 5191 *freebuf = NULL; 5192 if (disablefullpath) 5193 return (ENODEV); 5194 5195 if (p == NULL) 5196 return (EINVAL); 5197 5198 /* vn is NULL, client wants us to use p->p_textvp */ 5199 if (vn == NULL) { 5200 if ((vn = p->p_textvp) == NULL) 5201 return (EINVAL); 5202 } 5203 spin_lock_shared(&vn->v_spin); 5204 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 5205 if (ncp->nc_nlen) 5206 break; 5207 } 5208 if (ncp == NULL) { 5209 spin_unlock_shared(&vn->v_spin); 5210 return (EINVAL); 5211 } 5212 _cache_hold(ncp); 5213 spin_unlock_shared(&vn->v_spin); 5214 5215 nch.ncp = ncp; 5216 nch.mount = vn->v_mount; 5217 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 5218 _cache_drop(ncp); 5219 return (error); 5220 } 5221 5222 void 5223 vfscache_rollup_cpu(struct globaldata *gd) 5224 { 5225 struct pcpu_ncache *pn; 5226 long count; 5227 5228 if (pcpu_ncache == NULL) 5229 return; 5230 pn = &pcpu_ncache[gd->gd_cpuid]; 5231 5232 /* 5233 * namecache statistics 5234 */ 5235 if (pn->vfscache_count) { 5236 count = atomic_swap_long(&pn->vfscache_count, 0); 5237 atomic_add_long(&vfscache_count, count); 5238 } 5239 if (pn->vfscache_leafs) { 5240 count = atomic_swap_long(&pn->vfscache_leafs, 0); 5241 atomic_add_long(&vfscache_leafs, count); 5242 } 5243 if (pn->vfscache_unres) { 5244 count = atomic_swap_long(&pn->vfscache_unres, 0); 5245 atomic_add_long(&vfscache_unres, count); 5246 } 5247 if (pn->vfscache_negs) { 5248 count = atomic_swap_long(&pn->vfscache_negs, 0); 5249 atomic_add_long(&vfscache_negs, count); 5250 } 5251 5252 /* 5253 * hysteresis based cleanings 5254 */ 5255 if (pn->inv_kid_quick_count) { 5256 count = atomic_swap_long(&pn->inv_kid_quick_count, 0); 5257 atomic_add_long(&inv_kid_quick_count, count); 5258 } 5259 if (pn->inv_ncp_quick_count) { 5260 count = atomic_swap_long(&pn->inv_ncp_quick_count, 0); 5261 atomic_add_long(&inv_ncp_quick_count, count); 5262 } 5263 if (pn->clean_pos_count) { 5264 count = atomic_swap_long(&pn->clean_pos_count, 0); 5265 atomic_add_long(&clean_pos_count, count); 5266 } 5267 if (pn->clean_neg_count) { 5268 count = atomic_swap_long(&pn->clean_neg_count, 0); 5269 atomic_add_long(&clean_neg_count, count); 5270 } 5271 5272 if (pn->numdefered) { 5273 count = atomic_swap_long(&pn->numdefered, 0); 5274 atomic_add_long(&numdefered, count); 5275 } 5276 } 5277