1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysproto.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 144 145 TAILQ_HEAD(nchash_list, namecache); 146 147 /* 148 * Don't cachealign, but at least pad to 32 bytes so entries 149 * don't cross a cache line. 150 */ 151 struct nchash_head { 152 struct nchash_list list; /* 16 bytes */ 153 struct spinlock spin; /* 8 bytes */ 154 long pad01; /* 8 bytes */ 155 }; 156 157 struct ncmount_cache { 158 struct spinlock spin; 159 struct namecache *ncp; 160 struct mount *mp; 161 struct mount *mp_target; 162 int isneg; 163 int ticks; 164 int updating; 165 int unused01; 166 }; 167 168 struct pcpu_ncache { 169 struct spinlock umount_spin; /* cache_findmount/interlock */ 170 struct spinlock neg_spin; /* for neg_list and neg_count */ 171 struct namecache_list neg_list; 172 long neg_count; 173 long vfscache_negs; 174 long vfscache_count; 175 long vfscache_leafs; 176 long numdefered; 177 } __cachealign; 178 179 __read_mostly static struct nchash_head *nchashtbl; 180 __read_mostly static struct pcpu_ncache *pcpu_ncache; 181 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 182 183 /* 184 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 185 * to create the namecache infrastructure leading to a dangling vnode. 186 * 187 * 0 Only errors are reported 188 * 1 Successes are reported 189 * 2 Successes + the whole directory scan is reported 190 * 3 Force the directory scan code run as if the parent vnode did not 191 * have a namecache record, even if it does have one. 192 */ 193 __read_mostly static int ncvp_debug; 194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 195 "Namecache debug level (0-3)"); 196 197 __read_mostly static u_long nchash; /* size of hash table */ 198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 199 "Size of namecache hash table"); 200 201 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 203 "Batch flush negative entries"); 204 205 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 207 "Batch flush positive entries"); 208 209 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 211 "Ratio of namecache negative entries"); 212 213 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 215 "Warn on locked namecache entries in ticks"); 216 217 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 219 "Number of cache entries allocated"); 220 221 __read_mostly static int ncp_shared_lock_disable = 0; 222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 223 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 224 225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 226 "sizeof(struct vnode)"); 227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 228 "sizeof(struct namecache)"); 229 230 __read_mostly static int ncmount_cache_enable = 1; 231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 232 &ncmount_cache_enable, 0, "mount point cache"); 233 234 static __inline void _cache_drop(struct namecache *ncp); 235 static int cache_resolve_mp(struct mount *mp); 236 static int cache_findmount_callback(struct mount *mp, void *data); 237 static void _cache_setunresolved(struct namecache *ncp); 238 static void _cache_cleanneg(long count); 239 static void _cache_cleanpos(long count); 240 static void _cache_cleandefered(void); 241 static void _cache_unlink(struct namecache *ncp); 242 243 /* 244 * The new name cache statistics (these are rolled up globals and not 245 * modified in the critical path, see struct pcpu_ncache). 246 */ 247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 248 static long vfscache_negs; 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 250 "Number of negative namecache entries"); 251 static long vfscache_count; 252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 253 "Number of namecaches entries"); 254 static long vfscache_leafs; 255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 256 "Number of namecaches entries"); 257 static long numdefered; 258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 259 "Number of cache entries allocated"); 260 261 262 struct nchstats nchstats[SMP_MAXCPU]; 263 /* 264 * Export VFS cache effectiveness statistics to user-land. 265 * 266 * The statistics are left for aggregation to user-land so 267 * neat things can be achieved, like observing per-CPU cache 268 * distribution. 269 */ 270 static int 271 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 272 { 273 struct globaldata *gd; 274 int i, error; 275 276 error = 0; 277 for (i = 0; i < ncpus; ++i) { 278 gd = globaldata_find(i); 279 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 280 sizeof(struct nchstats)))) 281 break; 282 } 283 284 return (error); 285 } 286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 287 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 288 289 static void cache_zap(struct namecache *ncp); 290 291 /* 292 * Cache mount points and namecache records in order to avoid unnecessary 293 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 294 * performance and is particularly important on multi-socket systems to 295 * reduce cache-line ping-ponging. 296 * 297 * Try to keep the pcpu structure within one cache line (~64 bytes). 298 */ 299 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 300 #define MNTCACHE_SET 8 /* set associativity */ 301 302 struct mntcache_elm { 303 struct namecache *ncp; 304 struct mount *mp; 305 int ticks; 306 int unused01; 307 }; 308 309 struct mntcache { 310 struct mntcache_elm array[MNTCACHE_COUNT]; 311 } __cachealign; 312 313 static struct mntcache pcpu_mntcache[MAXCPU]; 314 315 static __inline 316 struct mntcache_elm * 317 _cache_mntcache_hash(void *ptr) 318 { 319 struct mntcache_elm *elm; 320 int hv; 321 322 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 323 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 324 325 return elm; 326 } 327 328 static 329 void 330 _cache_mntref(struct mount *mp) 331 { 332 struct mntcache_elm *elm; 333 struct mount *mpr; 334 int i; 335 336 elm = _cache_mntcache_hash(mp); 337 for (i = 0; i < MNTCACHE_SET; ++i) { 338 if (elm->mp == mp) { 339 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 340 if (__predict_true(mpr == mp)) 341 return; 342 if (mpr) 343 atomic_add_int(&mpr->mnt_refs, -1); 344 } 345 ++elm; 346 } 347 atomic_add_int(&mp->mnt_refs, 1); 348 } 349 350 static 351 void 352 _cache_mntrel(struct mount *mp) 353 { 354 struct mntcache_elm *elm; 355 struct mntcache_elm *best; 356 struct mount *mpr; 357 int delta1; 358 int delta2; 359 int i; 360 361 elm = _cache_mntcache_hash(mp); 362 best = elm; 363 for (i = 0; i < MNTCACHE_SET; ++i) { 364 if (elm->mp == NULL) { 365 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 366 if (__predict_false(mpr != NULL)) { 367 atomic_add_int(&mpr->mnt_refs, -1); 368 } 369 elm->ticks = ticks; 370 return; 371 } 372 delta1 = ticks - best->ticks; 373 delta2 = ticks - elm->ticks; 374 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 375 best = elm; 376 ++elm; 377 } 378 mpr = atomic_swap_ptr((void *)&best->mp, mp); 379 best->ticks = ticks; 380 if (mpr) 381 atomic_add_int(&mpr->mnt_refs, -1); 382 } 383 384 /* 385 * Clears all cached mount points on all cpus. This routine should only 386 * be called when we are waiting for a mount to clear, e.g. so we can 387 * unmount. 388 */ 389 void 390 cache_clearmntcache(struct mount *target __unused) 391 { 392 int n; 393 394 for (n = 0; n < ncpus; ++n) { 395 struct mntcache *cache = &pcpu_mntcache[n]; 396 struct mntcache_elm *elm; 397 struct namecache *ncp; 398 struct mount *mp; 399 int i; 400 401 for (i = 0; i < MNTCACHE_COUNT; ++i) { 402 elm = &cache->array[i]; 403 if (elm->mp) { 404 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 405 if (mp) 406 atomic_add_int(&mp->mnt_refs, -1); 407 } 408 if (elm->ncp) { 409 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 410 if (ncp) 411 _cache_drop(ncp); 412 } 413 } 414 } 415 } 416 417 /* 418 * Namespace locking. The caller must already hold a reference to the 419 * namecache structure in order to lock/unlock it. The controlling entity 420 * in a 1->0 transition does not need to lock the ncp to dispose of it, 421 * as nobody else will have visiblity to it at that point. 422 * 423 * Note that holding a locked namecache structure prevents other threads 424 * from making namespace changes (e.g. deleting or creating), prevents 425 * vnode association state changes by other threads, and prevents the 426 * namecache entry from being resolved or unresolved by other threads. 427 * 428 * An exclusive lock owner has full authority to associate/disassociate 429 * vnodes and resolve/unresolve the locked ncp. 430 * 431 * A shared lock owner only has authority to acquire the underlying vnode, 432 * if any. 433 * 434 * The primary lock field is nc_lockstatus. nc_locktd is set after the 435 * fact (when locking) or cleared prior to unlocking. 436 * 437 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 438 * or recycled, but it does NOT help you if the vnode had already 439 * initiated a recyclement. If this is important, use cache_get() 440 * rather then cache_lock() (and deal with the differences in the 441 * way the refs counter is handled). Or, alternatively, make an 442 * unconditional call to cache_validate() or cache_resolve() 443 * after cache_lock() returns. 444 */ 445 static __inline 446 void 447 _cache_lock(struct namecache *ncp) 448 { 449 int didwarn = 0; 450 int error; 451 452 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 453 while (__predict_false(error == EWOULDBLOCK)) { 454 if (didwarn == 0) { 455 didwarn = ticks - nclockwarn; 456 kprintf("[diagnostic] cache_lock: " 457 "%s blocked on %p " 458 "\"%*.*s\"\n", 459 curthread->td_comm, ncp, 460 ncp->nc_nlen, ncp->nc_nlen, 461 ncp->nc_name); 462 } 463 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 464 } 465 if (__predict_false(didwarn)) { 466 kprintf("[diagnostic] cache_lock: " 467 "%s unblocked %*.*s after %d secs\n", 468 curthread->td_comm, 469 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 470 (int)(ticks - didwarn) / hz); 471 } 472 } 473 474 /* 475 * Release a previously acquired lock. 476 * 477 * A concurrent shared-lock acquisition or acquisition/release can 478 * race bit 31 so only drop the ncp if bit 31 was set. 479 */ 480 static __inline 481 void 482 _cache_unlock(struct namecache *ncp) 483 { 484 lockmgr(&ncp->nc_lock, LK_RELEASE); 485 } 486 487 /* 488 * Lock ncp exclusively, non-blocking. Return 0 on success. 489 */ 490 static __inline 491 int 492 _cache_lock_nonblock(struct namecache *ncp) 493 { 494 int error; 495 496 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 497 if (__predict_false(error != 0)) { 498 return(EWOULDBLOCK); 499 } 500 return 0; 501 } 502 503 /* 504 * This is a special form of _cache_lock() which only succeeds if 505 * it can get a pristine, non-recursive lock. The caller must have 506 * already ref'd the ncp. 507 * 508 * On success the ncp will be locked, on failure it will not. The 509 * ref count does not change either way. 510 * 511 * We want _cache_lock_special() (on success) to return a definitively 512 * usable vnode or a definitively unresolved ncp. 513 */ 514 static __inline 515 int 516 _cache_lock_special(struct namecache *ncp) 517 { 518 if (_cache_lock_nonblock(ncp) == 0) { 519 if (lockmgr_oneexcl(&ncp->nc_lock)) { 520 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 521 _cache_setunresolved(ncp); 522 return 0; 523 } 524 _cache_unlock(ncp); 525 } 526 return EWOULDBLOCK; 527 } 528 529 /* 530 * Shared lock, guarantees vp held 531 * 532 * The shared lock holds vp on the 0->1 transition. It is possible to race 533 * another shared lock release, preventing the other release from dropping 534 * the vnode and clearing bit 31. 535 * 536 * If it is not set then we are responsible for setting it, and this 537 * responsibility does not race with anyone else. 538 */ 539 static __inline 540 void 541 _cache_lock_shared(struct namecache *ncp) 542 { 543 int didwarn = 0; 544 int error; 545 546 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 547 while (__predict_false(error == EWOULDBLOCK)) { 548 if (didwarn == 0) { 549 didwarn = ticks - nclockwarn; 550 kprintf("[diagnostic] cache_lock_shared: " 551 "%s blocked on %p " 552 "\"%*.*s\"\n", 553 curthread->td_comm, ncp, 554 ncp->nc_nlen, ncp->nc_nlen, 555 ncp->nc_name); 556 } 557 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 558 } 559 if (__predict_false(didwarn)) { 560 kprintf("[diagnostic] cache_lock_shared: " 561 "%s unblocked %*.*s after %d secs\n", 562 curthread->td_comm, 563 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 564 (int)(ticks - didwarn) / hz); 565 } 566 } 567 568 /* 569 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 570 */ 571 static __inline 572 int 573 _cache_lock_shared_nonblock(struct namecache *ncp) 574 { 575 int error; 576 577 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 578 if (__predict_false(error != 0)) { 579 return(EWOULDBLOCK); 580 } 581 return 0; 582 } 583 584 /* 585 * This function tries to get a shared lock but will back-off to an 586 * exclusive lock if: 587 * 588 * (1) Some other thread is trying to obtain an exclusive lock 589 * (to prevent the exclusive requester from getting livelocked out 590 * by many shared locks). 591 * 592 * (2) The current thread already owns an exclusive lock (to avoid 593 * deadlocking). 594 * 595 * WARNING! On machines with lots of cores we really want to try hard to 596 * get a shared lock or concurrent path lookups can chain-react 597 * into a very high-latency exclusive lock. 598 * 599 * This is very evident in dsynth's initial scans. 600 */ 601 static __inline 602 int 603 _cache_lock_shared_special(struct namecache *ncp) 604 { 605 /* 606 * Only honor a successful shared lock (returning 0) if there is 607 * no exclusive request pending and the vnode, if present, is not 608 * in a reclaimed state. 609 */ 610 if (_cache_lock_shared_nonblock(ncp) == 0) { 611 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 612 if (ncp->nc_vp == NULL || 613 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 614 return(0); 615 } 616 } 617 _cache_unlock(ncp); 618 return(EWOULDBLOCK); 619 } 620 621 /* 622 * Non-blocking shared lock failed. If we already own the exclusive 623 * lock just acquire another exclusive lock (instead of deadlocking). 624 * Otherwise acquire a shared lock. 625 */ 626 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 627 _cache_lock(ncp); 628 return(0); 629 } 630 _cache_lock_shared(ncp); 631 return(0); 632 } 633 634 static __inline 635 int 636 _cache_lockstatus(struct namecache *ncp) 637 { 638 int status; 639 640 status = lockstatus(&ncp->nc_lock, curthread); 641 if (status == 0 || status == LK_EXCLOTHER) 642 status = -1; 643 return status; 644 } 645 646 /* 647 * cache_hold() and cache_drop() prevent the premature deletion of a 648 * namecache entry but do not prevent operations (such as zapping) on 649 * that namecache entry. 650 * 651 * This routine may only be called from outside this source module if 652 * nc_refs is already deterministically at least 1, such as being 653 * associated with e.g. a process, file descriptor, or some other entity. 654 * 655 * Only the above situations, similar situations within this module where 656 * the ref count is deterministically at least 1, or when the ncp is found 657 * via the nchpp (hash table) lookup, can bump nc_refs. 658 * 659 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 660 * can still be removed from the nc_list, however, as long as the caller 661 * can acquire its lock (in the wrong order). 662 * 663 * This is a rare case where callers are allowed to hold a spinlock, 664 * so we can't ourselves. 665 */ 666 static __inline 667 struct namecache * 668 _cache_hold(struct namecache *ncp) 669 { 670 KKASSERT(ncp->nc_refs > 0); 671 atomic_add_int(&ncp->nc_refs, 1); 672 673 return(ncp); 674 } 675 676 /* 677 * Drop a cache entry. 678 * 679 * The 1->0 transition is special and requires the caller to destroy the 680 * entry. It means that the ncp is no longer on a nchpp list (since that 681 * would mean there was stilla ref). The ncp could still be on a nc_list 682 * but will not have any child of its own, again because nc_refs is now 0 683 * and children would have a ref to their parent. 684 * 685 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 686 */ 687 static __inline 688 void 689 _cache_drop(struct namecache *ncp) 690 { 691 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 692 /* 693 * Executed unlocked (no need to lock on last drop) 694 */ 695 _cache_setunresolved(ncp); 696 697 /* 698 * Scrap it. 699 */ 700 ncp->nc_refs = -1; /* safety */ 701 if (ncp->nc_name) 702 kfree(ncp->nc_name, M_VFSCACHE); 703 kfree(ncp, M_VFSCACHE); 704 } 705 } 706 707 /* 708 * Link a new namecache entry to its parent and to the hash table. Be 709 * careful to avoid races if vhold() blocks in the future. 710 * 711 * Both ncp and par must be referenced and locked. The reference is 712 * transfered to the nchpp (and, most notably, NOT to the parent list). 713 * 714 * NOTE: The hash table spinlock is held across this call, we can't do 715 * anything fancy. 716 */ 717 static void 718 _cache_link_parent(struct namecache *ncp, struct namecache *par, 719 struct nchash_head *nchpp) 720 { 721 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 722 723 KKASSERT(ncp->nc_parent == NULL); 724 ncp->nc_parent = par; 725 ncp->nc_head = nchpp; 726 727 /* 728 * Set inheritance flags. Note that the parent flags may be 729 * stale due to getattr potentially not having been run yet 730 * (it gets run during nlookup()'s). 731 */ 732 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 733 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 734 ncp->nc_flag |= NCF_SF_PNOCACHE; 735 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 736 ncp->nc_flag |= NCF_UF_PCACHE; 737 738 /* 739 * Add to hash table and parent, adjust accounting 740 */ 741 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 742 atomic_add_long(&pn->vfscache_count, 1); 743 if (TAILQ_EMPTY(&ncp->nc_list)) 744 atomic_add_long(&pn->vfscache_leafs, 1); 745 746 if (TAILQ_EMPTY(&par->nc_list)) { 747 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 748 atomic_add_long(&pn->vfscache_leafs, -1); 749 /* 750 * Any vp associated with an ncp which has children must 751 * be held to prevent it from being recycled. 752 */ 753 if (par->nc_vp) 754 vhold(par->nc_vp); 755 } else { 756 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 757 } 758 _cache_hold(par); /* add nc_parent ref */ 759 } 760 761 /* 762 * Remove the parent and hash associations from a namecache structure. 763 * Drop the ref-count on the parent. The caller receives the ref 764 * from the ncp's nchpp linkage that was removed and may forward that 765 * ref to a new linkage. 766 767 * The caller usually holds an additional ref * on the ncp so the unlink 768 * cannot be the final drop. XXX should not be necessary now since the 769 * caller receives the ref from the nchpp linkage, assuming the ncp 770 * was linked in the first place. 771 * 772 * ncp must be locked, which means that there won't be any nc_parent 773 * removal races. This routine will acquire a temporary lock on 774 * the parent as well as the appropriate hash chain. 775 */ 776 static void 777 _cache_unlink_parent(struct namecache *ncp) 778 { 779 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 780 struct namecache *par; 781 struct vnode *dropvp; 782 struct nchash_head *nchpp; 783 784 if ((par = ncp->nc_parent) != NULL) { 785 cpu_ccfence(); 786 KKASSERT(ncp->nc_parent == par); 787 788 /* don't add a ref, we drop the nchpp ref later */ 789 _cache_lock(par); 790 nchpp = ncp->nc_head; 791 spin_lock(&nchpp->spin); 792 793 /* 794 * Remove from hash table and parent, adjust accounting 795 */ 796 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 797 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 798 atomic_add_long(&pn->vfscache_count, -1); 799 if (TAILQ_EMPTY(&ncp->nc_list)) 800 atomic_add_long(&pn->vfscache_leafs, -1); 801 802 dropvp = NULL; 803 if (TAILQ_EMPTY(&par->nc_list)) { 804 atomic_add_long(&pn->vfscache_leafs, 1); 805 if (par->nc_vp) 806 dropvp = par->nc_vp; 807 } 808 ncp->nc_parent = NULL; 809 ncp->nc_head = NULL; 810 spin_unlock(&nchpp->spin); 811 _cache_unlock(par); 812 _cache_drop(par); /* drop nc_parent ref */ 813 814 /* 815 * We can only safely vdrop with no spinlocks held. 816 */ 817 if (dropvp) 818 vdrop(dropvp); 819 } 820 } 821 822 /* 823 * Allocate a new namecache structure. Most of the code does not require 824 * zero-termination of the string but it makes vop_compat_ncreate() easier. 825 * 826 * The returned ncp will be locked and referenced. The ref is generally meant 827 * to be transfered to the nchpp linkage. 828 */ 829 static struct namecache * 830 cache_alloc(int nlen) 831 { 832 struct namecache *ncp; 833 834 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 835 if (nlen) 836 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 837 ncp->nc_nlen = nlen; 838 ncp->nc_flag = NCF_UNRESOLVED; 839 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 840 ncp->nc_refs = 1; 841 TAILQ_INIT(&ncp->nc_list); 842 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 843 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 844 845 return(ncp); 846 } 847 848 /* 849 * Can only be called for the case where the ncp has never been 850 * associated with anything (so no spinlocks are needed). 851 */ 852 static void 853 _cache_free(struct namecache *ncp) 854 { 855 KKASSERT(ncp->nc_refs == 1); 856 if (ncp->nc_name) 857 kfree(ncp->nc_name, M_VFSCACHE); 858 kfree(ncp, M_VFSCACHE); 859 } 860 861 /* 862 * [re]initialize a nchandle. 863 */ 864 void 865 cache_zero(struct nchandle *nch) 866 { 867 nch->ncp = NULL; 868 nch->mount = NULL; 869 } 870 871 /* 872 * Ref and deref a nchandle structure (ncp + mp) 873 * 874 * The caller must specify a stable ncp pointer, typically meaning the 875 * ncp is already referenced but this can also occur indirectly through 876 * e.g. holding a lock on a direct child. 877 * 878 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 879 * use read spinlocks here. 880 */ 881 struct nchandle * 882 cache_hold(struct nchandle *nch) 883 { 884 _cache_hold(nch->ncp); 885 _cache_mntref(nch->mount); 886 return(nch); 887 } 888 889 /* 890 * Create a copy of a namecache handle for an already-referenced 891 * entry. 892 */ 893 void 894 cache_copy(struct nchandle *nch, struct nchandle *target) 895 { 896 struct namecache *ncp; 897 struct mount *mp; 898 struct mntcache_elm *elm; 899 struct namecache *ncpr; 900 int i; 901 902 ncp = nch->ncp; 903 mp = nch->mount; 904 target->ncp = ncp; 905 target->mount = mp; 906 907 elm = _cache_mntcache_hash(ncp); 908 for (i = 0; i < MNTCACHE_SET; ++i) { 909 if (elm->ncp == ncp) { 910 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 911 if (ncpr == ncp) { 912 _cache_mntref(mp); 913 return; 914 } 915 if (ncpr) 916 _cache_drop(ncpr); 917 } 918 ++elm; 919 } 920 if (ncp) 921 _cache_hold(ncp); 922 _cache_mntref(mp); 923 } 924 925 /* 926 * Drop the nchandle, but try to cache the ref to avoid global atomic 927 * ops. This is typically done on the system root and jail root nchandles. 928 */ 929 void 930 cache_drop_and_cache(struct nchandle *nch, int elmno) 931 { 932 struct mntcache_elm *elm; 933 struct mntcache_elm *best; 934 struct namecache *ncpr; 935 int delta1; 936 int delta2; 937 int i; 938 939 if (elmno > 4) { 940 if (nch->ncp) { 941 _cache_drop(nch->ncp); 942 nch->ncp = NULL; 943 } 944 if (nch->mount) { 945 _cache_mntrel(nch->mount); 946 nch->mount = NULL; 947 } 948 return; 949 } 950 951 elm = _cache_mntcache_hash(nch->ncp); 952 best = elm; 953 for (i = 0; i < MNTCACHE_SET; ++i) { 954 if (elm->ncp == NULL) { 955 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 956 _cache_mntrel(nch->mount); 957 elm->ticks = ticks; 958 nch->mount = NULL; 959 nch->ncp = NULL; 960 if (ncpr) 961 _cache_drop(ncpr); 962 return; 963 } 964 delta1 = ticks - best->ticks; 965 delta2 = ticks - elm->ticks; 966 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 967 best = elm; 968 ++elm; 969 } 970 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 971 _cache_mntrel(nch->mount); 972 best->ticks = ticks; 973 nch->mount = NULL; 974 nch->ncp = NULL; 975 if (ncpr) 976 _cache_drop(ncpr); 977 } 978 979 void 980 cache_changemount(struct nchandle *nch, struct mount *mp) 981 { 982 _cache_mntref(mp); 983 _cache_mntrel(nch->mount); 984 nch->mount = mp; 985 } 986 987 void 988 cache_drop(struct nchandle *nch) 989 { 990 _cache_mntrel(nch->mount); 991 _cache_drop(nch->ncp); 992 nch->ncp = NULL; 993 nch->mount = NULL; 994 } 995 996 int 997 cache_lockstatus(struct nchandle *nch) 998 { 999 return(_cache_lockstatus(nch->ncp)); 1000 } 1001 1002 void 1003 cache_lock(struct nchandle *nch) 1004 { 1005 _cache_lock(nch->ncp); 1006 } 1007 1008 void 1009 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1010 { 1011 struct namecache *ncp = nch->ncp; 1012 1013 if (ncp_shared_lock_disable || excl || 1014 (ncp->nc_flag & NCF_UNRESOLVED)) { 1015 _cache_lock(ncp); 1016 } else { 1017 _cache_lock_shared(ncp); 1018 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1019 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1020 _cache_unlock(ncp); 1021 _cache_lock(ncp); 1022 } 1023 } else { 1024 _cache_unlock(ncp); 1025 _cache_lock(ncp); 1026 } 1027 } 1028 } 1029 1030 /* 1031 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1032 * is responsible for checking both for validity on return as they 1033 * may have become invalid. 1034 * 1035 * We have to deal with potential deadlocks here, just ping pong 1036 * the lock until we get it (we will always block somewhere when 1037 * looping so this is not cpu-intensive). 1038 * 1039 * which = 0 nch1 not locked, nch2 is locked 1040 * which = 1 nch1 is locked, nch2 is not locked 1041 */ 1042 void 1043 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1044 struct nchandle *nch2, struct ucred *cred2) 1045 { 1046 int which; 1047 1048 which = 0; 1049 1050 for (;;) { 1051 if (which == 0) { 1052 if (cache_lock_nonblock(nch1) == 0) { 1053 cache_resolve(nch1, cred1); 1054 break; 1055 } 1056 cache_unlock(nch2); 1057 cache_lock(nch1); 1058 cache_resolve(nch1, cred1); 1059 which = 1; 1060 } else { 1061 if (cache_lock_nonblock(nch2) == 0) { 1062 cache_resolve(nch2, cred2); 1063 break; 1064 } 1065 cache_unlock(nch1); 1066 cache_lock(nch2); 1067 cache_resolve(nch2, cred2); 1068 which = 0; 1069 } 1070 } 1071 } 1072 1073 int 1074 cache_lock_nonblock(struct nchandle *nch) 1075 { 1076 return(_cache_lock_nonblock(nch->ncp)); 1077 } 1078 1079 void 1080 cache_unlock(struct nchandle *nch) 1081 { 1082 _cache_unlock(nch->ncp); 1083 } 1084 1085 /* 1086 * ref-and-lock, unlock-and-deref functions. 1087 * 1088 * This function is primarily used by nlookup. Even though cache_lock 1089 * holds the vnode, it is possible that the vnode may have already 1090 * initiated a recyclement. 1091 * 1092 * We want cache_get() to return a definitively usable vnode or a 1093 * definitively unresolved ncp. 1094 */ 1095 static 1096 struct namecache * 1097 _cache_get(struct namecache *ncp) 1098 { 1099 _cache_hold(ncp); 1100 _cache_lock(ncp); 1101 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1102 _cache_setunresolved(ncp); 1103 return(ncp); 1104 } 1105 1106 /* 1107 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1108 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1109 * valid. Otherwise an exclusive lock will be acquired instead. 1110 */ 1111 static 1112 struct namecache * 1113 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1114 { 1115 if (ncp_shared_lock_disable || excl || 1116 (ncp->nc_flag & NCF_UNRESOLVED)) { 1117 return(_cache_get(ncp)); 1118 } 1119 _cache_hold(ncp); 1120 _cache_lock_shared(ncp); 1121 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1122 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1123 _cache_unlock(ncp); 1124 ncp = _cache_get(ncp); 1125 _cache_drop(ncp); 1126 } 1127 } else { 1128 _cache_unlock(ncp); 1129 ncp = _cache_get(ncp); 1130 _cache_drop(ncp); 1131 } 1132 return(ncp); 1133 } 1134 1135 /* 1136 * NOTE: The same nchandle can be passed for both arguments. 1137 */ 1138 void 1139 cache_get(struct nchandle *nch, struct nchandle *target) 1140 { 1141 KKASSERT(nch->ncp->nc_refs > 0); 1142 target->mount = nch->mount; 1143 target->ncp = _cache_get(nch->ncp); 1144 _cache_mntref(target->mount); 1145 } 1146 1147 void 1148 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1149 { 1150 KKASSERT(nch->ncp->nc_refs > 0); 1151 target->mount = nch->mount; 1152 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1153 _cache_mntref(target->mount); 1154 } 1155 1156 /* 1157 * Release a held and locked ncp 1158 */ 1159 static __inline 1160 void 1161 _cache_put(struct namecache *ncp) 1162 { 1163 _cache_unlock(ncp); 1164 _cache_drop(ncp); 1165 } 1166 1167 void 1168 cache_put(struct nchandle *nch) 1169 { 1170 _cache_mntrel(nch->mount); 1171 _cache_put(nch->ncp); 1172 nch->ncp = NULL; 1173 nch->mount = NULL; 1174 } 1175 1176 /* 1177 * Resolve an unresolved ncp by associating a vnode with it. If the 1178 * vnode is NULL, a negative cache entry is created. 1179 * 1180 * The ncp should be locked on entry and will remain locked on return. 1181 */ 1182 static 1183 void 1184 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1185 { 1186 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1187 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1188 ncp->nc_vp == NULL); 1189 1190 if (vp) { 1191 /* 1192 * Any vp associated with an ncp which has children must 1193 * be held. Any vp associated with a locked ncp must be held. 1194 */ 1195 if (!TAILQ_EMPTY(&ncp->nc_list)) 1196 vhold(vp); 1197 spin_lock(&vp->v_spin); 1198 ncp->nc_vp = vp; 1199 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1200 ++vp->v_namecache_count; 1201 _cache_hold(ncp); /* v_namecache assoc */ 1202 spin_unlock(&vp->v_spin); 1203 vhold(vp); /* nc_vp */ 1204 1205 /* 1206 * Set auxiliary flags 1207 */ 1208 switch(vp->v_type) { 1209 case VDIR: 1210 ncp->nc_flag |= NCF_ISDIR; 1211 break; 1212 case VLNK: 1213 ncp->nc_flag |= NCF_ISSYMLINK; 1214 /* XXX cache the contents of the symlink */ 1215 break; 1216 default: 1217 break; 1218 } 1219 1220 ncp->nc_error = 0; 1221 1222 /* 1223 * XXX: this is a hack to work-around the lack of a real pfs vfs 1224 * implementation 1225 */ 1226 if (mp) { 1227 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1228 vp->v_pfsmp = mp; 1229 } 1230 } else { 1231 /* 1232 * When creating a negative cache hit we set the 1233 * namecache_gen. A later resolve will clean out the 1234 * negative cache hit if the mount point's namecache_gen 1235 * has changed. Used by devfs, could also be used by 1236 * other remote FSs. 1237 */ 1238 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1239 1240 ncp->nc_vp = NULL; 1241 ncp->nc_negcpu = mycpu->gd_cpuid; 1242 spin_lock(&pn->neg_spin); 1243 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1244 _cache_hold(ncp); /* neg_list assoc */ 1245 ++pn->neg_count; 1246 spin_unlock(&pn->neg_spin); 1247 atomic_add_long(&pn->vfscache_negs, 1); 1248 1249 ncp->nc_error = ENOENT; 1250 if (mp) 1251 VFS_NCPGEN_SET(mp, ncp); 1252 } 1253 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1254 } 1255 1256 void 1257 cache_setvp(struct nchandle *nch, struct vnode *vp) 1258 { 1259 _cache_setvp(nch->mount, nch->ncp, vp); 1260 } 1261 1262 /* 1263 * Used for NFS 1264 */ 1265 void 1266 cache_settimeout(struct nchandle *nch, int nticks) 1267 { 1268 struct namecache *ncp = nch->ncp; 1269 1270 if ((ncp->nc_timeout = ticks + nticks) == 0) 1271 ncp->nc_timeout = 1; 1272 } 1273 1274 /* 1275 * Disassociate the vnode or negative-cache association and mark a 1276 * namecache entry as unresolved again. Note that the ncp is still 1277 * left in the hash table and still linked to its parent. 1278 * 1279 * The ncp should be locked and refd on entry and will remain locked and refd 1280 * on return. 1281 * 1282 * This routine is normally never called on a directory containing children. 1283 * However, NFS often does just that in its rename() code as a cop-out to 1284 * avoid complex namespace operations. This disconnects a directory vnode 1285 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1286 * sync. 1287 * 1288 */ 1289 static 1290 void 1291 _cache_setunresolved(struct namecache *ncp) 1292 { 1293 struct vnode *vp; 1294 1295 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1296 ncp->nc_flag |= NCF_UNRESOLVED; 1297 ncp->nc_timeout = 0; 1298 ncp->nc_error = ENOTCONN; 1299 if ((vp = ncp->nc_vp) != NULL) { 1300 spin_lock(&vp->v_spin); 1301 ncp->nc_vp = NULL; 1302 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1303 --vp->v_namecache_count; 1304 spin_unlock(&vp->v_spin); 1305 1306 /* 1307 * Any vp associated with an ncp with children is 1308 * held by that ncp. Any vp associated with ncp 1309 * is held by that ncp. These conditions must be 1310 * undone when the vp is cleared out from the ncp. 1311 */ 1312 if (!TAILQ_EMPTY(&ncp->nc_list)) 1313 vdrop(vp); 1314 vdrop(vp); 1315 } else { 1316 struct pcpu_ncache *pn; 1317 1318 pn = &pcpu_ncache[ncp->nc_negcpu]; 1319 1320 atomic_add_long(&pn->vfscache_negs, -1); 1321 spin_lock(&pn->neg_spin); 1322 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1323 --pn->neg_count; 1324 spin_unlock(&pn->neg_spin); 1325 } 1326 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1327 _cache_drop(ncp); /* from v_namecache or neg_list */ 1328 } 1329 } 1330 1331 /* 1332 * The cache_nresolve() code calls this function to automatically 1333 * set a resolved cache element to unresolved if it has timed out 1334 * or if it is a negative cache hit and the mount point namecache_gen 1335 * has changed. 1336 */ 1337 static __inline int 1338 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1339 { 1340 /* 1341 * Try to zap entries that have timed out. We have 1342 * to be careful here because locked leafs may depend 1343 * on the vnode remaining intact in a parent, so only 1344 * do this under very specific conditions. 1345 */ 1346 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1347 TAILQ_EMPTY(&ncp->nc_list)) { 1348 return 1; 1349 } 1350 1351 /* 1352 * If a resolved negative cache hit is invalid due to 1353 * the mount's namecache generation being bumped, zap it. 1354 */ 1355 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1356 return 1; 1357 } 1358 1359 /* 1360 * Otherwise we are good 1361 */ 1362 return 0; 1363 } 1364 1365 static __inline void 1366 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1367 { 1368 /* 1369 * Already in an unresolved state, nothing to do. 1370 */ 1371 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1372 if (_cache_auto_unresolve_test(mp, ncp)) 1373 _cache_setunresolved(ncp); 1374 } 1375 } 1376 1377 void 1378 cache_setunresolved(struct nchandle *nch) 1379 { 1380 _cache_setunresolved(nch->ncp); 1381 } 1382 1383 /* 1384 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1385 * looking for matches. This flag tells the lookup code when it must 1386 * check for a mount linkage and also prevents the directories in question 1387 * from being deleted or renamed. 1388 */ 1389 static 1390 int 1391 cache_clrmountpt_callback(struct mount *mp, void *data) 1392 { 1393 struct nchandle *nch = data; 1394 1395 if (mp->mnt_ncmounton.ncp == nch->ncp) 1396 return(1); 1397 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1398 return(1); 1399 return(0); 1400 } 1401 1402 /* 1403 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1404 * with a mount point. 1405 */ 1406 void 1407 cache_clrmountpt(struct nchandle *nch) 1408 { 1409 int count; 1410 1411 count = mountlist_scan(cache_clrmountpt_callback, nch, 1412 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1413 MNTSCAN_NOUNLOCK); 1414 if (count == 0) 1415 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1416 } 1417 1418 /* 1419 * Invalidate portions of the namecache topology given a starting entry. 1420 * The passed ncp is set to an unresolved state and: 1421 * 1422 * The passed ncp must be referenced and locked. The routine may unlock 1423 * and relock ncp several times, and will recheck the children and loop 1424 * to catch races. When done the passed ncp will be returned with the 1425 * reference and lock intact. 1426 * 1427 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1428 * that the physical underlying nodes have been 1429 * destroyed... as in deleted. For example, when 1430 * a directory is removed. This will cause record 1431 * lookups on the name to no longer be able to find 1432 * the record and tells the resolver to return failure 1433 * rather then trying to resolve through the parent. 1434 * 1435 * The topology itself, including ncp->nc_name, 1436 * remains intact. 1437 * 1438 * This only applies to the passed ncp, if CINV_CHILDREN 1439 * is specified the children are not flagged. 1440 * 1441 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1442 * state as well. 1443 * 1444 * Note that this will also have the side effect of 1445 * cleaning out any unreferenced nodes in the topology 1446 * from the leaves up as the recursion backs out. 1447 * 1448 * Note that the topology for any referenced nodes remains intact, but 1449 * the nodes will be marked as having been destroyed and will be set 1450 * to an unresolved state. 1451 * 1452 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1453 * the namecache entry may not actually be invalidated on return if it was 1454 * revalidated while recursing down into its children. This code guarentees 1455 * that the node(s) will go through an invalidation cycle, but does not 1456 * guarentee that they will remain in an invalidated state. 1457 * 1458 * Returns non-zero if a revalidation was detected during the invalidation 1459 * recursion, zero otherwise. Note that since only the original ncp is 1460 * locked the revalidation ultimately can only indicate that the original ncp 1461 * *MIGHT* no have been reresolved. 1462 * 1463 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1464 * have to avoid blowing out the kernel stack. We do this by saving the 1465 * deep namecache node and aborting the recursion, then re-recursing at that 1466 * node using a depth-first algorithm in order to allow multiple deep 1467 * recursions to chain through each other, then we restart the invalidation 1468 * from scratch. 1469 */ 1470 1471 struct cinvtrack { 1472 struct namecache *resume_ncp; 1473 int depth; 1474 }; 1475 1476 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1477 1478 static 1479 int 1480 _cache_inval(struct namecache *ncp, int flags) 1481 { 1482 struct cinvtrack track; 1483 struct namecache *ncp2; 1484 int r; 1485 1486 track.depth = 0; 1487 track.resume_ncp = NULL; 1488 1489 for (;;) { 1490 r = _cache_inval_internal(ncp, flags, &track); 1491 if (track.resume_ncp == NULL) 1492 break; 1493 _cache_unlock(ncp); 1494 while ((ncp2 = track.resume_ncp) != NULL) { 1495 track.resume_ncp = NULL; 1496 _cache_lock(ncp2); 1497 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1498 &track); 1499 /*_cache_put(ncp2);*/ 1500 cache_zap(ncp2); 1501 } 1502 _cache_lock(ncp); 1503 } 1504 return(r); 1505 } 1506 1507 int 1508 cache_inval(struct nchandle *nch, int flags) 1509 { 1510 return(_cache_inval(nch->ncp, flags)); 1511 } 1512 1513 /* 1514 * Helper for _cache_inval(). The passed ncp is refd and locked and 1515 * remains that way on return, but may be unlocked/relocked multiple 1516 * times by the routine. 1517 */ 1518 static int 1519 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1520 { 1521 struct namecache *nextkid; 1522 int rcnt = 0; 1523 1524 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1525 1526 _cache_setunresolved(ncp); 1527 if (flags & CINV_DESTROY) { 1528 ncp->nc_flag |= NCF_DESTROYED; 1529 ++ncp->nc_generation; 1530 } 1531 1532 while ((flags & CINV_CHILDREN) && 1533 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1534 ) { 1535 struct namecache *kid; 1536 int restart; 1537 1538 restart = 0; 1539 _cache_hold(nextkid); 1540 if (++track->depth > MAX_RECURSION_DEPTH) { 1541 track->resume_ncp = ncp; 1542 _cache_hold(ncp); 1543 ++rcnt; 1544 } 1545 while ((kid = nextkid) != NULL) { 1546 /* 1547 * Parent (ncp) must be locked for the iteration. 1548 */ 1549 nextkid = NULL; 1550 if (kid->nc_parent != ncp) { 1551 _cache_drop(kid); 1552 kprintf("cache_inval_internal restartA %s\n", 1553 ncp->nc_name); 1554 restart = 1; 1555 break; 1556 } 1557 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1558 _cache_hold(nextkid); 1559 1560 /* 1561 * Parent unlocked for this section to avoid 1562 * deadlocks. Then lock the kid and check for 1563 * races. 1564 */ 1565 _cache_unlock(ncp); 1566 if (track->resume_ncp) { 1567 _cache_drop(kid); 1568 _cache_lock(ncp); 1569 break; 1570 } 1571 _cache_lock(kid); 1572 if (kid->nc_parent != ncp) { 1573 kprintf("cache_inval_internal " 1574 "restartB %s\n", 1575 ncp->nc_name); 1576 restart = 1; 1577 _cache_unlock(kid); 1578 _cache_drop(kid); 1579 _cache_lock(ncp); 1580 break; 1581 } 1582 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1583 TAILQ_FIRST(&kid->nc_list) 1584 ) { 1585 1586 rcnt += _cache_inval_internal(kid, 1587 flags & ~CINV_DESTROY, track); 1588 /*_cache_unlock(kid);*/ 1589 /*_cache_drop(kid);*/ 1590 cache_zap(kid); 1591 } else { 1592 cache_zap(kid); 1593 } 1594 1595 /* 1596 * Relock parent to continue scan 1597 */ 1598 _cache_lock(ncp); 1599 } 1600 if (nextkid) 1601 _cache_drop(nextkid); 1602 --track->depth; 1603 if (restart == 0) 1604 break; 1605 } 1606 1607 /* 1608 * Someone could have gotten in there while ncp was unlocked, 1609 * retry if so. 1610 */ 1611 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1612 ++rcnt; 1613 return (rcnt); 1614 } 1615 1616 /* 1617 * Invalidate a vnode's namecache associations. To avoid races against 1618 * the resolver we do not invalidate a node which we previously invalidated 1619 * but which was then re-resolved while we were in the invalidation loop. 1620 * 1621 * Returns non-zero if any namecache entries remain after the invalidation 1622 * loop completed. 1623 * 1624 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1625 * be ripped out of the topology while held, the vnode's v_namecache 1626 * list has no such restriction. NCP's can be ripped out of the list 1627 * at virtually any time if not locked, even if held. 1628 * 1629 * In addition, the v_namecache list itself must be locked via 1630 * the vnode's spinlock. 1631 */ 1632 int 1633 cache_inval_vp(struct vnode *vp, int flags) 1634 { 1635 struct namecache *ncp; 1636 struct namecache *next; 1637 1638 restart: 1639 spin_lock(&vp->v_spin); 1640 ncp = TAILQ_FIRST(&vp->v_namecache); 1641 if (ncp) 1642 _cache_hold(ncp); 1643 while (ncp) { 1644 /* loop entered with ncp held and vp spin-locked */ 1645 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1646 _cache_hold(next); 1647 spin_unlock(&vp->v_spin); 1648 _cache_lock(ncp); 1649 if (ncp->nc_vp != vp) { 1650 kprintf("Warning: cache_inval_vp: race-A detected on " 1651 "%s\n", ncp->nc_name); 1652 _cache_put(ncp); 1653 if (next) 1654 _cache_drop(next); 1655 goto restart; 1656 } 1657 _cache_inval(ncp, flags); 1658 _cache_put(ncp); /* also releases reference */ 1659 ncp = next; 1660 spin_lock(&vp->v_spin); 1661 if (ncp && ncp->nc_vp != vp) { 1662 spin_unlock(&vp->v_spin); 1663 kprintf("Warning: cache_inval_vp: race-B detected on " 1664 "%s\n", ncp->nc_name); 1665 _cache_drop(ncp); 1666 goto restart; 1667 } 1668 } 1669 spin_unlock(&vp->v_spin); 1670 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1671 } 1672 1673 /* 1674 * This routine is used instead of the normal cache_inval_vp() when we 1675 * are trying to recycle otherwise good vnodes. 1676 * 1677 * Return 0 on success, non-zero if not all namecache records could be 1678 * disassociated from the vnode (for various reasons). 1679 */ 1680 int 1681 cache_inval_vp_nonblock(struct vnode *vp) 1682 { 1683 struct namecache *ncp; 1684 struct namecache *next; 1685 1686 spin_lock(&vp->v_spin); 1687 ncp = TAILQ_FIRST(&vp->v_namecache); 1688 if (ncp) 1689 _cache_hold(ncp); 1690 while (ncp) { 1691 /* loop entered with ncp held */ 1692 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1693 _cache_hold(next); 1694 spin_unlock(&vp->v_spin); 1695 if (_cache_lock_nonblock(ncp)) { 1696 _cache_drop(ncp); 1697 if (next) 1698 _cache_drop(next); 1699 goto done; 1700 } 1701 if (ncp->nc_vp != vp) { 1702 kprintf("Warning: cache_inval_vp: race-A detected on " 1703 "%s\n", ncp->nc_name); 1704 _cache_put(ncp); 1705 if (next) 1706 _cache_drop(next); 1707 goto done; 1708 } 1709 _cache_inval(ncp, 0); 1710 _cache_put(ncp); /* also releases reference */ 1711 ncp = next; 1712 spin_lock(&vp->v_spin); 1713 if (ncp && ncp->nc_vp != vp) { 1714 spin_unlock(&vp->v_spin); 1715 kprintf("Warning: cache_inval_vp: race-B detected on " 1716 "%s\n", ncp->nc_name); 1717 _cache_drop(ncp); 1718 goto done; 1719 } 1720 } 1721 spin_unlock(&vp->v_spin); 1722 done: 1723 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1724 } 1725 1726 /* 1727 * Clears the universal directory search 'ok' flag. This flag allows 1728 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1729 * so clearing it simply forces revalidation. 1730 */ 1731 void 1732 cache_inval_wxok(struct vnode *vp) 1733 { 1734 struct namecache *ncp; 1735 1736 spin_lock(&vp->v_spin); 1737 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1738 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1739 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1740 } 1741 spin_unlock(&vp->v_spin); 1742 } 1743 1744 /* 1745 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1746 * must be locked. The target ncp is destroyed (as a normal rename-over 1747 * would destroy the target file or directory). 1748 * 1749 * Because there may be references to the source ncp we cannot copy its 1750 * contents to the target. Instead the source ncp is relinked as the target 1751 * and the target ncp is removed from the namecache topology. 1752 */ 1753 void 1754 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1755 { 1756 struct namecache *fncp = fnch->ncp; 1757 struct namecache *tncp = tnch->ncp; 1758 struct namecache *tncp_par; 1759 struct nchash_head *nchpp; 1760 u_int32_t hash; 1761 char *oname; 1762 char *nname; 1763 1764 ++fncp->nc_generation; 1765 ++tncp->nc_generation; 1766 if (tncp->nc_nlen) { 1767 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1768 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1769 nname[tncp->nc_nlen] = 0; 1770 } else { 1771 nname = NULL; 1772 } 1773 1774 /* 1775 * Rename fncp (unlink) 1776 */ 1777 _cache_unlink_parent(fncp); 1778 oname = fncp->nc_name; 1779 fncp->nc_name = nname; 1780 fncp->nc_nlen = tncp->nc_nlen; 1781 if (oname) 1782 kfree(oname, M_VFSCACHE); 1783 1784 tncp_par = tncp->nc_parent; 1785 _cache_hold(tncp_par); 1786 _cache_lock(tncp_par); 1787 1788 /* 1789 * Rename fncp (relink) 1790 */ 1791 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1792 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1793 nchpp = NCHHASH(hash); 1794 1795 spin_lock(&nchpp->spin); 1796 _cache_link_parent(fncp, tncp_par, nchpp); 1797 spin_unlock(&nchpp->spin); 1798 1799 _cache_put(tncp_par); 1800 1801 /* 1802 * Get rid of the overwritten tncp (unlink) 1803 */ 1804 _cache_unlink(tncp); 1805 } 1806 1807 /* 1808 * Perform actions consistent with unlinking a file. The passed-in ncp 1809 * must be locked. 1810 * 1811 * The ncp is marked DESTROYED so it no longer shows up in searches, 1812 * and will be physically deleted when the vnode goes away. 1813 * 1814 * If the related vnode has no refs then we cycle it through vget()/vput() 1815 * to (possibly if we don't have a ref race) trigger a deactivation, 1816 * allowing the VFS to trivially detect and recycle the deleted vnode 1817 * via VOP_INACTIVE(). 1818 * 1819 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1820 * target ncp. 1821 */ 1822 void 1823 cache_unlink(struct nchandle *nch) 1824 { 1825 _cache_unlink(nch->ncp); 1826 } 1827 1828 static void 1829 _cache_unlink(struct namecache *ncp) 1830 { 1831 struct vnode *vp; 1832 1833 /* 1834 * Causes lookups to fail and allows another ncp with the same 1835 * name to be created under ncp->nc_parent. 1836 */ 1837 ncp->nc_flag |= NCF_DESTROYED; 1838 ++ncp->nc_generation; 1839 1840 /* 1841 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1842 * force action on the 1->0 transition. 1843 */ 1844 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1845 (vp = ncp->nc_vp) != NULL) { 1846 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1847 if (VREFCNT(vp) <= 0) { 1848 if (vget(vp, LK_SHARED) == 0) 1849 vput(vp); 1850 } 1851 } 1852 } 1853 1854 /* 1855 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1856 * file. The easy solution is to just return non-zero if the vnode has refs. 1857 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1858 * force the reclaim). 1859 */ 1860 int 1861 cache_isopen(struct nchandle *nch) 1862 { 1863 struct vnode *vp; 1864 struct namecache *ncp = nch->ncp; 1865 1866 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1867 (vp = ncp->nc_vp) != NULL && 1868 VREFCNT(vp)) { 1869 return 1; 1870 } 1871 return 0; 1872 } 1873 1874 1875 /* 1876 * vget the vnode associated with the namecache entry. Resolve the namecache 1877 * entry if necessary. The passed ncp must be referenced and locked. If 1878 * the ncp is resolved it might be locked shared. 1879 * 1880 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1881 * (depending on the passed lk_type) will be returned in *vpp with an error 1882 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1883 * most typical error is ENOENT, meaning that the ncp represents a negative 1884 * cache hit and there is no vnode to retrieve, but other errors can occur 1885 * too. 1886 * 1887 * The vget() can race a reclaim. If this occurs we re-resolve the 1888 * namecache entry. 1889 * 1890 * There are numerous places in the kernel where vget() is called on a 1891 * vnode while one or more of its namecache entries is locked. Releasing 1892 * a vnode never deadlocks against locked namecache entries (the vnode 1893 * will not get recycled while referenced ncp's exist). This means we 1894 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1895 * lock when acquiring the vp lock or we might cause a deadlock. 1896 * 1897 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1898 * unresolved. If a reclaim race occurs the passed-in ncp will be 1899 * relocked exclusively before being re-resolved. 1900 */ 1901 int 1902 cache_vget(struct nchandle *nch, struct ucred *cred, 1903 int lk_type, struct vnode **vpp) 1904 { 1905 struct namecache *ncp; 1906 struct vnode *vp; 1907 int error; 1908 1909 ncp = nch->ncp; 1910 again: 1911 vp = NULL; 1912 if (ncp->nc_flag & NCF_UNRESOLVED) 1913 error = cache_resolve(nch, cred); 1914 else 1915 error = 0; 1916 1917 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1918 error = vget(vp, lk_type); 1919 if (error) { 1920 /* 1921 * VRECLAIM race 1922 * 1923 * The ncp may have been locked shared, we must relock 1924 * it exclusively before we can set it to unresolved. 1925 */ 1926 if (error == ENOENT) { 1927 kprintf("Warning: vnode reclaim race detected " 1928 "in cache_vget on %p (%s)\n", 1929 vp, ncp->nc_name); 1930 _cache_unlock(ncp); 1931 _cache_lock(ncp); 1932 _cache_setunresolved(ncp); 1933 goto again; 1934 } 1935 1936 /* 1937 * Not a reclaim race, some other error. 1938 */ 1939 KKASSERT(ncp->nc_vp == vp); 1940 vp = NULL; 1941 } else { 1942 KKASSERT(ncp->nc_vp == vp); 1943 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1944 } 1945 } 1946 if (error == 0 && vp == NULL) 1947 error = ENOENT; 1948 *vpp = vp; 1949 return(error); 1950 } 1951 1952 /* 1953 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 1954 * is already held by virtuue of the ncp being locked, but it might not be 1955 * referenced and while it is not referenced it can transition into the 1956 * VRECLAIMED state. 1957 * 1958 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1959 * unresolved. If a reclaim race occurs the passed-in ncp will be 1960 * relocked exclusively before being re-resolved. 1961 * 1962 * NOTE: At the moment we have to issue a vget() on the vnode, even though 1963 * we are going to immediately release the lock, in order to resolve 1964 * potential reclamation races. Once we have a solid vnode ref that 1965 * was (at some point) interlocked via a vget(), the vnode will not 1966 * be reclaimed. 1967 * 1968 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 1969 */ 1970 int 1971 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1972 { 1973 struct namecache *ncp; 1974 struct vnode *vp; 1975 int error; 1976 int v; 1977 1978 ncp = nch->ncp; 1979 again: 1980 vp = NULL; 1981 if (ncp->nc_flag & NCF_UNRESOLVED) 1982 error = cache_resolve(nch, cred); 1983 else 1984 error = 0; 1985 1986 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 1987 /* 1988 * Try a lockless ref of the vnode. VRECLAIMED transitions 1989 * use the vx_lock state and update-counter mechanism so we 1990 * can detect if one is in-progress or occurred. 1991 * 1992 * If we can successfully ref the vnode and interlock against 1993 * the update-counter mechanism, and VRECLAIMED is found to 1994 * not be set after that, we should be good. 1995 */ 1996 v = spin_access_start_only(&vp->v_spin); 1997 if (__predict_true(spin_access_check_inprog(v) == 0)) { 1998 vref_special(vp); 1999 if (__predict_false( 2000 spin_access_end_only(&vp->v_spin, v))) { 2001 vrele(vp); 2002 kprintf("CACHE_VREF: RACED %p\n", vp); 2003 continue; 2004 } 2005 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2006 break; 2007 } 2008 vrele(vp); 2009 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2010 } 2011 2012 /* 2013 * Do it the slow way 2014 */ 2015 error = vget(vp, LK_SHARED); 2016 if (error) { 2017 /* 2018 * VRECLAIM race 2019 */ 2020 if (error == ENOENT) { 2021 kprintf("Warning: vnode reclaim race detected " 2022 "in cache_vget on %p (%s)\n", 2023 vp, ncp->nc_name); 2024 _cache_unlock(ncp); 2025 _cache_lock(ncp); 2026 _cache_setunresolved(ncp); 2027 goto again; 2028 } 2029 2030 /* 2031 * Not a reclaim race, some other error. 2032 */ 2033 KKASSERT(ncp->nc_vp == vp); 2034 vp = NULL; 2035 } else { 2036 KKASSERT(ncp->nc_vp == vp); 2037 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2038 /* caller does not want a lock */ 2039 vn_unlock(vp); 2040 } 2041 break; 2042 } 2043 if (error == 0 && vp == NULL) 2044 error = ENOENT; 2045 *vpp = vp; 2046 2047 return(error); 2048 } 2049 2050 /* 2051 * Return a referenced vnode representing the parent directory of 2052 * ncp. 2053 * 2054 * Because the caller has locked the ncp it should not be possible for 2055 * the parent ncp to go away. However, the parent can unresolve its 2056 * dvp at any time so we must be able to acquire a lock on the parent 2057 * to safely access nc_vp. 2058 * 2059 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2060 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2061 * getting destroyed. 2062 * 2063 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2064 * lock on the ncp in question.. 2065 */ 2066 struct vnode * 2067 cache_dvpref(struct namecache *ncp) 2068 { 2069 struct namecache *par; 2070 struct vnode *dvp; 2071 2072 dvp = NULL; 2073 if ((par = ncp->nc_parent) != NULL) { 2074 _cache_hold(par); 2075 _cache_lock(par); 2076 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2077 if ((dvp = par->nc_vp) != NULL) 2078 vhold(dvp); 2079 } 2080 _cache_unlock(par); 2081 if (dvp) { 2082 if (vget(dvp, LK_SHARED) == 0) { 2083 vn_unlock(dvp); 2084 vdrop(dvp); 2085 /* return refd, unlocked dvp */ 2086 } else { 2087 vdrop(dvp); 2088 dvp = NULL; 2089 } 2090 } 2091 _cache_drop(par); 2092 } 2093 return(dvp); 2094 } 2095 2096 /* 2097 * Convert a directory vnode to a namecache record without any other 2098 * knowledge of the topology. This ONLY works with directory vnodes and 2099 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2100 * returned ncp (if not NULL) will be held and unlocked. 2101 * 2102 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2103 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2104 * for dvp. This will fail only if the directory has been deleted out from 2105 * under the caller. 2106 * 2107 * Callers must always check for a NULL return no matter the value of 'makeit'. 2108 * 2109 * To avoid underflowing the kernel stack each recursive call increments 2110 * the makeit variable. 2111 */ 2112 2113 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2114 struct vnode *dvp, char *fakename); 2115 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2116 struct vnode **saved_dvp); 2117 2118 int 2119 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2120 struct nchandle *nch) 2121 { 2122 struct vnode *saved_dvp; 2123 struct vnode *pvp; 2124 char *fakename; 2125 int error; 2126 2127 nch->ncp = NULL; 2128 nch->mount = dvp->v_mount; 2129 saved_dvp = NULL; 2130 fakename = NULL; 2131 2132 /* 2133 * Handle the makeit == 0 degenerate case 2134 */ 2135 if (makeit == 0) { 2136 spin_lock_shared(&dvp->v_spin); 2137 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2138 if (nch->ncp) 2139 cache_hold(nch); 2140 spin_unlock_shared(&dvp->v_spin); 2141 } 2142 2143 /* 2144 * Loop until resolution, inside code will break out on error. 2145 */ 2146 while (makeit) { 2147 /* 2148 * Break out if we successfully acquire a working ncp. 2149 */ 2150 spin_lock_shared(&dvp->v_spin); 2151 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2152 if (nch->ncp) { 2153 cache_hold(nch); 2154 spin_unlock_shared(&dvp->v_spin); 2155 break; 2156 } 2157 spin_unlock_shared(&dvp->v_spin); 2158 2159 /* 2160 * If dvp is the root of its filesystem it should already 2161 * have a namecache pointer associated with it as a side 2162 * effect of the mount, but it may have been disassociated. 2163 */ 2164 if (dvp->v_flag & VROOT) { 2165 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2166 error = cache_resolve_mp(nch->mount); 2167 _cache_put(nch->ncp); 2168 if (ncvp_debug) { 2169 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2170 dvp->v_mount, error); 2171 } 2172 if (error) { 2173 if (ncvp_debug) 2174 kprintf(" failed\n"); 2175 nch->ncp = NULL; 2176 break; 2177 } 2178 if (ncvp_debug) 2179 kprintf(" succeeded\n"); 2180 continue; 2181 } 2182 2183 /* 2184 * If we are recursed too deeply resort to an O(n^2) 2185 * algorithm to resolve the namecache topology. The 2186 * resolved pvp is left referenced in saved_dvp to 2187 * prevent the tree from being destroyed while we loop. 2188 */ 2189 if (makeit > 20) { 2190 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2191 if (error) { 2192 kprintf("lookupdotdot(longpath) failed %d " 2193 "dvp %p\n", error, dvp); 2194 nch->ncp = NULL; 2195 break; 2196 } 2197 continue; 2198 } 2199 2200 /* 2201 * Get the parent directory and resolve its ncp. 2202 */ 2203 if (fakename) { 2204 kfree(fakename, M_TEMP); 2205 fakename = NULL; 2206 } 2207 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2208 &fakename); 2209 if (error) { 2210 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2211 break; 2212 } 2213 vn_unlock(pvp); 2214 2215 /* 2216 * Reuse makeit as a recursion depth counter. On success 2217 * nch will be fully referenced. 2218 */ 2219 cache_fromdvp(pvp, cred, makeit + 1, nch); 2220 vrele(pvp); 2221 if (nch->ncp == NULL) 2222 break; 2223 2224 /* 2225 * Do an inefficient scan of pvp (embodied by ncp) to look 2226 * for dvp. This will create a namecache record for dvp on 2227 * success. We loop up to recheck on success. 2228 * 2229 * ncp and dvp are both held but not locked. 2230 */ 2231 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2232 if (error) { 2233 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2234 pvp, nch->ncp->nc_name, dvp); 2235 cache_drop(nch); 2236 /* nch was NULLed out, reload mount */ 2237 nch->mount = dvp->v_mount; 2238 break; 2239 } 2240 if (ncvp_debug) { 2241 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2242 pvp, nch->ncp->nc_name); 2243 } 2244 cache_drop(nch); 2245 /* nch was NULLed out, reload mount */ 2246 nch->mount = dvp->v_mount; 2247 } 2248 2249 /* 2250 * If nch->ncp is non-NULL it will have been held already. 2251 */ 2252 if (fakename) 2253 kfree(fakename, M_TEMP); 2254 if (saved_dvp) 2255 vrele(saved_dvp); 2256 if (nch->ncp) 2257 return (0); 2258 return (EINVAL); 2259 } 2260 2261 /* 2262 * Go up the chain of parent directories until we find something 2263 * we can resolve into the namecache. This is very inefficient. 2264 */ 2265 static 2266 int 2267 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2268 struct vnode **saved_dvp) 2269 { 2270 struct nchandle nch; 2271 struct vnode *pvp; 2272 int error; 2273 static time_t last_fromdvp_report; 2274 char *fakename; 2275 2276 /* 2277 * Loop getting the parent directory vnode until we get something we 2278 * can resolve in the namecache. 2279 */ 2280 vref(dvp); 2281 nch.mount = dvp->v_mount; 2282 nch.ncp = NULL; 2283 fakename = NULL; 2284 2285 for (;;) { 2286 if (fakename) { 2287 kfree(fakename, M_TEMP); 2288 fakename = NULL; 2289 } 2290 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2291 &fakename); 2292 if (error) { 2293 vrele(dvp); 2294 break; 2295 } 2296 vn_unlock(pvp); 2297 spin_lock_shared(&pvp->v_spin); 2298 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2299 _cache_hold(nch.ncp); 2300 spin_unlock_shared(&pvp->v_spin); 2301 vrele(pvp); 2302 break; 2303 } 2304 spin_unlock_shared(&pvp->v_spin); 2305 if (pvp->v_flag & VROOT) { 2306 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2307 error = cache_resolve_mp(nch.mount); 2308 _cache_unlock(nch.ncp); 2309 vrele(pvp); 2310 if (error) { 2311 _cache_drop(nch.ncp); 2312 nch.ncp = NULL; 2313 vrele(dvp); 2314 } 2315 break; 2316 } 2317 vrele(dvp); 2318 dvp = pvp; 2319 } 2320 if (error == 0) { 2321 if (last_fromdvp_report != time_uptime) { 2322 last_fromdvp_report = time_uptime; 2323 kprintf("Warning: extremely inefficient path " 2324 "resolution on %s\n", 2325 nch.ncp->nc_name); 2326 } 2327 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2328 2329 /* 2330 * Hopefully dvp now has a namecache record associated with 2331 * it. Leave it referenced to prevent the kernel from 2332 * recycling the vnode. Otherwise extremely long directory 2333 * paths could result in endless recycling. 2334 */ 2335 if (*saved_dvp) 2336 vrele(*saved_dvp); 2337 *saved_dvp = dvp; 2338 _cache_drop(nch.ncp); 2339 } 2340 if (fakename) 2341 kfree(fakename, M_TEMP); 2342 return (error); 2343 } 2344 2345 /* 2346 * Do an inefficient scan of the directory represented by ncp looking for 2347 * the directory vnode dvp. ncp must be held but not locked on entry and 2348 * will be held on return. dvp must be refd but not locked on entry and 2349 * will remain refd on return. 2350 * 2351 * Why do this at all? Well, due to its stateless nature the NFS server 2352 * converts file handles directly to vnodes without necessarily going through 2353 * the namecache ops that would otherwise create the namecache topology 2354 * leading to the vnode. We could either (1) Change the namecache algorithms 2355 * to allow disconnect namecache records that are re-merged opportunistically, 2356 * or (2) Make the NFS server backtrack and scan to recover a connected 2357 * namecache topology in order to then be able to issue new API lookups. 2358 * 2359 * It turns out that (1) is a huge mess. It takes a nice clean set of 2360 * namecache algorithms and introduces a lot of complication in every subsystem 2361 * that calls into the namecache to deal with the re-merge case, especially 2362 * since we are using the namecache to placehold negative lookups and the 2363 * vnode might not be immediately assigned. (2) is certainly far less 2364 * efficient then (1), but since we are only talking about directories here 2365 * (which are likely to remain cached), the case does not actually run all 2366 * that often and has the supreme advantage of not polluting the namecache 2367 * algorithms. 2368 * 2369 * If a fakename is supplied just construct a namecache entry using the 2370 * fake name. 2371 */ 2372 static int 2373 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2374 struct vnode *dvp, char *fakename) 2375 { 2376 struct nlcomponent nlc; 2377 struct nchandle rncp; 2378 struct dirent *den; 2379 struct vnode *pvp; 2380 struct vattr vat; 2381 struct iovec iov; 2382 struct uio uio; 2383 int blksize; 2384 int eofflag; 2385 int bytes; 2386 char *rbuf; 2387 int error; 2388 2389 vat.va_blocksize = 0; 2390 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2391 return (error); 2392 cache_lock(nch); 2393 error = cache_vref(nch, cred, &pvp); 2394 cache_unlock(nch); 2395 if (error) 2396 return (error); 2397 if (ncvp_debug) { 2398 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2399 "vattr fileid = %lld\n", 2400 nch->ncp, nch->ncp->nc_name, 2401 vat.va_blocksize, 2402 (long long)vat.va_fileid); 2403 } 2404 2405 /* 2406 * Use the supplied fakename if not NULL. Fake names are typically 2407 * not in the actual filesystem hierarchy. This is used by HAMMER 2408 * to glue @@timestamp recursions together. 2409 */ 2410 if (fakename) { 2411 nlc.nlc_nameptr = fakename; 2412 nlc.nlc_namelen = strlen(fakename); 2413 rncp = cache_nlookup(nch, &nlc); 2414 goto done; 2415 } 2416 2417 if ((blksize = vat.va_blocksize) == 0) 2418 blksize = DEV_BSIZE; 2419 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2420 rncp.ncp = NULL; 2421 2422 eofflag = 0; 2423 uio.uio_offset = 0; 2424 again: 2425 iov.iov_base = rbuf; 2426 iov.iov_len = blksize; 2427 uio.uio_iov = &iov; 2428 uio.uio_iovcnt = 1; 2429 uio.uio_resid = blksize; 2430 uio.uio_segflg = UIO_SYSSPACE; 2431 uio.uio_rw = UIO_READ; 2432 uio.uio_td = curthread; 2433 2434 if (ncvp_debug >= 2) 2435 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2436 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2437 if (error == 0) { 2438 den = (struct dirent *)rbuf; 2439 bytes = blksize - uio.uio_resid; 2440 2441 while (bytes > 0) { 2442 if (ncvp_debug >= 2) { 2443 kprintf("cache_inefficient_scan: %*.*s\n", 2444 den->d_namlen, den->d_namlen, 2445 den->d_name); 2446 } 2447 if (den->d_type != DT_WHT && 2448 den->d_ino == vat.va_fileid) { 2449 if (ncvp_debug) { 2450 kprintf("cache_inefficient_scan: " 2451 "MATCHED inode %lld path %s/%*.*s\n", 2452 (long long)vat.va_fileid, 2453 nch->ncp->nc_name, 2454 den->d_namlen, den->d_namlen, 2455 den->d_name); 2456 } 2457 nlc.nlc_nameptr = den->d_name; 2458 nlc.nlc_namelen = den->d_namlen; 2459 rncp = cache_nlookup(nch, &nlc); 2460 KKASSERT(rncp.ncp != NULL); 2461 break; 2462 } 2463 bytes -= _DIRENT_DIRSIZ(den); 2464 den = _DIRENT_NEXT(den); 2465 } 2466 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2467 goto again; 2468 } 2469 kfree(rbuf, M_TEMP); 2470 done: 2471 vrele(pvp); 2472 if (rncp.ncp) { 2473 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2474 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2475 if (ncvp_debug >= 2) { 2476 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2477 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2478 } 2479 } else { 2480 if (ncvp_debug >= 2) { 2481 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2482 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2483 rncp.ncp->nc_vp); 2484 } 2485 } 2486 if (rncp.ncp->nc_vp == NULL) 2487 error = rncp.ncp->nc_error; 2488 /* 2489 * Release rncp after a successful nlookup. rncp was fully 2490 * referenced. 2491 */ 2492 cache_put(&rncp); 2493 } else { 2494 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2495 dvp, nch->ncp->nc_name); 2496 error = ENOENT; 2497 } 2498 return (error); 2499 } 2500 2501 /* 2502 * This function must be called with the ncp held and locked and will unlock 2503 * and drop it during zapping. 2504 * 2505 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2506 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2507 * and removes the related reference. If the ncp can be removed, and the 2508 * parent can be zapped non-blocking, this function loops up. 2509 * 2510 * There will be one ref from the caller (which we now own). The only 2511 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2512 * so possibly 2 refs left. Taking this into account, if there are no 2513 * additional refs and no children, the ncp will be removed from the topology 2514 * and destroyed. 2515 * 2516 * References and/or children may exist if the ncp is in the middle of the 2517 * topology, preventing the ncp from being destroyed. 2518 * 2519 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2520 * 2521 * This function may return a held (but NOT locked) parent node which the 2522 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2523 * due to deep namecache trees. 2524 * 2525 * WARNING! For MPSAFE operation this routine must acquire up to three 2526 * spin locks to be able to safely test nc_refs. Lock order is 2527 * very important. 2528 * 2529 * hash spinlock if on hash list 2530 * parent spinlock if child of parent 2531 * (the ncp is unresolved so there is no vnode association) 2532 */ 2533 static void 2534 cache_zap(struct namecache *ncp) 2535 { 2536 struct namecache *par; 2537 struct vnode *dropvp; 2538 struct nchash_head *nchpp; 2539 int refcmp; 2540 int nonblock = 1; /* XXX cleanup */ 2541 2542 again: 2543 /* 2544 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2545 * This gets rid of any vp->v_namecache list or negative list and 2546 * the related ref. 2547 */ 2548 _cache_setunresolved(ncp); 2549 2550 /* 2551 * Try to scrap the entry and possibly tail-recurse on its parent. 2552 * We only scrap unref'd (other then our ref) unresolved entries, 2553 * we do not scrap 'live' entries. 2554 * 2555 * If nc_parent is non NULL we expect 2 references, else just 1. 2556 * If there are more, someone else also holds the ncp and we cannot 2557 * destroy it. 2558 */ 2559 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2560 KKASSERT(ncp->nc_refs > 0); 2561 2562 /* 2563 * If the ncp is linked to its parent it will also be in the hash 2564 * table. We have to be able to lock the parent and the hash table. 2565 * 2566 * Acquire locks. Note that the parent can't go away while we hold 2567 * a child locked. If nc_parent is present, expect 2 refs instead 2568 * of 1. 2569 */ 2570 nchpp = NULL; 2571 if ((par = ncp->nc_parent) != NULL) { 2572 if (nonblock) { 2573 if (_cache_lock_nonblock(par)) { 2574 /* lock failed */ 2575 ncp->nc_flag |= NCF_DEFEREDZAP; 2576 atomic_add_long( 2577 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2578 1); 2579 _cache_unlock(ncp); 2580 _cache_drop(ncp); /* caller's ref */ 2581 return; 2582 } 2583 _cache_hold(par); 2584 } else { 2585 _cache_hold(par); 2586 _cache_lock(par); 2587 } 2588 nchpp = ncp->nc_head; 2589 spin_lock(&nchpp->spin); 2590 } 2591 2592 /* 2593 * With the parent and nchpp locked, and the vnode removed 2594 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2595 * more someone else has a ref and we cannot zap the entry. 2596 * 2597 * one for our hold 2598 * one for our parent link (parent also has one from the linkage) 2599 */ 2600 if (par) 2601 refcmp = 2; 2602 else 2603 refcmp = 1; 2604 2605 /* 2606 * On failure undo the work we've done so far and drop the 2607 * caller's ref and ncp. 2608 */ 2609 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2610 if (par) { 2611 spin_unlock(&nchpp->spin); 2612 _cache_put(par); 2613 } 2614 _cache_unlock(ncp); 2615 _cache_drop(ncp); 2616 return; 2617 } 2618 2619 /* 2620 * We own all the refs and with the spinlocks held no further 2621 * refs can be acquired by others. 2622 * 2623 * Remove us from the hash list and parent list. We have to 2624 * drop a ref on the parent's vp if the parent's list becomes 2625 * empty. 2626 */ 2627 dropvp = NULL; 2628 if (par) { 2629 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2630 2631 KKASSERT(nchpp == ncp->nc_head); 2632 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2633 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2634 atomic_add_long(&pn->vfscache_count, -1); 2635 if (TAILQ_EMPTY(&ncp->nc_list)) 2636 atomic_add_long(&pn->vfscache_leafs, -1); 2637 2638 if (TAILQ_EMPTY(&par->nc_list)) { 2639 atomic_add_long(&pn->vfscache_leafs, 1); 2640 if (par->nc_vp) 2641 dropvp = par->nc_vp; 2642 } 2643 ncp->nc_parent = NULL; 2644 ncp->nc_head = NULL; 2645 spin_unlock(&nchpp->spin); 2646 _cache_drop(par); /* removal of ncp from par->nc_list */ 2647 /*_cache_unlock(par);*/ 2648 } else { 2649 KKASSERT(ncp->nc_head == NULL); 2650 } 2651 2652 /* 2653 * ncp should not have picked up any refs. Physically 2654 * destroy the ncp. 2655 */ 2656 if (ncp->nc_refs != refcmp) { 2657 panic("cache_zap: %p bad refs %d (expected %d)\n", 2658 ncp, ncp->nc_refs, refcmp); 2659 } 2660 /* _cache_unlock(ncp) not required */ 2661 ncp->nc_refs = -1; /* safety */ 2662 if (ncp->nc_name) 2663 kfree(ncp->nc_name, M_VFSCACHE); 2664 kfree(ncp, M_VFSCACHE); 2665 2666 /* 2667 * Delayed drop (we had to release our spinlocks) 2668 */ 2669 if (dropvp) 2670 vdrop(dropvp); 2671 2672 /* 2673 * Loop up if we can recursively clean out the parent. 2674 */ 2675 if (par) { 2676 refcmp = 1; /* ref on parent */ 2677 if (par->nc_parent) /* par->par */ 2678 ++refcmp; 2679 par->nc_flag &= ~NCF_DEFEREDZAP; 2680 if ((par->nc_flag & NCF_UNRESOLVED) && 2681 par->nc_refs == refcmp && 2682 TAILQ_EMPTY(&par->nc_list)) { 2683 ncp = par; 2684 goto again; 2685 } 2686 _cache_unlock(par); 2687 _cache_drop(par); 2688 } 2689 } 2690 2691 /* 2692 * Clean up dangling negative cache and defered-drop entries in the 2693 * namecache. 2694 * 2695 * This routine is called in the critical path and also called from 2696 * vnlru(). When called from vnlru we use a lower limit to try to 2697 * deal with the negative cache before the critical path has to start 2698 * dealing with it. 2699 */ 2700 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2701 2702 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2703 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2704 2705 void 2706 cache_hysteresis(int critpath) 2707 { 2708 long poslimit; 2709 long neglimit = maxvnodes / ncnegfactor; 2710 long xnumcache = vfscache_leafs; 2711 2712 if (critpath == 0) 2713 neglimit = neglimit * 8 / 10; 2714 2715 /* 2716 * Don't cache too many negative hits. We use hysteresis to reduce 2717 * the impact on the critical path. 2718 */ 2719 switch(neg_cache_hysteresis_state[critpath]) { 2720 case CHI_LOW: 2721 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2722 if (critpath) 2723 _cache_cleanneg(ncnegflush); 2724 else 2725 _cache_cleanneg(ncnegflush + 2726 vfscache_negs - neglimit); 2727 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2728 } 2729 break; 2730 case CHI_HIGH: 2731 if (vfscache_negs > MINNEG * 9 / 10 && 2732 vfscache_negs * 9 / 10 > neglimit 2733 ) { 2734 if (critpath) 2735 _cache_cleanneg(ncnegflush); 2736 else 2737 _cache_cleanneg(ncnegflush + 2738 vfscache_negs * 9 / 10 - 2739 neglimit); 2740 } else { 2741 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2742 } 2743 break; 2744 } 2745 2746 /* 2747 * Don't cache too many positive hits. We use hysteresis to reduce 2748 * the impact on the critical path. 2749 * 2750 * Excessive positive hits can accumulate due to large numbers of 2751 * hardlinks (the vnode cache will not prevent hl ncps from growing 2752 * into infinity). 2753 */ 2754 if ((poslimit = ncposlimit) == 0) 2755 poslimit = maxvnodes * 2; 2756 if (critpath == 0) 2757 poslimit = poslimit * 8 / 10; 2758 2759 switch(pos_cache_hysteresis_state[critpath]) { 2760 case CHI_LOW: 2761 if (xnumcache > poslimit && xnumcache > MINPOS) { 2762 if (critpath) 2763 _cache_cleanpos(ncposflush); 2764 else 2765 _cache_cleanpos(ncposflush + 2766 xnumcache - poslimit); 2767 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2768 } 2769 break; 2770 case CHI_HIGH: 2771 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2772 if (critpath) 2773 _cache_cleanpos(ncposflush); 2774 else 2775 _cache_cleanpos(ncposflush + 2776 xnumcache - poslimit * 5 / 6); 2777 } else { 2778 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2779 } 2780 break; 2781 } 2782 2783 /* 2784 * Clean out dangling defered-zap ncps which could not be cleanly 2785 * dropped if too many build up. Note that numdefered is 2786 * heuristical. Make sure we are real-time for the current cpu, 2787 * plus the global rollup. 2788 */ 2789 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2790 _cache_cleandefered(); 2791 } 2792 } 2793 2794 /* 2795 * NEW NAMECACHE LOOKUP API 2796 * 2797 * Lookup an entry in the namecache. The passed par_nch must be referenced 2798 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2799 * is ALWAYS returned, eve if the supplied component is illegal. 2800 * 2801 * The resulting namecache entry should be returned to the system with 2802 * cache_put() or cache_unlock() + cache_drop(). 2803 * 2804 * namecache locks are recursive but care must be taken to avoid lock order 2805 * reversals (hence why the passed par_nch must be unlocked). Locking 2806 * rules are to order for parent traversals, not for child traversals. 2807 * 2808 * Nobody else will be able to manipulate the associated namespace (e.g. 2809 * create, delete, rename, rename-target) until the caller unlocks the 2810 * entry. 2811 * 2812 * The returned entry will be in one of three states: positive hit (non-null 2813 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2814 * Unresolved entries must be resolved through the filesystem to associate the 2815 * vnode and/or determine whether a positive or negative hit has occured. 2816 * 2817 * It is not necessary to lock a directory in order to lock namespace under 2818 * that directory. In fact, it is explicitly not allowed to do that. A 2819 * directory is typically only locked when being created, renamed, or 2820 * destroyed. 2821 * 2822 * The directory (par) may be unresolved, in which case any returned child 2823 * will likely also be marked unresolved. Likely but not guarenteed. Since 2824 * the filesystem lookup requires a resolved directory vnode the caller is 2825 * responsible for resolving the namecache chain top-down. This API 2826 * specifically allows whole chains to be created in an unresolved state. 2827 */ 2828 struct nchandle 2829 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2830 { 2831 struct nchandle nch; 2832 struct namecache *ncp; 2833 struct namecache *new_ncp; 2834 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2835 struct nchash_head *nchpp; 2836 struct mount *mp; 2837 u_int32_t hash; 2838 globaldata_t gd; 2839 int par_locked; 2840 2841 gd = mycpu; 2842 mp = par_nch->mount; 2843 par_locked = 0; 2844 2845 /* 2846 * This is a good time to call it, no ncp's are locked by 2847 * the caller or us. 2848 */ 2849 cache_hysteresis(1); 2850 2851 /* 2852 * Try to locate an existing entry 2853 */ 2854 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2855 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2856 new_ncp = NULL; 2857 nchpp = NCHHASH(hash); 2858 restart: 2859 rep_ncp = NULL; 2860 if (new_ncp) 2861 spin_lock(&nchpp->spin); 2862 else 2863 spin_lock_shared(&nchpp->spin); 2864 2865 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2866 /* 2867 * Break out if we find a matching entry. Note that 2868 * UNRESOLVED entries may match, but DESTROYED entries 2869 * do not. 2870 * 2871 * We may be able to reuse DESTROYED entries that we come 2872 * across, even if the name does not match, as long as 2873 * nc_nlen is correct and the only hold ref is from the nchpp 2874 * list itself. 2875 */ 2876 if (ncp->nc_parent == par_nch->ncp && 2877 ncp->nc_nlen == nlc->nlc_namelen) { 2878 if (ncp->nc_flag & NCF_DESTROYED) { 2879 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2880 rep_ncp = ncp; 2881 continue; 2882 } 2883 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2884 continue; 2885 _cache_hold(ncp); 2886 if (new_ncp) 2887 spin_unlock(&nchpp->spin); 2888 else 2889 spin_unlock_shared(&nchpp->spin); 2890 if (par_locked) { 2891 _cache_unlock(par_nch->ncp); 2892 par_locked = 0; 2893 } 2894 if (_cache_lock_special(ncp) == 0) { 2895 /* 2896 * Successfully locked but we must re-test 2897 * conditions that might have changed since 2898 * we did not have the lock before. 2899 */ 2900 if (ncp->nc_parent != par_nch->ncp || 2901 ncp->nc_nlen != nlc->nlc_namelen || 2902 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2903 ncp->nc_nlen) || 2904 (ncp->nc_flag & NCF_DESTROYED)) { 2905 _cache_put(ncp); 2906 goto restart; 2907 } 2908 _cache_auto_unresolve(mp, ncp); 2909 if (new_ncp) 2910 _cache_free(new_ncp); 2911 goto found; 2912 } 2913 _cache_get(ncp); /* cycle the lock to block */ 2914 _cache_put(ncp); 2915 _cache_drop(ncp); 2916 goto restart; 2917 } 2918 } 2919 2920 /* 2921 * We failed to locate the entry, try to resurrect a destroyed 2922 * entry that we did find that is already correctly linked into 2923 * nchpp and the parent. We must re-test conditions after 2924 * successfully locking rep_ncp. 2925 * 2926 * This case can occur under heavy loads due to not being able 2927 * to safely lock the parent in cache_zap(). Nominally a repeated 2928 * create/unlink load, but only the namelen needs to match. 2929 */ 2930 if (rep_ncp && new_ncp == NULL) { 2931 if (_cache_lock_nonblock(rep_ncp) == 0) { 2932 _cache_hold(rep_ncp); 2933 if (rep_ncp->nc_parent == par_nch->ncp && 2934 rep_ncp->nc_nlen == nlc->nlc_namelen && 2935 (rep_ncp->nc_flag & NCF_DESTROYED) && 2936 rep_ncp->nc_refs == 2) { 2937 /* 2938 * Update nc_name as reuse as new. 2939 */ 2940 ncp = rep_ncp; 2941 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2942 nlc->nlc_namelen); 2943 spin_unlock_shared(&nchpp->spin); 2944 _cache_setunresolved(ncp); 2945 ncp->nc_flag = NCF_UNRESOLVED; 2946 ncp->nc_error = ENOTCONN; 2947 goto found; 2948 } 2949 _cache_put(rep_ncp); 2950 } 2951 } 2952 2953 /* 2954 * Otherwise create a new entry and add it to the cache. The parent 2955 * ncp must also be locked so we can link into it. 2956 * 2957 * We have to relookup after possibly blocking in kmalloc or 2958 * when locking par_nch. 2959 * 2960 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2961 * mount case, in which case nc_name will be NULL. 2962 */ 2963 if (new_ncp == NULL) { 2964 spin_unlock_shared(&nchpp->spin); 2965 new_ncp = cache_alloc(nlc->nlc_namelen); 2966 if (nlc->nlc_namelen) { 2967 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2968 nlc->nlc_namelen); 2969 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2970 } 2971 goto restart; 2972 } 2973 2974 /* 2975 * NOTE! The spinlock is held exclusively here because new_ncp 2976 * is non-NULL. 2977 */ 2978 if (par_locked == 0) { 2979 spin_unlock(&nchpp->spin); 2980 _cache_lock(par_nch->ncp); 2981 par_locked = 1; 2982 goto restart; 2983 } 2984 2985 /* 2986 * Link to parent (requires another ref, the one already in new_ncp 2987 * is what we wil lreturn). 2988 * 2989 * WARNING! We still hold the spinlock. We have to set the hash 2990 * table entry atomically. 2991 */ 2992 ncp = new_ncp; 2993 ++ncp->nc_refs; 2994 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2995 spin_unlock(&nchpp->spin); 2996 _cache_unlock(par_nch->ncp); 2997 /* par_locked = 0 - not used */ 2998 found: 2999 /* 3000 * stats and namecache size management 3001 */ 3002 if (ncp->nc_flag & NCF_UNRESOLVED) 3003 ++gd->gd_nchstats->ncs_miss; 3004 else if (ncp->nc_vp) 3005 ++gd->gd_nchstats->ncs_goodhits; 3006 else 3007 ++gd->gd_nchstats->ncs_neghits; 3008 nch.mount = mp; 3009 nch.ncp = ncp; 3010 _cache_mntref(nch.mount); 3011 3012 return(nch); 3013 } 3014 3015 /* 3016 * Attempt to lookup a namecache entry and return with a shared namecache 3017 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3018 * set or we are unable to lock. 3019 */ 3020 int 3021 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3022 struct nlcomponent *nlc, 3023 int excl, struct nchandle *res_nch) 3024 { 3025 struct namecache *ncp; 3026 struct nchash_head *nchpp; 3027 struct mount *mp; 3028 u_int32_t hash; 3029 globaldata_t gd; 3030 3031 /* 3032 * If exclusive requested or shared namecache locks are disabled, 3033 * return failure. 3034 */ 3035 if (ncp_shared_lock_disable || excl) 3036 return(EWOULDBLOCK); 3037 3038 gd = mycpu; 3039 mp = par_nch->mount; 3040 3041 /* 3042 * This is a good time to call it, no ncp's are locked by 3043 * the caller or us. 3044 */ 3045 cache_hysteresis(1); 3046 3047 /* 3048 * Try to locate an existing entry 3049 */ 3050 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3051 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3052 nchpp = NCHHASH(hash); 3053 3054 spin_lock_shared(&nchpp->spin); 3055 3056 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3057 /* 3058 * Break out if we find a matching entry. Note that 3059 * UNRESOLVED entries may match, but DESTROYED entries 3060 * do not. 3061 */ 3062 if (ncp->nc_parent == par_nch->ncp && 3063 ncp->nc_nlen == nlc->nlc_namelen && 3064 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3065 (ncp->nc_flag & NCF_DESTROYED) == 0 3066 ) { 3067 _cache_hold(ncp); 3068 spin_unlock_shared(&nchpp->spin); 3069 3070 if (_cache_lock_shared_special(ncp) == 0) { 3071 if (ncp->nc_parent == par_nch->ncp && 3072 ncp->nc_nlen == nlc->nlc_namelen && 3073 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3074 ncp->nc_nlen) == 0 && 3075 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3076 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3077 _cache_auto_unresolve_test(mp, ncp) == 0) { 3078 goto found; 3079 } 3080 _cache_unlock(ncp); 3081 } 3082 _cache_drop(ncp); 3083 return(EWOULDBLOCK); 3084 } 3085 } 3086 3087 /* 3088 * Failure 3089 */ 3090 spin_unlock_shared(&nchpp->spin); 3091 return(EWOULDBLOCK); 3092 3093 /* 3094 * Success 3095 * 3096 * Note that nc_error might be non-zero (e.g ENOENT). 3097 */ 3098 found: 3099 res_nch->mount = mp; 3100 res_nch->ncp = ncp; 3101 ++gd->gd_nchstats->ncs_goodhits; 3102 _cache_mntref(res_nch->mount); 3103 3104 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3105 return(ncp->nc_error); 3106 } 3107 3108 /* 3109 * This is a non-blocking verison of cache_nlookup() used by 3110 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3111 * will return nch.ncp == NULL in that case. 3112 */ 3113 struct nchandle 3114 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3115 { 3116 struct nchandle nch; 3117 struct namecache *ncp; 3118 struct namecache *new_ncp; 3119 struct nchash_head *nchpp; 3120 struct mount *mp; 3121 u_int32_t hash; 3122 globaldata_t gd; 3123 int par_locked; 3124 3125 gd = mycpu; 3126 mp = par_nch->mount; 3127 par_locked = 0; 3128 3129 /* 3130 * Try to locate an existing entry 3131 */ 3132 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3133 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3134 new_ncp = NULL; 3135 nchpp = NCHHASH(hash); 3136 restart: 3137 spin_lock(&nchpp->spin); 3138 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3139 /* 3140 * Break out if we find a matching entry. Note that 3141 * UNRESOLVED entries may match, but DESTROYED entries 3142 * do not. 3143 */ 3144 if (ncp->nc_parent == par_nch->ncp && 3145 ncp->nc_nlen == nlc->nlc_namelen && 3146 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3147 (ncp->nc_flag & NCF_DESTROYED) == 0 3148 ) { 3149 _cache_hold(ncp); 3150 spin_unlock(&nchpp->spin); 3151 if (par_locked) { 3152 _cache_unlock(par_nch->ncp); 3153 par_locked = 0; 3154 } 3155 if (_cache_lock_special(ncp) == 0) { 3156 if (ncp->nc_parent != par_nch->ncp || 3157 ncp->nc_nlen != nlc->nlc_namelen || 3158 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3159 (ncp->nc_flag & NCF_DESTROYED)) { 3160 kprintf("cache_lookup_nonblock: " 3161 "ncp-race %p %*.*s\n", 3162 ncp, 3163 nlc->nlc_namelen, 3164 nlc->nlc_namelen, 3165 nlc->nlc_nameptr); 3166 _cache_unlock(ncp); 3167 _cache_drop(ncp); 3168 goto failed; 3169 } 3170 _cache_auto_unresolve(mp, ncp); 3171 if (new_ncp) { 3172 _cache_free(new_ncp); 3173 new_ncp = NULL; 3174 } 3175 goto found; 3176 } 3177 _cache_drop(ncp); 3178 goto failed; 3179 } 3180 } 3181 3182 /* 3183 * We failed to locate an entry, create a new entry and add it to 3184 * the cache. The parent ncp must also be locked so we 3185 * can link into it. 3186 * 3187 * We have to relookup after possibly blocking in kmalloc or 3188 * when locking par_nch. 3189 * 3190 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3191 * mount case, in which case nc_name will be NULL. 3192 */ 3193 if (new_ncp == NULL) { 3194 spin_unlock(&nchpp->spin); 3195 new_ncp = cache_alloc(nlc->nlc_namelen); 3196 if (nlc->nlc_namelen) { 3197 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3198 nlc->nlc_namelen); 3199 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3200 } 3201 goto restart; 3202 } 3203 if (par_locked == 0) { 3204 spin_unlock(&nchpp->spin); 3205 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3206 par_locked = 1; 3207 goto restart; 3208 } 3209 goto failed; 3210 } 3211 3212 /* 3213 * Link to parent (requires another ref, the one already in new_ncp 3214 * is what we wil lreturn). 3215 * 3216 * WARNING! We still hold the spinlock. We have to set the hash 3217 * table entry atomically. 3218 */ 3219 ncp = new_ncp; 3220 ++ncp->nc_refs; 3221 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3222 spin_unlock(&nchpp->spin); 3223 _cache_unlock(par_nch->ncp); 3224 /* par_locked = 0 - not used */ 3225 found: 3226 /* 3227 * stats and namecache size management 3228 */ 3229 if (ncp->nc_flag & NCF_UNRESOLVED) 3230 ++gd->gd_nchstats->ncs_miss; 3231 else if (ncp->nc_vp) 3232 ++gd->gd_nchstats->ncs_goodhits; 3233 else 3234 ++gd->gd_nchstats->ncs_neghits; 3235 nch.mount = mp; 3236 nch.ncp = ncp; 3237 _cache_mntref(nch.mount); 3238 3239 return(nch); 3240 failed: 3241 if (new_ncp) { 3242 _cache_free(new_ncp); 3243 new_ncp = NULL; 3244 } 3245 nch.mount = NULL; 3246 nch.ncp = NULL; 3247 return(nch); 3248 } 3249 3250 /* 3251 * This version is non-locking. The caller must validate the result 3252 * for parent-to-child continuity. 3253 * 3254 * It can fail for any reason and will return nch.ncp == NULL in that case. 3255 */ 3256 struct nchandle 3257 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3258 { 3259 struct nchandle nch; 3260 struct namecache *ncp; 3261 struct nchash_head *nchpp; 3262 struct mount *mp; 3263 u_int32_t hash; 3264 globaldata_t gd; 3265 3266 gd = mycpu; 3267 mp = par_nch->mount; 3268 3269 /* 3270 * Try to locate an existing entry 3271 */ 3272 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3273 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3274 nchpp = NCHHASH(hash); 3275 3276 spin_lock_shared(&nchpp->spin); 3277 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3278 /* 3279 * Break out if we find a matching entry. Note that 3280 * UNRESOLVED entries may match, but DESTROYED entries 3281 * do not. 3282 * 3283 * Resolved NFS entries which have timed out fail so the 3284 * caller can rerun with normal locking. 3285 */ 3286 if (ncp->nc_parent == par_nch->ncp && 3287 ncp->nc_nlen == nlc->nlc_namelen && 3288 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3289 (ncp->nc_flag & NCF_DESTROYED) == 0 3290 ) { 3291 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3292 break; 3293 _cache_hold(ncp); 3294 spin_unlock_shared(&nchpp->spin); 3295 goto found; 3296 } 3297 } 3298 spin_unlock_shared(&nchpp->spin); 3299 nch.mount = NULL; 3300 nch.ncp = NULL; 3301 return nch; 3302 found: 3303 /* 3304 * stats and namecache size management 3305 */ 3306 if (ncp->nc_flag & NCF_UNRESOLVED) 3307 ++gd->gd_nchstats->ncs_miss; 3308 else if (ncp->nc_vp) 3309 ++gd->gd_nchstats->ncs_goodhits; 3310 else 3311 ++gd->gd_nchstats->ncs_neghits; 3312 nch.mount = mp; 3313 nch.ncp = ncp; 3314 _cache_mntref(nch.mount); 3315 3316 return(nch); 3317 } 3318 3319 /* 3320 * The namecache entry is marked as being used as a mount point. 3321 * Locate the mount if it is visible to the caller. The DragonFly 3322 * mount system allows arbitrary loops in the topology and disentangles 3323 * those loops by matching against (mp, ncp) rather than just (ncp). 3324 * This means any given ncp can dive any number of mounts, depending 3325 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3326 * 3327 * We use a very simple frontend cache to reduce SMP conflicts, 3328 * which we have to do because the mountlist scan needs an exclusive 3329 * lock around its ripout info list. Not to mention that there might 3330 * be a lot of mounts. 3331 * 3332 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3333 * down a bit to allow some contention rather than making the cache 3334 * excessively huge. 3335 * 3336 * The hash table is split into per-cpu areas, is 4-way set-associative. 3337 */ 3338 struct findmount_info { 3339 struct mount *result; 3340 struct mount *nch_mount; 3341 struct namecache *nch_ncp; 3342 }; 3343 3344 static __inline 3345 struct ncmount_cache * 3346 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3347 { 3348 uint32_t hash; 3349 3350 hash = iscsi_crc32(&mp, sizeof(mp)); 3351 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3352 hash ^= hash >> 16; 3353 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3354 3355 return (&ncmount_cache[hash]); 3356 } 3357 3358 static 3359 struct ncmount_cache * 3360 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3361 { 3362 struct ncmount_cache *ncc; 3363 struct ncmount_cache *best; 3364 int delta; 3365 int best_delta; 3366 int i; 3367 3368 ncc = ncmount_cache_lookup4(mp, ncp); 3369 3370 /* 3371 * NOTE: When checking for a ticks overflow implement a slop of 3372 * 2 ticks just to be safe, because ticks is accessed 3373 * non-atomically one CPU can increment it while another 3374 * is still using the old value. 3375 */ 3376 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3377 return ncc; 3378 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3379 if (delta < -2) /* overflow reset */ 3380 ncc->ticks = ticks; 3381 best = ncc; 3382 best_delta = delta; 3383 3384 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3385 ++ncc; 3386 if (ncc->ncp == ncp && ncc->mp == mp) 3387 return ncc; 3388 delta = (int)(ticks - ncc->ticks); 3389 if (delta < -2) 3390 ncc->ticks = ticks; 3391 if (delta > best_delta) { 3392 best_delta = delta; 3393 best = ncc; 3394 } 3395 } 3396 return best; 3397 } 3398 3399 /* 3400 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3401 * doing an expensive mountlist_scan*() if possible. 3402 * 3403 * (mp, ncp) -> mountonpt.k 3404 * 3405 * Returns a referenced mount pointer or NULL 3406 * 3407 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3408 * operations (that is, where the mp_target can be freed out from under us). 3409 * 3410 * Lookups use the ncc->updating counter to validate the contents in order 3411 * to avoid having to obtain the per cache-element spin-lock. In addition, 3412 * the ticks field is only updated when it changes. However, if our per-cpu 3413 * lock fails due to an unmount-in-progress, we fall-back to the 3414 * cache-element's spin-lock. 3415 */ 3416 struct mount * 3417 cache_findmount(struct nchandle *nch) 3418 { 3419 struct findmount_info info; 3420 struct ncmount_cache *ncc; 3421 struct ncmount_cache ncc_copy; 3422 struct mount *target; 3423 struct pcpu_ncache *pcpu; 3424 struct spinlock *spinlk; 3425 int update; 3426 3427 pcpu = pcpu_ncache; 3428 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3429 ncc = NULL; 3430 goto skip; 3431 } 3432 pcpu += mycpu->gd_cpuid; 3433 3434 again: 3435 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3436 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3437 found: 3438 /* 3439 * This is a bit messy for now because we do not yet have 3440 * safe disposal of mount structures. We have to ref 3441 * ncc->mp_target but the 'update' counter only tell us 3442 * whether the cache has changed after the fact. 3443 * 3444 * For now get a per-cpu spinlock that will only contend 3445 * against umount's. This is the best path. If it fails, 3446 * instead of waiting on the umount we fall-back to a 3447 * shared ncc->spin lock, which will generally only cost a 3448 * cache ping-pong. 3449 */ 3450 update = ncc->updating; 3451 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3452 spinlk = &pcpu->umount_spin; 3453 } else { 3454 spinlk = &ncc->spin; 3455 spin_lock_shared(spinlk); 3456 } 3457 if (update & 1) { /* update in progress */ 3458 spin_unlock_any(spinlk); 3459 goto skip; 3460 } 3461 ncc_copy = *ncc; 3462 cpu_lfence(); 3463 if (ncc->updating != update) { /* content changed */ 3464 spin_unlock_any(spinlk); 3465 goto again; 3466 } 3467 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3468 spin_unlock_any(spinlk); 3469 goto again; 3470 } 3471 if (ncc_copy.isneg == 0) { 3472 target = ncc_copy.mp_target; 3473 if (target->mnt_ncmounton.mount == nch->mount && 3474 target->mnt_ncmounton.ncp == nch->ncp) { 3475 /* 3476 * Cache hit (positive) (avoid dirtying 3477 * the cache line if possible) 3478 */ 3479 if (ncc->ticks != (int)ticks) 3480 ncc->ticks = (int)ticks; 3481 _cache_mntref(target); 3482 } 3483 } else { 3484 /* 3485 * Cache hit (negative) (avoid dirtying 3486 * the cache line if possible) 3487 */ 3488 if (ncc->ticks != (int)ticks) 3489 ncc->ticks = (int)ticks; 3490 target = NULL; 3491 } 3492 spin_unlock_any(spinlk); 3493 3494 return target; 3495 } 3496 skip: 3497 3498 /* 3499 * Slow 3500 */ 3501 info.result = NULL; 3502 info.nch_mount = nch->mount; 3503 info.nch_ncp = nch->ncp; 3504 mountlist_scan(cache_findmount_callback, &info, 3505 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3506 3507 /* 3508 * To reduce multi-re-entry on the cache, relookup in the cache. 3509 * This can still race, obviously, but that's ok. 3510 */ 3511 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3512 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3513 if (info.result) 3514 atomic_add_int(&info.result->mnt_refs, -1); 3515 goto found; 3516 } 3517 3518 /* 3519 * Cache the result. 3520 */ 3521 if ((info.result == NULL || 3522 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3523 spin_lock(&ncc->spin); 3524 atomic_add_int_nonlocked(&ncc->updating, 1); 3525 cpu_sfence(); 3526 KKASSERT(ncc->updating & 1); 3527 if (ncc->mp != nch->mount) { 3528 if (ncc->mp) 3529 atomic_add_int(&ncc->mp->mnt_refs, -1); 3530 atomic_add_int(&nch->mount->mnt_refs, 1); 3531 ncc->mp = nch->mount; 3532 } 3533 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3534 ncc->ticks = (int)ticks; 3535 3536 if (info.result) { 3537 ncc->isneg = 0; 3538 if (ncc->mp_target != info.result) { 3539 if (ncc->mp_target) 3540 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3541 ncc->mp_target = info.result; 3542 atomic_add_int(&info.result->mnt_refs, 1); 3543 } 3544 } else { 3545 ncc->isneg = 1; 3546 if (ncc->mp_target) { 3547 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3548 ncc->mp_target = NULL; 3549 } 3550 } 3551 cpu_sfence(); 3552 atomic_add_int_nonlocked(&ncc->updating, 1); 3553 spin_unlock(&ncc->spin); 3554 } 3555 return(info.result); 3556 } 3557 3558 static 3559 int 3560 cache_findmount_callback(struct mount *mp, void *data) 3561 { 3562 struct findmount_info *info = data; 3563 3564 /* 3565 * Check the mount's mounted-on point against the passed nch. 3566 */ 3567 if (mp->mnt_ncmounton.mount == info->nch_mount && 3568 mp->mnt_ncmounton.ncp == info->nch_ncp 3569 ) { 3570 info->result = mp; 3571 _cache_mntref(mp); 3572 return(-1); 3573 } 3574 return(0); 3575 } 3576 3577 void 3578 cache_dropmount(struct mount *mp) 3579 { 3580 _cache_mntrel(mp); 3581 } 3582 3583 /* 3584 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3585 * or negative). 3586 * 3587 * A full scan is not required, but for now just do it anyway. 3588 */ 3589 void 3590 cache_ismounting(struct mount *mp) 3591 { 3592 struct ncmount_cache *ncc; 3593 struct mount *ncc_mp; 3594 int i; 3595 3596 if (pcpu_ncache == NULL) 3597 return; 3598 3599 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3600 ncc = &ncmount_cache[i]; 3601 if (ncc->mp != mp->mnt_ncmounton.mount || 3602 ncc->ncp != mp->mnt_ncmounton.ncp) { 3603 continue; 3604 } 3605 spin_lock(&ncc->spin); 3606 atomic_add_int_nonlocked(&ncc->updating, 1); 3607 cpu_sfence(); 3608 KKASSERT(ncc->updating & 1); 3609 if (ncc->mp != mp->mnt_ncmounton.mount || 3610 ncc->ncp != mp->mnt_ncmounton.ncp) { 3611 cpu_sfence(); 3612 ++ncc->updating; 3613 spin_unlock(&ncc->spin); 3614 continue; 3615 } 3616 ncc_mp = ncc->mp; 3617 ncc->ncp = NULL; 3618 ncc->mp = NULL; 3619 if (ncc_mp) 3620 atomic_add_int(&ncc_mp->mnt_refs, -1); 3621 ncc_mp = ncc->mp_target; 3622 ncc->mp_target = NULL; 3623 if (ncc_mp) 3624 atomic_add_int(&ncc_mp->mnt_refs, -1); 3625 ncc->ticks = (int)ticks - hz * 120; 3626 3627 cpu_sfence(); 3628 atomic_add_int_nonlocked(&ncc->updating, 1); 3629 spin_unlock(&ncc->spin); 3630 } 3631 3632 /* 3633 * Pre-cache the mount point 3634 */ 3635 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3636 mp->mnt_ncmounton.ncp); 3637 3638 spin_lock(&ncc->spin); 3639 atomic_add_int_nonlocked(&ncc->updating, 1); 3640 cpu_sfence(); 3641 KKASSERT(ncc->updating & 1); 3642 3643 if (ncc->mp) 3644 atomic_add_int(&ncc->mp->mnt_refs, -1); 3645 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3646 ncc->mp = mp->mnt_ncmounton.mount; 3647 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3648 ncc->ticks = (int)ticks; 3649 3650 ncc->isneg = 0; 3651 if (ncc->mp_target != mp) { 3652 if (ncc->mp_target) 3653 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3654 ncc->mp_target = mp; 3655 atomic_add_int(&mp->mnt_refs, 1); 3656 } 3657 cpu_sfence(); 3658 atomic_add_int_nonlocked(&ncc->updating, 1); 3659 spin_unlock(&ncc->spin); 3660 } 3661 3662 /* 3663 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3664 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3665 * negative hits involving (mp, <any>). 3666 * 3667 * A full scan is required. 3668 */ 3669 void 3670 cache_unmounting(struct mount *mp) 3671 { 3672 struct ncmount_cache *ncc; 3673 struct pcpu_ncache *pcpu; 3674 struct mount *ncc_mp; 3675 int i; 3676 3677 pcpu = pcpu_ncache; 3678 if (pcpu == NULL) 3679 return; 3680 3681 for (i = 0; i < ncpus; ++i) 3682 spin_lock(&pcpu[i].umount_spin); 3683 3684 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3685 ncc = &ncmount_cache[i]; 3686 if (ncc->mp != mp && ncc->mp_target != mp) 3687 continue; 3688 spin_lock(&ncc->spin); 3689 atomic_add_int_nonlocked(&ncc->updating, 1); 3690 cpu_sfence(); 3691 3692 if (ncc->mp != mp && ncc->mp_target != mp) { 3693 atomic_add_int_nonlocked(&ncc->updating, 1); 3694 cpu_sfence(); 3695 spin_unlock(&ncc->spin); 3696 continue; 3697 } 3698 ncc_mp = ncc->mp; 3699 ncc->ncp = NULL; 3700 ncc->mp = NULL; 3701 if (ncc_mp) 3702 atomic_add_int(&ncc_mp->mnt_refs, -1); 3703 ncc_mp = ncc->mp_target; 3704 ncc->mp_target = NULL; 3705 if (ncc_mp) 3706 atomic_add_int(&ncc_mp->mnt_refs, -1); 3707 ncc->ticks = (int)ticks - hz * 120; 3708 3709 cpu_sfence(); 3710 atomic_add_int_nonlocked(&ncc->updating, 1); 3711 spin_unlock(&ncc->spin); 3712 } 3713 3714 for (i = 0; i < ncpus; ++i) 3715 spin_unlock(&pcpu[i].umount_spin); 3716 } 3717 3718 /* 3719 * Resolve an unresolved namecache entry, generally by looking it up. 3720 * The passed ncp must be locked and refd. 3721 * 3722 * Theoretically since a vnode cannot be recycled while held, and since 3723 * the nc_parent chain holds its vnode as long as children exist, the 3724 * direct parent of the cache entry we are trying to resolve should 3725 * have a valid vnode. If not then generate an error that we can 3726 * determine is related to a resolver bug. 3727 * 3728 * However, if a vnode was in the middle of a recyclement when the NCP 3729 * got locked, ncp->nc_vp might point to a vnode that is about to become 3730 * invalid. cache_resolve() handles this case by unresolving the entry 3731 * and then re-resolving it. 3732 * 3733 * Note that successful resolution does not necessarily return an error 3734 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3735 * will be returned. 3736 */ 3737 int 3738 cache_resolve(struct nchandle *nch, struct ucred *cred) 3739 { 3740 struct namecache *par_tmp; 3741 struct namecache *par; 3742 struct namecache *ncp; 3743 struct nchandle nctmp; 3744 struct mount *mp; 3745 struct vnode *dvp; 3746 int error; 3747 3748 ncp = nch->ncp; 3749 mp = nch->mount; 3750 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3751 restart: 3752 /* 3753 * If the ncp is already resolved we have nothing to do. However, 3754 * we do want to guarentee that a usable vnode is returned when 3755 * a vnode is present, so make sure it hasn't been reclaimed. 3756 */ 3757 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3758 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3759 _cache_setunresolved(ncp); 3760 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3761 return (ncp->nc_error); 3762 } 3763 3764 /* 3765 * If the ncp was destroyed it will never resolve again. This 3766 * can basically only happen when someone is chdir'd into an 3767 * empty directory which is then rmdir'd. We want to catch this 3768 * here and not dive the VFS because the VFS might actually 3769 * have a way to re-resolve the disconnected ncp, which will 3770 * result in inconsistencies in the cdir/nch for proc->p_fd. 3771 */ 3772 if (ncp->nc_flag & NCF_DESTROYED) 3773 return(EINVAL); 3774 3775 /* 3776 * Mount points need special handling because the parent does not 3777 * belong to the same filesystem as the ncp. 3778 */ 3779 if (ncp == mp->mnt_ncmountpt.ncp) 3780 return (cache_resolve_mp(mp)); 3781 3782 /* 3783 * We expect an unbroken chain of ncps to at least the mount point, 3784 * and even all the way to root (but this code doesn't have to go 3785 * past the mount point). 3786 */ 3787 if (ncp->nc_parent == NULL) { 3788 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3789 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3790 ncp->nc_error = EXDEV; 3791 return(ncp->nc_error); 3792 } 3793 3794 /* 3795 * The vp's of the parent directories in the chain are held via vhold() 3796 * due to the existance of the child, and should not disappear. 3797 * However, there are cases where they can disappear: 3798 * 3799 * - due to filesystem I/O errors. 3800 * - due to NFS being stupid about tracking the namespace and 3801 * destroys the namespace for entire directories quite often. 3802 * - due to forced unmounts. 3803 * - due to an rmdir (parent will be marked DESTROYED) 3804 * 3805 * When this occurs we have to track the chain backwards and resolve 3806 * it, looping until the resolver catches up to the current node. We 3807 * could recurse here but we might run ourselves out of kernel stack 3808 * so we do it in a more painful manner. This situation really should 3809 * not occur all that often, or if it does not have to go back too 3810 * many nodes to resolve the ncp. 3811 */ 3812 while ((dvp = cache_dvpref(ncp)) == NULL) { 3813 /* 3814 * This case can occur if a process is CD'd into a 3815 * directory which is then rmdir'd. If the parent is marked 3816 * destroyed there is no point trying to resolve it. 3817 */ 3818 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3819 return(ENOENT); 3820 par = ncp->nc_parent; 3821 _cache_hold(par); 3822 _cache_lock(par); 3823 while ((par_tmp = par->nc_parent) != NULL && 3824 par_tmp->nc_vp == NULL) { 3825 _cache_hold(par_tmp); 3826 _cache_lock(par_tmp); 3827 _cache_put(par); 3828 par = par_tmp; 3829 } 3830 if (par->nc_parent == NULL) { 3831 kprintf("EXDEV case 2 %*.*s\n", 3832 par->nc_nlen, par->nc_nlen, par->nc_name); 3833 _cache_put(par); 3834 return (EXDEV); 3835 } 3836 /* 3837 * The parent is not set in stone, ref and lock it to prevent 3838 * it from disappearing. Also note that due to renames it 3839 * is possible for our ncp to move and for par to no longer 3840 * be one of its parents. We resolve it anyway, the loop 3841 * will handle any moves. 3842 */ 3843 _cache_get(par); /* additional hold/lock */ 3844 _cache_put(par); /* from earlier hold/lock */ 3845 if (par == nch->mount->mnt_ncmountpt.ncp) { 3846 cache_resolve_mp(nch->mount); 3847 } else if ((dvp = cache_dvpref(par)) == NULL) { 3848 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3849 par->nc_nlen, par->nc_nlen, par->nc_name); 3850 _cache_put(par); 3851 continue; 3852 } else { 3853 if (par->nc_flag & NCF_UNRESOLVED) { 3854 nctmp.mount = mp; 3855 nctmp.ncp = par; 3856 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3857 } 3858 vrele(dvp); 3859 } 3860 if ((error = par->nc_error) != 0) { 3861 if (par->nc_error != EAGAIN) { 3862 kprintf("EXDEV case 3 %*.*s error %d\n", 3863 par->nc_nlen, par->nc_nlen, par->nc_name, 3864 par->nc_error); 3865 _cache_put(par); 3866 return(error); 3867 } 3868 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3869 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3870 } 3871 _cache_put(par); 3872 /* loop */ 3873 } 3874 3875 /* 3876 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3877 * ncp's and reattach them. If this occurs the original ncp is marked 3878 * EAGAIN to force a relookup. 3879 * 3880 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3881 * ncp must already be resolved. 3882 */ 3883 if (dvp) { 3884 nctmp.mount = mp; 3885 nctmp.ncp = ncp; 3886 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3887 vrele(dvp); 3888 } else { 3889 ncp->nc_error = EPERM; 3890 } 3891 if (ncp->nc_error == EAGAIN) { 3892 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3893 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3894 goto restart; 3895 } 3896 return(ncp->nc_error); 3897 } 3898 3899 /* 3900 * Resolve the ncp associated with a mount point. Such ncp's almost always 3901 * remain resolved and this routine is rarely called. NFS MPs tends to force 3902 * re-resolution more often due to its mac-truck-smash-the-namecache 3903 * method of tracking namespace changes. 3904 * 3905 * The semantics for this call is that the passed ncp must be locked on 3906 * entry and will be locked on return. However, if we actually have to 3907 * resolve the mount point we temporarily unlock the entry in order to 3908 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3909 * the unlock we have to recheck the flags after we relock. 3910 */ 3911 static int 3912 cache_resolve_mp(struct mount *mp) 3913 { 3914 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3915 struct vnode *vp; 3916 int error; 3917 3918 KKASSERT(mp != NULL); 3919 3920 /* 3921 * If the ncp is already resolved we have nothing to do. However, 3922 * we do want to guarentee that a usable vnode is returned when 3923 * a vnode is present, so make sure it hasn't been reclaimed. 3924 */ 3925 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3926 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3927 _cache_setunresolved(ncp); 3928 } 3929 3930 if (ncp->nc_flag & NCF_UNRESOLVED) { 3931 _cache_unlock(ncp); 3932 while (vfs_busy(mp, 0)) 3933 ; 3934 error = VFS_ROOT(mp, &vp); 3935 _cache_lock(ncp); 3936 3937 /* 3938 * recheck the ncp state after relocking. 3939 */ 3940 if (ncp->nc_flag & NCF_UNRESOLVED) { 3941 ncp->nc_error = error; 3942 if (error == 0) { 3943 _cache_setvp(mp, ncp, vp); 3944 vput(vp); 3945 } else { 3946 kprintf("[diagnostic] cache_resolve_mp: failed" 3947 " to resolve mount %p err=%d ncp=%p\n", 3948 mp, error, ncp); 3949 _cache_setvp(mp, ncp, NULL); 3950 } 3951 } else if (error == 0) { 3952 vput(vp); 3953 } 3954 vfs_unbusy(mp); 3955 } 3956 return(ncp->nc_error); 3957 } 3958 3959 /* 3960 * Clean out negative cache entries when too many have accumulated. 3961 */ 3962 static void 3963 _cache_cleanneg(long count) 3964 { 3965 struct pcpu_ncache *pn; 3966 struct namecache *ncp; 3967 static uint32_t neg_rover; 3968 uint32_t n; 3969 long vnegs; 3970 3971 n = neg_rover++; /* SMP heuristical, race ok */ 3972 cpu_ccfence(); 3973 n = n % (uint32_t)ncpus; 3974 3975 /* 3976 * Normalize vfscache_negs and count. count is sometimes based 3977 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 3978 * have crazy values. 3979 */ 3980 vnegs = vfscache_negs; 3981 cpu_ccfence(); 3982 if (vnegs <= MINNEG) 3983 vnegs = MINNEG; 3984 if (count < 1) 3985 count = 1; 3986 3987 pn = &pcpu_ncache[n]; 3988 spin_lock(&pn->neg_spin); 3989 count = pn->neg_count * count / vnegs + 1; 3990 spin_unlock(&pn->neg_spin); 3991 3992 /* 3993 * Attempt to clean out the specified number of negative cache 3994 * entries. 3995 */ 3996 while (count > 0) { 3997 spin_lock(&pn->neg_spin); 3998 ncp = TAILQ_FIRST(&pn->neg_list); 3999 if (ncp == NULL) { 4000 spin_unlock(&pn->neg_spin); 4001 break; 4002 } 4003 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4004 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4005 _cache_hold(ncp); 4006 spin_unlock(&pn->neg_spin); 4007 4008 /* 4009 * This can race, so we must re-check that the ncp 4010 * is on the ncneg.list after successfully locking it. 4011 */ 4012 if (_cache_lock_special(ncp) == 0) { 4013 if (ncp->nc_vp == NULL && 4014 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4015 cache_zap(ncp); 4016 } else { 4017 _cache_unlock(ncp); 4018 _cache_drop(ncp); 4019 } 4020 } else { 4021 _cache_drop(ncp); 4022 } 4023 --count; 4024 } 4025 } 4026 4027 /* 4028 * Clean out positive cache entries when too many have accumulated. 4029 */ 4030 static void 4031 _cache_cleanpos(long count) 4032 { 4033 static volatile int rover; 4034 struct nchash_head *nchpp; 4035 struct namecache *ncp; 4036 int rover_copy; 4037 4038 /* 4039 * Attempt to clean out the specified number of negative cache 4040 * entries. 4041 */ 4042 while (count > 0) { 4043 rover_copy = ++rover; /* MPSAFEENOUGH */ 4044 cpu_ccfence(); 4045 nchpp = NCHHASH(rover_copy); 4046 4047 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4048 --count; 4049 continue; 4050 } 4051 4052 /* 4053 * Cycle ncp on list, ignore and do not move DUMMY 4054 * ncps. These are temporary list iterators. 4055 * 4056 * We must cycle the ncp to the end of the list to 4057 * ensure that all ncp's have an equal chance of 4058 * being removed. 4059 */ 4060 spin_lock(&nchpp->spin); 4061 ncp = TAILQ_FIRST(&nchpp->list); 4062 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4063 ncp = TAILQ_NEXT(ncp, nc_hash); 4064 if (ncp) { 4065 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4066 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4067 _cache_hold(ncp); 4068 } 4069 spin_unlock(&nchpp->spin); 4070 4071 if (ncp) { 4072 if (_cache_lock_special(ncp) == 0) { 4073 cache_zap(ncp); 4074 } else { 4075 _cache_drop(ncp); 4076 } 4077 } 4078 --count; 4079 } 4080 } 4081 4082 /* 4083 * This is a kitchen sink function to clean out ncps which we 4084 * tried to zap from cache_drop() but failed because we were 4085 * unable to acquire the parent lock. 4086 * 4087 * Such entries can also be removed via cache_inval_vp(), such 4088 * as when unmounting. 4089 */ 4090 static void 4091 _cache_cleandefered(void) 4092 { 4093 struct nchash_head *nchpp; 4094 struct namecache *ncp; 4095 struct namecache dummy; 4096 int i; 4097 4098 /* 4099 * Create a list iterator. DUMMY indicates that this is a list 4100 * iterator, DESTROYED prevents matches by lookup functions. 4101 */ 4102 numdefered = 0; 4103 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4104 bzero(&dummy, sizeof(dummy)); 4105 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4106 dummy.nc_refs = 1; 4107 4108 for (i = 0; i <= nchash; ++i) { 4109 nchpp = &nchashtbl[i]; 4110 4111 spin_lock(&nchpp->spin); 4112 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4113 ncp = &dummy; 4114 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4115 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4116 continue; 4117 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4118 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4119 _cache_hold(ncp); 4120 spin_unlock(&nchpp->spin); 4121 if (_cache_lock_nonblock(ncp) == 0) { 4122 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4123 _cache_unlock(ncp); 4124 } 4125 _cache_drop(ncp); 4126 spin_lock(&nchpp->spin); 4127 ncp = &dummy; 4128 } 4129 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4130 spin_unlock(&nchpp->spin); 4131 } 4132 } 4133 4134 /* 4135 * Name cache initialization, from vfsinit() when we are booting 4136 */ 4137 void 4138 nchinit(void) 4139 { 4140 struct pcpu_ncache *pn; 4141 globaldata_t gd; 4142 int i; 4143 4144 /* 4145 * Per-cpu accounting and negative hit list 4146 */ 4147 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4148 M_VFSCACHE, M_WAITOK|M_ZERO); 4149 for (i = 0; i < ncpus; ++i) { 4150 pn = &pcpu_ncache[i]; 4151 TAILQ_INIT(&pn->neg_list); 4152 spin_init(&pn->neg_spin, "ncneg"); 4153 spin_init(&pn->umount_spin, "ncumm"); 4154 } 4155 4156 /* 4157 * Initialise per-cpu namecache effectiveness statistics. 4158 */ 4159 for (i = 0; i < ncpus; ++i) { 4160 gd = globaldata_find(i); 4161 gd->gd_nchstats = &nchstats[i]; 4162 } 4163 4164 /* 4165 * Create a generous namecache hash table 4166 */ 4167 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4168 sizeof(struct nchash_head), 4169 M_VFSCACHE, &nchash); 4170 for (i = 0; i <= (int)nchash; ++i) { 4171 TAILQ_INIT(&nchashtbl[i].list); 4172 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4173 } 4174 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4175 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4176 nclockwarn = 5 * hz; 4177 } 4178 4179 /* 4180 * Called from start_init() to bootstrap the root filesystem. Returns 4181 * a referenced, unlocked namecache record. 4182 */ 4183 void 4184 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4185 { 4186 nch->ncp = cache_alloc(0); 4187 nch->mount = mp; 4188 _cache_mntref(mp); 4189 if (vp) 4190 _cache_setvp(nch->mount, nch->ncp, vp); 4191 } 4192 4193 /* 4194 * vfs_cache_setroot() 4195 * 4196 * Create an association between the root of our namecache and 4197 * the root vnode. This routine may be called several times during 4198 * booting. 4199 * 4200 * If the caller intends to save the returned namecache pointer somewhere 4201 * it must cache_hold() it. 4202 */ 4203 void 4204 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4205 { 4206 struct vnode *ovp; 4207 struct nchandle onch; 4208 4209 ovp = rootvnode; 4210 onch = rootnch; 4211 rootvnode = nvp; 4212 if (nch) 4213 rootnch = *nch; 4214 else 4215 cache_zero(&rootnch); 4216 if (ovp) 4217 vrele(ovp); 4218 if (onch.ncp) 4219 cache_drop(&onch); 4220 } 4221 4222 /* 4223 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4224 * topology and is being removed as quickly as possible. The new VOP_N*() 4225 * API calls are required to make specific adjustments using the supplied 4226 * ncp pointers rather then just bogusly purging random vnodes. 4227 * 4228 * Invalidate all namecache entries to a particular vnode as well as 4229 * any direct children of that vnode in the namecache. This is a 4230 * 'catch all' purge used by filesystems that do not know any better. 4231 * 4232 * Note that the linkage between the vnode and its namecache entries will 4233 * be removed, but the namecache entries themselves might stay put due to 4234 * active references from elsewhere in the system or due to the existance of 4235 * the children. The namecache topology is left intact even if we do not 4236 * know what the vnode association is. Such entries will be marked 4237 * NCF_UNRESOLVED. 4238 */ 4239 void 4240 cache_purge(struct vnode *vp) 4241 { 4242 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4243 } 4244 4245 static int disablecwd; 4246 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4247 "Disable getcwd"); 4248 4249 static u_long numcwdcalls; 4250 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4251 "Number of current directory resolution calls"); 4252 static u_long numcwdfailnf; 4253 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4254 "Number of current directory failures due to lack of file"); 4255 static u_long numcwdfailsz; 4256 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4257 "Number of current directory failures due to large result"); 4258 static u_long numcwdfound; 4259 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4260 "Number of current directory resolution successes"); 4261 4262 /* 4263 * MPALMOSTSAFE 4264 */ 4265 int 4266 sys___getcwd(struct __getcwd_args *uap) 4267 { 4268 u_int buflen; 4269 int error; 4270 char *buf; 4271 char *bp; 4272 4273 if (disablecwd) 4274 return (ENODEV); 4275 4276 buflen = uap->buflen; 4277 if (buflen == 0) 4278 return (EINVAL); 4279 if (buflen > MAXPATHLEN) 4280 buflen = MAXPATHLEN; 4281 4282 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4283 bp = kern_getcwd(buf, buflen, &error); 4284 if (error == 0) 4285 error = copyout(bp, uap->buf, strlen(bp) + 1); 4286 kfree(buf, M_TEMP); 4287 return (error); 4288 } 4289 4290 char * 4291 kern_getcwd(char *buf, size_t buflen, int *error) 4292 { 4293 struct proc *p = curproc; 4294 char *bp; 4295 int i, slash_prefixed; 4296 struct filedesc *fdp; 4297 struct nchandle nch; 4298 struct namecache *ncp; 4299 4300 numcwdcalls++; 4301 bp = buf; 4302 bp += buflen - 1; 4303 *bp = '\0'; 4304 fdp = p->p_fd; 4305 slash_prefixed = 0; 4306 4307 nch = fdp->fd_ncdir; 4308 ncp = nch.ncp; 4309 if (ncp) 4310 _cache_hold(ncp); 4311 4312 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4313 nch.mount != fdp->fd_nrdir.mount) 4314 ) { 4315 /* 4316 * While traversing upwards if we encounter the root 4317 * of the current mount we have to skip to the mount point 4318 * in the underlying filesystem. 4319 */ 4320 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4321 nch = nch.mount->mnt_ncmounton; 4322 _cache_drop(ncp); 4323 ncp = nch.ncp; 4324 if (ncp) 4325 _cache_hold(ncp); 4326 continue; 4327 } 4328 4329 /* 4330 * Prepend the path segment 4331 */ 4332 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4333 if (bp == buf) { 4334 numcwdfailsz++; 4335 *error = ERANGE; 4336 bp = NULL; 4337 goto done; 4338 } 4339 *--bp = ncp->nc_name[i]; 4340 } 4341 if (bp == buf) { 4342 numcwdfailsz++; 4343 *error = ERANGE; 4344 bp = NULL; 4345 goto done; 4346 } 4347 *--bp = '/'; 4348 slash_prefixed = 1; 4349 4350 /* 4351 * Go up a directory. This isn't a mount point so we don't 4352 * have to check again. 4353 */ 4354 while ((nch.ncp = ncp->nc_parent) != NULL) { 4355 if (ncp_shared_lock_disable) 4356 _cache_lock(ncp); 4357 else 4358 _cache_lock_shared(ncp); 4359 if (nch.ncp != ncp->nc_parent) { 4360 _cache_unlock(ncp); 4361 continue; 4362 } 4363 _cache_hold(nch.ncp); 4364 _cache_unlock(ncp); 4365 break; 4366 } 4367 _cache_drop(ncp); 4368 ncp = nch.ncp; 4369 } 4370 if (ncp == NULL) { 4371 numcwdfailnf++; 4372 *error = ENOENT; 4373 bp = NULL; 4374 goto done; 4375 } 4376 if (!slash_prefixed) { 4377 if (bp == buf) { 4378 numcwdfailsz++; 4379 *error = ERANGE; 4380 bp = NULL; 4381 goto done; 4382 } 4383 *--bp = '/'; 4384 } 4385 numcwdfound++; 4386 *error = 0; 4387 done: 4388 if (ncp) 4389 _cache_drop(ncp); 4390 return (bp); 4391 } 4392 4393 /* 4394 * Thus begins the fullpath magic. 4395 * 4396 * The passed nchp is referenced but not locked. 4397 */ 4398 static int disablefullpath; 4399 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4400 &disablefullpath, 0, 4401 "Disable fullpath lookups"); 4402 4403 int 4404 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4405 char **retbuf, char **freebuf, int guess) 4406 { 4407 struct nchandle fd_nrdir; 4408 struct nchandle nch; 4409 struct namecache *ncp; 4410 struct mount *mp, *new_mp; 4411 char *bp, *buf; 4412 int slash_prefixed; 4413 int error = 0; 4414 int i; 4415 4416 *retbuf = NULL; 4417 *freebuf = NULL; 4418 4419 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4420 bp = buf + MAXPATHLEN - 1; 4421 *bp = '\0'; 4422 if (nchbase) 4423 fd_nrdir = *nchbase; 4424 else if (p != NULL) 4425 fd_nrdir = p->p_fd->fd_nrdir; 4426 else 4427 fd_nrdir = rootnch; 4428 slash_prefixed = 0; 4429 nch = *nchp; 4430 ncp = nch.ncp; 4431 if (ncp) 4432 _cache_hold(ncp); 4433 mp = nch.mount; 4434 4435 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4436 new_mp = NULL; 4437 4438 /* 4439 * If we are asked to guess the upwards path, we do so whenever 4440 * we encounter an ncp marked as a mountpoint. We try to find 4441 * the actual mountpoint by finding the mountpoint with this 4442 * ncp. 4443 */ 4444 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4445 new_mp = mount_get_by_nc(ncp); 4446 } 4447 /* 4448 * While traversing upwards if we encounter the root 4449 * of the current mount we have to skip to the mount point. 4450 */ 4451 if (ncp == mp->mnt_ncmountpt.ncp) { 4452 new_mp = mp; 4453 } 4454 if (new_mp) { 4455 nch = new_mp->mnt_ncmounton; 4456 _cache_drop(ncp); 4457 ncp = nch.ncp; 4458 if (ncp) 4459 _cache_hold(ncp); 4460 mp = nch.mount; 4461 continue; 4462 } 4463 4464 /* 4465 * Prepend the path segment 4466 */ 4467 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4468 if (bp == buf) { 4469 kfree(buf, M_TEMP); 4470 error = ENOMEM; 4471 goto done; 4472 } 4473 *--bp = ncp->nc_name[i]; 4474 } 4475 if (bp == buf) { 4476 kfree(buf, M_TEMP); 4477 error = ENOMEM; 4478 goto done; 4479 } 4480 *--bp = '/'; 4481 slash_prefixed = 1; 4482 4483 /* 4484 * Go up a directory. This isn't a mount point so we don't 4485 * have to check again. 4486 * 4487 * We can only safely access nc_parent with ncp held locked. 4488 */ 4489 while ((nch.ncp = ncp->nc_parent) != NULL) { 4490 _cache_lock_shared(ncp); 4491 if (nch.ncp != ncp->nc_parent) { 4492 _cache_unlock(ncp); 4493 continue; 4494 } 4495 _cache_hold(nch.ncp); 4496 _cache_unlock(ncp); 4497 break; 4498 } 4499 _cache_drop(ncp); 4500 ncp = nch.ncp; 4501 } 4502 if (ncp == NULL) { 4503 kfree(buf, M_TEMP); 4504 error = ENOENT; 4505 goto done; 4506 } 4507 4508 if (!slash_prefixed) { 4509 if (bp == buf) { 4510 kfree(buf, M_TEMP); 4511 error = ENOMEM; 4512 goto done; 4513 } 4514 *--bp = '/'; 4515 } 4516 *retbuf = bp; 4517 *freebuf = buf; 4518 error = 0; 4519 done: 4520 if (ncp) 4521 _cache_drop(ncp); 4522 return(error); 4523 } 4524 4525 int 4526 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4527 char **freebuf, int guess) 4528 { 4529 struct namecache *ncp; 4530 struct nchandle nch; 4531 int error; 4532 4533 *freebuf = NULL; 4534 if (disablefullpath) 4535 return (ENODEV); 4536 4537 if (p == NULL) 4538 return (EINVAL); 4539 4540 /* vn is NULL, client wants us to use p->p_textvp */ 4541 if (vn == NULL) { 4542 if ((vn = p->p_textvp) == NULL) 4543 return (EINVAL); 4544 } 4545 spin_lock_shared(&vn->v_spin); 4546 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4547 if (ncp->nc_nlen) 4548 break; 4549 } 4550 if (ncp == NULL) { 4551 spin_unlock_shared(&vn->v_spin); 4552 return (EINVAL); 4553 } 4554 _cache_hold(ncp); 4555 spin_unlock_shared(&vn->v_spin); 4556 4557 nch.ncp = ncp; 4558 nch.mount = vn->v_mount; 4559 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4560 _cache_drop(ncp); 4561 return (error); 4562 } 4563 4564 void 4565 vfscache_rollup_cpu(struct globaldata *gd) 4566 { 4567 struct pcpu_ncache *pn; 4568 long count; 4569 4570 if (pcpu_ncache == NULL) 4571 return; 4572 pn = &pcpu_ncache[gd->gd_cpuid]; 4573 4574 if (pn->vfscache_count) { 4575 count = atomic_swap_long(&pn->vfscache_count, 0); 4576 atomic_add_long(&vfscache_count, count); 4577 } 4578 if (pn->vfscache_leafs) { 4579 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4580 atomic_add_long(&vfscache_leafs, count); 4581 } 4582 if (pn->vfscache_negs) { 4583 count = atomic_swap_long(&pn->vfscache_negs, 0); 4584 atomic_add_long(&vfscache_negs, count); 4585 } 4586 if (pn->numdefered) { 4587 count = atomic_swap_long(&pn->numdefered, 0); 4588 atomic_add_long(&numdefered, count); 4589 } 4590 } 4591