1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysproto.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 144 145 TAILQ_HEAD(nchash_list, namecache); 146 147 /* 148 * Don't cachealign, but at least pad to 32 bytes so entries 149 * don't cross a cache line. 150 */ 151 struct nchash_head { 152 struct nchash_list list; /* 16 bytes */ 153 struct spinlock spin; /* 8 bytes */ 154 long pad01; /* 8 bytes */ 155 }; 156 157 struct ncmount_cache { 158 struct spinlock spin; 159 struct namecache *ncp; 160 struct mount *mp; 161 struct mount *mp_target; 162 int isneg; 163 int ticks; 164 int updating; 165 int unused01; 166 }; 167 168 struct pcpu_ncache { 169 struct spinlock umount_spin; /* cache_findmount/interlock */ 170 struct spinlock neg_spin; /* for neg_list and neg_count */ 171 struct namecache_list neg_list; 172 long neg_count; 173 long vfscache_negs; 174 long vfscache_count; 175 long vfscache_leafs; 176 long numdefered; 177 } __cachealign; 178 179 __read_mostly static struct nchash_head *nchashtbl; 180 __read_mostly static struct pcpu_ncache *pcpu_ncache; 181 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 182 183 /* 184 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 185 * to create the namecache infrastructure leading to a dangling vnode. 186 * 187 * 0 Only errors are reported 188 * 1 Successes are reported 189 * 2 Successes + the whole directory scan is reported 190 * 3 Force the directory scan code run as if the parent vnode did not 191 * have a namecache record, even if it does have one. 192 */ 193 __read_mostly static int ncvp_debug; 194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 195 "Namecache debug level (0-3)"); 196 197 __read_mostly static u_long nchash; /* size of hash table */ 198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 199 "Size of namecache hash table"); 200 201 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 203 "Batch flush negative entries"); 204 205 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 207 "Batch flush positive entries"); 208 209 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 211 "Ratio of namecache negative entries"); 212 213 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 215 "Warn on locked namecache entries in ticks"); 216 217 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 219 "Number of cache entries allocated"); 220 221 __read_mostly static int ncp_shared_lock_disable = 0; 222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 223 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 224 225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 226 "sizeof(struct vnode)"); 227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 228 "sizeof(struct namecache)"); 229 230 __read_mostly static int ncmount_cache_enable = 1; 231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 232 &ncmount_cache_enable, 0, "mount point cache"); 233 234 static __inline void _cache_drop(struct namecache *ncp); 235 static int cache_resolve_mp(struct mount *mp); 236 static int cache_findmount_callback(struct mount *mp, void *data); 237 static void _cache_setunresolved(struct namecache *ncp); 238 static void _cache_cleanneg(long count); 239 static void _cache_cleanpos(long count); 240 static void _cache_cleandefered(void); 241 static void _cache_unlink(struct namecache *ncp); 242 243 /* 244 * The new name cache statistics (these are rolled up globals and not 245 * modified in the critical path, see struct pcpu_ncache). 246 */ 247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 248 static long vfscache_negs; 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 250 "Number of negative namecache entries"); 251 static long vfscache_count; 252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 253 "Number of namecaches entries"); 254 static long vfscache_leafs; 255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 256 "Number of namecaches entries"); 257 static long numdefered; 258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 259 "Number of cache entries allocated"); 260 261 262 struct nchstats nchstats[SMP_MAXCPU]; 263 /* 264 * Export VFS cache effectiveness statistics to user-land. 265 * 266 * The statistics are left for aggregation to user-land so 267 * neat things can be achieved, like observing per-CPU cache 268 * distribution. 269 */ 270 static int 271 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 272 { 273 struct globaldata *gd; 274 int i, error; 275 276 error = 0; 277 for (i = 0; i < ncpus; ++i) { 278 gd = globaldata_find(i); 279 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 280 sizeof(struct nchstats)))) 281 break; 282 } 283 284 return (error); 285 } 286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 287 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 288 289 static void cache_zap(struct namecache *ncp); 290 291 /* 292 * Cache mount points and namecache records in order to avoid unnecessary 293 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 294 * performance and is particularly important on multi-socket systems to 295 * reduce cache-line ping-ponging. 296 * 297 * Try to keep the pcpu structure within one cache line (~64 bytes). 298 */ 299 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 300 #define MNTCACHE_SET 8 /* set associativity */ 301 302 struct mntcache_elm { 303 struct namecache *ncp; 304 struct mount *mp; 305 int ticks; 306 int unused01; 307 }; 308 309 struct mntcache { 310 struct mntcache_elm array[MNTCACHE_COUNT]; 311 } __cachealign; 312 313 static struct mntcache pcpu_mntcache[MAXCPU]; 314 315 static __inline 316 struct mntcache_elm * 317 _cache_mntcache_hash(void *ptr) 318 { 319 struct mntcache_elm *elm; 320 int hv; 321 322 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 323 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 324 325 return elm; 326 } 327 328 static 329 void 330 _cache_mntref(struct mount *mp) 331 { 332 struct mntcache_elm *elm; 333 struct mount *mpr; 334 int i; 335 336 elm = _cache_mntcache_hash(mp); 337 for (i = 0; i < MNTCACHE_SET; ++i) { 338 if (elm->mp == mp) { 339 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 340 if (__predict_true(mpr == mp)) 341 return; 342 if (mpr) 343 atomic_add_int(&mpr->mnt_refs, -1); 344 } 345 ++elm; 346 } 347 atomic_add_int(&mp->mnt_refs, 1); 348 } 349 350 static 351 void 352 _cache_mntrel(struct mount *mp) 353 { 354 struct mntcache_elm *elm; 355 struct mntcache_elm *best; 356 struct mount *mpr; 357 int delta1; 358 int delta2; 359 int i; 360 361 elm = _cache_mntcache_hash(mp); 362 best = elm; 363 for (i = 0; i < MNTCACHE_SET; ++i) { 364 if (elm->mp == NULL) { 365 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 366 if (__predict_false(mpr != NULL)) { 367 atomic_add_int(&mpr->mnt_refs, -1); 368 } 369 elm->ticks = ticks; 370 return; 371 } 372 delta1 = ticks - best->ticks; 373 delta2 = ticks - elm->ticks; 374 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 375 best = elm; 376 ++elm; 377 } 378 mpr = atomic_swap_ptr((void *)&best->mp, mp); 379 best->ticks = ticks; 380 if (mpr) 381 atomic_add_int(&mpr->mnt_refs, -1); 382 } 383 384 /* 385 * Clears all cached mount points on all cpus. This routine should only 386 * be called when we are waiting for a mount to clear, e.g. so we can 387 * unmount. 388 */ 389 void 390 cache_clearmntcache(struct mount *target __unused) 391 { 392 int n; 393 394 for (n = 0; n < ncpus; ++n) { 395 struct mntcache *cache = &pcpu_mntcache[n]; 396 struct mntcache_elm *elm; 397 struct namecache *ncp; 398 struct mount *mp; 399 int i; 400 401 for (i = 0; i < MNTCACHE_COUNT; ++i) { 402 elm = &cache->array[i]; 403 if (elm->mp) { 404 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 405 if (mp) 406 atomic_add_int(&mp->mnt_refs, -1); 407 } 408 if (elm->ncp) { 409 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 410 if (ncp) 411 _cache_drop(ncp); 412 } 413 } 414 } 415 } 416 417 /* 418 * Namespace locking. The caller must already hold a reference to the 419 * namecache structure in order to lock/unlock it. The controlling entity 420 * in a 1->0 transition does not need to lock the ncp to dispose of it, 421 * as nobody else will have visiblity to it at that point. 422 * 423 * Note that holding a locked namecache structure prevents other threads 424 * from making namespace changes (e.g. deleting or creating), prevents 425 * vnode association state changes by other threads, and prevents the 426 * namecache entry from being resolved or unresolved by other threads. 427 * 428 * An exclusive lock owner has full authority to associate/disassociate 429 * vnodes and resolve/unresolve the locked ncp. 430 * 431 * A shared lock owner only has authority to acquire the underlying vnode, 432 * if any. 433 * 434 * The primary lock field is nc_lockstatus. nc_locktd is set after the 435 * fact (when locking) or cleared prior to unlocking. 436 * 437 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 438 * or recycled, but it does NOT help you if the vnode had already 439 * initiated a recyclement. If this is important, use cache_get() 440 * rather then cache_lock() (and deal with the differences in the 441 * way the refs counter is handled). Or, alternatively, make an 442 * unconditional call to cache_validate() or cache_resolve() 443 * after cache_lock() returns. 444 */ 445 static __inline 446 void 447 _cache_lock(struct namecache *ncp) 448 { 449 int didwarn = 0; 450 int error; 451 452 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 453 while (__predict_false(error == EWOULDBLOCK)) { 454 if (didwarn == 0) { 455 didwarn = ticks - nclockwarn; 456 kprintf("[diagnostic] cache_lock: " 457 "%s blocked on %p " 458 "\"%*.*s\"\n", 459 curthread->td_comm, ncp, 460 ncp->nc_nlen, ncp->nc_nlen, 461 ncp->nc_name); 462 } 463 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 464 } 465 if (__predict_false(didwarn)) { 466 kprintf("[diagnostic] cache_lock: " 467 "%s unblocked %*.*s after %d secs\n", 468 curthread->td_comm, 469 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 470 (int)(ticks - didwarn) / hz); 471 } 472 } 473 474 /* 475 * Release a previously acquired lock. 476 * 477 * A concurrent shared-lock acquisition or acquisition/release can 478 * race bit 31 so only drop the ncp if bit 31 was set. 479 */ 480 static __inline 481 void 482 _cache_unlock(struct namecache *ncp) 483 { 484 lockmgr(&ncp->nc_lock, LK_RELEASE); 485 } 486 487 /* 488 * Lock ncp exclusively, non-blocking. Return 0 on success. 489 */ 490 static __inline 491 int 492 _cache_lock_nonblock(struct namecache *ncp) 493 { 494 int error; 495 496 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 497 if (__predict_false(error != 0)) { 498 return(EWOULDBLOCK); 499 } 500 return 0; 501 } 502 503 /* 504 * This is a special form of _cache_lock() which only succeeds if 505 * it can get a pristine, non-recursive lock. The caller must have 506 * already ref'd the ncp. 507 * 508 * On success the ncp will be locked, on failure it will not. The 509 * ref count does not change either way. 510 * 511 * We want _cache_lock_special() (on success) to return a definitively 512 * usable vnode or a definitively unresolved ncp. 513 */ 514 static __inline 515 int 516 _cache_lock_special(struct namecache *ncp) 517 { 518 if (_cache_lock_nonblock(ncp) == 0) { 519 if (lockmgr_oneexcl(&ncp->nc_lock)) { 520 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 521 _cache_setunresolved(ncp); 522 return 0; 523 } 524 _cache_unlock(ncp); 525 } 526 return EWOULDBLOCK; 527 } 528 529 /* 530 * Shared lock, guarantees vp held 531 * 532 * The shared lock holds vp on the 0->1 transition. It is possible to race 533 * another shared lock release, preventing the other release from dropping 534 * the vnode and clearing bit 31. 535 * 536 * If it is not set then we are responsible for setting it, and this 537 * responsibility does not race with anyone else. 538 */ 539 static __inline 540 void 541 _cache_lock_shared(struct namecache *ncp) 542 { 543 int didwarn = 0; 544 int error; 545 546 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 547 while (__predict_false(error == EWOULDBLOCK)) { 548 if (didwarn == 0) { 549 didwarn = ticks - nclockwarn; 550 kprintf("[diagnostic] cache_lock_shared: " 551 "%s blocked on %p " 552 "\"%*.*s\"\n", 553 curthread->td_comm, ncp, 554 ncp->nc_nlen, ncp->nc_nlen, 555 ncp->nc_name); 556 } 557 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 558 } 559 if (__predict_false(didwarn)) { 560 kprintf("[diagnostic] cache_lock_shared: " 561 "%s unblocked %*.*s after %d secs\n", 562 curthread->td_comm, 563 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 564 (int)(ticks - didwarn) / hz); 565 } 566 } 567 568 /* 569 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 570 */ 571 static __inline 572 int 573 _cache_lock_shared_nonblock(struct namecache *ncp) 574 { 575 int error; 576 577 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 578 if (__predict_false(error != 0)) { 579 return(EWOULDBLOCK); 580 } 581 return 0; 582 } 583 584 /* 585 * This function tries to get a shared lock but will back-off to an 586 * exclusive lock if: 587 * 588 * (1) Some other thread is trying to obtain an exclusive lock 589 * (to prevent the exclusive requester from getting livelocked out 590 * by many shared locks). 591 * 592 * (2) The current thread already owns an exclusive lock (to avoid 593 * deadlocking). 594 * 595 * WARNING! On machines with lots of cores we really want to try hard to 596 * get a shared lock or concurrent path lookups can chain-react 597 * into a very high-latency exclusive lock. 598 * 599 * This is very evident in dsynth's initial scans. 600 */ 601 static __inline 602 int 603 _cache_lock_shared_special(struct namecache *ncp) 604 { 605 /* 606 * Only honor a successful shared lock (returning 0) if there is 607 * no exclusive request pending and the vnode, if present, is not 608 * in a reclaimed state. 609 */ 610 if (_cache_lock_shared_nonblock(ncp) == 0) { 611 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 612 if (ncp->nc_vp == NULL || 613 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 614 return(0); 615 } 616 } 617 _cache_unlock(ncp); 618 return(EWOULDBLOCK); 619 } 620 621 /* 622 * Non-blocking shared lock failed. If we already own the exclusive 623 * lock just acquire another exclusive lock (instead of deadlocking). 624 * Otherwise acquire a shared lock. 625 */ 626 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 627 _cache_lock(ncp); 628 return(0); 629 } 630 _cache_lock_shared(ncp); 631 return(0); 632 } 633 634 static __inline 635 int 636 _cache_lockstatus(struct namecache *ncp) 637 { 638 int status; 639 640 status = lockstatus(&ncp->nc_lock, curthread); 641 if (status == 0 || status == LK_EXCLOTHER) 642 status = -1; 643 return status; 644 } 645 646 /* 647 * cache_hold() and cache_drop() prevent the premature deletion of a 648 * namecache entry but do not prevent operations (such as zapping) on 649 * that namecache entry. 650 * 651 * This routine may only be called from outside this source module if 652 * nc_refs is already deterministically at least 1, such as being 653 * associated with e.g. a process, file descriptor, or some other entity. 654 * 655 * Only the above situations, similar situations within this module where 656 * the ref count is deterministically at least 1, or when the ncp is found 657 * via the nchpp (hash table) lookup, can bump nc_refs. 658 * 659 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 660 * can still be removed from the nc_list, however, as long as the caller 661 * can acquire its lock (in the wrong order). 662 * 663 * This is a rare case where callers are allowed to hold a spinlock, 664 * so we can't ourselves. 665 */ 666 static __inline 667 struct namecache * 668 _cache_hold(struct namecache *ncp) 669 { 670 KKASSERT(ncp->nc_refs > 0); 671 atomic_add_int(&ncp->nc_refs, 1); 672 673 return(ncp); 674 } 675 676 /* 677 * Drop a cache entry. 678 * 679 * The 1->0 transition is special and requires the caller to destroy the 680 * entry. It means that the ncp is no longer on a nchpp list (since that 681 * would mean there was stilla ref). The ncp could still be on a nc_list 682 * but will not have any child of its own, again because nc_refs is now 0 683 * and children would have a ref to their parent. 684 * 685 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 686 */ 687 static __inline 688 void 689 _cache_drop(struct namecache *ncp) 690 { 691 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 692 /* 693 * Executed unlocked (no need to lock on last drop) 694 */ 695 _cache_setunresolved(ncp); 696 697 /* 698 * Scrap it. 699 */ 700 ncp->nc_refs = -1; /* safety */ 701 if (ncp->nc_name) 702 kfree(ncp->nc_name, M_VFSCACHE); 703 kfree(ncp, M_VFSCACHE); 704 } 705 } 706 707 /* 708 * Link a new namecache entry to its parent and to the hash table. Be 709 * careful to avoid races if vhold() blocks in the future. 710 * 711 * Both ncp and par must be referenced and locked. The reference is 712 * transfered to the nchpp (and, most notably, NOT to the parent list). 713 * 714 * NOTE: The hash table spinlock is held across this call, we can't do 715 * anything fancy. 716 */ 717 static void 718 _cache_link_parent(struct namecache *ncp, struct namecache *par, 719 struct nchash_head *nchpp) 720 { 721 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 722 723 KKASSERT(ncp->nc_parent == NULL); 724 ncp->nc_parent = par; 725 ncp->nc_head = nchpp; 726 727 /* 728 * Set inheritance flags. Note that the parent flags may be 729 * stale due to getattr potentially not having been run yet 730 * (it gets run during nlookup()'s). 731 */ 732 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 733 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 734 ncp->nc_flag |= NCF_SF_PNOCACHE; 735 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 736 ncp->nc_flag |= NCF_UF_PCACHE; 737 738 /* 739 * Add to hash table and parent, adjust accounting 740 */ 741 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 742 atomic_add_long(&pn->vfscache_count, 1); 743 if (TAILQ_EMPTY(&ncp->nc_list)) 744 atomic_add_long(&pn->vfscache_leafs, 1); 745 746 if (TAILQ_EMPTY(&par->nc_list)) { 747 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 748 atomic_add_long(&pn->vfscache_leafs, -1); 749 /* 750 * Any vp associated with an ncp which has children must 751 * be held to prevent it from being recycled. 752 */ 753 if (par->nc_vp) 754 vhold(par->nc_vp); 755 } else { 756 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 757 } 758 _cache_hold(par); /* add nc_parent ref */ 759 } 760 761 /* 762 * Remove the parent and hash associations from a namecache structure. 763 * Drop the ref-count on the parent. The caller receives the ref 764 * from the ncp's nchpp linkage that was removed and may forward that 765 * ref to a new linkage. 766 767 * The caller usually holds an additional ref * on the ncp so the unlink 768 * cannot be the final drop. XXX should not be necessary now since the 769 * caller receives the ref from the nchpp linkage, assuming the ncp 770 * was linked in the first place. 771 * 772 * ncp must be locked, which means that there won't be any nc_parent 773 * removal races. This routine will acquire a temporary lock on 774 * the parent as well as the appropriate hash chain. 775 */ 776 static void 777 _cache_unlink_parent(struct namecache *ncp) 778 { 779 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 780 struct namecache *par; 781 struct vnode *dropvp; 782 struct nchash_head *nchpp; 783 784 if ((par = ncp->nc_parent) != NULL) { 785 cpu_ccfence(); 786 KKASSERT(ncp->nc_parent == par); 787 788 /* don't add a ref, we drop the nchpp ref later */ 789 _cache_lock(par); 790 nchpp = ncp->nc_head; 791 spin_lock(&nchpp->spin); 792 793 /* 794 * Remove from hash table and parent, adjust accounting 795 */ 796 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 797 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 798 atomic_add_long(&pn->vfscache_count, -1); 799 if (TAILQ_EMPTY(&ncp->nc_list)) 800 atomic_add_long(&pn->vfscache_leafs, -1); 801 802 dropvp = NULL; 803 if (TAILQ_EMPTY(&par->nc_list)) { 804 atomic_add_long(&pn->vfscache_leafs, 1); 805 if (par->nc_vp) 806 dropvp = par->nc_vp; 807 } 808 ncp->nc_parent = NULL; 809 ncp->nc_head = NULL; 810 spin_unlock(&nchpp->spin); 811 _cache_unlock(par); 812 _cache_drop(par); /* drop nc_parent ref */ 813 814 /* 815 * We can only safely vdrop with no spinlocks held. 816 */ 817 if (dropvp) 818 vdrop(dropvp); 819 } 820 } 821 822 /* 823 * Allocate a new namecache structure. Most of the code does not require 824 * zero-termination of the string but it makes vop_compat_ncreate() easier. 825 * 826 * The returned ncp will be locked and referenced. The ref is generally meant 827 * to be transfered to the nchpp linkage. 828 */ 829 static struct namecache * 830 cache_alloc(int nlen) 831 { 832 struct namecache *ncp; 833 834 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 835 if (nlen) 836 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 837 ncp->nc_nlen = nlen; 838 ncp->nc_flag = NCF_UNRESOLVED; 839 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 840 ncp->nc_refs = 1; 841 TAILQ_INIT(&ncp->nc_list); 842 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 843 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 844 845 return(ncp); 846 } 847 848 /* 849 * Can only be called for the case where the ncp has never been 850 * associated with anything (so no spinlocks are needed). 851 */ 852 static void 853 _cache_free(struct namecache *ncp) 854 { 855 KKASSERT(ncp->nc_refs == 1); 856 if (ncp->nc_name) 857 kfree(ncp->nc_name, M_VFSCACHE); 858 kfree(ncp, M_VFSCACHE); 859 } 860 861 /* 862 * [re]initialize a nchandle. 863 */ 864 void 865 cache_zero(struct nchandle *nch) 866 { 867 nch->ncp = NULL; 868 nch->mount = NULL; 869 } 870 871 /* 872 * Ref and deref a nchandle structure (ncp + mp) 873 * 874 * The caller must specify a stable ncp pointer, typically meaning the 875 * ncp is already referenced but this can also occur indirectly through 876 * e.g. holding a lock on a direct child. 877 * 878 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 879 * use read spinlocks here. 880 */ 881 struct nchandle * 882 cache_hold(struct nchandle *nch) 883 { 884 _cache_hold(nch->ncp); 885 _cache_mntref(nch->mount); 886 return(nch); 887 } 888 889 /* 890 * Create a copy of a namecache handle for an already-referenced 891 * entry. 892 */ 893 void 894 cache_copy(struct nchandle *nch, struct nchandle *target) 895 { 896 struct namecache *ncp; 897 struct mount *mp; 898 struct mntcache_elm *elm; 899 struct namecache *ncpr; 900 int i; 901 902 ncp = nch->ncp; 903 mp = nch->mount; 904 target->ncp = ncp; 905 target->mount = mp; 906 907 elm = _cache_mntcache_hash(ncp); 908 for (i = 0; i < MNTCACHE_SET; ++i) { 909 if (elm->ncp == ncp) { 910 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 911 if (ncpr == ncp) { 912 _cache_mntref(mp); 913 return; 914 } 915 if (ncpr) 916 _cache_drop(ncpr); 917 } 918 ++elm; 919 } 920 if (ncp) 921 _cache_hold(ncp); 922 _cache_mntref(mp); 923 } 924 925 /* 926 * Drop the nchandle, but try to cache the ref to avoid global atomic 927 * ops. This is typically done on the system root and jail root nchandles. 928 */ 929 void 930 cache_drop_and_cache(struct nchandle *nch, int elmno) 931 { 932 struct mntcache_elm *elm; 933 struct mntcache_elm *best; 934 struct namecache *ncpr; 935 int delta1; 936 int delta2; 937 int i; 938 939 if (elmno > 4) { 940 if (nch->ncp) { 941 _cache_drop(nch->ncp); 942 nch->ncp = NULL; 943 } 944 if (nch->mount) { 945 _cache_mntrel(nch->mount); 946 nch->mount = NULL; 947 } 948 return; 949 } 950 951 elm = _cache_mntcache_hash(nch->ncp); 952 best = elm; 953 for (i = 0; i < MNTCACHE_SET; ++i) { 954 if (elm->ncp == NULL) { 955 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 956 _cache_mntrel(nch->mount); 957 elm->ticks = ticks; 958 nch->mount = NULL; 959 nch->ncp = NULL; 960 if (ncpr) 961 _cache_drop(ncpr); 962 return; 963 } 964 delta1 = ticks - best->ticks; 965 delta2 = ticks - elm->ticks; 966 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 967 best = elm; 968 ++elm; 969 } 970 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 971 _cache_mntrel(nch->mount); 972 best->ticks = ticks; 973 nch->mount = NULL; 974 nch->ncp = NULL; 975 if (ncpr) 976 _cache_drop(ncpr); 977 } 978 979 void 980 cache_changemount(struct nchandle *nch, struct mount *mp) 981 { 982 _cache_mntref(mp); 983 _cache_mntrel(nch->mount); 984 nch->mount = mp; 985 } 986 987 void 988 cache_drop(struct nchandle *nch) 989 { 990 _cache_mntrel(nch->mount); 991 _cache_drop(nch->ncp); 992 nch->ncp = NULL; 993 nch->mount = NULL; 994 } 995 996 int 997 cache_lockstatus(struct nchandle *nch) 998 { 999 return(_cache_lockstatus(nch->ncp)); 1000 } 1001 1002 void 1003 cache_lock(struct nchandle *nch) 1004 { 1005 _cache_lock(nch->ncp); 1006 } 1007 1008 void 1009 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1010 { 1011 struct namecache *ncp = nch->ncp; 1012 1013 if (ncp_shared_lock_disable || excl || 1014 (ncp->nc_flag & NCF_UNRESOLVED)) { 1015 _cache_lock(ncp); 1016 } else { 1017 _cache_lock_shared(ncp); 1018 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1019 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1020 _cache_unlock(ncp); 1021 _cache_lock(ncp); 1022 } 1023 } else { 1024 _cache_unlock(ncp); 1025 _cache_lock(ncp); 1026 } 1027 } 1028 } 1029 1030 /* 1031 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1032 * is responsible for checking both for validity on return as they 1033 * may have become invalid. 1034 * 1035 * We have to deal with potential deadlocks here, just ping pong 1036 * the lock until we get it (we will always block somewhere when 1037 * looping so this is not cpu-intensive). 1038 * 1039 * which = 0 nch1 not locked, nch2 is locked 1040 * which = 1 nch1 is locked, nch2 is not locked 1041 */ 1042 void 1043 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1044 struct nchandle *nch2, struct ucred *cred2) 1045 { 1046 int which; 1047 1048 which = 0; 1049 1050 for (;;) { 1051 if (which == 0) { 1052 if (cache_lock_nonblock(nch1) == 0) { 1053 cache_resolve(nch1, cred1); 1054 break; 1055 } 1056 cache_unlock(nch2); 1057 cache_lock(nch1); 1058 cache_resolve(nch1, cred1); 1059 which = 1; 1060 } else { 1061 if (cache_lock_nonblock(nch2) == 0) { 1062 cache_resolve(nch2, cred2); 1063 break; 1064 } 1065 cache_unlock(nch1); 1066 cache_lock(nch2); 1067 cache_resolve(nch2, cred2); 1068 which = 0; 1069 } 1070 } 1071 } 1072 1073 int 1074 cache_lock_nonblock(struct nchandle *nch) 1075 { 1076 return(_cache_lock_nonblock(nch->ncp)); 1077 } 1078 1079 void 1080 cache_unlock(struct nchandle *nch) 1081 { 1082 _cache_unlock(nch->ncp); 1083 } 1084 1085 /* 1086 * ref-and-lock, unlock-and-deref functions. 1087 * 1088 * This function is primarily used by nlookup. Even though cache_lock 1089 * holds the vnode, it is possible that the vnode may have already 1090 * initiated a recyclement. 1091 * 1092 * We want cache_get() to return a definitively usable vnode or a 1093 * definitively unresolved ncp. 1094 */ 1095 static 1096 struct namecache * 1097 _cache_get(struct namecache *ncp) 1098 { 1099 _cache_hold(ncp); 1100 _cache_lock(ncp); 1101 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1102 _cache_setunresolved(ncp); 1103 return(ncp); 1104 } 1105 1106 /* 1107 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1108 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1109 * valid. Otherwise an exclusive lock will be acquired instead. 1110 */ 1111 static 1112 struct namecache * 1113 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1114 { 1115 if (ncp_shared_lock_disable || excl || 1116 (ncp->nc_flag & NCF_UNRESOLVED)) { 1117 return(_cache_get(ncp)); 1118 } 1119 _cache_hold(ncp); 1120 _cache_lock_shared(ncp); 1121 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1122 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1123 _cache_unlock(ncp); 1124 ncp = _cache_get(ncp); 1125 _cache_drop(ncp); 1126 } 1127 } else { 1128 _cache_unlock(ncp); 1129 ncp = _cache_get(ncp); 1130 _cache_drop(ncp); 1131 } 1132 return(ncp); 1133 } 1134 1135 /* 1136 * NOTE: The same nchandle can be passed for both arguments. 1137 */ 1138 void 1139 cache_get(struct nchandle *nch, struct nchandle *target) 1140 { 1141 KKASSERT(nch->ncp->nc_refs > 0); 1142 target->mount = nch->mount; 1143 target->ncp = _cache_get(nch->ncp); 1144 _cache_mntref(target->mount); 1145 } 1146 1147 void 1148 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1149 { 1150 KKASSERT(nch->ncp->nc_refs > 0); 1151 target->mount = nch->mount; 1152 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1153 _cache_mntref(target->mount); 1154 } 1155 1156 /* 1157 * Release a held and locked ncp 1158 */ 1159 static __inline 1160 void 1161 _cache_put(struct namecache *ncp) 1162 { 1163 _cache_unlock(ncp); 1164 _cache_drop(ncp); 1165 } 1166 1167 void 1168 cache_put(struct nchandle *nch) 1169 { 1170 _cache_mntrel(nch->mount); 1171 _cache_put(nch->ncp); 1172 nch->ncp = NULL; 1173 nch->mount = NULL; 1174 } 1175 1176 /* 1177 * Resolve an unresolved ncp by associating a vnode with it. If the 1178 * vnode is NULL, a negative cache entry is created. 1179 * 1180 * The ncp should be locked on entry and will remain locked on return. 1181 */ 1182 static 1183 void 1184 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1185 { 1186 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1187 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1188 ncp->nc_vp == NULL); 1189 1190 if (vp) { 1191 /* 1192 * Any vp associated with an ncp which has children must 1193 * be held. Any vp associated with a locked ncp must be held. 1194 */ 1195 if (!TAILQ_EMPTY(&ncp->nc_list)) 1196 vhold(vp); 1197 spin_lock(&vp->v_spin); 1198 ncp->nc_vp = vp; 1199 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1200 ++vp->v_namecache_count; 1201 _cache_hold(ncp); /* v_namecache assoc */ 1202 spin_unlock(&vp->v_spin); 1203 vhold(vp); /* nc_vp */ 1204 1205 /* 1206 * Set auxiliary flags 1207 */ 1208 switch(vp->v_type) { 1209 case VDIR: 1210 ncp->nc_flag |= NCF_ISDIR; 1211 break; 1212 case VLNK: 1213 ncp->nc_flag |= NCF_ISSYMLINK; 1214 /* XXX cache the contents of the symlink */ 1215 break; 1216 default: 1217 break; 1218 } 1219 1220 ncp->nc_error = 0; 1221 1222 /* 1223 * XXX: this is a hack to work-around the lack of a real pfs vfs 1224 * implementation 1225 */ 1226 if (mp) { 1227 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1228 vp->v_pfsmp = mp; 1229 } 1230 } else { 1231 /* 1232 * When creating a negative cache hit we set the 1233 * namecache_gen. A later resolve will clean out the 1234 * negative cache hit if the mount point's namecache_gen 1235 * has changed. Used by devfs, could also be used by 1236 * other remote FSs. 1237 */ 1238 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1239 1240 ncp->nc_vp = NULL; 1241 ncp->nc_negcpu = mycpu->gd_cpuid; 1242 spin_lock(&pn->neg_spin); 1243 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1244 _cache_hold(ncp); /* neg_list assoc */ 1245 ++pn->neg_count; 1246 spin_unlock(&pn->neg_spin); 1247 atomic_add_long(&pn->vfscache_negs, 1); 1248 1249 ncp->nc_error = ENOENT; 1250 if (mp) 1251 VFS_NCPGEN_SET(mp, ncp); 1252 } 1253 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1254 } 1255 1256 void 1257 cache_setvp(struct nchandle *nch, struct vnode *vp) 1258 { 1259 _cache_setvp(nch->mount, nch->ncp, vp); 1260 } 1261 1262 /* 1263 * Used for NFS 1264 */ 1265 void 1266 cache_settimeout(struct nchandle *nch, int nticks) 1267 { 1268 struct namecache *ncp = nch->ncp; 1269 1270 if ((ncp->nc_timeout = ticks + nticks) == 0) 1271 ncp->nc_timeout = 1; 1272 } 1273 1274 /* 1275 * Disassociate the vnode or negative-cache association and mark a 1276 * namecache entry as unresolved again. Note that the ncp is still 1277 * left in the hash table and still linked to its parent. 1278 * 1279 * The ncp should be locked and refd on entry and will remain locked and refd 1280 * on return. 1281 * 1282 * This routine is normally never called on a directory containing children. 1283 * However, NFS often does just that in its rename() code as a cop-out to 1284 * avoid complex namespace operations. This disconnects a directory vnode 1285 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1286 * sync. 1287 * 1288 */ 1289 static 1290 void 1291 _cache_setunresolved(struct namecache *ncp) 1292 { 1293 struct vnode *vp; 1294 1295 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1296 ncp->nc_flag |= NCF_UNRESOLVED; 1297 ncp->nc_timeout = 0; 1298 ncp->nc_error = ENOTCONN; 1299 if ((vp = ncp->nc_vp) != NULL) { 1300 spin_lock(&vp->v_spin); 1301 ncp->nc_vp = NULL; 1302 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1303 --vp->v_namecache_count; 1304 spin_unlock(&vp->v_spin); 1305 1306 /* 1307 * Any vp associated with an ncp with children is 1308 * held by that ncp. Any vp associated with ncp 1309 * is held by that ncp. These conditions must be 1310 * undone when the vp is cleared out from the ncp. 1311 */ 1312 if (!TAILQ_EMPTY(&ncp->nc_list)) 1313 vdrop(vp); 1314 vdrop(vp); 1315 } else { 1316 struct pcpu_ncache *pn; 1317 1318 pn = &pcpu_ncache[ncp->nc_negcpu]; 1319 1320 atomic_add_long(&pn->vfscache_negs, -1); 1321 spin_lock(&pn->neg_spin); 1322 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1323 --pn->neg_count; 1324 spin_unlock(&pn->neg_spin); 1325 } 1326 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1327 _cache_drop(ncp); /* from v_namecache or neg_list */ 1328 } 1329 } 1330 1331 /* 1332 * The cache_nresolve() code calls this function to automatically 1333 * set a resolved cache element to unresolved if it has timed out 1334 * or if it is a negative cache hit and the mount point namecache_gen 1335 * has changed. 1336 */ 1337 static __inline int 1338 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1339 { 1340 /* 1341 * Try to zap entries that have timed out. We have 1342 * to be careful here because locked leafs may depend 1343 * on the vnode remaining intact in a parent, so only 1344 * do this under very specific conditions. 1345 */ 1346 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1347 TAILQ_EMPTY(&ncp->nc_list)) { 1348 return 1; 1349 } 1350 1351 /* 1352 * If a resolved negative cache hit is invalid due to 1353 * the mount's namecache generation being bumped, zap it. 1354 */ 1355 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1356 return 1; 1357 } 1358 1359 /* 1360 * Otherwise we are good 1361 */ 1362 return 0; 1363 } 1364 1365 static __inline void 1366 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1367 { 1368 /* 1369 * Already in an unresolved state, nothing to do. 1370 */ 1371 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1372 if (_cache_auto_unresolve_test(mp, ncp)) 1373 _cache_setunresolved(ncp); 1374 } 1375 } 1376 1377 void 1378 cache_setunresolved(struct nchandle *nch) 1379 { 1380 _cache_setunresolved(nch->ncp); 1381 } 1382 1383 /* 1384 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1385 * looking for matches. This flag tells the lookup code when it must 1386 * check for a mount linkage and also prevents the directories in question 1387 * from being deleted or renamed. 1388 */ 1389 static 1390 int 1391 cache_clrmountpt_callback(struct mount *mp, void *data) 1392 { 1393 struct nchandle *nch = data; 1394 1395 if (mp->mnt_ncmounton.ncp == nch->ncp) 1396 return(1); 1397 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1398 return(1); 1399 return(0); 1400 } 1401 1402 /* 1403 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1404 * with a mount point. 1405 */ 1406 void 1407 cache_clrmountpt(struct nchandle *nch) 1408 { 1409 int count; 1410 1411 count = mountlist_scan(cache_clrmountpt_callback, nch, 1412 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1413 MNTSCAN_NOUNLOCK); 1414 if (count == 0) 1415 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1416 } 1417 1418 /* 1419 * Invalidate portions of the namecache topology given a starting entry. 1420 * The passed ncp is set to an unresolved state and: 1421 * 1422 * The passed ncp must be referenced and locked. The routine may unlock 1423 * and relock ncp several times, and will recheck the children and loop 1424 * to catch races. When done the passed ncp will be returned with the 1425 * reference and lock intact. 1426 * 1427 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1428 * that the physical underlying nodes have been 1429 * destroyed... as in deleted. For example, when 1430 * a directory is removed. This will cause record 1431 * lookups on the name to no longer be able to find 1432 * the record and tells the resolver to return failure 1433 * rather then trying to resolve through the parent. 1434 * 1435 * The topology itself, including ncp->nc_name, 1436 * remains intact. 1437 * 1438 * This only applies to the passed ncp, if CINV_CHILDREN 1439 * is specified the children are not flagged. 1440 * 1441 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1442 * state as well. 1443 * 1444 * Note that this will also have the side effect of 1445 * cleaning out any unreferenced nodes in the topology 1446 * from the leaves up as the recursion backs out. 1447 * 1448 * Note that the topology for any referenced nodes remains intact, but 1449 * the nodes will be marked as having been destroyed and will be set 1450 * to an unresolved state. 1451 * 1452 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1453 * the namecache entry may not actually be invalidated on return if it was 1454 * revalidated while recursing down into its children. This code guarentees 1455 * that the node(s) will go through an invalidation cycle, but does not 1456 * guarentee that they will remain in an invalidated state. 1457 * 1458 * Returns non-zero if a revalidation was detected during the invalidation 1459 * recursion, zero otherwise. Note that since only the original ncp is 1460 * locked the revalidation ultimately can only indicate that the original ncp 1461 * *MIGHT* no have been reresolved. 1462 * 1463 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1464 * have to avoid blowing out the kernel stack. We do this by saving the 1465 * deep namecache node and aborting the recursion, then re-recursing at that 1466 * node using a depth-first algorithm in order to allow multiple deep 1467 * recursions to chain through each other, then we restart the invalidation 1468 * from scratch. 1469 */ 1470 1471 struct cinvtrack { 1472 struct namecache *resume_ncp; 1473 int depth; 1474 }; 1475 1476 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1477 1478 static 1479 int 1480 _cache_inval(struct namecache *ncp, int flags) 1481 { 1482 struct cinvtrack track; 1483 struct namecache *ncp2; 1484 int r; 1485 1486 track.depth = 0; 1487 track.resume_ncp = NULL; 1488 1489 for (;;) { 1490 r = _cache_inval_internal(ncp, flags, &track); 1491 if (track.resume_ncp == NULL) 1492 break; 1493 _cache_unlock(ncp); 1494 while ((ncp2 = track.resume_ncp) != NULL) { 1495 track.resume_ncp = NULL; 1496 _cache_lock(ncp2); 1497 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1498 &track); 1499 /*_cache_put(ncp2);*/ 1500 cache_zap(ncp2); 1501 } 1502 _cache_lock(ncp); 1503 } 1504 return(r); 1505 } 1506 1507 int 1508 cache_inval(struct nchandle *nch, int flags) 1509 { 1510 return(_cache_inval(nch->ncp, flags)); 1511 } 1512 1513 /* 1514 * Helper for _cache_inval(). The passed ncp is refd and locked and 1515 * remains that way on return, but may be unlocked/relocked multiple 1516 * times by the routine. 1517 */ 1518 static int 1519 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1520 { 1521 struct namecache *nextkid; 1522 int rcnt = 0; 1523 1524 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1525 1526 _cache_setunresolved(ncp); 1527 if (flags & CINV_DESTROY) { 1528 ncp->nc_flag |= NCF_DESTROYED; 1529 ++ncp->nc_generation; 1530 } 1531 1532 while ((flags & CINV_CHILDREN) && 1533 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1534 ) { 1535 struct namecache *kid; 1536 int restart; 1537 1538 restart = 0; 1539 _cache_hold(nextkid); 1540 if (++track->depth > MAX_RECURSION_DEPTH) { 1541 track->resume_ncp = ncp; 1542 _cache_hold(ncp); 1543 ++rcnt; 1544 } 1545 while ((kid = nextkid) != NULL) { 1546 /* 1547 * Parent (ncp) must be locked for the iteration. 1548 */ 1549 nextkid = NULL; 1550 if (kid->nc_parent != ncp) { 1551 _cache_drop(kid); 1552 kprintf("cache_inval_internal restartA %s\n", 1553 ncp->nc_name); 1554 restart = 1; 1555 break; 1556 } 1557 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1558 _cache_hold(nextkid); 1559 1560 /* 1561 * Parent unlocked for this section to avoid 1562 * deadlocks. Then lock the kid and check for 1563 * races. 1564 */ 1565 _cache_unlock(ncp); 1566 if (track->resume_ncp) { 1567 _cache_drop(kid); 1568 _cache_lock(ncp); 1569 break; 1570 } 1571 _cache_lock(kid); 1572 if (kid->nc_parent != ncp) { 1573 kprintf("cache_inval_internal " 1574 "restartB %s\n", 1575 ncp->nc_name); 1576 restart = 1; 1577 _cache_unlock(kid); 1578 _cache_drop(kid); 1579 _cache_lock(ncp); 1580 break; 1581 } 1582 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1583 TAILQ_FIRST(&kid->nc_list) 1584 ) { 1585 1586 rcnt += _cache_inval_internal(kid, 1587 flags & ~CINV_DESTROY, track); 1588 /*_cache_unlock(kid);*/ 1589 /*_cache_drop(kid);*/ 1590 cache_zap(kid); 1591 } else { 1592 cache_zap(kid); 1593 } 1594 1595 /* 1596 * Relock parent to continue scan 1597 */ 1598 _cache_lock(ncp); 1599 } 1600 if (nextkid) 1601 _cache_drop(nextkid); 1602 --track->depth; 1603 if (restart == 0) 1604 break; 1605 } 1606 1607 /* 1608 * Someone could have gotten in there while ncp was unlocked, 1609 * retry if so. 1610 */ 1611 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1612 ++rcnt; 1613 return (rcnt); 1614 } 1615 1616 /* 1617 * Invalidate a vnode's namecache associations. To avoid races against 1618 * the resolver we do not invalidate a node which we previously invalidated 1619 * but which was then re-resolved while we were in the invalidation loop. 1620 * 1621 * Returns non-zero if any namecache entries remain after the invalidation 1622 * loop completed. 1623 * 1624 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1625 * be ripped out of the topology while held, the vnode's v_namecache 1626 * list has no such restriction. NCP's can be ripped out of the list 1627 * at virtually any time if not locked, even if held. 1628 * 1629 * In addition, the v_namecache list itself must be locked via 1630 * the vnode's spinlock. 1631 */ 1632 int 1633 cache_inval_vp(struct vnode *vp, int flags) 1634 { 1635 struct namecache *ncp; 1636 struct namecache *next; 1637 1638 restart: 1639 spin_lock(&vp->v_spin); 1640 ncp = TAILQ_FIRST(&vp->v_namecache); 1641 if (ncp) 1642 _cache_hold(ncp); 1643 while (ncp) { 1644 /* loop entered with ncp held and vp spin-locked */ 1645 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1646 _cache_hold(next); 1647 spin_unlock(&vp->v_spin); 1648 _cache_lock(ncp); 1649 if (ncp->nc_vp != vp) { 1650 kprintf("Warning: cache_inval_vp: race-A detected on " 1651 "%s\n", ncp->nc_name); 1652 _cache_put(ncp); 1653 if (next) 1654 _cache_drop(next); 1655 goto restart; 1656 } 1657 _cache_inval(ncp, flags); 1658 _cache_put(ncp); /* also releases reference */ 1659 ncp = next; 1660 spin_lock(&vp->v_spin); 1661 if (ncp && ncp->nc_vp != vp) { 1662 spin_unlock(&vp->v_spin); 1663 kprintf("Warning: cache_inval_vp: race-B detected on " 1664 "%s\n", ncp->nc_name); 1665 _cache_drop(ncp); 1666 goto restart; 1667 } 1668 } 1669 spin_unlock(&vp->v_spin); 1670 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1671 } 1672 1673 /* 1674 * This routine is used instead of the normal cache_inval_vp() when we 1675 * are trying to recycle otherwise good vnodes. 1676 * 1677 * Return 0 on success, non-zero if not all namecache records could be 1678 * disassociated from the vnode (for various reasons). 1679 */ 1680 int 1681 cache_inval_vp_nonblock(struct vnode *vp) 1682 { 1683 struct namecache *ncp; 1684 struct namecache *next; 1685 1686 spin_lock(&vp->v_spin); 1687 ncp = TAILQ_FIRST(&vp->v_namecache); 1688 if (ncp) 1689 _cache_hold(ncp); 1690 while (ncp) { 1691 /* loop entered with ncp held */ 1692 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1693 _cache_hold(next); 1694 spin_unlock(&vp->v_spin); 1695 if (_cache_lock_nonblock(ncp)) { 1696 _cache_drop(ncp); 1697 if (next) 1698 _cache_drop(next); 1699 goto done; 1700 } 1701 if (ncp->nc_vp != vp) { 1702 kprintf("Warning: cache_inval_vp: race-A detected on " 1703 "%s\n", ncp->nc_name); 1704 _cache_put(ncp); 1705 if (next) 1706 _cache_drop(next); 1707 goto done; 1708 } 1709 _cache_inval(ncp, 0); 1710 _cache_put(ncp); /* also releases reference */ 1711 ncp = next; 1712 spin_lock(&vp->v_spin); 1713 if (ncp && ncp->nc_vp != vp) { 1714 spin_unlock(&vp->v_spin); 1715 kprintf("Warning: cache_inval_vp: race-B detected on " 1716 "%s\n", ncp->nc_name); 1717 _cache_drop(ncp); 1718 goto done; 1719 } 1720 } 1721 spin_unlock(&vp->v_spin); 1722 done: 1723 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1724 } 1725 1726 /* 1727 * Clears the universal directory search 'ok' flag. This flag allows 1728 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1729 * so clearing it simply forces revalidation. 1730 */ 1731 void 1732 cache_inval_wxok(struct vnode *vp) 1733 { 1734 struct namecache *ncp; 1735 1736 spin_lock(&vp->v_spin); 1737 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1738 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1739 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1740 } 1741 spin_unlock(&vp->v_spin); 1742 } 1743 1744 /* 1745 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1746 * must be locked. The target ncp is destroyed (as a normal rename-over 1747 * would destroy the target file or directory). 1748 * 1749 * Because there may be references to the source ncp we cannot copy its 1750 * contents to the target. Instead the source ncp is relinked as the target 1751 * and the target ncp is removed from the namecache topology. 1752 */ 1753 void 1754 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1755 { 1756 struct namecache *fncp = fnch->ncp; 1757 struct namecache *tncp = tnch->ncp; 1758 struct namecache *tncp_par; 1759 struct nchash_head *nchpp; 1760 u_int32_t hash; 1761 char *oname; 1762 char *nname; 1763 1764 ++fncp->nc_generation; 1765 ++tncp->nc_generation; 1766 if (tncp->nc_nlen) { 1767 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1768 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1769 nname[tncp->nc_nlen] = 0; 1770 } else { 1771 nname = NULL; 1772 } 1773 1774 /* 1775 * Rename fncp (unlink) 1776 */ 1777 _cache_unlink_parent(fncp); 1778 oname = fncp->nc_name; 1779 fncp->nc_name = nname; 1780 fncp->nc_nlen = tncp->nc_nlen; 1781 if (oname) 1782 kfree(oname, M_VFSCACHE); 1783 1784 tncp_par = tncp->nc_parent; 1785 _cache_hold(tncp_par); 1786 _cache_lock(tncp_par); 1787 1788 /* 1789 * Rename fncp (relink) 1790 */ 1791 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1792 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1793 nchpp = NCHHASH(hash); 1794 1795 spin_lock(&nchpp->spin); 1796 _cache_link_parent(fncp, tncp_par, nchpp); 1797 spin_unlock(&nchpp->spin); 1798 1799 _cache_put(tncp_par); 1800 1801 /* 1802 * Get rid of the overwritten tncp (unlink) 1803 */ 1804 _cache_unlink(tncp); 1805 } 1806 1807 /* 1808 * Perform actions consistent with unlinking a file. The passed-in ncp 1809 * must be locked. 1810 * 1811 * The ncp is marked DESTROYED so it no longer shows up in searches, 1812 * and will be physically deleted when the vnode goes away. 1813 * 1814 * If the related vnode has no refs then we cycle it through vget()/vput() 1815 * to (possibly if we don't have a ref race) trigger a deactivation, 1816 * allowing the VFS to trivially detect and recycle the deleted vnode 1817 * via VOP_INACTIVE(). 1818 * 1819 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1820 * target ncp. 1821 */ 1822 void 1823 cache_unlink(struct nchandle *nch) 1824 { 1825 _cache_unlink(nch->ncp); 1826 } 1827 1828 static void 1829 _cache_unlink(struct namecache *ncp) 1830 { 1831 struct vnode *vp; 1832 1833 /* 1834 * Causes lookups to fail and allows another ncp with the same 1835 * name to be created under ncp->nc_parent. 1836 */ 1837 ncp->nc_flag |= NCF_DESTROYED; 1838 ++ncp->nc_generation; 1839 1840 /* 1841 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1842 * force action on the 1->0 transition. 1843 */ 1844 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1845 (vp = ncp->nc_vp) != NULL) { 1846 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1847 if (VREFCNT(vp) <= 0) { 1848 if (vget(vp, LK_SHARED) == 0) 1849 vput(vp); 1850 } 1851 } 1852 } 1853 1854 /* 1855 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1856 * file. The easy solution is to just return non-zero if the vnode has refs. 1857 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1858 * force the reclaim). 1859 */ 1860 int 1861 cache_isopen(struct nchandle *nch) 1862 { 1863 struct vnode *vp; 1864 struct namecache *ncp = nch->ncp; 1865 1866 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1867 (vp = ncp->nc_vp) != NULL && 1868 VREFCNT(vp)) { 1869 return 1; 1870 } 1871 return 0; 1872 } 1873 1874 1875 /* 1876 * vget the vnode associated with the namecache entry. Resolve the namecache 1877 * entry if necessary. The passed ncp must be referenced and locked. If 1878 * the ncp is resolved it might be locked shared. 1879 * 1880 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1881 * (depending on the passed lk_type) will be returned in *vpp with an error 1882 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1883 * most typical error is ENOENT, meaning that the ncp represents a negative 1884 * cache hit and there is no vnode to retrieve, but other errors can occur 1885 * too. 1886 * 1887 * The vget() can race a reclaim. If this occurs we re-resolve the 1888 * namecache entry. 1889 * 1890 * There are numerous places in the kernel where vget() is called on a 1891 * vnode while one or more of its namecache entries is locked. Releasing 1892 * a vnode never deadlocks against locked namecache entries (the vnode 1893 * will not get recycled while referenced ncp's exist). This means we 1894 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1895 * lock when acquiring the vp lock or we might cause a deadlock. 1896 * 1897 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1898 * unresolved. If a reclaim race occurs the passed-in ncp will be 1899 * relocked exclusively before being re-resolved. 1900 */ 1901 int 1902 cache_vget(struct nchandle *nch, struct ucred *cred, 1903 int lk_type, struct vnode **vpp) 1904 { 1905 struct namecache *ncp; 1906 struct vnode *vp; 1907 int error; 1908 1909 ncp = nch->ncp; 1910 again: 1911 vp = NULL; 1912 if (ncp->nc_flag & NCF_UNRESOLVED) 1913 error = cache_resolve(nch, cred); 1914 else 1915 error = 0; 1916 1917 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1918 error = vget(vp, lk_type); 1919 if (error) { 1920 /* 1921 * VRECLAIM race 1922 * 1923 * The ncp may have been locked shared, we must relock 1924 * it exclusively before we can set it to unresolved. 1925 */ 1926 if (error == ENOENT) { 1927 kprintf("Warning: vnode reclaim race detected " 1928 "in cache_vget on %p (%s)\n", 1929 vp, ncp->nc_name); 1930 _cache_unlock(ncp); 1931 _cache_lock(ncp); 1932 _cache_setunresolved(ncp); 1933 goto again; 1934 } 1935 1936 /* 1937 * Not a reclaim race, some other error. 1938 */ 1939 KKASSERT(ncp->nc_vp == vp); 1940 vp = NULL; 1941 } else { 1942 KKASSERT(ncp->nc_vp == vp); 1943 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1944 } 1945 } 1946 if (error == 0 && vp == NULL) 1947 error = ENOENT; 1948 *vpp = vp; 1949 return(error); 1950 } 1951 1952 /* 1953 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 1954 * is already held by virtuue of the ncp being locked, but it might not be 1955 * referenced and while it is not referenced it can transition into the 1956 * VRECLAIMED state. 1957 * 1958 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1959 * unresolved. If a reclaim race occurs the passed-in ncp will be 1960 * relocked exclusively before being re-resolved. 1961 * 1962 * NOTE: At the moment we have to issue a vget() on the vnode, even though 1963 * we are going to immediately release the lock, in order to resolve 1964 * potential reclamation races. Once we have a solid vnode ref that 1965 * was (at some point) interlocked via a vget(), the vnode will not 1966 * be reclaimed. 1967 * 1968 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 1969 */ 1970 int 1971 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1972 { 1973 struct namecache *ncp; 1974 struct vnode *vp; 1975 int error; 1976 int v; 1977 1978 ncp = nch->ncp; 1979 again: 1980 vp = NULL; 1981 if (ncp->nc_flag & NCF_UNRESOLVED) 1982 error = cache_resolve(nch, cred); 1983 else 1984 error = 0; 1985 1986 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 1987 /* 1988 * Try a lockless ref of the vnode. VRECLAIMED transitions 1989 * use the vx_lock state and update-counter mechanism so we 1990 * can detect if one is in-progress or occurred. 1991 * 1992 * If we can successfully ref the vnode and interlock against 1993 * the update-counter mechanism, and VRECLAIMED is found to 1994 * not be set after that, we should be good. 1995 */ 1996 v = spin_access_start_only(&vp->v_spin); 1997 if (__predict_true(spin_access_check_inprog(v) == 0)) { 1998 vref_special(vp); 1999 if (__predict_false( 2000 spin_access_end_only(&vp->v_spin, v))) { 2001 vrele(vp); 2002 continue; 2003 } 2004 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2005 break; 2006 } 2007 vrele(vp); 2008 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2009 } 2010 2011 /* 2012 * Do it the slow way 2013 */ 2014 error = vget(vp, LK_SHARED); 2015 if (error) { 2016 /* 2017 * VRECLAIM race 2018 */ 2019 if (error == ENOENT) { 2020 kprintf("Warning: vnode reclaim race detected " 2021 "in cache_vget on %p (%s)\n", 2022 vp, ncp->nc_name); 2023 _cache_unlock(ncp); 2024 _cache_lock(ncp); 2025 _cache_setunresolved(ncp); 2026 goto again; 2027 } 2028 2029 /* 2030 * Not a reclaim race, some other error. 2031 */ 2032 KKASSERT(ncp->nc_vp == vp); 2033 vp = NULL; 2034 } else { 2035 KKASSERT(ncp->nc_vp == vp); 2036 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2037 /* caller does not want a lock */ 2038 vn_unlock(vp); 2039 } 2040 break; 2041 } 2042 if (error == 0 && vp == NULL) 2043 error = ENOENT; 2044 *vpp = vp; 2045 2046 return(error); 2047 } 2048 2049 /* 2050 * Return a referenced vnode representing the parent directory of 2051 * ncp. 2052 * 2053 * Because the caller has locked the ncp it should not be possible for 2054 * the parent ncp to go away. However, the parent can unresolve its 2055 * dvp at any time so we must be able to acquire a lock on the parent 2056 * to safely access nc_vp. 2057 * 2058 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2059 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2060 * getting destroyed. 2061 * 2062 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2063 * lock on the ncp in question.. 2064 */ 2065 struct vnode * 2066 cache_dvpref(struct namecache *ncp) 2067 { 2068 struct namecache *par; 2069 struct vnode *dvp; 2070 2071 dvp = NULL; 2072 if ((par = ncp->nc_parent) != NULL) { 2073 _cache_hold(par); 2074 _cache_lock(par); 2075 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2076 if ((dvp = par->nc_vp) != NULL) 2077 vhold(dvp); 2078 } 2079 _cache_unlock(par); 2080 if (dvp) { 2081 if (vget(dvp, LK_SHARED) == 0) { 2082 vn_unlock(dvp); 2083 vdrop(dvp); 2084 /* return refd, unlocked dvp */ 2085 } else { 2086 vdrop(dvp); 2087 dvp = NULL; 2088 } 2089 } 2090 _cache_drop(par); 2091 } 2092 return(dvp); 2093 } 2094 2095 /* 2096 * Convert a directory vnode to a namecache record without any other 2097 * knowledge of the topology. This ONLY works with directory vnodes and 2098 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2099 * returned ncp (if not NULL) will be held and unlocked. 2100 * 2101 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2102 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2103 * for dvp. This will fail only if the directory has been deleted out from 2104 * under the caller. 2105 * 2106 * Callers must always check for a NULL return no matter the value of 'makeit'. 2107 * 2108 * To avoid underflowing the kernel stack each recursive call increments 2109 * the makeit variable. 2110 */ 2111 2112 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2113 struct vnode *dvp, char *fakename); 2114 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2115 struct vnode **saved_dvp); 2116 2117 int 2118 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2119 struct nchandle *nch) 2120 { 2121 struct vnode *saved_dvp; 2122 struct vnode *pvp; 2123 char *fakename; 2124 int error; 2125 2126 nch->ncp = NULL; 2127 nch->mount = dvp->v_mount; 2128 saved_dvp = NULL; 2129 fakename = NULL; 2130 2131 /* 2132 * Handle the makeit == 0 degenerate case 2133 */ 2134 if (makeit == 0) { 2135 spin_lock_shared(&dvp->v_spin); 2136 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2137 if (nch->ncp) 2138 cache_hold(nch); 2139 spin_unlock_shared(&dvp->v_spin); 2140 } 2141 2142 /* 2143 * Loop until resolution, inside code will break out on error. 2144 */ 2145 while (makeit) { 2146 /* 2147 * Break out if we successfully acquire a working ncp. 2148 */ 2149 spin_lock_shared(&dvp->v_spin); 2150 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2151 if (nch->ncp) { 2152 cache_hold(nch); 2153 spin_unlock_shared(&dvp->v_spin); 2154 break; 2155 } 2156 spin_unlock_shared(&dvp->v_spin); 2157 2158 /* 2159 * If dvp is the root of its filesystem it should already 2160 * have a namecache pointer associated with it as a side 2161 * effect of the mount, but it may have been disassociated. 2162 */ 2163 if (dvp->v_flag & VROOT) { 2164 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2165 error = cache_resolve_mp(nch->mount); 2166 _cache_put(nch->ncp); 2167 if (ncvp_debug) { 2168 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2169 dvp->v_mount, error); 2170 } 2171 if (error) { 2172 if (ncvp_debug) 2173 kprintf(" failed\n"); 2174 nch->ncp = NULL; 2175 break; 2176 } 2177 if (ncvp_debug) 2178 kprintf(" succeeded\n"); 2179 continue; 2180 } 2181 2182 /* 2183 * If we are recursed too deeply resort to an O(n^2) 2184 * algorithm to resolve the namecache topology. The 2185 * resolved pvp is left referenced in saved_dvp to 2186 * prevent the tree from being destroyed while we loop. 2187 */ 2188 if (makeit > 20) { 2189 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2190 if (error) { 2191 kprintf("lookupdotdot(longpath) failed %d " 2192 "dvp %p\n", error, dvp); 2193 nch->ncp = NULL; 2194 break; 2195 } 2196 continue; 2197 } 2198 2199 /* 2200 * Get the parent directory and resolve its ncp. 2201 */ 2202 if (fakename) { 2203 kfree(fakename, M_TEMP); 2204 fakename = NULL; 2205 } 2206 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2207 &fakename); 2208 if (error) { 2209 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2210 break; 2211 } 2212 vn_unlock(pvp); 2213 2214 /* 2215 * Reuse makeit as a recursion depth counter. On success 2216 * nch will be fully referenced. 2217 */ 2218 cache_fromdvp(pvp, cred, makeit + 1, nch); 2219 vrele(pvp); 2220 if (nch->ncp == NULL) 2221 break; 2222 2223 /* 2224 * Do an inefficient scan of pvp (embodied by ncp) to look 2225 * for dvp. This will create a namecache record for dvp on 2226 * success. We loop up to recheck on success. 2227 * 2228 * ncp and dvp are both held but not locked. 2229 */ 2230 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2231 if (error) { 2232 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2233 pvp, nch->ncp->nc_name, dvp); 2234 cache_drop(nch); 2235 /* nch was NULLed out, reload mount */ 2236 nch->mount = dvp->v_mount; 2237 break; 2238 } 2239 if (ncvp_debug) { 2240 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2241 pvp, nch->ncp->nc_name); 2242 } 2243 cache_drop(nch); 2244 /* nch was NULLed out, reload mount */ 2245 nch->mount = dvp->v_mount; 2246 } 2247 2248 /* 2249 * If nch->ncp is non-NULL it will have been held already. 2250 */ 2251 if (fakename) 2252 kfree(fakename, M_TEMP); 2253 if (saved_dvp) 2254 vrele(saved_dvp); 2255 if (nch->ncp) 2256 return (0); 2257 return (EINVAL); 2258 } 2259 2260 /* 2261 * Go up the chain of parent directories until we find something 2262 * we can resolve into the namecache. This is very inefficient. 2263 */ 2264 static 2265 int 2266 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2267 struct vnode **saved_dvp) 2268 { 2269 struct nchandle nch; 2270 struct vnode *pvp; 2271 int error; 2272 static time_t last_fromdvp_report; 2273 char *fakename; 2274 2275 /* 2276 * Loop getting the parent directory vnode until we get something we 2277 * can resolve in the namecache. 2278 */ 2279 vref(dvp); 2280 nch.mount = dvp->v_mount; 2281 nch.ncp = NULL; 2282 fakename = NULL; 2283 2284 for (;;) { 2285 if (fakename) { 2286 kfree(fakename, M_TEMP); 2287 fakename = NULL; 2288 } 2289 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2290 &fakename); 2291 if (error) { 2292 vrele(dvp); 2293 break; 2294 } 2295 vn_unlock(pvp); 2296 spin_lock_shared(&pvp->v_spin); 2297 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2298 _cache_hold(nch.ncp); 2299 spin_unlock_shared(&pvp->v_spin); 2300 vrele(pvp); 2301 break; 2302 } 2303 spin_unlock_shared(&pvp->v_spin); 2304 if (pvp->v_flag & VROOT) { 2305 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2306 error = cache_resolve_mp(nch.mount); 2307 _cache_unlock(nch.ncp); 2308 vrele(pvp); 2309 if (error) { 2310 _cache_drop(nch.ncp); 2311 nch.ncp = NULL; 2312 vrele(dvp); 2313 } 2314 break; 2315 } 2316 vrele(dvp); 2317 dvp = pvp; 2318 } 2319 if (error == 0) { 2320 if (last_fromdvp_report != time_uptime) { 2321 last_fromdvp_report = time_uptime; 2322 kprintf("Warning: extremely inefficient path " 2323 "resolution on %s\n", 2324 nch.ncp->nc_name); 2325 } 2326 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2327 2328 /* 2329 * Hopefully dvp now has a namecache record associated with 2330 * it. Leave it referenced to prevent the kernel from 2331 * recycling the vnode. Otherwise extremely long directory 2332 * paths could result in endless recycling. 2333 */ 2334 if (*saved_dvp) 2335 vrele(*saved_dvp); 2336 *saved_dvp = dvp; 2337 _cache_drop(nch.ncp); 2338 } 2339 if (fakename) 2340 kfree(fakename, M_TEMP); 2341 return (error); 2342 } 2343 2344 /* 2345 * Do an inefficient scan of the directory represented by ncp looking for 2346 * the directory vnode dvp. ncp must be held but not locked on entry and 2347 * will be held on return. dvp must be refd but not locked on entry and 2348 * will remain refd on return. 2349 * 2350 * Why do this at all? Well, due to its stateless nature the NFS server 2351 * converts file handles directly to vnodes without necessarily going through 2352 * the namecache ops that would otherwise create the namecache topology 2353 * leading to the vnode. We could either (1) Change the namecache algorithms 2354 * to allow disconnect namecache records that are re-merged opportunistically, 2355 * or (2) Make the NFS server backtrack and scan to recover a connected 2356 * namecache topology in order to then be able to issue new API lookups. 2357 * 2358 * It turns out that (1) is a huge mess. It takes a nice clean set of 2359 * namecache algorithms and introduces a lot of complication in every subsystem 2360 * that calls into the namecache to deal with the re-merge case, especially 2361 * since we are using the namecache to placehold negative lookups and the 2362 * vnode might not be immediately assigned. (2) is certainly far less 2363 * efficient then (1), but since we are only talking about directories here 2364 * (which are likely to remain cached), the case does not actually run all 2365 * that often and has the supreme advantage of not polluting the namecache 2366 * algorithms. 2367 * 2368 * If a fakename is supplied just construct a namecache entry using the 2369 * fake name. 2370 */ 2371 static int 2372 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2373 struct vnode *dvp, char *fakename) 2374 { 2375 struct nlcomponent nlc; 2376 struct nchandle rncp; 2377 struct dirent *den; 2378 struct vnode *pvp; 2379 struct vattr vat; 2380 struct iovec iov; 2381 struct uio uio; 2382 int blksize; 2383 int eofflag; 2384 int bytes; 2385 char *rbuf; 2386 int error; 2387 2388 vat.va_blocksize = 0; 2389 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2390 return (error); 2391 cache_lock(nch); 2392 error = cache_vref(nch, cred, &pvp); 2393 cache_unlock(nch); 2394 if (error) 2395 return (error); 2396 if (ncvp_debug) { 2397 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2398 "vattr fileid = %lld\n", 2399 nch->ncp, nch->ncp->nc_name, 2400 vat.va_blocksize, 2401 (long long)vat.va_fileid); 2402 } 2403 2404 /* 2405 * Use the supplied fakename if not NULL. Fake names are typically 2406 * not in the actual filesystem hierarchy. This is used by HAMMER 2407 * to glue @@timestamp recursions together. 2408 */ 2409 if (fakename) { 2410 nlc.nlc_nameptr = fakename; 2411 nlc.nlc_namelen = strlen(fakename); 2412 rncp = cache_nlookup(nch, &nlc); 2413 goto done; 2414 } 2415 2416 if ((blksize = vat.va_blocksize) == 0) 2417 blksize = DEV_BSIZE; 2418 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2419 rncp.ncp = NULL; 2420 2421 eofflag = 0; 2422 uio.uio_offset = 0; 2423 again: 2424 iov.iov_base = rbuf; 2425 iov.iov_len = blksize; 2426 uio.uio_iov = &iov; 2427 uio.uio_iovcnt = 1; 2428 uio.uio_resid = blksize; 2429 uio.uio_segflg = UIO_SYSSPACE; 2430 uio.uio_rw = UIO_READ; 2431 uio.uio_td = curthread; 2432 2433 if (ncvp_debug >= 2) 2434 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2435 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2436 if (error == 0) { 2437 den = (struct dirent *)rbuf; 2438 bytes = blksize - uio.uio_resid; 2439 2440 while (bytes > 0) { 2441 if (ncvp_debug >= 2) { 2442 kprintf("cache_inefficient_scan: %*.*s\n", 2443 den->d_namlen, den->d_namlen, 2444 den->d_name); 2445 } 2446 if (den->d_type != DT_WHT && 2447 den->d_ino == vat.va_fileid) { 2448 if (ncvp_debug) { 2449 kprintf("cache_inefficient_scan: " 2450 "MATCHED inode %lld path %s/%*.*s\n", 2451 (long long)vat.va_fileid, 2452 nch->ncp->nc_name, 2453 den->d_namlen, den->d_namlen, 2454 den->d_name); 2455 } 2456 nlc.nlc_nameptr = den->d_name; 2457 nlc.nlc_namelen = den->d_namlen; 2458 rncp = cache_nlookup(nch, &nlc); 2459 KKASSERT(rncp.ncp != NULL); 2460 break; 2461 } 2462 bytes -= _DIRENT_DIRSIZ(den); 2463 den = _DIRENT_NEXT(den); 2464 } 2465 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2466 goto again; 2467 } 2468 kfree(rbuf, M_TEMP); 2469 done: 2470 vrele(pvp); 2471 if (rncp.ncp) { 2472 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2473 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2474 if (ncvp_debug >= 2) { 2475 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2476 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2477 } 2478 } else { 2479 if (ncvp_debug >= 2) { 2480 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2481 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2482 rncp.ncp->nc_vp); 2483 } 2484 } 2485 if (rncp.ncp->nc_vp == NULL) 2486 error = rncp.ncp->nc_error; 2487 /* 2488 * Release rncp after a successful nlookup. rncp was fully 2489 * referenced. 2490 */ 2491 cache_put(&rncp); 2492 } else { 2493 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2494 dvp, nch->ncp->nc_name); 2495 error = ENOENT; 2496 } 2497 return (error); 2498 } 2499 2500 /* 2501 * This function must be called with the ncp held and locked and will unlock 2502 * and drop it during zapping. 2503 * 2504 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2505 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2506 * and removes the related reference. If the ncp can be removed, and the 2507 * parent can be zapped non-blocking, this function loops up. 2508 * 2509 * There will be one ref from the caller (which we now own). The only 2510 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2511 * so possibly 2 refs left. Taking this into account, if there are no 2512 * additional refs and no children, the ncp will be removed from the topology 2513 * and destroyed. 2514 * 2515 * References and/or children may exist if the ncp is in the middle of the 2516 * topology, preventing the ncp from being destroyed. 2517 * 2518 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2519 * 2520 * This function may return a held (but NOT locked) parent node which the 2521 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2522 * due to deep namecache trees. 2523 * 2524 * WARNING! For MPSAFE operation this routine must acquire up to three 2525 * spin locks to be able to safely test nc_refs. Lock order is 2526 * very important. 2527 * 2528 * hash spinlock if on hash list 2529 * parent spinlock if child of parent 2530 * (the ncp is unresolved so there is no vnode association) 2531 */ 2532 static void 2533 cache_zap(struct namecache *ncp) 2534 { 2535 struct namecache *par; 2536 struct vnode *dropvp; 2537 struct nchash_head *nchpp; 2538 int refcmp; 2539 int nonblock = 1; /* XXX cleanup */ 2540 2541 again: 2542 /* 2543 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2544 * This gets rid of any vp->v_namecache list or negative list and 2545 * the related ref. 2546 */ 2547 _cache_setunresolved(ncp); 2548 2549 /* 2550 * Try to scrap the entry and possibly tail-recurse on its parent. 2551 * We only scrap unref'd (other then our ref) unresolved entries, 2552 * we do not scrap 'live' entries. 2553 * 2554 * If nc_parent is non NULL we expect 2 references, else just 1. 2555 * If there are more, someone else also holds the ncp and we cannot 2556 * destroy it. 2557 */ 2558 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2559 KKASSERT(ncp->nc_refs > 0); 2560 2561 /* 2562 * If the ncp is linked to its parent it will also be in the hash 2563 * table. We have to be able to lock the parent and the hash table. 2564 * 2565 * Acquire locks. Note that the parent can't go away while we hold 2566 * a child locked. If nc_parent is present, expect 2 refs instead 2567 * of 1. 2568 */ 2569 nchpp = NULL; 2570 if ((par = ncp->nc_parent) != NULL) { 2571 if (nonblock) { 2572 if (_cache_lock_nonblock(par)) { 2573 /* lock failed */ 2574 ncp->nc_flag |= NCF_DEFEREDZAP; 2575 atomic_add_long( 2576 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2577 1); 2578 _cache_unlock(ncp); 2579 _cache_drop(ncp); /* caller's ref */ 2580 return; 2581 } 2582 _cache_hold(par); 2583 } else { 2584 _cache_hold(par); 2585 _cache_lock(par); 2586 } 2587 nchpp = ncp->nc_head; 2588 spin_lock(&nchpp->spin); 2589 } 2590 2591 /* 2592 * With the parent and nchpp locked, and the vnode removed 2593 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2594 * more someone else has a ref and we cannot zap the entry. 2595 * 2596 * one for our hold 2597 * one for our parent link (parent also has one from the linkage) 2598 */ 2599 if (par) 2600 refcmp = 2; 2601 else 2602 refcmp = 1; 2603 2604 /* 2605 * On failure undo the work we've done so far and drop the 2606 * caller's ref and ncp. 2607 */ 2608 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2609 if (par) { 2610 spin_unlock(&nchpp->spin); 2611 _cache_put(par); 2612 } 2613 _cache_unlock(ncp); 2614 _cache_drop(ncp); 2615 return; 2616 } 2617 2618 /* 2619 * We own all the refs and with the spinlocks held no further 2620 * refs can be acquired by others. 2621 * 2622 * Remove us from the hash list and parent list. We have to 2623 * drop a ref on the parent's vp if the parent's list becomes 2624 * empty. 2625 */ 2626 dropvp = NULL; 2627 if (par) { 2628 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2629 2630 KKASSERT(nchpp == ncp->nc_head); 2631 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2632 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2633 atomic_add_long(&pn->vfscache_count, -1); 2634 if (TAILQ_EMPTY(&ncp->nc_list)) 2635 atomic_add_long(&pn->vfscache_leafs, -1); 2636 2637 if (TAILQ_EMPTY(&par->nc_list)) { 2638 atomic_add_long(&pn->vfscache_leafs, 1); 2639 if (par->nc_vp) 2640 dropvp = par->nc_vp; 2641 } 2642 ncp->nc_parent = NULL; 2643 ncp->nc_head = NULL; 2644 spin_unlock(&nchpp->spin); 2645 _cache_drop(par); /* removal of ncp from par->nc_list */ 2646 /*_cache_unlock(par);*/ 2647 } else { 2648 KKASSERT(ncp->nc_head == NULL); 2649 } 2650 2651 /* 2652 * ncp should not have picked up any refs. Physically 2653 * destroy the ncp. 2654 */ 2655 if (ncp->nc_refs != refcmp) { 2656 panic("cache_zap: %p bad refs %d (expected %d)\n", 2657 ncp, ncp->nc_refs, refcmp); 2658 } 2659 /* _cache_unlock(ncp) not required */ 2660 ncp->nc_refs = -1; /* safety */ 2661 if (ncp->nc_name) 2662 kfree(ncp->nc_name, M_VFSCACHE); 2663 kfree(ncp, M_VFSCACHE); 2664 2665 /* 2666 * Delayed drop (we had to release our spinlocks) 2667 */ 2668 if (dropvp) 2669 vdrop(dropvp); 2670 2671 /* 2672 * Loop up if we can recursively clean out the parent. 2673 */ 2674 if (par) { 2675 refcmp = 1; /* ref on parent */ 2676 if (par->nc_parent) /* par->par */ 2677 ++refcmp; 2678 par->nc_flag &= ~NCF_DEFEREDZAP; 2679 if ((par->nc_flag & NCF_UNRESOLVED) && 2680 par->nc_refs == refcmp && 2681 TAILQ_EMPTY(&par->nc_list)) { 2682 ncp = par; 2683 goto again; 2684 } 2685 _cache_unlock(par); 2686 _cache_drop(par); 2687 } 2688 } 2689 2690 /* 2691 * Clean up dangling negative cache and defered-drop entries in the 2692 * namecache. 2693 * 2694 * This routine is called in the critical path and also called from 2695 * vnlru(). When called from vnlru we use a lower limit to try to 2696 * deal with the negative cache before the critical path has to start 2697 * dealing with it. 2698 */ 2699 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2700 2701 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2702 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2703 2704 void 2705 cache_hysteresis(int critpath) 2706 { 2707 long poslimit; 2708 long neglimit = maxvnodes / ncnegfactor; 2709 long xnumcache = vfscache_leafs; 2710 2711 if (critpath == 0) 2712 neglimit = neglimit * 8 / 10; 2713 2714 /* 2715 * Don't cache too many negative hits. We use hysteresis to reduce 2716 * the impact on the critical path. 2717 */ 2718 switch(neg_cache_hysteresis_state[critpath]) { 2719 case CHI_LOW: 2720 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2721 if (critpath) 2722 _cache_cleanneg(ncnegflush); 2723 else 2724 _cache_cleanneg(ncnegflush + 2725 vfscache_negs - neglimit); 2726 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2727 } 2728 break; 2729 case CHI_HIGH: 2730 if (vfscache_negs > MINNEG * 9 / 10 && 2731 vfscache_negs * 9 / 10 > neglimit 2732 ) { 2733 if (critpath) 2734 _cache_cleanneg(ncnegflush); 2735 else 2736 _cache_cleanneg(ncnegflush + 2737 vfscache_negs * 9 / 10 - 2738 neglimit); 2739 } else { 2740 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2741 } 2742 break; 2743 } 2744 2745 /* 2746 * Don't cache too many positive hits. We use hysteresis to reduce 2747 * the impact on the critical path. 2748 * 2749 * Excessive positive hits can accumulate due to large numbers of 2750 * hardlinks (the vnode cache will not prevent hl ncps from growing 2751 * into infinity). 2752 */ 2753 if ((poslimit = ncposlimit) == 0) 2754 poslimit = maxvnodes * 2; 2755 if (critpath == 0) 2756 poslimit = poslimit * 8 / 10; 2757 2758 switch(pos_cache_hysteresis_state[critpath]) { 2759 case CHI_LOW: 2760 if (xnumcache > poslimit && xnumcache > MINPOS) { 2761 if (critpath) 2762 _cache_cleanpos(ncposflush); 2763 else 2764 _cache_cleanpos(ncposflush + 2765 xnumcache - poslimit); 2766 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2767 } 2768 break; 2769 case CHI_HIGH: 2770 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2771 if (critpath) 2772 _cache_cleanpos(ncposflush); 2773 else 2774 _cache_cleanpos(ncposflush + 2775 xnumcache - poslimit * 5 / 6); 2776 } else { 2777 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2778 } 2779 break; 2780 } 2781 2782 /* 2783 * Clean out dangling defered-zap ncps which could not be cleanly 2784 * dropped if too many build up. Note that numdefered is 2785 * heuristical. Make sure we are real-time for the current cpu, 2786 * plus the global rollup. 2787 */ 2788 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2789 _cache_cleandefered(); 2790 } 2791 } 2792 2793 /* 2794 * NEW NAMECACHE LOOKUP API 2795 * 2796 * Lookup an entry in the namecache. The passed par_nch must be referenced 2797 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2798 * is ALWAYS returned, eve if the supplied component is illegal. 2799 * 2800 * The resulting namecache entry should be returned to the system with 2801 * cache_put() or cache_unlock() + cache_drop(). 2802 * 2803 * namecache locks are recursive but care must be taken to avoid lock order 2804 * reversals (hence why the passed par_nch must be unlocked). Locking 2805 * rules are to order for parent traversals, not for child traversals. 2806 * 2807 * Nobody else will be able to manipulate the associated namespace (e.g. 2808 * create, delete, rename, rename-target) until the caller unlocks the 2809 * entry. 2810 * 2811 * The returned entry will be in one of three states: positive hit (non-null 2812 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2813 * Unresolved entries must be resolved through the filesystem to associate the 2814 * vnode and/or determine whether a positive or negative hit has occured. 2815 * 2816 * It is not necessary to lock a directory in order to lock namespace under 2817 * that directory. In fact, it is explicitly not allowed to do that. A 2818 * directory is typically only locked when being created, renamed, or 2819 * destroyed. 2820 * 2821 * The directory (par) may be unresolved, in which case any returned child 2822 * will likely also be marked unresolved. Likely but not guarenteed. Since 2823 * the filesystem lookup requires a resolved directory vnode the caller is 2824 * responsible for resolving the namecache chain top-down. This API 2825 * specifically allows whole chains to be created in an unresolved state. 2826 */ 2827 struct nchandle 2828 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2829 { 2830 struct nchandle nch; 2831 struct namecache *ncp; 2832 struct namecache *new_ncp; 2833 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2834 struct nchash_head *nchpp; 2835 struct mount *mp; 2836 u_int32_t hash; 2837 globaldata_t gd; 2838 int par_locked; 2839 2840 gd = mycpu; 2841 mp = par_nch->mount; 2842 par_locked = 0; 2843 2844 /* 2845 * This is a good time to call it, no ncp's are locked by 2846 * the caller or us. 2847 */ 2848 cache_hysteresis(1); 2849 2850 /* 2851 * Try to locate an existing entry 2852 */ 2853 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2854 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2855 new_ncp = NULL; 2856 nchpp = NCHHASH(hash); 2857 restart: 2858 rep_ncp = NULL; 2859 if (new_ncp) 2860 spin_lock(&nchpp->spin); 2861 else 2862 spin_lock_shared(&nchpp->spin); 2863 2864 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2865 /* 2866 * Break out if we find a matching entry. Note that 2867 * UNRESOLVED entries may match, but DESTROYED entries 2868 * do not. 2869 * 2870 * We may be able to reuse DESTROYED entries that we come 2871 * across, even if the name does not match, as long as 2872 * nc_nlen is correct and the only hold ref is from the nchpp 2873 * list itself. 2874 */ 2875 if (ncp->nc_parent == par_nch->ncp && 2876 ncp->nc_nlen == nlc->nlc_namelen) { 2877 if (ncp->nc_flag & NCF_DESTROYED) { 2878 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2879 rep_ncp = ncp; 2880 continue; 2881 } 2882 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2883 continue; 2884 _cache_hold(ncp); 2885 if (new_ncp) 2886 spin_unlock(&nchpp->spin); 2887 else 2888 spin_unlock_shared(&nchpp->spin); 2889 if (par_locked) { 2890 _cache_unlock(par_nch->ncp); 2891 par_locked = 0; 2892 } 2893 if (_cache_lock_special(ncp) == 0) { 2894 /* 2895 * Successfully locked but we must re-test 2896 * conditions that might have changed since 2897 * we did not have the lock before. 2898 */ 2899 if (ncp->nc_parent != par_nch->ncp || 2900 ncp->nc_nlen != nlc->nlc_namelen || 2901 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2902 ncp->nc_nlen) || 2903 (ncp->nc_flag & NCF_DESTROYED)) { 2904 _cache_put(ncp); 2905 goto restart; 2906 } 2907 _cache_auto_unresolve(mp, ncp); 2908 if (new_ncp) 2909 _cache_free(new_ncp); 2910 goto found; 2911 } 2912 _cache_get(ncp); /* cycle the lock to block */ 2913 _cache_put(ncp); 2914 _cache_drop(ncp); 2915 goto restart; 2916 } 2917 } 2918 2919 /* 2920 * We failed to locate the entry, try to resurrect a destroyed 2921 * entry that we did find that is already correctly linked into 2922 * nchpp and the parent. We must re-test conditions after 2923 * successfully locking rep_ncp. 2924 * 2925 * This case can occur under heavy loads due to not being able 2926 * to safely lock the parent in cache_zap(). Nominally a repeated 2927 * create/unlink load, but only the namelen needs to match. 2928 */ 2929 if (rep_ncp && new_ncp == NULL) { 2930 if (_cache_lock_nonblock(rep_ncp) == 0) { 2931 _cache_hold(rep_ncp); 2932 if (rep_ncp->nc_parent == par_nch->ncp && 2933 rep_ncp->nc_nlen == nlc->nlc_namelen && 2934 (rep_ncp->nc_flag & NCF_DESTROYED) && 2935 rep_ncp->nc_refs == 2) { 2936 /* 2937 * Update nc_name as reuse as new. 2938 */ 2939 ncp = rep_ncp; 2940 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2941 nlc->nlc_namelen); 2942 spin_unlock_shared(&nchpp->spin); 2943 _cache_setunresolved(ncp); 2944 ncp->nc_flag = NCF_UNRESOLVED; 2945 ncp->nc_error = ENOTCONN; 2946 goto found; 2947 } 2948 _cache_put(rep_ncp); 2949 } 2950 } 2951 2952 /* 2953 * Otherwise create a new entry and add it to the cache. The parent 2954 * ncp must also be locked so we can link into it. 2955 * 2956 * We have to relookup after possibly blocking in kmalloc or 2957 * when locking par_nch. 2958 * 2959 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2960 * mount case, in which case nc_name will be NULL. 2961 */ 2962 if (new_ncp == NULL) { 2963 spin_unlock_shared(&nchpp->spin); 2964 new_ncp = cache_alloc(nlc->nlc_namelen); 2965 if (nlc->nlc_namelen) { 2966 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2967 nlc->nlc_namelen); 2968 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2969 } 2970 goto restart; 2971 } 2972 2973 /* 2974 * NOTE! The spinlock is held exclusively here because new_ncp 2975 * is non-NULL. 2976 */ 2977 if (par_locked == 0) { 2978 spin_unlock(&nchpp->spin); 2979 _cache_lock(par_nch->ncp); 2980 par_locked = 1; 2981 goto restart; 2982 } 2983 2984 /* 2985 * Link to parent (requires another ref, the one already in new_ncp 2986 * is what we wil lreturn). 2987 * 2988 * WARNING! We still hold the spinlock. We have to set the hash 2989 * table entry atomically. 2990 */ 2991 ncp = new_ncp; 2992 ++ncp->nc_refs; 2993 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2994 spin_unlock(&nchpp->spin); 2995 _cache_unlock(par_nch->ncp); 2996 /* par_locked = 0 - not used */ 2997 found: 2998 /* 2999 * stats and namecache size management 3000 */ 3001 if (ncp->nc_flag & NCF_UNRESOLVED) 3002 ++gd->gd_nchstats->ncs_miss; 3003 else if (ncp->nc_vp) 3004 ++gd->gd_nchstats->ncs_goodhits; 3005 else 3006 ++gd->gd_nchstats->ncs_neghits; 3007 nch.mount = mp; 3008 nch.ncp = ncp; 3009 _cache_mntref(nch.mount); 3010 3011 return(nch); 3012 } 3013 3014 /* 3015 * Attempt to lookup a namecache entry and return with a shared namecache 3016 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3017 * set or we are unable to lock. 3018 */ 3019 int 3020 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3021 struct nlcomponent *nlc, 3022 int excl, struct nchandle *res_nch) 3023 { 3024 struct namecache *ncp; 3025 struct nchash_head *nchpp; 3026 struct mount *mp; 3027 u_int32_t hash; 3028 globaldata_t gd; 3029 3030 /* 3031 * If exclusive requested or shared namecache locks are disabled, 3032 * return failure. 3033 */ 3034 if (ncp_shared_lock_disable || excl) 3035 return(EWOULDBLOCK); 3036 3037 gd = mycpu; 3038 mp = par_nch->mount; 3039 3040 /* 3041 * This is a good time to call it, no ncp's are locked by 3042 * the caller or us. 3043 */ 3044 cache_hysteresis(1); 3045 3046 /* 3047 * Try to locate an existing entry 3048 */ 3049 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3050 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3051 nchpp = NCHHASH(hash); 3052 3053 spin_lock_shared(&nchpp->spin); 3054 3055 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3056 /* 3057 * Break out if we find a matching entry. Note that 3058 * UNRESOLVED entries may match, but DESTROYED entries 3059 * do not. 3060 */ 3061 if (ncp->nc_parent == par_nch->ncp && 3062 ncp->nc_nlen == nlc->nlc_namelen && 3063 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3064 (ncp->nc_flag & NCF_DESTROYED) == 0 3065 ) { 3066 _cache_hold(ncp); 3067 spin_unlock_shared(&nchpp->spin); 3068 3069 if (_cache_lock_shared_special(ncp) == 0) { 3070 if (ncp->nc_parent == par_nch->ncp && 3071 ncp->nc_nlen == nlc->nlc_namelen && 3072 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3073 ncp->nc_nlen) == 0 && 3074 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3075 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3076 _cache_auto_unresolve_test(mp, ncp) == 0) { 3077 goto found; 3078 } 3079 _cache_unlock(ncp); 3080 } 3081 _cache_drop(ncp); 3082 return(EWOULDBLOCK); 3083 } 3084 } 3085 3086 /* 3087 * Failure 3088 */ 3089 spin_unlock_shared(&nchpp->spin); 3090 return(EWOULDBLOCK); 3091 3092 /* 3093 * Success 3094 * 3095 * Note that nc_error might be non-zero (e.g ENOENT). 3096 */ 3097 found: 3098 res_nch->mount = mp; 3099 res_nch->ncp = ncp; 3100 ++gd->gd_nchstats->ncs_goodhits; 3101 _cache_mntref(res_nch->mount); 3102 3103 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3104 return(ncp->nc_error); 3105 } 3106 3107 /* 3108 * This is a non-blocking verison of cache_nlookup() used by 3109 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3110 * will return nch.ncp == NULL in that case. 3111 */ 3112 struct nchandle 3113 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3114 { 3115 struct nchandle nch; 3116 struct namecache *ncp; 3117 struct namecache *new_ncp; 3118 struct nchash_head *nchpp; 3119 struct mount *mp; 3120 u_int32_t hash; 3121 globaldata_t gd; 3122 int par_locked; 3123 3124 gd = mycpu; 3125 mp = par_nch->mount; 3126 par_locked = 0; 3127 3128 /* 3129 * Try to locate an existing entry 3130 */ 3131 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3132 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3133 new_ncp = NULL; 3134 nchpp = NCHHASH(hash); 3135 restart: 3136 spin_lock(&nchpp->spin); 3137 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3138 /* 3139 * Break out if we find a matching entry. Note that 3140 * UNRESOLVED entries may match, but DESTROYED entries 3141 * do not. 3142 */ 3143 if (ncp->nc_parent == par_nch->ncp && 3144 ncp->nc_nlen == nlc->nlc_namelen && 3145 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3146 (ncp->nc_flag & NCF_DESTROYED) == 0 3147 ) { 3148 _cache_hold(ncp); 3149 spin_unlock(&nchpp->spin); 3150 if (par_locked) { 3151 _cache_unlock(par_nch->ncp); 3152 par_locked = 0; 3153 } 3154 if (_cache_lock_special(ncp) == 0) { 3155 if (ncp->nc_parent != par_nch->ncp || 3156 ncp->nc_nlen != nlc->nlc_namelen || 3157 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3158 (ncp->nc_flag & NCF_DESTROYED)) { 3159 kprintf("cache_lookup_nonblock: " 3160 "ncp-race %p %*.*s\n", 3161 ncp, 3162 nlc->nlc_namelen, 3163 nlc->nlc_namelen, 3164 nlc->nlc_nameptr); 3165 _cache_unlock(ncp); 3166 _cache_drop(ncp); 3167 goto failed; 3168 } 3169 _cache_auto_unresolve(mp, ncp); 3170 if (new_ncp) { 3171 _cache_free(new_ncp); 3172 new_ncp = NULL; 3173 } 3174 goto found; 3175 } 3176 _cache_drop(ncp); 3177 goto failed; 3178 } 3179 } 3180 3181 /* 3182 * We failed to locate an entry, create a new entry and add it to 3183 * the cache. The parent ncp must also be locked so we 3184 * can link into it. 3185 * 3186 * We have to relookup after possibly blocking in kmalloc or 3187 * when locking par_nch. 3188 * 3189 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3190 * mount case, in which case nc_name will be NULL. 3191 */ 3192 if (new_ncp == NULL) { 3193 spin_unlock(&nchpp->spin); 3194 new_ncp = cache_alloc(nlc->nlc_namelen); 3195 if (nlc->nlc_namelen) { 3196 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3197 nlc->nlc_namelen); 3198 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3199 } 3200 goto restart; 3201 } 3202 if (par_locked == 0) { 3203 spin_unlock(&nchpp->spin); 3204 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3205 par_locked = 1; 3206 goto restart; 3207 } 3208 goto failed; 3209 } 3210 3211 /* 3212 * Link to parent (requires another ref, the one already in new_ncp 3213 * is what we wil lreturn). 3214 * 3215 * WARNING! We still hold the spinlock. We have to set the hash 3216 * table entry atomically. 3217 */ 3218 ncp = new_ncp; 3219 ++ncp->nc_refs; 3220 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3221 spin_unlock(&nchpp->spin); 3222 _cache_unlock(par_nch->ncp); 3223 /* par_locked = 0 - not used */ 3224 found: 3225 /* 3226 * stats and namecache size management 3227 */ 3228 if (ncp->nc_flag & NCF_UNRESOLVED) 3229 ++gd->gd_nchstats->ncs_miss; 3230 else if (ncp->nc_vp) 3231 ++gd->gd_nchstats->ncs_goodhits; 3232 else 3233 ++gd->gd_nchstats->ncs_neghits; 3234 nch.mount = mp; 3235 nch.ncp = ncp; 3236 _cache_mntref(nch.mount); 3237 3238 return(nch); 3239 failed: 3240 if (new_ncp) { 3241 _cache_free(new_ncp); 3242 new_ncp = NULL; 3243 } 3244 nch.mount = NULL; 3245 nch.ncp = NULL; 3246 return(nch); 3247 } 3248 3249 /* 3250 * This version is non-locking. The caller must validate the result 3251 * for parent-to-child continuity. 3252 * 3253 * It can fail for any reason and will return nch.ncp == NULL in that case. 3254 */ 3255 struct nchandle 3256 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3257 { 3258 struct nchandle nch; 3259 struct namecache *ncp; 3260 struct nchash_head *nchpp; 3261 struct mount *mp; 3262 u_int32_t hash; 3263 globaldata_t gd; 3264 3265 gd = mycpu; 3266 mp = par_nch->mount; 3267 3268 /* 3269 * Try to locate an existing entry 3270 */ 3271 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3272 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3273 nchpp = NCHHASH(hash); 3274 3275 spin_lock_shared(&nchpp->spin); 3276 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3277 /* 3278 * Break out if we find a matching entry. Note that 3279 * UNRESOLVED entries may match, but DESTROYED entries 3280 * do not. 3281 * 3282 * Resolved NFS entries which have timed out fail so the 3283 * caller can rerun with normal locking. 3284 */ 3285 if (ncp->nc_parent == par_nch->ncp && 3286 ncp->nc_nlen == nlc->nlc_namelen && 3287 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3288 (ncp->nc_flag & NCF_DESTROYED) == 0 3289 ) { 3290 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3291 break; 3292 _cache_hold(ncp); 3293 spin_unlock_shared(&nchpp->spin); 3294 goto found; 3295 } 3296 } 3297 spin_unlock_shared(&nchpp->spin); 3298 nch.mount = NULL; 3299 nch.ncp = NULL; 3300 return nch; 3301 found: 3302 /* 3303 * stats and namecache size management 3304 */ 3305 if (ncp->nc_flag & NCF_UNRESOLVED) 3306 ++gd->gd_nchstats->ncs_miss; 3307 else if (ncp->nc_vp) 3308 ++gd->gd_nchstats->ncs_goodhits; 3309 else 3310 ++gd->gd_nchstats->ncs_neghits; 3311 nch.mount = mp; 3312 nch.ncp = ncp; 3313 _cache_mntref(nch.mount); 3314 3315 return(nch); 3316 } 3317 3318 /* 3319 * The namecache entry is marked as being used as a mount point. 3320 * Locate the mount if it is visible to the caller. The DragonFly 3321 * mount system allows arbitrary loops in the topology and disentangles 3322 * those loops by matching against (mp, ncp) rather than just (ncp). 3323 * This means any given ncp can dive any number of mounts, depending 3324 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3325 * 3326 * We use a very simple frontend cache to reduce SMP conflicts, 3327 * which we have to do because the mountlist scan needs an exclusive 3328 * lock around its ripout info list. Not to mention that there might 3329 * be a lot of mounts. 3330 * 3331 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3332 * down a bit to allow some contention rather than making the cache 3333 * excessively huge. 3334 * 3335 * The hash table is split into per-cpu areas, is 4-way set-associative. 3336 */ 3337 struct findmount_info { 3338 struct mount *result; 3339 struct mount *nch_mount; 3340 struct namecache *nch_ncp; 3341 }; 3342 3343 static __inline 3344 struct ncmount_cache * 3345 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3346 { 3347 uint32_t hash; 3348 3349 hash = iscsi_crc32(&mp, sizeof(mp)); 3350 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3351 hash ^= hash >> 16; 3352 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3353 3354 return (&ncmount_cache[hash]); 3355 } 3356 3357 static 3358 struct ncmount_cache * 3359 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3360 { 3361 struct ncmount_cache *ncc; 3362 struct ncmount_cache *best; 3363 int delta; 3364 int best_delta; 3365 int i; 3366 3367 ncc = ncmount_cache_lookup4(mp, ncp); 3368 3369 /* 3370 * NOTE: When checking for a ticks overflow implement a slop of 3371 * 2 ticks just to be safe, because ticks is accessed 3372 * non-atomically one CPU can increment it while another 3373 * is still using the old value. 3374 */ 3375 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3376 return ncc; 3377 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3378 if (delta < -2) /* overflow reset */ 3379 ncc->ticks = ticks; 3380 best = ncc; 3381 best_delta = delta; 3382 3383 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3384 ++ncc; 3385 if (ncc->ncp == ncp && ncc->mp == mp) 3386 return ncc; 3387 delta = (int)(ticks - ncc->ticks); 3388 if (delta < -2) 3389 ncc->ticks = ticks; 3390 if (delta > best_delta) { 3391 best_delta = delta; 3392 best = ncc; 3393 } 3394 } 3395 return best; 3396 } 3397 3398 /* 3399 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3400 * doing an expensive mountlist_scan*() if possible. 3401 * 3402 * (mp, ncp) -> mountonpt.k 3403 * 3404 * Returns a referenced mount pointer or NULL 3405 * 3406 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3407 * operations (that is, where the mp_target can be freed out from under us). 3408 * 3409 * Lookups use the ncc->updating counter to validate the contents in order 3410 * to avoid having to obtain the per cache-element spin-lock. In addition, 3411 * the ticks field is only updated when it changes. However, if our per-cpu 3412 * lock fails due to an unmount-in-progress, we fall-back to the 3413 * cache-element's spin-lock. 3414 */ 3415 struct mount * 3416 cache_findmount(struct nchandle *nch) 3417 { 3418 struct findmount_info info; 3419 struct ncmount_cache *ncc; 3420 struct ncmount_cache ncc_copy; 3421 struct mount *target; 3422 struct pcpu_ncache *pcpu; 3423 struct spinlock *spinlk; 3424 int update; 3425 3426 pcpu = pcpu_ncache; 3427 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3428 ncc = NULL; 3429 goto skip; 3430 } 3431 pcpu += mycpu->gd_cpuid; 3432 3433 again: 3434 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3435 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3436 found: 3437 /* 3438 * This is a bit messy for now because we do not yet have 3439 * safe disposal of mount structures. We have to ref 3440 * ncc->mp_target but the 'update' counter only tell us 3441 * whether the cache has changed after the fact. 3442 * 3443 * For now get a per-cpu spinlock that will only contend 3444 * against umount's. This is the best path. If it fails, 3445 * instead of waiting on the umount we fall-back to a 3446 * shared ncc->spin lock, which will generally only cost a 3447 * cache ping-pong. 3448 */ 3449 update = ncc->updating; 3450 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3451 spinlk = &pcpu->umount_spin; 3452 } else { 3453 spinlk = &ncc->spin; 3454 spin_lock_shared(spinlk); 3455 } 3456 if (update & 1) { /* update in progress */ 3457 spin_unlock_any(spinlk); 3458 goto skip; 3459 } 3460 ncc_copy = *ncc; 3461 cpu_lfence(); 3462 if (ncc->updating != update) { /* content changed */ 3463 spin_unlock_any(spinlk); 3464 goto again; 3465 } 3466 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3467 spin_unlock_any(spinlk); 3468 goto again; 3469 } 3470 if (ncc_copy.isneg == 0) { 3471 target = ncc_copy.mp_target; 3472 if (target->mnt_ncmounton.mount == nch->mount && 3473 target->mnt_ncmounton.ncp == nch->ncp) { 3474 /* 3475 * Cache hit (positive) (avoid dirtying 3476 * the cache line if possible) 3477 */ 3478 if (ncc->ticks != (int)ticks) 3479 ncc->ticks = (int)ticks; 3480 _cache_mntref(target); 3481 } 3482 } else { 3483 /* 3484 * Cache hit (negative) (avoid dirtying 3485 * the cache line if possible) 3486 */ 3487 if (ncc->ticks != (int)ticks) 3488 ncc->ticks = (int)ticks; 3489 target = NULL; 3490 } 3491 spin_unlock_any(spinlk); 3492 3493 return target; 3494 } 3495 skip: 3496 3497 /* 3498 * Slow 3499 */ 3500 info.result = NULL; 3501 info.nch_mount = nch->mount; 3502 info.nch_ncp = nch->ncp; 3503 mountlist_scan(cache_findmount_callback, &info, 3504 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3505 3506 /* 3507 * To reduce multi-re-entry on the cache, relookup in the cache. 3508 * This can still race, obviously, but that's ok. 3509 */ 3510 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3511 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3512 if (info.result) 3513 atomic_add_int(&info.result->mnt_refs, -1); 3514 goto found; 3515 } 3516 3517 /* 3518 * Cache the result. 3519 */ 3520 if ((info.result == NULL || 3521 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3522 spin_lock(&ncc->spin); 3523 atomic_add_int_nonlocked(&ncc->updating, 1); 3524 cpu_sfence(); 3525 KKASSERT(ncc->updating & 1); 3526 if (ncc->mp != nch->mount) { 3527 if (ncc->mp) 3528 atomic_add_int(&ncc->mp->mnt_refs, -1); 3529 atomic_add_int(&nch->mount->mnt_refs, 1); 3530 ncc->mp = nch->mount; 3531 } 3532 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3533 ncc->ticks = (int)ticks; 3534 3535 if (info.result) { 3536 ncc->isneg = 0; 3537 if (ncc->mp_target != info.result) { 3538 if (ncc->mp_target) 3539 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3540 ncc->mp_target = info.result; 3541 atomic_add_int(&info.result->mnt_refs, 1); 3542 } 3543 } else { 3544 ncc->isneg = 1; 3545 if (ncc->mp_target) { 3546 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3547 ncc->mp_target = NULL; 3548 } 3549 } 3550 cpu_sfence(); 3551 atomic_add_int_nonlocked(&ncc->updating, 1); 3552 spin_unlock(&ncc->spin); 3553 } 3554 return(info.result); 3555 } 3556 3557 static 3558 int 3559 cache_findmount_callback(struct mount *mp, void *data) 3560 { 3561 struct findmount_info *info = data; 3562 3563 /* 3564 * Check the mount's mounted-on point against the passed nch. 3565 */ 3566 if (mp->mnt_ncmounton.mount == info->nch_mount && 3567 mp->mnt_ncmounton.ncp == info->nch_ncp 3568 ) { 3569 info->result = mp; 3570 _cache_mntref(mp); 3571 return(-1); 3572 } 3573 return(0); 3574 } 3575 3576 void 3577 cache_dropmount(struct mount *mp) 3578 { 3579 _cache_mntrel(mp); 3580 } 3581 3582 /* 3583 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3584 * or negative). 3585 * 3586 * A full scan is not required, but for now just do it anyway. 3587 */ 3588 void 3589 cache_ismounting(struct mount *mp) 3590 { 3591 struct ncmount_cache *ncc; 3592 struct mount *ncc_mp; 3593 int i; 3594 3595 if (pcpu_ncache == NULL) 3596 return; 3597 3598 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3599 ncc = &ncmount_cache[i]; 3600 if (ncc->mp != mp->mnt_ncmounton.mount || 3601 ncc->ncp != mp->mnt_ncmounton.ncp) { 3602 continue; 3603 } 3604 spin_lock(&ncc->spin); 3605 atomic_add_int_nonlocked(&ncc->updating, 1); 3606 cpu_sfence(); 3607 KKASSERT(ncc->updating & 1); 3608 if (ncc->mp != mp->mnt_ncmounton.mount || 3609 ncc->ncp != mp->mnt_ncmounton.ncp) { 3610 cpu_sfence(); 3611 ++ncc->updating; 3612 spin_unlock(&ncc->spin); 3613 continue; 3614 } 3615 ncc_mp = ncc->mp; 3616 ncc->ncp = NULL; 3617 ncc->mp = NULL; 3618 if (ncc_mp) 3619 atomic_add_int(&ncc_mp->mnt_refs, -1); 3620 ncc_mp = ncc->mp_target; 3621 ncc->mp_target = NULL; 3622 if (ncc_mp) 3623 atomic_add_int(&ncc_mp->mnt_refs, -1); 3624 ncc->ticks = (int)ticks - hz * 120; 3625 3626 cpu_sfence(); 3627 atomic_add_int_nonlocked(&ncc->updating, 1); 3628 spin_unlock(&ncc->spin); 3629 } 3630 3631 /* 3632 * Pre-cache the mount point 3633 */ 3634 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3635 mp->mnt_ncmounton.ncp); 3636 3637 spin_lock(&ncc->spin); 3638 atomic_add_int_nonlocked(&ncc->updating, 1); 3639 cpu_sfence(); 3640 KKASSERT(ncc->updating & 1); 3641 3642 if (ncc->mp) 3643 atomic_add_int(&ncc->mp->mnt_refs, -1); 3644 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3645 ncc->mp = mp->mnt_ncmounton.mount; 3646 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3647 ncc->ticks = (int)ticks; 3648 3649 ncc->isneg = 0; 3650 if (ncc->mp_target != mp) { 3651 if (ncc->mp_target) 3652 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3653 ncc->mp_target = mp; 3654 atomic_add_int(&mp->mnt_refs, 1); 3655 } 3656 cpu_sfence(); 3657 atomic_add_int_nonlocked(&ncc->updating, 1); 3658 spin_unlock(&ncc->spin); 3659 } 3660 3661 /* 3662 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3663 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3664 * negative hits involving (mp, <any>). 3665 * 3666 * A full scan is required. 3667 */ 3668 void 3669 cache_unmounting(struct mount *mp) 3670 { 3671 struct ncmount_cache *ncc; 3672 struct pcpu_ncache *pcpu; 3673 struct mount *ncc_mp; 3674 int i; 3675 3676 pcpu = pcpu_ncache; 3677 if (pcpu == NULL) 3678 return; 3679 3680 for (i = 0; i < ncpus; ++i) 3681 spin_lock(&pcpu[i].umount_spin); 3682 3683 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3684 ncc = &ncmount_cache[i]; 3685 if (ncc->mp != mp && ncc->mp_target != mp) 3686 continue; 3687 spin_lock(&ncc->spin); 3688 atomic_add_int_nonlocked(&ncc->updating, 1); 3689 cpu_sfence(); 3690 3691 if (ncc->mp != mp && ncc->mp_target != mp) { 3692 atomic_add_int_nonlocked(&ncc->updating, 1); 3693 cpu_sfence(); 3694 spin_unlock(&ncc->spin); 3695 continue; 3696 } 3697 ncc_mp = ncc->mp; 3698 ncc->ncp = NULL; 3699 ncc->mp = NULL; 3700 if (ncc_mp) 3701 atomic_add_int(&ncc_mp->mnt_refs, -1); 3702 ncc_mp = ncc->mp_target; 3703 ncc->mp_target = NULL; 3704 if (ncc_mp) 3705 atomic_add_int(&ncc_mp->mnt_refs, -1); 3706 ncc->ticks = (int)ticks - hz * 120; 3707 3708 cpu_sfence(); 3709 atomic_add_int_nonlocked(&ncc->updating, 1); 3710 spin_unlock(&ncc->spin); 3711 } 3712 3713 for (i = 0; i < ncpus; ++i) 3714 spin_unlock(&pcpu[i].umount_spin); 3715 } 3716 3717 /* 3718 * Resolve an unresolved namecache entry, generally by looking it up. 3719 * The passed ncp must be locked and refd. 3720 * 3721 * Theoretically since a vnode cannot be recycled while held, and since 3722 * the nc_parent chain holds its vnode as long as children exist, the 3723 * direct parent of the cache entry we are trying to resolve should 3724 * have a valid vnode. If not then generate an error that we can 3725 * determine is related to a resolver bug. 3726 * 3727 * However, if a vnode was in the middle of a recyclement when the NCP 3728 * got locked, ncp->nc_vp might point to a vnode that is about to become 3729 * invalid. cache_resolve() handles this case by unresolving the entry 3730 * and then re-resolving it. 3731 * 3732 * Note that successful resolution does not necessarily return an error 3733 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3734 * will be returned. 3735 */ 3736 int 3737 cache_resolve(struct nchandle *nch, struct ucred *cred) 3738 { 3739 struct namecache *par_tmp; 3740 struct namecache *par; 3741 struct namecache *ncp; 3742 struct nchandle nctmp; 3743 struct mount *mp; 3744 struct vnode *dvp; 3745 int error; 3746 3747 ncp = nch->ncp; 3748 mp = nch->mount; 3749 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3750 restart: 3751 /* 3752 * If the ncp is already resolved we have nothing to do. However, 3753 * we do want to guarentee that a usable vnode is returned when 3754 * a vnode is present, so make sure it hasn't been reclaimed. 3755 */ 3756 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3757 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3758 _cache_setunresolved(ncp); 3759 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3760 return (ncp->nc_error); 3761 } 3762 3763 /* 3764 * If the ncp was destroyed it will never resolve again. This 3765 * can basically only happen when someone is chdir'd into an 3766 * empty directory which is then rmdir'd. We want to catch this 3767 * here and not dive the VFS because the VFS might actually 3768 * have a way to re-resolve the disconnected ncp, which will 3769 * result in inconsistencies in the cdir/nch for proc->p_fd. 3770 */ 3771 if (ncp->nc_flag & NCF_DESTROYED) 3772 return(EINVAL); 3773 3774 /* 3775 * Mount points need special handling because the parent does not 3776 * belong to the same filesystem as the ncp. 3777 */ 3778 if (ncp == mp->mnt_ncmountpt.ncp) 3779 return (cache_resolve_mp(mp)); 3780 3781 /* 3782 * We expect an unbroken chain of ncps to at least the mount point, 3783 * and even all the way to root (but this code doesn't have to go 3784 * past the mount point). 3785 */ 3786 if (ncp->nc_parent == NULL) { 3787 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3788 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3789 ncp->nc_error = EXDEV; 3790 return(ncp->nc_error); 3791 } 3792 3793 /* 3794 * The vp's of the parent directories in the chain are held via vhold() 3795 * due to the existance of the child, and should not disappear. 3796 * However, there are cases where they can disappear: 3797 * 3798 * - due to filesystem I/O errors. 3799 * - due to NFS being stupid about tracking the namespace and 3800 * destroys the namespace for entire directories quite often. 3801 * - due to forced unmounts. 3802 * - due to an rmdir (parent will be marked DESTROYED) 3803 * 3804 * When this occurs we have to track the chain backwards and resolve 3805 * it, looping until the resolver catches up to the current node. We 3806 * could recurse here but we might run ourselves out of kernel stack 3807 * so we do it in a more painful manner. This situation really should 3808 * not occur all that often, or if it does not have to go back too 3809 * many nodes to resolve the ncp. 3810 */ 3811 while ((dvp = cache_dvpref(ncp)) == NULL) { 3812 /* 3813 * This case can occur if a process is CD'd into a 3814 * directory which is then rmdir'd. If the parent is marked 3815 * destroyed there is no point trying to resolve it. 3816 */ 3817 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3818 return(ENOENT); 3819 par = ncp->nc_parent; 3820 _cache_hold(par); 3821 _cache_lock(par); 3822 while ((par_tmp = par->nc_parent) != NULL && 3823 par_tmp->nc_vp == NULL) { 3824 _cache_hold(par_tmp); 3825 _cache_lock(par_tmp); 3826 _cache_put(par); 3827 par = par_tmp; 3828 } 3829 if (par->nc_parent == NULL) { 3830 kprintf("EXDEV case 2 %*.*s\n", 3831 par->nc_nlen, par->nc_nlen, par->nc_name); 3832 _cache_put(par); 3833 return (EXDEV); 3834 } 3835 /* 3836 * The parent is not set in stone, ref and lock it to prevent 3837 * it from disappearing. Also note that due to renames it 3838 * is possible for our ncp to move and for par to no longer 3839 * be one of its parents. We resolve it anyway, the loop 3840 * will handle any moves. 3841 */ 3842 _cache_get(par); /* additional hold/lock */ 3843 _cache_put(par); /* from earlier hold/lock */ 3844 if (par == nch->mount->mnt_ncmountpt.ncp) { 3845 cache_resolve_mp(nch->mount); 3846 } else if ((dvp = cache_dvpref(par)) == NULL) { 3847 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3848 par->nc_nlen, par->nc_nlen, par->nc_name); 3849 _cache_put(par); 3850 continue; 3851 } else { 3852 if (par->nc_flag & NCF_UNRESOLVED) { 3853 nctmp.mount = mp; 3854 nctmp.ncp = par; 3855 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3856 } 3857 vrele(dvp); 3858 } 3859 if ((error = par->nc_error) != 0) { 3860 if (par->nc_error != EAGAIN) { 3861 kprintf("EXDEV case 3 %*.*s error %d\n", 3862 par->nc_nlen, par->nc_nlen, par->nc_name, 3863 par->nc_error); 3864 _cache_put(par); 3865 return(error); 3866 } 3867 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3868 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3869 } 3870 _cache_put(par); 3871 /* loop */ 3872 } 3873 3874 /* 3875 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3876 * ncp's and reattach them. If this occurs the original ncp is marked 3877 * EAGAIN to force a relookup. 3878 * 3879 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3880 * ncp must already be resolved. 3881 */ 3882 if (dvp) { 3883 nctmp.mount = mp; 3884 nctmp.ncp = ncp; 3885 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3886 vrele(dvp); 3887 } else { 3888 ncp->nc_error = EPERM; 3889 } 3890 if (ncp->nc_error == EAGAIN) { 3891 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3892 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3893 goto restart; 3894 } 3895 return(ncp->nc_error); 3896 } 3897 3898 /* 3899 * Resolve the ncp associated with a mount point. Such ncp's almost always 3900 * remain resolved and this routine is rarely called. NFS MPs tends to force 3901 * re-resolution more often due to its mac-truck-smash-the-namecache 3902 * method of tracking namespace changes. 3903 * 3904 * The semantics for this call is that the passed ncp must be locked on 3905 * entry and will be locked on return. However, if we actually have to 3906 * resolve the mount point we temporarily unlock the entry in order to 3907 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3908 * the unlock we have to recheck the flags after we relock. 3909 */ 3910 static int 3911 cache_resolve_mp(struct mount *mp) 3912 { 3913 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3914 struct vnode *vp; 3915 int error; 3916 3917 KKASSERT(mp != NULL); 3918 3919 /* 3920 * If the ncp is already resolved we have nothing to do. However, 3921 * we do want to guarentee that a usable vnode is returned when 3922 * a vnode is present, so make sure it hasn't been reclaimed. 3923 */ 3924 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3925 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3926 _cache_setunresolved(ncp); 3927 } 3928 3929 if (ncp->nc_flag & NCF_UNRESOLVED) { 3930 _cache_unlock(ncp); 3931 while (vfs_busy(mp, 0)) 3932 ; 3933 error = VFS_ROOT(mp, &vp); 3934 _cache_lock(ncp); 3935 3936 /* 3937 * recheck the ncp state after relocking. 3938 */ 3939 if (ncp->nc_flag & NCF_UNRESOLVED) { 3940 ncp->nc_error = error; 3941 if (error == 0) { 3942 _cache_setvp(mp, ncp, vp); 3943 vput(vp); 3944 } else { 3945 kprintf("[diagnostic] cache_resolve_mp: failed" 3946 " to resolve mount %p err=%d ncp=%p\n", 3947 mp, error, ncp); 3948 _cache_setvp(mp, ncp, NULL); 3949 } 3950 } else if (error == 0) { 3951 vput(vp); 3952 } 3953 vfs_unbusy(mp); 3954 } 3955 return(ncp->nc_error); 3956 } 3957 3958 /* 3959 * Clean out negative cache entries when too many have accumulated. 3960 */ 3961 static void 3962 _cache_cleanneg(long count) 3963 { 3964 struct pcpu_ncache *pn; 3965 struct namecache *ncp; 3966 static uint32_t neg_rover; 3967 uint32_t n; 3968 long vnegs; 3969 3970 n = neg_rover++; /* SMP heuristical, race ok */ 3971 cpu_ccfence(); 3972 n = n % (uint32_t)ncpus; 3973 3974 /* 3975 * Normalize vfscache_negs and count. count is sometimes based 3976 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 3977 * have crazy values. 3978 */ 3979 vnegs = vfscache_negs; 3980 cpu_ccfence(); 3981 if (vnegs <= MINNEG) 3982 vnegs = MINNEG; 3983 if (count < 1) 3984 count = 1; 3985 3986 pn = &pcpu_ncache[n]; 3987 spin_lock(&pn->neg_spin); 3988 count = pn->neg_count * count / vnegs + 1; 3989 spin_unlock(&pn->neg_spin); 3990 3991 /* 3992 * Attempt to clean out the specified number of negative cache 3993 * entries. 3994 */ 3995 while (count > 0) { 3996 spin_lock(&pn->neg_spin); 3997 ncp = TAILQ_FIRST(&pn->neg_list); 3998 if (ncp == NULL) { 3999 spin_unlock(&pn->neg_spin); 4000 break; 4001 } 4002 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4003 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4004 _cache_hold(ncp); 4005 spin_unlock(&pn->neg_spin); 4006 4007 /* 4008 * This can race, so we must re-check that the ncp 4009 * is on the ncneg.list after successfully locking it. 4010 */ 4011 if (_cache_lock_special(ncp) == 0) { 4012 if (ncp->nc_vp == NULL && 4013 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4014 cache_zap(ncp); 4015 } else { 4016 _cache_unlock(ncp); 4017 _cache_drop(ncp); 4018 } 4019 } else { 4020 _cache_drop(ncp); 4021 } 4022 --count; 4023 } 4024 } 4025 4026 /* 4027 * Clean out positive cache entries when too many have accumulated. 4028 */ 4029 static void 4030 _cache_cleanpos(long count) 4031 { 4032 static volatile int rover; 4033 struct nchash_head *nchpp; 4034 struct namecache *ncp; 4035 int rover_copy; 4036 4037 /* 4038 * Attempt to clean out the specified number of negative cache 4039 * entries. 4040 */ 4041 while (count > 0) { 4042 rover_copy = ++rover; /* MPSAFEENOUGH */ 4043 cpu_ccfence(); 4044 nchpp = NCHHASH(rover_copy); 4045 4046 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4047 --count; 4048 continue; 4049 } 4050 4051 /* 4052 * Cycle ncp on list, ignore and do not move DUMMY 4053 * ncps. These are temporary list iterators. 4054 * 4055 * We must cycle the ncp to the end of the list to 4056 * ensure that all ncp's have an equal chance of 4057 * being removed. 4058 */ 4059 spin_lock(&nchpp->spin); 4060 ncp = TAILQ_FIRST(&nchpp->list); 4061 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4062 ncp = TAILQ_NEXT(ncp, nc_hash); 4063 if (ncp) { 4064 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4065 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4066 _cache_hold(ncp); 4067 } 4068 spin_unlock(&nchpp->spin); 4069 4070 if (ncp) { 4071 if (_cache_lock_special(ncp) == 0) { 4072 cache_zap(ncp); 4073 } else { 4074 _cache_drop(ncp); 4075 } 4076 } 4077 --count; 4078 } 4079 } 4080 4081 /* 4082 * This is a kitchen sink function to clean out ncps which we 4083 * tried to zap from cache_drop() but failed because we were 4084 * unable to acquire the parent lock. 4085 * 4086 * Such entries can also be removed via cache_inval_vp(), such 4087 * as when unmounting. 4088 */ 4089 static void 4090 _cache_cleandefered(void) 4091 { 4092 struct nchash_head *nchpp; 4093 struct namecache *ncp; 4094 struct namecache dummy; 4095 int i; 4096 4097 /* 4098 * Create a list iterator. DUMMY indicates that this is a list 4099 * iterator, DESTROYED prevents matches by lookup functions. 4100 */ 4101 numdefered = 0; 4102 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4103 bzero(&dummy, sizeof(dummy)); 4104 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4105 dummy.nc_refs = 1; 4106 4107 for (i = 0; i <= nchash; ++i) { 4108 nchpp = &nchashtbl[i]; 4109 4110 spin_lock(&nchpp->spin); 4111 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4112 ncp = &dummy; 4113 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4114 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4115 continue; 4116 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4117 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4118 _cache_hold(ncp); 4119 spin_unlock(&nchpp->spin); 4120 if (_cache_lock_nonblock(ncp) == 0) { 4121 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4122 _cache_unlock(ncp); 4123 } 4124 _cache_drop(ncp); 4125 spin_lock(&nchpp->spin); 4126 ncp = &dummy; 4127 } 4128 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4129 spin_unlock(&nchpp->spin); 4130 } 4131 } 4132 4133 /* 4134 * Name cache initialization, from vfsinit() when we are booting 4135 */ 4136 void 4137 nchinit(void) 4138 { 4139 struct pcpu_ncache *pn; 4140 globaldata_t gd; 4141 int i; 4142 4143 /* 4144 * Per-cpu accounting and negative hit list 4145 */ 4146 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4147 M_VFSCACHE, M_WAITOK|M_ZERO); 4148 for (i = 0; i < ncpus; ++i) { 4149 pn = &pcpu_ncache[i]; 4150 TAILQ_INIT(&pn->neg_list); 4151 spin_init(&pn->neg_spin, "ncneg"); 4152 spin_init(&pn->umount_spin, "ncumm"); 4153 } 4154 4155 /* 4156 * Initialise per-cpu namecache effectiveness statistics. 4157 */ 4158 for (i = 0; i < ncpus; ++i) { 4159 gd = globaldata_find(i); 4160 gd->gd_nchstats = &nchstats[i]; 4161 } 4162 4163 /* 4164 * Create a generous namecache hash table 4165 */ 4166 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4167 sizeof(struct nchash_head), 4168 M_VFSCACHE, &nchash); 4169 for (i = 0; i <= (int)nchash; ++i) { 4170 TAILQ_INIT(&nchashtbl[i].list); 4171 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4172 } 4173 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4174 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4175 nclockwarn = 5 * hz; 4176 } 4177 4178 /* 4179 * Called from start_init() to bootstrap the root filesystem. Returns 4180 * a referenced, unlocked namecache record. 4181 */ 4182 void 4183 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4184 { 4185 nch->ncp = cache_alloc(0); 4186 nch->mount = mp; 4187 _cache_mntref(mp); 4188 if (vp) 4189 _cache_setvp(nch->mount, nch->ncp, vp); 4190 } 4191 4192 /* 4193 * vfs_cache_setroot() 4194 * 4195 * Create an association between the root of our namecache and 4196 * the root vnode. This routine may be called several times during 4197 * booting. 4198 * 4199 * If the caller intends to save the returned namecache pointer somewhere 4200 * it must cache_hold() it. 4201 */ 4202 void 4203 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4204 { 4205 struct vnode *ovp; 4206 struct nchandle onch; 4207 4208 ovp = rootvnode; 4209 onch = rootnch; 4210 rootvnode = nvp; 4211 if (nch) 4212 rootnch = *nch; 4213 else 4214 cache_zero(&rootnch); 4215 if (ovp) 4216 vrele(ovp); 4217 if (onch.ncp) 4218 cache_drop(&onch); 4219 } 4220 4221 /* 4222 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4223 * topology and is being removed as quickly as possible. The new VOP_N*() 4224 * API calls are required to make specific adjustments using the supplied 4225 * ncp pointers rather then just bogusly purging random vnodes. 4226 * 4227 * Invalidate all namecache entries to a particular vnode as well as 4228 * any direct children of that vnode in the namecache. This is a 4229 * 'catch all' purge used by filesystems that do not know any better. 4230 * 4231 * Note that the linkage between the vnode and its namecache entries will 4232 * be removed, but the namecache entries themselves might stay put due to 4233 * active references from elsewhere in the system or due to the existance of 4234 * the children. The namecache topology is left intact even if we do not 4235 * know what the vnode association is. Such entries will be marked 4236 * NCF_UNRESOLVED. 4237 */ 4238 void 4239 cache_purge(struct vnode *vp) 4240 { 4241 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4242 } 4243 4244 static int disablecwd; 4245 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4246 "Disable getcwd"); 4247 4248 static u_long numcwdcalls; 4249 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4250 "Number of current directory resolution calls"); 4251 static u_long numcwdfailnf; 4252 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4253 "Number of current directory failures due to lack of file"); 4254 static u_long numcwdfailsz; 4255 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4256 "Number of current directory failures due to large result"); 4257 static u_long numcwdfound; 4258 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4259 "Number of current directory resolution successes"); 4260 4261 /* 4262 * MPALMOSTSAFE 4263 */ 4264 int 4265 sys___getcwd(struct __getcwd_args *uap) 4266 { 4267 u_int buflen; 4268 int error; 4269 char *buf; 4270 char *bp; 4271 4272 if (disablecwd) 4273 return (ENODEV); 4274 4275 buflen = uap->buflen; 4276 if (buflen == 0) 4277 return (EINVAL); 4278 if (buflen > MAXPATHLEN) 4279 buflen = MAXPATHLEN; 4280 4281 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4282 bp = kern_getcwd(buf, buflen, &error); 4283 if (error == 0) 4284 error = copyout(bp, uap->buf, strlen(bp) + 1); 4285 kfree(buf, M_TEMP); 4286 return (error); 4287 } 4288 4289 char * 4290 kern_getcwd(char *buf, size_t buflen, int *error) 4291 { 4292 struct proc *p = curproc; 4293 char *bp; 4294 int i, slash_prefixed; 4295 struct filedesc *fdp; 4296 struct nchandle nch; 4297 struct namecache *ncp; 4298 4299 numcwdcalls++; 4300 bp = buf; 4301 bp += buflen - 1; 4302 *bp = '\0'; 4303 fdp = p->p_fd; 4304 slash_prefixed = 0; 4305 4306 nch = fdp->fd_ncdir; 4307 ncp = nch.ncp; 4308 if (ncp) 4309 _cache_hold(ncp); 4310 4311 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4312 nch.mount != fdp->fd_nrdir.mount) 4313 ) { 4314 /* 4315 * While traversing upwards if we encounter the root 4316 * of the current mount we have to skip to the mount point 4317 * in the underlying filesystem. 4318 */ 4319 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4320 nch = nch.mount->mnt_ncmounton; 4321 _cache_drop(ncp); 4322 ncp = nch.ncp; 4323 if (ncp) 4324 _cache_hold(ncp); 4325 continue; 4326 } 4327 4328 /* 4329 * Prepend the path segment 4330 */ 4331 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4332 if (bp == buf) { 4333 numcwdfailsz++; 4334 *error = ERANGE; 4335 bp = NULL; 4336 goto done; 4337 } 4338 *--bp = ncp->nc_name[i]; 4339 } 4340 if (bp == buf) { 4341 numcwdfailsz++; 4342 *error = ERANGE; 4343 bp = NULL; 4344 goto done; 4345 } 4346 *--bp = '/'; 4347 slash_prefixed = 1; 4348 4349 /* 4350 * Go up a directory. This isn't a mount point so we don't 4351 * have to check again. 4352 */ 4353 while ((nch.ncp = ncp->nc_parent) != NULL) { 4354 if (ncp_shared_lock_disable) 4355 _cache_lock(ncp); 4356 else 4357 _cache_lock_shared(ncp); 4358 if (nch.ncp != ncp->nc_parent) { 4359 _cache_unlock(ncp); 4360 continue; 4361 } 4362 _cache_hold(nch.ncp); 4363 _cache_unlock(ncp); 4364 break; 4365 } 4366 _cache_drop(ncp); 4367 ncp = nch.ncp; 4368 } 4369 if (ncp == NULL) { 4370 numcwdfailnf++; 4371 *error = ENOENT; 4372 bp = NULL; 4373 goto done; 4374 } 4375 if (!slash_prefixed) { 4376 if (bp == buf) { 4377 numcwdfailsz++; 4378 *error = ERANGE; 4379 bp = NULL; 4380 goto done; 4381 } 4382 *--bp = '/'; 4383 } 4384 numcwdfound++; 4385 *error = 0; 4386 done: 4387 if (ncp) 4388 _cache_drop(ncp); 4389 return (bp); 4390 } 4391 4392 /* 4393 * Thus begins the fullpath magic. 4394 * 4395 * The passed nchp is referenced but not locked. 4396 */ 4397 static int disablefullpath; 4398 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4399 &disablefullpath, 0, 4400 "Disable fullpath lookups"); 4401 4402 int 4403 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4404 char **retbuf, char **freebuf, int guess) 4405 { 4406 struct nchandle fd_nrdir; 4407 struct nchandle nch; 4408 struct namecache *ncp; 4409 struct mount *mp, *new_mp; 4410 char *bp, *buf; 4411 int slash_prefixed; 4412 int error = 0; 4413 int i; 4414 4415 *retbuf = NULL; 4416 *freebuf = NULL; 4417 4418 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4419 bp = buf + MAXPATHLEN - 1; 4420 *bp = '\0'; 4421 if (nchbase) 4422 fd_nrdir = *nchbase; 4423 else if (p != NULL) 4424 fd_nrdir = p->p_fd->fd_nrdir; 4425 else 4426 fd_nrdir = rootnch; 4427 slash_prefixed = 0; 4428 nch = *nchp; 4429 ncp = nch.ncp; 4430 if (ncp) 4431 _cache_hold(ncp); 4432 mp = nch.mount; 4433 4434 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4435 new_mp = NULL; 4436 4437 /* 4438 * If we are asked to guess the upwards path, we do so whenever 4439 * we encounter an ncp marked as a mountpoint. We try to find 4440 * the actual mountpoint by finding the mountpoint with this 4441 * ncp. 4442 */ 4443 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4444 new_mp = mount_get_by_nc(ncp); 4445 } 4446 /* 4447 * While traversing upwards if we encounter the root 4448 * of the current mount we have to skip to the mount point. 4449 */ 4450 if (ncp == mp->mnt_ncmountpt.ncp) { 4451 new_mp = mp; 4452 } 4453 if (new_mp) { 4454 nch = new_mp->mnt_ncmounton; 4455 _cache_drop(ncp); 4456 ncp = nch.ncp; 4457 if (ncp) 4458 _cache_hold(ncp); 4459 mp = nch.mount; 4460 continue; 4461 } 4462 4463 /* 4464 * Prepend the path segment 4465 */ 4466 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4467 if (bp == buf) { 4468 kfree(buf, M_TEMP); 4469 error = ENOMEM; 4470 goto done; 4471 } 4472 *--bp = ncp->nc_name[i]; 4473 } 4474 if (bp == buf) { 4475 kfree(buf, M_TEMP); 4476 error = ENOMEM; 4477 goto done; 4478 } 4479 *--bp = '/'; 4480 slash_prefixed = 1; 4481 4482 /* 4483 * Go up a directory. This isn't a mount point so we don't 4484 * have to check again. 4485 * 4486 * We can only safely access nc_parent with ncp held locked. 4487 */ 4488 while ((nch.ncp = ncp->nc_parent) != NULL) { 4489 _cache_lock_shared(ncp); 4490 if (nch.ncp != ncp->nc_parent) { 4491 _cache_unlock(ncp); 4492 continue; 4493 } 4494 _cache_hold(nch.ncp); 4495 _cache_unlock(ncp); 4496 break; 4497 } 4498 _cache_drop(ncp); 4499 ncp = nch.ncp; 4500 } 4501 if (ncp == NULL) { 4502 kfree(buf, M_TEMP); 4503 error = ENOENT; 4504 goto done; 4505 } 4506 4507 if (!slash_prefixed) { 4508 if (bp == buf) { 4509 kfree(buf, M_TEMP); 4510 error = ENOMEM; 4511 goto done; 4512 } 4513 *--bp = '/'; 4514 } 4515 *retbuf = bp; 4516 *freebuf = buf; 4517 error = 0; 4518 done: 4519 if (ncp) 4520 _cache_drop(ncp); 4521 return(error); 4522 } 4523 4524 int 4525 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4526 char **freebuf, int guess) 4527 { 4528 struct namecache *ncp; 4529 struct nchandle nch; 4530 int error; 4531 4532 *freebuf = NULL; 4533 if (disablefullpath) 4534 return (ENODEV); 4535 4536 if (p == NULL) 4537 return (EINVAL); 4538 4539 /* vn is NULL, client wants us to use p->p_textvp */ 4540 if (vn == NULL) { 4541 if ((vn = p->p_textvp) == NULL) 4542 return (EINVAL); 4543 } 4544 spin_lock_shared(&vn->v_spin); 4545 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4546 if (ncp->nc_nlen) 4547 break; 4548 } 4549 if (ncp == NULL) { 4550 spin_unlock_shared(&vn->v_spin); 4551 return (EINVAL); 4552 } 4553 _cache_hold(ncp); 4554 spin_unlock_shared(&vn->v_spin); 4555 4556 nch.ncp = ncp; 4557 nch.mount = vn->v_mount; 4558 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4559 _cache_drop(ncp); 4560 return (error); 4561 } 4562 4563 void 4564 vfscache_rollup_cpu(struct globaldata *gd) 4565 { 4566 struct pcpu_ncache *pn; 4567 long count; 4568 4569 if (pcpu_ncache == NULL) 4570 return; 4571 pn = &pcpu_ncache[gd->gd_cpuid]; 4572 4573 if (pn->vfscache_count) { 4574 count = atomic_swap_long(&pn->vfscache_count, 0); 4575 atomic_add_long(&vfscache_count, count); 4576 } 4577 if (pn->vfscache_leafs) { 4578 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4579 atomic_add_long(&vfscache_leafs, count); 4580 } 4581 if (pn->vfscache_negs) { 4582 count = atomic_swap_long(&pn->vfscache_negs, 0); 4583 atomic_add_long(&vfscache_negs, count); 4584 } 4585 if (pn->numdefered) { 4586 count = atomic_swap_long(&pn->numdefered, 0); 4587 atomic_add_long(&numdefered, count); 4588 } 4589 } 4590