1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 #include <sys/mount.h> 70 #include <sys/vnode.h> 71 #include <sys/malloc.h> 72 #include <sys/sysproto.h> 73 #include <sys/spinlock.h> 74 #include <sys/proc.h> 75 #include <sys/namei.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 91 * 92 * Negative entries may exist and correspond to resolved namecache 93 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 94 * will be set if the entry corresponds to a whited-out directory entry 95 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 96 * is locked via pcpu_ncache[n].neg_spin; 97 * 98 * MPSAFE RULES: 99 * 100 * (1) A ncp must be referenced before it can be locked. 101 * 102 * (2) A ncp must be locked in order to modify it. 103 * 104 * (3) ncp locks are always ordered child -> parent. That may seem 105 * backwards but forward scans use the hash table and thus can hold 106 * the parent unlocked when traversing downward. 107 * 108 * This allows insert/rename/delete/dot-dot and other operations 109 * to use ncp->nc_parent links. 110 * 111 * This also prevents a locked up e.g. NFS node from creating a 112 * chain reaction all the way back to the root vnode / namecache. 113 * 114 * (4) parent linkages require both the parent and child to be locked. 115 */ 116 117 /* 118 * Structures associated with name cacheing. 119 */ 120 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 121 #define MINNEG 1024 122 #define MINPOS 1024 123 #define NCMOUNT_NUMCACHE 16301 /* prime number */ 124 125 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 126 127 TAILQ_HEAD(nchash_list, namecache); 128 129 /* 130 * Don't cachealign, but at least pad to 32 bytes so entries 131 * don't cross a cache line. 132 */ 133 struct nchash_head { 134 struct nchash_list list; /* 16 bytes */ 135 struct spinlock spin; /* 8 bytes */ 136 long pad01; /* 8 bytes */ 137 }; 138 139 struct ncmount_cache { 140 struct spinlock spin; 141 struct namecache *ncp; 142 struct mount *mp; 143 int isneg; /* if != 0 mp is originator and not target */ 144 } __cachealign; 145 146 struct pcpu_ncache { 147 struct spinlock neg_spin; /* for neg_list and neg_count */ 148 struct namecache_list neg_list; 149 long neg_count; 150 long vfscache_negs; 151 long vfscache_count; 152 long vfscache_leafs; 153 } __cachealign; 154 155 static struct nchash_head *nchashtbl; 156 static struct pcpu_ncache *pcpu_ncache; 157 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 158 159 /* 160 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 161 * to create the namecache infrastructure leading to a dangling vnode. 162 * 163 * 0 Only errors are reported 164 * 1 Successes are reported 165 * 2 Successes + the whole directory scan is reported 166 * 3 Force the directory scan code run as if the parent vnode did not 167 * have a namecache record, even if it does have one. 168 */ 169 static int ncvp_debug; 170 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 171 "Namecache debug level (0-3)"); 172 173 static u_long nchash; /* size of hash table */ 174 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 175 "Size of namecache hash table"); 176 177 static int ncnegflush = 10; /* burst for negative flush */ 178 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 179 "Batch flush negative entries"); 180 181 static int ncposflush = 10; /* burst for positive flush */ 182 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 183 "Batch flush positive entries"); 184 185 static int ncnegfactor = 16; /* ratio of negative entries */ 186 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 187 "Ratio of namecache negative entries"); 188 189 static int nclockwarn; /* warn on locked entries in ticks */ 190 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 191 "Warn on locked namecache entries in ticks"); 192 193 static int numdefered; /* number of cache entries allocated */ 194 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 195 "Number of cache entries allocated"); 196 197 static int ncposlimit; /* number of cache entries allocated */ 198 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 199 "Number of cache entries allocated"); 200 201 static int ncp_shared_lock_disable = 0; 202 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 203 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 204 205 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 206 "sizeof(struct vnode)"); 207 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 208 "sizeof(struct namecache)"); 209 210 static int ncmount_cache_enable = 1; 211 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 212 &ncmount_cache_enable, 0, "mount point cache"); 213 214 static __inline void _cache_drop(struct namecache *ncp); 215 static int cache_resolve_mp(struct mount *mp); 216 static struct vnode *cache_dvpref(struct namecache *ncp); 217 static void _cache_lock(struct namecache *ncp); 218 static void _cache_setunresolved(struct namecache *ncp); 219 static void _cache_cleanneg(long count); 220 static void _cache_cleanpos(long count); 221 static void _cache_cleandefered(void); 222 static void _cache_unlink(struct namecache *ncp); 223 #if 0 224 static void vfscache_rollup_all(void); 225 #endif 226 227 /* 228 * The new name cache statistics 229 */ 230 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 231 static long vfscache_negs; 232 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 233 "Number of negative namecache entries"); 234 static long vfscache_count; 235 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 236 "Number of namecaches entries"); 237 static long vfscache_leafs; 238 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 239 "Number of namecaches entries"); 240 241 struct nchstats nchstats[SMP_MAXCPU]; 242 /* 243 * Export VFS cache effectiveness statistics to user-land. 244 * 245 * The statistics are left for aggregation to user-land so 246 * neat things can be achieved, like observing per-CPU cache 247 * distribution. 248 */ 249 static int 250 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 251 { 252 struct globaldata *gd; 253 int i, error; 254 255 error = 0; 256 for (i = 0; i < ncpus; ++i) { 257 gd = globaldata_find(i); 258 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 259 sizeof(struct nchstats)))) 260 break; 261 } 262 263 return (error); 264 } 265 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 266 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 267 268 static struct namecache *cache_zap(struct namecache *ncp, int nonblock); 269 270 /* 271 * Cache mount points and namecache records in order to avoid unnecessary 272 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 273 * performance and is particularly important on multi-socket systems to 274 * reduce cache-line ping-ponging. 275 * 276 * Try to keep the pcpu structure within one cache line (~64 bytes). 277 */ 278 #define MNTCACHE_COUNT 5 279 280 struct mntcache { 281 struct mount *mntary[MNTCACHE_COUNT]; 282 struct namecache *ncp1; 283 struct namecache *ncp2; 284 struct nchandle ncdir; 285 int iter; 286 int unused01; 287 } __cachealign; 288 289 static struct mntcache pcpu_mntcache[MAXCPU]; 290 291 static 292 void 293 _cache_mntref(struct mount *mp) 294 { 295 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 296 int i; 297 298 for (i = 0; i < MNTCACHE_COUNT; ++i) { 299 if (cache->mntary[i] != mp) 300 continue; 301 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL)) 302 return; 303 } 304 atomic_add_int(&mp->mnt_refs, 1); 305 } 306 307 static 308 void 309 _cache_mntrel(struct mount *mp) 310 { 311 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 312 int i; 313 314 for (i = 0; i < MNTCACHE_COUNT; ++i) { 315 if (cache->mntary[i] == NULL) { 316 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 317 if (mp == NULL) 318 return; 319 } 320 } 321 i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT); 322 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 323 if (mp) 324 atomic_add_int(&mp->mnt_refs, -1); 325 } 326 327 /* 328 * Clears all cached mount points on all cpus. This routine should only 329 * be called when we are waiting for a mount to clear, e.g. so we can 330 * unmount. 331 */ 332 void 333 cache_clearmntcache(void) 334 { 335 int n; 336 337 for (n = 0; n < ncpus; ++n) { 338 struct mntcache *cache = &pcpu_mntcache[n]; 339 struct namecache *ncp; 340 struct mount *mp; 341 int i; 342 343 for (i = 0; i < MNTCACHE_COUNT; ++i) { 344 if (cache->mntary[i]) { 345 mp = atomic_swap_ptr( 346 (void *)&cache->mntary[i], NULL); 347 if (mp) 348 atomic_add_int(&mp->mnt_refs, -1); 349 } 350 } 351 if (cache->ncp1) { 352 ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL); 353 if (ncp) 354 _cache_drop(ncp); 355 } 356 if (cache->ncp2) { 357 ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL); 358 if (ncp) 359 _cache_drop(ncp); 360 } 361 if (cache->ncdir.ncp) { 362 ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL); 363 if (ncp) 364 _cache_drop(ncp); 365 } 366 if (cache->ncdir.mount) { 367 mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL); 368 if (mp) 369 atomic_add_int(&mp->mnt_refs, -1); 370 } 371 } 372 } 373 374 375 /* 376 * Namespace locking. The caller must already hold a reference to the 377 * namecache structure in order to lock/unlock it. This function prevents 378 * the namespace from being created or destroyed by accessors other then 379 * the lock holder. 380 * 381 * Note that holding a locked namecache structure prevents other threads 382 * from making namespace changes (e.g. deleting or creating), prevents 383 * vnode association state changes by other threads, and prevents the 384 * namecache entry from being resolved or unresolved by other threads. 385 * 386 * An exclusive lock owner has full authority to associate/disassociate 387 * vnodes and resolve/unresolve the locked ncp. 388 * 389 * A shared lock owner only has authority to acquire the underlying vnode, 390 * if any. 391 * 392 * The primary lock field is nc_lockstatus. nc_locktd is set after the 393 * fact (when locking) or cleared prior to unlocking. 394 * 395 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 396 * or recycled, but it does NOT help you if the vnode had already 397 * initiated a recyclement. If this is important, use cache_get() 398 * rather then cache_lock() (and deal with the differences in the 399 * way the refs counter is handled). Or, alternatively, make an 400 * unconditional call to cache_validate() or cache_resolve() 401 * after cache_lock() returns. 402 */ 403 static 404 void 405 _cache_lock(struct namecache *ncp) 406 { 407 thread_t td; 408 int didwarn; 409 int begticks; 410 int error; 411 u_int count; 412 413 KKASSERT(ncp->nc_refs != 0); 414 didwarn = 0; 415 begticks = 0; 416 td = curthread; 417 418 for (;;) { 419 count = ncp->nc_lockstatus; 420 cpu_ccfence(); 421 422 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 423 if (atomic_cmpset_int(&ncp->nc_lockstatus, 424 count, count + 1)) { 425 /* 426 * The vp associated with a locked ncp must 427 * be held to prevent it from being recycled. 428 * 429 * WARNING! If VRECLAIMED is set the vnode 430 * could already be in the middle of a recycle. 431 * Callers must use cache_vref() or 432 * cache_vget() on the locked ncp to 433 * validate the vp or set the cache entry 434 * to unresolved. 435 * 436 * NOTE! vhold() is allowed if we hold a 437 * lock on the ncp (which we do). 438 */ 439 ncp->nc_locktd = td; 440 if (ncp->nc_vp) 441 vhold(ncp->nc_vp); 442 break; 443 } 444 /* cmpset failed */ 445 continue; 446 } 447 if (ncp->nc_locktd == td) { 448 KKASSERT((count & NC_SHLOCK_FLAG) == 0); 449 if (atomic_cmpset_int(&ncp->nc_lockstatus, 450 count, count + 1)) { 451 break; 452 } 453 /* cmpset failed */ 454 continue; 455 } 456 tsleep_interlock(&ncp->nc_locktd, 0); 457 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 458 count | NC_EXLOCK_REQ) == 0) { 459 /* cmpset failed */ 460 continue; 461 } 462 if (begticks == 0) 463 begticks = ticks; 464 error = tsleep(&ncp->nc_locktd, PINTERLOCKED, 465 "clock", nclockwarn); 466 if (error == EWOULDBLOCK) { 467 if (didwarn == 0) { 468 didwarn = ticks; 469 kprintf("[diagnostic] cache_lock: " 470 "%s blocked on %p %08x", 471 td->td_comm, ncp, count); 472 kprintf(" \"%*.*s\"\n", 473 ncp->nc_nlen, ncp->nc_nlen, 474 ncp->nc_name); 475 } 476 } 477 /* loop */ 478 } 479 if (didwarn) { 480 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after " 481 "%d secs\n", 482 td->td_comm, 483 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 484 (int)(ticks + (hz / 2) - begticks) / hz); 485 } 486 } 487 488 /* 489 * The shared lock works similarly to the exclusive lock except 490 * nc_locktd is left NULL and we need an interlock (VHOLD) to 491 * prevent vhold() races, since the moment our cmpset_int succeeds 492 * another cpu can come in and get its own shared lock. 493 * 494 * A critical section is needed to prevent interruption during the 495 * VHOLD interlock. 496 */ 497 static 498 void 499 _cache_lock_shared(struct namecache *ncp) 500 { 501 int didwarn; 502 int error; 503 u_int count; 504 u_int optreq = NC_EXLOCK_REQ; 505 506 KKASSERT(ncp->nc_refs != 0); 507 didwarn = 0; 508 509 for (;;) { 510 count = ncp->nc_lockstatus; 511 cpu_ccfence(); 512 513 if ((count & ~NC_SHLOCK_REQ) == 0) { 514 crit_enter(); 515 if (atomic_cmpset_int(&ncp->nc_lockstatus, 516 count, 517 (count + 1) | NC_SHLOCK_FLAG | 518 NC_SHLOCK_VHOLD)) { 519 /* 520 * The vp associated with a locked ncp must 521 * be held to prevent it from being recycled. 522 * 523 * WARNING! If VRECLAIMED is set the vnode 524 * could already be in the middle of a recycle. 525 * Callers must use cache_vref() or 526 * cache_vget() on the locked ncp to 527 * validate the vp or set the cache entry 528 * to unresolved. 529 * 530 * NOTE! vhold() is allowed if we hold a 531 * lock on the ncp (which we do). 532 */ 533 if (ncp->nc_vp) 534 vhold(ncp->nc_vp); 535 atomic_clear_int(&ncp->nc_lockstatus, 536 NC_SHLOCK_VHOLD); 537 crit_exit(); 538 break; 539 } 540 /* cmpset failed */ 541 crit_exit(); 542 continue; 543 } 544 545 /* 546 * If already held shared we can just bump the count, but 547 * only allow this if nobody is trying to get the lock 548 * exclusively. If we are blocking too long ignore excl 549 * requests (which can race/deadlock us). 550 * 551 * VHOLD is a bit of a hack. Even though we successfully 552 * added another shared ref, the cpu that got the first 553 * shared ref might not yet have held the vnode. 554 */ 555 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) { 556 KKASSERT((count & ~(NC_EXLOCK_REQ | 557 NC_SHLOCK_REQ | 558 NC_SHLOCK_FLAG)) > 0); 559 if (atomic_cmpset_int(&ncp->nc_lockstatus, 560 count, count + 1)) { 561 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 562 cpu_pause(); 563 break; 564 } 565 continue; 566 } 567 tsleep_interlock(ncp, 0); 568 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 569 count | NC_SHLOCK_REQ) == 0) { 570 /* cmpset failed */ 571 continue; 572 } 573 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); 574 if (error == EWOULDBLOCK) { 575 optreq = 0; 576 if (didwarn == 0) { 577 didwarn = ticks - nclockwarn; 578 kprintf("[diagnostic] cache_lock_shared: " 579 "%s blocked on %p %08x " 580 "\"%*.*s\"\n", 581 curthread->td_comm, ncp, count, 582 ncp->nc_nlen, ncp->nc_nlen, 583 ncp->nc_name); 584 } 585 } 586 /* loop */ 587 } 588 if (didwarn) { 589 kprintf("[diagnostic] cache_lock_shared: " 590 "%s unblocked %*.*s after %d secs\n", 591 curthread->td_comm, 592 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 593 (int)(ticks - didwarn) / hz); 594 } 595 } 596 597 /* 598 * Lock ncp exclusively, return 0 on success. 599 * 600 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, 601 * such as the case where one of its children is locked. 602 */ 603 static 604 int 605 _cache_lock_nonblock(struct namecache *ncp) 606 { 607 thread_t td; 608 u_int count; 609 610 td = curthread; 611 612 for (;;) { 613 count = ncp->nc_lockstatus; 614 615 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 616 if (atomic_cmpset_int(&ncp->nc_lockstatus, 617 count, count + 1)) { 618 /* 619 * The vp associated with a locked ncp must 620 * be held to prevent it from being recycled. 621 * 622 * WARNING! If VRECLAIMED is set the vnode 623 * could already be in the middle of a recycle. 624 * Callers must use cache_vref() or 625 * cache_vget() on the locked ncp to 626 * validate the vp or set the cache entry 627 * to unresolved. 628 * 629 * NOTE! vhold() is allowed if we hold a 630 * lock on the ncp (which we do). 631 */ 632 ncp->nc_locktd = td; 633 if (ncp->nc_vp) 634 vhold(ncp->nc_vp); 635 break; 636 } 637 /* cmpset failed */ 638 continue; 639 } 640 if (ncp->nc_locktd == td) { 641 if (atomic_cmpset_int(&ncp->nc_lockstatus, 642 count, count + 1)) { 643 break; 644 } 645 /* cmpset failed */ 646 continue; 647 } 648 return(EWOULDBLOCK); 649 } 650 return(0); 651 } 652 653 /* 654 * The shared lock works similarly to the exclusive lock except 655 * nc_locktd is left NULL and we need an interlock (VHOLD) to 656 * prevent vhold() races, since the moment our cmpset_int succeeds 657 * another cpu can come in and get its own shared lock. 658 * 659 * A critical section is needed to prevent interruption during the 660 * VHOLD interlock. 661 */ 662 static 663 int 664 _cache_lock_shared_nonblock(struct namecache *ncp) 665 { 666 u_int count; 667 668 for (;;) { 669 count = ncp->nc_lockstatus; 670 671 if ((count & ~NC_SHLOCK_REQ) == 0) { 672 crit_enter(); 673 if (atomic_cmpset_int(&ncp->nc_lockstatus, 674 count, 675 (count + 1) | NC_SHLOCK_FLAG | 676 NC_SHLOCK_VHOLD)) { 677 /* 678 * The vp associated with a locked ncp must 679 * be held to prevent it from being recycled. 680 * 681 * WARNING! If VRECLAIMED is set the vnode 682 * could already be in the middle of a recycle. 683 * Callers must use cache_vref() or 684 * cache_vget() on the locked ncp to 685 * validate the vp or set the cache entry 686 * to unresolved. 687 * 688 * NOTE! vhold() is allowed if we hold a 689 * lock on the ncp (which we do). 690 */ 691 if (ncp->nc_vp) 692 vhold(ncp->nc_vp); 693 atomic_clear_int(&ncp->nc_lockstatus, 694 NC_SHLOCK_VHOLD); 695 crit_exit(); 696 break; 697 } 698 /* cmpset failed */ 699 crit_exit(); 700 continue; 701 } 702 703 /* 704 * If already held shared we can just bump the count, but 705 * only allow this if nobody is trying to get the lock 706 * exclusively. 707 * 708 * VHOLD is a bit of a hack. Even though we successfully 709 * added another shared ref, the cpu that got the first 710 * shared ref might not yet have held the vnode. 711 */ 712 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 713 NC_SHLOCK_FLAG) { 714 KKASSERT((count & ~(NC_EXLOCK_REQ | 715 NC_SHLOCK_REQ | 716 NC_SHLOCK_FLAG)) > 0); 717 if (atomic_cmpset_int(&ncp->nc_lockstatus, 718 count, count + 1)) { 719 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 720 cpu_pause(); 721 break; 722 } 723 continue; 724 } 725 return(EWOULDBLOCK); 726 } 727 return(0); 728 } 729 730 /* 731 * Helper function 732 * 733 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). 734 * 735 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. 736 */ 737 static 738 void 739 _cache_unlock(struct namecache *ncp) 740 { 741 thread_t td __debugvar = curthread; 742 u_int count; 743 u_int ncount; 744 struct vnode *dropvp; 745 746 KKASSERT(ncp->nc_refs >= 0); 747 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); 748 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); 749 750 count = ncp->nc_lockstatus; 751 cpu_ccfence(); 752 753 /* 754 * Clear nc_locktd prior to the atomic op (excl lock only) 755 */ 756 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) 757 ncp->nc_locktd = NULL; 758 dropvp = NULL; 759 760 for (;;) { 761 if ((count & 762 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { 763 dropvp = ncp->nc_vp; 764 if (count & NC_EXLOCK_REQ) 765 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ 766 else 767 ncount = 0; 768 769 if (atomic_cmpset_int(&ncp->nc_lockstatus, 770 count, ncount)) { 771 if (count & NC_EXLOCK_REQ) 772 wakeup(&ncp->nc_locktd); 773 else if (count & NC_SHLOCK_REQ) 774 wakeup(ncp); 775 break; 776 } 777 dropvp = NULL; 778 } else { 779 KKASSERT((count & NC_SHLOCK_VHOLD) == 0); 780 KKASSERT((count & ~(NC_EXLOCK_REQ | 781 NC_SHLOCK_REQ | 782 NC_SHLOCK_FLAG)) > 1); 783 if (atomic_cmpset_int(&ncp->nc_lockstatus, 784 count, count - 1)) { 785 break; 786 } 787 } 788 count = ncp->nc_lockstatus; 789 cpu_ccfence(); 790 } 791 792 /* 793 * Don't actually drop the vp until we successfully clean out 794 * the lock, otherwise we may race another shared lock. 795 */ 796 if (dropvp) 797 vdrop(dropvp); 798 } 799 800 static 801 int 802 _cache_lockstatus(struct namecache *ncp) 803 { 804 if (ncp->nc_locktd == curthread) 805 return(LK_EXCLUSIVE); 806 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) 807 return(LK_SHARED); 808 return(-1); 809 } 810 811 /* 812 * cache_hold() and cache_drop() prevent the premature deletion of a 813 * namecache entry but do not prevent operations (such as zapping) on 814 * that namecache entry. 815 * 816 * This routine may only be called from outside this source module if 817 * nc_refs is already at least 1. 818 * 819 * This is a rare case where callers are allowed to hold a spinlock, 820 * so we can't ourselves. 821 */ 822 static __inline 823 struct namecache * 824 _cache_hold(struct namecache *ncp) 825 { 826 atomic_add_int(&ncp->nc_refs, 1); 827 return(ncp); 828 } 829 830 /* 831 * Drop a cache entry, taking care to deal with races. 832 * 833 * For potential 1->0 transitions we must hold the ncp lock to safely 834 * test its flags. An unresolved entry with no children must be zapped 835 * to avoid leaks. 836 * 837 * The call to cache_zap() itself will handle all remaining races and 838 * will decrement the ncp's refs regardless. If we are resolved or 839 * have children nc_refs can safely be dropped to 0 without having to 840 * zap the entry. 841 * 842 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion. 843 * 844 * NOTE: cache_zap() may return a non-NULL referenced parent which must 845 * be dropped in a loop. 846 */ 847 static __inline 848 void 849 _cache_drop(struct namecache *ncp) 850 { 851 int refs; 852 853 while (ncp) { 854 KKASSERT(ncp->nc_refs > 0); 855 refs = ncp->nc_refs; 856 857 if (refs == 1) { 858 if (_cache_lock_nonblock(ncp) == 0) { 859 ncp->nc_flag &= ~NCF_DEFEREDZAP; 860 if ((ncp->nc_flag & NCF_UNRESOLVED) && 861 TAILQ_EMPTY(&ncp->nc_list)) { 862 ncp = cache_zap(ncp, 1); 863 continue; 864 } 865 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) { 866 _cache_unlock(ncp); 867 break; 868 } 869 _cache_unlock(ncp); 870 } 871 } else { 872 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) 873 break; 874 } 875 cpu_pause(); 876 } 877 } 878 879 /* 880 * Link a new namecache entry to its parent and to the hash table. Be 881 * careful to avoid races if vhold() blocks in the future. 882 * 883 * Both ncp and par must be referenced and locked. 884 * 885 * NOTE: The hash table spinlock is held during this call, we can't do 886 * anything fancy. 887 */ 888 static void 889 _cache_link_parent(struct namecache *ncp, struct namecache *par, 890 struct nchash_head *nchpp) 891 { 892 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 893 894 KKASSERT(ncp->nc_parent == NULL); 895 ncp->nc_parent = par; 896 ncp->nc_head = nchpp; 897 898 /* 899 * Set inheritance flags. Note that the parent flags may be 900 * stale due to getattr potentially not having been run yet 901 * (it gets run during nlookup()'s). 902 */ 903 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 904 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 905 ncp->nc_flag |= NCF_SF_PNOCACHE; 906 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 907 ncp->nc_flag |= NCF_UF_PCACHE; 908 909 /* 910 * Add to hash table and parent, adjust accounting 911 */ 912 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 913 atomic_add_long(&pn->vfscache_count, 1); 914 if (TAILQ_EMPTY(&ncp->nc_list)) 915 atomic_add_long(&pn->vfscache_leafs, 1); 916 917 if (TAILQ_EMPTY(&par->nc_list)) { 918 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 919 atomic_add_long(&pn->vfscache_leafs, -1); 920 /* 921 * Any vp associated with an ncp which has children must 922 * be held to prevent it from being recycled. 923 */ 924 if (par->nc_vp) 925 vhold(par->nc_vp); 926 } else { 927 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 928 } 929 } 930 931 /* 932 * Remove the parent and hash associations from a namecache structure. 933 * If this is the last child of the parent the cache_drop(par) will 934 * attempt to recursively zap the parent. 935 * 936 * ncp must be locked. This routine will acquire a temporary lock on 937 * the parent as wlel as the appropriate hash chain. 938 */ 939 static void 940 _cache_unlink_parent(struct namecache *ncp) 941 { 942 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 943 struct namecache *par; 944 struct vnode *dropvp; 945 946 if ((par = ncp->nc_parent) != NULL) { 947 KKASSERT(ncp->nc_parent == par); 948 _cache_hold(par); 949 _cache_lock(par); 950 spin_lock(&ncp->nc_head->spin); 951 952 /* 953 * Remove from hash table and parent, adjust accounting 954 */ 955 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 956 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 957 atomic_add_long(&pn->vfscache_count, -1); 958 if (TAILQ_EMPTY(&ncp->nc_list)) 959 atomic_add_long(&pn->vfscache_leafs, -1); 960 961 dropvp = NULL; 962 if (TAILQ_EMPTY(&par->nc_list)) { 963 atomic_add_long(&pn->vfscache_leafs, 1); 964 if (par->nc_vp) 965 dropvp = par->nc_vp; 966 } 967 spin_unlock(&ncp->nc_head->spin); 968 ncp->nc_parent = NULL; 969 ncp->nc_head = NULL; 970 _cache_unlock(par); 971 _cache_drop(par); 972 973 /* 974 * We can only safely vdrop with no spinlocks held. 975 */ 976 if (dropvp) 977 vdrop(dropvp); 978 } 979 } 980 981 /* 982 * Allocate a new namecache structure. Most of the code does not require 983 * zero-termination of the string but it makes vop_compat_ncreate() easier. 984 */ 985 static struct namecache * 986 cache_alloc(int nlen) 987 { 988 struct namecache *ncp; 989 990 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 991 if (nlen) 992 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 993 ncp->nc_nlen = nlen; 994 ncp->nc_flag = NCF_UNRESOLVED; 995 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 996 ncp->nc_refs = 1; 997 998 TAILQ_INIT(&ncp->nc_list); 999 _cache_lock(ncp); 1000 return(ncp); 1001 } 1002 1003 /* 1004 * Can only be called for the case where the ncp has never been 1005 * associated with anything (so no spinlocks are needed). 1006 */ 1007 static void 1008 _cache_free(struct namecache *ncp) 1009 { 1010 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); 1011 if (ncp->nc_name) 1012 kfree(ncp->nc_name, M_VFSCACHE); 1013 kfree(ncp, M_VFSCACHE); 1014 } 1015 1016 /* 1017 * [re]initialize a nchandle. 1018 */ 1019 void 1020 cache_zero(struct nchandle *nch) 1021 { 1022 nch->ncp = NULL; 1023 nch->mount = NULL; 1024 } 1025 1026 /* 1027 * Ref and deref a namecache structure. 1028 * 1029 * The caller must specify a stable ncp pointer, typically meaning the 1030 * ncp is already referenced but this can also occur indirectly through 1031 * e.g. holding a lock on a direct child. 1032 * 1033 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 1034 * use read spinlocks here. 1035 */ 1036 struct nchandle * 1037 cache_hold(struct nchandle *nch) 1038 { 1039 _cache_hold(nch->ncp); 1040 _cache_mntref(nch->mount); 1041 return(nch); 1042 } 1043 1044 /* 1045 * Create a copy of a namecache handle for an already-referenced 1046 * entry. 1047 */ 1048 void 1049 cache_copy(struct nchandle *nch, struct nchandle *target) 1050 { 1051 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1052 struct namecache *ncp; 1053 1054 *target = *nch; 1055 _cache_mntref(target->mount); 1056 ncp = target->ncp; 1057 if (ncp) { 1058 if (ncp == cache->ncp1) { 1059 if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL)) 1060 return; 1061 } 1062 if (ncp == cache->ncp2) { 1063 if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL)) 1064 return; 1065 } 1066 _cache_hold(ncp); 1067 } 1068 } 1069 1070 /* 1071 * Caller wants to copy the current directory, copy it out from our 1072 * pcpu cache if possible (the entire critical path is just two localized 1073 * cmpset ops). If the pcpu cache has a snapshot at all it will be a 1074 * valid one, so we don't have to lock p->p_fd even though we are loading 1075 * two fields. 1076 * 1077 * This has a limited effect since nlookup must still ref and shlock the 1078 * vnode to check perms. We do avoid the per-proc spin-lock though, which 1079 * can aid threaded programs. 1080 */ 1081 void 1082 cache_copy_ncdir(struct proc *p, struct nchandle *target) 1083 { 1084 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1085 1086 *target = p->p_fd->fd_ncdir; 1087 if (target->ncp == cache->ncdir.ncp && 1088 target->mount == cache->ncdir.mount) { 1089 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp, 1090 target->ncp, NULL)) { 1091 if (atomic_cmpset_ptr((void *)&cache->ncdir.mount, 1092 target->mount, NULL)) { 1093 /* CRITICAL PATH */ 1094 return; 1095 } 1096 _cache_drop(target->ncp); 1097 } 1098 } 1099 spin_lock_shared(&p->p_fd->fd_spin); 1100 cache_copy(&p->p_fd->fd_ncdir, target); 1101 spin_unlock_shared(&p->p_fd->fd_spin); 1102 } 1103 1104 void 1105 cache_changemount(struct nchandle *nch, struct mount *mp) 1106 { 1107 _cache_mntref(mp); 1108 _cache_mntrel(nch->mount); 1109 nch->mount = mp; 1110 } 1111 1112 void 1113 cache_drop(struct nchandle *nch) 1114 { 1115 _cache_mntrel(nch->mount); 1116 _cache_drop(nch->ncp); 1117 nch->ncp = NULL; 1118 nch->mount = NULL; 1119 } 1120 1121 /* 1122 * Drop the nchandle, but try to cache the ref to avoid global atomic 1123 * ops. This is typically done on the system root and jail root nchandles. 1124 */ 1125 void 1126 cache_drop_and_cache(struct nchandle *nch) 1127 { 1128 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1129 struct namecache *ncp; 1130 1131 _cache_mntrel(nch->mount); 1132 ncp = nch->ncp; 1133 if (cache->ncp1 == NULL) { 1134 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1135 if (ncp == NULL) 1136 goto done; 1137 } 1138 if (cache->ncp2 == NULL) { 1139 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1140 if (ncp == NULL) 1141 goto done; 1142 } 1143 if (++cache->iter & 1) 1144 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1145 else 1146 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1147 if (ncp) 1148 _cache_drop(ncp); 1149 done: 1150 nch->ncp = NULL; 1151 nch->mount = NULL; 1152 } 1153 1154 /* 1155 * We are dropping what the caller believes is the current directory, 1156 * unconditionally store it in our pcpu cache. Anything already in 1157 * the cache will be discarded. 1158 */ 1159 void 1160 cache_drop_ncdir(struct nchandle *nch) 1161 { 1162 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1163 1164 nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp); 1165 nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount); 1166 if (nch->ncp) 1167 _cache_drop(nch->ncp); 1168 if (nch->mount) 1169 _cache_mntrel(nch->mount); 1170 nch->ncp = NULL; 1171 nch->mount = NULL; 1172 } 1173 1174 int 1175 cache_lockstatus(struct nchandle *nch) 1176 { 1177 return(_cache_lockstatus(nch->ncp)); 1178 } 1179 1180 void 1181 cache_lock(struct nchandle *nch) 1182 { 1183 _cache_lock(nch->ncp); 1184 } 1185 1186 void 1187 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1188 { 1189 struct namecache *ncp = nch->ncp; 1190 1191 if (ncp_shared_lock_disable || excl || 1192 (ncp->nc_flag & NCF_UNRESOLVED)) { 1193 _cache_lock(ncp); 1194 } else { 1195 _cache_lock_shared(ncp); 1196 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1197 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1198 _cache_unlock(ncp); 1199 _cache_lock(ncp); 1200 } 1201 } else { 1202 _cache_unlock(ncp); 1203 _cache_lock(ncp); 1204 } 1205 } 1206 } 1207 1208 /* 1209 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1210 * is responsible for checking both for validity on return as they 1211 * may have become invalid. 1212 * 1213 * We have to deal with potential deadlocks here, just ping pong 1214 * the lock until we get it (we will always block somewhere when 1215 * looping so this is not cpu-intensive). 1216 * 1217 * which = 0 nch1 not locked, nch2 is locked 1218 * which = 1 nch1 is locked, nch2 is not locked 1219 */ 1220 void 1221 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1222 struct nchandle *nch2, struct ucred *cred2) 1223 { 1224 int which; 1225 1226 which = 0; 1227 1228 for (;;) { 1229 if (which == 0) { 1230 if (cache_lock_nonblock(nch1) == 0) { 1231 cache_resolve(nch1, cred1); 1232 break; 1233 } 1234 cache_unlock(nch2); 1235 cache_lock(nch1); 1236 cache_resolve(nch1, cred1); 1237 which = 1; 1238 } else { 1239 if (cache_lock_nonblock(nch2) == 0) { 1240 cache_resolve(nch2, cred2); 1241 break; 1242 } 1243 cache_unlock(nch1); 1244 cache_lock(nch2); 1245 cache_resolve(nch2, cred2); 1246 which = 0; 1247 } 1248 } 1249 } 1250 1251 int 1252 cache_lock_nonblock(struct nchandle *nch) 1253 { 1254 return(_cache_lock_nonblock(nch->ncp)); 1255 } 1256 1257 void 1258 cache_unlock(struct nchandle *nch) 1259 { 1260 _cache_unlock(nch->ncp); 1261 } 1262 1263 /* 1264 * ref-and-lock, unlock-and-deref functions. 1265 * 1266 * This function is primarily used by nlookup. Even though cache_lock 1267 * holds the vnode, it is possible that the vnode may have already 1268 * initiated a recyclement. 1269 * 1270 * We want cache_get() to return a definitively usable vnode or a 1271 * definitively unresolved ncp. 1272 */ 1273 static 1274 struct namecache * 1275 _cache_get(struct namecache *ncp) 1276 { 1277 _cache_hold(ncp); 1278 _cache_lock(ncp); 1279 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1280 _cache_setunresolved(ncp); 1281 return(ncp); 1282 } 1283 1284 /* 1285 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1286 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1287 * valid. Otherwise an exclusive lock will be acquired instead. 1288 */ 1289 static 1290 struct namecache * 1291 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1292 { 1293 if (ncp_shared_lock_disable || excl || 1294 (ncp->nc_flag & NCF_UNRESOLVED)) { 1295 return(_cache_get(ncp)); 1296 } 1297 _cache_hold(ncp); 1298 _cache_lock_shared(ncp); 1299 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1300 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1301 _cache_unlock(ncp); 1302 ncp = _cache_get(ncp); 1303 _cache_drop(ncp); 1304 } 1305 } else { 1306 _cache_unlock(ncp); 1307 ncp = _cache_get(ncp); 1308 _cache_drop(ncp); 1309 } 1310 return(ncp); 1311 } 1312 1313 /* 1314 * This is a special form of _cache_lock() which only succeeds if 1315 * it can get a pristine, non-recursive lock. The caller must have 1316 * already ref'd the ncp. 1317 * 1318 * On success the ncp will be locked, on failure it will not. The 1319 * ref count does not change either way. 1320 * 1321 * We want _cache_lock_special() (on success) to return a definitively 1322 * usable vnode or a definitively unresolved ncp. 1323 */ 1324 static int 1325 _cache_lock_special(struct namecache *ncp) 1326 { 1327 if (_cache_lock_nonblock(ncp) == 0) { 1328 if ((ncp->nc_lockstatus & 1329 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { 1330 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1331 _cache_setunresolved(ncp); 1332 return(0); 1333 } 1334 _cache_unlock(ncp); 1335 } 1336 return(EWOULDBLOCK); 1337 } 1338 1339 /* 1340 * This function tries to get a shared lock but will back-off to an exclusive 1341 * lock if: 1342 * 1343 * (1) Some other thread is trying to obtain an exclusive lock 1344 * (to prevent the exclusive requester from getting livelocked out 1345 * by many shared locks). 1346 * 1347 * (2) The current thread already owns an exclusive lock (to avoid 1348 * deadlocking). 1349 * 1350 * WARNING! On machines with lots of cores we really want to try hard to 1351 * get a shared lock or concurrent path lookups can chain-react 1352 * into a very high-latency exclusive lock. 1353 */ 1354 static int 1355 _cache_lock_shared_special(struct namecache *ncp) 1356 { 1357 /* 1358 * Only honor a successful shared lock (returning 0) if there is 1359 * no exclusive request pending and the vnode, if present, is not 1360 * in a reclaimed state. 1361 */ 1362 if (_cache_lock_shared_nonblock(ncp) == 0) { 1363 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) { 1364 if (ncp->nc_vp == NULL || 1365 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 1366 return(0); 1367 } 1368 } 1369 _cache_unlock(ncp); 1370 return(EWOULDBLOCK); 1371 } 1372 1373 /* 1374 * Non-blocking shared lock failed. If we already own the exclusive 1375 * lock just acquire another exclusive lock (instead of deadlocking). 1376 * Otherwise acquire a shared lock. 1377 */ 1378 if (ncp->nc_locktd == curthread) { 1379 _cache_lock(ncp); 1380 return(0); 1381 } 1382 _cache_lock_shared(ncp); 1383 return(0); 1384 } 1385 1386 1387 /* 1388 * NOTE: The same nchandle can be passed for both arguments. 1389 */ 1390 void 1391 cache_get(struct nchandle *nch, struct nchandle *target) 1392 { 1393 KKASSERT(nch->ncp->nc_refs > 0); 1394 target->mount = nch->mount; 1395 target->ncp = _cache_get(nch->ncp); 1396 _cache_mntref(target->mount); 1397 } 1398 1399 void 1400 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1401 { 1402 KKASSERT(nch->ncp->nc_refs > 0); 1403 target->mount = nch->mount; 1404 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1405 _cache_mntref(target->mount); 1406 } 1407 1408 /* 1409 * 1410 */ 1411 static __inline 1412 void 1413 _cache_put(struct namecache *ncp) 1414 { 1415 _cache_unlock(ncp); 1416 _cache_drop(ncp); 1417 } 1418 1419 /* 1420 * 1421 */ 1422 void 1423 cache_put(struct nchandle *nch) 1424 { 1425 _cache_mntrel(nch->mount); 1426 _cache_put(nch->ncp); 1427 nch->ncp = NULL; 1428 nch->mount = NULL; 1429 } 1430 1431 /* 1432 * Resolve an unresolved ncp by associating a vnode with it. If the 1433 * vnode is NULL, a negative cache entry is created. 1434 * 1435 * The ncp should be locked on entry and will remain locked on return. 1436 */ 1437 static 1438 void 1439 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1440 { 1441 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 1442 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1443 1444 if (vp != NULL) { 1445 /* 1446 * Any vp associated with an ncp which has children must 1447 * be held. Any vp associated with a locked ncp must be held. 1448 */ 1449 if (!TAILQ_EMPTY(&ncp->nc_list)) 1450 vhold(vp); 1451 spin_lock(&vp->v_spin); 1452 ncp->nc_vp = vp; 1453 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1454 spin_unlock(&vp->v_spin); 1455 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1456 vhold(vp); 1457 1458 /* 1459 * Set auxiliary flags 1460 */ 1461 switch(vp->v_type) { 1462 case VDIR: 1463 ncp->nc_flag |= NCF_ISDIR; 1464 break; 1465 case VLNK: 1466 ncp->nc_flag |= NCF_ISSYMLINK; 1467 /* XXX cache the contents of the symlink */ 1468 break; 1469 default: 1470 break; 1471 } 1472 ncp->nc_error = 0; 1473 /* XXX: this is a hack to work-around the lack of a real pfs vfs 1474 * implementation*/ 1475 if (mp != NULL) 1476 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1477 vp->v_pfsmp = mp; 1478 } else { 1479 /* 1480 * When creating a negative cache hit we set the 1481 * namecache_gen. A later resolve will clean out the 1482 * negative cache hit if the mount point's namecache_gen 1483 * has changed. Used by devfs, could also be used by 1484 * other remote FSs. 1485 */ 1486 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1487 1488 ncp->nc_vp = NULL; 1489 ncp->nc_negcpu = mycpu->gd_cpuid; 1490 spin_lock(&pn->neg_spin); 1491 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1492 ++pn->neg_count; 1493 spin_unlock(&pn->neg_spin); 1494 atomic_add_long(&pn->vfscache_negs, 1); 1495 1496 ncp->nc_error = ENOENT; 1497 if (mp) 1498 VFS_NCPGEN_SET(mp, ncp); 1499 } 1500 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1501 } 1502 1503 /* 1504 * 1505 */ 1506 void 1507 cache_setvp(struct nchandle *nch, struct vnode *vp) 1508 { 1509 _cache_setvp(nch->mount, nch->ncp, vp); 1510 } 1511 1512 /* 1513 * 1514 */ 1515 void 1516 cache_settimeout(struct nchandle *nch, int nticks) 1517 { 1518 struct namecache *ncp = nch->ncp; 1519 1520 if ((ncp->nc_timeout = ticks + nticks) == 0) 1521 ncp->nc_timeout = 1; 1522 } 1523 1524 /* 1525 * Disassociate the vnode or negative-cache association and mark a 1526 * namecache entry as unresolved again. Note that the ncp is still 1527 * left in the hash table and still linked to its parent. 1528 * 1529 * The ncp should be locked and refd on entry and will remain locked and refd 1530 * on return. 1531 * 1532 * This routine is normally never called on a directory containing children. 1533 * However, NFS often does just that in its rename() code as a cop-out to 1534 * avoid complex namespace operations. This disconnects a directory vnode 1535 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1536 * sync. 1537 * 1538 */ 1539 static 1540 void 1541 _cache_setunresolved(struct namecache *ncp) 1542 { 1543 struct vnode *vp; 1544 1545 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1546 ncp->nc_flag |= NCF_UNRESOLVED; 1547 ncp->nc_timeout = 0; 1548 ncp->nc_error = ENOTCONN; 1549 if ((vp = ncp->nc_vp) != NULL) { 1550 spin_lock(&vp->v_spin); 1551 ncp->nc_vp = NULL; 1552 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1553 spin_unlock(&vp->v_spin); 1554 1555 /* 1556 * Any vp associated with an ncp with children is 1557 * held by that ncp. Any vp associated with a locked 1558 * ncp is held by that ncp. These conditions must be 1559 * undone when the vp is cleared out from the ncp. 1560 */ 1561 if (!TAILQ_EMPTY(&ncp->nc_list)) 1562 vdrop(vp); 1563 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1564 vdrop(vp); 1565 } else { 1566 struct pcpu_ncache *pn; 1567 1568 pn = &pcpu_ncache[ncp->nc_negcpu]; 1569 1570 atomic_add_long(&pn->vfscache_negs, -1); 1571 spin_lock(&pn->neg_spin); 1572 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1573 --pn->neg_count; 1574 spin_unlock(&pn->neg_spin); 1575 } 1576 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1577 } 1578 } 1579 1580 /* 1581 * The cache_nresolve() code calls this function to automatically 1582 * set a resolved cache element to unresolved if it has timed out 1583 * or if it is a negative cache hit and the mount point namecache_gen 1584 * has changed. 1585 */ 1586 static __inline int 1587 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1588 { 1589 /* 1590 * Try to zap entries that have timed out. We have 1591 * to be careful here because locked leafs may depend 1592 * on the vnode remaining intact in a parent, so only 1593 * do this under very specific conditions. 1594 */ 1595 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1596 TAILQ_EMPTY(&ncp->nc_list)) { 1597 return 1; 1598 } 1599 1600 /* 1601 * If a resolved negative cache hit is invalid due to 1602 * the mount's namecache generation being bumped, zap it. 1603 */ 1604 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1605 return 1; 1606 } 1607 1608 /* 1609 * Otherwise we are good 1610 */ 1611 return 0; 1612 } 1613 1614 static __inline void 1615 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1616 { 1617 /* 1618 * Already in an unresolved state, nothing to do. 1619 */ 1620 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1621 if (_cache_auto_unresolve_test(mp, ncp)) 1622 _cache_setunresolved(ncp); 1623 } 1624 } 1625 1626 /* 1627 * 1628 */ 1629 void 1630 cache_setunresolved(struct nchandle *nch) 1631 { 1632 _cache_setunresolved(nch->ncp); 1633 } 1634 1635 /* 1636 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1637 * looking for matches. This flag tells the lookup code when it must 1638 * check for a mount linkage and also prevents the directories in question 1639 * from being deleted or renamed. 1640 */ 1641 static 1642 int 1643 cache_clrmountpt_callback(struct mount *mp, void *data) 1644 { 1645 struct nchandle *nch = data; 1646 1647 if (mp->mnt_ncmounton.ncp == nch->ncp) 1648 return(1); 1649 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1650 return(1); 1651 return(0); 1652 } 1653 1654 /* 1655 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1656 * with a mount point. 1657 */ 1658 void 1659 cache_clrmountpt(struct nchandle *nch) 1660 { 1661 int count; 1662 1663 count = mountlist_scan(cache_clrmountpt_callback, nch, 1664 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1665 if (count == 0) 1666 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1667 } 1668 1669 /* 1670 * Invalidate portions of the namecache topology given a starting entry. 1671 * The passed ncp is set to an unresolved state and: 1672 * 1673 * The passed ncp must be referencxed and locked. The routine may unlock 1674 * and relock ncp several times, and will recheck the children and loop 1675 * to catch races. When done the passed ncp will be returned with the 1676 * reference and lock intact. 1677 * 1678 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1679 * that the physical underlying nodes have been 1680 * destroyed... as in deleted. For example, when 1681 * a directory is removed. This will cause record 1682 * lookups on the name to no longer be able to find 1683 * the record and tells the resolver to return failure 1684 * rather then trying to resolve through the parent. 1685 * 1686 * The topology itself, including ncp->nc_name, 1687 * remains intact. 1688 * 1689 * This only applies to the passed ncp, if CINV_CHILDREN 1690 * is specified the children are not flagged. 1691 * 1692 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1693 * state as well. 1694 * 1695 * Note that this will also have the side effect of 1696 * cleaning out any unreferenced nodes in the topology 1697 * from the leaves up as the recursion backs out. 1698 * 1699 * Note that the topology for any referenced nodes remains intact, but 1700 * the nodes will be marked as having been destroyed and will be set 1701 * to an unresolved state. 1702 * 1703 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1704 * the namecache entry may not actually be invalidated on return if it was 1705 * revalidated while recursing down into its children. This code guarentees 1706 * that the node(s) will go through an invalidation cycle, but does not 1707 * guarentee that they will remain in an invalidated state. 1708 * 1709 * Returns non-zero if a revalidation was detected during the invalidation 1710 * recursion, zero otherwise. Note that since only the original ncp is 1711 * locked the revalidation ultimately can only indicate that the original ncp 1712 * *MIGHT* no have been reresolved. 1713 * 1714 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1715 * have to avoid blowing out the kernel stack. We do this by saving the 1716 * deep namecache node and aborting the recursion, then re-recursing at that 1717 * node using a depth-first algorithm in order to allow multiple deep 1718 * recursions to chain through each other, then we restart the invalidation 1719 * from scratch. 1720 */ 1721 1722 struct cinvtrack { 1723 struct namecache *resume_ncp; 1724 int depth; 1725 }; 1726 1727 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1728 1729 static 1730 int 1731 _cache_inval(struct namecache *ncp, int flags) 1732 { 1733 struct cinvtrack track; 1734 struct namecache *ncp2; 1735 int r; 1736 1737 track.depth = 0; 1738 track.resume_ncp = NULL; 1739 1740 for (;;) { 1741 r = _cache_inval_internal(ncp, flags, &track); 1742 if (track.resume_ncp == NULL) 1743 break; 1744 _cache_unlock(ncp); 1745 while ((ncp2 = track.resume_ncp) != NULL) { 1746 track.resume_ncp = NULL; 1747 _cache_lock(ncp2); 1748 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1749 &track); 1750 _cache_put(ncp2); 1751 } 1752 _cache_lock(ncp); 1753 } 1754 return(r); 1755 } 1756 1757 int 1758 cache_inval(struct nchandle *nch, int flags) 1759 { 1760 return(_cache_inval(nch->ncp, flags)); 1761 } 1762 1763 /* 1764 * Helper for _cache_inval(). The passed ncp is refd and locked and 1765 * remains that way on return, but may be unlocked/relocked multiple 1766 * times by the routine. 1767 */ 1768 static int 1769 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1770 { 1771 struct namecache *nextkid; 1772 int rcnt = 0; 1773 1774 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1775 1776 _cache_setunresolved(ncp); 1777 if (flags & CINV_DESTROY) { 1778 ncp->nc_flag |= NCF_DESTROYED; 1779 ++ncp->nc_generation; 1780 } 1781 while ((flags & CINV_CHILDREN) && 1782 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1783 ) { 1784 struct namecache *kid; 1785 int restart; 1786 1787 restart = 0; 1788 _cache_hold(nextkid); 1789 if (++track->depth > MAX_RECURSION_DEPTH) { 1790 track->resume_ncp = ncp; 1791 _cache_hold(ncp); 1792 ++rcnt; 1793 } 1794 while ((kid = nextkid) != NULL) { 1795 /* 1796 * Parent (ncp) must be locked for the iteration. 1797 */ 1798 nextkid = NULL; 1799 if (kid->nc_parent != ncp) { 1800 _cache_drop(kid); 1801 kprintf("cache_inval_internal restartA %s\n", 1802 ncp->nc_name); 1803 restart = 1; 1804 break; 1805 } 1806 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1807 _cache_hold(nextkid); 1808 1809 /* 1810 * Parent unlocked for this section to avoid 1811 * deadlocks. 1812 */ 1813 _cache_unlock(ncp); 1814 if (track->resume_ncp) { 1815 _cache_drop(kid); 1816 _cache_lock(ncp); 1817 break; 1818 } 1819 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1820 TAILQ_FIRST(&kid->nc_list) 1821 ) { 1822 _cache_lock(kid); 1823 if (kid->nc_parent != ncp) { 1824 kprintf("cache_inval_internal " 1825 "restartB %s\n", 1826 ncp->nc_name); 1827 restart = 1; 1828 _cache_unlock(kid); 1829 _cache_drop(kid); 1830 _cache_lock(ncp); 1831 break; 1832 } 1833 1834 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track); 1835 _cache_unlock(kid); 1836 } 1837 _cache_drop(kid); 1838 _cache_lock(ncp); 1839 } 1840 if (nextkid) 1841 _cache_drop(nextkid); 1842 --track->depth; 1843 if (restart == 0) 1844 break; 1845 } 1846 1847 /* 1848 * Someone could have gotten in there while ncp was unlocked, 1849 * retry if so. 1850 */ 1851 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1852 ++rcnt; 1853 return (rcnt); 1854 } 1855 1856 /* 1857 * Invalidate a vnode's namecache associations. To avoid races against 1858 * the resolver we do not invalidate a node which we previously invalidated 1859 * but which was then re-resolved while we were in the invalidation loop. 1860 * 1861 * Returns non-zero if any namecache entries remain after the invalidation 1862 * loop completed. 1863 * 1864 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1865 * be ripped out of the topology while held, the vnode's v_namecache 1866 * list has no such restriction. NCP's can be ripped out of the list 1867 * at virtually any time if not locked, even if held. 1868 * 1869 * In addition, the v_namecache list itself must be locked via 1870 * the vnode's spinlock. 1871 */ 1872 int 1873 cache_inval_vp(struct vnode *vp, int flags) 1874 { 1875 struct namecache *ncp; 1876 struct namecache *next; 1877 1878 restart: 1879 spin_lock(&vp->v_spin); 1880 ncp = TAILQ_FIRST(&vp->v_namecache); 1881 if (ncp) 1882 _cache_hold(ncp); 1883 while (ncp) { 1884 /* loop entered with ncp held and vp spin-locked */ 1885 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1886 _cache_hold(next); 1887 spin_unlock(&vp->v_spin); 1888 _cache_lock(ncp); 1889 if (ncp->nc_vp != vp) { 1890 kprintf("Warning: cache_inval_vp: race-A detected on " 1891 "%s\n", ncp->nc_name); 1892 _cache_put(ncp); 1893 if (next) 1894 _cache_drop(next); 1895 goto restart; 1896 } 1897 _cache_inval(ncp, flags); 1898 _cache_put(ncp); /* also releases reference */ 1899 ncp = next; 1900 spin_lock(&vp->v_spin); 1901 if (ncp && ncp->nc_vp != vp) { 1902 spin_unlock(&vp->v_spin); 1903 kprintf("Warning: cache_inval_vp: race-B detected on " 1904 "%s\n", ncp->nc_name); 1905 _cache_drop(ncp); 1906 goto restart; 1907 } 1908 } 1909 spin_unlock(&vp->v_spin); 1910 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1911 } 1912 1913 /* 1914 * This routine is used instead of the normal cache_inval_vp() when we 1915 * are trying to recycle otherwise good vnodes. 1916 * 1917 * Return 0 on success, non-zero if not all namecache records could be 1918 * disassociated from the vnode (for various reasons). 1919 */ 1920 int 1921 cache_inval_vp_nonblock(struct vnode *vp) 1922 { 1923 struct namecache *ncp; 1924 struct namecache *next; 1925 1926 spin_lock(&vp->v_spin); 1927 ncp = TAILQ_FIRST(&vp->v_namecache); 1928 if (ncp) 1929 _cache_hold(ncp); 1930 while (ncp) { 1931 /* loop entered with ncp held */ 1932 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1933 _cache_hold(next); 1934 spin_unlock(&vp->v_spin); 1935 if (_cache_lock_nonblock(ncp)) { 1936 _cache_drop(ncp); 1937 if (next) 1938 _cache_drop(next); 1939 goto done; 1940 } 1941 if (ncp->nc_vp != vp) { 1942 kprintf("Warning: cache_inval_vp: race-A detected on " 1943 "%s\n", ncp->nc_name); 1944 _cache_put(ncp); 1945 if (next) 1946 _cache_drop(next); 1947 goto done; 1948 } 1949 _cache_inval(ncp, 0); 1950 _cache_put(ncp); /* also releases reference */ 1951 ncp = next; 1952 spin_lock(&vp->v_spin); 1953 if (ncp && ncp->nc_vp != vp) { 1954 spin_unlock(&vp->v_spin); 1955 kprintf("Warning: cache_inval_vp: race-B detected on " 1956 "%s\n", ncp->nc_name); 1957 _cache_drop(ncp); 1958 goto done; 1959 } 1960 } 1961 spin_unlock(&vp->v_spin); 1962 done: 1963 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1964 } 1965 1966 /* 1967 * Clears the universal directory search 'ok' flag. This flag allows 1968 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1969 * so clearing it simply forces revalidation. 1970 */ 1971 void 1972 cache_inval_wxok(struct vnode *vp) 1973 { 1974 struct namecache *ncp; 1975 1976 spin_lock(&vp->v_spin); 1977 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1978 if (ncp->nc_flag & NCF_WXOK) 1979 atomic_clear_short(&ncp->nc_flag, NCF_WXOK); 1980 } 1981 spin_unlock(&vp->v_spin); 1982 } 1983 1984 /* 1985 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1986 * must be locked. The target ncp is destroyed (as a normal rename-over 1987 * would destroy the target file or directory). 1988 * 1989 * Because there may be references to the source ncp we cannot copy its 1990 * contents to the target. Instead the source ncp is relinked as the target 1991 * and the target ncp is removed from the namecache topology. 1992 */ 1993 void 1994 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1995 { 1996 struct namecache *fncp = fnch->ncp; 1997 struct namecache *tncp = tnch->ncp; 1998 struct namecache *tncp_par; 1999 struct nchash_head *nchpp; 2000 u_int32_t hash; 2001 char *oname; 2002 char *nname; 2003 2004 ++fncp->nc_generation; 2005 ++tncp->nc_generation; 2006 if (tncp->nc_nlen) { 2007 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 2008 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 2009 nname[tncp->nc_nlen] = 0; 2010 } else { 2011 nname = NULL; 2012 } 2013 2014 /* 2015 * Rename fncp (unlink) 2016 */ 2017 _cache_unlink_parent(fncp); 2018 oname = fncp->nc_name; 2019 fncp->nc_name = nname; 2020 fncp->nc_nlen = tncp->nc_nlen; 2021 if (oname) 2022 kfree(oname, M_VFSCACHE); 2023 2024 tncp_par = tncp->nc_parent; 2025 _cache_hold(tncp_par); 2026 _cache_lock(tncp_par); 2027 2028 /* 2029 * Rename fncp (relink) 2030 */ 2031 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 2032 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 2033 nchpp = NCHHASH(hash); 2034 2035 spin_lock(&nchpp->spin); 2036 _cache_link_parent(fncp, tncp_par, nchpp); 2037 spin_unlock(&nchpp->spin); 2038 2039 _cache_put(tncp_par); 2040 2041 /* 2042 * Get rid of the overwritten tncp (unlink) 2043 */ 2044 _cache_unlink(tncp); 2045 } 2046 2047 /* 2048 * Perform actions consistent with unlinking a file. The passed-in ncp 2049 * must be locked. 2050 * 2051 * The ncp is marked DESTROYED so it no longer shows up in searches, 2052 * and will be physically deleted when the vnode goes away. 2053 * 2054 * If the related vnode has no refs then we cycle it through vget()/vput() 2055 * to (possibly if we don't have a ref race) trigger a deactivation, 2056 * allowing the VFS to trivially detect and recycle the deleted vnode 2057 * via VOP_INACTIVE(). 2058 * 2059 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2060 * target ncp. 2061 */ 2062 void 2063 cache_unlink(struct nchandle *nch) 2064 { 2065 _cache_unlink(nch->ncp); 2066 } 2067 2068 static void 2069 _cache_unlink(struct namecache *ncp) 2070 { 2071 struct vnode *vp; 2072 2073 /* 2074 * Causes lookups to fail and allows another ncp with the same 2075 * name to be created under ncp->nc_parent. 2076 */ 2077 ncp->nc_flag |= NCF_DESTROYED; 2078 ++ncp->nc_generation; 2079 2080 /* 2081 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2082 * force action on the 1->0 transition. 2083 */ 2084 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2085 (vp = ncp->nc_vp) != NULL) { 2086 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2087 if (VREFCNT(vp) <= 0) { 2088 if (vget(vp, LK_SHARED) == 0) 2089 vput(vp); 2090 } 2091 } 2092 } 2093 2094 /* 2095 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2096 * file. The easy solution is to just return non-zero if the vnode has refs. 2097 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2098 * force the reclaim). 2099 */ 2100 int 2101 cache_isopen(struct nchandle *nch) 2102 { 2103 struct vnode *vp; 2104 struct namecache *ncp = nch->ncp; 2105 2106 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2107 (vp = ncp->nc_vp) != NULL && 2108 VREFCNT(vp)) { 2109 return 1; 2110 } 2111 return 0; 2112 } 2113 2114 2115 /* 2116 * vget the vnode associated with the namecache entry. Resolve the namecache 2117 * entry if necessary. The passed ncp must be referenced and locked. If 2118 * the ncp is resolved it might be locked shared. 2119 * 2120 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2121 * (depending on the passed lk_type) will be returned in *vpp with an error 2122 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2123 * most typical error is ENOENT, meaning that the ncp represents a negative 2124 * cache hit and there is no vnode to retrieve, but other errors can occur 2125 * too. 2126 * 2127 * The vget() can race a reclaim. If this occurs we re-resolve the 2128 * namecache entry. 2129 * 2130 * There are numerous places in the kernel where vget() is called on a 2131 * vnode while one or more of its namecache entries is locked. Releasing 2132 * a vnode never deadlocks against locked namecache entries (the vnode 2133 * will not get recycled while referenced ncp's exist). This means we 2134 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2135 * lock when acquiring the vp lock or we might cause a deadlock. 2136 * 2137 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2138 * unresolved. If a reclaim race occurs the passed-in ncp will be 2139 * relocked exclusively before being re-resolved. 2140 */ 2141 int 2142 cache_vget(struct nchandle *nch, struct ucred *cred, 2143 int lk_type, struct vnode **vpp) 2144 { 2145 struct namecache *ncp; 2146 struct vnode *vp; 2147 int error; 2148 2149 ncp = nch->ncp; 2150 again: 2151 vp = NULL; 2152 if (ncp->nc_flag & NCF_UNRESOLVED) 2153 error = cache_resolve(nch, cred); 2154 else 2155 error = 0; 2156 2157 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2158 error = vget(vp, lk_type); 2159 if (error) { 2160 /* 2161 * VRECLAIM race 2162 * 2163 * The ncp may have been locked shared, we must relock 2164 * it exclusively before we can set it to unresolved. 2165 */ 2166 if (error == ENOENT) { 2167 kprintf("Warning: vnode reclaim race detected " 2168 "in cache_vget on %p (%s)\n", 2169 vp, ncp->nc_name); 2170 _cache_unlock(ncp); 2171 _cache_lock(ncp); 2172 _cache_setunresolved(ncp); 2173 goto again; 2174 } 2175 2176 /* 2177 * Not a reclaim race, some other error. 2178 */ 2179 KKASSERT(ncp->nc_vp == vp); 2180 vp = NULL; 2181 } else { 2182 KKASSERT(ncp->nc_vp == vp); 2183 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2184 } 2185 } 2186 if (error == 0 && vp == NULL) 2187 error = ENOENT; 2188 *vpp = vp; 2189 return(error); 2190 } 2191 2192 /* 2193 * Similar to cache_vget() but only acquires a ref on the vnode. 2194 * 2195 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2196 * unresolved. If a reclaim race occurs the passed-in ncp will be 2197 * relocked exclusively before being re-resolved. 2198 */ 2199 int 2200 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2201 { 2202 struct namecache *ncp; 2203 struct vnode *vp; 2204 int error; 2205 2206 ncp = nch->ncp; 2207 again: 2208 vp = NULL; 2209 if (ncp->nc_flag & NCF_UNRESOLVED) 2210 error = cache_resolve(nch, cred); 2211 else 2212 error = 0; 2213 2214 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2215 error = vget(vp, LK_SHARED); 2216 if (error) { 2217 /* 2218 * VRECLAIM race 2219 */ 2220 if (error == ENOENT) { 2221 kprintf("Warning: vnode reclaim race detected " 2222 "in cache_vget on %p (%s)\n", 2223 vp, ncp->nc_name); 2224 _cache_unlock(ncp); 2225 _cache_lock(ncp); 2226 _cache_setunresolved(ncp); 2227 goto again; 2228 } 2229 2230 /* 2231 * Not a reclaim race, some other error. 2232 */ 2233 KKASSERT(ncp->nc_vp == vp); 2234 vp = NULL; 2235 } else { 2236 KKASSERT(ncp->nc_vp == vp); 2237 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2238 /* caller does not want a lock */ 2239 vn_unlock(vp); 2240 } 2241 } 2242 if (error == 0 && vp == NULL) 2243 error = ENOENT; 2244 *vpp = vp; 2245 return(error); 2246 } 2247 2248 /* 2249 * Return a referenced vnode representing the parent directory of 2250 * ncp. 2251 * 2252 * Because the caller has locked the ncp it should not be possible for 2253 * the parent ncp to go away. However, the parent can unresolve its 2254 * dvp at any time so we must be able to acquire a lock on the parent 2255 * to safely access nc_vp. 2256 * 2257 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2258 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2259 * getting destroyed. 2260 * 2261 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2262 * lock on the ncp in question.. 2263 */ 2264 static struct vnode * 2265 cache_dvpref(struct namecache *ncp) 2266 { 2267 struct namecache *par; 2268 struct vnode *dvp; 2269 2270 dvp = NULL; 2271 if ((par = ncp->nc_parent) != NULL) { 2272 _cache_hold(par); 2273 _cache_lock(par); 2274 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2275 if ((dvp = par->nc_vp) != NULL) 2276 vhold(dvp); 2277 } 2278 _cache_unlock(par); 2279 if (dvp) { 2280 if (vget(dvp, LK_SHARED) == 0) { 2281 vn_unlock(dvp); 2282 vdrop(dvp); 2283 /* return refd, unlocked dvp */ 2284 } else { 2285 vdrop(dvp); 2286 dvp = NULL; 2287 } 2288 } 2289 _cache_drop(par); 2290 } 2291 return(dvp); 2292 } 2293 2294 /* 2295 * Convert a directory vnode to a namecache record without any other 2296 * knowledge of the topology. This ONLY works with directory vnodes and 2297 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2298 * returned ncp (if not NULL) will be held and unlocked. 2299 * 2300 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2301 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2302 * for dvp. This will fail only if the directory has been deleted out from 2303 * under the caller. 2304 * 2305 * Callers must always check for a NULL return no matter the value of 'makeit'. 2306 * 2307 * To avoid underflowing the kernel stack each recursive call increments 2308 * the makeit variable. 2309 */ 2310 2311 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2312 struct vnode *dvp, char *fakename); 2313 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2314 struct vnode **saved_dvp); 2315 2316 int 2317 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2318 struct nchandle *nch) 2319 { 2320 struct vnode *saved_dvp; 2321 struct vnode *pvp; 2322 char *fakename; 2323 int error; 2324 2325 nch->ncp = NULL; 2326 nch->mount = dvp->v_mount; 2327 saved_dvp = NULL; 2328 fakename = NULL; 2329 2330 /* 2331 * Handle the makeit == 0 degenerate case 2332 */ 2333 if (makeit == 0) { 2334 spin_lock_shared(&dvp->v_spin); 2335 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2336 if (nch->ncp) 2337 cache_hold(nch); 2338 spin_unlock_shared(&dvp->v_spin); 2339 } 2340 2341 /* 2342 * Loop until resolution, inside code will break out on error. 2343 */ 2344 while (makeit) { 2345 /* 2346 * Break out if we successfully acquire a working ncp. 2347 */ 2348 spin_lock_shared(&dvp->v_spin); 2349 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2350 if (nch->ncp) { 2351 cache_hold(nch); 2352 spin_unlock_shared(&dvp->v_spin); 2353 break; 2354 } 2355 spin_unlock_shared(&dvp->v_spin); 2356 2357 /* 2358 * If dvp is the root of its filesystem it should already 2359 * have a namecache pointer associated with it as a side 2360 * effect of the mount, but it may have been disassociated. 2361 */ 2362 if (dvp->v_flag & VROOT) { 2363 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2364 error = cache_resolve_mp(nch->mount); 2365 _cache_put(nch->ncp); 2366 if (ncvp_debug) { 2367 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2368 dvp->v_mount, error); 2369 } 2370 if (error) { 2371 if (ncvp_debug) 2372 kprintf(" failed\n"); 2373 nch->ncp = NULL; 2374 break; 2375 } 2376 if (ncvp_debug) 2377 kprintf(" succeeded\n"); 2378 continue; 2379 } 2380 2381 /* 2382 * If we are recursed too deeply resort to an O(n^2) 2383 * algorithm to resolve the namecache topology. The 2384 * resolved pvp is left referenced in saved_dvp to 2385 * prevent the tree from being destroyed while we loop. 2386 */ 2387 if (makeit > 20) { 2388 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2389 if (error) { 2390 kprintf("lookupdotdot(longpath) failed %d " 2391 "dvp %p\n", error, dvp); 2392 nch->ncp = NULL; 2393 break; 2394 } 2395 continue; 2396 } 2397 2398 /* 2399 * Get the parent directory and resolve its ncp. 2400 */ 2401 if (fakename) { 2402 kfree(fakename, M_TEMP); 2403 fakename = NULL; 2404 } 2405 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2406 &fakename); 2407 if (error) { 2408 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2409 break; 2410 } 2411 vn_unlock(pvp); 2412 2413 /* 2414 * Reuse makeit as a recursion depth counter. On success 2415 * nch will be fully referenced. 2416 */ 2417 cache_fromdvp(pvp, cred, makeit + 1, nch); 2418 vrele(pvp); 2419 if (nch->ncp == NULL) 2420 break; 2421 2422 /* 2423 * Do an inefficient scan of pvp (embodied by ncp) to look 2424 * for dvp. This will create a namecache record for dvp on 2425 * success. We loop up to recheck on success. 2426 * 2427 * ncp and dvp are both held but not locked. 2428 */ 2429 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2430 if (error) { 2431 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2432 pvp, nch->ncp->nc_name, dvp); 2433 cache_drop(nch); 2434 /* nch was NULLed out, reload mount */ 2435 nch->mount = dvp->v_mount; 2436 break; 2437 } 2438 if (ncvp_debug) { 2439 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2440 pvp, nch->ncp->nc_name); 2441 } 2442 cache_drop(nch); 2443 /* nch was NULLed out, reload mount */ 2444 nch->mount = dvp->v_mount; 2445 } 2446 2447 /* 2448 * If nch->ncp is non-NULL it will have been held already. 2449 */ 2450 if (fakename) 2451 kfree(fakename, M_TEMP); 2452 if (saved_dvp) 2453 vrele(saved_dvp); 2454 if (nch->ncp) 2455 return (0); 2456 return (EINVAL); 2457 } 2458 2459 /* 2460 * Go up the chain of parent directories until we find something 2461 * we can resolve into the namecache. This is very inefficient. 2462 */ 2463 static 2464 int 2465 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2466 struct vnode **saved_dvp) 2467 { 2468 struct nchandle nch; 2469 struct vnode *pvp; 2470 int error; 2471 static time_t last_fromdvp_report; 2472 char *fakename; 2473 2474 /* 2475 * Loop getting the parent directory vnode until we get something we 2476 * can resolve in the namecache. 2477 */ 2478 vref(dvp); 2479 nch.mount = dvp->v_mount; 2480 nch.ncp = NULL; 2481 fakename = NULL; 2482 2483 for (;;) { 2484 if (fakename) { 2485 kfree(fakename, M_TEMP); 2486 fakename = NULL; 2487 } 2488 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2489 &fakename); 2490 if (error) { 2491 vrele(dvp); 2492 break; 2493 } 2494 vn_unlock(pvp); 2495 spin_lock_shared(&pvp->v_spin); 2496 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2497 _cache_hold(nch.ncp); 2498 spin_unlock_shared(&pvp->v_spin); 2499 vrele(pvp); 2500 break; 2501 } 2502 spin_unlock_shared(&pvp->v_spin); 2503 if (pvp->v_flag & VROOT) { 2504 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2505 error = cache_resolve_mp(nch.mount); 2506 _cache_unlock(nch.ncp); 2507 vrele(pvp); 2508 if (error) { 2509 _cache_drop(nch.ncp); 2510 nch.ncp = NULL; 2511 vrele(dvp); 2512 } 2513 break; 2514 } 2515 vrele(dvp); 2516 dvp = pvp; 2517 } 2518 if (error == 0) { 2519 if (last_fromdvp_report != time_uptime) { 2520 last_fromdvp_report = time_uptime; 2521 kprintf("Warning: extremely inefficient path " 2522 "resolution on %s\n", 2523 nch.ncp->nc_name); 2524 } 2525 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2526 2527 /* 2528 * Hopefully dvp now has a namecache record associated with 2529 * it. Leave it referenced to prevent the kernel from 2530 * recycling the vnode. Otherwise extremely long directory 2531 * paths could result in endless recycling. 2532 */ 2533 if (*saved_dvp) 2534 vrele(*saved_dvp); 2535 *saved_dvp = dvp; 2536 _cache_drop(nch.ncp); 2537 } 2538 if (fakename) 2539 kfree(fakename, M_TEMP); 2540 return (error); 2541 } 2542 2543 /* 2544 * Do an inefficient scan of the directory represented by ncp looking for 2545 * the directory vnode dvp. ncp must be held but not locked on entry and 2546 * will be held on return. dvp must be refd but not locked on entry and 2547 * will remain refd on return. 2548 * 2549 * Why do this at all? Well, due to its stateless nature the NFS server 2550 * converts file handles directly to vnodes without necessarily going through 2551 * the namecache ops that would otherwise create the namecache topology 2552 * leading to the vnode. We could either (1) Change the namecache algorithms 2553 * to allow disconnect namecache records that are re-merged opportunistically, 2554 * or (2) Make the NFS server backtrack and scan to recover a connected 2555 * namecache topology in order to then be able to issue new API lookups. 2556 * 2557 * It turns out that (1) is a huge mess. It takes a nice clean set of 2558 * namecache algorithms and introduces a lot of complication in every subsystem 2559 * that calls into the namecache to deal with the re-merge case, especially 2560 * since we are using the namecache to placehold negative lookups and the 2561 * vnode might not be immediately assigned. (2) is certainly far less 2562 * efficient then (1), but since we are only talking about directories here 2563 * (which are likely to remain cached), the case does not actually run all 2564 * that often and has the supreme advantage of not polluting the namecache 2565 * algorithms. 2566 * 2567 * If a fakename is supplied just construct a namecache entry using the 2568 * fake name. 2569 */ 2570 static int 2571 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2572 struct vnode *dvp, char *fakename) 2573 { 2574 struct nlcomponent nlc; 2575 struct nchandle rncp; 2576 struct dirent *den; 2577 struct vnode *pvp; 2578 struct vattr vat; 2579 struct iovec iov; 2580 struct uio uio; 2581 int blksize; 2582 int eofflag; 2583 int bytes; 2584 char *rbuf; 2585 int error; 2586 2587 vat.va_blocksize = 0; 2588 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2589 return (error); 2590 cache_lock(nch); 2591 error = cache_vref(nch, cred, &pvp); 2592 cache_unlock(nch); 2593 if (error) 2594 return (error); 2595 if (ncvp_debug) { 2596 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2597 "vattr fileid = %lld\n", 2598 nch->ncp, nch->ncp->nc_name, 2599 vat.va_blocksize, 2600 (long long)vat.va_fileid); 2601 } 2602 2603 /* 2604 * Use the supplied fakename if not NULL. Fake names are typically 2605 * not in the actual filesystem hierarchy. This is used by HAMMER 2606 * to glue @@timestamp recursions together. 2607 */ 2608 if (fakename) { 2609 nlc.nlc_nameptr = fakename; 2610 nlc.nlc_namelen = strlen(fakename); 2611 rncp = cache_nlookup(nch, &nlc); 2612 goto done; 2613 } 2614 2615 if ((blksize = vat.va_blocksize) == 0) 2616 blksize = DEV_BSIZE; 2617 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2618 rncp.ncp = NULL; 2619 2620 eofflag = 0; 2621 uio.uio_offset = 0; 2622 again: 2623 iov.iov_base = rbuf; 2624 iov.iov_len = blksize; 2625 uio.uio_iov = &iov; 2626 uio.uio_iovcnt = 1; 2627 uio.uio_resid = blksize; 2628 uio.uio_segflg = UIO_SYSSPACE; 2629 uio.uio_rw = UIO_READ; 2630 uio.uio_td = curthread; 2631 2632 if (ncvp_debug >= 2) 2633 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2634 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2635 if (error == 0) { 2636 den = (struct dirent *)rbuf; 2637 bytes = blksize - uio.uio_resid; 2638 2639 while (bytes > 0) { 2640 if (ncvp_debug >= 2) { 2641 kprintf("cache_inefficient_scan: %*.*s\n", 2642 den->d_namlen, den->d_namlen, 2643 den->d_name); 2644 } 2645 if (den->d_type != DT_WHT && 2646 den->d_ino == vat.va_fileid) { 2647 if (ncvp_debug) { 2648 kprintf("cache_inefficient_scan: " 2649 "MATCHED inode %lld path %s/%*.*s\n", 2650 (long long)vat.va_fileid, 2651 nch->ncp->nc_name, 2652 den->d_namlen, den->d_namlen, 2653 den->d_name); 2654 } 2655 nlc.nlc_nameptr = den->d_name; 2656 nlc.nlc_namelen = den->d_namlen; 2657 rncp = cache_nlookup(nch, &nlc); 2658 KKASSERT(rncp.ncp != NULL); 2659 break; 2660 } 2661 bytes -= _DIRENT_DIRSIZ(den); 2662 den = _DIRENT_NEXT(den); 2663 } 2664 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2665 goto again; 2666 } 2667 kfree(rbuf, M_TEMP); 2668 done: 2669 vrele(pvp); 2670 if (rncp.ncp) { 2671 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2672 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2673 if (ncvp_debug >= 2) { 2674 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2675 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2676 } 2677 } else { 2678 if (ncvp_debug >= 2) { 2679 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2680 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2681 rncp.ncp->nc_vp); 2682 } 2683 } 2684 if (rncp.ncp->nc_vp == NULL) 2685 error = rncp.ncp->nc_error; 2686 /* 2687 * Release rncp after a successful nlookup. rncp was fully 2688 * referenced. 2689 */ 2690 cache_put(&rncp); 2691 } else { 2692 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2693 dvp, nch->ncp->nc_name); 2694 error = ENOENT; 2695 } 2696 return (error); 2697 } 2698 2699 /* 2700 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2701 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list. 2702 * 2703 * Then, if there are no additional references to the ncp and no children, 2704 * the ncp is removed from the topology and destroyed. 2705 * 2706 * References and/or children may exist if the ncp is in the middle of the 2707 * topology, preventing the ncp from being destroyed. 2708 * 2709 * This function must be called with the ncp held and locked and will unlock 2710 * and drop it during zapping. 2711 * 2712 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2713 * This case can occur in the cache_drop() path. 2714 * 2715 * This function may returned a held (but NOT locked) parent node which the 2716 * caller must drop. We do this so _cache_drop() can loop, to avoid 2717 * blowing out the kernel stack. 2718 * 2719 * WARNING! For MPSAFE operation this routine must acquire up to three 2720 * spin locks to be able to safely test nc_refs. Lock order is 2721 * very important. 2722 * 2723 * hash spinlock if on hash list 2724 * parent spinlock if child of parent 2725 * (the ncp is unresolved so there is no vnode association) 2726 */ 2727 static struct namecache * 2728 cache_zap(struct namecache *ncp, int nonblock) 2729 { 2730 struct namecache *par; 2731 struct vnode *dropvp; 2732 struct nchash_head *nchpp; 2733 int refs; 2734 2735 /* 2736 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2737 */ 2738 _cache_setunresolved(ncp); 2739 2740 /* 2741 * Try to scrap the entry and possibly tail-recurse on its parent. 2742 * We only scrap unref'd (other then our ref) unresolved entries, 2743 * we do not scrap 'live' entries. 2744 * 2745 * Note that once the spinlocks are acquired if nc_refs == 1 no 2746 * other references are possible. If it isn't, however, we have 2747 * to decrement but also be sure to avoid a 1->0 transition. 2748 */ 2749 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2750 KKASSERT(ncp->nc_refs > 0); 2751 2752 /* 2753 * Acquire locks. Note that the parent can't go away while we hold 2754 * a child locked. 2755 */ 2756 nchpp = NULL; 2757 if ((par = ncp->nc_parent) != NULL) { 2758 if (nonblock) { 2759 for (;;) { 2760 if (_cache_lock_nonblock(par) == 0) 2761 break; 2762 refs = ncp->nc_refs; 2763 ncp->nc_flag |= NCF_DEFEREDZAP; 2764 ++numdefered; /* MP race ok */ 2765 if (atomic_cmpset_int(&ncp->nc_refs, 2766 refs, refs - 1)) { 2767 _cache_unlock(ncp); 2768 return(NULL); 2769 } 2770 cpu_pause(); 2771 } 2772 _cache_hold(par); 2773 } else { 2774 _cache_hold(par); 2775 _cache_lock(par); 2776 } 2777 nchpp = ncp->nc_head; 2778 spin_lock(&nchpp->spin); 2779 } 2780 2781 /* 2782 * At this point if we find refs == 1 it should not be possible for 2783 * anyone else to have access to the ncp. We are holding the only 2784 * possible access point left (nchpp) spin-locked. 2785 * 2786 * If someone other then us has a ref or we have children 2787 * we cannot zap the entry. The 1->0 transition and any 2788 * further list operation is protected by the spinlocks 2789 * we have acquired but other transitions are not. 2790 */ 2791 for (;;) { 2792 refs = ncp->nc_refs; 2793 cpu_ccfence(); 2794 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list)) 2795 break; 2796 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) { 2797 if (par) { 2798 spin_unlock(&nchpp->spin); 2799 _cache_put(par); 2800 } 2801 _cache_unlock(ncp); 2802 return(NULL); 2803 } 2804 cpu_pause(); 2805 } 2806 2807 /* 2808 * We are the only ref and with the spinlocks held no further 2809 * refs can be acquired by others. 2810 * 2811 * Remove us from the hash list and parent list. We have to 2812 * drop a ref on the parent's vp if the parent's list becomes 2813 * empty. 2814 */ 2815 dropvp = NULL; 2816 if (par) { 2817 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2818 2819 KKASSERT(nchpp == ncp->nc_head); 2820 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2821 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2822 atomic_add_long(&pn->vfscache_count, -1); 2823 if (TAILQ_EMPTY(&ncp->nc_list)) 2824 atomic_add_long(&pn->vfscache_leafs, -1); 2825 2826 if (TAILQ_EMPTY(&par->nc_list)) { 2827 atomic_add_long(&pn->vfscache_leafs, 1); 2828 if (par->nc_vp) 2829 dropvp = par->nc_vp; 2830 } 2831 ncp->nc_head = NULL; 2832 ncp->nc_parent = NULL; 2833 spin_unlock(&nchpp->spin); 2834 _cache_unlock(par); 2835 } else { 2836 KKASSERT(ncp->nc_head == NULL); 2837 } 2838 2839 /* 2840 * ncp should not have picked up any refs. Physically 2841 * destroy the ncp. 2842 */ 2843 if (ncp->nc_refs != 1) { 2844 int save_refs = ncp->nc_refs; 2845 cpu_ccfence(); 2846 panic("cache_zap: %p bad refs %d (%d)\n", 2847 ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0)); 2848 } 2849 KKASSERT(ncp->nc_refs == 1); 2850 /* _cache_unlock(ncp) not required */ 2851 ncp->nc_refs = -1; /* safety */ 2852 if (ncp->nc_name) 2853 kfree(ncp->nc_name, M_VFSCACHE); 2854 kfree(ncp, M_VFSCACHE); 2855 2856 /* 2857 * Delayed drop (we had to release our spinlocks) 2858 * 2859 * The refed parent (if not NULL) must be dropped. The 2860 * caller is responsible for looping. 2861 */ 2862 if (dropvp) 2863 vdrop(dropvp); 2864 return(par); 2865 } 2866 2867 /* 2868 * Clean up dangling negative cache and defered-drop entries in the 2869 * namecache. 2870 * 2871 * This routine is called in the critical path and also called from 2872 * vnlru(). When called from vnlru we use a lower limit to try to 2873 * deal with the negative cache before the critical path has to start 2874 * dealing with it. 2875 */ 2876 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2877 2878 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2879 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2880 2881 void 2882 cache_hysteresis(int critpath) 2883 { 2884 long poslimit; 2885 long neglimit = maxvnodes / ncnegfactor; 2886 long xnumcache = vfscache_leafs; 2887 2888 if (critpath == 0) 2889 neglimit = neglimit * 8 / 10; 2890 2891 /* 2892 * Don't cache too many negative hits. We use hysteresis to reduce 2893 * the impact on the critical path. 2894 */ 2895 switch(neg_cache_hysteresis_state[critpath]) { 2896 case CHI_LOW: 2897 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2898 if (critpath) 2899 _cache_cleanneg(ncnegflush); 2900 else 2901 _cache_cleanneg(ncnegflush + 2902 vfscache_negs - neglimit); 2903 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2904 } 2905 break; 2906 case CHI_HIGH: 2907 if (vfscache_negs > MINNEG * 9 / 10 && 2908 vfscache_negs * 9 / 10 > neglimit 2909 ) { 2910 if (critpath) 2911 _cache_cleanneg(ncnegflush); 2912 else 2913 _cache_cleanneg(ncnegflush + 2914 vfscache_negs * 9 / 10 - 2915 neglimit); 2916 } else { 2917 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2918 } 2919 break; 2920 } 2921 2922 /* 2923 * Don't cache too many positive hits. We use hysteresis to reduce 2924 * the impact on the critical path. 2925 * 2926 * Excessive positive hits can accumulate due to large numbers of 2927 * hardlinks (the vnode cache will not prevent hl ncps from growing 2928 * into infinity). 2929 */ 2930 if ((poslimit = ncposlimit) == 0) 2931 poslimit = maxvnodes * 2; 2932 if (critpath == 0) 2933 poslimit = poslimit * 8 / 10; 2934 2935 switch(pos_cache_hysteresis_state[critpath]) { 2936 case CHI_LOW: 2937 if (xnumcache > poslimit && xnumcache > MINPOS) { 2938 if (critpath) 2939 _cache_cleanpos(ncposflush); 2940 else 2941 _cache_cleanpos(ncposflush + 2942 xnumcache - poslimit); 2943 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2944 } 2945 break; 2946 case CHI_HIGH: 2947 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2948 if (critpath) 2949 _cache_cleanpos(ncposflush); 2950 else 2951 _cache_cleanpos(ncposflush + 2952 xnumcache - poslimit * 5 / 6); 2953 } else { 2954 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2955 } 2956 break; 2957 } 2958 2959 /* 2960 * Clean out dangling defered-zap ncps which could not 2961 * be cleanly dropped if too many build up. Note 2962 * that numdefered is not an exact number as such ncps 2963 * can be reused and the counter is not handled in a MP 2964 * safe manner by design. 2965 */ 2966 if (numdefered > neglimit) { 2967 _cache_cleandefered(); 2968 } 2969 } 2970 2971 /* 2972 * NEW NAMECACHE LOOKUP API 2973 * 2974 * Lookup an entry in the namecache. The passed par_nch must be referenced 2975 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2976 * is ALWAYS returned, eve if the supplied component is illegal. 2977 * 2978 * The resulting namecache entry should be returned to the system with 2979 * cache_put() or cache_unlock() + cache_drop(). 2980 * 2981 * namecache locks are recursive but care must be taken to avoid lock order 2982 * reversals (hence why the passed par_nch must be unlocked). Locking 2983 * rules are to order for parent traversals, not for child traversals. 2984 * 2985 * Nobody else will be able to manipulate the associated namespace (e.g. 2986 * create, delete, rename, rename-target) until the caller unlocks the 2987 * entry. 2988 * 2989 * The returned entry will be in one of three states: positive hit (non-null 2990 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2991 * Unresolved entries must be resolved through the filesystem to associate the 2992 * vnode and/or determine whether a positive or negative hit has occured. 2993 * 2994 * It is not necessary to lock a directory in order to lock namespace under 2995 * that directory. In fact, it is explicitly not allowed to do that. A 2996 * directory is typically only locked when being created, renamed, or 2997 * destroyed. 2998 * 2999 * The directory (par) may be unresolved, in which case any returned child 3000 * will likely also be marked unresolved. Likely but not guarenteed. Since 3001 * the filesystem lookup requires a resolved directory vnode the caller is 3002 * responsible for resolving the namecache chain top-down. This API 3003 * specifically allows whole chains to be created in an unresolved state. 3004 */ 3005 struct nchandle 3006 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3007 { 3008 struct nchandle nch; 3009 struct namecache *ncp; 3010 struct namecache *new_ncp; 3011 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 3012 struct nchash_head *nchpp; 3013 struct mount *mp; 3014 u_int32_t hash; 3015 globaldata_t gd; 3016 int par_locked; 3017 3018 gd = mycpu; 3019 mp = par_nch->mount; 3020 par_locked = 0; 3021 3022 /* 3023 * This is a good time to call it, no ncp's are locked by 3024 * the caller or us. 3025 */ 3026 cache_hysteresis(1); 3027 3028 /* 3029 * Try to locate an existing entry 3030 */ 3031 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3032 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3033 new_ncp = NULL; 3034 nchpp = NCHHASH(hash); 3035 restart: 3036 rep_ncp = NULL; 3037 if (new_ncp) 3038 spin_lock(&nchpp->spin); 3039 else 3040 spin_lock_shared(&nchpp->spin); 3041 3042 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3043 /* 3044 * Break out if we find a matching entry. Note that 3045 * UNRESOLVED entries may match, but DESTROYED entries 3046 * do not. 3047 * 3048 * We may be able to reuse DESTROYED entries that we come 3049 * across, even if the name does not match, as long as 3050 * nc_nlen is correct. 3051 */ 3052 if (ncp->nc_parent == par_nch->ncp && 3053 ncp->nc_nlen == nlc->nlc_namelen) { 3054 if (ncp->nc_flag & NCF_DESTROYED) { 3055 if (ncp->nc_refs == 0 && rep_ncp == NULL) 3056 rep_ncp = ncp; 3057 continue; 3058 } 3059 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 3060 continue; 3061 _cache_hold(ncp); 3062 if (new_ncp) 3063 spin_unlock(&nchpp->spin); 3064 else 3065 spin_unlock_shared(&nchpp->spin); 3066 if (par_locked) { 3067 _cache_unlock(par_nch->ncp); 3068 par_locked = 0; 3069 } 3070 if (_cache_lock_special(ncp) == 0) { 3071 /* 3072 * Successfully locked but we must re-test 3073 * conditions that might have changed since 3074 * we did not have the lock before. 3075 */ 3076 if (ncp->nc_parent != par_nch->ncp || 3077 ncp->nc_nlen != nlc->nlc_namelen || 3078 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3079 ncp->nc_nlen) || 3080 (ncp->nc_flag & NCF_DESTROYED)) { 3081 _cache_put(ncp); 3082 goto restart; 3083 } 3084 _cache_auto_unresolve(mp, ncp); 3085 if (new_ncp) 3086 _cache_free(new_ncp); 3087 goto found; 3088 } 3089 _cache_get(ncp); /* cycle the lock to block */ 3090 _cache_put(ncp); 3091 _cache_drop(ncp); 3092 goto restart; 3093 } 3094 } 3095 3096 /* 3097 * We failed to locate the entry, try to resurrect a destroyed 3098 * entry that we did find that is already correctly linked into 3099 * nchpp and the parent. We must re-test conditions after 3100 * successfully locking rep_ncp. 3101 * 3102 * This case can occur under heavy loads due to not being able 3103 * to safely lock the parent in cache_zap(). Nominally a repeated 3104 * create/unlink load, but only the namelen needs to match. 3105 */ 3106 if (rep_ncp && new_ncp == NULL) { 3107 if (_cache_lock_nonblock(rep_ncp) == 0) { 3108 _cache_hold(rep_ncp); 3109 if (rep_ncp->nc_parent == par_nch->ncp && 3110 rep_ncp->nc_nlen == nlc->nlc_namelen && 3111 (rep_ncp->nc_flag & NCF_DESTROYED)) { 3112 /* 3113 * Update nc_name as reuse as new. 3114 */ 3115 ncp = rep_ncp; 3116 bcopy(nlc->nlc_nameptr, ncp->nc_name, 3117 nlc->nlc_namelen); 3118 spin_unlock_shared(&nchpp->spin); 3119 _cache_setunresolved(ncp); 3120 ncp->nc_flag = NCF_UNRESOLVED; 3121 ncp->nc_error = ENOTCONN; 3122 goto found; 3123 } 3124 _cache_put(rep_ncp); 3125 } 3126 } 3127 3128 /* 3129 * Otherwise create a new entry and add it to the cache. The parent 3130 * ncp must also be locked so we can link into it. 3131 * 3132 * We have to relookup after possibly blocking in kmalloc or 3133 * when locking par_nch. 3134 * 3135 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3136 * mount case, in which case nc_name will be NULL. 3137 */ 3138 if (new_ncp == NULL) { 3139 spin_unlock_shared(&nchpp->spin); 3140 new_ncp = cache_alloc(nlc->nlc_namelen); 3141 if (nlc->nlc_namelen) { 3142 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3143 nlc->nlc_namelen); 3144 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3145 } 3146 goto restart; 3147 } 3148 3149 /* 3150 * NOTE! The spinlock is held exclusively here because new_ncp 3151 * is non-NULL. 3152 */ 3153 if (par_locked == 0) { 3154 spin_unlock(&nchpp->spin); 3155 _cache_lock(par_nch->ncp); 3156 par_locked = 1; 3157 goto restart; 3158 } 3159 3160 /* 3161 * WARNING! We still hold the spinlock. We have to set the hash 3162 * table entry atomically. 3163 */ 3164 ncp = new_ncp; 3165 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3166 spin_unlock(&nchpp->spin); 3167 _cache_unlock(par_nch->ncp); 3168 /* par_locked = 0 - not used */ 3169 found: 3170 /* 3171 * stats and namecache size management 3172 */ 3173 if (ncp->nc_flag & NCF_UNRESOLVED) 3174 ++gd->gd_nchstats->ncs_miss; 3175 else if (ncp->nc_vp) 3176 ++gd->gd_nchstats->ncs_goodhits; 3177 else 3178 ++gd->gd_nchstats->ncs_neghits; 3179 nch.mount = mp; 3180 nch.ncp = ncp; 3181 _cache_mntref(nch.mount); 3182 3183 return(nch); 3184 } 3185 3186 /* 3187 * Attempt to lookup a namecache entry and return with a shared namecache 3188 * lock. 3189 */ 3190 int 3191 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 3192 int excl, struct nchandle *res_nch) 3193 { 3194 struct namecache *ncp; 3195 struct nchash_head *nchpp; 3196 struct mount *mp; 3197 u_int32_t hash; 3198 globaldata_t gd; 3199 3200 /* 3201 * If exclusive requested or shared namecache locks are disabled, 3202 * return failure. 3203 */ 3204 if (ncp_shared_lock_disable || excl) 3205 return(EWOULDBLOCK); 3206 3207 gd = mycpu; 3208 mp = par_nch->mount; 3209 3210 /* 3211 * This is a good time to call it, no ncp's are locked by 3212 * the caller or us. 3213 */ 3214 cache_hysteresis(1); 3215 3216 /* 3217 * Try to locate an existing entry 3218 */ 3219 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3220 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3221 nchpp = NCHHASH(hash); 3222 3223 spin_lock_shared(&nchpp->spin); 3224 3225 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3226 /* 3227 * Break out if we find a matching entry. Note that 3228 * UNRESOLVED entries may match, but DESTROYED entries 3229 * do not. 3230 */ 3231 if (ncp->nc_parent == par_nch->ncp && 3232 ncp->nc_nlen == nlc->nlc_namelen && 3233 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3234 (ncp->nc_flag & NCF_DESTROYED) == 0 3235 ) { 3236 _cache_hold(ncp); 3237 spin_unlock_shared(&nchpp->spin); 3238 if (_cache_lock_shared_special(ncp) == 0) { 3239 if (ncp->nc_parent == par_nch->ncp && 3240 ncp->nc_nlen == nlc->nlc_namelen && 3241 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3242 ncp->nc_nlen) == 0 && 3243 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3244 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3245 _cache_auto_unresolve_test(mp, ncp) == 0) { 3246 goto found; 3247 } 3248 _cache_unlock(ncp); 3249 } 3250 _cache_drop(ncp); 3251 spin_lock_shared(&nchpp->spin); 3252 break; 3253 } 3254 } 3255 3256 /* 3257 * Failure 3258 */ 3259 spin_unlock_shared(&nchpp->spin); 3260 return(EWOULDBLOCK); 3261 3262 /* 3263 * Success 3264 * 3265 * Note that nc_error might be non-zero (e.g ENOENT). 3266 */ 3267 found: 3268 res_nch->mount = mp; 3269 res_nch->ncp = ncp; 3270 ++gd->gd_nchstats->ncs_goodhits; 3271 _cache_mntref(res_nch->mount); 3272 3273 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3274 return(ncp->nc_error); 3275 } 3276 3277 /* 3278 * This is a non-blocking verison of cache_nlookup() used by 3279 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3280 * will return nch.ncp == NULL in that case. 3281 */ 3282 struct nchandle 3283 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3284 { 3285 struct nchandle nch; 3286 struct namecache *ncp; 3287 struct namecache *new_ncp; 3288 struct nchash_head *nchpp; 3289 struct mount *mp; 3290 u_int32_t hash; 3291 globaldata_t gd; 3292 int par_locked; 3293 3294 gd = mycpu; 3295 mp = par_nch->mount; 3296 par_locked = 0; 3297 3298 /* 3299 * Try to locate an existing entry 3300 */ 3301 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3302 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3303 new_ncp = NULL; 3304 nchpp = NCHHASH(hash); 3305 restart: 3306 spin_lock(&nchpp->spin); 3307 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3308 /* 3309 * Break out if we find a matching entry. Note that 3310 * UNRESOLVED entries may match, but DESTROYED entries 3311 * do not. 3312 */ 3313 if (ncp->nc_parent == par_nch->ncp && 3314 ncp->nc_nlen == nlc->nlc_namelen && 3315 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3316 (ncp->nc_flag & NCF_DESTROYED) == 0 3317 ) { 3318 _cache_hold(ncp); 3319 spin_unlock(&nchpp->spin); 3320 if (par_locked) { 3321 _cache_unlock(par_nch->ncp); 3322 par_locked = 0; 3323 } 3324 if (_cache_lock_special(ncp) == 0) { 3325 if (ncp->nc_parent != par_nch->ncp || 3326 ncp->nc_nlen != nlc->nlc_namelen || 3327 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3328 (ncp->nc_flag & NCF_DESTROYED)) { 3329 kprintf("cache_lookup_nonblock: " 3330 "ncp-race %p %*.*s\n", 3331 ncp, 3332 nlc->nlc_namelen, 3333 nlc->nlc_namelen, 3334 nlc->nlc_nameptr); 3335 _cache_unlock(ncp); 3336 _cache_drop(ncp); 3337 goto failed; 3338 } 3339 _cache_auto_unresolve(mp, ncp); 3340 if (new_ncp) { 3341 _cache_free(new_ncp); 3342 new_ncp = NULL; 3343 } 3344 goto found; 3345 } 3346 _cache_drop(ncp); 3347 goto failed; 3348 } 3349 } 3350 3351 /* 3352 * We failed to locate an entry, create a new entry and add it to 3353 * the cache. The parent ncp must also be locked so we 3354 * can link into it. 3355 * 3356 * We have to relookup after possibly blocking in kmalloc or 3357 * when locking par_nch. 3358 * 3359 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3360 * mount case, in which case nc_name will be NULL. 3361 */ 3362 if (new_ncp == NULL) { 3363 spin_unlock(&nchpp->spin); 3364 new_ncp = cache_alloc(nlc->nlc_namelen); 3365 if (nlc->nlc_namelen) { 3366 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3367 nlc->nlc_namelen); 3368 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3369 } 3370 goto restart; 3371 } 3372 if (par_locked == 0) { 3373 spin_unlock(&nchpp->spin); 3374 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3375 par_locked = 1; 3376 goto restart; 3377 } 3378 goto failed; 3379 } 3380 3381 /* 3382 * WARNING! We still hold the spinlock. We have to set the hash 3383 * table entry atomically. 3384 */ 3385 ncp = new_ncp; 3386 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3387 spin_unlock(&nchpp->spin); 3388 _cache_unlock(par_nch->ncp); 3389 /* par_locked = 0 - not used */ 3390 found: 3391 /* 3392 * stats and namecache size management 3393 */ 3394 if (ncp->nc_flag & NCF_UNRESOLVED) 3395 ++gd->gd_nchstats->ncs_miss; 3396 else if (ncp->nc_vp) 3397 ++gd->gd_nchstats->ncs_goodhits; 3398 else 3399 ++gd->gd_nchstats->ncs_neghits; 3400 nch.mount = mp; 3401 nch.ncp = ncp; 3402 _cache_mntref(nch.mount); 3403 3404 return(nch); 3405 failed: 3406 if (new_ncp) { 3407 _cache_free(new_ncp); 3408 new_ncp = NULL; 3409 } 3410 nch.mount = NULL; 3411 nch.ncp = NULL; 3412 return(nch); 3413 } 3414 3415 /* 3416 * The namecache entry is marked as being used as a mount point. 3417 * Locate the mount if it is visible to the caller. The DragonFly 3418 * mount system allows arbitrary loops in the topology and disentangles 3419 * those loops by matching against (mp, ncp) rather than just (ncp). 3420 * This means any given ncp can dive any number of mounts, depending 3421 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3422 * 3423 * We use a very simple frontend cache to reduce SMP conflicts, 3424 * which we have to do because the mountlist scan needs an exclusive 3425 * lock around its ripout info list. Not to mention that there might 3426 * be a lot of mounts. 3427 */ 3428 struct findmount_info { 3429 struct mount *result; 3430 struct mount *nch_mount; 3431 struct namecache *nch_ncp; 3432 }; 3433 3434 static 3435 struct ncmount_cache * 3436 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3437 { 3438 uintptr_t hash; 3439 3440 hash = (uintptr_t)mp + ((uintptr_t)mp >> 18); 3441 hash += (uintptr_t)ncp + ((uintptr_t)ncp >> 16); 3442 hash = (hash >> 1) % NCMOUNT_NUMCACHE; 3443 3444 return (&ncmount_cache[hash]); 3445 } 3446 3447 static 3448 int 3449 cache_findmount_callback(struct mount *mp, void *data) 3450 { 3451 struct findmount_info *info = data; 3452 3453 /* 3454 * Check the mount's mounted-on point against the passed nch. 3455 */ 3456 if (mp->mnt_ncmounton.mount == info->nch_mount && 3457 mp->mnt_ncmounton.ncp == info->nch_ncp 3458 ) { 3459 info->result = mp; 3460 _cache_mntref(mp); 3461 return(-1); 3462 } 3463 return(0); 3464 } 3465 3466 struct mount * 3467 cache_findmount(struct nchandle *nch) 3468 { 3469 struct findmount_info info; 3470 struct ncmount_cache *ncc; 3471 struct mount *mp; 3472 3473 /* 3474 * Fast 3475 */ 3476 if (ncmount_cache_enable == 0) { 3477 ncc = NULL; 3478 goto skip; 3479 } 3480 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3481 if (ncc->ncp == nch->ncp) { 3482 spin_lock_shared(&ncc->spin); 3483 if (ncc->isneg == 0 && 3484 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3485 if (mp->mnt_ncmounton.mount == nch->mount && 3486 mp->mnt_ncmounton.ncp == nch->ncp) { 3487 /* 3488 * Cache hit (positive) 3489 */ 3490 _cache_mntref(mp); 3491 spin_unlock_shared(&ncc->spin); 3492 return(mp); 3493 } 3494 /* else cache miss */ 3495 } 3496 if (ncc->isneg && 3497 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3498 /* 3499 * Cache hit (negative) 3500 */ 3501 spin_unlock_shared(&ncc->spin); 3502 return(NULL); 3503 } 3504 spin_unlock_shared(&ncc->spin); 3505 } 3506 skip: 3507 3508 /* 3509 * Slow 3510 */ 3511 info.result = NULL; 3512 info.nch_mount = nch->mount; 3513 info.nch_ncp = nch->ncp; 3514 mountlist_scan(cache_findmount_callback, &info, 3515 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 3516 3517 /* 3518 * Cache the result. 3519 * 3520 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3521 * only used for pointer comparisons and is not 3522 * referenced (otherwise there would be dangling 3523 * refs). 3524 * 3525 * Positive lookups: We cache the originating {ncp} and the target 3526 * (mp). (mp) is referenced. 3527 * 3528 * Indeterminant: If the match is undergoing an unmount we do 3529 * not cache it to avoid racing cache_unmounting(), 3530 * but still return the match. 3531 */ 3532 if (ncc) { 3533 spin_lock(&ncc->spin); 3534 if (info.result == NULL) { 3535 if (ncc->isneg == 0 && ncc->mp) 3536 _cache_mntrel(ncc->mp); 3537 ncc->ncp = nch->ncp; 3538 ncc->mp = nch->mount; 3539 ncc->isneg = 1; 3540 spin_unlock(&ncc->spin); 3541 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3542 if (ncc->isneg == 0 && ncc->mp) 3543 _cache_mntrel(ncc->mp); 3544 _cache_mntref(info.result); 3545 ncc->ncp = nch->ncp; 3546 ncc->mp = info.result; 3547 ncc->isneg = 0; 3548 spin_unlock(&ncc->spin); 3549 } else { 3550 spin_unlock(&ncc->spin); 3551 } 3552 } 3553 return(info.result); 3554 } 3555 3556 void 3557 cache_dropmount(struct mount *mp) 3558 { 3559 _cache_mntrel(mp); 3560 } 3561 3562 void 3563 cache_ismounting(struct mount *mp) 3564 { 3565 struct nchandle *nch = &mp->mnt_ncmounton; 3566 struct ncmount_cache *ncc; 3567 3568 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3569 if (ncc->isneg && 3570 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3571 spin_lock(&ncc->spin); 3572 if (ncc->isneg && 3573 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3574 ncc->ncp = NULL; 3575 ncc->mp = NULL; 3576 } 3577 spin_unlock(&ncc->spin); 3578 } 3579 } 3580 3581 void 3582 cache_unmounting(struct mount *mp) 3583 { 3584 struct nchandle *nch = &mp->mnt_ncmounton; 3585 struct ncmount_cache *ncc; 3586 3587 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3588 if (ncc->isneg == 0 && 3589 ncc->ncp == nch->ncp && ncc->mp == mp) { 3590 spin_lock(&ncc->spin); 3591 if (ncc->isneg == 0 && 3592 ncc->ncp == nch->ncp && ncc->mp == mp) { 3593 _cache_mntrel(mp); 3594 ncc->ncp = NULL; 3595 ncc->mp = NULL; 3596 } 3597 spin_unlock(&ncc->spin); 3598 } 3599 } 3600 3601 /* 3602 * Resolve an unresolved namecache entry, generally by looking it up. 3603 * The passed ncp must be locked and refd. 3604 * 3605 * Theoretically since a vnode cannot be recycled while held, and since 3606 * the nc_parent chain holds its vnode as long as children exist, the 3607 * direct parent of the cache entry we are trying to resolve should 3608 * have a valid vnode. If not then generate an error that we can 3609 * determine is related to a resolver bug. 3610 * 3611 * However, if a vnode was in the middle of a recyclement when the NCP 3612 * got locked, ncp->nc_vp might point to a vnode that is about to become 3613 * invalid. cache_resolve() handles this case by unresolving the entry 3614 * and then re-resolving it. 3615 * 3616 * Note that successful resolution does not necessarily return an error 3617 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3618 * will be returned. 3619 */ 3620 int 3621 cache_resolve(struct nchandle *nch, struct ucred *cred) 3622 { 3623 struct namecache *par_tmp; 3624 struct namecache *par; 3625 struct namecache *ncp; 3626 struct nchandle nctmp; 3627 struct mount *mp; 3628 struct vnode *dvp; 3629 int error; 3630 3631 ncp = nch->ncp; 3632 mp = nch->mount; 3633 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3634 restart: 3635 /* 3636 * If the ncp is already resolved we have nothing to do. However, 3637 * we do want to guarentee that a usable vnode is returned when 3638 * a vnode is present, so make sure it hasn't been reclaimed. 3639 */ 3640 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3641 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3642 _cache_setunresolved(ncp); 3643 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3644 return (ncp->nc_error); 3645 } 3646 3647 /* 3648 * If the ncp was destroyed it will never resolve again. This 3649 * can basically only happen when someone is chdir'd into an 3650 * empty directory which is then rmdir'd. We want to catch this 3651 * here and not dive the VFS because the VFS might actually 3652 * have a way to re-resolve the disconnected ncp, which will 3653 * result in inconsistencies in the cdir/nch for proc->p_fd. 3654 */ 3655 if (ncp->nc_flag & NCF_DESTROYED) 3656 return(EINVAL); 3657 3658 /* 3659 * Mount points need special handling because the parent does not 3660 * belong to the same filesystem as the ncp. 3661 */ 3662 if (ncp == mp->mnt_ncmountpt.ncp) 3663 return (cache_resolve_mp(mp)); 3664 3665 /* 3666 * We expect an unbroken chain of ncps to at least the mount point, 3667 * and even all the way to root (but this code doesn't have to go 3668 * past the mount point). 3669 */ 3670 if (ncp->nc_parent == NULL) { 3671 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3672 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3673 ncp->nc_error = EXDEV; 3674 return(ncp->nc_error); 3675 } 3676 3677 /* 3678 * The vp's of the parent directories in the chain are held via vhold() 3679 * due to the existance of the child, and should not disappear. 3680 * However, there are cases where they can disappear: 3681 * 3682 * - due to filesystem I/O errors. 3683 * - due to NFS being stupid about tracking the namespace and 3684 * destroys the namespace for entire directories quite often. 3685 * - due to forced unmounts. 3686 * - due to an rmdir (parent will be marked DESTROYED) 3687 * 3688 * When this occurs we have to track the chain backwards and resolve 3689 * it, looping until the resolver catches up to the current node. We 3690 * could recurse here but we might run ourselves out of kernel stack 3691 * so we do it in a more painful manner. This situation really should 3692 * not occur all that often, or if it does not have to go back too 3693 * many nodes to resolve the ncp. 3694 */ 3695 while ((dvp = cache_dvpref(ncp)) == NULL) { 3696 /* 3697 * This case can occur if a process is CD'd into a 3698 * directory which is then rmdir'd. If the parent is marked 3699 * destroyed there is no point trying to resolve it. 3700 */ 3701 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3702 return(ENOENT); 3703 par = ncp->nc_parent; 3704 _cache_hold(par); 3705 _cache_lock(par); 3706 while ((par_tmp = par->nc_parent) != NULL && 3707 par_tmp->nc_vp == NULL) { 3708 _cache_hold(par_tmp); 3709 _cache_lock(par_tmp); 3710 _cache_put(par); 3711 par = par_tmp; 3712 } 3713 if (par->nc_parent == NULL) { 3714 kprintf("EXDEV case 2 %*.*s\n", 3715 par->nc_nlen, par->nc_nlen, par->nc_name); 3716 _cache_put(par); 3717 return (EXDEV); 3718 } 3719 /* 3720 * The parent is not set in stone, ref and lock it to prevent 3721 * it from disappearing. Also note that due to renames it 3722 * is possible for our ncp to move and for par to no longer 3723 * be one of its parents. We resolve it anyway, the loop 3724 * will handle any moves. 3725 */ 3726 _cache_get(par); /* additional hold/lock */ 3727 _cache_put(par); /* from earlier hold/lock */ 3728 if (par == nch->mount->mnt_ncmountpt.ncp) { 3729 cache_resolve_mp(nch->mount); 3730 } else if ((dvp = cache_dvpref(par)) == NULL) { 3731 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3732 _cache_put(par); 3733 continue; 3734 } else { 3735 if (par->nc_flag & NCF_UNRESOLVED) { 3736 nctmp.mount = mp; 3737 nctmp.ncp = par; 3738 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3739 } 3740 vrele(dvp); 3741 } 3742 if ((error = par->nc_error) != 0) { 3743 if (par->nc_error != EAGAIN) { 3744 kprintf("EXDEV case 3 %*.*s error %d\n", 3745 par->nc_nlen, par->nc_nlen, par->nc_name, 3746 par->nc_error); 3747 _cache_put(par); 3748 return(error); 3749 } 3750 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3751 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3752 } 3753 _cache_put(par); 3754 /* loop */ 3755 } 3756 3757 /* 3758 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3759 * ncp's and reattach them. If this occurs the original ncp is marked 3760 * EAGAIN to force a relookup. 3761 * 3762 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3763 * ncp must already be resolved. 3764 */ 3765 if (dvp) { 3766 nctmp.mount = mp; 3767 nctmp.ncp = ncp; 3768 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3769 vrele(dvp); 3770 } else { 3771 ncp->nc_error = EPERM; 3772 } 3773 if (ncp->nc_error == EAGAIN) { 3774 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3775 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3776 goto restart; 3777 } 3778 return(ncp->nc_error); 3779 } 3780 3781 /* 3782 * Resolve the ncp associated with a mount point. Such ncp's almost always 3783 * remain resolved and this routine is rarely called. NFS MPs tends to force 3784 * re-resolution more often due to its mac-truck-smash-the-namecache 3785 * method of tracking namespace changes. 3786 * 3787 * The semantics for this call is that the passed ncp must be locked on 3788 * entry and will be locked on return. However, if we actually have to 3789 * resolve the mount point we temporarily unlock the entry in order to 3790 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3791 * the unlock we have to recheck the flags after we relock. 3792 */ 3793 static int 3794 cache_resolve_mp(struct mount *mp) 3795 { 3796 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3797 struct vnode *vp; 3798 int error; 3799 3800 KKASSERT(mp != NULL); 3801 3802 /* 3803 * If the ncp is already resolved we have nothing to do. However, 3804 * we do want to guarentee that a usable vnode is returned when 3805 * a vnode is present, so make sure it hasn't been reclaimed. 3806 */ 3807 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3808 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3809 _cache_setunresolved(ncp); 3810 } 3811 3812 if (ncp->nc_flag & NCF_UNRESOLVED) { 3813 _cache_unlock(ncp); 3814 while (vfs_busy(mp, 0)) 3815 ; 3816 error = VFS_ROOT(mp, &vp); 3817 _cache_lock(ncp); 3818 3819 /* 3820 * recheck the ncp state after relocking. 3821 */ 3822 if (ncp->nc_flag & NCF_UNRESOLVED) { 3823 ncp->nc_error = error; 3824 if (error == 0) { 3825 _cache_setvp(mp, ncp, vp); 3826 vput(vp); 3827 } else { 3828 kprintf("[diagnostic] cache_resolve_mp: failed" 3829 " to resolve mount %p err=%d ncp=%p\n", 3830 mp, error, ncp); 3831 _cache_setvp(mp, ncp, NULL); 3832 } 3833 } else if (error == 0) { 3834 vput(vp); 3835 } 3836 vfs_unbusy(mp); 3837 } 3838 return(ncp->nc_error); 3839 } 3840 3841 /* 3842 * Clean out negative cache entries when too many have accumulated. 3843 */ 3844 static void 3845 _cache_cleanneg(long count) 3846 { 3847 struct pcpu_ncache *pn; 3848 struct namecache *ncp; 3849 static uint32_t neg_rover; 3850 uint32_t n; 3851 long vnegs; 3852 3853 n = neg_rover++; /* SMP heuristical, race ok */ 3854 cpu_ccfence(); 3855 n = n % (uint32_t)ncpus; 3856 3857 /* 3858 * Normalize vfscache_negs and count. count is sometimes based 3859 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 3860 * have crazy values. 3861 */ 3862 vnegs = vfscache_negs; 3863 cpu_ccfence(); 3864 if (vnegs <= MINNEG) 3865 vnegs = MINNEG; 3866 if (count < 1) 3867 count = 1; 3868 3869 pn = &pcpu_ncache[n]; 3870 spin_lock(&pn->neg_spin); 3871 count = pn->neg_count * count / vnegs + 1; 3872 spin_unlock(&pn->neg_spin); 3873 3874 /* 3875 * Attempt to clean out the specified number of negative cache 3876 * entries. 3877 */ 3878 while (count > 0) { 3879 spin_lock(&pn->neg_spin); 3880 ncp = TAILQ_FIRST(&pn->neg_list); 3881 if (ncp == NULL) { 3882 spin_unlock(&pn->neg_spin); 3883 break; 3884 } 3885 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 3886 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 3887 _cache_hold(ncp); 3888 spin_unlock(&pn->neg_spin); 3889 3890 /* 3891 * This can race, so we must re-check that the ncp 3892 * is on the ncneg.list after successfully locking it. 3893 */ 3894 if (_cache_lock_special(ncp) == 0) { 3895 if (ncp->nc_vp == NULL && 3896 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3897 ncp = cache_zap(ncp, 1); 3898 if (ncp) 3899 _cache_drop(ncp); 3900 } else { 3901 _cache_unlock(ncp); 3902 _cache_drop(ncp); 3903 } 3904 } else { 3905 _cache_drop(ncp); 3906 } 3907 --count; 3908 } 3909 } 3910 3911 /* 3912 * Clean out positive cache entries when too many have accumulated. 3913 */ 3914 static void 3915 _cache_cleanpos(long count) 3916 { 3917 static volatile int rover; 3918 struct nchash_head *nchpp; 3919 struct namecache *ncp; 3920 int rover_copy; 3921 3922 /* 3923 * Attempt to clean out the specified number of negative cache 3924 * entries. 3925 */ 3926 while (count > 0) { 3927 rover_copy = ++rover; /* MPSAFEENOUGH */ 3928 cpu_ccfence(); 3929 nchpp = NCHHASH(rover_copy); 3930 3931 if (TAILQ_FIRST(&nchpp->list) == NULL) { 3932 --count; 3933 continue; 3934 } 3935 3936 /* 3937 * Cycle ncp on list, ignore and do not move DUMMY 3938 * ncps. These are temporary list iterators. 3939 * 3940 * We must cycle the ncp to the end of the list to 3941 * ensure that all ncp's have an equal chance of 3942 * being removed. 3943 */ 3944 spin_lock(&nchpp->spin); 3945 ncp = TAILQ_FIRST(&nchpp->list); 3946 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 3947 ncp = TAILQ_NEXT(ncp, nc_hash); 3948 if (ncp) { 3949 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 3950 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 3951 _cache_hold(ncp); 3952 } 3953 spin_unlock(&nchpp->spin); 3954 3955 if (ncp) { 3956 if (_cache_lock_special(ncp) == 0) { 3957 ncp = cache_zap(ncp, 1); 3958 if (ncp) 3959 _cache_drop(ncp); 3960 } else { 3961 _cache_drop(ncp); 3962 } 3963 } 3964 --count; 3965 } 3966 } 3967 3968 /* 3969 * This is a kitchen sink function to clean out ncps which we 3970 * tried to zap from cache_drop() but failed because we were 3971 * unable to acquire the parent lock. 3972 * 3973 * Such entries can also be removed via cache_inval_vp(), such 3974 * as when unmounting. 3975 */ 3976 static void 3977 _cache_cleandefered(void) 3978 { 3979 struct nchash_head *nchpp; 3980 struct namecache *ncp; 3981 struct namecache dummy; 3982 int i; 3983 3984 /* 3985 * Create a list iterator. DUMMY indicates that this is a list 3986 * iterator, DESTROYED prevents matches by lookup functions. 3987 */ 3988 numdefered = 0; 3989 bzero(&dummy, sizeof(dummy)); 3990 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 3991 dummy.nc_refs = 1; 3992 3993 for (i = 0; i <= nchash; ++i) { 3994 nchpp = &nchashtbl[i]; 3995 3996 spin_lock(&nchpp->spin); 3997 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3998 ncp = &dummy; 3999 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4000 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4001 continue; 4002 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4003 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4004 _cache_hold(ncp); 4005 spin_unlock(&nchpp->spin); 4006 if (_cache_lock_nonblock(ncp) == 0) { 4007 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4008 _cache_unlock(ncp); 4009 } 4010 _cache_drop(ncp); 4011 spin_lock(&nchpp->spin); 4012 ncp = &dummy; 4013 } 4014 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4015 spin_unlock(&nchpp->spin); 4016 } 4017 } 4018 4019 /* 4020 * Name cache initialization, from vfsinit() when we are booting 4021 */ 4022 void 4023 nchinit(void) 4024 { 4025 struct pcpu_ncache *pn; 4026 globaldata_t gd; 4027 int i; 4028 4029 /* 4030 * Per-cpu accounting and negative hit list 4031 */ 4032 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4033 M_VFSCACHE, M_WAITOK|M_ZERO); 4034 for (i = 0; i < ncpus; ++i) { 4035 pn = &pcpu_ncache[i]; 4036 TAILQ_INIT(&pn->neg_list); 4037 spin_init(&pn->neg_spin, "ncneg"); 4038 } 4039 4040 /* 4041 * Initialise per-cpu namecache effectiveness statistics. 4042 */ 4043 for (i = 0; i < ncpus; ++i) { 4044 gd = globaldata_find(i); 4045 gd->gd_nchstats = &nchstats[i]; 4046 } 4047 4048 /* 4049 * Create a generous namecache hash table 4050 */ 4051 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4052 sizeof(struct nchash_head), 4053 M_VFSCACHE, &nchash); 4054 for (i = 0; i <= (int)nchash; ++i) { 4055 TAILQ_INIT(&nchashtbl[i].list); 4056 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4057 } 4058 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4059 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4060 nclockwarn = 5 * hz; 4061 } 4062 4063 /* 4064 * Called from start_init() to bootstrap the root filesystem. Returns 4065 * a referenced, unlocked namecache record. 4066 */ 4067 void 4068 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4069 { 4070 nch->ncp = cache_alloc(0); 4071 nch->mount = mp; 4072 _cache_mntref(mp); 4073 if (vp) 4074 _cache_setvp(nch->mount, nch->ncp, vp); 4075 } 4076 4077 /* 4078 * vfs_cache_setroot() 4079 * 4080 * Create an association between the root of our namecache and 4081 * the root vnode. This routine may be called several times during 4082 * booting. 4083 * 4084 * If the caller intends to save the returned namecache pointer somewhere 4085 * it must cache_hold() it. 4086 */ 4087 void 4088 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4089 { 4090 struct vnode *ovp; 4091 struct nchandle onch; 4092 4093 ovp = rootvnode; 4094 onch = rootnch; 4095 rootvnode = nvp; 4096 if (nch) 4097 rootnch = *nch; 4098 else 4099 cache_zero(&rootnch); 4100 if (ovp) 4101 vrele(ovp); 4102 if (onch.ncp) 4103 cache_drop(&onch); 4104 } 4105 4106 /* 4107 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4108 * topology and is being removed as quickly as possible. The new VOP_N*() 4109 * API calls are required to make specific adjustments using the supplied 4110 * ncp pointers rather then just bogusly purging random vnodes. 4111 * 4112 * Invalidate all namecache entries to a particular vnode as well as 4113 * any direct children of that vnode in the namecache. This is a 4114 * 'catch all' purge used by filesystems that do not know any better. 4115 * 4116 * Note that the linkage between the vnode and its namecache entries will 4117 * be removed, but the namecache entries themselves might stay put due to 4118 * active references from elsewhere in the system or due to the existance of 4119 * the children. The namecache topology is left intact even if we do not 4120 * know what the vnode association is. Such entries will be marked 4121 * NCF_UNRESOLVED. 4122 */ 4123 void 4124 cache_purge(struct vnode *vp) 4125 { 4126 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4127 } 4128 4129 static int disablecwd; 4130 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4131 "Disable getcwd"); 4132 4133 static u_long numcwdcalls; 4134 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4135 "Number of current directory resolution calls"); 4136 static u_long numcwdfailnf; 4137 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4138 "Number of current directory failures due to lack of file"); 4139 static u_long numcwdfailsz; 4140 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4141 "Number of current directory failures due to large result"); 4142 static u_long numcwdfound; 4143 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4144 "Number of current directory resolution successes"); 4145 4146 /* 4147 * MPALMOSTSAFE 4148 */ 4149 int 4150 sys___getcwd(struct __getcwd_args *uap) 4151 { 4152 u_int buflen; 4153 int error; 4154 char *buf; 4155 char *bp; 4156 4157 if (disablecwd) 4158 return (ENODEV); 4159 4160 buflen = uap->buflen; 4161 if (buflen == 0) 4162 return (EINVAL); 4163 if (buflen > MAXPATHLEN) 4164 buflen = MAXPATHLEN; 4165 4166 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4167 bp = kern_getcwd(buf, buflen, &error); 4168 if (error == 0) 4169 error = copyout(bp, uap->buf, strlen(bp) + 1); 4170 kfree(buf, M_TEMP); 4171 return (error); 4172 } 4173 4174 char * 4175 kern_getcwd(char *buf, size_t buflen, int *error) 4176 { 4177 struct proc *p = curproc; 4178 char *bp; 4179 int i, slash_prefixed; 4180 struct filedesc *fdp; 4181 struct nchandle nch; 4182 struct namecache *ncp; 4183 4184 numcwdcalls++; 4185 bp = buf; 4186 bp += buflen - 1; 4187 *bp = '\0'; 4188 fdp = p->p_fd; 4189 slash_prefixed = 0; 4190 4191 nch = fdp->fd_ncdir; 4192 ncp = nch.ncp; 4193 if (ncp) 4194 _cache_hold(ncp); 4195 4196 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4197 nch.mount != fdp->fd_nrdir.mount) 4198 ) { 4199 /* 4200 * While traversing upwards if we encounter the root 4201 * of the current mount we have to skip to the mount point 4202 * in the underlying filesystem. 4203 */ 4204 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4205 nch = nch.mount->mnt_ncmounton; 4206 _cache_drop(ncp); 4207 ncp = nch.ncp; 4208 if (ncp) 4209 _cache_hold(ncp); 4210 continue; 4211 } 4212 4213 /* 4214 * Prepend the path segment 4215 */ 4216 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4217 if (bp == buf) { 4218 numcwdfailsz++; 4219 *error = ERANGE; 4220 bp = NULL; 4221 goto done; 4222 } 4223 *--bp = ncp->nc_name[i]; 4224 } 4225 if (bp == buf) { 4226 numcwdfailsz++; 4227 *error = ERANGE; 4228 bp = NULL; 4229 goto done; 4230 } 4231 *--bp = '/'; 4232 slash_prefixed = 1; 4233 4234 /* 4235 * Go up a directory. This isn't a mount point so we don't 4236 * have to check again. 4237 */ 4238 while ((nch.ncp = ncp->nc_parent) != NULL) { 4239 if (ncp_shared_lock_disable) 4240 _cache_lock(ncp); 4241 else 4242 _cache_lock_shared(ncp); 4243 if (nch.ncp != ncp->nc_parent) { 4244 _cache_unlock(ncp); 4245 continue; 4246 } 4247 _cache_hold(nch.ncp); 4248 _cache_unlock(ncp); 4249 break; 4250 } 4251 _cache_drop(ncp); 4252 ncp = nch.ncp; 4253 } 4254 if (ncp == NULL) { 4255 numcwdfailnf++; 4256 *error = ENOENT; 4257 bp = NULL; 4258 goto done; 4259 } 4260 if (!slash_prefixed) { 4261 if (bp == buf) { 4262 numcwdfailsz++; 4263 *error = ERANGE; 4264 bp = NULL; 4265 goto done; 4266 } 4267 *--bp = '/'; 4268 } 4269 numcwdfound++; 4270 *error = 0; 4271 done: 4272 if (ncp) 4273 _cache_drop(ncp); 4274 return (bp); 4275 } 4276 4277 /* 4278 * Thus begins the fullpath magic. 4279 * 4280 * The passed nchp is referenced but not locked. 4281 */ 4282 static int disablefullpath; 4283 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4284 &disablefullpath, 0, 4285 "Disable fullpath lookups"); 4286 4287 int 4288 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4289 char **retbuf, char **freebuf, int guess) 4290 { 4291 struct nchandle fd_nrdir; 4292 struct nchandle nch; 4293 struct namecache *ncp; 4294 struct mount *mp, *new_mp; 4295 char *bp, *buf; 4296 int slash_prefixed; 4297 int error = 0; 4298 int i; 4299 4300 *retbuf = NULL; 4301 *freebuf = NULL; 4302 4303 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4304 bp = buf + MAXPATHLEN - 1; 4305 *bp = '\0'; 4306 if (nchbase) 4307 fd_nrdir = *nchbase; 4308 else if (p != NULL) 4309 fd_nrdir = p->p_fd->fd_nrdir; 4310 else 4311 fd_nrdir = rootnch; 4312 slash_prefixed = 0; 4313 nch = *nchp; 4314 ncp = nch.ncp; 4315 if (ncp) 4316 _cache_hold(ncp); 4317 mp = nch.mount; 4318 4319 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4320 new_mp = NULL; 4321 4322 /* 4323 * If we are asked to guess the upwards path, we do so whenever 4324 * we encounter an ncp marked as a mountpoint. We try to find 4325 * the actual mountpoint by finding the mountpoint with this 4326 * ncp. 4327 */ 4328 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4329 new_mp = mount_get_by_nc(ncp); 4330 } 4331 /* 4332 * While traversing upwards if we encounter the root 4333 * of the current mount we have to skip to the mount point. 4334 */ 4335 if (ncp == mp->mnt_ncmountpt.ncp) { 4336 new_mp = mp; 4337 } 4338 if (new_mp) { 4339 nch = new_mp->mnt_ncmounton; 4340 _cache_drop(ncp); 4341 ncp = nch.ncp; 4342 if (ncp) 4343 _cache_hold(ncp); 4344 mp = nch.mount; 4345 continue; 4346 } 4347 4348 /* 4349 * Prepend the path segment 4350 */ 4351 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4352 if (bp == buf) { 4353 kfree(buf, M_TEMP); 4354 error = ENOMEM; 4355 goto done; 4356 } 4357 *--bp = ncp->nc_name[i]; 4358 } 4359 if (bp == buf) { 4360 kfree(buf, M_TEMP); 4361 error = ENOMEM; 4362 goto done; 4363 } 4364 *--bp = '/'; 4365 slash_prefixed = 1; 4366 4367 /* 4368 * Go up a directory. This isn't a mount point so we don't 4369 * have to check again. 4370 * 4371 * We can only safely access nc_parent with ncp held locked. 4372 */ 4373 while ((nch.ncp = ncp->nc_parent) != NULL) { 4374 _cache_lock(ncp); 4375 if (nch.ncp != ncp->nc_parent) { 4376 _cache_unlock(ncp); 4377 continue; 4378 } 4379 _cache_hold(nch.ncp); 4380 _cache_unlock(ncp); 4381 break; 4382 } 4383 _cache_drop(ncp); 4384 ncp = nch.ncp; 4385 } 4386 if (ncp == NULL) { 4387 kfree(buf, M_TEMP); 4388 error = ENOENT; 4389 goto done; 4390 } 4391 4392 if (!slash_prefixed) { 4393 if (bp == buf) { 4394 kfree(buf, M_TEMP); 4395 error = ENOMEM; 4396 goto done; 4397 } 4398 *--bp = '/'; 4399 } 4400 *retbuf = bp; 4401 *freebuf = buf; 4402 error = 0; 4403 done: 4404 if (ncp) 4405 _cache_drop(ncp); 4406 return(error); 4407 } 4408 4409 int 4410 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4411 char **freebuf, int guess) 4412 { 4413 struct namecache *ncp; 4414 struct nchandle nch; 4415 int error; 4416 4417 *freebuf = NULL; 4418 if (disablefullpath) 4419 return (ENODEV); 4420 4421 if (p == NULL) 4422 return (EINVAL); 4423 4424 /* vn is NULL, client wants us to use p->p_textvp */ 4425 if (vn == NULL) { 4426 if ((vn = p->p_textvp) == NULL) 4427 return (EINVAL); 4428 } 4429 spin_lock_shared(&vn->v_spin); 4430 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4431 if (ncp->nc_nlen) 4432 break; 4433 } 4434 if (ncp == NULL) { 4435 spin_unlock_shared(&vn->v_spin); 4436 return (EINVAL); 4437 } 4438 _cache_hold(ncp); 4439 spin_unlock_shared(&vn->v_spin); 4440 4441 nch.ncp = ncp; 4442 nch.mount = vn->v_mount; 4443 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4444 _cache_drop(ncp); 4445 return (error); 4446 } 4447 4448 void 4449 vfscache_rollup_cpu(struct globaldata *gd) 4450 { 4451 struct pcpu_ncache *pn; 4452 long count; 4453 4454 if (pcpu_ncache == NULL) 4455 return; 4456 pn = &pcpu_ncache[gd->gd_cpuid]; 4457 4458 if (pn->vfscache_count) { 4459 count = atomic_swap_long(&pn->vfscache_count, 0); 4460 atomic_add_long(&vfscache_count, count); 4461 } 4462 if (pn->vfscache_leafs) { 4463 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4464 atomic_add_long(&vfscache_leafs, count); 4465 } 4466 if (pn->vfscache_negs) { 4467 count = atomic_swap_long(&pn->vfscache_negs, 0); 4468 atomic_add_long(&vfscache_negs, count); 4469 } 4470 } 4471 4472 #if 0 4473 static void 4474 vfscache_rollup_all(void) 4475 { 4476 int n; 4477 4478 for (n = 0; n < ncpus; ++n) 4479 vfscache_rollup_cpu(globaldata_find(n)); 4480 } 4481 #endif 4482