1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 #include <sys/mount.h> 70 #include <sys/vnode.h> 71 #include <sys/malloc.h> 72 #include <sys/sysproto.h> 73 #include <sys/spinlock.h> 74 #include <sys/proc.h> 75 #include <sys/namei.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 91 * 92 * Negative entries may exist and correspond to resolved namecache 93 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 94 * will be set if the entry corresponds to a whited-out directory entry 95 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 96 * is locked via pcpu_ncache[n].neg_spin; 97 * 98 * MPSAFE RULES: 99 * 100 * (1) A ncp must be referenced before it can be locked. 101 * 102 * (2) A ncp must be locked in order to modify it. 103 * 104 * (3) ncp locks are always ordered child -> parent. That may seem 105 * backwards but forward scans use the hash table and thus can hold 106 * the parent unlocked when traversing downward. 107 * 108 * This allows insert/rename/delete/dot-dot and other operations 109 * to use ncp->nc_parent links. 110 * 111 * This also prevents a locked up e.g. NFS node from creating a 112 * chain reaction all the way back to the root vnode / namecache. 113 * 114 * (4) parent linkages require both the parent and child to be locked. 115 */ 116 117 /* 118 * Structures associated with name cacheing. 119 */ 120 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 121 #define MINNEG 1024 122 #define MINPOS 1024 123 #define NCMOUNT_NUMCACHE 16301 /* prime number */ 124 125 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 126 127 LIST_HEAD(nchash_list, namecache); 128 129 /* 130 * Don't cachealign, but at least pad to 32 bytes so entries 131 * don't cross a cache line. 132 */ 133 struct nchash_head { 134 struct nchash_list list; /* 16 bytes */ 135 struct spinlock spin; /* 8 bytes */ 136 long pad01; /* 8 bytes */ 137 }; 138 139 struct ncmount_cache { 140 struct spinlock spin; 141 struct namecache *ncp; 142 struct mount *mp; 143 int isneg; /* if != 0 mp is originator and not target */ 144 } __cachealign; 145 146 struct pcpu_ncache { 147 struct spinlock neg_spin; /* for neg_list and neg_count */ 148 struct namecache_list neg_list; 149 long neg_count; 150 long vfscache_negs; 151 long vfscache_count; 152 long vfscache_leafs; 153 } __cachealign; 154 155 static struct nchash_head *nchashtbl; 156 static struct pcpu_ncache *pcpu_ncache; 157 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 158 159 /* 160 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 161 * to create the namecache infrastructure leading to a dangling vnode. 162 * 163 * 0 Only errors are reported 164 * 1 Successes are reported 165 * 2 Successes + the whole directory scan is reported 166 * 3 Force the directory scan code run as if the parent vnode did not 167 * have a namecache record, even if it does have one. 168 */ 169 static int ncvp_debug; 170 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 171 "Namecache debug level (0-3)"); 172 173 static u_long nchash; /* size of hash table */ 174 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 175 "Size of namecache hash table"); 176 177 static int ncnegflush = 10; /* burst for negative flush */ 178 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 179 "Batch flush negative entries"); 180 181 static int ncposflush = 10; /* burst for positive flush */ 182 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 183 "Batch flush positive entries"); 184 185 static int ncnegfactor = 16; /* ratio of negative entries */ 186 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 187 "Ratio of namecache negative entries"); 188 189 static int nclockwarn; /* warn on locked entries in ticks */ 190 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 191 "Warn on locked namecache entries in ticks"); 192 193 static int numdefered; /* number of cache entries allocated */ 194 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 195 "Number of cache entries allocated"); 196 197 static int ncposlimit; /* number of cache entries allocated */ 198 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 199 "Number of cache entries allocated"); 200 201 static int ncp_shared_lock_disable = 0; 202 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 203 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 204 205 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 206 "sizeof(struct vnode)"); 207 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 208 "sizeof(struct namecache)"); 209 210 static int ncmount_cache_enable = 1; 211 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 212 &ncmount_cache_enable, 0, "mount point cache"); 213 214 static __inline void _cache_drop(struct namecache *ncp); 215 static int cache_resolve_mp(struct mount *mp); 216 static struct vnode *cache_dvpref(struct namecache *ncp); 217 static void _cache_lock(struct namecache *ncp); 218 static void _cache_setunresolved(struct namecache *ncp); 219 static void _cache_cleanneg(long count); 220 static void _cache_cleanpos(long count); 221 static void _cache_cleandefered(void); 222 static void _cache_unlink(struct namecache *ncp); 223 #if 0 224 static void vfscache_rollup_all(void); 225 #endif 226 227 /* 228 * The new name cache statistics 229 */ 230 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 231 static long vfscache_negs; 232 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 233 "Number of negative namecache entries"); 234 static long vfscache_count; 235 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 236 "Number of namecaches entries"); 237 static long vfscache_leafs; 238 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 239 "Number of namecaches entries"); 240 241 struct nchstats nchstats[SMP_MAXCPU]; 242 /* 243 * Export VFS cache effectiveness statistics to user-land. 244 * 245 * The statistics are left for aggregation to user-land so 246 * neat things can be achieved, like observing per-CPU cache 247 * distribution. 248 */ 249 static int 250 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 251 { 252 struct globaldata *gd; 253 int i, error; 254 255 error = 0; 256 for (i = 0; i < ncpus; ++i) { 257 gd = globaldata_find(i); 258 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 259 sizeof(struct nchstats)))) 260 break; 261 } 262 263 return (error); 264 } 265 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 266 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 267 268 static struct namecache *cache_zap(struct namecache *ncp, int nonblock); 269 270 /* 271 * Cache mount points and namecache records in order to avoid unnecessary 272 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 273 * performance and is particularly important on multi-socket systems to 274 * reduce cache-line ping-ponging. 275 * 276 * Try to keep the pcpu structure within one cache line (~64 bytes). 277 */ 278 #define MNTCACHE_COUNT 5 279 280 struct mntcache { 281 struct mount *mntary[MNTCACHE_COUNT]; 282 struct namecache *ncp1; 283 struct namecache *ncp2; 284 struct nchandle ncdir; 285 int iter; 286 int unused01; 287 } __cachealign; 288 289 static struct mntcache pcpu_mntcache[MAXCPU]; 290 291 static 292 void 293 _cache_mntref(struct mount *mp) 294 { 295 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 296 int i; 297 298 for (i = 0; i < MNTCACHE_COUNT; ++i) { 299 if (cache->mntary[i] != mp) 300 continue; 301 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL)) 302 return; 303 } 304 atomic_add_int(&mp->mnt_refs, 1); 305 } 306 307 static 308 void 309 _cache_mntrel(struct mount *mp) 310 { 311 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 312 int i; 313 314 for (i = 0; i < MNTCACHE_COUNT; ++i) { 315 if (cache->mntary[i] == NULL) { 316 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 317 if (mp == NULL) 318 return; 319 } 320 } 321 i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT); 322 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 323 if (mp) 324 atomic_add_int(&mp->mnt_refs, -1); 325 } 326 327 /* 328 * Clears all cached mount points on all cpus. This routine should only 329 * be called when we are waiting for a mount to clear, e.g. so we can 330 * unmount. 331 */ 332 void 333 cache_clearmntcache(void) 334 { 335 int n; 336 337 for (n = 0; n < ncpus; ++n) { 338 struct mntcache *cache = &pcpu_mntcache[n]; 339 struct namecache *ncp; 340 struct mount *mp; 341 int i; 342 343 for (i = 0; i < MNTCACHE_COUNT; ++i) { 344 if (cache->mntary[i]) { 345 mp = atomic_swap_ptr( 346 (void *)&cache->mntary[i], NULL); 347 if (mp) 348 atomic_add_int(&mp->mnt_refs, -1); 349 } 350 } 351 if (cache->ncp1) { 352 ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL); 353 if (ncp) 354 _cache_drop(ncp); 355 } 356 if (cache->ncp2) { 357 ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL); 358 if (ncp) 359 _cache_drop(ncp); 360 } 361 if (cache->ncdir.ncp) { 362 ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL); 363 if (ncp) 364 _cache_drop(ncp); 365 } 366 if (cache->ncdir.mount) { 367 mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL); 368 if (mp) 369 atomic_add_int(&mp->mnt_refs, -1); 370 } 371 } 372 } 373 374 375 /* 376 * Namespace locking. The caller must already hold a reference to the 377 * namecache structure in order to lock/unlock it. This function prevents 378 * the namespace from being created or destroyed by accessors other then 379 * the lock holder. 380 * 381 * Note that holding a locked namecache structure prevents other threads 382 * from making namespace changes (e.g. deleting or creating), prevents 383 * vnode association state changes by other threads, and prevents the 384 * namecache entry from being resolved or unresolved by other threads. 385 * 386 * An exclusive lock owner has full authority to associate/disassociate 387 * vnodes and resolve/unresolve the locked ncp. 388 * 389 * A shared lock owner only has authority to acquire the underlying vnode, 390 * if any. 391 * 392 * The primary lock field is nc_lockstatus. nc_locktd is set after the 393 * fact (when locking) or cleared prior to unlocking. 394 * 395 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 396 * or recycled, but it does NOT help you if the vnode had already 397 * initiated a recyclement. If this is important, use cache_get() 398 * rather then cache_lock() (and deal with the differences in the 399 * way the refs counter is handled). Or, alternatively, make an 400 * unconditional call to cache_validate() or cache_resolve() 401 * after cache_lock() returns. 402 */ 403 static 404 void 405 _cache_lock(struct namecache *ncp) 406 { 407 thread_t td; 408 int didwarn; 409 int begticks; 410 int error; 411 u_int count; 412 413 KKASSERT(ncp->nc_refs != 0); 414 didwarn = 0; 415 begticks = 0; 416 td = curthread; 417 418 for (;;) { 419 count = ncp->nc_lockstatus; 420 cpu_ccfence(); 421 422 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 423 if (atomic_cmpset_int(&ncp->nc_lockstatus, 424 count, count + 1)) { 425 /* 426 * The vp associated with a locked ncp must 427 * be held to prevent it from being recycled. 428 * 429 * WARNING! If VRECLAIMED is set the vnode 430 * could already be in the middle of a recycle. 431 * Callers must use cache_vref() or 432 * cache_vget() on the locked ncp to 433 * validate the vp or set the cache entry 434 * to unresolved. 435 * 436 * NOTE! vhold() is allowed if we hold a 437 * lock on the ncp (which we do). 438 */ 439 ncp->nc_locktd = td; 440 if (ncp->nc_vp) 441 vhold(ncp->nc_vp); 442 break; 443 } 444 /* cmpset failed */ 445 continue; 446 } 447 if (ncp->nc_locktd == td) { 448 KKASSERT((count & NC_SHLOCK_FLAG) == 0); 449 if (atomic_cmpset_int(&ncp->nc_lockstatus, 450 count, count + 1)) { 451 break; 452 } 453 /* cmpset failed */ 454 continue; 455 } 456 tsleep_interlock(&ncp->nc_locktd, 0); 457 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 458 count | NC_EXLOCK_REQ) == 0) { 459 /* cmpset failed */ 460 continue; 461 } 462 if (begticks == 0) 463 begticks = ticks; 464 error = tsleep(&ncp->nc_locktd, PINTERLOCKED, 465 "clock", nclockwarn); 466 if (error == EWOULDBLOCK) { 467 if (didwarn == 0) { 468 didwarn = ticks; 469 kprintf("[diagnostic] cache_lock: " 470 "%s blocked on %p %08x", 471 td->td_comm, ncp, count); 472 kprintf(" \"%*.*s\"\n", 473 ncp->nc_nlen, ncp->nc_nlen, 474 ncp->nc_name); 475 } 476 } 477 /* loop */ 478 } 479 if (didwarn) { 480 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after " 481 "%d secs\n", 482 td->td_comm, 483 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 484 (int)(ticks + (hz / 2) - begticks) / hz); 485 } 486 } 487 488 /* 489 * The shared lock works similarly to the exclusive lock except 490 * nc_locktd is left NULL and we need an interlock (VHOLD) to 491 * prevent vhold() races, since the moment our cmpset_int succeeds 492 * another cpu can come in and get its own shared lock. 493 * 494 * A critical section is needed to prevent interruption during the 495 * VHOLD interlock. 496 */ 497 static 498 void 499 _cache_lock_shared(struct namecache *ncp) 500 { 501 int didwarn; 502 int error; 503 u_int count; 504 u_int optreq = NC_EXLOCK_REQ; 505 506 KKASSERT(ncp->nc_refs != 0); 507 didwarn = 0; 508 509 for (;;) { 510 count = ncp->nc_lockstatus; 511 cpu_ccfence(); 512 513 if ((count & ~NC_SHLOCK_REQ) == 0) { 514 crit_enter(); 515 if (atomic_cmpset_int(&ncp->nc_lockstatus, 516 count, 517 (count + 1) | NC_SHLOCK_FLAG | 518 NC_SHLOCK_VHOLD)) { 519 /* 520 * The vp associated with a locked ncp must 521 * be held to prevent it from being recycled. 522 * 523 * WARNING! If VRECLAIMED is set the vnode 524 * could already be in the middle of a recycle. 525 * Callers must use cache_vref() or 526 * cache_vget() on the locked ncp to 527 * validate the vp or set the cache entry 528 * to unresolved. 529 * 530 * NOTE! vhold() is allowed if we hold a 531 * lock on the ncp (which we do). 532 */ 533 if (ncp->nc_vp) 534 vhold(ncp->nc_vp); 535 atomic_clear_int(&ncp->nc_lockstatus, 536 NC_SHLOCK_VHOLD); 537 crit_exit(); 538 break; 539 } 540 /* cmpset failed */ 541 crit_exit(); 542 continue; 543 } 544 545 /* 546 * If already held shared we can just bump the count, but 547 * only allow this if nobody is trying to get the lock 548 * exclusively. If we are blocking too long ignore excl 549 * requests (which can race/deadlock us). 550 * 551 * VHOLD is a bit of a hack. Even though we successfully 552 * added another shared ref, the cpu that got the first 553 * shared ref might not yet have held the vnode. 554 */ 555 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) { 556 KKASSERT((count & ~(NC_EXLOCK_REQ | 557 NC_SHLOCK_REQ | 558 NC_SHLOCK_FLAG)) > 0); 559 if (atomic_cmpset_int(&ncp->nc_lockstatus, 560 count, count + 1)) { 561 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 562 cpu_pause(); 563 break; 564 } 565 continue; 566 } 567 tsleep_interlock(ncp, 0); 568 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 569 count | NC_SHLOCK_REQ) == 0) { 570 /* cmpset failed */ 571 continue; 572 } 573 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); 574 if (error == EWOULDBLOCK) { 575 optreq = 0; 576 if (didwarn == 0) { 577 didwarn = ticks - nclockwarn; 578 kprintf("[diagnostic] cache_lock_shared: " 579 "%s blocked on %p %08x", 580 curthread->td_comm, ncp, count); 581 kprintf(" \"%*.*s\"\n", 582 ncp->nc_nlen, ncp->nc_nlen, 583 ncp->nc_name); 584 } 585 } 586 /* loop */ 587 } 588 if (didwarn) { 589 kprintf("[diagnostic] cache_lock_shared: " 590 "%s unblocked %*.*s after %d secs\n", 591 curthread->td_comm, 592 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 593 (int)(ticks - didwarn) / hz); 594 } 595 } 596 597 /* 598 * Lock ncp exclusively, return 0 on success. 599 * 600 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, 601 * such as the case where one of its children is locked. 602 */ 603 static 604 int 605 _cache_lock_nonblock(struct namecache *ncp) 606 { 607 thread_t td; 608 u_int count; 609 610 td = curthread; 611 612 for (;;) { 613 count = ncp->nc_lockstatus; 614 615 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 616 if (atomic_cmpset_int(&ncp->nc_lockstatus, 617 count, count + 1)) { 618 /* 619 * The vp associated with a locked ncp must 620 * be held to prevent it from being recycled. 621 * 622 * WARNING! If VRECLAIMED is set the vnode 623 * could already be in the middle of a recycle. 624 * Callers must use cache_vref() or 625 * cache_vget() on the locked ncp to 626 * validate the vp or set the cache entry 627 * to unresolved. 628 * 629 * NOTE! vhold() is allowed if we hold a 630 * lock on the ncp (which we do). 631 */ 632 ncp->nc_locktd = td; 633 if (ncp->nc_vp) 634 vhold(ncp->nc_vp); 635 break; 636 } 637 /* cmpset failed */ 638 continue; 639 } 640 if (ncp->nc_locktd == td) { 641 if (atomic_cmpset_int(&ncp->nc_lockstatus, 642 count, count + 1)) { 643 break; 644 } 645 /* cmpset failed */ 646 continue; 647 } 648 return(EWOULDBLOCK); 649 } 650 return(0); 651 } 652 653 /* 654 * The shared lock works similarly to the exclusive lock except 655 * nc_locktd is left NULL and we need an interlock (VHOLD) to 656 * prevent vhold() races, since the moment our cmpset_int succeeds 657 * another cpu can come in and get its own shared lock. 658 * 659 * A critical section is needed to prevent interruption during the 660 * VHOLD interlock. 661 */ 662 static 663 int 664 _cache_lock_shared_nonblock(struct namecache *ncp) 665 { 666 u_int count; 667 668 for (;;) { 669 count = ncp->nc_lockstatus; 670 671 if ((count & ~NC_SHLOCK_REQ) == 0) { 672 crit_enter(); 673 if (atomic_cmpset_int(&ncp->nc_lockstatus, 674 count, 675 (count + 1) | NC_SHLOCK_FLAG | 676 NC_SHLOCK_VHOLD)) { 677 /* 678 * The vp associated with a locked ncp must 679 * be held to prevent it from being recycled. 680 * 681 * WARNING! If VRECLAIMED is set the vnode 682 * could already be in the middle of a recycle. 683 * Callers must use cache_vref() or 684 * cache_vget() on the locked ncp to 685 * validate the vp or set the cache entry 686 * to unresolved. 687 * 688 * NOTE! vhold() is allowed if we hold a 689 * lock on the ncp (which we do). 690 */ 691 if (ncp->nc_vp) 692 vhold(ncp->nc_vp); 693 atomic_clear_int(&ncp->nc_lockstatus, 694 NC_SHLOCK_VHOLD); 695 crit_exit(); 696 break; 697 } 698 /* cmpset failed */ 699 crit_exit(); 700 continue; 701 } 702 703 /* 704 * If already held shared we can just bump the count, but 705 * only allow this if nobody is trying to get the lock 706 * exclusively. 707 * 708 * VHOLD is a bit of a hack. Even though we successfully 709 * added another shared ref, the cpu that got the first 710 * shared ref might not yet have held the vnode. 711 */ 712 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 713 NC_SHLOCK_FLAG) { 714 KKASSERT((count & ~(NC_EXLOCK_REQ | 715 NC_SHLOCK_REQ | 716 NC_SHLOCK_FLAG)) > 0); 717 if (atomic_cmpset_int(&ncp->nc_lockstatus, 718 count, count + 1)) { 719 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 720 cpu_pause(); 721 break; 722 } 723 continue; 724 } 725 return(EWOULDBLOCK); 726 } 727 return(0); 728 } 729 730 /* 731 * Helper function 732 * 733 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). 734 * 735 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. 736 */ 737 static 738 void 739 _cache_unlock(struct namecache *ncp) 740 { 741 thread_t td __debugvar = curthread; 742 u_int count; 743 u_int ncount; 744 struct vnode *dropvp; 745 746 KKASSERT(ncp->nc_refs >= 0); 747 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); 748 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); 749 750 count = ncp->nc_lockstatus; 751 cpu_ccfence(); 752 753 /* 754 * Clear nc_locktd prior to the atomic op (excl lock only) 755 */ 756 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) 757 ncp->nc_locktd = NULL; 758 dropvp = NULL; 759 760 for (;;) { 761 if ((count & 762 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { 763 dropvp = ncp->nc_vp; 764 if (count & NC_EXLOCK_REQ) 765 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ 766 else 767 ncount = 0; 768 769 if (atomic_cmpset_int(&ncp->nc_lockstatus, 770 count, ncount)) { 771 if (count & NC_EXLOCK_REQ) 772 wakeup(&ncp->nc_locktd); 773 else if (count & NC_SHLOCK_REQ) 774 wakeup(ncp); 775 break; 776 } 777 dropvp = NULL; 778 } else { 779 KKASSERT((count & NC_SHLOCK_VHOLD) == 0); 780 KKASSERT((count & ~(NC_EXLOCK_REQ | 781 NC_SHLOCK_REQ | 782 NC_SHLOCK_FLAG)) > 1); 783 if (atomic_cmpset_int(&ncp->nc_lockstatus, 784 count, count - 1)) { 785 break; 786 } 787 } 788 count = ncp->nc_lockstatus; 789 cpu_ccfence(); 790 } 791 792 /* 793 * Don't actually drop the vp until we successfully clean out 794 * the lock, otherwise we may race another shared lock. 795 */ 796 if (dropvp) 797 vdrop(dropvp); 798 } 799 800 static 801 int 802 _cache_lockstatus(struct namecache *ncp) 803 { 804 if (ncp->nc_locktd == curthread) 805 return(LK_EXCLUSIVE); 806 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) 807 return(LK_SHARED); 808 return(-1); 809 } 810 811 /* 812 * cache_hold() and cache_drop() prevent the premature deletion of a 813 * namecache entry but do not prevent operations (such as zapping) on 814 * that namecache entry. 815 * 816 * This routine may only be called from outside this source module if 817 * nc_refs is already at least 1. 818 * 819 * This is a rare case where callers are allowed to hold a spinlock, 820 * so we can't ourselves. 821 */ 822 static __inline 823 struct namecache * 824 _cache_hold(struct namecache *ncp) 825 { 826 atomic_add_int(&ncp->nc_refs, 1); 827 return(ncp); 828 } 829 830 /* 831 * Drop a cache entry, taking care to deal with races. 832 * 833 * For potential 1->0 transitions we must hold the ncp lock to safely 834 * test its flags. An unresolved entry with no children must be zapped 835 * to avoid leaks. 836 * 837 * The call to cache_zap() itself will handle all remaining races and 838 * will decrement the ncp's refs regardless. If we are resolved or 839 * have children nc_refs can safely be dropped to 0 without having to 840 * zap the entry. 841 * 842 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion. 843 * 844 * NOTE: cache_zap() may return a non-NULL referenced parent which must 845 * be dropped in a loop. 846 */ 847 static __inline 848 void 849 _cache_drop(struct namecache *ncp) 850 { 851 int refs; 852 853 while (ncp) { 854 KKASSERT(ncp->nc_refs > 0); 855 refs = ncp->nc_refs; 856 857 if (refs == 1) { 858 if (_cache_lock_nonblock(ncp) == 0) { 859 ncp->nc_flag &= ~NCF_DEFEREDZAP; 860 if ((ncp->nc_flag & NCF_UNRESOLVED) && 861 TAILQ_EMPTY(&ncp->nc_list)) { 862 ncp = cache_zap(ncp, 1); 863 continue; 864 } 865 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) { 866 _cache_unlock(ncp); 867 break; 868 } 869 _cache_unlock(ncp); 870 } 871 } else { 872 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) 873 break; 874 } 875 cpu_pause(); 876 } 877 } 878 879 /* 880 * Link a new namecache entry to its parent and to the hash table. Be 881 * careful to avoid races if vhold() blocks in the future. 882 * 883 * Both ncp and par must be referenced and locked. 884 * 885 * NOTE: The hash table spinlock is held during this call, we can't do 886 * anything fancy. 887 */ 888 static void 889 _cache_link_parent(struct namecache *ncp, struct namecache *par, 890 struct nchash_head *nchpp) 891 { 892 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 893 894 KKASSERT(ncp->nc_parent == NULL); 895 ncp->nc_parent = par; 896 ncp->nc_head = nchpp; 897 898 /* 899 * Set inheritance flags. Note that the parent flags may be 900 * stale due to getattr potentially not having been run yet 901 * (it gets run during nlookup()'s). 902 */ 903 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 904 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 905 ncp->nc_flag |= NCF_SF_PNOCACHE; 906 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 907 ncp->nc_flag |= NCF_UF_PCACHE; 908 909 /* 910 * Add to hash table and parent, adjust accounting 911 */ 912 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 913 atomic_add_long(&pn->vfscache_count, 1); 914 if (TAILQ_EMPTY(&ncp->nc_list)) 915 atomic_add_long(&pn->vfscache_leafs, 1); 916 917 if (TAILQ_EMPTY(&par->nc_list)) { 918 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 919 atomic_add_long(&pn->vfscache_leafs, -1); 920 /* 921 * Any vp associated with an ncp which has children must 922 * be held to prevent it from being recycled. 923 */ 924 if (par->nc_vp) 925 vhold(par->nc_vp); 926 } else { 927 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 928 } 929 } 930 931 /* 932 * Remove the parent and hash associations from a namecache structure. 933 * If this is the last child of the parent the cache_drop(par) will 934 * attempt to recursively zap the parent. 935 * 936 * ncp must be locked. This routine will acquire a temporary lock on 937 * the parent as wlel as the appropriate hash chain. 938 */ 939 static void 940 _cache_unlink_parent(struct namecache *ncp) 941 { 942 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 943 struct namecache *par; 944 struct vnode *dropvp; 945 946 if ((par = ncp->nc_parent) != NULL) { 947 KKASSERT(ncp->nc_parent == par); 948 _cache_hold(par); 949 _cache_lock(par); 950 spin_lock(&ncp->nc_head->spin); 951 952 /* 953 * Remove from hash table and parent, adjust accounting 954 */ 955 LIST_REMOVE(ncp, nc_hash); 956 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 957 atomic_add_long(&pn->vfscache_count, -1); 958 if (TAILQ_EMPTY(&ncp->nc_list)) 959 atomic_add_long(&pn->vfscache_leafs, -1); 960 961 dropvp = NULL; 962 if (TAILQ_EMPTY(&par->nc_list)) { 963 atomic_add_long(&pn->vfscache_leafs, 1); 964 if (par->nc_vp) 965 dropvp = par->nc_vp; 966 } 967 spin_unlock(&ncp->nc_head->spin); 968 ncp->nc_parent = NULL; 969 ncp->nc_head = NULL; 970 _cache_unlock(par); 971 _cache_drop(par); 972 973 /* 974 * We can only safely vdrop with no spinlocks held. 975 */ 976 if (dropvp) 977 vdrop(dropvp); 978 } 979 } 980 981 /* 982 * Allocate a new namecache structure. Most of the code does not require 983 * zero-termination of the string but it makes vop_compat_ncreate() easier. 984 */ 985 static struct namecache * 986 cache_alloc(int nlen) 987 { 988 struct namecache *ncp; 989 990 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 991 if (nlen) 992 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 993 ncp->nc_nlen = nlen; 994 ncp->nc_flag = NCF_UNRESOLVED; 995 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 996 ncp->nc_refs = 1; 997 998 TAILQ_INIT(&ncp->nc_list); 999 _cache_lock(ncp); 1000 return(ncp); 1001 } 1002 1003 /* 1004 * Can only be called for the case where the ncp has never been 1005 * associated with anything (so no spinlocks are needed). 1006 */ 1007 static void 1008 _cache_free(struct namecache *ncp) 1009 { 1010 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); 1011 if (ncp->nc_name) 1012 kfree(ncp->nc_name, M_VFSCACHE); 1013 kfree(ncp, M_VFSCACHE); 1014 } 1015 1016 /* 1017 * [re]initialize a nchandle. 1018 */ 1019 void 1020 cache_zero(struct nchandle *nch) 1021 { 1022 nch->ncp = NULL; 1023 nch->mount = NULL; 1024 } 1025 1026 /* 1027 * Ref and deref a namecache structure. 1028 * 1029 * The caller must specify a stable ncp pointer, typically meaning the 1030 * ncp is already referenced but this can also occur indirectly through 1031 * e.g. holding a lock on a direct child. 1032 * 1033 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 1034 * use read spinlocks here. 1035 */ 1036 struct nchandle * 1037 cache_hold(struct nchandle *nch) 1038 { 1039 _cache_hold(nch->ncp); 1040 _cache_mntref(nch->mount); 1041 return(nch); 1042 } 1043 1044 /* 1045 * Create a copy of a namecache handle for an already-referenced 1046 * entry. 1047 */ 1048 void 1049 cache_copy(struct nchandle *nch, struct nchandle *target) 1050 { 1051 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1052 struct namecache *ncp; 1053 1054 *target = *nch; 1055 _cache_mntref(target->mount); 1056 ncp = target->ncp; 1057 if (ncp) { 1058 if (ncp == cache->ncp1) { 1059 if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL)) 1060 return; 1061 } 1062 if (ncp == cache->ncp2) { 1063 if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL)) 1064 return; 1065 } 1066 _cache_hold(ncp); 1067 } 1068 } 1069 1070 /* 1071 * Caller wants to copy the current directory, copy it out from our 1072 * pcpu cache if possible (the entire critical path is just two localized 1073 * cmpset ops). If the pcpu cache has a snapshot at all it will be a 1074 * valid one, so we don't have to lock p->p_fd even though we are loading 1075 * two fields. 1076 * 1077 * This has a limited effect since nlookup must still ref and shlock the 1078 * vnode to check perms. We do avoid the per-proc spin-lock though, which 1079 * can aid threaded programs. 1080 */ 1081 void 1082 cache_copy_ncdir(struct proc *p, struct nchandle *target) 1083 { 1084 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1085 1086 *target = p->p_fd->fd_ncdir; 1087 if (target->ncp == cache->ncdir.ncp && 1088 target->mount == cache->ncdir.mount) { 1089 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp, 1090 target->ncp, NULL)) { 1091 if (atomic_cmpset_ptr((void *)&cache->ncdir.mount, 1092 target->mount, NULL)) { 1093 /* CRITICAL PATH */ 1094 return; 1095 } 1096 _cache_drop(target->ncp); 1097 } 1098 } 1099 spin_lock_shared(&p->p_fd->fd_spin); 1100 cache_copy(&p->p_fd->fd_ncdir, target); 1101 spin_unlock_shared(&p->p_fd->fd_spin); 1102 } 1103 1104 void 1105 cache_changemount(struct nchandle *nch, struct mount *mp) 1106 { 1107 _cache_mntref(mp); 1108 _cache_mntrel(nch->mount); 1109 nch->mount = mp; 1110 } 1111 1112 void 1113 cache_drop(struct nchandle *nch) 1114 { 1115 _cache_mntrel(nch->mount); 1116 _cache_drop(nch->ncp); 1117 nch->ncp = NULL; 1118 nch->mount = NULL; 1119 } 1120 1121 /* 1122 * Drop the nchandle, but try to cache the ref to avoid global atomic 1123 * ops. This is typically done on the system root and jail root nchandles. 1124 */ 1125 void 1126 cache_drop_and_cache(struct nchandle *nch) 1127 { 1128 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1129 struct namecache *ncp; 1130 1131 _cache_mntrel(nch->mount); 1132 ncp = nch->ncp; 1133 if (cache->ncp1 == NULL) { 1134 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1135 if (ncp == NULL) 1136 goto done; 1137 } 1138 if (cache->ncp2 == NULL) { 1139 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1140 if (ncp == NULL) 1141 goto done; 1142 } 1143 if (++cache->iter & 1) 1144 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1145 else 1146 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1147 if (ncp) 1148 _cache_drop(ncp); 1149 done: 1150 nch->ncp = NULL; 1151 nch->mount = NULL; 1152 } 1153 1154 /* 1155 * We are dropping what the caller believes is the current directory, 1156 * unconditionally store it in our pcpu cache. Anything already in 1157 * the cache will be discarded. 1158 */ 1159 void 1160 cache_drop_ncdir(struct nchandle *nch) 1161 { 1162 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1163 1164 nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp); 1165 nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount); 1166 if (nch->ncp) 1167 _cache_drop(nch->ncp); 1168 if (nch->mount) 1169 _cache_mntrel(nch->mount); 1170 nch->ncp = NULL; 1171 nch->mount = NULL; 1172 } 1173 1174 int 1175 cache_lockstatus(struct nchandle *nch) 1176 { 1177 return(_cache_lockstatus(nch->ncp)); 1178 } 1179 1180 void 1181 cache_lock(struct nchandle *nch) 1182 { 1183 _cache_lock(nch->ncp); 1184 } 1185 1186 void 1187 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1188 { 1189 struct namecache *ncp = nch->ncp; 1190 1191 if (ncp_shared_lock_disable || excl || 1192 (ncp->nc_flag & NCF_UNRESOLVED)) { 1193 _cache_lock(ncp); 1194 } else { 1195 _cache_lock_shared(ncp); 1196 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1197 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1198 _cache_unlock(ncp); 1199 _cache_lock(ncp); 1200 } 1201 } else { 1202 _cache_unlock(ncp); 1203 _cache_lock(ncp); 1204 } 1205 } 1206 } 1207 1208 /* 1209 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1210 * is responsible for checking both for validity on return as they 1211 * may have become invalid. 1212 * 1213 * We have to deal with potential deadlocks here, just ping pong 1214 * the lock until we get it (we will always block somewhere when 1215 * looping so this is not cpu-intensive). 1216 * 1217 * which = 0 nch1 not locked, nch2 is locked 1218 * which = 1 nch1 is locked, nch2 is not locked 1219 */ 1220 void 1221 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1222 struct nchandle *nch2, struct ucred *cred2) 1223 { 1224 int which; 1225 1226 which = 0; 1227 1228 for (;;) { 1229 if (which == 0) { 1230 if (cache_lock_nonblock(nch1) == 0) { 1231 cache_resolve(nch1, cred1); 1232 break; 1233 } 1234 cache_unlock(nch2); 1235 cache_lock(nch1); 1236 cache_resolve(nch1, cred1); 1237 which = 1; 1238 } else { 1239 if (cache_lock_nonblock(nch2) == 0) { 1240 cache_resolve(nch2, cred2); 1241 break; 1242 } 1243 cache_unlock(nch1); 1244 cache_lock(nch2); 1245 cache_resolve(nch2, cred2); 1246 which = 0; 1247 } 1248 } 1249 } 1250 1251 int 1252 cache_lock_nonblock(struct nchandle *nch) 1253 { 1254 return(_cache_lock_nonblock(nch->ncp)); 1255 } 1256 1257 void 1258 cache_unlock(struct nchandle *nch) 1259 { 1260 _cache_unlock(nch->ncp); 1261 } 1262 1263 /* 1264 * ref-and-lock, unlock-and-deref functions. 1265 * 1266 * This function is primarily used by nlookup. Even though cache_lock 1267 * holds the vnode, it is possible that the vnode may have already 1268 * initiated a recyclement. 1269 * 1270 * We want cache_get() to return a definitively usable vnode or a 1271 * definitively unresolved ncp. 1272 */ 1273 static 1274 struct namecache * 1275 _cache_get(struct namecache *ncp) 1276 { 1277 _cache_hold(ncp); 1278 _cache_lock(ncp); 1279 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1280 _cache_setunresolved(ncp); 1281 return(ncp); 1282 } 1283 1284 /* 1285 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1286 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1287 * valid. Otherwise an exclusive lock will be acquired instead. 1288 */ 1289 static 1290 struct namecache * 1291 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1292 { 1293 if (ncp_shared_lock_disable || excl || 1294 (ncp->nc_flag & NCF_UNRESOLVED)) { 1295 return(_cache_get(ncp)); 1296 } 1297 _cache_hold(ncp); 1298 _cache_lock_shared(ncp); 1299 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1300 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1301 _cache_unlock(ncp); 1302 ncp = _cache_get(ncp); 1303 _cache_drop(ncp); 1304 } 1305 } else { 1306 _cache_unlock(ncp); 1307 ncp = _cache_get(ncp); 1308 _cache_drop(ncp); 1309 } 1310 return(ncp); 1311 } 1312 1313 /* 1314 * This is a special form of _cache_lock() which only succeeds if 1315 * it can get a pristine, non-recursive lock. The caller must have 1316 * already ref'd the ncp. 1317 * 1318 * On success the ncp will be locked, on failure it will not. The 1319 * ref count does not change either way. 1320 * 1321 * We want _cache_lock_special() (on success) to return a definitively 1322 * usable vnode or a definitively unresolved ncp. 1323 */ 1324 static int 1325 _cache_lock_special(struct namecache *ncp) 1326 { 1327 if (_cache_lock_nonblock(ncp) == 0) { 1328 if ((ncp->nc_lockstatus & 1329 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { 1330 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1331 _cache_setunresolved(ncp); 1332 return(0); 1333 } 1334 _cache_unlock(ncp); 1335 } 1336 return(EWOULDBLOCK); 1337 } 1338 1339 /* 1340 * This function tries to get a shared lock but will back-off to an exclusive 1341 * lock if: 1342 * 1343 * (1) Some other thread is trying to obtain an exclusive lock 1344 * (to prevent the exclusive requester from getting livelocked out 1345 * by many shared locks). 1346 * 1347 * (2) The current thread already owns an exclusive lock (to avoid 1348 * deadlocking). 1349 * 1350 * WARNING! On machines with lots of cores we really want to try hard to 1351 * get a shared lock or concurrent path lookups can chain-react 1352 * into a very high-latency exclusive lock. 1353 */ 1354 static int 1355 _cache_lock_shared_special(struct namecache *ncp) 1356 { 1357 /* 1358 * Only honor a successful shared lock (returning 0) if there is 1359 * no exclusive request pending and the vnode, if present, is not 1360 * in a reclaimed state. 1361 */ 1362 if (_cache_lock_shared_nonblock(ncp) == 0) { 1363 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) { 1364 if (ncp->nc_vp == NULL || 1365 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 1366 return(0); 1367 } 1368 } 1369 _cache_unlock(ncp); 1370 return(EWOULDBLOCK); 1371 } 1372 1373 /* 1374 * Non-blocking shared lock failed. If we already own the exclusive 1375 * lock just acquire another exclusive lock (instead of deadlocking). 1376 * Otherwise acquire a shared lock. 1377 */ 1378 if (ncp->nc_locktd == curthread) { 1379 _cache_lock(ncp); 1380 return(0); 1381 } 1382 _cache_lock_shared(ncp); 1383 return(0); 1384 } 1385 1386 1387 /* 1388 * NOTE: The same nchandle can be passed for both arguments. 1389 */ 1390 void 1391 cache_get(struct nchandle *nch, struct nchandle *target) 1392 { 1393 KKASSERT(nch->ncp->nc_refs > 0); 1394 target->mount = nch->mount; 1395 target->ncp = _cache_get(nch->ncp); 1396 _cache_mntref(target->mount); 1397 } 1398 1399 void 1400 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1401 { 1402 KKASSERT(nch->ncp->nc_refs > 0); 1403 target->mount = nch->mount; 1404 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1405 _cache_mntref(target->mount); 1406 } 1407 1408 /* 1409 * 1410 */ 1411 static __inline 1412 void 1413 _cache_put(struct namecache *ncp) 1414 { 1415 _cache_unlock(ncp); 1416 _cache_drop(ncp); 1417 } 1418 1419 /* 1420 * 1421 */ 1422 void 1423 cache_put(struct nchandle *nch) 1424 { 1425 _cache_mntrel(nch->mount); 1426 _cache_put(nch->ncp); 1427 nch->ncp = NULL; 1428 nch->mount = NULL; 1429 } 1430 1431 /* 1432 * Resolve an unresolved ncp by associating a vnode with it. If the 1433 * vnode is NULL, a negative cache entry is created. 1434 * 1435 * The ncp should be locked on entry and will remain locked on return. 1436 */ 1437 static 1438 void 1439 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1440 { 1441 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 1442 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1443 1444 if (vp != NULL) { 1445 /* 1446 * Any vp associated with an ncp which has children must 1447 * be held. Any vp associated with a locked ncp must be held. 1448 */ 1449 if (!TAILQ_EMPTY(&ncp->nc_list)) 1450 vhold(vp); 1451 spin_lock(&vp->v_spin); 1452 ncp->nc_vp = vp; 1453 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1454 spin_unlock(&vp->v_spin); 1455 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1456 vhold(vp); 1457 1458 /* 1459 * Set auxiliary flags 1460 */ 1461 switch(vp->v_type) { 1462 case VDIR: 1463 ncp->nc_flag |= NCF_ISDIR; 1464 break; 1465 case VLNK: 1466 ncp->nc_flag |= NCF_ISSYMLINK; 1467 /* XXX cache the contents of the symlink */ 1468 break; 1469 default: 1470 break; 1471 } 1472 ncp->nc_error = 0; 1473 /* XXX: this is a hack to work-around the lack of a real pfs vfs 1474 * implementation*/ 1475 if (mp != NULL) 1476 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1477 vp->v_pfsmp = mp; 1478 } else { 1479 /* 1480 * When creating a negative cache hit we set the 1481 * namecache_gen. A later resolve will clean out the 1482 * negative cache hit if the mount point's namecache_gen 1483 * has changed. Used by devfs, could also be used by 1484 * other remote FSs. 1485 */ 1486 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1487 1488 ncp->nc_vp = NULL; 1489 ncp->nc_negcpu = mycpu->gd_cpuid; 1490 spin_lock(&pn->neg_spin); 1491 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1492 ++pn->neg_count; 1493 spin_unlock(&pn->neg_spin); 1494 atomic_add_long(&pn->vfscache_negs, 1); 1495 1496 ncp->nc_error = ENOENT; 1497 if (mp) 1498 VFS_NCPGEN_SET(mp, ncp); 1499 } 1500 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1501 } 1502 1503 /* 1504 * 1505 */ 1506 void 1507 cache_setvp(struct nchandle *nch, struct vnode *vp) 1508 { 1509 _cache_setvp(nch->mount, nch->ncp, vp); 1510 } 1511 1512 /* 1513 * 1514 */ 1515 void 1516 cache_settimeout(struct nchandle *nch, int nticks) 1517 { 1518 struct namecache *ncp = nch->ncp; 1519 1520 if ((ncp->nc_timeout = ticks + nticks) == 0) 1521 ncp->nc_timeout = 1; 1522 } 1523 1524 /* 1525 * Disassociate the vnode or negative-cache association and mark a 1526 * namecache entry as unresolved again. Note that the ncp is still 1527 * left in the hash table and still linked to its parent. 1528 * 1529 * The ncp should be locked and refd on entry and will remain locked and refd 1530 * on return. 1531 * 1532 * This routine is normally never called on a directory containing children. 1533 * However, NFS often does just that in its rename() code as a cop-out to 1534 * avoid complex namespace operations. This disconnects a directory vnode 1535 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1536 * sync. 1537 * 1538 */ 1539 static 1540 void 1541 _cache_setunresolved(struct namecache *ncp) 1542 { 1543 struct vnode *vp; 1544 1545 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1546 ncp->nc_flag |= NCF_UNRESOLVED; 1547 ncp->nc_timeout = 0; 1548 ncp->nc_error = ENOTCONN; 1549 if ((vp = ncp->nc_vp) != NULL) { 1550 spin_lock(&vp->v_spin); 1551 ncp->nc_vp = NULL; 1552 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1553 spin_unlock(&vp->v_spin); 1554 1555 /* 1556 * Any vp associated with an ncp with children is 1557 * held by that ncp. Any vp associated with a locked 1558 * ncp is held by that ncp. These conditions must be 1559 * undone when the vp is cleared out from the ncp. 1560 */ 1561 if (!TAILQ_EMPTY(&ncp->nc_list)) 1562 vdrop(vp); 1563 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1564 vdrop(vp); 1565 } else { 1566 struct pcpu_ncache *pn; 1567 1568 pn = &pcpu_ncache[ncp->nc_negcpu]; 1569 1570 atomic_add_long(&pn->vfscache_negs, -1); 1571 spin_lock(&pn->neg_spin); 1572 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1573 --pn->neg_count; 1574 spin_unlock(&pn->neg_spin); 1575 } 1576 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1577 } 1578 } 1579 1580 /* 1581 * The cache_nresolve() code calls this function to automatically 1582 * set a resolved cache element to unresolved if it has timed out 1583 * or if it is a negative cache hit and the mount point namecache_gen 1584 * has changed. 1585 */ 1586 static __inline int 1587 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1588 { 1589 /* 1590 * Try to zap entries that have timed out. We have 1591 * to be careful here because locked leafs may depend 1592 * on the vnode remaining intact in a parent, so only 1593 * do this under very specific conditions. 1594 */ 1595 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1596 TAILQ_EMPTY(&ncp->nc_list)) { 1597 return 1; 1598 } 1599 1600 /* 1601 * If a resolved negative cache hit is invalid due to 1602 * the mount's namecache generation being bumped, zap it. 1603 */ 1604 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1605 return 1; 1606 } 1607 1608 /* 1609 * Otherwise we are good 1610 */ 1611 return 0; 1612 } 1613 1614 static __inline void 1615 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1616 { 1617 /* 1618 * Already in an unresolved state, nothing to do. 1619 */ 1620 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1621 if (_cache_auto_unresolve_test(mp, ncp)) 1622 _cache_setunresolved(ncp); 1623 } 1624 } 1625 1626 /* 1627 * 1628 */ 1629 void 1630 cache_setunresolved(struct nchandle *nch) 1631 { 1632 _cache_setunresolved(nch->ncp); 1633 } 1634 1635 /* 1636 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1637 * looking for matches. This flag tells the lookup code when it must 1638 * check for a mount linkage and also prevents the directories in question 1639 * from being deleted or renamed. 1640 */ 1641 static 1642 int 1643 cache_clrmountpt_callback(struct mount *mp, void *data) 1644 { 1645 struct nchandle *nch = data; 1646 1647 if (mp->mnt_ncmounton.ncp == nch->ncp) 1648 return(1); 1649 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1650 return(1); 1651 return(0); 1652 } 1653 1654 /* 1655 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1656 * with a mount point. 1657 */ 1658 void 1659 cache_clrmountpt(struct nchandle *nch) 1660 { 1661 int count; 1662 1663 count = mountlist_scan(cache_clrmountpt_callback, nch, 1664 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1665 if (count == 0) 1666 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1667 } 1668 1669 /* 1670 * Invalidate portions of the namecache topology given a starting entry. 1671 * The passed ncp is set to an unresolved state and: 1672 * 1673 * The passed ncp must be referencxed and locked. The routine may unlock 1674 * and relock ncp several times, and will recheck the children and loop 1675 * to catch races. When done the passed ncp will be returned with the 1676 * reference and lock intact. 1677 * 1678 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1679 * that the physical underlying nodes have been 1680 * destroyed... as in deleted. For example, when 1681 * a directory is removed. This will cause record 1682 * lookups on the name to no longer be able to find 1683 * the record and tells the resolver to return failure 1684 * rather then trying to resolve through the parent. 1685 * 1686 * The topology itself, including ncp->nc_name, 1687 * remains intact. 1688 * 1689 * This only applies to the passed ncp, if CINV_CHILDREN 1690 * is specified the children are not flagged. 1691 * 1692 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1693 * state as well. 1694 * 1695 * Note that this will also have the side effect of 1696 * cleaning out any unreferenced nodes in the topology 1697 * from the leaves up as the recursion backs out. 1698 * 1699 * Note that the topology for any referenced nodes remains intact, but 1700 * the nodes will be marked as having been destroyed and will be set 1701 * to an unresolved state. 1702 * 1703 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1704 * the namecache entry may not actually be invalidated on return if it was 1705 * revalidated while recursing down into its children. This code guarentees 1706 * that the node(s) will go through an invalidation cycle, but does not 1707 * guarentee that they will remain in an invalidated state. 1708 * 1709 * Returns non-zero if a revalidation was detected during the invalidation 1710 * recursion, zero otherwise. Note that since only the original ncp is 1711 * locked the revalidation ultimately can only indicate that the original ncp 1712 * *MIGHT* no have been reresolved. 1713 * 1714 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1715 * have to avoid blowing out the kernel stack. We do this by saving the 1716 * deep namecache node and aborting the recursion, then re-recursing at that 1717 * node using a depth-first algorithm in order to allow multiple deep 1718 * recursions to chain through each other, then we restart the invalidation 1719 * from scratch. 1720 */ 1721 1722 struct cinvtrack { 1723 struct namecache *resume_ncp; 1724 int depth; 1725 }; 1726 1727 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1728 1729 static 1730 int 1731 _cache_inval(struct namecache *ncp, int flags) 1732 { 1733 struct cinvtrack track; 1734 struct namecache *ncp2; 1735 int r; 1736 1737 track.depth = 0; 1738 track.resume_ncp = NULL; 1739 1740 for (;;) { 1741 r = _cache_inval_internal(ncp, flags, &track); 1742 if (track.resume_ncp == NULL) 1743 break; 1744 _cache_unlock(ncp); 1745 while ((ncp2 = track.resume_ncp) != NULL) { 1746 track.resume_ncp = NULL; 1747 _cache_lock(ncp2); 1748 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1749 &track); 1750 _cache_put(ncp2); 1751 } 1752 _cache_lock(ncp); 1753 } 1754 return(r); 1755 } 1756 1757 int 1758 cache_inval(struct nchandle *nch, int flags) 1759 { 1760 return(_cache_inval(nch->ncp, flags)); 1761 } 1762 1763 /* 1764 * Helper for _cache_inval(). The passed ncp is refd and locked and 1765 * remains that way on return, but may be unlocked/relocked multiple 1766 * times by the routine. 1767 */ 1768 static int 1769 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1770 { 1771 struct namecache *nextkid; 1772 int rcnt = 0; 1773 1774 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1775 1776 _cache_setunresolved(ncp); 1777 if (flags & CINV_DESTROY) { 1778 ncp->nc_flag |= NCF_DESTROYED; 1779 ++ncp->nc_generation; 1780 } 1781 while ((flags & CINV_CHILDREN) && 1782 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1783 ) { 1784 struct namecache *kid; 1785 int restart; 1786 1787 restart = 0; 1788 _cache_hold(nextkid); 1789 if (++track->depth > MAX_RECURSION_DEPTH) { 1790 track->resume_ncp = ncp; 1791 _cache_hold(ncp); 1792 ++rcnt; 1793 } 1794 while ((kid = nextkid) != NULL) { 1795 /* 1796 * Parent (ncp) must be locked for the iteration. 1797 */ 1798 nextkid = NULL; 1799 if (kid->nc_parent != ncp) { 1800 _cache_drop(kid); 1801 kprintf("cache_inval_internal restartA %s\n", 1802 ncp->nc_name); 1803 restart = 1; 1804 break; 1805 } 1806 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1807 _cache_hold(nextkid); 1808 1809 /* 1810 * Parent unlocked for this section to avoid 1811 * deadlocks. 1812 */ 1813 _cache_unlock(ncp); 1814 if (track->resume_ncp) { 1815 _cache_drop(kid); 1816 _cache_lock(ncp); 1817 break; 1818 } 1819 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1820 TAILQ_FIRST(&kid->nc_list) 1821 ) { 1822 _cache_lock(kid); 1823 if (kid->nc_parent != ncp) { 1824 kprintf("cache_inval_internal " 1825 "restartB %s\n", 1826 ncp->nc_name); 1827 restart = 1; 1828 _cache_unlock(kid); 1829 _cache_drop(kid); 1830 _cache_lock(ncp); 1831 break; 1832 } 1833 1834 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track); 1835 _cache_unlock(kid); 1836 } 1837 _cache_drop(kid); 1838 _cache_lock(ncp); 1839 } 1840 if (nextkid) 1841 _cache_drop(nextkid); 1842 --track->depth; 1843 if (restart == 0) 1844 break; 1845 } 1846 1847 /* 1848 * Someone could have gotten in there while ncp was unlocked, 1849 * retry if so. 1850 */ 1851 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1852 ++rcnt; 1853 return (rcnt); 1854 } 1855 1856 /* 1857 * Invalidate a vnode's namecache associations. To avoid races against 1858 * the resolver we do not invalidate a node which we previously invalidated 1859 * but which was then re-resolved while we were in the invalidation loop. 1860 * 1861 * Returns non-zero if any namecache entries remain after the invalidation 1862 * loop completed. 1863 * 1864 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1865 * be ripped out of the topology while held, the vnode's v_namecache 1866 * list has no such restriction. NCP's can be ripped out of the list 1867 * at virtually any time if not locked, even if held. 1868 * 1869 * In addition, the v_namecache list itself must be locked via 1870 * the vnode's spinlock. 1871 */ 1872 int 1873 cache_inval_vp(struct vnode *vp, int flags) 1874 { 1875 struct namecache *ncp; 1876 struct namecache *next; 1877 1878 restart: 1879 spin_lock(&vp->v_spin); 1880 ncp = TAILQ_FIRST(&vp->v_namecache); 1881 if (ncp) 1882 _cache_hold(ncp); 1883 while (ncp) { 1884 /* loop entered with ncp held and vp spin-locked */ 1885 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1886 _cache_hold(next); 1887 spin_unlock(&vp->v_spin); 1888 _cache_lock(ncp); 1889 if (ncp->nc_vp != vp) { 1890 kprintf("Warning: cache_inval_vp: race-A detected on " 1891 "%s\n", ncp->nc_name); 1892 _cache_put(ncp); 1893 if (next) 1894 _cache_drop(next); 1895 goto restart; 1896 } 1897 _cache_inval(ncp, flags); 1898 _cache_put(ncp); /* also releases reference */ 1899 ncp = next; 1900 spin_lock(&vp->v_spin); 1901 if (ncp && ncp->nc_vp != vp) { 1902 spin_unlock(&vp->v_spin); 1903 kprintf("Warning: cache_inval_vp: race-B detected on " 1904 "%s\n", ncp->nc_name); 1905 _cache_drop(ncp); 1906 goto restart; 1907 } 1908 } 1909 spin_unlock(&vp->v_spin); 1910 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1911 } 1912 1913 /* 1914 * This routine is used instead of the normal cache_inval_vp() when we 1915 * are trying to recycle otherwise good vnodes. 1916 * 1917 * Return 0 on success, non-zero if not all namecache records could be 1918 * disassociated from the vnode (for various reasons). 1919 */ 1920 int 1921 cache_inval_vp_nonblock(struct vnode *vp) 1922 { 1923 struct namecache *ncp; 1924 struct namecache *next; 1925 1926 spin_lock(&vp->v_spin); 1927 ncp = TAILQ_FIRST(&vp->v_namecache); 1928 if (ncp) 1929 _cache_hold(ncp); 1930 while (ncp) { 1931 /* loop entered with ncp held */ 1932 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1933 _cache_hold(next); 1934 spin_unlock(&vp->v_spin); 1935 if (_cache_lock_nonblock(ncp)) { 1936 _cache_drop(ncp); 1937 if (next) 1938 _cache_drop(next); 1939 goto done; 1940 } 1941 if (ncp->nc_vp != vp) { 1942 kprintf("Warning: cache_inval_vp: race-A detected on " 1943 "%s\n", ncp->nc_name); 1944 _cache_put(ncp); 1945 if (next) 1946 _cache_drop(next); 1947 goto done; 1948 } 1949 _cache_inval(ncp, 0); 1950 _cache_put(ncp); /* also releases reference */ 1951 ncp = next; 1952 spin_lock(&vp->v_spin); 1953 if (ncp && ncp->nc_vp != vp) { 1954 spin_unlock(&vp->v_spin); 1955 kprintf("Warning: cache_inval_vp: race-B detected on " 1956 "%s\n", ncp->nc_name); 1957 _cache_drop(ncp); 1958 goto done; 1959 } 1960 } 1961 spin_unlock(&vp->v_spin); 1962 done: 1963 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1964 } 1965 1966 /* 1967 * Clears the universal directory search 'ok' flag. This flag allows 1968 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1969 * so clearing it simply forces revalidation. 1970 */ 1971 void 1972 cache_inval_wxok(struct vnode *vp) 1973 { 1974 struct namecache *ncp; 1975 1976 spin_lock(&vp->v_spin); 1977 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1978 if (ncp->nc_flag & NCF_WXOK) 1979 atomic_clear_short(&ncp->nc_flag, NCF_WXOK); 1980 } 1981 spin_unlock(&vp->v_spin); 1982 } 1983 1984 /* 1985 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1986 * must be locked. The target ncp is destroyed (as a normal rename-over 1987 * would destroy the target file or directory). 1988 * 1989 * Because there may be references to the source ncp we cannot copy its 1990 * contents to the target. Instead the source ncp is relinked as the target 1991 * and the target ncp is removed from the namecache topology. 1992 */ 1993 void 1994 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1995 { 1996 struct namecache *fncp = fnch->ncp; 1997 struct namecache *tncp = tnch->ncp; 1998 struct namecache *tncp_par; 1999 struct nchash_head *nchpp; 2000 u_int32_t hash; 2001 char *oname; 2002 char *nname; 2003 2004 ++fncp->nc_generation; 2005 ++tncp->nc_generation; 2006 if (tncp->nc_nlen) { 2007 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 2008 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 2009 nname[tncp->nc_nlen] = 0; 2010 } else { 2011 nname = NULL; 2012 } 2013 2014 /* 2015 * Rename fncp (unlink) 2016 */ 2017 _cache_unlink_parent(fncp); 2018 oname = fncp->nc_name; 2019 fncp->nc_name = nname; 2020 fncp->nc_nlen = tncp->nc_nlen; 2021 if (oname) 2022 kfree(oname, M_VFSCACHE); 2023 2024 tncp_par = tncp->nc_parent; 2025 _cache_hold(tncp_par); 2026 _cache_lock(tncp_par); 2027 2028 /* 2029 * Rename fncp (relink) 2030 */ 2031 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 2032 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 2033 nchpp = NCHHASH(hash); 2034 2035 spin_lock(&nchpp->spin); 2036 _cache_link_parent(fncp, tncp_par, nchpp); 2037 spin_unlock(&nchpp->spin); 2038 2039 _cache_put(tncp_par); 2040 2041 /* 2042 * Get rid of the overwritten tncp (unlink) 2043 */ 2044 _cache_unlink(tncp); 2045 } 2046 2047 /* 2048 * Perform actions consistent with unlinking a file. The passed-in ncp 2049 * must be locked. 2050 * 2051 * The ncp is marked DESTROYED so it no longer shows up in searches, 2052 * and will be physically deleted when the vnode goes away. 2053 * 2054 * If the related vnode has no refs then we cycle it through vget()/vput() 2055 * to (possibly if we don't have a ref race) trigger a deactivation, 2056 * allowing the VFS to trivially detect and recycle the deleted vnode 2057 * via VOP_INACTIVE(). 2058 * 2059 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2060 * target ncp. 2061 */ 2062 void 2063 cache_unlink(struct nchandle *nch) 2064 { 2065 _cache_unlink(nch->ncp); 2066 } 2067 2068 static void 2069 _cache_unlink(struct namecache *ncp) 2070 { 2071 struct vnode *vp; 2072 2073 /* 2074 * Causes lookups to fail and allows another ncp with the same 2075 * name to be created under ncp->nc_parent. 2076 */ 2077 ncp->nc_flag |= NCF_DESTROYED; 2078 ++ncp->nc_generation; 2079 2080 /* 2081 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2082 * force action on the 1->0 transition. 2083 */ 2084 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2085 (vp = ncp->nc_vp) != NULL) { 2086 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2087 if (VREFCNT(vp) <= 0) { 2088 if (vget(vp, LK_SHARED) == 0) 2089 vput(vp); 2090 } 2091 } 2092 } 2093 2094 /* 2095 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2096 * file. The easy solution is to just return non-zero if the vnode has refs. 2097 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2098 * force the reclaim). 2099 */ 2100 int 2101 cache_isopen(struct nchandle *nch) 2102 { 2103 struct vnode *vp; 2104 struct namecache *ncp = nch->ncp; 2105 2106 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2107 (vp = ncp->nc_vp) != NULL && 2108 VREFCNT(vp)) { 2109 return 1; 2110 } 2111 return 0; 2112 } 2113 2114 2115 /* 2116 * vget the vnode associated with the namecache entry. Resolve the namecache 2117 * entry if necessary. The passed ncp must be referenced and locked. If 2118 * the ncp is resolved it might be locked shared. 2119 * 2120 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2121 * (depending on the passed lk_type) will be returned in *vpp with an error 2122 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2123 * most typical error is ENOENT, meaning that the ncp represents a negative 2124 * cache hit and there is no vnode to retrieve, but other errors can occur 2125 * too. 2126 * 2127 * The vget() can race a reclaim. If this occurs we re-resolve the 2128 * namecache entry. 2129 * 2130 * There are numerous places in the kernel where vget() is called on a 2131 * vnode while one or more of its namecache entries is locked. Releasing 2132 * a vnode never deadlocks against locked namecache entries (the vnode 2133 * will not get recycled while referenced ncp's exist). This means we 2134 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2135 * lock when acquiring the vp lock or we might cause a deadlock. 2136 * 2137 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2138 * unresolved. If a reclaim race occurs the passed-in ncp will be 2139 * relocked exclusively before being re-resolved. 2140 */ 2141 int 2142 cache_vget(struct nchandle *nch, struct ucred *cred, 2143 int lk_type, struct vnode **vpp) 2144 { 2145 struct namecache *ncp; 2146 struct vnode *vp; 2147 int error; 2148 2149 ncp = nch->ncp; 2150 again: 2151 vp = NULL; 2152 if (ncp->nc_flag & NCF_UNRESOLVED) 2153 error = cache_resolve(nch, cred); 2154 else 2155 error = 0; 2156 2157 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2158 error = vget(vp, lk_type); 2159 if (error) { 2160 /* 2161 * VRECLAIM race 2162 * 2163 * The ncp may have been locked shared, we must relock 2164 * it exclusively before we can set it to unresolved. 2165 */ 2166 if (error == ENOENT) { 2167 kprintf("Warning: vnode reclaim race detected " 2168 "in cache_vget on %p (%s)\n", 2169 vp, ncp->nc_name); 2170 _cache_unlock(ncp); 2171 _cache_lock(ncp); 2172 _cache_setunresolved(ncp); 2173 goto again; 2174 } 2175 2176 /* 2177 * Not a reclaim race, some other error. 2178 */ 2179 KKASSERT(ncp->nc_vp == vp); 2180 vp = NULL; 2181 } else { 2182 KKASSERT(ncp->nc_vp == vp); 2183 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2184 } 2185 } 2186 if (error == 0 && vp == NULL) 2187 error = ENOENT; 2188 *vpp = vp; 2189 return(error); 2190 } 2191 2192 /* 2193 * Similar to cache_vget() but only acquires a ref on the vnode. 2194 * 2195 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2196 * unresolved. If a reclaim race occurs the passed-in ncp will be 2197 * relocked exclusively before being re-resolved. 2198 */ 2199 int 2200 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2201 { 2202 struct namecache *ncp; 2203 struct vnode *vp; 2204 int error; 2205 2206 ncp = nch->ncp; 2207 again: 2208 vp = NULL; 2209 if (ncp->nc_flag & NCF_UNRESOLVED) 2210 error = cache_resolve(nch, cred); 2211 else 2212 error = 0; 2213 2214 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2215 error = vget(vp, LK_SHARED); 2216 if (error) { 2217 /* 2218 * VRECLAIM race 2219 */ 2220 if (error == ENOENT) { 2221 kprintf("Warning: vnode reclaim race detected " 2222 "in cache_vget on %p (%s)\n", 2223 vp, ncp->nc_name); 2224 _cache_unlock(ncp); 2225 _cache_lock(ncp); 2226 _cache_setunresolved(ncp); 2227 goto again; 2228 } 2229 2230 /* 2231 * Not a reclaim race, some other error. 2232 */ 2233 KKASSERT(ncp->nc_vp == vp); 2234 vp = NULL; 2235 } else { 2236 KKASSERT(ncp->nc_vp == vp); 2237 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2238 /* caller does not want a lock */ 2239 vn_unlock(vp); 2240 } 2241 } 2242 if (error == 0 && vp == NULL) 2243 error = ENOENT; 2244 *vpp = vp; 2245 return(error); 2246 } 2247 2248 /* 2249 * Return a referenced vnode representing the parent directory of 2250 * ncp. 2251 * 2252 * Because the caller has locked the ncp it should not be possible for 2253 * the parent ncp to go away. However, the parent can unresolve its 2254 * dvp at any time so we must be able to acquire a lock on the parent 2255 * to safely access nc_vp. 2256 * 2257 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2258 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2259 * getting destroyed. 2260 * 2261 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2262 * lock on the ncp in question.. 2263 */ 2264 static struct vnode * 2265 cache_dvpref(struct namecache *ncp) 2266 { 2267 struct namecache *par; 2268 struct vnode *dvp; 2269 2270 dvp = NULL; 2271 if ((par = ncp->nc_parent) != NULL) { 2272 _cache_hold(par); 2273 _cache_lock(par); 2274 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2275 if ((dvp = par->nc_vp) != NULL) 2276 vhold(dvp); 2277 } 2278 _cache_unlock(par); 2279 if (dvp) { 2280 if (vget(dvp, LK_SHARED) == 0) { 2281 vn_unlock(dvp); 2282 vdrop(dvp); 2283 /* return refd, unlocked dvp */ 2284 } else { 2285 vdrop(dvp); 2286 dvp = NULL; 2287 } 2288 } 2289 _cache_drop(par); 2290 } 2291 return(dvp); 2292 } 2293 2294 /* 2295 * Convert a directory vnode to a namecache record without any other 2296 * knowledge of the topology. This ONLY works with directory vnodes and 2297 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2298 * returned ncp (if not NULL) will be held and unlocked. 2299 * 2300 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2301 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2302 * for dvp. This will fail only if the directory has been deleted out from 2303 * under the caller. 2304 * 2305 * Callers must always check for a NULL return no matter the value of 'makeit'. 2306 * 2307 * To avoid underflowing the kernel stack each recursive call increments 2308 * the makeit variable. 2309 */ 2310 2311 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2312 struct vnode *dvp, char *fakename); 2313 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2314 struct vnode **saved_dvp); 2315 2316 int 2317 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2318 struct nchandle *nch) 2319 { 2320 struct vnode *saved_dvp; 2321 struct vnode *pvp; 2322 char *fakename; 2323 int error; 2324 2325 nch->ncp = NULL; 2326 nch->mount = dvp->v_mount; 2327 saved_dvp = NULL; 2328 fakename = NULL; 2329 2330 /* 2331 * Handle the makeit == 0 degenerate case 2332 */ 2333 if (makeit == 0) { 2334 spin_lock_shared(&dvp->v_spin); 2335 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2336 if (nch->ncp) 2337 cache_hold(nch); 2338 spin_unlock_shared(&dvp->v_spin); 2339 } 2340 2341 /* 2342 * Loop until resolution, inside code will break out on error. 2343 */ 2344 while (makeit) { 2345 /* 2346 * Break out if we successfully acquire a working ncp. 2347 */ 2348 spin_lock_shared(&dvp->v_spin); 2349 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2350 if (nch->ncp) { 2351 cache_hold(nch); 2352 spin_unlock_shared(&dvp->v_spin); 2353 break; 2354 } 2355 spin_unlock_shared(&dvp->v_spin); 2356 2357 /* 2358 * If dvp is the root of its filesystem it should already 2359 * have a namecache pointer associated with it as a side 2360 * effect of the mount, but it may have been disassociated. 2361 */ 2362 if (dvp->v_flag & VROOT) { 2363 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2364 error = cache_resolve_mp(nch->mount); 2365 _cache_put(nch->ncp); 2366 if (ncvp_debug) { 2367 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2368 dvp->v_mount, error); 2369 } 2370 if (error) { 2371 if (ncvp_debug) 2372 kprintf(" failed\n"); 2373 nch->ncp = NULL; 2374 break; 2375 } 2376 if (ncvp_debug) 2377 kprintf(" succeeded\n"); 2378 continue; 2379 } 2380 2381 /* 2382 * If we are recursed too deeply resort to an O(n^2) 2383 * algorithm to resolve the namecache topology. The 2384 * resolved pvp is left referenced in saved_dvp to 2385 * prevent the tree from being destroyed while we loop. 2386 */ 2387 if (makeit > 20) { 2388 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2389 if (error) { 2390 kprintf("lookupdotdot(longpath) failed %d " 2391 "dvp %p\n", error, dvp); 2392 nch->ncp = NULL; 2393 break; 2394 } 2395 continue; 2396 } 2397 2398 /* 2399 * Get the parent directory and resolve its ncp. 2400 */ 2401 if (fakename) { 2402 kfree(fakename, M_TEMP); 2403 fakename = NULL; 2404 } 2405 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2406 &fakename); 2407 if (error) { 2408 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2409 break; 2410 } 2411 vn_unlock(pvp); 2412 2413 /* 2414 * Reuse makeit as a recursion depth counter. On success 2415 * nch will be fully referenced. 2416 */ 2417 cache_fromdvp(pvp, cred, makeit + 1, nch); 2418 vrele(pvp); 2419 if (nch->ncp == NULL) 2420 break; 2421 2422 /* 2423 * Do an inefficient scan of pvp (embodied by ncp) to look 2424 * for dvp. This will create a namecache record for dvp on 2425 * success. We loop up to recheck on success. 2426 * 2427 * ncp and dvp are both held but not locked. 2428 */ 2429 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2430 if (error) { 2431 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2432 pvp, nch->ncp->nc_name, dvp); 2433 cache_drop(nch); 2434 /* nch was NULLed out, reload mount */ 2435 nch->mount = dvp->v_mount; 2436 break; 2437 } 2438 if (ncvp_debug) { 2439 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2440 pvp, nch->ncp->nc_name); 2441 } 2442 cache_drop(nch); 2443 /* nch was NULLed out, reload mount */ 2444 nch->mount = dvp->v_mount; 2445 } 2446 2447 /* 2448 * If nch->ncp is non-NULL it will have been held already. 2449 */ 2450 if (fakename) 2451 kfree(fakename, M_TEMP); 2452 if (saved_dvp) 2453 vrele(saved_dvp); 2454 if (nch->ncp) 2455 return (0); 2456 return (EINVAL); 2457 } 2458 2459 /* 2460 * Go up the chain of parent directories until we find something 2461 * we can resolve into the namecache. This is very inefficient. 2462 */ 2463 static 2464 int 2465 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2466 struct vnode **saved_dvp) 2467 { 2468 struct nchandle nch; 2469 struct vnode *pvp; 2470 int error; 2471 static time_t last_fromdvp_report; 2472 char *fakename; 2473 2474 /* 2475 * Loop getting the parent directory vnode until we get something we 2476 * can resolve in the namecache. 2477 */ 2478 vref(dvp); 2479 nch.mount = dvp->v_mount; 2480 nch.ncp = NULL; 2481 fakename = NULL; 2482 2483 for (;;) { 2484 if (fakename) { 2485 kfree(fakename, M_TEMP); 2486 fakename = NULL; 2487 } 2488 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2489 &fakename); 2490 if (error) { 2491 vrele(dvp); 2492 break; 2493 } 2494 vn_unlock(pvp); 2495 spin_lock_shared(&pvp->v_spin); 2496 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2497 _cache_hold(nch.ncp); 2498 spin_unlock_shared(&pvp->v_spin); 2499 vrele(pvp); 2500 break; 2501 } 2502 spin_unlock_shared(&pvp->v_spin); 2503 if (pvp->v_flag & VROOT) { 2504 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2505 error = cache_resolve_mp(nch.mount); 2506 _cache_unlock(nch.ncp); 2507 vrele(pvp); 2508 if (error) { 2509 _cache_drop(nch.ncp); 2510 nch.ncp = NULL; 2511 vrele(dvp); 2512 } 2513 break; 2514 } 2515 vrele(dvp); 2516 dvp = pvp; 2517 } 2518 if (error == 0) { 2519 if (last_fromdvp_report != time_uptime) { 2520 last_fromdvp_report = time_uptime; 2521 kprintf("Warning: extremely inefficient path " 2522 "resolution on %s\n", 2523 nch.ncp->nc_name); 2524 } 2525 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2526 2527 /* 2528 * Hopefully dvp now has a namecache record associated with 2529 * it. Leave it referenced to prevent the kernel from 2530 * recycling the vnode. Otherwise extremely long directory 2531 * paths could result in endless recycling. 2532 */ 2533 if (*saved_dvp) 2534 vrele(*saved_dvp); 2535 *saved_dvp = dvp; 2536 _cache_drop(nch.ncp); 2537 } 2538 if (fakename) 2539 kfree(fakename, M_TEMP); 2540 return (error); 2541 } 2542 2543 /* 2544 * Do an inefficient scan of the directory represented by ncp looking for 2545 * the directory vnode dvp. ncp must be held but not locked on entry and 2546 * will be held on return. dvp must be refd but not locked on entry and 2547 * will remain refd on return. 2548 * 2549 * Why do this at all? Well, due to its stateless nature the NFS server 2550 * converts file handles directly to vnodes without necessarily going through 2551 * the namecache ops that would otherwise create the namecache topology 2552 * leading to the vnode. We could either (1) Change the namecache algorithms 2553 * to allow disconnect namecache records that are re-merged opportunistically, 2554 * or (2) Make the NFS server backtrack and scan to recover a connected 2555 * namecache topology in order to then be able to issue new API lookups. 2556 * 2557 * It turns out that (1) is a huge mess. It takes a nice clean set of 2558 * namecache algorithms and introduces a lot of complication in every subsystem 2559 * that calls into the namecache to deal with the re-merge case, especially 2560 * since we are using the namecache to placehold negative lookups and the 2561 * vnode might not be immediately assigned. (2) is certainly far less 2562 * efficient then (1), but since we are only talking about directories here 2563 * (which are likely to remain cached), the case does not actually run all 2564 * that often and has the supreme advantage of not polluting the namecache 2565 * algorithms. 2566 * 2567 * If a fakename is supplied just construct a namecache entry using the 2568 * fake name. 2569 */ 2570 static int 2571 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2572 struct vnode *dvp, char *fakename) 2573 { 2574 struct nlcomponent nlc; 2575 struct nchandle rncp; 2576 struct dirent *den; 2577 struct vnode *pvp; 2578 struct vattr vat; 2579 struct iovec iov; 2580 struct uio uio; 2581 int blksize; 2582 int eofflag; 2583 int bytes; 2584 char *rbuf; 2585 int error; 2586 2587 vat.va_blocksize = 0; 2588 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2589 return (error); 2590 cache_lock(nch); 2591 error = cache_vref(nch, cred, &pvp); 2592 cache_unlock(nch); 2593 if (error) 2594 return (error); 2595 if (ncvp_debug) { 2596 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2597 "vattr fileid = %lld\n", 2598 nch->ncp, nch->ncp->nc_name, 2599 vat.va_blocksize, 2600 (long long)vat.va_fileid); 2601 } 2602 2603 /* 2604 * Use the supplied fakename if not NULL. Fake names are typically 2605 * not in the actual filesystem hierarchy. This is used by HAMMER 2606 * to glue @@timestamp recursions together. 2607 */ 2608 if (fakename) { 2609 nlc.nlc_nameptr = fakename; 2610 nlc.nlc_namelen = strlen(fakename); 2611 rncp = cache_nlookup(nch, &nlc); 2612 goto done; 2613 } 2614 2615 if ((blksize = vat.va_blocksize) == 0) 2616 blksize = DEV_BSIZE; 2617 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2618 rncp.ncp = NULL; 2619 2620 eofflag = 0; 2621 uio.uio_offset = 0; 2622 again: 2623 iov.iov_base = rbuf; 2624 iov.iov_len = blksize; 2625 uio.uio_iov = &iov; 2626 uio.uio_iovcnt = 1; 2627 uio.uio_resid = blksize; 2628 uio.uio_segflg = UIO_SYSSPACE; 2629 uio.uio_rw = UIO_READ; 2630 uio.uio_td = curthread; 2631 2632 if (ncvp_debug >= 2) 2633 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2634 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2635 if (error == 0) { 2636 den = (struct dirent *)rbuf; 2637 bytes = blksize - uio.uio_resid; 2638 2639 while (bytes > 0) { 2640 if (ncvp_debug >= 2) { 2641 kprintf("cache_inefficient_scan: %*.*s\n", 2642 den->d_namlen, den->d_namlen, 2643 den->d_name); 2644 } 2645 if (den->d_type != DT_WHT && 2646 den->d_ino == vat.va_fileid) { 2647 if (ncvp_debug) { 2648 kprintf("cache_inefficient_scan: " 2649 "MATCHED inode %lld path %s/%*.*s\n", 2650 (long long)vat.va_fileid, 2651 nch->ncp->nc_name, 2652 den->d_namlen, den->d_namlen, 2653 den->d_name); 2654 } 2655 nlc.nlc_nameptr = den->d_name; 2656 nlc.nlc_namelen = den->d_namlen; 2657 rncp = cache_nlookup(nch, &nlc); 2658 KKASSERT(rncp.ncp != NULL); 2659 break; 2660 } 2661 bytes -= _DIRENT_DIRSIZ(den); 2662 den = _DIRENT_NEXT(den); 2663 } 2664 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2665 goto again; 2666 } 2667 kfree(rbuf, M_TEMP); 2668 done: 2669 vrele(pvp); 2670 if (rncp.ncp) { 2671 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2672 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2673 if (ncvp_debug >= 2) { 2674 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2675 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2676 } 2677 } else { 2678 if (ncvp_debug >= 2) { 2679 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2680 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2681 rncp.ncp->nc_vp); 2682 } 2683 } 2684 if (rncp.ncp->nc_vp == NULL) 2685 error = rncp.ncp->nc_error; 2686 /* 2687 * Release rncp after a successful nlookup. rncp was fully 2688 * referenced. 2689 */ 2690 cache_put(&rncp); 2691 } else { 2692 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2693 dvp, nch->ncp->nc_name); 2694 error = ENOENT; 2695 } 2696 return (error); 2697 } 2698 2699 /* 2700 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2701 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list. 2702 * 2703 * Then, if there are no additional references to the ncp and no children, 2704 * the ncp is removed from the topology and destroyed. 2705 * 2706 * References and/or children may exist if the ncp is in the middle of the 2707 * topology, preventing the ncp from being destroyed. 2708 * 2709 * This function must be called with the ncp held and locked and will unlock 2710 * and drop it during zapping. 2711 * 2712 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2713 * This case can occur in the cache_drop() path. 2714 * 2715 * This function may returned a held (but NOT locked) parent node which the 2716 * caller must drop. We do this so _cache_drop() can loop, to avoid 2717 * blowing out the kernel stack. 2718 * 2719 * WARNING! For MPSAFE operation this routine must acquire up to three 2720 * spin locks to be able to safely test nc_refs. Lock order is 2721 * very important. 2722 * 2723 * hash spinlock if on hash list 2724 * parent spinlock if child of parent 2725 * (the ncp is unresolved so there is no vnode association) 2726 */ 2727 static struct namecache * 2728 cache_zap(struct namecache *ncp, int nonblock) 2729 { 2730 struct namecache *par; 2731 struct vnode *dropvp; 2732 struct nchash_head *nchpp; 2733 int refs; 2734 2735 /* 2736 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2737 */ 2738 _cache_setunresolved(ncp); 2739 2740 /* 2741 * Try to scrap the entry and possibly tail-recurse on its parent. 2742 * We only scrap unref'd (other then our ref) unresolved entries, 2743 * we do not scrap 'live' entries. 2744 * 2745 * Note that once the spinlocks are acquired if nc_refs == 1 no 2746 * other references are possible. If it isn't, however, we have 2747 * to decrement but also be sure to avoid a 1->0 transition. 2748 */ 2749 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2750 KKASSERT(ncp->nc_refs > 0); 2751 2752 /* 2753 * Acquire locks. Note that the parent can't go away while we hold 2754 * a child locked. 2755 */ 2756 nchpp = NULL; 2757 if ((par = ncp->nc_parent) != NULL) { 2758 if (nonblock) { 2759 for (;;) { 2760 if (_cache_lock_nonblock(par) == 0) 2761 break; 2762 refs = ncp->nc_refs; 2763 ncp->nc_flag |= NCF_DEFEREDZAP; 2764 ++numdefered; /* MP race ok */ 2765 if (atomic_cmpset_int(&ncp->nc_refs, 2766 refs, refs - 1)) { 2767 _cache_unlock(ncp); 2768 return(NULL); 2769 } 2770 cpu_pause(); 2771 } 2772 _cache_hold(par); 2773 } else { 2774 _cache_hold(par); 2775 _cache_lock(par); 2776 } 2777 nchpp = ncp->nc_head; 2778 spin_lock(&nchpp->spin); 2779 } 2780 2781 /* 2782 * At this point if we find refs == 1 it should not be possible for 2783 * anyone else to have access to the ncp. We are holding the only 2784 * possible access point left (nchpp) spin-locked. 2785 * 2786 * If someone other then us has a ref or we have children 2787 * we cannot zap the entry. The 1->0 transition and any 2788 * further list operation is protected by the spinlocks 2789 * we have acquired but other transitions are not. 2790 */ 2791 for (;;) { 2792 refs = ncp->nc_refs; 2793 cpu_ccfence(); 2794 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list)) 2795 break; 2796 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) { 2797 if (par) { 2798 spin_unlock(&nchpp->spin); 2799 _cache_put(par); 2800 } 2801 _cache_unlock(ncp); 2802 return(NULL); 2803 } 2804 cpu_pause(); 2805 } 2806 2807 /* 2808 * We are the only ref and with the spinlocks held no further 2809 * refs can be acquired by others. 2810 * 2811 * Remove us from the hash list and parent list. We have to 2812 * drop a ref on the parent's vp if the parent's list becomes 2813 * empty. 2814 */ 2815 dropvp = NULL; 2816 if (par) { 2817 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2818 2819 KKASSERT(nchpp == ncp->nc_head); 2820 LIST_REMOVE(ncp, nc_hash); 2821 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2822 atomic_add_long(&pn->vfscache_count, -1); 2823 if (TAILQ_EMPTY(&ncp->nc_list)) 2824 atomic_add_long(&pn->vfscache_leafs, -1); 2825 2826 if (TAILQ_EMPTY(&par->nc_list)) { 2827 atomic_add_long(&pn->vfscache_leafs, 1); 2828 if (par->nc_vp) 2829 dropvp = par->nc_vp; 2830 } 2831 ncp->nc_head = NULL; 2832 ncp->nc_parent = NULL; 2833 spin_unlock(&nchpp->spin); 2834 _cache_unlock(par); 2835 } else { 2836 KKASSERT(ncp->nc_head == NULL); 2837 } 2838 2839 /* 2840 * ncp should not have picked up any refs. Physically 2841 * destroy the ncp. 2842 */ 2843 if (ncp->nc_refs != 1) { 2844 int save_refs = ncp->nc_refs; 2845 cpu_ccfence(); 2846 panic("cache_zap: %p bad refs %d (%d)\n", 2847 ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0)); 2848 } 2849 KKASSERT(ncp->nc_refs == 1); 2850 /* _cache_unlock(ncp) not required */ 2851 ncp->nc_refs = -1; /* safety */ 2852 if (ncp->nc_name) 2853 kfree(ncp->nc_name, M_VFSCACHE); 2854 kfree(ncp, M_VFSCACHE); 2855 2856 /* 2857 * Delayed drop (we had to release our spinlocks) 2858 * 2859 * The refed parent (if not NULL) must be dropped. The 2860 * caller is responsible for looping. 2861 */ 2862 if (dropvp) 2863 vdrop(dropvp); 2864 return(par); 2865 } 2866 2867 /* 2868 * Clean up dangling negative cache and defered-drop entries in the 2869 * namecache. 2870 * 2871 * This routine is called in the critical path and also called from 2872 * vnlru(). When called from vnlru we use a lower limit to try to 2873 * deal with the negative cache before the critical path has to start 2874 * dealing with it. 2875 */ 2876 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2877 2878 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2879 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2880 2881 void 2882 cache_hysteresis(int critpath) 2883 { 2884 long poslimit; 2885 long neglimit = maxvnodes / ncnegfactor; 2886 long xnumcache = vfscache_leafs; 2887 2888 if (critpath == 0) 2889 neglimit = neglimit * 8 / 10; 2890 2891 /* 2892 * Don't cache too many negative hits. We use hysteresis to reduce 2893 * the impact on the critical path. 2894 */ 2895 switch(neg_cache_hysteresis_state[critpath]) { 2896 case CHI_LOW: 2897 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2898 if (critpath) 2899 _cache_cleanneg(ncnegflush); 2900 else 2901 _cache_cleanneg(ncnegflush + 2902 vfscache_negs - neglimit); 2903 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2904 } 2905 break; 2906 case CHI_HIGH: 2907 if (vfscache_negs > MINNEG * 9 / 10 && 2908 vfscache_negs * 9 / 10 > neglimit 2909 ) { 2910 if (critpath) 2911 _cache_cleanneg(ncnegflush); 2912 else 2913 _cache_cleanneg(ncnegflush + 2914 vfscache_negs * 9 / 10 - 2915 neglimit); 2916 } else { 2917 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2918 } 2919 break; 2920 } 2921 2922 /* 2923 * Don't cache too many positive hits. We use hysteresis to reduce 2924 * the impact on the critical path. 2925 * 2926 * Excessive positive hits can accumulate due to large numbers of 2927 * hardlinks (the vnode cache will not prevent hl ncps from growing 2928 * into infinity). 2929 */ 2930 if ((poslimit = ncposlimit) == 0) 2931 poslimit = maxvnodes * 2; 2932 if (critpath == 0) 2933 poslimit = poslimit * 8 / 10; 2934 2935 switch(pos_cache_hysteresis_state[critpath]) { 2936 case CHI_LOW: 2937 if (xnumcache > poslimit && xnumcache > MINPOS) { 2938 if (critpath) 2939 _cache_cleanpos(ncposflush); 2940 else 2941 _cache_cleanpos(ncposflush + 2942 xnumcache - poslimit); 2943 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2944 } 2945 break; 2946 case CHI_HIGH: 2947 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2948 if (critpath) 2949 _cache_cleanpos(ncposflush); 2950 else 2951 _cache_cleanpos(ncposflush + 2952 xnumcache - poslimit * 5 / 6); 2953 } else { 2954 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2955 } 2956 break; 2957 } 2958 2959 /* 2960 * Clean out dangling defered-zap ncps which could not 2961 * be cleanly dropped if too many build up. Note 2962 * that numdefered is not an exact number as such ncps 2963 * can be reused and the counter is not handled in a MP 2964 * safe manner by design. 2965 */ 2966 if (numdefered > neglimit) { 2967 _cache_cleandefered(); 2968 } 2969 } 2970 2971 /* 2972 * NEW NAMECACHE LOOKUP API 2973 * 2974 * Lookup an entry in the namecache. The passed par_nch must be referenced 2975 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2976 * is ALWAYS returned, eve if the supplied component is illegal. 2977 * 2978 * The resulting namecache entry should be returned to the system with 2979 * cache_put() or cache_unlock() + cache_drop(). 2980 * 2981 * namecache locks are recursive but care must be taken to avoid lock order 2982 * reversals (hence why the passed par_nch must be unlocked). Locking 2983 * rules are to order for parent traversals, not for child traversals. 2984 * 2985 * Nobody else will be able to manipulate the associated namespace (e.g. 2986 * create, delete, rename, rename-target) until the caller unlocks the 2987 * entry. 2988 * 2989 * The returned entry will be in one of three states: positive hit (non-null 2990 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2991 * Unresolved entries must be resolved through the filesystem to associate the 2992 * vnode and/or determine whether a positive or negative hit has occured. 2993 * 2994 * It is not necessary to lock a directory in order to lock namespace under 2995 * that directory. In fact, it is explicitly not allowed to do that. A 2996 * directory is typically only locked when being created, renamed, or 2997 * destroyed. 2998 * 2999 * The directory (par) may be unresolved, in which case any returned child 3000 * will likely also be marked unresolved. Likely but not guarenteed. Since 3001 * the filesystem lookup requires a resolved directory vnode the caller is 3002 * responsible for resolving the namecache chain top-down. This API 3003 * specifically allows whole chains to be created in an unresolved state. 3004 */ 3005 struct nchandle 3006 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 3007 { 3008 struct nchandle nch; 3009 struct namecache *ncp; 3010 struct namecache *new_ncp; 3011 struct nchash_head *nchpp; 3012 struct mount *mp; 3013 u_int32_t hash; 3014 globaldata_t gd; 3015 int par_locked; 3016 3017 gd = mycpu; 3018 mp = par_nch->mount; 3019 par_locked = 0; 3020 3021 /* 3022 * This is a good time to call it, no ncp's are locked by 3023 * the caller or us. 3024 */ 3025 cache_hysteresis(1); 3026 3027 /* 3028 * Try to locate an existing entry 3029 */ 3030 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3031 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3032 new_ncp = NULL; 3033 nchpp = NCHHASH(hash); 3034 restart: 3035 if (new_ncp) 3036 spin_lock(&nchpp->spin); 3037 else 3038 spin_lock_shared(&nchpp->spin); 3039 3040 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3041 /* 3042 * Break out if we find a matching entry. Note that 3043 * UNRESOLVED entries may match, but DESTROYED entries 3044 * do not. 3045 */ 3046 if (ncp->nc_parent == par_nch->ncp && 3047 ncp->nc_nlen == nlc->nlc_namelen && 3048 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3049 (ncp->nc_flag & NCF_DESTROYED) == 0 3050 ) { 3051 _cache_hold(ncp); 3052 if (new_ncp) 3053 spin_unlock(&nchpp->spin); 3054 else 3055 spin_unlock_shared(&nchpp->spin); 3056 if (par_locked) { 3057 _cache_unlock(par_nch->ncp); 3058 par_locked = 0; 3059 } 3060 if (_cache_lock_special(ncp) == 0) { 3061 /* 3062 * Successfully locked but we must re-test 3063 * conditions that might have changed since 3064 * we did not have the lock before. 3065 */ 3066 if (ncp->nc_parent != par_nch->ncp || 3067 ncp->nc_nlen != nlc->nlc_namelen || 3068 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3069 ncp->nc_nlen) || 3070 (ncp->nc_flag & NCF_DESTROYED)) { 3071 _cache_put(ncp); 3072 goto restart; 3073 } 3074 _cache_auto_unresolve(mp, ncp); 3075 if (new_ncp) 3076 _cache_free(new_ncp); 3077 goto found; 3078 } 3079 _cache_get(ncp); /* cycle the lock to block */ 3080 _cache_put(ncp); 3081 _cache_drop(ncp); 3082 goto restart; 3083 } 3084 } 3085 3086 /* 3087 * We failed to locate an entry, create a new entry and add it to 3088 * the cache. The parent ncp must also be locked so we 3089 * can link into it. 3090 * 3091 * We have to relookup after possibly blocking in kmalloc or 3092 * when locking par_nch. 3093 * 3094 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3095 * mount case, in which case nc_name will be NULL. 3096 */ 3097 if (new_ncp == NULL) { 3098 spin_unlock_shared(&nchpp->spin); 3099 new_ncp = cache_alloc(nlc->nlc_namelen); 3100 if (nlc->nlc_namelen) { 3101 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3102 nlc->nlc_namelen); 3103 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3104 } 3105 goto restart; 3106 } 3107 3108 /* 3109 * NOTE! The spinlock is held exclusively here because new_ncp 3110 * is non-NULL. 3111 */ 3112 if (par_locked == 0) { 3113 spin_unlock(&nchpp->spin); 3114 _cache_lock(par_nch->ncp); 3115 par_locked = 1; 3116 goto restart; 3117 } 3118 3119 /* 3120 * WARNING! We still hold the spinlock. We have to set the hash 3121 * table entry atomically. 3122 */ 3123 ncp = new_ncp; 3124 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3125 spin_unlock(&nchpp->spin); 3126 _cache_unlock(par_nch->ncp); 3127 /* par_locked = 0 - not used */ 3128 found: 3129 /* 3130 * stats and namecache size management 3131 */ 3132 if (ncp->nc_flag & NCF_UNRESOLVED) 3133 ++gd->gd_nchstats->ncs_miss; 3134 else if (ncp->nc_vp) 3135 ++gd->gd_nchstats->ncs_goodhits; 3136 else 3137 ++gd->gd_nchstats->ncs_neghits; 3138 nch.mount = mp; 3139 nch.ncp = ncp; 3140 _cache_mntref(nch.mount); 3141 3142 return(nch); 3143 } 3144 3145 /* 3146 * Attempt to lookup a namecache entry and return with a shared namecache 3147 * lock. 3148 */ 3149 int 3150 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 3151 int excl, struct nchandle *res_nch) 3152 { 3153 struct namecache *ncp; 3154 struct nchash_head *nchpp; 3155 struct mount *mp; 3156 u_int32_t hash; 3157 globaldata_t gd; 3158 3159 /* 3160 * If exclusive requested or shared namecache locks are disabled, 3161 * return failure. 3162 */ 3163 if (ncp_shared_lock_disable || excl) 3164 return(EWOULDBLOCK); 3165 3166 gd = mycpu; 3167 mp = par_nch->mount; 3168 3169 /* 3170 * This is a good time to call it, no ncp's are locked by 3171 * the caller or us. 3172 */ 3173 cache_hysteresis(1); 3174 3175 /* 3176 * Try to locate an existing entry 3177 */ 3178 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3179 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3180 nchpp = NCHHASH(hash); 3181 3182 spin_lock_shared(&nchpp->spin); 3183 3184 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3185 /* 3186 * Break out if we find a matching entry. Note that 3187 * UNRESOLVED entries may match, but DESTROYED entries 3188 * do not. 3189 */ 3190 if (ncp->nc_parent == par_nch->ncp && 3191 ncp->nc_nlen == nlc->nlc_namelen && 3192 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3193 (ncp->nc_flag & NCF_DESTROYED) == 0 3194 ) { 3195 _cache_hold(ncp); 3196 spin_unlock_shared(&nchpp->spin); 3197 if (_cache_lock_shared_special(ncp) == 0) { 3198 if (ncp->nc_parent == par_nch->ncp && 3199 ncp->nc_nlen == nlc->nlc_namelen && 3200 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3201 ncp->nc_nlen) == 0 && 3202 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3203 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3204 _cache_auto_unresolve_test(mp, ncp) == 0) { 3205 goto found; 3206 } 3207 _cache_unlock(ncp); 3208 } 3209 _cache_drop(ncp); 3210 spin_lock_shared(&nchpp->spin); 3211 break; 3212 } 3213 } 3214 3215 /* 3216 * Failure 3217 */ 3218 spin_unlock_shared(&nchpp->spin); 3219 return(EWOULDBLOCK); 3220 3221 /* 3222 * Success 3223 * 3224 * Note that nc_error might be non-zero (e.g ENOENT). 3225 */ 3226 found: 3227 res_nch->mount = mp; 3228 res_nch->ncp = ncp; 3229 ++gd->gd_nchstats->ncs_goodhits; 3230 _cache_mntref(res_nch->mount); 3231 3232 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3233 return(ncp->nc_error); 3234 } 3235 3236 /* 3237 * This is a non-blocking verison of cache_nlookup() used by 3238 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3239 * will return nch.ncp == NULL in that case. 3240 */ 3241 struct nchandle 3242 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3243 { 3244 struct nchandle nch; 3245 struct namecache *ncp; 3246 struct namecache *new_ncp; 3247 struct nchash_head *nchpp; 3248 struct mount *mp; 3249 u_int32_t hash; 3250 globaldata_t gd; 3251 int par_locked; 3252 3253 gd = mycpu; 3254 mp = par_nch->mount; 3255 par_locked = 0; 3256 3257 /* 3258 * Try to locate an existing entry 3259 */ 3260 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3261 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3262 new_ncp = NULL; 3263 nchpp = NCHHASH(hash); 3264 restart: 3265 spin_lock(&nchpp->spin); 3266 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3267 /* 3268 * Break out if we find a matching entry. Note that 3269 * UNRESOLVED entries may match, but DESTROYED entries 3270 * do not. 3271 */ 3272 if (ncp->nc_parent == par_nch->ncp && 3273 ncp->nc_nlen == nlc->nlc_namelen && 3274 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3275 (ncp->nc_flag & NCF_DESTROYED) == 0 3276 ) { 3277 _cache_hold(ncp); 3278 spin_unlock(&nchpp->spin); 3279 if (par_locked) { 3280 _cache_unlock(par_nch->ncp); 3281 par_locked = 0; 3282 } 3283 if (_cache_lock_special(ncp) == 0) { 3284 if (ncp->nc_parent != par_nch->ncp || 3285 ncp->nc_nlen != nlc->nlc_namelen || 3286 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3287 (ncp->nc_flag & NCF_DESTROYED)) { 3288 kprintf("cache_lookup_nonblock: " 3289 "ncp-race %p %*.*s\n", 3290 ncp, 3291 nlc->nlc_namelen, 3292 nlc->nlc_namelen, 3293 nlc->nlc_nameptr); 3294 _cache_unlock(ncp); 3295 _cache_drop(ncp); 3296 goto failed; 3297 } 3298 _cache_auto_unresolve(mp, ncp); 3299 if (new_ncp) { 3300 _cache_free(new_ncp); 3301 new_ncp = NULL; 3302 } 3303 goto found; 3304 } 3305 _cache_drop(ncp); 3306 goto failed; 3307 } 3308 } 3309 3310 /* 3311 * We failed to locate an entry, create a new entry and add it to 3312 * the cache. The parent ncp must also be locked so we 3313 * can link into it. 3314 * 3315 * We have to relookup after possibly blocking in kmalloc or 3316 * when locking par_nch. 3317 * 3318 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3319 * mount case, in which case nc_name will be NULL. 3320 */ 3321 if (new_ncp == NULL) { 3322 spin_unlock(&nchpp->spin); 3323 new_ncp = cache_alloc(nlc->nlc_namelen); 3324 if (nlc->nlc_namelen) { 3325 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3326 nlc->nlc_namelen); 3327 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3328 } 3329 goto restart; 3330 } 3331 if (par_locked == 0) { 3332 spin_unlock(&nchpp->spin); 3333 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3334 par_locked = 1; 3335 goto restart; 3336 } 3337 goto failed; 3338 } 3339 3340 /* 3341 * WARNING! We still hold the spinlock. We have to set the hash 3342 * table entry atomically. 3343 */ 3344 ncp = new_ncp; 3345 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3346 spin_unlock(&nchpp->spin); 3347 _cache_unlock(par_nch->ncp); 3348 /* par_locked = 0 - not used */ 3349 found: 3350 /* 3351 * stats and namecache size management 3352 */ 3353 if (ncp->nc_flag & NCF_UNRESOLVED) 3354 ++gd->gd_nchstats->ncs_miss; 3355 else if (ncp->nc_vp) 3356 ++gd->gd_nchstats->ncs_goodhits; 3357 else 3358 ++gd->gd_nchstats->ncs_neghits; 3359 nch.mount = mp; 3360 nch.ncp = ncp; 3361 _cache_mntref(nch.mount); 3362 3363 return(nch); 3364 failed: 3365 if (new_ncp) { 3366 _cache_free(new_ncp); 3367 new_ncp = NULL; 3368 } 3369 nch.mount = NULL; 3370 nch.ncp = NULL; 3371 return(nch); 3372 } 3373 3374 /* 3375 * The namecache entry is marked as being used as a mount point. 3376 * Locate the mount if it is visible to the caller. The DragonFly 3377 * mount system allows arbitrary loops in the topology and disentangles 3378 * those loops by matching against (mp, ncp) rather than just (ncp). 3379 * This means any given ncp can dive any number of mounts, depending 3380 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3381 * 3382 * We use a very simple frontend cache to reduce SMP conflicts, 3383 * which we have to do because the mountlist scan needs an exclusive 3384 * lock around its ripout info list. Not to mention that there might 3385 * be a lot of mounts. 3386 */ 3387 struct findmount_info { 3388 struct mount *result; 3389 struct mount *nch_mount; 3390 struct namecache *nch_ncp; 3391 }; 3392 3393 #define MNTCACHE_PRIME 66555444443333333ULL 3394 3395 static 3396 struct ncmount_cache * 3397 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3398 { 3399 uintptr_t hash; 3400 3401 hash = (uintptr_t)mp + ((uintptr_t)mp >> 18); 3402 hash %= MNTCACHE_PRIME; 3403 hash ^= (uintptr_t)ncp + ((uintptr_t)ncp >> 18); 3404 hash %= MNTCACHE_PRIME; 3405 hash = hash % NCMOUNT_NUMCACHE; 3406 3407 return (&ncmount_cache[hash]); 3408 } 3409 3410 static 3411 int 3412 cache_findmount_callback(struct mount *mp, void *data) 3413 { 3414 struct findmount_info *info = data; 3415 3416 /* 3417 * Check the mount's mounted-on point against the passed nch. 3418 */ 3419 if (mp->mnt_ncmounton.mount == info->nch_mount && 3420 mp->mnt_ncmounton.ncp == info->nch_ncp 3421 ) { 3422 info->result = mp; 3423 _cache_mntref(mp); 3424 return(-1); 3425 } 3426 return(0); 3427 } 3428 3429 struct mount * 3430 cache_findmount(struct nchandle *nch) 3431 { 3432 struct findmount_info info; 3433 struct ncmount_cache *ncc; 3434 struct mount *mp; 3435 3436 /* 3437 * Fast 3438 */ 3439 if (ncmount_cache_enable == 0) { 3440 ncc = NULL; 3441 goto skip; 3442 } 3443 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3444 if (ncc->ncp == nch->ncp) { 3445 spin_lock_shared(&ncc->spin); 3446 if (ncc->isneg == 0 && 3447 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3448 if (mp->mnt_ncmounton.mount == nch->mount && 3449 mp->mnt_ncmounton.ncp == nch->ncp) { 3450 /* 3451 * Cache hit (positive) 3452 */ 3453 _cache_mntref(mp); 3454 spin_unlock_shared(&ncc->spin); 3455 return(mp); 3456 } 3457 /* else cache miss */ 3458 } 3459 if (ncc->isneg && 3460 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3461 /* 3462 * Cache hit (negative) 3463 */ 3464 spin_unlock_shared(&ncc->spin); 3465 return(NULL); 3466 } 3467 spin_unlock_shared(&ncc->spin); 3468 } 3469 skip: 3470 3471 /* 3472 * Slow 3473 */ 3474 info.result = NULL; 3475 info.nch_mount = nch->mount; 3476 info.nch_ncp = nch->ncp; 3477 mountlist_scan(cache_findmount_callback, &info, 3478 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 3479 3480 /* 3481 * Cache the result. 3482 * 3483 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3484 * only used for pointer comparisons and is not 3485 * referenced (otherwise there would be dangling 3486 * refs). 3487 * 3488 * Positive lookups: We cache the originating {ncp} and the target 3489 * (mp). (mp) is referenced. 3490 * 3491 * Indeterminant: If the match is undergoing an unmount we do 3492 * not cache it to avoid racing cache_unmounting(), 3493 * but still return the match. 3494 */ 3495 if (ncc) { 3496 spin_lock(&ncc->spin); 3497 if (info.result == NULL) { 3498 if (ncc->isneg == 0 && ncc->mp) 3499 _cache_mntrel(ncc->mp); 3500 ncc->ncp = nch->ncp; 3501 ncc->mp = nch->mount; 3502 ncc->isneg = 1; 3503 spin_unlock(&ncc->spin); 3504 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3505 if (ncc->isneg == 0 && ncc->mp) 3506 _cache_mntrel(ncc->mp); 3507 _cache_mntref(info.result); 3508 ncc->ncp = nch->ncp; 3509 ncc->mp = info.result; 3510 ncc->isneg = 0; 3511 spin_unlock(&ncc->spin); 3512 } else { 3513 spin_unlock(&ncc->spin); 3514 } 3515 } 3516 return(info.result); 3517 } 3518 3519 void 3520 cache_dropmount(struct mount *mp) 3521 { 3522 _cache_mntrel(mp); 3523 } 3524 3525 void 3526 cache_ismounting(struct mount *mp) 3527 { 3528 struct nchandle *nch = &mp->mnt_ncmounton; 3529 struct ncmount_cache *ncc; 3530 3531 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3532 if (ncc->isneg && 3533 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3534 spin_lock(&ncc->spin); 3535 if (ncc->isneg && 3536 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3537 ncc->ncp = NULL; 3538 ncc->mp = NULL; 3539 } 3540 spin_unlock(&ncc->spin); 3541 } 3542 } 3543 3544 void 3545 cache_unmounting(struct mount *mp) 3546 { 3547 struct nchandle *nch = &mp->mnt_ncmounton; 3548 struct ncmount_cache *ncc; 3549 3550 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3551 if (ncc->isneg == 0 && 3552 ncc->ncp == nch->ncp && ncc->mp == mp) { 3553 spin_lock(&ncc->spin); 3554 if (ncc->isneg == 0 && 3555 ncc->ncp == nch->ncp && ncc->mp == mp) { 3556 _cache_mntrel(mp); 3557 ncc->ncp = NULL; 3558 ncc->mp = NULL; 3559 } 3560 spin_unlock(&ncc->spin); 3561 } 3562 } 3563 3564 /* 3565 * Resolve an unresolved namecache entry, generally by looking it up. 3566 * The passed ncp must be locked and refd. 3567 * 3568 * Theoretically since a vnode cannot be recycled while held, and since 3569 * the nc_parent chain holds its vnode as long as children exist, the 3570 * direct parent of the cache entry we are trying to resolve should 3571 * have a valid vnode. If not then generate an error that we can 3572 * determine is related to a resolver bug. 3573 * 3574 * However, if a vnode was in the middle of a recyclement when the NCP 3575 * got locked, ncp->nc_vp might point to a vnode that is about to become 3576 * invalid. cache_resolve() handles this case by unresolving the entry 3577 * and then re-resolving it. 3578 * 3579 * Note that successful resolution does not necessarily return an error 3580 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3581 * will be returned. 3582 */ 3583 int 3584 cache_resolve(struct nchandle *nch, struct ucred *cred) 3585 { 3586 struct namecache *par_tmp; 3587 struct namecache *par; 3588 struct namecache *ncp; 3589 struct nchandle nctmp; 3590 struct mount *mp; 3591 struct vnode *dvp; 3592 int error; 3593 3594 ncp = nch->ncp; 3595 mp = nch->mount; 3596 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3597 restart: 3598 /* 3599 * If the ncp is already resolved we have nothing to do. However, 3600 * we do want to guarentee that a usable vnode is returned when 3601 * a vnode is present, so make sure it hasn't been reclaimed. 3602 */ 3603 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3604 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3605 _cache_setunresolved(ncp); 3606 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3607 return (ncp->nc_error); 3608 } 3609 3610 /* 3611 * If the ncp was destroyed it will never resolve again. This 3612 * can basically only happen when someone is chdir'd into an 3613 * empty directory which is then rmdir'd. We want to catch this 3614 * here and not dive the VFS because the VFS might actually 3615 * have a way to re-resolve the disconnected ncp, which will 3616 * result in inconsistencies in the cdir/nch for proc->p_fd. 3617 */ 3618 if (ncp->nc_flag & NCF_DESTROYED) 3619 return(EINVAL); 3620 3621 /* 3622 * Mount points need special handling because the parent does not 3623 * belong to the same filesystem as the ncp. 3624 */ 3625 if (ncp == mp->mnt_ncmountpt.ncp) 3626 return (cache_resolve_mp(mp)); 3627 3628 /* 3629 * We expect an unbroken chain of ncps to at least the mount point, 3630 * and even all the way to root (but this code doesn't have to go 3631 * past the mount point). 3632 */ 3633 if (ncp->nc_parent == NULL) { 3634 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3635 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3636 ncp->nc_error = EXDEV; 3637 return(ncp->nc_error); 3638 } 3639 3640 /* 3641 * The vp's of the parent directories in the chain are held via vhold() 3642 * due to the existance of the child, and should not disappear. 3643 * However, there are cases where they can disappear: 3644 * 3645 * - due to filesystem I/O errors. 3646 * - due to NFS being stupid about tracking the namespace and 3647 * destroys the namespace for entire directories quite often. 3648 * - due to forced unmounts. 3649 * - due to an rmdir (parent will be marked DESTROYED) 3650 * 3651 * When this occurs we have to track the chain backwards and resolve 3652 * it, looping until the resolver catches up to the current node. We 3653 * could recurse here but we might run ourselves out of kernel stack 3654 * so we do it in a more painful manner. This situation really should 3655 * not occur all that often, or if it does not have to go back too 3656 * many nodes to resolve the ncp. 3657 */ 3658 while ((dvp = cache_dvpref(ncp)) == NULL) { 3659 /* 3660 * This case can occur if a process is CD'd into a 3661 * directory which is then rmdir'd. If the parent is marked 3662 * destroyed there is no point trying to resolve it. 3663 */ 3664 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3665 return(ENOENT); 3666 par = ncp->nc_parent; 3667 _cache_hold(par); 3668 _cache_lock(par); 3669 while ((par_tmp = par->nc_parent) != NULL && 3670 par_tmp->nc_vp == NULL) { 3671 _cache_hold(par_tmp); 3672 _cache_lock(par_tmp); 3673 _cache_put(par); 3674 par = par_tmp; 3675 } 3676 if (par->nc_parent == NULL) { 3677 kprintf("EXDEV case 2 %*.*s\n", 3678 par->nc_nlen, par->nc_nlen, par->nc_name); 3679 _cache_put(par); 3680 return (EXDEV); 3681 } 3682 /* 3683 * The parent is not set in stone, ref and lock it to prevent 3684 * it from disappearing. Also note that due to renames it 3685 * is possible for our ncp to move and for par to no longer 3686 * be one of its parents. We resolve it anyway, the loop 3687 * will handle any moves. 3688 */ 3689 _cache_get(par); /* additional hold/lock */ 3690 _cache_put(par); /* from earlier hold/lock */ 3691 if (par == nch->mount->mnt_ncmountpt.ncp) { 3692 cache_resolve_mp(nch->mount); 3693 } else if ((dvp = cache_dvpref(par)) == NULL) { 3694 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3695 _cache_put(par); 3696 continue; 3697 } else { 3698 if (par->nc_flag & NCF_UNRESOLVED) { 3699 nctmp.mount = mp; 3700 nctmp.ncp = par; 3701 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3702 } 3703 vrele(dvp); 3704 } 3705 if ((error = par->nc_error) != 0) { 3706 if (par->nc_error != EAGAIN) { 3707 kprintf("EXDEV case 3 %*.*s error %d\n", 3708 par->nc_nlen, par->nc_nlen, par->nc_name, 3709 par->nc_error); 3710 _cache_put(par); 3711 return(error); 3712 } 3713 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3714 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3715 } 3716 _cache_put(par); 3717 /* loop */ 3718 } 3719 3720 /* 3721 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3722 * ncp's and reattach them. If this occurs the original ncp is marked 3723 * EAGAIN to force a relookup. 3724 * 3725 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3726 * ncp must already be resolved. 3727 */ 3728 if (dvp) { 3729 nctmp.mount = mp; 3730 nctmp.ncp = ncp; 3731 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3732 vrele(dvp); 3733 } else { 3734 ncp->nc_error = EPERM; 3735 } 3736 if (ncp->nc_error == EAGAIN) { 3737 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3738 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3739 goto restart; 3740 } 3741 return(ncp->nc_error); 3742 } 3743 3744 /* 3745 * Resolve the ncp associated with a mount point. Such ncp's almost always 3746 * remain resolved and this routine is rarely called. NFS MPs tends to force 3747 * re-resolution more often due to its mac-truck-smash-the-namecache 3748 * method of tracking namespace changes. 3749 * 3750 * The semantics for this call is that the passed ncp must be locked on 3751 * entry and will be locked on return. However, if we actually have to 3752 * resolve the mount point we temporarily unlock the entry in order to 3753 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3754 * the unlock we have to recheck the flags after we relock. 3755 */ 3756 static int 3757 cache_resolve_mp(struct mount *mp) 3758 { 3759 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3760 struct vnode *vp; 3761 int error; 3762 3763 KKASSERT(mp != NULL); 3764 3765 /* 3766 * If the ncp is already resolved we have nothing to do. However, 3767 * we do want to guarentee that a usable vnode is returned when 3768 * a vnode is present, so make sure it hasn't been reclaimed. 3769 */ 3770 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3771 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3772 _cache_setunresolved(ncp); 3773 } 3774 3775 if (ncp->nc_flag & NCF_UNRESOLVED) { 3776 _cache_unlock(ncp); 3777 while (vfs_busy(mp, 0)) 3778 ; 3779 error = VFS_ROOT(mp, &vp); 3780 _cache_lock(ncp); 3781 3782 /* 3783 * recheck the ncp state after relocking. 3784 */ 3785 if (ncp->nc_flag & NCF_UNRESOLVED) { 3786 ncp->nc_error = error; 3787 if (error == 0) { 3788 _cache_setvp(mp, ncp, vp); 3789 vput(vp); 3790 } else { 3791 kprintf("[diagnostic] cache_resolve_mp: failed" 3792 " to resolve mount %p err=%d ncp=%p\n", 3793 mp, error, ncp); 3794 _cache_setvp(mp, ncp, NULL); 3795 } 3796 } else if (error == 0) { 3797 vput(vp); 3798 } 3799 vfs_unbusy(mp); 3800 } 3801 return(ncp->nc_error); 3802 } 3803 3804 /* 3805 * Clean out negative cache entries when too many have accumulated. 3806 */ 3807 static void 3808 _cache_cleanneg(long count) 3809 { 3810 struct pcpu_ncache *pn; 3811 struct namecache *ncp; 3812 static uint32_t neg_rover; 3813 uint32_t n; 3814 3815 n = neg_rover++; /* SMP heuristical, race ok */ 3816 cpu_ccfence(); 3817 n = n % (uint32_t)ncpus; 3818 3819 pn = &pcpu_ncache[n]; 3820 spin_lock(&pn->neg_spin); 3821 count = pn->neg_count * count / vfscache_negs + 1; 3822 spin_unlock(&pn->neg_spin); 3823 3824 /* 3825 * Attempt to clean out the specified number of negative cache 3826 * entries. 3827 */ 3828 while (count > 0) { 3829 spin_lock(&pn->neg_spin); 3830 ncp = TAILQ_FIRST(&pn->neg_list); 3831 if (ncp == NULL) { 3832 spin_unlock(&pn->neg_spin); 3833 break; 3834 } 3835 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 3836 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 3837 _cache_hold(ncp); 3838 spin_unlock(&pn->neg_spin); 3839 3840 /* 3841 * This can race, so we must re-check that the ncp 3842 * is on the ncneg.list after successfully locking it. 3843 */ 3844 if (_cache_lock_special(ncp) == 0) { 3845 if (ncp->nc_vp == NULL && 3846 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3847 ncp = cache_zap(ncp, 1); 3848 if (ncp) 3849 _cache_drop(ncp); 3850 } else { 3851 kprintf("cache_cleanneg: race avoided\n"); 3852 _cache_unlock(ncp); 3853 } 3854 } else { 3855 _cache_drop(ncp); 3856 } 3857 --count; 3858 } 3859 } 3860 3861 /* 3862 * Clean out positive cache entries when too many have accumulated. 3863 */ 3864 static void 3865 _cache_cleanpos(long count) 3866 { 3867 static volatile int rover; 3868 struct nchash_head *nchpp; 3869 struct namecache *ncp; 3870 int rover_copy; 3871 3872 /* 3873 * Attempt to clean out the specified number of negative cache 3874 * entries. 3875 */ 3876 while (count > 0) { 3877 rover_copy = ++rover; /* MPSAFEENOUGH */ 3878 cpu_ccfence(); 3879 nchpp = NCHHASH(rover_copy); 3880 3881 spin_lock_shared(&nchpp->spin); 3882 ncp = LIST_FIRST(&nchpp->list); 3883 while (ncp && (ncp->nc_flag & NCF_DESTROYED)) 3884 ncp = LIST_NEXT(ncp, nc_hash); 3885 if (ncp) 3886 _cache_hold(ncp); 3887 spin_unlock_shared(&nchpp->spin); 3888 3889 if (ncp) { 3890 if (_cache_lock_special(ncp) == 0) { 3891 ncp = cache_zap(ncp, 1); 3892 if (ncp) 3893 _cache_drop(ncp); 3894 } else { 3895 _cache_drop(ncp); 3896 } 3897 } 3898 --count; 3899 } 3900 } 3901 3902 /* 3903 * This is a kitchen sink function to clean out ncps which we 3904 * tried to zap from cache_drop() but failed because we were 3905 * unable to acquire the parent lock. 3906 * 3907 * Such entries can also be removed via cache_inval_vp(), such 3908 * as when unmounting. 3909 */ 3910 static void 3911 _cache_cleandefered(void) 3912 { 3913 struct nchash_head *nchpp; 3914 struct namecache *ncp; 3915 struct namecache dummy; 3916 int i; 3917 3918 numdefered = 0; 3919 bzero(&dummy, sizeof(dummy)); 3920 dummy.nc_flag = NCF_DESTROYED; 3921 dummy.nc_refs = 1; 3922 3923 for (i = 0; i <= nchash; ++i) { 3924 nchpp = &nchashtbl[i]; 3925 3926 spin_lock(&nchpp->spin); 3927 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3928 ncp = &dummy; 3929 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) { 3930 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 3931 continue; 3932 LIST_REMOVE(&dummy, nc_hash); 3933 LIST_INSERT_AFTER(ncp, &dummy, nc_hash); 3934 _cache_hold(ncp); 3935 spin_unlock(&nchpp->spin); 3936 if (_cache_lock_nonblock(ncp) == 0) { 3937 ncp->nc_flag &= ~NCF_DEFEREDZAP; 3938 _cache_unlock(ncp); 3939 } 3940 _cache_drop(ncp); 3941 spin_lock(&nchpp->spin); 3942 ncp = &dummy; 3943 } 3944 LIST_REMOVE(&dummy, nc_hash); 3945 spin_unlock(&nchpp->spin); 3946 } 3947 } 3948 3949 /* 3950 * Name cache initialization, from vfsinit() when we are booting 3951 */ 3952 void 3953 nchinit(void) 3954 { 3955 struct pcpu_ncache *pn; 3956 globaldata_t gd; 3957 int i; 3958 3959 /* 3960 * Per-cpu accounting and negative hit list 3961 */ 3962 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 3963 M_VFSCACHE, M_WAITOK|M_ZERO); 3964 for (i = 0; i < ncpus; ++i) { 3965 pn = &pcpu_ncache[i]; 3966 TAILQ_INIT(&pn->neg_list); 3967 spin_init(&pn->neg_spin, "ncneg"); 3968 } 3969 3970 /* 3971 * Initialise per-cpu namecache effectiveness statistics. 3972 */ 3973 for (i = 0; i < ncpus; ++i) { 3974 gd = globaldata_find(i); 3975 gd->gd_nchstats = &nchstats[i]; 3976 } 3977 3978 /* 3979 * Create a generous namecache hash table 3980 */ 3981 nchashtbl = hashinit_ext(vfs_inodehashsize(), 3982 sizeof(struct nchash_head), 3983 M_VFSCACHE, &nchash); 3984 for (i = 0; i <= (int)nchash; ++i) { 3985 LIST_INIT(&nchashtbl[i].list); 3986 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 3987 } 3988 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 3989 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 3990 nclockwarn = 5 * hz; 3991 } 3992 3993 /* 3994 * Called from start_init() to bootstrap the root filesystem. Returns 3995 * a referenced, unlocked namecache record. 3996 */ 3997 void 3998 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 3999 { 4000 nch->ncp = cache_alloc(0); 4001 nch->mount = mp; 4002 _cache_mntref(mp); 4003 if (vp) 4004 _cache_setvp(nch->mount, nch->ncp, vp); 4005 } 4006 4007 /* 4008 * vfs_cache_setroot() 4009 * 4010 * Create an association between the root of our namecache and 4011 * the root vnode. This routine may be called several times during 4012 * booting. 4013 * 4014 * If the caller intends to save the returned namecache pointer somewhere 4015 * it must cache_hold() it. 4016 */ 4017 void 4018 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4019 { 4020 struct vnode *ovp; 4021 struct nchandle onch; 4022 4023 ovp = rootvnode; 4024 onch = rootnch; 4025 rootvnode = nvp; 4026 if (nch) 4027 rootnch = *nch; 4028 else 4029 cache_zero(&rootnch); 4030 if (ovp) 4031 vrele(ovp); 4032 if (onch.ncp) 4033 cache_drop(&onch); 4034 } 4035 4036 /* 4037 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4038 * topology and is being removed as quickly as possible. The new VOP_N*() 4039 * API calls are required to make specific adjustments using the supplied 4040 * ncp pointers rather then just bogusly purging random vnodes. 4041 * 4042 * Invalidate all namecache entries to a particular vnode as well as 4043 * any direct children of that vnode in the namecache. This is a 4044 * 'catch all' purge used by filesystems that do not know any better. 4045 * 4046 * Note that the linkage between the vnode and its namecache entries will 4047 * be removed, but the namecache entries themselves might stay put due to 4048 * active references from elsewhere in the system or due to the existance of 4049 * the children. The namecache topology is left intact even if we do not 4050 * know what the vnode association is. Such entries will be marked 4051 * NCF_UNRESOLVED. 4052 */ 4053 void 4054 cache_purge(struct vnode *vp) 4055 { 4056 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4057 } 4058 4059 static int disablecwd; 4060 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4061 "Disable getcwd"); 4062 4063 static u_long numcwdcalls; 4064 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4065 "Number of current directory resolution calls"); 4066 static u_long numcwdfailnf; 4067 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4068 "Number of current directory failures due to lack of file"); 4069 static u_long numcwdfailsz; 4070 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4071 "Number of current directory failures due to large result"); 4072 static u_long numcwdfound; 4073 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4074 "Number of current directory resolution successes"); 4075 4076 /* 4077 * MPALMOSTSAFE 4078 */ 4079 int 4080 sys___getcwd(struct __getcwd_args *uap) 4081 { 4082 u_int buflen; 4083 int error; 4084 char *buf; 4085 char *bp; 4086 4087 if (disablecwd) 4088 return (ENODEV); 4089 4090 buflen = uap->buflen; 4091 if (buflen == 0) 4092 return (EINVAL); 4093 if (buflen > MAXPATHLEN) 4094 buflen = MAXPATHLEN; 4095 4096 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4097 bp = kern_getcwd(buf, buflen, &error); 4098 if (error == 0) 4099 error = copyout(bp, uap->buf, strlen(bp) + 1); 4100 kfree(buf, M_TEMP); 4101 return (error); 4102 } 4103 4104 char * 4105 kern_getcwd(char *buf, size_t buflen, int *error) 4106 { 4107 struct proc *p = curproc; 4108 char *bp; 4109 int i, slash_prefixed; 4110 struct filedesc *fdp; 4111 struct nchandle nch; 4112 struct namecache *ncp; 4113 4114 numcwdcalls++; 4115 bp = buf; 4116 bp += buflen - 1; 4117 *bp = '\0'; 4118 fdp = p->p_fd; 4119 slash_prefixed = 0; 4120 4121 nch = fdp->fd_ncdir; 4122 ncp = nch.ncp; 4123 if (ncp) 4124 _cache_hold(ncp); 4125 4126 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4127 nch.mount != fdp->fd_nrdir.mount) 4128 ) { 4129 /* 4130 * While traversing upwards if we encounter the root 4131 * of the current mount we have to skip to the mount point 4132 * in the underlying filesystem. 4133 */ 4134 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4135 nch = nch.mount->mnt_ncmounton; 4136 _cache_drop(ncp); 4137 ncp = nch.ncp; 4138 if (ncp) 4139 _cache_hold(ncp); 4140 continue; 4141 } 4142 4143 /* 4144 * Prepend the path segment 4145 */ 4146 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4147 if (bp == buf) { 4148 numcwdfailsz++; 4149 *error = ERANGE; 4150 bp = NULL; 4151 goto done; 4152 } 4153 *--bp = ncp->nc_name[i]; 4154 } 4155 if (bp == buf) { 4156 numcwdfailsz++; 4157 *error = ERANGE; 4158 bp = NULL; 4159 goto done; 4160 } 4161 *--bp = '/'; 4162 slash_prefixed = 1; 4163 4164 /* 4165 * Go up a directory. This isn't a mount point so we don't 4166 * have to check again. 4167 */ 4168 while ((nch.ncp = ncp->nc_parent) != NULL) { 4169 if (ncp_shared_lock_disable) 4170 _cache_lock(ncp); 4171 else 4172 _cache_lock_shared(ncp); 4173 if (nch.ncp != ncp->nc_parent) { 4174 _cache_unlock(ncp); 4175 continue; 4176 } 4177 _cache_hold(nch.ncp); 4178 _cache_unlock(ncp); 4179 break; 4180 } 4181 _cache_drop(ncp); 4182 ncp = nch.ncp; 4183 } 4184 if (ncp == NULL) { 4185 numcwdfailnf++; 4186 *error = ENOENT; 4187 bp = NULL; 4188 goto done; 4189 } 4190 if (!slash_prefixed) { 4191 if (bp == buf) { 4192 numcwdfailsz++; 4193 *error = ERANGE; 4194 bp = NULL; 4195 goto done; 4196 } 4197 *--bp = '/'; 4198 } 4199 numcwdfound++; 4200 *error = 0; 4201 done: 4202 if (ncp) 4203 _cache_drop(ncp); 4204 return (bp); 4205 } 4206 4207 /* 4208 * Thus begins the fullpath magic. 4209 * 4210 * The passed nchp is referenced but not locked. 4211 */ 4212 static int disablefullpath; 4213 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4214 &disablefullpath, 0, 4215 "Disable fullpath lookups"); 4216 4217 int 4218 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4219 char **retbuf, char **freebuf, int guess) 4220 { 4221 struct nchandle fd_nrdir; 4222 struct nchandle nch; 4223 struct namecache *ncp; 4224 struct mount *mp, *new_mp; 4225 char *bp, *buf; 4226 int slash_prefixed; 4227 int error = 0; 4228 int i; 4229 4230 *retbuf = NULL; 4231 *freebuf = NULL; 4232 4233 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4234 bp = buf + MAXPATHLEN - 1; 4235 *bp = '\0'; 4236 if (nchbase) 4237 fd_nrdir = *nchbase; 4238 else if (p != NULL) 4239 fd_nrdir = p->p_fd->fd_nrdir; 4240 else 4241 fd_nrdir = rootnch; 4242 slash_prefixed = 0; 4243 nch = *nchp; 4244 ncp = nch.ncp; 4245 if (ncp) 4246 _cache_hold(ncp); 4247 mp = nch.mount; 4248 4249 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4250 new_mp = NULL; 4251 4252 /* 4253 * If we are asked to guess the upwards path, we do so whenever 4254 * we encounter an ncp marked as a mountpoint. We try to find 4255 * the actual mountpoint by finding the mountpoint with this 4256 * ncp. 4257 */ 4258 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4259 new_mp = mount_get_by_nc(ncp); 4260 } 4261 /* 4262 * While traversing upwards if we encounter the root 4263 * of the current mount we have to skip to the mount point. 4264 */ 4265 if (ncp == mp->mnt_ncmountpt.ncp) { 4266 new_mp = mp; 4267 } 4268 if (new_mp) { 4269 nch = new_mp->mnt_ncmounton; 4270 _cache_drop(ncp); 4271 ncp = nch.ncp; 4272 if (ncp) 4273 _cache_hold(ncp); 4274 mp = nch.mount; 4275 continue; 4276 } 4277 4278 /* 4279 * Prepend the path segment 4280 */ 4281 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4282 if (bp == buf) { 4283 kfree(buf, M_TEMP); 4284 error = ENOMEM; 4285 goto done; 4286 } 4287 *--bp = ncp->nc_name[i]; 4288 } 4289 if (bp == buf) { 4290 kfree(buf, M_TEMP); 4291 error = ENOMEM; 4292 goto done; 4293 } 4294 *--bp = '/'; 4295 slash_prefixed = 1; 4296 4297 /* 4298 * Go up a directory. This isn't a mount point so we don't 4299 * have to check again. 4300 * 4301 * We can only safely access nc_parent with ncp held locked. 4302 */ 4303 while ((nch.ncp = ncp->nc_parent) != NULL) { 4304 _cache_lock(ncp); 4305 if (nch.ncp != ncp->nc_parent) { 4306 _cache_unlock(ncp); 4307 continue; 4308 } 4309 _cache_hold(nch.ncp); 4310 _cache_unlock(ncp); 4311 break; 4312 } 4313 _cache_drop(ncp); 4314 ncp = nch.ncp; 4315 } 4316 if (ncp == NULL) { 4317 kfree(buf, M_TEMP); 4318 error = ENOENT; 4319 goto done; 4320 } 4321 4322 if (!slash_prefixed) { 4323 if (bp == buf) { 4324 kfree(buf, M_TEMP); 4325 error = ENOMEM; 4326 goto done; 4327 } 4328 *--bp = '/'; 4329 } 4330 *retbuf = bp; 4331 *freebuf = buf; 4332 error = 0; 4333 done: 4334 if (ncp) 4335 _cache_drop(ncp); 4336 return(error); 4337 } 4338 4339 int 4340 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4341 char **freebuf, int guess) 4342 { 4343 struct namecache *ncp; 4344 struct nchandle nch; 4345 int error; 4346 4347 *freebuf = NULL; 4348 if (disablefullpath) 4349 return (ENODEV); 4350 4351 if (p == NULL) 4352 return (EINVAL); 4353 4354 /* vn is NULL, client wants us to use p->p_textvp */ 4355 if (vn == NULL) { 4356 if ((vn = p->p_textvp) == NULL) 4357 return (EINVAL); 4358 } 4359 spin_lock_shared(&vn->v_spin); 4360 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4361 if (ncp->nc_nlen) 4362 break; 4363 } 4364 if (ncp == NULL) { 4365 spin_unlock_shared(&vn->v_spin); 4366 return (EINVAL); 4367 } 4368 _cache_hold(ncp); 4369 spin_unlock_shared(&vn->v_spin); 4370 4371 nch.ncp = ncp; 4372 nch.mount = vn->v_mount; 4373 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4374 _cache_drop(ncp); 4375 return (error); 4376 } 4377 4378 void 4379 vfscache_rollup_cpu(struct globaldata *gd) 4380 { 4381 struct pcpu_ncache *pn; 4382 long count; 4383 4384 if (pcpu_ncache == NULL) 4385 return; 4386 pn = &pcpu_ncache[gd->gd_cpuid]; 4387 4388 if (pn->vfscache_count) { 4389 count = atomic_swap_long(&pn->vfscache_count, 0); 4390 atomic_add_long(&vfscache_count, count); 4391 } 4392 if (pn->vfscache_leafs) { 4393 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4394 atomic_add_long(&vfscache_leafs, count); 4395 } 4396 if (pn->vfscache_negs) { 4397 count = atomic_swap_long(&pn->vfscache_negs, 0); 4398 atomic_add_long(&vfscache_negs, count); 4399 } 4400 } 4401 4402 #if 0 4403 static void 4404 vfscache_rollup_all(void) 4405 { 4406 int n; 4407 4408 for (n = 0; n < ncpus; ++n) 4409 vfscache_rollup_cpu(globaldata_find(n)); 4410 } 4411 #endif 4412