1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 #include <sys/mount.h> 70 #include <sys/vnode.h> 71 #include <sys/malloc.h> 72 #include <sys/sysproto.h> 73 #include <sys/spinlock.h> 74 #include <sys/proc.h> 75 #include <sys/namei.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/sysref2.h> 85 #include <sys/spinlock2.h> 86 #include <sys/mplock2.h> 87 88 #define MAX_RECURSION_DEPTH 64 89 90 /* 91 * Random lookups in the cache are accomplished with a hash table using 92 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). ncneglist is locked 98 * with a global spinlock (ncspin). 99 * 100 * MPSAFE RULES: 101 * 102 * (1) A ncp must be referenced before it can be locked. 103 * 104 * (2) A ncp must be locked in order to modify it. 105 * 106 * (3) ncp locks are always ordered child -> parent. That may seem 107 * backwards but forward scans use the hash table and thus can hold 108 * the parent unlocked when traversing downward. 109 * 110 * This allows insert/rename/delete/dot-dot and other operations 111 * to use ncp->nc_parent links. 112 * 113 * This also prevents a locked up e.g. NFS node from creating a 114 * chain reaction all the way back to the root vnode / namecache. 115 * 116 * (4) parent linkages require both the parent and child to be locked. 117 */ 118 119 /* 120 * Structures associated with name cacheing. 121 */ 122 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 123 #define MINNEG 1024 124 #define MINPOS 1024 125 #define NCMOUNT_NUMCACHE 1009 /* prime number */ 126 127 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 128 129 LIST_HEAD(nchash_list, namecache); 130 131 /* 132 * Don't cachealign, but at least pad to 32 bytes so entries 133 * don't cross a cache line. 134 */ 135 struct nchash_head { 136 struct nchash_list list; /* 16 bytes */ 137 struct spinlock spin; /* 8 bytes */ 138 long pad01; /* 8 bytes */ 139 }; 140 141 struct ncmount_cache { 142 struct spinlock spin; 143 struct namecache *ncp; 144 struct mount *mp; 145 int isneg; /* if != 0 mp is originator and not target */ 146 }; 147 148 static struct nchash_head *nchashtbl; 149 static struct namecache_list ncneglist; 150 static struct spinlock ncspin; 151 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 152 153 /* 154 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 155 * to create the namecache infrastructure leading to a dangling vnode. 156 * 157 * 0 Only errors are reported 158 * 1 Successes are reported 159 * 2 Successes + the whole directory scan is reported 160 * 3 Force the directory scan code run as if the parent vnode did not 161 * have a namecache record, even if it does have one. 162 */ 163 static int ncvp_debug; 164 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 165 "Namecache debug level (0-3)"); 166 167 static u_long nchash; /* size of hash table */ 168 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 169 "Size of namecache hash table"); 170 171 static int ncnegflush = 10; /* burst for negative flush */ 172 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 173 "Batch flush negative entries"); 174 175 static int ncposflush = 10; /* burst for positive flush */ 176 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 177 "Batch flush positive entries"); 178 179 static int ncnegfactor = 16; /* ratio of negative entries */ 180 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 181 "Ratio of namecache negative entries"); 182 183 static int nclockwarn; /* warn on locked entries in ticks */ 184 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 185 "Warn on locked namecache entries in ticks"); 186 187 static int numdefered; /* number of cache entries allocated */ 188 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 189 "Number of cache entries allocated"); 190 191 static int ncposlimit; /* number of cache entries allocated */ 192 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 193 "Number of cache entries allocated"); 194 195 static int ncp_shared_lock_disable = 0; 196 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 197 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 198 199 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 200 "sizeof(struct vnode)"); 201 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 202 "sizeof(struct namecache)"); 203 204 static int ncmount_cache_enable = 1; 205 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 206 &ncmount_cache_enable, 0, "mount point cache"); 207 static long ncmount_cache_hit; 208 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW, 209 &ncmount_cache_hit, 0, "mpcache hits"); 210 static long ncmount_cache_miss; 211 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW, 212 &ncmount_cache_miss, 0, "mpcache misses"); 213 static long ncmount_cache_overwrite; 214 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW, 215 &ncmount_cache_overwrite, 0, "mpcache entry overwrites"); 216 217 static __inline void _cache_drop(struct namecache *ncp); 218 static int cache_resolve_mp(struct mount *mp); 219 static struct vnode *cache_dvpref(struct namecache *ncp); 220 static void _cache_lock(struct namecache *ncp); 221 static void _cache_setunresolved(struct namecache *ncp); 222 static void _cache_cleanneg(int count); 223 static void _cache_cleanpos(int count); 224 static void _cache_cleandefered(void); 225 static void _cache_unlink(struct namecache *ncp); 226 227 /* 228 * The new name cache statistics 229 */ 230 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 231 static int numneg; 232 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 233 "Number of negative namecache entries"); 234 static int numcache; 235 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 236 "Number of namecaches entries"); 237 238 struct nchstats nchstats[SMP_MAXCPU]; 239 /* 240 * Export VFS cache effectiveness statistics to user-land. 241 * 242 * The statistics are left for aggregation to user-land so 243 * neat things can be achieved, like observing per-CPU cache 244 * distribution. 245 */ 246 static int 247 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 248 { 249 struct globaldata *gd; 250 int i, error; 251 252 error = 0; 253 for (i = 0; i < ncpus; ++i) { 254 gd = globaldata_find(i); 255 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 256 sizeof(struct nchstats)))) 257 break; 258 } 259 260 return (error); 261 } 262 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 263 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 264 265 static struct namecache *cache_zap(struct namecache *ncp, int nonblock); 266 267 /* 268 * Cache mount points and namecache records in order to avoid unnecessary 269 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 270 * performance and is particularly important on multi-socket systems to 271 * reduce cache-line ping-ponging. 272 * 273 * Try to keep the pcpu structure within one cache line (~64 bytes). 274 */ 275 #define MNTCACHE_COUNT 5 276 277 struct mntcache { 278 struct mount *mntary[MNTCACHE_COUNT]; 279 struct namecache *ncp1; 280 struct namecache *ncp2; 281 struct nchandle ncdir; 282 int iter; 283 int unused01; 284 } __cachealign; 285 286 static struct mntcache pcpu_mntcache[MAXCPU]; 287 288 static 289 void 290 _cache_mntref(struct mount *mp) 291 { 292 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 293 int i; 294 295 for (i = 0; i < MNTCACHE_COUNT; ++i) { 296 if (cache->mntary[i] != mp) 297 continue; 298 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL)) 299 return; 300 } 301 atomic_add_int(&mp->mnt_refs, 1); 302 } 303 304 static 305 void 306 _cache_mntrel(struct mount *mp) 307 { 308 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 309 int i; 310 311 for (i = 0; i < MNTCACHE_COUNT; ++i) { 312 if (cache->mntary[i] == NULL) { 313 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 314 if (mp == NULL) 315 return; 316 } 317 } 318 i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT); 319 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 320 if (mp) 321 atomic_add_int(&mp->mnt_refs, -1); 322 } 323 324 /* 325 * Clears all cached mount points on all cpus. This routine should only 326 * be called when we are waiting for a mount to clear, e.g. so we can 327 * unmount. 328 */ 329 void 330 cache_clearmntcache(void) 331 { 332 int n; 333 334 for (n = 0; n < ncpus; ++n) { 335 struct mntcache *cache = &pcpu_mntcache[n]; 336 struct namecache *ncp; 337 struct mount *mp; 338 int i; 339 340 for (i = 0; i < MNTCACHE_COUNT; ++i) { 341 if (cache->mntary[i]) { 342 mp = atomic_swap_ptr( 343 (void *)&cache->mntary[i], NULL); 344 if (mp) 345 atomic_add_int(&mp->mnt_refs, -1); 346 } 347 } 348 if (cache->ncp1) { 349 ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL); 350 if (ncp) 351 _cache_drop(ncp); 352 } 353 if (cache->ncp2) { 354 ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL); 355 if (ncp) 356 _cache_drop(ncp); 357 } 358 if (cache->ncdir.ncp) { 359 ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL); 360 if (ncp) 361 _cache_drop(ncp); 362 } 363 if (cache->ncdir.mount) { 364 mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL); 365 if (mp) 366 atomic_add_int(&mp->mnt_refs, -1); 367 } 368 } 369 } 370 371 372 /* 373 * Namespace locking. The caller must already hold a reference to the 374 * namecache structure in order to lock/unlock it. This function prevents 375 * the namespace from being created or destroyed by accessors other then 376 * the lock holder. 377 * 378 * Note that holding a locked namecache structure prevents other threads 379 * from making namespace changes (e.g. deleting or creating), prevents 380 * vnode association state changes by other threads, and prevents the 381 * namecache entry from being resolved or unresolved by other threads. 382 * 383 * An exclusive lock owner has full authority to associate/disassociate 384 * vnodes and resolve/unresolve the locked ncp. 385 * 386 * A shared lock owner only has authority to acquire the underlying vnode, 387 * if any. 388 * 389 * The primary lock field is nc_lockstatus. nc_locktd is set after the 390 * fact (when locking) or cleared prior to unlocking. 391 * 392 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 393 * or recycled, but it does NOT help you if the vnode had already 394 * initiated a recyclement. If this is important, use cache_get() 395 * rather then cache_lock() (and deal with the differences in the 396 * way the refs counter is handled). Or, alternatively, make an 397 * unconditional call to cache_validate() or cache_resolve() 398 * after cache_lock() returns. 399 */ 400 static 401 void 402 _cache_lock(struct namecache *ncp) 403 { 404 thread_t td; 405 int didwarn; 406 int begticks; 407 int error; 408 u_int count; 409 410 KKASSERT(ncp->nc_refs != 0); 411 didwarn = 0; 412 begticks = 0; 413 td = curthread; 414 415 for (;;) { 416 count = ncp->nc_lockstatus; 417 cpu_ccfence(); 418 419 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 420 if (atomic_cmpset_int(&ncp->nc_lockstatus, 421 count, count + 1)) { 422 /* 423 * The vp associated with a locked ncp must 424 * be held to prevent it from being recycled. 425 * 426 * WARNING! If VRECLAIMED is set the vnode 427 * could already be in the middle of a recycle. 428 * Callers must use cache_vref() or 429 * cache_vget() on the locked ncp to 430 * validate the vp or set the cache entry 431 * to unresolved. 432 * 433 * NOTE! vhold() is allowed if we hold a 434 * lock on the ncp (which we do). 435 */ 436 ncp->nc_locktd = td; 437 if (ncp->nc_vp) 438 vhold(ncp->nc_vp); 439 break; 440 } 441 /* cmpset failed */ 442 continue; 443 } 444 if (ncp->nc_locktd == td) { 445 KKASSERT((count & NC_SHLOCK_FLAG) == 0); 446 if (atomic_cmpset_int(&ncp->nc_lockstatus, 447 count, count + 1)) { 448 break; 449 } 450 /* cmpset failed */ 451 continue; 452 } 453 tsleep_interlock(&ncp->nc_locktd, 0); 454 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 455 count | NC_EXLOCK_REQ) == 0) { 456 /* cmpset failed */ 457 continue; 458 } 459 if (begticks == 0) 460 begticks = ticks; 461 error = tsleep(&ncp->nc_locktd, PINTERLOCKED, 462 "clock", nclockwarn); 463 if (error == EWOULDBLOCK) { 464 if (didwarn == 0) { 465 didwarn = ticks; 466 kprintf("[diagnostic] cache_lock: " 467 "%s blocked on %p %08x", 468 td->td_comm, ncp, count); 469 kprintf(" \"%*.*s\"\n", 470 ncp->nc_nlen, ncp->nc_nlen, 471 ncp->nc_name); 472 } 473 } 474 /* loop */ 475 } 476 if (didwarn) { 477 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after " 478 "%d secs\n", 479 td->td_comm, 480 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 481 (int)(ticks + (hz / 2) - begticks) / hz); 482 } 483 } 484 485 /* 486 * The shared lock works similarly to the exclusive lock except 487 * nc_locktd is left NULL and we need an interlock (VHOLD) to 488 * prevent vhold() races, since the moment our cmpset_int succeeds 489 * another cpu can come in and get its own shared lock. 490 * 491 * A critical section is needed to prevent interruption during the 492 * VHOLD interlock. 493 */ 494 static 495 void 496 _cache_lock_shared(struct namecache *ncp) 497 { 498 int didwarn; 499 int error; 500 u_int count; 501 u_int optreq = NC_EXLOCK_REQ; 502 503 KKASSERT(ncp->nc_refs != 0); 504 didwarn = 0; 505 506 for (;;) { 507 count = ncp->nc_lockstatus; 508 cpu_ccfence(); 509 510 if ((count & ~NC_SHLOCK_REQ) == 0) { 511 crit_enter(); 512 if (atomic_cmpset_int(&ncp->nc_lockstatus, 513 count, 514 (count + 1) | NC_SHLOCK_FLAG | 515 NC_SHLOCK_VHOLD)) { 516 /* 517 * The vp associated with a locked ncp must 518 * be held to prevent it from being recycled. 519 * 520 * WARNING! If VRECLAIMED is set the vnode 521 * could already be in the middle of a recycle. 522 * Callers must use cache_vref() or 523 * cache_vget() on the locked ncp to 524 * validate the vp or set the cache entry 525 * to unresolved. 526 * 527 * NOTE! vhold() is allowed if we hold a 528 * lock on the ncp (which we do). 529 */ 530 if (ncp->nc_vp) 531 vhold(ncp->nc_vp); 532 atomic_clear_int(&ncp->nc_lockstatus, 533 NC_SHLOCK_VHOLD); 534 crit_exit(); 535 break; 536 } 537 /* cmpset failed */ 538 crit_exit(); 539 continue; 540 } 541 542 /* 543 * If already held shared we can just bump the count, but 544 * only allow this if nobody is trying to get the lock 545 * exclusively. If we are blocking too long ignore excl 546 * requests (which can race/deadlock us). 547 * 548 * VHOLD is a bit of a hack. Even though we successfully 549 * added another shared ref, the cpu that got the first 550 * shared ref might not yet have held the vnode. 551 */ 552 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) { 553 KKASSERT((count & ~(NC_EXLOCK_REQ | 554 NC_SHLOCK_REQ | 555 NC_SHLOCK_FLAG)) > 0); 556 if (atomic_cmpset_int(&ncp->nc_lockstatus, 557 count, count + 1)) { 558 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 559 cpu_pause(); 560 break; 561 } 562 continue; 563 } 564 tsleep_interlock(ncp, 0); 565 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 566 count | NC_SHLOCK_REQ) == 0) { 567 /* cmpset failed */ 568 continue; 569 } 570 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); 571 if (error == EWOULDBLOCK) { 572 optreq = 0; 573 if (didwarn == 0) { 574 didwarn = ticks - nclockwarn; 575 kprintf("[diagnostic] cache_lock_shared: " 576 "%s blocked on %p %08x", 577 curthread->td_comm, ncp, count); 578 kprintf(" \"%*.*s\"\n", 579 ncp->nc_nlen, ncp->nc_nlen, 580 ncp->nc_name); 581 } 582 } 583 /* loop */ 584 } 585 if (didwarn) { 586 kprintf("[diagnostic] cache_lock_shared: " 587 "%s unblocked %*.*s after %d secs\n", 588 curthread->td_comm, 589 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 590 (int)(ticks - didwarn) / hz); 591 } 592 } 593 594 /* 595 * Lock ncp exclusively, return 0 on success. 596 * 597 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, 598 * such as the case where one of its children is locked. 599 */ 600 static 601 int 602 _cache_lock_nonblock(struct namecache *ncp) 603 { 604 thread_t td; 605 u_int count; 606 607 td = curthread; 608 609 for (;;) { 610 count = ncp->nc_lockstatus; 611 612 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 613 if (atomic_cmpset_int(&ncp->nc_lockstatus, 614 count, count + 1)) { 615 /* 616 * The vp associated with a locked ncp must 617 * be held to prevent it from being recycled. 618 * 619 * WARNING! If VRECLAIMED is set the vnode 620 * could already be in the middle of a recycle. 621 * Callers must use cache_vref() or 622 * cache_vget() on the locked ncp to 623 * validate the vp or set the cache entry 624 * to unresolved. 625 * 626 * NOTE! vhold() is allowed if we hold a 627 * lock on the ncp (which we do). 628 */ 629 ncp->nc_locktd = td; 630 if (ncp->nc_vp) 631 vhold(ncp->nc_vp); 632 break; 633 } 634 /* cmpset failed */ 635 continue; 636 } 637 if (ncp->nc_locktd == td) { 638 if (atomic_cmpset_int(&ncp->nc_lockstatus, 639 count, count + 1)) { 640 break; 641 } 642 /* cmpset failed */ 643 continue; 644 } 645 return(EWOULDBLOCK); 646 } 647 return(0); 648 } 649 650 /* 651 * The shared lock works similarly to the exclusive lock except 652 * nc_locktd is left NULL and we need an interlock (VHOLD) to 653 * prevent vhold() races, since the moment our cmpset_int succeeds 654 * another cpu can come in and get its own shared lock. 655 * 656 * A critical section is needed to prevent interruption during the 657 * VHOLD interlock. 658 */ 659 static 660 int 661 _cache_lock_shared_nonblock(struct namecache *ncp) 662 { 663 u_int count; 664 665 for (;;) { 666 count = ncp->nc_lockstatus; 667 668 if ((count & ~NC_SHLOCK_REQ) == 0) { 669 crit_enter(); 670 if (atomic_cmpset_int(&ncp->nc_lockstatus, 671 count, 672 (count + 1) | NC_SHLOCK_FLAG | 673 NC_SHLOCK_VHOLD)) { 674 /* 675 * The vp associated with a locked ncp must 676 * be held to prevent it from being recycled. 677 * 678 * WARNING! If VRECLAIMED is set the vnode 679 * could already be in the middle of a recycle. 680 * Callers must use cache_vref() or 681 * cache_vget() on the locked ncp to 682 * validate the vp or set the cache entry 683 * to unresolved. 684 * 685 * NOTE! vhold() is allowed if we hold a 686 * lock on the ncp (which we do). 687 */ 688 if (ncp->nc_vp) 689 vhold(ncp->nc_vp); 690 atomic_clear_int(&ncp->nc_lockstatus, 691 NC_SHLOCK_VHOLD); 692 crit_exit(); 693 break; 694 } 695 /* cmpset failed */ 696 crit_exit(); 697 continue; 698 } 699 700 /* 701 * If already held shared we can just bump the count, but 702 * only allow this if nobody is trying to get the lock 703 * exclusively. 704 * 705 * VHOLD is a bit of a hack. Even though we successfully 706 * added another shared ref, the cpu that got the first 707 * shared ref might not yet have held the vnode. 708 */ 709 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 710 NC_SHLOCK_FLAG) { 711 KKASSERT((count & ~(NC_EXLOCK_REQ | 712 NC_SHLOCK_REQ | 713 NC_SHLOCK_FLAG)) > 0); 714 if (atomic_cmpset_int(&ncp->nc_lockstatus, 715 count, count + 1)) { 716 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 717 cpu_pause(); 718 break; 719 } 720 continue; 721 } 722 return(EWOULDBLOCK); 723 } 724 return(0); 725 } 726 727 /* 728 * Helper function 729 * 730 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). 731 * 732 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. 733 */ 734 static 735 void 736 _cache_unlock(struct namecache *ncp) 737 { 738 thread_t td __debugvar = curthread; 739 u_int count; 740 u_int ncount; 741 struct vnode *dropvp; 742 743 KKASSERT(ncp->nc_refs >= 0); 744 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); 745 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); 746 747 count = ncp->nc_lockstatus; 748 cpu_ccfence(); 749 750 /* 751 * Clear nc_locktd prior to the atomic op (excl lock only) 752 */ 753 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) 754 ncp->nc_locktd = NULL; 755 dropvp = NULL; 756 757 for (;;) { 758 if ((count & 759 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { 760 dropvp = ncp->nc_vp; 761 if (count & NC_EXLOCK_REQ) 762 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ 763 else 764 ncount = 0; 765 766 if (atomic_cmpset_int(&ncp->nc_lockstatus, 767 count, ncount)) { 768 if (count & NC_EXLOCK_REQ) 769 wakeup(&ncp->nc_locktd); 770 else if (count & NC_SHLOCK_REQ) 771 wakeup(ncp); 772 break; 773 } 774 dropvp = NULL; 775 } else { 776 KKASSERT((count & NC_SHLOCK_VHOLD) == 0); 777 KKASSERT((count & ~(NC_EXLOCK_REQ | 778 NC_SHLOCK_REQ | 779 NC_SHLOCK_FLAG)) > 1); 780 if (atomic_cmpset_int(&ncp->nc_lockstatus, 781 count, count - 1)) { 782 break; 783 } 784 } 785 count = ncp->nc_lockstatus; 786 cpu_ccfence(); 787 } 788 789 /* 790 * Don't actually drop the vp until we successfully clean out 791 * the lock, otherwise we may race another shared lock. 792 */ 793 if (dropvp) 794 vdrop(dropvp); 795 } 796 797 static 798 int 799 _cache_lockstatus(struct namecache *ncp) 800 { 801 if (ncp->nc_locktd == curthread) 802 return(LK_EXCLUSIVE); 803 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) 804 return(LK_SHARED); 805 return(-1); 806 } 807 808 /* 809 * cache_hold() and cache_drop() prevent the premature deletion of a 810 * namecache entry but do not prevent operations (such as zapping) on 811 * that namecache entry. 812 * 813 * This routine may only be called from outside this source module if 814 * nc_refs is already at least 1. 815 * 816 * This is a rare case where callers are allowed to hold a spinlock, 817 * so we can't ourselves. 818 */ 819 static __inline 820 struct namecache * 821 _cache_hold(struct namecache *ncp) 822 { 823 atomic_add_int(&ncp->nc_refs, 1); 824 return(ncp); 825 } 826 827 /* 828 * Drop a cache entry, taking care to deal with races. 829 * 830 * For potential 1->0 transitions we must hold the ncp lock to safely 831 * test its flags. An unresolved entry with no children must be zapped 832 * to avoid leaks. 833 * 834 * The call to cache_zap() itself will handle all remaining races and 835 * will decrement the ncp's refs regardless. If we are resolved or 836 * have children nc_refs can safely be dropped to 0 without having to 837 * zap the entry. 838 * 839 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion. 840 * 841 * NOTE: cache_zap() may return a non-NULL referenced parent which must 842 * be dropped in a loop. 843 */ 844 static __inline 845 void 846 _cache_drop(struct namecache *ncp) 847 { 848 int refs; 849 850 while (ncp) { 851 KKASSERT(ncp->nc_refs > 0); 852 refs = ncp->nc_refs; 853 854 if (refs == 1) { 855 if (_cache_lock_nonblock(ncp) == 0) { 856 ncp->nc_flag &= ~NCF_DEFEREDZAP; 857 if ((ncp->nc_flag & NCF_UNRESOLVED) && 858 TAILQ_EMPTY(&ncp->nc_list)) { 859 ncp = cache_zap(ncp, 1); 860 continue; 861 } 862 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) { 863 _cache_unlock(ncp); 864 break; 865 } 866 _cache_unlock(ncp); 867 } 868 } else { 869 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) 870 break; 871 } 872 cpu_pause(); 873 } 874 } 875 876 /* 877 * Link a new namecache entry to its parent and to the hash table. Be 878 * careful to avoid races if vhold() blocks in the future. 879 * 880 * Both ncp and par must be referenced and locked. 881 * 882 * NOTE: The hash table spinlock is held during this call, we can't do 883 * anything fancy. 884 */ 885 static void 886 _cache_link_parent(struct namecache *ncp, struct namecache *par, 887 struct nchash_head *nchpp) 888 { 889 KKASSERT(ncp->nc_parent == NULL); 890 ncp->nc_parent = par; 891 ncp->nc_head = nchpp; 892 893 /* 894 * Set inheritance flags. Note that the parent flags may be 895 * stale due to getattr potentially not having been run yet 896 * (it gets run during nlookup()'s). 897 */ 898 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 899 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 900 ncp->nc_flag |= NCF_SF_PNOCACHE; 901 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 902 ncp->nc_flag |= NCF_UF_PCACHE; 903 904 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 905 906 if (TAILQ_EMPTY(&par->nc_list)) { 907 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 908 /* 909 * Any vp associated with an ncp which has children must 910 * be held to prevent it from being recycled. 911 */ 912 if (par->nc_vp) 913 vhold(par->nc_vp); 914 } else { 915 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 916 } 917 } 918 919 /* 920 * Remove the parent and hash associations from a namecache structure. 921 * If this is the last child of the parent the cache_drop(par) will 922 * attempt to recursively zap the parent. 923 * 924 * ncp must be locked. This routine will acquire a temporary lock on 925 * the parent as wlel as the appropriate hash chain. 926 */ 927 static void 928 _cache_unlink_parent(struct namecache *ncp) 929 { 930 struct namecache *par; 931 struct vnode *dropvp; 932 933 if ((par = ncp->nc_parent) != NULL) { 934 KKASSERT(ncp->nc_parent == par); 935 _cache_hold(par); 936 _cache_lock(par); 937 spin_lock(&ncp->nc_head->spin); 938 LIST_REMOVE(ncp, nc_hash); 939 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 940 dropvp = NULL; 941 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 942 dropvp = par->nc_vp; 943 spin_unlock(&ncp->nc_head->spin); 944 ncp->nc_parent = NULL; 945 ncp->nc_head = NULL; 946 _cache_unlock(par); 947 _cache_drop(par); 948 949 /* 950 * We can only safely vdrop with no spinlocks held. 951 */ 952 if (dropvp) 953 vdrop(dropvp); 954 } 955 } 956 957 /* 958 * Allocate a new namecache structure. Most of the code does not require 959 * zero-termination of the string but it makes vop_compat_ncreate() easier. 960 */ 961 static struct namecache * 962 cache_alloc(int nlen) 963 { 964 struct namecache *ncp; 965 966 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 967 if (nlen) 968 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 969 ncp->nc_nlen = nlen; 970 ncp->nc_flag = NCF_UNRESOLVED; 971 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 972 ncp->nc_refs = 1; 973 974 TAILQ_INIT(&ncp->nc_list); 975 _cache_lock(ncp); 976 return(ncp); 977 } 978 979 /* 980 * Can only be called for the case where the ncp has never been 981 * associated with anything (so no spinlocks are needed). 982 */ 983 static void 984 _cache_free(struct namecache *ncp) 985 { 986 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); 987 if (ncp->nc_name) 988 kfree(ncp->nc_name, M_VFSCACHE); 989 kfree(ncp, M_VFSCACHE); 990 } 991 992 /* 993 * [re]initialize a nchandle. 994 */ 995 void 996 cache_zero(struct nchandle *nch) 997 { 998 nch->ncp = NULL; 999 nch->mount = NULL; 1000 } 1001 1002 /* 1003 * Ref and deref a namecache structure. 1004 * 1005 * The caller must specify a stable ncp pointer, typically meaning the 1006 * ncp is already referenced but this can also occur indirectly through 1007 * e.g. holding a lock on a direct child. 1008 * 1009 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 1010 * use read spinlocks here. 1011 */ 1012 struct nchandle * 1013 cache_hold(struct nchandle *nch) 1014 { 1015 _cache_hold(nch->ncp); 1016 _cache_mntref(nch->mount); 1017 return(nch); 1018 } 1019 1020 /* 1021 * Create a copy of a namecache handle for an already-referenced 1022 * entry. 1023 */ 1024 void 1025 cache_copy(struct nchandle *nch, struct nchandle *target) 1026 { 1027 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1028 struct namecache *ncp; 1029 1030 *target = *nch; 1031 _cache_mntref(target->mount); 1032 ncp = target->ncp; 1033 if (ncp) { 1034 if (ncp == cache->ncp1) { 1035 if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL)) 1036 return; 1037 } 1038 if (ncp == cache->ncp2) { 1039 if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL)) 1040 return; 1041 } 1042 _cache_hold(ncp); 1043 } 1044 } 1045 1046 /* 1047 * Caller wants to copy the current directory, copy it out from our 1048 * pcpu cache if possible (the entire critical path is just two localized 1049 * cmpset ops). If the pcpu cache has a snapshot at all it will be a 1050 * valid one, so we don't have to lock p->p_fd even though we are loading 1051 * two fields. 1052 * 1053 * This has a limited effect since nlookup must still ref and shlock the 1054 * vnode to check perms. We do avoid the per-proc spin-lock though, which 1055 * can aid threaded programs. 1056 */ 1057 void 1058 cache_copy_ncdir(struct proc *p, struct nchandle *target) 1059 { 1060 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1061 1062 *target = p->p_fd->fd_ncdir; 1063 if (target->ncp == cache->ncdir.ncp && 1064 target->mount == cache->ncdir.mount) { 1065 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp, 1066 target->ncp, NULL)) { 1067 if (atomic_cmpset_ptr((void *)&cache->ncdir.mount, 1068 target->mount, NULL)) { 1069 /* CRITICAL PATH */ 1070 return; 1071 } 1072 _cache_drop(target->ncp); 1073 } 1074 } 1075 spin_lock_shared(&p->p_fd->fd_spin); 1076 cache_copy(&p->p_fd->fd_ncdir, target); 1077 spin_unlock_shared(&p->p_fd->fd_spin); 1078 } 1079 1080 void 1081 cache_changemount(struct nchandle *nch, struct mount *mp) 1082 { 1083 _cache_mntref(mp); 1084 _cache_mntrel(nch->mount); 1085 nch->mount = mp; 1086 } 1087 1088 void 1089 cache_drop(struct nchandle *nch) 1090 { 1091 _cache_mntrel(nch->mount); 1092 _cache_drop(nch->ncp); 1093 nch->ncp = NULL; 1094 nch->mount = NULL; 1095 } 1096 1097 /* 1098 * Drop the nchandle, but try to cache the ref to avoid global atomic 1099 * ops. This is typically done on the system root and jail root nchandles. 1100 */ 1101 void 1102 cache_drop_and_cache(struct nchandle *nch) 1103 { 1104 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1105 struct namecache *ncp; 1106 1107 _cache_mntrel(nch->mount); 1108 ncp = nch->ncp; 1109 if (cache->ncp1 == NULL) { 1110 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1111 if (ncp == NULL) 1112 goto done; 1113 } 1114 if (cache->ncp2 == NULL) { 1115 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1116 if (ncp == NULL) 1117 goto done; 1118 } 1119 if (++cache->iter & 1) 1120 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 1121 else 1122 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 1123 if (ncp) 1124 _cache_drop(ncp); 1125 done: 1126 nch->ncp = NULL; 1127 nch->mount = NULL; 1128 } 1129 1130 /* 1131 * We are dropping what the caller believes is the current directory, 1132 * unconditionally store it in our pcpu cache. Anything already in 1133 * the cache will be discarded. 1134 */ 1135 void 1136 cache_drop_ncdir(struct nchandle *nch) 1137 { 1138 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 1139 1140 nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp); 1141 nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount); 1142 if (nch->ncp) 1143 _cache_drop(nch->ncp); 1144 if (nch->mount) 1145 _cache_mntrel(nch->mount); 1146 nch->ncp = NULL; 1147 nch->mount = NULL; 1148 } 1149 1150 int 1151 cache_lockstatus(struct nchandle *nch) 1152 { 1153 return(_cache_lockstatus(nch->ncp)); 1154 } 1155 1156 void 1157 cache_lock(struct nchandle *nch) 1158 { 1159 _cache_lock(nch->ncp); 1160 } 1161 1162 void 1163 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1164 { 1165 struct namecache *ncp = nch->ncp; 1166 1167 if (ncp_shared_lock_disable || excl || 1168 (ncp->nc_flag & NCF_UNRESOLVED)) { 1169 _cache_lock(ncp); 1170 } else { 1171 _cache_lock_shared(ncp); 1172 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1173 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1174 _cache_unlock(ncp); 1175 _cache_lock(ncp); 1176 } 1177 } else { 1178 _cache_unlock(ncp); 1179 _cache_lock(ncp); 1180 } 1181 } 1182 } 1183 1184 /* 1185 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1186 * is responsible for checking both for validity on return as they 1187 * may have become invalid. 1188 * 1189 * We have to deal with potential deadlocks here, just ping pong 1190 * the lock until we get it (we will always block somewhere when 1191 * looping so this is not cpu-intensive). 1192 * 1193 * which = 0 nch1 not locked, nch2 is locked 1194 * which = 1 nch1 is locked, nch2 is not locked 1195 */ 1196 void 1197 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1198 struct nchandle *nch2, struct ucred *cred2) 1199 { 1200 int which; 1201 1202 which = 0; 1203 1204 for (;;) { 1205 if (which == 0) { 1206 if (cache_lock_nonblock(nch1) == 0) { 1207 cache_resolve(nch1, cred1); 1208 break; 1209 } 1210 cache_unlock(nch2); 1211 cache_lock(nch1); 1212 cache_resolve(nch1, cred1); 1213 which = 1; 1214 } else { 1215 if (cache_lock_nonblock(nch2) == 0) { 1216 cache_resolve(nch2, cred2); 1217 break; 1218 } 1219 cache_unlock(nch1); 1220 cache_lock(nch2); 1221 cache_resolve(nch2, cred2); 1222 which = 0; 1223 } 1224 } 1225 } 1226 1227 int 1228 cache_lock_nonblock(struct nchandle *nch) 1229 { 1230 return(_cache_lock_nonblock(nch->ncp)); 1231 } 1232 1233 void 1234 cache_unlock(struct nchandle *nch) 1235 { 1236 _cache_unlock(nch->ncp); 1237 } 1238 1239 /* 1240 * ref-and-lock, unlock-and-deref functions. 1241 * 1242 * This function is primarily used by nlookup. Even though cache_lock 1243 * holds the vnode, it is possible that the vnode may have already 1244 * initiated a recyclement. 1245 * 1246 * We want cache_get() to return a definitively usable vnode or a 1247 * definitively unresolved ncp. 1248 */ 1249 static 1250 struct namecache * 1251 _cache_get(struct namecache *ncp) 1252 { 1253 _cache_hold(ncp); 1254 _cache_lock(ncp); 1255 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1256 _cache_setunresolved(ncp); 1257 return(ncp); 1258 } 1259 1260 /* 1261 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1262 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1263 * valid. Otherwise an exclusive lock will be acquired instead. 1264 */ 1265 static 1266 struct namecache * 1267 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1268 { 1269 if (ncp_shared_lock_disable || excl || 1270 (ncp->nc_flag & NCF_UNRESOLVED)) { 1271 return(_cache_get(ncp)); 1272 } 1273 _cache_hold(ncp); 1274 _cache_lock_shared(ncp); 1275 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1276 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1277 _cache_unlock(ncp); 1278 ncp = _cache_get(ncp); 1279 _cache_drop(ncp); 1280 } 1281 } else { 1282 _cache_unlock(ncp); 1283 ncp = _cache_get(ncp); 1284 _cache_drop(ncp); 1285 } 1286 return(ncp); 1287 } 1288 1289 /* 1290 * This is a special form of _cache_lock() which only succeeds if 1291 * it can get a pristine, non-recursive lock. The caller must have 1292 * already ref'd the ncp. 1293 * 1294 * On success the ncp will be locked, on failure it will not. The 1295 * ref count does not change either way. 1296 * 1297 * We want _cache_lock_special() (on success) to return a definitively 1298 * usable vnode or a definitively unresolved ncp. 1299 */ 1300 static int 1301 _cache_lock_special(struct namecache *ncp) 1302 { 1303 if (_cache_lock_nonblock(ncp) == 0) { 1304 if ((ncp->nc_lockstatus & 1305 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { 1306 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1307 _cache_setunresolved(ncp); 1308 return(0); 1309 } 1310 _cache_unlock(ncp); 1311 } 1312 return(EWOULDBLOCK); 1313 } 1314 1315 /* 1316 * This function tries to get a shared lock but will back-off to an exclusive 1317 * lock if: 1318 * 1319 * (1) Some other thread is trying to obtain an exclusive lock 1320 * (to prevent the exclusive requester from getting livelocked out 1321 * by many shared locks). 1322 * 1323 * (2) The current thread already owns an exclusive lock (to avoid 1324 * deadlocking). 1325 * 1326 * WARNING! On machines with lots of cores we really want to try hard to 1327 * get a shared lock or concurrent path lookups can chain-react 1328 * into a very high-latency exclusive lock. 1329 */ 1330 static int 1331 _cache_lock_shared_special(struct namecache *ncp) 1332 { 1333 /* 1334 * Only honor a successful shared lock (returning 0) if there is 1335 * no exclusive request pending and the vnode, if present, is not 1336 * in a reclaimed state. 1337 */ 1338 if (_cache_lock_shared_nonblock(ncp) == 0) { 1339 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) { 1340 if (ncp->nc_vp == NULL || 1341 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 1342 return(0); 1343 } 1344 } 1345 _cache_unlock(ncp); 1346 return(EWOULDBLOCK); 1347 } 1348 1349 /* 1350 * Non-blocking shared lock failed. If we already own the exclusive 1351 * lock just acquire another exclusive lock (instead of deadlocking). 1352 * Otherwise acquire a shared lock. 1353 */ 1354 if (ncp->nc_locktd == curthread) { 1355 _cache_lock(ncp); 1356 return(0); 1357 } 1358 _cache_lock_shared(ncp); 1359 return(0); 1360 } 1361 1362 1363 /* 1364 * NOTE: The same nchandle can be passed for both arguments. 1365 */ 1366 void 1367 cache_get(struct nchandle *nch, struct nchandle *target) 1368 { 1369 KKASSERT(nch->ncp->nc_refs > 0); 1370 target->mount = nch->mount; 1371 target->ncp = _cache_get(nch->ncp); 1372 _cache_mntref(target->mount); 1373 } 1374 1375 void 1376 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1377 { 1378 KKASSERT(nch->ncp->nc_refs > 0); 1379 target->mount = nch->mount; 1380 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1381 _cache_mntref(target->mount); 1382 } 1383 1384 /* 1385 * 1386 */ 1387 static __inline 1388 void 1389 _cache_put(struct namecache *ncp) 1390 { 1391 _cache_unlock(ncp); 1392 _cache_drop(ncp); 1393 } 1394 1395 /* 1396 * 1397 */ 1398 void 1399 cache_put(struct nchandle *nch) 1400 { 1401 _cache_mntrel(nch->mount); 1402 _cache_put(nch->ncp); 1403 nch->ncp = NULL; 1404 nch->mount = NULL; 1405 } 1406 1407 /* 1408 * Resolve an unresolved ncp by associating a vnode with it. If the 1409 * vnode is NULL, a negative cache entry is created. 1410 * 1411 * The ncp should be locked on entry and will remain locked on return. 1412 */ 1413 static 1414 void 1415 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1416 { 1417 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 1418 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1419 1420 if (vp != NULL) { 1421 /* 1422 * Any vp associated with an ncp which has children must 1423 * be held. Any vp associated with a locked ncp must be held. 1424 */ 1425 if (!TAILQ_EMPTY(&ncp->nc_list)) 1426 vhold(vp); 1427 spin_lock(&vp->v_spin); 1428 ncp->nc_vp = vp; 1429 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1430 spin_unlock(&vp->v_spin); 1431 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1432 vhold(vp); 1433 1434 /* 1435 * Set auxiliary flags 1436 */ 1437 switch(vp->v_type) { 1438 case VDIR: 1439 ncp->nc_flag |= NCF_ISDIR; 1440 break; 1441 case VLNK: 1442 ncp->nc_flag |= NCF_ISSYMLINK; 1443 /* XXX cache the contents of the symlink */ 1444 break; 1445 default: 1446 break; 1447 } 1448 atomic_add_int(&numcache, 1); 1449 ncp->nc_error = 0; 1450 /* XXX: this is a hack to work-around the lack of a real pfs vfs 1451 * implementation*/ 1452 if (mp != NULL) 1453 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1454 vp->v_pfsmp = mp; 1455 } else { 1456 /* 1457 * When creating a negative cache hit we set the 1458 * namecache_gen. A later resolve will clean out the 1459 * negative cache hit if the mount point's namecache_gen 1460 * has changed. Used by devfs, could also be used by 1461 * other remote FSs. 1462 */ 1463 ncp->nc_vp = NULL; 1464 spin_lock(&ncspin); 1465 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 1466 ++numneg; 1467 spin_unlock(&ncspin); 1468 ncp->nc_error = ENOENT; 1469 if (mp) 1470 VFS_NCPGEN_SET(mp, ncp); 1471 } 1472 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1473 } 1474 1475 /* 1476 * 1477 */ 1478 void 1479 cache_setvp(struct nchandle *nch, struct vnode *vp) 1480 { 1481 _cache_setvp(nch->mount, nch->ncp, vp); 1482 } 1483 1484 /* 1485 * 1486 */ 1487 void 1488 cache_settimeout(struct nchandle *nch, int nticks) 1489 { 1490 struct namecache *ncp = nch->ncp; 1491 1492 if ((ncp->nc_timeout = ticks + nticks) == 0) 1493 ncp->nc_timeout = 1; 1494 } 1495 1496 /* 1497 * Disassociate the vnode or negative-cache association and mark a 1498 * namecache entry as unresolved again. Note that the ncp is still 1499 * left in the hash table and still linked to its parent. 1500 * 1501 * The ncp should be locked and refd on entry and will remain locked and refd 1502 * on return. 1503 * 1504 * This routine is normally never called on a directory containing children. 1505 * However, NFS often does just that in its rename() code as a cop-out to 1506 * avoid complex namespace operations. This disconnects a directory vnode 1507 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1508 * sync. 1509 * 1510 */ 1511 static 1512 void 1513 _cache_setunresolved(struct namecache *ncp) 1514 { 1515 struct vnode *vp; 1516 1517 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1518 ncp->nc_flag |= NCF_UNRESOLVED; 1519 ncp->nc_timeout = 0; 1520 ncp->nc_error = ENOTCONN; 1521 if ((vp = ncp->nc_vp) != NULL) { 1522 atomic_add_int(&numcache, -1); 1523 spin_lock(&vp->v_spin); 1524 ncp->nc_vp = NULL; 1525 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1526 spin_unlock(&vp->v_spin); 1527 1528 /* 1529 * Any vp associated with an ncp with children is 1530 * held by that ncp. Any vp associated with a locked 1531 * ncp is held by that ncp. These conditions must be 1532 * undone when the vp is cleared out from the ncp. 1533 */ 1534 if (!TAILQ_EMPTY(&ncp->nc_list)) 1535 vdrop(vp); 1536 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1537 vdrop(vp); 1538 } else { 1539 spin_lock(&ncspin); 1540 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 1541 --numneg; 1542 spin_unlock(&ncspin); 1543 } 1544 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1545 } 1546 } 1547 1548 /* 1549 * The cache_nresolve() code calls this function to automatically 1550 * set a resolved cache element to unresolved if it has timed out 1551 * or if it is a negative cache hit and the mount point namecache_gen 1552 * has changed. 1553 */ 1554 static __inline int 1555 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1556 { 1557 /* 1558 * Try to zap entries that have timed out. We have 1559 * to be careful here because locked leafs may depend 1560 * on the vnode remaining intact in a parent, so only 1561 * do this under very specific conditions. 1562 */ 1563 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1564 TAILQ_EMPTY(&ncp->nc_list)) { 1565 return 1; 1566 } 1567 1568 /* 1569 * If a resolved negative cache hit is invalid due to 1570 * the mount's namecache generation being bumped, zap it. 1571 */ 1572 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1573 return 1; 1574 } 1575 1576 /* 1577 * Otherwise we are good 1578 */ 1579 return 0; 1580 } 1581 1582 static __inline void 1583 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1584 { 1585 /* 1586 * Already in an unresolved state, nothing to do. 1587 */ 1588 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1589 if (_cache_auto_unresolve_test(mp, ncp)) 1590 _cache_setunresolved(ncp); 1591 } 1592 } 1593 1594 /* 1595 * 1596 */ 1597 void 1598 cache_setunresolved(struct nchandle *nch) 1599 { 1600 _cache_setunresolved(nch->ncp); 1601 } 1602 1603 /* 1604 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1605 * looking for matches. This flag tells the lookup code when it must 1606 * check for a mount linkage and also prevents the directories in question 1607 * from being deleted or renamed. 1608 */ 1609 static 1610 int 1611 cache_clrmountpt_callback(struct mount *mp, void *data) 1612 { 1613 struct nchandle *nch = data; 1614 1615 if (mp->mnt_ncmounton.ncp == nch->ncp) 1616 return(1); 1617 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1618 return(1); 1619 return(0); 1620 } 1621 1622 /* 1623 * 1624 */ 1625 void 1626 cache_clrmountpt(struct nchandle *nch) 1627 { 1628 int count; 1629 1630 count = mountlist_scan(cache_clrmountpt_callback, nch, 1631 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1632 if (count == 0) 1633 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1634 } 1635 1636 /* 1637 * Invalidate portions of the namecache topology given a starting entry. 1638 * The passed ncp is set to an unresolved state and: 1639 * 1640 * The passed ncp must be referencxed and locked. The routine may unlock 1641 * and relock ncp several times, and will recheck the children and loop 1642 * to catch races. When done the passed ncp will be returned with the 1643 * reference and lock intact. 1644 * 1645 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1646 * that the physical underlying nodes have been 1647 * destroyed... as in deleted. For example, when 1648 * a directory is removed. This will cause record 1649 * lookups on the name to no longer be able to find 1650 * the record and tells the resolver to return failure 1651 * rather then trying to resolve through the parent. 1652 * 1653 * The topology itself, including ncp->nc_name, 1654 * remains intact. 1655 * 1656 * This only applies to the passed ncp, if CINV_CHILDREN 1657 * is specified the children are not flagged. 1658 * 1659 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1660 * state as well. 1661 * 1662 * Note that this will also have the side effect of 1663 * cleaning out any unreferenced nodes in the topology 1664 * from the leaves up as the recursion backs out. 1665 * 1666 * Note that the topology for any referenced nodes remains intact, but 1667 * the nodes will be marked as having been destroyed and will be set 1668 * to an unresolved state. 1669 * 1670 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1671 * the namecache entry may not actually be invalidated on return if it was 1672 * revalidated while recursing down into its children. This code guarentees 1673 * that the node(s) will go through an invalidation cycle, but does not 1674 * guarentee that they will remain in an invalidated state. 1675 * 1676 * Returns non-zero if a revalidation was detected during the invalidation 1677 * recursion, zero otherwise. Note that since only the original ncp is 1678 * locked the revalidation ultimately can only indicate that the original ncp 1679 * *MIGHT* no have been reresolved. 1680 * 1681 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1682 * have to avoid blowing out the kernel stack. We do this by saving the 1683 * deep namecache node and aborting the recursion, then re-recursing at that 1684 * node using a depth-first algorithm in order to allow multiple deep 1685 * recursions to chain through each other, then we restart the invalidation 1686 * from scratch. 1687 */ 1688 1689 struct cinvtrack { 1690 struct namecache *resume_ncp; 1691 int depth; 1692 }; 1693 1694 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1695 1696 static 1697 int 1698 _cache_inval(struct namecache *ncp, int flags) 1699 { 1700 struct cinvtrack track; 1701 struct namecache *ncp2; 1702 int r; 1703 1704 track.depth = 0; 1705 track.resume_ncp = NULL; 1706 1707 for (;;) { 1708 r = _cache_inval_internal(ncp, flags, &track); 1709 if (track.resume_ncp == NULL) 1710 break; 1711 _cache_unlock(ncp); 1712 while ((ncp2 = track.resume_ncp) != NULL) { 1713 track.resume_ncp = NULL; 1714 _cache_lock(ncp2); 1715 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1716 &track); 1717 _cache_put(ncp2); 1718 } 1719 _cache_lock(ncp); 1720 } 1721 return(r); 1722 } 1723 1724 int 1725 cache_inval(struct nchandle *nch, int flags) 1726 { 1727 return(_cache_inval(nch->ncp, flags)); 1728 } 1729 1730 /* 1731 * Helper for _cache_inval(). The passed ncp is refd and locked and 1732 * remains that way on return, but may be unlocked/relocked multiple 1733 * times by the routine. 1734 */ 1735 static int 1736 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1737 { 1738 struct namecache *nextkid; 1739 int rcnt = 0; 1740 1741 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1742 1743 _cache_setunresolved(ncp); 1744 if (flags & CINV_DESTROY) { 1745 ncp->nc_flag |= NCF_DESTROYED; 1746 ++ncp->nc_generation; 1747 } 1748 while ((flags & CINV_CHILDREN) && 1749 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1750 ) { 1751 struct namecache *kid; 1752 int restart; 1753 1754 restart = 0; 1755 _cache_hold(nextkid); 1756 if (++track->depth > MAX_RECURSION_DEPTH) { 1757 track->resume_ncp = ncp; 1758 _cache_hold(ncp); 1759 ++rcnt; 1760 } 1761 while ((kid = nextkid) != NULL) { 1762 /* 1763 * Parent (ncp) must be locked for the iteration. 1764 */ 1765 nextkid = NULL; 1766 if (kid->nc_parent != ncp) { 1767 _cache_drop(kid); 1768 kprintf("cache_inval_internal restartA %s\n", 1769 ncp->nc_name); 1770 restart = 1; 1771 break; 1772 } 1773 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1774 _cache_hold(nextkid); 1775 1776 /* 1777 * Parent unlocked for this section to avoid 1778 * deadlocks. 1779 */ 1780 _cache_unlock(ncp); 1781 if (track->resume_ncp) { 1782 _cache_drop(kid); 1783 _cache_lock(ncp); 1784 break; 1785 } 1786 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1787 TAILQ_FIRST(&kid->nc_list) 1788 ) { 1789 _cache_lock(kid); 1790 if (kid->nc_parent != ncp) { 1791 kprintf("cache_inval_internal " 1792 "restartB %s\n", 1793 ncp->nc_name); 1794 restart = 1; 1795 _cache_unlock(kid); 1796 _cache_drop(kid); 1797 _cache_lock(ncp); 1798 break; 1799 } 1800 1801 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track); 1802 _cache_unlock(kid); 1803 } 1804 _cache_drop(kid); 1805 _cache_lock(ncp); 1806 } 1807 if (nextkid) 1808 _cache_drop(nextkid); 1809 --track->depth; 1810 if (restart == 0) 1811 break; 1812 } 1813 1814 /* 1815 * Someone could have gotten in there while ncp was unlocked, 1816 * retry if so. 1817 */ 1818 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1819 ++rcnt; 1820 return (rcnt); 1821 } 1822 1823 /* 1824 * Invalidate a vnode's namecache associations. To avoid races against 1825 * the resolver we do not invalidate a node which we previously invalidated 1826 * but which was then re-resolved while we were in the invalidation loop. 1827 * 1828 * Returns non-zero if any namecache entries remain after the invalidation 1829 * loop completed. 1830 * 1831 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1832 * be ripped out of the topology while held, the vnode's v_namecache 1833 * list has no such restriction. NCP's can be ripped out of the list 1834 * at virtually any time if not locked, even if held. 1835 * 1836 * In addition, the v_namecache list itself must be locked via 1837 * the vnode's spinlock. 1838 */ 1839 int 1840 cache_inval_vp(struct vnode *vp, int flags) 1841 { 1842 struct namecache *ncp; 1843 struct namecache *next; 1844 1845 restart: 1846 spin_lock(&vp->v_spin); 1847 ncp = TAILQ_FIRST(&vp->v_namecache); 1848 if (ncp) 1849 _cache_hold(ncp); 1850 while (ncp) { 1851 /* loop entered with ncp held and vp spin-locked */ 1852 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1853 _cache_hold(next); 1854 spin_unlock(&vp->v_spin); 1855 _cache_lock(ncp); 1856 if (ncp->nc_vp != vp) { 1857 kprintf("Warning: cache_inval_vp: race-A detected on " 1858 "%s\n", ncp->nc_name); 1859 _cache_put(ncp); 1860 if (next) 1861 _cache_drop(next); 1862 goto restart; 1863 } 1864 _cache_inval(ncp, flags); 1865 _cache_put(ncp); /* also releases reference */ 1866 ncp = next; 1867 spin_lock(&vp->v_spin); 1868 if (ncp && ncp->nc_vp != vp) { 1869 spin_unlock(&vp->v_spin); 1870 kprintf("Warning: cache_inval_vp: race-B detected on " 1871 "%s\n", ncp->nc_name); 1872 _cache_drop(ncp); 1873 goto restart; 1874 } 1875 } 1876 spin_unlock(&vp->v_spin); 1877 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1878 } 1879 1880 /* 1881 * This routine is used instead of the normal cache_inval_vp() when we 1882 * are trying to recycle otherwise good vnodes. 1883 * 1884 * Return 0 on success, non-zero if not all namecache records could be 1885 * disassociated from the vnode (for various reasons). 1886 */ 1887 int 1888 cache_inval_vp_nonblock(struct vnode *vp) 1889 { 1890 struct namecache *ncp; 1891 struct namecache *next; 1892 1893 spin_lock(&vp->v_spin); 1894 ncp = TAILQ_FIRST(&vp->v_namecache); 1895 if (ncp) 1896 _cache_hold(ncp); 1897 while (ncp) { 1898 /* loop entered with ncp held */ 1899 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1900 _cache_hold(next); 1901 spin_unlock(&vp->v_spin); 1902 if (_cache_lock_nonblock(ncp)) { 1903 _cache_drop(ncp); 1904 if (next) 1905 _cache_drop(next); 1906 goto done; 1907 } 1908 if (ncp->nc_vp != vp) { 1909 kprintf("Warning: cache_inval_vp: race-A detected on " 1910 "%s\n", ncp->nc_name); 1911 _cache_put(ncp); 1912 if (next) 1913 _cache_drop(next); 1914 goto done; 1915 } 1916 _cache_inval(ncp, 0); 1917 _cache_put(ncp); /* also releases reference */ 1918 ncp = next; 1919 spin_lock(&vp->v_spin); 1920 if (ncp && ncp->nc_vp != vp) { 1921 spin_unlock(&vp->v_spin); 1922 kprintf("Warning: cache_inval_vp: race-B detected on " 1923 "%s\n", ncp->nc_name); 1924 _cache_drop(ncp); 1925 goto done; 1926 } 1927 } 1928 spin_unlock(&vp->v_spin); 1929 done: 1930 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1931 } 1932 1933 /* 1934 * Clears the universal directory search 'ok' flag. This flag allows 1935 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1936 * so clearing it simply forces revalidation. 1937 */ 1938 void 1939 cache_inval_wxok(struct vnode *vp) 1940 { 1941 struct namecache *ncp; 1942 1943 spin_lock(&vp->v_spin); 1944 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1945 if (ncp->nc_flag & NCF_WXOK) 1946 atomic_clear_short(&ncp->nc_flag, NCF_WXOK); 1947 } 1948 spin_unlock(&vp->v_spin); 1949 } 1950 1951 /* 1952 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1953 * must be locked. The target ncp is destroyed (as a normal rename-over 1954 * would destroy the target file or directory). 1955 * 1956 * Because there may be references to the source ncp we cannot copy its 1957 * contents to the target. Instead the source ncp is relinked as the target 1958 * and the target ncp is removed from the namecache topology. 1959 */ 1960 void 1961 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1962 { 1963 struct namecache *fncp = fnch->ncp; 1964 struct namecache *tncp = tnch->ncp; 1965 struct namecache *tncp_par; 1966 struct nchash_head *nchpp; 1967 u_int32_t hash; 1968 char *oname; 1969 char *nname; 1970 1971 ++fncp->nc_generation; 1972 ++tncp->nc_generation; 1973 if (tncp->nc_nlen) { 1974 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1975 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1976 nname[tncp->nc_nlen] = 0; 1977 } else { 1978 nname = NULL; 1979 } 1980 1981 /* 1982 * Rename fncp (unlink) 1983 */ 1984 _cache_unlink_parent(fncp); 1985 oname = fncp->nc_name; 1986 fncp->nc_name = nname; 1987 fncp->nc_nlen = tncp->nc_nlen; 1988 if (oname) 1989 kfree(oname, M_VFSCACHE); 1990 1991 tncp_par = tncp->nc_parent; 1992 _cache_hold(tncp_par); 1993 _cache_lock(tncp_par); 1994 1995 /* 1996 * Rename fncp (relink) 1997 */ 1998 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1999 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 2000 nchpp = NCHHASH(hash); 2001 2002 spin_lock(&nchpp->spin); 2003 _cache_link_parent(fncp, tncp_par, nchpp); 2004 spin_unlock(&nchpp->spin); 2005 2006 _cache_put(tncp_par); 2007 2008 /* 2009 * Get rid of the overwritten tncp (unlink) 2010 */ 2011 _cache_unlink(tncp); 2012 } 2013 2014 /* 2015 * Perform actions consistent with unlinking a file. The passed-in ncp 2016 * must be locked. 2017 * 2018 * The ncp is marked DESTROYED so it no longer shows up in searches, 2019 * and will be physically deleted when the vnode goes away. 2020 * 2021 * If the related vnode has no refs then we cycle it through vget()/vput() 2022 * to (possibly if we don't have a ref race) trigger a deactivation, 2023 * allowing the VFS to trivially detect and recycle the deleted vnode 2024 * via VOP_INACTIVE(). 2025 * 2026 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 2027 * target ncp. 2028 */ 2029 void 2030 cache_unlink(struct nchandle *nch) 2031 { 2032 _cache_unlink(nch->ncp); 2033 } 2034 2035 static void 2036 _cache_unlink(struct namecache *ncp) 2037 { 2038 struct vnode *vp; 2039 2040 /* 2041 * Causes lookups to fail and allows another ncp with the same 2042 * name to be created under ncp->nc_parent. 2043 */ 2044 ncp->nc_flag |= NCF_DESTROYED; 2045 ++ncp->nc_generation; 2046 2047 /* 2048 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 2049 * force action on the 1->0 transition. 2050 */ 2051 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2052 (vp = ncp->nc_vp) != NULL) { 2053 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 2054 if (VREFCNT(vp) <= 0) { 2055 if (vget(vp, LK_SHARED) == 0) 2056 vput(vp); 2057 } 2058 } 2059 } 2060 2061 /* 2062 * Return non-zero if the nch might be associated with an open and/or mmap()'d 2063 * file. The easy solution is to just return non-zero if the vnode has refs. 2064 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 2065 * force the reclaim). 2066 */ 2067 int 2068 cache_isopen(struct nchandle *nch) 2069 { 2070 struct vnode *vp; 2071 struct namecache *ncp = nch->ncp; 2072 2073 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2074 (vp = ncp->nc_vp) != NULL && 2075 VREFCNT(vp)) { 2076 return 1; 2077 } 2078 return 0; 2079 } 2080 2081 2082 /* 2083 * vget the vnode associated with the namecache entry. Resolve the namecache 2084 * entry if necessary. The passed ncp must be referenced and locked. If 2085 * the ncp is resolved it might be locked shared. 2086 * 2087 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 2088 * (depending on the passed lk_type) will be returned in *vpp with an error 2089 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 2090 * most typical error is ENOENT, meaning that the ncp represents a negative 2091 * cache hit and there is no vnode to retrieve, but other errors can occur 2092 * too. 2093 * 2094 * The vget() can race a reclaim. If this occurs we re-resolve the 2095 * namecache entry. 2096 * 2097 * There are numerous places in the kernel where vget() is called on a 2098 * vnode while one or more of its namecache entries is locked. Releasing 2099 * a vnode never deadlocks against locked namecache entries (the vnode 2100 * will not get recycled while referenced ncp's exist). This means we 2101 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 2102 * lock when acquiring the vp lock or we might cause a deadlock. 2103 * 2104 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2105 * unresolved. If a reclaim race occurs the passed-in ncp will be 2106 * relocked exclusively before being re-resolved. 2107 */ 2108 int 2109 cache_vget(struct nchandle *nch, struct ucred *cred, 2110 int lk_type, struct vnode **vpp) 2111 { 2112 struct namecache *ncp; 2113 struct vnode *vp; 2114 int error; 2115 2116 ncp = nch->ncp; 2117 again: 2118 vp = NULL; 2119 if (ncp->nc_flag & NCF_UNRESOLVED) 2120 error = cache_resolve(nch, cred); 2121 else 2122 error = 0; 2123 2124 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2125 error = vget(vp, lk_type); 2126 if (error) { 2127 /* 2128 * VRECLAIM race 2129 * 2130 * The ncp may have been locked shared, we must relock 2131 * it exclusively before we can set it to unresolved. 2132 */ 2133 if (error == ENOENT) { 2134 kprintf("Warning: vnode reclaim race detected " 2135 "in cache_vget on %p (%s)\n", 2136 vp, ncp->nc_name); 2137 _cache_unlock(ncp); 2138 _cache_lock(ncp); 2139 _cache_setunresolved(ncp); 2140 goto again; 2141 } 2142 2143 /* 2144 * Not a reclaim race, some other error. 2145 */ 2146 KKASSERT(ncp->nc_vp == vp); 2147 vp = NULL; 2148 } else { 2149 KKASSERT(ncp->nc_vp == vp); 2150 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2151 } 2152 } 2153 if (error == 0 && vp == NULL) 2154 error = ENOENT; 2155 *vpp = vp; 2156 return(error); 2157 } 2158 2159 /* 2160 * Similar to cache_vget() but only acquires a ref on the vnode. 2161 * 2162 * NOTE: The passed-in ncp must be locked exclusively if it is initially 2163 * unresolved. If a reclaim race occurs the passed-in ncp will be 2164 * relocked exclusively before being re-resolved. 2165 */ 2166 int 2167 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2168 { 2169 struct namecache *ncp; 2170 struct vnode *vp; 2171 int error; 2172 2173 ncp = nch->ncp; 2174 again: 2175 vp = NULL; 2176 if (ncp->nc_flag & NCF_UNRESOLVED) 2177 error = cache_resolve(nch, cred); 2178 else 2179 error = 0; 2180 2181 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 2182 error = vget(vp, LK_SHARED); 2183 if (error) { 2184 /* 2185 * VRECLAIM race 2186 */ 2187 if (error == ENOENT) { 2188 kprintf("Warning: vnode reclaim race detected " 2189 "in cache_vget on %p (%s)\n", 2190 vp, ncp->nc_name); 2191 _cache_unlock(ncp); 2192 _cache_lock(ncp); 2193 _cache_setunresolved(ncp); 2194 goto again; 2195 } 2196 2197 /* 2198 * Not a reclaim race, some other error. 2199 */ 2200 KKASSERT(ncp->nc_vp == vp); 2201 vp = NULL; 2202 } else { 2203 KKASSERT(ncp->nc_vp == vp); 2204 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2205 /* caller does not want a lock */ 2206 vn_unlock(vp); 2207 } 2208 } 2209 if (error == 0 && vp == NULL) 2210 error = ENOENT; 2211 *vpp = vp; 2212 return(error); 2213 } 2214 2215 /* 2216 * Return a referenced vnode representing the parent directory of 2217 * ncp. 2218 * 2219 * Because the caller has locked the ncp it should not be possible for 2220 * the parent ncp to go away. However, the parent can unresolve its 2221 * dvp at any time so we must be able to acquire a lock on the parent 2222 * to safely access nc_vp. 2223 * 2224 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2225 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2226 * getting destroyed. 2227 * 2228 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2229 * lock on the ncp in question.. 2230 */ 2231 static struct vnode * 2232 cache_dvpref(struct namecache *ncp) 2233 { 2234 struct namecache *par; 2235 struct vnode *dvp; 2236 2237 dvp = NULL; 2238 if ((par = ncp->nc_parent) != NULL) { 2239 _cache_hold(par); 2240 _cache_lock(par); 2241 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2242 if ((dvp = par->nc_vp) != NULL) 2243 vhold(dvp); 2244 } 2245 _cache_unlock(par); 2246 if (dvp) { 2247 if (vget(dvp, LK_SHARED) == 0) { 2248 vn_unlock(dvp); 2249 vdrop(dvp); 2250 /* return refd, unlocked dvp */ 2251 } else { 2252 vdrop(dvp); 2253 dvp = NULL; 2254 } 2255 } 2256 _cache_drop(par); 2257 } 2258 return(dvp); 2259 } 2260 2261 /* 2262 * Convert a directory vnode to a namecache record without any other 2263 * knowledge of the topology. This ONLY works with directory vnodes and 2264 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2265 * returned ncp (if not NULL) will be held and unlocked. 2266 * 2267 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2268 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2269 * for dvp. This will fail only if the directory has been deleted out from 2270 * under the caller. 2271 * 2272 * Callers must always check for a NULL return no matter the value of 'makeit'. 2273 * 2274 * To avoid underflowing the kernel stack each recursive call increments 2275 * the makeit variable. 2276 */ 2277 2278 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2279 struct vnode *dvp, char *fakename); 2280 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2281 struct vnode **saved_dvp); 2282 2283 int 2284 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2285 struct nchandle *nch) 2286 { 2287 struct vnode *saved_dvp; 2288 struct vnode *pvp; 2289 char *fakename; 2290 int error; 2291 2292 nch->ncp = NULL; 2293 nch->mount = dvp->v_mount; 2294 saved_dvp = NULL; 2295 fakename = NULL; 2296 2297 /* 2298 * Handle the makeit == 0 degenerate case 2299 */ 2300 if (makeit == 0) { 2301 spin_lock_shared(&dvp->v_spin); 2302 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2303 if (nch->ncp) 2304 cache_hold(nch); 2305 spin_unlock_shared(&dvp->v_spin); 2306 } 2307 2308 /* 2309 * Loop until resolution, inside code will break out on error. 2310 */ 2311 while (makeit) { 2312 /* 2313 * Break out if we successfully acquire a working ncp. 2314 */ 2315 spin_lock_shared(&dvp->v_spin); 2316 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2317 if (nch->ncp) { 2318 cache_hold(nch); 2319 spin_unlock_shared(&dvp->v_spin); 2320 break; 2321 } 2322 spin_unlock_shared(&dvp->v_spin); 2323 2324 /* 2325 * If dvp is the root of its filesystem it should already 2326 * have a namecache pointer associated with it as a side 2327 * effect of the mount, but it may have been disassociated. 2328 */ 2329 if (dvp->v_flag & VROOT) { 2330 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2331 error = cache_resolve_mp(nch->mount); 2332 _cache_put(nch->ncp); 2333 if (ncvp_debug) { 2334 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2335 dvp->v_mount, error); 2336 } 2337 if (error) { 2338 if (ncvp_debug) 2339 kprintf(" failed\n"); 2340 nch->ncp = NULL; 2341 break; 2342 } 2343 if (ncvp_debug) 2344 kprintf(" succeeded\n"); 2345 continue; 2346 } 2347 2348 /* 2349 * If we are recursed too deeply resort to an O(n^2) 2350 * algorithm to resolve the namecache topology. The 2351 * resolved pvp is left referenced in saved_dvp to 2352 * prevent the tree from being destroyed while we loop. 2353 */ 2354 if (makeit > 20) { 2355 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2356 if (error) { 2357 kprintf("lookupdotdot(longpath) failed %d " 2358 "dvp %p\n", error, dvp); 2359 nch->ncp = NULL; 2360 break; 2361 } 2362 continue; 2363 } 2364 2365 /* 2366 * Get the parent directory and resolve its ncp. 2367 */ 2368 if (fakename) { 2369 kfree(fakename, M_TEMP); 2370 fakename = NULL; 2371 } 2372 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2373 &fakename); 2374 if (error) { 2375 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2376 break; 2377 } 2378 vn_unlock(pvp); 2379 2380 /* 2381 * Reuse makeit as a recursion depth counter. On success 2382 * nch will be fully referenced. 2383 */ 2384 cache_fromdvp(pvp, cred, makeit + 1, nch); 2385 vrele(pvp); 2386 if (nch->ncp == NULL) 2387 break; 2388 2389 /* 2390 * Do an inefficient scan of pvp (embodied by ncp) to look 2391 * for dvp. This will create a namecache record for dvp on 2392 * success. We loop up to recheck on success. 2393 * 2394 * ncp and dvp are both held but not locked. 2395 */ 2396 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2397 if (error) { 2398 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2399 pvp, nch->ncp->nc_name, dvp); 2400 cache_drop(nch); 2401 /* nch was NULLed out, reload mount */ 2402 nch->mount = dvp->v_mount; 2403 break; 2404 } 2405 if (ncvp_debug) { 2406 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2407 pvp, nch->ncp->nc_name); 2408 } 2409 cache_drop(nch); 2410 /* nch was NULLed out, reload mount */ 2411 nch->mount = dvp->v_mount; 2412 } 2413 2414 /* 2415 * If nch->ncp is non-NULL it will have been held already. 2416 */ 2417 if (fakename) 2418 kfree(fakename, M_TEMP); 2419 if (saved_dvp) 2420 vrele(saved_dvp); 2421 if (nch->ncp) 2422 return (0); 2423 return (EINVAL); 2424 } 2425 2426 /* 2427 * Go up the chain of parent directories until we find something 2428 * we can resolve into the namecache. This is very inefficient. 2429 */ 2430 static 2431 int 2432 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2433 struct vnode **saved_dvp) 2434 { 2435 struct nchandle nch; 2436 struct vnode *pvp; 2437 int error; 2438 static time_t last_fromdvp_report; 2439 char *fakename; 2440 2441 /* 2442 * Loop getting the parent directory vnode until we get something we 2443 * can resolve in the namecache. 2444 */ 2445 vref(dvp); 2446 nch.mount = dvp->v_mount; 2447 nch.ncp = NULL; 2448 fakename = NULL; 2449 2450 for (;;) { 2451 if (fakename) { 2452 kfree(fakename, M_TEMP); 2453 fakename = NULL; 2454 } 2455 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2456 &fakename); 2457 if (error) { 2458 vrele(dvp); 2459 break; 2460 } 2461 vn_unlock(pvp); 2462 spin_lock_shared(&pvp->v_spin); 2463 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2464 _cache_hold(nch.ncp); 2465 spin_unlock_shared(&pvp->v_spin); 2466 vrele(pvp); 2467 break; 2468 } 2469 spin_unlock_shared(&pvp->v_spin); 2470 if (pvp->v_flag & VROOT) { 2471 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2472 error = cache_resolve_mp(nch.mount); 2473 _cache_unlock(nch.ncp); 2474 vrele(pvp); 2475 if (error) { 2476 _cache_drop(nch.ncp); 2477 nch.ncp = NULL; 2478 vrele(dvp); 2479 } 2480 break; 2481 } 2482 vrele(dvp); 2483 dvp = pvp; 2484 } 2485 if (error == 0) { 2486 if (last_fromdvp_report != time_uptime) { 2487 last_fromdvp_report = time_uptime; 2488 kprintf("Warning: extremely inefficient path " 2489 "resolution on %s\n", 2490 nch.ncp->nc_name); 2491 } 2492 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2493 2494 /* 2495 * Hopefully dvp now has a namecache record associated with 2496 * it. Leave it referenced to prevent the kernel from 2497 * recycling the vnode. Otherwise extremely long directory 2498 * paths could result in endless recycling. 2499 */ 2500 if (*saved_dvp) 2501 vrele(*saved_dvp); 2502 *saved_dvp = dvp; 2503 _cache_drop(nch.ncp); 2504 } 2505 if (fakename) 2506 kfree(fakename, M_TEMP); 2507 return (error); 2508 } 2509 2510 /* 2511 * Do an inefficient scan of the directory represented by ncp looking for 2512 * the directory vnode dvp. ncp must be held but not locked on entry and 2513 * will be held on return. dvp must be refd but not locked on entry and 2514 * will remain refd on return. 2515 * 2516 * Why do this at all? Well, due to its stateless nature the NFS server 2517 * converts file handles directly to vnodes without necessarily going through 2518 * the namecache ops that would otherwise create the namecache topology 2519 * leading to the vnode. We could either (1) Change the namecache algorithms 2520 * to allow disconnect namecache records that are re-merged opportunistically, 2521 * or (2) Make the NFS server backtrack and scan to recover a connected 2522 * namecache topology in order to then be able to issue new API lookups. 2523 * 2524 * It turns out that (1) is a huge mess. It takes a nice clean set of 2525 * namecache algorithms and introduces a lot of complication in every subsystem 2526 * that calls into the namecache to deal with the re-merge case, especially 2527 * since we are using the namecache to placehold negative lookups and the 2528 * vnode might not be immediately assigned. (2) is certainly far less 2529 * efficient then (1), but since we are only talking about directories here 2530 * (which are likely to remain cached), the case does not actually run all 2531 * that often and has the supreme advantage of not polluting the namecache 2532 * algorithms. 2533 * 2534 * If a fakename is supplied just construct a namecache entry using the 2535 * fake name. 2536 */ 2537 static int 2538 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2539 struct vnode *dvp, char *fakename) 2540 { 2541 struct nlcomponent nlc; 2542 struct nchandle rncp; 2543 struct dirent *den; 2544 struct vnode *pvp; 2545 struct vattr vat; 2546 struct iovec iov; 2547 struct uio uio; 2548 int blksize; 2549 int eofflag; 2550 int bytes; 2551 char *rbuf; 2552 int error; 2553 2554 vat.va_blocksize = 0; 2555 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2556 return (error); 2557 cache_lock(nch); 2558 error = cache_vref(nch, cred, &pvp); 2559 cache_unlock(nch); 2560 if (error) 2561 return (error); 2562 if (ncvp_debug) { 2563 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2564 "vattr fileid = %lld\n", 2565 nch->ncp, nch->ncp->nc_name, 2566 vat.va_blocksize, 2567 (long long)vat.va_fileid); 2568 } 2569 2570 /* 2571 * Use the supplied fakename if not NULL. Fake names are typically 2572 * not in the actual filesystem hierarchy. This is used by HAMMER 2573 * to glue @@timestamp recursions together. 2574 */ 2575 if (fakename) { 2576 nlc.nlc_nameptr = fakename; 2577 nlc.nlc_namelen = strlen(fakename); 2578 rncp = cache_nlookup(nch, &nlc); 2579 goto done; 2580 } 2581 2582 if ((blksize = vat.va_blocksize) == 0) 2583 blksize = DEV_BSIZE; 2584 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2585 rncp.ncp = NULL; 2586 2587 eofflag = 0; 2588 uio.uio_offset = 0; 2589 again: 2590 iov.iov_base = rbuf; 2591 iov.iov_len = blksize; 2592 uio.uio_iov = &iov; 2593 uio.uio_iovcnt = 1; 2594 uio.uio_resid = blksize; 2595 uio.uio_segflg = UIO_SYSSPACE; 2596 uio.uio_rw = UIO_READ; 2597 uio.uio_td = curthread; 2598 2599 if (ncvp_debug >= 2) 2600 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2601 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2602 if (error == 0) { 2603 den = (struct dirent *)rbuf; 2604 bytes = blksize - uio.uio_resid; 2605 2606 while (bytes > 0) { 2607 if (ncvp_debug >= 2) { 2608 kprintf("cache_inefficient_scan: %*.*s\n", 2609 den->d_namlen, den->d_namlen, 2610 den->d_name); 2611 } 2612 if (den->d_type != DT_WHT && 2613 den->d_ino == vat.va_fileid) { 2614 if (ncvp_debug) { 2615 kprintf("cache_inefficient_scan: " 2616 "MATCHED inode %lld path %s/%*.*s\n", 2617 (long long)vat.va_fileid, 2618 nch->ncp->nc_name, 2619 den->d_namlen, den->d_namlen, 2620 den->d_name); 2621 } 2622 nlc.nlc_nameptr = den->d_name; 2623 nlc.nlc_namelen = den->d_namlen; 2624 rncp = cache_nlookup(nch, &nlc); 2625 KKASSERT(rncp.ncp != NULL); 2626 break; 2627 } 2628 bytes -= _DIRENT_DIRSIZ(den); 2629 den = _DIRENT_NEXT(den); 2630 } 2631 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2632 goto again; 2633 } 2634 kfree(rbuf, M_TEMP); 2635 done: 2636 vrele(pvp); 2637 if (rncp.ncp) { 2638 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2639 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2640 if (ncvp_debug >= 2) { 2641 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2642 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2643 } 2644 } else { 2645 if (ncvp_debug >= 2) { 2646 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2647 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2648 rncp.ncp->nc_vp); 2649 } 2650 } 2651 if (rncp.ncp->nc_vp == NULL) 2652 error = rncp.ncp->nc_error; 2653 /* 2654 * Release rncp after a successful nlookup. rncp was fully 2655 * referenced. 2656 */ 2657 cache_put(&rncp); 2658 } else { 2659 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2660 dvp, nch->ncp->nc_name); 2661 error = ENOENT; 2662 } 2663 return (error); 2664 } 2665 2666 /* 2667 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2668 * state, which disassociates it from its vnode or ncneglist. 2669 * 2670 * Then, if there are no additional references to the ncp and no children, 2671 * the ncp is removed from the topology and destroyed. 2672 * 2673 * References and/or children may exist if the ncp is in the middle of the 2674 * topology, preventing the ncp from being destroyed. 2675 * 2676 * This function must be called with the ncp held and locked and will unlock 2677 * and drop it during zapping. 2678 * 2679 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2680 * This case can occur in the cache_drop() path. 2681 * 2682 * This function may returned a held (but NOT locked) parent node which the 2683 * caller must drop. We do this so _cache_drop() can loop, to avoid 2684 * blowing out the kernel stack. 2685 * 2686 * WARNING! For MPSAFE operation this routine must acquire up to three 2687 * spin locks to be able to safely test nc_refs. Lock order is 2688 * very important. 2689 * 2690 * hash spinlock if on hash list 2691 * parent spinlock if child of parent 2692 * (the ncp is unresolved so there is no vnode association) 2693 */ 2694 static struct namecache * 2695 cache_zap(struct namecache *ncp, int nonblock) 2696 { 2697 struct namecache *par; 2698 struct vnode *dropvp; 2699 struct nchash_head *nchpp; 2700 int refs; 2701 2702 /* 2703 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2704 */ 2705 _cache_setunresolved(ncp); 2706 2707 /* 2708 * Try to scrap the entry and possibly tail-recurse on its parent. 2709 * We only scrap unref'd (other then our ref) unresolved entries, 2710 * we do not scrap 'live' entries. 2711 * 2712 * Note that once the spinlocks are acquired if nc_refs == 1 no 2713 * other references are possible. If it isn't, however, we have 2714 * to decrement but also be sure to avoid a 1->0 transition. 2715 */ 2716 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2717 KKASSERT(ncp->nc_refs > 0); 2718 2719 /* 2720 * Acquire locks. Note that the parent can't go away while we hold 2721 * a child locked. 2722 */ 2723 nchpp = NULL; 2724 if ((par = ncp->nc_parent) != NULL) { 2725 if (nonblock) { 2726 for (;;) { 2727 if (_cache_lock_nonblock(par) == 0) 2728 break; 2729 refs = ncp->nc_refs; 2730 ncp->nc_flag |= NCF_DEFEREDZAP; 2731 ++numdefered; /* MP race ok */ 2732 if (atomic_cmpset_int(&ncp->nc_refs, 2733 refs, refs - 1)) { 2734 _cache_unlock(ncp); 2735 return(NULL); 2736 } 2737 cpu_pause(); 2738 } 2739 _cache_hold(par); 2740 } else { 2741 _cache_hold(par); 2742 _cache_lock(par); 2743 } 2744 nchpp = ncp->nc_head; 2745 spin_lock(&nchpp->spin); 2746 } 2747 2748 /* 2749 * At this point if we find refs == 1 it should not be possible for 2750 * anyone else to have access to the ncp. We are holding the only 2751 * possible access point left (nchpp) spin-locked. 2752 * 2753 * If someone other then us has a ref or we have children 2754 * we cannot zap the entry. The 1->0 transition and any 2755 * further list operation is protected by the spinlocks 2756 * we have acquired but other transitions are not. 2757 */ 2758 for (;;) { 2759 refs = ncp->nc_refs; 2760 cpu_ccfence(); 2761 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list)) 2762 break; 2763 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) { 2764 if (par) { 2765 spin_unlock(&nchpp->spin); 2766 _cache_put(par); 2767 } 2768 _cache_unlock(ncp); 2769 return(NULL); 2770 } 2771 cpu_pause(); 2772 } 2773 2774 /* 2775 * We are the only ref and with the spinlocks held no further 2776 * refs can be acquired by others. 2777 * 2778 * Remove us from the hash list and parent list. We have to 2779 * drop a ref on the parent's vp if the parent's list becomes 2780 * empty. 2781 */ 2782 dropvp = NULL; 2783 if (par) { 2784 KKASSERT(nchpp == ncp->nc_head); 2785 LIST_REMOVE(ncp, nc_hash); 2786 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2787 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 2788 dropvp = par->nc_vp; 2789 ncp->nc_head = NULL; 2790 ncp->nc_parent = NULL; 2791 spin_unlock(&nchpp->spin); 2792 _cache_unlock(par); 2793 } else { 2794 KKASSERT(ncp->nc_head == NULL); 2795 } 2796 2797 /* 2798 * ncp should not have picked up any refs. Physically 2799 * destroy the ncp. 2800 */ 2801 if (ncp->nc_refs != 1) { 2802 int save_refs = ncp->nc_refs; 2803 cpu_ccfence(); 2804 panic("cache_zap: %p bad refs %d (%d)\n", 2805 ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0)); 2806 } 2807 KKASSERT(ncp->nc_refs == 1); 2808 /* _cache_unlock(ncp) not required */ 2809 ncp->nc_refs = -1; /* safety */ 2810 if (ncp->nc_name) 2811 kfree(ncp->nc_name, M_VFSCACHE); 2812 kfree(ncp, M_VFSCACHE); 2813 2814 /* 2815 * Delayed drop (we had to release our spinlocks) 2816 * 2817 * The refed parent (if not NULL) must be dropped. The 2818 * caller is responsible for looping. 2819 */ 2820 if (dropvp) 2821 vdrop(dropvp); 2822 return(par); 2823 } 2824 2825 /* 2826 * Clean up dangling negative cache and defered-drop entries in the 2827 * namecache. 2828 * 2829 * This routine is called in the critical path and also called from 2830 * vnlru(). When called from vnlru we use a lower limit to try to 2831 * deal with the negative cache before the critical path has to start 2832 * dealing with it. 2833 */ 2834 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2835 2836 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2837 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2838 2839 void 2840 cache_hysteresis(int critpath) 2841 { 2842 int poslimit; 2843 int neglimit = maxvnodes / ncnegfactor; 2844 int xnumcache = numcache; 2845 2846 if (critpath == 0) 2847 neglimit = neglimit * 8 / 10; 2848 2849 /* 2850 * Don't cache too many negative hits. We use hysteresis to reduce 2851 * the impact on the critical path. 2852 */ 2853 switch(neg_cache_hysteresis_state[critpath]) { 2854 case CHI_LOW: 2855 if (numneg > MINNEG && numneg > neglimit) { 2856 if (critpath) 2857 _cache_cleanneg(ncnegflush); 2858 else 2859 _cache_cleanneg(ncnegflush + 2860 numneg - neglimit); 2861 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2862 } 2863 break; 2864 case CHI_HIGH: 2865 if (numneg > MINNEG * 9 / 10 && 2866 numneg * 9 / 10 > neglimit 2867 ) { 2868 if (critpath) 2869 _cache_cleanneg(ncnegflush); 2870 else 2871 _cache_cleanneg(ncnegflush + 2872 numneg * 9 / 10 - neglimit); 2873 } else { 2874 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2875 } 2876 break; 2877 } 2878 2879 /* 2880 * Don't cache too many positive hits. We use hysteresis to reduce 2881 * the impact on the critical path. 2882 * 2883 * Excessive positive hits can accumulate due to large numbers of 2884 * hardlinks (the vnode cache will not prevent hl ncps from growing 2885 * into infinity). 2886 */ 2887 if ((poslimit = ncposlimit) == 0) 2888 poslimit = maxvnodes * 2; 2889 if (critpath == 0) 2890 poslimit = poslimit * 8 / 10; 2891 2892 switch(pos_cache_hysteresis_state[critpath]) { 2893 case CHI_LOW: 2894 if (xnumcache > poslimit && xnumcache > MINPOS) { 2895 if (critpath) 2896 _cache_cleanpos(ncposflush); 2897 else 2898 _cache_cleanpos(ncposflush + 2899 xnumcache - poslimit); 2900 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2901 } 2902 break; 2903 case CHI_HIGH: 2904 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2905 if (critpath) 2906 _cache_cleanpos(ncposflush); 2907 else 2908 _cache_cleanpos(ncposflush + 2909 xnumcache - poslimit * 5 / 6); 2910 } else { 2911 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2912 } 2913 break; 2914 } 2915 2916 /* 2917 * Clean out dangling defered-zap ncps which could not 2918 * be cleanly dropped if too many build up. Note 2919 * that numdefered is not an exact number as such ncps 2920 * can be reused and the counter is not handled in a MP 2921 * safe manner by design. 2922 */ 2923 if (numdefered > neglimit) { 2924 _cache_cleandefered(); 2925 } 2926 } 2927 2928 /* 2929 * NEW NAMECACHE LOOKUP API 2930 * 2931 * Lookup an entry in the namecache. The passed par_nch must be referenced 2932 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2933 * is ALWAYS returned, eve if the supplied component is illegal. 2934 * 2935 * The resulting namecache entry should be returned to the system with 2936 * cache_put() or cache_unlock() + cache_drop(). 2937 * 2938 * namecache locks are recursive but care must be taken to avoid lock order 2939 * reversals (hence why the passed par_nch must be unlocked). Locking 2940 * rules are to order for parent traversals, not for child traversals. 2941 * 2942 * Nobody else will be able to manipulate the associated namespace (e.g. 2943 * create, delete, rename, rename-target) until the caller unlocks the 2944 * entry. 2945 * 2946 * The returned entry will be in one of three states: positive hit (non-null 2947 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2948 * Unresolved entries must be resolved through the filesystem to associate the 2949 * vnode and/or determine whether a positive or negative hit has occured. 2950 * 2951 * It is not necessary to lock a directory in order to lock namespace under 2952 * that directory. In fact, it is explicitly not allowed to do that. A 2953 * directory is typically only locked when being created, renamed, or 2954 * destroyed. 2955 * 2956 * The directory (par) may be unresolved, in which case any returned child 2957 * will likely also be marked unresolved. Likely but not guarenteed. Since 2958 * the filesystem lookup requires a resolved directory vnode the caller is 2959 * responsible for resolving the namecache chain top-down. This API 2960 * specifically allows whole chains to be created in an unresolved state. 2961 */ 2962 struct nchandle 2963 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2964 { 2965 struct nchandle nch; 2966 struct namecache *ncp; 2967 struct namecache *new_ncp; 2968 struct nchash_head *nchpp; 2969 struct mount *mp; 2970 u_int32_t hash; 2971 globaldata_t gd; 2972 int par_locked; 2973 2974 gd = mycpu; 2975 mp = par_nch->mount; 2976 par_locked = 0; 2977 2978 /* 2979 * This is a good time to call it, no ncp's are locked by 2980 * the caller or us. 2981 */ 2982 cache_hysteresis(1); 2983 2984 /* 2985 * Try to locate an existing entry 2986 */ 2987 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2988 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2989 new_ncp = NULL; 2990 nchpp = NCHHASH(hash); 2991 restart: 2992 if (new_ncp) 2993 spin_lock(&nchpp->spin); 2994 else 2995 spin_lock_shared(&nchpp->spin); 2996 2997 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2998 /* 2999 * Break out if we find a matching entry. Note that 3000 * UNRESOLVED entries may match, but DESTROYED entries 3001 * do not. 3002 */ 3003 if (ncp->nc_parent == par_nch->ncp && 3004 ncp->nc_nlen == nlc->nlc_namelen && 3005 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3006 (ncp->nc_flag & NCF_DESTROYED) == 0 3007 ) { 3008 _cache_hold(ncp); 3009 if (new_ncp) 3010 spin_unlock(&nchpp->spin); 3011 else 3012 spin_unlock_shared(&nchpp->spin); 3013 if (par_locked) { 3014 _cache_unlock(par_nch->ncp); 3015 par_locked = 0; 3016 } 3017 if (_cache_lock_special(ncp) == 0) { 3018 /* 3019 * Successfully locked but we must re-test 3020 * conditions that might have changed since 3021 * we did not have the lock before. 3022 */ 3023 if (ncp->nc_parent != par_nch->ncp || 3024 ncp->nc_nlen != nlc->nlc_namelen || 3025 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3026 ncp->nc_nlen) || 3027 (ncp->nc_flag & NCF_DESTROYED)) { 3028 _cache_put(ncp); 3029 goto restart; 3030 } 3031 _cache_auto_unresolve(mp, ncp); 3032 if (new_ncp) 3033 _cache_free(new_ncp); 3034 goto found; 3035 } 3036 _cache_get(ncp); /* cycle the lock to block */ 3037 _cache_put(ncp); 3038 _cache_drop(ncp); 3039 goto restart; 3040 } 3041 } 3042 3043 /* 3044 * We failed to locate an entry, create a new entry and add it to 3045 * the cache. The parent ncp must also be locked so we 3046 * can link into it. 3047 * 3048 * We have to relookup after possibly blocking in kmalloc or 3049 * when locking par_nch. 3050 * 3051 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3052 * mount case, in which case nc_name will be NULL. 3053 */ 3054 if (new_ncp == NULL) { 3055 spin_unlock_shared(&nchpp->spin); 3056 new_ncp = cache_alloc(nlc->nlc_namelen); 3057 if (nlc->nlc_namelen) { 3058 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3059 nlc->nlc_namelen); 3060 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3061 } 3062 goto restart; 3063 } 3064 3065 /* 3066 * NOTE! The spinlock is held exclusively here because new_ncp 3067 * is non-NULL. 3068 */ 3069 if (par_locked == 0) { 3070 spin_unlock(&nchpp->spin); 3071 _cache_lock(par_nch->ncp); 3072 par_locked = 1; 3073 goto restart; 3074 } 3075 3076 /* 3077 * WARNING! We still hold the spinlock. We have to set the hash 3078 * table entry atomically. 3079 */ 3080 ncp = new_ncp; 3081 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3082 spin_unlock(&nchpp->spin); 3083 _cache_unlock(par_nch->ncp); 3084 /* par_locked = 0 - not used */ 3085 found: 3086 /* 3087 * stats and namecache size management 3088 */ 3089 if (ncp->nc_flag & NCF_UNRESOLVED) 3090 ++gd->gd_nchstats->ncs_miss; 3091 else if (ncp->nc_vp) 3092 ++gd->gd_nchstats->ncs_goodhits; 3093 else 3094 ++gd->gd_nchstats->ncs_neghits; 3095 nch.mount = mp; 3096 nch.ncp = ncp; 3097 _cache_mntref(nch.mount); 3098 3099 return(nch); 3100 } 3101 3102 /* 3103 * Attempt to lookup a namecache entry and return with a shared namecache 3104 * lock. 3105 */ 3106 int 3107 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 3108 int excl, struct nchandle *res_nch) 3109 { 3110 struct namecache *ncp; 3111 struct nchash_head *nchpp; 3112 struct mount *mp; 3113 u_int32_t hash; 3114 globaldata_t gd; 3115 3116 /* 3117 * If exclusive requested or shared namecache locks are disabled, 3118 * return failure. 3119 */ 3120 if (ncp_shared_lock_disable || excl) 3121 return(EWOULDBLOCK); 3122 3123 gd = mycpu; 3124 mp = par_nch->mount; 3125 3126 /* 3127 * This is a good time to call it, no ncp's are locked by 3128 * the caller or us. 3129 */ 3130 cache_hysteresis(1); 3131 3132 /* 3133 * Try to locate an existing entry 3134 */ 3135 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3136 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3137 nchpp = NCHHASH(hash); 3138 3139 spin_lock_shared(&nchpp->spin); 3140 3141 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3142 /* 3143 * Break out if we find a matching entry. Note that 3144 * UNRESOLVED entries may match, but DESTROYED entries 3145 * do not. 3146 */ 3147 if (ncp->nc_parent == par_nch->ncp && 3148 ncp->nc_nlen == nlc->nlc_namelen && 3149 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3150 (ncp->nc_flag & NCF_DESTROYED) == 0 3151 ) { 3152 _cache_hold(ncp); 3153 spin_unlock_shared(&nchpp->spin); 3154 if (_cache_lock_shared_special(ncp) == 0) { 3155 if (ncp->nc_parent == par_nch->ncp && 3156 ncp->nc_nlen == nlc->nlc_namelen && 3157 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3158 ncp->nc_nlen) == 0 && 3159 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3160 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3161 _cache_auto_unresolve_test(mp, ncp) == 0) { 3162 goto found; 3163 } 3164 _cache_unlock(ncp); 3165 } 3166 _cache_drop(ncp); 3167 spin_lock_shared(&nchpp->spin); 3168 break; 3169 } 3170 } 3171 3172 /* 3173 * Failure 3174 */ 3175 spin_unlock_shared(&nchpp->spin); 3176 return(EWOULDBLOCK); 3177 3178 /* 3179 * Success 3180 * 3181 * Note that nc_error might be non-zero (e.g ENOENT). 3182 */ 3183 found: 3184 res_nch->mount = mp; 3185 res_nch->ncp = ncp; 3186 ++gd->gd_nchstats->ncs_goodhits; 3187 _cache_mntref(res_nch->mount); 3188 3189 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3190 return(ncp->nc_error); 3191 } 3192 3193 /* 3194 * This is a non-blocking verison of cache_nlookup() used by 3195 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3196 * will return nch.ncp == NULL in that case. 3197 */ 3198 struct nchandle 3199 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3200 { 3201 struct nchandle nch; 3202 struct namecache *ncp; 3203 struct namecache *new_ncp; 3204 struct nchash_head *nchpp; 3205 struct mount *mp; 3206 u_int32_t hash; 3207 globaldata_t gd; 3208 int par_locked; 3209 3210 gd = mycpu; 3211 mp = par_nch->mount; 3212 par_locked = 0; 3213 3214 /* 3215 * Try to locate an existing entry 3216 */ 3217 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3218 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3219 new_ncp = NULL; 3220 nchpp = NCHHASH(hash); 3221 restart: 3222 spin_lock(&nchpp->spin); 3223 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3224 /* 3225 * Break out if we find a matching entry. Note that 3226 * UNRESOLVED entries may match, but DESTROYED entries 3227 * do not. 3228 */ 3229 if (ncp->nc_parent == par_nch->ncp && 3230 ncp->nc_nlen == nlc->nlc_namelen && 3231 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3232 (ncp->nc_flag & NCF_DESTROYED) == 0 3233 ) { 3234 _cache_hold(ncp); 3235 spin_unlock(&nchpp->spin); 3236 if (par_locked) { 3237 _cache_unlock(par_nch->ncp); 3238 par_locked = 0; 3239 } 3240 if (_cache_lock_special(ncp) == 0) { 3241 if (ncp->nc_parent != par_nch->ncp || 3242 ncp->nc_nlen != nlc->nlc_namelen || 3243 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3244 (ncp->nc_flag & NCF_DESTROYED)) { 3245 kprintf("cache_lookup_nonblock: " 3246 "ncp-race %p %*.*s\n", 3247 ncp, 3248 nlc->nlc_namelen, 3249 nlc->nlc_namelen, 3250 nlc->nlc_nameptr); 3251 _cache_unlock(ncp); 3252 _cache_drop(ncp); 3253 goto failed; 3254 } 3255 _cache_auto_unresolve(mp, ncp); 3256 if (new_ncp) { 3257 _cache_free(new_ncp); 3258 new_ncp = NULL; 3259 } 3260 goto found; 3261 } 3262 _cache_drop(ncp); 3263 goto failed; 3264 } 3265 } 3266 3267 /* 3268 * We failed to locate an entry, create a new entry and add it to 3269 * the cache. The parent ncp must also be locked so we 3270 * can link into it. 3271 * 3272 * We have to relookup after possibly blocking in kmalloc or 3273 * when locking par_nch. 3274 * 3275 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3276 * mount case, in which case nc_name will be NULL. 3277 */ 3278 if (new_ncp == NULL) { 3279 spin_unlock(&nchpp->spin); 3280 new_ncp = cache_alloc(nlc->nlc_namelen); 3281 if (nlc->nlc_namelen) { 3282 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3283 nlc->nlc_namelen); 3284 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3285 } 3286 goto restart; 3287 } 3288 if (par_locked == 0) { 3289 spin_unlock(&nchpp->spin); 3290 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3291 par_locked = 1; 3292 goto restart; 3293 } 3294 goto failed; 3295 } 3296 3297 /* 3298 * WARNING! We still hold the spinlock. We have to set the hash 3299 * table entry atomically. 3300 */ 3301 ncp = new_ncp; 3302 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3303 spin_unlock(&nchpp->spin); 3304 _cache_unlock(par_nch->ncp); 3305 /* par_locked = 0 - not used */ 3306 found: 3307 /* 3308 * stats and namecache size management 3309 */ 3310 if (ncp->nc_flag & NCF_UNRESOLVED) 3311 ++gd->gd_nchstats->ncs_miss; 3312 else if (ncp->nc_vp) 3313 ++gd->gd_nchstats->ncs_goodhits; 3314 else 3315 ++gd->gd_nchstats->ncs_neghits; 3316 nch.mount = mp; 3317 nch.ncp = ncp; 3318 _cache_mntref(nch.mount); 3319 3320 return(nch); 3321 failed: 3322 if (new_ncp) { 3323 _cache_free(new_ncp); 3324 new_ncp = NULL; 3325 } 3326 nch.mount = NULL; 3327 nch.ncp = NULL; 3328 return(nch); 3329 } 3330 3331 /* 3332 * The namecache entry is marked as being used as a mount point. 3333 * Locate the mount if it is visible to the caller. The DragonFly 3334 * mount system allows arbitrary loops in the topology and disentangles 3335 * those loops by matching against (mp, ncp) rather than just (ncp). 3336 * This means any given ncp can dive any number of mounts, depending 3337 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3338 * 3339 * We use a very simple frontend cache to reduce SMP conflicts, 3340 * which we have to do because the mountlist scan needs an exclusive 3341 * lock around its ripout info list. Not to mention that there might 3342 * be a lot of mounts. 3343 */ 3344 struct findmount_info { 3345 struct mount *result; 3346 struct mount *nch_mount; 3347 struct namecache *nch_ncp; 3348 }; 3349 3350 static 3351 struct ncmount_cache * 3352 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3353 { 3354 int hash; 3355 3356 hash = ((int)(intptr_t)mp / sizeof(*mp)) ^ 3357 ((int)(intptr_t)ncp / sizeof(*ncp)); 3358 hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE; 3359 return (&ncmount_cache[hash]); 3360 } 3361 3362 static 3363 int 3364 cache_findmount_callback(struct mount *mp, void *data) 3365 { 3366 struct findmount_info *info = data; 3367 3368 /* 3369 * Check the mount's mounted-on point against the passed nch. 3370 */ 3371 if (mp->mnt_ncmounton.mount == info->nch_mount && 3372 mp->mnt_ncmounton.ncp == info->nch_ncp 3373 ) { 3374 info->result = mp; 3375 _cache_mntref(mp); 3376 return(-1); 3377 } 3378 return(0); 3379 } 3380 3381 struct mount * 3382 cache_findmount(struct nchandle *nch) 3383 { 3384 struct findmount_info info; 3385 struct ncmount_cache *ncc; 3386 struct mount *mp; 3387 3388 /* 3389 * Fast 3390 */ 3391 if (ncmount_cache_enable == 0) { 3392 ncc = NULL; 3393 goto skip; 3394 } 3395 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3396 if (ncc->ncp == nch->ncp) { 3397 spin_lock_shared(&ncc->spin); 3398 if (ncc->isneg == 0 && 3399 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3400 if (mp->mnt_ncmounton.mount == nch->mount && 3401 mp->mnt_ncmounton.ncp == nch->ncp) { 3402 /* 3403 * Cache hit (positive) 3404 */ 3405 _cache_mntref(mp); 3406 spin_unlock_shared(&ncc->spin); 3407 ++ncmount_cache_hit; 3408 return(mp); 3409 } 3410 /* else cache miss */ 3411 } 3412 if (ncc->isneg && 3413 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3414 /* 3415 * Cache hit (negative) 3416 */ 3417 spin_unlock_shared(&ncc->spin); 3418 ++ncmount_cache_hit; 3419 return(NULL); 3420 } 3421 spin_unlock_shared(&ncc->spin); 3422 } 3423 skip: 3424 3425 /* 3426 * Slow 3427 */ 3428 info.result = NULL; 3429 info.nch_mount = nch->mount; 3430 info.nch_ncp = nch->ncp; 3431 mountlist_scan(cache_findmount_callback, &info, 3432 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 3433 3434 /* 3435 * Cache the result. 3436 * 3437 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3438 * only used for pointer comparisons and is not 3439 * referenced (otherwise there would be dangling 3440 * refs). 3441 * 3442 * Positive lookups: We cache the originating {ncp} and the target 3443 * (mp). (mp) is referenced. 3444 * 3445 * Indeterminant: If the match is undergoing an unmount we do 3446 * not cache it to avoid racing cache_unmounting(), 3447 * but still return the match. 3448 */ 3449 if (ncc) { 3450 spin_lock(&ncc->spin); 3451 if (info.result == NULL) { 3452 if (ncc->isneg == 0 && ncc->mp) 3453 _cache_mntrel(ncc->mp); 3454 ncc->ncp = nch->ncp; 3455 ncc->mp = nch->mount; 3456 ncc->isneg = 1; 3457 spin_unlock(&ncc->spin); 3458 ++ncmount_cache_overwrite; 3459 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3460 if (ncc->isneg == 0 && ncc->mp) 3461 _cache_mntrel(ncc->mp); 3462 _cache_mntref(info.result); 3463 ncc->ncp = nch->ncp; 3464 ncc->mp = info.result; 3465 ncc->isneg = 0; 3466 spin_unlock(&ncc->spin); 3467 ++ncmount_cache_overwrite; 3468 } else { 3469 spin_unlock(&ncc->spin); 3470 } 3471 ++ncmount_cache_miss; 3472 } 3473 return(info.result); 3474 } 3475 3476 void 3477 cache_dropmount(struct mount *mp) 3478 { 3479 _cache_mntrel(mp); 3480 } 3481 3482 void 3483 cache_ismounting(struct mount *mp) 3484 { 3485 struct nchandle *nch = &mp->mnt_ncmounton; 3486 struct ncmount_cache *ncc; 3487 3488 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3489 if (ncc->isneg && 3490 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3491 spin_lock(&ncc->spin); 3492 if (ncc->isneg && 3493 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3494 ncc->ncp = NULL; 3495 ncc->mp = NULL; 3496 } 3497 spin_unlock(&ncc->spin); 3498 } 3499 } 3500 3501 void 3502 cache_unmounting(struct mount *mp) 3503 { 3504 struct nchandle *nch = &mp->mnt_ncmounton; 3505 struct ncmount_cache *ncc; 3506 3507 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3508 if (ncc->isneg == 0 && 3509 ncc->ncp == nch->ncp && ncc->mp == mp) { 3510 spin_lock(&ncc->spin); 3511 if (ncc->isneg == 0 && 3512 ncc->ncp == nch->ncp && ncc->mp == mp) { 3513 _cache_mntrel(mp); 3514 ncc->ncp = NULL; 3515 ncc->mp = NULL; 3516 } 3517 spin_unlock(&ncc->spin); 3518 } 3519 } 3520 3521 /* 3522 * Resolve an unresolved namecache entry, generally by looking it up. 3523 * The passed ncp must be locked and refd. 3524 * 3525 * Theoretically since a vnode cannot be recycled while held, and since 3526 * the nc_parent chain holds its vnode as long as children exist, the 3527 * direct parent of the cache entry we are trying to resolve should 3528 * have a valid vnode. If not then generate an error that we can 3529 * determine is related to a resolver bug. 3530 * 3531 * However, if a vnode was in the middle of a recyclement when the NCP 3532 * got locked, ncp->nc_vp might point to a vnode that is about to become 3533 * invalid. cache_resolve() handles this case by unresolving the entry 3534 * and then re-resolving it. 3535 * 3536 * Note that successful resolution does not necessarily return an error 3537 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3538 * will be returned. 3539 */ 3540 int 3541 cache_resolve(struct nchandle *nch, struct ucred *cred) 3542 { 3543 struct namecache *par_tmp; 3544 struct namecache *par; 3545 struct namecache *ncp; 3546 struct nchandle nctmp; 3547 struct mount *mp; 3548 struct vnode *dvp; 3549 int error; 3550 3551 ncp = nch->ncp; 3552 mp = nch->mount; 3553 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3554 restart: 3555 /* 3556 * If the ncp is already resolved we have nothing to do. However, 3557 * we do want to guarentee that a usable vnode is returned when 3558 * a vnode is present, so make sure it hasn't been reclaimed. 3559 */ 3560 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3561 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3562 _cache_setunresolved(ncp); 3563 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3564 return (ncp->nc_error); 3565 } 3566 3567 /* 3568 * If the ncp was destroyed it will never resolve again. This 3569 * can basically only happen when someone is chdir'd into an 3570 * empty directory which is then rmdir'd. We want to catch this 3571 * here and not dive the VFS because the VFS might actually 3572 * have a way to re-resolve the disconnected ncp, which will 3573 * result in inconsistencies in the cdir/nch for proc->p_fd. 3574 */ 3575 if (ncp->nc_flag & NCF_DESTROYED) 3576 return(EINVAL); 3577 3578 /* 3579 * Mount points need special handling because the parent does not 3580 * belong to the same filesystem as the ncp. 3581 */ 3582 if (ncp == mp->mnt_ncmountpt.ncp) 3583 return (cache_resolve_mp(mp)); 3584 3585 /* 3586 * We expect an unbroken chain of ncps to at least the mount point, 3587 * and even all the way to root (but this code doesn't have to go 3588 * past the mount point). 3589 */ 3590 if (ncp->nc_parent == NULL) { 3591 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3592 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3593 ncp->nc_error = EXDEV; 3594 return(ncp->nc_error); 3595 } 3596 3597 /* 3598 * The vp's of the parent directories in the chain are held via vhold() 3599 * due to the existance of the child, and should not disappear. 3600 * However, there are cases where they can disappear: 3601 * 3602 * - due to filesystem I/O errors. 3603 * - due to NFS being stupid about tracking the namespace and 3604 * destroys the namespace for entire directories quite often. 3605 * - due to forced unmounts. 3606 * - due to an rmdir (parent will be marked DESTROYED) 3607 * 3608 * When this occurs we have to track the chain backwards and resolve 3609 * it, looping until the resolver catches up to the current node. We 3610 * could recurse here but we might run ourselves out of kernel stack 3611 * so we do it in a more painful manner. This situation really should 3612 * not occur all that often, or if it does not have to go back too 3613 * many nodes to resolve the ncp. 3614 */ 3615 while ((dvp = cache_dvpref(ncp)) == NULL) { 3616 /* 3617 * This case can occur if a process is CD'd into a 3618 * directory which is then rmdir'd. If the parent is marked 3619 * destroyed there is no point trying to resolve it. 3620 */ 3621 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3622 return(ENOENT); 3623 par = ncp->nc_parent; 3624 _cache_hold(par); 3625 _cache_lock(par); 3626 while ((par_tmp = par->nc_parent) != NULL && 3627 par_tmp->nc_vp == NULL) { 3628 _cache_hold(par_tmp); 3629 _cache_lock(par_tmp); 3630 _cache_put(par); 3631 par = par_tmp; 3632 } 3633 if (par->nc_parent == NULL) { 3634 kprintf("EXDEV case 2 %*.*s\n", 3635 par->nc_nlen, par->nc_nlen, par->nc_name); 3636 _cache_put(par); 3637 return (EXDEV); 3638 } 3639 /* 3640 * The parent is not set in stone, ref and lock it to prevent 3641 * it from disappearing. Also note that due to renames it 3642 * is possible for our ncp to move and for par to no longer 3643 * be one of its parents. We resolve it anyway, the loop 3644 * will handle any moves. 3645 */ 3646 _cache_get(par); /* additional hold/lock */ 3647 _cache_put(par); /* from earlier hold/lock */ 3648 if (par == nch->mount->mnt_ncmountpt.ncp) { 3649 cache_resolve_mp(nch->mount); 3650 } else if ((dvp = cache_dvpref(par)) == NULL) { 3651 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3652 _cache_put(par); 3653 continue; 3654 } else { 3655 if (par->nc_flag & NCF_UNRESOLVED) { 3656 nctmp.mount = mp; 3657 nctmp.ncp = par; 3658 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3659 } 3660 vrele(dvp); 3661 } 3662 if ((error = par->nc_error) != 0) { 3663 if (par->nc_error != EAGAIN) { 3664 kprintf("EXDEV case 3 %*.*s error %d\n", 3665 par->nc_nlen, par->nc_nlen, par->nc_name, 3666 par->nc_error); 3667 _cache_put(par); 3668 return(error); 3669 } 3670 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3671 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3672 } 3673 _cache_put(par); 3674 /* loop */ 3675 } 3676 3677 /* 3678 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3679 * ncp's and reattach them. If this occurs the original ncp is marked 3680 * EAGAIN to force a relookup. 3681 * 3682 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3683 * ncp must already be resolved. 3684 */ 3685 if (dvp) { 3686 nctmp.mount = mp; 3687 nctmp.ncp = ncp; 3688 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3689 vrele(dvp); 3690 } else { 3691 ncp->nc_error = EPERM; 3692 } 3693 if (ncp->nc_error == EAGAIN) { 3694 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3695 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3696 goto restart; 3697 } 3698 return(ncp->nc_error); 3699 } 3700 3701 /* 3702 * Resolve the ncp associated with a mount point. Such ncp's almost always 3703 * remain resolved and this routine is rarely called. NFS MPs tends to force 3704 * re-resolution more often due to its mac-truck-smash-the-namecache 3705 * method of tracking namespace changes. 3706 * 3707 * The semantics for this call is that the passed ncp must be locked on 3708 * entry and will be locked on return. However, if we actually have to 3709 * resolve the mount point we temporarily unlock the entry in order to 3710 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3711 * the unlock we have to recheck the flags after we relock. 3712 */ 3713 static int 3714 cache_resolve_mp(struct mount *mp) 3715 { 3716 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3717 struct vnode *vp; 3718 int error; 3719 3720 KKASSERT(mp != NULL); 3721 3722 /* 3723 * If the ncp is already resolved we have nothing to do. However, 3724 * we do want to guarentee that a usable vnode is returned when 3725 * a vnode is present, so make sure it hasn't been reclaimed. 3726 */ 3727 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3728 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3729 _cache_setunresolved(ncp); 3730 } 3731 3732 if (ncp->nc_flag & NCF_UNRESOLVED) { 3733 _cache_unlock(ncp); 3734 while (vfs_busy(mp, 0)) 3735 ; 3736 error = VFS_ROOT(mp, &vp); 3737 _cache_lock(ncp); 3738 3739 /* 3740 * recheck the ncp state after relocking. 3741 */ 3742 if (ncp->nc_flag & NCF_UNRESOLVED) { 3743 ncp->nc_error = error; 3744 if (error == 0) { 3745 _cache_setvp(mp, ncp, vp); 3746 vput(vp); 3747 } else { 3748 kprintf("[diagnostic] cache_resolve_mp: failed" 3749 " to resolve mount %p err=%d ncp=%p\n", 3750 mp, error, ncp); 3751 _cache_setvp(mp, ncp, NULL); 3752 } 3753 } else if (error == 0) { 3754 vput(vp); 3755 } 3756 vfs_unbusy(mp); 3757 } 3758 return(ncp->nc_error); 3759 } 3760 3761 /* 3762 * Clean out negative cache entries when too many have accumulated. 3763 */ 3764 static void 3765 _cache_cleanneg(int count) 3766 { 3767 struct namecache *ncp; 3768 3769 /* 3770 * Attempt to clean out the specified number of negative cache 3771 * entries. 3772 */ 3773 while (count) { 3774 spin_lock(&ncspin); 3775 ncp = TAILQ_FIRST(&ncneglist); 3776 if (ncp == NULL) { 3777 spin_unlock(&ncspin); 3778 break; 3779 } 3780 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 3781 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 3782 _cache_hold(ncp); 3783 spin_unlock(&ncspin); 3784 3785 /* 3786 * This can race, so we must re-check that the ncp 3787 * is on the ncneglist after successfully locking it. 3788 */ 3789 if (_cache_lock_special(ncp) == 0) { 3790 if (ncp->nc_vp == NULL && 3791 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3792 ncp = cache_zap(ncp, 1); 3793 if (ncp) 3794 _cache_drop(ncp); 3795 } else { 3796 kprintf("cache_cleanneg: race avoided\n"); 3797 _cache_unlock(ncp); 3798 } 3799 } else { 3800 _cache_drop(ncp); 3801 } 3802 --count; 3803 } 3804 } 3805 3806 /* 3807 * Clean out positive cache entries when too many have accumulated. 3808 */ 3809 static void 3810 _cache_cleanpos(int count) 3811 { 3812 static volatile int rover; 3813 struct nchash_head *nchpp; 3814 struct namecache *ncp; 3815 int rover_copy; 3816 3817 /* 3818 * Attempt to clean out the specified number of negative cache 3819 * entries. 3820 */ 3821 while (count) { 3822 rover_copy = ++rover; /* MPSAFEENOUGH */ 3823 cpu_ccfence(); 3824 nchpp = NCHHASH(rover_copy); 3825 3826 spin_lock_shared(&nchpp->spin); 3827 ncp = LIST_FIRST(&nchpp->list); 3828 while (ncp && (ncp->nc_flag & NCF_DESTROYED)) 3829 ncp = LIST_NEXT(ncp, nc_hash); 3830 if (ncp) 3831 _cache_hold(ncp); 3832 spin_unlock_shared(&nchpp->spin); 3833 3834 if (ncp) { 3835 if (_cache_lock_special(ncp) == 0) { 3836 ncp = cache_zap(ncp, 1); 3837 if (ncp) 3838 _cache_drop(ncp); 3839 } else { 3840 _cache_drop(ncp); 3841 } 3842 } 3843 --count; 3844 } 3845 } 3846 3847 /* 3848 * This is a kitchen sink function to clean out ncps which we 3849 * tried to zap from cache_drop() but failed because we were 3850 * unable to acquire the parent lock. 3851 * 3852 * Such entries can also be removed via cache_inval_vp(), such 3853 * as when unmounting. 3854 */ 3855 static void 3856 _cache_cleandefered(void) 3857 { 3858 struct nchash_head *nchpp; 3859 struct namecache *ncp; 3860 struct namecache dummy; 3861 int i; 3862 3863 numdefered = 0; 3864 bzero(&dummy, sizeof(dummy)); 3865 dummy.nc_flag = NCF_DESTROYED; 3866 dummy.nc_refs = 1; 3867 3868 for (i = 0; i <= nchash; ++i) { 3869 nchpp = &nchashtbl[i]; 3870 3871 spin_lock(&nchpp->spin); 3872 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3873 ncp = &dummy; 3874 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) { 3875 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 3876 continue; 3877 LIST_REMOVE(&dummy, nc_hash); 3878 LIST_INSERT_AFTER(ncp, &dummy, nc_hash); 3879 _cache_hold(ncp); 3880 spin_unlock(&nchpp->spin); 3881 if (_cache_lock_nonblock(ncp) == 0) { 3882 ncp->nc_flag &= ~NCF_DEFEREDZAP; 3883 _cache_unlock(ncp); 3884 } 3885 _cache_drop(ncp); 3886 spin_lock(&nchpp->spin); 3887 ncp = &dummy; 3888 } 3889 LIST_REMOVE(&dummy, nc_hash); 3890 spin_unlock(&nchpp->spin); 3891 } 3892 } 3893 3894 /* 3895 * Name cache initialization, from vfsinit() when we are booting 3896 */ 3897 void 3898 nchinit(void) 3899 { 3900 int i; 3901 globaldata_t gd; 3902 3903 /* 3904 * Initialise per-cpu namecache effectiveness statistics. 3905 */ 3906 for (i = 0; i < ncpus; ++i) { 3907 gd = globaldata_find(i); 3908 gd->gd_nchstats = &nchstats[i]; 3909 } 3910 3911 /* 3912 * Create a generous namecache hash table 3913 */ 3914 TAILQ_INIT(&ncneglist); 3915 spin_init(&ncspin, "nchinit"); 3916 nchashtbl = hashinit_ext(vfs_inodehashsize(), 3917 sizeof(struct nchash_head), 3918 M_VFSCACHE, &nchash); 3919 for (i = 0; i <= (int)nchash; ++i) { 3920 LIST_INIT(&nchashtbl[i].list); 3921 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 3922 } 3923 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 3924 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 3925 nclockwarn = 5 * hz; 3926 } 3927 3928 /* 3929 * Called from start_init() to bootstrap the root filesystem. Returns 3930 * a referenced, unlocked namecache record. 3931 */ 3932 void 3933 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 3934 { 3935 nch->ncp = cache_alloc(0); 3936 nch->mount = mp; 3937 _cache_mntref(mp); 3938 if (vp) 3939 _cache_setvp(nch->mount, nch->ncp, vp); 3940 } 3941 3942 /* 3943 * vfs_cache_setroot() 3944 * 3945 * Create an association between the root of our namecache and 3946 * the root vnode. This routine may be called several times during 3947 * booting. 3948 * 3949 * If the caller intends to save the returned namecache pointer somewhere 3950 * it must cache_hold() it. 3951 */ 3952 void 3953 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 3954 { 3955 struct vnode *ovp; 3956 struct nchandle onch; 3957 3958 ovp = rootvnode; 3959 onch = rootnch; 3960 rootvnode = nvp; 3961 if (nch) 3962 rootnch = *nch; 3963 else 3964 cache_zero(&rootnch); 3965 if (ovp) 3966 vrele(ovp); 3967 if (onch.ncp) 3968 cache_drop(&onch); 3969 } 3970 3971 /* 3972 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 3973 * topology and is being removed as quickly as possible. The new VOP_N*() 3974 * API calls are required to make specific adjustments using the supplied 3975 * ncp pointers rather then just bogusly purging random vnodes. 3976 * 3977 * Invalidate all namecache entries to a particular vnode as well as 3978 * any direct children of that vnode in the namecache. This is a 3979 * 'catch all' purge used by filesystems that do not know any better. 3980 * 3981 * Note that the linkage between the vnode and its namecache entries will 3982 * be removed, but the namecache entries themselves might stay put due to 3983 * active references from elsewhere in the system or due to the existance of 3984 * the children. The namecache topology is left intact even if we do not 3985 * know what the vnode association is. Such entries will be marked 3986 * NCF_UNRESOLVED. 3987 */ 3988 void 3989 cache_purge(struct vnode *vp) 3990 { 3991 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 3992 } 3993 3994 static int disablecwd; 3995 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 3996 "Disable getcwd"); 3997 3998 static u_long numcwdcalls; 3999 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4000 "Number of current directory resolution calls"); 4001 static u_long numcwdfailnf; 4002 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4003 "Number of current directory failures due to lack of file"); 4004 static u_long numcwdfailsz; 4005 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4006 "Number of current directory failures due to large result"); 4007 static u_long numcwdfound; 4008 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4009 "Number of current directory resolution successes"); 4010 4011 /* 4012 * MPALMOSTSAFE 4013 */ 4014 int 4015 sys___getcwd(struct __getcwd_args *uap) 4016 { 4017 u_int buflen; 4018 int error; 4019 char *buf; 4020 char *bp; 4021 4022 if (disablecwd) 4023 return (ENODEV); 4024 4025 buflen = uap->buflen; 4026 if (buflen == 0) 4027 return (EINVAL); 4028 if (buflen > MAXPATHLEN) 4029 buflen = MAXPATHLEN; 4030 4031 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4032 bp = kern_getcwd(buf, buflen, &error); 4033 if (error == 0) 4034 error = copyout(bp, uap->buf, strlen(bp) + 1); 4035 kfree(buf, M_TEMP); 4036 return (error); 4037 } 4038 4039 char * 4040 kern_getcwd(char *buf, size_t buflen, int *error) 4041 { 4042 struct proc *p = curproc; 4043 char *bp; 4044 int i, slash_prefixed; 4045 struct filedesc *fdp; 4046 struct nchandle nch; 4047 struct namecache *ncp; 4048 4049 numcwdcalls++; 4050 bp = buf; 4051 bp += buflen - 1; 4052 *bp = '\0'; 4053 fdp = p->p_fd; 4054 slash_prefixed = 0; 4055 4056 nch = fdp->fd_ncdir; 4057 ncp = nch.ncp; 4058 if (ncp) 4059 _cache_hold(ncp); 4060 4061 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4062 nch.mount != fdp->fd_nrdir.mount) 4063 ) { 4064 /* 4065 * While traversing upwards if we encounter the root 4066 * of the current mount we have to skip to the mount point 4067 * in the underlying filesystem. 4068 */ 4069 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4070 nch = nch.mount->mnt_ncmounton; 4071 _cache_drop(ncp); 4072 ncp = nch.ncp; 4073 if (ncp) 4074 _cache_hold(ncp); 4075 continue; 4076 } 4077 4078 /* 4079 * Prepend the path segment 4080 */ 4081 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4082 if (bp == buf) { 4083 numcwdfailsz++; 4084 *error = ERANGE; 4085 bp = NULL; 4086 goto done; 4087 } 4088 *--bp = ncp->nc_name[i]; 4089 } 4090 if (bp == buf) { 4091 numcwdfailsz++; 4092 *error = ERANGE; 4093 bp = NULL; 4094 goto done; 4095 } 4096 *--bp = '/'; 4097 slash_prefixed = 1; 4098 4099 /* 4100 * Go up a directory. This isn't a mount point so we don't 4101 * have to check again. 4102 */ 4103 while ((nch.ncp = ncp->nc_parent) != NULL) { 4104 if (ncp_shared_lock_disable) 4105 _cache_lock(ncp); 4106 else 4107 _cache_lock_shared(ncp); 4108 if (nch.ncp != ncp->nc_parent) { 4109 _cache_unlock(ncp); 4110 continue; 4111 } 4112 _cache_hold(nch.ncp); 4113 _cache_unlock(ncp); 4114 break; 4115 } 4116 _cache_drop(ncp); 4117 ncp = nch.ncp; 4118 } 4119 if (ncp == NULL) { 4120 numcwdfailnf++; 4121 *error = ENOENT; 4122 bp = NULL; 4123 goto done; 4124 } 4125 if (!slash_prefixed) { 4126 if (bp == buf) { 4127 numcwdfailsz++; 4128 *error = ERANGE; 4129 bp = NULL; 4130 goto done; 4131 } 4132 *--bp = '/'; 4133 } 4134 numcwdfound++; 4135 *error = 0; 4136 done: 4137 if (ncp) 4138 _cache_drop(ncp); 4139 return (bp); 4140 } 4141 4142 /* 4143 * Thus begins the fullpath magic. 4144 * 4145 * The passed nchp is referenced but not locked. 4146 */ 4147 static int disablefullpath; 4148 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4149 &disablefullpath, 0, 4150 "Disable fullpath lookups"); 4151 4152 static u_int numfullpathcalls; 4153 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD, 4154 &numfullpathcalls, 0, 4155 "Number of full path resolutions in progress"); 4156 static u_int numfullpathfailnf; 4157 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD, 4158 &numfullpathfailnf, 0, 4159 "Number of full path resolution failures due to lack of file"); 4160 static u_int numfullpathfailsz; 4161 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD, 4162 &numfullpathfailsz, 0, 4163 "Number of full path resolution failures due to insufficient memory"); 4164 static u_int numfullpathfound; 4165 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD, 4166 &numfullpathfound, 0, 4167 "Number of full path resolution successes"); 4168 4169 int 4170 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4171 char **retbuf, char **freebuf, int guess) 4172 { 4173 struct nchandle fd_nrdir; 4174 struct nchandle nch; 4175 struct namecache *ncp; 4176 struct mount *mp, *new_mp; 4177 char *bp, *buf; 4178 int slash_prefixed; 4179 int error = 0; 4180 int i; 4181 4182 atomic_add_int(&numfullpathcalls, -1); 4183 4184 *retbuf = NULL; 4185 *freebuf = NULL; 4186 4187 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4188 bp = buf + MAXPATHLEN - 1; 4189 *bp = '\0'; 4190 if (nchbase) 4191 fd_nrdir = *nchbase; 4192 else if (p != NULL) 4193 fd_nrdir = p->p_fd->fd_nrdir; 4194 else 4195 fd_nrdir = rootnch; 4196 slash_prefixed = 0; 4197 nch = *nchp; 4198 ncp = nch.ncp; 4199 if (ncp) 4200 _cache_hold(ncp); 4201 mp = nch.mount; 4202 4203 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4204 new_mp = NULL; 4205 4206 /* 4207 * If we are asked to guess the upwards path, we do so whenever 4208 * we encounter an ncp marked as a mountpoint. We try to find 4209 * the actual mountpoint by finding the mountpoint with this 4210 * ncp. 4211 */ 4212 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4213 new_mp = mount_get_by_nc(ncp); 4214 } 4215 /* 4216 * While traversing upwards if we encounter the root 4217 * of the current mount we have to skip to the mount point. 4218 */ 4219 if (ncp == mp->mnt_ncmountpt.ncp) { 4220 new_mp = mp; 4221 } 4222 if (new_mp) { 4223 nch = new_mp->mnt_ncmounton; 4224 _cache_drop(ncp); 4225 ncp = nch.ncp; 4226 if (ncp) 4227 _cache_hold(ncp); 4228 mp = nch.mount; 4229 continue; 4230 } 4231 4232 /* 4233 * Prepend the path segment 4234 */ 4235 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4236 if (bp == buf) { 4237 numfullpathfailsz++; 4238 kfree(buf, M_TEMP); 4239 error = ENOMEM; 4240 goto done; 4241 } 4242 *--bp = ncp->nc_name[i]; 4243 } 4244 if (bp == buf) { 4245 numfullpathfailsz++; 4246 kfree(buf, M_TEMP); 4247 error = ENOMEM; 4248 goto done; 4249 } 4250 *--bp = '/'; 4251 slash_prefixed = 1; 4252 4253 /* 4254 * Go up a directory. This isn't a mount point so we don't 4255 * have to check again. 4256 * 4257 * We can only safely access nc_parent with ncp held locked. 4258 */ 4259 while ((nch.ncp = ncp->nc_parent) != NULL) { 4260 _cache_lock(ncp); 4261 if (nch.ncp != ncp->nc_parent) { 4262 _cache_unlock(ncp); 4263 continue; 4264 } 4265 _cache_hold(nch.ncp); 4266 _cache_unlock(ncp); 4267 break; 4268 } 4269 _cache_drop(ncp); 4270 ncp = nch.ncp; 4271 } 4272 if (ncp == NULL) { 4273 numfullpathfailnf++; 4274 kfree(buf, M_TEMP); 4275 error = ENOENT; 4276 goto done; 4277 } 4278 4279 if (!slash_prefixed) { 4280 if (bp == buf) { 4281 numfullpathfailsz++; 4282 kfree(buf, M_TEMP); 4283 error = ENOMEM; 4284 goto done; 4285 } 4286 *--bp = '/'; 4287 } 4288 numfullpathfound++; 4289 *retbuf = bp; 4290 *freebuf = buf; 4291 error = 0; 4292 done: 4293 if (ncp) 4294 _cache_drop(ncp); 4295 return(error); 4296 } 4297 4298 int 4299 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4300 char **freebuf, int guess) 4301 { 4302 struct namecache *ncp; 4303 struct nchandle nch; 4304 int error; 4305 4306 *freebuf = NULL; 4307 atomic_add_int(&numfullpathcalls, 1); 4308 if (disablefullpath) 4309 return (ENODEV); 4310 4311 if (p == NULL) 4312 return (EINVAL); 4313 4314 /* vn is NULL, client wants us to use p->p_textvp */ 4315 if (vn == NULL) { 4316 if ((vn = p->p_textvp) == NULL) 4317 return (EINVAL); 4318 } 4319 spin_lock_shared(&vn->v_spin); 4320 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4321 if (ncp->nc_nlen) 4322 break; 4323 } 4324 if (ncp == NULL) { 4325 spin_unlock_shared(&vn->v_spin); 4326 return (EINVAL); 4327 } 4328 _cache_hold(ncp); 4329 spin_unlock_shared(&vn->v_spin); 4330 4331 atomic_add_int(&numfullpathcalls, -1); 4332 nch.ncp = ncp; 4333 nch.mount = vn->v_mount; 4334 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4335 _cache_drop(ncp); 4336 return (error); 4337 } 4338