1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 #include <sys/mount.h> 70 #include <sys/vnode.h> 71 #include <sys/malloc.h> 72 #include <sys/sysproto.h> 73 #include <sys/spinlock.h> 74 #include <sys/proc.h> 75 #include <sys/namei.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/sysref2.h> 85 #include <sys/spinlock2.h> 86 #include <sys/mplock2.h> 87 88 #define MAX_RECURSION_DEPTH 64 89 90 /* 91 * Random lookups in the cache are accomplished with a hash table using 92 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). ncneglist is locked 98 * with a global spinlock (ncspin). 99 * 100 * MPSAFE RULES: 101 * 102 * (1) A ncp must be referenced before it can be locked. 103 * 104 * (2) A ncp must be locked in order to modify it. 105 * 106 * (3) ncp locks are always ordered child -> parent. That may seem 107 * backwards but forward scans use the hash table and thus can hold 108 * the parent unlocked when traversing downward. 109 * 110 * This allows insert/rename/delete/dot-dot and other operations 111 * to use ncp->nc_parent links. 112 * 113 * This also prevents a locked up e.g. NFS node from creating a 114 * chain reaction all the way back to the root vnode / namecache. 115 * 116 * (4) parent linkages require both the parent and child to be locked. 117 */ 118 119 /* 120 * Structures associated with name cacheing. 121 */ 122 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 123 #define MINNEG 1024 124 #define MINPOS 1024 125 #define NCMOUNT_NUMCACHE 1009 /* prime number */ 126 127 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 128 129 LIST_HEAD(nchash_list, namecache); 130 131 struct nchash_head { 132 struct nchash_list list; 133 struct spinlock spin; 134 }; 135 136 struct ncmount_cache { 137 struct spinlock spin; 138 struct namecache *ncp; 139 struct mount *mp; 140 int isneg; /* if != 0 mp is originator and not target */ 141 }; 142 143 static struct nchash_head *nchashtbl; 144 static struct namecache_list ncneglist; 145 static struct spinlock ncspin; 146 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 147 148 /* 149 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 150 * to create the namecache infrastructure leading to a dangling vnode. 151 * 152 * 0 Only errors are reported 153 * 1 Successes are reported 154 * 2 Successes + the whole directory scan is reported 155 * 3 Force the directory scan code run as if the parent vnode did not 156 * have a namecache record, even if it does have one. 157 */ 158 static int ncvp_debug; 159 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 160 "Namecache debug level (0-3)"); 161 162 static u_long nchash; /* size of hash table */ 163 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 164 "Size of namecache hash table"); 165 166 static int ncnegflush = 10; /* burst for negative flush */ 167 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 168 "Batch flush negative entries"); 169 170 static int ncposflush = 10; /* burst for positive flush */ 171 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 172 "Batch flush positive entries"); 173 174 static int ncnegfactor = 16; /* ratio of negative entries */ 175 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 176 "Ratio of namecache negative entries"); 177 178 static int nclockwarn; /* warn on locked entries in ticks */ 179 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 180 "Warn on locked namecache entries in ticks"); 181 182 static int numdefered; /* number of cache entries allocated */ 183 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 184 "Number of cache entries allocated"); 185 186 static int ncposlimit; /* number of cache entries allocated */ 187 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 188 "Number of cache entries allocated"); 189 190 static int ncp_shared_lock_disable = 0; 191 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 192 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 193 194 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 195 "sizeof(struct vnode)"); 196 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 197 "sizeof(struct namecache)"); 198 199 static int ncmount_cache_enable = 1; 200 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 201 &ncmount_cache_enable, 0, "mount point cache"); 202 static long ncmount_cache_hit; 203 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW, 204 &ncmount_cache_hit, 0, "mpcache hits"); 205 static long ncmount_cache_miss; 206 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW, 207 &ncmount_cache_miss, 0, "mpcache misses"); 208 static long ncmount_cache_overwrite; 209 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW, 210 &ncmount_cache_overwrite, 0, "mpcache entry overwrites"); 211 212 static int cache_resolve_mp(struct mount *mp); 213 static struct vnode *cache_dvpref(struct namecache *ncp); 214 static void _cache_lock(struct namecache *ncp); 215 static void _cache_setunresolved(struct namecache *ncp); 216 static void _cache_cleanneg(int count); 217 static void _cache_cleanpos(int count); 218 static void _cache_cleandefered(void); 219 static void _cache_unlink(struct namecache *ncp); 220 221 /* 222 * The new name cache statistics 223 */ 224 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 225 static int numneg; 226 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 227 "Number of negative namecache entries"); 228 static int numcache; 229 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 230 "Number of namecaches entries"); 231 static u_long numcalls; 232 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0, 233 "Number of namecache lookups"); 234 static u_long numchecks; 235 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0, 236 "Number of checked entries in namecache lookups"); 237 238 struct nchstats nchstats[SMP_MAXCPU]; 239 /* 240 * Export VFS cache effectiveness statistics to user-land. 241 * 242 * The statistics are left for aggregation to user-land so 243 * neat things can be achieved, like observing per-CPU cache 244 * distribution. 245 */ 246 static int 247 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 248 { 249 struct globaldata *gd; 250 int i, error; 251 252 error = 0; 253 for (i = 0; i < ncpus; ++i) { 254 gd = globaldata_find(i); 255 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 256 sizeof(struct nchstats)))) 257 break; 258 } 259 260 return (error); 261 } 262 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 263 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 264 265 static struct namecache *cache_zap(struct namecache *ncp, int nonblock); 266 267 /* 268 * Namespace locking. The caller must already hold a reference to the 269 * namecache structure in order to lock/unlock it. This function prevents 270 * the namespace from being created or destroyed by accessors other then 271 * the lock holder. 272 * 273 * Note that holding a locked namecache structure prevents other threads 274 * from making namespace changes (e.g. deleting or creating), prevents 275 * vnode association state changes by other threads, and prevents the 276 * namecache entry from being resolved or unresolved by other threads. 277 * 278 * An exclusive lock owner has full authority to associate/disassociate 279 * vnodes and resolve/unresolve the locked ncp. 280 * 281 * A shared lock owner only has authority to acquire the underlying vnode, 282 * if any. 283 * 284 * The primary lock field is nc_lockstatus. nc_locktd is set after the 285 * fact (when locking) or cleared prior to unlocking. 286 * 287 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 288 * or recycled, but it does NOT help you if the vnode had already 289 * initiated a recyclement. If this is important, use cache_get() 290 * rather then cache_lock() (and deal with the differences in the 291 * way the refs counter is handled). Or, alternatively, make an 292 * unconditional call to cache_validate() or cache_resolve() 293 * after cache_lock() returns. 294 */ 295 static 296 void 297 _cache_lock(struct namecache *ncp) 298 { 299 thread_t td; 300 int didwarn; 301 int begticks; 302 int error; 303 u_int count; 304 305 KKASSERT(ncp->nc_refs != 0); 306 didwarn = 0; 307 begticks = 0; 308 td = curthread; 309 310 for (;;) { 311 count = ncp->nc_lockstatus; 312 cpu_ccfence(); 313 314 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 315 if (atomic_cmpset_int(&ncp->nc_lockstatus, 316 count, count + 1)) { 317 /* 318 * The vp associated with a locked ncp must 319 * be held to prevent it from being recycled. 320 * 321 * WARNING! If VRECLAIMED is set the vnode 322 * could already be in the middle of a recycle. 323 * Callers must use cache_vref() or 324 * cache_vget() on the locked ncp to 325 * validate the vp or set the cache entry 326 * to unresolved. 327 * 328 * NOTE! vhold() is allowed if we hold a 329 * lock on the ncp (which we do). 330 */ 331 ncp->nc_locktd = td; 332 if (ncp->nc_vp) 333 vhold(ncp->nc_vp); 334 break; 335 } 336 /* cmpset failed */ 337 continue; 338 } 339 if (ncp->nc_locktd == td) { 340 KKASSERT((count & NC_SHLOCK_FLAG) == 0); 341 if (atomic_cmpset_int(&ncp->nc_lockstatus, 342 count, count + 1)) { 343 break; 344 } 345 /* cmpset failed */ 346 continue; 347 } 348 tsleep_interlock(&ncp->nc_locktd, 0); 349 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 350 count | NC_EXLOCK_REQ) == 0) { 351 /* cmpset failed */ 352 continue; 353 } 354 if (begticks == 0) 355 begticks = ticks; 356 error = tsleep(&ncp->nc_locktd, PINTERLOCKED, 357 "clock", nclockwarn); 358 if (error == EWOULDBLOCK) { 359 if (didwarn == 0) { 360 didwarn = ticks; 361 kprintf("[diagnostic] cache_lock: " 362 "%s blocked on %p %08x", 363 td->td_comm, ncp, count); 364 kprintf(" \"%*.*s\"\n", 365 ncp->nc_nlen, ncp->nc_nlen, 366 ncp->nc_name); 367 } 368 } 369 /* loop */ 370 } 371 if (didwarn) { 372 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after " 373 "%d secs\n", 374 td->td_comm, 375 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 376 (int)(ticks + (hz / 2) - begticks) / hz); 377 } 378 } 379 380 /* 381 * The shared lock works similarly to the exclusive lock except 382 * nc_locktd is left NULL and we need an interlock (VHOLD) to 383 * prevent vhold() races, since the moment our cmpset_int succeeds 384 * another cpu can come in and get its own shared lock. 385 * 386 * A critical section is needed to prevent interruption during the 387 * VHOLD interlock. 388 */ 389 static 390 void 391 _cache_lock_shared(struct namecache *ncp) 392 { 393 int didwarn; 394 int error; 395 u_int count; 396 u_int optreq = NC_EXLOCK_REQ; 397 398 KKASSERT(ncp->nc_refs != 0); 399 didwarn = 0; 400 401 for (;;) { 402 count = ncp->nc_lockstatus; 403 cpu_ccfence(); 404 405 if ((count & ~NC_SHLOCK_REQ) == 0) { 406 crit_enter(); 407 if (atomic_cmpset_int(&ncp->nc_lockstatus, 408 count, 409 (count + 1) | NC_SHLOCK_FLAG | 410 NC_SHLOCK_VHOLD)) { 411 /* 412 * The vp associated with a locked ncp must 413 * be held to prevent it from being recycled. 414 * 415 * WARNING! If VRECLAIMED is set the vnode 416 * could already be in the middle of a recycle. 417 * Callers must use cache_vref() or 418 * cache_vget() on the locked ncp to 419 * validate the vp or set the cache entry 420 * to unresolved. 421 * 422 * NOTE! vhold() is allowed if we hold a 423 * lock on the ncp (which we do). 424 */ 425 if (ncp->nc_vp) 426 vhold(ncp->nc_vp); 427 atomic_clear_int(&ncp->nc_lockstatus, 428 NC_SHLOCK_VHOLD); 429 crit_exit(); 430 break; 431 } 432 /* cmpset failed */ 433 crit_exit(); 434 continue; 435 } 436 437 /* 438 * If already held shared we can just bump the count, but 439 * only allow this if nobody is trying to get the lock 440 * exclusively. If we are blocking too long ignore excl 441 * requests (which can race/deadlock us). 442 * 443 * VHOLD is a bit of a hack. Even though we successfully 444 * added another shared ref, the cpu that got the first 445 * shared ref might not yet have held the vnode. 446 */ 447 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) { 448 KKASSERT((count & ~(NC_EXLOCK_REQ | 449 NC_SHLOCK_REQ | 450 NC_SHLOCK_FLAG)) > 0); 451 if (atomic_cmpset_int(&ncp->nc_lockstatus, 452 count, count + 1)) { 453 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 454 cpu_pause(); 455 break; 456 } 457 continue; 458 } 459 tsleep_interlock(ncp, 0); 460 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 461 count | NC_SHLOCK_REQ) == 0) { 462 /* cmpset failed */ 463 continue; 464 } 465 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); 466 if (error == EWOULDBLOCK) { 467 optreq = 0; 468 if (didwarn == 0) { 469 didwarn = ticks - nclockwarn; 470 kprintf("[diagnostic] cache_lock_shared: " 471 "%s blocked on %p %08x", 472 curthread->td_comm, ncp, count); 473 kprintf(" \"%*.*s\"\n", 474 ncp->nc_nlen, ncp->nc_nlen, 475 ncp->nc_name); 476 } 477 } 478 /* loop */ 479 } 480 if (didwarn) { 481 kprintf("[diagnostic] cache_lock_shared: " 482 "%s unblocked %*.*s after %d secs\n", 483 curthread->td_comm, 484 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 485 (int)(ticks - didwarn) / hz); 486 } 487 } 488 489 /* 490 * Lock ncp exclusively, return 0 on success. 491 * 492 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, 493 * such as the case where one of its children is locked. 494 */ 495 static 496 int 497 _cache_lock_nonblock(struct namecache *ncp) 498 { 499 thread_t td; 500 u_int count; 501 502 td = curthread; 503 504 for (;;) { 505 count = ncp->nc_lockstatus; 506 507 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 508 if (atomic_cmpset_int(&ncp->nc_lockstatus, 509 count, count + 1)) { 510 /* 511 * The vp associated with a locked ncp must 512 * be held to prevent it from being recycled. 513 * 514 * WARNING! If VRECLAIMED is set the vnode 515 * could already be in the middle of a recycle. 516 * Callers must use cache_vref() or 517 * cache_vget() on the locked ncp to 518 * validate the vp or set the cache entry 519 * to unresolved. 520 * 521 * NOTE! vhold() is allowed if we hold a 522 * lock on the ncp (which we do). 523 */ 524 ncp->nc_locktd = td; 525 if (ncp->nc_vp) 526 vhold(ncp->nc_vp); 527 break; 528 } 529 /* cmpset failed */ 530 continue; 531 } 532 if (ncp->nc_locktd == td) { 533 if (atomic_cmpset_int(&ncp->nc_lockstatus, 534 count, count + 1)) { 535 break; 536 } 537 /* cmpset failed */ 538 continue; 539 } 540 return(EWOULDBLOCK); 541 } 542 return(0); 543 } 544 545 /* 546 * The shared lock works similarly to the exclusive lock except 547 * nc_locktd is left NULL and we need an interlock (VHOLD) to 548 * prevent vhold() races, since the moment our cmpset_int succeeds 549 * another cpu can come in and get its own shared lock. 550 * 551 * A critical section is needed to prevent interruption during the 552 * VHOLD interlock. 553 */ 554 static 555 int 556 _cache_lock_shared_nonblock(struct namecache *ncp) 557 { 558 u_int count; 559 560 for (;;) { 561 count = ncp->nc_lockstatus; 562 563 if ((count & ~NC_SHLOCK_REQ) == 0) { 564 crit_enter(); 565 if (atomic_cmpset_int(&ncp->nc_lockstatus, 566 count, 567 (count + 1) | NC_SHLOCK_FLAG | 568 NC_SHLOCK_VHOLD)) { 569 /* 570 * The vp associated with a locked ncp must 571 * be held to prevent it from being recycled. 572 * 573 * WARNING! If VRECLAIMED is set the vnode 574 * could already be in the middle of a recycle. 575 * Callers must use cache_vref() or 576 * cache_vget() on the locked ncp to 577 * validate the vp or set the cache entry 578 * to unresolved. 579 * 580 * NOTE! vhold() is allowed if we hold a 581 * lock on the ncp (which we do). 582 */ 583 if (ncp->nc_vp) 584 vhold(ncp->nc_vp); 585 atomic_clear_int(&ncp->nc_lockstatus, 586 NC_SHLOCK_VHOLD); 587 crit_exit(); 588 break; 589 } 590 /* cmpset failed */ 591 crit_exit(); 592 continue; 593 } 594 595 /* 596 * If already held shared we can just bump the count, but 597 * only allow this if nobody is trying to get the lock 598 * exclusively. 599 * 600 * VHOLD is a bit of a hack. Even though we successfully 601 * added another shared ref, the cpu that got the first 602 * shared ref might not yet have held the vnode. 603 */ 604 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 605 NC_SHLOCK_FLAG) { 606 KKASSERT((count & ~(NC_EXLOCK_REQ | 607 NC_SHLOCK_REQ | 608 NC_SHLOCK_FLAG)) > 0); 609 if (atomic_cmpset_int(&ncp->nc_lockstatus, 610 count, count + 1)) { 611 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 612 cpu_pause(); 613 break; 614 } 615 continue; 616 } 617 return(EWOULDBLOCK); 618 } 619 return(0); 620 } 621 622 /* 623 * Helper function 624 * 625 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). 626 * 627 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. 628 */ 629 static 630 void 631 _cache_unlock(struct namecache *ncp) 632 { 633 thread_t td __debugvar = curthread; 634 u_int count; 635 u_int ncount; 636 struct vnode *dropvp; 637 638 KKASSERT(ncp->nc_refs >= 0); 639 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); 640 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); 641 642 count = ncp->nc_lockstatus; 643 cpu_ccfence(); 644 645 /* 646 * Clear nc_locktd prior to the atomic op (excl lock only) 647 */ 648 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) 649 ncp->nc_locktd = NULL; 650 dropvp = NULL; 651 652 for (;;) { 653 if ((count & 654 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { 655 dropvp = ncp->nc_vp; 656 if (count & NC_EXLOCK_REQ) 657 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ 658 else 659 ncount = 0; 660 661 if (atomic_cmpset_int(&ncp->nc_lockstatus, 662 count, ncount)) { 663 if (count & NC_EXLOCK_REQ) 664 wakeup(&ncp->nc_locktd); 665 else if (count & NC_SHLOCK_REQ) 666 wakeup(ncp); 667 break; 668 } 669 dropvp = NULL; 670 } else { 671 KKASSERT((count & NC_SHLOCK_VHOLD) == 0); 672 KKASSERT((count & ~(NC_EXLOCK_REQ | 673 NC_SHLOCK_REQ | 674 NC_SHLOCK_FLAG)) > 1); 675 if (atomic_cmpset_int(&ncp->nc_lockstatus, 676 count, count - 1)) { 677 break; 678 } 679 } 680 count = ncp->nc_lockstatus; 681 cpu_ccfence(); 682 } 683 684 /* 685 * Don't actually drop the vp until we successfully clean out 686 * the lock, otherwise we may race another shared lock. 687 */ 688 if (dropvp) 689 vdrop(dropvp); 690 } 691 692 static 693 int 694 _cache_lockstatus(struct namecache *ncp) 695 { 696 if (ncp->nc_locktd == curthread) 697 return(LK_EXCLUSIVE); 698 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) 699 return(LK_SHARED); 700 return(-1); 701 } 702 703 /* 704 * cache_hold() and cache_drop() prevent the premature deletion of a 705 * namecache entry but do not prevent operations (such as zapping) on 706 * that namecache entry. 707 * 708 * This routine may only be called from outside this source module if 709 * nc_refs is already at least 1. 710 * 711 * This is a rare case where callers are allowed to hold a spinlock, 712 * so we can't ourselves. 713 */ 714 static __inline 715 struct namecache * 716 _cache_hold(struct namecache *ncp) 717 { 718 atomic_add_int(&ncp->nc_refs, 1); 719 return(ncp); 720 } 721 722 /* 723 * Drop a cache entry, taking care to deal with races. 724 * 725 * For potential 1->0 transitions we must hold the ncp lock to safely 726 * test its flags. An unresolved entry with no children must be zapped 727 * to avoid leaks. 728 * 729 * The call to cache_zap() itself will handle all remaining races and 730 * will decrement the ncp's refs regardless. If we are resolved or 731 * have children nc_refs can safely be dropped to 0 without having to 732 * zap the entry. 733 * 734 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion. 735 * 736 * NOTE: cache_zap() may return a non-NULL referenced parent which must 737 * be dropped in a loop. 738 */ 739 static __inline 740 void 741 _cache_drop(struct namecache *ncp) 742 { 743 int refs; 744 745 while (ncp) { 746 KKASSERT(ncp->nc_refs > 0); 747 refs = ncp->nc_refs; 748 749 if (refs == 1) { 750 if (_cache_lock_nonblock(ncp) == 0) { 751 ncp->nc_flag &= ~NCF_DEFEREDZAP; 752 if ((ncp->nc_flag & NCF_UNRESOLVED) && 753 TAILQ_EMPTY(&ncp->nc_list)) { 754 ncp = cache_zap(ncp, 1); 755 continue; 756 } 757 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) { 758 _cache_unlock(ncp); 759 break; 760 } 761 _cache_unlock(ncp); 762 } 763 } else { 764 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) 765 break; 766 } 767 cpu_pause(); 768 } 769 } 770 771 /* 772 * Link a new namecache entry to its parent and to the hash table. Be 773 * careful to avoid races if vhold() blocks in the future. 774 * 775 * Both ncp and par must be referenced and locked. 776 * 777 * NOTE: The hash table spinlock is held during this call, we can't do 778 * anything fancy. 779 */ 780 static void 781 _cache_link_parent(struct namecache *ncp, struct namecache *par, 782 struct nchash_head *nchpp) 783 { 784 KKASSERT(ncp->nc_parent == NULL); 785 ncp->nc_parent = par; 786 ncp->nc_head = nchpp; 787 788 /* 789 * Set inheritance flags. Note that the parent flags may be 790 * stale due to getattr potentially not having been run yet 791 * (it gets run during nlookup()'s). 792 */ 793 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 794 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 795 ncp->nc_flag |= NCF_SF_PNOCACHE; 796 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 797 ncp->nc_flag |= NCF_UF_PCACHE; 798 799 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 800 801 if (TAILQ_EMPTY(&par->nc_list)) { 802 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 803 /* 804 * Any vp associated with an ncp which has children must 805 * be held to prevent it from being recycled. 806 */ 807 if (par->nc_vp) 808 vhold(par->nc_vp); 809 } else { 810 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 811 } 812 } 813 814 /* 815 * Remove the parent and hash associations from a namecache structure. 816 * If this is the last child of the parent the cache_drop(par) will 817 * attempt to recursively zap the parent. 818 * 819 * ncp must be locked. This routine will acquire a temporary lock on 820 * the parent as wlel as the appropriate hash chain. 821 */ 822 static void 823 _cache_unlink_parent(struct namecache *ncp) 824 { 825 struct namecache *par; 826 struct vnode *dropvp; 827 828 if ((par = ncp->nc_parent) != NULL) { 829 KKASSERT(ncp->nc_parent == par); 830 _cache_hold(par); 831 _cache_lock(par); 832 spin_lock(&ncp->nc_head->spin); 833 LIST_REMOVE(ncp, nc_hash); 834 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 835 dropvp = NULL; 836 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 837 dropvp = par->nc_vp; 838 spin_unlock(&ncp->nc_head->spin); 839 ncp->nc_parent = NULL; 840 ncp->nc_head = NULL; 841 _cache_unlock(par); 842 _cache_drop(par); 843 844 /* 845 * We can only safely vdrop with no spinlocks held. 846 */ 847 if (dropvp) 848 vdrop(dropvp); 849 } 850 } 851 852 /* 853 * Allocate a new namecache structure. Most of the code does not require 854 * zero-termination of the string but it makes vop_compat_ncreate() easier. 855 */ 856 static struct namecache * 857 cache_alloc(int nlen) 858 { 859 struct namecache *ncp; 860 861 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 862 if (nlen) 863 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 864 ncp->nc_nlen = nlen; 865 ncp->nc_flag = NCF_UNRESOLVED; 866 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 867 ncp->nc_refs = 1; 868 869 TAILQ_INIT(&ncp->nc_list); 870 _cache_lock(ncp); 871 return(ncp); 872 } 873 874 /* 875 * Can only be called for the case where the ncp has never been 876 * associated with anything (so no spinlocks are needed). 877 */ 878 static void 879 _cache_free(struct namecache *ncp) 880 { 881 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); 882 if (ncp->nc_name) 883 kfree(ncp->nc_name, M_VFSCACHE); 884 kfree(ncp, M_VFSCACHE); 885 } 886 887 /* 888 * [re]initialize a nchandle. 889 */ 890 void 891 cache_zero(struct nchandle *nch) 892 { 893 nch->ncp = NULL; 894 nch->mount = NULL; 895 } 896 897 /* 898 * Ref and deref a namecache structure. 899 * 900 * The caller must specify a stable ncp pointer, typically meaning the 901 * ncp is already referenced but this can also occur indirectly through 902 * e.g. holding a lock on a direct child. 903 * 904 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 905 * use read spinlocks here. 906 * 907 * MPSAFE if nch is 908 */ 909 struct nchandle * 910 cache_hold(struct nchandle *nch) 911 { 912 _cache_hold(nch->ncp); 913 atomic_add_int(&nch->mount->mnt_refs, 1); 914 return(nch); 915 } 916 917 /* 918 * Create a copy of a namecache handle for an already-referenced 919 * entry. 920 * 921 * MPSAFE if nch is 922 */ 923 void 924 cache_copy(struct nchandle *nch, struct nchandle *target) 925 { 926 *target = *nch; 927 if (target->ncp) 928 _cache_hold(target->ncp); 929 atomic_add_int(&nch->mount->mnt_refs, 1); 930 } 931 932 /* 933 * MPSAFE if nch is 934 */ 935 void 936 cache_changemount(struct nchandle *nch, struct mount *mp) 937 { 938 atomic_add_int(&nch->mount->mnt_refs, -1); 939 nch->mount = mp; 940 atomic_add_int(&nch->mount->mnt_refs, 1); 941 } 942 943 void 944 cache_drop(struct nchandle *nch) 945 { 946 atomic_add_int(&nch->mount->mnt_refs, -1); 947 _cache_drop(nch->ncp); 948 nch->ncp = NULL; 949 nch->mount = NULL; 950 } 951 952 int 953 cache_lockstatus(struct nchandle *nch) 954 { 955 return(_cache_lockstatus(nch->ncp)); 956 } 957 958 void 959 cache_lock(struct nchandle *nch) 960 { 961 _cache_lock(nch->ncp); 962 } 963 964 void 965 cache_lock_maybe_shared(struct nchandle *nch, int excl) 966 { 967 struct namecache *ncp = nch->ncp; 968 969 if (ncp_shared_lock_disable || excl || 970 (ncp->nc_flag & NCF_UNRESOLVED)) { 971 _cache_lock(ncp); 972 } else { 973 _cache_lock_shared(ncp); 974 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 975 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 976 _cache_unlock(ncp); 977 _cache_lock(ncp); 978 } 979 } else { 980 _cache_unlock(ncp); 981 _cache_lock(ncp); 982 } 983 } 984 } 985 986 /* 987 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 988 * is responsible for checking both for validity on return as they 989 * may have become invalid. 990 * 991 * We have to deal with potential deadlocks here, just ping pong 992 * the lock until we get it (we will always block somewhere when 993 * looping so this is not cpu-intensive). 994 * 995 * which = 0 nch1 not locked, nch2 is locked 996 * which = 1 nch1 is locked, nch2 is not locked 997 */ 998 void 999 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1000 struct nchandle *nch2, struct ucred *cred2) 1001 { 1002 int which; 1003 1004 which = 0; 1005 1006 for (;;) { 1007 if (which == 0) { 1008 if (cache_lock_nonblock(nch1) == 0) { 1009 cache_resolve(nch1, cred1); 1010 break; 1011 } 1012 cache_unlock(nch2); 1013 cache_lock(nch1); 1014 cache_resolve(nch1, cred1); 1015 which = 1; 1016 } else { 1017 if (cache_lock_nonblock(nch2) == 0) { 1018 cache_resolve(nch2, cred2); 1019 break; 1020 } 1021 cache_unlock(nch1); 1022 cache_lock(nch2); 1023 cache_resolve(nch2, cred2); 1024 which = 0; 1025 } 1026 } 1027 } 1028 1029 int 1030 cache_lock_nonblock(struct nchandle *nch) 1031 { 1032 return(_cache_lock_nonblock(nch->ncp)); 1033 } 1034 1035 void 1036 cache_unlock(struct nchandle *nch) 1037 { 1038 _cache_unlock(nch->ncp); 1039 } 1040 1041 /* 1042 * ref-and-lock, unlock-and-deref functions. 1043 * 1044 * This function is primarily used by nlookup. Even though cache_lock 1045 * holds the vnode, it is possible that the vnode may have already 1046 * initiated a recyclement. 1047 * 1048 * We want cache_get() to return a definitively usable vnode or a 1049 * definitively unresolved ncp. 1050 */ 1051 static 1052 struct namecache * 1053 _cache_get(struct namecache *ncp) 1054 { 1055 _cache_hold(ncp); 1056 _cache_lock(ncp); 1057 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1058 _cache_setunresolved(ncp); 1059 return(ncp); 1060 } 1061 1062 /* 1063 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1064 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1065 * valid. Otherwise an exclusive lock will be acquired instead. 1066 */ 1067 static 1068 struct namecache * 1069 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1070 { 1071 if (ncp_shared_lock_disable || excl || 1072 (ncp->nc_flag & NCF_UNRESOLVED)) { 1073 return(_cache_get(ncp)); 1074 } 1075 _cache_hold(ncp); 1076 _cache_lock_shared(ncp); 1077 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1078 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1079 _cache_unlock(ncp); 1080 ncp = _cache_get(ncp); 1081 _cache_drop(ncp); 1082 } 1083 } else { 1084 _cache_unlock(ncp); 1085 ncp = _cache_get(ncp); 1086 _cache_drop(ncp); 1087 } 1088 return(ncp); 1089 } 1090 1091 /* 1092 * This is a special form of _cache_lock() which only succeeds if 1093 * it can get a pristine, non-recursive lock. The caller must have 1094 * already ref'd the ncp. 1095 * 1096 * On success the ncp will be locked, on failure it will not. The 1097 * ref count does not change either way. 1098 * 1099 * We want _cache_lock_special() (on success) to return a definitively 1100 * usable vnode or a definitively unresolved ncp. 1101 */ 1102 static int 1103 _cache_lock_special(struct namecache *ncp) 1104 { 1105 if (_cache_lock_nonblock(ncp) == 0) { 1106 if ((ncp->nc_lockstatus & 1107 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { 1108 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1109 _cache_setunresolved(ncp); 1110 return(0); 1111 } 1112 _cache_unlock(ncp); 1113 } 1114 return(EWOULDBLOCK); 1115 } 1116 1117 /* 1118 * This function tries to get a shared lock but will back-off to an exclusive 1119 * lock if: 1120 * 1121 * (1) Some other thread is trying to obtain an exclusive lock 1122 * (to prevent the exclusive requester from getting livelocked out 1123 * by many shared locks). 1124 * 1125 * (2) The current thread already owns an exclusive lock (to avoid 1126 * deadlocking). 1127 * 1128 * WARNING! On machines with lots of cores we really want to try hard to 1129 * get a shared lock or concurrent path lookups can chain-react 1130 * into a very high-latency exclusive lock. 1131 */ 1132 static int 1133 _cache_lock_shared_special(struct namecache *ncp) 1134 { 1135 /* 1136 * Only honor a successful shared lock (returning 0) if there is 1137 * no exclusive request pending and the vnode, if present, is not 1138 * in a reclaimed state. 1139 */ 1140 if (_cache_lock_shared_nonblock(ncp) == 0) { 1141 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) { 1142 if (ncp->nc_vp == NULL || 1143 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 1144 return(0); 1145 } 1146 } 1147 _cache_unlock(ncp); 1148 return(EWOULDBLOCK); 1149 } 1150 1151 /* 1152 * Non-blocking shared lock failed. If we already own the exclusive 1153 * lock just acquire another exclusive lock (instead of deadlocking). 1154 * Otherwise acquire a shared lock. 1155 */ 1156 if (ncp->nc_locktd == curthread) { 1157 _cache_lock(ncp); 1158 return(0); 1159 } 1160 _cache_lock_shared(ncp); 1161 return(0); 1162 } 1163 1164 1165 /* 1166 * NOTE: The same nchandle can be passed for both arguments. 1167 */ 1168 void 1169 cache_get(struct nchandle *nch, struct nchandle *target) 1170 { 1171 KKASSERT(nch->ncp->nc_refs > 0); 1172 target->mount = nch->mount; 1173 target->ncp = _cache_get(nch->ncp); 1174 atomic_add_int(&target->mount->mnt_refs, 1); 1175 } 1176 1177 void 1178 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1179 { 1180 KKASSERT(nch->ncp->nc_refs > 0); 1181 target->mount = nch->mount; 1182 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1183 atomic_add_int(&target->mount->mnt_refs, 1); 1184 } 1185 1186 /* 1187 * 1188 */ 1189 static __inline 1190 void 1191 _cache_put(struct namecache *ncp) 1192 { 1193 _cache_unlock(ncp); 1194 _cache_drop(ncp); 1195 } 1196 1197 /* 1198 * 1199 */ 1200 void 1201 cache_put(struct nchandle *nch) 1202 { 1203 atomic_add_int(&nch->mount->mnt_refs, -1); 1204 _cache_put(nch->ncp); 1205 nch->ncp = NULL; 1206 nch->mount = NULL; 1207 } 1208 1209 /* 1210 * Resolve an unresolved ncp by associating a vnode with it. If the 1211 * vnode is NULL, a negative cache entry is created. 1212 * 1213 * The ncp should be locked on entry and will remain locked on return. 1214 */ 1215 static 1216 void 1217 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1218 { 1219 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 1220 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1221 1222 if (vp != NULL) { 1223 /* 1224 * Any vp associated with an ncp which has children must 1225 * be held. Any vp associated with a locked ncp must be held. 1226 */ 1227 if (!TAILQ_EMPTY(&ncp->nc_list)) 1228 vhold(vp); 1229 spin_lock(&vp->v_spin); 1230 ncp->nc_vp = vp; 1231 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1232 spin_unlock(&vp->v_spin); 1233 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1234 vhold(vp); 1235 1236 /* 1237 * Set auxiliary flags 1238 */ 1239 switch(vp->v_type) { 1240 case VDIR: 1241 ncp->nc_flag |= NCF_ISDIR; 1242 break; 1243 case VLNK: 1244 ncp->nc_flag |= NCF_ISSYMLINK; 1245 /* XXX cache the contents of the symlink */ 1246 break; 1247 default: 1248 break; 1249 } 1250 atomic_add_int(&numcache, 1); 1251 ncp->nc_error = 0; 1252 /* XXX: this is a hack to work-around the lack of a real pfs vfs 1253 * implementation*/ 1254 if (mp != NULL) 1255 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1256 vp->v_pfsmp = mp; 1257 } else { 1258 /* 1259 * When creating a negative cache hit we set the 1260 * namecache_gen. A later resolve will clean out the 1261 * negative cache hit if the mount point's namecache_gen 1262 * has changed. Used by devfs, could also be used by 1263 * other remote FSs. 1264 */ 1265 ncp->nc_vp = NULL; 1266 spin_lock(&ncspin); 1267 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 1268 ++numneg; 1269 spin_unlock(&ncspin); 1270 ncp->nc_error = ENOENT; 1271 if (mp) 1272 VFS_NCPGEN_SET(mp, ncp); 1273 } 1274 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1275 } 1276 1277 /* 1278 * 1279 */ 1280 void 1281 cache_setvp(struct nchandle *nch, struct vnode *vp) 1282 { 1283 _cache_setvp(nch->mount, nch->ncp, vp); 1284 } 1285 1286 /* 1287 * 1288 */ 1289 void 1290 cache_settimeout(struct nchandle *nch, int nticks) 1291 { 1292 struct namecache *ncp = nch->ncp; 1293 1294 if ((ncp->nc_timeout = ticks + nticks) == 0) 1295 ncp->nc_timeout = 1; 1296 } 1297 1298 /* 1299 * Disassociate the vnode or negative-cache association and mark a 1300 * namecache entry as unresolved again. Note that the ncp is still 1301 * left in the hash table and still linked to its parent. 1302 * 1303 * The ncp should be locked and refd on entry and will remain locked and refd 1304 * on return. 1305 * 1306 * This routine is normally never called on a directory containing children. 1307 * However, NFS often does just that in its rename() code as a cop-out to 1308 * avoid complex namespace operations. This disconnects a directory vnode 1309 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1310 * sync. 1311 * 1312 */ 1313 static 1314 void 1315 _cache_setunresolved(struct namecache *ncp) 1316 { 1317 struct vnode *vp; 1318 1319 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1320 ncp->nc_flag |= NCF_UNRESOLVED; 1321 ncp->nc_timeout = 0; 1322 ncp->nc_error = ENOTCONN; 1323 if ((vp = ncp->nc_vp) != NULL) { 1324 atomic_add_int(&numcache, -1); 1325 spin_lock(&vp->v_spin); 1326 ncp->nc_vp = NULL; 1327 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1328 spin_unlock(&vp->v_spin); 1329 1330 /* 1331 * Any vp associated with an ncp with children is 1332 * held by that ncp. Any vp associated with a locked 1333 * ncp is held by that ncp. These conditions must be 1334 * undone when the vp is cleared out from the ncp. 1335 */ 1336 if (!TAILQ_EMPTY(&ncp->nc_list)) 1337 vdrop(vp); 1338 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1339 vdrop(vp); 1340 } else { 1341 spin_lock(&ncspin); 1342 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 1343 --numneg; 1344 spin_unlock(&ncspin); 1345 } 1346 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1347 } 1348 } 1349 1350 /* 1351 * The cache_nresolve() code calls this function to automatically 1352 * set a resolved cache element to unresolved if it has timed out 1353 * or if it is a negative cache hit and the mount point namecache_gen 1354 * has changed. 1355 */ 1356 static __inline int 1357 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1358 { 1359 /* 1360 * Try to zap entries that have timed out. We have 1361 * to be careful here because locked leafs may depend 1362 * on the vnode remaining intact in a parent, so only 1363 * do this under very specific conditions. 1364 */ 1365 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1366 TAILQ_EMPTY(&ncp->nc_list)) { 1367 return 1; 1368 } 1369 1370 /* 1371 * If a resolved negative cache hit is invalid due to 1372 * the mount's namecache generation being bumped, zap it. 1373 */ 1374 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1375 return 1; 1376 } 1377 1378 /* 1379 * Otherwise we are good 1380 */ 1381 return 0; 1382 } 1383 1384 static __inline void 1385 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1386 { 1387 /* 1388 * Already in an unresolved state, nothing to do. 1389 */ 1390 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1391 if (_cache_auto_unresolve_test(mp, ncp)) 1392 _cache_setunresolved(ncp); 1393 } 1394 } 1395 1396 /* 1397 * 1398 */ 1399 void 1400 cache_setunresolved(struct nchandle *nch) 1401 { 1402 _cache_setunresolved(nch->ncp); 1403 } 1404 1405 /* 1406 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1407 * looking for matches. This flag tells the lookup code when it must 1408 * check for a mount linkage and also prevents the directories in question 1409 * from being deleted or renamed. 1410 */ 1411 static 1412 int 1413 cache_clrmountpt_callback(struct mount *mp, void *data) 1414 { 1415 struct nchandle *nch = data; 1416 1417 if (mp->mnt_ncmounton.ncp == nch->ncp) 1418 return(1); 1419 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1420 return(1); 1421 return(0); 1422 } 1423 1424 /* 1425 * 1426 */ 1427 void 1428 cache_clrmountpt(struct nchandle *nch) 1429 { 1430 int count; 1431 1432 count = mountlist_scan(cache_clrmountpt_callback, nch, 1433 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1434 if (count == 0) 1435 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1436 } 1437 1438 /* 1439 * Invalidate portions of the namecache topology given a starting entry. 1440 * The passed ncp is set to an unresolved state and: 1441 * 1442 * The passed ncp must be referencxed and locked. The routine may unlock 1443 * and relock ncp several times, and will recheck the children and loop 1444 * to catch races. When done the passed ncp will be returned with the 1445 * reference and lock intact. 1446 * 1447 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1448 * that the physical underlying nodes have been 1449 * destroyed... as in deleted. For example, when 1450 * a directory is removed. This will cause record 1451 * lookups on the name to no longer be able to find 1452 * the record and tells the resolver to return failure 1453 * rather then trying to resolve through the parent. 1454 * 1455 * The topology itself, including ncp->nc_name, 1456 * remains intact. 1457 * 1458 * This only applies to the passed ncp, if CINV_CHILDREN 1459 * is specified the children are not flagged. 1460 * 1461 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1462 * state as well. 1463 * 1464 * Note that this will also have the side effect of 1465 * cleaning out any unreferenced nodes in the topology 1466 * from the leaves up as the recursion backs out. 1467 * 1468 * Note that the topology for any referenced nodes remains intact, but 1469 * the nodes will be marked as having been destroyed and will be set 1470 * to an unresolved state. 1471 * 1472 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1473 * the namecache entry may not actually be invalidated on return if it was 1474 * revalidated while recursing down into its children. This code guarentees 1475 * that the node(s) will go through an invalidation cycle, but does not 1476 * guarentee that they will remain in an invalidated state. 1477 * 1478 * Returns non-zero if a revalidation was detected during the invalidation 1479 * recursion, zero otherwise. Note that since only the original ncp is 1480 * locked the revalidation ultimately can only indicate that the original ncp 1481 * *MIGHT* no have been reresolved. 1482 * 1483 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1484 * have to avoid blowing out the kernel stack. We do this by saving the 1485 * deep namecache node and aborting the recursion, then re-recursing at that 1486 * node using a depth-first algorithm in order to allow multiple deep 1487 * recursions to chain through each other, then we restart the invalidation 1488 * from scratch. 1489 */ 1490 1491 struct cinvtrack { 1492 struct namecache *resume_ncp; 1493 int depth; 1494 }; 1495 1496 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1497 1498 static 1499 int 1500 _cache_inval(struct namecache *ncp, int flags) 1501 { 1502 struct cinvtrack track; 1503 struct namecache *ncp2; 1504 int r; 1505 1506 track.depth = 0; 1507 track.resume_ncp = NULL; 1508 1509 for (;;) { 1510 r = _cache_inval_internal(ncp, flags, &track); 1511 if (track.resume_ncp == NULL) 1512 break; 1513 _cache_unlock(ncp); 1514 while ((ncp2 = track.resume_ncp) != NULL) { 1515 track.resume_ncp = NULL; 1516 _cache_lock(ncp2); 1517 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1518 &track); 1519 _cache_put(ncp2); 1520 } 1521 _cache_lock(ncp); 1522 } 1523 return(r); 1524 } 1525 1526 int 1527 cache_inval(struct nchandle *nch, int flags) 1528 { 1529 return(_cache_inval(nch->ncp, flags)); 1530 } 1531 1532 /* 1533 * Helper for _cache_inval(). The passed ncp is refd and locked and 1534 * remains that way on return, but may be unlocked/relocked multiple 1535 * times by the routine. 1536 */ 1537 static int 1538 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1539 { 1540 struct namecache *nextkid; 1541 int rcnt = 0; 1542 1543 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1544 1545 _cache_setunresolved(ncp); 1546 if (flags & CINV_DESTROY) { 1547 ncp->nc_flag |= NCF_DESTROYED; 1548 ++ncp->nc_generation; 1549 } 1550 while ((flags & CINV_CHILDREN) && 1551 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1552 ) { 1553 struct namecache *kid; 1554 int restart; 1555 1556 restart = 0; 1557 _cache_hold(nextkid); 1558 if (++track->depth > MAX_RECURSION_DEPTH) { 1559 track->resume_ncp = ncp; 1560 _cache_hold(ncp); 1561 ++rcnt; 1562 } 1563 while ((kid = nextkid) != NULL) { 1564 /* 1565 * Parent (ncp) must be locked for the iteration. 1566 */ 1567 nextkid = NULL; 1568 if (kid->nc_parent != ncp) { 1569 _cache_drop(kid); 1570 kprintf("cache_inval_internal restartA %s\n", 1571 ncp->nc_name); 1572 restart = 1; 1573 break; 1574 } 1575 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1576 _cache_hold(nextkid); 1577 1578 /* 1579 * Parent unlocked for this section to avoid 1580 * deadlocks. 1581 */ 1582 _cache_unlock(ncp); 1583 if (track->resume_ncp) { 1584 _cache_drop(kid); 1585 _cache_lock(ncp); 1586 break; 1587 } 1588 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1589 TAILQ_FIRST(&kid->nc_list) 1590 ) { 1591 _cache_lock(kid); 1592 if (kid->nc_parent != ncp) { 1593 kprintf("cache_inval_internal " 1594 "restartB %s\n", 1595 ncp->nc_name); 1596 restart = 1; 1597 _cache_unlock(kid); 1598 _cache_drop(kid); 1599 _cache_lock(ncp); 1600 break; 1601 } 1602 1603 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track); 1604 _cache_unlock(kid); 1605 } 1606 _cache_drop(kid); 1607 _cache_lock(ncp); 1608 } 1609 if (nextkid) 1610 _cache_drop(nextkid); 1611 --track->depth; 1612 if (restart == 0) 1613 break; 1614 } 1615 1616 /* 1617 * Someone could have gotten in there while ncp was unlocked, 1618 * retry if so. 1619 */ 1620 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1621 ++rcnt; 1622 return (rcnt); 1623 } 1624 1625 /* 1626 * Invalidate a vnode's namecache associations. To avoid races against 1627 * the resolver we do not invalidate a node which we previously invalidated 1628 * but which was then re-resolved while we were in the invalidation loop. 1629 * 1630 * Returns non-zero if any namecache entries remain after the invalidation 1631 * loop completed. 1632 * 1633 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1634 * be ripped out of the topology while held, the vnode's v_namecache 1635 * list has no such restriction. NCP's can be ripped out of the list 1636 * at virtually any time if not locked, even if held. 1637 * 1638 * In addition, the v_namecache list itself must be locked via 1639 * the vnode's spinlock. 1640 */ 1641 int 1642 cache_inval_vp(struct vnode *vp, int flags) 1643 { 1644 struct namecache *ncp; 1645 struct namecache *next; 1646 1647 restart: 1648 spin_lock(&vp->v_spin); 1649 ncp = TAILQ_FIRST(&vp->v_namecache); 1650 if (ncp) 1651 _cache_hold(ncp); 1652 while (ncp) { 1653 /* loop entered with ncp held and vp spin-locked */ 1654 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1655 _cache_hold(next); 1656 spin_unlock(&vp->v_spin); 1657 _cache_lock(ncp); 1658 if (ncp->nc_vp != vp) { 1659 kprintf("Warning: cache_inval_vp: race-A detected on " 1660 "%s\n", ncp->nc_name); 1661 _cache_put(ncp); 1662 if (next) 1663 _cache_drop(next); 1664 goto restart; 1665 } 1666 _cache_inval(ncp, flags); 1667 _cache_put(ncp); /* also releases reference */ 1668 ncp = next; 1669 spin_lock(&vp->v_spin); 1670 if (ncp && ncp->nc_vp != vp) { 1671 spin_unlock(&vp->v_spin); 1672 kprintf("Warning: cache_inval_vp: race-B detected on " 1673 "%s\n", ncp->nc_name); 1674 _cache_drop(ncp); 1675 goto restart; 1676 } 1677 } 1678 spin_unlock(&vp->v_spin); 1679 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1680 } 1681 1682 /* 1683 * This routine is used instead of the normal cache_inval_vp() when we 1684 * are trying to recycle otherwise good vnodes. 1685 * 1686 * Return 0 on success, non-zero if not all namecache records could be 1687 * disassociated from the vnode (for various reasons). 1688 */ 1689 int 1690 cache_inval_vp_nonblock(struct vnode *vp) 1691 { 1692 struct namecache *ncp; 1693 struct namecache *next; 1694 1695 spin_lock(&vp->v_spin); 1696 ncp = TAILQ_FIRST(&vp->v_namecache); 1697 if (ncp) 1698 _cache_hold(ncp); 1699 while (ncp) { 1700 /* loop entered with ncp held */ 1701 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1702 _cache_hold(next); 1703 spin_unlock(&vp->v_spin); 1704 if (_cache_lock_nonblock(ncp)) { 1705 _cache_drop(ncp); 1706 if (next) 1707 _cache_drop(next); 1708 goto done; 1709 } 1710 if (ncp->nc_vp != vp) { 1711 kprintf("Warning: cache_inval_vp: race-A detected on " 1712 "%s\n", ncp->nc_name); 1713 _cache_put(ncp); 1714 if (next) 1715 _cache_drop(next); 1716 goto done; 1717 } 1718 _cache_inval(ncp, 0); 1719 _cache_put(ncp); /* also releases reference */ 1720 ncp = next; 1721 spin_lock(&vp->v_spin); 1722 if (ncp && ncp->nc_vp != vp) { 1723 spin_unlock(&vp->v_spin); 1724 kprintf("Warning: cache_inval_vp: race-B detected on " 1725 "%s\n", ncp->nc_name); 1726 _cache_drop(ncp); 1727 goto done; 1728 } 1729 } 1730 spin_unlock(&vp->v_spin); 1731 done: 1732 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1733 } 1734 1735 /* 1736 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1737 * must be locked. The target ncp is destroyed (as a normal rename-over 1738 * would destroy the target file or directory). 1739 * 1740 * Because there may be references to the source ncp we cannot copy its 1741 * contents to the target. Instead the source ncp is relinked as the target 1742 * and the target ncp is removed from the namecache topology. 1743 */ 1744 void 1745 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1746 { 1747 struct namecache *fncp = fnch->ncp; 1748 struct namecache *tncp = tnch->ncp; 1749 struct namecache *tncp_par; 1750 struct nchash_head *nchpp; 1751 u_int32_t hash; 1752 char *oname; 1753 char *nname; 1754 1755 ++fncp->nc_generation; 1756 ++tncp->nc_generation; 1757 if (tncp->nc_nlen) { 1758 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1759 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1760 nname[tncp->nc_nlen] = 0; 1761 } else { 1762 nname = NULL; 1763 } 1764 1765 /* 1766 * Rename fncp (unlink) 1767 */ 1768 _cache_unlink_parent(fncp); 1769 oname = fncp->nc_name; 1770 fncp->nc_name = nname; 1771 fncp->nc_nlen = tncp->nc_nlen; 1772 if (oname) 1773 kfree(oname, M_VFSCACHE); 1774 1775 tncp_par = tncp->nc_parent; 1776 _cache_hold(tncp_par); 1777 _cache_lock(tncp_par); 1778 1779 /* 1780 * Rename fncp (relink) 1781 */ 1782 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1783 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1784 nchpp = NCHHASH(hash); 1785 1786 spin_lock(&nchpp->spin); 1787 _cache_link_parent(fncp, tncp_par, nchpp); 1788 spin_unlock(&nchpp->spin); 1789 1790 _cache_put(tncp_par); 1791 1792 /* 1793 * Get rid of the overwritten tncp (unlink) 1794 */ 1795 _cache_unlink(tncp); 1796 } 1797 1798 /* 1799 * Perform actions consistent with unlinking a file. The passed-in ncp 1800 * must be locked. 1801 * 1802 * The ncp is marked DESTROYED so it no longer shows up in searches, 1803 * and will be physically deleted when the vnode goes away. 1804 * 1805 * If the related vnode has no refs then we cycle it through vget()/vput() 1806 * to (possibly if we don't have a ref race) trigger a deactivation, 1807 * allowing the VFS to trivially detect and recycle the deleted vnode 1808 * via VOP_INACTIVE(). 1809 * 1810 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1811 * target ncp. 1812 */ 1813 void 1814 cache_unlink(struct nchandle *nch) 1815 { 1816 _cache_unlink(nch->ncp); 1817 } 1818 1819 static void 1820 _cache_unlink(struct namecache *ncp) 1821 { 1822 struct vnode *vp; 1823 1824 /* 1825 * Causes lookups to fail and allows another ncp with the same 1826 * name to be created under ncp->nc_parent. 1827 */ 1828 ncp->nc_flag |= NCF_DESTROYED; 1829 ++ncp->nc_generation; 1830 1831 /* 1832 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1833 * force action on the 1->0 transition. 1834 */ 1835 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1836 (vp = ncp->nc_vp) != NULL) { 1837 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1838 if (VREFCNT(vp) <= 0) { 1839 if (vget(vp, LK_SHARED) == 0) 1840 vput(vp); 1841 } 1842 } 1843 } 1844 1845 /* 1846 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1847 * file. The easy solution is to just return non-zero if the vnode has refs. 1848 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1849 * force the reclaim). 1850 */ 1851 int 1852 cache_isopen(struct nchandle *nch) 1853 { 1854 struct vnode *vp; 1855 struct namecache *ncp = nch->ncp; 1856 1857 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1858 (vp = ncp->nc_vp) != NULL && 1859 VREFCNT(vp)) { 1860 return 1; 1861 } 1862 return 0; 1863 } 1864 1865 1866 /* 1867 * vget the vnode associated with the namecache entry. Resolve the namecache 1868 * entry if necessary. The passed ncp must be referenced and locked. If 1869 * the ncp is resolved it might be locked shared. 1870 * 1871 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1872 * (depending on the passed lk_type) will be returned in *vpp with an error 1873 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1874 * most typical error is ENOENT, meaning that the ncp represents a negative 1875 * cache hit and there is no vnode to retrieve, but other errors can occur 1876 * too. 1877 * 1878 * The vget() can race a reclaim. If this occurs we re-resolve the 1879 * namecache entry. 1880 * 1881 * There are numerous places in the kernel where vget() is called on a 1882 * vnode while one or more of its namecache entries is locked. Releasing 1883 * a vnode never deadlocks against locked namecache entries (the vnode 1884 * will not get recycled while referenced ncp's exist). This means we 1885 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1886 * lock when acquiring the vp lock or we might cause a deadlock. 1887 * 1888 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1889 * unresolved. If a reclaim race occurs the passed-in ncp will be 1890 * relocked exclusively before being re-resolved. 1891 */ 1892 int 1893 cache_vget(struct nchandle *nch, struct ucred *cred, 1894 int lk_type, struct vnode **vpp) 1895 { 1896 struct namecache *ncp; 1897 struct vnode *vp; 1898 int error; 1899 1900 ncp = nch->ncp; 1901 again: 1902 vp = NULL; 1903 if (ncp->nc_flag & NCF_UNRESOLVED) 1904 error = cache_resolve(nch, cred); 1905 else 1906 error = 0; 1907 1908 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1909 error = vget(vp, lk_type); 1910 if (error) { 1911 /* 1912 * VRECLAIM race 1913 * 1914 * The ncp may have been locked shared, we must relock 1915 * it exclusively before we can set it to unresolved. 1916 */ 1917 if (error == ENOENT) { 1918 kprintf("Warning: vnode reclaim race detected " 1919 "in cache_vget on %p (%s)\n", 1920 vp, ncp->nc_name); 1921 _cache_unlock(ncp); 1922 _cache_lock(ncp); 1923 _cache_setunresolved(ncp); 1924 goto again; 1925 } 1926 1927 /* 1928 * Not a reclaim race, some other error. 1929 */ 1930 KKASSERT(ncp->nc_vp == vp); 1931 vp = NULL; 1932 } else { 1933 KKASSERT(ncp->nc_vp == vp); 1934 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1935 } 1936 } 1937 if (error == 0 && vp == NULL) 1938 error = ENOENT; 1939 *vpp = vp; 1940 return(error); 1941 } 1942 1943 /* 1944 * Similar to cache_vget() but only acquires a ref on the vnode. 1945 * 1946 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1947 * unresolved. If a reclaim race occurs the passed-in ncp will be 1948 * relocked exclusively before being re-resolved. 1949 */ 1950 int 1951 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1952 { 1953 struct namecache *ncp; 1954 struct vnode *vp; 1955 int error; 1956 1957 ncp = nch->ncp; 1958 again: 1959 vp = NULL; 1960 if (ncp->nc_flag & NCF_UNRESOLVED) 1961 error = cache_resolve(nch, cred); 1962 else 1963 error = 0; 1964 1965 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1966 error = vget(vp, LK_SHARED); 1967 if (error) { 1968 /* 1969 * VRECLAIM race 1970 */ 1971 if (error == ENOENT) { 1972 kprintf("Warning: vnode reclaim race detected " 1973 "in cache_vget on %p (%s)\n", 1974 vp, ncp->nc_name); 1975 _cache_unlock(ncp); 1976 _cache_lock(ncp); 1977 _cache_setunresolved(ncp); 1978 goto again; 1979 } 1980 1981 /* 1982 * Not a reclaim race, some other error. 1983 */ 1984 KKASSERT(ncp->nc_vp == vp); 1985 vp = NULL; 1986 } else { 1987 KKASSERT(ncp->nc_vp == vp); 1988 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1989 /* caller does not want a lock */ 1990 vn_unlock(vp); 1991 } 1992 } 1993 if (error == 0 && vp == NULL) 1994 error = ENOENT; 1995 *vpp = vp; 1996 return(error); 1997 } 1998 1999 /* 2000 * Return a referenced vnode representing the parent directory of 2001 * ncp. 2002 * 2003 * Because the caller has locked the ncp it should not be possible for 2004 * the parent ncp to go away. However, the parent can unresolve its 2005 * dvp at any time so we must be able to acquire a lock on the parent 2006 * to safely access nc_vp. 2007 * 2008 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2009 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2010 * getting destroyed. 2011 * 2012 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2013 * lock on the ncp in question.. 2014 */ 2015 static struct vnode * 2016 cache_dvpref(struct namecache *ncp) 2017 { 2018 struct namecache *par; 2019 struct vnode *dvp; 2020 2021 dvp = NULL; 2022 if ((par = ncp->nc_parent) != NULL) { 2023 _cache_hold(par); 2024 _cache_lock(par); 2025 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2026 if ((dvp = par->nc_vp) != NULL) 2027 vhold(dvp); 2028 } 2029 _cache_unlock(par); 2030 if (dvp) { 2031 if (vget(dvp, LK_SHARED) == 0) { 2032 vn_unlock(dvp); 2033 vdrop(dvp); 2034 /* return refd, unlocked dvp */ 2035 } else { 2036 vdrop(dvp); 2037 dvp = NULL; 2038 } 2039 } 2040 _cache_drop(par); 2041 } 2042 return(dvp); 2043 } 2044 2045 /* 2046 * Convert a directory vnode to a namecache record without any other 2047 * knowledge of the topology. This ONLY works with directory vnodes and 2048 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2049 * returned ncp (if not NULL) will be held and unlocked. 2050 * 2051 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2052 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2053 * for dvp. This will fail only if the directory has been deleted out from 2054 * under the caller. 2055 * 2056 * Callers must always check for a NULL return no matter the value of 'makeit'. 2057 * 2058 * To avoid underflowing the kernel stack each recursive call increments 2059 * the makeit variable. 2060 */ 2061 2062 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2063 struct vnode *dvp, char *fakename); 2064 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2065 struct vnode **saved_dvp); 2066 2067 int 2068 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2069 struct nchandle *nch) 2070 { 2071 struct vnode *saved_dvp; 2072 struct vnode *pvp; 2073 char *fakename; 2074 int error; 2075 2076 nch->ncp = NULL; 2077 nch->mount = dvp->v_mount; 2078 saved_dvp = NULL; 2079 fakename = NULL; 2080 2081 /* 2082 * Handle the makeit == 0 degenerate case 2083 */ 2084 if (makeit == 0) { 2085 spin_lock_shared(&dvp->v_spin); 2086 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2087 if (nch->ncp) 2088 cache_hold(nch); 2089 spin_unlock_shared(&dvp->v_spin); 2090 } 2091 2092 /* 2093 * Loop until resolution, inside code will break out on error. 2094 */ 2095 while (makeit) { 2096 /* 2097 * Break out if we successfully acquire a working ncp. 2098 */ 2099 spin_lock_shared(&dvp->v_spin); 2100 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2101 if (nch->ncp) { 2102 cache_hold(nch); 2103 spin_unlock_shared(&dvp->v_spin); 2104 break; 2105 } 2106 spin_unlock_shared(&dvp->v_spin); 2107 2108 /* 2109 * If dvp is the root of its filesystem it should already 2110 * have a namecache pointer associated with it as a side 2111 * effect of the mount, but it may have been disassociated. 2112 */ 2113 if (dvp->v_flag & VROOT) { 2114 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2115 error = cache_resolve_mp(nch->mount); 2116 _cache_put(nch->ncp); 2117 if (ncvp_debug) { 2118 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2119 dvp->v_mount, error); 2120 } 2121 if (error) { 2122 if (ncvp_debug) 2123 kprintf(" failed\n"); 2124 nch->ncp = NULL; 2125 break; 2126 } 2127 if (ncvp_debug) 2128 kprintf(" succeeded\n"); 2129 continue; 2130 } 2131 2132 /* 2133 * If we are recursed too deeply resort to an O(n^2) 2134 * algorithm to resolve the namecache topology. The 2135 * resolved pvp is left referenced in saved_dvp to 2136 * prevent the tree from being destroyed while we loop. 2137 */ 2138 if (makeit > 20) { 2139 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2140 if (error) { 2141 kprintf("lookupdotdot(longpath) failed %d " 2142 "dvp %p\n", error, dvp); 2143 nch->ncp = NULL; 2144 break; 2145 } 2146 continue; 2147 } 2148 2149 /* 2150 * Get the parent directory and resolve its ncp. 2151 */ 2152 if (fakename) { 2153 kfree(fakename, M_TEMP); 2154 fakename = NULL; 2155 } 2156 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2157 &fakename); 2158 if (error) { 2159 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2160 break; 2161 } 2162 vn_unlock(pvp); 2163 2164 /* 2165 * Reuse makeit as a recursion depth counter. On success 2166 * nch will be fully referenced. 2167 */ 2168 cache_fromdvp(pvp, cred, makeit + 1, nch); 2169 vrele(pvp); 2170 if (nch->ncp == NULL) 2171 break; 2172 2173 /* 2174 * Do an inefficient scan of pvp (embodied by ncp) to look 2175 * for dvp. This will create a namecache record for dvp on 2176 * success. We loop up to recheck on success. 2177 * 2178 * ncp and dvp are both held but not locked. 2179 */ 2180 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2181 if (error) { 2182 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2183 pvp, nch->ncp->nc_name, dvp); 2184 cache_drop(nch); 2185 /* nch was NULLed out, reload mount */ 2186 nch->mount = dvp->v_mount; 2187 break; 2188 } 2189 if (ncvp_debug) { 2190 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2191 pvp, nch->ncp->nc_name); 2192 } 2193 cache_drop(nch); 2194 /* nch was NULLed out, reload mount */ 2195 nch->mount = dvp->v_mount; 2196 } 2197 2198 /* 2199 * If nch->ncp is non-NULL it will have been held already. 2200 */ 2201 if (fakename) 2202 kfree(fakename, M_TEMP); 2203 if (saved_dvp) 2204 vrele(saved_dvp); 2205 if (nch->ncp) 2206 return (0); 2207 return (EINVAL); 2208 } 2209 2210 /* 2211 * Go up the chain of parent directories until we find something 2212 * we can resolve into the namecache. This is very inefficient. 2213 */ 2214 static 2215 int 2216 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2217 struct vnode **saved_dvp) 2218 { 2219 struct nchandle nch; 2220 struct vnode *pvp; 2221 int error; 2222 static time_t last_fromdvp_report; 2223 char *fakename; 2224 2225 /* 2226 * Loop getting the parent directory vnode until we get something we 2227 * can resolve in the namecache. 2228 */ 2229 vref(dvp); 2230 nch.mount = dvp->v_mount; 2231 nch.ncp = NULL; 2232 fakename = NULL; 2233 2234 for (;;) { 2235 if (fakename) { 2236 kfree(fakename, M_TEMP); 2237 fakename = NULL; 2238 } 2239 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2240 &fakename); 2241 if (error) { 2242 vrele(dvp); 2243 break; 2244 } 2245 vn_unlock(pvp); 2246 spin_lock_shared(&pvp->v_spin); 2247 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2248 _cache_hold(nch.ncp); 2249 spin_unlock_shared(&pvp->v_spin); 2250 vrele(pvp); 2251 break; 2252 } 2253 spin_unlock_shared(&pvp->v_spin); 2254 if (pvp->v_flag & VROOT) { 2255 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2256 error = cache_resolve_mp(nch.mount); 2257 _cache_unlock(nch.ncp); 2258 vrele(pvp); 2259 if (error) { 2260 _cache_drop(nch.ncp); 2261 nch.ncp = NULL; 2262 vrele(dvp); 2263 } 2264 break; 2265 } 2266 vrele(dvp); 2267 dvp = pvp; 2268 } 2269 if (error == 0) { 2270 if (last_fromdvp_report != time_uptime) { 2271 last_fromdvp_report = time_uptime; 2272 kprintf("Warning: extremely inefficient path " 2273 "resolution on %s\n", 2274 nch.ncp->nc_name); 2275 } 2276 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2277 2278 /* 2279 * Hopefully dvp now has a namecache record associated with 2280 * it. Leave it referenced to prevent the kernel from 2281 * recycling the vnode. Otherwise extremely long directory 2282 * paths could result in endless recycling. 2283 */ 2284 if (*saved_dvp) 2285 vrele(*saved_dvp); 2286 *saved_dvp = dvp; 2287 _cache_drop(nch.ncp); 2288 } 2289 if (fakename) 2290 kfree(fakename, M_TEMP); 2291 return (error); 2292 } 2293 2294 /* 2295 * Do an inefficient scan of the directory represented by ncp looking for 2296 * the directory vnode dvp. ncp must be held but not locked on entry and 2297 * will be held on return. dvp must be refd but not locked on entry and 2298 * will remain refd on return. 2299 * 2300 * Why do this at all? Well, due to its stateless nature the NFS server 2301 * converts file handles directly to vnodes without necessarily going through 2302 * the namecache ops that would otherwise create the namecache topology 2303 * leading to the vnode. We could either (1) Change the namecache algorithms 2304 * to allow disconnect namecache records that are re-merged opportunistically, 2305 * or (2) Make the NFS server backtrack and scan to recover a connected 2306 * namecache topology in order to then be able to issue new API lookups. 2307 * 2308 * It turns out that (1) is a huge mess. It takes a nice clean set of 2309 * namecache algorithms and introduces a lot of complication in every subsystem 2310 * that calls into the namecache to deal with the re-merge case, especially 2311 * since we are using the namecache to placehold negative lookups and the 2312 * vnode might not be immediately assigned. (2) is certainly far less 2313 * efficient then (1), but since we are only talking about directories here 2314 * (which are likely to remain cached), the case does not actually run all 2315 * that often and has the supreme advantage of not polluting the namecache 2316 * algorithms. 2317 * 2318 * If a fakename is supplied just construct a namecache entry using the 2319 * fake name. 2320 */ 2321 static int 2322 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2323 struct vnode *dvp, char *fakename) 2324 { 2325 struct nlcomponent nlc; 2326 struct nchandle rncp; 2327 struct dirent *den; 2328 struct vnode *pvp; 2329 struct vattr vat; 2330 struct iovec iov; 2331 struct uio uio; 2332 int blksize; 2333 int eofflag; 2334 int bytes; 2335 char *rbuf; 2336 int error; 2337 2338 vat.va_blocksize = 0; 2339 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2340 return (error); 2341 cache_lock(nch); 2342 error = cache_vref(nch, cred, &pvp); 2343 cache_unlock(nch); 2344 if (error) 2345 return (error); 2346 if (ncvp_debug) { 2347 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2348 "vattr fileid = %lld\n", 2349 nch->ncp, nch->ncp->nc_name, 2350 vat.va_blocksize, 2351 (long long)vat.va_fileid); 2352 } 2353 2354 /* 2355 * Use the supplied fakename if not NULL. Fake names are typically 2356 * not in the actual filesystem hierarchy. This is used by HAMMER 2357 * to glue @@timestamp recursions together. 2358 */ 2359 if (fakename) { 2360 nlc.nlc_nameptr = fakename; 2361 nlc.nlc_namelen = strlen(fakename); 2362 rncp = cache_nlookup(nch, &nlc); 2363 goto done; 2364 } 2365 2366 if ((blksize = vat.va_blocksize) == 0) 2367 blksize = DEV_BSIZE; 2368 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2369 rncp.ncp = NULL; 2370 2371 eofflag = 0; 2372 uio.uio_offset = 0; 2373 again: 2374 iov.iov_base = rbuf; 2375 iov.iov_len = blksize; 2376 uio.uio_iov = &iov; 2377 uio.uio_iovcnt = 1; 2378 uio.uio_resid = blksize; 2379 uio.uio_segflg = UIO_SYSSPACE; 2380 uio.uio_rw = UIO_READ; 2381 uio.uio_td = curthread; 2382 2383 if (ncvp_debug >= 2) 2384 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2385 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2386 if (error == 0) { 2387 den = (struct dirent *)rbuf; 2388 bytes = blksize - uio.uio_resid; 2389 2390 while (bytes > 0) { 2391 if (ncvp_debug >= 2) { 2392 kprintf("cache_inefficient_scan: %*.*s\n", 2393 den->d_namlen, den->d_namlen, 2394 den->d_name); 2395 } 2396 if (den->d_type != DT_WHT && 2397 den->d_ino == vat.va_fileid) { 2398 if (ncvp_debug) { 2399 kprintf("cache_inefficient_scan: " 2400 "MATCHED inode %lld path %s/%*.*s\n", 2401 (long long)vat.va_fileid, 2402 nch->ncp->nc_name, 2403 den->d_namlen, den->d_namlen, 2404 den->d_name); 2405 } 2406 nlc.nlc_nameptr = den->d_name; 2407 nlc.nlc_namelen = den->d_namlen; 2408 rncp = cache_nlookup(nch, &nlc); 2409 KKASSERT(rncp.ncp != NULL); 2410 break; 2411 } 2412 bytes -= _DIRENT_DIRSIZ(den); 2413 den = _DIRENT_NEXT(den); 2414 } 2415 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2416 goto again; 2417 } 2418 kfree(rbuf, M_TEMP); 2419 done: 2420 vrele(pvp); 2421 if (rncp.ncp) { 2422 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2423 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2424 if (ncvp_debug >= 2) { 2425 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2426 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2427 } 2428 } else { 2429 if (ncvp_debug >= 2) { 2430 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2431 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2432 rncp.ncp->nc_vp); 2433 } 2434 } 2435 if (rncp.ncp->nc_vp == NULL) 2436 error = rncp.ncp->nc_error; 2437 /* 2438 * Release rncp after a successful nlookup. rncp was fully 2439 * referenced. 2440 */ 2441 cache_put(&rncp); 2442 } else { 2443 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2444 dvp, nch->ncp->nc_name); 2445 error = ENOENT; 2446 } 2447 return (error); 2448 } 2449 2450 /* 2451 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2452 * state, which disassociates it from its vnode or ncneglist. 2453 * 2454 * Then, if there are no additional references to the ncp and no children, 2455 * the ncp is removed from the topology and destroyed. 2456 * 2457 * References and/or children may exist if the ncp is in the middle of the 2458 * topology, preventing the ncp from being destroyed. 2459 * 2460 * This function must be called with the ncp held and locked and will unlock 2461 * and drop it during zapping. 2462 * 2463 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2464 * This case can occur in the cache_drop() path. 2465 * 2466 * This function may returned a held (but NOT locked) parent node which the 2467 * caller must drop. We do this so _cache_drop() can loop, to avoid 2468 * blowing out the kernel stack. 2469 * 2470 * WARNING! For MPSAFE operation this routine must acquire up to three 2471 * spin locks to be able to safely test nc_refs. Lock order is 2472 * very important. 2473 * 2474 * hash spinlock if on hash list 2475 * parent spinlock if child of parent 2476 * (the ncp is unresolved so there is no vnode association) 2477 */ 2478 static struct namecache * 2479 cache_zap(struct namecache *ncp, int nonblock) 2480 { 2481 struct namecache *par; 2482 struct vnode *dropvp; 2483 struct nchash_head *nchpp; 2484 int refs; 2485 2486 /* 2487 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2488 */ 2489 _cache_setunresolved(ncp); 2490 2491 /* 2492 * Try to scrap the entry and possibly tail-recurse on its parent. 2493 * We only scrap unref'd (other then our ref) unresolved entries, 2494 * we do not scrap 'live' entries. 2495 * 2496 * Note that once the spinlocks are acquired if nc_refs == 1 no 2497 * other references are possible. If it isn't, however, we have 2498 * to decrement but also be sure to avoid a 1->0 transition. 2499 */ 2500 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2501 KKASSERT(ncp->nc_refs > 0); 2502 2503 /* 2504 * Acquire locks. Note that the parent can't go away while we hold 2505 * a child locked. 2506 */ 2507 nchpp = NULL; 2508 if ((par = ncp->nc_parent) != NULL) { 2509 if (nonblock) { 2510 for (;;) { 2511 if (_cache_lock_nonblock(par) == 0) 2512 break; 2513 refs = ncp->nc_refs; 2514 ncp->nc_flag |= NCF_DEFEREDZAP; 2515 ++numdefered; /* MP race ok */ 2516 if (atomic_cmpset_int(&ncp->nc_refs, 2517 refs, refs - 1)) { 2518 _cache_unlock(ncp); 2519 return(NULL); 2520 } 2521 cpu_pause(); 2522 } 2523 _cache_hold(par); 2524 } else { 2525 _cache_hold(par); 2526 _cache_lock(par); 2527 } 2528 nchpp = ncp->nc_head; 2529 spin_lock(&nchpp->spin); 2530 } 2531 2532 /* 2533 * At this point if we find refs == 1 it should not be possible for 2534 * anyone else to have access to the ncp. We are holding the only 2535 * possible access point left (nchpp) spin-locked. 2536 * 2537 * If someone other then us has a ref or we have children 2538 * we cannot zap the entry. The 1->0 transition and any 2539 * further list operation is protected by the spinlocks 2540 * we have acquired but other transitions are not. 2541 */ 2542 for (;;) { 2543 refs = ncp->nc_refs; 2544 cpu_ccfence(); 2545 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list)) 2546 break; 2547 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) { 2548 if (par) { 2549 spin_unlock(&nchpp->spin); 2550 _cache_put(par); 2551 } 2552 _cache_unlock(ncp); 2553 return(NULL); 2554 } 2555 cpu_pause(); 2556 } 2557 2558 /* 2559 * We are the only ref and with the spinlocks held no further 2560 * refs can be acquired by others. 2561 * 2562 * Remove us from the hash list and parent list. We have to 2563 * drop a ref on the parent's vp if the parent's list becomes 2564 * empty. 2565 */ 2566 dropvp = NULL; 2567 if (par) { 2568 KKASSERT(nchpp == ncp->nc_head); 2569 LIST_REMOVE(ncp, nc_hash); 2570 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2571 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 2572 dropvp = par->nc_vp; 2573 ncp->nc_head = NULL; 2574 ncp->nc_parent = NULL; 2575 spin_unlock(&nchpp->spin); 2576 _cache_unlock(par); 2577 } else { 2578 KKASSERT(ncp->nc_head == NULL); 2579 } 2580 2581 /* 2582 * ncp should not have picked up any refs. Physically 2583 * destroy the ncp. 2584 */ 2585 if (ncp->nc_refs != 1) { 2586 int save_refs = ncp->nc_refs; 2587 cpu_ccfence(); 2588 panic("cache_zap: %p bad refs %d (%d)\n", 2589 ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0)); 2590 } 2591 KKASSERT(ncp->nc_refs == 1); 2592 /* _cache_unlock(ncp) not required */ 2593 ncp->nc_refs = -1; /* safety */ 2594 if (ncp->nc_name) 2595 kfree(ncp->nc_name, M_VFSCACHE); 2596 kfree(ncp, M_VFSCACHE); 2597 2598 /* 2599 * Delayed drop (we had to release our spinlocks) 2600 * 2601 * The refed parent (if not NULL) must be dropped. The 2602 * caller is responsible for looping. 2603 */ 2604 if (dropvp) 2605 vdrop(dropvp); 2606 return(par); 2607 } 2608 2609 /* 2610 * Clean up dangling negative cache and defered-drop entries in the 2611 * namecache. 2612 * 2613 * This routine is called in the critical path and also called from 2614 * vnlru(). When called from vnlru we use a lower limit to try to 2615 * deal with the negative cache before the critical path has to start 2616 * dealing with it. 2617 */ 2618 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2619 2620 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2621 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2622 2623 void 2624 cache_hysteresis(int critpath) 2625 { 2626 int poslimit; 2627 int neglimit = desiredvnodes / ncnegfactor; 2628 int xnumcache = numcache; 2629 2630 if (critpath == 0) 2631 neglimit = neglimit * 8 / 10; 2632 2633 /* 2634 * Don't cache too many negative hits. We use hysteresis to reduce 2635 * the impact on the critical path. 2636 */ 2637 switch(neg_cache_hysteresis_state[critpath]) { 2638 case CHI_LOW: 2639 if (numneg > MINNEG && numneg > neglimit) { 2640 if (critpath) 2641 _cache_cleanneg(ncnegflush); 2642 else 2643 _cache_cleanneg(ncnegflush + 2644 numneg - neglimit); 2645 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2646 } 2647 break; 2648 case CHI_HIGH: 2649 if (numneg > MINNEG * 9 / 10 && 2650 numneg * 9 / 10 > neglimit 2651 ) { 2652 if (critpath) 2653 _cache_cleanneg(ncnegflush); 2654 else 2655 _cache_cleanneg(ncnegflush + 2656 numneg * 9 / 10 - neglimit); 2657 } else { 2658 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2659 } 2660 break; 2661 } 2662 2663 /* 2664 * Don't cache too many positive hits. We use hysteresis to reduce 2665 * the impact on the critical path. 2666 * 2667 * Excessive positive hits can accumulate due to large numbers of 2668 * hardlinks (the vnode cache will not prevent hl ncps from growing 2669 * into infinity). 2670 */ 2671 if ((poslimit = ncposlimit) == 0) 2672 poslimit = desiredvnodes * 2; 2673 if (critpath == 0) 2674 poslimit = poslimit * 8 / 10; 2675 2676 switch(pos_cache_hysteresis_state[critpath]) { 2677 case CHI_LOW: 2678 if (xnumcache > poslimit && xnumcache > MINPOS) { 2679 if (critpath) 2680 _cache_cleanpos(ncposflush); 2681 else 2682 _cache_cleanpos(ncposflush + 2683 xnumcache - poslimit); 2684 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2685 } 2686 break; 2687 case CHI_HIGH: 2688 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2689 if (critpath) 2690 _cache_cleanpos(ncposflush); 2691 else 2692 _cache_cleanpos(ncposflush + 2693 xnumcache - poslimit * 5 / 6); 2694 } else { 2695 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2696 } 2697 break; 2698 } 2699 2700 /* 2701 * Clean out dangling defered-zap ncps which could not 2702 * be cleanly dropped if too many build up. Note 2703 * that numdefered is not an exact number as such ncps 2704 * can be reused and the counter is not handled in a MP 2705 * safe manner by design. 2706 */ 2707 if (numdefered > neglimit) { 2708 _cache_cleandefered(); 2709 } 2710 } 2711 2712 /* 2713 * NEW NAMECACHE LOOKUP API 2714 * 2715 * Lookup an entry in the namecache. The passed par_nch must be referenced 2716 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2717 * is ALWAYS returned, eve if the supplied component is illegal. 2718 * 2719 * The resulting namecache entry should be returned to the system with 2720 * cache_put() or cache_unlock() + cache_drop(). 2721 * 2722 * namecache locks are recursive but care must be taken to avoid lock order 2723 * reversals (hence why the passed par_nch must be unlocked). Locking 2724 * rules are to order for parent traversals, not for child traversals. 2725 * 2726 * Nobody else will be able to manipulate the associated namespace (e.g. 2727 * create, delete, rename, rename-target) until the caller unlocks the 2728 * entry. 2729 * 2730 * The returned entry will be in one of three states: positive hit (non-null 2731 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2732 * Unresolved entries must be resolved through the filesystem to associate the 2733 * vnode and/or determine whether a positive or negative hit has occured. 2734 * 2735 * It is not necessary to lock a directory in order to lock namespace under 2736 * that directory. In fact, it is explicitly not allowed to do that. A 2737 * directory is typically only locked when being created, renamed, or 2738 * destroyed. 2739 * 2740 * The directory (par) may be unresolved, in which case any returned child 2741 * will likely also be marked unresolved. Likely but not guarenteed. Since 2742 * the filesystem lookup requires a resolved directory vnode the caller is 2743 * responsible for resolving the namecache chain top-down. This API 2744 * specifically allows whole chains to be created in an unresolved state. 2745 */ 2746 struct nchandle 2747 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2748 { 2749 struct nchandle nch; 2750 struct namecache *ncp; 2751 struct namecache *new_ncp; 2752 struct nchash_head *nchpp; 2753 struct mount *mp; 2754 u_int32_t hash; 2755 globaldata_t gd; 2756 int par_locked; 2757 2758 numcalls++; 2759 gd = mycpu; 2760 mp = par_nch->mount; 2761 par_locked = 0; 2762 2763 /* 2764 * This is a good time to call it, no ncp's are locked by 2765 * the caller or us. 2766 */ 2767 cache_hysteresis(1); 2768 2769 /* 2770 * Try to locate an existing entry 2771 */ 2772 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2773 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2774 new_ncp = NULL; 2775 nchpp = NCHHASH(hash); 2776 restart: 2777 if (new_ncp) 2778 spin_lock(&nchpp->spin); 2779 else 2780 spin_lock_shared(&nchpp->spin); 2781 2782 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2783 numchecks++; 2784 2785 /* 2786 * Break out if we find a matching entry. Note that 2787 * UNRESOLVED entries may match, but DESTROYED entries 2788 * do not. 2789 */ 2790 if (ncp->nc_parent == par_nch->ncp && 2791 ncp->nc_nlen == nlc->nlc_namelen && 2792 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 2793 (ncp->nc_flag & NCF_DESTROYED) == 0 2794 ) { 2795 _cache_hold(ncp); 2796 if (new_ncp) 2797 spin_unlock(&nchpp->spin); 2798 else 2799 spin_unlock_shared(&nchpp->spin); 2800 if (par_locked) { 2801 _cache_unlock(par_nch->ncp); 2802 par_locked = 0; 2803 } 2804 if (_cache_lock_special(ncp) == 0) { 2805 /* 2806 * Successfully locked but we must re-test 2807 * conditions that might have changed since 2808 * we did not have the lock before. 2809 */ 2810 if (ncp->nc_parent != par_nch->ncp || 2811 ncp->nc_nlen != nlc->nlc_namelen || 2812 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2813 ncp->nc_nlen) || 2814 (ncp->nc_flag & NCF_DESTROYED)) { 2815 _cache_put(ncp); 2816 goto restart; 2817 } 2818 _cache_auto_unresolve(mp, ncp); 2819 if (new_ncp) 2820 _cache_free(new_ncp); 2821 goto found; 2822 } 2823 _cache_get(ncp); /* cycle the lock to block */ 2824 _cache_put(ncp); 2825 _cache_drop(ncp); 2826 goto restart; 2827 } 2828 } 2829 2830 /* 2831 * We failed to locate an entry, create a new entry and add it to 2832 * the cache. The parent ncp must also be locked so we 2833 * can link into it. 2834 * 2835 * We have to relookup after possibly blocking in kmalloc or 2836 * when locking par_nch. 2837 * 2838 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2839 * mount case, in which case nc_name will be NULL. 2840 */ 2841 if (new_ncp == NULL) { 2842 spin_unlock_shared(&nchpp->spin); 2843 new_ncp = cache_alloc(nlc->nlc_namelen); 2844 if (nlc->nlc_namelen) { 2845 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2846 nlc->nlc_namelen); 2847 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2848 } 2849 goto restart; 2850 } 2851 2852 /* 2853 * NOTE! The spinlock is held exclusively here because new_ncp 2854 * is non-NULL. 2855 */ 2856 if (par_locked == 0) { 2857 spin_unlock(&nchpp->spin); 2858 _cache_lock(par_nch->ncp); 2859 par_locked = 1; 2860 goto restart; 2861 } 2862 2863 /* 2864 * WARNING! We still hold the spinlock. We have to set the hash 2865 * table entry atomically. 2866 */ 2867 ncp = new_ncp; 2868 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2869 spin_unlock(&nchpp->spin); 2870 _cache_unlock(par_nch->ncp); 2871 /* par_locked = 0 - not used */ 2872 found: 2873 /* 2874 * stats and namecache size management 2875 */ 2876 if (ncp->nc_flag & NCF_UNRESOLVED) 2877 ++gd->gd_nchstats->ncs_miss; 2878 else if (ncp->nc_vp) 2879 ++gd->gd_nchstats->ncs_goodhits; 2880 else 2881 ++gd->gd_nchstats->ncs_neghits; 2882 nch.mount = mp; 2883 nch.ncp = ncp; 2884 atomic_add_int(&nch.mount->mnt_refs, 1); 2885 return(nch); 2886 } 2887 2888 /* 2889 * Attempt to lookup a namecache entry and return with a shared namecache 2890 * lock. 2891 */ 2892 int 2893 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 2894 int excl, struct nchandle *res_nch) 2895 { 2896 struct namecache *ncp; 2897 struct nchash_head *nchpp; 2898 struct mount *mp; 2899 u_int32_t hash; 2900 globaldata_t gd; 2901 2902 /* 2903 * If exclusive requested or shared namecache locks are disabled, 2904 * return failure. 2905 */ 2906 if (ncp_shared_lock_disable || excl) 2907 return(EWOULDBLOCK); 2908 2909 numcalls++; 2910 gd = mycpu; 2911 mp = par_nch->mount; 2912 2913 /* 2914 * This is a good time to call it, no ncp's are locked by 2915 * the caller or us. 2916 */ 2917 cache_hysteresis(1); 2918 2919 /* 2920 * Try to locate an existing entry 2921 */ 2922 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2923 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2924 nchpp = NCHHASH(hash); 2925 2926 spin_lock_shared(&nchpp->spin); 2927 2928 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2929 numchecks++; 2930 2931 /* 2932 * Break out if we find a matching entry. Note that 2933 * UNRESOLVED entries may match, but DESTROYED entries 2934 * do not. 2935 */ 2936 if (ncp->nc_parent == par_nch->ncp && 2937 ncp->nc_nlen == nlc->nlc_namelen && 2938 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 2939 (ncp->nc_flag & NCF_DESTROYED) == 0 2940 ) { 2941 _cache_hold(ncp); 2942 spin_unlock_shared(&nchpp->spin); 2943 if (_cache_lock_shared_special(ncp) == 0) { 2944 if (ncp->nc_parent == par_nch->ncp && 2945 ncp->nc_nlen == nlc->nlc_namelen && 2946 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2947 ncp->nc_nlen) == 0 && 2948 (ncp->nc_flag & NCF_DESTROYED) == 0 && 2949 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2950 _cache_auto_unresolve_test(mp, ncp) == 0) { 2951 goto found; 2952 } 2953 _cache_unlock(ncp); 2954 } 2955 _cache_drop(ncp); 2956 spin_lock_shared(&nchpp->spin); 2957 break; 2958 } 2959 } 2960 2961 /* 2962 * Failure 2963 */ 2964 spin_unlock_shared(&nchpp->spin); 2965 return(EWOULDBLOCK); 2966 2967 /* 2968 * Success 2969 * 2970 * Note that nc_error might be non-zero (e.g ENOENT). 2971 */ 2972 found: 2973 res_nch->mount = mp; 2974 res_nch->ncp = ncp; 2975 ++gd->gd_nchstats->ncs_goodhits; 2976 atomic_add_int(&res_nch->mount->mnt_refs, 1); 2977 2978 KKASSERT(ncp->nc_error != EWOULDBLOCK); 2979 return(ncp->nc_error); 2980 } 2981 2982 /* 2983 * This is a non-blocking verison of cache_nlookup() used by 2984 * nfs_readdirplusrpc_uio(). It can fail for any reason and 2985 * will return nch.ncp == NULL in that case. 2986 */ 2987 struct nchandle 2988 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 2989 { 2990 struct nchandle nch; 2991 struct namecache *ncp; 2992 struct namecache *new_ncp; 2993 struct nchash_head *nchpp; 2994 struct mount *mp; 2995 u_int32_t hash; 2996 globaldata_t gd; 2997 int par_locked; 2998 2999 numcalls++; 3000 gd = mycpu; 3001 mp = par_nch->mount; 3002 par_locked = 0; 3003 3004 /* 3005 * Try to locate an existing entry 3006 */ 3007 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3008 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3009 new_ncp = NULL; 3010 nchpp = NCHHASH(hash); 3011 restart: 3012 spin_lock(&nchpp->spin); 3013 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 3014 numchecks++; 3015 3016 /* 3017 * Break out if we find a matching entry. Note that 3018 * UNRESOLVED entries may match, but DESTROYED entries 3019 * do not. 3020 */ 3021 if (ncp->nc_parent == par_nch->ncp && 3022 ncp->nc_nlen == nlc->nlc_namelen && 3023 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3024 (ncp->nc_flag & NCF_DESTROYED) == 0 3025 ) { 3026 _cache_hold(ncp); 3027 spin_unlock(&nchpp->spin); 3028 if (par_locked) { 3029 _cache_unlock(par_nch->ncp); 3030 par_locked = 0; 3031 } 3032 if (_cache_lock_special(ncp) == 0) { 3033 if (ncp->nc_parent != par_nch->ncp || 3034 ncp->nc_nlen != nlc->nlc_namelen || 3035 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3036 (ncp->nc_flag & NCF_DESTROYED)) { 3037 kprintf("cache_lookup_nonblock: " 3038 "ncp-race %p %*.*s\n", 3039 ncp, 3040 nlc->nlc_namelen, 3041 nlc->nlc_namelen, 3042 nlc->nlc_nameptr); 3043 _cache_unlock(ncp); 3044 _cache_drop(ncp); 3045 goto failed; 3046 } 3047 _cache_auto_unresolve(mp, ncp); 3048 if (new_ncp) { 3049 _cache_free(new_ncp); 3050 new_ncp = NULL; 3051 } 3052 goto found; 3053 } 3054 _cache_drop(ncp); 3055 goto failed; 3056 } 3057 } 3058 3059 /* 3060 * We failed to locate an entry, create a new entry and add it to 3061 * the cache. The parent ncp must also be locked so we 3062 * can link into it. 3063 * 3064 * We have to relookup after possibly blocking in kmalloc or 3065 * when locking par_nch. 3066 * 3067 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3068 * mount case, in which case nc_name will be NULL. 3069 */ 3070 if (new_ncp == NULL) { 3071 spin_unlock(&nchpp->spin); 3072 new_ncp = cache_alloc(nlc->nlc_namelen); 3073 if (nlc->nlc_namelen) { 3074 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3075 nlc->nlc_namelen); 3076 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3077 } 3078 goto restart; 3079 } 3080 if (par_locked == 0) { 3081 spin_unlock(&nchpp->spin); 3082 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3083 par_locked = 1; 3084 goto restart; 3085 } 3086 goto failed; 3087 } 3088 3089 /* 3090 * WARNING! We still hold the spinlock. We have to set the hash 3091 * table entry atomically. 3092 */ 3093 ncp = new_ncp; 3094 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3095 spin_unlock(&nchpp->spin); 3096 _cache_unlock(par_nch->ncp); 3097 /* par_locked = 0 - not used */ 3098 found: 3099 /* 3100 * stats and namecache size management 3101 */ 3102 if (ncp->nc_flag & NCF_UNRESOLVED) 3103 ++gd->gd_nchstats->ncs_miss; 3104 else if (ncp->nc_vp) 3105 ++gd->gd_nchstats->ncs_goodhits; 3106 else 3107 ++gd->gd_nchstats->ncs_neghits; 3108 nch.mount = mp; 3109 nch.ncp = ncp; 3110 atomic_add_int(&nch.mount->mnt_refs, 1); 3111 return(nch); 3112 failed: 3113 if (new_ncp) { 3114 _cache_free(new_ncp); 3115 new_ncp = NULL; 3116 } 3117 nch.mount = NULL; 3118 nch.ncp = NULL; 3119 return(nch); 3120 } 3121 3122 /* 3123 * The namecache entry is marked as being used as a mount point. 3124 * Locate the mount if it is visible to the caller. The DragonFly 3125 * mount system allows arbitrary loops in the topology and disentangles 3126 * those loops by matching against (mp, ncp) rather than just (ncp). 3127 * This means any given ncp can dive any number of mounts, depending 3128 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3129 * 3130 * We use a very simple frontend cache to reduce SMP conflicts, 3131 * which we have to do because the mountlist scan needs an exclusive 3132 * lock around its ripout info list. Not to mention that there might 3133 * be a lot of mounts. 3134 */ 3135 struct findmount_info { 3136 struct mount *result; 3137 struct mount *nch_mount; 3138 struct namecache *nch_ncp; 3139 }; 3140 3141 static 3142 struct ncmount_cache * 3143 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3144 { 3145 int hash; 3146 3147 hash = ((int)(intptr_t)mp / sizeof(*mp)) ^ 3148 ((int)(intptr_t)ncp / sizeof(*ncp)); 3149 hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE; 3150 return (&ncmount_cache[hash]); 3151 } 3152 3153 static 3154 int 3155 cache_findmount_callback(struct mount *mp, void *data) 3156 { 3157 struct findmount_info *info = data; 3158 3159 /* 3160 * Check the mount's mounted-on point against the passed nch. 3161 */ 3162 if (mp->mnt_ncmounton.mount == info->nch_mount && 3163 mp->mnt_ncmounton.ncp == info->nch_ncp 3164 ) { 3165 info->result = mp; 3166 atomic_add_int(&mp->mnt_refs, 1); 3167 return(-1); 3168 } 3169 return(0); 3170 } 3171 3172 struct mount * 3173 cache_findmount(struct nchandle *nch) 3174 { 3175 struct findmount_info info; 3176 struct ncmount_cache *ncc; 3177 struct mount *mp; 3178 3179 /* 3180 * Fast 3181 */ 3182 if (ncmount_cache_enable == 0) { 3183 ncc = NULL; 3184 goto skip; 3185 } 3186 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3187 if (ncc->ncp == nch->ncp) { 3188 spin_lock_shared(&ncc->spin); 3189 if (ncc->isneg == 0 && 3190 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3191 if (mp->mnt_ncmounton.mount == nch->mount && 3192 mp->mnt_ncmounton.ncp == nch->ncp) { 3193 /* 3194 * Cache hit (positive) 3195 */ 3196 atomic_add_int(&mp->mnt_refs, 1); 3197 spin_unlock_shared(&ncc->spin); 3198 ++ncmount_cache_hit; 3199 return(mp); 3200 } 3201 /* else cache miss */ 3202 } 3203 if (ncc->isneg && 3204 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3205 /* 3206 * Cache hit (negative) 3207 */ 3208 spin_unlock_shared(&ncc->spin); 3209 ++ncmount_cache_hit; 3210 return(NULL); 3211 } 3212 spin_unlock_shared(&ncc->spin); 3213 } 3214 skip: 3215 3216 /* 3217 * Slow 3218 */ 3219 info.result = NULL; 3220 info.nch_mount = nch->mount; 3221 info.nch_ncp = nch->ncp; 3222 mountlist_scan(cache_findmount_callback, &info, 3223 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 3224 3225 /* 3226 * Cache the result. 3227 * 3228 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3229 * only used for pointer comparisons and is not 3230 * referenced (otherwise there would be dangling 3231 * refs). 3232 * 3233 * Positive lookups: We cache the originating {ncp} and the target 3234 * (mp). (mp) is referenced. 3235 * 3236 * Indeterminant: If the match is undergoing an unmount we do 3237 * not cache it to avoid racing cache_unmounting(), 3238 * but still return the match. 3239 */ 3240 if (ncc) { 3241 spin_lock(&ncc->spin); 3242 if (info.result == NULL) { 3243 if (ncc->isneg == 0 && ncc->mp) 3244 atomic_add_int(&ncc->mp->mnt_refs, -1); 3245 ncc->ncp = nch->ncp; 3246 ncc->mp = nch->mount; 3247 ncc->isneg = 1; 3248 spin_unlock(&ncc->spin); 3249 ++ncmount_cache_overwrite; 3250 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3251 if (ncc->isneg == 0 && ncc->mp) 3252 atomic_add_int(&ncc->mp->mnt_refs, -1); 3253 atomic_add_int(&info.result->mnt_refs, 1); 3254 ncc->ncp = nch->ncp; 3255 ncc->mp = info.result; 3256 ncc->isneg = 0; 3257 spin_unlock(&ncc->spin); 3258 ++ncmount_cache_overwrite; 3259 } else { 3260 spin_unlock(&ncc->spin); 3261 } 3262 ++ncmount_cache_miss; 3263 } 3264 return(info.result); 3265 } 3266 3267 void 3268 cache_dropmount(struct mount *mp) 3269 { 3270 atomic_add_int(&mp->mnt_refs, -1); 3271 } 3272 3273 void 3274 cache_ismounting(struct mount *mp) 3275 { 3276 struct nchandle *nch = &mp->mnt_ncmounton; 3277 struct ncmount_cache *ncc; 3278 3279 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3280 if (ncc->isneg && 3281 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3282 spin_lock(&ncc->spin); 3283 if (ncc->isneg && 3284 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3285 ncc->ncp = NULL; 3286 ncc->mp = NULL; 3287 } 3288 spin_unlock(&ncc->spin); 3289 } 3290 } 3291 3292 void 3293 cache_unmounting(struct mount *mp) 3294 { 3295 struct nchandle *nch = &mp->mnt_ncmounton; 3296 struct ncmount_cache *ncc; 3297 3298 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3299 if (ncc->isneg == 0 && 3300 ncc->ncp == nch->ncp && ncc->mp == mp) { 3301 spin_lock(&ncc->spin); 3302 if (ncc->isneg == 0 && 3303 ncc->ncp == nch->ncp && ncc->mp == mp) { 3304 atomic_add_int(&mp->mnt_refs, -1); 3305 ncc->ncp = NULL; 3306 ncc->mp = NULL; 3307 } 3308 spin_unlock(&ncc->spin); 3309 } 3310 } 3311 3312 /* 3313 * Resolve an unresolved namecache entry, generally by looking it up. 3314 * The passed ncp must be locked and refd. 3315 * 3316 * Theoretically since a vnode cannot be recycled while held, and since 3317 * the nc_parent chain holds its vnode as long as children exist, the 3318 * direct parent of the cache entry we are trying to resolve should 3319 * have a valid vnode. If not then generate an error that we can 3320 * determine is related to a resolver bug. 3321 * 3322 * However, if a vnode was in the middle of a recyclement when the NCP 3323 * got locked, ncp->nc_vp might point to a vnode that is about to become 3324 * invalid. cache_resolve() handles this case by unresolving the entry 3325 * and then re-resolving it. 3326 * 3327 * Note that successful resolution does not necessarily return an error 3328 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3329 * will be returned. 3330 */ 3331 int 3332 cache_resolve(struct nchandle *nch, struct ucred *cred) 3333 { 3334 struct namecache *par_tmp; 3335 struct namecache *par; 3336 struct namecache *ncp; 3337 struct nchandle nctmp; 3338 struct mount *mp; 3339 struct vnode *dvp; 3340 int error; 3341 3342 ncp = nch->ncp; 3343 mp = nch->mount; 3344 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3345 restart: 3346 /* 3347 * If the ncp is already resolved we have nothing to do. However, 3348 * we do want to guarentee that a usable vnode is returned when 3349 * a vnode is present, so make sure it hasn't been reclaimed. 3350 */ 3351 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3352 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3353 _cache_setunresolved(ncp); 3354 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3355 return (ncp->nc_error); 3356 } 3357 3358 /* 3359 * If the ncp was destroyed it will never resolve again. This 3360 * can basically only happen when someone is chdir'd into an 3361 * empty directory which is then rmdir'd. We want to catch this 3362 * here and not dive the VFS because the VFS might actually 3363 * have a way to re-resolve the disconnected ncp, which will 3364 * result in inconsistencies in the cdir/nch for proc->p_fd. 3365 */ 3366 if (ncp->nc_flag & NCF_DESTROYED) 3367 return(EINVAL); 3368 3369 /* 3370 * Mount points need special handling because the parent does not 3371 * belong to the same filesystem as the ncp. 3372 */ 3373 if (ncp == mp->mnt_ncmountpt.ncp) 3374 return (cache_resolve_mp(mp)); 3375 3376 /* 3377 * We expect an unbroken chain of ncps to at least the mount point, 3378 * and even all the way to root (but this code doesn't have to go 3379 * past the mount point). 3380 */ 3381 if (ncp->nc_parent == NULL) { 3382 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3383 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3384 ncp->nc_error = EXDEV; 3385 return(ncp->nc_error); 3386 } 3387 3388 /* 3389 * The vp's of the parent directories in the chain are held via vhold() 3390 * due to the existance of the child, and should not disappear. 3391 * However, there are cases where they can disappear: 3392 * 3393 * - due to filesystem I/O errors. 3394 * - due to NFS being stupid about tracking the namespace and 3395 * destroys the namespace for entire directories quite often. 3396 * - due to forced unmounts. 3397 * - due to an rmdir (parent will be marked DESTROYED) 3398 * 3399 * When this occurs we have to track the chain backwards and resolve 3400 * it, looping until the resolver catches up to the current node. We 3401 * could recurse here but we might run ourselves out of kernel stack 3402 * so we do it in a more painful manner. This situation really should 3403 * not occur all that often, or if it does not have to go back too 3404 * many nodes to resolve the ncp. 3405 */ 3406 while ((dvp = cache_dvpref(ncp)) == NULL) { 3407 /* 3408 * This case can occur if a process is CD'd into a 3409 * directory which is then rmdir'd. If the parent is marked 3410 * destroyed there is no point trying to resolve it. 3411 */ 3412 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3413 return(ENOENT); 3414 par = ncp->nc_parent; 3415 _cache_hold(par); 3416 _cache_lock(par); 3417 while ((par_tmp = par->nc_parent) != NULL && 3418 par_tmp->nc_vp == NULL) { 3419 _cache_hold(par_tmp); 3420 _cache_lock(par_tmp); 3421 _cache_put(par); 3422 par = par_tmp; 3423 } 3424 if (par->nc_parent == NULL) { 3425 kprintf("EXDEV case 2 %*.*s\n", 3426 par->nc_nlen, par->nc_nlen, par->nc_name); 3427 _cache_put(par); 3428 return (EXDEV); 3429 } 3430 /* 3431 * The parent is not set in stone, ref and lock it to prevent 3432 * it from disappearing. Also note that due to renames it 3433 * is possible for our ncp to move and for par to no longer 3434 * be one of its parents. We resolve it anyway, the loop 3435 * will handle any moves. 3436 */ 3437 _cache_get(par); /* additional hold/lock */ 3438 _cache_put(par); /* from earlier hold/lock */ 3439 if (par == nch->mount->mnt_ncmountpt.ncp) { 3440 cache_resolve_mp(nch->mount); 3441 } else if ((dvp = cache_dvpref(par)) == NULL) { 3442 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3443 _cache_put(par); 3444 continue; 3445 } else { 3446 if (par->nc_flag & NCF_UNRESOLVED) { 3447 nctmp.mount = mp; 3448 nctmp.ncp = par; 3449 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3450 } 3451 vrele(dvp); 3452 } 3453 if ((error = par->nc_error) != 0) { 3454 if (par->nc_error != EAGAIN) { 3455 kprintf("EXDEV case 3 %*.*s error %d\n", 3456 par->nc_nlen, par->nc_nlen, par->nc_name, 3457 par->nc_error); 3458 _cache_put(par); 3459 return(error); 3460 } 3461 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3462 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3463 } 3464 _cache_put(par); 3465 /* loop */ 3466 } 3467 3468 /* 3469 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3470 * ncp's and reattach them. If this occurs the original ncp is marked 3471 * EAGAIN to force a relookup. 3472 * 3473 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3474 * ncp must already be resolved. 3475 */ 3476 if (dvp) { 3477 nctmp.mount = mp; 3478 nctmp.ncp = ncp; 3479 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3480 vrele(dvp); 3481 } else { 3482 ncp->nc_error = EPERM; 3483 } 3484 if (ncp->nc_error == EAGAIN) { 3485 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3486 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3487 goto restart; 3488 } 3489 return(ncp->nc_error); 3490 } 3491 3492 /* 3493 * Resolve the ncp associated with a mount point. Such ncp's almost always 3494 * remain resolved and this routine is rarely called. NFS MPs tends to force 3495 * re-resolution more often due to its mac-truck-smash-the-namecache 3496 * method of tracking namespace changes. 3497 * 3498 * The semantics for this call is that the passed ncp must be locked on 3499 * entry and will be locked on return. However, if we actually have to 3500 * resolve the mount point we temporarily unlock the entry in order to 3501 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3502 * the unlock we have to recheck the flags after we relock. 3503 */ 3504 static int 3505 cache_resolve_mp(struct mount *mp) 3506 { 3507 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3508 struct vnode *vp; 3509 int error; 3510 3511 KKASSERT(mp != NULL); 3512 3513 /* 3514 * If the ncp is already resolved we have nothing to do. However, 3515 * we do want to guarentee that a usable vnode is returned when 3516 * a vnode is present, so make sure it hasn't been reclaimed. 3517 */ 3518 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3519 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3520 _cache_setunresolved(ncp); 3521 } 3522 3523 if (ncp->nc_flag & NCF_UNRESOLVED) { 3524 _cache_unlock(ncp); 3525 while (vfs_busy(mp, 0)) 3526 ; 3527 error = VFS_ROOT(mp, &vp); 3528 _cache_lock(ncp); 3529 3530 /* 3531 * recheck the ncp state after relocking. 3532 */ 3533 if (ncp->nc_flag & NCF_UNRESOLVED) { 3534 ncp->nc_error = error; 3535 if (error == 0) { 3536 _cache_setvp(mp, ncp, vp); 3537 vput(vp); 3538 } else { 3539 kprintf("[diagnostic] cache_resolve_mp: failed" 3540 " to resolve mount %p err=%d ncp=%p\n", 3541 mp, error, ncp); 3542 _cache_setvp(mp, ncp, NULL); 3543 } 3544 } else if (error == 0) { 3545 vput(vp); 3546 } 3547 vfs_unbusy(mp); 3548 } 3549 return(ncp->nc_error); 3550 } 3551 3552 /* 3553 * Clean out negative cache entries when too many have accumulated. 3554 */ 3555 static void 3556 _cache_cleanneg(int count) 3557 { 3558 struct namecache *ncp; 3559 3560 /* 3561 * Attempt to clean out the specified number of negative cache 3562 * entries. 3563 */ 3564 while (count) { 3565 spin_lock(&ncspin); 3566 ncp = TAILQ_FIRST(&ncneglist); 3567 if (ncp == NULL) { 3568 spin_unlock(&ncspin); 3569 break; 3570 } 3571 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 3572 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 3573 _cache_hold(ncp); 3574 spin_unlock(&ncspin); 3575 3576 /* 3577 * This can race, so we must re-check that the ncp 3578 * is on the ncneglist after successfully locking it. 3579 */ 3580 if (_cache_lock_special(ncp) == 0) { 3581 if (ncp->nc_vp == NULL && 3582 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3583 ncp = cache_zap(ncp, 1); 3584 if (ncp) 3585 _cache_drop(ncp); 3586 } else { 3587 kprintf("cache_cleanneg: race avoided\n"); 3588 _cache_unlock(ncp); 3589 } 3590 } else { 3591 _cache_drop(ncp); 3592 } 3593 --count; 3594 } 3595 } 3596 3597 /* 3598 * Clean out positive cache entries when too many have accumulated. 3599 */ 3600 static void 3601 _cache_cleanpos(int count) 3602 { 3603 static volatile int rover; 3604 struct nchash_head *nchpp; 3605 struct namecache *ncp; 3606 int rover_copy; 3607 3608 /* 3609 * Attempt to clean out the specified number of negative cache 3610 * entries. 3611 */ 3612 while (count) { 3613 rover_copy = ++rover; /* MPSAFEENOUGH */ 3614 cpu_ccfence(); 3615 nchpp = NCHHASH(rover_copy); 3616 3617 spin_lock_shared(&nchpp->spin); 3618 ncp = LIST_FIRST(&nchpp->list); 3619 while (ncp && (ncp->nc_flag & NCF_DESTROYED)) 3620 ncp = LIST_NEXT(ncp, nc_hash); 3621 if (ncp) 3622 _cache_hold(ncp); 3623 spin_unlock_shared(&nchpp->spin); 3624 3625 if (ncp) { 3626 if (_cache_lock_special(ncp) == 0) { 3627 ncp = cache_zap(ncp, 1); 3628 if (ncp) 3629 _cache_drop(ncp); 3630 } else { 3631 _cache_drop(ncp); 3632 } 3633 } 3634 --count; 3635 } 3636 } 3637 3638 /* 3639 * This is a kitchen sink function to clean out ncps which we 3640 * tried to zap from cache_drop() but failed because we were 3641 * unable to acquire the parent lock. 3642 * 3643 * Such entries can also be removed via cache_inval_vp(), such 3644 * as when unmounting. 3645 */ 3646 static void 3647 _cache_cleandefered(void) 3648 { 3649 struct nchash_head *nchpp; 3650 struct namecache *ncp; 3651 struct namecache dummy; 3652 int i; 3653 3654 numdefered = 0; 3655 bzero(&dummy, sizeof(dummy)); 3656 dummy.nc_flag = NCF_DESTROYED; 3657 dummy.nc_refs = 1; 3658 3659 for (i = 0; i <= nchash; ++i) { 3660 nchpp = &nchashtbl[i]; 3661 3662 spin_lock(&nchpp->spin); 3663 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3664 ncp = &dummy; 3665 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) { 3666 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 3667 continue; 3668 LIST_REMOVE(&dummy, nc_hash); 3669 LIST_INSERT_AFTER(ncp, &dummy, nc_hash); 3670 _cache_hold(ncp); 3671 spin_unlock(&nchpp->spin); 3672 if (_cache_lock_nonblock(ncp) == 0) { 3673 ncp->nc_flag &= ~NCF_DEFEREDZAP; 3674 _cache_unlock(ncp); 3675 } 3676 _cache_drop(ncp); 3677 spin_lock(&nchpp->spin); 3678 ncp = &dummy; 3679 } 3680 LIST_REMOVE(&dummy, nc_hash); 3681 spin_unlock(&nchpp->spin); 3682 } 3683 } 3684 3685 /* 3686 * Name cache initialization, from vfsinit() when we are booting 3687 */ 3688 void 3689 nchinit(void) 3690 { 3691 int i; 3692 globaldata_t gd; 3693 3694 /* initialise per-cpu namecache effectiveness statistics. */ 3695 for (i = 0; i < ncpus; ++i) { 3696 gd = globaldata_find(i); 3697 gd->gd_nchstats = &nchstats[i]; 3698 } 3699 TAILQ_INIT(&ncneglist); 3700 spin_init(&ncspin, "nchinit"); 3701 nchashtbl = hashinit_ext(desiredvnodes / 2, 3702 sizeof(struct nchash_head), 3703 M_VFSCACHE, &nchash); 3704 for (i = 0; i <= (int)nchash; ++i) { 3705 LIST_INIT(&nchashtbl[i].list); 3706 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 3707 } 3708 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 3709 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 3710 nclockwarn = 5 * hz; 3711 } 3712 3713 /* 3714 * Called from start_init() to bootstrap the root filesystem. Returns 3715 * a referenced, unlocked namecache record. 3716 */ 3717 void 3718 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 3719 { 3720 nch->ncp = cache_alloc(0); 3721 nch->mount = mp; 3722 atomic_add_int(&mp->mnt_refs, 1); 3723 if (vp) 3724 _cache_setvp(nch->mount, nch->ncp, vp); 3725 } 3726 3727 /* 3728 * vfs_cache_setroot() 3729 * 3730 * Create an association between the root of our namecache and 3731 * the root vnode. This routine may be called several times during 3732 * booting. 3733 * 3734 * If the caller intends to save the returned namecache pointer somewhere 3735 * it must cache_hold() it. 3736 */ 3737 void 3738 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 3739 { 3740 struct vnode *ovp; 3741 struct nchandle onch; 3742 3743 ovp = rootvnode; 3744 onch = rootnch; 3745 rootvnode = nvp; 3746 if (nch) 3747 rootnch = *nch; 3748 else 3749 cache_zero(&rootnch); 3750 if (ovp) 3751 vrele(ovp); 3752 if (onch.ncp) 3753 cache_drop(&onch); 3754 } 3755 3756 /* 3757 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 3758 * topology and is being removed as quickly as possible. The new VOP_N*() 3759 * API calls are required to make specific adjustments using the supplied 3760 * ncp pointers rather then just bogusly purging random vnodes. 3761 * 3762 * Invalidate all namecache entries to a particular vnode as well as 3763 * any direct children of that vnode in the namecache. This is a 3764 * 'catch all' purge used by filesystems that do not know any better. 3765 * 3766 * Note that the linkage between the vnode and its namecache entries will 3767 * be removed, but the namecache entries themselves might stay put due to 3768 * active references from elsewhere in the system or due to the existance of 3769 * the children. The namecache topology is left intact even if we do not 3770 * know what the vnode association is. Such entries will be marked 3771 * NCF_UNRESOLVED. 3772 */ 3773 void 3774 cache_purge(struct vnode *vp) 3775 { 3776 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 3777 } 3778 3779 static int disablecwd; 3780 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 3781 "Disable getcwd"); 3782 3783 static u_long numcwdcalls; 3784 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 3785 "Number of current directory resolution calls"); 3786 static u_long numcwdfailnf; 3787 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 3788 "Number of current directory failures due to lack of file"); 3789 static u_long numcwdfailsz; 3790 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 3791 "Number of current directory failures due to large result"); 3792 static u_long numcwdfound; 3793 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 3794 "Number of current directory resolution successes"); 3795 3796 /* 3797 * MPALMOSTSAFE 3798 */ 3799 int 3800 sys___getcwd(struct __getcwd_args *uap) 3801 { 3802 u_int buflen; 3803 int error; 3804 char *buf; 3805 char *bp; 3806 3807 if (disablecwd) 3808 return (ENODEV); 3809 3810 buflen = uap->buflen; 3811 if (buflen == 0) 3812 return (EINVAL); 3813 if (buflen > MAXPATHLEN) 3814 buflen = MAXPATHLEN; 3815 3816 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 3817 bp = kern_getcwd(buf, buflen, &error); 3818 if (error == 0) 3819 error = copyout(bp, uap->buf, strlen(bp) + 1); 3820 kfree(buf, M_TEMP); 3821 return (error); 3822 } 3823 3824 char * 3825 kern_getcwd(char *buf, size_t buflen, int *error) 3826 { 3827 struct proc *p = curproc; 3828 char *bp; 3829 int i, slash_prefixed; 3830 struct filedesc *fdp; 3831 struct nchandle nch; 3832 struct namecache *ncp; 3833 3834 numcwdcalls++; 3835 bp = buf; 3836 bp += buflen - 1; 3837 *bp = '\0'; 3838 fdp = p->p_fd; 3839 slash_prefixed = 0; 3840 3841 nch = fdp->fd_ncdir; 3842 ncp = nch.ncp; 3843 if (ncp) 3844 _cache_hold(ncp); 3845 3846 while (ncp && (ncp != fdp->fd_nrdir.ncp || 3847 nch.mount != fdp->fd_nrdir.mount) 3848 ) { 3849 /* 3850 * While traversing upwards if we encounter the root 3851 * of the current mount we have to skip to the mount point 3852 * in the underlying filesystem. 3853 */ 3854 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 3855 nch = nch.mount->mnt_ncmounton; 3856 _cache_drop(ncp); 3857 ncp = nch.ncp; 3858 if (ncp) 3859 _cache_hold(ncp); 3860 continue; 3861 } 3862 3863 /* 3864 * Prepend the path segment 3865 */ 3866 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 3867 if (bp == buf) { 3868 numcwdfailsz++; 3869 *error = ERANGE; 3870 bp = NULL; 3871 goto done; 3872 } 3873 *--bp = ncp->nc_name[i]; 3874 } 3875 if (bp == buf) { 3876 numcwdfailsz++; 3877 *error = ERANGE; 3878 bp = NULL; 3879 goto done; 3880 } 3881 *--bp = '/'; 3882 slash_prefixed = 1; 3883 3884 /* 3885 * Go up a directory. This isn't a mount point so we don't 3886 * have to check again. 3887 */ 3888 while ((nch.ncp = ncp->nc_parent) != NULL) { 3889 if (ncp_shared_lock_disable) 3890 _cache_lock(ncp); 3891 else 3892 _cache_lock_shared(ncp); 3893 if (nch.ncp != ncp->nc_parent) { 3894 _cache_unlock(ncp); 3895 continue; 3896 } 3897 _cache_hold(nch.ncp); 3898 _cache_unlock(ncp); 3899 break; 3900 } 3901 _cache_drop(ncp); 3902 ncp = nch.ncp; 3903 } 3904 if (ncp == NULL) { 3905 numcwdfailnf++; 3906 *error = ENOENT; 3907 bp = NULL; 3908 goto done; 3909 } 3910 if (!slash_prefixed) { 3911 if (bp == buf) { 3912 numcwdfailsz++; 3913 *error = ERANGE; 3914 bp = NULL; 3915 goto done; 3916 } 3917 *--bp = '/'; 3918 } 3919 numcwdfound++; 3920 *error = 0; 3921 done: 3922 if (ncp) 3923 _cache_drop(ncp); 3924 return (bp); 3925 } 3926 3927 /* 3928 * Thus begins the fullpath magic. 3929 * 3930 * The passed nchp is referenced but not locked. 3931 */ 3932 static int disablefullpath; 3933 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 3934 &disablefullpath, 0, 3935 "Disable fullpath lookups"); 3936 3937 static u_int numfullpathcalls; 3938 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD, 3939 &numfullpathcalls, 0, 3940 "Number of full path resolutions in progress"); 3941 static u_int numfullpathfailnf; 3942 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD, 3943 &numfullpathfailnf, 0, 3944 "Number of full path resolution failures due to lack of file"); 3945 static u_int numfullpathfailsz; 3946 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD, 3947 &numfullpathfailsz, 0, 3948 "Number of full path resolution failures due to insufficient memory"); 3949 static u_int numfullpathfound; 3950 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD, 3951 &numfullpathfound, 0, 3952 "Number of full path resolution successes"); 3953 3954 int 3955 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 3956 char **retbuf, char **freebuf, int guess) 3957 { 3958 struct nchandle fd_nrdir; 3959 struct nchandle nch; 3960 struct namecache *ncp; 3961 struct mount *mp, *new_mp; 3962 char *bp, *buf; 3963 int slash_prefixed; 3964 int error = 0; 3965 int i; 3966 3967 atomic_add_int(&numfullpathcalls, -1); 3968 3969 *retbuf = NULL; 3970 *freebuf = NULL; 3971 3972 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3973 bp = buf + MAXPATHLEN - 1; 3974 *bp = '\0'; 3975 if (nchbase) 3976 fd_nrdir = *nchbase; 3977 else if (p != NULL) 3978 fd_nrdir = p->p_fd->fd_nrdir; 3979 else 3980 fd_nrdir = rootnch; 3981 slash_prefixed = 0; 3982 nch = *nchp; 3983 ncp = nch.ncp; 3984 if (ncp) 3985 _cache_hold(ncp); 3986 mp = nch.mount; 3987 3988 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 3989 new_mp = NULL; 3990 3991 /* 3992 * If we are asked to guess the upwards path, we do so whenever 3993 * we encounter an ncp marked as a mountpoint. We try to find 3994 * the actual mountpoint by finding the mountpoint with this 3995 * ncp. 3996 */ 3997 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 3998 new_mp = mount_get_by_nc(ncp); 3999 } 4000 /* 4001 * While traversing upwards if we encounter the root 4002 * of the current mount we have to skip to the mount point. 4003 */ 4004 if (ncp == mp->mnt_ncmountpt.ncp) { 4005 new_mp = mp; 4006 } 4007 if (new_mp) { 4008 nch = new_mp->mnt_ncmounton; 4009 _cache_drop(ncp); 4010 ncp = nch.ncp; 4011 if (ncp) 4012 _cache_hold(ncp); 4013 mp = nch.mount; 4014 continue; 4015 } 4016 4017 /* 4018 * Prepend the path segment 4019 */ 4020 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4021 if (bp == buf) { 4022 numfullpathfailsz++; 4023 kfree(buf, M_TEMP); 4024 error = ENOMEM; 4025 goto done; 4026 } 4027 *--bp = ncp->nc_name[i]; 4028 } 4029 if (bp == buf) { 4030 numfullpathfailsz++; 4031 kfree(buf, M_TEMP); 4032 error = ENOMEM; 4033 goto done; 4034 } 4035 *--bp = '/'; 4036 slash_prefixed = 1; 4037 4038 /* 4039 * Go up a directory. This isn't a mount point so we don't 4040 * have to check again. 4041 * 4042 * We can only safely access nc_parent with ncp held locked. 4043 */ 4044 while ((nch.ncp = ncp->nc_parent) != NULL) { 4045 _cache_lock(ncp); 4046 if (nch.ncp != ncp->nc_parent) { 4047 _cache_unlock(ncp); 4048 continue; 4049 } 4050 _cache_hold(nch.ncp); 4051 _cache_unlock(ncp); 4052 break; 4053 } 4054 _cache_drop(ncp); 4055 ncp = nch.ncp; 4056 } 4057 if (ncp == NULL) { 4058 numfullpathfailnf++; 4059 kfree(buf, M_TEMP); 4060 error = ENOENT; 4061 goto done; 4062 } 4063 4064 if (!slash_prefixed) { 4065 if (bp == buf) { 4066 numfullpathfailsz++; 4067 kfree(buf, M_TEMP); 4068 error = ENOMEM; 4069 goto done; 4070 } 4071 *--bp = '/'; 4072 } 4073 numfullpathfound++; 4074 *retbuf = bp; 4075 *freebuf = buf; 4076 error = 0; 4077 done: 4078 if (ncp) 4079 _cache_drop(ncp); 4080 return(error); 4081 } 4082 4083 int 4084 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4085 char **freebuf, int guess) 4086 { 4087 struct namecache *ncp; 4088 struct nchandle nch; 4089 int error; 4090 4091 *freebuf = NULL; 4092 atomic_add_int(&numfullpathcalls, 1); 4093 if (disablefullpath) 4094 return (ENODEV); 4095 4096 if (p == NULL) 4097 return (EINVAL); 4098 4099 /* vn is NULL, client wants us to use p->p_textvp */ 4100 if (vn == NULL) { 4101 if ((vn = p->p_textvp) == NULL) 4102 return (EINVAL); 4103 } 4104 spin_lock_shared(&vn->v_spin); 4105 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4106 if (ncp->nc_nlen) 4107 break; 4108 } 4109 if (ncp == NULL) { 4110 spin_unlock_shared(&vn->v_spin); 4111 return (EINVAL); 4112 } 4113 _cache_hold(ncp); 4114 spin_unlock_shared(&vn->v_spin); 4115 4116 atomic_add_int(&numfullpathcalls, -1); 4117 nch.ncp = ncp; 4118 nch.mount = vn->v_mount; 4119 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4120 _cache_drop(ncp); 4121 return (error); 4122 } 4123