1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysmsg.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/nlookup.h> 77 #include <sys/filedesc.h> 78 #include <sys/fnv_hash.h> 79 #include <sys/globaldata.h> 80 #include <sys/kern_syscall.h> 81 #include <sys/dirent.h> 82 #include <ddb/ddb.h> 83 84 #include <sys/spinlock2.h> 85 86 #define MAX_RECURSION_DEPTH 64 87 88 /* 89 * Random lookups in the cache are accomplished with a hash table using 90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 91 * but we use the ncp->update counter trick to avoid acquiring any 92 * contestable spin-locks during a lookup. 93 * 94 * Negative entries may exist and correspond to resolved namecache 95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 96 * will be set if the entry corresponds to a whited-out directory entry 97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 98 * is locked via pcpu_ncache[n].neg_spin; 99 * 100 * MPSAFE RULES: 101 * 102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 103 * is applicable to direct lookups via the hash table nchpp or via 104 * nc_list (the two are added or removed together). Removal of the ncp 105 * from the hash table drops this reference. The second is applicable 106 * to vp->v_namecache linkages (or negative list linkages), and removal 107 * of the ncp from these lists drops this reference. 108 * 109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 110 * and must be destroyed. No other thread should have access to it at 111 * this point so it can be safely locked and freed without any deadlock 112 * fears. 113 * 114 * The 1->0 transition can occur at almost any juncture and so cache_drop() 115 * deals with it directly. 116 * 117 * (2) Once the 1->0 transition occurs, the entity that caused the transition 118 * will be responsible for destroying the ncp. The ncp cannot be on any 119 * list or hash at this time, or be held by anyone other than the caller 120 * responsible for the transition. 121 * 122 * (3) A ncp must be locked in order to modify it. 123 * 124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 125 * This may seem backwards but forward-scans use the hash table and thus 126 * can hold the parent unlocked while traversing downward. Deletions, 127 * on the other-hand, tend to propagate bottom-up since the ref on the 128 * is dropped as the children go away. 129 * 130 * (6) Both parent and child must be locked in order to enter the child onto 131 * the parent's nc_list. 132 */ 133 134 /* 135 * Structures associated with name cacheing. 136 */ 137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 138 #define MINNEG 1024 139 #define MINPOS 1024 140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 141 #define NCMOUNT_SET (8) /* power of 2 */ 142 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache), 144 "namecache", "namecache entries"); 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings"); 146 147 TAILQ_HEAD(nchash_list, namecache); 148 149 /* 150 * Don't cachealign, but at least pad to 32 bytes so entries 151 * don't cross a cache line. 152 */ 153 struct nchash_head { 154 struct nchash_list list; /* 16 bytes */ 155 struct spinlock spin; /* 8 bytes */ 156 long pad01; /* 8 bytes */ 157 }; 158 159 struct ncmount_cache { 160 struct spinlock spin; 161 struct namecache *ncp; 162 struct mount *mp; 163 struct mount *mp_target; 164 int isneg; 165 int ticks; 166 int updating; 167 int unused01; 168 }; 169 170 struct pcpu_ncache { 171 struct spinlock umount_spin; /* cache_findmount/interlock */ 172 struct spinlock neg_spin; /* for neg_list and neg_count */ 173 struct namecache_list neg_list; 174 long neg_count; 175 long vfscache_negs; 176 long vfscache_count; 177 long vfscache_leafs; 178 long numdefered; 179 } __cachealign; 180 181 __read_mostly static struct nchash_head *nchashtbl; 182 __read_mostly static struct pcpu_ncache *pcpu_ncache; 183 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 184 185 /* 186 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 187 * to create the namecache infrastructure leading to a dangling vnode. 188 * 189 * 0 Only errors are reported 190 * 1 Successes are reported 191 * 2 Successes + the whole directory scan is reported 192 * 3 Force the directory scan code run as if the parent vnode did not 193 * have a namecache record, even if it does have one. 194 */ 195 __read_mostly static int ncvp_debug; 196 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 197 "Namecache debug level (0-3)"); 198 199 __read_mostly static u_long nchash; /* size of hash table */ 200 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 201 "Size of namecache hash table"); 202 203 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 204 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 205 "Batch flush negative entries"); 206 207 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 208 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 209 "Batch flush positive entries"); 210 211 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 212 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 213 "Ratio of namecache negative entries"); 214 215 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 216 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 217 "Warn on locked namecache entries in ticks"); 218 219 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 220 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 221 "Number of cache entries allocated"); 222 223 __read_mostly static int ncp_shared_lock_disable = 0; 224 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 225 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 226 227 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 228 "sizeof(struct vnode)"); 229 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 230 "sizeof(struct namecache)"); 231 232 __read_mostly static int ncmount_cache_enable = 1; 233 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 234 &ncmount_cache_enable, 0, "mount point cache"); 235 236 static __inline void _cache_drop(struct namecache *ncp); 237 static int cache_resolve_mp(struct mount *mp); 238 static int cache_findmount_callback(struct mount *mp, void *data); 239 static void _cache_setunresolved(struct namecache *ncp); 240 static void _cache_cleanneg(long count); 241 static void _cache_cleanpos(long count); 242 static void _cache_cleandefered(void); 243 static void _cache_unlink(struct namecache *ncp); 244 245 /* 246 * The new name cache statistics (these are rolled up globals and not 247 * modified in the critical path, see struct pcpu_ncache). 248 */ 249 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 250 static long vfscache_negs; 251 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 252 "Number of negative namecache entries"); 253 static long vfscache_count; 254 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 255 "Number of namecaches entries"); 256 static long vfscache_leafs; 257 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 258 "Number of namecaches entries"); 259 static long numdefered; 260 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 261 "Number of cache entries allocated"); 262 263 264 struct nchstats nchstats[SMP_MAXCPU]; 265 /* 266 * Export VFS cache effectiveness statistics to user-land. 267 * 268 * The statistics are left for aggregation to user-land so 269 * neat things can be achieved, like observing per-CPU cache 270 * distribution. 271 */ 272 static int 273 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 274 { 275 struct globaldata *gd; 276 int i, error; 277 278 error = 0; 279 for (i = 0; i < ncpus; ++i) { 280 gd = globaldata_find(i); 281 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 282 sizeof(struct nchstats)))) 283 break; 284 } 285 286 return (error); 287 } 288 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 289 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 290 291 static void cache_zap(struct namecache *ncp); 292 293 /* 294 * Cache mount points and namecache records in order to avoid unnecessary 295 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 296 * performance and is particularly important on multi-socket systems to 297 * reduce cache-line ping-ponging. 298 * 299 * Try to keep the pcpu structure within one cache line (~64 bytes). 300 */ 301 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 302 #define MNTCACHE_SET 8 /* set associativity */ 303 304 struct mntcache_elm { 305 struct namecache *ncp; 306 struct mount *mp; 307 int ticks; 308 int unused01; 309 }; 310 311 struct mntcache { 312 struct mntcache_elm array[MNTCACHE_COUNT]; 313 } __cachealign; 314 315 static struct mntcache pcpu_mntcache[MAXCPU]; 316 317 static __inline 318 struct mntcache_elm * 319 _cache_mntcache_hash(void *ptr) 320 { 321 struct mntcache_elm *elm; 322 int hv; 323 324 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 325 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 326 327 return elm; 328 } 329 330 static 331 void 332 _cache_mntref(struct mount *mp) 333 { 334 struct mntcache_elm *elm; 335 struct mount *mpr; 336 int i; 337 338 elm = _cache_mntcache_hash(mp); 339 for (i = 0; i < MNTCACHE_SET; ++i) { 340 if (elm->mp == mp) { 341 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 342 if (__predict_true(mpr == mp)) 343 return; 344 if (mpr) 345 atomic_add_int(&mpr->mnt_refs, -1); 346 } 347 ++elm; 348 } 349 atomic_add_int(&mp->mnt_refs, 1); 350 } 351 352 static 353 void 354 _cache_mntrel(struct mount *mp) 355 { 356 struct mntcache_elm *elm; 357 struct mntcache_elm *best; 358 struct mount *mpr; 359 int delta1; 360 int delta2; 361 int i; 362 363 elm = _cache_mntcache_hash(mp); 364 best = elm; 365 for (i = 0; i < MNTCACHE_SET; ++i) { 366 if (elm->mp == NULL) { 367 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 368 if (__predict_false(mpr != NULL)) { 369 atomic_add_int(&mpr->mnt_refs, -1); 370 } 371 elm->ticks = ticks; 372 return; 373 } 374 delta1 = ticks - best->ticks; 375 delta2 = ticks - elm->ticks; 376 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 377 best = elm; 378 ++elm; 379 } 380 mpr = atomic_swap_ptr((void *)&best->mp, mp); 381 best->ticks = ticks; 382 if (mpr) 383 atomic_add_int(&mpr->mnt_refs, -1); 384 } 385 386 /* 387 * Clears all cached mount points on all cpus. This routine should only 388 * be called when we are waiting for a mount to clear, e.g. so we can 389 * unmount. 390 */ 391 void 392 cache_clearmntcache(struct mount *target __unused) 393 { 394 int n; 395 396 for (n = 0; n < ncpus; ++n) { 397 struct mntcache *cache = &pcpu_mntcache[n]; 398 struct mntcache_elm *elm; 399 struct namecache *ncp; 400 struct mount *mp; 401 int i; 402 403 for (i = 0; i < MNTCACHE_COUNT; ++i) { 404 elm = &cache->array[i]; 405 if (elm->mp) { 406 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 407 if (mp) 408 atomic_add_int(&mp->mnt_refs, -1); 409 } 410 if (elm->ncp) { 411 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 412 if (ncp) 413 _cache_drop(ncp); 414 } 415 } 416 } 417 } 418 419 /* 420 * Namespace locking. The caller must already hold a reference to the 421 * namecache structure in order to lock/unlock it. The controlling entity 422 * in a 1->0 transition does not need to lock the ncp to dispose of it, 423 * as nobody else will have visibility to it at that point. 424 * 425 * Note that holding a locked namecache structure prevents other threads 426 * from making namespace changes (e.g. deleting or creating), prevents 427 * vnode association state changes by other threads, and prevents the 428 * namecache entry from being resolved or unresolved by other threads. 429 * 430 * An exclusive lock owner has full authority to associate/disassociate 431 * vnodes and resolve/unresolve the locked ncp. 432 * 433 * A shared lock owner only has authority to acquire the underlying vnode, 434 * if any. 435 * 436 * The primary lock field is nc_lockstatus. nc_locktd is set after the 437 * fact (when locking) or cleared prior to unlocking. 438 * 439 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 440 * or recycled, but it does NOT help you if the vnode had already 441 * initiated a recyclement. If this is important, use cache_get() 442 * rather then cache_lock() (and deal with the differences in the 443 * way the refs counter is handled). Or, alternatively, make an 444 * unconditional call to cache_validate() or cache_resolve() 445 * after cache_lock() returns. 446 */ 447 static __inline 448 void 449 _cache_lock(struct namecache *ncp) 450 { 451 int didwarn = 0; 452 int error; 453 454 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 455 while (__predict_false(error == EWOULDBLOCK)) { 456 if (didwarn == 0) { 457 didwarn = ticks - nclockwarn; 458 kprintf("[diagnostic] cache_lock: " 459 "%s blocked on %p " 460 "\"%*.*s\"\n", 461 curthread->td_comm, ncp, 462 ncp->nc_nlen, ncp->nc_nlen, 463 ncp->nc_name); 464 } 465 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 466 } 467 if (__predict_false(didwarn)) { 468 kprintf("[diagnostic] cache_lock: " 469 "%s unblocked %*.*s after %d secs\n", 470 curthread->td_comm, 471 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 472 (int)(ticks - didwarn) / hz); 473 } 474 } 475 476 /* 477 * Release a previously acquired lock. 478 * 479 * A concurrent shared-lock acquisition or acquisition/release can 480 * race bit 31 so only drop the ncp if bit 31 was set. 481 */ 482 static __inline 483 void 484 _cache_unlock(struct namecache *ncp) 485 { 486 lockmgr(&ncp->nc_lock, LK_RELEASE); 487 } 488 489 /* 490 * Lock ncp exclusively, non-blocking. Return 0 on success. 491 */ 492 static __inline 493 int 494 _cache_lock_nonblock(struct namecache *ncp) 495 { 496 int error; 497 498 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 499 if (__predict_false(error != 0)) { 500 return(EWOULDBLOCK); 501 } 502 return 0; 503 } 504 505 /* 506 * This is a special form of _cache_lock() which only succeeds if 507 * it can get a pristine, non-recursive lock. The caller must have 508 * already ref'd the ncp. 509 * 510 * On success the ncp will be locked, on failure it will not. The 511 * ref count does not change either way. 512 * 513 * We want _cache_lock_special() (on success) to return a definitively 514 * usable vnode or a definitively unresolved ncp. 515 */ 516 static __inline 517 int 518 _cache_lock_special(struct namecache *ncp) 519 { 520 if (_cache_lock_nonblock(ncp) == 0) { 521 if (lockmgr_oneexcl(&ncp->nc_lock)) { 522 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 523 _cache_setunresolved(ncp); 524 return 0; 525 } 526 _cache_unlock(ncp); 527 } 528 return EWOULDBLOCK; 529 } 530 531 /* 532 * Shared lock, guarantees vp held 533 * 534 * The shared lock holds vp on the 0->1 transition. It is possible to race 535 * another shared lock release, preventing the other release from dropping 536 * the vnode and clearing bit 31. 537 * 538 * If it is not set then we are responsible for setting it, and this 539 * responsibility does not race with anyone else. 540 */ 541 static __inline 542 void 543 _cache_lock_shared(struct namecache *ncp) 544 { 545 int didwarn = 0; 546 int error; 547 548 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 549 while (__predict_false(error == EWOULDBLOCK)) { 550 if (didwarn == 0) { 551 didwarn = ticks - nclockwarn; 552 kprintf("[diagnostic] cache_lock_shared: " 553 "%s blocked on %p " 554 "\"%*.*s\"\n", 555 curthread->td_comm, ncp, 556 ncp->nc_nlen, ncp->nc_nlen, 557 ncp->nc_name); 558 } 559 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 560 } 561 if (__predict_false(didwarn)) { 562 kprintf("[diagnostic] cache_lock_shared: " 563 "%s unblocked %*.*s after %d secs\n", 564 curthread->td_comm, 565 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 566 (int)(ticks - didwarn) / hz); 567 } 568 } 569 570 /* 571 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 572 */ 573 static __inline 574 int 575 _cache_lock_shared_nonblock(struct namecache *ncp) 576 { 577 int error; 578 579 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 580 if (__predict_false(error != 0)) { 581 return(EWOULDBLOCK); 582 } 583 return 0; 584 } 585 586 /* 587 * This function tries to get a shared lock but will back-off to an 588 * exclusive lock if: 589 * 590 * (1) Some other thread is trying to obtain an exclusive lock 591 * (to prevent the exclusive requester from getting livelocked out 592 * by many shared locks). 593 * 594 * (2) The current thread already owns an exclusive lock (to avoid 595 * deadlocking). 596 * 597 * WARNING! On machines with lots of cores we really want to try hard to 598 * get a shared lock or concurrent path lookups can chain-react 599 * into a very high-latency exclusive lock. 600 * 601 * This is very evident in dsynth's initial scans. 602 */ 603 static __inline 604 int 605 _cache_lock_shared_special(struct namecache *ncp) 606 { 607 /* 608 * Only honor a successful shared lock (returning 0) if there is 609 * no exclusive request pending and the vnode, if present, is not 610 * in a reclaimed state. 611 */ 612 if (_cache_lock_shared_nonblock(ncp) == 0) { 613 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 614 if (ncp->nc_vp == NULL || 615 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 616 return(0); 617 } 618 } 619 _cache_unlock(ncp); 620 return(EWOULDBLOCK); 621 } 622 623 /* 624 * Non-blocking shared lock failed. If we already own the exclusive 625 * lock just acquire another exclusive lock (instead of deadlocking). 626 * Otherwise acquire a shared lock. 627 */ 628 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 629 _cache_lock(ncp); 630 return(0); 631 } 632 _cache_lock_shared(ncp); 633 return(0); 634 } 635 636 static __inline 637 int 638 _cache_lockstatus(struct namecache *ncp) 639 { 640 int status; 641 642 status = lockstatus(&ncp->nc_lock, curthread); 643 if (status == 0 || status == LK_EXCLOTHER) 644 status = -1; 645 return status; 646 } 647 648 /* 649 * cache_hold() and cache_drop() prevent the premature deletion of a 650 * namecache entry but do not prevent operations (such as zapping) on 651 * that namecache entry. 652 * 653 * This routine may only be called from outside this source module if 654 * nc_refs is already deterministically at least 1, such as being 655 * associated with e.g. a process, file descriptor, or some other entity. 656 * 657 * Only the above situations, similar situations within this module where 658 * the ref count is deterministically at least 1, or when the ncp is found 659 * via the nchpp (hash table) lookup, can bump nc_refs. 660 * 661 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 662 * can still be removed from the nc_list, however, as long as the caller 663 * can acquire its lock (in the wrong order). 664 * 665 * This is a rare case where callers are allowed to hold a spinlock, 666 * so we can't ourselves. 667 */ 668 static __inline 669 struct namecache * 670 _cache_hold(struct namecache *ncp) 671 { 672 KKASSERT(ncp->nc_refs > 0); 673 atomic_add_int(&ncp->nc_refs, 1); 674 675 return(ncp); 676 } 677 678 /* 679 * Drop a cache entry. 680 * 681 * The 1->0 transition is special and requires the caller to destroy the 682 * entry. It means that the ncp is no longer on a nchpp list (since that 683 * would mean there was stilla ref). The ncp could still be on a nc_list 684 * but will not have any child of its own, again because nc_refs is now 0 685 * and children would have a ref to their parent. 686 * 687 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 688 */ 689 static __inline 690 void 691 _cache_drop(struct namecache *ncp) 692 { 693 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 694 /* 695 * Executed unlocked (no need to lock on last drop) 696 */ 697 _cache_setunresolved(ncp); 698 699 /* 700 * Scrap it. 701 */ 702 ncp->nc_refs = -1; /* safety */ 703 if (ncp->nc_name) 704 kfree(ncp->nc_name, M_VFSCACHEAUX); 705 kfree_obj(ncp, M_VFSCACHE); 706 } 707 } 708 709 /* 710 * Link a new namecache entry to its parent and to the hash table. Be 711 * careful to avoid races if vhold() blocks in the future. 712 * 713 * Both ncp and par must be referenced and locked. The reference is 714 * transfered to the nchpp (and, most notably, NOT to the parent list). 715 * 716 * NOTE: The hash table spinlock is held across this call, we can't do 717 * anything fancy. 718 */ 719 static void 720 _cache_link_parent(struct namecache *ncp, struct namecache *par, 721 struct nchash_head *nchpp) 722 { 723 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 724 725 KKASSERT(ncp->nc_parent == NULL); 726 ncp->nc_parent = par; 727 ncp->nc_head = nchpp; 728 729 /* 730 * Set inheritance flags. Note that the parent flags may be 731 * stale due to getattr potentially not having been run yet 732 * (it gets run during nlookup()'s). 733 */ 734 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 735 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 736 ncp->nc_flag |= NCF_SF_PNOCACHE; 737 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 738 ncp->nc_flag |= NCF_UF_PCACHE; 739 740 /* 741 * Add to hash table and parent, adjust accounting 742 */ 743 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 744 atomic_add_long(&pn->vfscache_count, 1); 745 if (TAILQ_EMPTY(&ncp->nc_list)) 746 atomic_add_long(&pn->vfscache_leafs, 1); 747 748 if (TAILQ_EMPTY(&par->nc_list)) { 749 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 750 atomic_add_long(&pn->vfscache_leafs, -1); 751 /* 752 * Any vp associated with an ncp which has children must 753 * be held to prevent it from being recycled. 754 */ 755 if (par->nc_vp) 756 vhold(par->nc_vp); 757 } else { 758 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 759 } 760 _cache_hold(par); /* add nc_parent ref */ 761 } 762 763 /* 764 * Remove the parent and hash associations from a namecache structure. 765 * Drop the ref-count on the parent. The caller receives the ref 766 * from the ncp's nchpp linkage that was removed and may forward that 767 * ref to a new linkage. 768 769 * The caller usually holds an additional ref * on the ncp so the unlink 770 * cannot be the final drop. XXX should not be necessary now since the 771 * caller receives the ref from the nchpp linkage, assuming the ncp 772 * was linked in the first place. 773 * 774 * ncp must be locked, which means that there won't be any nc_parent 775 * removal races. This routine will acquire a temporary lock on 776 * the parent as well as the appropriate hash chain. 777 */ 778 static void 779 _cache_unlink_parent(struct namecache *ncp) 780 { 781 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 782 struct namecache *par; 783 struct vnode *dropvp; 784 struct nchash_head *nchpp; 785 786 if ((par = ncp->nc_parent) != NULL) { 787 cpu_ccfence(); 788 KKASSERT(ncp->nc_parent == par); 789 790 /* don't add a ref, we drop the nchpp ref later */ 791 _cache_lock(par); 792 nchpp = ncp->nc_head; 793 spin_lock(&nchpp->spin); 794 795 /* 796 * Remove from hash table and parent, adjust accounting 797 */ 798 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 799 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 800 atomic_add_long(&pn->vfscache_count, -1); 801 if (TAILQ_EMPTY(&ncp->nc_list)) 802 atomic_add_long(&pn->vfscache_leafs, -1); 803 804 dropvp = NULL; 805 if (TAILQ_EMPTY(&par->nc_list)) { 806 atomic_add_long(&pn->vfscache_leafs, 1); 807 if (par->nc_vp) 808 dropvp = par->nc_vp; 809 } 810 ncp->nc_parent = NULL; 811 ncp->nc_head = NULL; 812 spin_unlock(&nchpp->spin); 813 _cache_unlock(par); 814 _cache_drop(par); /* drop nc_parent ref */ 815 816 /* 817 * We can only safely vdrop with no spinlocks held. 818 */ 819 if (dropvp) 820 vdrop(dropvp); 821 } 822 } 823 824 /* 825 * Allocate a new namecache structure. Most of the code does not require 826 * zero-termination of the string but it makes vop_compat_ncreate() easier. 827 * 828 * The returned ncp will be locked and referenced. The ref is generally meant 829 * to be transfered to the nchpp linkage. 830 */ 831 static struct namecache * 832 cache_alloc(int nlen) 833 { 834 struct namecache *ncp; 835 836 ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 837 if (nlen) 838 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK); 839 ncp->nc_nlen = nlen; 840 ncp->nc_flag = NCF_UNRESOLVED; 841 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 842 ncp->nc_refs = 1; 843 TAILQ_INIT(&ncp->nc_list); 844 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 845 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 846 847 return(ncp); 848 } 849 850 /* 851 * Can only be called for the case where the ncp has never been 852 * associated with anything (so no spinlocks are needed). 853 */ 854 static void 855 _cache_free(struct namecache *ncp) 856 { 857 KKASSERT(ncp->nc_refs == 1); 858 if (ncp->nc_name) 859 kfree(ncp->nc_name, M_VFSCACHEAUX); 860 kfree_obj(ncp, M_VFSCACHE); 861 } 862 863 /* 864 * [re]initialize a nchandle. 865 */ 866 void 867 cache_zero(struct nchandle *nch) 868 { 869 nch->ncp = NULL; 870 nch->mount = NULL; 871 } 872 873 /* 874 * Ref and deref a nchandle structure (ncp + mp) 875 * 876 * The caller must specify a stable ncp pointer, typically meaning the 877 * ncp is already referenced but this can also occur indirectly through 878 * e.g. holding a lock on a direct child. 879 * 880 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 881 * use read spinlocks here. 882 */ 883 struct nchandle * 884 cache_hold(struct nchandle *nch) 885 { 886 _cache_hold(nch->ncp); 887 _cache_mntref(nch->mount); 888 return(nch); 889 } 890 891 /* 892 * Create a copy of a namecache handle for an already-referenced 893 * entry. 894 */ 895 void 896 cache_copy(struct nchandle *nch, struct nchandle *target) 897 { 898 struct namecache *ncp; 899 struct mount *mp; 900 struct mntcache_elm *elm; 901 struct namecache *ncpr; 902 int i; 903 904 ncp = nch->ncp; 905 mp = nch->mount; 906 target->ncp = ncp; 907 target->mount = mp; 908 909 elm = _cache_mntcache_hash(ncp); 910 for (i = 0; i < MNTCACHE_SET; ++i) { 911 if (elm->ncp == ncp) { 912 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 913 if (ncpr == ncp) { 914 _cache_mntref(mp); 915 return; 916 } 917 if (ncpr) 918 _cache_drop(ncpr); 919 } 920 ++elm; 921 } 922 if (ncp) 923 _cache_hold(ncp); 924 _cache_mntref(mp); 925 } 926 927 /* 928 * Drop the nchandle, but try to cache the ref to avoid global atomic 929 * ops. This is typically done on the system root and jail root nchandles. 930 */ 931 void 932 cache_drop_and_cache(struct nchandle *nch, int elmno) 933 { 934 struct mntcache_elm *elm; 935 struct mntcache_elm *best; 936 struct namecache *ncpr; 937 int delta1; 938 int delta2; 939 int i; 940 941 if (elmno > 4) { 942 if (nch->ncp) { 943 _cache_drop(nch->ncp); 944 nch->ncp = NULL; 945 } 946 if (nch->mount) { 947 _cache_mntrel(nch->mount); 948 nch->mount = NULL; 949 } 950 return; 951 } 952 953 elm = _cache_mntcache_hash(nch->ncp); 954 best = elm; 955 for (i = 0; i < MNTCACHE_SET; ++i) { 956 if (elm->ncp == NULL) { 957 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 958 _cache_mntrel(nch->mount); 959 elm->ticks = ticks; 960 nch->mount = NULL; 961 nch->ncp = NULL; 962 if (ncpr) 963 _cache_drop(ncpr); 964 return; 965 } 966 delta1 = ticks - best->ticks; 967 delta2 = ticks - elm->ticks; 968 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 969 best = elm; 970 ++elm; 971 } 972 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 973 _cache_mntrel(nch->mount); 974 best->ticks = ticks; 975 nch->mount = NULL; 976 nch->ncp = NULL; 977 if (ncpr) 978 _cache_drop(ncpr); 979 } 980 981 void 982 cache_changemount(struct nchandle *nch, struct mount *mp) 983 { 984 _cache_mntref(mp); 985 _cache_mntrel(nch->mount); 986 nch->mount = mp; 987 } 988 989 void 990 cache_drop(struct nchandle *nch) 991 { 992 _cache_mntrel(nch->mount); 993 _cache_drop(nch->ncp); 994 nch->ncp = NULL; 995 nch->mount = NULL; 996 } 997 998 int 999 cache_lockstatus(struct nchandle *nch) 1000 { 1001 return(_cache_lockstatus(nch->ncp)); 1002 } 1003 1004 void 1005 cache_lock(struct nchandle *nch) 1006 { 1007 _cache_lock(nch->ncp); 1008 } 1009 1010 void 1011 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1012 { 1013 struct namecache *ncp = nch->ncp; 1014 1015 if (ncp_shared_lock_disable || excl || 1016 (ncp->nc_flag & NCF_UNRESOLVED)) { 1017 _cache_lock(ncp); 1018 } else { 1019 _cache_lock_shared(ncp); 1020 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1021 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1022 _cache_unlock(ncp); 1023 _cache_lock(ncp); 1024 } 1025 } else { 1026 _cache_unlock(ncp); 1027 _cache_lock(ncp); 1028 } 1029 } 1030 } 1031 1032 /* 1033 * Lock fncpd, fncp, tncpd, and tncp. tncp is already locked but may 1034 * have to be cycled to avoid deadlocks. Make sure all four are resolved. 1035 * 1036 * The caller is responsible for checking the validity upon return as 1037 * the records may have been flagged DESTROYED in the interim. 1038 * 1039 * Namecache lock ordering is leaf first, then parent. However, complex 1040 * interactions may occur between the source and target because there is 1041 * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp). 1042 */ 1043 void 1044 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp, 1045 struct nchandle *tncpd, struct nchandle *tncp, 1046 struct ucred *fcred, struct ucred *tcred) 1047 { 1048 int tlocked = 1; 1049 1050 /* 1051 * Lock tncp and tncpd 1052 * 1053 * NOTE: Because these ncps are not locked to begin with, it is 1054 * possible for other rename races to cause the normal lock 1055 * order assumptions to fail. 1056 * 1057 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1058 * matches after the leaf has been locked. However, ordering 1059 * between the 'from' and the 'to' is not and an overlapping 1060 * lock order reversal is still possible. 1061 */ 1062 again: 1063 if (__predict_false(tlocked == 0)) { 1064 cache_lock(tncp); 1065 } 1066 if (__predict_false(cache_lock_nonblock(tncpd) != 0)) { 1067 cache_unlock(tncp); 1068 cache_lock(tncpd); cache_unlock(tncpd); /* cycle */ 1069 tlocked = 0; 1070 goto again; 1071 } 1072 1073 /* 1074 * Lock fncp and fncpd 1075 * 1076 * NOTE: Because these ncps are not locked to begin with, it is 1077 * possible for other rename races to cause the normal lock 1078 * order assumptions to fail. 1079 * 1080 * NOTE: Lock ordering assumptions are valid if a leaf's parent 1081 * matches after the leaf has been locked. However, ordering 1082 * between the 'from' and the 'to' is not and an overlapping 1083 * lock order reversal is still possible. 1084 */ 1085 if (__predict_false(cache_lock_nonblock(fncp) != 0)) { 1086 cache_unlock(tncpd); 1087 cache_unlock(tncp); 1088 cache_lock(fncp); cache_unlock(fncp); /* cycle */ 1089 tlocked = 0; 1090 goto again; 1091 } 1092 if (__predict_false(cache_lock_nonblock(fncpd) != 0)) { 1093 cache_unlock(fncp); 1094 cache_unlock(tncpd); 1095 cache_unlock(tncp); 1096 cache_lock(fncpd); cache_unlock(fncpd); /* cycle */ 1097 tlocked = 0; 1098 goto again; 1099 } 1100 if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1101 cache_resolve(fncpd, fcred); 1102 if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0)) 1103 cache_resolve(tncpd, tcred); 1104 if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1105 cache_resolve(fncp, fcred); 1106 if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0)) 1107 cache_resolve(tncp, tcred); 1108 } 1109 1110 int 1111 cache_lock_nonblock(struct nchandle *nch) 1112 { 1113 return(_cache_lock_nonblock(nch->ncp)); 1114 } 1115 1116 void 1117 cache_unlock(struct nchandle *nch) 1118 { 1119 _cache_unlock(nch->ncp); 1120 } 1121 1122 /* 1123 * ref-and-lock, unlock-and-deref functions. 1124 * 1125 * This function is primarily used by nlookup. Even though cache_lock 1126 * holds the vnode, it is possible that the vnode may have already 1127 * initiated a recyclement. 1128 * 1129 * We want cache_get() to return a definitively usable vnode or a 1130 * definitively unresolved ncp. 1131 */ 1132 static 1133 struct namecache * 1134 _cache_get(struct namecache *ncp) 1135 { 1136 _cache_hold(ncp); 1137 _cache_lock(ncp); 1138 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1139 _cache_setunresolved(ncp); 1140 return(ncp); 1141 } 1142 1143 /* 1144 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1145 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1146 * valid. Otherwise an exclusive lock will be acquired instead. 1147 */ 1148 static 1149 struct namecache * 1150 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1151 { 1152 if (ncp_shared_lock_disable || excl || 1153 (ncp->nc_flag & NCF_UNRESOLVED)) { 1154 return(_cache_get(ncp)); 1155 } 1156 _cache_hold(ncp); 1157 _cache_lock_shared(ncp); 1158 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1159 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1160 _cache_unlock(ncp); 1161 ncp = _cache_get(ncp); 1162 _cache_drop(ncp); 1163 } 1164 } else { 1165 _cache_unlock(ncp); 1166 ncp = _cache_get(ncp); 1167 _cache_drop(ncp); 1168 } 1169 return(ncp); 1170 } 1171 1172 /* 1173 * NOTE: The same nchandle can be passed for both arguments. 1174 */ 1175 void 1176 cache_get(struct nchandle *nch, struct nchandle *target) 1177 { 1178 KKASSERT(nch->ncp->nc_refs > 0); 1179 target->mount = nch->mount; 1180 target->ncp = _cache_get(nch->ncp); 1181 _cache_mntref(target->mount); 1182 } 1183 1184 void 1185 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1186 { 1187 KKASSERT(nch->ncp->nc_refs > 0); 1188 target->mount = nch->mount; 1189 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1190 _cache_mntref(target->mount); 1191 } 1192 1193 /* 1194 * Release a held and locked ncp 1195 */ 1196 static __inline 1197 void 1198 _cache_put(struct namecache *ncp) 1199 { 1200 _cache_unlock(ncp); 1201 _cache_drop(ncp); 1202 } 1203 1204 void 1205 cache_put(struct nchandle *nch) 1206 { 1207 _cache_mntrel(nch->mount); 1208 _cache_put(nch->ncp); 1209 nch->ncp = NULL; 1210 nch->mount = NULL; 1211 } 1212 1213 /* 1214 * Resolve an unresolved ncp by associating a vnode with it. If the 1215 * vnode is NULL, a negative cache entry is created. 1216 * 1217 * The ncp should be locked on entry and will remain locked on return. 1218 */ 1219 static 1220 void 1221 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1222 { 1223 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1224 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1225 ncp->nc_vp == NULL); 1226 1227 if (vp) { 1228 /* 1229 * Any vp associated with an ncp which has children must 1230 * be held. Any vp associated with a locked ncp must be held. 1231 */ 1232 if (!TAILQ_EMPTY(&ncp->nc_list)) 1233 vhold(vp); 1234 spin_lock(&vp->v_spin); 1235 ncp->nc_vp = vp; 1236 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1237 ++vp->v_namecache_count; 1238 _cache_hold(ncp); /* v_namecache assoc */ 1239 spin_unlock(&vp->v_spin); 1240 vhold(vp); /* nc_vp */ 1241 1242 /* 1243 * Set auxiliary flags 1244 */ 1245 switch(vp->v_type) { 1246 case VDIR: 1247 ncp->nc_flag |= NCF_ISDIR; 1248 break; 1249 case VLNK: 1250 ncp->nc_flag |= NCF_ISSYMLINK; 1251 /* XXX cache the contents of the symlink */ 1252 break; 1253 default: 1254 break; 1255 } 1256 1257 ncp->nc_error = 0; 1258 1259 /* 1260 * XXX: this is a hack to work-around the lack of a real pfs vfs 1261 * implementation 1262 */ 1263 if (mp) { 1264 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1265 vp->v_pfsmp = mp; 1266 } 1267 } else { 1268 /* 1269 * When creating a negative cache hit we set the 1270 * namecache_gen. A later resolve will clean out the 1271 * negative cache hit if the mount point's namecache_gen 1272 * has changed. Used by devfs, could also be used by 1273 * other remote FSs. 1274 */ 1275 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1276 1277 ncp->nc_vp = NULL; 1278 ncp->nc_negcpu = mycpu->gd_cpuid; 1279 spin_lock(&pn->neg_spin); 1280 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1281 _cache_hold(ncp); /* neg_list assoc */ 1282 ++pn->neg_count; 1283 spin_unlock(&pn->neg_spin); 1284 atomic_add_long(&pn->vfscache_negs, 1); 1285 1286 ncp->nc_error = ENOENT; 1287 if (mp) 1288 VFS_NCPGEN_SET(mp, ncp); 1289 } 1290 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1291 } 1292 1293 void 1294 cache_setvp(struct nchandle *nch, struct vnode *vp) 1295 { 1296 _cache_setvp(nch->mount, nch->ncp, vp); 1297 } 1298 1299 /* 1300 * Used for NFS 1301 */ 1302 void 1303 cache_settimeout(struct nchandle *nch, int nticks) 1304 { 1305 struct namecache *ncp = nch->ncp; 1306 1307 if ((ncp->nc_timeout = ticks + nticks) == 0) 1308 ncp->nc_timeout = 1; 1309 } 1310 1311 /* 1312 * Disassociate the vnode or negative-cache association and mark a 1313 * namecache entry as unresolved again. Note that the ncp is still 1314 * left in the hash table and still linked to its parent. 1315 * 1316 * The ncp should be locked and refd on entry and will remain locked and refd 1317 * on return. 1318 * 1319 * This routine is normally never called on a directory containing children. 1320 * However, NFS often does just that in its rename() code as a cop-out to 1321 * avoid complex namespace operations. This disconnects a directory vnode 1322 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1323 * sync. 1324 * 1325 */ 1326 static 1327 void 1328 _cache_setunresolved(struct namecache *ncp) 1329 { 1330 struct vnode *vp; 1331 1332 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1333 ncp->nc_flag |= NCF_UNRESOLVED; 1334 ncp->nc_timeout = 0; 1335 ncp->nc_error = ENOTCONN; 1336 if ((vp = ncp->nc_vp) != NULL) { 1337 spin_lock(&vp->v_spin); 1338 ncp->nc_vp = NULL; 1339 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1340 --vp->v_namecache_count; 1341 spin_unlock(&vp->v_spin); 1342 1343 /* 1344 * Any vp associated with an ncp with children is 1345 * held by that ncp. Any vp associated with ncp 1346 * is held by that ncp. These conditions must be 1347 * undone when the vp is cleared out from the ncp. 1348 */ 1349 if (!TAILQ_EMPTY(&ncp->nc_list)) 1350 vdrop(vp); 1351 vdrop(vp); 1352 } else { 1353 struct pcpu_ncache *pn; 1354 1355 pn = &pcpu_ncache[ncp->nc_negcpu]; 1356 1357 atomic_add_long(&pn->vfscache_negs, -1); 1358 spin_lock(&pn->neg_spin); 1359 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1360 --pn->neg_count; 1361 spin_unlock(&pn->neg_spin); 1362 } 1363 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1364 _cache_drop(ncp); /* from v_namecache or neg_list */ 1365 } 1366 } 1367 1368 /* 1369 * The cache_nresolve() code calls this function to automatically 1370 * set a resolved cache element to unresolved if it has timed out 1371 * or if it is a negative cache hit and the mount point namecache_gen 1372 * has changed. 1373 */ 1374 static __inline int 1375 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1376 { 1377 /* 1378 * Try to zap entries that have timed out. We have 1379 * to be careful here because locked leafs may depend 1380 * on the vnode remaining intact in a parent, so only 1381 * do this under very specific conditions. 1382 */ 1383 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1384 TAILQ_EMPTY(&ncp->nc_list)) { 1385 return 1; 1386 } 1387 1388 /* 1389 * If a resolved negative cache hit is invalid due to 1390 * the mount's namecache generation being bumped, zap it. 1391 */ 1392 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1393 return 1; 1394 } 1395 1396 /* 1397 * Otherwise we are good 1398 */ 1399 return 0; 1400 } 1401 1402 static __inline void 1403 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1404 { 1405 /* 1406 * Already in an unresolved state, nothing to do. 1407 */ 1408 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1409 if (_cache_auto_unresolve_test(mp, ncp)) 1410 _cache_setunresolved(ncp); 1411 } 1412 } 1413 1414 void 1415 cache_setunresolved(struct nchandle *nch) 1416 { 1417 _cache_setunresolved(nch->ncp); 1418 } 1419 1420 /* 1421 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1422 * looking for matches. This flag tells the lookup code when it must 1423 * check for a mount linkage and also prevents the directories in question 1424 * from being deleted or renamed. 1425 */ 1426 static 1427 int 1428 cache_clrmountpt_callback(struct mount *mp, void *data) 1429 { 1430 struct nchandle *nch = data; 1431 1432 if (mp->mnt_ncmounton.ncp == nch->ncp) 1433 return(1); 1434 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1435 return(1); 1436 return(0); 1437 } 1438 1439 /* 1440 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1441 * with a mount point. 1442 */ 1443 void 1444 cache_clrmountpt(struct nchandle *nch) 1445 { 1446 int count; 1447 1448 count = mountlist_scan(cache_clrmountpt_callback, nch, 1449 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1450 MNTSCAN_NOUNLOCK); 1451 if (count == 0) 1452 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1453 } 1454 1455 /* 1456 * Invalidate portions of the namecache topology given a starting entry. 1457 * The passed ncp is set to an unresolved state and: 1458 * 1459 * The passed ncp must be referenced and locked. The routine may unlock 1460 * and relock ncp several times, and will recheck the children and loop 1461 * to catch races. When done the passed ncp will be returned with the 1462 * reference and lock intact. 1463 * 1464 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1465 * that the physical underlying nodes have been 1466 * destroyed... as in deleted. For example, when 1467 * a directory is removed. This will cause record 1468 * lookups on the name to no longer be able to find 1469 * the record and tells the resolver to return failure 1470 * rather then trying to resolve through the parent. 1471 * 1472 * The topology itself, including ncp->nc_name, 1473 * remains intact. 1474 * 1475 * This only applies to the passed ncp, if CINV_CHILDREN 1476 * is specified the children are not flagged. 1477 * 1478 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1479 * state as well. 1480 * 1481 * Note that this will also have the side effect of 1482 * cleaning out any unreferenced nodes in the topology 1483 * from the leaves up as the recursion backs out. 1484 * 1485 * Note that the topology for any referenced nodes remains intact, but 1486 * the nodes will be marked as having been destroyed and will be set 1487 * to an unresolved state. 1488 * 1489 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1490 * the namecache entry may not actually be invalidated on return if it was 1491 * revalidated while recursing down into its children. This code guarentees 1492 * that the node(s) will go through an invalidation cycle, but does not 1493 * guarentee that they will remain in an invalidated state. 1494 * 1495 * Returns non-zero if a revalidation was detected during the invalidation 1496 * recursion, zero otherwise. Note that since only the original ncp is 1497 * locked the revalidation ultimately can only indicate that the original ncp 1498 * *MIGHT* no have been reresolved. 1499 * 1500 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1501 * have to avoid blowing out the kernel stack. We do this by saving the 1502 * deep namecache node and aborting the recursion, then re-recursing at that 1503 * node using a depth-first algorithm in order to allow multiple deep 1504 * recursions to chain through each other, then we restart the invalidation 1505 * from scratch. 1506 */ 1507 1508 struct cinvtrack { 1509 struct namecache *resume_ncp; 1510 int depth; 1511 }; 1512 1513 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1514 1515 static 1516 int 1517 _cache_inval(struct namecache *ncp, int flags) 1518 { 1519 struct cinvtrack track; 1520 struct namecache *ncp2; 1521 int r; 1522 1523 track.depth = 0; 1524 track.resume_ncp = NULL; 1525 1526 for (;;) { 1527 r = _cache_inval_internal(ncp, flags, &track); 1528 if (track.resume_ncp == NULL) 1529 break; 1530 _cache_unlock(ncp); 1531 while ((ncp2 = track.resume_ncp) != NULL) { 1532 track.resume_ncp = NULL; 1533 _cache_lock(ncp2); 1534 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1535 &track); 1536 /*_cache_put(ncp2);*/ 1537 cache_zap(ncp2); 1538 } 1539 _cache_lock(ncp); 1540 } 1541 return(r); 1542 } 1543 1544 int 1545 cache_inval(struct nchandle *nch, int flags) 1546 { 1547 return(_cache_inval(nch->ncp, flags)); 1548 } 1549 1550 /* 1551 * Helper for _cache_inval(). The passed ncp is refd and locked and 1552 * remains that way on return, but may be unlocked/relocked multiple 1553 * times by the routine. 1554 */ 1555 static int 1556 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1557 { 1558 struct namecache *nextkid; 1559 int rcnt = 0; 1560 1561 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1562 1563 _cache_setunresolved(ncp); 1564 if (flags & CINV_DESTROY) { 1565 ncp->nc_flag |= NCF_DESTROYED; 1566 ++ncp->nc_generation; 1567 } 1568 1569 while ((flags & CINV_CHILDREN) && 1570 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1571 ) { 1572 struct namecache *kid; 1573 int restart; 1574 1575 restart = 0; 1576 _cache_hold(nextkid); 1577 if (++track->depth > MAX_RECURSION_DEPTH) { 1578 track->resume_ncp = ncp; 1579 _cache_hold(ncp); 1580 ++rcnt; 1581 } 1582 while ((kid = nextkid) != NULL) { 1583 /* 1584 * Parent (ncp) must be locked for the iteration. 1585 */ 1586 nextkid = NULL; 1587 if (kid->nc_parent != ncp) { 1588 _cache_drop(kid); 1589 kprintf("cache_inval_internal restartA %s\n", 1590 ncp->nc_name); 1591 restart = 1; 1592 break; 1593 } 1594 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1595 _cache_hold(nextkid); 1596 1597 /* 1598 * Parent unlocked for this section to avoid 1599 * deadlocks. Then lock the kid and check for 1600 * races. 1601 */ 1602 _cache_unlock(ncp); 1603 if (track->resume_ncp) { 1604 _cache_drop(kid); 1605 _cache_lock(ncp); 1606 break; 1607 } 1608 _cache_lock(kid); 1609 if (kid->nc_parent != ncp) { 1610 kprintf("cache_inval_internal " 1611 "restartB %s\n", 1612 ncp->nc_name); 1613 restart = 1; 1614 _cache_unlock(kid); 1615 _cache_drop(kid); 1616 _cache_lock(ncp); 1617 break; 1618 } 1619 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1620 TAILQ_FIRST(&kid->nc_list) 1621 ) { 1622 1623 rcnt += _cache_inval_internal(kid, 1624 flags & ~CINV_DESTROY, track); 1625 /*_cache_unlock(kid);*/ 1626 /*_cache_drop(kid);*/ 1627 cache_zap(kid); 1628 } else { 1629 cache_zap(kid); 1630 } 1631 1632 /* 1633 * Relock parent to continue scan 1634 */ 1635 _cache_lock(ncp); 1636 } 1637 if (nextkid) 1638 _cache_drop(nextkid); 1639 --track->depth; 1640 if (restart == 0) 1641 break; 1642 } 1643 1644 /* 1645 * Someone could have gotten in there while ncp was unlocked, 1646 * retry if so. 1647 */ 1648 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1649 ++rcnt; 1650 return (rcnt); 1651 } 1652 1653 /* 1654 * Invalidate a vnode's namecache associations. To avoid races against 1655 * the resolver we do not invalidate a node which we previously invalidated 1656 * but which was then re-resolved while we were in the invalidation loop. 1657 * 1658 * Returns non-zero if any namecache entries remain after the invalidation 1659 * loop completed. 1660 * 1661 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1662 * be ripped out of the topology while held, the vnode's v_namecache 1663 * list has no such restriction. NCP's can be ripped out of the list 1664 * at virtually any time if not locked, even if held. 1665 * 1666 * In addition, the v_namecache list itself must be locked via 1667 * the vnode's spinlock. 1668 */ 1669 int 1670 cache_inval_vp(struct vnode *vp, int flags) 1671 { 1672 struct namecache *ncp; 1673 struct namecache *next; 1674 1675 restart: 1676 spin_lock(&vp->v_spin); 1677 ncp = TAILQ_FIRST(&vp->v_namecache); 1678 if (ncp) 1679 _cache_hold(ncp); 1680 while (ncp) { 1681 /* loop entered with ncp held and vp spin-locked */ 1682 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1683 _cache_hold(next); 1684 spin_unlock(&vp->v_spin); 1685 _cache_lock(ncp); 1686 if (ncp->nc_vp != vp) { 1687 kprintf("Warning: cache_inval_vp: race-A detected on " 1688 "%s\n", ncp->nc_name); 1689 _cache_put(ncp); 1690 if (next) 1691 _cache_drop(next); 1692 goto restart; 1693 } 1694 _cache_inval(ncp, flags); 1695 _cache_put(ncp); /* also releases reference */ 1696 ncp = next; 1697 spin_lock(&vp->v_spin); 1698 if (ncp && ncp->nc_vp != vp) { 1699 spin_unlock(&vp->v_spin); 1700 kprintf("Warning: cache_inval_vp: race-B detected on " 1701 "%s\n", ncp->nc_name); 1702 _cache_drop(ncp); 1703 goto restart; 1704 } 1705 } 1706 spin_unlock(&vp->v_spin); 1707 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1708 } 1709 1710 /* 1711 * This routine is used instead of the normal cache_inval_vp() when we 1712 * are trying to recycle otherwise good vnodes. 1713 * 1714 * Return 0 on success, non-zero if not all namecache records could be 1715 * disassociated from the vnode (for various reasons). 1716 */ 1717 int 1718 cache_inval_vp_nonblock(struct vnode *vp) 1719 { 1720 struct namecache *ncp; 1721 struct namecache *next; 1722 1723 spin_lock(&vp->v_spin); 1724 ncp = TAILQ_FIRST(&vp->v_namecache); 1725 if (ncp) 1726 _cache_hold(ncp); 1727 while (ncp) { 1728 /* loop entered with ncp held */ 1729 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1730 _cache_hold(next); 1731 spin_unlock(&vp->v_spin); 1732 if (_cache_lock_nonblock(ncp)) { 1733 _cache_drop(ncp); 1734 if (next) 1735 _cache_drop(next); 1736 goto done; 1737 } 1738 if (ncp->nc_vp != vp) { 1739 kprintf("Warning: cache_inval_vp: race-A detected on " 1740 "%s\n", ncp->nc_name); 1741 _cache_put(ncp); 1742 if (next) 1743 _cache_drop(next); 1744 goto done; 1745 } 1746 _cache_inval(ncp, 0); 1747 _cache_put(ncp); /* also releases reference */ 1748 ncp = next; 1749 spin_lock(&vp->v_spin); 1750 if (ncp && ncp->nc_vp != vp) { 1751 spin_unlock(&vp->v_spin); 1752 kprintf("Warning: cache_inval_vp: race-B detected on " 1753 "%s\n", ncp->nc_name); 1754 _cache_drop(ncp); 1755 goto done; 1756 } 1757 } 1758 spin_unlock(&vp->v_spin); 1759 done: 1760 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1761 } 1762 1763 /* 1764 * Clears the universal directory search 'ok' flag. This flag allows 1765 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1766 * so clearing it simply forces revalidation. 1767 */ 1768 void 1769 cache_inval_wxok(struct vnode *vp) 1770 { 1771 struct namecache *ncp; 1772 1773 spin_lock(&vp->v_spin); 1774 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1775 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1776 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1777 } 1778 spin_unlock(&vp->v_spin); 1779 } 1780 1781 /* 1782 * The source ncp has been renamed to the target ncp. All elements have been 1783 * locked, including the parent ncp's. 1784 * 1785 * The target ncp is destroyed (as a normal rename-over would destroy the 1786 * target file or directory). 1787 * 1788 * Because there may be references to the source ncp we cannot copy its 1789 * contents to the target. Instead the source ncp is relinked as the target 1790 * and the target ncp is removed from the namecache topology. 1791 */ 1792 void 1793 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1794 { 1795 struct namecache *fncp = fnch->ncp; 1796 struct namecache *tncp = tnch->ncp; 1797 struct namecache *tncp_par; 1798 struct nchash_head *nchpp; 1799 u_int32_t hash; 1800 char *oname; 1801 char *nname; 1802 1803 ++fncp->nc_generation; 1804 ++tncp->nc_generation; 1805 if (tncp->nc_nlen) { 1806 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK); 1807 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1808 nname[tncp->nc_nlen] = 0; 1809 } else { 1810 nname = NULL; 1811 } 1812 1813 /* 1814 * Rename fncp (unlink) 1815 */ 1816 _cache_unlink_parent(fncp); 1817 oname = fncp->nc_name; 1818 fncp->nc_name = nname; 1819 fncp->nc_nlen = tncp->nc_nlen; 1820 if (oname) 1821 kfree(oname, M_VFSCACHEAUX); 1822 1823 tncp_par = tncp->nc_parent; 1824 KKASSERT(tncp_par->nc_lock.lk_lockholder == curthread); 1825 1826 /* 1827 * Rename fncp (relink) 1828 */ 1829 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1830 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1831 nchpp = NCHHASH(hash); 1832 1833 spin_lock(&nchpp->spin); 1834 _cache_link_parent(fncp, tncp_par, nchpp); 1835 spin_unlock(&nchpp->spin); 1836 1837 /* 1838 * Get rid of the overwritten tncp (unlink) 1839 */ 1840 _cache_unlink(tncp); 1841 } 1842 1843 /* 1844 * Perform actions consistent with unlinking a file. The passed-in ncp 1845 * must be locked. 1846 * 1847 * The ncp is marked DESTROYED so it no longer shows up in searches, 1848 * and will be physically deleted when the vnode goes away. 1849 * 1850 * If the related vnode has no refs then we cycle it through vget()/vput() 1851 * to (possibly if we don't have a ref race) trigger a deactivation, 1852 * allowing the VFS to trivially detect and recycle the deleted vnode 1853 * via VOP_INACTIVE(). 1854 * 1855 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1856 * target ncp. 1857 */ 1858 void 1859 cache_unlink(struct nchandle *nch) 1860 { 1861 _cache_unlink(nch->ncp); 1862 } 1863 1864 static void 1865 _cache_unlink(struct namecache *ncp) 1866 { 1867 struct vnode *vp; 1868 1869 /* 1870 * Causes lookups to fail and allows another ncp with the same 1871 * name to be created under ncp->nc_parent. 1872 */ 1873 ncp->nc_flag |= NCF_DESTROYED; 1874 ++ncp->nc_generation; 1875 1876 /* 1877 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1878 * force action on the 1->0 transition. 1879 */ 1880 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1881 (vp = ncp->nc_vp) != NULL) { 1882 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1883 if (VREFCNT(vp) <= 0) { 1884 if (vget(vp, LK_SHARED) == 0) 1885 vput(vp); 1886 } 1887 } 1888 } 1889 1890 /* 1891 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1892 * file. The easy solution is to just return non-zero if the vnode has refs. 1893 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1894 * force the reclaim). 1895 */ 1896 int 1897 cache_isopen(struct nchandle *nch) 1898 { 1899 struct vnode *vp; 1900 struct namecache *ncp = nch->ncp; 1901 1902 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1903 (vp = ncp->nc_vp) != NULL && 1904 VREFCNT(vp)) { 1905 return 1; 1906 } 1907 return 0; 1908 } 1909 1910 1911 /* 1912 * vget the vnode associated with the namecache entry. Resolve the namecache 1913 * entry if necessary. The passed ncp must be referenced and locked. If 1914 * the ncp is resolved it might be locked shared. 1915 * 1916 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1917 * (depending on the passed lk_type) will be returned in *vpp with an error 1918 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1919 * most typical error is ENOENT, meaning that the ncp represents a negative 1920 * cache hit and there is no vnode to retrieve, but other errors can occur 1921 * too. 1922 * 1923 * The vget() can race a reclaim. If this occurs we re-resolve the 1924 * namecache entry. 1925 * 1926 * There are numerous places in the kernel where vget() is called on a 1927 * vnode while one or more of its namecache entries is locked. Releasing 1928 * a vnode never deadlocks against locked namecache entries (the vnode 1929 * will not get recycled while referenced ncp's exist). This means we 1930 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1931 * lock when acquiring the vp lock or we might cause a deadlock. 1932 * 1933 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1934 * unresolved. If a reclaim race occurs the passed-in ncp will be 1935 * relocked exclusively before being re-resolved. 1936 */ 1937 int 1938 cache_vget(struct nchandle *nch, struct ucred *cred, 1939 int lk_type, struct vnode **vpp) 1940 { 1941 struct namecache *ncp; 1942 struct vnode *vp; 1943 int error; 1944 1945 ncp = nch->ncp; 1946 again: 1947 vp = NULL; 1948 if (ncp->nc_flag & NCF_UNRESOLVED) 1949 error = cache_resolve(nch, cred); 1950 else 1951 error = 0; 1952 1953 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1954 error = vget(vp, lk_type); 1955 if (error) { 1956 /* 1957 * VRECLAIM race 1958 * 1959 * The ncp may have been locked shared, we must relock 1960 * it exclusively before we can set it to unresolved. 1961 */ 1962 if (error == ENOENT) { 1963 kprintf("Warning: vnode reclaim race detected " 1964 "in cache_vget on %p (%s)\n", 1965 vp, ncp->nc_name); 1966 _cache_unlock(ncp); 1967 _cache_lock(ncp); 1968 _cache_setunresolved(ncp); 1969 goto again; 1970 } 1971 1972 /* 1973 * Not a reclaim race, some other error. 1974 */ 1975 KKASSERT(ncp->nc_vp == vp); 1976 vp = NULL; 1977 } else { 1978 KKASSERT(ncp->nc_vp == vp); 1979 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1980 } 1981 } 1982 if (error == 0 && vp == NULL) 1983 error = ENOENT; 1984 *vpp = vp; 1985 return(error); 1986 } 1987 1988 /* 1989 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode 1990 * is already held by virtuue of the ncp being locked, but it might not be 1991 * referenced and while it is not referenced it can transition into the 1992 * VRECLAIMED state. 1993 * 1994 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1995 * unresolved. If a reclaim race occurs the passed-in ncp will be 1996 * relocked exclusively before being re-resolved. 1997 * 1998 * NOTE: At the moment we have to issue a vget() on the vnode, even though 1999 * we are going to immediately release the lock, in order to resolve 2000 * potential reclamation races. Once we have a solid vnode ref that 2001 * was (at some point) interlocked via a vget(), the vnode will not 2002 * be reclaimed. 2003 * 2004 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation. 2005 */ 2006 int 2007 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 2008 { 2009 struct namecache *ncp; 2010 struct vnode *vp; 2011 int error; 2012 int v; 2013 2014 ncp = nch->ncp; 2015 again: 2016 vp = NULL; 2017 if (ncp->nc_flag & NCF_UNRESOLVED) 2018 error = cache_resolve(nch, cred); 2019 else 2020 error = 0; 2021 2022 while (error == 0 && (vp = ncp->nc_vp) != NULL) { 2023 /* 2024 * Try a lockless ref of the vnode. VRECLAIMED transitions 2025 * use the vx_lock state and update-counter mechanism so we 2026 * can detect if one is in-progress or occurred. 2027 * 2028 * If we can successfully ref the vnode and interlock against 2029 * the update-counter mechanism, and VRECLAIMED is found to 2030 * not be set after that, we should be good. 2031 */ 2032 v = spin_access_start_only(&vp->v_spin); 2033 if (__predict_true(spin_access_check_inprog(v) == 0)) { 2034 vref_special(vp); 2035 if (__predict_false( 2036 spin_access_end_only(&vp->v_spin, v))) { 2037 vrele(vp); 2038 continue; 2039 } 2040 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) { 2041 break; 2042 } 2043 vrele(vp); 2044 kprintf("CACHE_VREF: IN-RECLAIM\n"); 2045 } 2046 2047 /* 2048 * Do it the slow way 2049 */ 2050 error = vget(vp, LK_SHARED); 2051 if (error) { 2052 /* 2053 * VRECLAIM race 2054 */ 2055 if (error == ENOENT) { 2056 kprintf("Warning: vnode reclaim race detected " 2057 "in cache_vget on %p (%s)\n", 2058 vp, ncp->nc_name); 2059 _cache_unlock(ncp); 2060 _cache_lock(ncp); 2061 _cache_setunresolved(ncp); 2062 goto again; 2063 } 2064 2065 /* 2066 * Not a reclaim race, some other error. 2067 */ 2068 KKASSERT(ncp->nc_vp == vp); 2069 vp = NULL; 2070 } else { 2071 KKASSERT(ncp->nc_vp == vp); 2072 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 2073 /* caller does not want a lock */ 2074 vn_unlock(vp); 2075 } 2076 break; 2077 } 2078 if (error == 0 && vp == NULL) 2079 error = ENOENT; 2080 *vpp = vp; 2081 2082 return(error); 2083 } 2084 2085 /* 2086 * Return a referenced vnode representing the parent directory of 2087 * ncp. 2088 * 2089 * Because the caller has locked the ncp it should not be possible for 2090 * the parent ncp to go away. However, the parent can unresolve its 2091 * dvp at any time so we must be able to acquire a lock on the parent 2092 * to safely access nc_vp. 2093 * 2094 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2095 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2096 * getting destroyed. 2097 * 2098 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2099 * lock on the ncp in question.. 2100 */ 2101 struct vnode * 2102 cache_dvpref(struct namecache *ncp) 2103 { 2104 struct namecache *par; 2105 struct vnode *dvp; 2106 2107 dvp = NULL; 2108 if ((par = ncp->nc_parent) != NULL) { 2109 _cache_hold(par); 2110 _cache_lock(par); 2111 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2112 if ((dvp = par->nc_vp) != NULL) 2113 vhold(dvp); 2114 } 2115 _cache_unlock(par); 2116 if (dvp) { 2117 if (vget(dvp, LK_SHARED) == 0) { 2118 vn_unlock(dvp); 2119 vdrop(dvp); 2120 /* return refd, unlocked dvp */ 2121 } else { 2122 vdrop(dvp); 2123 dvp = NULL; 2124 } 2125 } 2126 _cache_drop(par); 2127 } 2128 return(dvp); 2129 } 2130 2131 /* 2132 * Convert a directory vnode to a namecache record without any other 2133 * knowledge of the topology. This ONLY works with directory vnodes and 2134 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2135 * returned ncp (if not NULL) will be held and unlocked. 2136 * 2137 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2138 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2139 * for dvp. This will fail only if the directory has been deleted out from 2140 * under the caller. 2141 * 2142 * Callers must always check for a NULL return no matter the value of 'makeit'. 2143 * 2144 * To avoid underflowing the kernel stack each recursive call increments 2145 * the makeit variable. 2146 */ 2147 2148 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2149 struct vnode *dvp, char *fakename); 2150 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2151 struct vnode **saved_dvp); 2152 2153 int 2154 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2155 struct nchandle *nch) 2156 { 2157 struct vnode *saved_dvp; 2158 struct vnode *pvp; 2159 char *fakename; 2160 int error; 2161 2162 nch->ncp = NULL; 2163 nch->mount = dvp->v_mount; 2164 saved_dvp = NULL; 2165 fakename = NULL; 2166 2167 /* 2168 * Handle the makeit == 0 degenerate case 2169 */ 2170 if (makeit == 0) { 2171 spin_lock_shared(&dvp->v_spin); 2172 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2173 if (nch->ncp) 2174 cache_hold(nch); 2175 spin_unlock_shared(&dvp->v_spin); 2176 } 2177 2178 /* 2179 * Loop until resolution, inside code will break out on error. 2180 */ 2181 while (makeit) { 2182 /* 2183 * Break out if we successfully acquire a working ncp. 2184 */ 2185 spin_lock_shared(&dvp->v_spin); 2186 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2187 if (nch->ncp) { 2188 cache_hold(nch); 2189 spin_unlock_shared(&dvp->v_spin); 2190 break; 2191 } 2192 spin_unlock_shared(&dvp->v_spin); 2193 2194 /* 2195 * If dvp is the root of its filesystem it should already 2196 * have a namecache pointer associated with it as a side 2197 * effect of the mount, but it may have been disassociated. 2198 */ 2199 if (dvp->v_flag & VROOT) { 2200 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2201 error = cache_resolve_mp(nch->mount); 2202 _cache_put(nch->ncp); 2203 if (ncvp_debug) { 2204 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2205 dvp->v_mount, error); 2206 } 2207 if (error) { 2208 if (ncvp_debug) 2209 kprintf(" failed\n"); 2210 nch->ncp = NULL; 2211 break; 2212 } 2213 if (ncvp_debug) 2214 kprintf(" succeeded\n"); 2215 continue; 2216 } 2217 2218 /* 2219 * If we are recursed too deeply resort to an O(n^2) 2220 * algorithm to resolve the namecache topology. The 2221 * resolved pvp is left referenced in saved_dvp to 2222 * prevent the tree from being destroyed while we loop. 2223 */ 2224 if (makeit > 20) { 2225 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2226 if (error) { 2227 kprintf("lookupdotdot(longpath) failed %d " 2228 "dvp %p\n", error, dvp); 2229 nch->ncp = NULL; 2230 break; 2231 } 2232 continue; 2233 } 2234 2235 /* 2236 * Get the parent directory and resolve its ncp. 2237 */ 2238 if (fakename) { 2239 kfree(fakename, M_TEMP); 2240 fakename = NULL; 2241 } 2242 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2243 &fakename); 2244 if (error) { 2245 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2246 break; 2247 } 2248 vn_unlock(pvp); 2249 2250 /* 2251 * Reuse makeit as a recursion depth counter. On success 2252 * nch will be fully referenced. 2253 */ 2254 cache_fromdvp(pvp, cred, makeit + 1, nch); 2255 vrele(pvp); 2256 if (nch->ncp == NULL) 2257 break; 2258 2259 /* 2260 * Do an inefficient scan of pvp (embodied by ncp) to look 2261 * for dvp. This will create a namecache record for dvp on 2262 * success. We loop up to recheck on success. 2263 * 2264 * ncp and dvp are both held but not locked. 2265 */ 2266 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2267 if (error) { 2268 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2269 pvp, nch->ncp->nc_name, dvp); 2270 cache_drop(nch); 2271 /* nch was NULLed out, reload mount */ 2272 nch->mount = dvp->v_mount; 2273 break; 2274 } 2275 if (ncvp_debug) { 2276 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2277 pvp, nch->ncp->nc_name); 2278 } 2279 cache_drop(nch); 2280 /* nch was NULLed out, reload mount */ 2281 nch->mount = dvp->v_mount; 2282 } 2283 2284 /* 2285 * If nch->ncp is non-NULL it will have been held already. 2286 */ 2287 if (fakename) 2288 kfree(fakename, M_TEMP); 2289 if (saved_dvp) 2290 vrele(saved_dvp); 2291 if (nch->ncp) 2292 return (0); 2293 return (EINVAL); 2294 } 2295 2296 /* 2297 * Go up the chain of parent directories until we find something 2298 * we can resolve into the namecache. This is very inefficient. 2299 */ 2300 static 2301 int 2302 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2303 struct vnode **saved_dvp) 2304 { 2305 struct nchandle nch; 2306 struct vnode *pvp; 2307 int error; 2308 static time_t last_fromdvp_report; 2309 char *fakename; 2310 2311 /* 2312 * Loop getting the parent directory vnode until we get something we 2313 * can resolve in the namecache. 2314 */ 2315 vref(dvp); 2316 nch.mount = dvp->v_mount; 2317 nch.ncp = NULL; 2318 fakename = NULL; 2319 2320 for (;;) { 2321 if (fakename) { 2322 kfree(fakename, M_TEMP); 2323 fakename = NULL; 2324 } 2325 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2326 &fakename); 2327 if (error) { 2328 vrele(dvp); 2329 break; 2330 } 2331 vn_unlock(pvp); 2332 spin_lock_shared(&pvp->v_spin); 2333 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2334 _cache_hold(nch.ncp); 2335 spin_unlock_shared(&pvp->v_spin); 2336 vrele(pvp); 2337 break; 2338 } 2339 spin_unlock_shared(&pvp->v_spin); 2340 if (pvp->v_flag & VROOT) { 2341 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2342 error = cache_resolve_mp(nch.mount); 2343 _cache_unlock(nch.ncp); 2344 vrele(pvp); 2345 if (error) { 2346 _cache_drop(nch.ncp); 2347 nch.ncp = NULL; 2348 vrele(dvp); 2349 } 2350 break; 2351 } 2352 vrele(dvp); 2353 dvp = pvp; 2354 } 2355 if (error == 0) { 2356 if (last_fromdvp_report != time_uptime) { 2357 last_fromdvp_report = time_uptime; 2358 kprintf("Warning: extremely inefficient path " 2359 "resolution on %s\n", 2360 nch.ncp->nc_name); 2361 } 2362 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2363 2364 /* 2365 * Hopefully dvp now has a namecache record associated with 2366 * it. Leave it referenced to prevent the kernel from 2367 * recycling the vnode. Otherwise extremely long directory 2368 * paths could result in endless recycling. 2369 */ 2370 if (*saved_dvp) 2371 vrele(*saved_dvp); 2372 *saved_dvp = dvp; 2373 _cache_drop(nch.ncp); 2374 } 2375 if (fakename) 2376 kfree(fakename, M_TEMP); 2377 return (error); 2378 } 2379 2380 /* 2381 * Do an inefficient scan of the directory represented by ncp looking for 2382 * the directory vnode dvp. ncp must be held but not locked on entry and 2383 * will be held on return. dvp must be refd but not locked on entry and 2384 * will remain refd on return. 2385 * 2386 * Why do this at all? Well, due to its stateless nature the NFS server 2387 * converts file handles directly to vnodes without necessarily going through 2388 * the namecache ops that would otherwise create the namecache topology 2389 * leading to the vnode. We could either (1) Change the namecache algorithms 2390 * to allow disconnect namecache records that are re-merged opportunistically, 2391 * or (2) Make the NFS server backtrack and scan to recover a connected 2392 * namecache topology in order to then be able to issue new API lookups. 2393 * 2394 * It turns out that (1) is a huge mess. It takes a nice clean set of 2395 * namecache algorithms and introduces a lot of complication in every subsystem 2396 * that calls into the namecache to deal with the re-merge case, especially 2397 * since we are using the namecache to placehold negative lookups and the 2398 * vnode might not be immediately assigned. (2) is certainly far less 2399 * efficient then (1), but since we are only talking about directories here 2400 * (which are likely to remain cached), the case does not actually run all 2401 * that often and has the supreme advantage of not polluting the namecache 2402 * algorithms. 2403 * 2404 * If a fakename is supplied just construct a namecache entry using the 2405 * fake name. 2406 */ 2407 static int 2408 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2409 struct vnode *dvp, char *fakename) 2410 { 2411 struct nlcomponent nlc; 2412 struct nchandle rncp; 2413 struct dirent *den; 2414 struct vnode *pvp; 2415 struct vattr vat; 2416 struct iovec iov; 2417 struct uio uio; 2418 int blksize; 2419 int eofflag; 2420 int bytes; 2421 char *rbuf; 2422 int error; 2423 2424 vat.va_blocksize = 0; 2425 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2426 return (error); 2427 cache_lock(nch); 2428 error = cache_vref(nch, cred, &pvp); 2429 cache_unlock(nch); 2430 if (error) 2431 return (error); 2432 if (ncvp_debug) { 2433 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2434 "vattr fileid = %lld\n", 2435 nch->ncp, nch->ncp->nc_name, 2436 vat.va_blocksize, 2437 (long long)vat.va_fileid); 2438 } 2439 2440 /* 2441 * Use the supplied fakename if not NULL. Fake names are typically 2442 * not in the actual filesystem hierarchy. This is used by HAMMER 2443 * to glue @@timestamp recursions together. 2444 */ 2445 if (fakename) { 2446 nlc.nlc_nameptr = fakename; 2447 nlc.nlc_namelen = strlen(fakename); 2448 rncp = cache_nlookup(nch, &nlc); 2449 goto done; 2450 } 2451 2452 if ((blksize = vat.va_blocksize) == 0) 2453 blksize = DEV_BSIZE; 2454 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2455 rncp.ncp = NULL; 2456 2457 eofflag = 0; 2458 uio.uio_offset = 0; 2459 again: 2460 iov.iov_base = rbuf; 2461 iov.iov_len = blksize; 2462 uio.uio_iov = &iov; 2463 uio.uio_iovcnt = 1; 2464 uio.uio_resid = blksize; 2465 uio.uio_segflg = UIO_SYSSPACE; 2466 uio.uio_rw = UIO_READ; 2467 uio.uio_td = curthread; 2468 2469 if (ncvp_debug >= 2) 2470 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2471 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2472 if (error == 0) { 2473 den = (struct dirent *)rbuf; 2474 bytes = blksize - uio.uio_resid; 2475 2476 while (bytes > 0) { 2477 if (ncvp_debug >= 2) { 2478 kprintf("cache_inefficient_scan: %*.*s\n", 2479 den->d_namlen, den->d_namlen, 2480 den->d_name); 2481 } 2482 if (den->d_type != DT_WHT && 2483 den->d_ino == vat.va_fileid) { 2484 if (ncvp_debug) { 2485 kprintf("cache_inefficient_scan: " 2486 "MATCHED inode %lld path %s/%*.*s\n", 2487 (long long)vat.va_fileid, 2488 nch->ncp->nc_name, 2489 den->d_namlen, den->d_namlen, 2490 den->d_name); 2491 } 2492 nlc.nlc_nameptr = den->d_name; 2493 nlc.nlc_namelen = den->d_namlen; 2494 rncp = cache_nlookup(nch, &nlc); 2495 KKASSERT(rncp.ncp != NULL); 2496 break; 2497 } 2498 bytes -= _DIRENT_DIRSIZ(den); 2499 den = _DIRENT_NEXT(den); 2500 } 2501 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2502 goto again; 2503 } 2504 kfree(rbuf, M_TEMP); 2505 done: 2506 vrele(pvp); 2507 if (rncp.ncp) { 2508 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2509 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2510 if (ncvp_debug >= 2) { 2511 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2512 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2513 } 2514 } else { 2515 if (ncvp_debug >= 2) { 2516 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2517 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2518 rncp.ncp->nc_vp); 2519 } 2520 } 2521 if (rncp.ncp->nc_vp == NULL) 2522 error = rncp.ncp->nc_error; 2523 /* 2524 * Release rncp after a successful nlookup. rncp was fully 2525 * referenced. 2526 */ 2527 cache_put(&rncp); 2528 } else { 2529 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2530 dvp, nch->ncp->nc_name); 2531 error = ENOENT; 2532 } 2533 return (error); 2534 } 2535 2536 /* 2537 * This function must be called with the ncp held and locked and will unlock 2538 * and drop it during zapping. 2539 * 2540 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2541 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2542 * and removes the related reference. If the ncp can be removed, and the 2543 * parent can be zapped non-blocking, this function loops up. 2544 * 2545 * There will be one ref from the caller (which we now own). The only 2546 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2547 * so possibly 2 refs left. Taking this into account, if there are no 2548 * additional refs and no children, the ncp will be removed from the topology 2549 * and destroyed. 2550 * 2551 * References and/or children may exist if the ncp is in the middle of the 2552 * topology, preventing the ncp from being destroyed. 2553 * 2554 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2555 * 2556 * This function may return a held (but NOT locked) parent node which the 2557 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2558 * due to deep namecache trees. 2559 * 2560 * WARNING! For MPSAFE operation this routine must acquire up to three 2561 * spin locks to be able to safely test nc_refs. Lock order is 2562 * very important. 2563 * 2564 * hash spinlock if on hash list 2565 * parent spinlock if child of parent 2566 * (the ncp is unresolved so there is no vnode association) 2567 */ 2568 static void 2569 cache_zap(struct namecache *ncp) 2570 { 2571 struct namecache *par; 2572 struct vnode *dropvp; 2573 struct nchash_head *nchpp; 2574 int refcmp; 2575 int nonblock = 1; /* XXX cleanup */ 2576 2577 again: 2578 /* 2579 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2580 * This gets rid of any vp->v_namecache list or negative list and 2581 * the related ref. 2582 */ 2583 _cache_setunresolved(ncp); 2584 2585 /* 2586 * Try to scrap the entry and possibly tail-recurse on its parent. 2587 * We only scrap unref'd (other then our ref) unresolved entries, 2588 * we do not scrap 'live' entries. 2589 * 2590 * If nc_parent is non NULL we expect 2 references, else just 1. 2591 * If there are more, someone else also holds the ncp and we cannot 2592 * destroy it. 2593 */ 2594 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2595 KKASSERT(ncp->nc_refs > 0); 2596 2597 /* 2598 * If the ncp is linked to its parent it will also be in the hash 2599 * table. We have to be able to lock the parent and the hash table. 2600 * 2601 * Acquire locks. Note that the parent can't go away while we hold 2602 * a child locked. If nc_parent is present, expect 2 refs instead 2603 * of 1. 2604 */ 2605 nchpp = NULL; 2606 if ((par = ncp->nc_parent) != NULL) { 2607 if (nonblock) { 2608 if (_cache_lock_nonblock(par)) { 2609 /* lock failed */ 2610 ncp->nc_flag |= NCF_DEFEREDZAP; 2611 atomic_add_long( 2612 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2613 1); 2614 _cache_unlock(ncp); 2615 _cache_drop(ncp); /* caller's ref */ 2616 return; 2617 } 2618 _cache_hold(par); 2619 } else { 2620 _cache_hold(par); 2621 _cache_lock(par); 2622 } 2623 nchpp = ncp->nc_head; 2624 spin_lock(&nchpp->spin); 2625 } 2626 2627 /* 2628 * With the parent and nchpp locked, and the vnode removed 2629 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2630 * more someone else has a ref and we cannot zap the entry. 2631 * 2632 * one for our hold 2633 * one for our parent link (parent also has one from the linkage) 2634 */ 2635 if (par) 2636 refcmp = 2; 2637 else 2638 refcmp = 1; 2639 2640 /* 2641 * On failure undo the work we've done so far and drop the 2642 * caller's ref and ncp. 2643 */ 2644 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2645 if (par) { 2646 spin_unlock(&nchpp->spin); 2647 _cache_put(par); 2648 } 2649 _cache_unlock(ncp); 2650 _cache_drop(ncp); 2651 return; 2652 } 2653 2654 /* 2655 * We own all the refs and with the spinlocks held no further 2656 * refs can be acquired by others. 2657 * 2658 * Remove us from the hash list and parent list. We have to 2659 * drop a ref on the parent's vp if the parent's list becomes 2660 * empty. 2661 */ 2662 dropvp = NULL; 2663 if (par) { 2664 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2665 2666 KKASSERT(nchpp == ncp->nc_head); 2667 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2668 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2669 atomic_add_long(&pn->vfscache_count, -1); 2670 if (TAILQ_EMPTY(&ncp->nc_list)) 2671 atomic_add_long(&pn->vfscache_leafs, -1); 2672 2673 if (TAILQ_EMPTY(&par->nc_list)) { 2674 atomic_add_long(&pn->vfscache_leafs, 1); 2675 if (par->nc_vp) 2676 dropvp = par->nc_vp; 2677 } 2678 ncp->nc_parent = NULL; 2679 ncp->nc_head = NULL; 2680 spin_unlock(&nchpp->spin); 2681 _cache_drop(par); /* removal of ncp from par->nc_list */ 2682 /*_cache_unlock(par);*/ 2683 } else { 2684 KKASSERT(ncp->nc_head == NULL); 2685 } 2686 2687 /* 2688 * ncp should not have picked up any refs. Physically 2689 * destroy the ncp. 2690 */ 2691 if (ncp->nc_refs != refcmp) { 2692 panic("cache_zap: %p bad refs %d (expected %d)\n", 2693 ncp, ncp->nc_refs, refcmp); 2694 } 2695 /* _cache_unlock(ncp) not required */ 2696 ncp->nc_refs = -1; /* safety */ 2697 if (ncp->nc_name) 2698 kfree(ncp->nc_name, M_VFSCACHEAUX); 2699 kfree_obj(ncp, M_VFSCACHE); 2700 2701 /* 2702 * Delayed drop (we had to release our spinlocks) 2703 */ 2704 if (dropvp) 2705 vdrop(dropvp); 2706 2707 /* 2708 * Loop up if we can recursively clean out the parent. 2709 */ 2710 if (par) { 2711 refcmp = 1; /* ref on parent */ 2712 if (par->nc_parent) /* par->par */ 2713 ++refcmp; 2714 par->nc_flag &= ~NCF_DEFEREDZAP; 2715 if ((par->nc_flag & NCF_UNRESOLVED) && 2716 par->nc_refs == refcmp && 2717 TAILQ_EMPTY(&par->nc_list)) { 2718 ncp = par; 2719 goto again; 2720 } 2721 _cache_unlock(par); 2722 _cache_drop(par); 2723 } 2724 } 2725 2726 /* 2727 * Clean up dangling negative cache and defered-drop entries in the 2728 * namecache. 2729 * 2730 * This routine is called in the critical path and also called from 2731 * vnlru(). When called from vnlru we use a lower limit to try to 2732 * deal with the negative cache before the critical path has to start 2733 * dealing with it. 2734 */ 2735 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2736 2737 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2738 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2739 2740 void 2741 cache_hysteresis(int critpath) 2742 { 2743 long poslimit; 2744 long neglimit = maxvnodes / ncnegfactor; 2745 long xnumcache = vfscache_leafs; 2746 2747 if (critpath == 0) 2748 neglimit = neglimit * 8 / 10; 2749 2750 /* 2751 * Don't cache too many negative hits. We use hysteresis to reduce 2752 * the impact on the critical path. 2753 */ 2754 switch(neg_cache_hysteresis_state[critpath]) { 2755 case CHI_LOW: 2756 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2757 if (critpath) 2758 _cache_cleanneg(ncnegflush); 2759 else 2760 _cache_cleanneg(ncnegflush + 2761 vfscache_negs - neglimit); 2762 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2763 } 2764 break; 2765 case CHI_HIGH: 2766 if (vfscache_negs > MINNEG * 9 / 10 && 2767 vfscache_negs * 9 / 10 > neglimit 2768 ) { 2769 if (critpath) 2770 _cache_cleanneg(ncnegflush); 2771 else 2772 _cache_cleanneg(ncnegflush + 2773 vfscache_negs * 9 / 10 - 2774 neglimit); 2775 } else { 2776 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2777 } 2778 break; 2779 } 2780 2781 /* 2782 * Don't cache too many positive hits. We use hysteresis to reduce 2783 * the impact on the critical path. 2784 * 2785 * Excessive positive hits can accumulate due to large numbers of 2786 * hardlinks (the vnode cache will not prevent hl ncps from growing 2787 * into infinity). 2788 */ 2789 if ((poslimit = ncposlimit) == 0) 2790 poslimit = maxvnodes * 2; 2791 if (critpath == 0) 2792 poslimit = poslimit * 8 / 10; 2793 2794 switch(pos_cache_hysteresis_state[critpath]) { 2795 case CHI_LOW: 2796 if (xnumcache > poslimit && xnumcache > MINPOS) { 2797 if (critpath) 2798 _cache_cleanpos(ncposflush); 2799 else 2800 _cache_cleanpos(ncposflush + 2801 xnumcache - poslimit); 2802 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2803 } 2804 break; 2805 case CHI_HIGH: 2806 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2807 if (critpath) 2808 _cache_cleanpos(ncposflush); 2809 else 2810 _cache_cleanpos(ncposflush + 2811 xnumcache - poslimit * 5 / 6); 2812 } else { 2813 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2814 } 2815 break; 2816 } 2817 2818 /* 2819 * Clean out dangling defered-zap ncps which could not be cleanly 2820 * dropped if too many build up. Note that numdefered is 2821 * heuristical. Make sure we are real-time for the current cpu, 2822 * plus the global rollup. 2823 */ 2824 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2825 _cache_cleandefered(); 2826 } 2827 } 2828 2829 /* 2830 * NEW NAMECACHE LOOKUP API 2831 * 2832 * Lookup an entry in the namecache. The passed par_nch must be referenced 2833 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2834 * is ALWAYS returned, eve if the supplied component is illegal. 2835 * 2836 * The resulting namecache entry should be returned to the system with 2837 * cache_put() or cache_unlock() + cache_drop(). 2838 * 2839 * namecache locks are recursive but care must be taken to avoid lock order 2840 * reversals (hence why the passed par_nch must be unlocked). Locking 2841 * rules are to order for parent traversals, not for child traversals. 2842 * 2843 * Nobody else will be able to manipulate the associated namespace (e.g. 2844 * create, delete, rename, rename-target) until the caller unlocks the 2845 * entry. 2846 * 2847 * The returned entry will be in one of three states: positive hit (non-null 2848 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2849 * Unresolved entries must be resolved through the filesystem to associate the 2850 * vnode and/or determine whether a positive or negative hit has occured. 2851 * 2852 * It is not necessary to lock a directory in order to lock namespace under 2853 * that directory. In fact, it is explicitly not allowed to do that. A 2854 * directory is typically only locked when being created, renamed, or 2855 * destroyed. 2856 * 2857 * The directory (par) may be unresolved, in which case any returned child 2858 * will likely also be marked unresolved. Likely but not guarenteed. Since 2859 * the filesystem lookup requires a resolved directory vnode the caller is 2860 * responsible for resolving the namecache chain top-down. This API 2861 * specifically allows whole chains to be created in an unresolved state. 2862 */ 2863 struct nchandle 2864 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2865 { 2866 struct nchandle nch; 2867 struct namecache *ncp; 2868 struct namecache *new_ncp; 2869 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2870 struct nchash_head *nchpp; 2871 struct mount *mp; 2872 u_int32_t hash; 2873 globaldata_t gd; 2874 int par_locked; 2875 int use_excl; 2876 2877 gd = mycpu; 2878 mp = par_nch->mount; 2879 par_locked = 0; 2880 2881 /* 2882 * This is a good time to call it, no ncp's are locked by 2883 * the caller or us. 2884 */ 2885 cache_hysteresis(1); 2886 2887 /* 2888 * Try to locate an existing entry 2889 */ 2890 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2891 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2892 new_ncp = NULL; 2893 use_excl = 0; 2894 nchpp = NCHHASH(hash); 2895 restart: 2896 rep_ncp = NULL; 2897 if (use_excl) 2898 spin_lock(&nchpp->spin); 2899 else 2900 spin_lock_shared(&nchpp->spin); 2901 2902 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2903 /* 2904 * Break out if we find a matching entry. Note that 2905 * UNRESOLVED entries may match, but DESTROYED entries 2906 * do not. 2907 * 2908 * We may be able to reuse DESTROYED entries that we come 2909 * across, even if the name does not match, as long as 2910 * nc_nlen is correct and the only hold ref is from the nchpp 2911 * list itself. 2912 */ 2913 if (ncp->nc_parent == par_nch->ncp && 2914 ncp->nc_nlen == nlc->nlc_namelen) { 2915 if (ncp->nc_flag & NCF_DESTROYED) { 2916 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2917 rep_ncp = ncp; 2918 continue; 2919 } 2920 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2921 continue; 2922 _cache_hold(ncp); 2923 if (use_excl) 2924 spin_unlock(&nchpp->spin); 2925 else 2926 spin_unlock_shared(&nchpp->spin); 2927 if (par_locked) { 2928 _cache_unlock(par_nch->ncp); 2929 par_locked = 0; 2930 } 2931 if (_cache_lock_special(ncp) == 0) { 2932 /* 2933 * Successfully locked but we must re-test 2934 * conditions that might have changed since 2935 * we did not have the lock before. 2936 */ 2937 if (ncp->nc_parent != par_nch->ncp || 2938 ncp->nc_nlen != nlc->nlc_namelen || 2939 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2940 ncp->nc_nlen) || 2941 (ncp->nc_flag & NCF_DESTROYED)) { 2942 _cache_put(ncp); 2943 goto restart; 2944 } 2945 _cache_auto_unresolve(mp, ncp); 2946 if (new_ncp) { 2947 _cache_free(new_ncp); 2948 new_ncp = NULL; /* safety */ 2949 } 2950 goto found; 2951 } 2952 _cache_get(ncp); /* cycle the lock to block */ 2953 _cache_put(ncp); 2954 _cache_drop(ncp); 2955 goto restart; 2956 } 2957 } 2958 2959 /* 2960 * We failed to locate the entry, try to resurrect a destroyed 2961 * entry that we did find that is already correctly linked into 2962 * nchpp and the parent. We must re-test conditions after 2963 * successfully locking rep_ncp. 2964 * 2965 * This case can occur under heavy loads due to not being able 2966 * to safely lock the parent in cache_zap(). Nominally a repeated 2967 * create/unlink load, but only the namelen needs to match. 2968 * 2969 * An exclusive lock on the nchpp is required to process this case, 2970 * otherwise a race can cause duplicate entries to be created with 2971 * one cpu reusing a DESTROYED ncp while another creates a new_ncp. 2972 */ 2973 if (rep_ncp && use_excl) { 2974 if (_cache_lock_nonblock(rep_ncp) == 0) { 2975 _cache_hold(rep_ncp); 2976 if (rep_ncp->nc_parent == par_nch->ncp && 2977 rep_ncp->nc_nlen == nlc->nlc_namelen && 2978 (rep_ncp->nc_flag & NCF_DESTROYED) && 2979 rep_ncp->nc_refs == 2) { 2980 /* 2981 * Update nc_name. 2982 */ 2983 ncp = rep_ncp; 2984 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2985 nlc->nlc_namelen); 2986 2987 /* 2988 * This takes some care. We must clear the 2989 * NCF_DESTROYED flag before unlocking the 2990 * hash chain so other concurrent searches 2991 * do not skip this element. 2992 * 2993 * We must also unlock the hash chain before 2994 * unresolving the ncp to avoid deadlocks. 2995 * We hold the lock on the ncp so we can safely 2996 * reinitialize nc_flag after that. 2997 */ 2998 ncp->nc_flag &= ~NCF_DESTROYED; 2999 spin_unlock(&nchpp->spin); /* use_excl */ 3000 3001 _cache_setunresolved(ncp); 3002 ncp->nc_flag = NCF_UNRESOLVED; 3003 ncp->nc_error = ENOTCONN; 3004 if (par_locked) { 3005 _cache_unlock(par_nch->ncp); 3006 par_locked = 0; 3007 } 3008 if (new_ncp) { 3009 _cache_free(new_ncp); 3010 new_ncp = NULL; /* safety */ 3011 } 3012 goto found; 3013 } 3014 _cache_put(rep_ncp); 3015 } 3016 } 3017 3018 /* 3019 * Otherwise create a new entry and add it to the cache. The parent 3020 * ncp must also be locked so we can link into it. 3021 * 3022 * We have to relookup after possibly blocking in kmalloc or 3023 * when locking par_nch. 3024 * 3025 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3026 * mount case, in which case nc_name will be NULL. 3027 * 3028 * NOTE: In the rep_ncp != NULL case we are trying to reuse 3029 * a DESTROYED entry, but didn't have an exclusive lock. 3030 * In this situation we do not create a new_ncp. 3031 */ 3032 if (new_ncp == NULL) { 3033 if (use_excl) 3034 spin_unlock(&nchpp->spin); 3035 else 3036 spin_unlock_shared(&nchpp->spin); 3037 if (rep_ncp == NULL) { 3038 new_ncp = cache_alloc(nlc->nlc_namelen); 3039 if (nlc->nlc_namelen) { 3040 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3041 nlc->nlc_namelen); 3042 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3043 } 3044 } 3045 use_excl = 1; 3046 goto restart; 3047 } 3048 3049 /* 3050 * NOTE! The spinlock is held exclusively here because new_ncp 3051 * is non-NULL. 3052 */ 3053 if (par_locked == 0) { 3054 spin_unlock(&nchpp->spin); 3055 _cache_lock(par_nch->ncp); 3056 par_locked = 1; 3057 goto restart; 3058 } 3059 3060 /* 3061 * Link to parent (requires another ref, the one already in new_ncp 3062 * is what we wil lreturn). 3063 * 3064 * WARNING! We still hold the spinlock. We have to set the hash 3065 * table entry atomically. 3066 */ 3067 ncp = new_ncp; 3068 ++ncp->nc_refs; 3069 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3070 spin_unlock(&nchpp->spin); 3071 _cache_unlock(par_nch->ncp); 3072 /* par_locked = 0 - not used */ 3073 found: 3074 /* 3075 * stats and namecache size management 3076 */ 3077 if (ncp->nc_flag & NCF_UNRESOLVED) 3078 ++gd->gd_nchstats->ncs_miss; 3079 else if (ncp->nc_vp) 3080 ++gd->gd_nchstats->ncs_goodhits; 3081 else 3082 ++gd->gd_nchstats->ncs_neghits; 3083 nch.mount = mp; 3084 nch.ncp = ncp; 3085 _cache_mntref(nch.mount); 3086 3087 return(nch); 3088 } 3089 3090 /* 3091 * Attempt to lookup a namecache entry and return with a shared namecache 3092 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 3093 * set or we are unable to lock. 3094 */ 3095 int 3096 cache_nlookup_maybe_shared(struct nchandle *par_nch, 3097 struct nlcomponent *nlc, 3098 int excl, struct nchandle *res_nch) 3099 { 3100 struct namecache *ncp; 3101 struct nchash_head *nchpp; 3102 struct mount *mp; 3103 u_int32_t hash; 3104 globaldata_t gd; 3105 3106 /* 3107 * If exclusive requested or shared namecache locks are disabled, 3108 * return failure. 3109 */ 3110 if (ncp_shared_lock_disable || excl) 3111 return(EWOULDBLOCK); 3112 3113 gd = mycpu; 3114 mp = par_nch->mount; 3115 3116 /* 3117 * This is a good time to call it, no ncp's are locked by 3118 * the caller or us. 3119 */ 3120 cache_hysteresis(1); 3121 3122 /* 3123 * Try to locate an existing entry 3124 */ 3125 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3126 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3127 nchpp = NCHHASH(hash); 3128 3129 spin_lock_shared(&nchpp->spin); 3130 3131 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3132 /* 3133 * Break out if we find a matching entry. Note that 3134 * UNRESOLVED entries may match, but DESTROYED entries 3135 * do not. 3136 */ 3137 if (ncp->nc_parent == par_nch->ncp && 3138 ncp->nc_nlen == nlc->nlc_namelen && 3139 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3140 (ncp->nc_flag & NCF_DESTROYED) == 0 3141 ) { 3142 _cache_hold(ncp); 3143 spin_unlock_shared(&nchpp->spin); 3144 3145 if (_cache_lock_shared_special(ncp) == 0) { 3146 if (ncp->nc_parent == par_nch->ncp && 3147 ncp->nc_nlen == nlc->nlc_namelen && 3148 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3149 ncp->nc_nlen) == 0 && 3150 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3151 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3152 _cache_auto_unresolve_test(mp, ncp) == 0) { 3153 goto found; 3154 } 3155 _cache_unlock(ncp); 3156 } 3157 _cache_drop(ncp); 3158 return(EWOULDBLOCK); 3159 } 3160 } 3161 3162 /* 3163 * Failure 3164 */ 3165 spin_unlock_shared(&nchpp->spin); 3166 return(EWOULDBLOCK); 3167 3168 /* 3169 * Success 3170 * 3171 * Note that nc_error might be non-zero (e.g ENOENT). 3172 */ 3173 found: 3174 res_nch->mount = mp; 3175 res_nch->ncp = ncp; 3176 ++gd->gd_nchstats->ncs_goodhits; 3177 _cache_mntref(res_nch->mount); 3178 3179 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3180 return(ncp->nc_error); 3181 } 3182 3183 /* 3184 * This is a non-blocking verison of cache_nlookup() used by 3185 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3186 * will return nch.ncp == NULL in that case. 3187 */ 3188 struct nchandle 3189 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3190 { 3191 struct nchandle nch; 3192 struct namecache *ncp; 3193 struct namecache *new_ncp; 3194 struct nchash_head *nchpp; 3195 struct mount *mp; 3196 u_int32_t hash; 3197 globaldata_t gd; 3198 int par_locked; 3199 3200 gd = mycpu; 3201 mp = par_nch->mount; 3202 par_locked = 0; 3203 3204 /* 3205 * Try to locate an existing entry 3206 */ 3207 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3208 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3209 new_ncp = NULL; 3210 nchpp = NCHHASH(hash); 3211 restart: 3212 spin_lock(&nchpp->spin); 3213 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3214 /* 3215 * Break out if we find a matching entry. Note that 3216 * UNRESOLVED entries may match, but DESTROYED entries 3217 * do not. 3218 */ 3219 if (ncp->nc_parent == par_nch->ncp && 3220 ncp->nc_nlen == nlc->nlc_namelen && 3221 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3222 (ncp->nc_flag & NCF_DESTROYED) == 0 3223 ) { 3224 _cache_hold(ncp); 3225 spin_unlock(&nchpp->spin); 3226 if (par_locked) { 3227 _cache_unlock(par_nch->ncp); 3228 par_locked = 0; 3229 } 3230 if (_cache_lock_special(ncp) == 0) { 3231 if (ncp->nc_parent != par_nch->ncp || 3232 ncp->nc_nlen != nlc->nlc_namelen || 3233 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3234 (ncp->nc_flag & NCF_DESTROYED)) { 3235 kprintf("cache_lookup_nonblock: " 3236 "ncp-race %p %*.*s\n", 3237 ncp, 3238 nlc->nlc_namelen, 3239 nlc->nlc_namelen, 3240 nlc->nlc_nameptr); 3241 _cache_unlock(ncp); 3242 _cache_drop(ncp); 3243 goto failed; 3244 } 3245 _cache_auto_unresolve(mp, ncp); 3246 if (new_ncp) { 3247 _cache_free(new_ncp); 3248 new_ncp = NULL; 3249 } 3250 goto found; 3251 } 3252 _cache_drop(ncp); 3253 goto failed; 3254 } 3255 } 3256 3257 /* 3258 * We failed to locate an entry, create a new entry and add it to 3259 * the cache. The parent ncp must also be locked so we 3260 * can link into it. 3261 * 3262 * We have to relookup after possibly blocking in kmalloc or 3263 * when locking par_nch. 3264 * 3265 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3266 * mount case, in which case nc_name will be NULL. 3267 */ 3268 if (new_ncp == NULL) { 3269 spin_unlock(&nchpp->spin); 3270 new_ncp = cache_alloc(nlc->nlc_namelen); 3271 if (nlc->nlc_namelen) { 3272 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3273 nlc->nlc_namelen); 3274 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3275 } 3276 goto restart; 3277 } 3278 if (par_locked == 0) { 3279 spin_unlock(&nchpp->spin); 3280 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3281 par_locked = 1; 3282 goto restart; 3283 } 3284 goto failed; 3285 } 3286 3287 /* 3288 * Link to parent (requires another ref, the one already in new_ncp 3289 * is what we wil lreturn). 3290 * 3291 * WARNING! We still hold the spinlock. We have to set the hash 3292 * table entry atomically. 3293 */ 3294 ncp = new_ncp; 3295 ++ncp->nc_refs; 3296 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3297 spin_unlock(&nchpp->spin); 3298 _cache_unlock(par_nch->ncp); 3299 /* par_locked = 0 - not used */ 3300 found: 3301 /* 3302 * stats and namecache size management 3303 */ 3304 if (ncp->nc_flag & NCF_UNRESOLVED) 3305 ++gd->gd_nchstats->ncs_miss; 3306 else if (ncp->nc_vp) 3307 ++gd->gd_nchstats->ncs_goodhits; 3308 else 3309 ++gd->gd_nchstats->ncs_neghits; 3310 nch.mount = mp; 3311 nch.ncp = ncp; 3312 _cache_mntref(nch.mount); 3313 3314 return(nch); 3315 failed: 3316 if (new_ncp) { 3317 _cache_free(new_ncp); 3318 new_ncp = NULL; 3319 } 3320 nch.mount = NULL; 3321 nch.ncp = NULL; 3322 return(nch); 3323 } 3324 3325 /* 3326 * This version is non-locking. The caller must validate the result 3327 * for parent-to-child continuity. 3328 * 3329 * It can fail for any reason and will return nch.ncp == NULL in that case. 3330 */ 3331 struct nchandle 3332 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3333 { 3334 struct nchandle nch; 3335 struct namecache *ncp; 3336 struct nchash_head *nchpp; 3337 struct mount *mp; 3338 u_int32_t hash; 3339 globaldata_t gd; 3340 3341 gd = mycpu; 3342 mp = par_nch->mount; 3343 3344 /* 3345 * Try to locate an existing entry 3346 */ 3347 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3348 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3349 nchpp = NCHHASH(hash); 3350 3351 spin_lock_shared(&nchpp->spin); 3352 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3353 /* 3354 * Break out if we find a matching entry. Note that 3355 * UNRESOLVED entries may match, but DESTROYED entries 3356 * do not. 3357 * 3358 * Resolved NFS entries which have timed out fail so the 3359 * caller can rerun with normal locking. 3360 */ 3361 if (ncp->nc_parent == par_nch->ncp && 3362 ncp->nc_nlen == nlc->nlc_namelen && 3363 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3364 (ncp->nc_flag & NCF_DESTROYED) == 0 3365 ) { 3366 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3367 break; 3368 _cache_hold(ncp); 3369 spin_unlock_shared(&nchpp->spin); 3370 goto found; 3371 } 3372 } 3373 spin_unlock_shared(&nchpp->spin); 3374 nch.mount = NULL; 3375 nch.ncp = NULL; 3376 return nch; 3377 found: 3378 /* 3379 * stats and namecache size management 3380 */ 3381 if (ncp->nc_flag & NCF_UNRESOLVED) 3382 ++gd->gd_nchstats->ncs_miss; 3383 else if (ncp->nc_vp) 3384 ++gd->gd_nchstats->ncs_goodhits; 3385 else 3386 ++gd->gd_nchstats->ncs_neghits; 3387 nch.mount = mp; 3388 nch.ncp = ncp; 3389 _cache_mntref(nch.mount); 3390 3391 return(nch); 3392 } 3393 3394 /* 3395 * The namecache entry is marked as being used as a mount point. 3396 * Locate the mount if it is visible to the caller. The DragonFly 3397 * mount system allows arbitrary loops in the topology and disentangles 3398 * those loops by matching against (mp, ncp) rather than just (ncp). 3399 * This means any given ncp can dive any number of mounts, depending 3400 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3401 * 3402 * We use a very simple frontend cache to reduce SMP conflicts, 3403 * which we have to do because the mountlist scan needs an exclusive 3404 * lock around its ripout info list. Not to mention that there might 3405 * be a lot of mounts. 3406 * 3407 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3408 * down a bit to allow some contention rather than making the cache 3409 * excessively huge. 3410 * 3411 * The hash table is split into per-cpu areas, is 4-way set-associative. 3412 */ 3413 struct findmount_info { 3414 struct mount *result; 3415 struct mount *nch_mount; 3416 struct namecache *nch_ncp; 3417 }; 3418 3419 static __inline 3420 struct ncmount_cache * 3421 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3422 { 3423 uint32_t hash; 3424 3425 hash = iscsi_crc32(&mp, sizeof(mp)); 3426 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3427 hash ^= hash >> 16; 3428 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3429 3430 return (&ncmount_cache[hash]); 3431 } 3432 3433 static 3434 struct ncmount_cache * 3435 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3436 { 3437 struct ncmount_cache *ncc; 3438 struct ncmount_cache *best; 3439 int delta; 3440 int best_delta; 3441 int i; 3442 3443 ncc = ncmount_cache_lookup4(mp, ncp); 3444 3445 /* 3446 * NOTE: When checking for a ticks overflow implement a slop of 3447 * 2 ticks just to be safe, because ticks is accessed 3448 * non-atomically one CPU can increment it while another 3449 * is still using the old value. 3450 */ 3451 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3452 return ncc; 3453 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3454 if (delta < -2) /* overflow reset */ 3455 ncc->ticks = ticks; 3456 best = ncc; 3457 best_delta = delta; 3458 3459 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3460 ++ncc; 3461 if (ncc->ncp == ncp && ncc->mp == mp) 3462 return ncc; 3463 delta = (int)(ticks - ncc->ticks); 3464 if (delta < -2) 3465 ncc->ticks = ticks; 3466 if (delta > best_delta) { 3467 best_delta = delta; 3468 best = ncc; 3469 } 3470 } 3471 return best; 3472 } 3473 3474 /* 3475 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3476 * doing an expensive mountlist_scan*() if possible. 3477 * 3478 * (mp, ncp) -> mountonpt.k 3479 * 3480 * Returns a referenced mount pointer or NULL 3481 * 3482 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3483 * operations (that is, where the mp_target can be freed out from under us). 3484 * 3485 * Lookups use the ncc->updating counter to validate the contents in order 3486 * to avoid having to obtain the per cache-element spin-lock. In addition, 3487 * the ticks field is only updated when it changes. However, if our per-cpu 3488 * lock fails due to an unmount-in-progress, we fall-back to the 3489 * cache-element's spin-lock. 3490 */ 3491 struct mount * 3492 cache_findmount(struct nchandle *nch) 3493 { 3494 struct findmount_info info; 3495 struct ncmount_cache *ncc; 3496 struct ncmount_cache ncc_copy; 3497 struct mount *target; 3498 struct pcpu_ncache *pcpu; 3499 struct spinlock *spinlk; 3500 int update; 3501 3502 pcpu = pcpu_ncache; 3503 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3504 ncc = NULL; 3505 goto skip; 3506 } 3507 pcpu += mycpu->gd_cpuid; 3508 3509 again: 3510 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3511 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3512 found: 3513 /* 3514 * This is a bit messy for now because we do not yet have 3515 * safe disposal of mount structures. We have to ref 3516 * ncc->mp_target but the 'update' counter only tell us 3517 * whether the cache has changed after the fact. 3518 * 3519 * For now get a per-cpu spinlock that will only contend 3520 * against umount's. This is the best path. If it fails, 3521 * instead of waiting on the umount we fall-back to a 3522 * shared ncc->spin lock, which will generally only cost a 3523 * cache ping-pong. 3524 */ 3525 update = ncc->updating; 3526 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3527 spinlk = &pcpu->umount_spin; 3528 } else { 3529 spinlk = &ncc->spin; 3530 spin_lock_shared(spinlk); 3531 } 3532 if (update & 1) { /* update in progress */ 3533 spin_unlock_any(spinlk); 3534 goto skip; 3535 } 3536 ncc_copy = *ncc; 3537 cpu_lfence(); 3538 if (ncc->updating != update) { /* content changed */ 3539 spin_unlock_any(spinlk); 3540 goto again; 3541 } 3542 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3543 spin_unlock_any(spinlk); 3544 goto again; 3545 } 3546 if (ncc_copy.isneg == 0) { 3547 target = ncc_copy.mp_target; 3548 if (target->mnt_ncmounton.mount == nch->mount && 3549 target->mnt_ncmounton.ncp == nch->ncp) { 3550 /* 3551 * Cache hit (positive) (avoid dirtying 3552 * the cache line if possible) 3553 */ 3554 if (ncc->ticks != (int)ticks) 3555 ncc->ticks = (int)ticks; 3556 _cache_mntref(target); 3557 } 3558 } else { 3559 /* 3560 * Cache hit (negative) (avoid dirtying 3561 * the cache line if possible) 3562 */ 3563 if (ncc->ticks != (int)ticks) 3564 ncc->ticks = (int)ticks; 3565 target = NULL; 3566 } 3567 spin_unlock_any(spinlk); 3568 3569 return target; 3570 } 3571 skip: 3572 3573 /* 3574 * Slow 3575 */ 3576 info.result = NULL; 3577 info.nch_mount = nch->mount; 3578 info.nch_ncp = nch->ncp; 3579 mountlist_scan(cache_findmount_callback, &info, 3580 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3581 3582 /* 3583 * To reduce multi-re-entry on the cache, relookup in the cache. 3584 * This can still race, obviously, but that's ok. 3585 */ 3586 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3587 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3588 if (info.result) 3589 atomic_add_int(&info.result->mnt_refs, -1); 3590 goto found; 3591 } 3592 3593 /* 3594 * Cache the result. 3595 */ 3596 if ((info.result == NULL || 3597 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3598 spin_lock(&ncc->spin); 3599 atomic_add_int_nonlocked(&ncc->updating, 1); 3600 cpu_sfence(); 3601 KKASSERT(ncc->updating & 1); 3602 if (ncc->mp != nch->mount) { 3603 if (ncc->mp) 3604 atomic_add_int(&ncc->mp->mnt_refs, -1); 3605 atomic_add_int(&nch->mount->mnt_refs, 1); 3606 ncc->mp = nch->mount; 3607 } 3608 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3609 ncc->ticks = (int)ticks; 3610 3611 if (info.result) { 3612 ncc->isneg = 0; 3613 if (ncc->mp_target != info.result) { 3614 if (ncc->mp_target) 3615 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3616 ncc->mp_target = info.result; 3617 atomic_add_int(&info.result->mnt_refs, 1); 3618 } 3619 } else { 3620 ncc->isneg = 1; 3621 if (ncc->mp_target) { 3622 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3623 ncc->mp_target = NULL; 3624 } 3625 } 3626 cpu_sfence(); 3627 atomic_add_int_nonlocked(&ncc->updating, 1); 3628 spin_unlock(&ncc->spin); 3629 } 3630 return(info.result); 3631 } 3632 3633 static 3634 int 3635 cache_findmount_callback(struct mount *mp, void *data) 3636 { 3637 struct findmount_info *info = data; 3638 3639 /* 3640 * Check the mount's mounted-on point against the passed nch. 3641 */ 3642 if (mp->mnt_ncmounton.mount == info->nch_mount && 3643 mp->mnt_ncmounton.ncp == info->nch_ncp 3644 ) { 3645 info->result = mp; 3646 _cache_mntref(mp); 3647 return(-1); 3648 } 3649 return(0); 3650 } 3651 3652 void 3653 cache_dropmount(struct mount *mp) 3654 { 3655 _cache_mntrel(mp); 3656 } 3657 3658 /* 3659 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3660 * or negative). 3661 * 3662 * A full scan is not required, but for now just do it anyway. 3663 */ 3664 void 3665 cache_ismounting(struct mount *mp) 3666 { 3667 struct ncmount_cache *ncc; 3668 struct mount *ncc_mp; 3669 int i; 3670 3671 if (pcpu_ncache == NULL) 3672 return; 3673 3674 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3675 ncc = &ncmount_cache[i]; 3676 if (ncc->mp != mp->mnt_ncmounton.mount || 3677 ncc->ncp != mp->mnt_ncmounton.ncp) { 3678 continue; 3679 } 3680 spin_lock(&ncc->spin); 3681 atomic_add_int_nonlocked(&ncc->updating, 1); 3682 cpu_sfence(); 3683 KKASSERT(ncc->updating & 1); 3684 if (ncc->mp != mp->mnt_ncmounton.mount || 3685 ncc->ncp != mp->mnt_ncmounton.ncp) { 3686 cpu_sfence(); 3687 ++ncc->updating; 3688 spin_unlock(&ncc->spin); 3689 continue; 3690 } 3691 ncc_mp = ncc->mp; 3692 ncc->ncp = NULL; 3693 ncc->mp = NULL; 3694 if (ncc_mp) 3695 atomic_add_int(&ncc_mp->mnt_refs, -1); 3696 ncc_mp = ncc->mp_target; 3697 ncc->mp_target = NULL; 3698 if (ncc_mp) 3699 atomic_add_int(&ncc_mp->mnt_refs, -1); 3700 ncc->ticks = (int)ticks - hz * 120; 3701 3702 cpu_sfence(); 3703 atomic_add_int_nonlocked(&ncc->updating, 1); 3704 spin_unlock(&ncc->spin); 3705 } 3706 3707 /* 3708 * Pre-cache the mount point 3709 */ 3710 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3711 mp->mnt_ncmounton.ncp); 3712 3713 spin_lock(&ncc->spin); 3714 atomic_add_int_nonlocked(&ncc->updating, 1); 3715 cpu_sfence(); 3716 KKASSERT(ncc->updating & 1); 3717 3718 if (ncc->mp) 3719 atomic_add_int(&ncc->mp->mnt_refs, -1); 3720 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3721 ncc->mp = mp->mnt_ncmounton.mount; 3722 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3723 ncc->ticks = (int)ticks; 3724 3725 ncc->isneg = 0; 3726 if (ncc->mp_target != mp) { 3727 if (ncc->mp_target) 3728 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3729 ncc->mp_target = mp; 3730 atomic_add_int(&mp->mnt_refs, 1); 3731 } 3732 cpu_sfence(); 3733 atomic_add_int_nonlocked(&ncc->updating, 1); 3734 spin_unlock(&ncc->spin); 3735 } 3736 3737 /* 3738 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3739 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3740 * negative hits involving (mp, <any>). 3741 * 3742 * A full scan is required. 3743 */ 3744 void 3745 cache_unmounting(struct mount *mp) 3746 { 3747 struct ncmount_cache *ncc; 3748 struct pcpu_ncache *pcpu; 3749 struct mount *ncc_mp; 3750 int i; 3751 3752 pcpu = pcpu_ncache; 3753 if (pcpu == NULL) 3754 return; 3755 3756 for (i = 0; i < ncpus; ++i) 3757 spin_lock(&pcpu[i].umount_spin); 3758 3759 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3760 ncc = &ncmount_cache[i]; 3761 if (ncc->mp != mp && ncc->mp_target != mp) 3762 continue; 3763 spin_lock(&ncc->spin); 3764 atomic_add_int_nonlocked(&ncc->updating, 1); 3765 cpu_sfence(); 3766 3767 if (ncc->mp != mp && ncc->mp_target != mp) { 3768 atomic_add_int_nonlocked(&ncc->updating, 1); 3769 cpu_sfence(); 3770 spin_unlock(&ncc->spin); 3771 continue; 3772 } 3773 ncc_mp = ncc->mp; 3774 ncc->ncp = NULL; 3775 ncc->mp = NULL; 3776 if (ncc_mp) 3777 atomic_add_int(&ncc_mp->mnt_refs, -1); 3778 ncc_mp = ncc->mp_target; 3779 ncc->mp_target = NULL; 3780 if (ncc_mp) 3781 atomic_add_int(&ncc_mp->mnt_refs, -1); 3782 ncc->ticks = (int)ticks - hz * 120; 3783 3784 cpu_sfence(); 3785 atomic_add_int_nonlocked(&ncc->updating, 1); 3786 spin_unlock(&ncc->spin); 3787 } 3788 3789 for (i = 0; i < ncpus; ++i) 3790 spin_unlock(&pcpu[i].umount_spin); 3791 } 3792 3793 /* 3794 * Resolve an unresolved namecache entry, generally by looking it up. 3795 * The passed ncp must be locked and refd. 3796 * 3797 * Theoretically since a vnode cannot be recycled while held, and since 3798 * the nc_parent chain holds its vnode as long as children exist, the 3799 * direct parent of the cache entry we are trying to resolve should 3800 * have a valid vnode. If not then generate an error that we can 3801 * determine is related to a resolver bug. 3802 * 3803 * However, if a vnode was in the middle of a recyclement when the NCP 3804 * got locked, ncp->nc_vp might point to a vnode that is about to become 3805 * invalid. cache_resolve() handles this case by unresolving the entry 3806 * and then re-resolving it. 3807 * 3808 * Note that successful resolution does not necessarily return an error 3809 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3810 * will be returned. 3811 */ 3812 int 3813 cache_resolve(struct nchandle *nch, struct ucred *cred) 3814 { 3815 struct namecache *par_tmp; 3816 struct namecache *par; 3817 struct namecache *ncp; 3818 struct nchandle nctmp; 3819 struct mount *mp; 3820 struct vnode *dvp; 3821 int error; 3822 3823 ncp = nch->ncp; 3824 mp = nch->mount; 3825 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3826 restart: 3827 /* 3828 * If the ncp is already resolved we have nothing to do. However, 3829 * we do want to guarentee that a usable vnode is returned when 3830 * a vnode is present, so make sure it hasn't been reclaimed. 3831 */ 3832 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3833 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3834 _cache_setunresolved(ncp); 3835 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3836 return (ncp->nc_error); 3837 } 3838 3839 /* 3840 * If the ncp was destroyed it will never resolve again. This 3841 * can basically only happen when someone is chdir'd into an 3842 * empty directory which is then rmdir'd. We want to catch this 3843 * here and not dive the VFS because the VFS might actually 3844 * have a way to re-resolve the disconnected ncp, which will 3845 * result in inconsistencies in the cdir/nch for proc->p_fd. 3846 */ 3847 if (ncp->nc_flag & NCF_DESTROYED) 3848 return(EINVAL); 3849 3850 /* 3851 * Mount points need special handling because the parent does not 3852 * belong to the same filesystem as the ncp. 3853 */ 3854 if (ncp == mp->mnt_ncmountpt.ncp) 3855 return (cache_resolve_mp(mp)); 3856 3857 /* 3858 * We expect an unbroken chain of ncps to at least the mount point, 3859 * and even all the way to root (but this code doesn't have to go 3860 * past the mount point). 3861 */ 3862 if (ncp->nc_parent == NULL) { 3863 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3864 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3865 ncp->nc_error = EXDEV; 3866 return(ncp->nc_error); 3867 } 3868 3869 /* 3870 * The vp's of the parent directories in the chain are held via vhold() 3871 * due to the existance of the child, and should not disappear. 3872 * However, there are cases where they can disappear: 3873 * 3874 * - due to filesystem I/O errors. 3875 * - due to NFS being stupid about tracking the namespace and 3876 * destroys the namespace for entire directories quite often. 3877 * - due to forced unmounts. 3878 * - due to an rmdir (parent will be marked DESTROYED) 3879 * 3880 * When this occurs we have to track the chain backwards and resolve 3881 * it, looping until the resolver catches up to the current node. We 3882 * could recurse here but we might run ourselves out of kernel stack 3883 * so we do it in a more painful manner. This situation really should 3884 * not occur all that often, or if it does not have to go back too 3885 * many nodes to resolve the ncp. 3886 */ 3887 while ((dvp = cache_dvpref(ncp)) == NULL) { 3888 /* 3889 * This case can occur if a process is CD'd into a 3890 * directory which is then rmdir'd. If the parent is marked 3891 * destroyed there is no point trying to resolve it. 3892 */ 3893 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3894 return(ENOENT); 3895 par = ncp->nc_parent; 3896 _cache_hold(par); 3897 _cache_lock(par); 3898 while ((par_tmp = par->nc_parent) != NULL && 3899 par_tmp->nc_vp == NULL) { 3900 _cache_hold(par_tmp); 3901 _cache_lock(par_tmp); 3902 _cache_put(par); 3903 par = par_tmp; 3904 } 3905 if (par->nc_parent == NULL) { 3906 kprintf("EXDEV case 2 %*.*s\n", 3907 par->nc_nlen, par->nc_nlen, par->nc_name); 3908 _cache_put(par); 3909 return (EXDEV); 3910 } 3911 /* 3912 * The parent is not set in stone, ref and lock it to prevent 3913 * it from disappearing. Also note that due to renames it 3914 * is possible for our ncp to move and for par to no longer 3915 * be one of its parents. We resolve it anyway, the loop 3916 * will handle any moves. 3917 */ 3918 _cache_get(par); /* additional hold/lock */ 3919 _cache_put(par); /* from earlier hold/lock */ 3920 if (par == nch->mount->mnt_ncmountpt.ncp) { 3921 cache_resolve_mp(nch->mount); 3922 } else if ((dvp = cache_dvpref(par)) == NULL) { 3923 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3924 par->nc_nlen, par->nc_nlen, par->nc_name); 3925 _cache_put(par); 3926 continue; 3927 } else { 3928 if (par->nc_flag & NCF_UNRESOLVED) { 3929 nctmp.mount = mp; 3930 nctmp.ncp = par; 3931 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3932 } 3933 vrele(dvp); 3934 } 3935 if ((error = par->nc_error) != 0) { 3936 if (par->nc_error != EAGAIN) { 3937 kprintf("EXDEV case 3 %*.*s error %d\n", 3938 par->nc_nlen, par->nc_nlen, par->nc_name, 3939 par->nc_error); 3940 _cache_put(par); 3941 return(error); 3942 } 3943 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3944 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3945 } 3946 _cache_put(par); 3947 /* loop */ 3948 } 3949 3950 /* 3951 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3952 * ncp's and reattach them. If this occurs the original ncp is marked 3953 * EAGAIN to force a relookup. 3954 * 3955 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3956 * ncp must already be resolved. 3957 */ 3958 if (dvp) { 3959 nctmp.mount = mp; 3960 nctmp.ncp = ncp; 3961 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3962 vrele(dvp); 3963 } else { 3964 ncp->nc_error = EPERM; 3965 } 3966 if (ncp->nc_error == EAGAIN) { 3967 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3968 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3969 goto restart; 3970 } 3971 return(ncp->nc_error); 3972 } 3973 3974 /* 3975 * Resolve the ncp associated with a mount point. Such ncp's almost always 3976 * remain resolved and this routine is rarely called. NFS MPs tends to force 3977 * re-resolution more often due to its mac-truck-smash-the-namecache 3978 * method of tracking namespace changes. 3979 * 3980 * The semantics for this call is that the passed ncp must be locked on 3981 * entry and will be locked on return. However, if we actually have to 3982 * resolve the mount point we temporarily unlock the entry in order to 3983 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3984 * the unlock we have to recheck the flags after we relock. 3985 */ 3986 static int 3987 cache_resolve_mp(struct mount *mp) 3988 { 3989 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3990 struct vnode *vp; 3991 int error; 3992 3993 KKASSERT(mp != NULL); 3994 3995 /* 3996 * If the ncp is already resolved we have nothing to do. However, 3997 * we do want to guarentee that a usable vnode is returned when 3998 * a vnode is present, so make sure it hasn't been reclaimed. 3999 */ 4000 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4001 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 4002 _cache_setunresolved(ncp); 4003 } 4004 4005 if (ncp->nc_flag & NCF_UNRESOLVED) { 4006 _cache_unlock(ncp); 4007 while (vfs_busy(mp, 0)) 4008 ; 4009 error = VFS_ROOT(mp, &vp); 4010 _cache_lock(ncp); 4011 4012 /* 4013 * recheck the ncp state after relocking. 4014 */ 4015 if (ncp->nc_flag & NCF_UNRESOLVED) { 4016 ncp->nc_error = error; 4017 if (error == 0) { 4018 _cache_setvp(mp, ncp, vp); 4019 vput(vp); 4020 } else { 4021 kprintf("[diagnostic] cache_resolve_mp: failed" 4022 " to resolve mount %p err=%d ncp=%p\n", 4023 mp, error, ncp); 4024 _cache_setvp(mp, ncp, NULL); 4025 } 4026 } else if (error == 0) { 4027 vput(vp); 4028 } 4029 vfs_unbusy(mp); 4030 } 4031 return(ncp->nc_error); 4032 } 4033 4034 /* 4035 * Resolve the parent vnode 4036 */ 4037 int 4038 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp) 4039 { 4040 struct namecache *par_tmp; 4041 struct namecache *par; 4042 struct namecache *ncp; 4043 struct nchandle nctmp; 4044 struct mount *mp; 4045 struct vnode *dvp; 4046 int error; 4047 4048 *dvpp = NULL; 4049 ncp = nch->ncp; 4050 mp = nch->mount; 4051 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 4052 4053 /* 4054 * Treat this as a mount point even if it has a parent (e.g. 4055 * null-mount). Return a NULL dvp and no error. 4056 */ 4057 if (ncp == mp->mnt_ncmountpt.ncp) 4058 return 0; 4059 4060 /* 4061 * If the ncp was destroyed there is no parent directory, return 4062 * EINVAL. 4063 */ 4064 if (ncp->nc_flag & NCF_DESTROYED) 4065 return(EINVAL); 4066 4067 /* 4068 * No parent if at the root of a filesystem, no error. Typically 4069 * not applicable to null-mounts. This case should have been caught 4070 * in the above ncmountpt check. 4071 */ 4072 if (ncp->nc_parent == NULL) 4073 return 0; 4074 4075 /* 4076 * Resolve the parent dvp. 4077 * 4078 * The vp's of the parent directories in the chain are held via vhold() 4079 * due to the existance of the child, and should not disappear. 4080 * However, there are cases where they can disappear: 4081 * 4082 * - due to filesystem I/O errors. 4083 * - due to NFS being stupid about tracking the namespace and 4084 * destroys the namespace for entire directories quite often. 4085 * - due to forced unmounts. 4086 * - due to an rmdir (parent will be marked DESTROYED) 4087 * 4088 * When this occurs we have to track the chain backwards and resolve 4089 * it, looping until the resolver catches up to the current node. We 4090 * could recurse here but we might run ourselves out of kernel stack 4091 * so we do it in a more painful manner. This situation really should 4092 * not occur all that often, or if it does not have to go back too 4093 * many nodes to resolve the ncp. 4094 */ 4095 while ((dvp = cache_dvpref(ncp)) == NULL) { 4096 /* 4097 * This case can occur if a process is CD'd into a 4098 * directory which is then rmdir'd. If the parent is marked 4099 * destroyed there is no point trying to resolve it. 4100 */ 4101 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 4102 return(ENOENT); 4103 par = ncp->nc_parent; 4104 _cache_hold(par); 4105 _cache_lock(par); 4106 while ((par_tmp = par->nc_parent) != NULL && 4107 par_tmp->nc_vp == NULL) { 4108 _cache_hold(par_tmp); 4109 _cache_lock(par_tmp); 4110 _cache_put(par); 4111 par = par_tmp; 4112 } 4113 if (par->nc_parent == NULL) { 4114 kprintf("EXDEV case 2 %*.*s\n", 4115 par->nc_nlen, par->nc_nlen, par->nc_name); 4116 _cache_put(par); 4117 return (EXDEV); 4118 } 4119 4120 /* 4121 * The parent is not set in stone, ref and lock it to prevent 4122 * it from disappearing. Also note that due to renames it 4123 * is possible for our ncp to move and for par to no longer 4124 * be one of its parents. We resolve it anyway, the loop 4125 * will handle any moves. 4126 */ 4127 _cache_get(par); /* additional hold/lock */ 4128 _cache_put(par); /* from earlier hold/lock */ 4129 if (par == nch->mount->mnt_ncmountpt.ncp) { 4130 cache_resolve_mp(nch->mount); 4131 } else if ((dvp = cache_dvpref(par)) == NULL) { 4132 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 4133 par->nc_nlen, par->nc_nlen, par->nc_name); 4134 _cache_put(par); 4135 continue; 4136 } else { 4137 if (par->nc_flag & NCF_UNRESOLVED) { 4138 nctmp.mount = mp; 4139 nctmp.ncp = par; 4140 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 4141 } 4142 vrele(dvp); 4143 } 4144 if ((error = par->nc_error) != 0) { 4145 if (par->nc_error != EAGAIN) { 4146 kprintf("EXDEV case 3 %*.*s error %d\n", 4147 par->nc_nlen, par->nc_nlen, par->nc_name, 4148 par->nc_error); 4149 _cache_put(par); 4150 return(error); 4151 } 4152 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 4153 par, par->nc_nlen, par->nc_nlen, par->nc_name); 4154 } 4155 _cache_put(par); 4156 /* loop */ 4157 } 4158 4159 /* 4160 * We have a referenced dvp 4161 */ 4162 *dvpp = dvp; 4163 return 0; 4164 } 4165 4166 /* 4167 * Clean out negative cache entries when too many have accumulated. 4168 */ 4169 static void 4170 _cache_cleanneg(long count) 4171 { 4172 struct pcpu_ncache *pn; 4173 struct namecache *ncp; 4174 static uint32_t neg_rover; 4175 uint32_t n; 4176 long vnegs; 4177 4178 n = neg_rover++; /* SMP heuristical, race ok */ 4179 cpu_ccfence(); 4180 n = n % (uint32_t)ncpus; 4181 4182 /* 4183 * Normalize vfscache_negs and count. count is sometimes based 4184 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 4185 * have crazy values. 4186 */ 4187 vnegs = vfscache_negs; 4188 cpu_ccfence(); 4189 if (vnegs <= MINNEG) 4190 vnegs = MINNEG; 4191 if (count < 1) 4192 count = 1; 4193 4194 pn = &pcpu_ncache[n]; 4195 spin_lock(&pn->neg_spin); 4196 count = pn->neg_count * count / vnegs + 1; 4197 spin_unlock(&pn->neg_spin); 4198 4199 /* 4200 * Attempt to clean out the specified number of negative cache 4201 * entries. 4202 */ 4203 while (count > 0) { 4204 spin_lock(&pn->neg_spin); 4205 ncp = TAILQ_FIRST(&pn->neg_list); 4206 if (ncp == NULL) { 4207 spin_unlock(&pn->neg_spin); 4208 break; 4209 } 4210 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 4211 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 4212 _cache_hold(ncp); 4213 spin_unlock(&pn->neg_spin); 4214 4215 /* 4216 * This can race, so we must re-check that the ncp 4217 * is on the ncneg.list after successfully locking it. 4218 */ 4219 if (_cache_lock_special(ncp) == 0) { 4220 if (ncp->nc_vp == NULL && 4221 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 4222 cache_zap(ncp); 4223 } else { 4224 _cache_unlock(ncp); 4225 _cache_drop(ncp); 4226 } 4227 } else { 4228 _cache_drop(ncp); 4229 } 4230 --count; 4231 } 4232 } 4233 4234 /* 4235 * Clean out positive cache entries when too many have accumulated. 4236 */ 4237 static void 4238 _cache_cleanpos(long count) 4239 { 4240 static volatile int rover; 4241 struct nchash_head *nchpp; 4242 struct namecache *ncp; 4243 int rover_copy; 4244 4245 /* 4246 * Attempt to clean out the specified number of negative cache 4247 * entries. 4248 */ 4249 while (count > 0) { 4250 rover_copy = ++rover; /* MPSAFEENOUGH */ 4251 cpu_ccfence(); 4252 nchpp = NCHHASH(rover_copy); 4253 4254 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4255 --count; 4256 continue; 4257 } 4258 4259 /* 4260 * Cycle ncp on list, ignore and do not move DUMMY 4261 * ncps. These are temporary list iterators. 4262 * 4263 * We must cycle the ncp to the end of the list to 4264 * ensure that all ncp's have an equal chance of 4265 * being removed. 4266 */ 4267 spin_lock(&nchpp->spin); 4268 ncp = TAILQ_FIRST(&nchpp->list); 4269 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4270 ncp = TAILQ_NEXT(ncp, nc_hash); 4271 if (ncp) { 4272 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4273 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4274 _cache_hold(ncp); 4275 } 4276 spin_unlock(&nchpp->spin); 4277 4278 if (ncp) { 4279 if (_cache_lock_special(ncp) == 0) { 4280 cache_zap(ncp); 4281 } else { 4282 _cache_drop(ncp); 4283 } 4284 } 4285 --count; 4286 } 4287 } 4288 4289 /* 4290 * This is a kitchen sink function to clean out ncps which we 4291 * tried to zap from cache_drop() but failed because we were 4292 * unable to acquire the parent lock. 4293 * 4294 * Such entries can also be removed via cache_inval_vp(), such 4295 * as when unmounting. 4296 */ 4297 static void 4298 _cache_cleandefered(void) 4299 { 4300 struct nchash_head *nchpp; 4301 struct namecache *ncp; 4302 struct namecache dummy; 4303 int i; 4304 4305 /* 4306 * Create a list iterator. DUMMY indicates that this is a list 4307 * iterator, DESTROYED prevents matches by lookup functions. 4308 */ 4309 numdefered = 0; 4310 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4311 bzero(&dummy, sizeof(dummy)); 4312 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4313 dummy.nc_refs = 1; 4314 4315 for (i = 0; i <= nchash; ++i) { 4316 nchpp = &nchashtbl[i]; 4317 4318 spin_lock(&nchpp->spin); 4319 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4320 ncp = &dummy; 4321 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4322 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4323 continue; 4324 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4325 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4326 _cache_hold(ncp); 4327 spin_unlock(&nchpp->spin); 4328 if (_cache_lock_nonblock(ncp) == 0) { 4329 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4330 _cache_unlock(ncp); 4331 } 4332 _cache_drop(ncp); 4333 spin_lock(&nchpp->spin); 4334 ncp = &dummy; 4335 } 4336 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4337 spin_unlock(&nchpp->spin); 4338 } 4339 } 4340 4341 /* 4342 * Name cache initialization, from vfsinit() when we are booting 4343 */ 4344 void 4345 nchinit(void) 4346 { 4347 struct pcpu_ncache *pn; 4348 globaldata_t gd; 4349 int i; 4350 4351 /* 4352 * Per-cpu accounting and negative hit list 4353 */ 4354 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4355 M_VFSCACHEAUX, M_WAITOK|M_ZERO); 4356 for (i = 0; i < ncpus; ++i) { 4357 pn = &pcpu_ncache[i]; 4358 TAILQ_INIT(&pn->neg_list); 4359 spin_init(&pn->neg_spin, "ncneg"); 4360 spin_init(&pn->umount_spin, "ncumm"); 4361 } 4362 4363 /* 4364 * Initialise per-cpu namecache effectiveness statistics. 4365 */ 4366 for (i = 0; i < ncpus; ++i) { 4367 gd = globaldata_find(i); 4368 gd->gd_nchstats = &nchstats[i]; 4369 } 4370 4371 /* 4372 * Create a generous namecache hash table 4373 */ 4374 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4375 sizeof(struct nchash_head), 4376 M_VFSCACHEAUX, &nchash); 4377 for (i = 0; i <= (int)nchash; ++i) { 4378 TAILQ_INIT(&nchashtbl[i].list); 4379 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4380 } 4381 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4382 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4383 nclockwarn = 5 * hz; 4384 } 4385 4386 /* 4387 * Called from start_init() to bootstrap the root filesystem. Returns 4388 * a referenced, unlocked namecache record. 4389 */ 4390 void 4391 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4392 { 4393 nch->ncp = cache_alloc(0); 4394 nch->mount = mp; 4395 _cache_mntref(mp); 4396 if (vp) 4397 _cache_setvp(nch->mount, nch->ncp, vp); 4398 } 4399 4400 /* 4401 * vfs_cache_setroot() 4402 * 4403 * Create an association between the root of our namecache and 4404 * the root vnode. This routine may be called several times during 4405 * booting. 4406 * 4407 * If the caller intends to save the returned namecache pointer somewhere 4408 * it must cache_hold() it. 4409 */ 4410 void 4411 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4412 { 4413 struct vnode *ovp; 4414 struct nchandle onch; 4415 4416 ovp = rootvnode; 4417 onch = rootnch; 4418 rootvnode = nvp; 4419 if (nch) 4420 rootnch = *nch; 4421 else 4422 cache_zero(&rootnch); 4423 if (ovp) 4424 vrele(ovp); 4425 if (onch.ncp) 4426 cache_drop(&onch); 4427 } 4428 4429 /* 4430 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4431 * topology and is being removed as quickly as possible. The new VOP_N*() 4432 * API calls are required to make specific adjustments using the supplied 4433 * ncp pointers rather then just bogusly purging random vnodes. 4434 * 4435 * Invalidate all namecache entries to a particular vnode as well as 4436 * any direct children of that vnode in the namecache. This is a 4437 * 'catch all' purge used by filesystems that do not know any better. 4438 * 4439 * Note that the linkage between the vnode and its namecache entries will 4440 * be removed, but the namecache entries themselves might stay put due to 4441 * active references from elsewhere in the system or due to the existance of 4442 * the children. The namecache topology is left intact even if we do not 4443 * know what the vnode association is. Such entries will be marked 4444 * NCF_UNRESOLVED. 4445 */ 4446 void 4447 cache_purge(struct vnode *vp) 4448 { 4449 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4450 } 4451 4452 __read_mostly static int disablecwd; 4453 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4454 "Disable getcwd"); 4455 4456 /* 4457 * MPALMOSTSAFE 4458 */ 4459 int 4460 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap) 4461 { 4462 u_int buflen; 4463 int error; 4464 char *buf; 4465 char *bp; 4466 4467 if (disablecwd) 4468 return (ENODEV); 4469 4470 buflen = uap->buflen; 4471 if (buflen == 0) 4472 return (EINVAL); 4473 if (buflen > MAXPATHLEN) 4474 buflen = MAXPATHLEN; 4475 4476 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4477 bp = kern_getcwd(buf, buflen, &error); 4478 if (error == 0) 4479 error = copyout(bp, uap->buf, strlen(bp) + 1); 4480 kfree(buf, M_TEMP); 4481 return (error); 4482 } 4483 4484 char * 4485 kern_getcwd(char *buf, size_t buflen, int *error) 4486 { 4487 struct proc *p = curproc; 4488 char *bp; 4489 int i, slash_prefixed; 4490 struct filedesc *fdp; 4491 struct nchandle nch; 4492 struct namecache *ncp; 4493 4494 bp = buf; 4495 bp += buflen - 1; 4496 *bp = '\0'; 4497 fdp = p->p_fd; 4498 slash_prefixed = 0; 4499 4500 nch = fdp->fd_ncdir; 4501 ncp = nch.ncp; 4502 if (ncp) 4503 _cache_hold(ncp); 4504 4505 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4506 nch.mount != fdp->fd_nrdir.mount) 4507 ) { 4508 if (ncp->nc_flag & NCF_DESTROYED) { 4509 _cache_drop(ncp); 4510 ncp = NULL; 4511 break; 4512 } 4513 /* 4514 * While traversing upwards if we encounter the root 4515 * of the current mount we have to skip to the mount point 4516 * in the underlying filesystem. 4517 */ 4518 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4519 nch = nch.mount->mnt_ncmounton; 4520 _cache_drop(ncp); 4521 ncp = nch.ncp; 4522 if (ncp) 4523 _cache_hold(ncp); 4524 continue; 4525 } 4526 4527 /* 4528 * Prepend the path segment 4529 */ 4530 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4531 if (bp == buf) { 4532 *error = ERANGE; 4533 bp = NULL; 4534 goto done; 4535 } 4536 *--bp = ncp->nc_name[i]; 4537 } 4538 if (bp == buf) { 4539 *error = ERANGE; 4540 bp = NULL; 4541 goto done; 4542 } 4543 *--bp = '/'; 4544 slash_prefixed = 1; 4545 4546 /* 4547 * Go up a directory. This isn't a mount point so we don't 4548 * have to check again. 4549 */ 4550 while ((nch.ncp = ncp->nc_parent) != NULL) { 4551 if (ncp_shared_lock_disable) 4552 _cache_lock(ncp); 4553 else 4554 _cache_lock_shared(ncp); 4555 if (nch.ncp != ncp->nc_parent) { 4556 _cache_unlock(ncp); 4557 continue; 4558 } 4559 _cache_hold(nch.ncp); 4560 _cache_unlock(ncp); 4561 break; 4562 } 4563 _cache_drop(ncp); 4564 ncp = nch.ncp; 4565 } 4566 if (ncp == NULL) { 4567 *error = ENOENT; 4568 bp = NULL; 4569 goto done; 4570 } 4571 if (!slash_prefixed) { 4572 if (bp == buf) { 4573 *error = ERANGE; 4574 bp = NULL; 4575 goto done; 4576 } 4577 *--bp = '/'; 4578 } 4579 *error = 0; 4580 done: 4581 if (ncp) 4582 _cache_drop(ncp); 4583 return (bp); 4584 } 4585 4586 /* 4587 * Thus begins the fullpath magic. 4588 * 4589 * The passed nchp is referenced but not locked. 4590 */ 4591 __read_mostly static int disablefullpath; 4592 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4593 &disablefullpath, 0, 4594 "Disable fullpath lookups"); 4595 4596 int 4597 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4598 char **retbuf, char **freebuf, int guess) 4599 { 4600 struct nchandle fd_nrdir; 4601 struct nchandle nch; 4602 struct namecache *ncp; 4603 struct mount *mp, *new_mp; 4604 char *bp, *buf; 4605 int slash_prefixed; 4606 int error = 0; 4607 int i; 4608 4609 *retbuf = NULL; 4610 *freebuf = NULL; 4611 4612 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4613 bp = buf + MAXPATHLEN - 1; 4614 *bp = '\0'; 4615 if (nchbase) 4616 fd_nrdir = *nchbase; 4617 else if (p != NULL) 4618 fd_nrdir = p->p_fd->fd_nrdir; 4619 else 4620 fd_nrdir = rootnch; 4621 slash_prefixed = 0; 4622 nch = *nchp; 4623 ncp = nch.ncp; 4624 if (ncp) 4625 _cache_hold(ncp); 4626 mp = nch.mount; 4627 4628 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4629 new_mp = NULL; 4630 4631 /* 4632 * If we are asked to guess the upwards path, we do so whenever 4633 * we encounter an ncp marked as a mountpoint. We try to find 4634 * the actual mountpoint by finding the mountpoint with this 4635 * ncp. 4636 */ 4637 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4638 new_mp = mount_get_by_nc(ncp); 4639 } 4640 /* 4641 * While traversing upwards if we encounter the root 4642 * of the current mount we have to skip to the mount point. 4643 */ 4644 if (ncp == mp->mnt_ncmountpt.ncp) { 4645 new_mp = mp; 4646 } 4647 if (new_mp) { 4648 nch = new_mp->mnt_ncmounton; 4649 _cache_drop(ncp); 4650 ncp = nch.ncp; 4651 if (ncp) 4652 _cache_hold(ncp); 4653 mp = nch.mount; 4654 continue; 4655 } 4656 4657 /* 4658 * Prepend the path segment 4659 */ 4660 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4661 if (bp == buf) { 4662 kfree(buf, M_TEMP); 4663 error = ENOMEM; 4664 goto done; 4665 } 4666 *--bp = ncp->nc_name[i]; 4667 } 4668 if (bp == buf) { 4669 kfree(buf, M_TEMP); 4670 error = ENOMEM; 4671 goto done; 4672 } 4673 *--bp = '/'; 4674 slash_prefixed = 1; 4675 4676 /* 4677 * Go up a directory. This isn't a mount point so we don't 4678 * have to check again. 4679 * 4680 * We can only safely access nc_parent with ncp held locked. 4681 */ 4682 while ((nch.ncp = ncp->nc_parent) != NULL) { 4683 _cache_lock_shared(ncp); 4684 if (nch.ncp != ncp->nc_parent) { 4685 _cache_unlock(ncp); 4686 continue; 4687 } 4688 _cache_hold(nch.ncp); 4689 _cache_unlock(ncp); 4690 break; 4691 } 4692 _cache_drop(ncp); 4693 ncp = nch.ncp; 4694 } 4695 if (ncp == NULL) { 4696 kfree(buf, M_TEMP); 4697 error = ENOENT; 4698 goto done; 4699 } 4700 4701 if (!slash_prefixed) { 4702 if (bp == buf) { 4703 kfree(buf, M_TEMP); 4704 error = ENOMEM; 4705 goto done; 4706 } 4707 *--bp = '/'; 4708 } 4709 *retbuf = bp; 4710 *freebuf = buf; 4711 error = 0; 4712 done: 4713 if (ncp) 4714 _cache_drop(ncp); 4715 return(error); 4716 } 4717 4718 int 4719 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4720 char **freebuf, int guess) 4721 { 4722 struct namecache *ncp; 4723 struct nchandle nch; 4724 int error; 4725 4726 *freebuf = NULL; 4727 if (disablefullpath) 4728 return (ENODEV); 4729 4730 if (p == NULL) 4731 return (EINVAL); 4732 4733 /* vn is NULL, client wants us to use p->p_textvp */ 4734 if (vn == NULL) { 4735 if ((vn = p->p_textvp) == NULL) 4736 return (EINVAL); 4737 } 4738 spin_lock_shared(&vn->v_spin); 4739 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4740 if (ncp->nc_nlen) 4741 break; 4742 } 4743 if (ncp == NULL) { 4744 spin_unlock_shared(&vn->v_spin); 4745 return (EINVAL); 4746 } 4747 _cache_hold(ncp); 4748 spin_unlock_shared(&vn->v_spin); 4749 4750 nch.ncp = ncp; 4751 nch.mount = vn->v_mount; 4752 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4753 _cache_drop(ncp); 4754 return (error); 4755 } 4756 4757 void 4758 vfscache_rollup_cpu(struct globaldata *gd) 4759 { 4760 struct pcpu_ncache *pn; 4761 long count; 4762 4763 if (pcpu_ncache == NULL) 4764 return; 4765 pn = &pcpu_ncache[gd->gd_cpuid]; 4766 4767 if (pn->vfscache_count) { 4768 count = atomic_swap_long(&pn->vfscache_count, 0); 4769 atomic_add_long(&vfscache_count, count); 4770 } 4771 if (pn->vfscache_leafs) { 4772 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4773 atomic_add_long(&vfscache_leafs, count); 4774 } 4775 if (pn->vfscache_negs) { 4776 count = atomic_swap_long(&pn->vfscache_negs, 0); 4777 atomic_add_long(&vfscache_negs, count); 4778 } 4779 if (pn->numdefered) { 4780 count = atomic_swap_long(&pn->numdefered, 0); 4781 atomic_add_long(&numdefered, count); 4782 } 4783 } 4784